xref: /freebsd/sys/dev/netmap/netmap.c (revision a6578a04e440f79f3b913660221caa9cde3e722c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2011-2014 Matteo Landi
5  * Copyright (C) 2011-2016 Luigi Rizzo
6  * Copyright (C) 2011-2016 Giuseppe Lettieri
7  * Copyright (C) 2011-2016 Vincenzo Maffione
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *   1. Redistributions of source code must retain the above copyright
14  *      notice, this list of conditions and the following disclaimer.
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * $FreeBSD$
35  *
36  * This module supports memory mapped access to network devices,
37  * see netmap(4).
38  *
39  * The module uses a large, memory pool allocated by the kernel
40  * and accessible as mmapped memory by multiple userspace threads/processes.
41  * The memory pool contains packet buffers and "netmap rings",
42  * i.e. user-accessible copies of the interface's queues.
43  *
44  * Access to the network card works like this:
45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
46  *    select()able file descriptor on which events are reported.
47  * 2. on each descriptor, the process issues an ioctl() to identify
48  *    the interface that should report events to the file descriptor.
49  * 3. on each descriptor, the process issues an mmap() request to
50  *    map the shared memory region within the process' address space.
51  *    The list of interesting queues is indicated by a location in
52  *    the shared memory region.
53  * 4. using the functions in the netmap(4) userspace API, a process
54  *    can look up the occupation state of a queue, access memory buffers,
55  *    and retrieve received packets or enqueue packets to transmit.
56  * 5. using some ioctl()s the process can synchronize the userspace view
57  *    of the queue with the actual status in the kernel. This includes both
58  *    receiving the notification of new packets, and transmitting new
59  *    packets on the output interface.
60  * 6. select() or poll() can be used to wait for events on individual
61  *    transmit or receive queues (or all queues for a given interface).
62  *
63 
64 		SYNCHRONIZATION (USER)
65 
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73 
74 		LOCKING (INTERNAL)
75 
76 Within the kernel, access to the netmap rings is protected as follows:
77 
78 - a spinlock on each ring, to handle producer/consumer races on
79   RX rings attached to the host stack (against multiple host
80   threads writing from the host stack to the same ring),
81   and on 'destination' rings attached to a VALE switch
82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83   protecting multiple active senders for the same destination)
84 
85 - an atomic variable to guarantee that there is at most one
86   instance of *_*xsync() on the ring at any time.
87   For rings connected to user file
88   descriptors, an atomic_test_and_set() protects this, and the
89   lock on the ring is not actually used.
90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91   is also used to prevent multiple executions (the driver might indeed
92   already guarantee this).
93   For NIC TX rings connected to a VALE switch, the lock arbitrates
94   access to the queue (both when allocating buffers and when pushing
95   them out).
96 
97 - *xsync() should be protected against initializations of the card.
98   On FreeBSD most devices have the reset routine protected by
99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100   the RING protection on rx_reset(), this should be added.
101 
102   On linux there is an external lock on the tx path, which probably
103   also arbitrates access to the reset routine. XXX to be revised
104 
105 - a per-interface core_lock protecting access from the host stack
106   while interfaces may be detached from netmap mode.
107   XXX there should be no need for this lock if we detach the interfaces
108   only while they are down.
109 
110 
111 --- VALE SWITCH ---
112 
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115 
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123 
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130 
131  */
132 
133 
134 /* --- internals ----
135  *
136  * Roadmap to the code that implements the above.
137  *
138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139  * >    select()able file descriptor on which events are reported.
140  *
141  *  	Internally, we allocate a netmap_priv_d structure, that will be
142  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143  *  	structure for each open().
144  *
145  *      os-specific:
146  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148  *
149  * > 2. on each descriptor, the process issues an ioctl() to identify
150  * >    the interface that should report events to the file descriptor.
151  *
152  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153  * 	Most important things happen in netmap_get_na() and
154  * 	netmap_do_regif(), called from there. Additional details can be
155  * 	found in the comments above those functions.
156  *
157  * 	In all cases, this action creates/takes-a-reference-to a
158  * 	netmap_*_adapter describing the port, and allocates a netmap_if
159  * 	and all necessary netmap rings, filling them with netmap buffers.
160  *
161  *      In this phase, the sync callbacks for each ring are set (these are used
162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163  *      The adapter creation/initialization code puts them in the
164  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167  * 	actually call netmap_krings_create() to perform this and the other
168  * 	common stuff. netmap_krings_create() also takes care of the host rings,
169  * 	if needed, by setting their sync callbacks appropriately.
170  *
171  * 	Additional actions depend on the kind of netmap_adapter that has been
172  * 	registered:
173  *
174  * 	- netmap_hw_adapter:  	     [netmap.c]
175  * 	     This is a system netdev/ifp with native netmap support.
176  * 	     The ifp is detached from the host stack by redirecting:
177  * 	       - transmissions (from the network stack) to netmap_transmit()
178  * 	       - receive notifications to the nm_notify() callback for
179  * 	         this adapter. The callback is normally netmap_notify(), unless
180  * 	         the ifp is attached to a bridge using bwrap, in which case it
181  * 	         is netmap_bwrap_intr_notify().
182  *
183  * 	- netmap_generic_adapter:      [netmap_generic.c]
184  * 	      A system netdev/ifp without native netmap support.
185  *
186  * 	(the decision about native/non native support is taken in
187  * 	 netmap_get_hw_na(), called by netmap_get_na())
188  *
189  * 	- netmap_vp_adapter 		[netmap_vale.c]
190  * 	      Returned by netmap_get_bdg_na().
191  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192  * 	      are created on the fly if they don't already exist, and are
193  * 	      always attached to a bridge.
194  * 	      Persistent VALE ports must must be created separately, and i
195  * 	      then attached like normal NICs. The NIOCREGIF we are examining
196  * 	      will find them only if they had previosly been created and
197  * 	      attached (see VALE_CTL below).
198  *
199  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200  * 	      Returned by netmap_get_pipe_na().
201  * 	      Both pipe ends are created, if they didn't already exist.
202  *
203  * 	- netmap_monitor_adapter      [netmap_monitor.c]
204  * 	      Returned by netmap_get_monitor_na().
205  * 	      If successful, the nm_sync callbacks of the monitored adapter
206  * 	      will be intercepted by the returned monitor.
207  *
208  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209  * 	      Cannot be obtained in this way, see VALE_CTL below
210  *
211  *
212  * 	os-specific:
213  * 	    linux: we first go through linux_netmap_ioctl() to
214  * 	           adapt the FreeBSD interface to the linux one.
215  *
216  *
217  * > 3. on each descriptor, the process issues an mmap() request to
218  * >    map the shared memory region within the process' address space.
219  * >    The list of interesting queues is indicated by a location in
220  * >    the shared memory region.
221  *
222  *      os-specific:
223  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225  *
226  * > 4. using the functions in the netmap(4) userspace API, a process
227  * >    can look up the occupation state of a queue, access memory buffers,
228  * >    and retrieve received packets or enqueue packets to transmit.
229  *
230  * 	these actions do not involve the kernel.
231  *
232  * > 5. using some ioctl()s the process can synchronize the userspace view
233  * >    of the queue with the actual status in the kernel. This includes both
234  * >    receiving the notification of new packets, and transmitting new
235  * >    packets on the output interface.
236  *
237  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239  * 	structures, as initialized in step 2 and maybe later modified
240  * 	by a monitor. Monitors, however, will always call the original
241  * 	callback before doing anything else.
242  *
243  *
244  * > 6. select() or poll() can be used to wait for events on individual
245  * >    transmit or receive queues (or all queues for a given interface).
246  *
247  * 	Implemented in netmap_poll(). This will call the same nm_sync()
248  * 	callbacks as in step 5 above.
249  *
250  * 	os-specific:
251  * 		linux: we first go through linux_netmap_poll() to adapt
252  * 		       the FreeBSD interface to the linux one.
253  *
254  *
255  *  ----  VALE_CTL -----
256  *
257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258  *  nr_cmd in the nmreq structure. These subcommands are handled by
259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261  *  subcommands, respectively.
262  *
263  *  Any network interface known to the system (including a persistent VALE
264  *  port) can be attached to a VALE switch by issuing the
265  *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267  *  attachment of other interfaces, instead, requires the creation of a
268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
270  *  we have no native support for the interface, or if generic adapters have
271  *  been forced by sysctl.
272  *
273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275  *  callback.  In the case of the bwrap, the callback creates the
276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279  *  A generic adapter for the wrapped ifp will be created if needed, when
280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
281  *
282  *
283  *  ---- DATAPATHS -----
284  *
285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286  *
287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288  *
289  *    - tx from netmap userspace:
290  *	 concurrently:
291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292  *                kring->nm_sync() == DEVICE_netmap_txsync()
293  *           2) device interrupt handler
294  *                na->nm_notify()  == netmap_notify()
295  *    - rx from netmap userspace:
296  *       concurrently:
297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
299  *           2) device interrupt handler
300  *                na->nm_notify()  == netmap_notify()
301  *    - rx from host stack
302  *       concurrently:
303  *           1) host stack
304  *                netmap_transmit()
305  *                  na->nm_notify  == netmap_notify()
306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307  *                kring->nm_sync() == netmap_rxsync_from_host
308  *                  netmap_rxsync_from_host(na, NULL, NULL)
309  *    - tx to host stack
310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311  *             kring->nm_sync() == netmap_txsync_to_host
312  *               netmap_txsync_to_host(na)
313  *                 nm_os_send_up()
314  *                   FreeBSD: na->if_input() == ether_input()
315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316  *
317  *
318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319  *
320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321  *
322  *    - tx from netmap userspace:
323  *       concurrently:
324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325  *               kring->nm_sync() == generic_netmap_txsync()
326  *                   nm_os_generic_xmit_frame()
327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329  *                               gna->save_start_xmit == orig. dev. start_xmit
330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331  *           2) generic_mbuf_destructor()
332  *                   na->nm_notify() == netmap_notify()
333  *    - rx from netmap userspace:
334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335  *               kring->nm_sync() == generic_netmap_rxsync()
336  *                   mbq_safe_dequeue()
337  *           2) device driver
338  *               generic_rx_handler()
339  *                   mbq_safe_enqueue()
340  *                   na->nm_notify() == netmap_notify()
341  *    - rx from host stack
342  *        FreeBSD: same as native
343  *        Linux: same as native except:
344  *           1) host stack
345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347  *                       netmap_transmit()
348  *                           na->nm_notify() == netmap_notify()
349  *    - tx to host stack (same as native):
350  *
351  *
352  *                           -= VALE =-
353  *
354  *   INCOMING:
355  *
356  *      - VALE ports:
357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358  *              kring->nm_sync() == netmap_vp_txsync()
359  *
360  *      - system device with native support:
361  *         from cable:
362  *             interrupt
363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *                     netmap_vp_txsync()
366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367  *         from host stack:
368  *             netmap_transmit()
369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370  *                     kring->nm_sync() == netmap_rxsync_from_host()
371  *                     netmap_vp_txsync()
372  *
373  *      - system device with generic support:
374  *         from device driver:
375  *            generic_rx_handler()
376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *                     netmap_vp_txsync()
379  *                     kring->nm_sync() == generic_netmap_rxsync()
380  *         from host stack:
381  *            netmap_transmit()
382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383  *                     kring->nm_sync() == netmap_rxsync_from_host()
384  *                     netmap_vp_txsync()
385  *
386  *   (all cases) --> nm_bdg_flush()
387  *                      dest_na->nm_notify() == (see below)
388  *
389  *   OUTGOING:
390  *
391  *      - VALE ports:
392  *         concurrently:
393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394  *                    kring->nm_sync() == netmap_vp_rxsync()
395  *             2) from nm_bdg_flush()
396  *                    na->nm_notify() == netmap_notify()
397  *
398  *      - system device with native support:
399  *          to cable:
400  *             na->nm_notify() == netmap_bwrap_notify()
401  *                 netmap_vp_rxsync()
402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
403  *                 netmap_vp_rxsync()
404  *          to host stack:
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == netmap_txsync_to_host
407  *                 netmap_vp_rxsync_locked()
408  *
409  *      - system device with generic adapter:
410  *          to device driver:
411  *             na->nm_notify() == netmap_bwrap_notify()
412  *                 netmap_vp_rxsync()
413  *                 kring->nm_sync() == generic_netmap_txsync()
414  *                 netmap_vp_rxsync()
415  *          to host stack:
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == netmap_txsync_to_host
418  *                 netmap_vp_rxsync()
419  *
420  */
421 
422 /*
423  * OS-specific code that is used only within this file.
424  * Other OS-specific code that must be accessed by drivers
425  * is present in netmap_kern.h
426  */
427 
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h>	/* defines used in kernel.h */
433 #include <sys/kernel.h>	/* types used in module initialization */
434 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
435 #include <sys/filio.h>	/* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h>	/* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
446 #include <net/if.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h>		/* BIOCIMMEDIATE */
449 #include <machine/bus.h>	/* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
452 
453 
454 #elif defined(linux)
455 
456 #include "bsd_glue.h"
457 
458 #elif defined(__APPLE__)
459 
460 #warning OSX support is only partial
461 #include "osx_glue.h"
462 
463 #elif defined (_WIN32)
464 
465 #include "win_glue.h"
466 
467 #else
468 
469 #error	Unsupported platform
470 
471 #endif /* unsupported */
472 
473 /*
474  * common headers
475  */
476 #include <net/netmap.h>
477 #include <dev/netmap/netmap_kern.h>
478 #include <dev/netmap/netmap_mem2.h>
479 
480 
481 /* user-controlled variables */
482 int netmap_verbose;
483 #ifdef CONFIG_NETMAP_DEBUG
484 int netmap_debug;
485 #endif /* CONFIG_NETMAP_DEBUG */
486 
487 static int netmap_no_timestamp; /* don't timestamp on rxsync */
488 int netmap_no_pendintr = 1;
489 int netmap_txsync_retry = 2;
490 static int netmap_fwd = 0;	/* force transparent forwarding */
491 
492 /*
493  * netmap_admode selects the netmap mode to use.
494  * Invalid values are reset to NETMAP_ADMODE_BEST
495  */
496 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
497 	NETMAP_ADMODE_NATIVE,	/* either native or none */
498 	NETMAP_ADMODE_GENERIC,	/* force generic */
499 	NETMAP_ADMODE_LAST };
500 static int netmap_admode = NETMAP_ADMODE_BEST;
501 
502 /* netmap_generic_mit controls mitigation of RX notifications for
503  * the generic netmap adapter. The value is a time interval in
504  * nanoseconds. */
505 int netmap_generic_mit = 100*1000;
506 
507 /* We use by default netmap-aware qdiscs with generic netmap adapters,
508  * even if there can be a little performance hit with hardware NICs.
509  * However, using the qdisc is the safer approach, for two reasons:
510  * 1) it prevents non-fifo qdiscs to break the TX notification
511  *    scheme, which is based on mbuf destructors when txqdisc is
512  *    not used.
513  * 2) it makes it possible to transmit over software devices that
514  *    change skb->dev, like bridge, veth, ...
515  *
516  * Anyway users looking for the best performance should
517  * use native adapters.
518  */
519 #ifdef linux
520 int netmap_generic_txqdisc = 1;
521 #endif
522 
523 /* Default number of slots and queues for generic adapters. */
524 int netmap_generic_ringsize = 1024;
525 int netmap_generic_rings = 1;
526 
527 /* Non-zero to enable checksum offloading in NIC drivers */
528 int netmap_generic_hwcsum = 0;
529 
530 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
531 int ptnet_vnet_hdr = 1;
532 
533 /*
534  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
535  * in some other operating systems
536  */
537 SYSBEGIN(main_init);
538 
539 SYSCTL_DECL(_dev_netmap);
540 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
541 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
542 		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
543 #ifdef CONFIG_NETMAP_DEBUG
544 SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
545 		CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
546 #endif /* CONFIG_NETMAP_DEBUG */
547 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
548 		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
549 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
550 		0, "Always look for new received packets.");
551 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
552 		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
553 
554 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
555 		"Force NR_FORWARD mode");
556 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
557 		"Adapter mode. 0 selects the best option available,"
558 		"1 forces native adapter, 2 forces emulated adapter");
559 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
560 		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
561 		"1 to enable checksum generation by the NIC");
562 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
563 		0, "RX notification interval in nanoseconds");
564 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
565 		&netmap_generic_ringsize, 0,
566 		"Number of per-ring slots for emulated netmap mode");
567 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
568 		&netmap_generic_rings, 0,
569 		"Number of TX/RX queues for emulated netmap adapters");
570 #ifdef linux
571 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
572 		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
573 #endif
574 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
575 		0, "Allow ptnet devices to use virtio-net headers");
576 
577 SYSEND;
578 
579 NMG_LOCK_T	netmap_global_lock;
580 
581 /*
582  * mark the ring as stopped, and run through the locks
583  * to make sure other users get to see it.
584  * stopped must be either NR_KR_STOPPED (for unbounded stop)
585  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
586  */
587 static void
588 netmap_disable_ring(struct netmap_kring *kr, int stopped)
589 {
590 	nm_kr_stop(kr, stopped);
591 	// XXX check if nm_kr_stop is sufficient
592 	mtx_lock(&kr->q_lock);
593 	mtx_unlock(&kr->q_lock);
594 	nm_kr_put(kr);
595 }
596 
597 /* stop or enable a single ring */
598 void
599 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
600 {
601 	if (stopped)
602 		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
603 	else
604 		NMR(na, t)[ring_id]->nkr_stopped = 0;
605 }
606 
607 
608 /* stop or enable all the rings of na */
609 void
610 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
611 {
612 	int i;
613 	enum txrx t;
614 
615 	if (!nm_netmap_on(na))
616 		return;
617 
618 	for_rx_tx(t) {
619 		for (i = 0; i < netmap_real_rings(na, t); i++) {
620 			netmap_set_ring(na, i, t, stopped);
621 		}
622 	}
623 }
624 
625 /*
626  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
627  * to finish and prevents any new one from starting.  Call this before turning
628  * netmap mode off, or before removing the hardware rings (e.g., on module
629  * onload).
630  */
631 void
632 netmap_disable_all_rings(struct ifnet *ifp)
633 {
634 	if (NM_NA_VALID(ifp)) {
635 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
636 	}
637 }
638 
639 /*
640  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
641  * adapter's rings In linux drivers, this should be placed near each
642  * napi_enable().
643  */
644 void
645 netmap_enable_all_rings(struct ifnet *ifp)
646 {
647 	if (NM_NA_VALID(ifp)) {
648 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
649 	}
650 }
651 
652 void
653 netmap_make_zombie(struct ifnet *ifp)
654 {
655 	if (NM_NA_VALID(ifp)) {
656 		struct netmap_adapter *na = NA(ifp);
657 		netmap_set_all_rings(na, NM_KR_LOCKED);
658 		na->na_flags |= NAF_ZOMBIE;
659 		netmap_set_all_rings(na, 0);
660 	}
661 }
662 
663 void
664 netmap_undo_zombie(struct ifnet *ifp)
665 {
666 	if (NM_NA_VALID(ifp)) {
667 		struct netmap_adapter *na = NA(ifp);
668 		if (na->na_flags & NAF_ZOMBIE) {
669 			netmap_set_all_rings(na, NM_KR_LOCKED);
670 			na->na_flags &= ~NAF_ZOMBIE;
671 			netmap_set_all_rings(na, 0);
672 		}
673 	}
674 }
675 
676 /*
677  * generic bound_checking function
678  */
679 u_int
680 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
681 {
682 	u_int oldv = *v;
683 	const char *op = NULL;
684 
685 	if (dflt < lo)
686 		dflt = lo;
687 	if (dflt > hi)
688 		dflt = hi;
689 	if (oldv < lo) {
690 		*v = dflt;
691 		op = "Bump";
692 	} else if (oldv > hi) {
693 		*v = hi;
694 		op = "Clamp";
695 	}
696 	if (op && msg)
697 		nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
698 	return *v;
699 }
700 
701 
702 /*
703  * packet-dump function, user-supplied or static buffer.
704  * The destination buffer must be at least 30+4*len
705  */
706 const char *
707 nm_dump_buf(char *p, int len, int lim, char *dst)
708 {
709 	static char _dst[8192];
710 	int i, j, i0;
711 	static char hex[] ="0123456789abcdef";
712 	char *o;	/* output position */
713 
714 #define P_HI(x)	hex[((x) & 0xf0)>>4]
715 #define P_LO(x)	hex[((x) & 0xf)]
716 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
717 	if (!dst)
718 		dst = _dst;
719 	if (lim <= 0 || lim > len)
720 		lim = len;
721 	o = dst;
722 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
723 	o += strlen(o);
724 	/* hexdump routine */
725 	for (i = 0; i < lim; ) {
726 		sprintf(o, "%5d: ", i);
727 		o += strlen(o);
728 		memset(o, ' ', 48);
729 		i0 = i;
730 		for (j=0; j < 16 && i < lim; i++, j++) {
731 			o[j*3] = P_HI(p[i]);
732 			o[j*3+1] = P_LO(p[i]);
733 		}
734 		i = i0;
735 		for (j=0; j < 16 && i < lim; i++, j++)
736 			o[j + 48] = P_C(p[i]);
737 		o[j+48] = '\n';
738 		o += j+49;
739 	}
740 	*o = '\0';
741 #undef P_HI
742 #undef P_LO
743 #undef P_C
744 	return dst;
745 }
746 
747 
748 /*
749  * Fetch configuration from the device, to cope with dynamic
750  * reconfigurations after loading the module.
751  */
752 /* call with NMG_LOCK held */
753 int
754 netmap_update_config(struct netmap_adapter *na)
755 {
756 	struct nm_config_info info;
757 
758 	bzero(&info, sizeof(info));
759 	if (na->nm_config == NULL ||
760 	    na->nm_config(na, &info)) {
761 		/* take whatever we had at init time */
762 		info.num_tx_rings = na->num_tx_rings;
763 		info.num_tx_descs = na->num_tx_desc;
764 		info.num_rx_rings = na->num_rx_rings;
765 		info.num_rx_descs = na->num_rx_desc;
766 		info.rx_buf_maxsize = na->rx_buf_maxsize;
767 	}
768 
769 	if (na->num_tx_rings == info.num_tx_rings &&
770 	    na->num_tx_desc == info.num_tx_descs &&
771 	    na->num_rx_rings == info.num_rx_rings &&
772 	    na->num_rx_desc == info.num_rx_descs &&
773 	    na->rx_buf_maxsize == info.rx_buf_maxsize)
774 		return 0; /* nothing changed */
775 	if (na->active_fds == 0) {
776 		na->num_tx_rings = info.num_tx_rings;
777 		na->num_tx_desc = info.num_tx_descs;
778 		na->num_rx_rings = info.num_rx_rings;
779 		na->num_rx_desc = info.num_rx_descs;
780 		na->rx_buf_maxsize = info.rx_buf_maxsize;
781 		if (netmap_verbose)
782 			nm_prinf("configuration changed for %s: txring %d x %d, "
783 				"rxring %d x %d, rxbufsz %d",
784 				na->name, na->num_tx_rings, na->num_tx_desc,
785 				na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
786 		return 0;
787 	}
788 	nm_prerr("WARNING: configuration changed for %s while active: "
789 		"txring %d x %d, rxring %d x %d, rxbufsz %d",
790 		na->name, info.num_tx_rings, info.num_tx_descs,
791 		info.num_rx_rings, info.num_rx_descs,
792 		info.rx_buf_maxsize);
793 	return 1;
794 }
795 
796 /* nm_sync callbacks for the host rings */
797 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
798 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
799 
800 /* create the krings array and initialize the fields common to all adapters.
801  * The array layout is this:
802  *
803  *                    +----------+
804  * na->tx_rings ----->|          | \
805  *                    |          |  } na->num_tx_ring
806  *                    |          | /
807  *                    +----------+
808  *                    |          |    host tx kring
809  * na->rx_rings ----> +----------+
810  *                    |          | \
811  *                    |          |  } na->num_rx_rings
812  *                    |          | /
813  *                    +----------+
814  *                    |          |    host rx kring
815  *                    +----------+
816  * na->tailroom ----->|          | \
817  *                    |          |  } tailroom bytes
818  *                    |          | /
819  *                    +----------+
820  *
821  * Note: for compatibility, host krings are created even when not needed.
822  * The tailroom space is currently used by vale ports for allocating leases.
823  */
824 /* call with NMG_LOCK held */
825 int
826 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
827 {
828 	u_int i, len, ndesc;
829 	struct netmap_kring *kring;
830 	u_int n[NR_TXRX];
831 	enum txrx t;
832 
833 	if (na->tx_rings != NULL) {
834 		if (netmap_debug & NM_DEBUG_ON)
835 			nm_prerr("warning: krings were already created");
836 		return 0;
837 	}
838 
839 	/* account for the (possibly fake) host rings */
840 	n[NR_TX] = netmap_all_rings(na, NR_TX);
841 	n[NR_RX] = netmap_all_rings(na, NR_RX);
842 
843 	len = (n[NR_TX] + n[NR_RX]) *
844 		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
845 		+ tailroom;
846 
847 	na->tx_rings = nm_os_malloc((size_t)len);
848 	if (na->tx_rings == NULL) {
849 		nm_prerr("Cannot allocate krings");
850 		return ENOMEM;
851 	}
852 	na->rx_rings = na->tx_rings + n[NR_TX];
853 	na->tailroom = na->rx_rings + n[NR_RX];
854 
855 	/* link the krings in the krings array */
856 	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
857 	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
858 		na->tx_rings[i] = kring;
859 		kring++;
860 	}
861 
862 	/*
863 	 * All fields in krings are 0 except the one initialized below.
864 	 * but better be explicit on important kring fields.
865 	 */
866 	for_rx_tx(t) {
867 		ndesc = nma_get_ndesc(na, t);
868 		for (i = 0; i < n[t]; i++) {
869 			kring = NMR(na, t)[i];
870 			bzero(kring, sizeof(*kring));
871 			kring->na = na;
872 			kring->notify_na = na;
873 			kring->ring_id = i;
874 			kring->tx = t;
875 			kring->nkr_num_slots = ndesc;
876 			kring->nr_mode = NKR_NETMAP_OFF;
877 			kring->nr_pending_mode = NKR_NETMAP_OFF;
878 			if (i < nma_get_nrings(na, t)) {
879 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
880 			} else {
881 				if (!(na->na_flags & NAF_HOST_RINGS))
882 					kring->nr_kflags |= NKR_FAKERING;
883 				kring->nm_sync = (t == NR_TX ?
884 						netmap_txsync_to_host:
885 						netmap_rxsync_from_host);
886 			}
887 			kring->nm_notify = na->nm_notify;
888 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
889 			/*
890 			 * IMPORTANT: Always keep one slot empty.
891 			 */
892 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
893 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
894 					nm_txrx2str(t), i);
895 			ND("ktx %s h %d c %d t %d",
896 				kring->name, kring->rhead, kring->rcur, kring->rtail);
897 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
898 			nm_os_selinfo_init(&kring->si);
899 		}
900 		nm_os_selinfo_init(&na->si[t]);
901 	}
902 
903 
904 	return 0;
905 }
906 
907 
908 /* undo the actions performed by netmap_krings_create */
909 /* call with NMG_LOCK held */
910 void
911 netmap_krings_delete(struct netmap_adapter *na)
912 {
913 	struct netmap_kring **kring = na->tx_rings;
914 	enum txrx t;
915 
916 	if (na->tx_rings == NULL) {
917 		if (netmap_debug & NM_DEBUG_ON)
918 			nm_prerr("warning: krings were already deleted");
919 		return;
920 	}
921 
922 	for_rx_tx(t)
923 		nm_os_selinfo_uninit(&na->si[t]);
924 
925 	/* we rely on the krings layout described above */
926 	for ( ; kring != na->tailroom; kring++) {
927 		mtx_destroy(&(*kring)->q_lock);
928 		nm_os_selinfo_uninit(&(*kring)->si);
929 	}
930 	nm_os_free(na->tx_rings);
931 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
932 }
933 
934 
935 /*
936  * Destructor for NIC ports. They also have an mbuf queue
937  * on the rings connected to the host so we need to purge
938  * them first.
939  */
940 /* call with NMG_LOCK held */
941 void
942 netmap_hw_krings_delete(struct netmap_adapter *na)
943 {
944 	u_int lim = netmap_real_rings(na, NR_RX), i;
945 
946 	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
947 		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
948 		ND("destroy sw mbq with len %d", mbq_len(q));
949 		mbq_purge(q);
950 		mbq_safe_fini(q);
951 	}
952 	netmap_krings_delete(na);
953 }
954 
955 static void
956 netmap_mem_drop(struct netmap_adapter *na)
957 {
958 	int last = netmap_mem_deref(na->nm_mem, na);
959 	/* if the native allocator had been overrided on regif,
960 	 * restore it now and drop the temporary one
961 	 */
962 	if (last && na->nm_mem_prev) {
963 		netmap_mem_put(na->nm_mem);
964 		na->nm_mem = na->nm_mem_prev;
965 		na->nm_mem_prev = NULL;
966 	}
967 }
968 
969 /*
970  * Undo everything that was done in netmap_do_regif(). In particular,
971  * call nm_register(ifp,0) to stop netmap mode on the interface and
972  * revert to normal operation.
973  */
974 /* call with NMG_LOCK held */
975 static void netmap_unset_ringid(struct netmap_priv_d *);
976 static void netmap_krings_put(struct netmap_priv_d *);
977 void
978 netmap_do_unregif(struct netmap_priv_d *priv)
979 {
980 	struct netmap_adapter *na = priv->np_na;
981 
982 	NMG_LOCK_ASSERT();
983 	na->active_fds--;
984 	/* unset nr_pending_mode and possibly release exclusive mode */
985 	netmap_krings_put(priv);
986 
987 #ifdef	WITH_MONITOR
988 	/* XXX check whether we have to do something with monitor
989 	 * when rings change nr_mode. */
990 	if (na->active_fds <= 0) {
991 		/* walk through all the rings and tell any monitor
992 		 * that the port is going to exit netmap mode
993 		 */
994 		netmap_monitor_stop(na);
995 	}
996 #endif
997 
998 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
999 		na->nm_register(na, 0);
1000 	}
1001 
1002 	/* delete rings and buffers that are no longer needed */
1003 	netmap_mem_rings_delete(na);
1004 
1005 	if (na->active_fds <= 0) {	/* last instance */
1006 		/*
1007 		 * (TO CHECK) We enter here
1008 		 * when the last reference to this file descriptor goes
1009 		 * away. This means we cannot have any pending poll()
1010 		 * or interrupt routine operating on the structure.
1011 		 * XXX The file may be closed in a thread while
1012 		 * another thread is using it.
1013 		 * Linux keeps the file opened until the last reference
1014 		 * by any outstanding ioctl/poll or mmap is gone.
1015 		 * FreeBSD does not track mmap()s (but we do) and
1016 		 * wakes up any sleeping poll(). Need to check what
1017 		 * happens if the close() occurs while a concurrent
1018 		 * syscall is running.
1019 		 */
1020 		if (netmap_debug & NM_DEBUG_ON)
1021 			nm_prinf("deleting last instance for %s", na->name);
1022 
1023 		if (nm_netmap_on(na)) {
1024 			nm_prerr("BUG: netmap on while going to delete the krings");
1025 		}
1026 
1027 		na->nm_krings_delete(na);
1028 	}
1029 
1030 	/* possibily decrement counter of tx_si/rx_si users */
1031 	netmap_unset_ringid(priv);
1032 	/* delete the nifp */
1033 	netmap_mem_if_delete(na, priv->np_nifp);
1034 	/* drop the allocator */
1035 	netmap_mem_drop(na);
1036 	/* mark the priv as unregistered */
1037 	priv->np_na = NULL;
1038 	priv->np_nifp = NULL;
1039 }
1040 
1041 struct netmap_priv_d*
1042 netmap_priv_new(void)
1043 {
1044 	struct netmap_priv_d *priv;
1045 
1046 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1047 	if (priv == NULL)
1048 		return NULL;
1049 	priv->np_refs = 1;
1050 	nm_os_get_module();
1051 	return priv;
1052 }
1053 
1054 /*
1055  * Destructor of the netmap_priv_d, called when the fd is closed
1056  * Action: undo all the things done by NIOCREGIF,
1057  * On FreeBSD we need to track whether there are active mmap()s,
1058  * and we use np_active_mmaps for that. On linux, the field is always 0.
1059  * Return: 1 if we can free priv, 0 otherwise.
1060  *
1061  */
1062 /* call with NMG_LOCK held */
1063 void
1064 netmap_priv_delete(struct netmap_priv_d *priv)
1065 {
1066 	struct netmap_adapter *na = priv->np_na;
1067 
1068 	/* number of active references to this fd */
1069 	if (--priv->np_refs > 0) {
1070 		return;
1071 	}
1072 	nm_os_put_module();
1073 	if (na) {
1074 		netmap_do_unregif(priv);
1075 	}
1076 	netmap_unget_na(na, priv->np_ifp);
1077 	bzero(priv, sizeof(*priv));	/* for safety */
1078 	nm_os_free(priv);
1079 }
1080 
1081 
1082 /* call with NMG_LOCK *not* held */
1083 void
1084 netmap_dtor(void *data)
1085 {
1086 	struct netmap_priv_d *priv = data;
1087 
1088 	NMG_LOCK();
1089 	netmap_priv_delete(priv);
1090 	NMG_UNLOCK();
1091 }
1092 
1093 
1094 /*
1095  * Handlers for synchronization of the rings from/to the host stack.
1096  * These are associated to a network interface and are just another
1097  * ring pair managed by userspace.
1098  *
1099  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1100  * flags):
1101  *
1102  * - Before releasing buffers on hw RX rings, the application can mark
1103  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1104  *   will be forwarded to the host stack, similarly to what happened if
1105  *   the application moved them to the host TX ring.
1106  *
1107  * - Before releasing buffers on the host RX ring, the application can
1108  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1109  *   they will be forwarded to the hw TX rings, saving the application
1110  *   from doing the same task in user-space.
1111  *
1112  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1113  * flag, or globally with the netmap_fwd sysctl.
1114  *
1115  * The transfer NIC --> host is relatively easy, just encapsulate
1116  * into mbufs and we are done. The host --> NIC side is slightly
1117  * harder because there might not be room in the tx ring so it
1118  * might take a while before releasing the buffer.
1119  */
1120 
1121 
1122 /*
1123  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1124  * We do not need to lock because the queue is private.
1125  * After this call the queue is empty.
1126  */
1127 static void
1128 netmap_send_up(struct ifnet *dst, struct mbq *q)
1129 {
1130 	struct mbuf *m;
1131 	struct mbuf *head = NULL, *prev = NULL;
1132 
1133 	/* Send packets up, outside the lock; head/prev machinery
1134 	 * is only useful for Windows. */
1135 	while ((m = mbq_dequeue(q)) != NULL) {
1136 		if (netmap_debug & NM_DEBUG_HOST)
1137 			nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1138 		prev = nm_os_send_up(dst, m, prev);
1139 		if (head == NULL)
1140 			head = prev;
1141 	}
1142 	if (head)
1143 		nm_os_send_up(dst, NULL, head);
1144 	mbq_fini(q);
1145 }
1146 
1147 
1148 /*
1149  * Scan the buffers from hwcur to ring->head, and put a copy of those
1150  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1151  * Drop remaining packets in the unlikely event
1152  * of an mbuf shortage.
1153  */
1154 static void
1155 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1156 {
1157 	u_int const lim = kring->nkr_num_slots - 1;
1158 	u_int const head = kring->rhead;
1159 	u_int n;
1160 	struct netmap_adapter *na = kring->na;
1161 
1162 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1163 		struct mbuf *m;
1164 		struct netmap_slot *slot = &kring->ring->slot[n];
1165 
1166 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1167 			continue;
1168 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1169 			RD(5, "bad pkt at %d len %d", n, slot->len);
1170 			continue;
1171 		}
1172 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1173 		/* XXX TODO: adapt to the case of a multisegment packet */
1174 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1175 
1176 		if (m == NULL)
1177 			break;
1178 		mbq_enqueue(q, m);
1179 	}
1180 }
1181 
1182 static inline int
1183 _nm_may_forward(struct netmap_kring *kring)
1184 {
1185 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1186 		 kring->na->na_flags & NAF_HOST_RINGS &&
1187 		 kring->tx == NR_RX);
1188 }
1189 
1190 static inline int
1191 nm_may_forward_up(struct netmap_kring *kring)
1192 {
1193 	return	_nm_may_forward(kring) &&
1194 		 kring->ring_id != kring->na->num_rx_rings;
1195 }
1196 
1197 static inline int
1198 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1199 {
1200 	return	_nm_may_forward(kring) &&
1201 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1202 		 kring->ring_id == kring->na->num_rx_rings;
1203 }
1204 
1205 /*
1206  * Send to the NIC rings packets marked NS_FORWARD between
1207  * kring->nr_hwcur and kring->rhead.
1208  * Called under kring->rx_queue.lock on the sw rx ring.
1209  *
1210  * It can only be called if the user opened all the TX hw rings,
1211  * see NAF_CAN_FORWARD_DOWN flag.
1212  * We can touch the TX netmap rings (slots, head and cur) since
1213  * we are in poll/ioctl system call context, and the application
1214  * is not supposed to touch the ring (using a different thread)
1215  * during the execution of the system call.
1216  */
1217 static u_int
1218 netmap_sw_to_nic(struct netmap_adapter *na)
1219 {
1220 	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1221 	struct netmap_slot *rxslot = kring->ring->slot;
1222 	u_int i, rxcur = kring->nr_hwcur;
1223 	u_int const head = kring->rhead;
1224 	u_int const src_lim = kring->nkr_num_slots - 1;
1225 	u_int sent = 0;
1226 
1227 	/* scan rings to find space, then fill as much as possible */
1228 	for (i = 0; i < na->num_tx_rings; i++) {
1229 		struct netmap_kring *kdst = na->tx_rings[i];
1230 		struct netmap_ring *rdst = kdst->ring;
1231 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1232 
1233 		/* XXX do we trust ring or kring->rcur,rtail ? */
1234 		for (; rxcur != head && !nm_ring_empty(rdst);
1235 		     rxcur = nm_next(rxcur, src_lim) ) {
1236 			struct netmap_slot *src, *dst, tmp;
1237 			u_int dst_head = rdst->head;
1238 
1239 			src = &rxslot[rxcur];
1240 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1241 				continue;
1242 
1243 			sent++;
1244 
1245 			dst = &rdst->slot[dst_head];
1246 
1247 			tmp = *src;
1248 
1249 			src->buf_idx = dst->buf_idx;
1250 			src->flags = NS_BUF_CHANGED;
1251 
1252 			dst->buf_idx = tmp.buf_idx;
1253 			dst->len = tmp.len;
1254 			dst->flags = NS_BUF_CHANGED;
1255 
1256 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1257 		}
1258 		/* if (sent) XXX txsync ? it would be just an optimization */
1259 	}
1260 	return sent;
1261 }
1262 
1263 
1264 /*
1265  * netmap_txsync_to_host() passes packets up. We are called from a
1266  * system call in user process context, and the only contention
1267  * can be among multiple user threads erroneously calling
1268  * this routine concurrently.
1269  */
1270 static int
1271 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1272 {
1273 	struct netmap_adapter *na = kring->na;
1274 	u_int const lim = kring->nkr_num_slots - 1;
1275 	u_int const head = kring->rhead;
1276 	struct mbq q;
1277 
1278 	/* Take packets from hwcur to head and pass them up.
1279 	 * Force hwcur = head since netmap_grab_packets() stops at head
1280 	 */
1281 	mbq_init(&q);
1282 	netmap_grab_packets(kring, &q, 1 /* force */);
1283 	ND("have %d pkts in queue", mbq_len(&q));
1284 	kring->nr_hwcur = head;
1285 	kring->nr_hwtail = head + lim;
1286 	if (kring->nr_hwtail > lim)
1287 		kring->nr_hwtail -= lim + 1;
1288 
1289 	netmap_send_up(na->ifp, &q);
1290 	return 0;
1291 }
1292 
1293 
1294 /*
1295  * rxsync backend for packets coming from the host stack.
1296  * They have been put in kring->rx_queue by netmap_transmit().
1297  * We protect access to the kring using kring->rx_queue.lock
1298  *
1299  * also moves to the nic hw rings any packet the user has marked
1300  * for transparent-mode forwarding, then sets the NR_FORWARD
1301  * flag in the kring to let the caller push them out
1302  */
1303 static int
1304 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1305 {
1306 	struct netmap_adapter *na = kring->na;
1307 	struct netmap_ring *ring = kring->ring;
1308 	u_int nm_i, n;
1309 	u_int const lim = kring->nkr_num_slots - 1;
1310 	u_int const head = kring->rhead;
1311 	int ret = 0;
1312 	struct mbq *q = &kring->rx_queue, fq;
1313 
1314 	mbq_init(&fq); /* fq holds packets to be freed */
1315 
1316 	mbq_lock(q);
1317 
1318 	/* First part: import newly received packets */
1319 	n = mbq_len(q);
1320 	if (n) { /* grab packets from the queue */
1321 		struct mbuf *m;
1322 		uint32_t stop_i;
1323 
1324 		nm_i = kring->nr_hwtail;
1325 		stop_i = nm_prev(kring->nr_hwcur, lim);
1326 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1327 			int len = MBUF_LEN(m);
1328 			struct netmap_slot *slot = &ring->slot[nm_i];
1329 
1330 			m_copydata(m, 0, len, NMB(na, slot));
1331 			ND("nm %d len %d", nm_i, len);
1332 			if (netmap_debug & NM_DEBUG_HOST)
1333 				nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1334 
1335 			slot->len = len;
1336 			slot->flags = 0;
1337 			nm_i = nm_next(nm_i, lim);
1338 			mbq_enqueue(&fq, m);
1339 		}
1340 		kring->nr_hwtail = nm_i;
1341 	}
1342 
1343 	/*
1344 	 * Second part: skip past packets that userspace has released.
1345 	 */
1346 	nm_i = kring->nr_hwcur;
1347 	if (nm_i != head) { /* something was released */
1348 		if (nm_may_forward_down(kring, flags)) {
1349 			ret = netmap_sw_to_nic(na);
1350 			if (ret > 0) {
1351 				kring->nr_kflags |= NR_FORWARD;
1352 				ret = 0;
1353 			}
1354 		}
1355 		kring->nr_hwcur = head;
1356 	}
1357 
1358 	mbq_unlock(q);
1359 
1360 	mbq_purge(&fq);
1361 	mbq_fini(&fq);
1362 
1363 	return ret;
1364 }
1365 
1366 
1367 /* Get a netmap adapter for the port.
1368  *
1369  * If it is possible to satisfy the request, return 0
1370  * with *na containing the netmap adapter found.
1371  * Otherwise return an error code, with *na containing NULL.
1372  *
1373  * When the port is attached to a bridge, we always return
1374  * EBUSY.
1375  * Otherwise, if the port is already bound to a file descriptor,
1376  * then we unconditionally return the existing adapter into *na.
1377  * In all the other cases, we return (into *na) either native,
1378  * generic or NULL, according to the following table:
1379  *
1380  *					native_support
1381  * active_fds   dev.netmap.admode         YES     NO
1382  * -------------------------------------------------------
1383  *    >0              *                 NA(ifp) NA(ifp)
1384  *
1385  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1386  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1387  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1388  *
1389  */
1390 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1391 int
1392 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1393 {
1394 	/* generic support */
1395 	int i = netmap_admode;	/* Take a snapshot. */
1396 	struct netmap_adapter *prev_na;
1397 	int error = 0;
1398 
1399 	*na = NULL; /* default */
1400 
1401 	/* reset in case of invalid value */
1402 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1403 		i = netmap_admode = NETMAP_ADMODE_BEST;
1404 
1405 	if (NM_NA_VALID(ifp)) {
1406 		prev_na = NA(ifp);
1407 		/* If an adapter already exists, return it if
1408 		 * there are active file descriptors or if
1409 		 * netmap is not forced to use generic
1410 		 * adapters.
1411 		 */
1412 		if (NETMAP_OWNED_BY_ANY(prev_na)
1413 			|| i != NETMAP_ADMODE_GENERIC
1414 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1415 #ifdef WITH_PIPES
1416 			/* ugly, but we cannot allow an adapter switch
1417 			 * if some pipe is referring to this one
1418 			 */
1419 			|| prev_na->na_next_pipe > 0
1420 #endif
1421 		) {
1422 			*na = prev_na;
1423 			goto assign_mem;
1424 		}
1425 	}
1426 
1427 	/* If there isn't native support and netmap is not allowed
1428 	 * to use generic adapters, we cannot satisfy the request.
1429 	 */
1430 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1431 		return EOPNOTSUPP;
1432 
1433 	/* Otherwise, create a generic adapter and return it,
1434 	 * saving the previously used netmap adapter, if any.
1435 	 *
1436 	 * Note that here 'prev_na', if not NULL, MUST be a
1437 	 * native adapter, and CANNOT be a generic one. This is
1438 	 * true because generic adapters are created on demand, and
1439 	 * destroyed when not used anymore. Therefore, if the adapter
1440 	 * currently attached to an interface 'ifp' is generic, it
1441 	 * must be that
1442 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1443 	 * Consequently, if NA(ifp) is generic, we will enter one of
1444 	 * the branches above. This ensures that we never override
1445 	 * a generic adapter with another generic adapter.
1446 	 */
1447 	error = generic_netmap_attach(ifp);
1448 	if (error)
1449 		return error;
1450 
1451 	*na = NA(ifp);
1452 
1453 assign_mem:
1454 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1455 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1456 		(*na)->nm_mem_prev = (*na)->nm_mem;
1457 		(*na)->nm_mem = netmap_mem_get(nmd);
1458 	}
1459 
1460 	return 0;
1461 }
1462 
1463 /*
1464  * MUST BE CALLED UNDER NMG_LOCK()
1465  *
1466  * Get a refcounted reference to a netmap adapter attached
1467  * to the interface specified by req.
1468  * This is always called in the execution of an ioctl().
1469  *
1470  * Return ENXIO if the interface specified by the request does
1471  * not exist, ENOTSUP if netmap is not supported by the interface,
1472  * EBUSY if the interface is already attached to a bridge,
1473  * EINVAL if parameters are invalid, ENOMEM if needed resources
1474  * could not be allocated.
1475  * If successful, hold a reference to the netmap adapter.
1476  *
1477  * If the interface specified by req is a system one, also keep
1478  * a reference to it and return a valid *ifp.
1479  */
1480 int
1481 netmap_get_na(struct nmreq_header *hdr,
1482 	      struct netmap_adapter **na, struct ifnet **ifp,
1483 	      struct netmap_mem_d *nmd, int create)
1484 {
1485 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1486 	int error = 0;
1487 	struct netmap_adapter *ret = NULL;
1488 	int nmd_ref = 0;
1489 
1490 	*na = NULL;     /* default return value */
1491 	*ifp = NULL;
1492 
1493 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1494 		return EINVAL;
1495 	}
1496 
1497 	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1498 			req->nr_mode == NR_REG_PIPE_SLAVE) {
1499 		/* Do not accept deprecated pipe modes. */
1500 		nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1501 		return EINVAL;
1502 	}
1503 
1504 	NMG_LOCK_ASSERT();
1505 
1506 	/* if the request contain a memid, try to find the
1507 	 * corresponding memory region
1508 	 */
1509 	if (nmd == NULL && req->nr_mem_id) {
1510 		nmd = netmap_mem_find(req->nr_mem_id);
1511 		if (nmd == NULL)
1512 			return EINVAL;
1513 		/* keep the rereference */
1514 		nmd_ref = 1;
1515 	}
1516 
1517 	/* We cascade through all possible types of netmap adapter.
1518 	 * All netmap_get_*_na() functions return an error and an na,
1519 	 * with the following combinations:
1520 	 *
1521 	 * error    na
1522 	 *   0	   NULL		type doesn't match
1523 	 *  !0	   NULL		type matches, but na creation/lookup failed
1524 	 *   0	  !NULL		type matches and na created/found
1525 	 *  !0    !NULL		impossible
1526 	 */
1527 	error = netmap_get_null_na(hdr, na, nmd, create);
1528 	if (error || *na != NULL)
1529 		goto out;
1530 
1531 	/* try to see if this is a monitor port */
1532 	error = netmap_get_monitor_na(hdr, na, nmd, create);
1533 	if (error || *na != NULL)
1534 		goto out;
1535 
1536 	/* try to see if this is a pipe port */
1537 	error = netmap_get_pipe_na(hdr, na, nmd, create);
1538 	if (error || *na != NULL)
1539 		goto out;
1540 
1541 	/* try to see if this is a bridge port */
1542 	error = netmap_get_vale_na(hdr, na, nmd, create);
1543 	if (error)
1544 		goto out;
1545 
1546 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1547 		goto out;
1548 
1549 	/*
1550 	 * This must be a hardware na, lookup the name in the system.
1551 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1552 	 * This may still be a tap, a veth/epair, or even a
1553 	 * persistent VALE port.
1554 	 */
1555 	*ifp = ifunit_ref(hdr->nr_name);
1556 	if (*ifp == NULL) {
1557 		error = ENXIO;
1558 		goto out;
1559 	}
1560 
1561 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1562 	if (error)
1563 		goto out;
1564 
1565 	*na = ret;
1566 	netmap_adapter_get(ret);
1567 
1568 out:
1569 	if (error) {
1570 		if (ret)
1571 			netmap_adapter_put(ret);
1572 		if (*ifp) {
1573 			if_rele(*ifp);
1574 			*ifp = NULL;
1575 		}
1576 	}
1577 	if (nmd_ref)
1578 		netmap_mem_put(nmd);
1579 
1580 	return error;
1581 }
1582 
1583 /* undo netmap_get_na() */
1584 void
1585 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1586 {
1587 	if (ifp)
1588 		if_rele(ifp);
1589 	if (na)
1590 		netmap_adapter_put(na);
1591 }
1592 
1593 
1594 #define NM_FAIL_ON(t) do {						\
1595 	if (unlikely(t)) {						\
1596 		RD(5, "%s: fail '" #t "' "				\
1597 			"h %d c %d t %d "				\
1598 			"rh %d rc %d rt %d "				\
1599 			"hc %d ht %d",					\
1600 			kring->name,					\
1601 			head, cur, ring->tail,				\
1602 			kring->rhead, kring->rcur, kring->rtail,	\
1603 			kring->nr_hwcur, kring->nr_hwtail);		\
1604 		return kring->nkr_num_slots;				\
1605 	}								\
1606 } while (0)
1607 
1608 /*
1609  * validate parameters on entry for *_txsync()
1610  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1611  * in case of error.
1612  *
1613  * rhead, rcur and rtail=hwtail are stored from previous round.
1614  * hwcur is the next packet to send to the ring.
1615  *
1616  * We want
1617  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1618  *
1619  * hwcur, rhead, rtail and hwtail are reliable
1620  */
1621 u_int
1622 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1623 {
1624 	u_int head = ring->head; /* read only once */
1625 	u_int cur = ring->cur; /* read only once */
1626 	u_int n = kring->nkr_num_slots;
1627 
1628 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1629 		kring->name,
1630 		kring->nr_hwcur, kring->nr_hwtail,
1631 		ring->head, ring->cur, ring->tail);
1632 #if 1 /* kernel sanity checks; but we can trust the kring. */
1633 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1634 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1635 #endif /* kernel sanity checks */
1636 	/*
1637 	 * user sanity checks. We only use head,
1638 	 * A, B, ... are possible positions for head:
1639 	 *
1640 	 *  0    A  rhead   B  rtail   C  n-1
1641 	 *  0    D  rtail   E  rhead   F  n-1
1642 	 *
1643 	 * B, F, D are valid. A, C, E are wrong
1644 	 */
1645 	if (kring->rtail >= kring->rhead) {
1646 		/* want rhead <= head <= rtail */
1647 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1648 		/* and also head <= cur <= rtail */
1649 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1650 	} else { /* here rtail < rhead */
1651 		/* we need head outside rtail .. rhead */
1652 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1653 
1654 		/* two cases now: head <= rtail or head >= rhead  */
1655 		if (head <= kring->rtail) {
1656 			/* want head <= cur <= rtail */
1657 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1658 		} else { /* head >= rhead */
1659 			/* cur must be outside rtail..head */
1660 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1661 		}
1662 	}
1663 	if (ring->tail != kring->rtail) {
1664 		RD(5, "%s tail overwritten was %d need %d", kring->name,
1665 			ring->tail, kring->rtail);
1666 		ring->tail = kring->rtail;
1667 	}
1668 	kring->rhead = head;
1669 	kring->rcur = cur;
1670 	return head;
1671 }
1672 
1673 
1674 /*
1675  * validate parameters on entry for *_rxsync()
1676  * Returns ring->head if ok, kring->nkr_num_slots on error.
1677  *
1678  * For a valid configuration,
1679  * hwcur <= head <= cur <= tail <= hwtail
1680  *
1681  * We only consider head and cur.
1682  * hwcur and hwtail are reliable.
1683  *
1684  */
1685 u_int
1686 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1687 {
1688 	uint32_t const n = kring->nkr_num_slots;
1689 	uint32_t head, cur;
1690 
1691 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1692 		kring->name,
1693 		kring->nr_hwcur, kring->nr_hwtail,
1694 		ring->head, ring->cur, ring->tail);
1695 	/*
1696 	 * Before storing the new values, we should check they do not
1697 	 * move backwards. However:
1698 	 * - head is not an issue because the previous value is hwcur;
1699 	 * - cur could in principle go back, however it does not matter
1700 	 *   because we are processing a brand new rxsync()
1701 	 */
1702 	cur = kring->rcur = ring->cur;	/* read only once */
1703 	head = kring->rhead = ring->head;	/* read only once */
1704 #if 1 /* kernel sanity checks */
1705 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1706 #endif /* kernel sanity checks */
1707 	/* user sanity checks */
1708 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1709 		/* want hwcur <= rhead <= hwtail */
1710 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1711 		/* and also rhead <= rcur <= hwtail */
1712 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1713 	} else {
1714 		/* we need rhead outside hwtail..hwcur */
1715 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1716 		/* two cases now: head <= hwtail or head >= hwcur  */
1717 		if (head <= kring->nr_hwtail) {
1718 			/* want head <= cur <= hwtail */
1719 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1720 		} else {
1721 			/* cur must be outside hwtail..head */
1722 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1723 		}
1724 	}
1725 	if (ring->tail != kring->rtail) {
1726 		RD(5, "%s tail overwritten was %d need %d",
1727 			kring->name,
1728 			ring->tail, kring->rtail);
1729 		ring->tail = kring->rtail;
1730 	}
1731 	return head;
1732 }
1733 
1734 
1735 /*
1736  * Error routine called when txsync/rxsync detects an error.
1737  * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1738  * Return 1 on reinit.
1739  *
1740  * This routine is only called by the upper half of the kernel.
1741  * It only reads hwcur (which is changed only by the upper half, too)
1742  * and hwtail (which may be changed by the lower half, but only on
1743  * a tx ring and only to increase it, so any error will be recovered
1744  * on the next call). For the above, we don't strictly need to call
1745  * it under lock.
1746  */
1747 int
1748 netmap_ring_reinit(struct netmap_kring *kring)
1749 {
1750 	struct netmap_ring *ring = kring->ring;
1751 	u_int i, lim = kring->nkr_num_slots - 1;
1752 	int errors = 0;
1753 
1754 	// XXX KASSERT nm_kr_tryget
1755 	RD(10, "called for %s", kring->name);
1756 	// XXX probably wrong to trust userspace
1757 	kring->rhead = ring->head;
1758 	kring->rcur  = ring->cur;
1759 	kring->rtail = ring->tail;
1760 
1761 	if (ring->cur > lim)
1762 		errors++;
1763 	if (ring->head > lim)
1764 		errors++;
1765 	if (ring->tail > lim)
1766 		errors++;
1767 	for (i = 0; i <= lim; i++) {
1768 		u_int idx = ring->slot[i].buf_idx;
1769 		u_int len = ring->slot[i].len;
1770 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1771 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1772 			ring->slot[i].buf_idx = 0;
1773 			ring->slot[i].len = 0;
1774 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1775 			ring->slot[i].len = 0;
1776 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1777 		}
1778 	}
1779 	if (errors) {
1780 		RD(10, "total %d errors", errors);
1781 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1782 			kring->name,
1783 			ring->cur, kring->nr_hwcur,
1784 			ring->tail, kring->nr_hwtail);
1785 		ring->head = kring->rhead = kring->nr_hwcur;
1786 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1787 		ring->tail = kring->rtail = kring->nr_hwtail;
1788 	}
1789 	return (errors ? 1 : 0);
1790 }
1791 
1792 /* interpret the ringid and flags fields of an nmreq, by translating them
1793  * into a pair of intervals of ring indices:
1794  *
1795  * [priv->np_txqfirst, priv->np_txqlast) and
1796  * [priv->np_rxqfirst, priv->np_rxqlast)
1797  *
1798  */
1799 int
1800 netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1801 			uint16_t nr_ringid, uint64_t nr_flags)
1802 {
1803 	struct netmap_adapter *na = priv->np_na;
1804 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1805 	enum txrx t;
1806 	u_int j;
1807 
1808 	for_rx_tx(t) {
1809 		if (nr_flags & excluded_direction[t]) {
1810 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1811 			continue;
1812 		}
1813 		switch (nr_mode) {
1814 		case NR_REG_ALL_NIC:
1815 		case NR_REG_NULL:
1816 			priv->np_qfirst[t] = 0;
1817 			priv->np_qlast[t] = nma_get_nrings(na, t);
1818 			ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1819 				priv->np_qfirst[t], priv->np_qlast[t]);
1820 			break;
1821 		case NR_REG_SW:
1822 		case NR_REG_NIC_SW:
1823 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1824 				nm_prerr("host rings not supported");
1825 				return EINVAL;
1826 			}
1827 			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1828 				nma_get_nrings(na, t) : 0);
1829 			priv->np_qlast[t] = netmap_all_rings(na, t);
1830 			ND("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1831 				nm_txrx2str(t),
1832 				priv->np_qfirst[t], priv->np_qlast[t]);
1833 			break;
1834 		case NR_REG_ONE_NIC:
1835 			if (nr_ringid >= na->num_tx_rings &&
1836 					nr_ringid >= na->num_rx_rings) {
1837 				nm_prerr("invalid ring id %d", nr_ringid);
1838 				return EINVAL;
1839 			}
1840 			/* if not enough rings, use the first one */
1841 			j = nr_ringid;
1842 			if (j >= nma_get_nrings(na, t))
1843 				j = 0;
1844 			priv->np_qfirst[t] = j;
1845 			priv->np_qlast[t] = j + 1;
1846 			ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1847 				priv->np_qfirst[t], priv->np_qlast[t]);
1848 			break;
1849 		default:
1850 			nm_prerr("invalid regif type %d", nr_mode);
1851 			return EINVAL;
1852 		}
1853 	}
1854 	priv->np_flags = nr_flags;
1855 
1856 	/* Allow transparent forwarding mode in the host --> nic
1857 	 * direction only if all the TX hw rings have been opened. */
1858 	if (priv->np_qfirst[NR_TX] == 0 &&
1859 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1860 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1861 	}
1862 
1863 	if (netmap_verbose) {
1864 		nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1865 			na->name,
1866 			priv->np_qfirst[NR_TX],
1867 			priv->np_qlast[NR_TX],
1868 			priv->np_qfirst[NR_RX],
1869 			priv->np_qlast[NR_RX],
1870 			nr_ringid);
1871 	}
1872 	return 0;
1873 }
1874 
1875 
1876 /*
1877  * Set the ring ID. For devices with a single queue, a request
1878  * for all rings is the same as a single ring.
1879  */
1880 static int
1881 netmap_set_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1882 		uint16_t nr_ringid, uint64_t nr_flags)
1883 {
1884 	struct netmap_adapter *na = priv->np_na;
1885 	int error;
1886 	enum txrx t;
1887 
1888 	error = netmap_interp_ringid(priv, nr_mode, nr_ringid, nr_flags);
1889 	if (error) {
1890 		return error;
1891 	}
1892 
1893 	priv->np_txpoll = (nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1894 
1895 	/* optimization: count the users registered for more than
1896 	 * one ring, which are the ones sleeping on the global queue.
1897 	 * The default netmap_notify() callback will then
1898 	 * avoid signaling the global queue if nobody is using it
1899 	 */
1900 	for_rx_tx(t) {
1901 		if (nm_si_user(priv, t))
1902 			na->si_users[t]++;
1903 	}
1904 	return 0;
1905 }
1906 
1907 static void
1908 netmap_unset_ringid(struct netmap_priv_d *priv)
1909 {
1910 	struct netmap_adapter *na = priv->np_na;
1911 	enum txrx t;
1912 
1913 	for_rx_tx(t) {
1914 		if (nm_si_user(priv, t))
1915 			na->si_users[t]--;
1916 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1917 	}
1918 	priv->np_flags = 0;
1919 	priv->np_txpoll = 0;
1920 	priv->np_kloop_state = 0;
1921 }
1922 
1923 
1924 /* Set the nr_pending_mode for the requested rings.
1925  * If requested, also try to get exclusive access to the rings, provided
1926  * the rings we want to bind are not exclusively owned by a previous bind.
1927  */
1928 static int
1929 netmap_krings_get(struct netmap_priv_d *priv)
1930 {
1931 	struct netmap_adapter *na = priv->np_na;
1932 	u_int i;
1933 	struct netmap_kring *kring;
1934 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1935 	enum txrx t;
1936 
1937 	if (netmap_debug & NM_DEBUG_ON)
1938 		nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
1939 			na->name,
1940 			priv->np_qfirst[NR_TX],
1941 			priv->np_qlast[NR_TX],
1942 			priv->np_qfirst[NR_RX],
1943 			priv->np_qlast[NR_RX]);
1944 
1945 	/* first round: check that all the requested rings
1946 	 * are neither alread exclusively owned, nor we
1947 	 * want exclusive ownership when they are already in use
1948 	 */
1949 	for_rx_tx(t) {
1950 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1951 			kring = NMR(na, t)[i];
1952 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1953 			    (kring->users && excl))
1954 			{
1955 				ND("ring %s busy", kring->name);
1956 				return EBUSY;
1957 			}
1958 		}
1959 	}
1960 
1961 	/* second round: increment usage count (possibly marking them
1962 	 * as exclusive) and set the nr_pending_mode
1963 	 */
1964 	for_rx_tx(t) {
1965 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1966 			kring = NMR(na, t)[i];
1967 			kring->users++;
1968 			if (excl)
1969 				kring->nr_kflags |= NKR_EXCLUSIVE;
1970 	                kring->nr_pending_mode = NKR_NETMAP_ON;
1971 		}
1972 	}
1973 
1974 	return 0;
1975 
1976 }
1977 
1978 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1979  * if was asked on regif, and unset the nr_pending_mode if we are the
1980  * last users of the involved rings. */
1981 static void
1982 netmap_krings_put(struct netmap_priv_d *priv)
1983 {
1984 	struct netmap_adapter *na = priv->np_na;
1985 	u_int i;
1986 	struct netmap_kring *kring;
1987 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1988 	enum txrx t;
1989 
1990 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1991 			na->name,
1992 			priv->np_qfirst[NR_TX],
1993 			priv->np_qlast[NR_TX],
1994 			priv->np_qfirst[NR_RX],
1995 			priv->np_qlast[MR_RX]);
1996 
1997 	for_rx_tx(t) {
1998 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1999 			kring = NMR(na, t)[i];
2000 			if (excl)
2001 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
2002 			kring->users--;
2003 			if (kring->users == 0)
2004 				kring->nr_pending_mode = NKR_NETMAP_OFF;
2005 		}
2006 	}
2007 }
2008 
2009 static int
2010 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2011 {
2012 	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2013 }
2014 
2015 /* Validate the CSB entries for both directions (atok and ktoa).
2016  * To be called under NMG_LOCK(). */
2017 static int
2018 netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2019 {
2020 	struct nm_csb_atok *csb_atok_base =
2021 		(struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2022 	struct nm_csb_ktoa *csb_ktoa_base =
2023 		(struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2024 	enum txrx t;
2025 	int num_rings[NR_TXRX], tot_rings;
2026 	size_t entry_size[2];
2027 	void *csb_start[2];
2028 	int i;
2029 
2030 	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2031 		nm_prerr("Cannot update CSB while kloop is running");
2032 		return EBUSY;
2033 	}
2034 
2035 	tot_rings = 0;
2036 	for_rx_tx(t) {
2037 		num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2038 		tot_rings += num_rings[t];
2039 	}
2040 	if (tot_rings <= 0)
2041 		return 0;
2042 
2043 	if (!(priv->np_flags & NR_EXCLUSIVE)) {
2044 		nm_prerr("CSB mode requires NR_EXCLUSIVE");
2045 		return EINVAL;
2046 	}
2047 
2048 	entry_size[0] = sizeof(*csb_atok_base);
2049 	entry_size[1] = sizeof(*csb_ktoa_base);
2050 	csb_start[0] = (void *)csb_atok_base;
2051 	csb_start[1] = (void *)csb_ktoa_base;
2052 
2053 	for (i = 0; i < 2; i++) {
2054 		/* On Linux we could use access_ok() to simplify
2055 		 * the validation. However, the advantage of
2056 		 * this approach is that it works also on
2057 		 * FreeBSD. */
2058 		size_t csb_size = tot_rings * entry_size[i];
2059 		void *tmp;
2060 		int err;
2061 
2062 		if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2063 			nm_prerr("Unaligned CSB address");
2064 			return EINVAL;
2065 		}
2066 
2067 		tmp = nm_os_malloc(csb_size);
2068 		if (!tmp)
2069 			return ENOMEM;
2070 		if (i == 0) {
2071 			/* Application --> kernel direction. */
2072 			err = copyin(csb_start[i], tmp, csb_size);
2073 		} else {
2074 			/* Kernel --> application direction. */
2075 			memset(tmp, 0, csb_size);
2076 			err = copyout(tmp, csb_start[i], csb_size);
2077 		}
2078 		nm_os_free(tmp);
2079 		if (err) {
2080 			nm_prerr("Invalid CSB address");
2081 			return err;
2082 		}
2083 	}
2084 
2085 	priv->np_csb_atok_base = csb_atok_base;
2086 	priv->np_csb_ktoa_base = csb_ktoa_base;
2087 
2088 	/* Initialize the CSB. */
2089 	for_rx_tx(t) {
2090 		for (i = 0; i < num_rings[t]; i++) {
2091 			struct netmap_kring *kring =
2092 				NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2093 			struct nm_csb_atok *csb_atok = csb_atok_base + i;
2094 			struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2095 
2096 			if (t == NR_RX) {
2097 				csb_atok += num_rings[NR_TX];
2098 				csb_ktoa += num_rings[NR_TX];
2099 			}
2100 
2101 			CSB_WRITE(csb_atok, head, kring->rhead);
2102 			CSB_WRITE(csb_atok, cur, kring->rcur);
2103 			CSB_WRITE(csb_atok, appl_need_kick, 1);
2104 			CSB_WRITE(csb_atok, sync_flags, 1);
2105 			CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2106 			CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2107 			CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2108 
2109 			nm_prinf("csb_init for kring %s: head %u, cur %u, "
2110 				"hwcur %u, hwtail %u", kring->name,
2111 				kring->rhead, kring->rcur, kring->nr_hwcur,
2112 				kring->nr_hwtail);
2113 		}
2114 	}
2115 
2116 	return 0;
2117 }
2118 
2119 /*
2120  * possibly move the interface to netmap-mode.
2121  * If success it returns a pointer to netmap_if, otherwise NULL.
2122  * This must be called with NMG_LOCK held.
2123  *
2124  * The following na callbacks are called in the process:
2125  *
2126  * na->nm_config()			[by netmap_update_config]
2127  * (get current number and size of rings)
2128  *
2129  *  	We have a generic one for linux (netmap_linux_config).
2130  *  	The bwrap has to override this, since it has to forward
2131  *  	the request to the wrapped adapter (netmap_bwrap_config).
2132  *
2133  *
2134  * na->nm_krings_create()
2135  * (create and init the krings array)
2136  *
2137  * 	One of the following:
2138  *
2139  *	* netmap_hw_krings_create, 			(hw ports)
2140  *		creates the standard layout for the krings
2141  * 		and adds the mbq (used for the host rings).
2142  *
2143  * 	* netmap_vp_krings_create			(VALE ports)
2144  * 		add leases and scratchpads
2145  *
2146  * 	* netmap_pipe_krings_create			(pipes)
2147  * 		create the krings and rings of both ends and
2148  * 		cross-link them
2149  *
2150  *      * netmap_monitor_krings_create 			(monitors)
2151  *      	avoid allocating the mbq
2152  *
2153  *      * netmap_bwrap_krings_create			(bwraps)
2154  *      	create both the brap krings array,
2155  *      	the krings array of the wrapped adapter, and
2156  *      	(if needed) the fake array for the host adapter
2157  *
2158  * na->nm_register(, 1)
2159  * (put the adapter in netmap mode)
2160  *
2161  * 	This may be one of the following:
2162  *
2163  * 	* netmap_hw_reg				        (hw ports)
2164  * 		checks that the ifp is still there, then calls
2165  * 		the hardware specific callback;
2166  *
2167  * 	* netmap_vp_reg					(VALE ports)
2168  *		If the port is connected to a bridge,
2169  *		set the NAF_NETMAP_ON flag under the
2170  *		bridge write lock.
2171  *
2172  *	* netmap_pipe_reg				(pipes)
2173  *		inform the other pipe end that it is no
2174  *		longer responsible for the lifetime of this
2175  *		pipe end
2176  *
2177  *	* netmap_monitor_reg				(monitors)
2178  *		intercept the sync callbacks of the monitored
2179  *		rings
2180  *
2181  *	* netmap_bwrap_reg				(bwraps)
2182  *		cross-link the bwrap and hwna rings,
2183  *		forward the request to the hwna, override
2184  *		the hwna notify callback (to get the frames
2185  *		coming from outside go through the bridge).
2186  *
2187  *
2188  */
2189 int
2190 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2191 	uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags)
2192 {
2193 	struct netmap_if *nifp = NULL;
2194 	int error;
2195 
2196 	NMG_LOCK_ASSERT();
2197 	priv->np_na = na;     /* store the reference */
2198 	error = netmap_mem_finalize(na->nm_mem, na);
2199 	if (error)
2200 		goto err;
2201 
2202 	if (na->active_fds == 0) {
2203 
2204 		/* cache the allocator info in the na */
2205 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2206 		if (error)
2207 			goto err_drop_mem;
2208 		ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2209 					    na->na_lut.objsize);
2210 
2211 		/* ring configuration may have changed, fetch from the card */
2212 		netmap_update_config(na);
2213 	}
2214 
2215 	/* compute the range of tx and rx rings to monitor */
2216 	error = netmap_set_ringid(priv, nr_mode, nr_ringid, nr_flags);
2217 	if (error)
2218 		goto err_put_lut;
2219 
2220 	if (na->active_fds == 0) {
2221 		/*
2222 		 * If this is the first registration of the adapter,
2223 		 * perform sanity checks and create the in-kernel view
2224 		 * of the netmap rings (the netmap krings).
2225 		 */
2226 		if (na->ifp && nm_priv_rx_enabled(priv)) {
2227 			/* This netmap adapter is attached to an ifnet. */
2228 			unsigned nbs = NETMAP_BUF_SIZE(na);
2229 			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2230 
2231 			ND("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2232 					na->name, mtu, na->rx_buf_maxsize, nbs);
2233 
2234 			if (na->rx_buf_maxsize == 0) {
2235 				nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2236 				error = EIO;
2237 				goto err_drop_mem;
2238 			}
2239 
2240 			if (mtu <= na->rx_buf_maxsize) {
2241 				/* The MTU fits a single NIC slot. We only
2242 				 * Need to check that netmap buffers are
2243 				 * large enough to hold an MTU. NS_MOREFRAG
2244 				 * cannot be used in this case. */
2245 				if (nbs < mtu) {
2246 					nm_prerr("error: netmap buf size (%u) "
2247 						"< device MTU (%u)", nbs, mtu);
2248 					error = EINVAL;
2249 					goto err_drop_mem;
2250 				}
2251 			} else {
2252 				/* More NIC slots may be needed to receive
2253 				 * or transmit a single packet. Check that
2254 				 * the adapter supports NS_MOREFRAG and that
2255 				 * netmap buffers are large enough to hold
2256 				 * the maximum per-slot size. */
2257 				if (!(na->na_flags & NAF_MOREFRAG)) {
2258 					nm_prerr("error: large MTU (%d) needed "
2259 						"but %s does not support "
2260 						"NS_MOREFRAG", mtu,
2261 						na->ifp->if_xname);
2262 					error = EINVAL;
2263 					goto err_drop_mem;
2264 				} else if (nbs < na->rx_buf_maxsize) {
2265 					nm_prerr("error: using NS_MOREFRAG on "
2266 						"%s requires netmap buf size "
2267 						">= %u", na->ifp->if_xname,
2268 						na->rx_buf_maxsize);
2269 					error = EINVAL;
2270 					goto err_drop_mem;
2271 				} else {
2272 					nm_prinf("info: netmap application on "
2273 						"%s needs to support "
2274 						"NS_MOREFRAG "
2275 						"(MTU=%u,netmap_buf_size=%u)",
2276 						na->ifp->if_xname, mtu, nbs);
2277 				}
2278 			}
2279 		}
2280 
2281 		/*
2282 		 * Depending on the adapter, this may also create
2283 		 * the netmap rings themselves
2284 		 */
2285 		error = na->nm_krings_create(na);
2286 		if (error)
2287 			goto err_put_lut;
2288 
2289 	}
2290 
2291 	/* now the krings must exist and we can check whether some
2292 	 * previous bind has exclusive ownership on them, and set
2293 	 * nr_pending_mode
2294 	 */
2295 	error = netmap_krings_get(priv);
2296 	if (error)
2297 		goto err_del_krings;
2298 
2299 	/* create all needed missing netmap rings */
2300 	error = netmap_mem_rings_create(na);
2301 	if (error)
2302 		goto err_rel_excl;
2303 
2304 	/* in all cases, create a new netmap if */
2305 	nifp = netmap_mem_if_new(na, priv);
2306 	if (nifp == NULL) {
2307 		error = ENOMEM;
2308 		goto err_rel_excl;
2309 	}
2310 
2311 	if (nm_kring_pending(priv)) {
2312 		/* Some kring is switching mode, tell the adapter to
2313 		 * react on this. */
2314 		error = na->nm_register(na, 1);
2315 		if (error)
2316 			goto err_del_if;
2317 	}
2318 
2319 	/* Commit the reference. */
2320 	na->active_fds++;
2321 
2322 	/*
2323 	 * advertise that the interface is ready by setting np_nifp.
2324 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2325 	 * check for priv->np_nifp != NULL without locking
2326 	 */
2327 	mb(); /* make sure previous writes are visible to all CPUs */
2328 	priv->np_nifp = nifp;
2329 
2330 	return 0;
2331 
2332 err_del_if:
2333 	netmap_mem_if_delete(na, nifp);
2334 err_rel_excl:
2335 	netmap_krings_put(priv);
2336 	netmap_mem_rings_delete(na);
2337 err_del_krings:
2338 	if (na->active_fds == 0)
2339 		na->nm_krings_delete(na);
2340 err_put_lut:
2341 	if (na->active_fds == 0)
2342 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2343 err_drop_mem:
2344 	netmap_mem_drop(na);
2345 err:
2346 	priv->np_na = NULL;
2347 	return error;
2348 }
2349 
2350 
2351 /*
2352  * update kring and ring at the end of rxsync/txsync.
2353  */
2354 static inline void
2355 nm_sync_finalize(struct netmap_kring *kring)
2356 {
2357 	/*
2358 	 * Update ring tail to what the kernel knows
2359 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2360 	 * if no carrier.
2361 	 */
2362 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2363 
2364 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2365 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2366 		kring->rhead, kring->rcur, kring->rtail);
2367 }
2368 
2369 /* set ring timestamp */
2370 static inline void
2371 ring_timestamp_set(struct netmap_ring *ring)
2372 {
2373 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2374 		microtime(&ring->ts);
2375 	}
2376 }
2377 
2378 static int nmreq_copyin(struct nmreq_header *, int);
2379 static int nmreq_copyout(struct nmreq_header *, int);
2380 static int nmreq_checkoptions(struct nmreq_header *);
2381 
2382 /*
2383  * ioctl(2) support for the "netmap" device.
2384  *
2385  * Following a list of accepted commands:
2386  * - NIOCCTRL		device control API
2387  * - NIOCTXSYNC		sync TX rings
2388  * - NIOCRXSYNC		sync RX rings
2389  * - SIOCGIFADDR	just for convenience
2390  * - NIOCGINFO		deprecated (legacy API)
2391  * - NIOCREGIF		deprecated (legacy API)
2392  *
2393  * Return 0 on success, errno otherwise.
2394  */
2395 int
2396 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2397 		struct thread *td, int nr_body_is_user)
2398 {
2399 	struct mbq q;	/* packets from RX hw queues to host stack */
2400 	struct netmap_adapter *na = NULL;
2401 	struct netmap_mem_d *nmd = NULL;
2402 	struct ifnet *ifp = NULL;
2403 	int error = 0;
2404 	u_int i, qfirst, qlast;
2405 	struct netmap_kring **krings;
2406 	int sync_flags;
2407 	enum txrx t;
2408 
2409 	switch (cmd) {
2410 	case NIOCCTRL: {
2411 		struct nmreq_header *hdr = (struct nmreq_header *)data;
2412 
2413 		if (hdr->nr_version < NETMAP_MIN_API ||
2414 		    hdr->nr_version > NETMAP_MAX_API) {
2415 			nm_prerr("API mismatch: got %d need %d",
2416 				hdr->nr_version, NETMAP_API);
2417 			return EINVAL;
2418 		}
2419 
2420 		/* Make a kernel-space copy of the user-space nr_body.
2421 		 * For convenince, the nr_body pointer and the pointers
2422 		 * in the options list will be replaced with their
2423 		 * kernel-space counterparts. The original pointers are
2424 		 * saved internally and later restored by nmreq_copyout
2425 		 */
2426 		error = nmreq_copyin(hdr, nr_body_is_user);
2427 		if (error) {
2428 			return error;
2429 		}
2430 
2431 		/* Sanitize hdr->nr_name. */
2432 		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2433 
2434 		switch (hdr->nr_reqtype) {
2435 		case NETMAP_REQ_REGISTER: {
2436 			struct nmreq_register *req =
2437 				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2438 			struct netmap_if *nifp;
2439 
2440 			/* Protect access to priv from concurrent requests. */
2441 			NMG_LOCK();
2442 			do {
2443 				struct nmreq_option *opt;
2444 				u_int memflags;
2445 
2446 				if (priv->np_nifp != NULL) {	/* thread already registered */
2447 					error = EBUSY;
2448 					break;
2449 				}
2450 
2451 #ifdef WITH_EXTMEM
2452 				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2453 						NETMAP_REQ_OPT_EXTMEM);
2454 				if (opt != NULL) {
2455 					struct nmreq_opt_extmem *e =
2456 						(struct nmreq_opt_extmem *)opt;
2457 
2458 					error = nmreq_checkduplicate(opt);
2459 					if (error) {
2460 						opt->nro_status = error;
2461 						break;
2462 					}
2463 					nmd = netmap_mem_ext_create(e->nro_usrptr,
2464 							&e->nro_info, &error);
2465 					opt->nro_status = error;
2466 					if (nmd == NULL)
2467 						break;
2468 				}
2469 #endif /* WITH_EXTMEM */
2470 
2471 				if (nmd == NULL && req->nr_mem_id) {
2472 					/* find the allocator and get a reference */
2473 					nmd = netmap_mem_find(req->nr_mem_id);
2474 					if (nmd == NULL) {
2475 						if (netmap_verbose) {
2476 							nm_prerr("%s: failed to find mem_id %u",
2477 									hdr->nr_name, req->nr_mem_id);
2478 						}
2479 						error = EINVAL;
2480 						break;
2481 					}
2482 				}
2483 				/* find the interface and a reference */
2484 				error = netmap_get_na(hdr, &na, &ifp, nmd,
2485 						      1 /* create */); /* keep reference */
2486 				if (error)
2487 					break;
2488 				if (NETMAP_OWNED_BY_KERN(na)) {
2489 					error = EBUSY;
2490 					break;
2491 				}
2492 
2493 				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2494 					nm_prerr("virt_hdr_len=%d, but application does "
2495 						"not accept it", na->virt_hdr_len);
2496 					error = EIO;
2497 					break;
2498 				}
2499 
2500 				error = netmap_do_regif(priv, na, req->nr_mode,
2501 							req->nr_ringid, req->nr_flags);
2502 				if (error) {    /* reg. failed, release priv and ref */
2503 					break;
2504 				}
2505 
2506 				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2507 							NETMAP_REQ_OPT_CSB);
2508 				if (opt != NULL) {
2509 					struct nmreq_opt_csb *csbo =
2510 						(struct nmreq_opt_csb *)opt;
2511 					error = nmreq_checkduplicate(opt);
2512 					if (!error) {
2513 						error = netmap_csb_validate(priv, csbo);
2514 					}
2515 					opt->nro_status = error;
2516 					if (error) {
2517 						netmap_do_unregif(priv);
2518 						break;
2519 					}
2520 				}
2521 
2522 				nifp = priv->np_nifp;
2523 				priv->np_td = td; /* for debugging purposes */
2524 
2525 				/* return the offset of the netmap_if object */
2526 				req->nr_rx_rings = na->num_rx_rings;
2527 				req->nr_tx_rings = na->num_tx_rings;
2528 				req->nr_rx_slots = na->num_rx_desc;
2529 				req->nr_tx_slots = na->num_tx_desc;
2530 				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2531 					&req->nr_mem_id);
2532 				if (error) {
2533 					netmap_do_unregif(priv);
2534 					break;
2535 				}
2536 				if (memflags & NETMAP_MEM_PRIVATE) {
2537 					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2538 				}
2539 				for_rx_tx(t) {
2540 					priv->np_si[t] = nm_si_user(priv, t) ?
2541 						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2542 				}
2543 
2544 				if (req->nr_extra_bufs) {
2545 					if (netmap_verbose)
2546 						nm_prinf("requested %d extra buffers",
2547 							req->nr_extra_bufs);
2548 					req->nr_extra_bufs = netmap_extra_alloc(na,
2549 						&nifp->ni_bufs_head, req->nr_extra_bufs);
2550 					if (netmap_verbose)
2551 						nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2552 				}
2553 				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2554 
2555 				error = nmreq_checkoptions(hdr);
2556 				if (error) {
2557 					netmap_do_unregif(priv);
2558 					break;
2559 				}
2560 
2561 				/* store ifp reference so that priv destructor may release it */
2562 				priv->np_ifp = ifp;
2563 			} while (0);
2564 			if (error) {
2565 				netmap_unget_na(na, ifp);
2566 			}
2567 			/* release the reference from netmap_mem_find() or
2568 			 * netmap_mem_ext_create()
2569 			 */
2570 			if (nmd)
2571 				netmap_mem_put(nmd);
2572 			NMG_UNLOCK();
2573 			break;
2574 		}
2575 
2576 		case NETMAP_REQ_PORT_INFO_GET: {
2577 			struct nmreq_port_info_get *req =
2578 				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2579 
2580 			NMG_LOCK();
2581 			do {
2582 				u_int memflags;
2583 
2584 				if (hdr->nr_name[0] != '\0') {
2585 					/* Build a nmreq_register out of the nmreq_port_info_get,
2586 					 * so that we can call netmap_get_na(). */
2587 					struct nmreq_register regreq;
2588 					bzero(&regreq, sizeof(regreq));
2589 					regreq.nr_mode = NR_REG_ALL_NIC;
2590 					regreq.nr_tx_slots = req->nr_tx_slots;
2591 					regreq.nr_rx_slots = req->nr_rx_slots;
2592 					regreq.nr_tx_rings = req->nr_tx_rings;
2593 					regreq.nr_rx_rings = req->nr_rx_rings;
2594 					regreq.nr_mem_id = req->nr_mem_id;
2595 
2596 					/* get a refcount */
2597 					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2598 					hdr->nr_body = (uintptr_t)&regreq;
2599 					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2600 					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2601 					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2602 					if (error) {
2603 						na = NULL;
2604 						ifp = NULL;
2605 						break;
2606 					}
2607 					nmd = na->nm_mem; /* get memory allocator */
2608 				} else {
2609 					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2610 					if (nmd == NULL) {
2611 						if (netmap_verbose)
2612 							nm_prerr("%s: failed to find mem_id %u",
2613 									hdr->nr_name,
2614 									req->nr_mem_id ? req->nr_mem_id : 1);
2615 						error = EINVAL;
2616 						break;
2617 					}
2618 				}
2619 
2620 				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2621 					&req->nr_mem_id);
2622 				if (error)
2623 					break;
2624 				if (na == NULL) /* only memory info */
2625 					break;
2626 				netmap_update_config(na);
2627 				req->nr_rx_rings = na->num_rx_rings;
2628 				req->nr_tx_rings = na->num_tx_rings;
2629 				req->nr_rx_slots = na->num_rx_desc;
2630 				req->nr_tx_slots = na->num_tx_desc;
2631 			} while (0);
2632 			netmap_unget_na(na, ifp);
2633 			NMG_UNLOCK();
2634 			break;
2635 		}
2636 #ifdef WITH_VALE
2637 		case NETMAP_REQ_VALE_ATTACH: {
2638 			error = netmap_vale_attach(hdr, NULL /* userspace request */);
2639 			break;
2640 		}
2641 
2642 		case NETMAP_REQ_VALE_DETACH: {
2643 			error = netmap_vale_detach(hdr, NULL /* userspace request */);
2644 			break;
2645 		}
2646 
2647 		case NETMAP_REQ_VALE_LIST: {
2648 			error = netmap_vale_list(hdr);
2649 			break;
2650 		}
2651 
2652 		case NETMAP_REQ_PORT_HDR_SET: {
2653 			struct nmreq_port_hdr *req =
2654 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2655 			/* Build a nmreq_register out of the nmreq_port_hdr,
2656 			 * so that we can call netmap_get_bdg_na(). */
2657 			struct nmreq_register regreq;
2658 			bzero(&regreq, sizeof(regreq));
2659 			regreq.nr_mode = NR_REG_ALL_NIC;
2660 
2661 			/* For now we only support virtio-net headers, and only for
2662 			 * VALE ports, but this may change in future. Valid lengths
2663 			 * for the virtio-net header are 0 (no header), 10 and 12. */
2664 			if (req->nr_hdr_len != 0 &&
2665 				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2666 					req->nr_hdr_len != 12) {
2667 				if (netmap_verbose)
2668 					nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
2669 				error = EINVAL;
2670 				break;
2671 			}
2672 			NMG_LOCK();
2673 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2674 			hdr->nr_body = (uintptr_t)&regreq;
2675 			error = netmap_get_vale_na(hdr, &na, NULL, 0);
2676 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2677 			hdr->nr_body = (uintptr_t)req;
2678 			if (na && !error) {
2679 				struct netmap_vp_adapter *vpna =
2680 					(struct netmap_vp_adapter *)na;
2681 				na->virt_hdr_len = req->nr_hdr_len;
2682 				if (na->virt_hdr_len) {
2683 					vpna->mfs = NETMAP_BUF_SIZE(na);
2684 				}
2685 				if (netmap_verbose)
2686 					nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2687 				netmap_adapter_put(na);
2688 			} else if (!na) {
2689 				error = ENXIO;
2690 			}
2691 			NMG_UNLOCK();
2692 			break;
2693 		}
2694 
2695 		case NETMAP_REQ_PORT_HDR_GET: {
2696 			/* Get vnet-header length for this netmap port */
2697 			struct nmreq_port_hdr *req =
2698 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2699 			/* Build a nmreq_register out of the nmreq_port_hdr,
2700 			 * so that we can call netmap_get_bdg_na(). */
2701 			struct nmreq_register regreq;
2702 			struct ifnet *ifp;
2703 
2704 			bzero(&regreq, sizeof(regreq));
2705 			regreq.nr_mode = NR_REG_ALL_NIC;
2706 			NMG_LOCK();
2707 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2708 			hdr->nr_body = (uintptr_t)&regreq;
2709 			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2710 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2711 			hdr->nr_body = (uintptr_t)req;
2712 			if (na && !error) {
2713 				req->nr_hdr_len = na->virt_hdr_len;
2714 			}
2715 			netmap_unget_na(na, ifp);
2716 			NMG_UNLOCK();
2717 			break;
2718 		}
2719 
2720 		case NETMAP_REQ_VALE_NEWIF: {
2721 			error = nm_vi_create(hdr);
2722 			break;
2723 		}
2724 
2725 		case NETMAP_REQ_VALE_DELIF: {
2726 			error = nm_vi_destroy(hdr->nr_name);
2727 			break;
2728 		}
2729 
2730 		case NETMAP_REQ_VALE_POLLING_ENABLE:
2731 		case NETMAP_REQ_VALE_POLLING_DISABLE: {
2732 			error = nm_bdg_polling(hdr);
2733 			break;
2734 		}
2735 #endif  /* WITH_VALE */
2736 		case NETMAP_REQ_POOLS_INFO_GET: {
2737 			/* Get information from the memory allocator used for
2738 			 * hdr->nr_name. */
2739 			struct nmreq_pools_info *req =
2740 				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2741 			NMG_LOCK();
2742 			do {
2743 				/* Build a nmreq_register out of the nmreq_pools_info,
2744 				 * so that we can call netmap_get_na(). */
2745 				struct nmreq_register regreq;
2746 				bzero(&regreq, sizeof(regreq));
2747 				regreq.nr_mem_id = req->nr_mem_id;
2748 				regreq.nr_mode = NR_REG_ALL_NIC;
2749 
2750 				hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2751 				hdr->nr_body = (uintptr_t)&regreq;
2752 				error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2753 				hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
2754 				hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2755 				if (error) {
2756 					na = NULL;
2757 					ifp = NULL;
2758 					break;
2759 				}
2760 				nmd = na->nm_mem; /* grab the memory allocator */
2761 				if (nmd == NULL) {
2762 					error = EINVAL;
2763 					break;
2764 				}
2765 
2766 				/* Finalize the memory allocator, get the pools
2767 				 * information and release the allocator. */
2768 				error = netmap_mem_finalize(nmd, na);
2769 				if (error) {
2770 					break;
2771 				}
2772 				error = netmap_mem_pools_info_get(req, nmd);
2773 				netmap_mem_drop(na);
2774 			} while (0);
2775 			netmap_unget_na(na, ifp);
2776 			NMG_UNLOCK();
2777 			break;
2778 		}
2779 
2780 		case NETMAP_REQ_CSB_ENABLE: {
2781 			struct nmreq_option *opt;
2782 
2783 			opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2784 						NETMAP_REQ_OPT_CSB);
2785 			if (opt == NULL) {
2786 				error = EINVAL;
2787 			} else {
2788 				struct nmreq_opt_csb *csbo =
2789 					(struct nmreq_opt_csb *)opt;
2790 				error = nmreq_checkduplicate(opt);
2791 				if (!error) {
2792 					NMG_LOCK();
2793 					error = netmap_csb_validate(priv, csbo);
2794 					NMG_UNLOCK();
2795 				}
2796 				opt->nro_status = error;
2797 			}
2798 			break;
2799 		}
2800 
2801 		case NETMAP_REQ_SYNC_KLOOP_START: {
2802 			error = netmap_sync_kloop(priv, hdr);
2803 			break;
2804 		}
2805 
2806 		case NETMAP_REQ_SYNC_KLOOP_STOP: {
2807 			error = netmap_sync_kloop_stop(priv);
2808 			break;
2809 		}
2810 
2811 		default: {
2812 			error = EINVAL;
2813 			break;
2814 		}
2815 		}
2816 		/* Write back request body to userspace and reset the
2817 		 * user-space pointer. */
2818 		error = nmreq_copyout(hdr, error);
2819 		break;
2820 	}
2821 
2822 	case NIOCTXSYNC:
2823 	case NIOCRXSYNC: {
2824 		if (unlikely(priv->np_nifp == NULL)) {
2825 			error = ENXIO;
2826 			break;
2827 		}
2828 		mb(); /* make sure following reads are not from cache */
2829 
2830 		if (unlikely(priv->np_csb_atok_base)) {
2831 			nm_prerr("Invalid sync in CSB mode");
2832 			error = EBUSY;
2833 			break;
2834 		}
2835 
2836 		na = priv->np_na;      /* we have a reference */
2837 
2838 		mbq_init(&q);
2839 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2840 		krings = NMR(na, t);
2841 		qfirst = priv->np_qfirst[t];
2842 		qlast = priv->np_qlast[t];
2843 		sync_flags = priv->np_sync_flags;
2844 
2845 		for (i = qfirst; i < qlast; i++) {
2846 			struct netmap_kring *kring = krings[i];
2847 			struct netmap_ring *ring = kring->ring;
2848 
2849 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2850 				error = (error ? EIO : 0);
2851 				continue;
2852 			}
2853 
2854 			if (cmd == NIOCTXSYNC) {
2855 				if (netmap_debug & NM_DEBUG_TXSYNC)
2856 					nm_prinf("pre txsync ring %d cur %d hwcur %d",
2857 					    i, ring->cur,
2858 					    kring->nr_hwcur);
2859 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2860 					netmap_ring_reinit(kring);
2861 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2862 					nm_sync_finalize(kring);
2863 				}
2864 				if (netmap_debug & NM_DEBUG_TXSYNC)
2865 					nm_prinf("post txsync ring %d cur %d hwcur %d",
2866 					    i, ring->cur,
2867 					    kring->nr_hwcur);
2868 			} else {
2869 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2870 					netmap_ring_reinit(kring);
2871 				}
2872 				if (nm_may_forward_up(kring)) {
2873 					/* transparent forwarding, see netmap_poll() */
2874 					netmap_grab_packets(kring, &q, netmap_fwd);
2875 				}
2876 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2877 					nm_sync_finalize(kring);
2878 				}
2879 				ring_timestamp_set(ring);
2880 			}
2881 			nm_kr_put(kring);
2882 		}
2883 
2884 		if (mbq_peek(&q)) {
2885 			netmap_send_up(na->ifp, &q);
2886 		}
2887 
2888 		break;
2889 	}
2890 
2891 	default: {
2892 		return netmap_ioctl_legacy(priv, cmd, data, td);
2893 		break;
2894 	}
2895 	}
2896 
2897 	return (error);
2898 }
2899 
2900 size_t
2901 nmreq_size_by_type(uint16_t nr_reqtype)
2902 {
2903 	switch (nr_reqtype) {
2904 	case NETMAP_REQ_REGISTER:
2905 		return sizeof(struct nmreq_register);
2906 	case NETMAP_REQ_PORT_INFO_GET:
2907 		return sizeof(struct nmreq_port_info_get);
2908 	case NETMAP_REQ_VALE_ATTACH:
2909 		return sizeof(struct nmreq_vale_attach);
2910 	case NETMAP_REQ_VALE_DETACH:
2911 		return sizeof(struct nmreq_vale_detach);
2912 	case NETMAP_REQ_VALE_LIST:
2913 		return sizeof(struct nmreq_vale_list);
2914 	case NETMAP_REQ_PORT_HDR_SET:
2915 	case NETMAP_REQ_PORT_HDR_GET:
2916 		return sizeof(struct nmreq_port_hdr);
2917 	case NETMAP_REQ_VALE_NEWIF:
2918 		return sizeof(struct nmreq_vale_newif);
2919 	case NETMAP_REQ_VALE_DELIF:
2920 	case NETMAP_REQ_SYNC_KLOOP_STOP:
2921 	case NETMAP_REQ_CSB_ENABLE:
2922 		return 0;
2923 	case NETMAP_REQ_VALE_POLLING_ENABLE:
2924 	case NETMAP_REQ_VALE_POLLING_DISABLE:
2925 		return sizeof(struct nmreq_vale_polling);
2926 	case NETMAP_REQ_POOLS_INFO_GET:
2927 		return sizeof(struct nmreq_pools_info);
2928 	case NETMAP_REQ_SYNC_KLOOP_START:
2929 		return sizeof(struct nmreq_sync_kloop_start);
2930 	}
2931 	return 0;
2932 }
2933 
2934 static size_t
2935 nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
2936 {
2937 	size_t rv = sizeof(struct nmreq_option);
2938 #ifdef NETMAP_REQ_OPT_DEBUG
2939 	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
2940 		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
2941 #endif /* NETMAP_REQ_OPT_DEBUG */
2942 	switch (nro_reqtype) {
2943 #ifdef WITH_EXTMEM
2944 	case NETMAP_REQ_OPT_EXTMEM:
2945 		rv = sizeof(struct nmreq_opt_extmem);
2946 		break;
2947 #endif /* WITH_EXTMEM */
2948 	case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
2949 		if (nro_size >= rv)
2950 			rv = nro_size;
2951 		break;
2952 	case NETMAP_REQ_OPT_CSB:
2953 		rv = sizeof(struct nmreq_opt_csb);
2954 		break;
2955 	}
2956 	/* subtract the common header */
2957 	return rv - sizeof(struct nmreq_option);
2958 }
2959 
2960 int
2961 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
2962 {
2963 	size_t rqsz, optsz, bufsz;
2964 	int error;
2965 	char *ker = NULL, *p;
2966 	struct nmreq_option **next, *src;
2967 	struct nmreq_option buf;
2968 	uint64_t *ptrs;
2969 
2970 	if (hdr->nr_reserved) {
2971 		if (netmap_verbose)
2972 			nm_prerr("nr_reserved must be zero");
2973 		return EINVAL;
2974 	}
2975 
2976 	if (!nr_body_is_user)
2977 		return 0;
2978 
2979 	hdr->nr_reserved = nr_body_is_user;
2980 
2981 	/* compute the total size of the buffer */
2982 	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
2983 	if (rqsz > NETMAP_REQ_MAXSIZE) {
2984 		error = EMSGSIZE;
2985 		goto out_err;
2986 	}
2987 	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
2988 		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
2989 		/* Request body expected, but not found; or
2990 		 * request body found but unexpected. */
2991 		if (netmap_verbose)
2992 			nm_prerr("nr_body expected but not found, or vice versa");
2993 		error = EINVAL;
2994 		goto out_err;
2995 	}
2996 
2997 	bufsz = 2 * sizeof(void *) + rqsz;
2998 	optsz = 0;
2999 	for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
3000 	     src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
3001 	{
3002 		error = copyin(src, &buf, sizeof(*src));
3003 		if (error)
3004 			goto out_err;
3005 		optsz += sizeof(*src);
3006 		optsz += nmreq_opt_size_by_type(buf.nro_reqtype, buf.nro_size);
3007 		if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
3008 			error = EMSGSIZE;
3009 			goto out_err;
3010 		}
3011 		bufsz += optsz + sizeof(void *);
3012 	}
3013 
3014 	ker = nm_os_malloc(bufsz);
3015 	if (ker == NULL) {
3016 		error = ENOMEM;
3017 		goto out_err;
3018 	}
3019 	p = ker;
3020 
3021 	/* make a copy of the user pointers */
3022 	ptrs = (uint64_t*)p;
3023 	*ptrs++ = hdr->nr_body;
3024 	*ptrs++ = hdr->nr_options;
3025 	p = (char *)ptrs;
3026 
3027 	/* copy the body */
3028 	error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3029 	if (error)
3030 		goto out_restore;
3031 	/* overwrite the user pointer with the in-kernel one */
3032 	hdr->nr_body = (uintptr_t)p;
3033 	p += rqsz;
3034 
3035 	/* copy the options */
3036 	next = (struct nmreq_option **)&hdr->nr_options;
3037 	src = *next;
3038 	while (src) {
3039 		struct nmreq_option *opt;
3040 
3041 		/* copy the option header */
3042 		ptrs = (uint64_t *)p;
3043 		opt = (struct nmreq_option *)(ptrs + 1);
3044 		error = copyin(src, opt, sizeof(*src));
3045 		if (error)
3046 			goto out_restore;
3047 		/* make a copy of the user next pointer */
3048 		*ptrs = opt->nro_next;
3049 		/* overwrite the user pointer with the in-kernel one */
3050 		*next = opt;
3051 
3052 		/* initialize the option as not supported.
3053 		 * Recognized options will update this field.
3054 		 */
3055 		opt->nro_status = EOPNOTSUPP;
3056 
3057 		p = (char *)(opt + 1);
3058 
3059 		/* copy the option body */
3060 		optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3061 						opt->nro_size);
3062 		if (optsz) {
3063 			/* the option body follows the option header */
3064 			error = copyin(src + 1, p, optsz);
3065 			if (error)
3066 				goto out_restore;
3067 			p += optsz;
3068 		}
3069 
3070 		/* move to next option */
3071 		next = (struct nmreq_option **)&opt->nro_next;
3072 		src = *next;
3073 	}
3074 	return 0;
3075 
3076 out_restore:
3077 	ptrs = (uint64_t *)ker;
3078 	hdr->nr_body = *ptrs++;
3079 	hdr->nr_options = *ptrs++;
3080 	hdr->nr_reserved = 0;
3081 	nm_os_free(ker);
3082 out_err:
3083 	return error;
3084 }
3085 
3086 static int
3087 nmreq_copyout(struct nmreq_header *hdr, int rerror)
3088 {
3089 	struct nmreq_option *src, *dst;
3090 	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3091 	uint64_t *ptrs;
3092 	size_t bodysz;
3093 	int error;
3094 
3095 	if (!hdr->nr_reserved)
3096 		return rerror;
3097 
3098 	/* restore the user pointers in the header */
3099 	ptrs = (uint64_t *)ker - 2;
3100 	bufstart = ptrs;
3101 	hdr->nr_body = *ptrs++;
3102 	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3103 	hdr->nr_options = *ptrs;
3104 
3105 	if (!rerror) {
3106 		/* copy the body */
3107 		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3108 		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3109 		if (error) {
3110 			rerror = error;
3111 			goto out;
3112 		}
3113 	}
3114 
3115 	/* copy the options */
3116 	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3117 	while (src) {
3118 		size_t optsz;
3119 		uint64_t next;
3120 
3121 		/* restore the user pointer */
3122 		next = src->nro_next;
3123 		ptrs = (uint64_t *)src - 1;
3124 		src->nro_next = *ptrs;
3125 
3126 		/* always copy the option header */
3127 		error = copyout(src, dst, sizeof(*src));
3128 		if (error) {
3129 			rerror = error;
3130 			goto out;
3131 		}
3132 
3133 		/* copy the option body only if there was no error */
3134 		if (!rerror && !src->nro_status) {
3135 			optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3136 							src->nro_size);
3137 			if (optsz) {
3138 				error = copyout(src + 1, dst + 1, optsz);
3139 				if (error) {
3140 					rerror = error;
3141 					goto out;
3142 				}
3143 			}
3144 		}
3145 		src = (struct nmreq_option *)(uintptr_t)next;
3146 		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3147 	}
3148 
3149 
3150 out:
3151 	hdr->nr_reserved = 0;
3152 	nm_os_free(bufstart);
3153 	return rerror;
3154 }
3155 
3156 struct nmreq_option *
3157 nmreq_findoption(struct nmreq_option *opt, uint16_t reqtype)
3158 {
3159 	for ( ; opt; opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3160 		if (opt->nro_reqtype == reqtype)
3161 			return opt;
3162 	return NULL;
3163 }
3164 
3165 int
3166 nmreq_checkduplicate(struct nmreq_option *opt) {
3167 	uint16_t type = opt->nro_reqtype;
3168 	int dup = 0;
3169 
3170 	while ((opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)opt->nro_next,
3171 			type))) {
3172 		dup++;
3173 		opt->nro_status = EINVAL;
3174 	}
3175 	return (dup ? EINVAL : 0);
3176 }
3177 
3178 static int
3179 nmreq_checkoptions(struct nmreq_header *hdr)
3180 {
3181 	struct nmreq_option *opt;
3182 	/* return error if there is still any option
3183 	 * marked as not supported
3184 	 */
3185 
3186 	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3187 	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3188 		if (opt->nro_status == EOPNOTSUPP)
3189 			return EOPNOTSUPP;
3190 
3191 	return 0;
3192 }
3193 
3194 /*
3195  * select(2) and poll(2) handlers for the "netmap" device.
3196  *
3197  * Can be called for one or more queues.
3198  * Return true the event mask corresponding to ready events.
3199  * If there are no ready events, do a selrecord on either individual
3200  * selinfo or on the global one.
3201  * Device-dependent parts (locking and sync of tx/rx rings)
3202  * are done through callbacks.
3203  *
3204  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3205  * The first one is remapped to pwait as selrecord() uses the name as an
3206  * hidden argument.
3207  */
3208 int
3209 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3210 {
3211 	struct netmap_adapter *na;
3212 	struct netmap_kring *kring;
3213 	struct netmap_ring *ring;
3214 	u_int i, want[NR_TXRX], revents = 0;
3215 	NM_SELINFO_T *si[NR_TXRX];
3216 #define want_tx want[NR_TX]
3217 #define want_rx want[NR_RX]
3218 	struct mbq q;	/* packets from RX hw queues to host stack */
3219 
3220 	/*
3221 	 * In order to avoid nested locks, we need to "double check"
3222 	 * txsync and rxsync if we decide to do a selrecord().
3223 	 * retry_tx (and retry_rx, later) prevent looping forever.
3224 	 */
3225 	int retry_tx = 1, retry_rx = 1;
3226 
3227 	/* Transparent mode: send_down is 1 if we have found some
3228 	 * packets to forward (host RX ring --> NIC) during the rx
3229 	 * scan and we have not sent them down to the NIC yet.
3230 	 * Transparent mode requires to bind all rings to a single
3231 	 * file descriptor.
3232 	 */
3233 	int send_down = 0;
3234 	int sync_flags = priv->np_sync_flags;
3235 
3236 	mbq_init(&q);
3237 
3238 	if (unlikely(priv->np_nifp == NULL)) {
3239 		return POLLERR;
3240 	}
3241 	mb(); /* make sure following reads are not from cache */
3242 
3243 	na = priv->np_na;
3244 
3245 	if (unlikely(!nm_netmap_on(na)))
3246 		return POLLERR;
3247 
3248 	if (unlikely(priv->np_csb_atok_base)) {
3249 		nm_prerr("Invalid poll in CSB mode");
3250 		return POLLERR;
3251 	}
3252 
3253 	if (netmap_debug & NM_DEBUG_ON)
3254 		nm_prinf("device %s events 0x%x", na->name, events);
3255 	want_tx = events & (POLLOUT | POLLWRNORM);
3256 	want_rx = events & (POLLIN | POLLRDNORM);
3257 
3258 	/*
3259 	 * If the card has more than one queue AND the file descriptor is
3260 	 * bound to all of them, we sleep on the "global" selinfo, otherwise
3261 	 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3262 	 * per file descriptor).
3263 	 * The interrupt routine in the driver wake one or the other
3264 	 * (or both) depending on which clients are active.
3265 	 *
3266 	 * rxsync() is only called if we run out of buffers on a POLLIN.
3267 	 * txsync() is called if we run out of buffers on POLLOUT, or
3268 	 * there are pending packets to send. The latter can be disabled
3269 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3270 	 */
3271 	si[NR_RX] = nm_si_user(priv, NR_RX) ? &na->si[NR_RX] :
3272 				&na->rx_rings[priv->np_qfirst[NR_RX]]->si;
3273 	si[NR_TX] = nm_si_user(priv, NR_TX) ? &na->si[NR_TX] :
3274 				&na->tx_rings[priv->np_qfirst[NR_TX]]->si;
3275 
3276 #ifdef __FreeBSD__
3277 	/*
3278 	 * We start with a lock free round which is cheap if we have
3279 	 * slots available. If this fails, then lock and call the sync
3280 	 * routines. We can't do this on Linux, as the contract says
3281 	 * that we must call nm_os_selrecord() unconditionally.
3282 	 */
3283 	if (want_tx) {
3284 		enum txrx t = NR_TX;
3285 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
3286 			kring = NMR(na, t)[i];
3287 			/* XXX compare ring->cur and kring->tail */
3288 			if (!nm_ring_empty(kring->ring)) {
3289 				revents |= want[t];
3290 				want[t] = 0;	/* also breaks the loop */
3291 			}
3292 		}
3293 	}
3294 	if (want_rx) {
3295 		enum txrx t = NR_RX;
3296 		want_rx = 0; /* look for a reason to run the handlers */
3297 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3298 			kring = NMR(na, t)[i];
3299 			if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */
3300 			    || kring->rhead != kring->ring->head /* release buffers */) {
3301 				want_rx = 1;
3302 			}
3303 		}
3304 		if (!want_rx)
3305 			revents |= events & (POLLIN | POLLRDNORM); /* we have data */
3306 	}
3307 #endif
3308 
3309 #ifdef linux
3310 	/* The selrecord must be unconditional on linux. */
3311 	nm_os_selrecord(sr, si[NR_RX]);
3312 	nm_os_selrecord(sr, si[NR_TX]);
3313 #endif /* linux */
3314 
3315 	/*
3316 	 * If we want to push packets out (priv->np_txpoll) or
3317 	 * want_tx is still set, we must issue txsync calls
3318 	 * (on all rings, to avoid that the tx rings stall).
3319 	 * Fortunately, normal tx mode has np_txpoll set.
3320 	 */
3321 	if (priv->np_txpoll || want_tx) {
3322 		/*
3323 		 * The first round checks if anyone is ready, if not
3324 		 * do a selrecord and another round to handle races.
3325 		 * want_tx goes to 0 if any space is found, and is
3326 		 * used to skip rings with no pending transmissions.
3327 		 */
3328 flush_tx:
3329 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3330 			int found = 0;
3331 
3332 			kring = na->tx_rings[i];
3333 			ring = kring->ring;
3334 
3335 			/*
3336 			 * Don't try to txsync this TX ring if we already found some
3337 			 * space in some of the TX rings (want_tx == 0) and there are no
3338 			 * TX slots in this ring that need to be flushed to the NIC
3339 			 * (head == hwcur).
3340 			 */
3341 			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3342 				continue;
3343 
3344 			if (nm_kr_tryget(kring, 1, &revents))
3345 				continue;
3346 
3347 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3348 				netmap_ring_reinit(kring);
3349 				revents |= POLLERR;
3350 			} else {
3351 				if (kring->nm_sync(kring, sync_flags))
3352 					revents |= POLLERR;
3353 				else
3354 					nm_sync_finalize(kring);
3355 			}
3356 
3357 			/*
3358 			 * If we found new slots, notify potential
3359 			 * listeners on the same ring.
3360 			 * Since we just did a txsync, look at the copies
3361 			 * of cur,tail in the kring.
3362 			 */
3363 			found = kring->rcur != kring->rtail;
3364 			nm_kr_put(kring);
3365 			if (found) { /* notify other listeners */
3366 				revents |= want_tx;
3367 				want_tx = 0;
3368 #ifndef linux
3369 				kring->nm_notify(kring, 0);
3370 #endif /* linux */
3371 			}
3372 		}
3373 		/* if there were any packet to forward we must have handled them by now */
3374 		send_down = 0;
3375 		if (want_tx && retry_tx && sr) {
3376 #ifndef linux
3377 			nm_os_selrecord(sr, si[NR_TX]);
3378 #endif /* !linux */
3379 			retry_tx = 0;
3380 			goto flush_tx;
3381 		}
3382 	}
3383 
3384 	/*
3385 	 * If want_rx is still set scan receive rings.
3386 	 * Do it on all rings because otherwise we starve.
3387 	 */
3388 	if (want_rx) {
3389 		/* two rounds here for race avoidance */
3390 do_retry_rx:
3391 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3392 			int found = 0;
3393 
3394 			kring = na->rx_rings[i];
3395 			ring = kring->ring;
3396 
3397 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3398 				continue;
3399 
3400 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3401 				netmap_ring_reinit(kring);
3402 				revents |= POLLERR;
3403 			}
3404 			/* now we can use kring->rcur, rtail */
3405 
3406 			/*
3407 			 * transparent mode support: collect packets from
3408 			 * hw rxring(s) that have been released by the user
3409 			 */
3410 			if (nm_may_forward_up(kring)) {
3411 				netmap_grab_packets(kring, &q, netmap_fwd);
3412 			}
3413 
3414 			/* Clear the NR_FORWARD flag anyway, it may be set by
3415 			 * the nm_sync() below only on for the host RX ring (see
3416 			 * netmap_rxsync_from_host()). */
3417 			kring->nr_kflags &= ~NR_FORWARD;
3418 			if (kring->nm_sync(kring, sync_flags))
3419 				revents |= POLLERR;
3420 			else
3421 				nm_sync_finalize(kring);
3422 			send_down |= (kring->nr_kflags & NR_FORWARD);
3423 			ring_timestamp_set(ring);
3424 			found = kring->rcur != kring->rtail;
3425 			nm_kr_put(kring);
3426 			if (found) {
3427 				revents |= want_rx;
3428 				retry_rx = 0;
3429 #ifndef linux
3430 				kring->nm_notify(kring, 0);
3431 #endif /* linux */
3432 			}
3433 		}
3434 
3435 #ifndef linux
3436 		if (retry_rx && sr) {
3437 			nm_os_selrecord(sr, si[NR_RX]);
3438 		}
3439 #endif /* !linux */
3440 		if (send_down || retry_rx) {
3441 			retry_rx = 0;
3442 			if (send_down)
3443 				goto flush_tx; /* and retry_rx */
3444 			else
3445 				goto do_retry_rx;
3446 		}
3447 	}
3448 
3449 	/*
3450 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3451 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3452 	 * to the host stack.
3453 	 */
3454 
3455 	if (mbq_peek(&q)) {
3456 		netmap_send_up(na->ifp, &q);
3457 	}
3458 
3459 	return (revents);
3460 #undef want_tx
3461 #undef want_rx
3462 }
3463 
3464 int
3465 nma_intr_enable(struct netmap_adapter *na, int onoff)
3466 {
3467 	bool changed = false;
3468 	enum txrx t;
3469 	int i;
3470 
3471 	for_rx_tx(t) {
3472 		for (i = 0; i < nma_get_nrings(na, t); i++) {
3473 			struct netmap_kring *kring = NMR(na, t)[i];
3474 			int on = !(kring->nr_kflags & NKR_NOINTR);
3475 
3476 			if (!!onoff != !!on) {
3477 				changed = true;
3478 			}
3479 			if (onoff) {
3480 				kring->nr_kflags &= ~NKR_NOINTR;
3481 			} else {
3482 				kring->nr_kflags |= NKR_NOINTR;
3483 			}
3484 		}
3485 	}
3486 
3487 	if (!changed) {
3488 		return 0; /* nothing to do */
3489 	}
3490 
3491 	if (!na->nm_intr) {
3492 		nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3493 		  na->name);
3494 		return -1;
3495 	}
3496 
3497 	na->nm_intr(na, onoff);
3498 
3499 	return 0;
3500 }
3501 
3502 
3503 /*-------------------- driver support routines -------------------*/
3504 
3505 /* default notify callback */
3506 static int
3507 netmap_notify(struct netmap_kring *kring, int flags)
3508 {
3509 	struct netmap_adapter *na = kring->notify_na;
3510 	enum txrx t = kring->tx;
3511 
3512 	nm_os_selwakeup(&kring->si);
3513 	/* optimization: avoid a wake up on the global
3514 	 * queue if nobody has registered for more
3515 	 * than one ring
3516 	 */
3517 	if (na->si_users[t] > 0)
3518 		nm_os_selwakeup(&na->si[t]);
3519 
3520 	return NM_IRQ_COMPLETED;
3521 }
3522 
3523 /* called by all routines that create netmap_adapters.
3524  * provide some defaults and get a reference to the
3525  * memory allocator
3526  */
3527 int
3528 netmap_attach_common(struct netmap_adapter *na)
3529 {
3530 	if (!na->rx_buf_maxsize) {
3531 		/* Set a conservative default (larger is safer). */
3532 		na->rx_buf_maxsize = PAGE_SIZE;
3533 	}
3534 
3535 #ifdef __FreeBSD__
3536 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3537 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
3538 	}
3539 	na->pdev = na; /* make sure netmap_mem_map() is called */
3540 #endif /* __FreeBSD__ */
3541 	if (na->na_flags & NAF_HOST_RINGS) {
3542 		if (na->num_host_rx_rings == 0)
3543 			na->num_host_rx_rings = 1;
3544 		if (na->num_host_tx_rings == 0)
3545 			na->num_host_tx_rings = 1;
3546 	}
3547 	if (na->nm_krings_create == NULL) {
3548 		/* we assume that we have been called by a driver,
3549 		 * since other port types all provide their own
3550 		 * nm_krings_create
3551 		 */
3552 		na->nm_krings_create = netmap_hw_krings_create;
3553 		na->nm_krings_delete = netmap_hw_krings_delete;
3554 	}
3555 	if (na->nm_notify == NULL)
3556 		na->nm_notify = netmap_notify;
3557 	na->active_fds = 0;
3558 
3559 	if (na->nm_mem == NULL) {
3560 		/* use the global allocator */
3561 		na->nm_mem = netmap_mem_get(&nm_mem);
3562 	}
3563 #ifdef WITH_VALE
3564 	if (na->nm_bdg_attach == NULL)
3565 		/* no special nm_bdg_attach callback. On VALE
3566 		 * attach, we need to interpose a bwrap
3567 		 */
3568 		na->nm_bdg_attach = netmap_default_bdg_attach;
3569 #endif
3570 
3571 	return 0;
3572 }
3573 
3574 /* Wrapper for the register callback provided netmap-enabled
3575  * hardware drivers.
3576  * nm_iszombie(na) means that the driver module has been
3577  * unloaded, so we cannot call into it.
3578  * nm_os_ifnet_lock() must guarantee mutual exclusion with
3579  * module unloading.
3580  */
3581 static int
3582 netmap_hw_reg(struct netmap_adapter *na, int onoff)
3583 {
3584 	struct netmap_hw_adapter *hwna =
3585 		(struct netmap_hw_adapter*)na;
3586 	int error = 0;
3587 
3588 	nm_os_ifnet_lock();
3589 
3590 	if (nm_iszombie(na)) {
3591 		if (onoff) {
3592 			error = ENXIO;
3593 		} else if (na != NULL) {
3594 			na->na_flags &= ~NAF_NETMAP_ON;
3595 		}
3596 		goto out;
3597 	}
3598 
3599 	error = hwna->nm_hw_register(na, onoff);
3600 
3601 out:
3602 	nm_os_ifnet_unlock();
3603 
3604 	return error;
3605 }
3606 
3607 static void
3608 netmap_hw_dtor(struct netmap_adapter *na)
3609 {
3610 	if (na->ifp == NULL)
3611 		return;
3612 
3613 	NM_DETACH_NA(na->ifp);
3614 }
3615 
3616 
3617 /*
3618  * Allocate a netmap_adapter object, and initialize it from the
3619  * 'arg' passed by the driver on attach.
3620  * We allocate a block of memory of 'size' bytes, which has room
3621  * for struct netmap_adapter plus additional room private to
3622  * the caller.
3623  * Return 0 on success, ENOMEM otherwise.
3624  */
3625 int
3626 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3627 {
3628 	struct netmap_hw_adapter *hwna = NULL;
3629 	struct ifnet *ifp = NULL;
3630 
3631 	if (size < sizeof(struct netmap_hw_adapter)) {
3632 		if (netmap_debug & NM_DEBUG_ON)
3633 			nm_prerr("Invalid netmap adapter size %d", (int)size);
3634 		return EINVAL;
3635 	}
3636 
3637 	if (arg == NULL || arg->ifp == NULL) {
3638 		if (netmap_debug & NM_DEBUG_ON)
3639 			nm_prerr("either arg or arg->ifp is NULL");
3640 		return EINVAL;
3641 	}
3642 
3643 	if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
3644 		if (netmap_debug & NM_DEBUG_ON)
3645 			nm_prerr("%s: invalid rings tx %d rx %d",
3646 				arg->name, arg->num_tx_rings, arg->num_rx_rings);
3647 		return EINVAL;
3648 	}
3649 
3650 	ifp = arg->ifp;
3651 	if (NM_NA_CLASH(ifp)) {
3652 		/* If NA(ifp) is not null but there is no valid netmap
3653 		 * adapter it means that someone else is using the same
3654 		 * pointer (e.g. ax25_ptr on linux). This happens for
3655 		 * instance when also PF_RING is in use. */
3656 		nm_prerr("Error: netmap adapter hook is busy");
3657 		return EBUSY;
3658 	}
3659 
3660 	hwna = nm_os_malloc(size);
3661 	if (hwna == NULL)
3662 		goto fail;
3663 	hwna->up = *arg;
3664 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3665 	strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3666 	if (override_reg) {
3667 		hwna->nm_hw_register = hwna->up.nm_register;
3668 		hwna->up.nm_register = netmap_hw_reg;
3669 	}
3670 	if (netmap_attach_common(&hwna->up)) {
3671 		nm_os_free(hwna);
3672 		goto fail;
3673 	}
3674 	netmap_adapter_get(&hwna->up);
3675 
3676 	NM_ATTACH_NA(ifp, &hwna->up);
3677 
3678 	nm_os_onattach(ifp);
3679 
3680 	if (arg->nm_dtor == NULL) {
3681 		hwna->up.nm_dtor = netmap_hw_dtor;
3682 	}
3683 
3684 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3685 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3686 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3687 	return 0;
3688 
3689 fail:
3690 	nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3691 	return (hwna ? EINVAL : ENOMEM);
3692 }
3693 
3694 
3695 int
3696 netmap_attach(struct netmap_adapter *arg)
3697 {
3698 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3699 			1 /* override nm_reg */);
3700 }
3701 
3702 
3703 void
3704 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3705 {
3706 	if (!na) {
3707 		return;
3708 	}
3709 
3710 	refcount_acquire(&na->na_refcount);
3711 }
3712 
3713 
3714 /* returns 1 iff the netmap_adapter is destroyed */
3715 int
3716 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3717 {
3718 	if (!na)
3719 		return 1;
3720 
3721 	if (!refcount_release(&na->na_refcount))
3722 		return 0;
3723 
3724 	if (na->nm_dtor)
3725 		na->nm_dtor(na);
3726 
3727 	if (na->tx_rings) { /* XXX should not happen */
3728 		if (netmap_debug & NM_DEBUG_ON)
3729 			nm_prerr("freeing leftover tx_rings");
3730 		na->nm_krings_delete(na);
3731 	}
3732 	netmap_pipe_dealloc(na);
3733 	if (na->nm_mem)
3734 		netmap_mem_put(na->nm_mem);
3735 	bzero(na, sizeof(*na));
3736 	nm_os_free(na);
3737 
3738 	return 1;
3739 }
3740 
3741 /* nm_krings_create callback for all hardware native adapters */
3742 int
3743 netmap_hw_krings_create(struct netmap_adapter *na)
3744 {
3745 	int ret = netmap_krings_create(na, 0);
3746 	if (ret == 0) {
3747 		/* initialize the mbq for the sw rx ring */
3748 		u_int lim = netmap_real_rings(na, NR_RX), i;
3749 		for (i = na->num_rx_rings; i < lim; i++) {
3750 			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3751 		}
3752 		ND("initialized sw rx queue %d", na->num_rx_rings);
3753 	}
3754 	return ret;
3755 }
3756 
3757 
3758 
3759 /*
3760  * Called on module unload by the netmap-enabled drivers
3761  */
3762 void
3763 netmap_detach(struct ifnet *ifp)
3764 {
3765 	struct netmap_adapter *na = NA(ifp);
3766 
3767 	if (!na)
3768 		return;
3769 
3770 	NMG_LOCK();
3771 	netmap_set_all_rings(na, NM_KR_LOCKED);
3772 	/*
3773 	 * if the netmap adapter is not native, somebody
3774 	 * changed it, so we can not release it here.
3775 	 * The NAF_ZOMBIE flag will notify the new owner that
3776 	 * the driver is gone.
3777 	 */
3778 	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3779 		na->na_flags |= NAF_ZOMBIE;
3780 	}
3781 	/* give active users a chance to notice that NAF_ZOMBIE has been
3782 	 * turned on, so that they can stop and return an error to userspace.
3783 	 * Note that this becomes a NOP if there are no active users and,
3784 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3785 	 * NULL.
3786 	 */
3787 	netmap_enable_all_rings(ifp);
3788 	NMG_UNLOCK();
3789 }
3790 
3791 
3792 /*
3793  * Intercept packets from the network stack and pass them
3794  * to netmap as incoming packets on the 'software' ring.
3795  *
3796  * We only store packets in a bounded mbq and then copy them
3797  * in the relevant rxsync routine.
3798  *
3799  * We rely on the OS to make sure that the ifp and na do not go
3800  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3801  * In nm_register() or whenever there is a reinitialization,
3802  * we make sure to make the mode change visible here.
3803  */
3804 int
3805 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3806 {
3807 	struct netmap_adapter *na = NA(ifp);
3808 	struct netmap_kring *kring, *tx_kring;
3809 	u_int len = MBUF_LEN(m);
3810 	u_int error = ENOBUFS;
3811 	unsigned int txr;
3812 	struct mbq *q;
3813 	int busy;
3814 	u_int i;
3815 
3816 	i = MBUF_TXQ(m);
3817 	if (i >= na->num_host_rx_rings) {
3818 		i = i % na->num_host_rx_rings;
3819 	}
3820 	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3821 
3822 	// XXX [Linux] we do not need this lock
3823 	// if we follow the down/configure/up protocol -gl
3824 	// mtx_lock(&na->core_lock);
3825 
3826 	if (!nm_netmap_on(na)) {
3827 		nm_prerr("%s not in netmap mode anymore", na->name);
3828 		error = ENXIO;
3829 		goto done;
3830 	}
3831 
3832 	txr = MBUF_TXQ(m);
3833 	if (txr >= na->num_tx_rings) {
3834 		txr %= na->num_tx_rings;
3835 	}
3836 	tx_kring = NMR(na, NR_TX)[txr];
3837 
3838 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3839 		return MBUF_TRANSMIT(na, ifp, m);
3840 	}
3841 
3842 	q = &kring->rx_queue;
3843 
3844 	// XXX reconsider long packets if we handle fragments
3845 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3846 		nm_prerr("%s from_host, drop packet size %d > %d", na->name,
3847 			len, NETMAP_BUF_SIZE(na));
3848 		goto done;
3849 	}
3850 
3851 	if (!netmap_generic_hwcsum) {
3852 		if (nm_os_mbuf_has_csum_offld(m)) {
3853 			RD(1, "%s drop mbuf that needs checksum offload", na->name);
3854 			goto done;
3855 		}
3856 	}
3857 
3858 	if (nm_os_mbuf_has_seg_offld(m)) {
3859 		RD(1, "%s drop mbuf that needs generic segmentation offload", na->name);
3860 		goto done;
3861 	}
3862 
3863 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3864 	 * and maybe other instances of netmap_transmit (the latter
3865 	 * not possible on Linux).
3866 	 * We enqueue the mbuf only if we are sure there is going to be
3867 	 * enough room in the host RX ring, otherwise we drop it.
3868 	 */
3869 	mbq_lock(q);
3870 
3871 	busy = kring->nr_hwtail - kring->nr_hwcur;
3872 	if (busy < 0)
3873 		busy += kring->nkr_num_slots;
3874 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3875 		RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3876 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3877 	} else {
3878 		mbq_enqueue(q, m);
3879 		ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3880 		/* notify outside the lock */
3881 		m = NULL;
3882 		error = 0;
3883 	}
3884 	mbq_unlock(q);
3885 
3886 done:
3887 	if (m)
3888 		m_freem(m);
3889 	/* unconditionally wake up listeners */
3890 	kring->nm_notify(kring, 0);
3891 	/* this is normally netmap_notify(), but for nics
3892 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3893 	 * that possibly forwards the frames through the switch
3894 	 */
3895 
3896 	return (error);
3897 }
3898 
3899 
3900 /*
3901  * netmap_reset() is called by the driver routines when reinitializing
3902  * a ring. The driver is in charge of locking to protect the kring.
3903  * If native netmap mode is not set just return NULL.
3904  * If native netmap mode is set, in particular, we have to set nr_mode to
3905  * NKR_NETMAP_ON.
3906  */
3907 struct netmap_slot *
3908 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3909 	u_int new_cur)
3910 {
3911 	struct netmap_kring *kring;
3912 	int new_hwofs, lim;
3913 
3914 	if (!nm_native_on(na)) {
3915 		ND("interface not in native netmap mode");
3916 		return NULL;	/* nothing to reinitialize */
3917 	}
3918 
3919 	/* XXX note- in the new scheme, we are not guaranteed to be
3920 	 * under lock (e.g. when called on a device reset).
3921 	 * In this case, we should set a flag and do not trust too
3922 	 * much the values. In practice: TODO
3923 	 * - set a RESET flag somewhere in the kring
3924 	 * - do the processing in a conservative way
3925 	 * - let the *sync() fixup at the end.
3926 	 */
3927 	if (tx == NR_TX) {
3928 		if (n >= na->num_tx_rings)
3929 			return NULL;
3930 
3931 		kring = na->tx_rings[n];
3932 
3933 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3934 			kring->nr_mode = NKR_NETMAP_OFF;
3935 			return NULL;
3936 		}
3937 
3938 		// XXX check whether we should use hwcur or rcur
3939 		new_hwofs = kring->nr_hwcur - new_cur;
3940 	} else {
3941 		if (n >= na->num_rx_rings)
3942 			return NULL;
3943 		kring = na->rx_rings[n];
3944 
3945 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3946 			kring->nr_mode = NKR_NETMAP_OFF;
3947 			return NULL;
3948 		}
3949 
3950 		new_hwofs = kring->nr_hwtail - new_cur;
3951 	}
3952 	lim = kring->nkr_num_slots - 1;
3953 	if (new_hwofs > lim)
3954 		new_hwofs -= lim + 1;
3955 
3956 	/* Always set the new offset value and realign the ring. */
3957 	if (netmap_debug & NM_DEBUG_ON)
3958 	    nm_prinf("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3959 		na->name,
3960 		tx == NR_TX ? "TX" : "RX", n,
3961 		kring->nkr_hwofs, new_hwofs,
3962 		kring->nr_hwtail,
3963 		tx == NR_TX ? lim : kring->nr_hwtail);
3964 	kring->nkr_hwofs = new_hwofs;
3965 	if (tx == NR_TX) {
3966 		kring->nr_hwtail = kring->nr_hwcur + lim;
3967 		if (kring->nr_hwtail > lim)
3968 			kring->nr_hwtail -= lim + 1;
3969 	}
3970 
3971 	/*
3972 	 * Wakeup on the individual and global selwait
3973 	 * We do the wakeup here, but the ring is not yet reconfigured.
3974 	 * However, we are under lock so there are no races.
3975 	 */
3976 	kring->nr_mode = NKR_NETMAP_ON;
3977 	kring->nm_notify(kring, 0);
3978 	return kring->ring->slot;
3979 }
3980 
3981 
3982 /*
3983  * Dispatch rx/tx interrupts to the netmap rings.
3984  *
3985  * "work_done" is non-null on the RX path, NULL for the TX path.
3986  * We rely on the OS to make sure that there is only one active
3987  * instance per queue, and that there is appropriate locking.
3988  *
3989  * The 'notify' routine depends on what the ring is attached to.
3990  * - for a netmap file descriptor, do a selwakeup on the individual
3991  *   waitqueue, plus one on the global one if needed
3992  *   (see netmap_notify)
3993  * - for a nic connected to a switch, call the proper forwarding routine
3994  *   (see netmap_bwrap_intr_notify)
3995  */
3996 int
3997 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
3998 {
3999 	struct netmap_kring *kring;
4000 	enum txrx t = (work_done ? NR_RX : NR_TX);
4001 
4002 	q &= NETMAP_RING_MASK;
4003 
4004 	if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4005 	        nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4006 	}
4007 
4008 	if (q >= nma_get_nrings(na, t))
4009 		return NM_IRQ_PASS; // not a physical queue
4010 
4011 	kring = NMR(na, t)[q];
4012 
4013 	if (kring->nr_mode == NKR_NETMAP_OFF) {
4014 		return NM_IRQ_PASS;
4015 	}
4016 
4017 	if (t == NR_RX) {
4018 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
4019 		*work_done = 1; /* do not fire napi again */
4020 	}
4021 
4022 	return kring->nm_notify(kring, 0);
4023 }
4024 
4025 
4026 /*
4027  * Default functions to handle rx/tx interrupts from a physical device.
4028  * "work_done" is non-null on the RX path, NULL for the TX path.
4029  *
4030  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4031  * so that the caller proceeds with regular processing.
4032  * Otherwise call netmap_common_irq().
4033  *
4034  * If the card is connected to a netmap file descriptor,
4035  * do a selwakeup on the individual queue, plus one on the global one
4036  * if needed (multiqueue card _and_ there are multiqueue listeners),
4037  * and return NR_IRQ_COMPLETED.
4038  *
4039  * Finally, if called on rx from an interface connected to a switch,
4040  * calls the proper forwarding routine.
4041  */
4042 int
4043 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4044 {
4045 	struct netmap_adapter *na = NA(ifp);
4046 
4047 	/*
4048 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4049 	 * we still use the regular driver even though the previous
4050 	 * check fails. It is unclear whether we should use
4051 	 * nm_native_on() here.
4052 	 */
4053 	if (!nm_netmap_on(na))
4054 		return NM_IRQ_PASS;
4055 
4056 	if (na->na_flags & NAF_SKIP_INTR) {
4057 		ND("use regular interrupt");
4058 		return NM_IRQ_PASS;
4059 	}
4060 
4061 	return netmap_common_irq(na, q, work_done);
4062 }
4063 
4064 /* set/clear native flags and if_transmit/netdev_ops */
4065 void
4066 nm_set_native_flags(struct netmap_adapter *na)
4067 {
4068 	struct ifnet *ifp = na->ifp;
4069 
4070 	/* We do the setup for intercepting packets only if we are the
4071 	 * first user of this adapapter. */
4072 	if (na->active_fds > 0) {
4073 		return;
4074 	}
4075 
4076 	na->na_flags |= NAF_NETMAP_ON;
4077 	nm_os_onenter(ifp);
4078 	nm_update_hostrings_mode(na);
4079 }
4080 
4081 void
4082 nm_clear_native_flags(struct netmap_adapter *na)
4083 {
4084 	struct ifnet *ifp = na->ifp;
4085 
4086 	/* We undo the setup for intercepting packets only if we are the
4087 	 * last user of this adapter. */
4088 	if (na->active_fds > 0) {
4089 		return;
4090 	}
4091 
4092 	nm_update_hostrings_mode(na);
4093 	nm_os_onexit(ifp);
4094 
4095 	na->na_flags &= ~NAF_NETMAP_ON;
4096 }
4097 
4098 /*
4099  * Module loader and unloader
4100  *
4101  * netmap_init() creates the /dev/netmap device and initializes
4102  * all global variables. Returns 0 on success, errno on failure
4103  * (but there is no chance)
4104  *
4105  * netmap_fini() destroys everything.
4106  */
4107 
4108 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4109 extern struct cdevsw netmap_cdevsw;
4110 
4111 
4112 void
4113 netmap_fini(void)
4114 {
4115 	if (netmap_dev)
4116 		destroy_dev(netmap_dev);
4117 	/* we assume that there are no longer netmap users */
4118 	nm_os_ifnet_fini();
4119 	netmap_uninit_bridges();
4120 	netmap_mem_fini();
4121 	NMG_LOCK_DESTROY();
4122 	nm_prinf("netmap: unloaded module.");
4123 }
4124 
4125 
4126 int
4127 netmap_init(void)
4128 {
4129 	int error;
4130 
4131 	NMG_LOCK_INIT();
4132 
4133 	error = netmap_mem_init();
4134 	if (error != 0)
4135 		goto fail;
4136 	/*
4137 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4138 	 * when the module is compiled in.
4139 	 * XXX could use make_dev_credv() to get error number
4140 	 */
4141 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4142 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4143 			      "netmap");
4144 	if (!netmap_dev)
4145 		goto fail;
4146 
4147 	error = netmap_init_bridges();
4148 	if (error)
4149 		goto fail;
4150 
4151 #ifdef __FreeBSD__
4152 	nm_os_vi_init_index();
4153 #endif
4154 
4155 	error = nm_os_ifnet_init();
4156 	if (error)
4157 		goto fail;
4158 
4159 	nm_prinf("netmap: loaded module");
4160 	return (0);
4161 fail:
4162 	netmap_fini();
4163 	return (EINVAL); /* may be incorrect */
4164 }
4165