xref: /freebsd/sys/dev/netmap/netmap.c (revision ff0ba87247820afbdfdc1b307c803f7923d0e4d3)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 
128 /* --- internals ----
129  *
130  * Roadmap to the code that implements the above.
131  *
132  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133  * >    select()able file descriptor on which events are reported.
134  *
135  *  	Internally, we allocate a netmap_priv_d structure, that will be
136  *  	initialized on ioctl(NIOCREGIF).
137  *
138  *      os-specific:
139  *  	    FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140  *  		     per-thread.
141  *  	    linux:   linux_netmap_open (netmap_linux.c). The priv is
142  *  		     per-open.
143  *
144  * > 2. on each descriptor, the process issues an ioctl() to identify
145  * >    the interface that should report events to the file descriptor.
146  *
147  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148  * 	Most important things happen in netmap_get_na() and
149  * 	netmap_do_regif(), called from there. Additional details can be
150  * 	found in the comments above those functions.
151  *
152  * 	In all cases, this action creates/takes-a-reference-to a
153  * 	netmap_*_adapter describing the port, and allocates a netmap_if
154  * 	and all necessary netmap rings, filling them with netmap buffers.
155  *
156  *      In this phase, the sync callbacks for each ring are set (these are used
157  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
158  *      The adapter creation/initialization code puts them in the
159  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
160  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
161  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
162  * 	actually call netmap_krings_create() to perform this and the other
163  * 	common stuff. netmap_krings_create() also takes care of the host rings,
164  * 	if needed, by setting their sync callbacks appropriately.
165  *
166  * 	Additional actions depend on the kind of netmap_adapter that has been
167  * 	registered:
168  *
169  * 	- netmap_hw_adapter:  	     [netmap.c]
170  * 	     This is a system netdev/ifp with native netmap support.
171  * 	     The ifp is detached from the host stack by redirecting:
172  * 	       - transmissions (from the network stack) to netmap_transmit()
173  * 	       - receive notifications to the nm_notify() callback for
174  * 	         this adapter. The callback is normally netmap_notify(), unless
175  * 	         the ifp is attached to a bridge using bwrap, in which case it
176  * 	         is netmap_bwrap_intr_notify().
177  *
178  * 	- netmap_generic_adapter:      [netmap_generic.c]
179  * 	      A system netdev/ifp without native netmap support.
180  *
181  * 	(the decision about native/non native support is taken in
182  * 	 netmap_get_hw_na(), called by netmap_get_na())
183  *
184  * 	- netmap_vp_adapter 		[netmap_vale.c]
185  * 	      Returned by netmap_get_bdg_na().
186  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
187  * 	      are created on the fly if they don't already exist, and are
188  * 	      always attached to a bridge.
189  * 	      Persistent VALE ports must must be created seperately, and i
190  * 	      then attached like normal NICs. The NIOCREGIF we are examining
191  * 	      will find them only if they had previosly been created and
192  * 	      attached (see VALE_CTL below).
193  *
194  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
195  * 	      Returned by netmap_get_pipe_na().
196  * 	      Both pipe ends are created, if they didn't already exist.
197  *
198  * 	- netmap_monitor_adapter      [netmap_monitor.c]
199  * 	      Returned by netmap_get_monitor_na().
200  * 	      If successful, the nm_sync callbacks of the monitored adapter
201  * 	      will be intercepted by the returned monitor.
202  *
203  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
204  * 	      Cannot be obtained in this way, see VALE_CTL below
205  *
206  *
207  * 	os-specific:
208  * 	    linux: we first go through linux_netmap_ioctl() to
209  * 	           adapt the FreeBSD interface to the linux one.
210  *
211  *
212  * > 3. on each descriptor, the process issues an mmap() request to
213  * >    map the shared memory region within the process' address space.
214  * >    The list of interesting queues is indicated by a location in
215  * >    the shared memory region.
216  *
217  *      os-specific:
218  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
220  *
221  * > 4. using the functions in the netmap(4) userspace API, a process
222  * >    can look up the occupation state of a queue, access memory buffers,
223  * >    and retrieve received packets or enqueue packets to transmit.
224  *
225  * 	these actions do not involve the kernel.
226  *
227  * > 5. using some ioctl()s the process can synchronize the userspace view
228  * >    of the queue with the actual status in the kernel. This includes both
229  * >    receiving the notification of new packets, and transmitting new
230  * >    packets on the output interface.
231  *
232  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
234  * 	structures, as initialized in step 2 and maybe later modified
235  * 	by a monitor. Monitors, however, will always call the original
236  * 	callback before doing anything else.
237  *
238  *
239  * > 6. select() or poll() can be used to wait for events on individual
240  * >    transmit or receive queues (or all queues for a given interface).
241  *
242  * 	Implemented in netmap_poll(). This will call the same nm_sync()
243  * 	callbacks as in step 5 above.
244  *
245  * 	os-specific:
246  * 		linux: we first go through linux_netmap_poll() to adapt
247  * 		       the FreeBSD interface to the linux one.
248  *
249  *
250  *  ----  VALE_CTL -----
251  *
252  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
253  *  nr_cmd in the nmreq structure. These subcommands are handled by
254  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256  *  subcommands, respectively.
257  *
258  *  Any network interface known to the system (including a persistent VALE
259  *  port) can be attached to a VALE switch by issuing the
260  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
262  *  attachment of other interfaces, instead, requires the creation of a
263  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
264  *  netmap mode. This may require the creation of a netmap_generic_adapter if
265  *  we have no native support for the interface, or if generic adapters have
266  *  been forced by sysctl.
267  *
268  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270  *  callback.  In the case of the bwrap, the callback creates the
271  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
272  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274  *  A generic adapter for the wrapped ifp will be created if needed, when
275  *  netmap_get_bdg_na() calls netmap_get_hw_na().
276  *
277  *
278  *  ---- DATAPATHS -----
279  *
280  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281  *
282  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283  *
284  *    - tx from netmap userspace:
285  *	 concurrently:
286  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287  *                kring->nm_sync() == DEVICE_netmap_txsync()
288  *           2) device interrupt handler
289  *                na->nm_notify()  == netmap_notify()
290  *    - rx from netmap userspace:
291  *       concurrently:
292  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293  *                kring->nm_sync() == DEVICE_netmap_rxsync()
294  *           2) device interrupt handler
295  *                na->nm_notify()  == netmap_notify()
296  *    - tx from host stack
297  *       concurrently:
298  *           1) host stack
299  *                netmap_transmit()
300  *                  na->nm_notify  == netmap_notify()
301  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302  *                kring->nm_sync() == netmap_rxsync_from_host_compat
303  *                  netmap_rxsync_from_host(na, NULL, NULL)
304  *    - tx to host stack
305  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
306  *             kring->nm_sync() == netmap_txsync_to_host_compat
307  *               netmap_txsync_to_host(na)
308  *                 NM_SEND_UP()
309  *                   FreeBSD: na->if_input() == ?? XXX
310  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311  *
312  *
313  *
314  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315  *
316  *
317  *
318  *                           -= VALE PORT =-
319  *
320  *
321  *
322  *                           -= NETMAP PIPE =-
323  *
324  *
325  *
326  *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
327  *
328  *
329  *
330  *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
331  *
332  *
333  *
334  *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
335  *
336  *
337  *
338  *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
339  *
340  *
341  *
342  */
343 
344 /*
345  * OS-specific code that is used only within this file.
346  * Other OS-specific code that must be accessed by drivers
347  * is present in netmap_kern.h
348  */
349 
350 #if defined(__FreeBSD__)
351 #include <sys/cdefs.h> /* prerequisite */
352 #include <sys/types.h>
353 #include <sys/errno.h>
354 #include <sys/param.h>	/* defines used in kernel.h */
355 #include <sys/kernel.h>	/* types used in module initialization */
356 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
357 #include <sys/filio.h>	/* FIONBIO */
358 #include <sys/sockio.h>
359 #include <sys/socketvar.h>	/* struct socket */
360 #include <sys/malloc.h>
361 #include <sys/poll.h>
362 #include <sys/rwlock.h>
363 #include <sys/socket.h> /* sockaddrs */
364 #include <sys/selinfo.h>
365 #include <sys/sysctl.h>
366 #include <sys/jail.h>
367 #include <net/vnet.h>
368 #include <net/if.h>
369 #include <net/if_var.h>
370 #include <net/bpf.h>		/* BIOCIMMEDIATE */
371 #include <machine/bus.h>	/* bus_dmamap_* */
372 #include <sys/endian.h>
373 #include <sys/refcount.h>
374 
375 
376 /* reduce conditional code */
377 // linux API, use for the knlist in FreeBSD
378 /* use a private mutex for the knlist */
379 #define init_waitqueue_head(x) do {			\
380 	struct mtx *m = &(x)->m;			\
381 	mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);	\
382 	knlist_init_mtx(&(x)->si.si_note, m);		\
383     } while (0)
384 
385 #define OS_selrecord(a, b)	selrecord(a, &((b)->si))
386 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
387 
388 #elif defined(linux)
389 
390 #include "bsd_glue.h"
391 
392 
393 
394 #elif defined(__APPLE__)
395 
396 #warning OSX support is only partial
397 #include "osx_glue.h"
398 
399 #else
400 
401 #error	Unsupported platform
402 
403 #endif /* unsupported */
404 
405 /*
406  * common headers
407  */
408 #include <net/netmap.h>
409 #include <dev/netmap/netmap_kern.h>
410 #include <dev/netmap/netmap_mem2.h>
411 
412 
413 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
414 
415 /*
416  * The following variables are used by the drivers and replicate
417  * fields in the global memory pool. They only refer to buffers
418  * used by physical interfaces.
419  */
420 u_int netmap_total_buffers;
421 u_int netmap_buf_size;
422 char *netmap_buffer_base;	/* also address of an invalid buffer */
423 
424 /* user-controlled variables */
425 int netmap_verbose;
426 
427 static int netmap_no_timestamp; /* don't timestamp on rxsync */
428 
429 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
430 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
431     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
432 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
433     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
434 int netmap_mitigate = 1;
435 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
436 int netmap_no_pendintr = 1;
437 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
438     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
439 int netmap_txsync_retry = 2;
440 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
441     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
442 
443 int netmap_adaptive_io = 0;
444 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
445     &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
446 
447 int netmap_flags = 0;	/* debug flags */
448 int netmap_fwd = 0;	/* force transparent mode */
449 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
450 
451 /*
452  * netmap_admode selects the netmap mode to use.
453  * Invalid values are reset to NETMAP_ADMODE_BEST
454  */
455 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
456 	NETMAP_ADMODE_NATIVE,	/* either native or none */
457 	NETMAP_ADMODE_GENERIC,	/* force generic */
458 	NETMAP_ADMODE_LAST };
459 static int netmap_admode = NETMAP_ADMODE_BEST;
460 
461 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
462 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
463 int netmap_generic_rings = 1;   /* number of queues in generic. */
464 
465 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
466 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
467 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
468 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
469 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
470 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
471 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
472 
473 NMG_LOCK_T	netmap_global_lock;
474 
475 
476 static void
477 nm_kr_get(struct netmap_kring *kr)
478 {
479 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
480 		tsleep(kr, 0, "NM_KR_GET", 4);
481 }
482 
483 
484 /*
485  * mark the ring as stopped, and run through the locks
486  * to make sure other users get to see it.
487  */
488 static void
489 netmap_disable_ring(struct netmap_kring *kr)
490 {
491 	kr->nkr_stopped = 1;
492 	nm_kr_get(kr);
493 	mtx_lock(&kr->q_lock);
494 	mtx_unlock(&kr->q_lock);
495 	nm_kr_put(kr);
496 }
497 
498 /* stop or enable a single tx ring */
499 void
500 netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped)
501 {
502 	if (stopped)
503 		netmap_disable_ring(na->tx_rings + ring_id);
504 	else
505 		na->tx_rings[ring_id].nkr_stopped = 0;
506 	/* nofify that the stopped state has changed. This is currently
507 	 *only used by bwrap to propagate the state to its own krings.
508 	 * (see netmap_bwrap_intr_notify).
509 	 */
510 	na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY);
511 }
512 
513 /* stop or enable a single rx ring */
514 void
515 netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped)
516 {
517 	if (stopped)
518 		netmap_disable_ring(na->rx_rings + ring_id);
519 	else
520 		na->rx_rings[ring_id].nkr_stopped = 0;
521 	/* nofify that the stopped state has changed. This is currently
522 	 *only used by bwrap to propagate the state to its own krings.
523 	 * (see netmap_bwrap_intr_notify).
524 	 */
525 	na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY);
526 }
527 
528 
529 /* stop or enable all the rings of na */
530 void
531 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
532 {
533 	int i;
534 	u_int ntx, nrx;
535 
536 	if (!nm_netmap_on(na))
537 		return;
538 
539 	ntx = netmap_real_tx_rings(na);
540 	nrx = netmap_real_rx_rings(na);
541 
542 	for (i = 0; i < ntx; i++) {
543 		netmap_set_txring(na, i, stopped);
544 	}
545 
546 	for (i = 0; i < nrx; i++) {
547 		netmap_set_rxring(na, i, stopped);
548 	}
549 }
550 
551 /*
552  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
553  * to finish and prevents any new one from starting.  Call this before turning
554  * netmap mode off, or before removing the harware rings (e.g., on module
555  * onload).  As a rule of thumb for linux drivers, this should be placed near
556  * each napi_disable().
557  */
558 void
559 netmap_disable_all_rings(struct ifnet *ifp)
560 {
561 	netmap_set_all_rings(NA(ifp), 1 /* stopped */);
562 }
563 
564 /*
565  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
566  * adapter's rings In linux drivers, this should be placed near each
567  * napi_enable().
568  */
569 void
570 netmap_enable_all_rings(struct ifnet *ifp)
571 {
572 	netmap_set_all_rings(NA(ifp), 0 /* enabled */);
573 }
574 
575 
576 /*
577  * generic bound_checking function
578  */
579 u_int
580 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
581 {
582 	u_int oldv = *v;
583 	const char *op = NULL;
584 
585 	if (dflt < lo)
586 		dflt = lo;
587 	if (dflt > hi)
588 		dflt = hi;
589 	if (oldv < lo) {
590 		*v = dflt;
591 		op = "Bump";
592 	} else if (oldv > hi) {
593 		*v = hi;
594 		op = "Clamp";
595 	}
596 	if (op && msg)
597 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
598 	return *v;
599 }
600 
601 
602 /*
603  * packet-dump function, user-supplied or static buffer.
604  * The destination buffer must be at least 30+4*len
605  */
606 const char *
607 nm_dump_buf(char *p, int len, int lim, char *dst)
608 {
609 	static char _dst[8192];
610 	int i, j, i0;
611 	static char hex[] ="0123456789abcdef";
612 	char *o;	/* output position */
613 
614 #define P_HI(x)	hex[((x) & 0xf0)>>4]
615 #define P_LO(x)	hex[((x) & 0xf)]
616 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
617 	if (!dst)
618 		dst = _dst;
619 	if (lim <= 0 || lim > len)
620 		lim = len;
621 	o = dst;
622 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
623 	o += strlen(o);
624 	/* hexdump routine */
625 	for (i = 0; i < lim; ) {
626 		sprintf(o, "%5d: ", i);
627 		o += strlen(o);
628 		memset(o, ' ', 48);
629 		i0 = i;
630 		for (j=0; j < 16 && i < lim; i++, j++) {
631 			o[j*3] = P_HI(p[i]);
632 			o[j*3+1] = P_LO(p[i]);
633 		}
634 		i = i0;
635 		for (j=0; j < 16 && i < lim; i++, j++)
636 			o[j + 48] = P_C(p[i]);
637 		o[j+48] = '\n';
638 		o += j+49;
639 	}
640 	*o = '\0';
641 #undef P_HI
642 #undef P_LO
643 #undef P_C
644 	return dst;
645 }
646 
647 
648 /*
649  * Fetch configuration from the device, to cope with dynamic
650  * reconfigurations after loading the module.
651  */
652 /* call with NMG_LOCK held */
653 int
654 netmap_update_config(struct netmap_adapter *na)
655 {
656 	u_int txr, txd, rxr, rxd;
657 
658 	txr = txd = rxr = rxd = 0;
659 	if (na->nm_config) {
660 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
661 	} else {
662 		/* take whatever we had at init time */
663 		txr = na->num_tx_rings;
664 		txd = na->num_tx_desc;
665 		rxr = na->num_rx_rings;
666 		rxd = na->num_rx_desc;
667 	}
668 
669 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
670 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
671 		return 0; /* nothing changed */
672 	if (netmap_verbose || na->active_fds > 0) {
673 		D("stored config %s: txring %d x %d, rxring %d x %d",
674 			na->name,
675 			na->num_tx_rings, na->num_tx_desc,
676 			na->num_rx_rings, na->num_rx_desc);
677 		D("new config %s: txring %d x %d, rxring %d x %d",
678 			na->name, txr, txd, rxr, rxd);
679 	}
680 	if (na->active_fds == 0) {
681 		D("configuration changed (but fine)");
682 		na->num_tx_rings = txr;
683 		na->num_tx_desc = txd;
684 		na->num_rx_rings = rxr;
685 		na->num_rx_desc = rxd;
686 		return 0;
687 	}
688 	D("configuration changed while active, this is bad...");
689 	return 1;
690 }
691 
692 /* kring->nm_sync callback for the host tx ring */
693 static int
694 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
695 {
696 	(void)flags; /* unused */
697 	netmap_txsync_to_host(kring->na);
698 	return 0;
699 }
700 
701 /* kring->nm_sync callback for the host rx ring */
702 static int
703 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
704 {
705 	(void)flags; /* unused */
706 	netmap_rxsync_from_host(kring->na, NULL, NULL);
707 	return 0;
708 }
709 
710 
711 
712 /* create the krings array and initialize the fields common to all adapters.
713  * The array layout is this:
714  *
715  *                    +----------+
716  * na->tx_rings ----->|          | \
717  *                    |          |  } na->num_tx_ring
718  *                    |          | /
719  *                    +----------+
720  *                    |          |    host tx kring
721  * na->rx_rings ----> +----------+
722  *                    |          | \
723  *                    |          |  } na->num_rx_rings
724  *                    |          | /
725  *                    +----------+
726  *                    |          |    host rx kring
727  *                    +----------+
728  * na->tailroom ----->|          | \
729  *                    |          |  } tailroom bytes
730  *                    |          | /
731  *                    +----------+
732  *
733  * Note: for compatibility, host krings are created even when not needed.
734  * The tailroom space is currently used by vale ports for allocating leases.
735  */
736 /* call with NMG_LOCK held */
737 int
738 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
739 {
740 	u_int i, len, ndesc;
741 	struct netmap_kring *kring;
742 	u_int ntx, nrx;
743 
744 	/* account for the (possibly fake) host rings */
745 	ntx = na->num_tx_rings + 1;
746 	nrx = na->num_rx_rings + 1;
747 
748 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
749 
750 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
751 	if (na->tx_rings == NULL) {
752 		D("Cannot allocate krings");
753 		return ENOMEM;
754 	}
755 	na->rx_rings = na->tx_rings + ntx;
756 
757 	/*
758 	 * All fields in krings are 0 except the one initialized below.
759 	 * but better be explicit on important kring fields.
760 	 */
761 	ndesc = na->num_tx_desc;
762 	for (i = 0; i < ntx; i++) { /* Transmit rings */
763 		kring = &na->tx_rings[i];
764 		bzero(kring, sizeof(*kring));
765 		kring->na = na;
766 		kring->ring_id = i;
767 		kring->nkr_num_slots = ndesc;
768 		if (i < na->num_tx_rings) {
769 			kring->nm_sync = na->nm_txsync;
770 		} else if (i == na->num_tx_rings) {
771 			kring->nm_sync = netmap_txsync_to_host_compat;
772 		}
773 		/*
774 		 * IMPORTANT: Always keep one slot empty.
775 		 */
776 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
777 		kring->rtail = kring->nr_hwtail = ndesc - 1;
778 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i);
779 		ND("ktx %s h %d c %d t %d",
780 			kring->name, kring->rhead, kring->rcur, kring->rtail);
781 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
782 		init_waitqueue_head(&kring->si);
783 	}
784 
785 	ndesc = na->num_rx_desc;
786 	for (i = 0; i < nrx; i++) { /* Receive rings */
787 		kring = &na->rx_rings[i];
788 		bzero(kring, sizeof(*kring));
789 		kring->na = na;
790 		kring->ring_id = i;
791 		kring->nkr_num_slots = ndesc;
792 		if (i < na->num_rx_rings) {
793 			kring->nm_sync = na->nm_rxsync;
794 		} else if (i == na->num_rx_rings) {
795 			kring->nm_sync = netmap_rxsync_from_host_compat;
796 		}
797 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
798 		kring->rtail = kring->nr_hwtail = 0;
799 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i);
800 		ND("krx %s h %d c %d t %d",
801 			kring->name, kring->rhead, kring->rcur, kring->rtail);
802 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
803 		init_waitqueue_head(&kring->si);
804 	}
805 	init_waitqueue_head(&na->tx_si);
806 	init_waitqueue_head(&na->rx_si);
807 
808 	na->tailroom = na->rx_rings + nrx;
809 
810 	return 0;
811 }
812 
813 
814 #ifdef __FreeBSD__
815 static void
816 netmap_knlist_destroy(NM_SELINFO_T *si)
817 {
818 	/* XXX kqueue(9) needed; these will mirror knlist_init. */
819 	knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
820 	knlist_destroy(&si->si.si_note);
821 	/* now we don't need the mutex anymore */
822 	mtx_destroy(&si->m);
823 }
824 #endif /* __FreeBSD__ */
825 
826 
827 /* undo the actions performed by netmap_krings_create */
828 /* call with NMG_LOCK held */
829 void
830 netmap_krings_delete(struct netmap_adapter *na)
831 {
832 	struct netmap_kring *kring = na->tx_rings;
833 
834 	/* we rely on the krings layout described above */
835 	for ( ; kring != na->tailroom; kring++) {
836 		mtx_destroy(&kring->q_lock);
837 		netmap_knlist_destroy(&kring->si);
838 	}
839 	free(na->tx_rings, M_DEVBUF);
840 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
841 }
842 
843 
844 /*
845  * Destructor for NIC ports. They also have an mbuf queue
846  * on the rings connected to the host so we need to purge
847  * them first.
848  */
849 /* call with NMG_LOCK held */
850 static void
851 netmap_hw_krings_delete(struct netmap_adapter *na)
852 {
853 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
854 
855 	ND("destroy sw mbq with len %d", mbq_len(q));
856 	mbq_purge(q);
857 	mbq_safe_destroy(q);
858 	netmap_krings_delete(na);
859 }
860 
861 
862 /* create a new netmap_if for a newly registered fd.
863  * If this is the first registration of the adapter,
864  * also create the netmap rings and their in-kernel view,
865  * the netmap krings.
866  */
867 /* call with NMG_LOCK held */
868 static struct netmap_if*
869 netmap_if_new(struct netmap_adapter *na)
870 {
871 	struct netmap_if *nifp;
872 
873 	if (netmap_update_config(na)) {
874 		/* configuration mismatch, report and fail */
875 		return NULL;
876 	}
877 
878 	if (na->active_fds)	/* already registered */
879 		goto final;
880 
881 	/* create and init the krings arrays.
882 	 * Depending on the adapter, this may also create
883 	 * the netmap rings themselves
884 	 */
885 	if (na->nm_krings_create(na))
886 		return NULL;
887 
888 	/* create all missing netmap rings */
889 	if (netmap_mem_rings_create(na))
890 		goto cleanup;
891 
892 final:
893 
894 	/* in all cases, create a new netmap if */
895 	nifp = netmap_mem_if_new(na);
896 	if (nifp == NULL)
897 		goto cleanup;
898 
899 	return (nifp);
900 
901 cleanup:
902 
903 	if (na->active_fds == 0) {
904 		netmap_mem_rings_delete(na);
905 		na->nm_krings_delete(na);
906 	}
907 
908 	return NULL;
909 }
910 
911 
912 /* grab a reference to the memory allocator, if we don't have one already.  The
913  * reference is taken from the netmap_adapter registered with the priv.
914  */
915 /* call with NMG_LOCK held */
916 static int
917 netmap_get_memory_locked(struct netmap_priv_d* p)
918 {
919 	struct netmap_mem_d *nmd;
920 	int error = 0;
921 
922 	if (p->np_na == NULL) {
923 		if (!netmap_mmap_unreg)
924 			return ENODEV;
925 		/* for compatibility with older versions of the API
926  		 * we use the global allocator when no interface has been
927  		 * registered
928  		 */
929 		nmd = &nm_mem;
930 	} else {
931 		nmd = p->np_na->nm_mem;
932 	}
933 	if (p->np_mref == NULL) {
934 		error = netmap_mem_finalize(nmd, p->np_na);
935 		if (!error)
936 			p->np_mref = nmd;
937 	} else if (p->np_mref != nmd) {
938 		/* a virtual port has been registered, but previous
939  		 * syscalls already used the global allocator.
940  		 * We cannot continue
941  		 */
942 		error = ENODEV;
943 	}
944 	return error;
945 }
946 
947 
948 /* call with NMG_LOCK *not* held */
949 int
950 netmap_get_memory(struct netmap_priv_d* p)
951 {
952 	int error;
953 	NMG_LOCK();
954 	error = netmap_get_memory_locked(p);
955 	NMG_UNLOCK();
956 	return error;
957 }
958 
959 
960 /* call with NMG_LOCK held */
961 static int
962 netmap_have_memory_locked(struct netmap_priv_d* p)
963 {
964 	return p->np_mref != NULL;
965 }
966 
967 
968 /* call with NMG_LOCK held */
969 static void
970 netmap_drop_memory_locked(struct netmap_priv_d* p)
971 {
972 	if (p->np_mref) {
973 		netmap_mem_deref(p->np_mref, p->np_na);
974 		p->np_mref = NULL;
975 	}
976 }
977 
978 
979 /*
980  * Call nm_register(ifp,0) to stop netmap mode on the interface and
981  * revert to normal operation.
982  * The second argument is the nifp to work on. In some cases it is
983  * not attached yet to the netmap_priv_d so we need to pass it as
984  * a separate argument.
985  */
986 /* call with NMG_LOCK held */
987 static void
988 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
989 {
990 	struct netmap_adapter *na = priv->np_na;
991 
992 	NMG_LOCK_ASSERT();
993 	na->active_fds--;
994 	if (na->active_fds <= 0) {	/* last instance */
995 
996 		if (netmap_verbose)
997 			D("deleting last instance for %s", na->name);
998 		/*
999 		 * (TO CHECK) This function is only called
1000 		 * when the last reference to this file descriptor goes
1001 		 * away. This means we cannot have any pending poll()
1002 		 * or interrupt routine operating on the structure.
1003 		 * XXX The file may be closed in a thread while
1004 		 * another thread is using it.
1005 		 * Linux keeps the file opened until the last reference
1006 		 * by any outstanding ioctl/poll or mmap is gone.
1007 		 * FreeBSD does not track mmap()s (but we do) and
1008 		 * wakes up any sleeping poll(). Need to check what
1009 		 * happens if the close() occurs while a concurrent
1010 		 * syscall is running.
1011 		 */
1012 		na->nm_register(na, 0); /* off, clear flags */
1013 		/* Wake up any sleeping threads. netmap_poll will
1014 		 * then return POLLERR
1015 		 * XXX The wake up now must happen during *_down(), when
1016 		 * we order all activities to stop. -gl
1017 		 */
1018 		netmap_knlist_destroy(&na->tx_si);
1019 		netmap_knlist_destroy(&na->rx_si);
1020 
1021 		/* delete rings and buffers */
1022 		netmap_mem_rings_delete(na);
1023 		na->nm_krings_delete(na);
1024 	}
1025 	/* delete the nifp */
1026 	netmap_mem_if_delete(na, nifp);
1027 }
1028 
1029 /* call with NMG_LOCK held */
1030 static __inline int
1031 nm_tx_si_user(struct netmap_priv_d *priv)
1032 {
1033 	return (priv->np_na != NULL &&
1034 		(priv->np_txqlast - priv->np_txqfirst > 1));
1035 }
1036 
1037 /* call with NMG_LOCK held */
1038 static __inline int
1039 nm_rx_si_user(struct netmap_priv_d *priv)
1040 {
1041 	return (priv->np_na != NULL &&
1042 		(priv->np_rxqlast - priv->np_rxqfirst > 1));
1043 }
1044 
1045 
1046 /*
1047  * Destructor of the netmap_priv_d, called when the fd has
1048  * no active open() and mmap(). Also called in error paths.
1049  *
1050  * returns 1 if this is the last instance and we can free priv
1051  */
1052 /* call with NMG_LOCK held */
1053 int
1054 netmap_dtor_locked(struct netmap_priv_d *priv)
1055 {
1056 	struct netmap_adapter *na = priv->np_na;
1057 
1058 #ifdef __FreeBSD__
1059 	/*
1060 	 * np_refcount is the number of active mmaps on
1061 	 * this file descriptor
1062 	 */
1063 	if (--priv->np_refcount > 0) {
1064 		return 0;
1065 	}
1066 #endif /* __FreeBSD__ */
1067 	if (!na) {
1068 	    return 1; //XXX is it correct?
1069 	}
1070 	netmap_do_unregif(priv, priv->np_nifp);
1071 	priv->np_nifp = NULL;
1072 	netmap_drop_memory_locked(priv);
1073 	if (priv->np_na) {
1074 		if (nm_tx_si_user(priv))
1075 			na->tx_si_users--;
1076 		if (nm_rx_si_user(priv))
1077 			na->rx_si_users--;
1078 		netmap_adapter_put(na);
1079 		priv->np_na = NULL;
1080 	}
1081 	return 1;
1082 }
1083 
1084 
1085 /* call with NMG_LOCK *not* held */
1086 void
1087 netmap_dtor(void *data)
1088 {
1089 	struct netmap_priv_d *priv = data;
1090 	int last_instance;
1091 
1092 	NMG_LOCK();
1093 	last_instance = netmap_dtor_locked(priv);
1094 	NMG_UNLOCK();
1095 	if (last_instance) {
1096 		bzero(priv, sizeof(*priv));	/* for safety */
1097 		free(priv, M_DEVBUF);
1098 	}
1099 }
1100 
1101 
1102 
1103 
1104 /*
1105  * Handlers for synchronization of the queues from/to the host.
1106  * Netmap has two operating modes:
1107  * - in the default mode, the rings connected to the host stack are
1108  *   just another ring pair managed by userspace;
1109  * - in transparent mode (XXX to be defined) incoming packets
1110  *   (from the host or the NIC) are marked as NS_FORWARD upon
1111  *   arrival, and the user application has a chance to reset the
1112  *   flag for packets that should be dropped.
1113  *   On the RXSYNC or poll(), packets in RX rings between
1114  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1115  *   to the other side.
1116  * The transfer NIC --> host is relatively easy, just encapsulate
1117  * into mbufs and we are done. The host --> NIC side is slightly
1118  * harder because there might not be room in the tx ring so it
1119  * might take a while before releasing the buffer.
1120  */
1121 
1122 
1123 /*
1124  * pass a chain of buffers to the host stack as coming from 'dst'
1125  * We do not need to lock because the queue is private.
1126  */
1127 static void
1128 netmap_send_up(struct ifnet *dst, struct mbq *q)
1129 {
1130 	struct mbuf *m;
1131 
1132 	/* send packets up, outside the lock */
1133 	while ((m = mbq_dequeue(q)) != NULL) {
1134 		if (netmap_verbose & NM_VERB_HOST)
1135 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1136 		NM_SEND_UP(dst, m);
1137 	}
1138 	mbq_destroy(q);
1139 }
1140 
1141 
1142 /*
1143  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1144  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1145  * and pass them up. Drop remaining packets in the unlikely event
1146  * of an mbuf shortage.
1147  */
1148 static void
1149 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1150 {
1151 	u_int const lim = kring->nkr_num_slots - 1;
1152 	u_int const head = kring->ring->head;
1153 	u_int n;
1154 	struct netmap_adapter *na = kring->na;
1155 
1156 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1157 		struct mbuf *m;
1158 		struct netmap_slot *slot = &kring->ring->slot[n];
1159 
1160 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1161 			continue;
1162 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1163 			RD(5, "bad pkt at %d len %d", n, slot->len);
1164 			continue;
1165 		}
1166 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1167 		/* XXX TODO: adapt to the case of a multisegment packet */
1168 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1169 
1170 		if (m == NULL)
1171 			break;
1172 		mbq_enqueue(q, m);
1173 	}
1174 }
1175 
1176 
1177 /*
1178  * Send to the NIC rings packets marked NS_FORWARD between
1179  * kring->nr_hwcur and kring->rhead
1180  * Called under kring->rx_queue.lock on the sw rx ring,
1181  */
1182 static u_int
1183 netmap_sw_to_nic(struct netmap_adapter *na)
1184 {
1185 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1186 	struct netmap_slot *rxslot = kring->ring->slot;
1187 	u_int i, rxcur = kring->nr_hwcur;
1188 	u_int const head = kring->rhead;
1189 	u_int const src_lim = kring->nkr_num_slots - 1;
1190 	u_int sent = 0;
1191 
1192 	/* scan rings to find space, then fill as much as possible */
1193 	for (i = 0; i < na->num_tx_rings; i++) {
1194 		struct netmap_kring *kdst = &na->tx_rings[i];
1195 		struct netmap_ring *rdst = kdst->ring;
1196 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1197 
1198 		/* XXX do we trust ring or kring->rcur,rtail ? */
1199 		for (; rxcur != head && !nm_ring_empty(rdst);
1200 		     rxcur = nm_next(rxcur, src_lim) ) {
1201 			struct netmap_slot *src, *dst, tmp;
1202 			u_int dst_cur = rdst->cur;
1203 
1204 			src = &rxslot[rxcur];
1205 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1206 				continue;
1207 
1208 			sent++;
1209 
1210 			dst = &rdst->slot[dst_cur];
1211 
1212 			tmp = *src;
1213 
1214 			src->buf_idx = dst->buf_idx;
1215 			src->flags = NS_BUF_CHANGED;
1216 
1217 			dst->buf_idx = tmp.buf_idx;
1218 			dst->len = tmp.len;
1219 			dst->flags = NS_BUF_CHANGED;
1220 
1221 			rdst->cur = nm_next(dst_cur, dst_lim);
1222 		}
1223 		/* if (sent) XXX txsync ? */
1224 	}
1225 	return sent;
1226 }
1227 
1228 
1229 /*
1230  * netmap_txsync_to_host() passes packets up. We are called from a
1231  * system call in user process context, and the only contention
1232  * can be among multiple user threads erroneously calling
1233  * this routine concurrently.
1234  */
1235 void
1236 netmap_txsync_to_host(struct netmap_adapter *na)
1237 {
1238 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1239 	struct netmap_ring *ring = kring->ring;
1240 	u_int const lim = kring->nkr_num_slots - 1;
1241 	u_int const head = kring->rhead;
1242 	struct mbq q;
1243 
1244 	/* Take packets from hwcur to head and pass them up.
1245 	 * force head = cur since netmap_grab_packets() stops at head
1246 	 * In case of no buffers we give up. At the end of the loop,
1247 	 * the queue is drained in all cases.
1248 	 */
1249 	mbq_init(&q);
1250 	ring->cur = head;
1251 	netmap_grab_packets(kring, &q, 1 /* force */);
1252 	ND("have %d pkts in queue", mbq_len(&q));
1253 	kring->nr_hwcur = head;
1254 	kring->nr_hwtail = head + lim;
1255 	if (kring->nr_hwtail > lim)
1256 		kring->nr_hwtail -= lim + 1;
1257 	nm_txsync_finalize(kring);
1258 
1259 	netmap_send_up(na->ifp, &q);
1260 }
1261 
1262 
1263 /*
1264  * rxsync backend for packets coming from the host stack.
1265  * They have been put in kring->rx_queue by netmap_transmit().
1266  * We protect access to the kring using kring->rx_queue.lock
1267  *
1268  * This routine also does the selrecord if called from the poll handler
1269  * (we know because td != NULL).
1270  *
1271  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1272  *     as an additional hidden argument.
1273  * returns the number of packets delivered to tx queues in
1274  * transparent mode, or a negative value if error
1275  */
1276 int
1277 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1278 {
1279 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1280 	struct netmap_ring *ring = kring->ring;
1281 	u_int nm_i, n;
1282 	u_int const lim = kring->nkr_num_slots - 1;
1283 	u_int const head = kring->rhead;
1284 	int ret = 0;
1285 	struct mbq *q = &kring->rx_queue;
1286 
1287 	(void)pwait;	/* disable unused warnings */
1288 	(void)td;
1289 
1290 	mbq_lock(q);
1291 
1292 	/* First part: import newly received packets */
1293 	n = mbq_len(q);
1294 	if (n) { /* grab packets from the queue */
1295 		struct mbuf *m;
1296 		uint32_t stop_i;
1297 
1298 		nm_i = kring->nr_hwtail;
1299 		stop_i = nm_prev(nm_i, lim);
1300 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1301 			int len = MBUF_LEN(m);
1302 			struct netmap_slot *slot = &ring->slot[nm_i];
1303 
1304 			m_copydata(m, 0, len, NMB(na, slot));
1305 			ND("nm %d len %d", nm_i, len);
1306 			if (netmap_verbose)
1307                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1308 
1309 			slot->len = len;
1310 			slot->flags = kring->nkr_slot_flags;
1311 			nm_i = nm_next(nm_i, lim);
1312 			m_freem(m);
1313 		}
1314 		kring->nr_hwtail = nm_i;
1315 	}
1316 
1317 	/*
1318 	 * Second part: skip past packets that userspace has released.
1319 	 */
1320 	nm_i = kring->nr_hwcur;
1321 	if (nm_i != head) { /* something was released */
1322 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1323 			ret = netmap_sw_to_nic(na);
1324 		kring->nr_hwcur = head;
1325 	}
1326 
1327 	nm_rxsync_finalize(kring);
1328 
1329 	/* access copies of cur,tail in the kring */
1330 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1331 		OS_selrecord(td, &kring->si);
1332 
1333 	mbq_unlock(q);
1334 	return ret;
1335 }
1336 
1337 
1338 /* Get a netmap adapter for the port.
1339  *
1340  * If it is possible to satisfy the request, return 0
1341  * with *na containing the netmap adapter found.
1342  * Otherwise return an error code, with *na containing NULL.
1343  *
1344  * When the port is attached to a bridge, we always return
1345  * EBUSY.
1346  * Otherwise, if the port is already bound to a file descriptor,
1347  * then we unconditionally return the existing adapter into *na.
1348  * In all the other cases, we return (into *na) either native,
1349  * generic or NULL, according to the following table:
1350  *
1351  *					native_support
1352  * active_fds   dev.netmap.admode         YES     NO
1353  * -------------------------------------------------------
1354  *    >0              *                 NA(ifp) NA(ifp)
1355  *
1356  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1357  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1358  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1359  *
1360  */
1361 
1362 int
1363 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1364 {
1365 	/* generic support */
1366 	int i = netmap_admode;	/* Take a snapshot. */
1367 	int error = 0;
1368 	struct netmap_adapter *prev_na;
1369 	struct netmap_generic_adapter *gna;
1370 
1371 	*na = NULL; /* default */
1372 
1373 	/* reset in case of invalid value */
1374 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1375 		i = netmap_admode = NETMAP_ADMODE_BEST;
1376 
1377 	if (NETMAP_CAPABLE(ifp)) {
1378 		prev_na = NA(ifp);
1379 		/* If an adapter already exists, return it if
1380 		 * there are active file descriptors or if
1381 		 * netmap is not forced to use generic
1382 		 * adapters.
1383 		 */
1384 		if (NETMAP_OWNED_BY_ANY(prev_na)
1385 			|| i != NETMAP_ADMODE_GENERIC
1386 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1387 #ifdef WITH_PIPES
1388 			/* ugly, but we cannot allow an adapter switch
1389 			 * if some pipe is referring to this one
1390 			 */
1391 			|| prev_na->na_next_pipe > 0
1392 #endif
1393 		) {
1394 			*na = prev_na;
1395 			return 0;
1396 		}
1397 	}
1398 
1399 	/* If there isn't native support and netmap is not allowed
1400 	 * to use generic adapters, we cannot satisfy the request.
1401 	 */
1402 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1403 		return EOPNOTSUPP;
1404 
1405 	/* Otherwise, create a generic adapter and return it,
1406 	 * saving the previously used netmap adapter, if any.
1407 	 *
1408 	 * Note that here 'prev_na', if not NULL, MUST be a
1409 	 * native adapter, and CANNOT be a generic one. This is
1410 	 * true because generic adapters are created on demand, and
1411 	 * destroyed when not used anymore. Therefore, if the adapter
1412 	 * currently attached to an interface 'ifp' is generic, it
1413 	 * must be that
1414 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1415 	 * Consequently, if NA(ifp) is generic, we will enter one of
1416 	 * the branches above. This ensures that we never override
1417 	 * a generic adapter with another generic adapter.
1418 	 */
1419 	prev_na = NA(ifp);
1420 	error = generic_netmap_attach(ifp);
1421 	if (error)
1422 		return error;
1423 
1424 	*na = NA(ifp);
1425 	gna = (struct netmap_generic_adapter*)NA(ifp);
1426 	gna->prev = prev_na; /* save old na */
1427 	if (prev_na != NULL) {
1428 		ifunit_ref(ifp->if_xname);
1429 		// XXX add a refcount ?
1430 		netmap_adapter_get(prev_na);
1431 	}
1432 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1433 
1434 	return 0;
1435 }
1436 
1437 
1438 /*
1439  * MUST BE CALLED UNDER NMG_LOCK()
1440  *
1441  * Get a refcounted reference to a netmap adapter attached
1442  * to the interface specified by nmr.
1443  * This is always called in the execution of an ioctl().
1444  *
1445  * Return ENXIO if the interface specified by the request does
1446  * not exist, ENOTSUP if netmap is not supported by the interface,
1447  * EBUSY if the interface is already attached to a bridge,
1448  * EINVAL if parameters are invalid, ENOMEM if needed resources
1449  * could not be allocated.
1450  * If successful, hold a reference to the netmap adapter.
1451  *
1452  * No reference is kept on the real interface, which may then
1453  * disappear at any time.
1454  */
1455 int
1456 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1457 {
1458 	struct ifnet *ifp = NULL;
1459 	int error = 0;
1460 	struct netmap_adapter *ret = NULL;
1461 
1462 	*na = NULL;     /* default return value */
1463 
1464 	NMG_LOCK_ASSERT();
1465 
1466 	/* we cascade through all possibile types of netmap adapter.
1467 	 * All netmap_get_*_na() functions return an error and an na,
1468 	 * with the following combinations:
1469 	 *
1470 	 * error    na
1471 	 *   0	   NULL		type doesn't match
1472 	 *  !0	   NULL		type matches, but na creation/lookup failed
1473 	 *   0	  !NULL		type matches and na created/found
1474 	 *  !0    !NULL		impossible
1475 	 */
1476 
1477 	/* try to see if this is a monitor port */
1478 	error = netmap_get_monitor_na(nmr, na, create);
1479 	if (error || *na != NULL)
1480 		return error;
1481 
1482 	/* try to see if this is a pipe port */
1483 	error = netmap_get_pipe_na(nmr, na, create);
1484 	if (error || *na != NULL)
1485 		return error;
1486 
1487 	/* try to see if this is a bridge port */
1488 	error = netmap_get_bdg_na(nmr, na, create);
1489 	if (error)
1490 		return error;
1491 
1492 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1493 		goto pipes;
1494 
1495 	/*
1496 	 * This must be a hardware na, lookup the name in the system.
1497 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1498 	 * This may still be a tap, a veth/epair, or even a
1499 	 * persistent VALE port.
1500 	 */
1501 	ifp = ifunit_ref(nmr->nr_name);
1502 	if (ifp == NULL) {
1503 	        return ENXIO;
1504 	}
1505 
1506 	error = netmap_get_hw_na(ifp, &ret);
1507 	if (error)
1508 		goto out;
1509 
1510 	*na = ret;
1511 	netmap_adapter_get(ret);
1512 
1513 pipes:
1514 	/*
1515 	 * If we are opening a pipe whose parent was not in netmap mode,
1516 	 * we have to allocate the pipe array now.
1517 	 * XXX get rid of this clumsiness (2014-03-15)
1518 	 */
1519 	error = netmap_pipe_alloc(*na, nmr);
1520 
1521 out:
1522 	if (error && ret != NULL)
1523 		netmap_adapter_put(ret);
1524 
1525 	if (ifp)
1526 		if_rele(ifp); /* allow live unloading of drivers modules */
1527 
1528 	return error;
1529 }
1530 
1531 
1532 /*
1533  * validate parameters on entry for *_txsync()
1534  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1535  * in case of error.
1536  *
1537  * rhead, rcur and rtail=hwtail are stored from previous round.
1538  * hwcur is the next packet to send to the ring.
1539  *
1540  * We want
1541  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1542  *
1543  * hwcur, rhead, rtail and hwtail are reliable
1544  */
1545 u_int
1546 nm_txsync_prologue(struct netmap_kring *kring)
1547 {
1548 	struct netmap_ring *ring = kring->ring;
1549 	u_int head = ring->head; /* read only once */
1550 	u_int cur = ring->cur; /* read only once */
1551 	u_int n = kring->nkr_num_slots;
1552 
1553 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1554 		kring->name,
1555 		kring->nr_hwcur, kring->nr_hwtail,
1556 		ring->head, ring->cur, ring->tail);
1557 #if 1 /* kernel sanity checks; but we can trust the kring. */
1558 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1559 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1560 		goto error;
1561 #endif /* kernel sanity checks */
1562 	/*
1563 	 * user sanity checks. We only use 'cur',
1564 	 * A, B, ... are possible positions for cur:
1565 	 *
1566 	 *  0    A  cur   B  tail  C  n-1
1567 	 *  0    D  tail  E  cur   F  n-1
1568 	 *
1569 	 * B, F, D are valid. A, C, E are wrong
1570 	 */
1571 	if (kring->rtail >= kring->rhead) {
1572 		/* want rhead <= head <= rtail */
1573 		if (head < kring->rhead || head > kring->rtail)
1574 			goto error;
1575 		/* and also head <= cur <= rtail */
1576 		if (cur < head || cur > kring->rtail)
1577 			goto error;
1578 	} else { /* here rtail < rhead */
1579 		/* we need head outside rtail .. rhead */
1580 		if (head > kring->rtail && head < kring->rhead)
1581 			goto error;
1582 
1583 		/* two cases now: head <= rtail or head >= rhead  */
1584 		if (head <= kring->rtail) {
1585 			/* want head <= cur <= rtail */
1586 			if (cur < head || cur > kring->rtail)
1587 				goto error;
1588 		} else { /* head >= rhead */
1589 			/* cur must be outside rtail..head */
1590 			if (cur > kring->rtail && cur < head)
1591 				goto error;
1592 		}
1593 	}
1594 	if (ring->tail != kring->rtail) {
1595 		RD(5, "tail overwritten was %d need %d",
1596 			ring->tail, kring->rtail);
1597 		ring->tail = kring->rtail;
1598 	}
1599 	kring->rhead = head;
1600 	kring->rcur = cur;
1601 	return head;
1602 
1603 error:
1604 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1605 		kring->name,
1606 		kring->nr_hwcur,
1607 		kring->rcur, kring->nr_hwtail,
1608 		cur, ring->tail);
1609 	return n;
1610 }
1611 
1612 
1613 /*
1614  * validate parameters on entry for *_rxsync()
1615  * Returns ring->head if ok, kring->nkr_num_slots on error.
1616  *
1617  * For a valid configuration,
1618  * hwcur <= head <= cur <= tail <= hwtail
1619  *
1620  * We only consider head and cur.
1621  * hwcur and hwtail are reliable.
1622  *
1623  */
1624 u_int
1625 nm_rxsync_prologue(struct netmap_kring *kring)
1626 {
1627 	struct netmap_ring *ring = kring->ring;
1628 	uint32_t const n = kring->nkr_num_slots;
1629 	uint32_t head, cur;
1630 
1631 	ND("%s kc %d kt %d h %d c %d t %d",
1632 		kring->name,
1633 		kring->nr_hwcur, kring->nr_hwtail,
1634 		ring->head, ring->cur, ring->tail);
1635 	/*
1636 	 * Before storing the new values, we should check they do not
1637 	 * move backwards. However:
1638 	 * - head is not an issue because the previous value is hwcur;
1639 	 * - cur could in principle go back, however it does not matter
1640 	 *   because we are processing a brand new rxsync()
1641 	 */
1642 	cur = kring->rcur = ring->cur;	/* read only once */
1643 	head = kring->rhead = ring->head;	/* read only once */
1644 #if 1 /* kernel sanity checks */
1645 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1646 		goto error;
1647 #endif /* kernel sanity checks */
1648 	/* user sanity checks */
1649 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1650 		/* want hwcur <= rhead <= hwtail */
1651 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1652 			goto error;
1653 		/* and also rhead <= rcur <= hwtail */
1654 		if (cur < head || cur > kring->nr_hwtail)
1655 			goto error;
1656 	} else {
1657 		/* we need rhead outside hwtail..hwcur */
1658 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1659 			goto error;
1660 		/* two cases now: head <= hwtail or head >= hwcur  */
1661 		if (head <= kring->nr_hwtail) {
1662 			/* want head <= cur <= hwtail */
1663 			if (cur < head || cur > kring->nr_hwtail)
1664 				goto error;
1665 		} else {
1666 			/* cur must be outside hwtail..head */
1667 			if (cur < head && cur > kring->nr_hwtail)
1668 				goto error;
1669 		}
1670 	}
1671 	if (ring->tail != kring->rtail) {
1672 		RD(5, "%s tail overwritten was %d need %d",
1673 			kring->name,
1674 			ring->tail, kring->rtail);
1675 		ring->tail = kring->rtail;
1676 	}
1677 	return head;
1678 
1679 error:
1680 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1681 		kring->nr_hwcur,
1682 		kring->rcur, kring->nr_hwtail,
1683 		kring->rhead, kring->rcur, ring->tail);
1684 	return n;
1685 }
1686 
1687 
1688 /*
1689  * Error routine called when txsync/rxsync detects an error.
1690  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1691  * Return 1 on reinit.
1692  *
1693  * This routine is only called by the upper half of the kernel.
1694  * It only reads hwcur (which is changed only by the upper half, too)
1695  * and hwtail (which may be changed by the lower half, but only on
1696  * a tx ring and only to increase it, so any error will be recovered
1697  * on the next call). For the above, we don't strictly need to call
1698  * it under lock.
1699  */
1700 int
1701 netmap_ring_reinit(struct netmap_kring *kring)
1702 {
1703 	struct netmap_ring *ring = kring->ring;
1704 	u_int i, lim = kring->nkr_num_slots - 1;
1705 	int errors = 0;
1706 
1707 	// XXX KASSERT nm_kr_tryget
1708 	RD(10, "called for %s", kring->name);
1709 	// XXX probably wrong to trust userspace
1710 	kring->rhead = ring->head;
1711 	kring->rcur  = ring->cur;
1712 	kring->rtail = ring->tail;
1713 
1714 	if (ring->cur > lim)
1715 		errors++;
1716 	if (ring->head > lim)
1717 		errors++;
1718 	if (ring->tail > lim)
1719 		errors++;
1720 	for (i = 0; i <= lim; i++) {
1721 		u_int idx = ring->slot[i].buf_idx;
1722 		u_int len = ring->slot[i].len;
1723 		if (idx < 2 || idx >= netmap_total_buffers) {
1724 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1725 			ring->slot[i].buf_idx = 0;
1726 			ring->slot[i].len = 0;
1727 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1728 			ring->slot[i].len = 0;
1729 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1730 		}
1731 	}
1732 	if (errors) {
1733 		RD(10, "total %d errors", errors);
1734 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1735 			kring->name,
1736 			ring->cur, kring->nr_hwcur,
1737 			ring->tail, kring->nr_hwtail);
1738 		ring->head = kring->rhead = kring->nr_hwcur;
1739 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1740 		ring->tail = kring->rtail = kring->nr_hwtail;
1741 	}
1742 	return (errors ? 1 : 0);
1743 }
1744 
1745 /* interpret the ringid and flags fields of an nmreq, by translating them
1746  * into a pair of intervals of ring indices:
1747  *
1748  * [priv->np_txqfirst, priv->np_txqlast) and
1749  * [priv->np_rxqfirst, priv->np_rxqlast)
1750  *
1751  */
1752 int
1753 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1754 {
1755 	struct netmap_adapter *na = priv->np_na;
1756 	u_int j, i = ringid & NETMAP_RING_MASK;
1757 	u_int reg = flags & NR_REG_MASK;
1758 
1759 	if (reg == NR_REG_DEFAULT) {
1760 		/* convert from old ringid to flags */
1761 		if (ringid & NETMAP_SW_RING) {
1762 			reg = NR_REG_SW;
1763 		} else if (ringid & NETMAP_HW_RING) {
1764 			reg = NR_REG_ONE_NIC;
1765 		} else {
1766 			reg = NR_REG_ALL_NIC;
1767 		}
1768 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1769 	}
1770 	switch (reg) {
1771 	case NR_REG_ALL_NIC:
1772 	case NR_REG_PIPE_MASTER:
1773 	case NR_REG_PIPE_SLAVE:
1774 		priv->np_txqfirst = 0;
1775 		priv->np_txqlast = na->num_tx_rings;
1776 		priv->np_rxqfirst = 0;
1777 		priv->np_rxqlast = na->num_rx_rings;
1778 		ND("%s %d %d", "ALL/PIPE",
1779 			priv->np_rxqfirst, priv->np_rxqlast);
1780 		break;
1781 	case NR_REG_SW:
1782 	case NR_REG_NIC_SW:
1783 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1784 			D("host rings not supported");
1785 			return EINVAL;
1786 		}
1787 		priv->np_txqfirst = (reg == NR_REG_SW ?
1788 			na->num_tx_rings : 0);
1789 		priv->np_txqlast = na->num_tx_rings + 1;
1790 		priv->np_rxqfirst = (reg == NR_REG_SW ?
1791 			na->num_rx_rings : 0);
1792 		priv->np_rxqlast = na->num_rx_rings + 1;
1793 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1794 			priv->np_rxqfirst, priv->np_rxqlast);
1795 		break;
1796 	case NR_REG_ONE_NIC:
1797 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1798 			D("invalid ring id %d", i);
1799 			return EINVAL;
1800 		}
1801 		/* if not enough rings, use the first one */
1802 		j = i;
1803 		if (j >= na->num_tx_rings)
1804 			j = 0;
1805 		priv->np_txqfirst = j;
1806 		priv->np_txqlast = j + 1;
1807 		j = i;
1808 		if (j >= na->num_rx_rings)
1809 			j = 0;
1810 		priv->np_rxqfirst = j;
1811 		priv->np_rxqlast = j + 1;
1812 		break;
1813 	default:
1814 		D("invalid regif type %d", reg);
1815 		return EINVAL;
1816 	}
1817 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1818 
1819 	if (netmap_verbose) {
1820 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1821 			na->name,
1822 			priv->np_txqfirst,
1823 			priv->np_txqlast,
1824 			priv->np_rxqfirst,
1825 			priv->np_rxqlast,
1826 			i);
1827 	}
1828 	return 0;
1829 }
1830 
1831 
1832 /*
1833  * Set the ring ID. For devices with a single queue, a request
1834  * for all rings is the same as a single ring.
1835  */
1836 static int
1837 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1838 {
1839 	struct netmap_adapter *na = priv->np_na;
1840 	int error;
1841 
1842 	error = netmap_interp_ringid(priv, ringid, flags);
1843 	if (error) {
1844 		return error;
1845 	}
1846 
1847 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1848 
1849 	/* optimization: count the users registered for more than
1850 	 * one ring, which are the ones sleeping on the global queue.
1851 	 * The default netmap_notify() callback will then
1852 	 * avoid signaling the global queue if nobody is using it
1853 	 */
1854 	if (nm_tx_si_user(priv))
1855 		na->tx_si_users++;
1856 	if (nm_rx_si_user(priv))
1857 		na->rx_si_users++;
1858 	return 0;
1859 }
1860 
1861 /*
1862  * possibly move the interface to netmap-mode.
1863  * If success it returns a pointer to netmap_if, otherwise NULL.
1864  * This must be called with NMG_LOCK held.
1865  *
1866  * The following na callbacks are called in the process:
1867  *
1868  * na->nm_config()			[by netmap_update_config]
1869  * (get current number and size of rings)
1870  *
1871  *  	We have a generic one for linux (netmap_linux_config).
1872  *  	The bwrap has to override this, since it has to forward
1873  *  	the request to the wrapped adapter (netmap_bwrap_config).
1874  *
1875  *    	XXX netmap_if_new calls this again (2014-03-15)
1876  *
1877  * na->nm_krings_create()		[by netmap_if_new]
1878  * (create and init the krings array)
1879  *
1880  * 	One of the following:
1881  *
1882  *	* netmap_hw_krings_create, 			(hw ports)
1883  *		creates the standard layout for the krings
1884  * 		and adds the mbq (used for the host rings).
1885  *
1886  * 	* netmap_vp_krings_create			(VALE ports)
1887  * 		add leases and scratchpads
1888  *
1889  * 	* netmap_pipe_krings_create			(pipes)
1890  * 		create the krings and rings of both ends and
1891  * 		cross-link them
1892  *
1893  *      * netmap_monitor_krings_create 			(monitors)
1894  *      	avoid allocating the mbq
1895  *
1896  *      * netmap_bwrap_krings_create			(bwraps)
1897  *      	create both the brap krings array,
1898  *      	the krings array of the wrapped adapter, and
1899  *      	(if needed) the fake array for the host adapter
1900  *
1901  * na->nm_register(, 1)
1902  * (put the adapter in netmap mode)
1903  *
1904  * 	This may be one of the following:
1905  * 	(XXX these should be either all *_register or all *_reg 2014-03-15)
1906  *
1907  * 	* netmap_hw_register				(hw ports)
1908  * 		checks that the ifp is still there, then calls
1909  * 		the hardware specific callback;
1910  *
1911  * 	* netmap_vp_reg					(VALE ports)
1912  *		If the port is connected to a bridge,
1913  *		set the NAF_NETMAP_ON flag under the
1914  *		bridge write lock.
1915  *
1916  *	* netmap_pipe_reg				(pipes)
1917  *		inform the other pipe end that it is no
1918  *		longer responsibile for the lifetime of this
1919  *		pipe end
1920  *
1921  *	* netmap_monitor_reg				(monitors)
1922  *		intercept the sync callbacks of the monitored
1923  *		rings
1924  *
1925  *	* netmap_bwrap_register				(bwraps)
1926  *		cross-link the bwrap and hwna rings,
1927  *		forward the request to the hwna, override
1928  *		the hwna notify callback (to get the frames
1929  *		coming from outside go through the bridge).
1930  *
1931  * XXX maybe netmap_if_new() should be merged with this (2014-03-15).
1932  *
1933  */
1934 struct netmap_if *
1935 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1936 	uint16_t ringid, uint32_t flags, int *err)
1937 {
1938 	struct netmap_if *nifp = NULL;
1939 	int error, need_mem = 0;
1940 
1941 	NMG_LOCK_ASSERT();
1942 	/* ring configuration may have changed, fetch from the card */
1943 	netmap_update_config(na);
1944 	priv->np_na = na;     /* store the reference */
1945 	error = netmap_set_ringid(priv, ringid, flags);
1946 	if (error)
1947 		goto out;
1948 	/* ensure allocators are ready */
1949 	need_mem = !netmap_have_memory_locked(priv);
1950 	if (need_mem) {
1951 		error = netmap_get_memory_locked(priv);
1952 		ND("get_memory returned %d", error);
1953 		if (error)
1954 			goto out;
1955 	}
1956 	/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
1957 	nifp = netmap_if_new(na);
1958 	if (nifp == NULL) { /* allocation failed */
1959 		error = ENOMEM;
1960 		goto out;
1961 	}
1962 	na->active_fds++;
1963 	if (!nm_netmap_on(na)) {
1964 		/* Netmap not active, set the card in netmap mode
1965 		 * and make it use the shared buffers.
1966 		 */
1967 		/* cache the allocator info in the na */
1968 		na->na_lut = netmap_mem_get_lut(na->nm_mem);
1969 		ND("%p->na_lut == %p", na, na->na_lut);
1970 		na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem);
1971 		na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem);
1972 		error = na->nm_register(na, 1); /* mode on */
1973 		if (error) {
1974 			netmap_do_unregif(priv, nifp);
1975 			nifp = NULL;
1976 		}
1977 	}
1978 out:
1979 	*err = error;
1980 	if (error) {
1981 		/* we should drop the allocator, but only
1982 		 * if we were the ones who grabbed it
1983 		 */
1984 		if (need_mem)
1985 			netmap_drop_memory_locked(priv);
1986 		priv->np_na = NULL;
1987 	}
1988 	if (nifp != NULL) {
1989 		/*
1990 		 * advertise that the interface is ready bt setting ni_nifp.
1991 		 * The barrier is needed because readers (poll and *SYNC)
1992 		 * check for priv->np_nifp != NULL without locking
1993 		 */
1994 		wmb(); /* make sure previous writes are visible to all CPUs */
1995 		priv->np_nifp = nifp;
1996 	}
1997 	return nifp;
1998 }
1999 
2000 
2001 
2002 /*
2003  * ioctl(2) support for the "netmap" device.
2004  *
2005  * Following a list of accepted commands:
2006  * - NIOCGINFO
2007  * - SIOCGIFADDR	just for convenience
2008  * - NIOCREGIF
2009  * - NIOCTXSYNC
2010  * - NIOCRXSYNC
2011  *
2012  * Return 0 on success, errno otherwise.
2013  */
2014 int
2015 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2016 	int fflag, struct thread *td)
2017 {
2018 	struct netmap_priv_d *priv = NULL;
2019 	struct nmreq *nmr = (struct nmreq *) data;
2020 	struct netmap_adapter *na = NULL;
2021 	int error;
2022 	u_int i, qfirst, qlast;
2023 	struct netmap_if *nifp;
2024 	struct netmap_kring *krings;
2025 
2026 	(void)dev;	/* UNUSED */
2027 	(void)fflag;	/* UNUSED */
2028 
2029 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2030 		/* truncate name */
2031 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2032 		if (nmr->nr_version != NETMAP_API) {
2033 			D("API mismatch for %s got %d need %d",
2034 				nmr->nr_name,
2035 				nmr->nr_version, NETMAP_API);
2036 			nmr->nr_version = NETMAP_API;
2037 		}
2038 		if (nmr->nr_version < NETMAP_MIN_API ||
2039 		    nmr->nr_version > NETMAP_MAX_API) {
2040 			return EINVAL;
2041 		}
2042 	}
2043 	CURVNET_SET(TD_TO_VNET(td));
2044 
2045 	error = devfs_get_cdevpriv((void **)&priv);
2046 	if (error) {
2047 		CURVNET_RESTORE();
2048 		/* XXX ENOENT should be impossible, since the priv
2049 		 * is now created in the open */
2050 		return (error == ENOENT ? ENXIO : error);
2051 	}
2052 
2053 	switch (cmd) {
2054 	case NIOCGINFO:		/* return capabilities etc */
2055 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2056 			error = netmap_bdg_ctl(nmr, NULL);
2057 			break;
2058 		}
2059 
2060 		NMG_LOCK();
2061 		do {
2062 			/* memsize is always valid */
2063 			struct netmap_mem_d *nmd = &nm_mem;
2064 			u_int memflags;
2065 
2066 			if (nmr->nr_name[0] != '\0') {
2067 				/* get a refcount */
2068 				error = netmap_get_na(nmr, &na, 1 /* create */);
2069 				if (error)
2070 					break;
2071 				nmd = na->nm_mem; /* get memory allocator */
2072 			}
2073 
2074 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2075 				&nmr->nr_arg2);
2076 			if (error)
2077 				break;
2078 			if (na == NULL) /* only memory info */
2079 				break;
2080 			nmr->nr_offset = 0;
2081 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2082 			netmap_update_config(na);
2083 			nmr->nr_rx_rings = na->num_rx_rings;
2084 			nmr->nr_tx_rings = na->num_tx_rings;
2085 			nmr->nr_rx_slots = na->num_rx_desc;
2086 			nmr->nr_tx_slots = na->num_tx_desc;
2087 			netmap_adapter_put(na);
2088 		} while (0);
2089 		NMG_UNLOCK();
2090 		break;
2091 
2092 	case NIOCREGIF:
2093 		/* possibly attach/detach NIC and VALE switch */
2094 		i = nmr->nr_cmd;
2095 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2096 				|| i == NETMAP_BDG_VNET_HDR
2097 				|| i == NETMAP_BDG_NEWIF
2098 				|| i == NETMAP_BDG_DELIF) {
2099 			error = netmap_bdg_ctl(nmr, NULL);
2100 			break;
2101 		} else if (i != 0) {
2102 			D("nr_cmd must be 0 not %d", i);
2103 			error = EINVAL;
2104 			break;
2105 		}
2106 
2107 		/* protect access to priv from concurrent NIOCREGIF */
2108 		NMG_LOCK();
2109 		do {
2110 			u_int memflags;
2111 
2112 			if (priv->np_na != NULL) {	/* thread already registered */
2113 				error = EBUSY;
2114 				break;
2115 			}
2116 			/* find the interface and a reference */
2117 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2118 			if (error)
2119 				break;
2120 			if (NETMAP_OWNED_BY_KERN(na)) {
2121 				netmap_adapter_put(na);
2122 				error = EBUSY;
2123 				break;
2124 			}
2125 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
2126 			if (!nifp) {    /* reg. failed, release priv and ref */
2127 				netmap_adapter_put(na);
2128 				priv->np_nifp = NULL;
2129 				break;
2130 			}
2131 			priv->np_td = td; // XXX kqueue, debugging only
2132 
2133 			/* return the offset of the netmap_if object */
2134 			nmr->nr_rx_rings = na->num_rx_rings;
2135 			nmr->nr_tx_rings = na->num_tx_rings;
2136 			nmr->nr_rx_slots = na->num_rx_desc;
2137 			nmr->nr_tx_slots = na->num_tx_desc;
2138 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2139 				&nmr->nr_arg2);
2140 			if (error) {
2141 				netmap_adapter_put(na);
2142 				break;
2143 			}
2144 			if (memflags & NETMAP_MEM_PRIVATE) {
2145 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2146 			}
2147 			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
2148 				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
2149 			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
2150 				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
2151 
2152 			if (nmr->nr_arg3) {
2153 				D("requested %d extra buffers", nmr->nr_arg3);
2154 				nmr->nr_arg3 = netmap_extra_alloc(na,
2155 					&nifp->ni_bufs_head, nmr->nr_arg3);
2156 				D("got %d extra buffers", nmr->nr_arg3);
2157 			}
2158 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2159 		} while (0);
2160 		NMG_UNLOCK();
2161 		break;
2162 
2163 	case NIOCTXSYNC:
2164 	case NIOCRXSYNC:
2165 		nifp = priv->np_nifp;
2166 
2167 		if (nifp == NULL) {
2168 			error = ENXIO;
2169 			break;
2170 		}
2171 		rmb(); /* make sure following reads are not from cache */
2172 
2173 		na = priv->np_na;      /* we have a reference */
2174 
2175 		if (na == NULL) {
2176 			D("Internal error: nifp != NULL && na == NULL");
2177 			error = ENXIO;
2178 			break;
2179 		}
2180 
2181 		if (!nm_netmap_on(na)) {
2182 			error = ENXIO;
2183 			break;
2184 		}
2185 
2186 		if (cmd == NIOCTXSYNC) {
2187 			krings = na->tx_rings;
2188 			qfirst = priv->np_txqfirst;
2189 			qlast = priv->np_txqlast;
2190 		} else {
2191 			krings = na->rx_rings;
2192 			qfirst = priv->np_rxqfirst;
2193 			qlast = priv->np_rxqlast;
2194 		}
2195 
2196 		for (i = qfirst; i < qlast; i++) {
2197 			struct netmap_kring *kring = krings + i;
2198 			if (nm_kr_tryget(kring)) {
2199 				error = EBUSY;
2200 				goto out;
2201 			}
2202 			if (cmd == NIOCTXSYNC) {
2203 				if (netmap_verbose & NM_VERB_TXSYNC)
2204 					D("pre txsync ring %d cur %d hwcur %d",
2205 					    i, kring->ring->cur,
2206 					    kring->nr_hwcur);
2207 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2208 					netmap_ring_reinit(kring);
2209 				} else {
2210 					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
2211 				}
2212 				if (netmap_verbose & NM_VERB_TXSYNC)
2213 					D("post txsync ring %d cur %d hwcur %d",
2214 					    i, kring->ring->cur,
2215 					    kring->nr_hwcur);
2216 			} else {
2217 				kring->nm_sync(kring, NAF_FORCE_READ);
2218 				microtime(&na->rx_rings[i].ring->ts);
2219 			}
2220 			nm_kr_put(kring);
2221 		}
2222 
2223 		break;
2224 
2225 	case NIOCCONFIG:
2226 		error = netmap_bdg_config(nmr);
2227 		break;
2228 #ifdef __FreeBSD__
2229 	case FIONBIO:
2230 	case FIOASYNC:
2231 		ND("FIONBIO/FIOASYNC are no-ops");
2232 		break;
2233 
2234 	case BIOCIMMEDIATE:
2235 	case BIOCGHDRCMPLT:
2236 	case BIOCSHDRCMPLT:
2237 	case BIOCSSEESENT:
2238 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2239 		break;
2240 
2241 	default:	/* allow device-specific ioctls */
2242 	    {
2243 		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2244 		if (ifp == NULL) {
2245 			error = ENXIO;
2246 		} else {
2247 			struct socket so;
2248 
2249 			bzero(&so, sizeof(so));
2250 			so.so_vnet = ifp->if_vnet;
2251 			// so->so_proto not null.
2252 			error = ifioctl(&so, cmd, data, td);
2253 			if_rele(ifp);
2254 		}
2255 		break;
2256 	    }
2257 
2258 #else /* linux */
2259 	default:
2260 		error = EOPNOTSUPP;
2261 #endif /* linux */
2262 	}
2263 out:
2264 
2265 	CURVNET_RESTORE();
2266 	return (error);
2267 }
2268 
2269 
2270 /*
2271  * select(2) and poll(2) handlers for the "netmap" device.
2272  *
2273  * Can be called for one or more queues.
2274  * Return true the event mask corresponding to ready events.
2275  * If there are no ready events, do a selrecord on either individual
2276  * selinfo or on the global one.
2277  * Device-dependent parts (locking and sync of tx/rx rings)
2278  * are done through callbacks.
2279  *
2280  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2281  * The first one is remapped to pwait as selrecord() uses the name as an
2282  * hidden argument.
2283  */
2284 int
2285 netmap_poll(struct cdev *dev, int events, struct thread *td)
2286 {
2287 	struct netmap_priv_d *priv = NULL;
2288 	struct netmap_adapter *na;
2289 	struct netmap_kring *kring;
2290 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
2291 	struct mbq q;		/* packets from hw queues to host stack */
2292 	void *pwait = dev;	/* linux compatibility */
2293 	int is_kevent = 0;
2294 
2295 	/*
2296 	 * In order to avoid nested locks, we need to "double check"
2297 	 * txsync and rxsync if we decide to do a selrecord().
2298 	 * retry_tx (and retry_rx, later) prevent looping forever.
2299 	 */
2300 	int retry_tx = 1, retry_rx = 1;
2301 
2302 	(void)pwait;
2303 	mbq_init(&q);
2304 
2305 	/*
2306 	 * XXX kevent has curthread->tp_fop == NULL,
2307 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2308 	 * priv as the first argument, which is also useful to avoid
2309 	 * the selrecord() which are not necessary in that case.
2310 	 */
2311 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
2312 		is_kevent = 1;
2313 		if (netmap_verbose)
2314 			D("called from kevent");
2315 		priv = (struct netmap_priv_d *)dev;
2316 	}
2317 	if (priv == NULL)
2318 		return POLLERR;
2319 
2320 	if (priv->np_nifp == NULL) {
2321 		D("No if registered");
2322 		return POLLERR;
2323 	}
2324 	rmb(); /* make sure following reads are not from cache */
2325 
2326 	na = priv->np_na;
2327 
2328 	if (!nm_netmap_on(na))
2329 		return POLLERR;
2330 
2331 	if (netmap_verbose & 0x8000)
2332 		D("device %s events 0x%x", na->name, events);
2333 	want_tx = events & (POLLOUT | POLLWRNORM);
2334 	want_rx = events & (POLLIN | POLLRDNORM);
2335 
2336 
2337 	/*
2338 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2339 	 * the file descriptor is bound to all of them. If so, we sleep on
2340 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2341 	 * (FreeBSD only allows two selinfo's per file descriptor).
2342 	 * The interrupt routine in the driver wake one or the other
2343 	 * (or both) depending on which clients are active.
2344 	 *
2345 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2346 	 * txsync() is called if we run out of buffers on POLLOUT, or
2347 	 * there are pending packets to send. The latter can be disabled
2348 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2349 	 */
2350 	check_all_tx = nm_tx_si_user(priv);
2351 	check_all_rx = nm_rx_si_user(priv);
2352 
2353 	/*
2354 	 * We start with a lock free round which is cheap if we have
2355 	 * slots available. If this fails, then lock and call the sync
2356 	 * routines.
2357 	 */
2358 	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
2359 		kring = &na->rx_rings[i];
2360 		/* XXX compare ring->cur and kring->tail */
2361 		if (!nm_ring_empty(kring->ring)) {
2362 			revents |= want_rx;
2363 			want_rx = 0;	/* also breaks the loop */
2364 		}
2365 	}
2366 	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
2367 		kring = &na->tx_rings[i];
2368 		/* XXX compare ring->cur and kring->tail */
2369 		if (!nm_ring_empty(kring->ring)) {
2370 			revents |= want_tx;
2371 			want_tx = 0;	/* also breaks the loop */
2372 		}
2373 	}
2374 
2375 	/*
2376 	 * If we want to push packets out (priv->np_txpoll) or
2377 	 * want_tx is still set, we must issue txsync calls
2378 	 * (on all rings, to avoid that the tx rings stall).
2379 	 * XXX should also check cur != hwcur on the tx rings.
2380 	 * Fortunately, normal tx mode has np_txpoll set.
2381 	 */
2382 	if (priv->np_txpoll || want_tx) {
2383 		/*
2384 		 * The first round checks if anyone is ready, if not
2385 		 * do a selrecord and another round to handle races.
2386 		 * want_tx goes to 0 if any space is found, and is
2387 		 * used to skip rings with no pending transmissions.
2388 		 */
2389 flush_tx:
2390 		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2391 			int found = 0;
2392 
2393 			kring = &na->tx_rings[i];
2394 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2395 				continue;
2396 			/* only one thread does txsync */
2397 			if (nm_kr_tryget(kring)) {
2398 				/* either busy or stopped
2399 				 * XXX if the ring is stopped, sleeping would
2400 				 * be better. In current code, however, we only
2401 				 * stop the rings for brief intervals (2014-03-14)
2402 				 */
2403 				if (netmap_verbose)
2404 					RD(2, "%p lost race on txring %d, ok",
2405 					    priv, i);
2406 				continue;
2407 			}
2408 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2409 				netmap_ring_reinit(kring);
2410 				revents |= POLLERR;
2411 			} else {
2412 				if (kring->nm_sync(kring, 0))
2413 					revents |= POLLERR;
2414 			}
2415 
2416 			/*
2417 			 * If we found new slots, notify potential
2418 			 * listeners on the same ring.
2419 			 * Since we just did a txsync, look at the copies
2420 			 * of cur,tail in the kring.
2421 			 */
2422 			found = kring->rcur != kring->rtail;
2423 			nm_kr_put(kring);
2424 			if (found) { /* notify other listeners */
2425 				revents |= want_tx;
2426 				want_tx = 0;
2427 				na->nm_notify(na, i, NR_TX, 0);
2428 			}
2429 		}
2430 		if (want_tx && retry_tx && !is_kevent) {
2431 			OS_selrecord(td, check_all_tx ?
2432 			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2433 			retry_tx = 0;
2434 			goto flush_tx;
2435 		}
2436 	}
2437 
2438 	/*
2439 	 * If want_rx is still set scan receive rings.
2440 	 * Do it on all rings because otherwise we starve.
2441 	 */
2442 	if (want_rx) {
2443 		int send_down = 0; /* transparent mode */
2444 		/* two rounds here for race avoidance */
2445 do_retry_rx:
2446 		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2447 			int found = 0;
2448 
2449 			kring = &na->rx_rings[i];
2450 
2451 			if (nm_kr_tryget(kring)) {
2452 				if (netmap_verbose)
2453 					RD(2, "%p lost race on rxring %d, ok",
2454 					    priv, i);
2455 				continue;
2456 			}
2457 
2458 			/*
2459 			 * transparent mode support: collect packets
2460 			 * from the rxring(s).
2461 			 * XXX NR_FORWARD should only be read on
2462 			 * physical or NIC ports
2463 			 */
2464 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2465 				ND(10, "forwarding some buffers up %d to %d",
2466 				    kring->nr_hwcur, kring->ring->cur);
2467 				netmap_grab_packets(kring, &q, netmap_fwd);
2468 			}
2469 
2470 			if (kring->nm_sync(kring, 0))
2471 				revents |= POLLERR;
2472 			if (netmap_no_timestamp == 0 ||
2473 					kring->ring->flags & NR_TIMESTAMP) {
2474 				microtime(&kring->ring->ts);
2475 			}
2476 			/* after an rxsync we can use kring->rcur, rtail */
2477 			found = kring->rcur != kring->rtail;
2478 			nm_kr_put(kring);
2479 			if (found) {
2480 				revents |= want_rx;
2481 				retry_rx = 0;
2482 				na->nm_notify(na, i, NR_RX, 0);
2483 			}
2484 		}
2485 
2486 		/* transparent mode XXX only during first pass ? */
2487 		if (na->na_flags & NAF_HOST_RINGS) {
2488 			kring = &na->rx_rings[na->num_rx_rings];
2489 			if (check_all_rx
2490 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2491 				/* XXX fix to use kring fields */
2492 				if (nm_ring_empty(kring->ring))
2493 					send_down = netmap_rxsync_from_host(na, td, dev);
2494 				if (!nm_ring_empty(kring->ring))
2495 					revents |= want_rx;
2496 			}
2497 		}
2498 
2499 		if (retry_rx && !is_kevent)
2500 			OS_selrecord(td, check_all_rx ?
2501 			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2502 		if (send_down > 0 || retry_rx) {
2503 			retry_rx = 0;
2504 			if (send_down)
2505 				goto flush_tx; /* and retry_rx */
2506 			else
2507 				goto do_retry_rx;
2508 		}
2509 	}
2510 
2511 	/*
2512 	 * Transparent mode: marked bufs on rx rings between
2513 	 * kring->nr_hwcur and ring->head
2514 	 * are passed to the other endpoint.
2515 	 *
2516 	 * In this mode we also scan the sw rxring, which in
2517 	 * turn passes packets up.
2518 	 *
2519 	 * XXX Transparent mode at the moment requires to bind all
2520  	 * rings to a single file descriptor.
2521 	 */
2522 
2523 	if (q.head && na->ifp != NULL)
2524 		netmap_send_up(na->ifp, &q);
2525 
2526 	return (revents);
2527 }
2528 
2529 
2530 /*-------------------- driver support routines -------------------*/
2531 
2532 static int netmap_hw_krings_create(struct netmap_adapter *);
2533 
2534 /* default notify callback */
2535 static int
2536 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2537 	enum txrx tx, int flags)
2538 {
2539 	struct netmap_kring *kring;
2540 
2541 	if (tx == NR_TX) {
2542 		kring = na->tx_rings + n_ring;
2543 		OS_selwakeup(&kring->si, PI_NET);
2544 		/* optimization: avoid a wake up on the global
2545 		 * queue if nobody has registered for more
2546 		 * than one ring
2547 		 */
2548 		if (na->tx_si_users > 0)
2549 			OS_selwakeup(&na->tx_si, PI_NET);
2550 	} else {
2551 		kring = na->rx_rings + n_ring;
2552 		OS_selwakeup(&kring->si, PI_NET);
2553 		/* optimization: same as above */
2554 		if (na->rx_si_users > 0)
2555 			OS_selwakeup(&na->rx_si, PI_NET);
2556 	}
2557 	return 0;
2558 }
2559 
2560 
2561 /* called by all routines that create netmap_adapters.
2562  * Attach na to the ifp (if any) and provide defaults
2563  * for optional callbacks. Defaults assume that we
2564  * are creating an hardware netmap_adapter.
2565  */
2566 int
2567 netmap_attach_common(struct netmap_adapter *na)
2568 {
2569 	struct ifnet *ifp = na->ifp;
2570 
2571 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2572 		D("%s: invalid rings tx %d rx %d",
2573 			na->name, na->num_tx_rings, na->num_rx_rings);
2574 		return EINVAL;
2575 	}
2576 	/* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2577 	 * pipes, monitors). For bwrap we actually have a non-null ifp for
2578 	 * use by the external modules, but that is set after this
2579 	 * function has been called.
2580 	 * XXX this is ugly, maybe split this function in two (2014-03-14)
2581 	 */
2582 	if (ifp != NULL) {
2583 		WNA(ifp) = na;
2584 
2585 	/* the following is only needed for na that use the host port.
2586 	 * XXX do we have something similar for linux ?
2587 	 */
2588 #ifdef __FreeBSD__
2589 		na->if_input = ifp->if_input; /* for netmap_send_up */
2590 #endif /* __FreeBSD__ */
2591 
2592 		NETMAP_SET_CAPABLE(ifp);
2593 	}
2594 	if (na->nm_krings_create == NULL) {
2595 		/* we assume that we have been called by a driver,
2596 		 * since other port types all provide their own
2597 		 * nm_krings_create
2598 		 */
2599 		na->nm_krings_create = netmap_hw_krings_create;
2600 		na->nm_krings_delete = netmap_hw_krings_delete;
2601 	}
2602 	if (na->nm_notify == NULL)
2603 		na->nm_notify = netmap_notify;
2604 	na->active_fds = 0;
2605 
2606 	if (na->nm_mem == NULL)
2607 		/* use the global allocator */
2608 		na->nm_mem = &nm_mem;
2609 	if (na->nm_bdg_attach == NULL)
2610 		/* no special nm_bdg_attach callback. On VALE
2611 		 * attach, we need to interpose a bwrap
2612 		 */
2613 		na->nm_bdg_attach = netmap_bwrap_attach;
2614 	return 0;
2615 }
2616 
2617 
2618 /* standard cleanup, called by all destructors */
2619 void
2620 netmap_detach_common(struct netmap_adapter *na)
2621 {
2622 	if (na->ifp != NULL)
2623 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2624 
2625 	if (na->tx_rings) { /* XXX should not happen */
2626 		D("freeing leftover tx_rings");
2627 		na->nm_krings_delete(na);
2628 	}
2629 	netmap_pipe_dealloc(na);
2630 	if (na->na_flags & NAF_MEM_OWNER)
2631 		netmap_mem_private_delete(na->nm_mem);
2632 	bzero(na, sizeof(*na));
2633 	free(na, M_DEVBUF);
2634 }
2635 
2636 /* Wrapper for the register callback provided hardware drivers.
2637  * na->ifp == NULL means the the driver module has been
2638  * unloaded, so we cannot call into it.
2639  * Note that module unloading, in our patched linux drivers,
2640  * happens under NMG_LOCK and after having stopped all the
2641  * nic rings (see netmap_detach). This provides sufficient
2642  * protection for the other driver-provied callbacks
2643  * (i.e., nm_config and nm_*xsync), that therefore don't need
2644  * to wrapped.
2645  */
2646 static int
2647 netmap_hw_register(struct netmap_adapter *na, int onoff)
2648 {
2649 	struct netmap_hw_adapter *hwna =
2650 		(struct netmap_hw_adapter*)na;
2651 
2652 	if (na->ifp == NULL)
2653 		return onoff ? ENXIO : 0;
2654 
2655 	return hwna->nm_hw_register(na, onoff);
2656 }
2657 
2658 
2659 /*
2660  * Initialize a ``netmap_adapter`` object created by driver on attach.
2661  * We allocate a block of memory with room for a struct netmap_adapter
2662  * plus two sets of N+2 struct netmap_kring (where N is the number
2663  * of hardware rings):
2664  * krings	0..N-1	are for the hardware queues.
2665  * kring	N	is for the host stack queue
2666  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2667  * Return 0 on success, ENOMEM otherwise.
2668  */
2669 int
2670 netmap_attach(struct netmap_adapter *arg)
2671 {
2672 	struct netmap_hw_adapter *hwna = NULL;
2673 	// XXX when is arg == NULL ?
2674 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2675 
2676 	if (arg == NULL || ifp == NULL)
2677 		goto fail;
2678 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2679 	if (hwna == NULL)
2680 		goto fail;
2681 	hwna->up = *arg;
2682 	hwna->up.na_flags |= NAF_HOST_RINGS;
2683 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2684 	hwna->nm_hw_register = hwna->up.nm_register;
2685 	hwna->up.nm_register = netmap_hw_register;
2686 	if (netmap_attach_common(&hwna->up)) {
2687 		free(hwna, M_DEVBUF);
2688 		goto fail;
2689 	}
2690 	netmap_adapter_get(&hwna->up);
2691 
2692 #ifdef linux
2693 	if (ifp->netdev_ops) {
2694 		/* prepare a clone of the netdev ops */
2695 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2696 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2697 #else
2698 		hwna->nm_ndo = *ifp->netdev_ops;
2699 #endif
2700 	}
2701 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2702 	if (ifp->ethtool_ops) {
2703 		hwna->nm_eto = *ifp->ethtool_ops;
2704 	}
2705 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2706 #ifdef ETHTOOL_SCHANNELS
2707 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2708 #endif
2709 	if (arg->nm_config == NULL) {
2710 		hwna->up.nm_config = netmap_linux_config;
2711 	}
2712 #endif /* linux */
2713 
2714 	D("success for %s tx %d/%d rx %d/%d queues/slots",
2715 		hwna->up.name,
2716 		hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2717 		hwna->up.num_rx_rings, hwna->up.num_rx_desc
2718 		);
2719 	return 0;
2720 
2721 fail:
2722 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2723 	if (ifp)
2724 		netmap_detach(ifp);
2725 	return (hwna ? EINVAL : ENOMEM);
2726 }
2727 
2728 
2729 void
2730 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2731 {
2732 	if (!na) {
2733 		return;
2734 	}
2735 
2736 	refcount_acquire(&na->na_refcount);
2737 }
2738 
2739 
2740 /* returns 1 iff the netmap_adapter is destroyed */
2741 int
2742 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2743 {
2744 	if (!na)
2745 		return 1;
2746 
2747 	if (!refcount_release(&na->na_refcount))
2748 		return 0;
2749 
2750 	if (na->nm_dtor)
2751 		na->nm_dtor(na);
2752 
2753 	netmap_detach_common(na);
2754 
2755 	return 1;
2756 }
2757 
2758 /* nm_krings_create callback for all hardware native adapters */
2759 int
2760 netmap_hw_krings_create(struct netmap_adapter *na)
2761 {
2762 	int ret = netmap_krings_create(na, 0);
2763 	if (ret == 0) {
2764 		/* initialize the mbq for the sw rx ring */
2765 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2766 		ND("initialized sw rx queue %d", na->num_rx_rings);
2767 	}
2768 	return ret;
2769 }
2770 
2771 
2772 
2773 /*
2774  * Called on module unload by the netmap-enabled drivers
2775  */
2776 void
2777 netmap_detach(struct ifnet *ifp)
2778 {
2779 	struct netmap_adapter *na = NA(ifp);
2780 
2781 	if (!na)
2782 		return;
2783 
2784 	NMG_LOCK();
2785 	netmap_disable_all_rings(ifp);
2786 	if (!netmap_adapter_put(na)) {
2787 		/* someone is still using the adapter,
2788 		 * tell them that the interface is gone
2789 		 */
2790 		na->ifp = NULL;
2791 		// XXX also clear NAF_NATIVE_ON ?
2792 		na->na_flags &= ~NAF_NETMAP_ON;
2793 		/* give them a chance to notice */
2794 		netmap_enable_all_rings(ifp);
2795 	}
2796 	NMG_UNLOCK();
2797 }
2798 
2799 
2800 /*
2801  * Intercept packets from the network stack and pass them
2802  * to netmap as incoming packets on the 'software' ring.
2803  *
2804  * We only store packets in a bounded mbq and then copy them
2805  * in the relevant rxsync routine.
2806  *
2807  * We rely on the OS to make sure that the ifp and na do not go
2808  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2809  * In nm_register() or whenever there is a reinitialization,
2810  * we make sure to make the mode change visible here.
2811  */
2812 int
2813 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2814 {
2815 	struct netmap_adapter *na = NA(ifp);
2816 	struct netmap_kring *kring;
2817 	u_int len = MBUF_LEN(m);
2818 	u_int error = ENOBUFS;
2819 	struct mbq *q;
2820 	int space;
2821 
2822 	// XXX [Linux] we do not need this lock
2823 	// if we follow the down/configure/up protocol -gl
2824 	// mtx_lock(&na->core_lock);
2825 
2826 	if (!nm_netmap_on(na)) {
2827 		D("%s not in netmap mode anymore", na->name);
2828 		error = ENXIO;
2829 		goto done;
2830 	}
2831 
2832 	kring = &na->rx_rings[na->num_rx_rings];
2833 	q = &kring->rx_queue;
2834 
2835 	// XXX reconsider long packets if we handle fragments
2836 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2837 		D("%s from_host, drop packet size %d > %d", na->name,
2838 			len, NETMAP_BUF_SIZE(na));
2839 		goto done;
2840 	}
2841 
2842 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2843 	 * and maybe other instances of netmap_transmit (the latter
2844 	 * not possible on Linux).
2845 	 * Also avoid overflowing the queue.
2846 	 */
2847 	mbq_lock(q);
2848 
2849         space = kring->nr_hwtail - kring->nr_hwcur;
2850         if (space < 0)
2851                 space += kring->nkr_num_slots;
2852 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2853 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2854 			na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2855 			len, m);
2856 	} else {
2857 		mbq_enqueue(q, m);
2858 		ND(10, "%s %d bufs in queue len %d m %p",
2859 			na->name, mbq_len(q), len, m);
2860 		/* notify outside the lock */
2861 		m = NULL;
2862 		error = 0;
2863 	}
2864 	mbq_unlock(q);
2865 
2866 done:
2867 	if (m)
2868 		m_freem(m);
2869 	/* unconditionally wake up listeners */
2870 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2871 	/* this is normally netmap_notify(), but for nics
2872 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2873 	 * that possibly forwards the frames through the switch
2874 	 */
2875 
2876 	return (error);
2877 }
2878 
2879 
2880 /*
2881  * netmap_reset() is called by the driver routines when reinitializing
2882  * a ring. The driver is in charge of locking to protect the kring.
2883  * If native netmap mode is not set just return NULL.
2884  */
2885 struct netmap_slot *
2886 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2887 	u_int new_cur)
2888 {
2889 	struct netmap_kring *kring;
2890 	int new_hwofs, lim;
2891 
2892 	if (!nm_native_on(na)) {
2893 		ND("interface not in native netmap mode");
2894 		return NULL;	/* nothing to reinitialize */
2895 	}
2896 
2897 	/* XXX note- in the new scheme, we are not guaranteed to be
2898 	 * under lock (e.g. when called on a device reset).
2899 	 * In this case, we should set a flag and do not trust too
2900 	 * much the values. In practice: TODO
2901 	 * - set a RESET flag somewhere in the kring
2902 	 * - do the processing in a conservative way
2903 	 * - let the *sync() fixup at the end.
2904 	 */
2905 	if (tx == NR_TX) {
2906 		if (n >= na->num_tx_rings)
2907 			return NULL;
2908 		kring = na->tx_rings + n;
2909 		// XXX check whether we should use hwcur or rcur
2910 		new_hwofs = kring->nr_hwcur - new_cur;
2911 	} else {
2912 		if (n >= na->num_rx_rings)
2913 			return NULL;
2914 		kring = na->rx_rings + n;
2915 		new_hwofs = kring->nr_hwtail - new_cur;
2916 	}
2917 	lim = kring->nkr_num_slots - 1;
2918 	if (new_hwofs > lim)
2919 		new_hwofs -= lim + 1;
2920 
2921 	/* Always set the new offset value and realign the ring. */
2922 	if (netmap_verbose)
2923 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2924 		na->name,
2925 		tx == NR_TX ? "TX" : "RX", n,
2926 		kring->nkr_hwofs, new_hwofs,
2927 		kring->nr_hwtail,
2928 		tx == NR_TX ? lim : kring->nr_hwtail);
2929 	kring->nkr_hwofs = new_hwofs;
2930 	if (tx == NR_TX) {
2931 		kring->nr_hwtail = kring->nr_hwcur + lim;
2932 		if (kring->nr_hwtail > lim)
2933 			kring->nr_hwtail -= lim + 1;
2934 	}
2935 
2936 #if 0 // def linux
2937 	/* XXX check that the mappings are correct */
2938 	/* need ring_nr, adapter->pdev, direction */
2939 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2940 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2941 		D("error mapping rx netmap buffer %d", i);
2942 		// XXX fix error handling
2943 	}
2944 
2945 #endif /* linux */
2946 	/*
2947 	 * Wakeup on the individual and global selwait
2948 	 * We do the wakeup here, but the ring is not yet reconfigured.
2949 	 * However, we are under lock so there are no races.
2950 	 */
2951 	na->nm_notify(na, n, tx, 0);
2952 	return kring->ring->slot;
2953 }
2954 
2955 
2956 /*
2957  * Dispatch rx/tx interrupts to the netmap rings.
2958  *
2959  * "work_done" is non-null on the RX path, NULL for the TX path.
2960  * We rely on the OS to make sure that there is only one active
2961  * instance per queue, and that there is appropriate locking.
2962  *
2963  * The 'notify' routine depends on what the ring is attached to.
2964  * - for a netmap file descriptor, do a selwakeup on the individual
2965  *   waitqueue, plus one on the global one if needed
2966  *   (see netmap_notify)
2967  * - for a nic connected to a switch, call the proper forwarding routine
2968  *   (see netmap_bwrap_intr_notify)
2969  */
2970 void
2971 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2972 {
2973 	struct netmap_adapter *na = NA(ifp);
2974 	struct netmap_kring *kring;
2975 
2976 	q &= NETMAP_RING_MASK;
2977 
2978 	if (netmap_verbose) {
2979 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2980 	}
2981 
2982 	if (work_done) { /* RX path */
2983 		if (q >= na->num_rx_rings)
2984 			return;	// not a physical queue
2985 		kring = na->rx_rings + q;
2986 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2987 		na->nm_notify(na, q, NR_RX, 0);
2988 		*work_done = 1; /* do not fire napi again */
2989 	} else { /* TX path */
2990 		if (q >= na->num_tx_rings)
2991 			return;	// not a physical queue
2992 		kring = na->tx_rings + q;
2993 		na->nm_notify(na, q, NR_TX, 0);
2994 	}
2995 }
2996 
2997 
2998 /*
2999  * Default functions to handle rx/tx interrupts from a physical device.
3000  * "work_done" is non-null on the RX path, NULL for the TX path.
3001  *
3002  * If the card is not in netmap mode, simply return 0,
3003  * so that the caller proceeds with regular processing.
3004  * Otherwise call netmap_common_irq() and return 1.
3005  *
3006  * If the card is connected to a netmap file descriptor,
3007  * do a selwakeup on the individual queue, plus one on the global one
3008  * if needed (multiqueue card _and_ there are multiqueue listeners),
3009  * and return 1.
3010  *
3011  * Finally, if called on rx from an interface connected to a switch,
3012  * calls the proper forwarding routine, and return 1.
3013  */
3014 int
3015 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3016 {
3017 	struct netmap_adapter *na = NA(ifp);
3018 
3019 	/*
3020 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3021 	 * we still use the regular driver even though the previous
3022 	 * check fails. It is unclear whether we should use
3023 	 * nm_native_on() here.
3024 	 */
3025 	if (!nm_netmap_on(na))
3026 		return 0;
3027 
3028 	if (na->na_flags & NAF_SKIP_INTR) {
3029 		ND("use regular interrupt");
3030 		return 0;
3031 	}
3032 
3033 	netmap_common_irq(ifp, q, work_done);
3034 	return 1;
3035 }
3036 
3037 
3038 /*
3039  * Module loader and unloader
3040  *
3041  * netmap_init() creates the /dev/netmap device and initializes
3042  * all global variables. Returns 0 on success, errno on failure
3043  * (but there is no chance)
3044  *
3045  * netmap_fini() destroys everything.
3046  */
3047 
3048 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3049 extern struct cdevsw netmap_cdevsw;
3050 
3051 
3052 void
3053 netmap_fini(void)
3054 {
3055 	// XXX destroy_bridges() ?
3056 	if (netmap_dev)
3057 		destroy_dev(netmap_dev);
3058 	netmap_mem_fini();
3059 	NMG_LOCK_DESTROY();
3060 	printf("netmap: unloaded module.\n");
3061 }
3062 
3063 
3064 int
3065 netmap_init(void)
3066 {
3067 	int error;
3068 
3069 	NMG_LOCK_INIT();
3070 
3071 	error = netmap_mem_init();
3072 	if (error != 0)
3073 		goto fail;
3074 	/* XXX could use make_dev_credv() to get error number */
3075 #ifdef __FreeBSD__
3076 	/* support for the 'eternal' flag */
3077 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3078 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0660,
3079 			      "netmap");
3080 #else
3081 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
3082 			      "netmap");
3083 #endif
3084 	if (!netmap_dev)
3085 		goto fail;
3086 
3087 	netmap_init_bridges();
3088 #ifdef __FreeBSD__
3089 	nm_vi_init_index();
3090 #endif
3091 	printf("netmap: loaded module\n");
3092 	return (0);
3093 fail:
3094 	netmap_fini();
3095 	return (EINVAL); /* may be incorrect */
3096 }
3097