xref: /freebsd/sys/dev/netmap/netmap.c (revision 4ce386ff25d77954b8cfa11534f632172e848244)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 
128 /* --- internals ----
129  *
130  * Roadmap to the code that implements the above.
131  *
132  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133  * >    select()able file descriptor on which events are reported.
134  *
135  *  	Internally, we allocate a netmap_priv_d structure, that will be
136  *  	initialized on ioctl(NIOCREGIF).
137  *
138  *      os-specific:
139  *  	    FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140  *  		     per-thread.
141  *  	    linux:   linux_netmap_open (netmap_linux.c). The priv is
142  *  		     per-open.
143  *
144  * > 2. on each descriptor, the process issues an ioctl() to identify
145  * >    the interface that should report events to the file descriptor.
146  *
147  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148  * 	Most important things happen in netmap_get_na() and
149  * 	netmap_do_regif(), called from there. Additional details can be
150  * 	found in the comments above those functions.
151  *
152  * 	In all cases, this action creates/takes-a-reference-to a
153  * 	netmap_*_adapter describing the port, and allocates a netmap_if
154  * 	and all necessary netmap rings, filling them with netmap buffers.
155  *
156  *      In this phase, the sync callbacks for each ring are set (these are used
157  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
158  *      The adapter creation/initialization code puts them in the
159  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
160  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
161  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
162  * 	actually call netmap_krings_create() to perform this and the other
163  * 	common stuff. netmap_krings_create() also takes care of the host rings,
164  * 	if needed, by setting their sync callbacks appropriately.
165  *
166  * 	Additional actions depend on the kind of netmap_adapter that has been
167  * 	registered:
168  *
169  * 	- netmap_hw_adapter:  	     [netmap.c]
170  * 	     This is a system netdev/ifp with native netmap support.
171  * 	     The ifp is detached from the host stack by redirecting:
172  * 	       - transmissions (from the network stack) to netmap_transmit()
173  * 	       - receive notifications to the nm_notify() callback for
174  * 	         this adapter. The callback is normally netmap_notify(), unless
175  * 	         the ifp is attached to a bridge using bwrap, in which case it
176  * 	         is netmap_bwrap_intr_notify().
177  *
178  * 	- netmap_generic_adapter:      [netmap_generic.c]
179  * 	      A system netdev/ifp without native netmap support.
180  *
181  * 	(the decision about native/non native support is taken in
182  * 	 netmap_get_hw_na(), called by netmap_get_na())
183  *
184  * 	- netmap_vp_adapter 		[netmap_vale.c]
185  * 	      Returned by netmap_get_bdg_na().
186  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
187  * 	      are created on the fly if they don't already exist, and are
188  * 	      always attached to a bridge.
189  * 	      Persistent VALE ports must must be created seperately, and i
190  * 	      then attached like normal NICs. The NIOCREGIF we are examining
191  * 	      will find them only if they had previosly been created and
192  * 	      attached (see VALE_CTL below).
193  *
194  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
195  * 	      Returned by netmap_get_pipe_na().
196  * 	      Both pipe ends are created, if they didn't already exist.
197  *
198  * 	- netmap_monitor_adapter      [netmap_monitor.c]
199  * 	      Returned by netmap_get_monitor_na().
200  * 	      If successful, the nm_sync callbacks of the monitored adapter
201  * 	      will be intercepted by the returned monitor.
202  *
203  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
204  * 	      Cannot be obtained in this way, see VALE_CTL below
205  *
206  *
207  * 	os-specific:
208  * 	    linux: we first go through linux_netmap_ioctl() to
209  * 	           adapt the FreeBSD interface to the linux one.
210  *
211  *
212  * > 3. on each descriptor, the process issues an mmap() request to
213  * >    map the shared memory region within the process' address space.
214  * >    The list of interesting queues is indicated by a location in
215  * >    the shared memory region.
216  *
217  *      os-specific:
218  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
220  *
221  * > 4. using the functions in the netmap(4) userspace API, a process
222  * >    can look up the occupation state of a queue, access memory buffers,
223  * >    and retrieve received packets or enqueue packets to transmit.
224  *
225  * 	these actions do not involve the kernel.
226  *
227  * > 5. using some ioctl()s the process can synchronize the userspace view
228  * >    of the queue with the actual status in the kernel. This includes both
229  * >    receiving the notification of new packets, and transmitting new
230  * >    packets on the output interface.
231  *
232  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
234  * 	structures, as initialized in step 2 and maybe later modified
235  * 	by a monitor. Monitors, however, will always call the original
236  * 	callback before doing anything else.
237  *
238  *
239  * > 6. select() or poll() can be used to wait for events on individual
240  * >    transmit or receive queues (or all queues for a given interface).
241  *
242  * 	Implemented in netmap_poll(). This will call the same nm_sync()
243  * 	callbacks as in step 5 above.
244  *
245  * 	os-specific:
246  * 		linux: we first go through linux_netmap_poll() to adapt
247  * 		       the FreeBSD interface to the linux one.
248  *
249  *
250  *  ----  VALE_CTL -----
251  *
252  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
253  *  nr_cmd in the nmreq structure. These subcommands are handled by
254  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256  *  subcommands, respectively.
257  *
258  *  Any network interface known to the system (including a persistent VALE
259  *  port) can be attached to a VALE switch by issuing the
260  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
262  *  attachment of other interfaces, instead, requires the creation of a
263  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
264  *  netmap mode. This may require the creation of a netmap_generic_adapter if
265  *  we have no native support for the interface, or if generic adapters have
266  *  been forced by sysctl.
267  *
268  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270  *  callback.  In the case of the bwrap, the callback creates the
271  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
272  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274  *  A generic adapter for the wrapped ifp will be created if needed, when
275  *  netmap_get_bdg_na() calls netmap_get_hw_na().
276  *
277  *
278  *  ---- DATAPATHS -----
279  *
280  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281  *
282  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283  *
284  *    - tx from netmap userspace:
285  *	 concurrently:
286  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287  *                kring->nm_sync() == DEVICE_netmap_txsync()
288  *           2) device interrupt handler
289  *                na->nm_notify()  == netmap_notify()
290  *    - rx from netmap userspace:
291  *       concurrently:
292  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293  *                kring->nm_sync() == DEVICE_netmap_rxsync()
294  *           2) device interrupt handler
295  *                na->nm_notify()  == netmap_notify()
296  *    - tx from host stack
297  *       concurrently:
298  *           1) host stack
299  *                netmap_transmit()
300  *                  na->nm_notify  == netmap_notify()
301  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302  *                kring->nm_sync() == netmap_rxsync_from_host_compat
303  *                  netmap_rxsync_from_host(na, NULL, NULL)
304  *    - tx to host stack
305  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
306  *             kring->nm_sync() == netmap_txsync_to_host_compat
307  *               netmap_txsync_to_host(na)
308  *                 NM_SEND_UP()
309  *                   FreeBSD: na->if_input() == ?? XXX
310  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311  *
312  *
313  *
314  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315  *
316  *
317  *
318  *                           -= VALE PORT =-
319  *
320  *
321  *
322  *                           -= NETMAP PIPE =-
323  *
324  *
325  *
326  *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
327  *
328  *
329  *
330  *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
331  *
332  *
333  *
334  *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
335  *
336  *
337  *
338  *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
339  *
340  *
341  *
342  */
343 
344 /*
345  * OS-specific code that is used only within this file.
346  * Other OS-specific code that must be accessed by drivers
347  * is present in netmap_kern.h
348  */
349 
350 #if defined(__FreeBSD__)
351 #include <sys/cdefs.h> /* prerequisite */
352 #include <sys/types.h>
353 #include <sys/errno.h>
354 #include <sys/param.h>	/* defines used in kernel.h */
355 #include <sys/kernel.h>	/* types used in module initialization */
356 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
357 #include <sys/filio.h>	/* FIONBIO */
358 #include <sys/sockio.h>
359 #include <sys/socketvar.h>	/* struct socket */
360 #include <sys/malloc.h>
361 #include <sys/poll.h>
362 #include <sys/rwlock.h>
363 #include <sys/socket.h> /* sockaddrs */
364 #include <sys/selinfo.h>
365 #include <sys/sysctl.h>
366 #include <sys/jail.h>
367 #include <net/vnet.h>
368 #include <net/if.h>
369 #include <net/if_var.h>
370 #include <net/bpf.h>		/* BIOCIMMEDIATE */
371 #include <machine/bus.h>	/* bus_dmamap_* */
372 #include <sys/endian.h>
373 #include <sys/refcount.h>
374 
375 
376 /* reduce conditional code */
377 // linux API, use for the knlist in FreeBSD
378 /* use a private mutex for the knlist */
379 #define init_waitqueue_head(x) do {			\
380 	struct mtx *m = &(x)->m;			\
381 	mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);	\
382 	knlist_init_mtx(&(x)->si.si_note, m);		\
383     } while (0)
384 
385 #define OS_selrecord(a, b)	selrecord(a, &((b)->si))
386 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
387 
388 #elif defined(linux)
389 
390 #include "bsd_glue.h"
391 
392 
393 
394 #elif defined(__APPLE__)
395 
396 #warning OSX support is only partial
397 #include "osx_glue.h"
398 
399 #else
400 
401 #error	Unsupported platform
402 
403 #endif /* unsupported */
404 
405 /*
406  * common headers
407  */
408 #include <net/netmap.h>
409 #include <dev/netmap/netmap_kern.h>
410 #include <dev/netmap/netmap_mem2.h>
411 
412 
413 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
414 
415 /*
416  * The following variables are used by the drivers and replicate
417  * fields in the global memory pool. They only refer to buffers
418  * used by physical interfaces.
419  */
420 u_int netmap_total_buffers;
421 u_int netmap_buf_size;
422 char *netmap_buffer_base;	/* also address of an invalid buffer */
423 
424 /* user-controlled variables */
425 int netmap_verbose;
426 
427 static int netmap_no_timestamp; /* don't timestamp on rxsync */
428 
429 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
430 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
431     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
432 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
433     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
434 int netmap_mitigate = 1;
435 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
436 int netmap_no_pendintr = 1;
437 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
438     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
439 int netmap_txsync_retry = 2;
440 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
441     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
442 
443 int netmap_adaptive_io = 0;
444 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
445     &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
446 
447 int netmap_flags = 0;	/* debug flags */
448 int netmap_fwd = 0;	/* force transparent mode */
449 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
450 
451 /*
452  * netmap_admode selects the netmap mode to use.
453  * Invalid values are reset to NETMAP_ADMODE_BEST
454  */
455 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
456 	NETMAP_ADMODE_NATIVE,	/* either native or none */
457 	NETMAP_ADMODE_GENERIC,	/* force generic */
458 	NETMAP_ADMODE_LAST };
459 static int netmap_admode = NETMAP_ADMODE_BEST;
460 
461 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
462 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
463 int netmap_generic_rings = 1;   /* number of queues in generic. */
464 
465 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
466 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
467 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
468 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
469 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
470 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
471 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
472 
473 NMG_LOCK_T	netmap_global_lock;
474 
475 
476 static void
477 nm_kr_get(struct netmap_kring *kr)
478 {
479 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
480 		tsleep(kr, 0, "NM_KR_GET", 4);
481 }
482 
483 
484 /*
485  * mark the ring as stopped, and run through the locks
486  * to make sure other users get to see it.
487  */
488 static void
489 netmap_disable_ring(struct netmap_kring *kr)
490 {
491 	kr->nkr_stopped = 1;
492 	nm_kr_get(kr);
493 	mtx_lock(&kr->q_lock);
494 	mtx_unlock(&kr->q_lock);
495 	nm_kr_put(kr);
496 }
497 
498 /* stop or enable a single tx ring */
499 void
500 netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped)
501 {
502 	if (stopped)
503 		netmap_disable_ring(na->tx_rings + ring_id);
504 	else
505 		na->tx_rings[ring_id].nkr_stopped = 0;
506 	/* nofify that the stopped state has changed. This is currently
507 	 *only used by bwrap to propagate the state to its own krings.
508 	 * (see netmap_bwrap_intr_notify).
509 	 */
510 	na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY);
511 }
512 
513 /* stop or enable a single rx ring */
514 void
515 netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped)
516 {
517 	if (stopped)
518 		netmap_disable_ring(na->rx_rings + ring_id);
519 	else
520 		na->rx_rings[ring_id].nkr_stopped = 0;
521 	/* nofify that the stopped state has changed. This is currently
522 	 *only used by bwrap to propagate the state to its own krings.
523 	 * (see netmap_bwrap_intr_notify).
524 	 */
525 	na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY);
526 }
527 
528 
529 /* stop or enable all the rings of na */
530 void
531 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
532 {
533 	int i;
534 	u_int ntx, nrx;
535 
536 	if (!nm_netmap_on(na))
537 		return;
538 
539 	ntx = netmap_real_tx_rings(na);
540 	nrx = netmap_real_rx_rings(na);
541 
542 	for (i = 0; i < ntx; i++) {
543 		netmap_set_txring(na, i, stopped);
544 	}
545 
546 	for (i = 0; i < nrx; i++) {
547 		netmap_set_rxring(na, i, stopped);
548 	}
549 }
550 
551 /*
552  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
553  * to finish and prevents any new one from starting.  Call this before turning
554  * netmap mode off, or before removing the harware rings (e.g., on module
555  * onload).  As a rule of thumb for linux drivers, this should be placed near
556  * each napi_disable().
557  */
558 void
559 netmap_disable_all_rings(struct ifnet *ifp)
560 {
561 	netmap_set_all_rings(NA(ifp), 1 /* stopped */);
562 }
563 
564 /*
565  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
566  * adapter's rings In linux drivers, this should be placed near each
567  * napi_enable().
568  */
569 void
570 netmap_enable_all_rings(struct ifnet *ifp)
571 {
572 	netmap_set_all_rings(NA(ifp), 0 /* enabled */);
573 }
574 
575 
576 /*
577  * generic bound_checking function
578  */
579 u_int
580 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
581 {
582 	u_int oldv = *v;
583 	const char *op = NULL;
584 
585 	if (dflt < lo)
586 		dflt = lo;
587 	if (dflt > hi)
588 		dflt = hi;
589 	if (oldv < lo) {
590 		*v = dflt;
591 		op = "Bump";
592 	} else if (oldv > hi) {
593 		*v = hi;
594 		op = "Clamp";
595 	}
596 	if (op && msg)
597 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
598 	return *v;
599 }
600 
601 
602 /*
603  * packet-dump function, user-supplied or static buffer.
604  * The destination buffer must be at least 30+4*len
605  */
606 const char *
607 nm_dump_buf(char *p, int len, int lim, char *dst)
608 {
609 	static char _dst[8192];
610 	int i, j, i0;
611 	static char hex[] ="0123456789abcdef";
612 	char *o;	/* output position */
613 
614 #define P_HI(x)	hex[((x) & 0xf0)>>4]
615 #define P_LO(x)	hex[((x) & 0xf)]
616 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
617 	if (!dst)
618 		dst = _dst;
619 	if (lim <= 0 || lim > len)
620 		lim = len;
621 	o = dst;
622 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
623 	o += strlen(o);
624 	/* hexdump routine */
625 	for (i = 0; i < lim; ) {
626 		sprintf(o, "%5d: ", i);
627 		o += strlen(o);
628 		memset(o, ' ', 48);
629 		i0 = i;
630 		for (j=0; j < 16 && i < lim; i++, j++) {
631 			o[j*3] = P_HI(p[i]);
632 			o[j*3+1] = P_LO(p[i]);
633 		}
634 		i = i0;
635 		for (j=0; j < 16 && i < lim; i++, j++)
636 			o[j + 48] = P_C(p[i]);
637 		o[j+48] = '\n';
638 		o += j+49;
639 	}
640 	*o = '\0';
641 #undef P_HI
642 #undef P_LO
643 #undef P_C
644 	return dst;
645 }
646 
647 
648 /*
649  * Fetch configuration from the device, to cope with dynamic
650  * reconfigurations after loading the module.
651  */
652 /* call with NMG_LOCK held */
653 int
654 netmap_update_config(struct netmap_adapter *na)
655 {
656 	u_int txr, txd, rxr, rxd;
657 
658 	txr = txd = rxr = rxd = 0;
659 	if (na->nm_config == NULL ||
660 	    na->nm_config(na, &txr, &txd, &rxr, &rxd)) {
661 		/* take whatever we had at init time */
662 		txr = na->num_tx_rings;
663 		txd = na->num_tx_desc;
664 		rxr = na->num_rx_rings;
665 		rxd = na->num_rx_desc;
666 	}
667 
668 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
669 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
670 		return 0; /* nothing changed */
671 	if (netmap_verbose || na->active_fds > 0) {
672 		D("stored config %s: txring %d x %d, rxring %d x %d",
673 			na->name,
674 			na->num_tx_rings, na->num_tx_desc,
675 			na->num_rx_rings, na->num_rx_desc);
676 		D("new config %s: txring %d x %d, rxring %d x %d",
677 			na->name, txr, txd, rxr, rxd);
678 	}
679 	if (na->active_fds == 0) {
680 		D("configuration changed (but fine)");
681 		na->num_tx_rings = txr;
682 		na->num_tx_desc = txd;
683 		na->num_rx_rings = rxr;
684 		na->num_rx_desc = rxd;
685 		return 0;
686 	}
687 	D("configuration changed while active, this is bad...");
688 	return 1;
689 }
690 
691 /* kring->nm_sync callback for the host tx ring */
692 static int
693 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
694 {
695 	(void)flags; /* unused */
696 	netmap_txsync_to_host(kring->na);
697 	return 0;
698 }
699 
700 /* kring->nm_sync callback for the host rx ring */
701 static int
702 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
703 {
704 	(void)flags; /* unused */
705 	netmap_rxsync_from_host(kring->na, NULL, NULL);
706 	return 0;
707 }
708 
709 
710 
711 /* create the krings array and initialize the fields common to all adapters.
712  * The array layout is this:
713  *
714  *                    +----------+
715  * na->tx_rings ----->|          | \
716  *                    |          |  } na->num_tx_ring
717  *                    |          | /
718  *                    +----------+
719  *                    |          |    host tx kring
720  * na->rx_rings ----> +----------+
721  *                    |          | \
722  *                    |          |  } na->num_rx_rings
723  *                    |          | /
724  *                    +----------+
725  *                    |          |    host rx kring
726  *                    +----------+
727  * na->tailroom ----->|          | \
728  *                    |          |  } tailroom bytes
729  *                    |          | /
730  *                    +----------+
731  *
732  * Note: for compatibility, host krings are created even when not needed.
733  * The tailroom space is currently used by vale ports for allocating leases.
734  */
735 /* call with NMG_LOCK held */
736 int
737 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
738 {
739 	u_int i, len, ndesc;
740 	struct netmap_kring *kring;
741 	u_int ntx, nrx;
742 
743 	/* account for the (possibly fake) host rings */
744 	ntx = na->num_tx_rings + 1;
745 	nrx = na->num_rx_rings + 1;
746 
747 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
748 
749 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
750 	if (na->tx_rings == NULL) {
751 		D("Cannot allocate krings");
752 		return ENOMEM;
753 	}
754 	na->rx_rings = na->tx_rings + ntx;
755 
756 	/*
757 	 * All fields in krings are 0 except the one initialized below.
758 	 * but better be explicit on important kring fields.
759 	 */
760 	ndesc = na->num_tx_desc;
761 	for (i = 0; i < ntx; i++) { /* Transmit rings */
762 		kring = &na->tx_rings[i];
763 		bzero(kring, sizeof(*kring));
764 		kring->na = na;
765 		kring->ring_id = i;
766 		kring->nkr_num_slots = ndesc;
767 		if (i < na->num_tx_rings) {
768 			kring->nm_sync = na->nm_txsync;
769 		} else if (i == na->num_tx_rings) {
770 			kring->nm_sync = netmap_txsync_to_host_compat;
771 		}
772 		/*
773 		 * IMPORTANT: Always keep one slot empty.
774 		 */
775 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
776 		kring->rtail = kring->nr_hwtail = ndesc - 1;
777 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i);
778 		ND("ktx %s h %d c %d t %d",
779 			kring->name, kring->rhead, kring->rcur, kring->rtail);
780 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
781 		init_waitqueue_head(&kring->si);
782 	}
783 
784 	ndesc = na->num_rx_desc;
785 	for (i = 0; i < nrx; i++) { /* Receive rings */
786 		kring = &na->rx_rings[i];
787 		bzero(kring, sizeof(*kring));
788 		kring->na = na;
789 		kring->ring_id = i;
790 		kring->nkr_num_slots = ndesc;
791 		if (i < na->num_rx_rings) {
792 			kring->nm_sync = na->nm_rxsync;
793 		} else if (i == na->num_rx_rings) {
794 			kring->nm_sync = netmap_rxsync_from_host_compat;
795 		}
796 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
797 		kring->rtail = kring->nr_hwtail = 0;
798 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i);
799 		ND("krx %s h %d c %d t %d",
800 			kring->name, kring->rhead, kring->rcur, kring->rtail);
801 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
802 		init_waitqueue_head(&kring->si);
803 	}
804 	init_waitqueue_head(&na->tx_si);
805 	init_waitqueue_head(&na->rx_si);
806 
807 	na->tailroom = na->rx_rings + nrx;
808 
809 	return 0;
810 }
811 
812 
813 #ifdef __FreeBSD__
814 static void
815 netmap_knlist_destroy(NM_SELINFO_T *si)
816 {
817 	/* XXX kqueue(9) needed; these will mirror knlist_init. */
818 	knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
819 	knlist_destroy(&si->si.si_note);
820 	/* now we don't need the mutex anymore */
821 	mtx_destroy(&si->m);
822 }
823 #endif /* __FreeBSD__ */
824 
825 
826 /* undo the actions performed by netmap_krings_create */
827 /* call with NMG_LOCK held */
828 void
829 netmap_krings_delete(struct netmap_adapter *na)
830 {
831 	struct netmap_kring *kring = na->tx_rings;
832 
833 	/* we rely on the krings layout described above */
834 	for ( ; kring != na->tailroom; kring++) {
835 		mtx_destroy(&kring->q_lock);
836 		netmap_knlist_destroy(&kring->si);
837 	}
838 	free(na->tx_rings, M_DEVBUF);
839 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
840 }
841 
842 
843 /*
844  * Destructor for NIC ports. They also have an mbuf queue
845  * on the rings connected to the host so we need to purge
846  * them first.
847  */
848 /* call with NMG_LOCK held */
849 static void
850 netmap_hw_krings_delete(struct netmap_adapter *na)
851 {
852 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
853 
854 	ND("destroy sw mbq with len %d", mbq_len(q));
855 	mbq_purge(q);
856 	mbq_safe_destroy(q);
857 	netmap_krings_delete(na);
858 }
859 
860 
861 /* create a new netmap_if for a newly registered fd.
862  * If this is the first registration of the adapter,
863  * also create the netmap rings and their in-kernel view,
864  * the netmap krings.
865  */
866 /* call with NMG_LOCK held */
867 static struct netmap_if*
868 netmap_if_new(struct netmap_adapter *na)
869 {
870 	struct netmap_if *nifp;
871 
872 	if (netmap_update_config(na)) {
873 		/* configuration mismatch, report and fail */
874 		return NULL;
875 	}
876 
877 	if (na->active_fds)	/* already registered */
878 		goto final;
879 
880 	/* create and init the krings arrays.
881 	 * Depending on the adapter, this may also create
882 	 * the netmap rings themselves
883 	 */
884 	if (na->nm_krings_create(na))
885 		return NULL;
886 
887 	/* create all missing netmap rings */
888 	if (netmap_mem_rings_create(na))
889 		goto cleanup;
890 
891 final:
892 
893 	/* in all cases, create a new netmap if */
894 	nifp = netmap_mem_if_new(na);
895 	if (nifp == NULL)
896 		goto cleanup;
897 
898 	return (nifp);
899 
900 cleanup:
901 
902 	if (na->active_fds == 0) {
903 		netmap_mem_rings_delete(na);
904 		na->nm_krings_delete(na);
905 	}
906 
907 	return NULL;
908 }
909 
910 
911 /* grab a reference to the memory allocator, if we don't have one already.  The
912  * reference is taken from the netmap_adapter registered with the priv.
913  */
914 /* call with NMG_LOCK held */
915 static int
916 netmap_get_memory_locked(struct netmap_priv_d* p)
917 {
918 	struct netmap_mem_d *nmd;
919 	int error = 0;
920 
921 	if (p->np_na == NULL) {
922 		if (!netmap_mmap_unreg)
923 			return ENODEV;
924 		/* for compatibility with older versions of the API
925  		 * we use the global allocator when no interface has been
926  		 * registered
927  		 */
928 		nmd = &nm_mem;
929 	} else {
930 		nmd = p->np_na->nm_mem;
931 	}
932 	if (p->np_mref == NULL) {
933 		error = netmap_mem_finalize(nmd, p->np_na);
934 		if (!error)
935 			p->np_mref = nmd;
936 	} else if (p->np_mref != nmd) {
937 		/* a virtual port has been registered, but previous
938  		 * syscalls already used the global allocator.
939  		 * We cannot continue
940  		 */
941 		error = ENODEV;
942 	}
943 	return error;
944 }
945 
946 
947 /* call with NMG_LOCK *not* held */
948 int
949 netmap_get_memory(struct netmap_priv_d* p)
950 {
951 	int error;
952 	NMG_LOCK();
953 	error = netmap_get_memory_locked(p);
954 	NMG_UNLOCK();
955 	return error;
956 }
957 
958 
959 /* call with NMG_LOCK held */
960 static int
961 netmap_have_memory_locked(struct netmap_priv_d* p)
962 {
963 	return p->np_mref != NULL;
964 }
965 
966 
967 /* call with NMG_LOCK held */
968 static void
969 netmap_drop_memory_locked(struct netmap_priv_d* p)
970 {
971 	if (p->np_mref) {
972 		netmap_mem_deref(p->np_mref, p->np_na);
973 		p->np_mref = NULL;
974 	}
975 }
976 
977 
978 /*
979  * Call nm_register(ifp,0) to stop netmap mode on the interface and
980  * revert to normal operation.
981  * The second argument is the nifp to work on. In some cases it is
982  * not attached yet to the netmap_priv_d so we need to pass it as
983  * a separate argument.
984  */
985 /* call with NMG_LOCK held */
986 static void
987 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
988 {
989 	struct netmap_adapter *na = priv->np_na;
990 
991 	NMG_LOCK_ASSERT();
992 	na->active_fds--;
993 	if (na->active_fds <= 0) {	/* last instance */
994 
995 		if (netmap_verbose)
996 			D("deleting last instance for %s", na->name);
997 		/*
998 		 * (TO CHECK) This function is only called
999 		 * when the last reference to this file descriptor goes
1000 		 * away. This means we cannot have any pending poll()
1001 		 * or interrupt routine operating on the structure.
1002 		 * XXX The file may be closed in a thread while
1003 		 * another thread is using it.
1004 		 * Linux keeps the file opened until the last reference
1005 		 * by any outstanding ioctl/poll or mmap is gone.
1006 		 * FreeBSD does not track mmap()s (but we do) and
1007 		 * wakes up any sleeping poll(). Need to check what
1008 		 * happens if the close() occurs while a concurrent
1009 		 * syscall is running.
1010 		 */
1011 		na->nm_register(na, 0); /* off, clear flags */
1012 		/* Wake up any sleeping threads. netmap_poll will
1013 		 * then return POLLERR
1014 		 * XXX The wake up now must happen during *_down(), when
1015 		 * we order all activities to stop. -gl
1016 		 */
1017 		netmap_knlist_destroy(&na->tx_si);
1018 		netmap_knlist_destroy(&na->rx_si);
1019 
1020 		/* delete rings and buffers */
1021 		netmap_mem_rings_delete(na);
1022 		na->nm_krings_delete(na);
1023 	}
1024 	/* delete the nifp */
1025 	netmap_mem_if_delete(na, nifp);
1026 }
1027 
1028 /* call with NMG_LOCK held */
1029 static __inline int
1030 nm_tx_si_user(struct netmap_priv_d *priv)
1031 {
1032 	return (priv->np_na != NULL &&
1033 		(priv->np_txqlast - priv->np_txqfirst > 1));
1034 }
1035 
1036 /* call with NMG_LOCK held */
1037 static __inline int
1038 nm_rx_si_user(struct netmap_priv_d *priv)
1039 {
1040 	return (priv->np_na != NULL &&
1041 		(priv->np_rxqlast - priv->np_rxqfirst > 1));
1042 }
1043 
1044 
1045 /*
1046  * Destructor of the netmap_priv_d, called when the fd has
1047  * no active open() and mmap(). Also called in error paths.
1048  *
1049  * returns 1 if this is the last instance and we can free priv
1050  */
1051 /* call with NMG_LOCK held */
1052 int
1053 netmap_dtor_locked(struct netmap_priv_d *priv)
1054 {
1055 	struct netmap_adapter *na = priv->np_na;
1056 
1057 #ifdef __FreeBSD__
1058 	/*
1059 	 * np_refcount is the number of active mmaps on
1060 	 * this file descriptor
1061 	 */
1062 	if (--priv->np_refcount > 0) {
1063 		return 0;
1064 	}
1065 #endif /* __FreeBSD__ */
1066 	if (!na) {
1067 	    return 1; //XXX is it correct?
1068 	}
1069 	netmap_do_unregif(priv, priv->np_nifp);
1070 	priv->np_nifp = NULL;
1071 	netmap_drop_memory_locked(priv);
1072 	if (priv->np_na) {
1073 		if (nm_tx_si_user(priv))
1074 			na->tx_si_users--;
1075 		if (nm_rx_si_user(priv))
1076 			na->rx_si_users--;
1077 		netmap_adapter_put(na);
1078 		priv->np_na = NULL;
1079 	}
1080 	return 1;
1081 }
1082 
1083 
1084 /* call with NMG_LOCK *not* held */
1085 void
1086 netmap_dtor(void *data)
1087 {
1088 	struct netmap_priv_d *priv = data;
1089 	int last_instance;
1090 
1091 	NMG_LOCK();
1092 	last_instance = netmap_dtor_locked(priv);
1093 	NMG_UNLOCK();
1094 	if (last_instance) {
1095 		bzero(priv, sizeof(*priv));	/* for safety */
1096 		free(priv, M_DEVBUF);
1097 	}
1098 }
1099 
1100 
1101 
1102 
1103 /*
1104  * Handlers for synchronization of the queues from/to the host.
1105  * Netmap has two operating modes:
1106  * - in the default mode, the rings connected to the host stack are
1107  *   just another ring pair managed by userspace;
1108  * - in transparent mode (XXX to be defined) incoming packets
1109  *   (from the host or the NIC) are marked as NS_FORWARD upon
1110  *   arrival, and the user application has a chance to reset the
1111  *   flag for packets that should be dropped.
1112  *   On the RXSYNC or poll(), packets in RX rings between
1113  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1114  *   to the other side.
1115  * The transfer NIC --> host is relatively easy, just encapsulate
1116  * into mbufs and we are done. The host --> NIC side is slightly
1117  * harder because there might not be room in the tx ring so it
1118  * might take a while before releasing the buffer.
1119  */
1120 
1121 
1122 /*
1123  * pass a chain of buffers to the host stack as coming from 'dst'
1124  * We do not need to lock because the queue is private.
1125  */
1126 static void
1127 netmap_send_up(struct ifnet *dst, struct mbq *q)
1128 {
1129 	struct mbuf *m;
1130 
1131 	/* send packets up, outside the lock */
1132 	while ((m = mbq_dequeue(q)) != NULL) {
1133 		if (netmap_verbose & NM_VERB_HOST)
1134 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1135 		NM_SEND_UP(dst, m);
1136 	}
1137 	mbq_destroy(q);
1138 }
1139 
1140 
1141 /*
1142  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1143  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1144  * and pass them up. Drop remaining packets in the unlikely event
1145  * of an mbuf shortage.
1146  */
1147 static void
1148 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1149 {
1150 	u_int const lim = kring->nkr_num_slots - 1;
1151 	u_int const head = kring->ring->head;
1152 	u_int n;
1153 	struct netmap_adapter *na = kring->na;
1154 
1155 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1156 		struct mbuf *m;
1157 		struct netmap_slot *slot = &kring->ring->slot[n];
1158 
1159 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1160 			continue;
1161 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1162 			RD(5, "bad pkt at %d len %d", n, slot->len);
1163 			continue;
1164 		}
1165 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1166 		/* XXX TODO: adapt to the case of a multisegment packet */
1167 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1168 
1169 		if (m == NULL)
1170 			break;
1171 		mbq_enqueue(q, m);
1172 	}
1173 }
1174 
1175 
1176 /*
1177  * Send to the NIC rings packets marked NS_FORWARD between
1178  * kring->nr_hwcur and kring->rhead
1179  * Called under kring->rx_queue.lock on the sw rx ring,
1180  */
1181 static u_int
1182 netmap_sw_to_nic(struct netmap_adapter *na)
1183 {
1184 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1185 	struct netmap_slot *rxslot = kring->ring->slot;
1186 	u_int i, rxcur = kring->nr_hwcur;
1187 	u_int const head = kring->rhead;
1188 	u_int const src_lim = kring->nkr_num_slots - 1;
1189 	u_int sent = 0;
1190 
1191 	/* scan rings to find space, then fill as much as possible */
1192 	for (i = 0; i < na->num_tx_rings; i++) {
1193 		struct netmap_kring *kdst = &na->tx_rings[i];
1194 		struct netmap_ring *rdst = kdst->ring;
1195 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1196 
1197 		/* XXX do we trust ring or kring->rcur,rtail ? */
1198 		for (; rxcur != head && !nm_ring_empty(rdst);
1199 		     rxcur = nm_next(rxcur, src_lim) ) {
1200 			struct netmap_slot *src, *dst, tmp;
1201 			u_int dst_cur = rdst->cur;
1202 
1203 			src = &rxslot[rxcur];
1204 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1205 				continue;
1206 
1207 			sent++;
1208 
1209 			dst = &rdst->slot[dst_cur];
1210 
1211 			tmp = *src;
1212 
1213 			src->buf_idx = dst->buf_idx;
1214 			src->flags = NS_BUF_CHANGED;
1215 
1216 			dst->buf_idx = tmp.buf_idx;
1217 			dst->len = tmp.len;
1218 			dst->flags = NS_BUF_CHANGED;
1219 
1220 			rdst->cur = nm_next(dst_cur, dst_lim);
1221 		}
1222 		/* if (sent) XXX txsync ? */
1223 	}
1224 	return sent;
1225 }
1226 
1227 
1228 /*
1229  * netmap_txsync_to_host() passes packets up. We are called from a
1230  * system call in user process context, and the only contention
1231  * can be among multiple user threads erroneously calling
1232  * this routine concurrently.
1233  */
1234 void
1235 netmap_txsync_to_host(struct netmap_adapter *na)
1236 {
1237 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1238 	struct netmap_ring *ring = kring->ring;
1239 	u_int const lim = kring->nkr_num_slots - 1;
1240 	u_int const head = kring->rhead;
1241 	struct mbq q;
1242 
1243 	/* Take packets from hwcur to head and pass them up.
1244 	 * force head = cur since netmap_grab_packets() stops at head
1245 	 * In case of no buffers we give up. At the end of the loop,
1246 	 * the queue is drained in all cases.
1247 	 */
1248 	mbq_init(&q);
1249 	ring->cur = head;
1250 	netmap_grab_packets(kring, &q, 1 /* force */);
1251 	ND("have %d pkts in queue", mbq_len(&q));
1252 	kring->nr_hwcur = head;
1253 	kring->nr_hwtail = head + lim;
1254 	if (kring->nr_hwtail > lim)
1255 		kring->nr_hwtail -= lim + 1;
1256 	nm_txsync_finalize(kring);
1257 
1258 	netmap_send_up(na->ifp, &q);
1259 }
1260 
1261 
1262 /*
1263  * rxsync backend for packets coming from the host stack.
1264  * They have been put in kring->rx_queue by netmap_transmit().
1265  * We protect access to the kring using kring->rx_queue.lock
1266  *
1267  * This routine also does the selrecord if called from the poll handler
1268  * (we know because td != NULL).
1269  *
1270  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1271  *     as an additional hidden argument.
1272  * returns the number of packets delivered to tx queues in
1273  * transparent mode, or a negative value if error
1274  */
1275 int
1276 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1277 {
1278 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1279 	struct netmap_ring *ring = kring->ring;
1280 	u_int nm_i, n;
1281 	u_int const lim = kring->nkr_num_slots - 1;
1282 	u_int const head = kring->rhead;
1283 	int ret = 0;
1284 	struct mbq *q = &kring->rx_queue;
1285 
1286 	(void)pwait;	/* disable unused warnings */
1287 	(void)td;
1288 
1289 	mbq_lock(q);
1290 
1291 	/* First part: import newly received packets */
1292 	n = mbq_len(q);
1293 	if (n) { /* grab packets from the queue */
1294 		struct mbuf *m;
1295 		uint32_t stop_i;
1296 
1297 		nm_i = kring->nr_hwtail;
1298 		stop_i = nm_prev(nm_i, lim);
1299 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1300 			int len = MBUF_LEN(m);
1301 			struct netmap_slot *slot = &ring->slot[nm_i];
1302 
1303 			m_copydata(m, 0, len, NMB(na, slot));
1304 			ND("nm %d len %d", nm_i, len);
1305 			if (netmap_verbose)
1306                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1307 
1308 			slot->len = len;
1309 			slot->flags = kring->nkr_slot_flags;
1310 			nm_i = nm_next(nm_i, lim);
1311 			m_freem(m);
1312 		}
1313 		kring->nr_hwtail = nm_i;
1314 	}
1315 
1316 	/*
1317 	 * Second part: skip past packets that userspace has released.
1318 	 */
1319 	nm_i = kring->nr_hwcur;
1320 	if (nm_i != head) { /* something was released */
1321 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1322 			ret = netmap_sw_to_nic(na);
1323 		kring->nr_hwcur = head;
1324 	}
1325 
1326 	nm_rxsync_finalize(kring);
1327 
1328 	/* access copies of cur,tail in the kring */
1329 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1330 		OS_selrecord(td, &kring->si);
1331 
1332 	mbq_unlock(q);
1333 	return ret;
1334 }
1335 
1336 
1337 /* Get a netmap adapter for the port.
1338  *
1339  * If it is possible to satisfy the request, return 0
1340  * with *na containing the netmap adapter found.
1341  * Otherwise return an error code, with *na containing NULL.
1342  *
1343  * When the port is attached to a bridge, we always return
1344  * EBUSY.
1345  * Otherwise, if the port is already bound to a file descriptor,
1346  * then we unconditionally return the existing adapter into *na.
1347  * In all the other cases, we return (into *na) either native,
1348  * generic or NULL, according to the following table:
1349  *
1350  *					native_support
1351  * active_fds   dev.netmap.admode         YES     NO
1352  * -------------------------------------------------------
1353  *    >0              *                 NA(ifp) NA(ifp)
1354  *
1355  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1356  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1357  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1358  *
1359  */
1360 
1361 int
1362 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1363 {
1364 	/* generic support */
1365 	int i = netmap_admode;	/* Take a snapshot. */
1366 	int error = 0;
1367 	struct netmap_adapter *prev_na;
1368 	struct netmap_generic_adapter *gna;
1369 
1370 	*na = NULL; /* default */
1371 
1372 	/* reset in case of invalid value */
1373 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1374 		i = netmap_admode = NETMAP_ADMODE_BEST;
1375 
1376 	if (NETMAP_CAPABLE(ifp)) {
1377 		prev_na = NA(ifp);
1378 		/* If an adapter already exists, return it if
1379 		 * there are active file descriptors or if
1380 		 * netmap is not forced to use generic
1381 		 * adapters.
1382 		 */
1383 		if (NETMAP_OWNED_BY_ANY(prev_na)
1384 			|| i != NETMAP_ADMODE_GENERIC
1385 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1386 #ifdef WITH_PIPES
1387 			/* ugly, but we cannot allow an adapter switch
1388 			 * if some pipe is referring to this one
1389 			 */
1390 			|| prev_na->na_next_pipe > 0
1391 #endif
1392 		) {
1393 			*na = prev_na;
1394 			return 0;
1395 		}
1396 	}
1397 
1398 	/* If there isn't native support and netmap is not allowed
1399 	 * to use generic adapters, we cannot satisfy the request.
1400 	 */
1401 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1402 		return EOPNOTSUPP;
1403 
1404 	/* Otherwise, create a generic adapter and return it,
1405 	 * saving the previously used netmap adapter, if any.
1406 	 *
1407 	 * Note that here 'prev_na', if not NULL, MUST be a
1408 	 * native adapter, and CANNOT be a generic one. This is
1409 	 * true because generic adapters are created on demand, and
1410 	 * destroyed when not used anymore. Therefore, if the adapter
1411 	 * currently attached to an interface 'ifp' is generic, it
1412 	 * must be that
1413 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1414 	 * Consequently, if NA(ifp) is generic, we will enter one of
1415 	 * the branches above. This ensures that we never override
1416 	 * a generic adapter with another generic adapter.
1417 	 */
1418 	prev_na = NA(ifp);
1419 	error = generic_netmap_attach(ifp);
1420 	if (error)
1421 		return error;
1422 
1423 	*na = NA(ifp);
1424 	gna = (struct netmap_generic_adapter*)NA(ifp);
1425 	gna->prev = prev_na; /* save old na */
1426 	if (prev_na != NULL) {
1427 		ifunit_ref(ifp->if_xname);
1428 		// XXX add a refcount ?
1429 		netmap_adapter_get(prev_na);
1430 	}
1431 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1432 
1433 	return 0;
1434 }
1435 
1436 
1437 /*
1438  * MUST BE CALLED UNDER NMG_LOCK()
1439  *
1440  * Get a refcounted reference to a netmap adapter attached
1441  * to the interface specified by nmr.
1442  * This is always called in the execution of an ioctl().
1443  *
1444  * Return ENXIO if the interface specified by the request does
1445  * not exist, ENOTSUP if netmap is not supported by the interface,
1446  * EBUSY if the interface is already attached to a bridge,
1447  * EINVAL if parameters are invalid, ENOMEM if needed resources
1448  * could not be allocated.
1449  * If successful, hold a reference to the netmap adapter.
1450  *
1451  * No reference is kept on the real interface, which may then
1452  * disappear at any time.
1453  */
1454 int
1455 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1456 {
1457 	struct ifnet *ifp = NULL;
1458 	int error = 0;
1459 	struct netmap_adapter *ret = NULL;
1460 
1461 	*na = NULL;     /* default return value */
1462 
1463 	NMG_LOCK_ASSERT();
1464 
1465 	/* we cascade through all possibile types of netmap adapter.
1466 	 * All netmap_get_*_na() functions return an error and an na,
1467 	 * with the following combinations:
1468 	 *
1469 	 * error    na
1470 	 *   0	   NULL		type doesn't match
1471 	 *  !0	   NULL		type matches, but na creation/lookup failed
1472 	 *   0	  !NULL		type matches and na created/found
1473 	 *  !0    !NULL		impossible
1474 	 */
1475 
1476 	/* try to see if this is a monitor port */
1477 	error = netmap_get_monitor_na(nmr, na, create);
1478 	if (error || *na != NULL)
1479 		return error;
1480 
1481 	/* try to see if this is a pipe port */
1482 	error = netmap_get_pipe_na(nmr, na, create);
1483 	if (error || *na != NULL)
1484 		return error;
1485 
1486 	/* try to see if this is a bridge port */
1487 	error = netmap_get_bdg_na(nmr, na, create);
1488 	if (error)
1489 		return error;
1490 
1491 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1492 		goto pipes;
1493 
1494 	/*
1495 	 * This must be a hardware na, lookup the name in the system.
1496 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1497 	 * This may still be a tap, a veth/epair, or even a
1498 	 * persistent VALE port.
1499 	 */
1500 	ifp = ifunit_ref(nmr->nr_name);
1501 	if (ifp == NULL) {
1502 	        return ENXIO;
1503 	}
1504 
1505 	error = netmap_get_hw_na(ifp, &ret);
1506 	if (error)
1507 		goto out;
1508 
1509 	*na = ret;
1510 	netmap_adapter_get(ret);
1511 
1512 pipes:
1513 	/*
1514 	 * If we are opening a pipe whose parent was not in netmap mode,
1515 	 * we have to allocate the pipe array now.
1516 	 * XXX get rid of this clumsiness (2014-03-15)
1517 	 */
1518 	error = netmap_pipe_alloc(*na, nmr);
1519 
1520 out:
1521 	if (error && ret != NULL)
1522 		netmap_adapter_put(ret);
1523 
1524 	if (ifp)
1525 		if_rele(ifp); /* allow live unloading of drivers modules */
1526 
1527 	return error;
1528 }
1529 
1530 
1531 /*
1532  * validate parameters on entry for *_txsync()
1533  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1534  * in case of error.
1535  *
1536  * rhead, rcur and rtail=hwtail are stored from previous round.
1537  * hwcur is the next packet to send to the ring.
1538  *
1539  * We want
1540  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1541  *
1542  * hwcur, rhead, rtail and hwtail are reliable
1543  */
1544 u_int
1545 nm_txsync_prologue(struct netmap_kring *kring)
1546 {
1547 	struct netmap_ring *ring = kring->ring;
1548 	u_int head = ring->head; /* read only once */
1549 	u_int cur = ring->cur; /* read only once */
1550 	u_int n = kring->nkr_num_slots;
1551 
1552 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1553 		kring->name,
1554 		kring->nr_hwcur, kring->nr_hwtail,
1555 		ring->head, ring->cur, ring->tail);
1556 #if 1 /* kernel sanity checks; but we can trust the kring. */
1557 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1558 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1559 		goto error;
1560 #endif /* kernel sanity checks */
1561 	/*
1562 	 * user sanity checks. We only use 'cur',
1563 	 * A, B, ... are possible positions for cur:
1564 	 *
1565 	 *  0    A  cur   B  tail  C  n-1
1566 	 *  0    D  tail  E  cur   F  n-1
1567 	 *
1568 	 * B, F, D are valid. A, C, E are wrong
1569 	 */
1570 	if (kring->rtail >= kring->rhead) {
1571 		/* want rhead <= head <= rtail */
1572 		if (head < kring->rhead || head > kring->rtail)
1573 			goto error;
1574 		/* and also head <= cur <= rtail */
1575 		if (cur < head || cur > kring->rtail)
1576 			goto error;
1577 	} else { /* here rtail < rhead */
1578 		/* we need head outside rtail .. rhead */
1579 		if (head > kring->rtail && head < kring->rhead)
1580 			goto error;
1581 
1582 		/* two cases now: head <= rtail or head >= rhead  */
1583 		if (head <= kring->rtail) {
1584 			/* want head <= cur <= rtail */
1585 			if (cur < head || cur > kring->rtail)
1586 				goto error;
1587 		} else { /* head >= rhead */
1588 			/* cur must be outside rtail..head */
1589 			if (cur > kring->rtail && cur < head)
1590 				goto error;
1591 		}
1592 	}
1593 	if (ring->tail != kring->rtail) {
1594 		RD(5, "tail overwritten was %d need %d",
1595 			ring->tail, kring->rtail);
1596 		ring->tail = kring->rtail;
1597 	}
1598 	kring->rhead = head;
1599 	kring->rcur = cur;
1600 	return head;
1601 
1602 error:
1603 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1604 		kring->name,
1605 		kring->nr_hwcur,
1606 		kring->rcur, kring->nr_hwtail,
1607 		cur, ring->tail);
1608 	return n;
1609 }
1610 
1611 
1612 /*
1613  * validate parameters on entry for *_rxsync()
1614  * Returns ring->head if ok, kring->nkr_num_slots on error.
1615  *
1616  * For a valid configuration,
1617  * hwcur <= head <= cur <= tail <= hwtail
1618  *
1619  * We only consider head and cur.
1620  * hwcur and hwtail are reliable.
1621  *
1622  */
1623 u_int
1624 nm_rxsync_prologue(struct netmap_kring *kring)
1625 {
1626 	struct netmap_ring *ring = kring->ring;
1627 	uint32_t const n = kring->nkr_num_slots;
1628 	uint32_t head, cur;
1629 
1630 	ND("%s kc %d kt %d h %d c %d t %d",
1631 		kring->name,
1632 		kring->nr_hwcur, kring->nr_hwtail,
1633 		ring->head, ring->cur, ring->tail);
1634 	/*
1635 	 * Before storing the new values, we should check they do not
1636 	 * move backwards. However:
1637 	 * - head is not an issue because the previous value is hwcur;
1638 	 * - cur could in principle go back, however it does not matter
1639 	 *   because we are processing a brand new rxsync()
1640 	 */
1641 	cur = kring->rcur = ring->cur;	/* read only once */
1642 	head = kring->rhead = ring->head;	/* read only once */
1643 #if 1 /* kernel sanity checks */
1644 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1645 		goto error;
1646 #endif /* kernel sanity checks */
1647 	/* user sanity checks */
1648 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1649 		/* want hwcur <= rhead <= hwtail */
1650 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1651 			goto error;
1652 		/* and also rhead <= rcur <= hwtail */
1653 		if (cur < head || cur > kring->nr_hwtail)
1654 			goto error;
1655 	} else {
1656 		/* we need rhead outside hwtail..hwcur */
1657 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1658 			goto error;
1659 		/* two cases now: head <= hwtail or head >= hwcur  */
1660 		if (head <= kring->nr_hwtail) {
1661 			/* want head <= cur <= hwtail */
1662 			if (cur < head || cur > kring->nr_hwtail)
1663 				goto error;
1664 		} else {
1665 			/* cur must be outside hwtail..head */
1666 			if (cur < head && cur > kring->nr_hwtail)
1667 				goto error;
1668 		}
1669 	}
1670 	if (ring->tail != kring->rtail) {
1671 		RD(5, "%s tail overwritten was %d need %d",
1672 			kring->name,
1673 			ring->tail, kring->rtail);
1674 		ring->tail = kring->rtail;
1675 	}
1676 	return head;
1677 
1678 error:
1679 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1680 		kring->nr_hwcur,
1681 		kring->rcur, kring->nr_hwtail,
1682 		kring->rhead, kring->rcur, ring->tail);
1683 	return n;
1684 }
1685 
1686 
1687 /*
1688  * Error routine called when txsync/rxsync detects an error.
1689  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1690  * Return 1 on reinit.
1691  *
1692  * This routine is only called by the upper half of the kernel.
1693  * It only reads hwcur (which is changed only by the upper half, too)
1694  * and hwtail (which may be changed by the lower half, but only on
1695  * a tx ring and only to increase it, so any error will be recovered
1696  * on the next call). For the above, we don't strictly need to call
1697  * it under lock.
1698  */
1699 int
1700 netmap_ring_reinit(struct netmap_kring *kring)
1701 {
1702 	struct netmap_ring *ring = kring->ring;
1703 	u_int i, lim = kring->nkr_num_slots - 1;
1704 	int errors = 0;
1705 
1706 	// XXX KASSERT nm_kr_tryget
1707 	RD(10, "called for %s", kring->name);
1708 	// XXX probably wrong to trust userspace
1709 	kring->rhead = ring->head;
1710 	kring->rcur  = ring->cur;
1711 	kring->rtail = ring->tail;
1712 
1713 	if (ring->cur > lim)
1714 		errors++;
1715 	if (ring->head > lim)
1716 		errors++;
1717 	if (ring->tail > lim)
1718 		errors++;
1719 	for (i = 0; i <= lim; i++) {
1720 		u_int idx = ring->slot[i].buf_idx;
1721 		u_int len = ring->slot[i].len;
1722 		if (idx < 2 || idx >= netmap_total_buffers) {
1723 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1724 			ring->slot[i].buf_idx = 0;
1725 			ring->slot[i].len = 0;
1726 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1727 			ring->slot[i].len = 0;
1728 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1729 		}
1730 	}
1731 	if (errors) {
1732 		RD(10, "total %d errors", errors);
1733 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1734 			kring->name,
1735 			ring->cur, kring->nr_hwcur,
1736 			ring->tail, kring->nr_hwtail);
1737 		ring->head = kring->rhead = kring->nr_hwcur;
1738 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1739 		ring->tail = kring->rtail = kring->nr_hwtail;
1740 	}
1741 	return (errors ? 1 : 0);
1742 }
1743 
1744 /* interpret the ringid and flags fields of an nmreq, by translating them
1745  * into a pair of intervals of ring indices:
1746  *
1747  * [priv->np_txqfirst, priv->np_txqlast) and
1748  * [priv->np_rxqfirst, priv->np_rxqlast)
1749  *
1750  */
1751 int
1752 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1753 {
1754 	struct netmap_adapter *na = priv->np_na;
1755 	u_int j, i = ringid & NETMAP_RING_MASK;
1756 	u_int reg = flags & NR_REG_MASK;
1757 
1758 	if (reg == NR_REG_DEFAULT) {
1759 		/* convert from old ringid to flags */
1760 		if (ringid & NETMAP_SW_RING) {
1761 			reg = NR_REG_SW;
1762 		} else if (ringid & NETMAP_HW_RING) {
1763 			reg = NR_REG_ONE_NIC;
1764 		} else {
1765 			reg = NR_REG_ALL_NIC;
1766 		}
1767 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1768 	}
1769 	switch (reg) {
1770 	case NR_REG_ALL_NIC:
1771 	case NR_REG_PIPE_MASTER:
1772 	case NR_REG_PIPE_SLAVE:
1773 		priv->np_txqfirst = 0;
1774 		priv->np_txqlast = na->num_tx_rings;
1775 		priv->np_rxqfirst = 0;
1776 		priv->np_rxqlast = na->num_rx_rings;
1777 		ND("%s %d %d", "ALL/PIPE",
1778 			priv->np_rxqfirst, priv->np_rxqlast);
1779 		break;
1780 	case NR_REG_SW:
1781 	case NR_REG_NIC_SW:
1782 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1783 			D("host rings not supported");
1784 			return EINVAL;
1785 		}
1786 		priv->np_txqfirst = (reg == NR_REG_SW ?
1787 			na->num_tx_rings : 0);
1788 		priv->np_txqlast = na->num_tx_rings + 1;
1789 		priv->np_rxqfirst = (reg == NR_REG_SW ?
1790 			na->num_rx_rings : 0);
1791 		priv->np_rxqlast = na->num_rx_rings + 1;
1792 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1793 			priv->np_rxqfirst, priv->np_rxqlast);
1794 		break;
1795 	case NR_REG_ONE_NIC:
1796 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1797 			D("invalid ring id %d", i);
1798 			return EINVAL;
1799 		}
1800 		/* if not enough rings, use the first one */
1801 		j = i;
1802 		if (j >= na->num_tx_rings)
1803 			j = 0;
1804 		priv->np_txqfirst = j;
1805 		priv->np_txqlast = j + 1;
1806 		j = i;
1807 		if (j >= na->num_rx_rings)
1808 			j = 0;
1809 		priv->np_rxqfirst = j;
1810 		priv->np_rxqlast = j + 1;
1811 		break;
1812 	default:
1813 		D("invalid regif type %d", reg);
1814 		return EINVAL;
1815 	}
1816 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1817 
1818 	if (netmap_verbose) {
1819 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1820 			na->name,
1821 			priv->np_txqfirst,
1822 			priv->np_txqlast,
1823 			priv->np_rxqfirst,
1824 			priv->np_rxqlast,
1825 			i);
1826 	}
1827 	return 0;
1828 }
1829 
1830 
1831 /*
1832  * Set the ring ID. For devices with a single queue, a request
1833  * for all rings is the same as a single ring.
1834  */
1835 static int
1836 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1837 {
1838 	struct netmap_adapter *na = priv->np_na;
1839 	int error;
1840 
1841 	error = netmap_interp_ringid(priv, ringid, flags);
1842 	if (error) {
1843 		return error;
1844 	}
1845 
1846 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1847 
1848 	/* optimization: count the users registered for more than
1849 	 * one ring, which are the ones sleeping on the global queue.
1850 	 * The default netmap_notify() callback will then
1851 	 * avoid signaling the global queue if nobody is using it
1852 	 */
1853 	if (nm_tx_si_user(priv))
1854 		na->tx_si_users++;
1855 	if (nm_rx_si_user(priv))
1856 		na->rx_si_users++;
1857 	return 0;
1858 }
1859 
1860 /*
1861  * possibly move the interface to netmap-mode.
1862  * If success it returns a pointer to netmap_if, otherwise NULL.
1863  * This must be called with NMG_LOCK held.
1864  *
1865  * The following na callbacks are called in the process:
1866  *
1867  * na->nm_config()			[by netmap_update_config]
1868  * (get current number and size of rings)
1869  *
1870  *  	We have a generic one for linux (netmap_linux_config).
1871  *  	The bwrap has to override this, since it has to forward
1872  *  	the request to the wrapped adapter (netmap_bwrap_config).
1873  *
1874  *    	XXX netmap_if_new calls this again (2014-03-15)
1875  *
1876  * na->nm_krings_create()		[by netmap_if_new]
1877  * (create and init the krings array)
1878  *
1879  * 	One of the following:
1880  *
1881  *	* netmap_hw_krings_create, 			(hw ports)
1882  *		creates the standard layout for the krings
1883  * 		and adds the mbq (used for the host rings).
1884  *
1885  * 	* netmap_vp_krings_create			(VALE ports)
1886  * 		add leases and scratchpads
1887  *
1888  * 	* netmap_pipe_krings_create			(pipes)
1889  * 		create the krings and rings of both ends and
1890  * 		cross-link them
1891  *
1892  *      * netmap_monitor_krings_create 			(monitors)
1893  *      	avoid allocating the mbq
1894  *
1895  *      * netmap_bwrap_krings_create			(bwraps)
1896  *      	create both the brap krings array,
1897  *      	the krings array of the wrapped adapter, and
1898  *      	(if needed) the fake array for the host adapter
1899  *
1900  * na->nm_register(, 1)
1901  * (put the adapter in netmap mode)
1902  *
1903  * 	This may be one of the following:
1904  * 	(XXX these should be either all *_register or all *_reg 2014-03-15)
1905  *
1906  * 	* netmap_hw_register				(hw ports)
1907  * 		checks that the ifp is still there, then calls
1908  * 		the hardware specific callback;
1909  *
1910  * 	* netmap_vp_reg					(VALE ports)
1911  *		If the port is connected to a bridge,
1912  *		set the NAF_NETMAP_ON flag under the
1913  *		bridge write lock.
1914  *
1915  *	* netmap_pipe_reg				(pipes)
1916  *		inform the other pipe end that it is no
1917  *		longer responsibile for the lifetime of this
1918  *		pipe end
1919  *
1920  *	* netmap_monitor_reg				(monitors)
1921  *		intercept the sync callbacks of the monitored
1922  *		rings
1923  *
1924  *	* netmap_bwrap_register				(bwraps)
1925  *		cross-link the bwrap and hwna rings,
1926  *		forward the request to the hwna, override
1927  *		the hwna notify callback (to get the frames
1928  *		coming from outside go through the bridge).
1929  *
1930  * XXX maybe netmap_if_new() should be merged with this (2014-03-15).
1931  *
1932  */
1933 struct netmap_if *
1934 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1935 	uint16_t ringid, uint32_t flags, int *err)
1936 {
1937 	struct netmap_if *nifp = NULL;
1938 	int error, need_mem = 0;
1939 
1940 	NMG_LOCK_ASSERT();
1941 	/* ring configuration may have changed, fetch from the card */
1942 	netmap_update_config(na);
1943 	priv->np_na = na;     /* store the reference */
1944 	error = netmap_set_ringid(priv, ringid, flags);
1945 	if (error)
1946 		goto out;
1947 	/* ensure allocators are ready */
1948 	need_mem = !netmap_have_memory_locked(priv);
1949 	if (need_mem) {
1950 		error = netmap_get_memory_locked(priv);
1951 		ND("get_memory returned %d", error);
1952 		if (error)
1953 			goto out;
1954 	}
1955 	/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
1956 	nifp = netmap_if_new(na);
1957 	if (nifp == NULL) { /* allocation failed */
1958 		error = ENOMEM;
1959 		goto out;
1960 	}
1961 	na->active_fds++;
1962 	if (!nm_netmap_on(na)) {
1963 		/* Netmap not active, set the card in netmap mode
1964 		 * and make it use the shared buffers.
1965 		 */
1966 		/* cache the allocator info in the na */
1967 		na->na_lut = netmap_mem_get_lut(na->nm_mem);
1968 		ND("%p->na_lut == %p", na, na->na_lut);
1969 		na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem);
1970 		na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem);
1971 		error = na->nm_register(na, 1); /* mode on */
1972 		if (error) {
1973 			netmap_do_unregif(priv, nifp);
1974 			nifp = NULL;
1975 		}
1976 	}
1977 out:
1978 	*err = error;
1979 	if (error) {
1980 		/* we should drop the allocator, but only
1981 		 * if we were the ones who grabbed it
1982 		 */
1983 		if (need_mem)
1984 			netmap_drop_memory_locked(priv);
1985 		priv->np_na = NULL;
1986 	}
1987 	if (nifp != NULL) {
1988 		/*
1989 		 * advertise that the interface is ready bt setting ni_nifp.
1990 		 * The barrier is needed because readers (poll and *SYNC)
1991 		 * check for priv->np_nifp != NULL without locking
1992 		 */
1993 		wmb(); /* make sure previous writes are visible to all CPUs */
1994 		priv->np_nifp = nifp;
1995 	}
1996 	return nifp;
1997 }
1998 
1999 
2000 
2001 /*
2002  * ioctl(2) support for the "netmap" device.
2003  *
2004  * Following a list of accepted commands:
2005  * - NIOCGINFO
2006  * - SIOCGIFADDR	just for convenience
2007  * - NIOCREGIF
2008  * - NIOCTXSYNC
2009  * - NIOCRXSYNC
2010  *
2011  * Return 0 on success, errno otherwise.
2012  */
2013 int
2014 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2015 	int fflag, struct thread *td)
2016 {
2017 	struct netmap_priv_d *priv = NULL;
2018 	struct nmreq *nmr = (struct nmreq *) data;
2019 	struct netmap_adapter *na = NULL;
2020 	int error;
2021 	u_int i, qfirst, qlast;
2022 	struct netmap_if *nifp;
2023 	struct netmap_kring *krings;
2024 
2025 	(void)dev;	/* UNUSED */
2026 	(void)fflag;	/* UNUSED */
2027 
2028 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2029 		/* truncate name */
2030 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2031 		if (nmr->nr_version != NETMAP_API) {
2032 			D("API mismatch for %s got %d need %d",
2033 				nmr->nr_name,
2034 				nmr->nr_version, NETMAP_API);
2035 			nmr->nr_version = NETMAP_API;
2036 		}
2037 		if (nmr->nr_version < NETMAP_MIN_API ||
2038 		    nmr->nr_version > NETMAP_MAX_API) {
2039 			return EINVAL;
2040 		}
2041 	}
2042 	CURVNET_SET(TD_TO_VNET(td));
2043 
2044 	error = devfs_get_cdevpriv((void **)&priv);
2045 	if (error) {
2046 		CURVNET_RESTORE();
2047 		/* XXX ENOENT should be impossible, since the priv
2048 		 * is now created in the open */
2049 		return (error == ENOENT ? ENXIO : error);
2050 	}
2051 
2052 	switch (cmd) {
2053 	case NIOCGINFO:		/* return capabilities etc */
2054 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2055 			error = netmap_bdg_ctl(nmr, NULL);
2056 			break;
2057 		}
2058 
2059 		NMG_LOCK();
2060 		do {
2061 			/* memsize is always valid */
2062 			struct netmap_mem_d *nmd = &nm_mem;
2063 			u_int memflags;
2064 
2065 			if (nmr->nr_name[0] != '\0') {
2066 				/* get a refcount */
2067 				error = netmap_get_na(nmr, &na, 1 /* create */);
2068 				if (error)
2069 					break;
2070 				nmd = na->nm_mem; /* get memory allocator */
2071 			}
2072 
2073 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2074 				&nmr->nr_arg2);
2075 			if (error)
2076 				break;
2077 			if (na == NULL) /* only memory info */
2078 				break;
2079 			nmr->nr_offset = 0;
2080 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2081 			netmap_update_config(na);
2082 			nmr->nr_rx_rings = na->num_rx_rings;
2083 			nmr->nr_tx_rings = na->num_tx_rings;
2084 			nmr->nr_rx_slots = na->num_rx_desc;
2085 			nmr->nr_tx_slots = na->num_tx_desc;
2086 			netmap_adapter_put(na);
2087 		} while (0);
2088 		NMG_UNLOCK();
2089 		break;
2090 
2091 	case NIOCREGIF:
2092 		/* possibly attach/detach NIC and VALE switch */
2093 		i = nmr->nr_cmd;
2094 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2095 				|| i == NETMAP_BDG_VNET_HDR
2096 				|| i == NETMAP_BDG_NEWIF
2097 				|| i == NETMAP_BDG_DELIF) {
2098 			error = netmap_bdg_ctl(nmr, NULL);
2099 			break;
2100 		} else if (i != 0) {
2101 			D("nr_cmd must be 0 not %d", i);
2102 			error = EINVAL;
2103 			break;
2104 		}
2105 
2106 		/* protect access to priv from concurrent NIOCREGIF */
2107 		NMG_LOCK();
2108 		do {
2109 			u_int memflags;
2110 
2111 			if (priv->np_na != NULL) {	/* thread already registered */
2112 				error = EBUSY;
2113 				break;
2114 			}
2115 			/* find the interface and a reference */
2116 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2117 			if (error)
2118 				break;
2119 			if (NETMAP_OWNED_BY_KERN(na)) {
2120 				netmap_adapter_put(na);
2121 				error = EBUSY;
2122 				break;
2123 			}
2124 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
2125 			if (!nifp) {    /* reg. failed, release priv and ref */
2126 				netmap_adapter_put(na);
2127 				priv->np_nifp = NULL;
2128 				break;
2129 			}
2130 			priv->np_td = td; // XXX kqueue, debugging only
2131 
2132 			/* return the offset of the netmap_if object */
2133 			nmr->nr_rx_rings = na->num_rx_rings;
2134 			nmr->nr_tx_rings = na->num_tx_rings;
2135 			nmr->nr_rx_slots = na->num_rx_desc;
2136 			nmr->nr_tx_slots = na->num_tx_desc;
2137 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2138 				&nmr->nr_arg2);
2139 			if (error) {
2140 				netmap_adapter_put(na);
2141 				break;
2142 			}
2143 			if (memflags & NETMAP_MEM_PRIVATE) {
2144 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2145 			}
2146 			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
2147 				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
2148 			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
2149 				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
2150 
2151 			if (nmr->nr_arg3) {
2152 				D("requested %d extra buffers", nmr->nr_arg3);
2153 				nmr->nr_arg3 = netmap_extra_alloc(na,
2154 					&nifp->ni_bufs_head, nmr->nr_arg3);
2155 				D("got %d extra buffers", nmr->nr_arg3);
2156 			}
2157 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2158 		} while (0);
2159 		NMG_UNLOCK();
2160 		break;
2161 
2162 	case NIOCTXSYNC:
2163 	case NIOCRXSYNC:
2164 		nifp = priv->np_nifp;
2165 
2166 		if (nifp == NULL) {
2167 			error = ENXIO;
2168 			break;
2169 		}
2170 		mb(); /* make sure following reads are not from cache */
2171 
2172 		na = priv->np_na;      /* we have a reference */
2173 
2174 		if (na == NULL) {
2175 			D("Internal error: nifp != NULL && na == NULL");
2176 			error = ENXIO;
2177 			break;
2178 		}
2179 
2180 		if (!nm_netmap_on(na)) {
2181 			error = ENXIO;
2182 			break;
2183 		}
2184 
2185 		if (cmd == NIOCTXSYNC) {
2186 			krings = na->tx_rings;
2187 			qfirst = priv->np_txqfirst;
2188 			qlast = priv->np_txqlast;
2189 		} else {
2190 			krings = na->rx_rings;
2191 			qfirst = priv->np_rxqfirst;
2192 			qlast = priv->np_rxqlast;
2193 		}
2194 
2195 		for (i = qfirst; i < qlast; i++) {
2196 			struct netmap_kring *kring = krings + i;
2197 			if (nm_kr_tryget(kring)) {
2198 				error = EBUSY;
2199 				goto out;
2200 			}
2201 			if (cmd == NIOCTXSYNC) {
2202 				if (netmap_verbose & NM_VERB_TXSYNC)
2203 					D("pre txsync ring %d cur %d hwcur %d",
2204 					    i, kring->ring->cur,
2205 					    kring->nr_hwcur);
2206 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2207 					netmap_ring_reinit(kring);
2208 				} else {
2209 					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
2210 				}
2211 				if (netmap_verbose & NM_VERB_TXSYNC)
2212 					D("post txsync ring %d cur %d hwcur %d",
2213 					    i, kring->ring->cur,
2214 					    kring->nr_hwcur);
2215 			} else {
2216 				kring->nm_sync(kring, NAF_FORCE_READ);
2217 				microtime(&na->rx_rings[i].ring->ts);
2218 			}
2219 			nm_kr_put(kring);
2220 		}
2221 
2222 		break;
2223 
2224 	case NIOCCONFIG:
2225 		error = netmap_bdg_config(nmr);
2226 		break;
2227 #ifdef __FreeBSD__
2228 	case FIONBIO:
2229 	case FIOASYNC:
2230 		ND("FIONBIO/FIOASYNC are no-ops");
2231 		break;
2232 
2233 	case BIOCIMMEDIATE:
2234 	case BIOCGHDRCMPLT:
2235 	case BIOCSHDRCMPLT:
2236 	case BIOCSSEESENT:
2237 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2238 		break;
2239 
2240 	default:	/* allow device-specific ioctls */
2241 	    {
2242 		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2243 		if (ifp == NULL) {
2244 			error = ENXIO;
2245 		} else {
2246 			struct socket so;
2247 
2248 			bzero(&so, sizeof(so));
2249 			so.so_vnet = ifp->if_vnet;
2250 			// so->so_proto not null.
2251 			error = ifioctl(&so, cmd, data, td);
2252 			if_rele(ifp);
2253 		}
2254 		break;
2255 	    }
2256 
2257 #else /* linux */
2258 	default:
2259 		error = EOPNOTSUPP;
2260 #endif /* linux */
2261 	}
2262 out:
2263 
2264 	CURVNET_RESTORE();
2265 	return (error);
2266 }
2267 
2268 
2269 /*
2270  * select(2) and poll(2) handlers for the "netmap" device.
2271  *
2272  * Can be called for one or more queues.
2273  * Return true the event mask corresponding to ready events.
2274  * If there are no ready events, do a selrecord on either individual
2275  * selinfo or on the global one.
2276  * Device-dependent parts (locking and sync of tx/rx rings)
2277  * are done through callbacks.
2278  *
2279  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2280  * The first one is remapped to pwait as selrecord() uses the name as an
2281  * hidden argument.
2282  */
2283 int
2284 netmap_poll(struct cdev *dev, int events, struct thread *td)
2285 {
2286 	struct netmap_priv_d *priv = NULL;
2287 	struct netmap_adapter *na;
2288 	struct netmap_kring *kring;
2289 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
2290 	struct mbq q;		/* packets from hw queues to host stack */
2291 	void *pwait = dev;	/* linux compatibility */
2292 	int is_kevent = 0;
2293 
2294 	/*
2295 	 * In order to avoid nested locks, we need to "double check"
2296 	 * txsync and rxsync if we decide to do a selrecord().
2297 	 * retry_tx (and retry_rx, later) prevent looping forever.
2298 	 */
2299 	int retry_tx = 1, retry_rx = 1;
2300 
2301 	(void)pwait;
2302 	mbq_init(&q);
2303 
2304 	/*
2305 	 * XXX kevent has curthread->tp_fop == NULL,
2306 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2307 	 * priv as the first argument, which is also useful to avoid
2308 	 * the selrecord() which are not necessary in that case.
2309 	 */
2310 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
2311 		is_kevent = 1;
2312 		if (netmap_verbose)
2313 			D("called from kevent");
2314 		priv = (struct netmap_priv_d *)dev;
2315 	}
2316 	if (priv == NULL)
2317 		return POLLERR;
2318 
2319 	if (priv->np_nifp == NULL) {
2320 		D("No if registered");
2321 		return POLLERR;
2322 	}
2323 	rmb(); /* make sure following reads are not from cache */
2324 
2325 	na = priv->np_na;
2326 
2327 	if (!nm_netmap_on(na))
2328 		return POLLERR;
2329 
2330 	if (netmap_verbose & 0x8000)
2331 		D("device %s events 0x%x", na->name, events);
2332 	want_tx = events & (POLLOUT | POLLWRNORM);
2333 	want_rx = events & (POLLIN | POLLRDNORM);
2334 
2335 
2336 	/*
2337 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2338 	 * the file descriptor is bound to all of them. If so, we sleep on
2339 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2340 	 * (FreeBSD only allows two selinfo's per file descriptor).
2341 	 * The interrupt routine in the driver wake one or the other
2342 	 * (or both) depending on which clients are active.
2343 	 *
2344 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2345 	 * txsync() is called if we run out of buffers on POLLOUT, or
2346 	 * there are pending packets to send. The latter can be disabled
2347 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2348 	 */
2349 	check_all_tx = nm_tx_si_user(priv);
2350 	check_all_rx = nm_rx_si_user(priv);
2351 
2352 	/*
2353 	 * We start with a lock free round which is cheap if we have
2354 	 * slots available. If this fails, then lock and call the sync
2355 	 * routines.
2356 	 */
2357 	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
2358 		kring = &na->rx_rings[i];
2359 		/* XXX compare ring->cur and kring->tail */
2360 		if (!nm_ring_empty(kring->ring)) {
2361 			revents |= want_rx;
2362 			want_rx = 0;	/* also breaks the loop */
2363 		}
2364 	}
2365 	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
2366 		kring = &na->tx_rings[i];
2367 		/* XXX compare ring->cur and kring->tail */
2368 		if (!nm_ring_empty(kring->ring)) {
2369 			revents |= want_tx;
2370 			want_tx = 0;	/* also breaks the loop */
2371 		}
2372 	}
2373 
2374 	/*
2375 	 * If we want to push packets out (priv->np_txpoll) or
2376 	 * want_tx is still set, we must issue txsync calls
2377 	 * (on all rings, to avoid that the tx rings stall).
2378 	 * XXX should also check cur != hwcur on the tx rings.
2379 	 * Fortunately, normal tx mode has np_txpoll set.
2380 	 */
2381 	if (priv->np_txpoll || want_tx) {
2382 		/*
2383 		 * The first round checks if anyone is ready, if not
2384 		 * do a selrecord and another round to handle races.
2385 		 * want_tx goes to 0 if any space is found, and is
2386 		 * used to skip rings with no pending transmissions.
2387 		 */
2388 flush_tx:
2389 		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2390 			int found = 0;
2391 
2392 			kring = &na->tx_rings[i];
2393 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2394 				continue;
2395 			/* only one thread does txsync */
2396 			if (nm_kr_tryget(kring)) {
2397 				/* either busy or stopped
2398 				 * XXX if the ring is stopped, sleeping would
2399 				 * be better. In current code, however, we only
2400 				 * stop the rings for brief intervals (2014-03-14)
2401 				 */
2402 				if (netmap_verbose)
2403 					RD(2, "%p lost race on txring %d, ok",
2404 					    priv, i);
2405 				continue;
2406 			}
2407 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2408 				netmap_ring_reinit(kring);
2409 				revents |= POLLERR;
2410 			} else {
2411 				if (kring->nm_sync(kring, 0))
2412 					revents |= POLLERR;
2413 			}
2414 
2415 			/*
2416 			 * If we found new slots, notify potential
2417 			 * listeners on the same ring.
2418 			 * Since we just did a txsync, look at the copies
2419 			 * of cur,tail in the kring.
2420 			 */
2421 			found = kring->rcur != kring->rtail;
2422 			nm_kr_put(kring);
2423 			if (found) { /* notify other listeners */
2424 				revents |= want_tx;
2425 				want_tx = 0;
2426 				na->nm_notify(na, i, NR_TX, 0);
2427 			}
2428 		}
2429 		if (want_tx && retry_tx && !is_kevent) {
2430 			OS_selrecord(td, check_all_tx ?
2431 			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2432 			retry_tx = 0;
2433 			goto flush_tx;
2434 		}
2435 	}
2436 
2437 	/*
2438 	 * If want_rx is still set scan receive rings.
2439 	 * Do it on all rings because otherwise we starve.
2440 	 */
2441 	if (want_rx) {
2442 		int send_down = 0; /* transparent mode */
2443 		/* two rounds here for race avoidance */
2444 do_retry_rx:
2445 		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2446 			int found = 0;
2447 
2448 			kring = &na->rx_rings[i];
2449 
2450 			if (nm_kr_tryget(kring)) {
2451 				if (netmap_verbose)
2452 					RD(2, "%p lost race on rxring %d, ok",
2453 					    priv, i);
2454 				continue;
2455 			}
2456 
2457 			/*
2458 			 * transparent mode support: collect packets
2459 			 * from the rxring(s).
2460 			 * XXX NR_FORWARD should only be read on
2461 			 * physical or NIC ports
2462 			 */
2463 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2464 				ND(10, "forwarding some buffers up %d to %d",
2465 				    kring->nr_hwcur, kring->ring->cur);
2466 				netmap_grab_packets(kring, &q, netmap_fwd);
2467 			}
2468 
2469 			if (kring->nm_sync(kring, 0))
2470 				revents |= POLLERR;
2471 			if (netmap_no_timestamp == 0 ||
2472 					kring->ring->flags & NR_TIMESTAMP) {
2473 				microtime(&kring->ring->ts);
2474 			}
2475 			/* after an rxsync we can use kring->rcur, rtail */
2476 			found = kring->rcur != kring->rtail;
2477 			nm_kr_put(kring);
2478 			if (found) {
2479 				revents |= want_rx;
2480 				retry_rx = 0;
2481 				na->nm_notify(na, i, NR_RX, 0);
2482 			}
2483 		}
2484 
2485 		/* transparent mode XXX only during first pass ? */
2486 		if (na->na_flags & NAF_HOST_RINGS) {
2487 			kring = &na->rx_rings[na->num_rx_rings];
2488 			if (check_all_rx
2489 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2490 				/* XXX fix to use kring fields */
2491 				if (nm_ring_empty(kring->ring))
2492 					send_down = netmap_rxsync_from_host(na, td, dev);
2493 				if (!nm_ring_empty(kring->ring))
2494 					revents |= want_rx;
2495 			}
2496 		}
2497 
2498 		if (retry_rx && !is_kevent)
2499 			OS_selrecord(td, check_all_rx ?
2500 			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2501 		if (send_down > 0 || retry_rx) {
2502 			retry_rx = 0;
2503 			if (send_down)
2504 				goto flush_tx; /* and retry_rx */
2505 			else
2506 				goto do_retry_rx;
2507 		}
2508 	}
2509 
2510 	/*
2511 	 * Transparent mode: marked bufs on rx rings between
2512 	 * kring->nr_hwcur and ring->head
2513 	 * are passed to the other endpoint.
2514 	 *
2515 	 * In this mode we also scan the sw rxring, which in
2516 	 * turn passes packets up.
2517 	 *
2518 	 * XXX Transparent mode at the moment requires to bind all
2519  	 * rings to a single file descriptor.
2520 	 */
2521 
2522 	if (q.head && na->ifp != NULL)
2523 		netmap_send_up(na->ifp, &q);
2524 
2525 	return (revents);
2526 }
2527 
2528 
2529 /*-------------------- driver support routines -------------------*/
2530 
2531 static int netmap_hw_krings_create(struct netmap_adapter *);
2532 
2533 /* default notify callback */
2534 static int
2535 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2536 	enum txrx tx, int flags)
2537 {
2538 	struct netmap_kring *kring;
2539 
2540 	if (tx == NR_TX) {
2541 		kring = na->tx_rings + n_ring;
2542 		OS_selwakeup(&kring->si, PI_NET);
2543 		/* optimization: avoid a wake up on the global
2544 		 * queue if nobody has registered for more
2545 		 * than one ring
2546 		 */
2547 		if (na->tx_si_users > 0)
2548 			OS_selwakeup(&na->tx_si, PI_NET);
2549 	} else {
2550 		kring = na->rx_rings + n_ring;
2551 		OS_selwakeup(&kring->si, PI_NET);
2552 		/* optimization: same as above */
2553 		if (na->rx_si_users > 0)
2554 			OS_selwakeup(&na->rx_si, PI_NET);
2555 	}
2556 	return 0;
2557 }
2558 
2559 
2560 /* called by all routines that create netmap_adapters.
2561  * Attach na to the ifp (if any) and provide defaults
2562  * for optional callbacks. Defaults assume that we
2563  * are creating an hardware netmap_adapter.
2564  */
2565 int
2566 netmap_attach_common(struct netmap_adapter *na)
2567 {
2568 	struct ifnet *ifp = na->ifp;
2569 
2570 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2571 		D("%s: invalid rings tx %d rx %d",
2572 			na->name, na->num_tx_rings, na->num_rx_rings);
2573 		return EINVAL;
2574 	}
2575 	/* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2576 	 * pipes, monitors). For bwrap we actually have a non-null ifp for
2577 	 * use by the external modules, but that is set after this
2578 	 * function has been called.
2579 	 * XXX this is ugly, maybe split this function in two (2014-03-14)
2580 	 */
2581 	if (ifp != NULL) {
2582 		WNA(ifp) = na;
2583 
2584 	/* the following is only needed for na that use the host port.
2585 	 * XXX do we have something similar for linux ?
2586 	 */
2587 #ifdef __FreeBSD__
2588 		na->if_input = ifp->if_input; /* for netmap_send_up */
2589 #endif /* __FreeBSD__ */
2590 
2591 		NETMAP_SET_CAPABLE(ifp);
2592 	}
2593 	if (na->nm_krings_create == NULL) {
2594 		/* we assume that we have been called by a driver,
2595 		 * since other port types all provide their own
2596 		 * nm_krings_create
2597 		 */
2598 		na->nm_krings_create = netmap_hw_krings_create;
2599 		na->nm_krings_delete = netmap_hw_krings_delete;
2600 	}
2601 	if (na->nm_notify == NULL)
2602 		na->nm_notify = netmap_notify;
2603 	na->active_fds = 0;
2604 
2605 	if (na->nm_mem == NULL)
2606 		/* use the global allocator */
2607 		na->nm_mem = &nm_mem;
2608 	if (na->nm_bdg_attach == NULL)
2609 		/* no special nm_bdg_attach callback. On VALE
2610 		 * attach, we need to interpose a bwrap
2611 		 */
2612 		na->nm_bdg_attach = netmap_bwrap_attach;
2613 	return 0;
2614 }
2615 
2616 
2617 /* standard cleanup, called by all destructors */
2618 void
2619 netmap_detach_common(struct netmap_adapter *na)
2620 {
2621 	if (na->ifp != NULL)
2622 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2623 
2624 	if (na->tx_rings) { /* XXX should not happen */
2625 		D("freeing leftover tx_rings");
2626 		na->nm_krings_delete(na);
2627 	}
2628 	netmap_pipe_dealloc(na);
2629 	if (na->na_flags & NAF_MEM_OWNER)
2630 		netmap_mem_private_delete(na->nm_mem);
2631 	bzero(na, sizeof(*na));
2632 	free(na, M_DEVBUF);
2633 }
2634 
2635 /* Wrapper for the register callback provided hardware drivers.
2636  * na->ifp == NULL means the the driver module has been
2637  * unloaded, so we cannot call into it.
2638  * Note that module unloading, in our patched linux drivers,
2639  * happens under NMG_LOCK and after having stopped all the
2640  * nic rings (see netmap_detach). This provides sufficient
2641  * protection for the other driver-provied callbacks
2642  * (i.e., nm_config and nm_*xsync), that therefore don't need
2643  * to wrapped.
2644  */
2645 static int
2646 netmap_hw_register(struct netmap_adapter *na, int onoff)
2647 {
2648 	struct netmap_hw_adapter *hwna =
2649 		(struct netmap_hw_adapter*)na;
2650 
2651 	if (na->ifp == NULL)
2652 		return onoff ? ENXIO : 0;
2653 
2654 	return hwna->nm_hw_register(na, onoff);
2655 }
2656 
2657 
2658 /*
2659  * Initialize a ``netmap_adapter`` object created by driver on attach.
2660  * We allocate a block of memory with room for a struct netmap_adapter
2661  * plus two sets of N+2 struct netmap_kring (where N is the number
2662  * of hardware rings):
2663  * krings	0..N-1	are for the hardware queues.
2664  * kring	N	is for the host stack queue
2665  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2666  * Return 0 on success, ENOMEM otherwise.
2667  */
2668 int
2669 netmap_attach(struct netmap_adapter *arg)
2670 {
2671 	struct netmap_hw_adapter *hwna = NULL;
2672 	// XXX when is arg == NULL ?
2673 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2674 
2675 	if (arg == NULL || ifp == NULL)
2676 		goto fail;
2677 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2678 	if (hwna == NULL)
2679 		goto fail;
2680 	hwna->up = *arg;
2681 	hwna->up.na_flags |= NAF_HOST_RINGS;
2682 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2683 	hwna->nm_hw_register = hwna->up.nm_register;
2684 	hwna->up.nm_register = netmap_hw_register;
2685 	if (netmap_attach_common(&hwna->up)) {
2686 		free(hwna, M_DEVBUF);
2687 		goto fail;
2688 	}
2689 	netmap_adapter_get(&hwna->up);
2690 
2691 #ifdef linux
2692 	if (ifp->netdev_ops) {
2693 		/* prepare a clone of the netdev ops */
2694 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2695 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2696 #else
2697 		hwna->nm_ndo = *ifp->netdev_ops;
2698 #endif
2699 	}
2700 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2701 	if (ifp->ethtool_ops) {
2702 		hwna->nm_eto = *ifp->ethtool_ops;
2703 	}
2704 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2705 #ifdef ETHTOOL_SCHANNELS
2706 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2707 #endif
2708 	if (arg->nm_config == NULL) {
2709 		hwna->up.nm_config = netmap_linux_config;
2710 	}
2711 #endif /* linux */
2712 
2713 	D("success for %s tx %d/%d rx %d/%d queues/slots",
2714 		hwna->up.name,
2715 		hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2716 		hwna->up.num_rx_rings, hwna->up.num_rx_desc
2717 		);
2718 	return 0;
2719 
2720 fail:
2721 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2722 	if (ifp)
2723 		netmap_detach(ifp);
2724 	return (hwna ? EINVAL : ENOMEM);
2725 }
2726 
2727 
2728 void
2729 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2730 {
2731 	if (!na) {
2732 		return;
2733 	}
2734 
2735 	refcount_acquire(&na->na_refcount);
2736 }
2737 
2738 
2739 /* returns 1 iff the netmap_adapter is destroyed */
2740 int
2741 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2742 {
2743 	if (!na)
2744 		return 1;
2745 
2746 	if (!refcount_release(&na->na_refcount))
2747 		return 0;
2748 
2749 	if (na->nm_dtor)
2750 		na->nm_dtor(na);
2751 
2752 	netmap_detach_common(na);
2753 
2754 	return 1;
2755 }
2756 
2757 /* nm_krings_create callback for all hardware native adapters */
2758 int
2759 netmap_hw_krings_create(struct netmap_adapter *na)
2760 {
2761 	int ret = netmap_krings_create(na, 0);
2762 	if (ret == 0) {
2763 		/* initialize the mbq for the sw rx ring */
2764 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2765 		ND("initialized sw rx queue %d", na->num_rx_rings);
2766 	}
2767 	return ret;
2768 }
2769 
2770 
2771 
2772 /*
2773  * Called on module unload by the netmap-enabled drivers
2774  */
2775 void
2776 netmap_detach(struct ifnet *ifp)
2777 {
2778 	struct netmap_adapter *na = NA(ifp);
2779 
2780 	if (!na)
2781 		return;
2782 
2783 	NMG_LOCK();
2784 	netmap_disable_all_rings(ifp);
2785 	if (!netmap_adapter_put(na)) {
2786 		/* someone is still using the adapter,
2787 		 * tell them that the interface is gone
2788 		 */
2789 		na->ifp = NULL;
2790 		// XXX also clear NAF_NATIVE_ON ?
2791 		na->na_flags &= ~NAF_NETMAP_ON;
2792 		/* give them a chance to notice */
2793 		netmap_enable_all_rings(ifp);
2794 	}
2795 	NMG_UNLOCK();
2796 }
2797 
2798 
2799 /*
2800  * Intercept packets from the network stack and pass them
2801  * to netmap as incoming packets on the 'software' ring.
2802  *
2803  * We only store packets in a bounded mbq and then copy them
2804  * in the relevant rxsync routine.
2805  *
2806  * We rely on the OS to make sure that the ifp and na do not go
2807  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2808  * In nm_register() or whenever there is a reinitialization,
2809  * we make sure to make the mode change visible here.
2810  */
2811 int
2812 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2813 {
2814 	struct netmap_adapter *na = NA(ifp);
2815 	struct netmap_kring *kring;
2816 	u_int len = MBUF_LEN(m);
2817 	u_int error = ENOBUFS;
2818 	struct mbq *q;
2819 	int space;
2820 
2821 	// XXX [Linux] we do not need this lock
2822 	// if we follow the down/configure/up protocol -gl
2823 	// mtx_lock(&na->core_lock);
2824 
2825 	if (!nm_netmap_on(na)) {
2826 		D("%s not in netmap mode anymore", na->name);
2827 		error = ENXIO;
2828 		goto done;
2829 	}
2830 
2831 	kring = &na->rx_rings[na->num_rx_rings];
2832 	q = &kring->rx_queue;
2833 
2834 	// XXX reconsider long packets if we handle fragments
2835 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2836 		D("%s from_host, drop packet size %d > %d", na->name,
2837 			len, NETMAP_BUF_SIZE(na));
2838 		goto done;
2839 	}
2840 
2841 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2842 	 * and maybe other instances of netmap_transmit (the latter
2843 	 * not possible on Linux).
2844 	 * Also avoid overflowing the queue.
2845 	 */
2846 	mbq_lock(q);
2847 
2848         space = kring->nr_hwtail - kring->nr_hwcur;
2849         if (space < 0)
2850                 space += kring->nkr_num_slots;
2851 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2852 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2853 			na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2854 			len, m);
2855 	} else {
2856 		mbq_enqueue(q, m);
2857 		ND(10, "%s %d bufs in queue len %d m %p",
2858 			na->name, mbq_len(q), len, m);
2859 		/* notify outside the lock */
2860 		m = NULL;
2861 		error = 0;
2862 	}
2863 	mbq_unlock(q);
2864 
2865 done:
2866 	if (m)
2867 		m_freem(m);
2868 	/* unconditionally wake up listeners */
2869 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2870 	/* this is normally netmap_notify(), but for nics
2871 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2872 	 * that possibly forwards the frames through the switch
2873 	 */
2874 
2875 	return (error);
2876 }
2877 
2878 
2879 /*
2880  * netmap_reset() is called by the driver routines when reinitializing
2881  * a ring. The driver is in charge of locking to protect the kring.
2882  * If native netmap mode is not set just return NULL.
2883  */
2884 struct netmap_slot *
2885 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2886 	u_int new_cur)
2887 {
2888 	struct netmap_kring *kring;
2889 	int new_hwofs, lim;
2890 
2891 	if (!nm_native_on(na)) {
2892 		ND("interface not in native netmap mode");
2893 		return NULL;	/* nothing to reinitialize */
2894 	}
2895 
2896 	/* XXX note- in the new scheme, we are not guaranteed to be
2897 	 * under lock (e.g. when called on a device reset).
2898 	 * In this case, we should set a flag and do not trust too
2899 	 * much the values. In practice: TODO
2900 	 * - set a RESET flag somewhere in the kring
2901 	 * - do the processing in a conservative way
2902 	 * - let the *sync() fixup at the end.
2903 	 */
2904 	if (tx == NR_TX) {
2905 		if (n >= na->num_tx_rings)
2906 			return NULL;
2907 		kring = na->tx_rings + n;
2908 		// XXX check whether we should use hwcur or rcur
2909 		new_hwofs = kring->nr_hwcur - new_cur;
2910 	} else {
2911 		if (n >= na->num_rx_rings)
2912 			return NULL;
2913 		kring = na->rx_rings + n;
2914 		new_hwofs = kring->nr_hwtail - new_cur;
2915 	}
2916 	lim = kring->nkr_num_slots - 1;
2917 	if (new_hwofs > lim)
2918 		new_hwofs -= lim + 1;
2919 
2920 	/* Always set the new offset value and realign the ring. */
2921 	if (netmap_verbose)
2922 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2923 		na->name,
2924 		tx == NR_TX ? "TX" : "RX", n,
2925 		kring->nkr_hwofs, new_hwofs,
2926 		kring->nr_hwtail,
2927 		tx == NR_TX ? lim : kring->nr_hwtail);
2928 	kring->nkr_hwofs = new_hwofs;
2929 	if (tx == NR_TX) {
2930 		kring->nr_hwtail = kring->nr_hwcur + lim;
2931 		if (kring->nr_hwtail > lim)
2932 			kring->nr_hwtail -= lim + 1;
2933 	}
2934 
2935 #if 0 // def linux
2936 	/* XXX check that the mappings are correct */
2937 	/* need ring_nr, adapter->pdev, direction */
2938 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2939 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2940 		D("error mapping rx netmap buffer %d", i);
2941 		// XXX fix error handling
2942 	}
2943 
2944 #endif /* linux */
2945 	/*
2946 	 * Wakeup on the individual and global selwait
2947 	 * We do the wakeup here, but the ring is not yet reconfigured.
2948 	 * However, we are under lock so there are no races.
2949 	 */
2950 	na->nm_notify(na, n, tx, 0);
2951 	return kring->ring->slot;
2952 }
2953 
2954 
2955 /*
2956  * Dispatch rx/tx interrupts to the netmap rings.
2957  *
2958  * "work_done" is non-null on the RX path, NULL for the TX path.
2959  * We rely on the OS to make sure that there is only one active
2960  * instance per queue, and that there is appropriate locking.
2961  *
2962  * The 'notify' routine depends on what the ring is attached to.
2963  * - for a netmap file descriptor, do a selwakeup on the individual
2964  *   waitqueue, plus one on the global one if needed
2965  *   (see netmap_notify)
2966  * - for a nic connected to a switch, call the proper forwarding routine
2967  *   (see netmap_bwrap_intr_notify)
2968  */
2969 void
2970 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2971 {
2972 	struct netmap_adapter *na = NA(ifp);
2973 	struct netmap_kring *kring;
2974 
2975 	q &= NETMAP_RING_MASK;
2976 
2977 	if (netmap_verbose) {
2978 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2979 	}
2980 
2981 	if (work_done) { /* RX path */
2982 		if (q >= na->num_rx_rings)
2983 			return;	// not a physical queue
2984 		kring = na->rx_rings + q;
2985 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2986 		na->nm_notify(na, q, NR_RX, 0);
2987 		*work_done = 1; /* do not fire napi again */
2988 	} else { /* TX path */
2989 		if (q >= na->num_tx_rings)
2990 			return;	// not a physical queue
2991 		kring = na->tx_rings + q;
2992 		na->nm_notify(na, q, NR_TX, 0);
2993 	}
2994 }
2995 
2996 
2997 /*
2998  * Default functions to handle rx/tx interrupts from a physical device.
2999  * "work_done" is non-null on the RX path, NULL for the TX path.
3000  *
3001  * If the card is not in netmap mode, simply return 0,
3002  * so that the caller proceeds with regular processing.
3003  * Otherwise call netmap_common_irq() and return 1.
3004  *
3005  * If the card is connected to a netmap file descriptor,
3006  * do a selwakeup on the individual queue, plus one on the global one
3007  * if needed (multiqueue card _and_ there are multiqueue listeners),
3008  * and return 1.
3009  *
3010  * Finally, if called on rx from an interface connected to a switch,
3011  * calls the proper forwarding routine, and return 1.
3012  */
3013 int
3014 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3015 {
3016 	struct netmap_adapter *na = NA(ifp);
3017 
3018 	/*
3019 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3020 	 * we still use the regular driver even though the previous
3021 	 * check fails. It is unclear whether we should use
3022 	 * nm_native_on() here.
3023 	 */
3024 	if (!nm_netmap_on(na))
3025 		return 0;
3026 
3027 	if (na->na_flags & NAF_SKIP_INTR) {
3028 		ND("use regular interrupt");
3029 		return 0;
3030 	}
3031 
3032 	netmap_common_irq(ifp, q, work_done);
3033 	return 1;
3034 }
3035 
3036 
3037 /*
3038  * Module loader and unloader
3039  *
3040  * netmap_init() creates the /dev/netmap device and initializes
3041  * all global variables. Returns 0 on success, errno on failure
3042  * (but there is no chance)
3043  *
3044  * netmap_fini() destroys everything.
3045  */
3046 
3047 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3048 extern struct cdevsw netmap_cdevsw;
3049 
3050 
3051 void
3052 netmap_fini(void)
3053 {
3054 	// XXX destroy_bridges() ?
3055 	if (netmap_dev)
3056 		destroy_dev(netmap_dev);
3057 	netmap_mem_fini();
3058 	NMG_LOCK_DESTROY();
3059 	printf("netmap: unloaded module.\n");
3060 }
3061 
3062 
3063 int
3064 netmap_init(void)
3065 {
3066 	int error;
3067 
3068 	NMG_LOCK_INIT();
3069 
3070 	error = netmap_mem_init();
3071 	if (error != 0)
3072 		goto fail;
3073 	/*
3074 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3075 	 * when the module is compiled in.
3076 	 * XXX could use make_dev_credv() to get error number
3077 	 */
3078 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3079 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3080 			      "netmap");
3081 	if (!netmap_dev)
3082 		goto fail;
3083 
3084 	netmap_init_bridges();
3085 #ifdef __FreeBSD__
3086 	nm_vi_init_index();
3087 #endif
3088 	printf("netmap: loaded module\n");
3089 	return (0);
3090 fail:
3091 	netmap_fini();
3092 	return (EINVAL); /* may be incorrect */
3093 }
3094