xref: /freebsd/sys/dev/netmap/netmap.c (revision f02f7422801bb39f5eaab8fc383fa7b70c467ff9)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 
128 /* --- internals ----
129  *
130  * Roadmap to the code that implements the above.
131  *
132  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133  * >    select()able file descriptor on which events are reported.
134  *
135  *  	Internally, we allocate a netmap_priv_d structure, that will be
136  *  	initialized on ioctl(NIOCREGIF).
137  *
138  *      os-specific:
139  *  	    FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140  *  		     per-thread.
141  *  	    linux:   linux_netmap_open (netmap_linux.c). The priv is
142  *  		     per-open.
143  *
144  * > 2. on each descriptor, the process issues an ioctl() to identify
145  * >    the interface that should report events to the file descriptor.
146  *
147  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148  * 	Most important things happen in netmap_get_na() and
149  * 	netmap_do_regif(), called from there. Additional details can be
150  * 	found in the comments above those functions.
151  *
152  * 	In all cases, this action creates/takes-a-reference-to a
153  * 	netmap_*_adapter describing the port, and allocates a netmap_if
154  * 	and all necessary netmap rings, filling them with netmap buffers.
155  *
156  *      In this phase, the sync callbacks for each ring are set (these are used
157  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
158  *      The adapter creation/initialization code puts them in the
159  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
160  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
161  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
162  * 	actually call netmap_krings_create() to perform this and the other
163  * 	common stuff. netmap_krings_create() also takes care of the host rings,
164  * 	if needed, by setting their sync callbacks appropriately.
165  *
166  * 	Additional actions depend on the kind of netmap_adapter that has been
167  * 	registered:
168  *
169  * 	- netmap_hw_adapter:  	     [netmap.c]
170  * 	     This is a system netdev/ifp with native netmap support.
171  * 	     The ifp is detached from the host stack by redirecting:
172  * 	       - transmissions (from the network stack) to netmap_transmit()
173  * 	       - receive notifications to the nm_notify() callback for
174  * 	         this adapter. The callback is normally netmap_notify(), unless
175  * 	         the ifp is attached to a bridge using bwrap, in which case it
176  * 	         is netmap_bwrap_intr_notify().
177  *
178  * 	- netmap_generic_adapter:      [netmap_generic.c]
179  * 	      A system netdev/ifp without native netmap support.
180  *
181  * 	(the decision about native/non native support is taken in
182  * 	 netmap_get_hw_na(), called by netmap_get_na())
183  *
184  * 	- netmap_vp_adapter 		[netmap_vale.c]
185  * 	      Returned by netmap_get_bdg_na().
186  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
187  * 	      are created on the fly if they don't already exist, and are
188  * 	      always attached to a bridge.
189  * 	      Persistent VALE ports must must be created seperately, and i
190  * 	      then attached like normal NICs. The NIOCREGIF we are examining
191  * 	      will find them only if they had previosly been created and
192  * 	      attached (see VALE_CTL below).
193  *
194  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
195  * 	      Returned by netmap_get_pipe_na().
196  * 	      Both pipe ends are created, if they didn't already exist.
197  *
198  * 	- netmap_monitor_adapter      [netmap_monitor.c]
199  * 	      Returned by netmap_get_monitor_na().
200  * 	      If successful, the nm_sync callbacks of the monitored adapter
201  * 	      will be intercepted by the returned monitor.
202  *
203  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
204  * 	      Cannot be obtained in this way, see VALE_CTL below
205  *
206  *
207  * 	os-specific:
208  * 	    linux: we first go through linux_netmap_ioctl() to
209  * 	           adapt the FreeBSD interface to the linux one.
210  *
211  *
212  * > 3. on each descriptor, the process issues an mmap() request to
213  * >    map the shared memory region within the process' address space.
214  * >    The list of interesting queues is indicated by a location in
215  * >    the shared memory region.
216  *
217  *      os-specific:
218  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
220  *
221  * > 4. using the functions in the netmap(4) userspace API, a process
222  * >    can look up the occupation state of a queue, access memory buffers,
223  * >    and retrieve received packets or enqueue packets to transmit.
224  *
225  * 	these actions do not involve the kernel.
226  *
227  * > 5. using some ioctl()s the process can synchronize the userspace view
228  * >    of the queue with the actual status in the kernel. This includes both
229  * >    receiving the notification of new packets, and transmitting new
230  * >    packets on the output interface.
231  *
232  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
234  * 	structures, as initialized in step 2 and maybe later modified
235  * 	by a monitor. Monitors, however, will always call the original
236  * 	callback before doing anything else.
237  *
238  *
239  * > 6. select() or poll() can be used to wait for events on individual
240  * >    transmit or receive queues (or all queues for a given interface).
241  *
242  * 	Implemented in netmap_poll(). This will call the same nm_sync()
243  * 	callbacks as in step 5 above.
244  *
245  * 	os-specific:
246  * 		linux: we first go through linux_netmap_poll() to adapt
247  * 		       the FreeBSD interface to the linux one.
248  *
249  *
250  *  ----  VALE_CTL -----
251  *
252  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
253  *  nr_cmd in the nmreq structure. These subcommands are handled by
254  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256  *  subcommands, respectively.
257  *
258  *  Any network interface known to the system (including a persistent VALE
259  *  port) can be attached to a VALE switch by issuing the
260  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
262  *  attachment of other interfaces, instead, requires the creation of a
263  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
264  *  netmap mode. This may require the creation of a netmap_generic_adapter if
265  *  we have no native support for the interface, or if generic adapters have
266  *  been forced by sysctl.
267  *
268  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270  *  callback.  In the case of the bwrap, the callback creates the
271  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
272  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274  *  A generic adapter for the wrapped ifp will be created if needed, when
275  *  netmap_get_bdg_na() calls netmap_get_hw_na().
276  *
277  *
278  *  ---- DATAPATHS -----
279  *
280  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281  *
282  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283  *
284  *    - tx from netmap userspace:
285  *	 concurrently:
286  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287  *                kring->nm_sync() == DEVICE_netmap_txsync()
288  *           2) device interrupt handler
289  *                na->nm_notify()  == netmap_notify()
290  *    - rx from netmap userspace:
291  *       concurrently:
292  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293  *                kring->nm_sync() == DEVICE_netmap_rxsync()
294  *           2) device interrupt handler
295  *                na->nm_notify()  == netmap_notify()
296  *    - tx from host stack
297  *       concurrently:
298  *           1) host stack
299  *                netmap_transmit()
300  *                  na->nm_notify  == netmap_notify()
301  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302  *                kring->nm_sync() == netmap_rxsync_from_host_compat
303  *                  netmap_rxsync_from_host(na, NULL, NULL)
304  *    - tx to host stack
305  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
306  *             kring->nm_sync() == netmap_txsync_to_host_compat
307  *               netmap_txsync_to_host(na)
308  *                 NM_SEND_UP()
309  *                   FreeBSD: na->if_input() == ?? XXX
310  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311  *
312  *
313  *
314  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315  *
316  *
317  *
318  *                           -= VALE PORT =-
319  *
320  *
321  *
322  *                           -= NETMAP PIPE =-
323  *
324  *
325  *
326  *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
327  *
328  *
329  *
330  *  -= SYSTEM DEVICE WITH NATIVE SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
331  *
332  *
333  *
334  *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, NO HOST RINGS =-
335  *
336  *
337  *
338  *  -= SYSTEM DEVICE WITH GENERIC SUPPORT, CONNECTED TO VALE, WITH HOST RINGS =-
339  *
340  *
341  *
342  */
343 
344 /*
345  * OS-specific code that is used only within this file.
346  * Other OS-specific code that must be accessed by drivers
347  * is present in netmap_kern.h
348  */
349 
350 #if defined(__FreeBSD__)
351 #include <sys/cdefs.h> /* prerequisite */
352 #include <sys/types.h>
353 #include <sys/errno.h>
354 #include <sys/param.h>	/* defines used in kernel.h */
355 #include <sys/kernel.h>	/* types used in module initialization */
356 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
357 #include <sys/filio.h>	/* FIONBIO */
358 #include <sys/sockio.h>
359 #include <sys/socketvar.h>	/* struct socket */
360 #include <sys/malloc.h>
361 #include <sys/poll.h>
362 #include <sys/rwlock.h>
363 #include <sys/socket.h> /* sockaddrs */
364 #include <sys/selinfo.h>
365 #include <sys/sysctl.h>
366 #include <sys/jail.h>
367 #include <net/vnet.h>
368 #include <net/if.h>
369 #include <net/if_var.h>
370 #include <net/bpf.h>		/* BIOCIMMEDIATE */
371 #include <machine/bus.h>	/* bus_dmamap_* */
372 #include <sys/endian.h>
373 #include <sys/refcount.h>
374 
375 
376 /* reduce conditional code */
377 // linux API, use for the knlist in FreeBSD
378 #define init_waitqueue_head(x)	knlist_init_mtx(&(x)->si_note, NULL)
379 
380 void freebsd_selwakeup(struct selinfo *si, int pri);
381 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
382 
383 #elif defined(linux)
384 
385 #include "bsd_glue.h"
386 
387 
388 
389 #elif defined(__APPLE__)
390 
391 #warning OSX support is only partial
392 #include "osx_glue.h"
393 
394 #else
395 
396 #error	Unsupported platform
397 
398 #endif /* unsupported */
399 
400 /*
401  * common headers
402  */
403 #include <net/netmap.h>
404 #include <dev/netmap/netmap_kern.h>
405 #include <dev/netmap/netmap_mem2.h>
406 
407 
408 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
409 
410 /*
411  * The following variables are used by the drivers and replicate
412  * fields in the global memory pool. They only refer to buffers
413  * used by physical interfaces.
414  */
415 u_int netmap_total_buffers;
416 u_int netmap_buf_size;
417 char *netmap_buffer_base;	/* also address of an invalid buffer */
418 
419 /* user-controlled variables */
420 int netmap_verbose;
421 
422 static int netmap_no_timestamp; /* don't timestamp on rxsync */
423 
424 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
425 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
426     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
427 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
428     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
429 int netmap_mitigate = 1;
430 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
431 int netmap_no_pendintr = 1;
432 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
433     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
434 int netmap_txsync_retry = 2;
435 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
436     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
437 
438 int netmap_adaptive_io = 0;
439 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
440     &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
441 
442 int netmap_flags = 0;	/* debug flags */
443 int netmap_fwd = 0;	/* force transparent mode */
444 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
445 
446 /*
447  * netmap_admode selects the netmap mode to use.
448  * Invalid values are reset to NETMAP_ADMODE_BEST
449  */
450 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
451 	NETMAP_ADMODE_NATIVE,	/* either native or none */
452 	NETMAP_ADMODE_GENERIC,	/* force generic */
453 	NETMAP_ADMODE_LAST };
454 static int netmap_admode = NETMAP_ADMODE_BEST;
455 
456 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
457 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
458 int netmap_generic_rings = 1;   /* number of queues in generic. */
459 
460 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
461 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
462 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
463 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
464 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
465 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
466 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
467 
468 NMG_LOCK_T	netmap_global_lock;
469 
470 
471 static void
472 nm_kr_get(struct netmap_kring *kr)
473 {
474 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
475 		tsleep(kr, 0, "NM_KR_GET", 4);
476 }
477 
478 
479 /*
480  * mark the ring as stopped, and run through the locks
481  * to make sure other users get to see it.
482  */
483 static void
484 netmap_disable_ring(struct netmap_kring *kr)
485 {
486 	kr->nkr_stopped = 1;
487 	nm_kr_get(kr);
488 	mtx_lock(&kr->q_lock);
489 	mtx_unlock(&kr->q_lock);
490 	nm_kr_put(kr);
491 }
492 
493 /* stop or enable a single tx ring */
494 void
495 netmap_set_txring(struct netmap_adapter *na, u_int ring_id, int stopped)
496 {
497 	if (stopped)
498 		netmap_disable_ring(na->tx_rings + ring_id);
499 	else
500 		na->tx_rings[ring_id].nkr_stopped = 0;
501 	/* nofify that the stopped state has changed. This is currently
502 	 *only used by bwrap to propagate the state to its own krings.
503 	 * (see netmap_bwrap_intr_notify).
504 	 */
505 	na->nm_notify(na, ring_id, NR_TX, NAF_DISABLE_NOTIFY);
506 }
507 
508 /* stop or enable a single rx ring */
509 void
510 netmap_set_rxring(struct netmap_adapter *na, u_int ring_id, int stopped)
511 {
512 	if (stopped)
513 		netmap_disable_ring(na->rx_rings + ring_id);
514 	else
515 		na->rx_rings[ring_id].nkr_stopped = 0;
516 	/* nofify that the stopped state has changed. This is currently
517 	 *only used by bwrap to propagate the state to its own krings.
518 	 * (see netmap_bwrap_intr_notify).
519 	 */
520 	na->nm_notify(na, ring_id, NR_RX, NAF_DISABLE_NOTIFY);
521 }
522 
523 
524 /* stop or enable all the rings of na */
525 void
526 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
527 {
528 	int i;
529 	u_int ntx, nrx;
530 
531 	if (!nm_netmap_on(na))
532 		return;
533 
534 	ntx = netmap_real_tx_rings(na);
535 	nrx = netmap_real_rx_rings(na);
536 
537 	for (i = 0; i < ntx; i++) {
538 		netmap_set_txring(na, i, stopped);
539 	}
540 
541 	for (i = 0; i < nrx; i++) {
542 		netmap_set_rxring(na, i, stopped);
543 	}
544 }
545 
546 /*
547  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
548  * to finish and prevents any new one from starting.  Call this before turning
549  * netmap mode off, or before removing the harware rings (e.g., on module
550  * onload).  As a rule of thumb for linux drivers, this should be placed near
551  * each napi_disable().
552  */
553 void
554 netmap_disable_all_rings(struct ifnet *ifp)
555 {
556 	netmap_set_all_rings(NA(ifp), 1 /* stopped */);
557 }
558 
559 /*
560  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
561  * adapter's rings In linux drivers, this should be placed near each
562  * napi_enable().
563  */
564 void
565 netmap_enable_all_rings(struct ifnet *ifp)
566 {
567 	netmap_set_all_rings(NA(ifp), 0 /* enabled */);
568 }
569 
570 
571 /*
572  * generic bound_checking function
573  */
574 u_int
575 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
576 {
577 	u_int oldv = *v;
578 	const char *op = NULL;
579 
580 	if (dflt < lo)
581 		dflt = lo;
582 	if (dflt > hi)
583 		dflt = hi;
584 	if (oldv < lo) {
585 		*v = dflt;
586 		op = "Bump";
587 	} else if (oldv > hi) {
588 		*v = hi;
589 		op = "Clamp";
590 	}
591 	if (op && msg)
592 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
593 	return *v;
594 }
595 
596 
597 /*
598  * packet-dump function, user-supplied or static buffer.
599  * The destination buffer must be at least 30+4*len
600  */
601 const char *
602 nm_dump_buf(char *p, int len, int lim, char *dst)
603 {
604 	static char _dst[8192];
605 	int i, j, i0;
606 	static char hex[] ="0123456789abcdef";
607 	char *o;	/* output position */
608 
609 #define P_HI(x)	hex[((x) & 0xf0)>>4]
610 #define P_LO(x)	hex[((x) & 0xf)]
611 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
612 	if (!dst)
613 		dst = _dst;
614 	if (lim <= 0 || lim > len)
615 		lim = len;
616 	o = dst;
617 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
618 	o += strlen(o);
619 	/* hexdump routine */
620 	for (i = 0; i < lim; ) {
621 		sprintf(o, "%5d: ", i);
622 		o += strlen(o);
623 		memset(o, ' ', 48);
624 		i0 = i;
625 		for (j=0; j < 16 && i < lim; i++, j++) {
626 			o[j*3] = P_HI(p[i]);
627 			o[j*3+1] = P_LO(p[i]);
628 		}
629 		i = i0;
630 		for (j=0; j < 16 && i < lim; i++, j++)
631 			o[j + 48] = P_C(p[i]);
632 		o[j+48] = '\n';
633 		o += j+49;
634 	}
635 	*o = '\0';
636 #undef P_HI
637 #undef P_LO
638 #undef P_C
639 	return dst;
640 }
641 
642 
643 /*
644  * Fetch configuration from the device, to cope with dynamic
645  * reconfigurations after loading the module.
646  */
647 /* call with NMG_LOCK held */
648 int
649 netmap_update_config(struct netmap_adapter *na)
650 {
651 	u_int txr, txd, rxr, rxd;
652 
653 	txr = txd = rxr = rxd = 0;
654 	if (na->nm_config) {
655 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
656 	} else {
657 		/* take whatever we had at init time */
658 		txr = na->num_tx_rings;
659 		txd = na->num_tx_desc;
660 		rxr = na->num_rx_rings;
661 		rxd = na->num_rx_desc;
662 	}
663 
664 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
665 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
666 		return 0; /* nothing changed */
667 	if (netmap_verbose || na->active_fds > 0) {
668 		D("stored config %s: txring %d x %d, rxring %d x %d",
669 			na->name,
670 			na->num_tx_rings, na->num_tx_desc,
671 			na->num_rx_rings, na->num_rx_desc);
672 		D("new config %s: txring %d x %d, rxring %d x %d",
673 			na->name, txr, txd, rxr, rxd);
674 	}
675 	if (na->active_fds == 0) {
676 		D("configuration changed (but fine)");
677 		na->num_tx_rings = txr;
678 		na->num_tx_desc = txd;
679 		na->num_rx_rings = rxr;
680 		na->num_rx_desc = rxd;
681 		return 0;
682 	}
683 	D("configuration changed while active, this is bad...");
684 	return 1;
685 }
686 
687 /* kring->nm_sync callback for the host tx ring */
688 static int
689 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
690 {
691 	(void)flags; /* unused */
692 	netmap_txsync_to_host(kring->na);
693 	return 0;
694 }
695 
696 /* kring->nm_sync callback for the host rx ring */
697 static int
698 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
699 {
700 	(void)flags; /* unused */
701 	netmap_rxsync_from_host(kring->na, NULL, NULL);
702 	return 0;
703 }
704 
705 
706 
707 /* create the krings array and initialize the fields common to all adapters.
708  * The array layout is this:
709  *
710  *                    +----------+
711  * na->tx_rings ----->|          | \
712  *                    |          |  } na->num_tx_ring
713  *                    |          | /
714  *                    +----------+
715  *                    |          |    host tx kring
716  * na->rx_rings ----> +----------+
717  *                    |          | \
718  *                    |          |  } na->num_rx_rings
719  *                    |          | /
720  *                    +----------+
721  *                    |          |    host rx kring
722  *                    +----------+
723  * na->tailroom ----->|          | \
724  *                    |          |  } tailroom bytes
725  *                    |          | /
726  *                    +----------+
727  *
728  * Note: for compatibility, host krings are created even when not needed.
729  * The tailroom space is currently used by vale ports for allocating leases.
730  */
731 /* call with NMG_LOCK held */
732 int
733 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
734 {
735 	u_int i, len, ndesc;
736 	struct netmap_kring *kring;
737 	u_int ntx, nrx;
738 
739 	/* account for the (possibly fake) host rings */
740 	ntx = na->num_tx_rings + 1;
741 	nrx = na->num_rx_rings + 1;
742 
743 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
744 
745 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
746 	if (na->tx_rings == NULL) {
747 		D("Cannot allocate krings");
748 		return ENOMEM;
749 	}
750 	na->rx_rings = na->tx_rings + ntx;
751 
752 	/*
753 	 * All fields in krings are 0 except the one initialized below.
754 	 * but better be explicit on important kring fields.
755 	 */
756 	ndesc = na->num_tx_desc;
757 	for (i = 0; i < ntx; i++) { /* Transmit rings */
758 		kring = &na->tx_rings[i];
759 		bzero(kring, sizeof(*kring));
760 		kring->na = na;
761 		kring->ring_id = i;
762 		kring->nkr_num_slots = ndesc;
763 		if (i < na->num_tx_rings) {
764 			kring->nm_sync = na->nm_txsync;
765 		} else if (i == na->num_tx_rings) {
766 			kring->nm_sync = netmap_txsync_to_host_compat;
767 		}
768 		/*
769 		 * IMPORTANT: Always keep one slot empty.
770 		 */
771 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
772 		kring->rtail = kring->nr_hwtail = ndesc - 1;
773 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", na->name, i);
774 		ND("ktx %s h %d c %d t %d",
775 			kring->name, kring->rhead, kring->rcur, kring->rtail);
776 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
777 		init_waitqueue_head(&kring->si);
778 	}
779 
780 	ndesc = na->num_rx_desc;
781 	for (i = 0; i < nrx; i++) { /* Receive rings */
782 		kring = &na->rx_rings[i];
783 		bzero(kring, sizeof(*kring));
784 		kring->na = na;
785 		kring->ring_id = i;
786 		kring->nkr_num_slots = ndesc;
787 		if (i < na->num_rx_rings) {
788 			kring->nm_sync = na->nm_rxsync;
789 		} else if (i == na->num_rx_rings) {
790 			kring->nm_sync = netmap_rxsync_from_host_compat;
791 		}
792 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
793 		kring->rtail = kring->nr_hwtail = 0;
794 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", na->name, i);
795 		ND("krx %s h %d c %d t %d",
796 			kring->name, kring->rhead, kring->rcur, kring->rtail);
797 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
798 		init_waitqueue_head(&kring->si);
799 	}
800 	init_waitqueue_head(&na->tx_si);
801 	init_waitqueue_head(&na->rx_si);
802 
803 	na->tailroom = na->rx_rings + nrx;
804 
805 	return 0;
806 }
807 
808 
809 /* undo the actions performed by netmap_krings_create */
810 /* call with NMG_LOCK held */
811 void
812 netmap_krings_delete(struct netmap_adapter *na)
813 {
814 	struct netmap_kring *kring = na->tx_rings;
815 
816 	/* we rely on the krings layout described above */
817 	for ( ; kring != na->tailroom; kring++) {
818 		mtx_destroy(&kring->q_lock);
819 	}
820 	free(na->tx_rings, M_DEVBUF);
821 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
822 }
823 
824 
825 /*
826  * Destructor for NIC ports. They also have an mbuf queue
827  * on the rings connected to the host so we need to purge
828  * them first.
829  */
830 /* call with NMG_LOCK held */
831 static void
832 netmap_hw_krings_delete(struct netmap_adapter *na)
833 {
834 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
835 
836 	ND("destroy sw mbq with len %d", mbq_len(q));
837 	mbq_purge(q);
838 	mbq_safe_destroy(q);
839 	netmap_krings_delete(na);
840 }
841 
842 
843 /* create a new netmap_if for a newly registered fd.
844  * If this is the first registration of the adapter,
845  * also create the netmap rings and their in-kernel view,
846  * the netmap krings.
847  */
848 /* call with NMG_LOCK held */
849 static struct netmap_if*
850 netmap_if_new(struct netmap_adapter *na)
851 {
852 	struct netmap_if *nifp;
853 
854 	if (netmap_update_config(na)) {
855 		/* configuration mismatch, report and fail */
856 		return NULL;
857 	}
858 
859 	if (na->active_fds)	/* already registered */
860 		goto final;
861 
862 	/* create and init the krings arrays.
863 	 * Depending on the adapter, this may also create
864 	 * the netmap rings themselves
865 	 */
866 	if (na->nm_krings_create(na))
867 		return NULL;
868 
869 	/* create all missing netmap rings */
870 	if (netmap_mem_rings_create(na))
871 		goto cleanup;
872 
873 final:
874 
875 	/* in all cases, create a new netmap if */
876 	nifp = netmap_mem_if_new(na);
877 	if (nifp == NULL)
878 		goto cleanup;
879 
880 	return (nifp);
881 
882 cleanup:
883 
884 	if (na->active_fds == 0) {
885 		netmap_mem_rings_delete(na);
886 		na->nm_krings_delete(na);
887 	}
888 
889 	return NULL;
890 }
891 
892 
893 /* grab a reference to the memory allocator, if we don't have one already.  The
894  * reference is taken from the netmap_adapter registered with the priv.
895  */
896 /* call with NMG_LOCK held */
897 static int
898 netmap_get_memory_locked(struct netmap_priv_d* p)
899 {
900 	struct netmap_mem_d *nmd;
901 	int error = 0;
902 
903 	if (p->np_na == NULL) {
904 		if (!netmap_mmap_unreg)
905 			return ENODEV;
906 		/* for compatibility with older versions of the API
907  		 * we use the global allocator when no interface has been
908  		 * registered
909  		 */
910 		nmd = &nm_mem;
911 	} else {
912 		nmd = p->np_na->nm_mem;
913 	}
914 	if (p->np_mref == NULL) {
915 		error = netmap_mem_finalize(nmd, p->np_na);
916 		if (!error)
917 			p->np_mref = nmd;
918 	} else if (p->np_mref != nmd) {
919 		/* a virtual port has been registered, but previous
920  		 * syscalls already used the global allocator.
921  		 * We cannot continue
922  		 */
923 		error = ENODEV;
924 	}
925 	return error;
926 }
927 
928 
929 /* call with NMG_LOCK *not* held */
930 int
931 netmap_get_memory(struct netmap_priv_d* p)
932 {
933 	int error;
934 	NMG_LOCK();
935 	error = netmap_get_memory_locked(p);
936 	NMG_UNLOCK();
937 	return error;
938 }
939 
940 
941 /* call with NMG_LOCK held */
942 static int
943 netmap_have_memory_locked(struct netmap_priv_d* p)
944 {
945 	return p->np_mref != NULL;
946 }
947 
948 
949 /* call with NMG_LOCK held */
950 static void
951 netmap_drop_memory_locked(struct netmap_priv_d* p)
952 {
953 	if (p->np_mref) {
954 		netmap_mem_deref(p->np_mref, p->np_na);
955 		p->np_mref = NULL;
956 	}
957 }
958 
959 
960 /*
961  * Call nm_register(ifp,0) to stop netmap mode on the interface and
962  * revert to normal operation.
963  * The second argument is the nifp to work on. In some cases it is
964  * not attached yet to the netmap_priv_d so we need to pass it as
965  * a separate argument.
966  */
967 /* call with NMG_LOCK held */
968 static void
969 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
970 {
971 	struct netmap_adapter *na = priv->np_na;
972 
973 	NMG_LOCK_ASSERT();
974 	na->active_fds--;
975 	if (na->active_fds <= 0) {	/* last instance */
976 
977 		if (netmap_verbose)
978 			D("deleting last instance for %s", na->name);
979 		/*
980 		 * (TO CHECK) This function is only called
981 		 * when the last reference to this file descriptor goes
982 		 * away. This means we cannot have any pending poll()
983 		 * or interrupt routine operating on the structure.
984 		 * XXX The file may be closed in a thread while
985 		 * another thread is using it.
986 		 * Linux keeps the file opened until the last reference
987 		 * by any outstanding ioctl/poll or mmap is gone.
988 		 * FreeBSD does not track mmap()s (but we do) and
989 		 * wakes up any sleeping poll(). Need to check what
990 		 * happens if the close() occurs while a concurrent
991 		 * syscall is running.
992 		 */
993 		na->nm_register(na, 0); /* off, clear flags */
994 		/* Wake up any sleeping threads. netmap_poll will
995 		 * then return POLLERR
996 		 * XXX The wake up now must happen during *_down(), when
997 		 * we order all activities to stop. -gl
998 		 */
999 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
1000 		/* knlist_destroy(&na->tx_si.si_note); */
1001 		/* knlist_destroy(&na->rx_si.si_note); */
1002 
1003 		/* delete rings and buffers */
1004 		netmap_mem_rings_delete(na);
1005 		na->nm_krings_delete(na);
1006 	}
1007 	/* delete the nifp */
1008 	netmap_mem_if_delete(na, nifp);
1009 }
1010 
1011 /* call with NMG_LOCK held */
1012 static __inline int
1013 nm_tx_si_user(struct netmap_priv_d *priv)
1014 {
1015 	return (priv->np_na != NULL &&
1016 		(priv->np_txqlast - priv->np_txqfirst > 1));
1017 }
1018 
1019 /* call with NMG_LOCK held */
1020 static __inline int
1021 nm_rx_si_user(struct netmap_priv_d *priv)
1022 {
1023 	return (priv->np_na != NULL &&
1024 		(priv->np_rxqlast - priv->np_rxqfirst > 1));
1025 }
1026 
1027 
1028 /*
1029  * Destructor of the netmap_priv_d, called when the fd has
1030  * no active open() and mmap(). Also called in error paths.
1031  *
1032  * returns 1 if this is the last instance and we can free priv
1033  */
1034 /* call with NMG_LOCK held */
1035 int
1036 netmap_dtor_locked(struct netmap_priv_d *priv)
1037 {
1038 	struct netmap_adapter *na = priv->np_na;
1039 
1040 #ifdef __FreeBSD__
1041 	/*
1042 	 * np_refcount is the number of active mmaps on
1043 	 * this file descriptor
1044 	 */
1045 	if (--priv->np_refcount > 0) {
1046 		return 0;
1047 	}
1048 #endif /* __FreeBSD__ */
1049 	if (!na) {
1050 	    return 1; //XXX is it correct?
1051 	}
1052 	netmap_do_unregif(priv, priv->np_nifp);
1053 	priv->np_nifp = NULL;
1054 	netmap_drop_memory_locked(priv);
1055 	if (priv->np_na) {
1056 		if (nm_tx_si_user(priv))
1057 			na->tx_si_users--;
1058 		if (nm_rx_si_user(priv))
1059 			na->rx_si_users--;
1060 		netmap_adapter_put(na);
1061 		priv->np_na = NULL;
1062 	}
1063 	return 1;
1064 }
1065 
1066 
1067 /* call with NMG_LOCK *not* held */
1068 void
1069 netmap_dtor(void *data)
1070 {
1071 	struct netmap_priv_d *priv = data;
1072 	int last_instance;
1073 
1074 	NMG_LOCK();
1075 	last_instance = netmap_dtor_locked(priv);
1076 	NMG_UNLOCK();
1077 	if (last_instance) {
1078 		bzero(priv, sizeof(*priv));	/* for safety */
1079 		free(priv, M_DEVBUF);
1080 	}
1081 }
1082 
1083 
1084 
1085 
1086 /*
1087  * Handlers for synchronization of the queues from/to the host.
1088  * Netmap has two operating modes:
1089  * - in the default mode, the rings connected to the host stack are
1090  *   just another ring pair managed by userspace;
1091  * - in transparent mode (XXX to be defined) incoming packets
1092  *   (from the host or the NIC) are marked as NS_FORWARD upon
1093  *   arrival, and the user application has a chance to reset the
1094  *   flag for packets that should be dropped.
1095  *   On the RXSYNC or poll(), packets in RX rings between
1096  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1097  *   to the other side.
1098  * The transfer NIC --> host is relatively easy, just encapsulate
1099  * into mbufs and we are done. The host --> NIC side is slightly
1100  * harder because there might not be room in the tx ring so it
1101  * might take a while before releasing the buffer.
1102  */
1103 
1104 
1105 /*
1106  * pass a chain of buffers to the host stack as coming from 'dst'
1107  * We do not need to lock because the queue is private.
1108  */
1109 static void
1110 netmap_send_up(struct ifnet *dst, struct mbq *q)
1111 {
1112 	struct mbuf *m;
1113 
1114 	/* send packets up, outside the lock */
1115 	while ((m = mbq_dequeue(q)) != NULL) {
1116 		if (netmap_verbose & NM_VERB_HOST)
1117 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1118 		NM_SEND_UP(dst, m);
1119 	}
1120 	mbq_destroy(q);
1121 }
1122 
1123 
1124 /*
1125  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1126  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1127  * and pass them up. Drop remaining packets in the unlikely event
1128  * of an mbuf shortage.
1129  */
1130 static void
1131 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1132 {
1133 	u_int const lim = kring->nkr_num_slots - 1;
1134 	u_int const head = kring->ring->head;
1135 	u_int n;
1136 	struct netmap_adapter *na = kring->na;
1137 
1138 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1139 		struct mbuf *m;
1140 		struct netmap_slot *slot = &kring->ring->slot[n];
1141 
1142 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1143 			continue;
1144 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1145 			RD(5, "bad pkt at %d len %d", n, slot->len);
1146 			continue;
1147 		}
1148 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1149 		/* XXX TODO: adapt to the case of a multisegment packet */
1150 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1151 
1152 		if (m == NULL)
1153 			break;
1154 		mbq_enqueue(q, m);
1155 	}
1156 }
1157 
1158 
1159 /*
1160  * Send to the NIC rings packets marked NS_FORWARD between
1161  * kring->nr_hwcur and kring->rhead
1162  * Called under kring->rx_queue.lock on the sw rx ring,
1163  */
1164 static u_int
1165 netmap_sw_to_nic(struct netmap_adapter *na)
1166 {
1167 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1168 	struct netmap_slot *rxslot = kring->ring->slot;
1169 	u_int i, rxcur = kring->nr_hwcur;
1170 	u_int const head = kring->rhead;
1171 	u_int const src_lim = kring->nkr_num_slots - 1;
1172 	u_int sent = 0;
1173 
1174 	/* scan rings to find space, then fill as much as possible */
1175 	for (i = 0; i < na->num_tx_rings; i++) {
1176 		struct netmap_kring *kdst = &na->tx_rings[i];
1177 		struct netmap_ring *rdst = kdst->ring;
1178 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1179 
1180 		/* XXX do we trust ring or kring->rcur,rtail ? */
1181 		for (; rxcur != head && !nm_ring_empty(rdst);
1182 		     rxcur = nm_next(rxcur, src_lim) ) {
1183 			struct netmap_slot *src, *dst, tmp;
1184 			u_int dst_cur = rdst->cur;
1185 
1186 			src = &rxslot[rxcur];
1187 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1188 				continue;
1189 
1190 			sent++;
1191 
1192 			dst = &rdst->slot[dst_cur];
1193 
1194 			tmp = *src;
1195 
1196 			src->buf_idx = dst->buf_idx;
1197 			src->flags = NS_BUF_CHANGED;
1198 
1199 			dst->buf_idx = tmp.buf_idx;
1200 			dst->len = tmp.len;
1201 			dst->flags = NS_BUF_CHANGED;
1202 
1203 			rdst->cur = nm_next(dst_cur, dst_lim);
1204 		}
1205 		/* if (sent) XXX txsync ? */
1206 	}
1207 	return sent;
1208 }
1209 
1210 
1211 /*
1212  * netmap_txsync_to_host() passes packets up. We are called from a
1213  * system call in user process context, and the only contention
1214  * can be among multiple user threads erroneously calling
1215  * this routine concurrently.
1216  */
1217 void
1218 netmap_txsync_to_host(struct netmap_adapter *na)
1219 {
1220 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1221 	struct netmap_ring *ring = kring->ring;
1222 	u_int const lim = kring->nkr_num_slots - 1;
1223 	u_int const head = kring->rhead;
1224 	struct mbq q;
1225 
1226 	/* Take packets from hwcur to head and pass them up.
1227 	 * force head = cur since netmap_grab_packets() stops at head
1228 	 * In case of no buffers we give up. At the end of the loop,
1229 	 * the queue is drained in all cases.
1230 	 */
1231 	mbq_init(&q);
1232 	ring->cur = head;
1233 	netmap_grab_packets(kring, &q, 1 /* force */);
1234 	ND("have %d pkts in queue", mbq_len(&q));
1235 	kring->nr_hwcur = head;
1236 	kring->nr_hwtail = head + lim;
1237 	if (kring->nr_hwtail > lim)
1238 		kring->nr_hwtail -= lim + 1;
1239 	nm_txsync_finalize(kring);
1240 
1241 	netmap_send_up(na->ifp, &q);
1242 }
1243 
1244 
1245 /*
1246  * rxsync backend for packets coming from the host stack.
1247  * They have been put in kring->rx_queue by netmap_transmit().
1248  * We protect access to the kring using kring->rx_queue.lock
1249  *
1250  * This routine also does the selrecord if called from the poll handler
1251  * (we know because td != NULL).
1252  *
1253  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1254  *     as an additional hidden argument.
1255  * returns the number of packets delivered to tx queues in
1256  * transparent mode, or a negative value if error
1257  */
1258 int
1259 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1260 {
1261 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1262 	struct netmap_ring *ring = kring->ring;
1263 	u_int nm_i, n;
1264 	u_int const lim = kring->nkr_num_slots - 1;
1265 	u_int const head = kring->rhead;
1266 	int ret = 0;
1267 	struct mbq *q = &kring->rx_queue;
1268 
1269 	(void)pwait;	/* disable unused warnings */
1270 	(void)td;
1271 
1272 	mbq_lock(q);
1273 
1274 	/* First part: import newly received packets */
1275 	n = mbq_len(q);
1276 	if (n) { /* grab packets from the queue */
1277 		struct mbuf *m;
1278 		uint32_t stop_i;
1279 
1280 		nm_i = kring->nr_hwtail;
1281 		stop_i = nm_prev(nm_i, lim);
1282 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1283 			int len = MBUF_LEN(m);
1284 			struct netmap_slot *slot = &ring->slot[nm_i];
1285 
1286 			m_copydata(m, 0, len, NMB(na, slot));
1287 			ND("nm %d len %d", nm_i, len);
1288 			if (netmap_verbose)
1289                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1290 
1291 			slot->len = len;
1292 			slot->flags = kring->nkr_slot_flags;
1293 			nm_i = nm_next(nm_i, lim);
1294 			m_freem(m);
1295 		}
1296 		kring->nr_hwtail = nm_i;
1297 	}
1298 
1299 	/*
1300 	 * Second part: skip past packets that userspace has released.
1301 	 */
1302 	nm_i = kring->nr_hwcur;
1303 	if (nm_i != head) { /* something was released */
1304 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1305 			ret = netmap_sw_to_nic(na);
1306 		kring->nr_hwcur = head;
1307 	}
1308 
1309 	nm_rxsync_finalize(kring);
1310 
1311 	/* access copies of cur,tail in the kring */
1312 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1313 		selrecord(td, &kring->si);
1314 
1315 	mbq_unlock(q);
1316 	return ret;
1317 }
1318 
1319 
1320 /* Get a netmap adapter for the port.
1321  *
1322  * If it is possible to satisfy the request, return 0
1323  * with *na containing the netmap adapter found.
1324  * Otherwise return an error code, with *na containing NULL.
1325  *
1326  * When the port is attached to a bridge, we always return
1327  * EBUSY.
1328  * Otherwise, if the port is already bound to a file descriptor,
1329  * then we unconditionally return the existing adapter into *na.
1330  * In all the other cases, we return (into *na) either native,
1331  * generic or NULL, according to the following table:
1332  *
1333  *					native_support
1334  * active_fds   dev.netmap.admode         YES     NO
1335  * -------------------------------------------------------
1336  *    >0              *                 NA(ifp) NA(ifp)
1337  *
1338  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1339  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1340  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1341  *
1342  */
1343 
1344 int
1345 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1346 {
1347 	/* generic support */
1348 	int i = netmap_admode;	/* Take a snapshot. */
1349 	int error = 0;
1350 	struct netmap_adapter *prev_na;
1351 	struct netmap_generic_adapter *gna;
1352 
1353 	*na = NULL; /* default */
1354 
1355 	/* reset in case of invalid value */
1356 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1357 		i = netmap_admode = NETMAP_ADMODE_BEST;
1358 
1359 	if (NETMAP_CAPABLE(ifp)) {
1360 		prev_na = NA(ifp);
1361 		/* If an adapter already exists, return it if
1362 		 * there are active file descriptors or if
1363 		 * netmap is not forced to use generic
1364 		 * adapters.
1365 		 */
1366 		if (NETMAP_OWNED_BY_ANY(prev_na)
1367 			|| i != NETMAP_ADMODE_GENERIC
1368 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1369 #ifdef WITH_PIPES
1370 			/* ugly, but we cannot allow an adapter switch
1371 			 * if some pipe is referring to this one
1372 			 */
1373 			|| prev_na->na_next_pipe > 0
1374 #endif
1375 		) {
1376 			*na = prev_na;
1377 			return 0;
1378 		}
1379 	}
1380 
1381 	/* If there isn't native support and netmap is not allowed
1382 	 * to use generic adapters, we cannot satisfy the request.
1383 	 */
1384 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1385 		return EOPNOTSUPP;
1386 
1387 	/* Otherwise, create a generic adapter and return it,
1388 	 * saving the previously used netmap adapter, if any.
1389 	 *
1390 	 * Note that here 'prev_na', if not NULL, MUST be a
1391 	 * native adapter, and CANNOT be a generic one. This is
1392 	 * true because generic adapters are created on demand, and
1393 	 * destroyed when not used anymore. Therefore, if the adapter
1394 	 * currently attached to an interface 'ifp' is generic, it
1395 	 * must be that
1396 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1397 	 * Consequently, if NA(ifp) is generic, we will enter one of
1398 	 * the branches above. This ensures that we never override
1399 	 * a generic adapter with another generic adapter.
1400 	 */
1401 	prev_na = NA(ifp);
1402 	error = generic_netmap_attach(ifp);
1403 	if (error)
1404 		return error;
1405 
1406 	*na = NA(ifp);
1407 	gna = (struct netmap_generic_adapter*)NA(ifp);
1408 	gna->prev = prev_na; /* save old na */
1409 	if (prev_na != NULL) {
1410 		ifunit_ref(ifp->if_xname);
1411 		// XXX add a refcount ?
1412 		netmap_adapter_get(prev_na);
1413 	}
1414 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1415 
1416 	return 0;
1417 }
1418 
1419 
1420 /*
1421  * MUST BE CALLED UNDER NMG_LOCK()
1422  *
1423  * Get a refcounted reference to a netmap adapter attached
1424  * to the interface specified by nmr.
1425  * This is always called in the execution of an ioctl().
1426  *
1427  * Return ENXIO if the interface specified by the request does
1428  * not exist, ENOTSUP if netmap is not supported by the interface,
1429  * EBUSY if the interface is already attached to a bridge,
1430  * EINVAL if parameters are invalid, ENOMEM if needed resources
1431  * could not be allocated.
1432  * If successful, hold a reference to the netmap adapter.
1433  *
1434  * No reference is kept on the real interface, which may then
1435  * disappear at any time.
1436  */
1437 int
1438 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1439 {
1440 	struct ifnet *ifp = NULL;
1441 	int error = 0;
1442 	struct netmap_adapter *ret = NULL;
1443 
1444 	*na = NULL;     /* default return value */
1445 
1446 	NMG_LOCK_ASSERT();
1447 
1448 	/* we cascade through all possibile types of netmap adapter.
1449 	 * All netmap_get_*_na() functions return an error and an na,
1450 	 * with the following combinations:
1451 	 *
1452 	 * error    na
1453 	 *   0	   NULL		type doesn't match
1454 	 *  !0	   NULL		type matches, but na creation/lookup failed
1455 	 *   0	  !NULL		type matches and na created/found
1456 	 *  !0    !NULL		impossible
1457 	 */
1458 
1459 	/* try to see if this is a monitor port */
1460 	error = netmap_get_monitor_na(nmr, na, create);
1461 	if (error || *na != NULL)
1462 		return error;
1463 
1464 	/* try to see if this is a pipe port */
1465 	error = netmap_get_pipe_na(nmr, na, create);
1466 	if (error || *na != NULL)
1467 		return error;
1468 
1469 	/* try to see if this is a bridge port */
1470 	error = netmap_get_bdg_na(nmr, na, create);
1471 	if (error)
1472 		return error;
1473 
1474 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1475 		goto pipes;
1476 
1477 	/*
1478 	 * This must be a hardware na, lookup the name in the system.
1479 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1480 	 * This may still be a tap, a veth/epair, or even a
1481 	 * persistent VALE port.
1482 	 */
1483 	ifp = ifunit_ref(nmr->nr_name);
1484 	if (ifp == NULL) {
1485 	        return ENXIO;
1486 	}
1487 
1488 	error = netmap_get_hw_na(ifp, &ret);
1489 	if (error)
1490 		goto out;
1491 
1492 	*na = ret;
1493 	netmap_adapter_get(ret);
1494 
1495 pipes:
1496 	/*
1497 	 * If we are opening a pipe whose parent was not in netmap mode,
1498 	 * we have to allocate the pipe array now.
1499 	 * XXX get rid of this clumsiness (2014-03-15)
1500 	 */
1501 	error = netmap_pipe_alloc(*na, nmr);
1502 
1503 out:
1504 	if (error && ret != NULL)
1505 		netmap_adapter_put(ret);
1506 
1507 	if (ifp)
1508 		if_rele(ifp); /* allow live unloading of drivers modules */
1509 
1510 	return error;
1511 }
1512 
1513 
1514 /*
1515  * validate parameters on entry for *_txsync()
1516  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1517  * in case of error.
1518  *
1519  * rhead, rcur and rtail=hwtail are stored from previous round.
1520  * hwcur is the next packet to send to the ring.
1521  *
1522  * We want
1523  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1524  *
1525  * hwcur, rhead, rtail and hwtail are reliable
1526  */
1527 u_int
1528 nm_txsync_prologue(struct netmap_kring *kring)
1529 {
1530 	struct netmap_ring *ring = kring->ring;
1531 	u_int head = ring->head; /* read only once */
1532 	u_int cur = ring->cur; /* read only once */
1533 	u_int n = kring->nkr_num_slots;
1534 
1535 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1536 		kring->name,
1537 		kring->nr_hwcur, kring->nr_hwtail,
1538 		ring->head, ring->cur, ring->tail);
1539 #if 1 /* kernel sanity checks; but we can trust the kring. */
1540 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1541 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1542 		goto error;
1543 #endif /* kernel sanity checks */
1544 	/*
1545 	 * user sanity checks. We only use 'cur',
1546 	 * A, B, ... are possible positions for cur:
1547 	 *
1548 	 *  0    A  cur   B  tail  C  n-1
1549 	 *  0    D  tail  E  cur   F  n-1
1550 	 *
1551 	 * B, F, D are valid. A, C, E are wrong
1552 	 */
1553 	if (kring->rtail >= kring->rhead) {
1554 		/* want rhead <= head <= rtail */
1555 		if (head < kring->rhead || head > kring->rtail)
1556 			goto error;
1557 		/* and also head <= cur <= rtail */
1558 		if (cur < head || cur > kring->rtail)
1559 			goto error;
1560 	} else { /* here rtail < rhead */
1561 		/* we need head outside rtail .. rhead */
1562 		if (head > kring->rtail && head < kring->rhead)
1563 			goto error;
1564 
1565 		/* two cases now: head <= rtail or head >= rhead  */
1566 		if (head <= kring->rtail) {
1567 			/* want head <= cur <= rtail */
1568 			if (cur < head || cur > kring->rtail)
1569 				goto error;
1570 		} else { /* head >= rhead */
1571 			/* cur must be outside rtail..head */
1572 			if (cur > kring->rtail && cur < head)
1573 				goto error;
1574 		}
1575 	}
1576 	if (ring->tail != kring->rtail) {
1577 		RD(5, "tail overwritten was %d need %d",
1578 			ring->tail, kring->rtail);
1579 		ring->tail = kring->rtail;
1580 	}
1581 	kring->rhead = head;
1582 	kring->rcur = cur;
1583 	return head;
1584 
1585 error:
1586 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1587 		kring->name,
1588 		kring->nr_hwcur,
1589 		kring->rcur, kring->nr_hwtail,
1590 		cur, ring->tail);
1591 	return n;
1592 }
1593 
1594 
1595 /*
1596  * validate parameters on entry for *_rxsync()
1597  * Returns ring->head if ok, kring->nkr_num_slots on error.
1598  *
1599  * For a valid configuration,
1600  * hwcur <= head <= cur <= tail <= hwtail
1601  *
1602  * We only consider head and cur.
1603  * hwcur and hwtail are reliable.
1604  *
1605  */
1606 u_int
1607 nm_rxsync_prologue(struct netmap_kring *kring)
1608 {
1609 	struct netmap_ring *ring = kring->ring;
1610 	uint32_t const n = kring->nkr_num_slots;
1611 	uint32_t head, cur;
1612 
1613 	ND("%s kc %d kt %d h %d c %d t %d",
1614 		kring->name,
1615 		kring->nr_hwcur, kring->nr_hwtail,
1616 		ring->head, ring->cur, ring->tail);
1617 	/*
1618 	 * Before storing the new values, we should check they do not
1619 	 * move backwards. However:
1620 	 * - head is not an issue because the previous value is hwcur;
1621 	 * - cur could in principle go back, however it does not matter
1622 	 *   because we are processing a brand new rxsync()
1623 	 */
1624 	cur = kring->rcur = ring->cur;	/* read only once */
1625 	head = kring->rhead = ring->head;	/* read only once */
1626 #if 1 /* kernel sanity checks */
1627 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1628 		goto error;
1629 #endif /* kernel sanity checks */
1630 	/* user sanity checks */
1631 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1632 		/* want hwcur <= rhead <= hwtail */
1633 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1634 			goto error;
1635 		/* and also rhead <= rcur <= hwtail */
1636 		if (cur < head || cur > kring->nr_hwtail)
1637 			goto error;
1638 	} else {
1639 		/* we need rhead outside hwtail..hwcur */
1640 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1641 			goto error;
1642 		/* two cases now: head <= hwtail or head >= hwcur  */
1643 		if (head <= kring->nr_hwtail) {
1644 			/* want head <= cur <= hwtail */
1645 			if (cur < head || cur > kring->nr_hwtail)
1646 				goto error;
1647 		} else {
1648 			/* cur must be outside hwtail..head */
1649 			if (cur < head && cur > kring->nr_hwtail)
1650 				goto error;
1651 		}
1652 	}
1653 	if (ring->tail != kring->rtail) {
1654 		RD(5, "%s tail overwritten was %d need %d",
1655 			kring->name,
1656 			ring->tail, kring->rtail);
1657 		ring->tail = kring->rtail;
1658 	}
1659 	return head;
1660 
1661 error:
1662 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1663 		kring->nr_hwcur,
1664 		kring->rcur, kring->nr_hwtail,
1665 		kring->rhead, kring->rcur, ring->tail);
1666 	return n;
1667 }
1668 
1669 
1670 /*
1671  * Error routine called when txsync/rxsync detects an error.
1672  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1673  * Return 1 on reinit.
1674  *
1675  * This routine is only called by the upper half of the kernel.
1676  * It only reads hwcur (which is changed only by the upper half, too)
1677  * and hwtail (which may be changed by the lower half, but only on
1678  * a tx ring and only to increase it, so any error will be recovered
1679  * on the next call). For the above, we don't strictly need to call
1680  * it under lock.
1681  */
1682 int
1683 netmap_ring_reinit(struct netmap_kring *kring)
1684 {
1685 	struct netmap_ring *ring = kring->ring;
1686 	u_int i, lim = kring->nkr_num_slots - 1;
1687 	int errors = 0;
1688 
1689 	// XXX KASSERT nm_kr_tryget
1690 	RD(10, "called for %s", kring->name);
1691 	// XXX probably wrong to trust userspace
1692 	kring->rhead = ring->head;
1693 	kring->rcur  = ring->cur;
1694 	kring->rtail = ring->tail;
1695 
1696 	if (ring->cur > lim)
1697 		errors++;
1698 	if (ring->head > lim)
1699 		errors++;
1700 	if (ring->tail > lim)
1701 		errors++;
1702 	for (i = 0; i <= lim; i++) {
1703 		u_int idx = ring->slot[i].buf_idx;
1704 		u_int len = ring->slot[i].len;
1705 		if (idx < 2 || idx >= netmap_total_buffers) {
1706 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1707 			ring->slot[i].buf_idx = 0;
1708 			ring->slot[i].len = 0;
1709 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1710 			ring->slot[i].len = 0;
1711 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1712 		}
1713 	}
1714 	if (errors) {
1715 		RD(10, "total %d errors", errors);
1716 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1717 			kring->name,
1718 			ring->cur, kring->nr_hwcur,
1719 			ring->tail, kring->nr_hwtail);
1720 		ring->head = kring->rhead = kring->nr_hwcur;
1721 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1722 		ring->tail = kring->rtail = kring->nr_hwtail;
1723 	}
1724 	return (errors ? 1 : 0);
1725 }
1726 
1727 /* interpret the ringid and flags fields of an nmreq, by translating them
1728  * into a pair of intervals of ring indices:
1729  *
1730  * [priv->np_txqfirst, priv->np_txqlast) and
1731  * [priv->np_rxqfirst, priv->np_rxqlast)
1732  *
1733  */
1734 int
1735 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1736 {
1737 	struct netmap_adapter *na = priv->np_na;
1738 	u_int j, i = ringid & NETMAP_RING_MASK;
1739 	u_int reg = flags & NR_REG_MASK;
1740 
1741 	if (reg == NR_REG_DEFAULT) {
1742 		/* convert from old ringid to flags */
1743 		if (ringid & NETMAP_SW_RING) {
1744 			reg = NR_REG_SW;
1745 		} else if (ringid & NETMAP_HW_RING) {
1746 			reg = NR_REG_ONE_NIC;
1747 		} else {
1748 			reg = NR_REG_ALL_NIC;
1749 		}
1750 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1751 	}
1752 	switch (reg) {
1753 	case NR_REG_ALL_NIC:
1754 	case NR_REG_PIPE_MASTER:
1755 	case NR_REG_PIPE_SLAVE:
1756 		priv->np_txqfirst = 0;
1757 		priv->np_txqlast = na->num_tx_rings;
1758 		priv->np_rxqfirst = 0;
1759 		priv->np_rxqlast = na->num_rx_rings;
1760 		ND("%s %d %d", "ALL/PIPE",
1761 			priv->np_rxqfirst, priv->np_rxqlast);
1762 		break;
1763 	case NR_REG_SW:
1764 	case NR_REG_NIC_SW:
1765 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1766 			D("host rings not supported");
1767 			return EINVAL;
1768 		}
1769 		priv->np_txqfirst = (reg == NR_REG_SW ?
1770 			na->num_tx_rings : 0);
1771 		priv->np_txqlast = na->num_tx_rings + 1;
1772 		priv->np_rxqfirst = (reg == NR_REG_SW ?
1773 			na->num_rx_rings : 0);
1774 		priv->np_rxqlast = na->num_rx_rings + 1;
1775 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1776 			priv->np_rxqfirst, priv->np_rxqlast);
1777 		break;
1778 	case NR_REG_ONE_NIC:
1779 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1780 			D("invalid ring id %d", i);
1781 			return EINVAL;
1782 		}
1783 		/* if not enough rings, use the first one */
1784 		j = i;
1785 		if (j >= na->num_tx_rings)
1786 			j = 0;
1787 		priv->np_txqfirst = j;
1788 		priv->np_txqlast = j + 1;
1789 		j = i;
1790 		if (j >= na->num_rx_rings)
1791 			j = 0;
1792 		priv->np_rxqfirst = j;
1793 		priv->np_rxqlast = j + 1;
1794 		break;
1795 	default:
1796 		D("invalid regif type %d", reg);
1797 		return EINVAL;
1798 	}
1799 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1800 
1801 	if (netmap_verbose) {
1802 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1803 			na->name,
1804 			priv->np_txqfirst,
1805 			priv->np_txqlast,
1806 			priv->np_rxqfirst,
1807 			priv->np_rxqlast,
1808 			i);
1809 	}
1810 	return 0;
1811 }
1812 
1813 
1814 /*
1815  * Set the ring ID. For devices with a single queue, a request
1816  * for all rings is the same as a single ring.
1817  */
1818 static int
1819 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1820 {
1821 	struct netmap_adapter *na = priv->np_na;
1822 	int error;
1823 
1824 	error = netmap_interp_ringid(priv, ringid, flags);
1825 	if (error) {
1826 		return error;
1827 	}
1828 
1829 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1830 
1831 	/* optimization: count the users registered for more than
1832 	 * one ring, which are the ones sleeping on the global queue.
1833 	 * The default netmap_notify() callback will then
1834 	 * avoid signaling the global queue if nobody is using it
1835 	 */
1836 	if (nm_tx_si_user(priv))
1837 		na->tx_si_users++;
1838 	if (nm_rx_si_user(priv))
1839 		na->rx_si_users++;
1840 	return 0;
1841 }
1842 
1843 /*
1844  * possibly move the interface to netmap-mode.
1845  * If success it returns a pointer to netmap_if, otherwise NULL.
1846  * This must be called with NMG_LOCK held.
1847  *
1848  * The following na callbacks are called in the process:
1849  *
1850  * na->nm_config()			[by netmap_update_config]
1851  * (get current number and size of rings)
1852  *
1853  *  	We have a generic one for linux (netmap_linux_config).
1854  *  	The bwrap has to override this, since it has to forward
1855  *  	the request to the wrapped adapter (netmap_bwrap_config).
1856  *
1857  *    	XXX netmap_if_new calls this again (2014-03-15)
1858  *
1859  * na->nm_krings_create()		[by netmap_if_new]
1860  * (create and init the krings array)
1861  *
1862  * 	One of the following:
1863  *
1864  *	* netmap_hw_krings_create, 			(hw ports)
1865  *		creates the standard layout for the krings
1866  * 		and adds the mbq (used for the host rings).
1867  *
1868  * 	* netmap_vp_krings_create			(VALE ports)
1869  * 		add leases and scratchpads
1870  *
1871  * 	* netmap_pipe_krings_create			(pipes)
1872  * 		create the krings and rings of both ends and
1873  * 		cross-link them
1874  *
1875  *      * netmap_monitor_krings_create 			(monitors)
1876  *      	avoid allocating the mbq
1877  *
1878  *      * netmap_bwrap_krings_create			(bwraps)
1879  *      	create both the brap krings array,
1880  *      	the krings array of the wrapped adapter, and
1881  *      	(if needed) the fake array for the host adapter
1882  *
1883  * na->nm_register(, 1)
1884  * (put the adapter in netmap mode)
1885  *
1886  * 	This may be one of the following:
1887  * 	(XXX these should be either all *_register or all *_reg 2014-03-15)
1888  *
1889  * 	* netmap_hw_register				(hw ports)
1890  * 		checks that the ifp is still there, then calls
1891  * 		the hardware specific callback;
1892  *
1893  * 	* netmap_vp_reg					(VALE ports)
1894  *		If the port is connected to a bridge,
1895  *		set the NAF_NETMAP_ON flag under the
1896  *		bridge write lock.
1897  *
1898  *	* netmap_pipe_reg				(pipes)
1899  *		inform the other pipe end that it is no
1900  *		longer responsibile for the lifetime of this
1901  *		pipe end
1902  *
1903  *	* netmap_monitor_reg				(monitors)
1904  *		intercept the sync callbacks of the monitored
1905  *		rings
1906  *
1907  *	* netmap_bwrap_register				(bwraps)
1908  *		cross-link the bwrap and hwna rings,
1909  *		forward the request to the hwna, override
1910  *		the hwna notify callback (to get the frames
1911  *		coming from outside go through the bridge).
1912  *
1913  * XXX maybe netmap_if_new() should be merged with this (2014-03-15).
1914  *
1915  */
1916 struct netmap_if *
1917 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1918 	uint16_t ringid, uint32_t flags, int *err)
1919 {
1920 	struct netmap_if *nifp = NULL;
1921 	int error, need_mem = 0;
1922 
1923 	NMG_LOCK_ASSERT();
1924 	/* ring configuration may have changed, fetch from the card */
1925 	netmap_update_config(na);
1926 	priv->np_na = na;     /* store the reference */
1927 	error = netmap_set_ringid(priv, ringid, flags);
1928 	if (error)
1929 		goto out;
1930 	/* ensure allocators are ready */
1931 	need_mem = !netmap_have_memory_locked(priv);
1932 	if (need_mem) {
1933 		error = netmap_get_memory_locked(priv);
1934 		ND("get_memory returned %d", error);
1935 		if (error)
1936 			goto out;
1937 	}
1938 	/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
1939 	nifp = netmap_if_new(na);
1940 	if (nifp == NULL) { /* allocation failed */
1941 		error = ENOMEM;
1942 		goto out;
1943 	}
1944 	na->active_fds++;
1945 	if (!nm_netmap_on(na)) {
1946 		/* Netmap not active, set the card in netmap mode
1947 		 * and make it use the shared buffers.
1948 		 */
1949 		/* cache the allocator info in the na */
1950 		na->na_lut = netmap_mem_get_lut(na->nm_mem);
1951 		ND("%p->na_lut == %p", na, na->na_lut);
1952 		na->na_lut_objtotal = netmap_mem_get_buftotal(na->nm_mem);
1953 		na->na_lut_objsize = netmap_mem_get_bufsize(na->nm_mem);
1954 		error = na->nm_register(na, 1); /* mode on */
1955 		if (error) {
1956 			netmap_do_unregif(priv, nifp);
1957 			nifp = NULL;
1958 		}
1959 	}
1960 out:
1961 	*err = error;
1962 	if (error) {
1963 		/* we should drop the allocator, but only
1964 		 * if we were the ones who grabbed it
1965 		 */
1966 		if (need_mem)
1967 			netmap_drop_memory_locked(priv);
1968 		priv->np_na = NULL;
1969 	}
1970 	if (nifp != NULL) {
1971 		/*
1972 		 * advertise that the interface is ready bt setting ni_nifp.
1973 		 * The barrier is needed because readers (poll and *SYNC)
1974 		 * check for priv->np_nifp != NULL without locking
1975 		 */
1976 		wmb(); /* make sure previous writes are visible to all CPUs */
1977 		priv->np_nifp = nifp;
1978 	}
1979 	return nifp;
1980 }
1981 
1982 
1983 
1984 /*
1985  * ioctl(2) support for the "netmap" device.
1986  *
1987  * Following a list of accepted commands:
1988  * - NIOCGINFO
1989  * - SIOCGIFADDR	just for convenience
1990  * - NIOCREGIF
1991  * - NIOCTXSYNC
1992  * - NIOCRXSYNC
1993  *
1994  * Return 0 on success, errno otherwise.
1995  */
1996 int
1997 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1998 	int fflag, struct thread *td)
1999 {
2000 	struct netmap_priv_d *priv = NULL;
2001 	struct nmreq *nmr = (struct nmreq *) data;
2002 	struct netmap_adapter *na = NULL;
2003 	int error;
2004 	u_int i, qfirst, qlast;
2005 	struct netmap_if *nifp;
2006 	struct netmap_kring *krings;
2007 
2008 	(void)dev;	/* UNUSED */
2009 	(void)fflag;	/* UNUSED */
2010 
2011 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2012 		/* truncate name */
2013 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2014 		if (nmr->nr_version != NETMAP_API) {
2015 			D("API mismatch for %s got %d need %d",
2016 				nmr->nr_name,
2017 				nmr->nr_version, NETMAP_API);
2018 			nmr->nr_version = NETMAP_API;
2019 		}
2020 		if (nmr->nr_version < NETMAP_MIN_API ||
2021 		    nmr->nr_version > NETMAP_MAX_API) {
2022 			return EINVAL;
2023 		}
2024 	}
2025 	CURVNET_SET(TD_TO_VNET(td));
2026 
2027 	error = devfs_get_cdevpriv((void **)&priv);
2028 	if (error) {
2029 		CURVNET_RESTORE();
2030 		/* XXX ENOENT should be impossible, since the priv
2031 		 * is now created in the open */
2032 		return (error == ENOENT ? ENXIO : error);
2033 	}
2034 
2035 	switch (cmd) {
2036 	case NIOCGINFO:		/* return capabilities etc */
2037 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2038 			error = netmap_bdg_ctl(nmr, NULL);
2039 			break;
2040 		}
2041 
2042 		NMG_LOCK();
2043 		do {
2044 			/* memsize is always valid */
2045 			struct netmap_mem_d *nmd = &nm_mem;
2046 			u_int memflags;
2047 
2048 			if (nmr->nr_name[0] != '\0') {
2049 				/* get a refcount */
2050 				error = netmap_get_na(nmr, &na, 1 /* create */);
2051 				if (error)
2052 					break;
2053 				nmd = na->nm_mem; /* get memory allocator */
2054 			}
2055 
2056 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2057 				&nmr->nr_arg2);
2058 			if (error)
2059 				break;
2060 			if (na == NULL) /* only memory info */
2061 				break;
2062 			nmr->nr_offset = 0;
2063 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2064 			netmap_update_config(na);
2065 			nmr->nr_rx_rings = na->num_rx_rings;
2066 			nmr->nr_tx_rings = na->num_tx_rings;
2067 			nmr->nr_rx_slots = na->num_rx_desc;
2068 			nmr->nr_tx_slots = na->num_tx_desc;
2069 			netmap_adapter_put(na);
2070 		} while (0);
2071 		NMG_UNLOCK();
2072 		break;
2073 
2074 	case NIOCREGIF:
2075 		/* possibly attach/detach NIC and VALE switch */
2076 		i = nmr->nr_cmd;
2077 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2078 				|| i == NETMAP_BDG_VNET_HDR
2079 				|| i == NETMAP_BDG_NEWIF
2080 				|| i == NETMAP_BDG_DELIF) {
2081 			error = netmap_bdg_ctl(nmr, NULL);
2082 			break;
2083 		} else if (i != 0) {
2084 			D("nr_cmd must be 0 not %d", i);
2085 			error = EINVAL;
2086 			break;
2087 		}
2088 
2089 		/* protect access to priv from concurrent NIOCREGIF */
2090 		NMG_LOCK();
2091 		do {
2092 			u_int memflags;
2093 
2094 			if (priv->np_na != NULL) {	/* thread already registered */
2095 				error = EBUSY;
2096 				break;
2097 			}
2098 			/* find the interface and a reference */
2099 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2100 			if (error)
2101 				break;
2102 			if (NETMAP_OWNED_BY_KERN(na)) {
2103 				netmap_adapter_put(na);
2104 				error = EBUSY;
2105 				break;
2106 			}
2107 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
2108 			if (!nifp) {    /* reg. failed, release priv and ref */
2109 				netmap_adapter_put(na);
2110 				priv->np_nifp = NULL;
2111 				break;
2112 			}
2113 			priv->np_td = td; // XXX kqueue, debugging only
2114 
2115 			/* return the offset of the netmap_if object */
2116 			nmr->nr_rx_rings = na->num_rx_rings;
2117 			nmr->nr_tx_rings = na->num_tx_rings;
2118 			nmr->nr_rx_slots = na->num_rx_desc;
2119 			nmr->nr_tx_slots = na->num_tx_desc;
2120 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2121 				&nmr->nr_arg2);
2122 			if (error) {
2123 				netmap_adapter_put(na);
2124 				break;
2125 			}
2126 			if (memflags & NETMAP_MEM_PRIVATE) {
2127 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2128 			}
2129 			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
2130 				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
2131 			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
2132 				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
2133 
2134 			if (nmr->nr_arg3) {
2135 				D("requested %d extra buffers", nmr->nr_arg3);
2136 				nmr->nr_arg3 = netmap_extra_alloc(na,
2137 					&nifp->ni_bufs_head, nmr->nr_arg3);
2138 				D("got %d extra buffers", nmr->nr_arg3);
2139 			}
2140 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2141 		} while (0);
2142 		NMG_UNLOCK();
2143 		break;
2144 
2145 	case NIOCTXSYNC:
2146 	case NIOCRXSYNC:
2147 		nifp = priv->np_nifp;
2148 
2149 		if (nifp == NULL) {
2150 			error = ENXIO;
2151 			break;
2152 		}
2153 		rmb(); /* make sure following reads are not from cache */
2154 
2155 		na = priv->np_na;      /* we have a reference */
2156 
2157 		if (na == NULL) {
2158 			D("Internal error: nifp != NULL && na == NULL");
2159 			error = ENXIO;
2160 			break;
2161 		}
2162 
2163 		if (!nm_netmap_on(na)) {
2164 			error = ENXIO;
2165 			break;
2166 		}
2167 
2168 		if (cmd == NIOCTXSYNC) {
2169 			krings = na->tx_rings;
2170 			qfirst = priv->np_txqfirst;
2171 			qlast = priv->np_txqlast;
2172 		} else {
2173 			krings = na->rx_rings;
2174 			qfirst = priv->np_rxqfirst;
2175 			qlast = priv->np_rxqlast;
2176 		}
2177 
2178 		for (i = qfirst; i < qlast; i++) {
2179 			struct netmap_kring *kring = krings + i;
2180 			if (nm_kr_tryget(kring)) {
2181 				error = EBUSY;
2182 				goto out;
2183 			}
2184 			if (cmd == NIOCTXSYNC) {
2185 				if (netmap_verbose & NM_VERB_TXSYNC)
2186 					D("pre txsync ring %d cur %d hwcur %d",
2187 					    i, kring->ring->cur,
2188 					    kring->nr_hwcur);
2189 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2190 					netmap_ring_reinit(kring);
2191 				} else {
2192 					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
2193 				}
2194 				if (netmap_verbose & NM_VERB_TXSYNC)
2195 					D("post txsync ring %d cur %d hwcur %d",
2196 					    i, kring->ring->cur,
2197 					    kring->nr_hwcur);
2198 			} else {
2199 				kring->nm_sync(kring, NAF_FORCE_READ);
2200 				microtime(&na->rx_rings[i].ring->ts);
2201 			}
2202 			nm_kr_put(kring);
2203 		}
2204 
2205 		break;
2206 
2207 	case NIOCCONFIG:
2208 		error = netmap_bdg_config(nmr);
2209 		break;
2210 #ifdef __FreeBSD__
2211 	case FIONBIO:
2212 	case FIOASYNC:
2213 		ND("FIONBIO/FIOASYNC are no-ops");
2214 		break;
2215 
2216 	case BIOCIMMEDIATE:
2217 	case BIOCGHDRCMPLT:
2218 	case BIOCSHDRCMPLT:
2219 	case BIOCSSEESENT:
2220 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2221 		break;
2222 
2223 	default:	/* allow device-specific ioctls */
2224 	    {
2225 		struct socket so;
2226 		struct ifnet *ifp;
2227 
2228 		bzero(&so, sizeof(so));
2229 		NMG_LOCK();
2230 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
2231 		if (error) {
2232 			netmap_adapter_put(na);
2233 			NMG_UNLOCK();
2234 			break;
2235 		}
2236 		ifp = na->ifp;
2237 		so.so_vnet = ifp->if_vnet;
2238 		// so->so_proto not null.
2239 		error = ifioctl(&so, cmd, data, td);
2240 		netmap_adapter_put(na);
2241 		NMG_UNLOCK();
2242 		break;
2243 	    }
2244 
2245 #else /* linux */
2246 	default:
2247 		error = EOPNOTSUPP;
2248 #endif /* linux */
2249 	}
2250 out:
2251 
2252 	CURVNET_RESTORE();
2253 	return (error);
2254 }
2255 
2256 
2257 /*
2258  * select(2) and poll(2) handlers for the "netmap" device.
2259  *
2260  * Can be called for one or more queues.
2261  * Return true the event mask corresponding to ready events.
2262  * If there are no ready events, do a selrecord on either individual
2263  * selinfo or on the global one.
2264  * Device-dependent parts (locking and sync of tx/rx rings)
2265  * are done through callbacks.
2266  *
2267  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2268  * The first one is remapped to pwait as selrecord() uses the name as an
2269  * hidden argument.
2270  */
2271 int
2272 netmap_poll(struct cdev *dev, int events, struct thread *td)
2273 {
2274 	struct netmap_priv_d *priv = NULL;
2275 	struct netmap_adapter *na;
2276 	struct netmap_kring *kring;
2277 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
2278 	struct mbq q;		/* packets from hw queues to host stack */
2279 	void *pwait = dev;	/* linux compatibility */
2280 	int is_kevent = 0;
2281 
2282 	/*
2283 	 * In order to avoid nested locks, we need to "double check"
2284 	 * txsync and rxsync if we decide to do a selrecord().
2285 	 * retry_tx (and retry_rx, later) prevent looping forever.
2286 	 */
2287 	int retry_tx = 1, retry_rx = 1;
2288 
2289 	(void)pwait;
2290 	mbq_init(&q);
2291 
2292 	/*
2293 	 * XXX kevent has curthread->tp_fop == NULL,
2294 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2295 	 * priv as the first argument, which is also useful to avoid
2296 	 * the selrecord() which are not necessary in that case.
2297 	 */
2298 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
2299 		is_kevent = 1;
2300 		if (netmap_verbose)
2301 			D("called from kevent");
2302 		priv = (struct netmap_priv_d *)dev;
2303 	}
2304 	if (priv == NULL)
2305 		return POLLERR;
2306 
2307 	if (priv->np_nifp == NULL) {
2308 		D("No if registered");
2309 		return POLLERR;
2310 	}
2311 	rmb(); /* make sure following reads are not from cache */
2312 
2313 	na = priv->np_na;
2314 
2315 	if (!nm_netmap_on(na))
2316 		return POLLERR;
2317 
2318 	if (netmap_verbose & 0x8000)
2319 		D("device %s events 0x%x", na->name, events);
2320 	want_tx = events & (POLLOUT | POLLWRNORM);
2321 	want_rx = events & (POLLIN | POLLRDNORM);
2322 
2323 
2324 	/*
2325 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2326 	 * the file descriptor is bound to all of them. If so, we sleep on
2327 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2328 	 * (FreeBSD only allows two selinfo's per file descriptor).
2329 	 * The interrupt routine in the driver wake one or the other
2330 	 * (or both) depending on which clients are active.
2331 	 *
2332 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2333 	 * txsync() is called if we run out of buffers on POLLOUT, or
2334 	 * there are pending packets to send. The latter can be disabled
2335 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2336 	 */
2337 	check_all_tx = nm_tx_si_user(priv);
2338 	check_all_rx = nm_rx_si_user(priv);
2339 
2340 	/*
2341 	 * We start with a lock free round which is cheap if we have
2342 	 * slots available. If this fails, then lock and call the sync
2343 	 * routines.
2344 	 */
2345 	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
2346 		kring = &na->rx_rings[i];
2347 		/* XXX compare ring->cur and kring->tail */
2348 		if (!nm_ring_empty(kring->ring)) {
2349 			revents |= want_rx;
2350 			want_rx = 0;	/* also breaks the loop */
2351 		}
2352 	}
2353 	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
2354 		kring = &na->tx_rings[i];
2355 		/* XXX compare ring->cur and kring->tail */
2356 		if (!nm_ring_empty(kring->ring)) {
2357 			revents |= want_tx;
2358 			want_tx = 0;	/* also breaks the loop */
2359 		}
2360 	}
2361 
2362 	/*
2363 	 * If we want to push packets out (priv->np_txpoll) or
2364 	 * want_tx is still set, we must issue txsync calls
2365 	 * (on all rings, to avoid that the tx rings stall).
2366 	 * XXX should also check cur != hwcur on the tx rings.
2367 	 * Fortunately, normal tx mode has np_txpoll set.
2368 	 */
2369 	if (priv->np_txpoll || want_tx) {
2370 		/*
2371 		 * The first round checks if anyone is ready, if not
2372 		 * do a selrecord and another round to handle races.
2373 		 * want_tx goes to 0 if any space is found, and is
2374 		 * used to skip rings with no pending transmissions.
2375 		 */
2376 flush_tx:
2377 		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2378 			int found = 0;
2379 
2380 			kring = &na->tx_rings[i];
2381 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2382 				continue;
2383 			/* only one thread does txsync */
2384 			if (nm_kr_tryget(kring)) {
2385 				/* either busy or stopped
2386 				 * XXX if the ring is stopped, sleeping would
2387 				 * be better. In current code, however, we only
2388 				 * stop the rings for brief intervals (2014-03-14)
2389 				 */
2390 				if (netmap_verbose)
2391 					RD(2, "%p lost race on txring %d, ok",
2392 					    priv, i);
2393 				continue;
2394 			}
2395 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2396 				netmap_ring_reinit(kring);
2397 				revents |= POLLERR;
2398 			} else {
2399 				if (kring->nm_sync(kring, 0))
2400 					revents |= POLLERR;
2401 			}
2402 
2403 			/*
2404 			 * If we found new slots, notify potential
2405 			 * listeners on the same ring.
2406 			 * Since we just did a txsync, look at the copies
2407 			 * of cur,tail in the kring.
2408 			 */
2409 			found = kring->rcur != kring->rtail;
2410 			nm_kr_put(kring);
2411 			if (found) { /* notify other listeners */
2412 				revents |= want_tx;
2413 				want_tx = 0;
2414 				na->nm_notify(na, i, NR_TX, 0);
2415 			}
2416 		}
2417 		if (want_tx && retry_tx && !is_kevent) {
2418 			selrecord(td, check_all_tx ?
2419 			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2420 			retry_tx = 0;
2421 			goto flush_tx;
2422 		}
2423 	}
2424 
2425 	/*
2426 	 * If want_rx is still set scan receive rings.
2427 	 * Do it on all rings because otherwise we starve.
2428 	 */
2429 	if (want_rx) {
2430 		int send_down = 0; /* transparent mode */
2431 		/* two rounds here for race avoidance */
2432 do_retry_rx:
2433 		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2434 			int found = 0;
2435 
2436 			kring = &na->rx_rings[i];
2437 
2438 			if (nm_kr_tryget(kring)) {
2439 				if (netmap_verbose)
2440 					RD(2, "%p lost race on rxring %d, ok",
2441 					    priv, i);
2442 				continue;
2443 			}
2444 
2445 			/*
2446 			 * transparent mode support: collect packets
2447 			 * from the rxring(s).
2448 			 * XXX NR_FORWARD should only be read on
2449 			 * physical or NIC ports
2450 			 */
2451 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2452 				ND(10, "forwarding some buffers up %d to %d",
2453 				    kring->nr_hwcur, kring->ring->cur);
2454 				netmap_grab_packets(kring, &q, netmap_fwd);
2455 			}
2456 
2457 			if (kring->nm_sync(kring, 0))
2458 				revents |= POLLERR;
2459 			if (netmap_no_timestamp == 0 ||
2460 					kring->ring->flags & NR_TIMESTAMP) {
2461 				microtime(&kring->ring->ts);
2462 			}
2463 			/* after an rxsync we can use kring->rcur, rtail */
2464 			found = kring->rcur != kring->rtail;
2465 			nm_kr_put(kring);
2466 			if (found) {
2467 				revents |= want_rx;
2468 				retry_rx = 0;
2469 				na->nm_notify(na, i, NR_RX, 0);
2470 			}
2471 		}
2472 
2473 		/* transparent mode XXX only during first pass ? */
2474 		if (na->na_flags & NAF_HOST_RINGS) {
2475 			kring = &na->rx_rings[na->num_rx_rings];
2476 			if (check_all_rx
2477 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2478 				/* XXX fix to use kring fields */
2479 				if (nm_ring_empty(kring->ring))
2480 					send_down = netmap_rxsync_from_host(na, td, dev);
2481 				if (!nm_ring_empty(kring->ring))
2482 					revents |= want_rx;
2483 			}
2484 		}
2485 
2486 		if (retry_rx && !is_kevent)
2487 			selrecord(td, check_all_rx ?
2488 			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2489 		if (send_down > 0 || retry_rx) {
2490 			retry_rx = 0;
2491 			if (send_down)
2492 				goto flush_tx; /* and retry_rx */
2493 			else
2494 				goto do_retry_rx;
2495 		}
2496 	}
2497 
2498 	/*
2499 	 * Transparent mode: marked bufs on rx rings between
2500 	 * kring->nr_hwcur and ring->head
2501 	 * are passed to the other endpoint.
2502 	 *
2503 	 * In this mode we also scan the sw rxring, which in
2504 	 * turn passes packets up.
2505 	 *
2506 	 * XXX Transparent mode at the moment requires to bind all
2507  	 * rings to a single file descriptor.
2508 	 */
2509 
2510 	if (q.head && na->ifp != NULL)
2511 		netmap_send_up(na->ifp, &q);
2512 
2513 	return (revents);
2514 }
2515 
2516 
2517 /*-------------------- driver support routines -------------------*/
2518 
2519 static int netmap_hw_krings_create(struct netmap_adapter *);
2520 
2521 /* default notify callback */
2522 static int
2523 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2524 	enum txrx tx, int flags)
2525 {
2526 	struct netmap_kring *kring;
2527 
2528 	if (tx == NR_TX) {
2529 		kring = na->tx_rings + n_ring;
2530 		OS_selwakeup(&kring->si, PI_NET);
2531 		/* optimization: avoid a wake up on the global
2532 		 * queue if nobody has registered for more
2533 		 * than one ring
2534 		 */
2535 		if (na->tx_si_users > 0)
2536 			OS_selwakeup(&na->tx_si, PI_NET);
2537 	} else {
2538 		kring = na->rx_rings + n_ring;
2539 		OS_selwakeup(&kring->si, PI_NET);
2540 		/* optimization: same as above */
2541 		if (na->rx_si_users > 0)
2542 			OS_selwakeup(&na->rx_si, PI_NET);
2543 	}
2544 	return 0;
2545 }
2546 
2547 
2548 /* called by all routines that create netmap_adapters.
2549  * Attach na to the ifp (if any) and provide defaults
2550  * for optional callbacks. Defaults assume that we
2551  * are creating an hardware netmap_adapter.
2552  */
2553 int
2554 netmap_attach_common(struct netmap_adapter *na)
2555 {
2556 	struct ifnet *ifp = na->ifp;
2557 
2558 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2559 		D("%s: invalid rings tx %d rx %d",
2560 			na->name, na->num_tx_rings, na->num_rx_rings);
2561 		return EINVAL;
2562 	}
2563 	/* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2564 	 * pipes, monitors). For bwrap we actually have a non-null ifp for
2565 	 * use by the external modules, but that is set after this
2566 	 * function has been called.
2567 	 * XXX this is ugly, maybe split this function in two (2014-03-14)
2568 	 */
2569 	if (ifp != NULL) {
2570 		WNA(ifp) = na;
2571 
2572 	/* the following is only needed for na that use the host port.
2573 	 * XXX do we have something similar for linux ?
2574 	 */
2575 #ifdef __FreeBSD__
2576 		na->if_input = ifp->if_input; /* for netmap_send_up */
2577 #endif /* __FreeBSD__ */
2578 
2579 		NETMAP_SET_CAPABLE(ifp);
2580 	}
2581 	if (na->nm_krings_create == NULL) {
2582 		/* we assume that we have been called by a driver,
2583 		 * since other port types all provide their own
2584 		 * nm_krings_create
2585 		 */
2586 		na->nm_krings_create = netmap_hw_krings_create;
2587 		na->nm_krings_delete = netmap_hw_krings_delete;
2588 	}
2589 	if (na->nm_notify == NULL)
2590 		na->nm_notify = netmap_notify;
2591 	na->active_fds = 0;
2592 
2593 	if (na->nm_mem == NULL)
2594 		/* use the global allocator */
2595 		na->nm_mem = &nm_mem;
2596 	if (na->nm_bdg_attach == NULL)
2597 		/* no special nm_bdg_attach callback. On VALE
2598 		 * attach, we need to interpose a bwrap
2599 		 */
2600 		na->nm_bdg_attach = netmap_bwrap_attach;
2601 	return 0;
2602 }
2603 
2604 
2605 /* standard cleanup, called by all destructors */
2606 void
2607 netmap_detach_common(struct netmap_adapter *na)
2608 {
2609 	if (na->ifp != NULL)
2610 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2611 
2612 	if (na->tx_rings) { /* XXX should not happen */
2613 		D("freeing leftover tx_rings");
2614 		na->nm_krings_delete(na);
2615 	}
2616 	netmap_pipe_dealloc(na);
2617 	if (na->na_flags & NAF_MEM_OWNER)
2618 		netmap_mem_private_delete(na->nm_mem);
2619 	bzero(na, sizeof(*na));
2620 	free(na, M_DEVBUF);
2621 }
2622 
2623 /* Wrapper for the register callback provided hardware drivers.
2624  * na->ifp == NULL means the the driver module has been
2625  * unloaded, so we cannot call into it.
2626  * Note that module unloading, in our patched linux drivers,
2627  * happens under NMG_LOCK and after having stopped all the
2628  * nic rings (see netmap_detach). This provides sufficient
2629  * protection for the other driver-provied callbacks
2630  * (i.e., nm_config and nm_*xsync), that therefore don't need
2631  * to wrapped.
2632  */
2633 static int
2634 netmap_hw_register(struct netmap_adapter *na, int onoff)
2635 {
2636 	struct netmap_hw_adapter *hwna =
2637 		(struct netmap_hw_adapter*)na;
2638 
2639 	if (na->ifp == NULL)
2640 		return onoff ? ENXIO : 0;
2641 
2642 	return hwna->nm_hw_register(na, onoff);
2643 }
2644 
2645 
2646 /*
2647  * Initialize a ``netmap_adapter`` object created by driver on attach.
2648  * We allocate a block of memory with room for a struct netmap_adapter
2649  * plus two sets of N+2 struct netmap_kring (where N is the number
2650  * of hardware rings):
2651  * krings	0..N-1	are for the hardware queues.
2652  * kring	N	is for the host stack queue
2653  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2654  * Return 0 on success, ENOMEM otherwise.
2655  */
2656 int
2657 netmap_attach(struct netmap_adapter *arg)
2658 {
2659 	struct netmap_hw_adapter *hwna = NULL;
2660 	// XXX when is arg == NULL ?
2661 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2662 
2663 	if (arg == NULL || ifp == NULL)
2664 		goto fail;
2665 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2666 	if (hwna == NULL)
2667 		goto fail;
2668 	hwna->up = *arg;
2669 	hwna->up.na_flags |= NAF_HOST_RINGS;
2670 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2671 	hwna->nm_hw_register = hwna->up.nm_register;
2672 	hwna->up.nm_register = netmap_hw_register;
2673 	if (netmap_attach_common(&hwna->up)) {
2674 		free(hwna, M_DEVBUF);
2675 		goto fail;
2676 	}
2677 	netmap_adapter_get(&hwna->up);
2678 
2679 #ifdef linux
2680 	if (ifp->netdev_ops) {
2681 		/* prepare a clone of the netdev ops */
2682 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2683 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2684 #else
2685 		hwna->nm_ndo = *ifp->netdev_ops;
2686 #endif
2687 	}
2688 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2689 	if (ifp->ethtool_ops) {
2690 		hwna->nm_eto = *ifp->ethtool_ops;
2691 	}
2692 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2693 #ifdef ETHTOOL_SCHANNELS
2694 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2695 #endif
2696 	if (arg->nm_config == NULL) {
2697 		hwna->up.nm_config = netmap_linux_config;
2698 	}
2699 #endif /* linux */
2700 
2701 	D("success for %s tx %d/%d rx %d/%d queues/slots",
2702 		hwna->up.name,
2703 		hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2704 		hwna->up.num_rx_rings, hwna->up.num_rx_desc
2705 		);
2706 	return 0;
2707 
2708 fail:
2709 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2710 	if (ifp)
2711 		netmap_detach(ifp);
2712 	return (hwna ? EINVAL : ENOMEM);
2713 }
2714 
2715 
2716 void
2717 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2718 {
2719 	if (!na) {
2720 		return;
2721 	}
2722 
2723 	refcount_acquire(&na->na_refcount);
2724 }
2725 
2726 
2727 /* returns 1 iff the netmap_adapter is destroyed */
2728 int
2729 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2730 {
2731 	if (!na)
2732 		return 1;
2733 
2734 	if (!refcount_release(&na->na_refcount))
2735 		return 0;
2736 
2737 	if (na->nm_dtor)
2738 		na->nm_dtor(na);
2739 
2740 	netmap_detach_common(na);
2741 
2742 	return 1;
2743 }
2744 
2745 /* nm_krings_create callback for all hardware native adapters */
2746 int
2747 netmap_hw_krings_create(struct netmap_adapter *na)
2748 {
2749 	int ret = netmap_krings_create(na, 0);
2750 	if (ret == 0) {
2751 		/* initialize the mbq for the sw rx ring */
2752 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2753 		ND("initialized sw rx queue %d", na->num_rx_rings);
2754 	}
2755 	return ret;
2756 }
2757 
2758 
2759 
2760 /*
2761  * Called on module unload by the netmap-enabled drivers
2762  */
2763 void
2764 netmap_detach(struct ifnet *ifp)
2765 {
2766 	struct netmap_adapter *na = NA(ifp);
2767 
2768 	if (!na)
2769 		return;
2770 
2771 	NMG_LOCK();
2772 	netmap_disable_all_rings(ifp);
2773 	if (!netmap_adapter_put(na)) {
2774 		/* someone is still using the adapter,
2775 		 * tell them that the interface is gone
2776 		 */
2777 		na->ifp = NULL;
2778 		// XXX also clear NAF_NATIVE_ON ?
2779 		na->na_flags &= ~NAF_NETMAP_ON;
2780 		/* give them a chance to notice */
2781 		netmap_enable_all_rings(ifp);
2782 	}
2783 	NMG_UNLOCK();
2784 }
2785 
2786 
2787 /*
2788  * Intercept packets from the network stack and pass them
2789  * to netmap as incoming packets on the 'software' ring.
2790  *
2791  * We only store packets in a bounded mbq and then copy them
2792  * in the relevant rxsync routine.
2793  *
2794  * We rely on the OS to make sure that the ifp and na do not go
2795  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2796  * In nm_register() or whenever there is a reinitialization,
2797  * we make sure to make the mode change visible here.
2798  */
2799 int
2800 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2801 {
2802 	struct netmap_adapter *na = NA(ifp);
2803 	struct netmap_kring *kring;
2804 	u_int len = MBUF_LEN(m);
2805 	u_int error = ENOBUFS;
2806 	struct mbq *q;
2807 	int space;
2808 
2809 	// XXX [Linux] we do not need this lock
2810 	// if we follow the down/configure/up protocol -gl
2811 	// mtx_lock(&na->core_lock);
2812 
2813 	if (!nm_netmap_on(na)) {
2814 		D("%s not in netmap mode anymore", na->name);
2815 		error = ENXIO;
2816 		goto done;
2817 	}
2818 
2819 	kring = &na->rx_rings[na->num_rx_rings];
2820 	q = &kring->rx_queue;
2821 
2822 	// XXX reconsider long packets if we handle fragments
2823 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2824 		D("%s from_host, drop packet size %d > %d", na->name,
2825 			len, NETMAP_BUF_SIZE(na));
2826 		goto done;
2827 	}
2828 
2829 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2830 	 * and maybe other instances of netmap_transmit (the latter
2831 	 * not possible on Linux).
2832 	 * Also avoid overflowing the queue.
2833 	 */
2834 	mbq_lock(q);
2835 
2836         space = kring->nr_hwtail - kring->nr_hwcur;
2837         if (space < 0)
2838                 space += kring->nkr_num_slots;
2839 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2840 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2841 			na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2842 			len, m);
2843 	} else {
2844 		mbq_enqueue(q, m);
2845 		ND(10, "%s %d bufs in queue len %d m %p",
2846 			na->name, mbq_len(q), len, m);
2847 		/* notify outside the lock */
2848 		m = NULL;
2849 		error = 0;
2850 	}
2851 	mbq_unlock(q);
2852 
2853 done:
2854 	if (m)
2855 		m_freem(m);
2856 	/* unconditionally wake up listeners */
2857 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2858 	/* this is normally netmap_notify(), but for nics
2859 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2860 	 * that possibly forwards the frames through the switch
2861 	 */
2862 
2863 	return (error);
2864 }
2865 
2866 
2867 /*
2868  * netmap_reset() is called by the driver routines when reinitializing
2869  * a ring. The driver is in charge of locking to protect the kring.
2870  * If native netmap mode is not set just return NULL.
2871  */
2872 struct netmap_slot *
2873 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2874 	u_int new_cur)
2875 {
2876 	struct netmap_kring *kring;
2877 	int new_hwofs, lim;
2878 
2879 	if (!nm_native_on(na)) {
2880 		ND("interface not in native netmap mode");
2881 		return NULL;	/* nothing to reinitialize */
2882 	}
2883 
2884 	/* XXX note- in the new scheme, we are not guaranteed to be
2885 	 * under lock (e.g. when called on a device reset).
2886 	 * In this case, we should set a flag and do not trust too
2887 	 * much the values. In practice: TODO
2888 	 * - set a RESET flag somewhere in the kring
2889 	 * - do the processing in a conservative way
2890 	 * - let the *sync() fixup at the end.
2891 	 */
2892 	if (tx == NR_TX) {
2893 		if (n >= na->num_tx_rings)
2894 			return NULL;
2895 		kring = na->tx_rings + n;
2896 		// XXX check whether we should use hwcur or rcur
2897 		new_hwofs = kring->nr_hwcur - new_cur;
2898 	} else {
2899 		if (n >= na->num_rx_rings)
2900 			return NULL;
2901 		kring = na->rx_rings + n;
2902 		new_hwofs = kring->nr_hwtail - new_cur;
2903 	}
2904 	lim = kring->nkr_num_slots - 1;
2905 	if (new_hwofs > lim)
2906 		new_hwofs -= lim + 1;
2907 
2908 	/* Always set the new offset value and realign the ring. */
2909 	if (netmap_verbose)
2910 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2911 		na->name,
2912 		tx == NR_TX ? "TX" : "RX", n,
2913 		kring->nkr_hwofs, new_hwofs,
2914 		kring->nr_hwtail,
2915 		tx == NR_TX ? lim : kring->nr_hwtail);
2916 	kring->nkr_hwofs = new_hwofs;
2917 	if (tx == NR_TX) {
2918 		kring->nr_hwtail = kring->nr_hwcur + lim;
2919 		if (kring->nr_hwtail > lim)
2920 			kring->nr_hwtail -= lim + 1;
2921 	}
2922 
2923 #if 0 // def linux
2924 	/* XXX check that the mappings are correct */
2925 	/* need ring_nr, adapter->pdev, direction */
2926 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2927 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2928 		D("error mapping rx netmap buffer %d", i);
2929 		// XXX fix error handling
2930 	}
2931 
2932 #endif /* linux */
2933 	/*
2934 	 * Wakeup on the individual and global selwait
2935 	 * We do the wakeup here, but the ring is not yet reconfigured.
2936 	 * However, we are under lock so there are no races.
2937 	 */
2938 	na->nm_notify(na, n, tx, 0);
2939 	return kring->ring->slot;
2940 }
2941 
2942 
2943 /*
2944  * Dispatch rx/tx interrupts to the netmap rings.
2945  *
2946  * "work_done" is non-null on the RX path, NULL for the TX path.
2947  * We rely on the OS to make sure that there is only one active
2948  * instance per queue, and that there is appropriate locking.
2949  *
2950  * The 'notify' routine depends on what the ring is attached to.
2951  * - for a netmap file descriptor, do a selwakeup on the individual
2952  *   waitqueue, plus one on the global one if needed
2953  *   (see netmap_notify)
2954  * - for a nic connected to a switch, call the proper forwarding routine
2955  *   (see netmap_bwrap_intr_notify)
2956  */
2957 void
2958 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2959 {
2960 	struct netmap_adapter *na = NA(ifp);
2961 	struct netmap_kring *kring;
2962 
2963 	q &= NETMAP_RING_MASK;
2964 
2965 	if (netmap_verbose) {
2966 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2967 	}
2968 
2969 	if (work_done) { /* RX path */
2970 		if (q >= na->num_rx_rings)
2971 			return;	// not a physical queue
2972 		kring = na->rx_rings + q;
2973 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2974 		na->nm_notify(na, q, NR_RX, 0);
2975 		*work_done = 1; /* do not fire napi again */
2976 	} else { /* TX path */
2977 		if (q >= na->num_tx_rings)
2978 			return;	// not a physical queue
2979 		kring = na->tx_rings + q;
2980 		na->nm_notify(na, q, NR_TX, 0);
2981 	}
2982 }
2983 
2984 
2985 /*
2986  * Default functions to handle rx/tx interrupts from a physical device.
2987  * "work_done" is non-null on the RX path, NULL for the TX path.
2988  *
2989  * If the card is not in netmap mode, simply return 0,
2990  * so that the caller proceeds with regular processing.
2991  * Otherwise call netmap_common_irq() and return 1.
2992  *
2993  * If the card is connected to a netmap file descriptor,
2994  * do a selwakeup on the individual queue, plus one on the global one
2995  * if needed (multiqueue card _and_ there are multiqueue listeners),
2996  * and return 1.
2997  *
2998  * Finally, if called on rx from an interface connected to a switch,
2999  * calls the proper forwarding routine, and return 1.
3000  */
3001 int
3002 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3003 {
3004 	struct netmap_adapter *na = NA(ifp);
3005 
3006 	/*
3007 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3008 	 * we still use the regular driver even though the previous
3009 	 * check fails. It is unclear whether we should use
3010 	 * nm_native_on() here.
3011 	 */
3012 	if (!nm_netmap_on(na))
3013 		return 0;
3014 
3015 	if (na->na_flags & NAF_SKIP_INTR) {
3016 		ND("use regular interrupt");
3017 		return 0;
3018 	}
3019 
3020 	netmap_common_irq(ifp, q, work_done);
3021 	return 1;
3022 }
3023 
3024 
3025 /*
3026  * Module loader and unloader
3027  *
3028  * netmap_init() creates the /dev/netmap device and initializes
3029  * all global variables. Returns 0 on success, errno on failure
3030  * (but there is no chance)
3031  *
3032  * netmap_fini() destroys everything.
3033  */
3034 
3035 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3036 extern struct cdevsw netmap_cdevsw;
3037 
3038 
3039 void
3040 netmap_fini(void)
3041 {
3042 	// XXX destroy_bridges() ?
3043 	if (netmap_dev)
3044 		destroy_dev(netmap_dev);
3045 	netmap_mem_fini();
3046 	NMG_LOCK_DESTROY();
3047 	printf("netmap: unloaded module.\n");
3048 }
3049 
3050 
3051 int
3052 netmap_init(void)
3053 {
3054 	int error;
3055 
3056 	NMG_LOCK_INIT();
3057 
3058 	error = netmap_mem_init();
3059 	if (error != 0)
3060 		goto fail;
3061 	/* XXX could use make_dev_credv() to get error number */
3062 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
3063 			      "netmap");
3064 	if (!netmap_dev)
3065 		goto fail;
3066 
3067 	netmap_init_bridges();
3068 #ifdef __FreeBSD__
3069 	nm_vi_init_index();
3070 #endif
3071 	printf("netmap: loaded module\n");
3072 	return (0);
3073 fail:
3074 	netmap_fini();
3075 	return (EINVAL); /* may be incorrect */
3076 }
3077