xref: /freebsd/sys/dev/netmap/netmap.c (revision f5fd950e35c962bad0aa31fdc4b4052e13207893)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 /*
128  * OS-specific code that is used only within this file.
129  * Other OS-specific code that must be accessed by drivers
130  * is present in netmap_kern.h
131  */
132 
133 #if defined(__FreeBSD__)
134 #include <sys/cdefs.h> /* prerequisite */
135 #include <sys/types.h>
136 #include <sys/errno.h>
137 #include <sys/param.h>	/* defines used in kernel.h */
138 #include <sys/kernel.h>	/* types used in module initialization */
139 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
140 #include <sys/filio.h>	/* FIONBIO */
141 #include <sys/sockio.h>
142 #include <sys/socketvar.h>	/* struct socket */
143 #include <sys/malloc.h>
144 #include <sys/poll.h>
145 #include <sys/rwlock.h>
146 #include <sys/socket.h> /* sockaddrs */
147 #include <sys/selinfo.h>
148 #include <sys/sysctl.h>
149 #include <sys/jail.h>
150 #include <net/vnet.h>
151 #include <net/if.h>
152 #include <net/if_var.h>
153 #include <net/bpf.h>		/* BIOCIMMEDIATE */
154 #include <machine/bus.h>	/* bus_dmamap_* */
155 #include <sys/endian.h>
156 #include <sys/refcount.h>
157 
158 
159 /* reduce conditional code */
160 // linux API, use for the knlist in FreeBSD
161 #define init_waitqueue_head(x)	knlist_init_mtx(&(x)->si_note, NULL)
162 
163 void freebsd_selwakeup(struct selinfo *si, int pri);
164 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
165 
166 #elif defined(linux)
167 
168 #include "bsd_glue.h"
169 
170 
171 
172 #elif defined(__APPLE__)
173 
174 #warning OSX support is only partial
175 #include "osx_glue.h"
176 
177 #else
178 
179 #error	Unsupported platform
180 
181 #endif /* unsupported */
182 
183 /*
184  * common headers
185  */
186 #include <net/netmap.h>
187 #include <dev/netmap/netmap_kern.h>
188 #include <dev/netmap/netmap_mem2.h>
189 
190 
191 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
192 
193 /*
194  * The following variables are used by the drivers and replicate
195  * fields in the global memory pool. They only refer to buffers
196  * used by physical interfaces.
197  */
198 u_int netmap_total_buffers;
199 u_int netmap_buf_size;
200 char *netmap_buffer_base;	/* also address of an invalid buffer */
201 
202 /* user-controlled variables */
203 int netmap_verbose;
204 
205 static int netmap_no_timestamp; /* don't timestamp on rxsync */
206 
207 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
208 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
209     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
210 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
211     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
212 int netmap_mitigate = 1;
213 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
214 int netmap_no_pendintr = 1;
215 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
216     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
217 int netmap_txsync_retry = 2;
218 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
219     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
220 
221 int netmap_flags = 0;	/* debug flags */
222 int netmap_fwd = 0;	/* force transparent mode */
223 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
224 
225 /*
226  * netmap_admode selects the netmap mode to use.
227  * Invalid values are reset to NETMAP_ADMODE_BEST
228  */
229 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
230 	NETMAP_ADMODE_NATIVE,	/* either native or none */
231 	NETMAP_ADMODE_GENERIC,	/* force generic */
232 	NETMAP_ADMODE_LAST };
233 static int netmap_admode = NETMAP_ADMODE_BEST;
234 
235 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
236 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
237 int netmap_generic_rings = 1;   /* number of queues in generic. */
238 
239 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
240 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
241 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
242 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
243 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
244 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
245 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
246 
247 NMG_LOCK_T	netmap_global_lock;
248 
249 
250 static void
251 nm_kr_get(struct netmap_kring *kr)
252 {
253 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
254 		tsleep(kr, 0, "NM_KR_GET", 4);
255 }
256 
257 
258 /*
259  * mark the ring as stopped, and run through the locks
260  * to make sure other users get to see it.
261  */
262 void
263 netmap_disable_ring(struct netmap_kring *kr)
264 {
265 	kr->nkr_stopped = 1;
266 	nm_kr_get(kr);
267 	mtx_lock(&kr->q_lock);
268 	mtx_unlock(&kr->q_lock);
269 	nm_kr_put(kr);
270 }
271 
272 
273 static void
274 netmap_set_all_rings(struct ifnet *ifp, int stopped)
275 {
276 	struct netmap_adapter *na;
277 	int i;
278 	u_int ntx, nrx;
279 
280 	if (!(ifp->if_capenable & IFCAP_NETMAP))
281 		return;
282 
283 	na = NA(ifp);
284 
285 	ntx = netmap_real_tx_rings(na);
286 	nrx = netmap_real_rx_rings(na);
287 
288 	for (i = 0; i < ntx; i++) {
289 		if (stopped)
290 			netmap_disable_ring(na->tx_rings + i);
291 		else
292 			na->tx_rings[i].nkr_stopped = 0;
293 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY);
294 	}
295 
296 	for (i = 0; i < nrx; i++) {
297 		if (stopped)
298 			netmap_disable_ring(na->rx_rings + i);
299 		else
300 			na->rx_rings[i].nkr_stopped = 0;
301 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY);
302 	}
303 }
304 
305 
306 void
307 netmap_disable_all_rings(struct ifnet *ifp)
308 {
309 	netmap_set_all_rings(ifp, 1 /* stopped */);
310 }
311 
312 
313 void
314 netmap_enable_all_rings(struct ifnet *ifp)
315 {
316 	netmap_set_all_rings(ifp, 0 /* enabled */);
317 }
318 
319 
320 /*
321  * generic bound_checking function
322  */
323 u_int
324 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
325 {
326 	u_int oldv = *v;
327 	const char *op = NULL;
328 
329 	if (dflt < lo)
330 		dflt = lo;
331 	if (dflt > hi)
332 		dflt = hi;
333 	if (oldv < lo) {
334 		*v = dflt;
335 		op = "Bump";
336 	} else if (oldv > hi) {
337 		*v = hi;
338 		op = "Clamp";
339 	}
340 	if (op && msg)
341 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
342 	return *v;
343 }
344 
345 
346 /*
347  * packet-dump function, user-supplied or static buffer.
348  * The destination buffer must be at least 30+4*len
349  */
350 const char *
351 nm_dump_buf(char *p, int len, int lim, char *dst)
352 {
353 	static char _dst[8192];
354 	int i, j, i0;
355 	static char hex[] ="0123456789abcdef";
356 	char *o;	/* output position */
357 
358 #define P_HI(x)	hex[((x) & 0xf0)>>4]
359 #define P_LO(x)	hex[((x) & 0xf)]
360 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
361 	if (!dst)
362 		dst = _dst;
363 	if (lim <= 0 || lim > len)
364 		lim = len;
365 	o = dst;
366 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
367 	o += strlen(o);
368 	/* hexdump routine */
369 	for (i = 0; i < lim; ) {
370 		sprintf(o, "%5d: ", i);
371 		o += strlen(o);
372 		memset(o, ' ', 48);
373 		i0 = i;
374 		for (j=0; j < 16 && i < lim; i++, j++) {
375 			o[j*3] = P_HI(p[i]);
376 			o[j*3+1] = P_LO(p[i]);
377 		}
378 		i = i0;
379 		for (j=0; j < 16 && i < lim; i++, j++)
380 			o[j + 48] = P_C(p[i]);
381 		o[j+48] = '\n';
382 		o += j+49;
383 	}
384 	*o = '\0';
385 #undef P_HI
386 #undef P_LO
387 #undef P_C
388 	return dst;
389 }
390 
391 
392 /*
393  * Fetch configuration from the device, to cope with dynamic
394  * reconfigurations after loading the module.
395  */
396 int
397 netmap_update_config(struct netmap_adapter *na)
398 {
399 	struct ifnet *ifp = na->ifp;
400 	u_int txr, txd, rxr, rxd;
401 
402 	txr = txd = rxr = rxd = 0;
403 	if (na->nm_config) {
404 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
405 	} else {
406 		/* take whatever we had at init time */
407 		txr = na->num_tx_rings;
408 		txd = na->num_tx_desc;
409 		rxr = na->num_rx_rings;
410 		rxd = na->num_rx_desc;
411 	}
412 
413 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
414 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
415 		return 0; /* nothing changed */
416 	if (netmap_verbose || na->active_fds > 0) {
417 		D("stored config %s: txring %d x %d, rxring %d x %d",
418 			NM_IFPNAME(ifp),
419 			na->num_tx_rings, na->num_tx_desc,
420 			na->num_rx_rings, na->num_rx_desc);
421 		D("new config %s: txring %d x %d, rxring %d x %d",
422 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
423 	}
424 	if (na->active_fds == 0) {
425 		D("configuration changed (but fine)");
426 		na->num_tx_rings = txr;
427 		na->num_tx_desc = txd;
428 		na->num_rx_rings = rxr;
429 		na->num_rx_desc = rxd;
430 		return 0;
431 	}
432 	D("configuration changed while active, this is bad...");
433 	return 1;
434 }
435 
436 static int
437 netmap_txsync_compat(struct netmap_kring *kring, int flags)
438 {
439 	struct netmap_adapter *na = kring->na;
440 	return na->nm_txsync(na, kring->ring_id, flags);
441 }
442 
443 static int
444 netmap_rxsync_compat(struct netmap_kring *kring, int flags)
445 {
446 	struct netmap_adapter *na = kring->na;
447 	return na->nm_rxsync(na, kring->ring_id, flags);
448 }
449 
450 static int
451 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
452 {
453 	(void)flags;
454 	netmap_txsync_to_host(kring->na);
455 	return 0;
456 }
457 
458 static int
459 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
460 {
461 	(void)flags;
462 	netmap_rxsync_from_host(kring->na, NULL, NULL);
463 	return 0;
464 }
465 
466 
467 
468 /* create the krings array and initialize the fields common to all adapters.
469  * The array layout is this:
470  *
471  *                    +----------+
472  * na->tx_rings ----->|          | \
473  *                    |          |  } na->num_tx_ring
474  *                    |          | /
475  *                    +----------+
476  *                    |          |    host tx kring
477  * na->rx_rings ----> +----------+
478  *                    |          | \
479  *                    |          |  } na->num_rx_rings
480  *                    |          | /
481  *                    +----------+
482  *                    |          |    host rx kring
483  *                    +----------+
484  * na->tailroom ----->|          | \
485  *                    |          |  } tailroom bytes
486  *                    |          | /
487  *                    +----------+
488  *
489  * Note: for compatibility, host krings are created even when not needed.
490  * The tailroom space is currently used by vale ports for allocating leases.
491  */
492 int
493 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
494 {
495 	u_int i, len, ndesc;
496 	struct netmap_kring *kring;
497 	u_int ntx, nrx;
498 
499 	/* account for the (possibly fake) host rings */
500 	ntx = na->num_tx_rings + 1;
501 	nrx = na->num_rx_rings + 1;
502 
503 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
504 
505 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
506 	if (na->tx_rings == NULL) {
507 		D("Cannot allocate krings");
508 		return ENOMEM;
509 	}
510 	na->rx_rings = na->tx_rings + ntx;
511 
512 	/*
513 	 * All fields in krings are 0 except the one initialized below.
514 	 * but better be explicit on important kring fields.
515 	 */
516 	ndesc = na->num_tx_desc;
517 	for (i = 0; i < ntx; i++) { /* Transmit rings */
518 		kring = &na->tx_rings[i];
519 		bzero(kring, sizeof(*kring));
520 		kring->na = na;
521 		kring->ring_id = i;
522 		kring->nkr_num_slots = ndesc;
523 		if (i < na->num_tx_rings) {
524 			kring->nm_sync = netmap_txsync_compat; // XXX
525 		} else if (i == na->num_tx_rings) {
526 			kring->nm_sync = netmap_txsync_to_host_compat;
527 		}
528 		/*
529 		 * IMPORTANT: Always keep one slot empty.
530 		 */
531 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
532 		kring->rtail = kring->nr_hwtail = ndesc - 1;
533 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
534 		ND("ktx %s h %d c %d t %d",
535 			kring->name, kring->rhead, kring->rcur, kring->rtail);
536 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
537 		init_waitqueue_head(&kring->si);
538 	}
539 
540 	ndesc = na->num_rx_desc;
541 	for (i = 0; i < nrx; i++) { /* Receive rings */
542 		kring = &na->rx_rings[i];
543 		bzero(kring, sizeof(*kring));
544 		kring->na = na;
545 		kring->ring_id = i;
546 		kring->nkr_num_slots = ndesc;
547 		if (i < na->num_rx_rings) {
548 			kring->nm_sync = netmap_rxsync_compat; // XXX
549 		} else if (i == na->num_rx_rings) {
550 			kring->nm_sync = netmap_rxsync_from_host_compat;
551 		}
552 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
553 		kring->rtail = kring->nr_hwtail = 0;
554 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
555 		ND("krx %s h %d c %d t %d",
556 			kring->name, kring->rhead, kring->rcur, kring->rtail);
557 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
558 		init_waitqueue_head(&kring->si);
559 	}
560 	init_waitqueue_head(&na->tx_si);
561 	init_waitqueue_head(&na->rx_si);
562 
563 	na->tailroom = na->rx_rings + nrx;
564 
565 	return 0;
566 }
567 
568 
569 /* undo the actions performed by netmap_krings_create */
570 void
571 netmap_krings_delete(struct netmap_adapter *na)
572 {
573 	struct netmap_kring *kring = na->tx_rings;
574 
575 	/* we rely on the krings layout described above */
576 	for ( ; kring != na->tailroom; kring++) {
577 		mtx_destroy(&kring->q_lock);
578 	}
579 	free(na->tx_rings, M_DEVBUF);
580 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
581 }
582 
583 
584 /*
585  * Destructor for NIC ports. They also have an mbuf queue
586  * on the rings connected to the host so we need to purge
587  * them first.
588  */
589 static void
590 netmap_hw_krings_delete(struct netmap_adapter *na)
591 {
592 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
593 
594 	ND("destroy sw mbq with len %d", mbq_len(q));
595 	mbq_purge(q);
596 	mbq_safe_destroy(q);
597 	netmap_krings_delete(na);
598 }
599 
600 
601 static struct netmap_if*
602 netmap_if_new(const char *ifname, struct netmap_adapter *na)
603 {
604 	struct netmap_if *nifp;
605 
606 	if (netmap_update_config(na)) {
607 		/* configuration mismatch, report and fail */
608 		return NULL;
609 	}
610 
611 	if (na->active_fds)
612 		goto final;
613 
614 	if (na->nm_krings_create(na))
615 		goto cleanup;
616 
617 	if (netmap_mem_rings_create(na))
618 		goto cleanup;
619 
620 final:
621 
622 	nifp = netmap_mem_if_new(ifname, na);
623 	if (nifp == NULL)
624 		goto cleanup;
625 
626 	return (nifp);
627 
628 cleanup:
629 
630 	if (na->active_fds == 0) {
631 		netmap_mem_rings_delete(na);
632 		na->nm_krings_delete(na);
633 	}
634 
635 	return NULL;
636 }
637 
638 
639 /* grab a reference to the memory allocator, if we don't have one already.  The
640  * reference is taken from the netmap_adapter registered with the priv.
641  *
642  */
643 static int
644 netmap_get_memory_locked(struct netmap_priv_d* p)
645 {
646 	struct netmap_mem_d *nmd;
647 	int error = 0;
648 
649 	if (p->np_na == NULL) {
650 		if (!netmap_mmap_unreg)
651 			return ENODEV;
652 		/* for compatibility with older versions of the API
653  		 * we use the global allocator when no interface has been
654  		 * registered
655  		 */
656 		nmd = &nm_mem;
657 	} else {
658 		nmd = p->np_na->nm_mem;
659 	}
660 	if (p->np_mref == NULL) {
661 		error = netmap_mem_finalize(nmd);
662 		if (!error)
663 			p->np_mref = nmd;
664 	} else if (p->np_mref != nmd) {
665 		/* a virtual port has been registered, but previous
666  		 * syscalls already used the global allocator.
667  		 * We cannot continue
668  		 */
669 		error = ENODEV;
670 	}
671 	return error;
672 }
673 
674 
675 int
676 netmap_get_memory(struct netmap_priv_d* p)
677 {
678 	int error;
679 	NMG_LOCK();
680 	error = netmap_get_memory_locked(p);
681 	NMG_UNLOCK();
682 	return error;
683 }
684 
685 
686 static int
687 netmap_have_memory_locked(struct netmap_priv_d* p)
688 {
689 	return p->np_mref != NULL;
690 }
691 
692 
693 static void
694 netmap_drop_memory_locked(struct netmap_priv_d* p)
695 {
696 	if (p->np_mref) {
697 		netmap_mem_deref(p->np_mref);
698 		p->np_mref = NULL;
699 	}
700 }
701 
702 
703 /*
704  * File descriptor's private data destructor.
705  *
706  * Call nm_register(ifp,0) to stop netmap mode on the interface and
707  * revert to normal operation. We expect that np_na->ifp has not gone.
708  * The second argument is the nifp to work on. In some cases it is
709  * not attached yet to the netmap_priv_d so we need to pass it as
710  * a separate argument.
711  */
712 /* call with NMG_LOCK held */
713 static void
714 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
715 {
716 	struct netmap_adapter *na = priv->np_na;
717 	struct ifnet *ifp = na->ifp;
718 
719 	NMG_LOCK_ASSERT();
720 	na->active_fds--;
721 	if (na->active_fds <= 0) {	/* last instance */
722 
723 		if (netmap_verbose)
724 			D("deleting last instance for %s", NM_IFPNAME(ifp));
725 		/*
726 		 * (TO CHECK) This function is only called
727 		 * when the last reference to this file descriptor goes
728 		 * away. This means we cannot have any pending poll()
729 		 * or interrupt routine operating on the structure.
730 		 * XXX The file may be closed in a thread while
731 		 * another thread is using it.
732 		 * Linux keeps the file opened until the last reference
733 		 * by any outstanding ioctl/poll or mmap is gone.
734 		 * FreeBSD does not track mmap()s (but we do) and
735 		 * wakes up any sleeping poll(). Need to check what
736 		 * happens if the close() occurs while a concurrent
737 		 * syscall is running.
738 		 */
739 		if (ifp)
740 			na->nm_register(na, 0); /* off, clear flags */
741 		/* Wake up any sleeping threads. netmap_poll will
742 		 * then return POLLERR
743 		 * XXX The wake up now must happen during *_down(), when
744 		 * we order all activities to stop. -gl
745 		 */
746 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
747 		/* knlist_destroy(&na->tx_si.si_note); */
748 		/* knlist_destroy(&na->rx_si.si_note); */
749 
750 		/* delete rings and buffers */
751 		netmap_mem_rings_delete(na);
752 		na->nm_krings_delete(na);
753 	}
754 	/* delete the nifp */
755 	netmap_mem_if_delete(na, nifp);
756 }
757 
758 static __inline int
759 nm_tx_si_user(struct netmap_priv_d *priv)
760 {
761 	return (priv->np_na != NULL &&
762 		(priv->np_txqlast - priv->np_txqfirst > 1));
763 }
764 
765 static __inline int
766 nm_rx_si_user(struct netmap_priv_d *priv)
767 {
768 	return (priv->np_na != NULL &&
769 		(priv->np_rxqlast - priv->np_rxqfirst > 1));
770 }
771 
772 
773 /*
774  * returns 1 if this is the last instance and we can free priv
775  */
776 int
777 netmap_dtor_locked(struct netmap_priv_d *priv)
778 {
779 	struct netmap_adapter *na = priv->np_na;
780 
781 #ifdef __FreeBSD__
782 	/*
783 	 * np_refcount is the number of active mmaps on
784 	 * this file descriptor
785 	 */
786 	if (--priv->np_refcount > 0) {
787 		return 0;
788 	}
789 #endif /* __FreeBSD__ */
790 	if (!na) {
791 	    return 1; //XXX is it correct?
792 	}
793 	netmap_do_unregif(priv, priv->np_nifp);
794 	priv->np_nifp = NULL;
795 	netmap_drop_memory_locked(priv);
796 	if (priv->np_na) {
797 		if (nm_tx_si_user(priv))
798 			na->tx_si_users--;
799 		if (nm_rx_si_user(priv))
800 			na->rx_si_users--;
801 		netmap_adapter_put(na);
802 		priv->np_na = NULL;
803 	}
804 	return 1;
805 }
806 
807 
808 void
809 netmap_dtor(void *data)
810 {
811 	struct netmap_priv_d *priv = data;
812 	int last_instance;
813 
814 	NMG_LOCK();
815 	last_instance = netmap_dtor_locked(priv);
816 	NMG_UNLOCK();
817 	if (last_instance) {
818 		bzero(priv, sizeof(*priv));	/* for safety */
819 		free(priv, M_DEVBUF);
820 	}
821 }
822 
823 
824 
825 
826 /*
827  * Handlers for synchronization of the queues from/to the host.
828  * Netmap has two operating modes:
829  * - in the default mode, the rings connected to the host stack are
830  *   just another ring pair managed by userspace;
831  * - in transparent mode (XXX to be defined) incoming packets
832  *   (from the host or the NIC) are marked as NS_FORWARD upon
833  *   arrival, and the user application has a chance to reset the
834  *   flag for packets that should be dropped.
835  *   On the RXSYNC or poll(), packets in RX rings between
836  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
837  *   to the other side.
838  * The transfer NIC --> host is relatively easy, just encapsulate
839  * into mbufs and we are done. The host --> NIC side is slightly
840  * harder because there might not be room in the tx ring so it
841  * might take a while before releasing the buffer.
842  */
843 
844 
845 /*
846  * pass a chain of buffers to the host stack as coming from 'dst'
847  * We do not need to lock because the queue is private.
848  */
849 static void
850 netmap_send_up(struct ifnet *dst, struct mbq *q)
851 {
852 	struct mbuf *m;
853 
854 	/* send packets up, outside the lock */
855 	while ((m = mbq_dequeue(q)) != NULL) {
856 		if (netmap_verbose & NM_VERB_HOST)
857 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
858 		NM_SEND_UP(dst, m);
859 	}
860 	mbq_destroy(q);
861 }
862 
863 
864 /*
865  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
866  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
867  * and pass them up. Drop remaining packets in the unlikely event
868  * of an mbuf shortage.
869  */
870 static void
871 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
872 {
873 	u_int const lim = kring->nkr_num_slots - 1;
874 	u_int const head = kring->ring->head;
875 	u_int n;
876 	struct netmap_adapter *na = kring->na;
877 
878 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
879 		struct mbuf *m;
880 		struct netmap_slot *slot = &kring->ring->slot[n];
881 
882 		if ((slot->flags & NS_FORWARD) == 0 && !force)
883 			continue;
884 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
885 			RD(5, "bad pkt at %d len %d", n, slot->len);
886 			continue;
887 		}
888 		slot->flags &= ~NS_FORWARD; // XXX needed ?
889 		/* XXX TODO: adapt to the case of a multisegment packet */
890 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
891 
892 		if (m == NULL)
893 			break;
894 		mbq_enqueue(q, m);
895 	}
896 }
897 
898 
899 /*
900  * Send to the NIC rings packets marked NS_FORWARD between
901  * kring->nr_hwcur and kring->rhead
902  * Called under kring->rx_queue.lock on the sw rx ring,
903  */
904 static u_int
905 netmap_sw_to_nic(struct netmap_adapter *na)
906 {
907 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
908 	struct netmap_slot *rxslot = kring->ring->slot;
909 	u_int i, rxcur = kring->nr_hwcur;
910 	u_int const head = kring->rhead;
911 	u_int const src_lim = kring->nkr_num_slots - 1;
912 	u_int sent = 0;
913 
914 	/* scan rings to find space, then fill as much as possible */
915 	for (i = 0; i < na->num_tx_rings; i++) {
916 		struct netmap_kring *kdst = &na->tx_rings[i];
917 		struct netmap_ring *rdst = kdst->ring;
918 		u_int const dst_lim = kdst->nkr_num_slots - 1;
919 
920 		/* XXX do we trust ring or kring->rcur,rtail ? */
921 		for (; rxcur != head && !nm_ring_empty(rdst);
922 		     rxcur = nm_next(rxcur, src_lim) ) {
923 			struct netmap_slot *src, *dst, tmp;
924 			u_int dst_cur = rdst->cur;
925 
926 			src = &rxslot[rxcur];
927 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
928 				continue;
929 
930 			sent++;
931 
932 			dst = &rdst->slot[dst_cur];
933 
934 			tmp = *src;
935 
936 			src->buf_idx = dst->buf_idx;
937 			src->flags = NS_BUF_CHANGED;
938 
939 			dst->buf_idx = tmp.buf_idx;
940 			dst->len = tmp.len;
941 			dst->flags = NS_BUF_CHANGED;
942 
943 			rdst->cur = nm_next(dst_cur, dst_lim);
944 		}
945 		/* if (sent) XXX txsync ? */
946 	}
947 	return sent;
948 }
949 
950 
951 /*
952  * netmap_txsync_to_host() passes packets up. We are called from a
953  * system call in user process context, and the only contention
954  * can be among multiple user threads erroneously calling
955  * this routine concurrently.
956  */
957 void
958 netmap_txsync_to_host(struct netmap_adapter *na)
959 {
960 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
961 	struct netmap_ring *ring = kring->ring;
962 	u_int const lim = kring->nkr_num_slots - 1;
963 	u_int const head = kring->rhead;
964 	struct mbq q;
965 
966 	/* Take packets from hwcur to head and pass them up.
967 	 * force head = cur since netmap_grab_packets() stops at head
968 	 * In case of no buffers we give up. At the end of the loop,
969 	 * the queue is drained in all cases.
970 	 */
971 	mbq_init(&q);
972 	ring->cur = head;
973 	netmap_grab_packets(kring, &q, 1 /* force */);
974 	ND("have %d pkts in queue", mbq_len(&q));
975 	kring->nr_hwcur = head;
976 	kring->nr_hwtail = head + lim;
977 	if (kring->nr_hwtail > lim)
978 		kring->nr_hwtail -= lim + 1;
979 	nm_txsync_finalize(kring);
980 
981 	netmap_send_up(na->ifp, &q);
982 }
983 
984 
985 /*
986  * rxsync backend for packets coming from the host stack.
987  * They have been put in kring->rx_queue by netmap_transmit().
988  * We protect access to the kring using kring->rx_queue.lock
989  *
990  * This routine also does the selrecord if called from the poll handler
991  * (we know because td != NULL).
992  *
993  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
994  *     as an additional hidden argument.
995  * returns the number of packets delivered to tx queues in
996  * transparent mode, or a negative value if error
997  */
998 int
999 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1000 {
1001 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1002 	struct netmap_ring *ring = kring->ring;
1003 	u_int nm_i, n;
1004 	u_int const lim = kring->nkr_num_slots - 1;
1005 	u_int const head = kring->rhead;
1006 	int ret = 0;
1007 	struct mbq *q = &kring->rx_queue;
1008 
1009 	(void)pwait;	/* disable unused warnings */
1010 	(void)td;
1011 
1012 	mtx_lock(&q->lock);
1013 
1014 	/* First part: import newly received packets */
1015 	n = mbq_len(q);
1016 	if (n) { /* grab packets from the queue */
1017 		struct mbuf *m;
1018 		uint32_t stop_i;
1019 
1020 		nm_i = kring->nr_hwtail;
1021 		stop_i = nm_prev(nm_i, lim);
1022 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1023 			int len = MBUF_LEN(m);
1024 			struct netmap_slot *slot = &ring->slot[nm_i];
1025 
1026 			m_copydata(m, 0, len, BDG_NMB(na, slot));
1027 			ND("nm %d len %d", nm_i, len);
1028 			if (netmap_verbose)
1029                                 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
1030 
1031 			slot->len = len;
1032 			slot->flags = kring->nkr_slot_flags;
1033 			nm_i = nm_next(nm_i, lim);
1034 		}
1035 		kring->nr_hwtail = nm_i;
1036 	}
1037 
1038 	/*
1039 	 * Second part: skip past packets that userspace has released.
1040 	 */
1041 	nm_i = kring->nr_hwcur;
1042 	if (nm_i != head) { /* something was released */
1043 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1044 			ret = netmap_sw_to_nic(na);
1045 		kring->nr_hwcur = head;
1046 	}
1047 
1048 	nm_rxsync_finalize(kring);
1049 
1050 	/* access copies of cur,tail in the kring */
1051 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1052 		selrecord(td, &kring->si);
1053 
1054 	mtx_unlock(&q->lock);
1055 	return ret;
1056 }
1057 
1058 
1059 /* Get a netmap adapter for the port.
1060  *
1061  * If it is possible to satisfy the request, return 0
1062  * with *na containing the netmap adapter found.
1063  * Otherwise return an error code, with *na containing NULL.
1064  *
1065  * When the port is attached to a bridge, we always return
1066  * EBUSY.
1067  * Otherwise, if the port is already bound to a file descriptor,
1068  * then we unconditionally return the existing adapter into *na.
1069  * In all the other cases, we return (into *na) either native,
1070  * generic or NULL, according to the following table:
1071  *
1072  *					native_support
1073  * active_fds   dev.netmap.admode         YES     NO
1074  * -------------------------------------------------------
1075  *    >0              *                 NA(ifp) NA(ifp)
1076  *
1077  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1078  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1079  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1080  *
1081  */
1082 
1083 int
1084 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1085 {
1086 	/* generic support */
1087 	int i = netmap_admode;	/* Take a snapshot. */
1088 	int error = 0;
1089 	struct netmap_adapter *prev_na;
1090 	struct netmap_generic_adapter *gna;
1091 
1092 	*na = NULL; /* default */
1093 
1094 	/* reset in case of invalid value */
1095 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1096 		i = netmap_admode = NETMAP_ADMODE_BEST;
1097 
1098 	if (NETMAP_CAPABLE(ifp)) {
1099 		/* If an adapter already exists, but is
1100 		 * attached to a vale port, we report that the
1101 		 * port is busy.
1102 		 */
1103 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
1104 			return EBUSY;
1105 
1106 		/* If an adapter already exists, return it if
1107 		 * there are active file descriptors or if
1108 		 * netmap is not forced to use generic
1109 		 * adapters.
1110 		 */
1111 		if (NA(ifp)->active_fds > 0 ||
1112 				i != NETMAP_ADMODE_GENERIC) {
1113 			*na = NA(ifp);
1114 			return 0;
1115 		}
1116 	}
1117 
1118 	/* If there isn't native support and netmap is not allowed
1119 	 * to use generic adapters, we cannot satisfy the request.
1120 	 */
1121 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1122 		return EOPNOTSUPP;
1123 
1124 	/* Otherwise, create a generic adapter and return it,
1125 	 * saving the previously used netmap adapter, if any.
1126 	 *
1127 	 * Note that here 'prev_na', if not NULL, MUST be a
1128 	 * native adapter, and CANNOT be a generic one. This is
1129 	 * true because generic adapters are created on demand, and
1130 	 * destroyed when not used anymore. Therefore, if the adapter
1131 	 * currently attached to an interface 'ifp' is generic, it
1132 	 * must be that
1133 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1134 	 * Consequently, if NA(ifp) is generic, we will enter one of
1135 	 * the branches above. This ensures that we never override
1136 	 * a generic adapter with another generic adapter.
1137 	 */
1138 	prev_na = NA(ifp);
1139 	error = generic_netmap_attach(ifp);
1140 	if (error)
1141 		return error;
1142 
1143 	*na = NA(ifp);
1144 	gna = (struct netmap_generic_adapter*)NA(ifp);
1145 	gna->prev = prev_na; /* save old na */
1146 	if (prev_na != NULL) {
1147 		ifunit_ref(ifp->if_xname);
1148 		// XXX add a refcount ?
1149 		netmap_adapter_get(prev_na);
1150 	}
1151 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1152 
1153 	return 0;
1154 }
1155 
1156 
1157 /*
1158  * MUST BE CALLED UNDER NMG_LOCK()
1159  *
1160  * Get a refcounted reference to a netmap adapter attached
1161  * to the interface specified by nmr.
1162  * This is always called in the execution of an ioctl().
1163  *
1164  * Return ENXIO if the interface specified by the request does
1165  * not exist, ENOTSUP if netmap is not supported by the interface,
1166  * EBUSY if the interface is already attached to a bridge,
1167  * EINVAL if parameters are invalid, ENOMEM if needed resources
1168  * could not be allocated.
1169  * If successful, hold a reference to the netmap adapter.
1170  *
1171  * No reference is kept on the real interface, which may then
1172  * disappear at any time.
1173  */
1174 int
1175 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1176 {
1177 	struct ifnet *ifp = NULL;
1178 	int error = 0;
1179 	struct netmap_adapter *ret = NULL;
1180 
1181 	*na = NULL;     /* default return value */
1182 
1183 	/* first try to see if this is a bridge port. */
1184 	NMG_LOCK_ASSERT();
1185 
1186 	error = netmap_get_pipe_na(nmr, na, create);
1187 	if (error || *na != NULL)
1188 		return error;
1189 
1190 	error = netmap_get_bdg_na(nmr, na, create);
1191 	if (error)
1192 		return error;
1193 
1194 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1195 		goto pipes;
1196 
1197 	ifp = ifunit_ref(nmr->nr_name);
1198 	if (ifp == NULL) {
1199 	        return ENXIO;
1200 	}
1201 
1202 	error = netmap_get_hw_na(ifp, &ret);
1203 	if (error)
1204 		goto out;
1205 
1206 	/* Users cannot use the NIC attached to a bridge directly */
1207 	if (NETMAP_OWNED_BY_KERN(ret)) {
1208 		error = EBUSY;
1209 		goto out;
1210 	}
1211 	*na = ret;
1212 	netmap_adapter_get(ret);
1213 
1214 pipes:
1215 	error = netmap_pipe_alloc(*na, nmr);
1216 
1217 out:
1218 	if (error && ret != NULL)
1219 		netmap_adapter_put(ret);
1220 
1221 	if (ifp)
1222 		if_rele(ifp);
1223 
1224 	return error;
1225 }
1226 
1227 
1228 /*
1229  * validate parameters on entry for *_txsync()
1230  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1231  * in case of error.
1232  *
1233  * rhead, rcur and rtail=hwtail are stored from previous round.
1234  * hwcur is the next packet to send to the ring.
1235  *
1236  * We want
1237  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1238  *
1239  * hwcur, rhead, rtail and hwtail are reliable
1240  */
1241 u_int
1242 nm_txsync_prologue(struct netmap_kring *kring)
1243 {
1244 	struct netmap_ring *ring = kring->ring;
1245 	u_int head = ring->head; /* read only once */
1246 	u_int cur = ring->cur; /* read only once */
1247 	u_int n = kring->nkr_num_slots;
1248 
1249 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1250 		kring->name,
1251 		kring->nr_hwcur, kring->nr_hwtail,
1252 		ring->head, ring->cur, ring->tail);
1253 #if 1 /* kernel sanity checks; but we can trust the kring. */
1254 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1255 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1256 		goto error;
1257 #endif /* kernel sanity checks */
1258 	/*
1259 	 * user sanity checks. We only use 'cur',
1260 	 * A, B, ... are possible positions for cur:
1261 	 *
1262 	 *  0    A  cur   B  tail  C  n-1
1263 	 *  0    D  tail  E  cur   F  n-1
1264 	 *
1265 	 * B, F, D are valid. A, C, E are wrong
1266 	 */
1267 	if (kring->rtail >= kring->rhead) {
1268 		/* want rhead <= head <= rtail */
1269 		if (head < kring->rhead || head > kring->rtail)
1270 			goto error;
1271 		/* and also head <= cur <= rtail */
1272 		if (cur < head || cur > kring->rtail)
1273 			goto error;
1274 	} else { /* here rtail < rhead */
1275 		/* we need head outside rtail .. rhead */
1276 		if (head > kring->rtail && head < kring->rhead)
1277 			goto error;
1278 
1279 		/* two cases now: head <= rtail or head >= rhead  */
1280 		if (head <= kring->rtail) {
1281 			/* want head <= cur <= rtail */
1282 			if (cur < head || cur > kring->rtail)
1283 				goto error;
1284 		} else { /* head >= rhead */
1285 			/* cur must be outside rtail..head */
1286 			if (cur > kring->rtail && cur < head)
1287 				goto error;
1288 		}
1289 	}
1290 	if (ring->tail != kring->rtail) {
1291 		RD(5, "tail overwritten was %d need %d",
1292 			ring->tail, kring->rtail);
1293 		ring->tail = kring->rtail;
1294 	}
1295 	kring->rhead = head;
1296 	kring->rcur = cur;
1297 	return head;
1298 
1299 error:
1300 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1301 		kring->name,
1302 		kring->nr_hwcur,
1303 		kring->rcur, kring->nr_hwtail,
1304 		cur, ring->tail);
1305 	return n;
1306 }
1307 
1308 
1309 /*
1310  * validate parameters on entry for *_rxsync()
1311  * Returns ring->head if ok, kring->nkr_num_slots on error.
1312  *
1313  * For a valid configuration,
1314  * hwcur <= head <= cur <= tail <= hwtail
1315  *
1316  * We only consider head and cur.
1317  * hwcur and hwtail are reliable.
1318  *
1319  */
1320 u_int
1321 nm_rxsync_prologue(struct netmap_kring *kring)
1322 {
1323 	struct netmap_ring *ring = kring->ring;
1324 	uint32_t const n = kring->nkr_num_slots;
1325 	uint32_t head, cur;
1326 
1327 	ND("%s kc %d kt %d h %d c %d t %d",
1328 		kring->name,
1329 		kring->nr_hwcur, kring->nr_hwtail,
1330 		ring->head, ring->cur, ring->tail);
1331 	/*
1332 	 * Before storing the new values, we should check they do not
1333 	 * move backwards. However:
1334 	 * - head is not an issue because the previous value is hwcur;
1335 	 * - cur could in principle go back, however it does not matter
1336 	 *   because we are processing a brand new rxsync()
1337 	 */
1338 	cur = kring->rcur = ring->cur;	/* read only once */
1339 	head = kring->rhead = ring->head;	/* read only once */
1340 #if 1 /* kernel sanity checks */
1341 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1342 		goto error;
1343 #endif /* kernel sanity checks */
1344 	/* user sanity checks */
1345 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1346 		/* want hwcur <= rhead <= hwtail */
1347 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1348 			goto error;
1349 		/* and also rhead <= rcur <= hwtail */
1350 		if (cur < head || cur > kring->nr_hwtail)
1351 			goto error;
1352 	} else {
1353 		/* we need rhead outside hwtail..hwcur */
1354 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1355 			goto error;
1356 		/* two cases now: head <= hwtail or head >= hwcur  */
1357 		if (head <= kring->nr_hwtail) {
1358 			/* want head <= cur <= hwtail */
1359 			if (cur < head || cur > kring->nr_hwtail)
1360 				goto error;
1361 		} else {
1362 			/* cur must be outside hwtail..head */
1363 			if (cur < head && cur > kring->nr_hwtail)
1364 				goto error;
1365 		}
1366 	}
1367 	if (ring->tail != kring->rtail) {
1368 		RD(5, "%s tail overwritten was %d need %d",
1369 			kring->name,
1370 			ring->tail, kring->rtail);
1371 		ring->tail = kring->rtail;
1372 	}
1373 	return head;
1374 
1375 error:
1376 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1377 		kring->nr_hwcur,
1378 		kring->rcur, kring->nr_hwtail,
1379 		kring->rhead, kring->rcur, ring->tail);
1380 	return n;
1381 }
1382 
1383 
1384 /*
1385  * Error routine called when txsync/rxsync detects an error.
1386  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1387  * Return 1 on reinit.
1388  *
1389  * This routine is only called by the upper half of the kernel.
1390  * It only reads hwcur (which is changed only by the upper half, too)
1391  * and hwtail (which may be changed by the lower half, but only on
1392  * a tx ring and only to increase it, so any error will be recovered
1393  * on the next call). For the above, we don't strictly need to call
1394  * it under lock.
1395  */
1396 int
1397 netmap_ring_reinit(struct netmap_kring *kring)
1398 {
1399 	struct netmap_ring *ring = kring->ring;
1400 	u_int i, lim = kring->nkr_num_slots - 1;
1401 	int errors = 0;
1402 
1403 	// XXX KASSERT nm_kr_tryget
1404 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1405 	// XXX probably wrong to trust userspace
1406 	kring->rhead = ring->head;
1407 	kring->rcur  = ring->cur;
1408 	kring->rtail = ring->tail;
1409 
1410 	if (ring->cur > lim)
1411 		errors++;
1412 	if (ring->head > lim)
1413 		errors++;
1414 	if (ring->tail > lim)
1415 		errors++;
1416 	for (i = 0; i <= lim; i++) {
1417 		u_int idx = ring->slot[i].buf_idx;
1418 		u_int len = ring->slot[i].len;
1419 		if (idx < 2 || idx >= netmap_total_buffers) {
1420 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1421 			ring->slot[i].buf_idx = 0;
1422 			ring->slot[i].len = 0;
1423 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1424 			ring->slot[i].len = 0;
1425 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1426 		}
1427 	}
1428 	if (errors) {
1429 		RD(10, "total %d errors", errors);
1430 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1431 			kring->name,
1432 			ring->cur, kring->nr_hwcur,
1433 			ring->tail, kring->nr_hwtail);
1434 		ring->head = kring->rhead = kring->nr_hwcur;
1435 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1436 		ring->tail = kring->rtail = kring->nr_hwtail;
1437 	}
1438 	return (errors ? 1 : 0);
1439 }
1440 
1441 
1442 /*
1443  * Set the ring ID. For devices with a single queue, a request
1444  * for all rings is the same as a single ring.
1445  */
1446 static int
1447 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1448 {
1449 	struct netmap_adapter *na = priv->np_na;
1450 	u_int j, i = ringid & NETMAP_RING_MASK;
1451 	u_int reg = flags & NR_REG_MASK;
1452 
1453 	if (reg == NR_REG_DEFAULT) {
1454 		/* convert from old ringid to flags */
1455 		if (ringid & NETMAP_SW_RING) {
1456 			reg = NR_REG_SW;
1457 		} else if (ringid & NETMAP_HW_RING) {
1458 			reg = NR_REG_ONE_NIC;
1459 		} else {
1460 			reg = NR_REG_ALL_NIC;
1461 		}
1462 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1463 	}
1464 	switch (reg) {
1465 	case NR_REG_ALL_NIC:
1466 	case NR_REG_PIPE_MASTER:
1467 	case NR_REG_PIPE_SLAVE:
1468 		priv->np_txqfirst = 0;
1469 		priv->np_txqlast = na->num_tx_rings;
1470 		priv->np_rxqfirst = 0;
1471 		priv->np_rxqlast = na->num_rx_rings;
1472 		ND("%s %d %d", "ALL/PIPE",
1473 			priv->np_rxqfirst, priv->np_rxqlast);
1474 		break;
1475 	case NR_REG_SW:
1476 	case NR_REG_NIC_SW:
1477 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1478 			D("host rings not supported");
1479 			return EINVAL;
1480 		}
1481 		priv->np_txqfirst = (reg == NR_REG_SW ?
1482 			na->num_tx_rings : 0);
1483 		priv->np_txqlast = na->num_tx_rings + 1;
1484 		priv->np_rxqfirst = (reg == NR_REG_SW ?
1485 			na->num_rx_rings : 0);
1486 		priv->np_rxqlast = na->num_rx_rings + 1;
1487 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1488 			priv->np_rxqfirst, priv->np_rxqlast);
1489 		break;
1490 	case NR_REG_ONE_NIC:
1491 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1492 			D("invalid ring id %d", i);
1493 			return EINVAL;
1494 		}
1495 		/* if not enough rings, use the first one */
1496 		j = i;
1497 		if (j >= na->num_tx_rings)
1498 			j = 0;
1499 		priv->np_txqfirst = j;
1500 		priv->np_txqlast = j + 1;
1501 		j = i;
1502 		if (j >= na->num_rx_rings)
1503 			j = 0;
1504 		priv->np_rxqfirst = j;
1505 		priv->np_rxqlast = j + 1;
1506 		break;
1507 	default:
1508 		D("invalid regif type %d", reg);
1509 		return EINVAL;
1510 	}
1511 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1512 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1513 	if (nm_tx_si_user(priv))
1514 		na->tx_si_users++;
1515 	if (nm_rx_si_user(priv))
1516 		na->rx_si_users++;
1517 	if (netmap_verbose) {
1518 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1519 			NM_IFPNAME(na->ifp),
1520 			priv->np_txqfirst,
1521 			priv->np_txqlast,
1522 			priv->np_rxqfirst,
1523 			priv->np_rxqlast,
1524 			i);
1525 	}
1526 	return 0;
1527 }
1528 
1529 /*
1530  * possibly move the interface to netmap-mode.
1531  * If success it returns a pointer to netmap_if, otherwise NULL.
1532  * This must be called with NMG_LOCK held.
1533  */
1534 struct netmap_if *
1535 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1536 	uint16_t ringid, uint32_t flags, int *err)
1537 {
1538 	struct ifnet *ifp = na->ifp;
1539 	struct netmap_if *nifp = NULL;
1540 	int error, need_mem = 0;
1541 
1542 	NMG_LOCK_ASSERT();
1543 	/* ring configuration may have changed, fetch from the card */
1544 	netmap_update_config(na);
1545 	priv->np_na = na;     /* store the reference */
1546 	error = netmap_set_ringid(priv, ringid, flags);
1547 	if (error)
1548 		goto out;
1549 	/* ensure allocators are ready */
1550 	need_mem = !netmap_have_memory_locked(priv);
1551 	if (need_mem) {
1552 		error = netmap_get_memory_locked(priv);
1553 		ND("get_memory returned %d", error);
1554 		if (error)
1555 			goto out;
1556 	}
1557 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1558 	if (nifp == NULL) { /* allocation failed */
1559 		/* we should drop the allocator, but only
1560 		 * if we were the ones who grabbed it
1561 		 */
1562 		error = ENOMEM;
1563 		goto out;
1564 	}
1565 	na->active_fds++;
1566 	if (ifp->if_capenable & IFCAP_NETMAP) {
1567 		/* was already set */
1568 	} else {
1569 		/* Otherwise set the card in netmap mode
1570 		 * and make it use the shared buffers.
1571 		 *
1572 		 * do not core lock because the race is harmless here,
1573 		 * there cannot be any traffic to netmap_transmit()
1574 		 */
1575 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1576 		ND("%p->na_lut == %p", na, na->na_lut);
1577 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1578 		error = na->nm_register(na, 1); /* mode on */
1579 		if (error) {
1580 			netmap_do_unregif(priv, nifp);
1581 			nifp = NULL;
1582 		}
1583 	}
1584 out:
1585 	*err = error;
1586 	if (error) {
1587 		priv->np_na = NULL;
1588 		if (need_mem)
1589 			netmap_drop_memory_locked(priv);
1590 	}
1591 	if (nifp != NULL) {
1592 		/*
1593 		 * advertise that the interface is ready bt setting ni_nifp.
1594 		 * The barrier is needed because readers (poll and *SYNC)
1595 		 * check for priv->np_nifp != NULL without locking
1596 		 */
1597 		wmb(); /* make sure previous writes are visible to all CPUs */
1598 		priv->np_nifp = nifp;
1599 	}
1600 	return nifp;
1601 }
1602 
1603 
1604 
1605 /*
1606  * ioctl(2) support for the "netmap" device.
1607  *
1608  * Following a list of accepted commands:
1609  * - NIOCGINFO
1610  * - SIOCGIFADDR	just for convenience
1611  * - NIOCREGIF
1612  * - NIOCTXSYNC
1613  * - NIOCRXSYNC
1614  *
1615  * Return 0 on success, errno otherwise.
1616  */
1617 int
1618 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1619 	int fflag, struct thread *td)
1620 {
1621 	struct netmap_priv_d *priv = NULL;
1622 	struct ifnet *ifp = NULL;
1623 	struct nmreq *nmr = (struct nmreq *) data;
1624 	struct netmap_adapter *na = NULL;
1625 	int error;
1626 	u_int i, qfirst, qlast;
1627 	struct netmap_if *nifp;
1628 	struct netmap_kring *krings;
1629 
1630 	(void)dev;	/* UNUSED */
1631 	(void)fflag;	/* UNUSED */
1632 
1633 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
1634 		/* truncate name */
1635 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
1636 		if (nmr->nr_version != NETMAP_API) {
1637 			D("API mismatch for %s got %d need %d",
1638 				nmr->nr_name,
1639 				nmr->nr_version, NETMAP_API);
1640 			nmr->nr_version = NETMAP_API;
1641 		}
1642 		if (nmr->nr_version < NETMAP_MIN_API ||
1643 		    nmr->nr_version > NETMAP_MAX_API) {
1644 			return EINVAL;
1645 		}
1646 	}
1647 	CURVNET_SET(TD_TO_VNET(td));
1648 
1649 	error = devfs_get_cdevpriv((void **)&priv);
1650 	if (error) {
1651 		CURVNET_RESTORE();
1652 		/* XXX ENOENT should be impossible, since the priv
1653 		 * is now created in the open */
1654 		return (error == ENOENT ? ENXIO : error);
1655 	}
1656 
1657 	switch (cmd) {
1658 	case NIOCGINFO:		/* return capabilities etc */
1659 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1660 			error = netmap_bdg_ctl(nmr, NULL);
1661 			break;
1662 		}
1663 
1664 		NMG_LOCK();
1665 		do {
1666 			/* memsize is always valid */
1667 			struct netmap_mem_d *nmd = &nm_mem;
1668 			u_int memflags;
1669 
1670 			if (nmr->nr_name[0] != '\0') {
1671 				/* get a refcount */
1672 				error = netmap_get_na(nmr, &na, 1 /* create */);
1673 				if (error)
1674 					break;
1675 				nmd = na->nm_mem; /* get memory allocator */
1676 			}
1677 
1678 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
1679 				&nmr->nr_arg2);
1680 			if (error)
1681 				break;
1682 			if (na == NULL) /* only memory info */
1683 				break;
1684 			nmr->nr_offset = 0;
1685 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1686 			netmap_update_config(na);
1687 			nmr->nr_rx_rings = na->num_rx_rings;
1688 			nmr->nr_tx_rings = na->num_tx_rings;
1689 			nmr->nr_rx_slots = na->num_rx_desc;
1690 			nmr->nr_tx_slots = na->num_tx_desc;
1691 			netmap_adapter_put(na);
1692 		} while (0);
1693 		NMG_UNLOCK();
1694 		break;
1695 
1696 	case NIOCREGIF:
1697 		/* possibly attach/detach NIC and VALE switch */
1698 		i = nmr->nr_cmd;
1699 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
1700 				|| i == NETMAP_BDG_VNET_HDR) {
1701 			error = netmap_bdg_ctl(nmr, NULL);
1702 			break;
1703 		} else if (i != 0) {
1704 			D("nr_cmd must be 0 not %d", i);
1705 			error = EINVAL;
1706 			break;
1707 		}
1708 
1709 		/* protect access to priv from concurrent NIOCREGIF */
1710 		NMG_LOCK();
1711 		do {
1712 			u_int memflags;
1713 
1714 			if (priv->np_na != NULL) {	/* thread already registered */
1715 				error = EBUSY;
1716 				break;
1717 			}
1718 			/* find the interface and a reference */
1719 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1720 			if (error)
1721 				break;
1722 			ifp = na->ifp;
1723 			if (NETMAP_OWNED_BY_KERN(na)) {
1724 				netmap_adapter_put(na);
1725 				error = EBUSY;
1726 				break;
1727 			}
1728 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
1729 			if (!nifp) {    /* reg. failed, release priv and ref */
1730 				netmap_adapter_put(na);
1731 				priv->np_nifp = NULL;
1732 				break;
1733 			}
1734 			priv->np_td = td; // XXX kqueue, debugging only
1735 
1736 			/* return the offset of the netmap_if object */
1737 			nmr->nr_rx_rings = na->num_rx_rings;
1738 			nmr->nr_tx_rings = na->num_tx_rings;
1739 			nmr->nr_rx_slots = na->num_rx_desc;
1740 			nmr->nr_tx_slots = na->num_tx_desc;
1741 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
1742 				&nmr->nr_arg2);
1743 			if (error) {
1744 				netmap_adapter_put(na);
1745 				break;
1746 			}
1747 			if (memflags & NETMAP_MEM_PRIVATE) {
1748 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1749 			}
1750 			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
1751 				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
1752 			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
1753 				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
1754 
1755 			if (nmr->nr_arg3) {
1756 				D("requested %d extra buffers", nmr->nr_arg3);
1757 				nmr->nr_arg3 = netmap_extra_alloc(na,
1758 					&nifp->ni_bufs_head, nmr->nr_arg3);
1759 				D("got %d extra buffers", nmr->nr_arg3);
1760 			}
1761 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1762 		} while (0);
1763 		NMG_UNLOCK();
1764 		break;
1765 
1766 	case NIOCTXSYNC:
1767 	case NIOCRXSYNC:
1768 		nifp = priv->np_nifp;
1769 
1770 		if (nifp == NULL) {
1771 			error = ENXIO;
1772 			break;
1773 		}
1774 		rmb(); /* make sure following reads are not from cache */
1775 
1776 		na = priv->np_na;      /* we have a reference */
1777 
1778 		if (na == NULL) {
1779 			D("Internal error: nifp != NULL && na == NULL");
1780 			error = ENXIO;
1781 			break;
1782 		}
1783 
1784 		ifp = na->ifp;
1785 		if (ifp == NULL) {
1786 			RD(1, "the ifp is gone");
1787 			error = ENXIO;
1788 			break;
1789 		}
1790 
1791 		if (cmd == NIOCTXSYNC) {
1792 			krings = na->tx_rings;
1793 			qfirst = priv->np_txqfirst;
1794 			qlast = priv->np_txqlast;
1795 		} else {
1796 			krings = na->rx_rings;
1797 			qfirst = priv->np_rxqfirst;
1798 			qlast = priv->np_rxqlast;
1799 		}
1800 
1801 		for (i = qfirst; i < qlast; i++) {
1802 			struct netmap_kring *kring = krings + i;
1803 			if (nm_kr_tryget(kring)) {
1804 				error = EBUSY;
1805 				goto out;
1806 			}
1807 			if (cmd == NIOCTXSYNC) {
1808 				if (netmap_verbose & NM_VERB_TXSYNC)
1809 					D("pre txsync ring %d cur %d hwcur %d",
1810 					    i, kring->ring->cur,
1811 					    kring->nr_hwcur);
1812 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1813 					netmap_ring_reinit(kring);
1814 				} else {
1815 					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
1816 				}
1817 				if (netmap_verbose & NM_VERB_TXSYNC)
1818 					D("post txsync ring %d cur %d hwcur %d",
1819 					    i, kring->ring->cur,
1820 					    kring->nr_hwcur);
1821 			} else {
1822 				kring->nm_sync(kring, NAF_FORCE_READ);
1823 				microtime(&na->rx_rings[i].ring->ts);
1824 			}
1825 			nm_kr_put(kring);
1826 		}
1827 
1828 		break;
1829 
1830 #ifdef __FreeBSD__
1831 	case FIONBIO:
1832 	case FIOASYNC:
1833 		ND("FIONBIO/FIOASYNC are no-ops");
1834 		break;
1835 
1836 	case BIOCIMMEDIATE:
1837 	case BIOCGHDRCMPLT:
1838 	case BIOCSHDRCMPLT:
1839 	case BIOCSSEESENT:
1840 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1841 		break;
1842 
1843 	default:	/* allow device-specific ioctls */
1844 	    {
1845 		struct socket so;
1846 
1847 		bzero(&so, sizeof(so));
1848 		NMG_LOCK();
1849 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1850 		if (error) {
1851 			netmap_adapter_put(na);
1852 			NMG_UNLOCK();
1853 			break;
1854 		}
1855 		ifp = na->ifp;
1856 		so.so_vnet = ifp->if_vnet;
1857 		// so->so_proto not null.
1858 		error = ifioctl(&so, cmd, data, td);
1859 		netmap_adapter_put(na);
1860 		NMG_UNLOCK();
1861 		break;
1862 	    }
1863 
1864 #else /* linux */
1865 	default:
1866 		error = EOPNOTSUPP;
1867 #endif /* linux */
1868 	}
1869 out:
1870 
1871 	CURVNET_RESTORE();
1872 	return (error);
1873 }
1874 
1875 
1876 /*
1877  * select(2) and poll(2) handlers for the "netmap" device.
1878  *
1879  * Can be called for one or more queues.
1880  * Return true the event mask corresponding to ready events.
1881  * If there are no ready events, do a selrecord on either individual
1882  * selinfo or on the global one.
1883  * Device-dependent parts (locking and sync of tx/rx rings)
1884  * are done through callbacks.
1885  *
1886  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1887  * The first one is remapped to pwait as selrecord() uses the name as an
1888  * hidden argument.
1889  */
1890 int
1891 netmap_poll(struct cdev *dev, int events, struct thread *td)
1892 {
1893 	struct netmap_priv_d *priv = NULL;
1894 	struct netmap_adapter *na;
1895 	struct ifnet *ifp;
1896 	struct netmap_kring *kring;
1897 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1898 	struct mbq q;		/* packets from hw queues to host stack */
1899 	void *pwait = dev;	/* linux compatibility */
1900 	int is_kevent = 0;
1901 
1902 	/*
1903 	 * In order to avoid nested locks, we need to "double check"
1904 	 * txsync and rxsync if we decide to do a selrecord().
1905 	 * retry_tx (and retry_rx, later) prevent looping forever.
1906 	 */
1907 	int retry_tx = 1, retry_rx = 1;
1908 
1909 	(void)pwait;
1910 	mbq_init(&q);
1911 
1912 	/*
1913 	 * XXX kevent has curthread->tp_fop == NULL,
1914 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
1915 	 * priv as the first argument, which is also useful to avoid
1916 	 * the selrecord() which are not necessary in that case.
1917 	 */
1918 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
1919 		is_kevent = 1;
1920 		if (netmap_verbose)
1921 			D("called from kevent");
1922 		priv = (struct netmap_priv_d *)dev;
1923 	}
1924 	if (priv == NULL)
1925 		return POLLERR;
1926 
1927 	if (priv->np_nifp == NULL) {
1928 		D("No if registered");
1929 		return POLLERR;
1930 	}
1931 	rmb(); /* make sure following reads are not from cache */
1932 
1933 	na = priv->np_na;
1934 	ifp = na->ifp;
1935 	// check for deleted
1936 	if (ifp == NULL) {
1937 		RD(1, "the ifp is gone");
1938 		return POLLERR;
1939 	}
1940 
1941 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1942 		return POLLERR;
1943 
1944 	if (netmap_verbose & 0x8000)
1945 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1946 	want_tx = events & (POLLOUT | POLLWRNORM);
1947 	want_rx = events & (POLLIN | POLLRDNORM);
1948 
1949 
1950 	/*
1951 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1952 	 * the file descriptor is bound to all of them. If so, we sleep on
1953 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1954 	 * (FreeBSD only allows two selinfo's per file descriptor).
1955 	 * The interrupt routine in the driver wake one or the other
1956 	 * (or both) depending on which clients are active.
1957 	 *
1958 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1959 	 * txsync() is called if we run out of buffers on POLLOUT, or
1960 	 * there are pending packets to send. The latter can be disabled
1961 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1962 	 */
1963 	check_all_tx = nm_tx_si_user(priv);
1964 	check_all_rx = nm_rx_si_user(priv);
1965 
1966 	/*
1967 	 * We start with a lock free round which is cheap if we have
1968 	 * slots available. If this fails, then lock and call the sync
1969 	 * routines.
1970 	 */
1971 	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
1972 		kring = &na->rx_rings[i];
1973 		/* XXX compare ring->cur and kring->tail */
1974 		if (!nm_ring_empty(kring->ring)) {
1975 			revents |= want_rx;
1976 			want_rx = 0;	/* also breaks the loop */
1977 		}
1978 	}
1979 	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
1980 		kring = &na->tx_rings[i];
1981 		/* XXX compare ring->cur and kring->tail */
1982 		if (!nm_ring_empty(kring->ring)) {
1983 			revents |= want_tx;
1984 			want_tx = 0;	/* also breaks the loop */
1985 		}
1986 	}
1987 
1988 	/*
1989 	 * If we want to push packets out (priv->np_txpoll) or
1990 	 * want_tx is still set, we must issue txsync calls
1991 	 * (on all rings, to avoid that the tx rings stall).
1992 	 * XXX should also check cur != hwcur on the tx rings.
1993 	 * Fortunately, normal tx mode has np_txpoll set.
1994 	 */
1995 	if (priv->np_txpoll || want_tx) {
1996 		/*
1997 		 * The first round checks if anyone is ready, if not
1998 		 * do a selrecord and another round to handle races.
1999 		 * want_tx goes to 0 if any space is found, and is
2000 		 * used to skip rings with no pending transmissions.
2001 		 */
2002 flush_tx:
2003 		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2004 			int found = 0;
2005 
2006 			kring = &na->tx_rings[i];
2007 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2008 				continue;
2009 			/* only one thread does txsync */
2010 			if (nm_kr_tryget(kring)) {
2011 				if (netmap_verbose)
2012 					RD(2, "%p lost race on txring %d, ok",
2013 					    priv, i);
2014 				continue;
2015 			}
2016 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2017 				netmap_ring_reinit(kring);
2018 				revents |= POLLERR;
2019 			} else {
2020 				if (kring->nm_sync(kring, 0))
2021 					revents |= POLLERR;
2022 			}
2023 
2024 			/*
2025 			 * If we found new slots, notify potential
2026 			 * listeners on the same ring.
2027 			 * Since we just did a txsync, look at the copies
2028 			 * of cur,tail in the kring.
2029 			 */
2030 			found = kring->rcur != kring->rtail;
2031 			nm_kr_put(kring);
2032 			if (found) { /* notify other listeners */
2033 				revents |= want_tx;
2034 				want_tx = 0;
2035 				na->nm_notify(na, i, NR_TX, 0);
2036 			}
2037 		}
2038 		if (want_tx && retry_tx && !is_kevent) {
2039 			selrecord(td, check_all_tx ?
2040 			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2041 			retry_tx = 0;
2042 			goto flush_tx;
2043 		}
2044 	}
2045 
2046 	/*
2047 	 * If want_rx is still set scan receive rings.
2048 	 * Do it on all rings because otherwise we starve.
2049 	 */
2050 	if (want_rx) {
2051 		int send_down = 0; /* transparent mode */
2052 		/* two rounds here to for race avoidance */
2053 do_retry_rx:
2054 		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2055 			int found = 0;
2056 
2057 			kring = &na->rx_rings[i];
2058 
2059 			if (nm_kr_tryget(kring)) {
2060 				if (netmap_verbose)
2061 					RD(2, "%p lost race on rxring %d, ok",
2062 					    priv, i);
2063 				continue;
2064 			}
2065 
2066 			/*
2067 			 * transparent mode support: collect packets
2068 			 * from the rxring(s).
2069 			 * XXX NR_FORWARD should only be read on
2070 			 * physical or NIC ports
2071 			 */
2072 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2073 				ND(10, "forwarding some buffers up %d to %d",
2074 				    kring->nr_hwcur, kring->ring->cur);
2075 				netmap_grab_packets(kring, &q, netmap_fwd);
2076 			}
2077 
2078 			if (kring->nm_sync(kring, 0))
2079 				revents |= POLLERR;
2080 			if (netmap_no_timestamp == 0 ||
2081 					kring->ring->flags & NR_TIMESTAMP) {
2082 				microtime(&kring->ring->ts);
2083 			}
2084 			/* after an rxsync we can use kring->rcur, rtail */
2085 			found = kring->rcur != kring->rtail;
2086 			nm_kr_put(kring);
2087 			if (found) {
2088 				revents |= want_rx;
2089 				retry_rx = 0;
2090 				na->nm_notify(na, i, NR_RX, 0);
2091 			}
2092 		}
2093 
2094 		/* transparent mode XXX only during first pass ? */
2095 		if (na->na_flags & NAF_HOST_RINGS) {
2096 			kring = &na->rx_rings[na->num_rx_rings];
2097 			if (check_all_rx
2098 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2099 				/* XXX fix to use kring fields */
2100 				if (nm_ring_empty(kring->ring))
2101 					send_down = netmap_rxsync_from_host(na, td, dev);
2102 				if (!nm_ring_empty(kring->ring))
2103 					revents |= want_rx;
2104 			}
2105 		}
2106 
2107 		if (retry_rx && !is_kevent)
2108 			selrecord(td, check_all_rx ?
2109 			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2110 		if (send_down > 0 || retry_rx) {
2111 			retry_rx = 0;
2112 			if (send_down)
2113 				goto flush_tx; /* and retry_rx */
2114 			else
2115 				goto do_retry_rx;
2116 		}
2117 	}
2118 
2119 	/*
2120 	 * Transparent mode: marked bufs on rx rings between
2121 	 * kring->nr_hwcur and ring->head
2122 	 * are passed to the other endpoint.
2123 	 *
2124 	 * In this mode we also scan the sw rxring, which in
2125 	 * turn passes packets up.
2126 	 *
2127 	 * XXX Transparent mode at the moment requires to bind all
2128  	 * rings to a single file descriptor.
2129 	 */
2130 
2131 	if (q.head)
2132 		netmap_send_up(na->ifp, &q);
2133 
2134 	return (revents);
2135 }
2136 
2137 
2138 /*-------------------- driver support routines -------------------*/
2139 
2140 static int netmap_hw_krings_create(struct netmap_adapter *);
2141 
2142 static int
2143 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2144 	enum txrx tx, int flags)
2145 {
2146 	struct netmap_kring *kring;
2147 
2148 	if (tx == NR_TX) {
2149 		kring = na->tx_rings + n_ring;
2150 		OS_selwakeup(&kring->si, PI_NET);
2151 		if (na->tx_si_users > 0)
2152 			OS_selwakeup(&na->tx_si, PI_NET);
2153 	} else {
2154 		kring = na->rx_rings + n_ring;
2155 		OS_selwakeup(&kring->si, PI_NET);
2156 		if (na->rx_si_users > 0)
2157 			OS_selwakeup(&na->rx_si, PI_NET);
2158 	}
2159 	return 0;
2160 }
2161 
2162 
2163 // XXX check handling of failures
2164 int
2165 netmap_attach_common(struct netmap_adapter *na)
2166 {
2167 	struct ifnet *ifp = na->ifp;
2168 
2169 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2170 		D("%s: invalid rings tx %d rx %d",
2171 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
2172 		return EINVAL;
2173 	}
2174 	WNA(ifp) = na;
2175 
2176 	/* the following is only needed for na that use the host port.
2177 	 * XXX do we have something similar for linux ?
2178 	 */
2179 #ifdef __FreeBSD__
2180 	na->if_input = ifp->if_input; /* for netmap_send_up */
2181 #endif /* __FreeBSD__ */
2182 
2183 	NETMAP_SET_CAPABLE(ifp);
2184 	if (na->nm_krings_create == NULL) {
2185 		na->nm_krings_create = netmap_hw_krings_create;
2186 		na->nm_krings_delete = netmap_hw_krings_delete;
2187 	}
2188 	if (na->nm_notify == NULL)
2189 		na->nm_notify = netmap_notify;
2190 	na->active_fds = 0;
2191 
2192 	if (na->nm_mem == NULL)
2193 		na->nm_mem = &nm_mem;
2194 	return 0;
2195 }
2196 
2197 
2198 void
2199 netmap_detach_common(struct netmap_adapter *na)
2200 {
2201 	if (na->ifp)
2202 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2203 
2204 	if (na->tx_rings) { /* XXX should not happen */
2205 		D("freeing leftover tx_rings");
2206 		na->nm_krings_delete(na);
2207 	}
2208 	netmap_pipe_dealloc(na);
2209 	if (na->na_flags & NAF_MEM_OWNER)
2210 		netmap_mem_private_delete(na->nm_mem);
2211 	bzero(na, sizeof(*na));
2212 	free(na, M_DEVBUF);
2213 }
2214 
2215 
2216 /*
2217  * Initialize a ``netmap_adapter`` object created by driver on attach.
2218  * We allocate a block of memory with room for a struct netmap_adapter
2219  * plus two sets of N+2 struct netmap_kring (where N is the number
2220  * of hardware rings):
2221  * krings	0..N-1	are for the hardware queues.
2222  * kring	N	is for the host stack queue
2223  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2224  * Return 0 on success, ENOMEM otherwise.
2225  */
2226 int
2227 netmap_attach(struct netmap_adapter *arg)
2228 {
2229 	struct netmap_hw_adapter *hwna = NULL;
2230 	// XXX when is arg == NULL ?
2231 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2232 
2233 	if (arg == NULL || ifp == NULL)
2234 		goto fail;
2235 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2236 	if (hwna == NULL)
2237 		goto fail;
2238 	hwna->up = *arg;
2239 	hwna->up.na_flags |= NAF_HOST_RINGS;
2240 	if (netmap_attach_common(&hwna->up)) {
2241 		free(hwna, M_DEVBUF);
2242 		goto fail;
2243 	}
2244 	netmap_adapter_get(&hwna->up);
2245 
2246 #ifdef linux
2247 	if (ifp->netdev_ops) {
2248 		/* prepare a clone of the netdev ops */
2249 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2250 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2251 #else
2252 		hwna->nm_ndo = *ifp->netdev_ops;
2253 #endif
2254 	}
2255 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2256 #endif /* linux */
2257 
2258 	D("success for %s", NM_IFPNAME(ifp));
2259 	return 0;
2260 
2261 fail:
2262 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2263 	netmap_detach(ifp);
2264 	return (hwna ? EINVAL : ENOMEM);
2265 }
2266 
2267 
2268 void
2269 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2270 {
2271 	if (!na) {
2272 		return;
2273 	}
2274 
2275 	refcount_acquire(&na->na_refcount);
2276 }
2277 
2278 
2279 /* returns 1 iff the netmap_adapter is destroyed */
2280 int
2281 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2282 {
2283 	if (!na)
2284 		return 1;
2285 
2286 	if (!refcount_release(&na->na_refcount))
2287 		return 0;
2288 
2289 	if (na->nm_dtor)
2290 		na->nm_dtor(na);
2291 
2292 	netmap_detach_common(na);
2293 
2294 	return 1;
2295 }
2296 
2297 int
2298 netmap_hw_krings_create(struct netmap_adapter *na)
2299 {
2300 	int ret = netmap_krings_create(na, 0);
2301 	if (ret == 0) {
2302 		/* initialize the mbq for the sw rx ring */
2303 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2304 		ND("initialized sw rx queue %d", na->num_rx_rings);
2305 	}
2306 	return ret;
2307 }
2308 
2309 
2310 
2311 /*
2312  * Free the allocated memory linked to the given ``netmap_adapter``
2313  * object.
2314  */
2315 void
2316 netmap_detach(struct ifnet *ifp)
2317 {
2318 	struct netmap_adapter *na = NA(ifp);
2319 
2320 	if (!na)
2321 		return;
2322 
2323 	NMG_LOCK();
2324 	netmap_disable_all_rings(ifp);
2325 	if (!netmap_adapter_put(na)) {
2326 		/* someone is still using the adapter,
2327 		 * tell them that the interface is gone
2328 		 */
2329 		na->ifp = NULL;
2330 		/* give them a chance to notice */
2331 		netmap_enable_all_rings(ifp);
2332 	}
2333 	NMG_UNLOCK();
2334 }
2335 
2336 
2337 /*
2338  * Intercept packets from the network stack and pass them
2339  * to netmap as incoming packets on the 'software' ring.
2340  *
2341  * We only store packets in a bounded mbq and then copy them
2342  * in the relevant rxsync routine.
2343  *
2344  * We rely on the OS to make sure that the ifp and na do not go
2345  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2346  * In nm_register() or whenever there is a reinitialization,
2347  * we make sure to make the mode change visible here.
2348  */
2349 int
2350 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2351 {
2352 	struct netmap_adapter *na = NA(ifp);
2353 	struct netmap_kring *kring;
2354 	u_int len = MBUF_LEN(m);
2355 	u_int error = ENOBUFS;
2356 	struct mbq *q;
2357 	int space;
2358 
2359 	// XXX [Linux] we do not need this lock
2360 	// if we follow the down/configure/up protocol -gl
2361 	// mtx_lock(&na->core_lock);
2362 
2363 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2364 		D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
2365 		error = ENXIO;
2366 		goto done;
2367 	}
2368 
2369 	kring = &na->rx_rings[na->num_rx_rings];
2370 	q = &kring->rx_queue;
2371 
2372 	// XXX reconsider long packets if we handle fragments
2373 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2374 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2375 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2376 		goto done;
2377 	}
2378 
2379 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2380 	 * and maybe other instances of netmap_transmit (the latter
2381 	 * not possible on Linux).
2382 	 * Also avoid overflowing the queue.
2383 	 */
2384 	mtx_lock(&q->lock);
2385 
2386         space = kring->nr_hwtail - kring->nr_hwcur;
2387         if (space < 0)
2388                 space += kring->nkr_num_slots;
2389 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2390 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2391 			 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2392 			len, m);
2393 	} else {
2394 		mbq_enqueue(q, m);
2395 		ND(10, "%s %d bufs in queue len %d m %p",
2396 			NM_IFPNAME(ifp), mbq_len(q), len, m);
2397 		/* notify outside the lock */
2398 		m = NULL;
2399 		error = 0;
2400 	}
2401 	mtx_unlock(&q->lock);
2402 
2403 done:
2404 	if (m)
2405 		m_freem(m);
2406 	/* unconditionally wake up listeners */
2407 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2408 
2409 	return (error);
2410 }
2411 
2412 
2413 /*
2414  * netmap_reset() is called by the driver routines when reinitializing
2415  * a ring. The driver is in charge of locking to protect the kring.
2416  * If native netmap mode is not set just return NULL.
2417  */
2418 struct netmap_slot *
2419 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2420 	u_int new_cur)
2421 {
2422 	struct netmap_kring *kring;
2423 	int new_hwofs, lim;
2424 
2425 	if (na == NULL) {
2426 		D("NULL na, should not happen");
2427 		return NULL;	/* no netmap support here */
2428 	}
2429 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
2430 		ND("interface not in netmap mode");
2431 		return NULL;	/* nothing to reinitialize */
2432 	}
2433 
2434 	/* XXX note- in the new scheme, we are not guaranteed to be
2435 	 * under lock (e.g. when called on a device reset).
2436 	 * In this case, we should set a flag and do not trust too
2437 	 * much the values. In practice: TODO
2438 	 * - set a RESET flag somewhere in the kring
2439 	 * - do the processing in a conservative way
2440 	 * - let the *sync() fixup at the end.
2441 	 */
2442 	if (tx == NR_TX) {
2443 		if (n >= na->num_tx_rings)
2444 			return NULL;
2445 		kring = na->tx_rings + n;
2446 		// XXX check whether we should use hwcur or rcur
2447 		new_hwofs = kring->nr_hwcur - new_cur;
2448 	} else {
2449 		if (n >= na->num_rx_rings)
2450 			return NULL;
2451 		kring = na->rx_rings + n;
2452 		new_hwofs = kring->nr_hwtail - new_cur;
2453 	}
2454 	lim = kring->nkr_num_slots - 1;
2455 	if (new_hwofs > lim)
2456 		new_hwofs -= lim + 1;
2457 
2458 	/* Always set the new offset value and realign the ring. */
2459 	if (netmap_verbose)
2460 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2461 		NM_IFPNAME(na->ifp),
2462 		tx == NR_TX ? "TX" : "RX", n,
2463 		kring->nkr_hwofs, new_hwofs,
2464 		kring->nr_hwtail,
2465 		tx == NR_TX ? lim : kring->nr_hwtail);
2466 	kring->nkr_hwofs = new_hwofs;
2467 	if (tx == NR_TX) {
2468 		kring->nr_hwtail = kring->nr_hwcur + lim;
2469 		if (kring->nr_hwtail > lim)
2470 			kring->nr_hwtail -= lim + 1;
2471 	}
2472 
2473 #if 0 // def linux
2474 	/* XXX check that the mappings are correct */
2475 	/* need ring_nr, adapter->pdev, direction */
2476 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2477 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2478 		D("error mapping rx netmap buffer %d", i);
2479 		// XXX fix error handling
2480 	}
2481 
2482 #endif /* linux */
2483 	/*
2484 	 * Wakeup on the individual and global selwait
2485 	 * We do the wakeup here, but the ring is not yet reconfigured.
2486 	 * However, we are under lock so there are no races.
2487 	 */
2488 	na->nm_notify(na, n, tx, 0);
2489 	return kring->ring->slot;
2490 }
2491 
2492 
2493 /*
2494  * Dispatch rx/tx interrupts to the netmap rings.
2495  *
2496  * "work_done" is non-null on the RX path, NULL for the TX path.
2497  * We rely on the OS to make sure that there is only one active
2498  * instance per queue, and that there is appropriate locking.
2499  *
2500  * The 'notify' routine depends on what the ring is attached to.
2501  * - for a netmap file descriptor, do a selwakeup on the individual
2502  *   waitqueue, plus one on the global one if needed
2503  * - for a switch, call the proper forwarding routine
2504  * - XXX more ?
2505  */
2506 void
2507 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2508 {
2509 	struct netmap_adapter *na = NA(ifp);
2510 	struct netmap_kring *kring;
2511 
2512 	q &= NETMAP_RING_MASK;
2513 
2514 	if (netmap_verbose) {
2515 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2516 	}
2517 
2518 	if (work_done) { /* RX path */
2519 		if (q >= na->num_rx_rings)
2520 			return;	// not a physical queue
2521 		kring = na->rx_rings + q;
2522 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2523 		na->nm_notify(na, q, NR_RX, 0);
2524 		*work_done = 1; /* do not fire napi again */
2525 	} else { /* TX path */
2526 		if (q >= na->num_tx_rings)
2527 			return;	// not a physical queue
2528 		kring = na->tx_rings + q;
2529 		na->nm_notify(na, q, NR_TX, 0);
2530 	}
2531 }
2532 
2533 
2534 /*
2535  * Default functions to handle rx/tx interrupts from a physical device.
2536  * "work_done" is non-null on the RX path, NULL for the TX path.
2537  *
2538  * If the card is not in netmap mode, simply return 0,
2539  * so that the caller proceeds with regular processing.
2540  * Otherwise call netmap_common_irq() and return 1.
2541  *
2542  * If the card is connected to a netmap file descriptor,
2543  * do a selwakeup on the individual queue, plus one on the global one
2544  * if needed (multiqueue card _and_ there are multiqueue listeners),
2545  * and return 1.
2546  *
2547  * Finally, if called on rx from an interface connected to a switch,
2548  * calls the proper forwarding routine, and return 1.
2549  */
2550 int
2551 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2552 {
2553 	// XXX could we check NAF_NATIVE_ON ?
2554 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2555 		return 0;
2556 
2557 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2558 		ND("use regular interrupt");
2559 		return 0;
2560 	}
2561 
2562 	netmap_common_irq(ifp, q, work_done);
2563 	return 1;
2564 }
2565 
2566 
2567 /*
2568  * Module loader and unloader
2569  *
2570  * netmap_init() creates the /dev/netmap device and initializes
2571  * all global variables. Returns 0 on success, errno on failure
2572  * (but there is no chance)
2573  *
2574  * netmap_fini() destroys everything.
2575  */
2576 
2577 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2578 extern struct cdevsw netmap_cdevsw;
2579 
2580 
2581 void
2582 netmap_fini(void)
2583 {
2584 	// XXX destroy_bridges() ?
2585 	if (netmap_dev)
2586 		destroy_dev(netmap_dev);
2587 	netmap_mem_fini();
2588 	NMG_LOCK_DESTROY();
2589 	printf("netmap: unloaded module.\n");
2590 }
2591 
2592 
2593 int
2594 netmap_init(void)
2595 {
2596 	int error;
2597 
2598 	NMG_LOCK_INIT();
2599 
2600 	error = netmap_mem_init();
2601 	if (error != 0)
2602 		goto fail;
2603 	/* XXX could use make_dev_credv() to get error number */
2604 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2605 			      "netmap");
2606 	if (!netmap_dev)
2607 		goto fail;
2608 
2609 	netmap_init_bridges();
2610 	printf("netmap: loaded module\n");
2611 	return (0);
2612 fail:
2613 	netmap_fini();
2614 	return (EINVAL); /* may be incorrect */
2615 }
2616