xref: /freebsd/sys/dev/netmap/netmap.c (revision b1f9167f94059fd55c630891d359bcff987bd7eb)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 /*
128  * OS-specific code that is used only within this file.
129  * Other OS-specific code that must be accessed by drivers
130  * is present in netmap_kern.h
131  */
132 
133 #if defined(__FreeBSD__)
134 #include <sys/cdefs.h> /* prerequisite */
135 #include <sys/types.h>
136 #include <sys/errno.h>
137 #include <sys/param.h>	/* defines used in kernel.h */
138 #include <sys/kernel.h>	/* types used in module initialization */
139 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
140 #include <sys/filio.h>	/* FIONBIO */
141 #include <sys/sockio.h>
142 #include <sys/socketvar.h>	/* struct socket */
143 #include <sys/malloc.h>
144 #include <sys/poll.h>
145 #include <sys/rwlock.h>
146 #include <sys/socket.h> /* sockaddrs */
147 #include <sys/selinfo.h>
148 #include <sys/sysctl.h>
149 #include <sys/jail.h>
150 #include <net/vnet.h>
151 #include <net/if.h>
152 #include <net/if_var.h>
153 #include <net/bpf.h>		/* BIOCIMMEDIATE */
154 #include <machine/bus.h>	/* bus_dmamap_* */
155 #include <sys/endian.h>
156 #include <sys/refcount.h>
157 
158 
159 /* reduce conditional code */
160 // linux API, use for the knlist in FreeBSD
161 #define init_waitqueue_head(x)	knlist_init_mtx(&(x)->si_note, NULL)
162 
163 void freebsd_selwakeup(struct selinfo *si, int pri);
164 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
165 
166 #elif defined(linux)
167 
168 #include "bsd_glue.h"
169 
170 
171 
172 #elif defined(__APPLE__)
173 
174 #warning OSX support is only partial
175 #include "osx_glue.h"
176 
177 #else
178 
179 #error	Unsupported platform
180 
181 #endif /* unsupported */
182 
183 /*
184  * common headers
185  */
186 #include <net/netmap.h>
187 #include <dev/netmap/netmap_kern.h>
188 #include <dev/netmap/netmap_mem2.h>
189 
190 
191 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
192 
193 /*
194  * The following variables are used by the drivers and replicate
195  * fields in the global memory pool. They only refer to buffers
196  * used by physical interfaces.
197  */
198 u_int netmap_total_buffers;
199 u_int netmap_buf_size;
200 char *netmap_buffer_base;	/* also address of an invalid buffer */
201 
202 /* user-controlled variables */
203 int netmap_verbose;
204 
205 static int netmap_no_timestamp; /* don't timestamp on rxsync */
206 
207 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
208 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
209     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
210 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
211     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
212 int netmap_mitigate = 1;
213 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
214 int netmap_no_pendintr = 1;
215 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
216     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
217 int netmap_txsync_retry = 2;
218 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
219     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
220 
221 int netmap_flags = 0;	/* debug flags */
222 int netmap_fwd = 0;	/* force transparent mode */
223 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
224 
225 /*
226  * netmap_admode selects the netmap mode to use.
227  * Invalid values are reset to NETMAP_ADMODE_BEST
228  */
229 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
230 	NETMAP_ADMODE_NATIVE,	/* either native or none */
231 	NETMAP_ADMODE_GENERIC,	/* force generic */
232 	NETMAP_ADMODE_LAST };
233 static int netmap_admode = NETMAP_ADMODE_BEST;
234 
235 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
236 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
237 int netmap_generic_rings = 1;   /* number of queues in generic. */
238 
239 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
240 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
241 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
242 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
243 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
244 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
245 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
246 
247 NMG_LOCK_T	netmap_global_lock;
248 
249 
250 static void
251 nm_kr_get(struct netmap_kring *kr)
252 {
253 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
254 		tsleep(kr, 0, "NM_KR_GET", 4);
255 }
256 
257 
258 /*
259  * mark the ring as stopped, and run through the locks
260  * to make sure other users get to see it.
261  */
262 void
263 netmap_disable_ring(struct netmap_kring *kr)
264 {
265 	kr->nkr_stopped = 1;
266 	nm_kr_get(kr);
267 	mtx_lock(&kr->q_lock);
268 	mtx_unlock(&kr->q_lock);
269 	nm_kr_put(kr);
270 }
271 
272 
273 /* stop or enable all the rings of na */
274 static void
275 netmap_set_all_rings(struct ifnet *ifp, int stopped)
276 {
277 	struct netmap_adapter *na;
278 	int i;
279 	u_int ntx, nrx;
280 
281 	if (!(ifp->if_capenable & IFCAP_NETMAP))
282 		return;
283 
284 	na = NA(ifp);
285 
286 	ntx = netmap_real_tx_rings(na);
287 	nrx = netmap_real_rx_rings(na);
288 
289 	for (i = 0; i < ntx; i++) {
290 		if (stopped)
291 			netmap_disable_ring(na->tx_rings + i);
292 		else
293 			na->tx_rings[i].nkr_stopped = 0;
294 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY);
295 	}
296 
297 	for (i = 0; i < nrx; i++) {
298 		if (stopped)
299 			netmap_disable_ring(na->rx_rings + i);
300 		else
301 			na->rx_rings[i].nkr_stopped = 0;
302 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY);
303 	}
304 }
305 
306 
307 /*
308  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
309  * to finish and prevents any new one from starting.  Call this before turning
310  * netmap mode off, or before removing the harware rings (e.g., on module
311  * onload).  As a rule of thumb for linux drivers, this should be placed near
312  * each napi_disable().
313  */
314 void
315 netmap_disable_all_rings(struct ifnet *ifp)
316 {
317 	netmap_set_all_rings(ifp, 1 /* stopped */);
318 }
319 
320 
321 /*
322  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
323  * adapter's rings In linux drivers, this should be placed near each
324  * napi_enable().
325  */
326 void
327 netmap_enable_all_rings(struct ifnet *ifp)
328 {
329 	netmap_set_all_rings(ifp, 0 /* enabled */);
330 }
331 
332 
333 /*
334  * generic bound_checking function
335  */
336 u_int
337 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
338 {
339 	u_int oldv = *v;
340 	const char *op = NULL;
341 
342 	if (dflt < lo)
343 		dflt = lo;
344 	if (dflt > hi)
345 		dflt = hi;
346 	if (oldv < lo) {
347 		*v = dflt;
348 		op = "Bump";
349 	} else if (oldv > hi) {
350 		*v = hi;
351 		op = "Clamp";
352 	}
353 	if (op && msg)
354 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
355 	return *v;
356 }
357 
358 
359 /*
360  * packet-dump function, user-supplied or static buffer.
361  * The destination buffer must be at least 30+4*len
362  */
363 const char *
364 nm_dump_buf(char *p, int len, int lim, char *dst)
365 {
366 	static char _dst[8192];
367 	int i, j, i0;
368 	static char hex[] ="0123456789abcdef";
369 	char *o;	/* output position */
370 
371 #define P_HI(x)	hex[((x) & 0xf0)>>4]
372 #define P_LO(x)	hex[((x) & 0xf)]
373 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
374 	if (!dst)
375 		dst = _dst;
376 	if (lim <= 0 || lim > len)
377 		lim = len;
378 	o = dst;
379 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
380 	o += strlen(o);
381 	/* hexdump routine */
382 	for (i = 0; i < lim; ) {
383 		sprintf(o, "%5d: ", i);
384 		o += strlen(o);
385 		memset(o, ' ', 48);
386 		i0 = i;
387 		for (j=0; j < 16 && i < lim; i++, j++) {
388 			o[j*3] = P_HI(p[i]);
389 			o[j*3+1] = P_LO(p[i]);
390 		}
391 		i = i0;
392 		for (j=0; j < 16 && i < lim; i++, j++)
393 			o[j + 48] = P_C(p[i]);
394 		o[j+48] = '\n';
395 		o += j+49;
396 	}
397 	*o = '\0';
398 #undef P_HI
399 #undef P_LO
400 #undef P_C
401 	return dst;
402 }
403 
404 
405 /*
406  * Fetch configuration from the device, to cope with dynamic
407  * reconfigurations after loading the module.
408  */
409 /* call with NMG_LOCK held */
410 int
411 netmap_update_config(struct netmap_adapter *na)
412 {
413 	struct ifnet *ifp = na->ifp;
414 	u_int txr, txd, rxr, rxd;
415 
416 	txr = txd = rxr = rxd = 0;
417 	if (na->nm_config) {
418 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
419 	} else {
420 		/* take whatever we had at init time */
421 		txr = na->num_tx_rings;
422 		txd = na->num_tx_desc;
423 		rxr = na->num_rx_rings;
424 		rxd = na->num_rx_desc;
425 	}
426 
427 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
428 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
429 		return 0; /* nothing changed */
430 	if (netmap_verbose || na->active_fds > 0) {
431 		D("stored config %s: txring %d x %d, rxring %d x %d",
432 			NM_IFPNAME(ifp),
433 			na->num_tx_rings, na->num_tx_desc,
434 			na->num_rx_rings, na->num_rx_desc);
435 		D("new config %s: txring %d x %d, rxring %d x %d",
436 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
437 	}
438 	if (na->active_fds == 0) {
439 		D("configuration changed (but fine)");
440 		na->num_tx_rings = txr;
441 		na->num_tx_desc = txd;
442 		na->num_rx_rings = rxr;
443 		na->num_rx_desc = rxd;
444 		return 0;
445 	}
446 	D("configuration changed while active, this is bad...");
447 	return 1;
448 }
449 
450 static int
451 netmap_txsync_compat(struct netmap_kring *kring, int flags)
452 {
453 	struct netmap_adapter *na = kring->na;
454 	return na->nm_txsync(na, kring->ring_id, flags);
455 }
456 
457 static int
458 netmap_rxsync_compat(struct netmap_kring *kring, int flags)
459 {
460 	struct netmap_adapter *na = kring->na;
461 	return na->nm_rxsync(na, kring->ring_id, flags);
462 }
463 
464 /* kring->nm_sync callback for the host tx ring */
465 static int
466 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
467 {
468 	(void)flags; /* unused */
469 	netmap_txsync_to_host(kring->na);
470 	return 0;
471 }
472 
473 /* kring->nm_sync callback for the host rx ring */
474 static int
475 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
476 {
477 	(void)flags; /* unused */
478 	netmap_rxsync_from_host(kring->na, NULL, NULL);
479 	return 0;
480 }
481 
482 
483 
484 /* create the krings array and initialize the fields common to all adapters.
485  * The array layout is this:
486  *
487  *                    +----------+
488  * na->tx_rings ----->|          | \
489  *                    |          |  } na->num_tx_ring
490  *                    |          | /
491  *                    +----------+
492  *                    |          |    host tx kring
493  * na->rx_rings ----> +----------+
494  *                    |          | \
495  *                    |          |  } na->num_rx_rings
496  *                    |          | /
497  *                    +----------+
498  *                    |          |    host rx kring
499  *                    +----------+
500  * na->tailroom ----->|          | \
501  *                    |          |  } tailroom bytes
502  *                    |          | /
503  *                    +----------+
504  *
505  * Note: for compatibility, host krings are created even when not needed.
506  * The tailroom space is currently used by vale ports for allocating leases.
507  */
508 /* call with NMG_LOCK held */
509 int
510 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
511 {
512 	u_int i, len, ndesc;
513 	struct netmap_kring *kring;
514 	u_int ntx, nrx;
515 
516 	/* account for the (possibly fake) host rings */
517 	ntx = na->num_tx_rings + 1;
518 	nrx = na->num_rx_rings + 1;
519 
520 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
521 
522 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
523 	if (na->tx_rings == NULL) {
524 		D("Cannot allocate krings");
525 		return ENOMEM;
526 	}
527 	na->rx_rings = na->tx_rings + ntx;
528 
529 	/*
530 	 * All fields in krings are 0 except the one initialized below.
531 	 * but better be explicit on important kring fields.
532 	 */
533 	ndesc = na->num_tx_desc;
534 	for (i = 0; i < ntx; i++) { /* Transmit rings */
535 		kring = &na->tx_rings[i];
536 		bzero(kring, sizeof(*kring));
537 		kring->na = na;
538 		kring->ring_id = i;
539 		kring->nkr_num_slots = ndesc;
540 		if (i < na->num_tx_rings) {
541 			kring->nm_sync = netmap_txsync_compat; // XXX
542 		} else if (i == na->num_tx_rings) {
543 			kring->nm_sync = netmap_txsync_to_host_compat;
544 		}
545 		/*
546 		 * IMPORTANT: Always keep one slot empty.
547 		 */
548 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
549 		kring->rtail = kring->nr_hwtail = ndesc - 1;
550 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
551 		ND("ktx %s h %d c %d t %d",
552 			kring->name, kring->rhead, kring->rcur, kring->rtail);
553 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
554 		init_waitqueue_head(&kring->si);
555 	}
556 
557 	ndesc = na->num_rx_desc;
558 	for (i = 0; i < nrx; i++) { /* Receive rings */
559 		kring = &na->rx_rings[i];
560 		bzero(kring, sizeof(*kring));
561 		kring->na = na;
562 		kring->ring_id = i;
563 		kring->nkr_num_slots = ndesc;
564 		if (i < na->num_rx_rings) {
565 			kring->nm_sync = netmap_rxsync_compat; // XXX
566 		} else if (i == na->num_rx_rings) {
567 			kring->nm_sync = netmap_rxsync_from_host_compat;
568 		}
569 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
570 		kring->rtail = kring->nr_hwtail = 0;
571 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
572 		ND("krx %s h %d c %d t %d",
573 			kring->name, kring->rhead, kring->rcur, kring->rtail);
574 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
575 		init_waitqueue_head(&kring->si);
576 	}
577 	init_waitqueue_head(&na->tx_si);
578 	init_waitqueue_head(&na->rx_si);
579 
580 	na->tailroom = na->rx_rings + nrx;
581 
582 	return 0;
583 }
584 
585 
586 /* undo the actions performed by netmap_krings_create */
587 /* call with NMG_LOCK held */
588 void
589 netmap_krings_delete(struct netmap_adapter *na)
590 {
591 	struct netmap_kring *kring = na->tx_rings;
592 
593 	/* we rely on the krings layout described above */
594 	for ( ; kring != na->tailroom; kring++) {
595 		mtx_destroy(&kring->q_lock);
596 	}
597 	free(na->tx_rings, M_DEVBUF);
598 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
599 }
600 
601 
602 /*
603  * Destructor for NIC ports. They also have an mbuf queue
604  * on the rings connected to the host so we need to purge
605  * them first.
606  */
607 /* call with NMG_LOCK held */
608 static void
609 netmap_hw_krings_delete(struct netmap_adapter *na)
610 {
611 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
612 
613 	ND("destroy sw mbq with len %d", mbq_len(q));
614 	mbq_purge(q);
615 	mbq_safe_destroy(q);
616 	netmap_krings_delete(na);
617 }
618 
619 
620 /* create a new netmap_if for a newly registered fd.
621  * If this is the first registration of the adapter,
622  * also create the netmap rings and their in-kernel view,
623  * the netmap krings.
624  */
625 /* call with NMG_LOCK held */
626 static struct netmap_if*
627 netmap_if_new(const char *ifname, struct netmap_adapter *na)
628 {
629 	struct netmap_if *nifp;
630 
631 	if (netmap_update_config(na)) {
632 		/* configuration mismatch, report and fail */
633 		return NULL;
634 	}
635 
636 	if (na->active_fds)	/* already registered */
637 		goto final;
638 
639 	/* create and init the krings arrays.
640 	 * Depending on the adapter, this may also create
641 	 * the netmap rings themselves
642 	 */
643 	if (na->nm_krings_create(na))
644 		goto cleanup;
645 
646 	/* create all missing netmap rings */
647 	if (netmap_mem_rings_create(na))
648 		goto cleanup;
649 
650 final:
651 
652 	/* in all cases, create a new netmap if */
653 	nifp = netmap_mem_if_new(ifname, na);
654 	if (nifp == NULL)
655 		goto cleanup;
656 
657 	return (nifp);
658 
659 cleanup:
660 
661 	if (na->active_fds == 0) {
662 		netmap_mem_rings_delete(na);
663 		na->nm_krings_delete(na);
664 	}
665 
666 	return NULL;
667 }
668 
669 
670 /* grab a reference to the memory allocator, if we don't have one already.  The
671  * reference is taken from the netmap_adapter registered with the priv.
672  */
673 /* call with NMG_LOCK held */
674 static int
675 netmap_get_memory_locked(struct netmap_priv_d* p)
676 {
677 	struct netmap_mem_d *nmd;
678 	int error = 0;
679 
680 	if (p->np_na == NULL) {
681 		if (!netmap_mmap_unreg)
682 			return ENODEV;
683 		/* for compatibility with older versions of the API
684  		 * we use the global allocator when no interface has been
685  		 * registered
686  		 */
687 		nmd = &nm_mem;
688 	} else {
689 		nmd = p->np_na->nm_mem;
690 	}
691 	if (p->np_mref == NULL) {
692 		error = netmap_mem_finalize(nmd);
693 		if (!error)
694 			p->np_mref = nmd;
695 	} else if (p->np_mref != nmd) {
696 		/* a virtual port has been registered, but previous
697  		 * syscalls already used the global allocator.
698  		 * We cannot continue
699  		 */
700 		error = ENODEV;
701 	}
702 	return error;
703 }
704 
705 
706 /* call with NMG_LOCK *not* held */
707 int
708 netmap_get_memory(struct netmap_priv_d* p)
709 {
710 	int error;
711 	NMG_LOCK();
712 	error = netmap_get_memory_locked(p);
713 	NMG_UNLOCK();
714 	return error;
715 }
716 
717 
718 /* call with NMG_LOCK held */
719 static int
720 netmap_have_memory_locked(struct netmap_priv_d* p)
721 {
722 	return p->np_mref != NULL;
723 }
724 
725 
726 /* call with NMG_LOCK held */
727 static void
728 netmap_drop_memory_locked(struct netmap_priv_d* p)
729 {
730 	if (p->np_mref) {
731 		netmap_mem_deref(p->np_mref);
732 		p->np_mref = NULL;
733 	}
734 }
735 
736 
737 /*
738  * File descriptor's private data destructor.
739  *
740  * Call nm_register(ifp,0) to stop netmap mode on the interface and
741  * revert to normal operation. We expect that np_na->ifp has not gone.
742  * The second argument is the nifp to work on. In some cases it is
743  * not attached yet to the netmap_priv_d so we need to pass it as
744  * a separate argument.
745  */
746 /* call with NMG_LOCK held */
747 static void
748 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
749 {
750 	struct netmap_adapter *na = priv->np_na;
751 	struct ifnet *ifp = na->ifp;
752 
753 	NMG_LOCK_ASSERT();
754 	na->active_fds--;
755 	if (na->active_fds <= 0) {	/* last instance */
756 
757 		if (netmap_verbose)
758 			D("deleting last instance for %s", NM_IFPNAME(ifp));
759 		/*
760 		 * (TO CHECK) This function is only called
761 		 * when the last reference to this file descriptor goes
762 		 * away. This means we cannot have any pending poll()
763 		 * or interrupt routine operating on the structure.
764 		 * XXX The file may be closed in a thread while
765 		 * another thread is using it.
766 		 * Linux keeps the file opened until the last reference
767 		 * by any outstanding ioctl/poll or mmap is gone.
768 		 * FreeBSD does not track mmap()s (but we do) and
769 		 * wakes up any sleeping poll(). Need to check what
770 		 * happens if the close() occurs while a concurrent
771 		 * syscall is running.
772 		 */
773 		if (ifp)
774 			na->nm_register(na, 0); /* off, clear flags */
775 		/* Wake up any sleeping threads. netmap_poll will
776 		 * then return POLLERR
777 		 * XXX The wake up now must happen during *_down(), when
778 		 * we order all activities to stop. -gl
779 		 */
780 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
781 		/* knlist_destroy(&na->tx_si.si_note); */
782 		/* knlist_destroy(&na->rx_si.si_note); */
783 
784 		/* delete rings and buffers */
785 		netmap_mem_rings_delete(na);
786 		na->nm_krings_delete(na);
787 	}
788 	/* delete the nifp */
789 	netmap_mem_if_delete(na, nifp);
790 }
791 
792 /* call with NMG_LOCK held */
793 static __inline int
794 nm_tx_si_user(struct netmap_priv_d *priv)
795 {
796 	return (priv->np_na != NULL &&
797 		(priv->np_txqlast - priv->np_txqfirst > 1));
798 }
799 
800 /* call with NMG_LOCK held */
801 static __inline int
802 nm_rx_si_user(struct netmap_priv_d *priv)
803 {
804 	return (priv->np_na != NULL &&
805 		(priv->np_rxqlast - priv->np_rxqfirst > 1));
806 }
807 
808 
809 /*
810  * Destructor of the netmap_priv_d, called when the fd has
811  * no active open() and mmap(). Also called in error paths.
812  *
813  * returns 1 if this is the last instance and we can free priv
814  */
815 /* call with NMG_LOCK held */
816 int
817 netmap_dtor_locked(struct netmap_priv_d *priv)
818 {
819 	struct netmap_adapter *na = priv->np_na;
820 
821 #ifdef __FreeBSD__
822 	/*
823 	 * np_refcount is the number of active mmaps on
824 	 * this file descriptor
825 	 */
826 	if (--priv->np_refcount > 0) {
827 		return 0;
828 	}
829 #endif /* __FreeBSD__ */
830 	if (!na) {
831 	    return 1; //XXX is it correct?
832 	}
833 	netmap_do_unregif(priv, priv->np_nifp);
834 	priv->np_nifp = NULL;
835 	netmap_drop_memory_locked(priv);
836 	if (priv->np_na) {
837 		if (nm_tx_si_user(priv))
838 			na->tx_si_users--;
839 		if (nm_rx_si_user(priv))
840 			na->rx_si_users--;
841 		netmap_adapter_put(na);
842 		priv->np_na = NULL;
843 	}
844 	return 1;
845 }
846 
847 
848 /* call with NMG_LOCK *not* held */
849 void
850 netmap_dtor(void *data)
851 {
852 	struct netmap_priv_d *priv = data;
853 	int last_instance;
854 
855 	NMG_LOCK();
856 	last_instance = netmap_dtor_locked(priv);
857 	NMG_UNLOCK();
858 	if (last_instance) {
859 		bzero(priv, sizeof(*priv));	/* for safety */
860 		free(priv, M_DEVBUF);
861 	}
862 }
863 
864 
865 
866 
867 /*
868  * Handlers for synchronization of the queues from/to the host.
869  * Netmap has two operating modes:
870  * - in the default mode, the rings connected to the host stack are
871  *   just another ring pair managed by userspace;
872  * - in transparent mode (XXX to be defined) incoming packets
873  *   (from the host or the NIC) are marked as NS_FORWARD upon
874  *   arrival, and the user application has a chance to reset the
875  *   flag for packets that should be dropped.
876  *   On the RXSYNC or poll(), packets in RX rings between
877  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
878  *   to the other side.
879  * The transfer NIC --> host is relatively easy, just encapsulate
880  * into mbufs and we are done. The host --> NIC side is slightly
881  * harder because there might not be room in the tx ring so it
882  * might take a while before releasing the buffer.
883  */
884 
885 
886 /*
887  * pass a chain of buffers to the host stack as coming from 'dst'
888  * We do not need to lock because the queue is private.
889  */
890 static void
891 netmap_send_up(struct ifnet *dst, struct mbq *q)
892 {
893 	struct mbuf *m;
894 
895 	/* send packets up, outside the lock */
896 	while ((m = mbq_dequeue(q)) != NULL) {
897 		if (netmap_verbose & NM_VERB_HOST)
898 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
899 		NM_SEND_UP(dst, m);
900 	}
901 	mbq_destroy(q);
902 }
903 
904 
905 /*
906  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
907  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
908  * and pass them up. Drop remaining packets in the unlikely event
909  * of an mbuf shortage.
910  */
911 static void
912 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
913 {
914 	u_int const lim = kring->nkr_num_slots - 1;
915 	u_int const head = kring->ring->head;
916 	u_int n;
917 	struct netmap_adapter *na = kring->na;
918 
919 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
920 		struct mbuf *m;
921 		struct netmap_slot *slot = &kring->ring->slot[n];
922 
923 		if ((slot->flags & NS_FORWARD) == 0 && !force)
924 			continue;
925 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
926 			RD(5, "bad pkt at %d len %d", n, slot->len);
927 			continue;
928 		}
929 		slot->flags &= ~NS_FORWARD; // XXX needed ?
930 		/* XXX TODO: adapt to the case of a multisegment packet */
931 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
932 
933 		if (m == NULL)
934 			break;
935 		mbq_enqueue(q, m);
936 	}
937 }
938 
939 
940 /*
941  * Send to the NIC rings packets marked NS_FORWARD between
942  * kring->nr_hwcur and kring->rhead
943  * Called under kring->rx_queue.lock on the sw rx ring,
944  */
945 static u_int
946 netmap_sw_to_nic(struct netmap_adapter *na)
947 {
948 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
949 	struct netmap_slot *rxslot = kring->ring->slot;
950 	u_int i, rxcur = kring->nr_hwcur;
951 	u_int const head = kring->rhead;
952 	u_int const src_lim = kring->nkr_num_slots - 1;
953 	u_int sent = 0;
954 
955 	/* scan rings to find space, then fill as much as possible */
956 	for (i = 0; i < na->num_tx_rings; i++) {
957 		struct netmap_kring *kdst = &na->tx_rings[i];
958 		struct netmap_ring *rdst = kdst->ring;
959 		u_int const dst_lim = kdst->nkr_num_slots - 1;
960 
961 		/* XXX do we trust ring or kring->rcur,rtail ? */
962 		for (; rxcur != head && !nm_ring_empty(rdst);
963 		     rxcur = nm_next(rxcur, src_lim) ) {
964 			struct netmap_slot *src, *dst, tmp;
965 			u_int dst_cur = rdst->cur;
966 
967 			src = &rxslot[rxcur];
968 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
969 				continue;
970 
971 			sent++;
972 
973 			dst = &rdst->slot[dst_cur];
974 
975 			tmp = *src;
976 
977 			src->buf_idx = dst->buf_idx;
978 			src->flags = NS_BUF_CHANGED;
979 
980 			dst->buf_idx = tmp.buf_idx;
981 			dst->len = tmp.len;
982 			dst->flags = NS_BUF_CHANGED;
983 
984 			rdst->head = rdst->cur = nm_next(dst_cur, dst_lim);
985 		}
986 		/* if (sent) XXX txsync ? */
987 	}
988 	return sent;
989 }
990 
991 
992 /*
993  * netmap_txsync_to_host() passes packets up. We are called from a
994  * system call in user process context, and the only contention
995  * can be among multiple user threads erroneously calling
996  * this routine concurrently.
997  */
998 void
999 netmap_txsync_to_host(struct netmap_adapter *na)
1000 {
1001 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1002 	struct netmap_ring *ring = kring->ring;
1003 	u_int const lim = kring->nkr_num_slots - 1;
1004 	u_int const head = kring->rhead;
1005 	struct mbq q;
1006 
1007 	/* Take packets from hwcur to head and pass them up.
1008 	 * force head = cur since netmap_grab_packets() stops at head
1009 	 * In case of no buffers we give up. At the end of the loop,
1010 	 * the queue is drained in all cases.
1011 	 */
1012 	mbq_init(&q);
1013 	ring->cur = head;
1014 	netmap_grab_packets(kring, &q, 1 /* force */);
1015 	ND("have %d pkts in queue", mbq_len(&q));
1016 	kring->nr_hwcur = head;
1017 	kring->nr_hwtail = head + lim;
1018 	if (kring->nr_hwtail > lim)
1019 		kring->nr_hwtail -= lim + 1;
1020 	nm_txsync_finalize(kring);
1021 
1022 	netmap_send_up(na->ifp, &q);
1023 }
1024 
1025 
1026 /*
1027  * rxsync backend for packets coming from the host stack.
1028  * They have been put in kring->rx_queue by netmap_transmit().
1029  * We protect access to the kring using kring->rx_queue.lock
1030  *
1031  * returns the number of packets delivered to tx queues in
1032  * transparent mode, or a negative value if error
1033  */
1034 int
1035 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1036 {
1037 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1038 	struct netmap_ring *ring = kring->ring;
1039 	u_int nm_i, n;
1040 	u_int const lim = kring->nkr_num_slots - 1;
1041 	u_int const head = kring->rhead;
1042 	int ret = 0;
1043 	struct mbq *q = &kring->rx_queue;
1044 
1045 	(void)pwait;	/* disable unused warnings */
1046 	(void)td;
1047 
1048 	mbq_lock(q);
1049 
1050 	/* First part: import newly received packets */
1051 	n = mbq_len(q);
1052 	if (n) { /* grab packets from the queue */
1053 		struct mbuf *m;
1054 		uint32_t stop_i;
1055 
1056 		nm_i = kring->nr_hwtail;
1057 		stop_i = nm_prev(nm_i, lim);
1058 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1059 			int len = MBUF_LEN(m);
1060 			struct netmap_slot *slot = &ring->slot[nm_i];
1061 
1062 			m_copydata(m, 0, len, BDG_NMB(na, slot));
1063 			ND("nm %d len %d", nm_i, len);
1064 			if (netmap_verbose)
1065                                 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
1066 
1067 			slot->len = len;
1068 			slot->flags = kring->nkr_slot_flags;
1069 			nm_i = nm_next(nm_i, lim);
1070 		}
1071 		kring->nr_hwtail = nm_i;
1072 	}
1073 
1074 	/*
1075 	 * Second part: skip past packets that userspace has released.
1076 	 */
1077 	nm_i = kring->nr_hwcur;
1078 	if (nm_i != head) { /* something was released */
1079 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1080 			ret = netmap_sw_to_nic(na);
1081 		kring->nr_hwcur = head;
1082 	}
1083 
1084 	nm_rxsync_finalize(kring);
1085 
1086 	mbq_unlock(q);
1087 	return ret;
1088 }
1089 
1090 
1091 /* Get a netmap adapter for the port.
1092  *
1093  * If it is possible to satisfy the request, return 0
1094  * with *na containing the netmap adapter found.
1095  * Otherwise return an error code, with *na containing NULL.
1096  *
1097  * When the port is attached to a bridge, we always return
1098  * EBUSY.
1099  * Otherwise, if the port is already bound to a file descriptor,
1100  * then we unconditionally return the existing adapter into *na.
1101  * In all the other cases, we return (into *na) either native,
1102  * generic or NULL, according to the following table:
1103  *
1104  *					native_support
1105  * active_fds   dev.netmap.admode         YES     NO
1106  * -------------------------------------------------------
1107  *    >0              *                 NA(ifp) NA(ifp)
1108  *
1109  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1110  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1111  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1112  *
1113  */
1114 
1115 int
1116 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1117 {
1118 	/* generic support */
1119 	int i = netmap_admode;	/* Take a snapshot. */
1120 	int error = 0;
1121 	struct netmap_adapter *prev_na;
1122 	struct netmap_generic_adapter *gna;
1123 
1124 	*na = NULL; /* default */
1125 
1126 	/* reset in case of invalid value */
1127 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1128 		i = netmap_admode = NETMAP_ADMODE_BEST;
1129 
1130 	if (NETMAP_CAPABLE(ifp)) {
1131 		/* If an adapter already exists, but is
1132 		 * attached to a vale port, we report that the
1133 		 * port is busy.
1134 		 */
1135 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
1136 			return EBUSY;
1137 
1138 		/* If an adapter already exists, return it if
1139 		 * there are active file descriptors or if
1140 		 * netmap is not forced to use generic
1141 		 * adapters.
1142 		 */
1143 		if (NA(ifp)->active_fds > 0 ||
1144 				i != NETMAP_ADMODE_GENERIC) {
1145 			*na = NA(ifp);
1146 			return 0;
1147 		}
1148 	}
1149 
1150 	/* If there isn't native support and netmap is not allowed
1151 	 * to use generic adapters, we cannot satisfy the request.
1152 	 */
1153 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1154 		return EOPNOTSUPP;
1155 
1156 	/* Otherwise, create a generic adapter and return it,
1157 	 * saving the previously used netmap adapter, if any.
1158 	 *
1159 	 * Note that here 'prev_na', if not NULL, MUST be a
1160 	 * native adapter, and CANNOT be a generic one. This is
1161 	 * true because generic adapters are created on demand, and
1162 	 * destroyed when not used anymore. Therefore, if the adapter
1163 	 * currently attached to an interface 'ifp' is generic, it
1164 	 * must be that
1165 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1166 	 * Consequently, if NA(ifp) is generic, we will enter one of
1167 	 * the branches above. This ensures that we never override
1168 	 * a generic adapter with another generic adapter.
1169 	 */
1170 	prev_na = NA(ifp);
1171 	error = generic_netmap_attach(ifp);
1172 	if (error)
1173 		return error;
1174 
1175 	*na = NA(ifp);
1176 	gna = (struct netmap_generic_adapter*)NA(ifp);
1177 	gna->prev = prev_na; /* save old na */
1178 	if (prev_na != NULL) {
1179 		ifunit_ref(ifp->if_xname);
1180 		// XXX add a refcount ?
1181 		netmap_adapter_get(prev_na);
1182 	}
1183 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1184 
1185 	return 0;
1186 }
1187 
1188 
1189 /*
1190  * MUST BE CALLED UNDER NMG_LOCK()
1191  *
1192  * Get a refcounted reference to a netmap adapter attached
1193  * to the interface specified by nmr.
1194  * This is always called in the execution of an ioctl().
1195  *
1196  * Return ENXIO if the interface specified by the request does
1197  * not exist, ENOTSUP if netmap is not supported by the interface,
1198  * EBUSY if the interface is already attached to a bridge,
1199  * EINVAL if parameters are invalid, ENOMEM if needed resources
1200  * could not be allocated.
1201  * If successful, hold a reference to the netmap adapter.
1202  *
1203  * No reference is kept on the real interface, which may then
1204  * disappear at any time.
1205  */
1206 int
1207 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1208 {
1209 	struct ifnet *ifp = NULL;
1210 	int error = 0;
1211 	struct netmap_adapter *ret = NULL;
1212 
1213 	*na = NULL;     /* default return value */
1214 
1215 	/* first try to see if this is a bridge port. */
1216 	NMG_LOCK_ASSERT();
1217 
1218 	error = netmap_get_pipe_na(nmr, na, create);
1219 	if (error || *na != NULL)
1220 		return error;
1221 
1222 	error = netmap_get_bdg_na(nmr, na, create);
1223 	if (error)
1224 		return error;
1225 
1226 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1227 		goto pipes;
1228 
1229 	/*
1230 	 * This must be a hardware na, lookup the name in the system.
1231 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1232 	 * This may still be a tap, a veth/epair, or even a
1233 	 * persistent VALE port.
1234 	 */
1235 	ifp = ifunit_ref(nmr->nr_name);
1236 	if (ifp == NULL) {
1237 	        return ENXIO;
1238 	}
1239 
1240 	error = netmap_get_hw_na(ifp, &ret);
1241 	if (error)
1242 		goto out;
1243 
1244 	/* Users cannot use the NIC attached to a bridge directly */
1245 	if (NETMAP_OWNED_BY_KERN(ret)) {
1246 		error = EBUSY;
1247 		goto out;
1248 	}
1249 	*na = ret;
1250 	netmap_adapter_get(ret);
1251 
1252 pipes:
1253 	/*
1254 	 * If we are opening a pipe whose parent was not in netmap mode,
1255 	 * we have to allocate the pipe array now.
1256 	 * XXX get rid of this clumsiness (2014-03-15)
1257 	 */
1258 	error = netmap_pipe_alloc(*na, nmr);
1259 
1260 out:
1261 	if (error && ret != NULL)
1262 		netmap_adapter_put(ret);
1263 
1264 	if (ifp)
1265 		if_rele(ifp); /* allow live unloading of drivers modules */
1266 
1267 	return error;
1268 }
1269 
1270 
1271 /*
1272  * validate parameters on entry for *_txsync()
1273  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1274  * in case of error.
1275  *
1276  * rhead, rcur and rtail=hwtail are stored from previous round.
1277  * hwcur is the next packet to send to the ring.
1278  *
1279  * We want
1280  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1281  *
1282  * hwcur, rhead, rtail and hwtail are reliable
1283  */
1284 u_int
1285 nm_txsync_prologue(struct netmap_kring *kring)
1286 {
1287 	struct netmap_ring *ring = kring->ring;
1288 	u_int head = ring->head; /* read only once */
1289 	u_int cur = ring->cur; /* read only once */
1290 	u_int n = kring->nkr_num_slots;
1291 
1292 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1293 		kring->name,
1294 		kring->nr_hwcur, kring->nr_hwtail,
1295 		ring->head, ring->cur, ring->tail);
1296 #if 1 /* kernel sanity checks; but we can trust the kring. */
1297 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1298 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1299 		goto error;
1300 #endif /* kernel sanity checks */
1301 	/*
1302 	 * user sanity checks. We only use 'cur',
1303 	 * A, B, ... are possible positions for cur:
1304 	 *
1305 	 *  0    A  cur   B  tail  C  n-1
1306 	 *  0    D  tail  E  cur   F  n-1
1307 	 *
1308 	 * B, F, D are valid. A, C, E are wrong
1309 	 */
1310 	if (kring->rtail >= kring->rhead) {
1311 		/* want rhead <= head <= rtail */
1312 		if (head < kring->rhead || head > kring->rtail)
1313 			goto error;
1314 		/* and also head <= cur <= rtail */
1315 		if (cur < head || cur > kring->rtail)
1316 			goto error;
1317 	} else { /* here rtail < rhead */
1318 		/* we need head outside rtail .. rhead */
1319 		if (head > kring->rtail && head < kring->rhead)
1320 			goto error;
1321 
1322 		/* two cases now: head <= rtail or head >= rhead  */
1323 		if (head <= kring->rtail) {
1324 			/* want head <= cur <= rtail */
1325 			if (cur < head || cur > kring->rtail)
1326 				goto error;
1327 		} else { /* head >= rhead */
1328 			/* cur must be outside rtail..head */
1329 			if (cur > kring->rtail && cur < head)
1330 				goto error;
1331 		}
1332 	}
1333 	if (ring->tail != kring->rtail) {
1334 		RD(5, "tail overwritten was %d need %d",
1335 			ring->tail, kring->rtail);
1336 		ring->tail = kring->rtail;
1337 	}
1338 	kring->rhead = head;
1339 	kring->rcur = cur;
1340 	return head;
1341 
1342 error:
1343 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1344 		kring->name,
1345 		kring->nr_hwcur,
1346 		kring->rcur, kring->nr_hwtail,
1347 		cur, ring->tail);
1348 	return n;
1349 }
1350 
1351 
1352 /*
1353  * validate parameters on entry for *_rxsync()
1354  * Returns ring->head if ok, kring->nkr_num_slots on error.
1355  *
1356  * For a valid configuration,
1357  * hwcur <= head <= cur <= tail <= hwtail
1358  *
1359  * We only consider head and cur.
1360  * hwcur and hwtail are reliable.
1361  *
1362  */
1363 u_int
1364 nm_rxsync_prologue(struct netmap_kring *kring)
1365 {
1366 	struct netmap_ring *ring = kring->ring;
1367 	uint32_t const n = kring->nkr_num_slots;
1368 	uint32_t head, cur;
1369 
1370 	ND("%s kc %d kt %d h %d c %d t %d",
1371 		kring->name,
1372 		kring->nr_hwcur, kring->nr_hwtail,
1373 		ring->head, ring->cur, ring->tail);
1374 	/*
1375 	 * Before storing the new values, we should check they do not
1376 	 * move backwards. However:
1377 	 * - head is not an issue because the previous value is hwcur;
1378 	 * - cur could in principle go back, however it does not matter
1379 	 *   because we are processing a brand new rxsync()
1380 	 */
1381 	cur = kring->rcur = ring->cur;	/* read only once */
1382 	head = kring->rhead = ring->head;	/* read only once */
1383 #if 1 /* kernel sanity checks */
1384 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1385 		goto error;
1386 #endif /* kernel sanity checks */
1387 	/* user sanity checks */
1388 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1389 		/* want hwcur <= rhead <= hwtail */
1390 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1391 			goto error;
1392 		/* and also rhead <= rcur <= hwtail */
1393 		if (cur < head || cur > kring->nr_hwtail)
1394 			goto error;
1395 	} else {
1396 		/* we need rhead outside hwtail..hwcur */
1397 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1398 			goto error;
1399 		/* two cases now: head <= hwtail or head >= hwcur  */
1400 		if (head <= kring->nr_hwtail) {
1401 			/* want head <= cur <= hwtail */
1402 			if (cur < head || cur > kring->nr_hwtail)
1403 				goto error;
1404 		} else {
1405 			/* cur must be outside hwtail..head */
1406 			if (cur < head && cur > kring->nr_hwtail)
1407 				goto error;
1408 		}
1409 	}
1410 	if (ring->tail != kring->rtail) {
1411 		RD(5, "%s tail overwritten was %d need %d",
1412 			kring->name,
1413 			ring->tail, kring->rtail);
1414 		ring->tail = kring->rtail;
1415 	}
1416 	return head;
1417 
1418 error:
1419 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1420 		kring->nr_hwcur,
1421 		kring->rcur, kring->nr_hwtail,
1422 		kring->rhead, kring->rcur, ring->tail);
1423 	return n;
1424 }
1425 
1426 
1427 /*
1428  * Error routine called when txsync/rxsync detects an error.
1429  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1430  * Return 1 on reinit.
1431  *
1432  * This routine is only called by the upper half of the kernel.
1433  * It only reads hwcur (which is changed only by the upper half, too)
1434  * and hwtail (which may be changed by the lower half, but only on
1435  * a tx ring and only to increase it, so any error will be recovered
1436  * on the next call). For the above, we don't strictly need to call
1437  * it under lock.
1438  */
1439 int
1440 netmap_ring_reinit(struct netmap_kring *kring)
1441 {
1442 	struct netmap_ring *ring = kring->ring;
1443 	u_int i, lim = kring->nkr_num_slots - 1;
1444 	int errors = 0;
1445 
1446 	// XXX KASSERT nm_kr_tryget
1447 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1448 	// XXX probably wrong to trust userspace
1449 	kring->rhead = ring->head;
1450 	kring->rcur  = ring->cur;
1451 	kring->rtail = ring->tail;
1452 
1453 	if (ring->cur > lim)
1454 		errors++;
1455 	if (ring->head > lim)
1456 		errors++;
1457 	if (ring->tail > lim)
1458 		errors++;
1459 	for (i = 0; i <= lim; i++) {
1460 		u_int idx = ring->slot[i].buf_idx;
1461 		u_int len = ring->slot[i].len;
1462 		if (idx < 2 || idx >= netmap_total_buffers) {
1463 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1464 			ring->slot[i].buf_idx = 0;
1465 			ring->slot[i].len = 0;
1466 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1467 			ring->slot[i].len = 0;
1468 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1469 		}
1470 	}
1471 	if (errors) {
1472 		RD(10, "total %d errors", errors);
1473 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1474 			kring->name,
1475 			ring->cur, kring->nr_hwcur,
1476 			ring->tail, kring->nr_hwtail);
1477 		ring->head = kring->rhead = kring->nr_hwcur;
1478 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1479 		ring->tail = kring->rtail = kring->nr_hwtail;
1480 	}
1481 	return (errors ? 1 : 0);
1482 }
1483 
1484 
1485 /*
1486  * Set the ring ID. For devices with a single queue, a request
1487  * for all rings is the same as a single ring.
1488  */
1489 static int
1490 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1491 {
1492 	struct netmap_adapter *na = priv->np_na;
1493 	u_int j, i = ringid & NETMAP_RING_MASK;
1494 	u_int reg = flags & NR_REG_MASK;
1495 
1496 	if (reg == NR_REG_DEFAULT) {
1497 		/* convert from old ringid to flags */
1498 		if (ringid & NETMAP_SW_RING) {
1499 			reg = NR_REG_SW;
1500 		} else if (ringid & NETMAP_HW_RING) {
1501 			reg = NR_REG_ONE_NIC;
1502 		} else {
1503 			reg = NR_REG_ALL_NIC;
1504 		}
1505 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1506 	}
1507 	switch (reg) {
1508 	case NR_REG_ALL_NIC:
1509 	case NR_REG_PIPE_MASTER:
1510 	case NR_REG_PIPE_SLAVE:
1511 		priv->np_txqfirst = 0;
1512 		priv->np_txqlast = na->num_tx_rings;
1513 		priv->np_rxqfirst = 0;
1514 		priv->np_rxqlast = na->num_rx_rings;
1515 		ND("%s %d %d", "ALL/PIPE",
1516 			priv->np_rxqfirst, priv->np_rxqlast);
1517 		break;
1518 	case NR_REG_SW:
1519 	case NR_REG_NIC_SW:
1520 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1521 			D("host rings not supported");
1522 			return EINVAL;
1523 		}
1524 		priv->np_txqfirst = (reg == NR_REG_SW ?
1525 			na->num_tx_rings : 0);
1526 		priv->np_txqlast = na->num_tx_rings + 1;
1527 		priv->np_rxqfirst = (reg == NR_REG_SW ?
1528 			na->num_rx_rings : 0);
1529 		priv->np_rxqlast = na->num_rx_rings + 1;
1530 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1531 			priv->np_rxqfirst, priv->np_rxqlast);
1532 		break;
1533 	case NR_REG_ONE_NIC:
1534 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1535 			D("invalid ring id %d", i);
1536 			return EINVAL;
1537 		}
1538 		/* if not enough rings, use the first one */
1539 		j = i;
1540 		if (j >= na->num_tx_rings)
1541 			j = 0;
1542 		priv->np_txqfirst = j;
1543 		priv->np_txqlast = j + 1;
1544 		j = i;
1545 		if (j >= na->num_rx_rings)
1546 			j = 0;
1547 		priv->np_rxqfirst = j;
1548 		priv->np_rxqlast = j + 1;
1549 		break;
1550 	default:
1551 		D("invalid regif type %d", reg);
1552 		return EINVAL;
1553 	}
1554 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1555 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1556 	if (nm_tx_si_user(priv))
1557 		na->tx_si_users++;
1558 	if (nm_rx_si_user(priv))
1559 		na->rx_si_users++;
1560 	if (netmap_verbose) {
1561 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1562 			NM_IFPNAME(na->ifp),
1563 			priv->np_txqfirst,
1564 			priv->np_txqlast,
1565 			priv->np_rxqfirst,
1566 			priv->np_rxqlast,
1567 			i);
1568 	}
1569 	return 0;
1570 }
1571 
1572 /*
1573  * possibly move the interface to netmap-mode.
1574  * If success it returns a pointer to netmap_if, otherwise NULL.
1575  * This must be called with NMG_LOCK held.
1576  */
1577 struct netmap_if *
1578 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1579 	uint16_t ringid, uint32_t flags, int *err)
1580 {
1581 	struct ifnet *ifp = na->ifp;
1582 	struct netmap_if *nifp = NULL;
1583 	int error, need_mem = 0;
1584 
1585 	NMG_LOCK_ASSERT();
1586 	/* ring configuration may have changed, fetch from the card */
1587 	netmap_update_config(na);
1588 	priv->np_na = na;     /* store the reference */
1589 	error = netmap_set_ringid(priv, ringid, flags);
1590 	if (error)
1591 		goto out;
1592 	/* ensure allocators are ready */
1593 	need_mem = !netmap_have_memory_locked(priv);
1594 	if (need_mem) {
1595 		error = netmap_get_memory_locked(priv);
1596 		ND("get_memory returned %d", error);
1597 		if (error)
1598 			goto out;
1599 	}
1600 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1601 
1602 	/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
1603 	if (nifp == NULL) { /* allocation failed */
1604 		error = ENOMEM;
1605 		goto out;
1606 	}
1607 	na->active_fds++;
1608 	if (ifp->if_capenable & IFCAP_NETMAP) {
1609 		/* was already set */
1610 	} else {
1611 		/* Otherwise set the card in netmap mode
1612 		 * and make it use the shared buffers.
1613 		 */
1614 		/* cache the allocator info in the na */
1615 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1616 		ND("%p->na_lut == %p", na, na->na_lut);
1617 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1618 		error = na->nm_register(na, 1); /* mode on */
1619 		if (error) {
1620 			netmap_do_unregif(priv, nifp);
1621 			nifp = NULL;
1622 		}
1623 	}
1624 out:
1625 	*err = error;
1626 	if (error) {
1627 		priv->np_na = NULL;
1628 		/* we should drop the allocator, but only
1629 		 * if we were the ones who grabbed it
1630 		 */
1631 		if (need_mem)
1632 			netmap_drop_memory_locked(priv);
1633 	}
1634 	if (nifp != NULL) {
1635 		/*
1636 		 * advertise that the interface is ready bt setting ni_nifp.
1637 		 * The barrier is needed because readers (poll and *SYNC)
1638 		 * check for priv->np_nifp != NULL without locking
1639 		 */
1640 		wmb(); /* make sure previous writes are visible to all CPUs */
1641 		priv->np_nifp = nifp;
1642 	}
1643 	return nifp;
1644 }
1645 
1646 
1647 
1648 /*
1649  * ioctl(2) support for the "netmap" device.
1650  *
1651  * Following a list of accepted commands:
1652  * - NIOCGINFO
1653  * - SIOCGIFADDR	just for convenience
1654  * - NIOCREGIF
1655  * - NIOCTXSYNC
1656  * - NIOCRXSYNC
1657  *
1658  * Return 0 on success, errno otherwise.
1659  */
1660 int
1661 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1662 	int fflag, struct thread *td)
1663 {
1664 	struct netmap_priv_d *priv = NULL;
1665 	struct ifnet *ifp = NULL;
1666 	struct nmreq *nmr = (struct nmreq *) data;
1667 	struct netmap_adapter *na = NULL;
1668 	int error;
1669 	u_int i, qfirst, qlast;
1670 	struct netmap_if *nifp;
1671 	struct netmap_kring *krings;
1672 
1673 	(void)dev;	/* UNUSED */
1674 	(void)fflag;	/* UNUSED */
1675 
1676 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
1677 		/* truncate name */
1678 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
1679 		if (nmr->nr_version != NETMAP_API) {
1680 			D("API mismatch for %s got %d need %d",
1681 				nmr->nr_name,
1682 				nmr->nr_version, NETMAP_API);
1683 			nmr->nr_version = NETMAP_API;
1684 		}
1685 		if (nmr->nr_version < NETMAP_MIN_API ||
1686 		    nmr->nr_version > NETMAP_MAX_API) {
1687 			return EINVAL;
1688 		}
1689 	}
1690 	CURVNET_SET(TD_TO_VNET(td));
1691 
1692 	error = devfs_get_cdevpriv((void **)&priv);
1693 	if (error) {
1694 		CURVNET_RESTORE();
1695 		/* XXX ENOENT should be impossible, since the priv
1696 		 * is now created in the open */
1697 		return (error == ENOENT ? ENXIO : error);
1698 	}
1699 
1700 	switch (cmd) {
1701 	case NIOCGINFO:		/* return capabilities etc */
1702 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1703 			error = netmap_bdg_ctl(nmr, NULL);
1704 			break;
1705 		}
1706 
1707 		NMG_LOCK();
1708 		do {
1709 			/* memsize is always valid */
1710 			struct netmap_mem_d *nmd = &nm_mem;
1711 			u_int memflags;
1712 
1713 			if (nmr->nr_name[0] != '\0') {
1714 				/* get a refcount */
1715 				error = netmap_get_na(nmr, &na, 1 /* create */);
1716 				if (error)
1717 					break;
1718 				nmd = na->nm_mem; /* get memory allocator */
1719 			}
1720 
1721 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
1722 				&nmr->nr_arg2);
1723 			if (error)
1724 				break;
1725 			if (na == NULL) /* only memory info */
1726 				break;
1727 			nmr->nr_offset = 0;
1728 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1729 			netmap_update_config(na);
1730 			nmr->nr_rx_rings = na->num_rx_rings;
1731 			nmr->nr_tx_rings = na->num_tx_rings;
1732 			nmr->nr_rx_slots = na->num_rx_desc;
1733 			nmr->nr_tx_slots = na->num_tx_desc;
1734 			netmap_adapter_put(na);
1735 		} while (0);
1736 		NMG_UNLOCK();
1737 		break;
1738 
1739 	case NIOCREGIF:
1740 		/* possibly attach/detach NIC and VALE switch */
1741 		i = nmr->nr_cmd;
1742 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
1743 				|| i == NETMAP_BDG_VNET_HDR) {
1744 			error = netmap_bdg_ctl(nmr, NULL);
1745 			break;
1746 		} else if (i != 0) {
1747 			D("nr_cmd must be 0 not %d", i);
1748 			error = EINVAL;
1749 			break;
1750 		}
1751 
1752 		/* protect access to priv from concurrent NIOCREGIF */
1753 		NMG_LOCK();
1754 		do {
1755 			u_int memflags;
1756 
1757 			if (priv->np_na != NULL) {	/* thread already registered */
1758 				error = EBUSY;
1759 				break;
1760 			}
1761 			/* find the interface and a reference */
1762 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1763 			if (error)
1764 				break;
1765 			ifp = na->ifp;
1766 			if (NETMAP_OWNED_BY_KERN(na)) {
1767 				netmap_adapter_put(na);
1768 				error = EBUSY;
1769 				break;
1770 			}
1771 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
1772 			if (!nifp) {    /* reg. failed, release priv and ref */
1773 				netmap_adapter_put(na);
1774 				priv->np_nifp = NULL;
1775 				break;
1776 			}
1777 			priv->np_td = td; // XXX kqueue, debugging only
1778 
1779 			/* return the offset of the netmap_if object */
1780 			nmr->nr_rx_rings = na->num_rx_rings;
1781 			nmr->nr_tx_rings = na->num_tx_rings;
1782 			nmr->nr_rx_slots = na->num_rx_desc;
1783 			nmr->nr_tx_slots = na->num_tx_desc;
1784 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
1785 				&nmr->nr_arg2);
1786 			if (error) {
1787 				netmap_adapter_put(na);
1788 				break;
1789 			}
1790 			if (memflags & NETMAP_MEM_PRIVATE) {
1791 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1792 			}
1793 			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
1794 				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
1795 			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
1796 				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
1797 
1798 			if (nmr->nr_arg3) {
1799 				D("requested %d extra buffers", nmr->nr_arg3);
1800 				nmr->nr_arg3 = netmap_extra_alloc(na,
1801 					&nifp->ni_bufs_head, nmr->nr_arg3);
1802 				D("got %d extra buffers", nmr->nr_arg3);
1803 			}
1804 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1805 		} while (0);
1806 		NMG_UNLOCK();
1807 		break;
1808 
1809 	case NIOCTXSYNC:
1810 	case NIOCRXSYNC:
1811 		nifp = priv->np_nifp;
1812 
1813 		if (nifp == NULL) {
1814 			error = ENXIO;
1815 			break;
1816 		}
1817 		rmb(); /* make sure following reads are not from cache */
1818 
1819 		na = priv->np_na;      /* we have a reference */
1820 
1821 		if (na == NULL) {
1822 			D("Internal error: nifp != NULL && na == NULL");
1823 			error = ENXIO;
1824 			break;
1825 		}
1826 
1827 		ifp = na->ifp;
1828 		if (ifp == NULL) {
1829 			RD(1, "the ifp is gone");
1830 			error = ENXIO;
1831 			break;
1832 		}
1833 
1834 		if (cmd == NIOCTXSYNC) {
1835 			krings = na->tx_rings;
1836 			qfirst = priv->np_txqfirst;
1837 			qlast = priv->np_txqlast;
1838 		} else {
1839 			krings = na->rx_rings;
1840 			qfirst = priv->np_rxqfirst;
1841 			qlast = priv->np_rxqlast;
1842 		}
1843 
1844 		for (i = qfirst; i < qlast; i++) {
1845 			struct netmap_kring *kring = krings + i;
1846 			if (nm_kr_tryget(kring)) {
1847 				error = EBUSY;
1848 				goto out;
1849 			}
1850 			if (cmd == NIOCTXSYNC) {
1851 				if (netmap_verbose & NM_VERB_TXSYNC)
1852 					D("pre txsync ring %d cur %d hwcur %d",
1853 					    i, kring->ring->cur,
1854 					    kring->nr_hwcur);
1855 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1856 					netmap_ring_reinit(kring);
1857 				} else {
1858 					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
1859 				}
1860 				if (netmap_verbose & NM_VERB_TXSYNC)
1861 					D("post txsync ring %d cur %d hwcur %d",
1862 					    i, kring->ring->cur,
1863 					    kring->nr_hwcur);
1864 			} else {
1865 				kring->nm_sync(kring, NAF_FORCE_READ);
1866 				microtime(&na->rx_rings[i].ring->ts);
1867 			}
1868 			nm_kr_put(kring);
1869 		}
1870 
1871 		break;
1872 
1873 #ifdef __FreeBSD__
1874 	case FIONBIO:
1875 	case FIOASYNC:
1876 		ND("FIONBIO/FIOASYNC are no-ops");
1877 		break;
1878 
1879 	case BIOCIMMEDIATE:
1880 	case BIOCGHDRCMPLT:
1881 	case BIOCSHDRCMPLT:
1882 	case BIOCSSEESENT:
1883 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1884 		break;
1885 
1886 	default:	/* allow device-specific ioctls */
1887 	    {
1888 		struct socket so;
1889 
1890 		bzero(&so, sizeof(so));
1891 		NMG_LOCK();
1892 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1893 		if (error) {
1894 			netmap_adapter_put(na);
1895 			NMG_UNLOCK();
1896 			break;
1897 		}
1898 		ifp = na->ifp;
1899 		so.so_vnet = ifp->if_vnet;
1900 		// so->so_proto not null.
1901 		error = ifioctl(&so, cmd, data, td);
1902 		netmap_adapter_put(na);
1903 		NMG_UNLOCK();
1904 		break;
1905 	    }
1906 
1907 #else /* linux */
1908 	default:
1909 		error = EOPNOTSUPP;
1910 #endif /* linux */
1911 	}
1912 out:
1913 
1914 	CURVNET_RESTORE();
1915 	return (error);
1916 }
1917 
1918 
1919 /*
1920  * select(2) and poll(2) handlers for the "netmap" device.
1921  *
1922  * Can be called for one or more queues.
1923  * Return true the event mask corresponding to ready events.
1924  * If there are no ready events, do a selrecord on either individual
1925  * selinfo or on the global one.
1926  * Device-dependent parts (locking and sync of tx/rx rings)
1927  * are done through callbacks.
1928  *
1929  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1930  * The first one is remapped to pwait as selrecord() uses the name as an
1931  * hidden argument.
1932  */
1933 int
1934 netmap_poll(struct cdev *dev, int events, struct thread *td)
1935 {
1936 	struct netmap_priv_d *priv = NULL;
1937 	struct netmap_adapter *na;
1938 	struct ifnet *ifp;
1939 	struct netmap_kring *kring;
1940 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1941 	struct mbq q;		/* packets from hw queues to host stack */
1942 	void *pwait = dev;	/* linux compatibility */
1943 	int is_kevent = 0;
1944 
1945 	/*
1946 	 * In order to avoid nested locks, we need to "double check"
1947 	 * txsync and rxsync if we decide to do a selrecord().
1948 	 * retry_tx (and retry_rx, later) prevent looping forever.
1949 	 */
1950 	int retry_tx = 1, retry_rx = 1;
1951 
1952 	(void)pwait;
1953 	mbq_init(&q);
1954 
1955 	/*
1956 	 * XXX kevent has curthread->tp_fop == NULL,
1957 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
1958 	 * priv as the first argument, which is also useful to avoid
1959 	 * the selrecord() which are not necessary in that case.
1960 	 */
1961 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
1962 		is_kevent = 1;
1963 		if (netmap_verbose)
1964 			D("called from kevent");
1965 		priv = (struct netmap_priv_d *)dev;
1966 	}
1967 	if (priv == NULL)
1968 		return POLLERR;
1969 
1970 	if (priv->np_nifp == NULL) {
1971 		D("No if registered");
1972 		return POLLERR;
1973 	}
1974 	rmb(); /* make sure following reads are not from cache */
1975 
1976 	na = priv->np_na;
1977 	ifp = na->ifp;
1978 	// check for deleted
1979 	if (ifp == NULL) {
1980 		RD(1, "the ifp is gone");
1981 		return POLLERR;
1982 	}
1983 
1984 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1985 		return POLLERR;
1986 
1987 	if (netmap_verbose & 0x8000)
1988 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1989 	want_tx = events & (POLLOUT | POLLWRNORM);
1990 	want_rx = events & (POLLIN | POLLRDNORM);
1991 
1992 
1993 	/*
1994 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1995 	 * the file descriptor is bound to all of them. If so, we sleep on
1996 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1997 	 * (FreeBSD only allows two selinfo's per file descriptor).
1998 	 * The interrupt routine in the driver wake one or the other
1999 	 * (or both) depending on which clients are active.
2000 	 *
2001 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2002 	 * txsync() is called if we run out of buffers on POLLOUT, or
2003 	 * there are pending packets to send. The latter can be disabled
2004 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2005 	 */
2006 	check_all_tx = nm_tx_si_user(priv);
2007 	check_all_rx = nm_rx_si_user(priv);
2008 
2009 	/*
2010 	 * We start with a lock free round which is cheap if we have
2011 	 * slots available. If this fails, then lock and call the sync
2012 	 * routines.
2013 	 */
2014 	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
2015 		kring = &na->rx_rings[i];
2016 		/* XXX compare ring->cur and kring->tail */
2017 		if (!nm_ring_empty(kring->ring)) {
2018 			revents |= want_rx;
2019 			want_rx = 0;	/* also breaks the loop */
2020 		}
2021 	}
2022 	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
2023 		kring = &na->tx_rings[i];
2024 		/* XXX compare ring->cur and kring->tail */
2025 		if (!nm_ring_empty(kring->ring)) {
2026 			revents |= want_tx;
2027 			want_tx = 0;	/* also breaks the loop */
2028 		}
2029 	}
2030 
2031 	/*
2032 	 * If we want to push packets out (priv->np_txpoll) or
2033 	 * want_tx is still set, we must issue txsync calls
2034 	 * (on all rings, to avoid that the tx rings stall).
2035 	 * XXX should also check cur != hwcur on the tx rings.
2036 	 * Fortunately, normal tx mode has np_txpoll set.
2037 	 */
2038 	if (priv->np_txpoll || want_tx) {
2039 		/*
2040 		 * The first round checks if anyone is ready, if not
2041 		 * do a selrecord and another round to handle races.
2042 		 * want_tx goes to 0 if any space is found, and is
2043 		 * used to skip rings with no pending transmissions.
2044 		 */
2045 flush_tx:
2046 		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2047 			int found = 0;
2048 
2049 			kring = &na->tx_rings[i];
2050 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2051 				continue;
2052 			/* only one thread does txsync */
2053 			if (nm_kr_tryget(kring)) {
2054 				/* either busy or stopped
2055 				 * XXX if the ring is stopped, sleeping would
2056 				 * be better. In current code, however, we only
2057 				 * stop the rings for brief intervals (2014-03-14)
2058 				 */
2059 
2060 				if (netmap_verbose)
2061 					RD(2, "%p lost race on txring %d, ok",
2062 					    priv, i);
2063 				continue;
2064 			}
2065 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2066 				netmap_ring_reinit(kring);
2067 				revents |= POLLERR;
2068 			} else {
2069 				if (kring->nm_sync(kring, 0))
2070 					revents |= POLLERR;
2071 			}
2072 
2073 			/*
2074 			 * If we found new slots, notify potential
2075 			 * listeners on the same ring.
2076 			 * Since we just did a txsync, look at the copies
2077 			 * of cur,tail in the kring.
2078 			 */
2079 			found = kring->rcur != kring->rtail;
2080 			nm_kr_put(kring);
2081 			if (found) { /* notify other listeners */
2082 				revents |= want_tx;
2083 				want_tx = 0;
2084 				na->nm_notify(na, i, NR_TX, 0);
2085 			}
2086 		}
2087 		if (want_tx && retry_tx && !is_kevent) {
2088 			selrecord(td, check_all_tx ?
2089 			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2090 			retry_tx = 0;
2091 			goto flush_tx;
2092 		}
2093 	}
2094 
2095 	/*
2096 	 * If want_rx is still set scan receive rings.
2097 	 * Do it on all rings because otherwise we starve.
2098 	 */
2099 	if (want_rx) {
2100 		int send_down = 0; /* transparent mode */
2101 		/* two rounds here for race avoidance */
2102 do_retry_rx:
2103 		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2104 			int found = 0;
2105 
2106 			kring = &na->rx_rings[i];
2107 
2108 			if (nm_kr_tryget(kring)) {
2109 				if (netmap_verbose)
2110 					RD(2, "%p lost race on rxring %d, ok",
2111 					    priv, i);
2112 				continue;
2113 			}
2114 
2115 			/*
2116 			 * transparent mode support: collect packets
2117 			 * from the rxring(s).
2118 			 */
2119 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2120 				ND(10, "forwarding some buffers up %d to %d",
2121 				    kring->nr_hwcur, kring->ring->cur);
2122 				netmap_grab_packets(kring, &q, netmap_fwd);
2123 			}
2124 
2125 			if (kring->nm_sync(kring, 0))
2126 				revents |= POLLERR;
2127 			if (netmap_no_timestamp == 0 ||
2128 					kring->ring->flags & NR_TIMESTAMP) {
2129 				microtime(&kring->ring->ts);
2130 			}
2131 			/* after an rxsync we can use kring->rcur, rtail */
2132 			found = kring->rcur != kring->rtail;
2133 			nm_kr_put(kring);
2134 			if (found) {
2135 				revents |= want_rx;
2136 				retry_rx = 0;
2137 				na->nm_notify(na, i, NR_RX, 0);
2138 			}
2139 		}
2140 
2141 		/* transparent mode XXX only during first pass ? */
2142 		if (na->na_flags & NAF_HOST_RINGS) {
2143 			kring = &na->rx_rings[na->num_rx_rings];
2144 			if (netmap_fwd || kring->ring->flags & NR_FORWARD) {
2145 				send_down = netmap_rxsync_from_host(na, td, dev);
2146 				if (send_down && (netmap_no_timestamp == 0 ||
2147 				    kring->ring->flags & NR_TIMESTAMP)) {
2148 					microtime(&kring->ring->ts);
2149 				}
2150 			}
2151 		}
2152 
2153 		if (retry_rx && !is_kevent)
2154 			selrecord(td, check_all_rx ?
2155 			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2156 		if (send_down > 0 || retry_rx) {
2157 			retry_rx = 0;
2158 			if (send_down)
2159 				goto flush_tx; /* and retry_rx */
2160 			else
2161 				goto do_retry_rx;
2162 		}
2163 	}
2164 
2165 	/*
2166 	 * Transparent mode: marked bufs on rx rings between
2167 	 * kring->nr_hwcur and ring->head
2168 	 * are passed to the other endpoint.
2169 	 *
2170 	 * In this mode we also scan the sw rxring, which in
2171 	 * turn passes packets up.
2172 	 *
2173 	 * XXX Transparent mode at the moment requires to bind all
2174  	 * rings to a single file descriptor.
2175 	 */
2176 
2177 	if (q.head)
2178 		netmap_send_up(na->ifp, &q);
2179 
2180 	return (revents);
2181 }
2182 
2183 
2184 /*-------------------- driver support routines -------------------*/
2185 
2186 static int netmap_hw_krings_create(struct netmap_adapter *);
2187 
2188 /* default notify callback */
2189 static int
2190 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2191 	enum txrx tx, int flags)
2192 {
2193 	struct netmap_kring *kring;
2194 
2195 	if (tx == NR_TX) {
2196 		kring = na->tx_rings + n_ring;
2197 		OS_selwakeup(&kring->si, PI_NET);
2198 		/* optimization: avoid a wake up on the global
2199 		 * queue if nobody has registered for more
2200 		 * than one ring
2201 		 */
2202 		if (na->tx_si_users > 0)
2203 			OS_selwakeup(&na->tx_si, PI_NET);
2204 	} else {
2205 		kring = na->rx_rings + n_ring;
2206 		OS_selwakeup(&kring->si, PI_NET);
2207 		/* optimization: same as above */
2208 		if (na->rx_si_users > 0)
2209 			OS_selwakeup(&na->rx_si, PI_NET);
2210 	}
2211 	return 0;
2212 }
2213 
2214 
2215 /* called by all routines that create netmap_adapters.
2216  * Attach na to the ifp (if any) and provide defaults
2217  * for optional callbacks. Defaults assume that we
2218  * are creating an hardware netmap_adapter.
2219  */
2220 int
2221 netmap_attach_common(struct netmap_adapter *na)
2222 {
2223 	struct ifnet *ifp = na->ifp;
2224 
2225 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2226 		D("%s: invalid rings tx %d rx %d",
2227 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
2228 		return EINVAL;
2229 	}
2230 	WNA(ifp) = na;
2231 
2232 	/* the following is only needed for na that use the host port.
2233 	 * XXX do we have something similar for linux ?
2234 	 */
2235 #ifdef __FreeBSD__
2236 	na->if_input = ifp->if_input; /* for netmap_send_up */
2237 #endif /* __FreeBSD__ */
2238 
2239 	NETMAP_SET_CAPABLE(ifp);
2240 	if (na->nm_krings_create == NULL) {
2241 		/* we assume that we have been called by a driver,
2242 		 * since other port types all provide their own
2243 		 * nm_krings_create
2244 		 */
2245 		na->nm_krings_create = netmap_hw_krings_create;
2246 		na->nm_krings_delete = netmap_hw_krings_delete;
2247 	}
2248 	if (na->nm_notify == NULL)
2249 		na->nm_notify = netmap_notify;
2250 	na->active_fds = 0;
2251 
2252 	if (na->nm_mem == NULL)
2253 		na->nm_mem = &nm_mem;
2254 	return 0;
2255 }
2256 
2257 
2258 /* standard cleanup, called by all destructors */
2259 void
2260 netmap_detach_common(struct netmap_adapter *na)
2261 {
2262 	if (na->ifp != NULL)
2263 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2264 
2265 	if (na->tx_rings) { /* XXX should not happen */
2266 		D("freeing leftover tx_rings");
2267 		na->nm_krings_delete(na);
2268 	}
2269 	netmap_pipe_dealloc(na);
2270 	if (na->na_flags & NAF_MEM_OWNER)
2271 		netmap_mem_private_delete(na->nm_mem);
2272 	bzero(na, sizeof(*na));
2273 	free(na, M_DEVBUF);
2274 }
2275 
2276 
2277 /*
2278  * Initialize a ``netmap_adapter`` object created by driver on attach.
2279  * We allocate a block of memory with room for a struct netmap_adapter
2280  * plus two sets of N+2 struct netmap_kring (where N is the number
2281  * of hardware rings):
2282  * krings	0..N-1	are for the hardware queues.
2283  * kring	N	is for the host stack queue
2284  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2285  * Return 0 on success, ENOMEM otherwise.
2286  */
2287 int
2288 netmap_attach(struct netmap_adapter *arg)
2289 {
2290 	struct netmap_hw_adapter *hwna = NULL;
2291 	// XXX when is arg == NULL ?
2292 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2293 
2294 	if (arg == NULL || ifp == NULL)
2295 		goto fail;
2296 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2297 	if (hwna == NULL)
2298 		goto fail;
2299 	hwna->up = *arg;
2300 	hwna->up.na_flags |= NAF_HOST_RINGS;
2301 	if (netmap_attach_common(&hwna->up)) {
2302 		free(hwna, M_DEVBUF);
2303 		goto fail;
2304 	}
2305 	netmap_adapter_get(&hwna->up);
2306 
2307 #ifdef linux
2308 	if (ifp->netdev_ops) {
2309 		/* prepare a clone of the netdev ops */
2310 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2311 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2312 #else
2313 		hwna->nm_ndo = *ifp->netdev_ops;
2314 #endif
2315 	}
2316 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2317 #endif /* linux */
2318 
2319 	D("success for %s tx %d/%d rx %d/%d queues/slots",
2320 		NM_IFPNAME(ifp),
2321 		hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2322 		hwna->up.num_rx_rings, hwna->up.num_rx_desc
2323 		);
2324 	return 0;
2325 
2326 fail:
2327 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2328 	if (ifp)
2329 		netmap_detach(ifp);
2330 	return (hwna ? EINVAL : ENOMEM);
2331 }
2332 
2333 
2334 void
2335 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2336 {
2337 	if (!na) {
2338 		return;
2339 	}
2340 
2341 	refcount_acquire(&na->na_refcount);
2342 }
2343 
2344 
2345 /* returns 1 iff the netmap_adapter is destroyed */
2346 int
2347 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2348 {
2349 	if (!na)
2350 		return 1;
2351 
2352 	if (!refcount_release(&na->na_refcount))
2353 		return 0;
2354 
2355 	if (na->nm_dtor)
2356 		na->nm_dtor(na);
2357 
2358 	netmap_detach_common(na);
2359 
2360 	return 1;
2361 }
2362 
2363 /* nm_krings_create callback for all hardware native adapters */
2364 int
2365 netmap_hw_krings_create(struct netmap_adapter *na)
2366 {
2367 	int ret = netmap_krings_create(na, 0);
2368 	if (ret == 0) {
2369 		/* initialize the mbq for the sw rx ring */
2370 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2371 		ND("initialized sw rx queue %d", na->num_rx_rings);
2372 	}
2373 	return ret;
2374 }
2375 
2376 
2377 
2378 /*
2379  * Called on module unload by the netmap-enabled drivers
2380  */
2381 void
2382 netmap_detach(struct ifnet *ifp)
2383 {
2384 	struct netmap_adapter *na = NA(ifp);
2385 
2386 	if (!na)
2387 		return;
2388 
2389 	NMG_LOCK();
2390 	netmap_disable_all_rings(ifp);
2391 	if (!netmap_adapter_put(na)) {
2392 		/* someone is still using the adapter,
2393 		 * tell them that the interface is gone
2394 		 */
2395 		na->ifp = NULL;
2396 		/* give them a chance to notice */
2397 		netmap_enable_all_rings(ifp);
2398 	}
2399 	NMG_UNLOCK();
2400 }
2401 
2402 
2403 /*
2404  * Intercept packets from the network stack and pass them
2405  * to netmap as incoming packets on the 'software' ring.
2406  *
2407  * We only store packets in a bounded mbq and then copy them
2408  * in the relevant rxsync routine.
2409  *
2410  * We rely on the OS to make sure that the ifp and na do not go
2411  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2412  * In nm_register() or whenever there is a reinitialization,
2413  * we make sure to make the mode change visible here.
2414  */
2415 int
2416 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2417 {
2418 	struct netmap_adapter *na = NA(ifp);
2419 	struct netmap_kring *kring;
2420 	u_int len = MBUF_LEN(m);
2421 	u_int error = ENOBUFS;
2422 	struct mbq *q;
2423 	int space;
2424 
2425 	// XXX [Linux] we do not need this lock
2426 	// if we follow the down/configure/up protocol -gl
2427 	// mtx_lock(&na->core_lock);
2428 
2429 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2430 		D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
2431 		error = ENXIO;
2432 		goto done;
2433 	}
2434 
2435 	kring = &na->rx_rings[na->num_rx_rings];
2436 	q = &kring->rx_queue;
2437 
2438 	// XXX reconsider long packets if we handle fragments
2439 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2440 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2441 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2442 		goto done;
2443 	}
2444 
2445 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2446 	 * and maybe other instances of netmap_transmit (the latter
2447 	 * not possible on Linux).
2448 	 * Also avoid overflowing the queue.
2449 	 */
2450 	mbq_lock(q);
2451 
2452         space = kring->nr_hwtail - kring->nr_hwcur;
2453         if (space < 0)
2454                 space += kring->nkr_num_slots;
2455 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2456 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2457 			 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2458 			len, m);
2459 	} else {
2460 		mbq_enqueue(q, m);
2461 		ND(10, "%s %d bufs in queue len %d m %p",
2462 			NM_IFPNAME(ifp), mbq_len(q), len, m);
2463 		/* notify outside the lock */
2464 		m = NULL;
2465 		error = 0;
2466 	}
2467 	mbq_unlock(q);
2468 
2469 done:
2470 	if (m)
2471 		m_freem(m);
2472 	/* unconditionally wake up listeners */
2473 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2474 	/* this is normally netmap_notify(), but for nics
2475 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2476 	 * that possibly forwards the frames through the switch
2477 	 */
2478 
2479 	return (error);
2480 }
2481 
2482 
2483 /*
2484  * netmap_reset() is called by the driver routines when reinitializing
2485  * a ring. The driver is in charge of locking to protect the kring.
2486  * If native netmap mode is not set just return NULL.
2487  */
2488 struct netmap_slot *
2489 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2490 	u_int new_cur)
2491 {
2492 	struct netmap_kring *kring;
2493 	int new_hwofs, lim;
2494 
2495 	if (na == NULL) {
2496 		D("NULL na, should not happen");
2497 		return NULL;	/* no netmap support here */
2498 	}
2499 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
2500 		ND("interface not in netmap mode");
2501 		return NULL;	/* nothing to reinitialize */
2502 	}
2503 
2504 	/* XXX note- in the new scheme, we are not guaranteed to be
2505 	 * under lock (e.g. when called on a device reset).
2506 	 * In this case, we should set a flag and do not trust too
2507 	 * much the values. In practice: TODO
2508 	 * - set a RESET flag somewhere in the kring
2509 	 * - do the processing in a conservative way
2510 	 * - let the *sync() fixup at the end.
2511 	 */
2512 	if (tx == NR_TX) {
2513 		if (n >= na->num_tx_rings)
2514 			return NULL;
2515 		kring = na->tx_rings + n;
2516 		// XXX check whether we should use hwcur or rcur
2517 		new_hwofs = kring->nr_hwcur - new_cur;
2518 	} else {
2519 		if (n >= na->num_rx_rings)
2520 			return NULL;
2521 		kring = na->rx_rings + n;
2522 		new_hwofs = kring->nr_hwtail - new_cur;
2523 	}
2524 	lim = kring->nkr_num_slots - 1;
2525 	if (new_hwofs > lim)
2526 		new_hwofs -= lim + 1;
2527 
2528 	/* Always set the new offset value and realign the ring. */
2529 	if (netmap_verbose)
2530 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2531 		NM_IFPNAME(na->ifp),
2532 		tx == NR_TX ? "TX" : "RX", n,
2533 		kring->nkr_hwofs, new_hwofs,
2534 		kring->nr_hwtail,
2535 		tx == NR_TX ? lim : kring->nr_hwtail);
2536 	kring->nkr_hwofs = new_hwofs;
2537 	if (tx == NR_TX) {
2538 		kring->nr_hwtail = kring->nr_hwcur + lim;
2539 		if (kring->nr_hwtail > lim)
2540 			kring->nr_hwtail -= lim + 1;
2541 	}
2542 
2543 #if 0 // def linux
2544 	/* XXX check that the mappings are correct */
2545 	/* need ring_nr, adapter->pdev, direction */
2546 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2547 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2548 		D("error mapping rx netmap buffer %d", i);
2549 		// XXX fix error handling
2550 	}
2551 
2552 #endif /* linux */
2553 	/*
2554 	 * Wakeup on the individual and global selwait
2555 	 * We do the wakeup here, but the ring is not yet reconfigured.
2556 	 * However, we are under lock so there are no races.
2557 	 */
2558 	na->nm_notify(na, n, tx, 0);
2559 	return kring->ring->slot;
2560 }
2561 
2562 
2563 /*
2564  * Dispatch rx/tx interrupts to the netmap rings.
2565  *
2566  * "work_done" is non-null on the RX path, NULL for the TX path.
2567  * We rely on the OS to make sure that there is only one active
2568  * instance per queue, and that there is appropriate locking.
2569  *
2570  * The 'notify' routine depends on what the ring is attached to.
2571  * - for a netmap file descriptor, do a selwakeup on the individual
2572  *   waitqueue, plus one on the global one if needed
2573  * - for a switch, call the proper forwarding routine
2574  * - XXX more ?
2575  */
2576 void
2577 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2578 {
2579 	struct netmap_adapter *na = NA(ifp);
2580 	struct netmap_kring *kring;
2581 
2582 	q &= NETMAP_RING_MASK;
2583 
2584 	if (netmap_verbose) {
2585 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2586 	}
2587 
2588 	if (work_done) { /* RX path */
2589 		if (q >= na->num_rx_rings)
2590 			return;	// not a physical queue
2591 		kring = na->rx_rings + q;
2592 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2593 		na->nm_notify(na, q, NR_RX, 0);
2594 		*work_done = 1; /* do not fire napi again */
2595 	} else { /* TX path */
2596 		if (q >= na->num_tx_rings)
2597 			return;	// not a physical queue
2598 		kring = na->tx_rings + q;
2599 		na->nm_notify(na, q, NR_TX, 0);
2600 	}
2601 }
2602 
2603 
2604 /*
2605  * Default functions to handle rx/tx interrupts from a physical device.
2606  * "work_done" is non-null on the RX path, NULL for the TX path.
2607  *
2608  * If the card is not in netmap mode, simply return 0,
2609  * so that the caller proceeds with regular processing.
2610  * Otherwise call netmap_common_irq() and return 1.
2611  *
2612  * If the card is connected to a netmap file descriptor,
2613  * do a selwakeup on the individual queue, plus one on the global one
2614  * if needed (multiqueue card _and_ there are multiqueue listeners),
2615  * and return 1.
2616  *
2617  * Finally, if called on rx from an interface connected to a switch,
2618  * calls the proper forwarding routine, and return 1.
2619  */
2620 int
2621 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2622 {
2623 	// XXX could we check NAF_NATIVE_ON ?
2624 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2625 		return 0;
2626 
2627 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2628 		ND("use regular interrupt");
2629 		return 0;
2630 	}
2631 
2632 	netmap_common_irq(ifp, q, work_done);
2633 	return 1;
2634 }
2635 
2636 
2637 /*
2638  * Module loader and unloader
2639  *
2640  * netmap_init() creates the /dev/netmap device and initializes
2641  * all global variables. Returns 0 on success, errno on failure
2642  * (but there is no chance)
2643  *
2644  * netmap_fini() destroys everything.
2645  */
2646 
2647 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2648 extern struct cdevsw netmap_cdevsw;
2649 
2650 
2651 void
2652 netmap_fini(void)
2653 {
2654 	// XXX destroy_bridges() ?
2655 	if (netmap_dev)
2656 		destroy_dev(netmap_dev);
2657 	netmap_mem_fini();
2658 	NMG_LOCK_DESTROY();
2659 	printf("netmap: unloaded module.\n");
2660 }
2661 
2662 
2663 int
2664 netmap_init(void)
2665 {
2666 	int error;
2667 
2668 	NMG_LOCK_INIT();
2669 
2670 	error = netmap_mem_init();
2671 	if (error != 0)
2672 		goto fail;
2673 	/* XXX could use make_dev_credv() to get error number */
2674 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2675 			      "netmap");
2676 	if (!netmap_dev)
2677 		goto fail;
2678 
2679 	netmap_init_bridges();
2680 	printf("netmap: loaded module\n");
2681 	return (0);
2682 fail:
2683 	netmap_fini();
2684 	return (EINVAL); /* may be incorrect */
2685 }
2686