xref: /freebsd/sys/dev/netmap/netmap.c (revision 864c53ead899f7838cd2e1cca3b485a4a82f5cdc)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 /*
128  * OS-specific code that is used only within this file.
129  * Other OS-specific code that must be accessed by drivers
130  * is present in netmap_kern.h
131  */
132 
133 #if defined(__FreeBSD__)
134 #include <sys/cdefs.h> /* prerequisite */
135 #include <sys/types.h>
136 #include <sys/errno.h>
137 #include <sys/param.h>	/* defines used in kernel.h */
138 #include <sys/kernel.h>	/* types used in module initialization */
139 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
140 #include <sys/filio.h>	/* FIONBIO */
141 #include <sys/sockio.h>
142 #include <sys/socketvar.h>	/* struct socket */
143 #include <sys/malloc.h>
144 #include <sys/poll.h>
145 #include <sys/rwlock.h>
146 #include <sys/socket.h> /* sockaddrs */
147 #include <sys/selinfo.h>
148 #include <sys/sysctl.h>
149 #include <sys/jail.h>
150 #include <net/vnet.h>
151 #include <net/if.h>
152 #include <net/if_var.h>
153 #include <net/bpf.h>		/* BIOCIMMEDIATE */
154 #include <machine/bus.h>	/* bus_dmamap_* */
155 #include <sys/endian.h>
156 #include <sys/refcount.h>
157 
158 
159 /* reduce conditional code */
160 // linux API, use for the knlist in FreeBSD
161 #define init_waitqueue_head(x)	knlist_init_mtx(&(x)->si_note, NULL)
162 
163 void freebsd_selwakeup(struct selinfo *si, int pri);
164 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
165 
166 #elif defined(linux)
167 
168 #include "bsd_glue.h"
169 
170 
171 
172 #elif defined(__APPLE__)
173 
174 #warning OSX support is only partial
175 #include "osx_glue.h"
176 
177 #else
178 
179 #error	Unsupported platform
180 
181 #endif /* unsupported */
182 
183 /*
184  * common headers
185  */
186 #include <net/netmap.h>
187 #include <dev/netmap/netmap_kern.h>
188 #include <dev/netmap/netmap_mem2.h>
189 
190 
191 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
192 
193 /*
194  * The following variables are used by the drivers and replicate
195  * fields in the global memory pool. They only refer to buffers
196  * used by physical interfaces.
197  */
198 u_int netmap_total_buffers;
199 u_int netmap_buf_size;
200 char *netmap_buffer_base;	/* also address of an invalid buffer */
201 
202 /* user-controlled variables */
203 int netmap_verbose;
204 
205 static int netmap_no_timestamp; /* don't timestamp on rxsync */
206 
207 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
208 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
209     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
210 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
211     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
212 int netmap_mitigate = 1;
213 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
214 int netmap_no_pendintr = 1;
215 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
216     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
217 int netmap_txsync_retry = 2;
218 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
219     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
220 
221 int netmap_flags = 0;	/* debug flags */
222 int netmap_fwd = 0;	/* force transparent mode */
223 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
224 
225 /*
226  * netmap_admode selects the netmap mode to use.
227  * Invalid values are reset to NETMAP_ADMODE_BEST
228  */
229 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
230 	NETMAP_ADMODE_NATIVE,	/* either native or none */
231 	NETMAP_ADMODE_GENERIC,	/* force generic */
232 	NETMAP_ADMODE_LAST };
233 static int netmap_admode = NETMAP_ADMODE_BEST;
234 
235 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
236 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
237 int netmap_generic_rings = 1;   /* number of queues in generic. */
238 
239 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
240 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
241 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
242 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
243 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
244 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
245 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
246 
247 NMG_LOCK_T	netmap_global_lock;
248 
249 
250 static void
251 nm_kr_get(struct netmap_kring *kr)
252 {
253 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
254 		tsleep(kr, 0, "NM_KR_GET", 4);
255 }
256 
257 
258 /*
259  * mark the ring as stopped, and run through the locks
260  * to make sure other users get to see it.
261  */
262 void
263 netmap_disable_ring(struct netmap_kring *kr)
264 {
265 	kr->nkr_stopped = 1;
266 	nm_kr_get(kr);
267 	mtx_lock(&kr->q_lock);
268 	mtx_unlock(&kr->q_lock);
269 	nm_kr_put(kr);
270 }
271 
272 
273 /* stop or enable all the rings of na */
274 static void
275 netmap_set_all_rings(struct ifnet *ifp, int stopped)
276 {
277 	struct netmap_adapter *na;
278 	int i;
279 	u_int ntx, nrx;
280 
281 	if (!(ifp->if_capenable & IFCAP_NETMAP))
282 		return;
283 
284 	na = NA(ifp);
285 
286 	ntx = netmap_real_tx_rings(na);
287 	nrx = netmap_real_rx_rings(na);
288 
289 	for (i = 0; i < ntx; i++) {
290 		if (stopped)
291 			netmap_disable_ring(na->tx_rings + i);
292 		else
293 			na->tx_rings[i].nkr_stopped = 0;
294 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY);
295 	}
296 
297 	for (i = 0; i < nrx; i++) {
298 		if (stopped)
299 			netmap_disable_ring(na->rx_rings + i);
300 		else
301 			na->rx_rings[i].nkr_stopped = 0;
302 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY);
303 	}
304 }
305 
306 
307 /*
308  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
309  * to finish and prevents any new one from starting.  Call this before turning
310  * netmap mode off, or before removing the harware rings (e.g., on module
311  * onload).  As a rule of thumb for linux drivers, this should be placed near
312  * each napi_disable().
313  */
314 void
315 netmap_disable_all_rings(struct ifnet *ifp)
316 {
317 	netmap_set_all_rings(ifp, 1 /* stopped */);
318 }
319 
320 
321 /*
322  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
323  * adapter's rings In linux drivers, this should be placed near each
324  * napi_enable().
325  */
326 void
327 netmap_enable_all_rings(struct ifnet *ifp)
328 {
329 	netmap_set_all_rings(ifp, 0 /* enabled */);
330 }
331 
332 
333 /*
334  * generic bound_checking function
335  */
336 u_int
337 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
338 {
339 	u_int oldv = *v;
340 	const char *op = NULL;
341 
342 	if (dflt < lo)
343 		dflt = lo;
344 	if (dflt > hi)
345 		dflt = hi;
346 	if (oldv < lo) {
347 		*v = dflt;
348 		op = "Bump";
349 	} else if (oldv > hi) {
350 		*v = hi;
351 		op = "Clamp";
352 	}
353 	if (op && msg)
354 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
355 	return *v;
356 }
357 
358 
359 /*
360  * packet-dump function, user-supplied or static buffer.
361  * The destination buffer must be at least 30+4*len
362  */
363 const char *
364 nm_dump_buf(char *p, int len, int lim, char *dst)
365 {
366 	static char _dst[8192];
367 	int i, j, i0;
368 	static char hex[] ="0123456789abcdef";
369 	char *o;	/* output position */
370 
371 #define P_HI(x)	hex[((x) & 0xf0)>>4]
372 #define P_LO(x)	hex[((x) & 0xf)]
373 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
374 	if (!dst)
375 		dst = _dst;
376 	if (lim <= 0 || lim > len)
377 		lim = len;
378 	o = dst;
379 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
380 	o += strlen(o);
381 	/* hexdump routine */
382 	for (i = 0; i < lim; ) {
383 		sprintf(o, "%5d: ", i);
384 		o += strlen(o);
385 		memset(o, ' ', 48);
386 		i0 = i;
387 		for (j=0; j < 16 && i < lim; i++, j++) {
388 			o[j*3] = P_HI(p[i]);
389 			o[j*3+1] = P_LO(p[i]);
390 		}
391 		i = i0;
392 		for (j=0; j < 16 && i < lim; i++, j++)
393 			o[j + 48] = P_C(p[i]);
394 		o[j+48] = '\n';
395 		o += j+49;
396 	}
397 	*o = '\0';
398 #undef P_HI
399 #undef P_LO
400 #undef P_C
401 	return dst;
402 }
403 
404 
405 /*
406  * Fetch configuration from the device, to cope with dynamic
407  * reconfigurations after loading the module.
408  */
409 /* call with NMG_LOCK held */
410 int
411 netmap_update_config(struct netmap_adapter *na)
412 {
413 	struct ifnet *ifp = na->ifp;
414 	u_int txr, txd, rxr, rxd;
415 
416 	txr = txd = rxr = rxd = 0;
417 	if (na->nm_config) {
418 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
419 	} else {
420 		/* take whatever we had at init time */
421 		txr = na->num_tx_rings;
422 		txd = na->num_tx_desc;
423 		rxr = na->num_rx_rings;
424 		rxd = na->num_rx_desc;
425 	}
426 
427 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
428 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
429 		return 0; /* nothing changed */
430 	if (netmap_verbose || na->active_fds > 0) {
431 		D("stored config %s: txring %d x %d, rxring %d x %d",
432 			NM_IFPNAME(ifp),
433 			na->num_tx_rings, na->num_tx_desc,
434 			na->num_rx_rings, na->num_rx_desc);
435 		D("new config %s: txring %d x %d, rxring %d x %d",
436 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
437 	}
438 	if (na->active_fds == 0) {
439 		D("configuration changed (but fine)");
440 		na->num_tx_rings = txr;
441 		na->num_tx_desc = txd;
442 		na->num_rx_rings = rxr;
443 		na->num_rx_desc = rxd;
444 		return 0;
445 	}
446 	D("configuration changed while active, this is bad...");
447 	return 1;
448 }
449 
450 static int
451 netmap_txsync_compat(struct netmap_kring *kring, int flags)
452 {
453 	struct netmap_adapter *na = kring->na;
454 	return na->nm_txsync(na, kring->ring_id, flags);
455 }
456 
457 static int
458 netmap_rxsync_compat(struct netmap_kring *kring, int flags)
459 {
460 	struct netmap_adapter *na = kring->na;
461 	return na->nm_rxsync(na, kring->ring_id, flags);
462 }
463 
464 /* kring->nm_sync callback for the host tx ring */
465 static int
466 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
467 {
468 	(void)flags; /* unused */
469 	netmap_txsync_to_host(kring->na);
470 	return 0;
471 }
472 
473 /* kring->nm_sync callback for the host rx ring */
474 static int
475 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
476 {
477 	(void)flags; /* unused */
478 	netmap_rxsync_from_host(kring->na, NULL, NULL);
479 	return 0;
480 }
481 
482 
483 
484 /* create the krings array and initialize the fields common to all adapters.
485  * The array layout is this:
486  *
487  *                    +----------+
488  * na->tx_rings ----->|          | \
489  *                    |          |  } na->num_tx_ring
490  *                    |          | /
491  *                    +----------+
492  *                    |          |    host tx kring
493  * na->rx_rings ----> +----------+
494  *                    |          | \
495  *                    |          |  } na->num_rx_rings
496  *                    |          | /
497  *                    +----------+
498  *                    |          |    host rx kring
499  *                    +----------+
500  * na->tailroom ----->|          | \
501  *                    |          |  } tailroom bytes
502  *                    |          | /
503  *                    +----------+
504  *
505  * Note: for compatibility, host krings are created even when not needed.
506  * The tailroom space is currently used by vale ports for allocating leases.
507  */
508 /* call with NMG_LOCK held */
509 int
510 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
511 {
512 	u_int i, len, ndesc;
513 	struct netmap_kring *kring;
514 	u_int ntx, nrx;
515 
516 	/* account for the (possibly fake) host rings */
517 	ntx = na->num_tx_rings + 1;
518 	nrx = na->num_rx_rings + 1;
519 
520 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
521 
522 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
523 	if (na->tx_rings == NULL) {
524 		D("Cannot allocate krings");
525 		return ENOMEM;
526 	}
527 	na->rx_rings = na->tx_rings + ntx;
528 
529 	/*
530 	 * All fields in krings are 0 except the one initialized below.
531 	 * but better be explicit on important kring fields.
532 	 */
533 	ndesc = na->num_tx_desc;
534 	for (i = 0; i < ntx; i++) { /* Transmit rings */
535 		kring = &na->tx_rings[i];
536 		bzero(kring, sizeof(*kring));
537 		kring->na = na;
538 		kring->ring_id = i;
539 		kring->nkr_num_slots = ndesc;
540 		if (i < na->num_tx_rings) {
541 			kring->nm_sync = netmap_txsync_compat; // XXX
542 		} else if (i == na->num_tx_rings) {
543 			kring->nm_sync = netmap_txsync_to_host_compat;
544 		}
545 		/*
546 		 * IMPORTANT: Always keep one slot empty.
547 		 */
548 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
549 		kring->rtail = kring->nr_hwtail = ndesc - 1;
550 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
551 		ND("ktx %s h %d c %d t %d",
552 			kring->name, kring->rhead, kring->rcur, kring->rtail);
553 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
554 		init_waitqueue_head(&kring->si);
555 	}
556 
557 	ndesc = na->num_rx_desc;
558 	for (i = 0; i < nrx; i++) { /* Receive rings */
559 		kring = &na->rx_rings[i];
560 		bzero(kring, sizeof(*kring));
561 		kring->na = na;
562 		kring->ring_id = i;
563 		kring->nkr_num_slots = ndesc;
564 		if (i < na->num_rx_rings) {
565 			kring->nm_sync = netmap_rxsync_compat; // XXX
566 		} else if (i == na->num_rx_rings) {
567 			kring->nm_sync = netmap_rxsync_from_host_compat;
568 		}
569 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
570 		kring->rtail = kring->nr_hwtail = 0;
571 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
572 		ND("krx %s h %d c %d t %d",
573 			kring->name, kring->rhead, kring->rcur, kring->rtail);
574 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
575 		init_waitqueue_head(&kring->si);
576 	}
577 	init_waitqueue_head(&na->tx_si);
578 	init_waitqueue_head(&na->rx_si);
579 
580 	na->tailroom = na->rx_rings + nrx;
581 
582 	return 0;
583 }
584 
585 
586 /* undo the actions performed by netmap_krings_create */
587 /* call with NMG_LOCK held */
588 void
589 netmap_krings_delete(struct netmap_adapter *na)
590 {
591 	struct netmap_kring *kring = na->tx_rings;
592 
593 	/* we rely on the krings layout described above */
594 	for ( ; kring != na->tailroom; kring++) {
595 		mtx_destroy(&kring->q_lock);
596 	}
597 	free(na->tx_rings, M_DEVBUF);
598 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
599 }
600 
601 
602 /*
603  * Destructor for NIC ports. They also have an mbuf queue
604  * on the rings connected to the host so we need to purge
605  * them first.
606  */
607 /* call with NMG_LOCK held */
608 static void
609 netmap_hw_krings_delete(struct netmap_adapter *na)
610 {
611 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
612 
613 	ND("destroy sw mbq with len %d", mbq_len(q));
614 	mbq_purge(q);
615 	mbq_safe_destroy(q);
616 	netmap_krings_delete(na);
617 }
618 
619 
620 /* create a new netmap_if for a newly registered fd.
621  * If this is the first registration of the adapter,
622  * also create the netmap rings and their in-kernel view,
623  * the netmap krings.
624  */
625 /* call with NMG_LOCK held */
626 static struct netmap_if*
627 netmap_if_new(const char *ifname, struct netmap_adapter *na)
628 {
629 	struct netmap_if *nifp;
630 
631 	if (netmap_update_config(na)) {
632 		/* configuration mismatch, report and fail */
633 		return NULL;
634 	}
635 
636 	if (na->active_fds)	/* already registered */
637 		goto final;
638 
639 	/* create and init the krings arrays.
640 	 * Depending on the adapter, this may also create
641 	 * the netmap rings themselves
642 	 */
643 	if (na->nm_krings_create(na))
644 		goto cleanup;
645 
646 	/* create all missing netmap rings */
647 	if (netmap_mem_rings_create(na))
648 		goto cleanup;
649 
650 final:
651 
652 	/* in all cases, create a new netmap if */
653 	nifp = netmap_mem_if_new(ifname, na);
654 	if (nifp == NULL)
655 		goto cleanup;
656 
657 	return (nifp);
658 
659 cleanup:
660 
661 	if (na->active_fds == 0) {
662 		netmap_mem_rings_delete(na);
663 		na->nm_krings_delete(na);
664 	}
665 
666 	return NULL;
667 }
668 
669 
670 /* grab a reference to the memory allocator, if we don't have one already.  The
671  * reference is taken from the netmap_adapter registered with the priv.
672  */
673 /* call with NMG_LOCK held */
674 static int
675 netmap_get_memory_locked(struct netmap_priv_d* p)
676 {
677 	struct netmap_mem_d *nmd;
678 	int error = 0;
679 
680 	if (p->np_na == NULL) {
681 		if (!netmap_mmap_unreg)
682 			return ENODEV;
683 		/* for compatibility with older versions of the API
684  		 * we use the global allocator when no interface has been
685  		 * registered
686  		 */
687 		nmd = &nm_mem;
688 	} else {
689 		nmd = p->np_na->nm_mem;
690 	}
691 	if (p->np_mref == NULL) {
692 		error = netmap_mem_finalize(nmd);
693 		if (!error)
694 			p->np_mref = nmd;
695 	} else if (p->np_mref != nmd) {
696 		/* a virtual port has been registered, but previous
697  		 * syscalls already used the global allocator.
698  		 * We cannot continue
699  		 */
700 		error = ENODEV;
701 	}
702 	return error;
703 }
704 
705 
706 /* call with NMG_LOCK *not* held */
707 int
708 netmap_get_memory(struct netmap_priv_d* p)
709 {
710 	int error;
711 	NMG_LOCK();
712 	error = netmap_get_memory_locked(p);
713 	NMG_UNLOCK();
714 	return error;
715 }
716 
717 
718 /* call with NMG_LOCK held */
719 static int
720 netmap_have_memory_locked(struct netmap_priv_d* p)
721 {
722 	return p->np_mref != NULL;
723 }
724 
725 
726 /* call with NMG_LOCK held */
727 static void
728 netmap_drop_memory_locked(struct netmap_priv_d* p)
729 {
730 	if (p->np_mref) {
731 		netmap_mem_deref(p->np_mref);
732 		p->np_mref = NULL;
733 	}
734 }
735 
736 
737 /*
738  * File descriptor's private data destructor.
739  *
740  * Call nm_register(ifp,0) to stop netmap mode on the interface and
741  * revert to normal operation. We expect that np_na->ifp has not gone.
742  * The second argument is the nifp to work on. In some cases it is
743  * not attached yet to the netmap_priv_d so we need to pass it as
744  * a separate argument.
745  */
746 /* call with NMG_LOCK held */
747 static void
748 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
749 {
750 	struct netmap_adapter *na = priv->np_na;
751 	struct ifnet *ifp = na->ifp;
752 
753 	NMG_LOCK_ASSERT();
754 	na->active_fds--;
755 	if (na->active_fds <= 0) {	/* last instance */
756 
757 		if (netmap_verbose)
758 			D("deleting last instance for %s", NM_IFPNAME(ifp));
759 		/*
760 		 * (TO CHECK) This function is only called
761 		 * when the last reference to this file descriptor goes
762 		 * away. This means we cannot have any pending poll()
763 		 * or interrupt routine operating on the structure.
764 		 * XXX The file may be closed in a thread while
765 		 * another thread is using it.
766 		 * Linux keeps the file opened until the last reference
767 		 * by any outstanding ioctl/poll or mmap is gone.
768 		 * FreeBSD does not track mmap()s (but we do) and
769 		 * wakes up any sleeping poll(). Need to check what
770 		 * happens if the close() occurs while a concurrent
771 		 * syscall is running.
772 		 */
773 		if (ifp)
774 			na->nm_register(na, 0); /* off, clear flags */
775 		/* Wake up any sleeping threads. netmap_poll will
776 		 * then return POLLERR
777 		 * XXX The wake up now must happen during *_down(), when
778 		 * we order all activities to stop. -gl
779 		 */
780 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
781 		/* knlist_destroy(&na->tx_si.si_note); */
782 		/* knlist_destroy(&na->rx_si.si_note); */
783 
784 		/* delete rings and buffers */
785 		netmap_mem_rings_delete(na);
786 		na->nm_krings_delete(na);
787 	}
788 	/* delete the nifp */
789 	netmap_mem_if_delete(na, nifp);
790 }
791 
792 /* call with NMG_LOCK held */
793 static __inline int
794 nm_tx_si_user(struct netmap_priv_d *priv)
795 {
796 	return (priv->np_na != NULL &&
797 		(priv->np_txqlast - priv->np_txqfirst > 1));
798 }
799 
800 /* call with NMG_LOCK held */
801 static __inline int
802 nm_rx_si_user(struct netmap_priv_d *priv)
803 {
804 	return (priv->np_na != NULL &&
805 		(priv->np_rxqlast - priv->np_rxqfirst > 1));
806 }
807 
808 
809 /*
810  * Destructor of the netmap_priv_d, called when the fd has
811  * no active open() and mmap(). Also called in error paths.
812  *
813  * returns 1 if this is the last instance and we can free priv
814  */
815 /* call with NMG_LOCK held */
816 int
817 netmap_dtor_locked(struct netmap_priv_d *priv)
818 {
819 	struct netmap_adapter *na = priv->np_na;
820 
821 #ifdef __FreeBSD__
822 	/*
823 	 * np_refcount is the number of active mmaps on
824 	 * this file descriptor
825 	 */
826 	if (--priv->np_refcount > 0) {
827 		return 0;
828 	}
829 #endif /* __FreeBSD__ */
830 	if (!na) {
831 	    return 1; //XXX is it correct?
832 	}
833 	netmap_do_unregif(priv, priv->np_nifp);
834 	priv->np_nifp = NULL;
835 	netmap_drop_memory_locked(priv);
836 	if (priv->np_na) {
837 		if (nm_tx_si_user(priv))
838 			na->tx_si_users--;
839 		if (nm_rx_si_user(priv))
840 			na->rx_si_users--;
841 		netmap_adapter_put(na);
842 		priv->np_na = NULL;
843 	}
844 	return 1;
845 }
846 
847 
848 /* call with NMG_LOCK *not* held */
849 void
850 netmap_dtor(void *data)
851 {
852 	struct netmap_priv_d *priv = data;
853 	int last_instance;
854 
855 	NMG_LOCK();
856 	last_instance = netmap_dtor_locked(priv);
857 	NMG_UNLOCK();
858 	if (last_instance) {
859 		bzero(priv, sizeof(*priv));	/* for safety */
860 		free(priv, M_DEVBUF);
861 	}
862 }
863 
864 
865 
866 
867 /*
868  * Handlers for synchronization of the queues from/to the host.
869  * Netmap has two operating modes:
870  * - in the default mode, the rings connected to the host stack are
871  *   just another ring pair managed by userspace;
872  * - in transparent mode (XXX to be defined) incoming packets
873  *   (from the host or the NIC) are marked as NS_FORWARD upon
874  *   arrival, and the user application has a chance to reset the
875  *   flag for packets that should be dropped.
876  *   On the RXSYNC or poll(), packets in RX rings between
877  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
878  *   to the other side.
879  * The transfer NIC --> host is relatively easy, just encapsulate
880  * into mbufs and we are done. The host --> NIC side is slightly
881  * harder because there might not be room in the tx ring so it
882  * might take a while before releasing the buffer.
883  */
884 
885 
886 /*
887  * pass a chain of buffers to the host stack as coming from 'dst'
888  * We do not need to lock because the queue is private.
889  */
890 static void
891 netmap_send_up(struct ifnet *dst, struct mbq *q)
892 {
893 	struct mbuf *m;
894 
895 	/* send packets up, outside the lock */
896 	while ((m = mbq_dequeue(q)) != NULL) {
897 		if (netmap_verbose & NM_VERB_HOST)
898 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
899 		NM_SEND_UP(dst, m);
900 	}
901 	mbq_destroy(q);
902 }
903 
904 
905 /*
906  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
907  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
908  * and pass them up. Drop remaining packets in the unlikely event
909  * of an mbuf shortage.
910  */
911 static void
912 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
913 {
914 	u_int const lim = kring->nkr_num_slots - 1;
915 	u_int const head = kring->ring->head;
916 	u_int n;
917 	struct netmap_adapter *na = kring->na;
918 
919 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
920 		struct mbuf *m;
921 		struct netmap_slot *slot = &kring->ring->slot[n];
922 
923 		if ((slot->flags & NS_FORWARD) == 0 && !force)
924 			continue;
925 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
926 			RD(5, "bad pkt at %d len %d", n, slot->len);
927 			continue;
928 		}
929 		slot->flags &= ~NS_FORWARD; // XXX needed ?
930 		/* XXX TODO: adapt to the case of a multisegment packet */
931 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
932 
933 		if (m == NULL)
934 			break;
935 		mbq_enqueue(q, m);
936 	}
937 }
938 
939 
940 /*
941  * Send to the NIC rings packets marked NS_FORWARD between
942  * kring->nr_hwcur and kring->rhead
943  * Called under kring->rx_queue.lock on the sw rx ring,
944  */
945 static u_int
946 netmap_sw_to_nic(struct netmap_adapter *na)
947 {
948 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
949 	struct netmap_slot *rxslot = kring->ring->slot;
950 	u_int i, rxcur = kring->nr_hwcur;
951 	u_int const head = kring->rhead;
952 	u_int const src_lim = kring->nkr_num_slots - 1;
953 	u_int sent = 0;
954 
955 	/* scan rings to find space, then fill as much as possible */
956 	for (i = 0; i < na->num_tx_rings; i++) {
957 		struct netmap_kring *kdst = &na->tx_rings[i];
958 		struct netmap_ring *rdst = kdst->ring;
959 		u_int const dst_lim = kdst->nkr_num_slots - 1;
960 
961 		/* XXX do we trust ring or kring->rcur,rtail ? */
962 		for (; rxcur != head && !nm_ring_empty(rdst);
963 		     rxcur = nm_next(rxcur, src_lim) ) {
964 			struct netmap_slot *src, *dst, tmp;
965 			u_int dst_cur = rdst->cur;
966 
967 			src = &rxslot[rxcur];
968 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
969 				continue;
970 
971 			sent++;
972 
973 			dst = &rdst->slot[dst_cur];
974 
975 			tmp = *src;
976 
977 			src->buf_idx = dst->buf_idx;
978 			src->flags = NS_BUF_CHANGED;
979 
980 			dst->buf_idx = tmp.buf_idx;
981 			dst->len = tmp.len;
982 			dst->flags = NS_BUF_CHANGED;
983 
984 			rdst->cur = nm_next(dst_cur, dst_lim);
985 		}
986 		/* if (sent) XXX txsync ? */
987 	}
988 	return sent;
989 }
990 
991 
992 /*
993  * netmap_txsync_to_host() passes packets up. We are called from a
994  * system call in user process context, and the only contention
995  * can be among multiple user threads erroneously calling
996  * this routine concurrently.
997  */
998 void
999 netmap_txsync_to_host(struct netmap_adapter *na)
1000 {
1001 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1002 	struct netmap_ring *ring = kring->ring;
1003 	u_int const lim = kring->nkr_num_slots - 1;
1004 	u_int const head = kring->rhead;
1005 	struct mbq q;
1006 
1007 	/* Take packets from hwcur to head and pass them up.
1008 	 * force head = cur since netmap_grab_packets() stops at head
1009 	 * In case of no buffers we give up. At the end of the loop,
1010 	 * the queue is drained in all cases.
1011 	 */
1012 	mbq_init(&q);
1013 	ring->cur = head;
1014 	netmap_grab_packets(kring, &q, 1 /* force */);
1015 	ND("have %d pkts in queue", mbq_len(&q));
1016 	kring->nr_hwcur = head;
1017 	kring->nr_hwtail = head + lim;
1018 	if (kring->nr_hwtail > lim)
1019 		kring->nr_hwtail -= lim + 1;
1020 	nm_txsync_finalize(kring);
1021 
1022 	netmap_send_up(na->ifp, &q);
1023 }
1024 
1025 
1026 /*
1027  * rxsync backend for packets coming from the host stack.
1028  * They have been put in kring->rx_queue by netmap_transmit().
1029  * We protect access to the kring using kring->rx_queue.lock
1030  *
1031  * This routine also does the selrecord if called from the poll handler
1032  * (we know because td != NULL).
1033  *
1034  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1035  *     as an additional hidden argument.
1036  * returns the number of packets delivered to tx queues in
1037  * transparent mode, or a negative value if error
1038  */
1039 int
1040 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1041 {
1042 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1043 	struct netmap_ring *ring = kring->ring;
1044 	u_int nm_i, n;
1045 	u_int const lim = kring->nkr_num_slots - 1;
1046 	u_int const head = kring->rhead;
1047 	int ret = 0;
1048 	struct mbq *q = &kring->rx_queue;
1049 
1050 	(void)pwait;	/* disable unused warnings */
1051 	(void)td;
1052 
1053 	mbq_lock(q);
1054 
1055 	/* First part: import newly received packets */
1056 	n = mbq_len(q);
1057 	if (n) { /* grab packets from the queue */
1058 		struct mbuf *m;
1059 		uint32_t stop_i;
1060 
1061 		nm_i = kring->nr_hwtail;
1062 		stop_i = nm_prev(nm_i, lim);
1063 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1064 			int len = MBUF_LEN(m);
1065 			struct netmap_slot *slot = &ring->slot[nm_i];
1066 
1067 			m_copydata(m, 0, len, BDG_NMB(na, slot));
1068 			ND("nm %d len %d", nm_i, len);
1069 			if (netmap_verbose)
1070                                 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
1071 
1072 			slot->len = len;
1073 			slot->flags = kring->nkr_slot_flags;
1074 			nm_i = nm_next(nm_i, lim);
1075 		}
1076 		kring->nr_hwtail = nm_i;
1077 	}
1078 
1079 	/*
1080 	 * Second part: skip past packets that userspace has released.
1081 	 */
1082 	nm_i = kring->nr_hwcur;
1083 	if (nm_i != head) { /* something was released */
1084 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1085 			ret = netmap_sw_to_nic(na);
1086 		kring->nr_hwcur = head;
1087 	}
1088 
1089 	nm_rxsync_finalize(kring);
1090 
1091 	/* access copies of cur,tail in the kring */
1092 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1093 		selrecord(td, &kring->si);
1094 
1095 	mbq_unlock(q);
1096 	return ret;
1097 }
1098 
1099 
1100 /* Get a netmap adapter for the port.
1101  *
1102  * If it is possible to satisfy the request, return 0
1103  * with *na containing the netmap adapter found.
1104  * Otherwise return an error code, with *na containing NULL.
1105  *
1106  * When the port is attached to a bridge, we always return
1107  * EBUSY.
1108  * Otherwise, if the port is already bound to a file descriptor,
1109  * then we unconditionally return the existing adapter into *na.
1110  * In all the other cases, we return (into *na) either native,
1111  * generic or NULL, according to the following table:
1112  *
1113  *					native_support
1114  * active_fds   dev.netmap.admode         YES     NO
1115  * -------------------------------------------------------
1116  *    >0              *                 NA(ifp) NA(ifp)
1117  *
1118  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1119  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1120  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1121  *
1122  */
1123 
1124 int
1125 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1126 {
1127 	/* generic support */
1128 	int i = netmap_admode;	/* Take a snapshot. */
1129 	int error = 0;
1130 	struct netmap_adapter *prev_na;
1131 	struct netmap_generic_adapter *gna;
1132 
1133 	*na = NULL; /* default */
1134 
1135 	/* reset in case of invalid value */
1136 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1137 		i = netmap_admode = NETMAP_ADMODE_BEST;
1138 
1139 	if (NETMAP_CAPABLE(ifp)) {
1140 		/* If an adapter already exists, but is
1141 		 * attached to a vale port, we report that the
1142 		 * port is busy.
1143 		 */
1144 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
1145 			return EBUSY;
1146 
1147 		/* If an adapter already exists, return it if
1148 		 * there are active file descriptors or if
1149 		 * netmap is not forced to use generic
1150 		 * adapters.
1151 		 */
1152 		if (NA(ifp)->active_fds > 0 ||
1153 				i != NETMAP_ADMODE_GENERIC) {
1154 			*na = NA(ifp);
1155 			return 0;
1156 		}
1157 	}
1158 
1159 	/* If there isn't native support and netmap is not allowed
1160 	 * to use generic adapters, we cannot satisfy the request.
1161 	 */
1162 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1163 		return EOPNOTSUPP;
1164 
1165 	/* Otherwise, create a generic adapter and return it,
1166 	 * saving the previously used netmap adapter, if any.
1167 	 *
1168 	 * Note that here 'prev_na', if not NULL, MUST be a
1169 	 * native adapter, and CANNOT be a generic one. This is
1170 	 * true because generic adapters are created on demand, and
1171 	 * destroyed when not used anymore. Therefore, if the adapter
1172 	 * currently attached to an interface 'ifp' is generic, it
1173 	 * must be that
1174 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1175 	 * Consequently, if NA(ifp) is generic, we will enter one of
1176 	 * the branches above. This ensures that we never override
1177 	 * a generic adapter with another generic adapter.
1178 	 */
1179 	prev_na = NA(ifp);
1180 	error = generic_netmap_attach(ifp);
1181 	if (error)
1182 		return error;
1183 
1184 	*na = NA(ifp);
1185 	gna = (struct netmap_generic_adapter*)NA(ifp);
1186 	gna->prev = prev_na; /* save old na */
1187 	if (prev_na != NULL) {
1188 		ifunit_ref(ifp->if_xname);
1189 		// XXX add a refcount ?
1190 		netmap_adapter_get(prev_na);
1191 	}
1192 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1193 
1194 	return 0;
1195 }
1196 
1197 
1198 /*
1199  * MUST BE CALLED UNDER NMG_LOCK()
1200  *
1201  * Get a refcounted reference to a netmap adapter attached
1202  * to the interface specified by nmr.
1203  * This is always called in the execution of an ioctl().
1204  *
1205  * Return ENXIO if the interface specified by the request does
1206  * not exist, ENOTSUP if netmap is not supported by the interface,
1207  * EBUSY if the interface is already attached to a bridge,
1208  * EINVAL if parameters are invalid, ENOMEM if needed resources
1209  * could not be allocated.
1210  * If successful, hold a reference to the netmap adapter.
1211  *
1212  * No reference is kept on the real interface, which may then
1213  * disappear at any time.
1214  */
1215 int
1216 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1217 {
1218 	struct ifnet *ifp = NULL;
1219 	int error = 0;
1220 	struct netmap_adapter *ret = NULL;
1221 
1222 	*na = NULL;     /* default return value */
1223 
1224 	/* first try to see if this is a bridge port. */
1225 	NMG_LOCK_ASSERT();
1226 
1227 	error = netmap_get_pipe_na(nmr, na, create);
1228 	if (error || *na != NULL)
1229 		return error;
1230 
1231 	error = netmap_get_bdg_na(nmr, na, create);
1232 	if (error)
1233 		return error;
1234 
1235 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1236 		goto pipes;
1237 
1238 	/*
1239 	 * This must be a hardware na, lookup the name in the system.
1240 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1241 	 * This may still be a tap, a veth/epair, or even a
1242 	 * persistent VALE port.
1243 	 */
1244 	ifp = ifunit_ref(nmr->nr_name);
1245 	if (ifp == NULL) {
1246 	        return ENXIO;
1247 	}
1248 
1249 	error = netmap_get_hw_na(ifp, &ret);
1250 	if (error)
1251 		goto out;
1252 
1253 	/* Users cannot use the NIC attached to a bridge directly */
1254 	if (NETMAP_OWNED_BY_KERN(ret)) {
1255 		error = EBUSY;
1256 		goto out;
1257 	}
1258 	*na = ret;
1259 	netmap_adapter_get(ret);
1260 
1261 pipes:
1262 	/*
1263 	 * If we are opening a pipe whose parent was not in netmap mode,
1264 	 * we have to allocate the pipe array now.
1265 	 * XXX get rid of this clumsiness (2014-03-15)
1266 	 */
1267 	error = netmap_pipe_alloc(*na, nmr);
1268 
1269 out:
1270 	if (error && ret != NULL)
1271 		netmap_adapter_put(ret);
1272 
1273 	if (ifp)
1274 		if_rele(ifp); /* allow live unloading of drivers modules */
1275 
1276 	return error;
1277 }
1278 
1279 
1280 /*
1281  * validate parameters on entry for *_txsync()
1282  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1283  * in case of error.
1284  *
1285  * rhead, rcur and rtail=hwtail are stored from previous round.
1286  * hwcur is the next packet to send to the ring.
1287  *
1288  * We want
1289  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1290  *
1291  * hwcur, rhead, rtail and hwtail are reliable
1292  */
1293 u_int
1294 nm_txsync_prologue(struct netmap_kring *kring)
1295 {
1296 	struct netmap_ring *ring = kring->ring;
1297 	u_int head = ring->head; /* read only once */
1298 	u_int cur = ring->cur; /* read only once */
1299 	u_int n = kring->nkr_num_slots;
1300 
1301 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1302 		kring->name,
1303 		kring->nr_hwcur, kring->nr_hwtail,
1304 		ring->head, ring->cur, ring->tail);
1305 #if 1 /* kernel sanity checks; but we can trust the kring. */
1306 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1307 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1308 		goto error;
1309 #endif /* kernel sanity checks */
1310 	/*
1311 	 * user sanity checks. We only use 'cur',
1312 	 * A, B, ... are possible positions for cur:
1313 	 *
1314 	 *  0    A  cur   B  tail  C  n-1
1315 	 *  0    D  tail  E  cur   F  n-1
1316 	 *
1317 	 * B, F, D are valid. A, C, E are wrong
1318 	 */
1319 	if (kring->rtail >= kring->rhead) {
1320 		/* want rhead <= head <= rtail */
1321 		if (head < kring->rhead || head > kring->rtail)
1322 			goto error;
1323 		/* and also head <= cur <= rtail */
1324 		if (cur < head || cur > kring->rtail)
1325 			goto error;
1326 	} else { /* here rtail < rhead */
1327 		/* we need head outside rtail .. rhead */
1328 		if (head > kring->rtail && head < kring->rhead)
1329 			goto error;
1330 
1331 		/* two cases now: head <= rtail or head >= rhead  */
1332 		if (head <= kring->rtail) {
1333 			/* want head <= cur <= rtail */
1334 			if (cur < head || cur > kring->rtail)
1335 				goto error;
1336 		} else { /* head >= rhead */
1337 			/* cur must be outside rtail..head */
1338 			if (cur > kring->rtail && cur < head)
1339 				goto error;
1340 		}
1341 	}
1342 	if (ring->tail != kring->rtail) {
1343 		RD(5, "tail overwritten was %d need %d",
1344 			ring->tail, kring->rtail);
1345 		ring->tail = kring->rtail;
1346 	}
1347 	kring->rhead = head;
1348 	kring->rcur = cur;
1349 	return head;
1350 
1351 error:
1352 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1353 		kring->name,
1354 		kring->nr_hwcur,
1355 		kring->rcur, kring->nr_hwtail,
1356 		cur, ring->tail);
1357 	return n;
1358 }
1359 
1360 
1361 /*
1362  * validate parameters on entry for *_rxsync()
1363  * Returns ring->head if ok, kring->nkr_num_slots on error.
1364  *
1365  * For a valid configuration,
1366  * hwcur <= head <= cur <= tail <= hwtail
1367  *
1368  * We only consider head and cur.
1369  * hwcur and hwtail are reliable.
1370  *
1371  */
1372 u_int
1373 nm_rxsync_prologue(struct netmap_kring *kring)
1374 {
1375 	struct netmap_ring *ring = kring->ring;
1376 	uint32_t const n = kring->nkr_num_slots;
1377 	uint32_t head, cur;
1378 
1379 	ND("%s kc %d kt %d h %d c %d t %d",
1380 		kring->name,
1381 		kring->nr_hwcur, kring->nr_hwtail,
1382 		ring->head, ring->cur, ring->tail);
1383 	/*
1384 	 * Before storing the new values, we should check they do not
1385 	 * move backwards. However:
1386 	 * - head is not an issue because the previous value is hwcur;
1387 	 * - cur could in principle go back, however it does not matter
1388 	 *   because we are processing a brand new rxsync()
1389 	 */
1390 	cur = kring->rcur = ring->cur;	/* read only once */
1391 	head = kring->rhead = ring->head;	/* read only once */
1392 #if 1 /* kernel sanity checks */
1393 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1394 		goto error;
1395 #endif /* kernel sanity checks */
1396 	/* user sanity checks */
1397 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1398 		/* want hwcur <= rhead <= hwtail */
1399 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1400 			goto error;
1401 		/* and also rhead <= rcur <= hwtail */
1402 		if (cur < head || cur > kring->nr_hwtail)
1403 			goto error;
1404 	} else {
1405 		/* we need rhead outside hwtail..hwcur */
1406 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1407 			goto error;
1408 		/* two cases now: head <= hwtail or head >= hwcur  */
1409 		if (head <= kring->nr_hwtail) {
1410 			/* want head <= cur <= hwtail */
1411 			if (cur < head || cur > kring->nr_hwtail)
1412 				goto error;
1413 		} else {
1414 			/* cur must be outside hwtail..head */
1415 			if (cur < head && cur > kring->nr_hwtail)
1416 				goto error;
1417 		}
1418 	}
1419 	if (ring->tail != kring->rtail) {
1420 		RD(5, "%s tail overwritten was %d need %d",
1421 			kring->name,
1422 			ring->tail, kring->rtail);
1423 		ring->tail = kring->rtail;
1424 	}
1425 	return head;
1426 
1427 error:
1428 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1429 		kring->nr_hwcur,
1430 		kring->rcur, kring->nr_hwtail,
1431 		kring->rhead, kring->rcur, ring->tail);
1432 	return n;
1433 }
1434 
1435 
1436 /*
1437  * Error routine called when txsync/rxsync detects an error.
1438  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1439  * Return 1 on reinit.
1440  *
1441  * This routine is only called by the upper half of the kernel.
1442  * It only reads hwcur (which is changed only by the upper half, too)
1443  * and hwtail (which may be changed by the lower half, but only on
1444  * a tx ring and only to increase it, so any error will be recovered
1445  * on the next call). For the above, we don't strictly need to call
1446  * it under lock.
1447  */
1448 int
1449 netmap_ring_reinit(struct netmap_kring *kring)
1450 {
1451 	struct netmap_ring *ring = kring->ring;
1452 	u_int i, lim = kring->nkr_num_slots - 1;
1453 	int errors = 0;
1454 
1455 	// XXX KASSERT nm_kr_tryget
1456 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1457 	// XXX probably wrong to trust userspace
1458 	kring->rhead = ring->head;
1459 	kring->rcur  = ring->cur;
1460 	kring->rtail = ring->tail;
1461 
1462 	if (ring->cur > lim)
1463 		errors++;
1464 	if (ring->head > lim)
1465 		errors++;
1466 	if (ring->tail > lim)
1467 		errors++;
1468 	for (i = 0; i <= lim; i++) {
1469 		u_int idx = ring->slot[i].buf_idx;
1470 		u_int len = ring->slot[i].len;
1471 		if (idx < 2 || idx >= netmap_total_buffers) {
1472 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1473 			ring->slot[i].buf_idx = 0;
1474 			ring->slot[i].len = 0;
1475 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1476 			ring->slot[i].len = 0;
1477 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1478 		}
1479 	}
1480 	if (errors) {
1481 		RD(10, "total %d errors", errors);
1482 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1483 			kring->name,
1484 			ring->cur, kring->nr_hwcur,
1485 			ring->tail, kring->nr_hwtail);
1486 		ring->head = kring->rhead = kring->nr_hwcur;
1487 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1488 		ring->tail = kring->rtail = kring->nr_hwtail;
1489 	}
1490 	return (errors ? 1 : 0);
1491 }
1492 
1493 
1494 /*
1495  * Set the ring ID. For devices with a single queue, a request
1496  * for all rings is the same as a single ring.
1497  */
1498 static int
1499 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1500 {
1501 	struct netmap_adapter *na = priv->np_na;
1502 	u_int j, i = ringid & NETMAP_RING_MASK;
1503 	u_int reg = flags & NR_REG_MASK;
1504 
1505 	if (reg == NR_REG_DEFAULT) {
1506 		/* convert from old ringid to flags */
1507 		if (ringid & NETMAP_SW_RING) {
1508 			reg = NR_REG_SW;
1509 		} else if (ringid & NETMAP_HW_RING) {
1510 			reg = NR_REG_ONE_NIC;
1511 		} else {
1512 			reg = NR_REG_ALL_NIC;
1513 		}
1514 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1515 	}
1516 	switch (reg) {
1517 	case NR_REG_ALL_NIC:
1518 	case NR_REG_PIPE_MASTER:
1519 	case NR_REG_PIPE_SLAVE:
1520 		priv->np_txqfirst = 0;
1521 		priv->np_txqlast = na->num_tx_rings;
1522 		priv->np_rxqfirst = 0;
1523 		priv->np_rxqlast = na->num_rx_rings;
1524 		ND("%s %d %d", "ALL/PIPE",
1525 			priv->np_rxqfirst, priv->np_rxqlast);
1526 		break;
1527 	case NR_REG_SW:
1528 	case NR_REG_NIC_SW:
1529 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1530 			D("host rings not supported");
1531 			return EINVAL;
1532 		}
1533 		priv->np_txqfirst = (reg == NR_REG_SW ?
1534 			na->num_tx_rings : 0);
1535 		priv->np_txqlast = na->num_tx_rings + 1;
1536 		priv->np_rxqfirst = (reg == NR_REG_SW ?
1537 			na->num_rx_rings : 0);
1538 		priv->np_rxqlast = na->num_rx_rings + 1;
1539 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1540 			priv->np_rxqfirst, priv->np_rxqlast);
1541 		break;
1542 	case NR_REG_ONE_NIC:
1543 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1544 			D("invalid ring id %d", i);
1545 			return EINVAL;
1546 		}
1547 		/* if not enough rings, use the first one */
1548 		j = i;
1549 		if (j >= na->num_tx_rings)
1550 			j = 0;
1551 		priv->np_txqfirst = j;
1552 		priv->np_txqlast = j + 1;
1553 		j = i;
1554 		if (j >= na->num_rx_rings)
1555 			j = 0;
1556 		priv->np_rxqfirst = j;
1557 		priv->np_rxqlast = j + 1;
1558 		break;
1559 	default:
1560 		D("invalid regif type %d", reg);
1561 		return EINVAL;
1562 	}
1563 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1564 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1565 	if (nm_tx_si_user(priv))
1566 		na->tx_si_users++;
1567 	if (nm_rx_si_user(priv))
1568 		na->rx_si_users++;
1569 	if (netmap_verbose) {
1570 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1571 			NM_IFPNAME(na->ifp),
1572 			priv->np_txqfirst,
1573 			priv->np_txqlast,
1574 			priv->np_rxqfirst,
1575 			priv->np_rxqlast,
1576 			i);
1577 	}
1578 	return 0;
1579 }
1580 
1581 /*
1582  * possibly move the interface to netmap-mode.
1583  * If success it returns a pointer to netmap_if, otherwise NULL.
1584  * This must be called with NMG_LOCK held.
1585  */
1586 struct netmap_if *
1587 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1588 	uint16_t ringid, uint32_t flags, int *err)
1589 {
1590 	struct ifnet *ifp = na->ifp;
1591 	struct netmap_if *nifp = NULL;
1592 	int error, need_mem = 0;
1593 
1594 	NMG_LOCK_ASSERT();
1595 	/* ring configuration may have changed, fetch from the card */
1596 	netmap_update_config(na);
1597 	priv->np_na = na;     /* store the reference */
1598 	error = netmap_set_ringid(priv, ringid, flags);
1599 	if (error)
1600 		goto out;
1601 	/* ensure allocators are ready */
1602 	need_mem = !netmap_have_memory_locked(priv);
1603 	if (need_mem) {
1604 		error = netmap_get_memory_locked(priv);
1605 		ND("get_memory returned %d", error);
1606 		if (error)
1607 			goto out;
1608 	}
1609 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1610 
1611 	/* Allocate a netmap_if and, if necessary, all the netmap_ring's */
1612 	if (nifp == NULL) { /* allocation failed */
1613 		error = ENOMEM;
1614 		goto out;
1615 	}
1616 	na->active_fds++;
1617 	if (ifp->if_capenable & IFCAP_NETMAP) {
1618 		/* was already set */
1619 	} else {
1620 		/* Otherwise set the card in netmap mode
1621 		 * and make it use the shared buffers.
1622 		 */
1623 		/* cache the allocator info in the na */
1624 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1625 		ND("%p->na_lut == %p", na, na->na_lut);
1626 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1627 		error = na->nm_register(na, 1); /* mode on */
1628 		if (error) {
1629 			netmap_do_unregif(priv, nifp);
1630 			nifp = NULL;
1631 		}
1632 	}
1633 out:
1634 	*err = error;
1635 	if (error) {
1636 		priv->np_na = NULL;
1637 		/* we should drop the allocator, but only
1638 		 * if we were the ones who grabbed it
1639 		 */
1640 		if (need_mem)
1641 			netmap_drop_memory_locked(priv);
1642 	}
1643 	if (nifp != NULL) {
1644 		/*
1645 		 * advertise that the interface is ready bt setting ni_nifp.
1646 		 * The barrier is needed because readers (poll and *SYNC)
1647 		 * check for priv->np_nifp != NULL without locking
1648 		 */
1649 		wmb(); /* make sure previous writes are visible to all CPUs */
1650 		priv->np_nifp = nifp;
1651 	}
1652 	return nifp;
1653 }
1654 
1655 
1656 
1657 /*
1658  * ioctl(2) support for the "netmap" device.
1659  *
1660  * Following a list of accepted commands:
1661  * - NIOCGINFO
1662  * - SIOCGIFADDR	just for convenience
1663  * - NIOCREGIF
1664  * - NIOCTXSYNC
1665  * - NIOCRXSYNC
1666  *
1667  * Return 0 on success, errno otherwise.
1668  */
1669 int
1670 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1671 	int fflag, struct thread *td)
1672 {
1673 	struct netmap_priv_d *priv = NULL;
1674 	struct ifnet *ifp = NULL;
1675 	struct nmreq *nmr = (struct nmreq *) data;
1676 	struct netmap_adapter *na = NULL;
1677 	int error;
1678 	u_int i, qfirst, qlast;
1679 	struct netmap_if *nifp;
1680 	struct netmap_kring *krings;
1681 
1682 	(void)dev;	/* UNUSED */
1683 	(void)fflag;	/* UNUSED */
1684 
1685 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
1686 		/* truncate name */
1687 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
1688 		if (nmr->nr_version != NETMAP_API) {
1689 			D("API mismatch for %s got %d need %d",
1690 				nmr->nr_name,
1691 				nmr->nr_version, NETMAP_API);
1692 			nmr->nr_version = NETMAP_API;
1693 		}
1694 		if (nmr->nr_version < NETMAP_MIN_API ||
1695 		    nmr->nr_version > NETMAP_MAX_API) {
1696 			return EINVAL;
1697 		}
1698 	}
1699 	CURVNET_SET(TD_TO_VNET(td));
1700 
1701 	error = devfs_get_cdevpriv((void **)&priv);
1702 	if (error) {
1703 		CURVNET_RESTORE();
1704 		/* XXX ENOENT should be impossible, since the priv
1705 		 * is now created in the open */
1706 		return (error == ENOENT ? ENXIO : error);
1707 	}
1708 
1709 	switch (cmd) {
1710 	case NIOCGINFO:		/* return capabilities etc */
1711 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1712 			error = netmap_bdg_ctl(nmr, NULL);
1713 			break;
1714 		}
1715 
1716 		NMG_LOCK();
1717 		do {
1718 			/* memsize is always valid */
1719 			struct netmap_mem_d *nmd = &nm_mem;
1720 			u_int memflags;
1721 
1722 			if (nmr->nr_name[0] != '\0') {
1723 				/* get a refcount */
1724 				error = netmap_get_na(nmr, &na, 1 /* create */);
1725 				if (error)
1726 					break;
1727 				nmd = na->nm_mem; /* get memory allocator */
1728 			}
1729 
1730 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
1731 				&nmr->nr_arg2);
1732 			if (error)
1733 				break;
1734 			if (na == NULL) /* only memory info */
1735 				break;
1736 			nmr->nr_offset = 0;
1737 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1738 			netmap_update_config(na);
1739 			nmr->nr_rx_rings = na->num_rx_rings;
1740 			nmr->nr_tx_rings = na->num_tx_rings;
1741 			nmr->nr_rx_slots = na->num_rx_desc;
1742 			nmr->nr_tx_slots = na->num_tx_desc;
1743 			netmap_adapter_put(na);
1744 		} while (0);
1745 		NMG_UNLOCK();
1746 		break;
1747 
1748 	case NIOCREGIF:
1749 		/* possibly attach/detach NIC and VALE switch */
1750 		i = nmr->nr_cmd;
1751 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
1752 				|| i == NETMAP_BDG_VNET_HDR) {
1753 			error = netmap_bdg_ctl(nmr, NULL);
1754 			break;
1755 		} else if (i != 0) {
1756 			D("nr_cmd must be 0 not %d", i);
1757 			error = EINVAL;
1758 			break;
1759 		}
1760 
1761 		/* protect access to priv from concurrent NIOCREGIF */
1762 		NMG_LOCK();
1763 		do {
1764 			u_int memflags;
1765 
1766 			if (priv->np_na != NULL) {	/* thread already registered */
1767 				error = EBUSY;
1768 				break;
1769 			}
1770 			/* find the interface and a reference */
1771 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1772 			if (error)
1773 				break;
1774 			ifp = na->ifp;
1775 			if (NETMAP_OWNED_BY_KERN(na)) {
1776 				netmap_adapter_put(na);
1777 				error = EBUSY;
1778 				break;
1779 			}
1780 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error);
1781 			if (!nifp) {    /* reg. failed, release priv and ref */
1782 				netmap_adapter_put(na);
1783 				priv->np_nifp = NULL;
1784 				break;
1785 			}
1786 			priv->np_td = td; // XXX kqueue, debugging only
1787 
1788 			/* return the offset of the netmap_if object */
1789 			nmr->nr_rx_rings = na->num_rx_rings;
1790 			nmr->nr_tx_rings = na->num_tx_rings;
1791 			nmr->nr_rx_slots = na->num_rx_desc;
1792 			nmr->nr_tx_slots = na->num_tx_desc;
1793 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
1794 				&nmr->nr_arg2);
1795 			if (error) {
1796 				netmap_adapter_put(na);
1797 				break;
1798 			}
1799 			if (memflags & NETMAP_MEM_PRIVATE) {
1800 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1801 			}
1802 			priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ?
1803 				&na->tx_si : &na->tx_rings[priv->np_txqfirst].si;
1804 			priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ?
1805 				&na->rx_si : &na->rx_rings[priv->np_rxqfirst].si;
1806 
1807 			if (nmr->nr_arg3) {
1808 				D("requested %d extra buffers", nmr->nr_arg3);
1809 				nmr->nr_arg3 = netmap_extra_alloc(na,
1810 					&nifp->ni_bufs_head, nmr->nr_arg3);
1811 				D("got %d extra buffers", nmr->nr_arg3);
1812 			}
1813 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1814 		} while (0);
1815 		NMG_UNLOCK();
1816 		break;
1817 
1818 	case NIOCTXSYNC:
1819 	case NIOCRXSYNC:
1820 		nifp = priv->np_nifp;
1821 
1822 		if (nifp == NULL) {
1823 			error = ENXIO;
1824 			break;
1825 		}
1826 		rmb(); /* make sure following reads are not from cache */
1827 
1828 		na = priv->np_na;      /* we have a reference */
1829 
1830 		if (na == NULL) {
1831 			D("Internal error: nifp != NULL && na == NULL");
1832 			error = ENXIO;
1833 			break;
1834 		}
1835 
1836 		ifp = na->ifp;
1837 		if (ifp == NULL) {
1838 			RD(1, "the ifp is gone");
1839 			error = ENXIO;
1840 			break;
1841 		}
1842 
1843 		if (cmd == NIOCTXSYNC) {
1844 			krings = na->tx_rings;
1845 			qfirst = priv->np_txqfirst;
1846 			qlast = priv->np_txqlast;
1847 		} else {
1848 			krings = na->rx_rings;
1849 			qfirst = priv->np_rxqfirst;
1850 			qlast = priv->np_rxqlast;
1851 		}
1852 
1853 		for (i = qfirst; i < qlast; i++) {
1854 			struct netmap_kring *kring = krings + i;
1855 			if (nm_kr_tryget(kring)) {
1856 				error = EBUSY;
1857 				goto out;
1858 			}
1859 			if (cmd == NIOCTXSYNC) {
1860 				if (netmap_verbose & NM_VERB_TXSYNC)
1861 					D("pre txsync ring %d cur %d hwcur %d",
1862 					    i, kring->ring->cur,
1863 					    kring->nr_hwcur);
1864 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1865 					netmap_ring_reinit(kring);
1866 				} else {
1867 					kring->nm_sync(kring, NAF_FORCE_RECLAIM);
1868 				}
1869 				if (netmap_verbose & NM_VERB_TXSYNC)
1870 					D("post txsync ring %d cur %d hwcur %d",
1871 					    i, kring->ring->cur,
1872 					    kring->nr_hwcur);
1873 			} else {
1874 				kring->nm_sync(kring, NAF_FORCE_READ);
1875 				microtime(&na->rx_rings[i].ring->ts);
1876 			}
1877 			nm_kr_put(kring);
1878 		}
1879 
1880 		break;
1881 
1882 #ifdef __FreeBSD__
1883 	case FIONBIO:
1884 	case FIOASYNC:
1885 		ND("FIONBIO/FIOASYNC are no-ops");
1886 		break;
1887 
1888 	case BIOCIMMEDIATE:
1889 	case BIOCGHDRCMPLT:
1890 	case BIOCSHDRCMPLT:
1891 	case BIOCSSEESENT:
1892 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1893 		break;
1894 
1895 	default:	/* allow device-specific ioctls */
1896 	    {
1897 		struct socket so;
1898 
1899 		bzero(&so, sizeof(so));
1900 		NMG_LOCK();
1901 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1902 		if (error) {
1903 			netmap_adapter_put(na);
1904 			NMG_UNLOCK();
1905 			break;
1906 		}
1907 		ifp = na->ifp;
1908 		so.so_vnet = ifp->if_vnet;
1909 		// so->so_proto not null.
1910 		error = ifioctl(&so, cmd, data, td);
1911 		netmap_adapter_put(na);
1912 		NMG_UNLOCK();
1913 		break;
1914 	    }
1915 
1916 #else /* linux */
1917 	default:
1918 		error = EOPNOTSUPP;
1919 #endif /* linux */
1920 	}
1921 out:
1922 
1923 	CURVNET_RESTORE();
1924 	return (error);
1925 }
1926 
1927 
1928 /*
1929  * select(2) and poll(2) handlers for the "netmap" device.
1930  *
1931  * Can be called for one or more queues.
1932  * Return true the event mask corresponding to ready events.
1933  * If there are no ready events, do a selrecord on either individual
1934  * selinfo or on the global one.
1935  * Device-dependent parts (locking and sync of tx/rx rings)
1936  * are done through callbacks.
1937  *
1938  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1939  * The first one is remapped to pwait as selrecord() uses the name as an
1940  * hidden argument.
1941  */
1942 int
1943 netmap_poll(struct cdev *dev, int events, struct thread *td)
1944 {
1945 	struct netmap_priv_d *priv = NULL;
1946 	struct netmap_adapter *na;
1947 	struct ifnet *ifp;
1948 	struct netmap_kring *kring;
1949 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1950 	struct mbq q;		/* packets from hw queues to host stack */
1951 	void *pwait = dev;	/* linux compatibility */
1952 	int is_kevent = 0;
1953 
1954 	/*
1955 	 * In order to avoid nested locks, we need to "double check"
1956 	 * txsync and rxsync if we decide to do a selrecord().
1957 	 * retry_tx (and retry_rx, later) prevent looping forever.
1958 	 */
1959 	int retry_tx = 1, retry_rx = 1;
1960 
1961 	(void)pwait;
1962 	mbq_init(&q);
1963 
1964 	/*
1965 	 * XXX kevent has curthread->tp_fop == NULL,
1966 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
1967 	 * priv as the first argument, which is also useful to avoid
1968 	 * the selrecord() which are not necessary in that case.
1969 	 */
1970 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
1971 		is_kevent = 1;
1972 		if (netmap_verbose)
1973 			D("called from kevent");
1974 		priv = (struct netmap_priv_d *)dev;
1975 	}
1976 	if (priv == NULL)
1977 		return POLLERR;
1978 
1979 	if (priv->np_nifp == NULL) {
1980 		D("No if registered");
1981 		return POLLERR;
1982 	}
1983 	rmb(); /* make sure following reads are not from cache */
1984 
1985 	na = priv->np_na;
1986 	ifp = na->ifp;
1987 	// check for deleted
1988 	if (ifp == NULL) {
1989 		RD(1, "the ifp is gone");
1990 		return POLLERR;
1991 	}
1992 
1993 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1994 		return POLLERR;
1995 
1996 	if (netmap_verbose & 0x8000)
1997 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1998 	want_tx = events & (POLLOUT | POLLWRNORM);
1999 	want_rx = events & (POLLIN | POLLRDNORM);
2000 
2001 
2002 	/*
2003 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2004 	 * the file descriptor is bound to all of them. If so, we sleep on
2005 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2006 	 * (FreeBSD only allows two selinfo's per file descriptor).
2007 	 * The interrupt routine in the driver wake one or the other
2008 	 * (or both) depending on which clients are active.
2009 	 *
2010 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2011 	 * txsync() is called if we run out of buffers on POLLOUT, or
2012 	 * there are pending packets to send. The latter can be disabled
2013 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2014 	 */
2015 	check_all_tx = nm_tx_si_user(priv);
2016 	check_all_rx = nm_rx_si_user(priv);
2017 
2018 	/*
2019 	 * We start with a lock free round which is cheap if we have
2020 	 * slots available. If this fails, then lock and call the sync
2021 	 * routines.
2022 	 */
2023 	for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) {
2024 		kring = &na->rx_rings[i];
2025 		/* XXX compare ring->cur and kring->tail */
2026 		if (!nm_ring_empty(kring->ring)) {
2027 			revents |= want_rx;
2028 			want_rx = 0;	/* also breaks the loop */
2029 		}
2030 	}
2031 	for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) {
2032 		kring = &na->tx_rings[i];
2033 		/* XXX compare ring->cur and kring->tail */
2034 		if (!nm_ring_empty(kring->ring)) {
2035 			revents |= want_tx;
2036 			want_tx = 0;	/* also breaks the loop */
2037 		}
2038 	}
2039 
2040 	/*
2041 	 * If we want to push packets out (priv->np_txpoll) or
2042 	 * want_tx is still set, we must issue txsync calls
2043 	 * (on all rings, to avoid that the tx rings stall).
2044 	 * XXX should also check cur != hwcur on the tx rings.
2045 	 * Fortunately, normal tx mode has np_txpoll set.
2046 	 */
2047 	if (priv->np_txpoll || want_tx) {
2048 		/*
2049 		 * The first round checks if anyone is ready, if not
2050 		 * do a selrecord and another round to handle races.
2051 		 * want_tx goes to 0 if any space is found, and is
2052 		 * used to skip rings with no pending transmissions.
2053 		 */
2054 flush_tx:
2055 		for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) {
2056 			int found = 0;
2057 
2058 			kring = &na->tx_rings[i];
2059 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2060 				continue;
2061 			/* only one thread does txsync */
2062 			if (nm_kr_tryget(kring)) {
2063 				/* either busy or stopped
2064 				 * XXX if the ring is stopped, sleeping would
2065 				 * be better. In current code, however, we only
2066 				 * stop the rings for brief intervals (2014-03-14)
2067 				 */
2068 
2069 				if (netmap_verbose)
2070 					RD(2, "%p lost race on txring %d, ok",
2071 					    priv, i);
2072 				continue;
2073 			}
2074 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2075 				netmap_ring_reinit(kring);
2076 				revents |= POLLERR;
2077 			} else {
2078 				if (kring->nm_sync(kring, 0))
2079 					revents |= POLLERR;
2080 			}
2081 
2082 			/*
2083 			 * If we found new slots, notify potential
2084 			 * listeners on the same ring.
2085 			 * Since we just did a txsync, look at the copies
2086 			 * of cur,tail in the kring.
2087 			 */
2088 			found = kring->rcur != kring->rtail;
2089 			nm_kr_put(kring);
2090 			if (found) { /* notify other listeners */
2091 				revents |= want_tx;
2092 				want_tx = 0;
2093 				na->nm_notify(na, i, NR_TX, 0);
2094 			}
2095 		}
2096 		if (want_tx && retry_tx && !is_kevent) {
2097 			selrecord(td, check_all_tx ?
2098 			    &na->tx_si : &na->tx_rings[priv->np_txqfirst].si);
2099 			retry_tx = 0;
2100 			goto flush_tx;
2101 		}
2102 	}
2103 
2104 	/*
2105 	 * If want_rx is still set scan receive rings.
2106 	 * Do it on all rings because otherwise we starve.
2107 	 */
2108 	if (want_rx) {
2109 		int send_down = 0; /* transparent mode */
2110 		/* two rounds here for race avoidance */
2111 do_retry_rx:
2112 		for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) {
2113 			int found = 0;
2114 
2115 			kring = &na->rx_rings[i];
2116 
2117 			if (nm_kr_tryget(kring)) {
2118 				if (netmap_verbose)
2119 					RD(2, "%p lost race on rxring %d, ok",
2120 					    priv, i);
2121 				continue;
2122 			}
2123 
2124 			/*
2125 			 * transparent mode support: collect packets
2126 			 * from the rxring(s).
2127 			 * XXX NR_FORWARD should only be read on
2128 			 * physical or NIC ports
2129 			 */
2130 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2131 				ND(10, "forwarding some buffers up %d to %d",
2132 				    kring->nr_hwcur, kring->ring->cur);
2133 				netmap_grab_packets(kring, &q, netmap_fwd);
2134 			}
2135 
2136 			if (kring->nm_sync(kring, 0))
2137 				revents |= POLLERR;
2138 			if (netmap_no_timestamp == 0 ||
2139 					kring->ring->flags & NR_TIMESTAMP) {
2140 				microtime(&kring->ring->ts);
2141 			}
2142 			/* after an rxsync we can use kring->rcur, rtail */
2143 			found = kring->rcur != kring->rtail;
2144 			nm_kr_put(kring);
2145 			if (found) {
2146 				revents |= want_rx;
2147 				retry_rx = 0;
2148 				na->nm_notify(na, i, NR_RX, 0);
2149 			}
2150 		}
2151 
2152 		/* transparent mode XXX only during first pass ? */
2153 		if (na->na_flags & NAF_HOST_RINGS) {
2154 			kring = &na->rx_rings[na->num_rx_rings];
2155 			if (check_all_rx
2156 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2157 				/* XXX fix to use kring fields */
2158 				if (nm_ring_empty(kring->ring))
2159 					send_down = netmap_rxsync_from_host(na, td, dev);
2160 				if (!nm_ring_empty(kring->ring))
2161 					revents |= want_rx;
2162 			}
2163 		}
2164 
2165 		if (retry_rx && !is_kevent)
2166 			selrecord(td, check_all_rx ?
2167 			    &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si);
2168 		if (send_down > 0 || retry_rx) {
2169 			retry_rx = 0;
2170 			if (send_down)
2171 				goto flush_tx; /* and retry_rx */
2172 			else
2173 				goto do_retry_rx;
2174 		}
2175 	}
2176 
2177 	/*
2178 	 * Transparent mode: marked bufs on rx rings between
2179 	 * kring->nr_hwcur and ring->head
2180 	 * are passed to the other endpoint.
2181 	 *
2182 	 * In this mode we also scan the sw rxring, which in
2183 	 * turn passes packets up.
2184 	 *
2185 	 * XXX Transparent mode at the moment requires to bind all
2186  	 * rings to a single file descriptor.
2187 	 */
2188 
2189 	if (q.head)
2190 		netmap_send_up(na->ifp, &q);
2191 
2192 	return (revents);
2193 }
2194 
2195 
2196 /*-------------------- driver support routines -------------------*/
2197 
2198 static int netmap_hw_krings_create(struct netmap_adapter *);
2199 
2200 /* default notify callback */
2201 static int
2202 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2203 	enum txrx tx, int flags)
2204 {
2205 	struct netmap_kring *kring;
2206 
2207 	if (tx == NR_TX) {
2208 		kring = na->tx_rings + n_ring;
2209 		OS_selwakeup(&kring->si, PI_NET);
2210 		/* optimization: avoid a wake up on the global
2211 		 * queue if nobody has registered for more
2212 		 * than one ring
2213 		 */
2214 		if (na->tx_si_users > 0)
2215 			OS_selwakeup(&na->tx_si, PI_NET);
2216 	} else {
2217 		kring = na->rx_rings + n_ring;
2218 		OS_selwakeup(&kring->si, PI_NET);
2219 		/* optimization: same as above */
2220 		if (na->rx_si_users > 0)
2221 			OS_selwakeup(&na->rx_si, PI_NET);
2222 	}
2223 	return 0;
2224 }
2225 
2226 
2227 /* called by all routines that create netmap_adapters.
2228  * Attach na to the ifp (if any) and provide defaults
2229  * for optional callbacks. Defaults assume that we
2230  * are creating an hardware netmap_adapter.
2231  */
2232 int
2233 netmap_attach_common(struct netmap_adapter *na)
2234 {
2235 	struct ifnet *ifp = na->ifp;
2236 
2237 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2238 		D("%s: invalid rings tx %d rx %d",
2239 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
2240 		return EINVAL;
2241 	}
2242 	WNA(ifp) = na;
2243 
2244 	/* the following is only needed for na that use the host port.
2245 	 * XXX do we have something similar for linux ?
2246 	 */
2247 #ifdef __FreeBSD__
2248 	na->if_input = ifp->if_input; /* for netmap_send_up */
2249 #endif /* __FreeBSD__ */
2250 
2251 	NETMAP_SET_CAPABLE(ifp);
2252 	if (na->nm_krings_create == NULL) {
2253 		/* we assume that we have been called by a driver,
2254 		 * since other port types all provide their own
2255 		 * nm_krings_create
2256 		 */
2257 		na->nm_krings_create = netmap_hw_krings_create;
2258 		na->nm_krings_delete = netmap_hw_krings_delete;
2259 	}
2260 	if (na->nm_notify == NULL)
2261 		na->nm_notify = netmap_notify;
2262 	na->active_fds = 0;
2263 
2264 	if (na->nm_mem == NULL)
2265 		na->nm_mem = &nm_mem;
2266 	return 0;
2267 }
2268 
2269 
2270 /* standard cleanup, called by all destructors */
2271 void
2272 netmap_detach_common(struct netmap_adapter *na)
2273 {
2274 	if (na->ifp != NULL)
2275 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2276 
2277 	if (na->tx_rings) { /* XXX should not happen */
2278 		D("freeing leftover tx_rings");
2279 		na->nm_krings_delete(na);
2280 	}
2281 	netmap_pipe_dealloc(na);
2282 	if (na->na_flags & NAF_MEM_OWNER)
2283 		netmap_mem_private_delete(na->nm_mem);
2284 	bzero(na, sizeof(*na));
2285 	free(na, M_DEVBUF);
2286 }
2287 
2288 
2289 /*
2290  * Initialize a ``netmap_adapter`` object created by driver on attach.
2291  * We allocate a block of memory with room for a struct netmap_adapter
2292  * plus two sets of N+2 struct netmap_kring (where N is the number
2293  * of hardware rings):
2294  * krings	0..N-1	are for the hardware queues.
2295  * kring	N	is for the host stack queue
2296  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2297  * Return 0 on success, ENOMEM otherwise.
2298  */
2299 int
2300 netmap_attach(struct netmap_adapter *arg)
2301 {
2302 	struct netmap_hw_adapter *hwna = NULL;
2303 	// XXX when is arg == NULL ?
2304 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2305 
2306 	if (arg == NULL || ifp == NULL)
2307 		goto fail;
2308 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2309 	if (hwna == NULL)
2310 		goto fail;
2311 	hwna->up = *arg;
2312 	hwna->up.na_flags |= NAF_HOST_RINGS;
2313 	if (netmap_attach_common(&hwna->up)) {
2314 		free(hwna, M_DEVBUF);
2315 		goto fail;
2316 	}
2317 	netmap_adapter_get(&hwna->up);
2318 
2319 #ifdef linux
2320 	if (ifp->netdev_ops) {
2321 		/* prepare a clone of the netdev ops */
2322 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2323 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2324 #else
2325 		hwna->nm_ndo = *ifp->netdev_ops;
2326 #endif
2327 	}
2328 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2329 #endif /* linux */
2330 
2331 	D("success for %s tx %d/%d rx %d/%d queues/slots",
2332 		NM_IFPNAME(ifp),
2333 		hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2334 		hwna->up.num_rx_rings, hwna->up.num_rx_desc
2335 		);
2336 	return 0;
2337 
2338 fail:
2339 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2340 	if (ifp)
2341 		netmap_detach(ifp);
2342 	return (hwna ? EINVAL : ENOMEM);
2343 }
2344 
2345 
2346 void
2347 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2348 {
2349 	if (!na) {
2350 		return;
2351 	}
2352 
2353 	refcount_acquire(&na->na_refcount);
2354 }
2355 
2356 
2357 /* returns 1 iff the netmap_adapter is destroyed */
2358 int
2359 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2360 {
2361 	if (!na)
2362 		return 1;
2363 
2364 	if (!refcount_release(&na->na_refcount))
2365 		return 0;
2366 
2367 	if (na->nm_dtor)
2368 		na->nm_dtor(na);
2369 
2370 	netmap_detach_common(na);
2371 
2372 	return 1;
2373 }
2374 
2375 /* nm_krings_create callback for all hardware native adapters */
2376 int
2377 netmap_hw_krings_create(struct netmap_adapter *na)
2378 {
2379 	int ret = netmap_krings_create(na, 0);
2380 	if (ret == 0) {
2381 		/* initialize the mbq for the sw rx ring */
2382 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2383 		ND("initialized sw rx queue %d", na->num_rx_rings);
2384 	}
2385 	return ret;
2386 }
2387 
2388 
2389 
2390 /*
2391  * Called on module unload by the netmap-enabled drivers
2392  */
2393 void
2394 netmap_detach(struct ifnet *ifp)
2395 {
2396 	struct netmap_adapter *na = NA(ifp);
2397 
2398 	if (!na)
2399 		return;
2400 
2401 	NMG_LOCK();
2402 	netmap_disable_all_rings(ifp);
2403 	if (!netmap_adapter_put(na)) {
2404 		/* someone is still using the adapter,
2405 		 * tell them that the interface is gone
2406 		 */
2407 		na->ifp = NULL;
2408 		/* give them a chance to notice */
2409 		netmap_enable_all_rings(ifp);
2410 	}
2411 	NMG_UNLOCK();
2412 }
2413 
2414 
2415 /*
2416  * Intercept packets from the network stack and pass them
2417  * to netmap as incoming packets on the 'software' ring.
2418  *
2419  * We only store packets in a bounded mbq and then copy them
2420  * in the relevant rxsync routine.
2421  *
2422  * We rely on the OS to make sure that the ifp and na do not go
2423  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2424  * In nm_register() or whenever there is a reinitialization,
2425  * we make sure to make the mode change visible here.
2426  */
2427 int
2428 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2429 {
2430 	struct netmap_adapter *na = NA(ifp);
2431 	struct netmap_kring *kring;
2432 	u_int len = MBUF_LEN(m);
2433 	u_int error = ENOBUFS;
2434 	struct mbq *q;
2435 	int space;
2436 
2437 	// XXX [Linux] we do not need this lock
2438 	// if we follow the down/configure/up protocol -gl
2439 	// mtx_lock(&na->core_lock);
2440 
2441 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2442 		D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
2443 		error = ENXIO;
2444 		goto done;
2445 	}
2446 
2447 	kring = &na->rx_rings[na->num_rx_rings];
2448 	q = &kring->rx_queue;
2449 
2450 	// XXX reconsider long packets if we handle fragments
2451 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2452 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2453 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2454 		goto done;
2455 	}
2456 
2457 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2458 	 * and maybe other instances of netmap_transmit (the latter
2459 	 * not possible on Linux).
2460 	 * Also avoid overflowing the queue.
2461 	 */
2462 	mbq_lock(q);
2463 
2464         space = kring->nr_hwtail - kring->nr_hwcur;
2465         if (space < 0)
2466                 space += kring->nkr_num_slots;
2467 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2468 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2469 			 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2470 			len, m);
2471 	} else {
2472 		mbq_enqueue(q, m);
2473 		ND(10, "%s %d bufs in queue len %d m %p",
2474 			NM_IFPNAME(ifp), mbq_len(q), len, m);
2475 		/* notify outside the lock */
2476 		m = NULL;
2477 		error = 0;
2478 	}
2479 	mbq_unlock(q);
2480 
2481 done:
2482 	if (m)
2483 		m_freem(m);
2484 	/* unconditionally wake up listeners */
2485 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2486 	/* this is normally netmap_notify(), but for nics
2487 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2488 	 * that possibly forwards the frames through the switch
2489 	 */
2490 
2491 	return (error);
2492 }
2493 
2494 
2495 /*
2496  * netmap_reset() is called by the driver routines when reinitializing
2497  * a ring. The driver is in charge of locking to protect the kring.
2498  * If native netmap mode is not set just return NULL.
2499  */
2500 struct netmap_slot *
2501 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2502 	u_int new_cur)
2503 {
2504 	struct netmap_kring *kring;
2505 	int new_hwofs, lim;
2506 
2507 	if (na == NULL) {
2508 		D("NULL na, should not happen");
2509 		return NULL;	/* no netmap support here */
2510 	}
2511 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
2512 		ND("interface not in netmap mode");
2513 		return NULL;	/* nothing to reinitialize */
2514 	}
2515 
2516 	/* XXX note- in the new scheme, we are not guaranteed to be
2517 	 * under lock (e.g. when called on a device reset).
2518 	 * In this case, we should set a flag and do not trust too
2519 	 * much the values. In practice: TODO
2520 	 * - set a RESET flag somewhere in the kring
2521 	 * - do the processing in a conservative way
2522 	 * - let the *sync() fixup at the end.
2523 	 */
2524 	if (tx == NR_TX) {
2525 		if (n >= na->num_tx_rings)
2526 			return NULL;
2527 		kring = na->tx_rings + n;
2528 		// XXX check whether we should use hwcur or rcur
2529 		new_hwofs = kring->nr_hwcur - new_cur;
2530 	} else {
2531 		if (n >= na->num_rx_rings)
2532 			return NULL;
2533 		kring = na->rx_rings + n;
2534 		new_hwofs = kring->nr_hwtail - new_cur;
2535 	}
2536 	lim = kring->nkr_num_slots - 1;
2537 	if (new_hwofs > lim)
2538 		new_hwofs -= lim + 1;
2539 
2540 	/* Always set the new offset value and realign the ring. */
2541 	if (netmap_verbose)
2542 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2543 		NM_IFPNAME(na->ifp),
2544 		tx == NR_TX ? "TX" : "RX", n,
2545 		kring->nkr_hwofs, new_hwofs,
2546 		kring->nr_hwtail,
2547 		tx == NR_TX ? lim : kring->nr_hwtail);
2548 	kring->nkr_hwofs = new_hwofs;
2549 	if (tx == NR_TX) {
2550 		kring->nr_hwtail = kring->nr_hwcur + lim;
2551 		if (kring->nr_hwtail > lim)
2552 			kring->nr_hwtail -= lim + 1;
2553 	}
2554 
2555 #if 0 // def linux
2556 	/* XXX check that the mappings are correct */
2557 	/* need ring_nr, adapter->pdev, direction */
2558 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2559 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2560 		D("error mapping rx netmap buffer %d", i);
2561 		// XXX fix error handling
2562 	}
2563 
2564 #endif /* linux */
2565 	/*
2566 	 * Wakeup on the individual and global selwait
2567 	 * We do the wakeup here, but the ring is not yet reconfigured.
2568 	 * However, we are under lock so there are no races.
2569 	 */
2570 	na->nm_notify(na, n, tx, 0);
2571 	return kring->ring->slot;
2572 }
2573 
2574 
2575 /*
2576  * Dispatch rx/tx interrupts to the netmap rings.
2577  *
2578  * "work_done" is non-null on the RX path, NULL for the TX path.
2579  * We rely on the OS to make sure that there is only one active
2580  * instance per queue, and that there is appropriate locking.
2581  *
2582  * The 'notify' routine depends on what the ring is attached to.
2583  * - for a netmap file descriptor, do a selwakeup on the individual
2584  *   waitqueue, plus one on the global one if needed
2585  * - for a switch, call the proper forwarding routine
2586  * - XXX more ?
2587  */
2588 void
2589 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2590 {
2591 	struct netmap_adapter *na = NA(ifp);
2592 	struct netmap_kring *kring;
2593 
2594 	q &= NETMAP_RING_MASK;
2595 
2596 	if (netmap_verbose) {
2597 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2598 	}
2599 
2600 	if (work_done) { /* RX path */
2601 		if (q >= na->num_rx_rings)
2602 			return;	// not a physical queue
2603 		kring = na->rx_rings + q;
2604 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2605 		na->nm_notify(na, q, NR_RX, 0);
2606 		*work_done = 1; /* do not fire napi again */
2607 	} else { /* TX path */
2608 		if (q >= na->num_tx_rings)
2609 			return;	// not a physical queue
2610 		kring = na->tx_rings + q;
2611 		na->nm_notify(na, q, NR_TX, 0);
2612 	}
2613 }
2614 
2615 
2616 /*
2617  * Default functions to handle rx/tx interrupts from a physical device.
2618  * "work_done" is non-null on the RX path, NULL for the TX path.
2619  *
2620  * If the card is not in netmap mode, simply return 0,
2621  * so that the caller proceeds with regular processing.
2622  * Otherwise call netmap_common_irq() and return 1.
2623  *
2624  * If the card is connected to a netmap file descriptor,
2625  * do a selwakeup on the individual queue, plus one on the global one
2626  * if needed (multiqueue card _and_ there are multiqueue listeners),
2627  * and return 1.
2628  *
2629  * Finally, if called on rx from an interface connected to a switch,
2630  * calls the proper forwarding routine, and return 1.
2631  */
2632 int
2633 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2634 {
2635 	// XXX could we check NAF_NATIVE_ON ?
2636 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2637 		return 0;
2638 
2639 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2640 		ND("use regular interrupt");
2641 		return 0;
2642 	}
2643 
2644 	netmap_common_irq(ifp, q, work_done);
2645 	return 1;
2646 }
2647 
2648 
2649 /*
2650  * Module loader and unloader
2651  *
2652  * netmap_init() creates the /dev/netmap device and initializes
2653  * all global variables. Returns 0 on success, errno on failure
2654  * (but there is no chance)
2655  *
2656  * netmap_fini() destroys everything.
2657  */
2658 
2659 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2660 extern struct cdevsw netmap_cdevsw;
2661 
2662 
2663 void
2664 netmap_fini(void)
2665 {
2666 	// XXX destroy_bridges() ?
2667 	if (netmap_dev)
2668 		destroy_dev(netmap_dev);
2669 	netmap_mem_fini();
2670 	NMG_LOCK_DESTROY();
2671 	printf("netmap: unloaded module.\n");
2672 }
2673 
2674 
2675 int
2676 netmap_init(void)
2677 {
2678 	int error;
2679 
2680 	NMG_LOCK_INIT();
2681 
2682 	error = netmap_mem_init();
2683 	if (error != 0)
2684 		goto fail;
2685 	/* XXX could use make_dev_credv() to get error number */
2686 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2687 			      "netmap");
2688 	if (!netmap_dev)
2689 		goto fail;
2690 
2691 	netmap_init_bridges();
2692 	printf("netmap: loaded module\n");
2693 	return (0);
2694 fail:
2695 	netmap_fini();
2696 	return (EINVAL); /* may be incorrect */
2697 }
2698