xref: /freebsd/sys/dev/netmap/netmap.c (revision 4ec234c813eed05c166859bba82c882e40826eb9)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 /*
128  * OS-specific code that is used only within this file.
129  * Other OS-specific code that must be accessed by drivers
130  * is present in netmap_kern.h
131  */
132 
133 #if defined(__FreeBSD__)
134 #include <sys/cdefs.h> /* prerequisite */
135 #include <sys/types.h>
136 #include <sys/errno.h>
137 #include <sys/param.h>	/* defines used in kernel.h */
138 #include <sys/kernel.h>	/* types used in module initialization */
139 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
140 #include <sys/sockio.h>
141 #include <sys/socketvar.h>	/* struct socket */
142 #include <sys/malloc.h>
143 #include <sys/poll.h>
144 #include <sys/rwlock.h>
145 #include <sys/socket.h> /* sockaddrs */
146 #include <sys/selinfo.h>
147 #include <sys/sysctl.h>
148 #include <sys/jail.h>
149 #include <net/vnet.h>
150 #include <net/if.h>
151 #include <net/if_var.h>
152 #include <net/bpf.h>		/* BIOCIMMEDIATE */
153 #include <machine/bus.h>	/* bus_dmamap_* */
154 #include <sys/endian.h>
155 #include <sys/refcount.h>
156 
157 
158 /* reduce conditional code */
159 #define init_waitqueue_head(x)	// only needed in linux
160 
161 
162 
163 #elif defined(linux)
164 
165 #include "bsd_glue.h"
166 
167 
168 
169 #elif defined(__APPLE__)
170 
171 #warning OSX support is only partial
172 #include "osx_glue.h"
173 
174 #else
175 
176 #error	Unsupported platform
177 
178 #endif /* unsupported */
179 
180 /*
181  * common headers
182  */
183 #include <net/netmap.h>
184 #include <dev/netmap/netmap_kern.h>
185 #include <dev/netmap/netmap_mem2.h>
186 
187 
188 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
189 
190 /*
191  * The following variables are used by the drivers and replicate
192  * fields in the global memory pool. They only refer to buffers
193  * used by physical interfaces.
194  */
195 u_int netmap_total_buffers;
196 u_int netmap_buf_size;
197 char *netmap_buffer_base;	/* also address of an invalid buffer */
198 
199 /* user-controlled variables */
200 int netmap_verbose;
201 
202 static int netmap_no_timestamp; /* don't timestamp on rxsync */
203 
204 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
205 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
206     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
207 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
208     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
209 int netmap_mitigate = 1;
210 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
211 int netmap_no_pendintr = 1;
212 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
213     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
214 int netmap_txsync_retry = 2;
215 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
216     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
217 
218 int netmap_flags = 0;	/* debug flags */
219 int netmap_fwd = 0;	/* force transparent mode */
220 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
221 
222 /*
223  * netmap_admode selects the netmap mode to use.
224  * Invalid values are reset to NETMAP_ADMODE_BEST
225  */
226 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
227 	NETMAP_ADMODE_NATIVE,	/* either native or none */
228 	NETMAP_ADMODE_GENERIC,	/* force generic */
229 	NETMAP_ADMODE_LAST };
230 static int netmap_admode = NETMAP_ADMODE_BEST;
231 
232 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
233 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
234 
235 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
236 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
237 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
238 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
239 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
240 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
241 
242 NMG_LOCK_T	netmap_global_lock;
243 
244 
245 static void
246 nm_kr_get(struct netmap_kring *kr)
247 {
248 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
249 		tsleep(kr, 0, "NM_KR_GET", 4);
250 }
251 
252 
253 /*
254  * mark the ring as stopped, and run through the locks
255  * to make sure other users get to see it.
256  */
257 void
258 netmap_disable_ring(struct netmap_kring *kr)
259 {
260 	kr->nkr_stopped = 1;
261 	nm_kr_get(kr);
262 	mtx_lock(&kr->q_lock);
263 	mtx_unlock(&kr->q_lock);
264 	nm_kr_put(kr);
265 }
266 
267 
268 static void
269 netmap_set_all_rings(struct ifnet *ifp, int stopped)
270 {
271 	struct netmap_adapter *na;
272 	int i;
273 
274 	if (!(ifp->if_capenable & IFCAP_NETMAP))
275 		return;
276 
277 	na = NA(ifp);
278 
279 	for (i = 0; i <= na->num_tx_rings; i++) {
280 		if (stopped)
281 			netmap_disable_ring(na->tx_rings + i);
282 		else
283 			na->tx_rings[i].nkr_stopped = 0;
284 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
285 			(i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
286 	}
287 
288 	for (i = 0; i <= na->num_rx_rings; i++) {
289 		if (stopped)
290 			netmap_disable_ring(na->rx_rings + i);
291 		else
292 			na->rx_rings[i].nkr_stopped = 0;
293 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
294 			(i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
295 	}
296 }
297 
298 
299 void
300 netmap_disable_all_rings(struct ifnet *ifp)
301 {
302 	netmap_set_all_rings(ifp, 1 /* stopped */);
303 }
304 
305 
306 void
307 netmap_enable_all_rings(struct ifnet *ifp)
308 {
309 	netmap_set_all_rings(ifp, 0 /* enabled */);
310 }
311 
312 
313 /*
314  * generic bound_checking function
315  */
316 u_int
317 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
318 {
319 	u_int oldv = *v;
320 	const char *op = NULL;
321 
322 	if (dflt < lo)
323 		dflt = lo;
324 	if (dflt > hi)
325 		dflt = hi;
326 	if (oldv < lo) {
327 		*v = dflt;
328 		op = "Bump";
329 	} else if (oldv > hi) {
330 		*v = hi;
331 		op = "Clamp";
332 	}
333 	if (op && msg)
334 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
335 	return *v;
336 }
337 
338 
339 /*
340  * packet-dump function, user-supplied or static buffer.
341  * The destination buffer must be at least 30+4*len
342  */
343 const char *
344 nm_dump_buf(char *p, int len, int lim, char *dst)
345 {
346 	static char _dst[8192];
347 	int i, j, i0;
348 	static char hex[] ="0123456789abcdef";
349 	char *o;	/* output position */
350 
351 #define P_HI(x)	hex[((x) & 0xf0)>>4]
352 #define P_LO(x)	hex[((x) & 0xf)]
353 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
354 	if (!dst)
355 		dst = _dst;
356 	if (lim <= 0 || lim > len)
357 		lim = len;
358 	o = dst;
359 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
360 	o += strlen(o);
361 	/* hexdump routine */
362 	for (i = 0; i < lim; ) {
363 		sprintf(o, "%5d: ", i);
364 		o += strlen(o);
365 		memset(o, ' ', 48);
366 		i0 = i;
367 		for (j=0; j < 16 && i < lim; i++, j++) {
368 			o[j*3] = P_HI(p[i]);
369 			o[j*3+1] = P_LO(p[i]);
370 		}
371 		i = i0;
372 		for (j=0; j < 16 && i < lim; i++, j++)
373 			o[j + 48] = P_C(p[i]);
374 		o[j+48] = '\n';
375 		o += j+49;
376 	}
377 	*o = '\0';
378 #undef P_HI
379 #undef P_LO
380 #undef P_C
381 	return dst;
382 }
383 
384 
385 /*
386  * Fetch configuration from the device, to cope with dynamic
387  * reconfigurations after loading the module.
388  */
389 int
390 netmap_update_config(struct netmap_adapter *na)
391 {
392 	struct ifnet *ifp = na->ifp;
393 	u_int txr, txd, rxr, rxd;
394 
395 	txr = txd = rxr = rxd = 0;
396 	if (na->nm_config) {
397 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
398 	} else {
399 		/* take whatever we had at init time */
400 		txr = na->num_tx_rings;
401 		txd = na->num_tx_desc;
402 		rxr = na->num_rx_rings;
403 		rxd = na->num_rx_desc;
404 	}
405 
406 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
407 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
408 		return 0; /* nothing changed */
409 	if (netmap_verbose || na->active_fds > 0) {
410 		D("stored config %s: txring %d x %d, rxring %d x %d",
411 			NM_IFPNAME(ifp),
412 			na->num_tx_rings, na->num_tx_desc,
413 			na->num_rx_rings, na->num_rx_desc);
414 		D("new config %s: txring %d x %d, rxring %d x %d",
415 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
416 	}
417 	if (na->active_fds == 0) {
418 		D("configuration changed (but fine)");
419 		na->num_tx_rings = txr;
420 		na->num_tx_desc = txd;
421 		na->num_rx_rings = rxr;
422 		na->num_rx_desc = rxd;
423 		return 0;
424 	}
425 	D("configuration changed while active, this is bad...");
426 	return 1;
427 }
428 
429 
430 int
431 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
432 {
433 	u_int i, len, ndesc;
434 	struct netmap_kring *kring;
435 
436 	// XXX additional space for extra rings ?
437 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
438 
439 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
440 	if (na->tx_rings == NULL) {
441 		D("Cannot allocate krings");
442 		return ENOMEM;
443 	}
444 	na->rx_rings = na->tx_rings + ntx;
445 
446 	/*
447 	 * All fields in krings are 0 except the one initialized below.
448 	 * but better be explicit on important kring fields.
449 	 */
450 	ndesc = na->num_tx_desc;
451 	for (i = 0; i < ntx; i++) { /* Transmit rings */
452 		kring = &na->tx_rings[i];
453 		bzero(kring, sizeof(*kring));
454 		kring->na = na;
455 		kring->ring_id = i;
456 		kring->nkr_num_slots = ndesc;
457 		/*
458 		 * IMPORTANT: Always keep one slot empty.
459 		 */
460 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
461 		kring->rtail = kring->nr_hwtail = ndesc - 1;
462 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
463 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
464 		init_waitqueue_head(&kring->si);
465 	}
466 
467 	ndesc = na->num_rx_desc;
468 	for (i = 0; i < nrx; i++) { /* Receive rings */
469 		kring = &na->rx_rings[i];
470 		bzero(kring, sizeof(*kring));
471 		kring->na = na;
472 		kring->ring_id = i;
473 		kring->nkr_num_slots = ndesc;
474 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
475 		kring->rtail = kring->nr_hwtail = 0;
476 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
477 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
478 		init_waitqueue_head(&kring->si);
479 	}
480 	init_waitqueue_head(&na->tx_si);
481 	init_waitqueue_head(&na->rx_si);
482 
483 	na->tailroom = na->rx_rings + nrx;
484 
485 	return 0;
486 }
487 
488 
489 /* XXX check boundaries */
490 void
491 netmap_krings_delete(struct netmap_adapter *na)
492 {
493 	int i;
494 
495 	for (i = 0; i < na->num_tx_rings + 1; i++) {
496 		mtx_destroy(&na->tx_rings[i].q_lock);
497 	}
498 	for (i = 0; i < na->num_rx_rings + 1; i++) {
499 		mtx_destroy(&na->rx_rings[i].q_lock);
500 	}
501 	free(na->tx_rings, M_DEVBUF);
502 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
503 }
504 
505 
506 /*
507  * Destructor for NIC ports. They also have an mbuf queue
508  * on the rings connected to the host so we need to purge
509  * them first.
510  */
511 static void
512 netmap_hw_krings_delete(struct netmap_adapter *na)
513 {
514 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
515 
516 	ND("destroy sw mbq with len %d", mbq_len(q));
517 	mbq_purge(q);
518 	mbq_safe_destroy(q);
519 	netmap_krings_delete(na);
520 }
521 
522 
523 static struct netmap_if*
524 netmap_if_new(const char *ifname, struct netmap_adapter *na)
525 {
526 	struct netmap_if *nifp;
527 
528 	if (netmap_update_config(na)) {
529 		/* configuration mismatch, report and fail */
530 		return NULL;
531 	}
532 
533 	if (na->active_fds)
534 		goto final;
535 
536 	if (na->nm_krings_create(na))
537 		goto cleanup;
538 
539 	if (netmap_mem_rings_create(na))
540 		goto cleanup;
541 
542 final:
543 
544 	nifp = netmap_mem_if_new(ifname, na);
545 	if (nifp == NULL)
546 		goto cleanup;
547 
548 	return (nifp);
549 
550 cleanup:
551 
552 	if (na->active_fds == 0) {
553 		netmap_mem_rings_delete(na);
554 		na->nm_krings_delete(na);
555 	}
556 
557 	return NULL;
558 }
559 
560 
561 /* grab a reference to the memory allocator, if we don't have one already.  The
562  * reference is taken from the netmap_adapter registered with the priv.
563  *
564  */
565 static int
566 netmap_get_memory_locked(struct netmap_priv_d* p)
567 {
568 	struct netmap_mem_d *nmd;
569 	int error = 0;
570 
571 	if (p->np_na == NULL) {
572 		if (!netmap_mmap_unreg)
573 			return ENODEV;
574 		/* for compatibility with older versions of the API
575  		 * we use the global allocator when no interface has been
576  		 * registered
577  		 */
578 		nmd = &nm_mem;
579 	} else {
580 		nmd = p->np_na->nm_mem;
581 	}
582 	if (p->np_mref == NULL) {
583 		error = netmap_mem_finalize(nmd);
584 		if (!error)
585 			p->np_mref = nmd;
586 	} else if (p->np_mref != nmd) {
587 		/* a virtual port has been registered, but previous
588  		 * syscalls already used the global allocator.
589  		 * We cannot continue
590  		 */
591 		error = ENODEV;
592 	}
593 	return error;
594 }
595 
596 
597 int
598 netmap_get_memory(struct netmap_priv_d* p)
599 {
600 	int error;
601 	NMG_LOCK();
602 	error = netmap_get_memory_locked(p);
603 	NMG_UNLOCK();
604 	return error;
605 }
606 
607 
608 static int
609 netmap_have_memory_locked(struct netmap_priv_d* p)
610 {
611 	return p->np_mref != NULL;
612 }
613 
614 
615 static void
616 netmap_drop_memory_locked(struct netmap_priv_d* p)
617 {
618 	if (p->np_mref) {
619 		netmap_mem_deref(p->np_mref);
620 		p->np_mref = NULL;
621 	}
622 }
623 
624 
625 /*
626  * File descriptor's private data destructor.
627  *
628  * Call nm_register(ifp,0) to stop netmap mode on the interface and
629  * revert to normal operation. We expect that np_na->ifp has not gone.
630  * The second argument is the nifp to work on. In some cases it is
631  * not attached yet to the netmap_priv_d so we need to pass it as
632  * a separate argument.
633  */
634 /* call with NMG_LOCK held */
635 static void
636 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
637 {
638 	struct netmap_adapter *na = priv->np_na;
639 	struct ifnet *ifp = na->ifp;
640 
641 	NMG_LOCK_ASSERT();
642 	na->active_fds--;
643 	if (na->active_fds <= 0) {	/* last instance */
644 
645 		if (netmap_verbose)
646 			D("deleting last instance for %s", NM_IFPNAME(ifp));
647 		/*
648 		 * (TO CHECK) This function is only called
649 		 * when the last reference to this file descriptor goes
650 		 * away. This means we cannot have any pending poll()
651 		 * or interrupt routine operating on the structure.
652 		 * XXX The file may be closed in a thread while
653 		 * another thread is using it.
654 		 * Linux keeps the file opened until the last reference
655 		 * by any outstanding ioctl/poll or mmap is gone.
656 		 * FreeBSD does not track mmap()s (but we do) and
657 		 * wakes up any sleeping poll(). Need to check what
658 		 * happens if the close() occurs while a concurrent
659 		 * syscall is running.
660 		 */
661 		if (ifp)
662 			na->nm_register(na, 0); /* off, clear flags */
663 		/* Wake up any sleeping threads. netmap_poll will
664 		 * then return POLLERR
665 		 * XXX The wake up now must happen during *_down(), when
666 		 * we order all activities to stop. -gl
667 		 */
668 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
669 		/* knlist_destroy(&na->tx_si.si_note); */
670 		/* knlist_destroy(&na->rx_si.si_note); */
671 
672 		/* delete rings and buffers */
673 		netmap_mem_rings_delete(na);
674 		na->nm_krings_delete(na);
675 	}
676 	/* delete the nifp */
677 	netmap_mem_if_delete(na, nifp);
678 }
679 
680 
681 /*
682  * returns 1 if this is the last instance and we can free priv
683  */
684 int
685 netmap_dtor_locked(struct netmap_priv_d *priv)
686 {
687 	struct netmap_adapter *na = priv->np_na;
688 
689 #ifdef __FreeBSD__
690 	/*
691 	 * np_refcount is the number of active mmaps on
692 	 * this file descriptor
693 	 */
694 	if (--priv->np_refcount > 0) {
695 		return 0;
696 	}
697 #endif /* __FreeBSD__ */
698 	if (!na) {
699 	    return 1; //XXX is it correct?
700 	}
701 	netmap_do_unregif(priv, priv->np_nifp);
702 	priv->np_nifp = NULL;
703 	netmap_drop_memory_locked(priv);
704 	if (priv->np_na) {
705 		netmap_adapter_put(na);
706 		priv->np_na = NULL;
707 	}
708 	return 1;
709 }
710 
711 
712 void
713 netmap_dtor(void *data)
714 {
715 	struct netmap_priv_d *priv = data;
716 	int last_instance;
717 
718 	NMG_LOCK();
719 	last_instance = netmap_dtor_locked(priv);
720 	NMG_UNLOCK();
721 	if (last_instance) {
722 		bzero(priv, sizeof(*priv));	/* for safety */
723 		free(priv, M_DEVBUF);
724 	}
725 }
726 
727 
728 
729 
730 /*
731  * Handlers for synchronization of the queues from/to the host.
732  * Netmap has two operating modes:
733  * - in the default mode, the rings connected to the host stack are
734  *   just another ring pair managed by userspace;
735  * - in transparent mode (XXX to be defined) incoming packets
736  *   (from the host or the NIC) are marked as NS_FORWARD upon
737  *   arrival, and the user application has a chance to reset the
738  *   flag for packets that should be dropped.
739  *   On the RXSYNC or poll(), packets in RX rings between
740  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
741  *   to the other side.
742  * The transfer NIC --> host is relatively easy, just encapsulate
743  * into mbufs and we are done. The host --> NIC side is slightly
744  * harder because there might not be room in the tx ring so it
745  * might take a while before releasing the buffer.
746  */
747 
748 
749 /*
750  * pass a chain of buffers to the host stack as coming from 'dst'
751  * We do not need to lock because the queue is private.
752  */
753 static void
754 netmap_send_up(struct ifnet *dst, struct mbq *q)
755 {
756 	struct mbuf *m;
757 
758 	/* send packets up, outside the lock */
759 	while ((m = mbq_dequeue(q)) != NULL) {
760 		if (netmap_verbose & NM_VERB_HOST)
761 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
762 		NM_SEND_UP(dst, m);
763 	}
764 	mbq_destroy(q);
765 }
766 
767 
768 /*
769  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
770  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
771  * and pass them up. Drop remaining packets in the unlikely event
772  * of an mbuf shortage.
773  */
774 static void
775 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
776 {
777 	u_int const lim = kring->nkr_num_slots - 1;
778 	u_int const head = kring->ring->head;
779 	u_int n;
780 	struct netmap_adapter *na = kring->na;
781 
782 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
783 		struct mbuf *m;
784 		struct netmap_slot *slot = &kring->ring->slot[n];
785 
786 		if ((slot->flags & NS_FORWARD) == 0 && !force)
787 			continue;
788 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
789 			RD(5, "bad pkt at %d len %d", n, slot->len);
790 			continue;
791 		}
792 		slot->flags &= ~NS_FORWARD; // XXX needed ?
793 		/* XXX TODO: adapt to the case of a multisegment packet */
794 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
795 
796 		if (m == NULL)
797 			break;
798 		mbq_enqueue(q, m);
799 	}
800 }
801 
802 
803 /*
804  * Send to the NIC rings packets marked NS_FORWARD between
805  * kring->nr_hwcur and kring->rhead
806  * Called under kring->rx_queue.lock on the sw rx ring,
807  */
808 static u_int
809 netmap_sw_to_nic(struct netmap_adapter *na)
810 {
811 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
812 	struct netmap_slot *rxslot = kring->ring->slot;
813 	u_int i, rxcur = kring->nr_hwcur;
814 	u_int const head = kring->rhead;
815 	u_int const src_lim = kring->nkr_num_slots - 1;
816 	u_int sent = 0;
817 
818 	/* scan rings to find space, then fill as much as possible */
819 	for (i = 0; i < na->num_tx_rings; i++) {
820 		struct netmap_kring *kdst = &na->tx_rings[i];
821 		struct netmap_ring *rdst = kdst->ring;
822 		u_int const dst_lim = kdst->nkr_num_slots - 1;
823 
824 		/* XXX do we trust ring or kring->rcur,rtail ? */
825 		for (; rxcur != head && !nm_ring_empty(rdst);
826 		     rxcur = nm_next(rxcur, src_lim) ) {
827 			struct netmap_slot *src, *dst, tmp;
828 			u_int dst_cur = rdst->cur;
829 
830 			src = &rxslot[rxcur];
831 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
832 				continue;
833 
834 			sent++;
835 
836 			dst = &rdst->slot[dst_cur];
837 
838 			tmp = *src;
839 
840 			src->buf_idx = dst->buf_idx;
841 			src->flags = NS_BUF_CHANGED;
842 
843 			dst->buf_idx = tmp.buf_idx;
844 			dst->len = tmp.len;
845 			dst->flags = NS_BUF_CHANGED;
846 
847 			rdst->cur = nm_next(dst_cur, dst_lim);
848 		}
849 		/* if (sent) XXX txsync ? */
850 	}
851 	return sent;
852 }
853 
854 
855 /*
856  * netmap_txsync_to_host() passes packets up. We are called from a
857  * system call in user process context, and the only contention
858  * can be among multiple user threads erroneously calling
859  * this routine concurrently.
860  */
861 void
862 netmap_txsync_to_host(struct netmap_adapter *na)
863 {
864 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
865 	struct netmap_ring *ring = kring->ring;
866 	u_int const lim = kring->nkr_num_slots - 1;
867 	u_int const head = nm_txsync_prologue(kring);
868 	struct mbq q;
869 	int error;
870 
871 	error = nm_kr_tryget(kring);
872 	if (error) {
873 		if (error == NM_KR_BUSY)
874 			D("ring %p busy (user error)", kring);
875 		return;
876 	}
877 	if (head > lim) {
878 		D("invalid ring index in stack TX kring %p", kring);
879 		netmap_ring_reinit(kring);
880 		nm_kr_put(kring);
881 		return;
882 	}
883 
884 	/* Take packets from hwcur to head and pass them up.
885 	 * force head = cur since netmap_grab_packets() stops at head
886 	 * In case of no buffers we give up. At the end of the loop,
887 	 * the queue is drained in all cases.
888 	 */
889 	mbq_init(&q);
890 	ring->cur = head;
891 	netmap_grab_packets(kring, &q, 1 /* force */);
892 	ND("have %d pkts in queue", mbq_len(&q));
893 	kring->nr_hwcur = head;
894 	kring->nr_hwtail = head + lim;
895 	if (kring->nr_hwtail > lim)
896 		kring->nr_hwtail -= lim + 1;
897 	nm_txsync_finalize(kring);
898 
899 	nm_kr_put(kring);
900 	netmap_send_up(na->ifp, &q);
901 }
902 
903 
904 /*
905  * rxsync backend for packets coming from the host stack.
906  * They have been put in kring->rx_queue by netmap_transmit().
907  * We protect access to the kring using kring->rx_queue.lock
908  *
909  * This routine also does the selrecord if called from the poll handler
910  * (we know because td != NULL).
911  *
912  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
913  *     as an additional hidden argument.
914  * returns the number of packets delivered to tx queues in
915  * transparent mode, or a negative value if error
916  */
917 int
918 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
919 {
920 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
921 	struct netmap_ring *ring = kring->ring;
922 	u_int nm_i, n;
923 	u_int const lim = kring->nkr_num_slots - 1;
924 	u_int const head = nm_rxsync_prologue(kring);
925 	int ret = 0;
926 	struct mbq *q = &kring->rx_queue;
927 
928 	(void)pwait;	/* disable unused warnings */
929 
930 	if (head > lim) {
931 		netmap_ring_reinit(kring);
932 		return EINVAL;
933 	}
934 
935 	if (kring->nkr_stopped) /* check a first time without lock */
936 		return EBUSY;
937 
938 	mtx_lock(&q->lock);
939 
940 	if (kring->nkr_stopped) {  /* check again with lock held */
941 		ret = EBUSY;
942 		goto unlock_out;
943 	}
944 
945 	/* First part: import newly received packets */
946 	n = mbq_len(q);
947 	if (n) { /* grab packets from the queue */
948 		struct mbuf *m;
949 		uint32_t stop_i;
950 
951 		nm_i = kring->nr_hwtail;
952 		stop_i = nm_prev(nm_i, lim);
953 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
954 			int len = MBUF_LEN(m);
955 			struct netmap_slot *slot = &ring->slot[nm_i];
956 
957 			m_copydata(m, 0, len, BDG_NMB(na, slot));
958 			ND("nm %d len %d", nm_i, len);
959 			if (netmap_verbose)
960                                 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
961 
962 			slot->len = len;
963 			slot->flags = kring->nkr_slot_flags;
964 			nm_i = nm_next(nm_i, lim);
965 		}
966 		kring->nr_hwtail = nm_i;
967 	}
968 
969 	/*
970 	 * Second part: skip past packets that userspace has released.
971 	 */
972 	nm_i = kring->nr_hwcur;
973 	if (nm_i != head) { /* something was released */
974 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
975 			ret = netmap_sw_to_nic(na);
976 		kring->nr_hwcur = head;
977 	}
978 
979 	nm_rxsync_finalize(kring);
980 
981 	/* access copies of cur,tail in the kring */
982 	if (kring->rcur == kring->rtail && td) /* no bufs available */
983 		selrecord(td, &kring->si);
984 
985 unlock_out:
986 
987 	mtx_unlock(&q->lock);
988 	return ret;
989 }
990 
991 
992 /* Get a netmap adapter for the port.
993  *
994  * If it is possible to satisfy the request, return 0
995  * with *na containing the netmap adapter found.
996  * Otherwise return an error code, with *na containing NULL.
997  *
998  * When the port is attached to a bridge, we always return
999  * EBUSY.
1000  * Otherwise, if the port is already bound to a file descriptor,
1001  * then we unconditionally return the existing adapter into *na.
1002  * In all the other cases, we return (into *na) either native,
1003  * generic or NULL, according to the following table:
1004  *
1005  *					native_support
1006  * active_fds   dev.netmap.admode         YES     NO
1007  * -------------------------------------------------------
1008  *    >0              *                 NA(ifp) NA(ifp)
1009  *
1010  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1011  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1012  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1013  *
1014  */
1015 
1016 int
1017 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1018 {
1019 	/* generic support */
1020 	int i = netmap_admode;	/* Take a snapshot. */
1021 	int error = 0;
1022 	struct netmap_adapter *prev_na;
1023 	struct netmap_generic_adapter *gna;
1024 
1025 	*na = NULL; /* default */
1026 
1027 	/* reset in case of invalid value */
1028 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1029 		i = netmap_admode = NETMAP_ADMODE_BEST;
1030 
1031 	if (NETMAP_CAPABLE(ifp)) {
1032 		/* If an adapter already exists, but is
1033 		 * attached to a vale port, we report that the
1034 		 * port is busy.
1035 		 */
1036 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
1037 			return EBUSY;
1038 
1039 		/* If an adapter already exists, return it if
1040 		 * there are active file descriptors or if
1041 		 * netmap is not forced to use generic
1042 		 * adapters.
1043 		 */
1044 		if (NA(ifp)->active_fds > 0 ||
1045 				i != NETMAP_ADMODE_GENERIC) {
1046 			*na = NA(ifp);
1047 			return 0;
1048 		}
1049 	}
1050 
1051 	/* If there isn't native support and netmap is not allowed
1052 	 * to use generic adapters, we cannot satisfy the request.
1053 	 */
1054 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1055 		return EINVAL;
1056 
1057 	/* Otherwise, create a generic adapter and return it,
1058 	 * saving the previously used netmap adapter, if any.
1059 	 *
1060 	 * Note that here 'prev_na', if not NULL, MUST be a
1061 	 * native adapter, and CANNOT be a generic one. This is
1062 	 * true because generic adapters are created on demand, and
1063 	 * destroyed when not used anymore. Therefore, if the adapter
1064 	 * currently attached to an interface 'ifp' is generic, it
1065 	 * must be that
1066 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1067 	 * Consequently, if NA(ifp) is generic, we will enter one of
1068 	 * the branches above. This ensures that we never override
1069 	 * a generic adapter with another generic adapter.
1070 	 */
1071 	prev_na = NA(ifp);
1072 	error = generic_netmap_attach(ifp);
1073 	if (error)
1074 		return error;
1075 
1076 	*na = NA(ifp);
1077 	gna = (struct netmap_generic_adapter*)NA(ifp);
1078 	gna->prev = prev_na; /* save old na */
1079 	if (prev_na != NULL) {
1080 		ifunit_ref(ifp->if_xname);
1081 		// XXX add a refcount ?
1082 		netmap_adapter_get(prev_na);
1083 	}
1084 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1085 
1086 	return 0;
1087 }
1088 
1089 
1090 /*
1091  * MUST BE CALLED UNDER NMG_LOCK()
1092  *
1093  * get a refcounted reference to an interface.
1094  * This is always called in the execution of an ioctl().
1095  *
1096  * Return ENXIO if the interface does not exist, EINVAL if netmap
1097  * is not supported by the interface.
1098  * If successful, hold a reference.
1099  *
1100  * When the NIC is attached to a bridge, reference is managed
1101  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1102  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1103  * is detached from the bridge, then ifp's refcount is dropped (this
1104  * is equivalent to that ifp is destroyed in case of virtual ports.
1105  *
1106  * This function uses if_rele() when we want to prevent the NIC from
1107  * being detached from the bridge in error handling.  But once refcount
1108  * is acquired by this function, it must be released using nm_if_rele().
1109  */
1110 int
1111 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1112 {
1113 	struct ifnet *ifp;
1114 	int error = 0;
1115 	struct netmap_adapter *ret;
1116 
1117 	*na = NULL;     /* default return value */
1118 
1119 	/* first try to see if this is a bridge port. */
1120 	NMG_LOCK_ASSERT();
1121 
1122 	error = netmap_get_bdg_na(nmr, na, create);
1123 	if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
1124 		return error;
1125 
1126 	ifp = ifunit_ref(nmr->nr_name);
1127 	if (ifp == NULL) {
1128 	        return ENXIO;
1129 	}
1130 
1131 	error = netmap_get_hw_na(ifp, &ret);
1132 	if (error)
1133 		goto out;
1134 
1135 	if (ret != NULL) {
1136 		/* Users cannot use the NIC attached to a bridge directly */
1137 		if (NETMAP_OWNED_BY_KERN(ret)) {
1138 			error = EINVAL;
1139 			goto out;
1140 		}
1141 		error = 0;
1142 		*na = ret;
1143 		netmap_adapter_get(ret);
1144 	}
1145 out:
1146 	if_rele(ifp);
1147 
1148 	return error;
1149 }
1150 
1151 
1152 /*
1153  * validate parameters on entry for *_txsync()
1154  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1155  * in case of error.
1156  *
1157  * rhead, rcur and rtail=hwtail are stored from previous round.
1158  * hwcur is the next packet to send to the ring.
1159  *
1160  * We want
1161  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1162  *
1163  * hwcur, rhead, rtail and hwtail are reliable
1164  */
1165 u_int
1166 nm_txsync_prologue(struct netmap_kring *kring)
1167 {
1168 	struct netmap_ring *ring = kring->ring;
1169 	u_int head = ring->head; /* read only once */
1170 	u_int cur = ring->cur; /* read only once */
1171 	u_int n = kring->nkr_num_slots;
1172 
1173 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1174 		kring->name,
1175 		kring->nr_hwcur, kring->nr_hwtail,
1176 		ring->head, ring->cur, ring->tail);
1177 #if 1 /* kernel sanity checks; but we can trust the kring. */
1178 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1179 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1180 		goto error;
1181 #endif /* kernel sanity checks */
1182 	/*
1183 	 * user sanity checks. We only use 'cur',
1184 	 * A, B, ... are possible positions for cur:
1185 	 *
1186 	 *  0    A  cur   B  tail  C  n-1
1187 	 *  0    D  tail  E  cur   F  n-1
1188 	 *
1189 	 * B, F, D are valid. A, C, E are wrong
1190 	 */
1191 	if (kring->rtail >= kring->rhead) {
1192 		/* want rhead <= head <= rtail */
1193 		if (head < kring->rhead || head > kring->rtail)
1194 			goto error;
1195 		/* and also head <= cur <= rtail */
1196 		if (cur < head || cur > kring->rtail)
1197 			goto error;
1198 	} else { /* here rtail < rhead */
1199 		/* we need head outside rtail .. rhead */
1200 		if (head > kring->rtail && head < kring->rhead)
1201 			goto error;
1202 
1203 		/* two cases now: head <= rtail or head >= rhead  */
1204 		if (head <= kring->rtail) {
1205 			/* want head <= cur <= rtail */
1206 			if (cur < head || cur > kring->rtail)
1207 				goto error;
1208 		} else { /* head >= rhead */
1209 			/* cur must be outside rtail..head */
1210 			if (cur > kring->rtail && cur < head)
1211 				goto error;
1212 		}
1213 	}
1214 	if (ring->tail != kring->rtail) {
1215 		RD(5, "tail overwritten was %d need %d",
1216 			ring->tail, kring->rtail);
1217 		ring->tail = kring->rtail;
1218 	}
1219 	kring->rhead = head;
1220 	kring->rcur = cur;
1221 	return head;
1222 
1223 error:
1224 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1225 		kring->name,
1226 		kring->nr_hwcur,
1227 		kring->rcur, kring->nr_hwtail,
1228 		cur, ring->tail);
1229 	return n;
1230 }
1231 
1232 
1233 /*
1234  * validate parameters on entry for *_rxsync()
1235  * Returns ring->head if ok, kring->nkr_num_slots on error.
1236  *
1237  * For a valid configuration,
1238  * hwcur <= head <= cur <= tail <= hwtail
1239  *
1240  * We only consider head and cur.
1241  * hwcur and hwtail are reliable.
1242  *
1243  */
1244 u_int
1245 nm_rxsync_prologue(struct netmap_kring *kring)
1246 {
1247 	struct netmap_ring *ring = kring->ring;
1248 	uint32_t const n = kring->nkr_num_slots;
1249 	uint32_t head, cur;
1250 
1251 	ND("%s kc %d kt %d h %d c %d t %d",
1252 		kring->name,
1253 		kring->nr_hwcur, kring->nr_hwtail,
1254 		ring->head, ring->cur, ring->tail);
1255 	/*
1256 	 * Before storing the new values, we should check they do not
1257 	 * move backwards. However:
1258 	 * - head is not an issue because the previous value is hwcur;
1259 	 * - cur could in principle go back, however it does not matter
1260 	 *   because we are processing a brand new rxsync()
1261 	 */
1262 	cur = kring->rcur = ring->cur;	/* read only once */
1263 	head = kring->rhead = ring->head;	/* read only once */
1264 #if 1 /* kernel sanity checks */
1265 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1266 		goto error;
1267 #endif /* kernel sanity checks */
1268 	/* user sanity checks */
1269 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1270 		/* want hwcur <= rhead <= hwtail */
1271 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1272 			goto error;
1273 		/* and also rhead <= rcur <= hwtail */
1274 		if (cur < head || cur > kring->nr_hwtail)
1275 			goto error;
1276 	} else {
1277 		/* we need rhead outside hwtail..hwcur */
1278 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1279 			goto error;
1280 		/* two cases now: head <= hwtail or head >= hwcur  */
1281 		if (head <= kring->nr_hwtail) {
1282 			/* want head <= cur <= hwtail */
1283 			if (cur < head || cur > kring->nr_hwtail)
1284 				goto error;
1285 		} else {
1286 			/* cur must be outside hwtail..head */
1287 			if (cur < head && cur > kring->nr_hwtail)
1288 				goto error;
1289 		}
1290 	}
1291 	if (ring->tail != kring->rtail) {
1292 		RD(5, "%s tail overwritten was %d need %d",
1293 			kring->name,
1294 			ring->tail, kring->rtail);
1295 		ring->tail = kring->rtail;
1296 	}
1297 	return head;
1298 
1299 error:
1300 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1301 		kring->nr_hwcur,
1302 		kring->rcur, kring->nr_hwtail,
1303 		kring->rhead, kring->rcur, ring->tail);
1304 	return n;
1305 }
1306 
1307 
1308 /*
1309  * Error routine called when txsync/rxsync detects an error.
1310  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1311  * Return 1 on reinit.
1312  *
1313  * This routine is only called by the upper half of the kernel.
1314  * It only reads hwcur (which is changed only by the upper half, too)
1315  * and hwtail (which may be changed by the lower half, but only on
1316  * a tx ring and only to increase it, so any error will be recovered
1317  * on the next call). For the above, we don't strictly need to call
1318  * it under lock.
1319  */
1320 int
1321 netmap_ring_reinit(struct netmap_kring *kring)
1322 {
1323 	struct netmap_ring *ring = kring->ring;
1324 	u_int i, lim = kring->nkr_num_slots - 1;
1325 	int errors = 0;
1326 
1327 	// XXX KASSERT nm_kr_tryget
1328 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1329 	// XXX probably wrong to trust userspace
1330 	kring->rhead = ring->head;
1331 	kring->rcur  = ring->cur;
1332 	kring->rtail = ring->tail;
1333 
1334 	if (ring->cur > lim)
1335 		errors++;
1336 	if (ring->head > lim)
1337 		errors++;
1338 	if (ring->tail > lim)
1339 		errors++;
1340 	for (i = 0; i <= lim; i++) {
1341 		u_int idx = ring->slot[i].buf_idx;
1342 		u_int len = ring->slot[i].len;
1343 		if (idx < 2 || idx >= netmap_total_buffers) {
1344 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1345 			ring->slot[i].buf_idx = 0;
1346 			ring->slot[i].len = 0;
1347 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1348 			ring->slot[i].len = 0;
1349 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1350 		}
1351 	}
1352 	if (errors) {
1353 		RD(10, "total %d errors", errors);
1354 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1355 			kring->name,
1356 			ring->cur, kring->nr_hwcur,
1357 			ring->tail, kring->nr_hwtail);
1358 		ring->head = kring->rhead = kring->nr_hwcur;
1359 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1360 		ring->tail = kring->rtail = kring->nr_hwtail;
1361 	}
1362 	return (errors ? 1 : 0);
1363 }
1364 
1365 
1366 /*
1367  * Set the ring ID. For devices with a single queue, a request
1368  * for all rings is the same as a single ring.
1369  */
1370 static int
1371 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1372 {
1373 	struct netmap_adapter *na = priv->np_na;
1374 	struct ifnet *ifp = na->ifp;
1375 	u_int i = ringid & NETMAP_RING_MASK;
1376 	/* initially (np_qfirst == np_qlast) we don't want to lock */
1377 	u_int lim = na->num_rx_rings;
1378 
1379 	if (na->num_tx_rings > lim)
1380 		lim = na->num_tx_rings;
1381 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1382 		D("invalid ring id %d", i);
1383 		return (EINVAL);
1384 	}
1385 	priv->np_ringid = ringid;
1386 	if (ringid & NETMAP_SW_RING) {
1387 		priv->np_qfirst = NETMAP_SW_RING;
1388 		priv->np_qlast = 0;
1389 	} else if (ringid & NETMAP_HW_RING) {
1390 		priv->np_qfirst = i;
1391 		priv->np_qlast = i + 1;
1392 	} else {
1393 		priv->np_qfirst = 0;
1394 		priv->np_qlast = NETMAP_HW_RING ;
1395 	}
1396 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1397     if (netmap_verbose) {
1398 	if (ringid & NETMAP_SW_RING)
1399 		D("ringid %s set to SW RING", NM_IFPNAME(ifp));
1400 	else if (ringid & NETMAP_HW_RING)
1401 		D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
1402 			priv->np_qfirst);
1403 	else
1404 		D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
1405     }
1406 	return 0;
1407 }
1408 
1409 
1410 /*
1411  * possibly move the interface to netmap-mode.
1412  * If success it returns a pointer to netmap_if, otherwise NULL.
1413  * This must be called with NMG_LOCK held.
1414  */
1415 struct netmap_if *
1416 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1417 	uint16_t ringid, int *err)
1418 {
1419 	struct ifnet *ifp = na->ifp;
1420 	struct netmap_if *nifp = NULL;
1421 	int error, need_mem = 0;
1422 
1423 	NMG_LOCK_ASSERT();
1424 	/* ring configuration may have changed, fetch from the card */
1425 	netmap_update_config(na);
1426 	priv->np_na = na;     /* store the reference */
1427 	error = netmap_set_ringid(priv, ringid);
1428 	if (error)
1429 		goto out;
1430 	/* ensure allocators are ready */
1431 	need_mem = !netmap_have_memory_locked(priv);
1432 	if (need_mem) {
1433 		error = netmap_get_memory_locked(priv);
1434 		ND("get_memory returned %d", error);
1435 		if (error)
1436 			goto out;
1437 	}
1438 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1439 	if (nifp == NULL) { /* allocation failed */
1440 		/* we should drop the allocator, but only
1441 		 * if we were the ones who grabbed it
1442 		 */
1443 		error = ENOMEM;
1444 		goto out;
1445 	}
1446 	na->active_fds++;
1447 	if (ifp->if_capenable & IFCAP_NETMAP) {
1448 		/* was already set */
1449 	} else {
1450 		/* Otherwise set the card in netmap mode
1451 		 * and make it use the shared buffers.
1452 		 *
1453 		 * do not core lock because the race is harmless here,
1454 		 * there cannot be any traffic to netmap_transmit()
1455 		 */
1456 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1457 		ND("%p->na_lut == %p", na, na->na_lut);
1458 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1459 		error = na->nm_register(na, 1); /* mode on */
1460 		if (error) {
1461 			netmap_do_unregif(priv, nifp);
1462 			nifp = NULL;
1463 		}
1464 	}
1465 out:
1466 	*err = error;
1467 	if (error) {
1468 		priv->np_na = NULL;
1469 		if (need_mem)
1470 			netmap_drop_memory_locked(priv);
1471 	}
1472 	if (nifp != NULL) {
1473 		/*
1474 		 * advertise that the interface is ready bt setting ni_nifp.
1475 		 * The barrier is needed because readers (poll and *SYNC)
1476 		 * check for priv->np_nifp != NULL without locking
1477 		 */
1478 		wmb(); /* make sure previous writes are visible to all CPUs */
1479 		priv->np_nifp = nifp;
1480 	}
1481 	return nifp;
1482 }
1483 
1484 
1485 
1486 /*
1487  * ioctl(2) support for the "netmap" device.
1488  *
1489  * Following a list of accepted commands:
1490  * - NIOCGINFO
1491  * - SIOCGIFADDR	just for convenience
1492  * - NIOCREGIF
1493  * - NIOCTXSYNC
1494  * - NIOCRXSYNC
1495  *
1496  * Return 0 on success, errno otherwise.
1497  */
1498 int
1499 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1500 	int fflag, struct thread *td)
1501 {
1502 	struct netmap_priv_d *priv = NULL;
1503 	struct ifnet *ifp = NULL;
1504 	struct nmreq *nmr = (struct nmreq *) data;
1505 	struct netmap_adapter *na = NULL;
1506 	int error;
1507 	u_int i, lim;
1508 	struct netmap_if *nifp;
1509 	struct netmap_kring *krings;
1510 
1511 	(void)dev;	/* UNUSED */
1512 	(void)fflag;	/* UNUSED */
1513 #ifdef linux
1514 #define devfs_get_cdevpriv(pp)				\
1515 	({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; 	\
1516 		(*pp ? 0 : ENOENT); })
1517 
1518 /* devfs_set_cdevpriv cannot fail on linux */
1519 #define devfs_set_cdevpriv(p, fn)				\
1520 	({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
1521 
1522 
1523 #define devfs_clear_cdevpriv()	do {				\
1524 		netmap_dtor(priv); ((struct file *)td)->private_data = 0;	\
1525 	} while (0)
1526 #endif /* linux */
1527 
1528 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
1529 		/* truncate name */
1530 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
1531 		if (nmr->nr_version != NETMAP_API) {
1532 			D("API mismatch for %s got %d need %d",
1533 				nmr->nr_name,
1534 				nmr->nr_version, NETMAP_API);
1535 			nmr->nr_version = NETMAP_API;
1536 			return EINVAL;
1537 		}
1538 	}
1539 	CURVNET_SET(TD_TO_VNET(td));
1540 
1541 	error = devfs_get_cdevpriv((void **)&priv);
1542 	if (error) {
1543 		CURVNET_RESTORE();
1544 		/* XXX ENOENT should be impossible, since the priv
1545 		 * is now created in the open */
1546 		return (error == ENOENT ? ENXIO : error);
1547 	}
1548 
1549 	switch (cmd) {
1550 	case NIOCGINFO:		/* return capabilities etc */
1551 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1552 			error = netmap_bdg_ctl(nmr, NULL);
1553 			break;
1554 		}
1555 
1556 		NMG_LOCK();
1557 		do {
1558 			/* memsize is always valid */
1559 			struct netmap_mem_d *nmd = &nm_mem;
1560 			u_int memflags;
1561 
1562 			if (nmr->nr_name[0] != '\0') {
1563 				/* get a refcount */
1564 				error = netmap_get_na(nmr, &na, 1 /* create */);
1565 				if (error)
1566 					break;
1567 				nmd = na->nm_mem; /* get memory allocator */
1568 			}
1569 
1570 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
1571 			if (error)
1572 				break;
1573 			if (na == NULL) /* only memory info */
1574 				break;
1575 			nmr->nr_offset = 0;
1576 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1577 			netmap_update_config(na);
1578 			nmr->nr_rx_rings = na->num_rx_rings;
1579 			nmr->nr_tx_rings = na->num_tx_rings;
1580 			nmr->nr_rx_slots = na->num_rx_desc;
1581 			nmr->nr_tx_slots = na->num_tx_desc;
1582 			if (memflags & NETMAP_MEM_PRIVATE)
1583 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1584 			netmap_adapter_put(na);
1585 		} while (0);
1586 		NMG_UNLOCK();
1587 		break;
1588 
1589 	case NIOCREGIF:
1590 		/* possibly attach/detach NIC and VALE switch */
1591 		i = nmr->nr_cmd;
1592 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
1593 				|| i == NETMAP_BDG_OFFSET) {
1594 			error = netmap_bdg_ctl(nmr, NULL);
1595 			break;
1596 		} else if (i != 0) {
1597 			D("nr_cmd must be 0 not %d", i);
1598 			error = EINVAL;
1599 			break;
1600 		}
1601 
1602 		/* protect access to priv from concurrent NIOCREGIF */
1603 		NMG_LOCK();
1604 		do {
1605 			u_int memflags;
1606 
1607 			if (priv->np_na != NULL) {	/* thread already registered */
1608 				error = netmap_set_ringid(priv, nmr->nr_ringid);
1609 				break;
1610 			}
1611 			/* find the interface and a reference */
1612 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1613 			if (error)
1614 				break;
1615 			ifp = na->ifp;
1616 			if (NETMAP_OWNED_BY_KERN(na)) {
1617 				netmap_adapter_put(na);
1618 				error = EBUSY;
1619 				break;
1620 			}
1621 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
1622 			if (!nifp) {    /* reg. failed, release priv and ref */
1623 				netmap_adapter_put(na);
1624 				priv->np_nifp = NULL;
1625 				break;
1626 			}
1627 
1628 			/* return the offset of the netmap_if object */
1629 			nmr->nr_rx_rings = na->num_rx_rings;
1630 			nmr->nr_tx_rings = na->num_tx_rings;
1631 			nmr->nr_rx_slots = na->num_rx_desc;
1632 			nmr->nr_tx_slots = na->num_tx_desc;
1633 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
1634 			if (error) {
1635 				netmap_adapter_put(na);
1636 				break;
1637 			}
1638 			if (memflags & NETMAP_MEM_PRIVATE) {
1639 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1640 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1641 			}
1642 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1643 		} while (0);
1644 		NMG_UNLOCK();
1645 		break;
1646 
1647 	case NIOCTXSYNC:
1648 	case NIOCRXSYNC:
1649 		nifp = priv->np_nifp;
1650 
1651 		if (nifp == NULL) {
1652 			error = ENXIO;
1653 			break;
1654 		}
1655 		rmb(); /* make sure following reads are not from cache */
1656 
1657 		na = priv->np_na;      /* we have a reference */
1658 
1659 		if (na == NULL) {
1660 			D("Internal error: nifp != NULL && na == NULL");
1661 			error = ENXIO;
1662 			break;
1663 		}
1664 
1665 		ifp = na->ifp;
1666 		if (ifp == NULL) {
1667 			RD(1, "the ifp is gone");
1668 			error = ENXIO;
1669 			break;
1670 		}
1671 
1672 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
1673 			if (cmd == NIOCTXSYNC)
1674 				netmap_txsync_to_host(na);
1675 			else
1676 				netmap_rxsync_from_host(na, NULL, NULL);
1677 			break;
1678 		}
1679 		/* find the last ring to scan */
1680 		lim = priv->np_qlast;
1681 		if (lim == NETMAP_HW_RING)
1682 			lim = (cmd == NIOCTXSYNC) ?
1683 			    na->num_tx_rings : na->num_rx_rings;
1684 
1685 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
1686 		for (i = priv->np_qfirst; i < lim; i++) {
1687 			struct netmap_kring *kring = krings + i;
1688 			if (nm_kr_tryget(kring)) {
1689 				error = EBUSY;
1690 				goto out;
1691 			}
1692 			if (cmd == NIOCTXSYNC) {
1693 				if (netmap_verbose & NM_VERB_TXSYNC)
1694 					D("pre txsync ring %d cur %d hwcur %d",
1695 					    i, kring->ring->cur,
1696 					    kring->nr_hwcur);
1697 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1698 					netmap_ring_reinit(kring);
1699 				} else {
1700 					na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
1701 				}
1702 				if (netmap_verbose & NM_VERB_TXSYNC)
1703 					D("post txsync ring %d cur %d hwcur %d",
1704 					    i, kring->ring->cur,
1705 					    kring->nr_hwcur);
1706 			} else {
1707 				na->nm_rxsync(na, i, NAF_FORCE_READ);
1708 				microtime(&na->rx_rings[i].ring->ts);
1709 			}
1710 			nm_kr_put(kring);
1711 		}
1712 
1713 		break;
1714 
1715 #ifdef __FreeBSD__
1716 	case BIOCIMMEDIATE:
1717 	case BIOCGHDRCMPLT:
1718 	case BIOCSHDRCMPLT:
1719 	case BIOCSSEESENT:
1720 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1721 		break;
1722 
1723 	default:	/* allow device-specific ioctls */
1724 	    {
1725 		struct socket so;
1726 
1727 		bzero(&so, sizeof(so));
1728 		NMG_LOCK();
1729 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1730 		if (error) {
1731 			netmap_adapter_put(na);
1732 			NMG_UNLOCK();
1733 			break;
1734 		}
1735 		ifp = na->ifp;
1736 		so.so_vnet = ifp->if_vnet;
1737 		// so->so_proto not null.
1738 		error = ifioctl(&so, cmd, data, td);
1739 		netmap_adapter_put(na);
1740 		NMG_UNLOCK();
1741 		break;
1742 	    }
1743 
1744 #else /* linux */
1745 	default:
1746 		error = EOPNOTSUPP;
1747 #endif /* linux */
1748 	}
1749 out:
1750 
1751 	CURVNET_RESTORE();
1752 	return (error);
1753 }
1754 
1755 
1756 /*
1757  * select(2) and poll(2) handlers for the "netmap" device.
1758  *
1759  * Can be called for one or more queues.
1760  * Return true the event mask corresponding to ready events.
1761  * If there are no ready events, do a selrecord on either individual
1762  * selinfo or on the global one.
1763  * Device-dependent parts (locking and sync of tx/rx rings)
1764  * are done through callbacks.
1765  *
1766  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1767  * The first one is remapped to pwait as selrecord() uses the name as an
1768  * hidden argument.
1769  */
1770 int
1771 netmap_poll(struct cdev *dev, int events, struct thread *td)
1772 {
1773 	struct netmap_priv_d *priv = NULL;
1774 	struct netmap_adapter *na;
1775 	struct ifnet *ifp;
1776 	struct netmap_kring *kring;
1777 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1778 	u_int lim_tx, lim_rx;
1779 	struct mbq q;		/* packets from hw queues to host stack */
1780 	void *pwait = dev;	/* linux compatibility */
1781 
1782 	/*
1783 	 * In order to avoid nested locks, we need to "double check"
1784 	 * txsync and rxsync if we decide to do a selrecord().
1785 	 * retry_tx (and retry_rx, later) prevent looping forever.
1786 	 */
1787 	int retry_tx = 1, retry_rx = 1;
1788 
1789 	(void)pwait;
1790 	mbq_init(&q);
1791 
1792 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
1793 		return POLLERR;
1794 
1795 	if (priv->np_nifp == NULL) {
1796 		D("No if registered");
1797 		return POLLERR;
1798 	}
1799 	rmb(); /* make sure following reads are not from cache */
1800 
1801 	na = priv->np_na;
1802 	ifp = na->ifp;
1803 	// check for deleted
1804 	if (ifp == NULL) {
1805 		RD(1, "the ifp is gone");
1806 		return POLLERR;
1807 	}
1808 
1809 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1810 		return POLLERR;
1811 
1812 	if (netmap_verbose & 0x8000)
1813 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1814 	want_tx = events & (POLLOUT | POLLWRNORM);
1815 	want_rx = events & (POLLIN | POLLRDNORM);
1816 
1817 	lim_tx = na->num_tx_rings;
1818 	lim_rx = na->num_rx_rings;
1819 
1820 	if (priv->np_qfirst == NETMAP_SW_RING) {
1821 		// XXX locking ?
1822 		/* handle the host stack ring */
1823 		if (priv->np_txpoll || want_tx) {
1824 			/* push any packets up, then we are always ready */
1825 			netmap_txsync_to_host(na);
1826 			revents |= want_tx;
1827 		}
1828 		if (want_rx) {
1829 			kring = &na->rx_rings[lim_rx];
1830 			/* XXX replace with rxprologue etc. */
1831 			if (nm_ring_empty(kring->ring))
1832 				netmap_rxsync_from_host(na, td, dev);
1833 			if (!nm_ring_empty(kring->ring))
1834 				revents |= want_rx;
1835 		}
1836 		return (revents);
1837 	}
1838 
1839 
1840 	/*
1841 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1842 	 * the file descriptor is bound to all of them. If so, we sleep on
1843 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1844 	 * (FreeBSD only allows two selinfo's per file descriptor).
1845 	 * The interrupt routine in the driver wake one or the other
1846 	 * (or both) depending on which clients are active.
1847 	 *
1848 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1849 	 * txsync() is called if we run out of buffers on POLLOUT, or
1850 	 * there are pending packets to send. The latter can be disabled
1851 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1852 	 */
1853 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
1854 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
1855 
1856 	if (priv->np_qlast != NETMAP_HW_RING) {
1857 		lim_tx = lim_rx = priv->np_qlast;
1858 	}
1859 
1860 	/*
1861 	 * We start with a lock free round which is cheap if we have
1862 	 * slots available. If this fails, then lock and call the sync
1863 	 * routines.
1864 	 */
1865 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
1866 		kring = &na->rx_rings[i];
1867 		/* XXX compare ring->cur and kring->tail */
1868 		if (!nm_ring_empty(kring->ring)) {
1869 			revents |= want_rx;
1870 			want_rx = 0;	/* also breaks the loop */
1871 		}
1872 	}
1873 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
1874 		kring = &na->tx_rings[i];
1875 		/* XXX compare ring->cur and kring->tail */
1876 		if (!nm_ring_empty(kring->ring)) {
1877 			revents |= want_tx;
1878 			want_tx = 0;	/* also breaks the loop */
1879 		}
1880 	}
1881 
1882 	/*
1883 	 * If we want to push packets out (priv->np_txpoll) or
1884 	 * want_tx is still set, we must issue txsync calls
1885 	 * (on all rings, to avoid that the tx rings stall).
1886 	 * XXX should also check cur != hwcur on the tx rings.
1887 	 * Fortunately, normal tx mode has np_txpoll set.
1888 	 */
1889 	if (priv->np_txpoll || want_tx) {
1890 		/*
1891 		 * The first round checks if anyone is ready, if not
1892 		 * do a selrecord and another round to handle races.
1893 		 * want_tx goes to 0 if any space is found, and is
1894 		 * used to skip rings with no pending transmissions.
1895 		 */
1896 flush_tx:
1897 		for (i = priv->np_qfirst; i < lim_tx; i++) {
1898 			int found = 0;
1899 
1900 			kring = &na->tx_rings[i];
1901 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1902 				continue;
1903 			/* only one thread does txsync */
1904 			if (nm_kr_tryget(kring)) {
1905 				D("%p lost race on txring %d, ok", priv, i);
1906 				continue;
1907 			}
1908 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1909 				netmap_ring_reinit(kring);
1910 				revents |= POLLERR;
1911 			} else {
1912 				if (na->nm_txsync(na, i, 0))
1913 					revents |= POLLERR;
1914 			}
1915 
1916 			/*
1917 			 * If we found new slots, notify potential
1918 			 * listeners on the same ring.
1919 			 * Since we just did a txsync, look at the copies
1920 			 * of cur,tail in the kring.
1921 			 */
1922 			found = kring->rcur != kring->rtail;
1923 			nm_kr_put(kring);
1924 			if (found) { /* notify other listeners */
1925 				revents |= want_tx;
1926 				want_tx = 0;
1927 				na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY);
1928 			}
1929 		}
1930 		if (want_tx && retry_tx) {
1931 			selrecord(td, check_all_tx ?
1932 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
1933 			retry_tx = 0;
1934 			goto flush_tx;
1935 		}
1936 	}
1937 
1938 	/*
1939 	 * If want_rx is still set scan receive rings.
1940 	 * Do it on all rings because otherwise we starve.
1941 	 */
1942 	if (want_rx) {
1943 		int send_down = 0; /* transparent mode */
1944 		/* two rounds here to for race avoidance */
1945 do_retry_rx:
1946 		for (i = priv->np_qfirst; i < lim_rx; i++) {
1947 			int found = 0;
1948 
1949 			kring = &na->rx_rings[i];
1950 
1951 			if (nm_kr_tryget(kring)) {
1952 				D("%p lost race on rxring %d, ok", priv, i);
1953 				continue;
1954 			}
1955 
1956 			/*
1957 			 * transparent mode support: collect packets
1958 			 * from the rxring(s).
1959 			 * XXX NR_FORWARD should only be read on
1960 			 * physical or NIC ports
1961 			 */
1962 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
1963 				ND(10, "forwarding some buffers up %d to %d",
1964 				    kring->nr_hwcur, kring->ring->cur);
1965 				netmap_grab_packets(kring, &q, netmap_fwd);
1966 			}
1967 
1968 			if (na->nm_rxsync(na, i, 0))
1969 				revents |= POLLERR;
1970 			if (netmap_no_timestamp == 0 ||
1971 					kring->ring->flags & NR_TIMESTAMP) {
1972 				microtime(&kring->ring->ts);
1973 			}
1974 			/* after an rxsync we can use kring->rcur, rtail */
1975 			found = kring->rcur != kring->rtail;
1976 			nm_kr_put(kring);
1977 			if (found) {
1978 				revents |= want_rx;
1979 				retry_rx = 0;
1980 				na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY);
1981 			}
1982 		}
1983 
1984 		/* transparent mode XXX only during first pass ? */
1985 		kring = &na->rx_rings[lim_rx];
1986 		if (check_all_rx
1987 		    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
1988 			/* XXX fix to use kring fields */
1989 			if (nm_ring_empty(kring->ring))
1990 				send_down = netmap_rxsync_from_host(na, td, dev);
1991 			if (!nm_ring_empty(kring->ring))
1992 				revents |= want_rx;
1993 		}
1994 
1995 		if (retry_rx)
1996 			selrecord(td, check_all_rx ?
1997 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
1998 		if (send_down > 0 || retry_rx) {
1999 			retry_rx = 0;
2000 			if (send_down)
2001 				goto flush_tx; /* and retry_rx */
2002 			else
2003 				goto do_retry_rx;
2004 		}
2005 	}
2006 
2007 	/*
2008 	 * Transparent mode: marked bufs on rx rings between
2009 	 * kring->nr_hwcur and ring->head
2010 	 * are passed to the other endpoint.
2011 	 *
2012 	 * In this mode we also scan the sw rxring, which in
2013 	 * turn passes packets up.
2014 	 *
2015 	 * XXX Transparent mode at the moment requires to bind all
2016  	 * rings to a single file descriptor.
2017 	 */
2018 
2019 	if (q.head)
2020 		netmap_send_up(na->ifp, &q);
2021 
2022 	return (revents);
2023 }
2024 
2025 
2026 /*-------------------- driver support routines -------------------*/
2027 
2028 static int netmap_hw_krings_create(struct netmap_adapter *);
2029 
2030 static int
2031 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2032 	enum txrx tx, int flags)
2033 {
2034 	struct netmap_kring *kring;
2035 
2036 	if (tx == NR_TX) {
2037 		kring = na->tx_rings + n_ring;
2038 		selwakeuppri(&kring->si, PI_NET);
2039 		if (flags & NAF_GLOBAL_NOTIFY)
2040 			selwakeuppri(&na->tx_si, PI_NET);
2041 	} else {
2042 		kring = na->rx_rings + n_ring;
2043 		selwakeuppri(&kring->si, PI_NET);
2044 		if (flags & NAF_GLOBAL_NOTIFY)
2045 			selwakeuppri(&na->rx_si, PI_NET);
2046 	}
2047 	return 0;
2048 }
2049 
2050 
2051 // XXX check handling of failures
2052 int
2053 netmap_attach_common(struct netmap_adapter *na)
2054 {
2055 	struct ifnet *ifp = na->ifp;
2056 
2057 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2058 		D("%s: invalid rings tx %d rx %d",
2059 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
2060 		return EINVAL;
2061 	}
2062 	WNA(ifp) = na;
2063 
2064 	/* the following is only needed for na that use the host port.
2065 	 * XXX do we have something similar for linux ?
2066 	 */
2067 #ifdef __FreeBSD__
2068 	na->if_input = ifp->if_input; /* for netmap_send_up */
2069 #endif /* __FreeBSD__ */
2070 
2071 	NETMAP_SET_CAPABLE(ifp);
2072 	if (na->nm_krings_create == NULL) {
2073 		na->nm_krings_create = netmap_hw_krings_create;
2074 		na->nm_krings_delete = netmap_hw_krings_delete;
2075 	}
2076 	if (na->nm_notify == NULL)
2077 		na->nm_notify = netmap_notify;
2078 	na->active_fds = 0;
2079 
2080 	if (na->nm_mem == NULL)
2081 		na->nm_mem = &nm_mem;
2082 	return 0;
2083 }
2084 
2085 
2086 void
2087 netmap_detach_common(struct netmap_adapter *na)
2088 {
2089 	if (na->ifp)
2090 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2091 
2092 	if (na->tx_rings) { /* XXX should not happen */
2093 		D("freeing leftover tx_rings");
2094 		na->nm_krings_delete(na);
2095 	}
2096 	if (na->na_flags & NAF_MEM_OWNER)
2097 		netmap_mem_private_delete(na->nm_mem);
2098 	bzero(na, sizeof(*na));
2099 	free(na, M_DEVBUF);
2100 }
2101 
2102 
2103 /*
2104  * Initialize a ``netmap_adapter`` object created by driver on attach.
2105  * We allocate a block of memory with room for a struct netmap_adapter
2106  * plus two sets of N+2 struct netmap_kring (where N is the number
2107  * of hardware rings):
2108  * krings	0..N-1	are for the hardware queues.
2109  * kring	N	is for the host stack queue
2110  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2111  * Return 0 on success, ENOMEM otherwise.
2112  */
2113 int
2114 netmap_attach(struct netmap_adapter *arg)
2115 {
2116 	struct netmap_hw_adapter *hwna = NULL;
2117 	// XXX when is arg == NULL ?
2118 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2119 
2120 	if (arg == NULL || ifp == NULL)
2121 		goto fail;
2122 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2123 	if (hwna == NULL)
2124 		goto fail;
2125 	hwna->up = *arg;
2126 	if (netmap_attach_common(&hwna->up)) {
2127 		free(hwna, M_DEVBUF);
2128 		goto fail;
2129 	}
2130 	netmap_adapter_get(&hwna->up);
2131 
2132 #ifdef linux
2133 	if (ifp->netdev_ops) {
2134 		/* prepare a clone of the netdev ops */
2135 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2136 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2137 #else
2138 		hwna->nm_ndo = *ifp->netdev_ops;
2139 #endif
2140 	}
2141 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2142 #endif /* linux */
2143 
2144 	D("success for %s", NM_IFPNAME(ifp));
2145 	return 0;
2146 
2147 fail:
2148 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2149 	netmap_detach(ifp);
2150 	return (hwna ? EINVAL : ENOMEM);
2151 }
2152 
2153 
2154 void
2155 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2156 {
2157 	if (!na) {
2158 		return;
2159 	}
2160 
2161 	refcount_acquire(&na->na_refcount);
2162 }
2163 
2164 
2165 /* returns 1 iff the netmap_adapter is destroyed */
2166 int
2167 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2168 {
2169 	if (!na)
2170 		return 1;
2171 
2172 	if (!refcount_release(&na->na_refcount))
2173 		return 0;
2174 
2175 	if (na->nm_dtor)
2176 		na->nm_dtor(na);
2177 
2178 	netmap_detach_common(na);
2179 
2180 	return 1;
2181 }
2182 
2183 
2184 int
2185 netmap_hw_krings_create(struct netmap_adapter *na)
2186 {
2187 	int ret = netmap_krings_create(na,
2188 		na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
2189 	if (ret == 0) {
2190 		/* initialize the mbq for the sw rx ring */
2191 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2192 		ND("initialized sw rx queue %d", na->num_rx_rings);
2193 	}
2194 	return ret;
2195 }
2196 
2197 
2198 
2199 /*
2200  * Free the allocated memory linked to the given ``netmap_adapter``
2201  * object.
2202  */
2203 void
2204 netmap_detach(struct ifnet *ifp)
2205 {
2206 	struct netmap_adapter *na = NA(ifp);
2207 
2208 	if (!na)
2209 		return;
2210 
2211 	NMG_LOCK();
2212 	netmap_disable_all_rings(ifp);
2213 	if (!netmap_adapter_put(na)) {
2214 		/* someone is still using the adapter,
2215 		 * tell them that the interface is gone
2216 		 */
2217 		na->ifp = NULL;
2218 		/* give them a chance to notice */
2219 		netmap_enable_all_rings(ifp);
2220 	}
2221 	NMG_UNLOCK();
2222 }
2223 
2224 
2225 /*
2226  * Intercept packets from the network stack and pass them
2227  * to netmap as incoming packets on the 'software' ring.
2228  *
2229  * We only store packets in a bounded mbq and then copy them
2230  * in the relevant rxsync routine.
2231  *
2232  * We rely on the OS to make sure that the ifp and na do not go
2233  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2234  * In nm_register() or whenever there is a reinitialization,
2235  * we make sure to make the mode change visible here.
2236  */
2237 int
2238 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2239 {
2240 	struct netmap_adapter *na = NA(ifp);
2241 	struct netmap_kring *kring;
2242 	u_int len = MBUF_LEN(m);
2243 	u_int error = ENOBUFS;
2244 	struct mbq *q;
2245 	int space;
2246 
2247 	// XXX [Linux] we do not need this lock
2248 	// if we follow the down/configure/up protocol -gl
2249 	// mtx_lock(&na->core_lock);
2250 
2251 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2252 		D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
2253 		error = ENXIO;
2254 		goto done;
2255 	}
2256 
2257 	kring = &na->rx_rings[na->num_rx_rings];
2258 	q = &kring->rx_queue;
2259 
2260 	// XXX reconsider long packets if we handle fragments
2261 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2262 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2263 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2264 		goto done;
2265 	}
2266 
2267 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2268 	 * and maybe other instances of netmap_transmit (the latter
2269 	 * not possible on Linux).
2270 	 * Also avoid overflowing the queue.
2271 	 */
2272 	mtx_lock(&q->lock);
2273 
2274         space = kring->nr_hwtail - kring->nr_hwcur;
2275         if (space < 0)
2276                 space += kring->nkr_num_slots;
2277 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2278 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2279 			 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2280 			len, m);
2281 	} else {
2282 		mbq_enqueue(q, m);
2283 		ND(10, "%s %d bufs in queue len %d m %p",
2284 			NM_IFPNAME(ifp), mbq_len(q), len, m);
2285 		/* notify outside the lock */
2286 		m = NULL;
2287 		error = 0;
2288 	}
2289 	mtx_unlock(&q->lock);
2290 
2291 done:
2292 	if (m)
2293 		m_freem(m);
2294 	/* unconditionally wake up listeners */
2295 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2296 
2297 	return (error);
2298 }
2299 
2300 
2301 /*
2302  * netmap_reset() is called by the driver routines when reinitializing
2303  * a ring. The driver is in charge of locking to protect the kring.
2304  * If native netmap mode is not set just return NULL.
2305  */
2306 struct netmap_slot *
2307 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2308 	u_int new_cur)
2309 {
2310 	struct netmap_kring *kring;
2311 	int new_hwofs, lim;
2312 
2313 	if (na == NULL) {
2314 		D("NULL na, should not happen");
2315 		return NULL;	/* no netmap support here */
2316 	}
2317 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
2318 		ND("interface not in netmap mode");
2319 		return NULL;	/* nothing to reinitialize */
2320 	}
2321 
2322 	/* XXX note- in the new scheme, we are not guaranteed to be
2323 	 * under lock (e.g. when called on a device reset).
2324 	 * In this case, we should set a flag and do not trust too
2325 	 * much the values. In practice: TODO
2326 	 * - set a RESET flag somewhere in the kring
2327 	 * - do the processing in a conservative way
2328 	 * - let the *sync() fixup at the end.
2329 	 */
2330 	if (tx == NR_TX) {
2331 		if (n >= na->num_tx_rings)
2332 			return NULL;
2333 		kring = na->tx_rings + n;
2334 		// XXX check whether we should use hwcur or rcur
2335 		new_hwofs = kring->nr_hwcur - new_cur;
2336 	} else {
2337 		if (n >= na->num_rx_rings)
2338 			return NULL;
2339 		kring = na->rx_rings + n;
2340 		new_hwofs = kring->nr_hwtail - new_cur;
2341 	}
2342 	lim = kring->nkr_num_slots - 1;
2343 	if (new_hwofs > lim)
2344 		new_hwofs -= lim + 1;
2345 
2346 	/* Always set the new offset value and realign the ring. */
2347 	if (netmap_verbose)
2348 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2349 		NM_IFPNAME(na->ifp),
2350 		tx == NR_TX ? "TX" : "RX", n,
2351 		kring->nkr_hwofs, new_hwofs,
2352 		kring->nr_hwtail,
2353 		tx == NR_TX ? lim : kring->nr_hwtail);
2354 	kring->nkr_hwofs = new_hwofs;
2355 	if (tx == NR_TX) {
2356 		kring->nr_hwtail = kring->nr_hwcur + lim;
2357 		if (kring->nr_hwtail > lim)
2358 			kring->nr_hwtail -= lim + 1;
2359 	}
2360 
2361 #if 0 // def linux
2362 	/* XXX check that the mappings are correct */
2363 	/* need ring_nr, adapter->pdev, direction */
2364 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2365 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2366 		D("error mapping rx netmap buffer %d", i);
2367 		// XXX fix error handling
2368 	}
2369 
2370 #endif /* linux */
2371 	/*
2372 	 * Wakeup on the individual and global selwait
2373 	 * We do the wakeup here, but the ring is not yet reconfigured.
2374 	 * However, we are under lock so there are no races.
2375 	 */
2376 	na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
2377 	return kring->ring->slot;
2378 }
2379 
2380 
2381 /*
2382  * Dispatch rx/tx interrupts to the netmap rings.
2383  *
2384  * "work_done" is non-null on the RX path, NULL for the TX path.
2385  * We rely on the OS to make sure that there is only one active
2386  * instance per queue, and that there is appropriate locking.
2387  *
2388  * The 'notify' routine depends on what the ring is attached to.
2389  * - for a netmap file descriptor, do a selwakeup on the individual
2390  *   waitqueue, plus one on the global one if needed
2391  * - for a switch, call the proper forwarding routine
2392  * - XXX more ?
2393  */
2394 void
2395 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2396 {
2397 	struct netmap_adapter *na = NA(ifp);
2398 	struct netmap_kring *kring;
2399 
2400 	q &= NETMAP_RING_MASK;
2401 
2402 	if (netmap_verbose) {
2403 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2404 	}
2405 
2406 	if (work_done) { /* RX path */
2407 		if (q >= na->num_rx_rings)
2408 			return;	// not a physical queue
2409 		kring = na->rx_rings + q;
2410 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2411 		na->nm_notify(na, q, NR_RX,
2412 			(na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2413 		*work_done = 1; /* do not fire napi again */
2414 	} else { /* TX path */
2415 		if (q >= na->num_tx_rings)
2416 			return;	// not a physical queue
2417 		kring = na->tx_rings + q;
2418 		na->nm_notify(na, q, NR_TX,
2419 			(na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2420 	}
2421 }
2422 
2423 
2424 /*
2425  * Default functions to handle rx/tx interrupts from a physical device.
2426  * "work_done" is non-null on the RX path, NULL for the TX path.
2427  *
2428  * If the card is not in netmap mode, simply return 0,
2429  * so that the caller proceeds with regular processing.
2430  * Otherwise call netmap_common_irq() and return 1.
2431  *
2432  * If the card is connected to a netmap file descriptor,
2433  * do a selwakeup on the individual queue, plus one on the global one
2434  * if needed (multiqueue card _and_ there are multiqueue listeners),
2435  * and return 1.
2436  *
2437  * Finally, if called on rx from an interface connected to a switch,
2438  * calls the proper forwarding routine, and return 1.
2439  */
2440 int
2441 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2442 {
2443 	// XXX could we check NAF_NATIVE_ON ?
2444 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2445 		return 0;
2446 
2447 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2448 		ND("use regular interrupt");
2449 		return 0;
2450 	}
2451 
2452 	netmap_common_irq(ifp, q, work_done);
2453 	return 1;
2454 }
2455 
2456 
2457 /*
2458  * Module loader and unloader
2459  *
2460  * netmap_init() creates the /dev/netmap device and initializes
2461  * all global variables. Returns 0 on success, errno on failure
2462  * (but there is no chance)
2463  *
2464  * netmap_fini() destroys everything.
2465  */
2466 
2467 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2468 extern struct cdevsw netmap_cdevsw;
2469 
2470 
2471 void
2472 netmap_fini(void)
2473 {
2474 	// XXX destroy_bridges() ?
2475 	if (netmap_dev)
2476 		destroy_dev(netmap_dev);
2477 	netmap_mem_fini();
2478 	NMG_LOCK_DESTROY();
2479 	printf("netmap: unloaded module.\n");
2480 }
2481 
2482 
2483 int
2484 netmap_init(void)
2485 {
2486 	int error;
2487 
2488 	NMG_LOCK_INIT();
2489 
2490 	error = netmap_mem_init();
2491 	if (error != 0)
2492 		goto fail;
2493 	/* XXX could use make_dev_credv() to get error number */
2494 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2495 			      "netmap");
2496 	if (!netmap_dev)
2497 		goto fail;
2498 
2499 	netmap_init_bridges();
2500 	printf("netmap: loaded module\n");
2501 	return (0);
2502 fail:
2503 	netmap_fini();
2504 	return (EINVAL); /* may be incorrect */
2505 }
2506