xref: /freebsd/sys/dev/netmap/netmap.c (revision 1a61beb0549e05b33df31380e427d90f6e46ff7e)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 /*
128  * OS-specific code that is used only within this file.
129  * Other OS-specific code that must be accessed by drivers
130  * is present in netmap_kern.h
131  */
132 
133 #if defined(__FreeBSD__)
134 #include <sys/cdefs.h> /* prerequisite */
135 #include <sys/types.h>
136 #include <sys/errno.h>
137 #include <sys/param.h>	/* defines used in kernel.h */
138 #include <sys/kernel.h>	/* types used in module initialization */
139 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
140 #include <sys/sockio.h>
141 #include <sys/socketvar.h>	/* struct socket */
142 #include <sys/malloc.h>
143 #include <sys/poll.h>
144 #include <sys/rwlock.h>
145 #include <sys/socket.h> /* sockaddrs */
146 #include <sys/selinfo.h>
147 #include <sys/sysctl.h>
148 #include <sys/jail.h>
149 #include <net/vnet.h>
150 #include <net/if.h>
151 #include <net/if_var.h>
152 #include <net/bpf.h>		/* BIOCIMMEDIATE */
153 #include <machine/bus.h>	/* bus_dmamap_* */
154 #include <sys/endian.h>
155 #include <sys/refcount.h>
156 
157 
158 /* reduce conditional code */
159 #define init_waitqueue_head(x)	// only needed in linux
160 
161 
162 
163 #elif defined(linux)
164 
165 #include "bsd_glue.h"
166 
167 
168 
169 #elif defined(__APPLE__)
170 
171 #warning OSX support is only partial
172 #include "osx_glue.h"
173 
174 #else
175 
176 #error	Unsupported platform
177 
178 #endif /* unsupported */
179 
180 /*
181  * common headers
182  */
183 #include <net/netmap.h>
184 #include <dev/netmap/netmap_kern.h>
185 #include <dev/netmap/netmap_mem2.h>
186 
187 
188 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
189 
190 /*
191  * The following variables are used by the drivers and replicate
192  * fields in the global memory pool. They only refer to buffers
193  * used by physical interfaces.
194  */
195 u_int netmap_total_buffers;
196 u_int netmap_buf_size;
197 char *netmap_buffer_base;	/* also address of an invalid buffer */
198 
199 /* user-controlled variables */
200 int netmap_verbose;
201 
202 static int netmap_no_timestamp; /* don't timestamp on rxsync */
203 
204 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
205 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
206     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
207 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
208     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
209 int netmap_mitigate = 1;
210 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
211 int netmap_no_pendintr = 1;
212 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
213     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
214 int netmap_txsync_retry = 2;
215 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
216     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
217 
218 int netmap_flags = 0;	/* debug flags */
219 int netmap_fwd = 0;	/* force transparent mode */
220 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
221 
222 /*
223  * netmap_admode selects the netmap mode to use.
224  * Invalid values are reset to NETMAP_ADMODE_BEST
225  */
226 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
227 	NETMAP_ADMODE_NATIVE,	/* either native or none */
228 	NETMAP_ADMODE_GENERIC,	/* force generic */
229 	NETMAP_ADMODE_LAST };
230 static int netmap_admode = NETMAP_ADMODE_BEST;
231 
232 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
233 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
234 
235 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
236 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
237 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
238 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
239 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
240 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
241 
242 NMG_LOCK_T	netmap_global_lock;
243 
244 
245 static void
246 nm_kr_get(struct netmap_kring *kr)
247 {
248 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
249 		tsleep(kr, 0, "NM_KR_GET", 4);
250 }
251 
252 
253 /*
254  * mark the ring as stopped, and run through the locks
255  * to make sure other users get to see it.
256  */
257 void
258 netmap_disable_ring(struct netmap_kring *kr)
259 {
260 	kr->nkr_stopped = 1;
261 	nm_kr_get(kr);
262 	mtx_lock(&kr->q_lock);
263 	mtx_unlock(&kr->q_lock);
264 	nm_kr_put(kr);
265 }
266 
267 
268 static void
269 netmap_set_all_rings(struct ifnet *ifp, int stopped)
270 {
271 	struct netmap_adapter *na;
272 	int i;
273 
274 	if (!(ifp->if_capenable & IFCAP_NETMAP))
275 		return;
276 
277 	na = NA(ifp);
278 
279 	for (i = 0; i <= na->num_tx_rings; i++) {
280 		if (stopped)
281 			netmap_disable_ring(na->tx_rings + i);
282 		else
283 			na->tx_rings[i].nkr_stopped = 0;
284 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
285 			(i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
286 	}
287 
288 	for (i = 0; i <= na->num_rx_rings; i++) {
289 		if (stopped)
290 			netmap_disable_ring(na->rx_rings + i);
291 		else
292 			na->rx_rings[i].nkr_stopped = 0;
293 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
294 			(i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
295 	}
296 }
297 
298 
299 void
300 netmap_disable_all_rings(struct ifnet *ifp)
301 {
302 	netmap_set_all_rings(ifp, 1 /* stopped */);
303 }
304 
305 
306 void
307 netmap_enable_all_rings(struct ifnet *ifp)
308 {
309 	netmap_set_all_rings(ifp, 0 /* enabled */);
310 }
311 
312 
313 /*
314  * generic bound_checking function
315  */
316 u_int
317 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
318 {
319 	u_int oldv = *v;
320 	const char *op = NULL;
321 
322 	if (dflt < lo)
323 		dflt = lo;
324 	if (dflt > hi)
325 		dflt = hi;
326 	if (oldv < lo) {
327 		*v = dflt;
328 		op = "Bump";
329 	} else if (oldv > hi) {
330 		*v = hi;
331 		op = "Clamp";
332 	}
333 	if (op && msg)
334 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
335 	return *v;
336 }
337 
338 
339 /*
340  * packet-dump function, user-supplied or static buffer.
341  * The destination buffer must be at least 30+4*len
342  */
343 const char *
344 nm_dump_buf(char *p, int len, int lim, char *dst)
345 {
346 	static char _dst[8192];
347 	int i, j, i0;
348 	static char hex[] ="0123456789abcdef";
349 	char *o;	/* output position */
350 
351 #define P_HI(x)	hex[((x) & 0xf0)>>4]
352 #define P_LO(x)	hex[((x) & 0xf)]
353 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
354 	if (!dst)
355 		dst = _dst;
356 	if (lim <= 0 || lim > len)
357 		lim = len;
358 	o = dst;
359 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
360 	o += strlen(o);
361 	/* hexdump routine */
362 	for (i = 0; i < lim; ) {
363 		sprintf(o, "%5d: ", i);
364 		o += strlen(o);
365 		memset(o, ' ', 48);
366 		i0 = i;
367 		for (j=0; j < 16 && i < lim; i++, j++) {
368 			o[j*3] = P_HI(p[i]);
369 			o[j*3+1] = P_LO(p[i]);
370 		}
371 		i = i0;
372 		for (j=0; j < 16 && i < lim; i++, j++)
373 			o[j + 48] = P_C(p[i]);
374 		o[j+48] = '\n';
375 		o += j+49;
376 	}
377 	*o = '\0';
378 #undef P_HI
379 #undef P_LO
380 #undef P_C
381 	return dst;
382 }
383 
384 
385 /*
386  * Fetch configuration from the device, to cope with dynamic
387  * reconfigurations after loading the module.
388  */
389 int
390 netmap_update_config(struct netmap_adapter *na)
391 {
392 	struct ifnet *ifp = na->ifp;
393 	u_int txr, txd, rxr, rxd;
394 
395 	txr = txd = rxr = rxd = 0;
396 	if (na->nm_config) {
397 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
398 	} else {
399 		/* take whatever we had at init time */
400 		txr = na->num_tx_rings;
401 		txd = na->num_tx_desc;
402 		rxr = na->num_rx_rings;
403 		rxd = na->num_rx_desc;
404 	}
405 
406 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
407 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
408 		return 0; /* nothing changed */
409 	if (netmap_verbose || na->active_fds > 0) {
410 		D("stored config %s: txring %d x %d, rxring %d x %d",
411 			NM_IFPNAME(ifp),
412 			na->num_tx_rings, na->num_tx_desc,
413 			na->num_rx_rings, na->num_rx_desc);
414 		D("new config %s: txring %d x %d, rxring %d x %d",
415 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
416 	}
417 	if (na->active_fds == 0) {
418 		D("configuration changed (but fine)");
419 		na->num_tx_rings = txr;
420 		na->num_tx_desc = txd;
421 		na->num_rx_rings = rxr;
422 		na->num_rx_desc = rxd;
423 		return 0;
424 	}
425 	D("configuration changed while active, this is bad...");
426 	return 1;
427 }
428 
429 
430 int
431 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
432 {
433 	u_int i, len, ndesc;
434 	struct netmap_kring *kring;
435 
436 	// XXX additional space for extra rings ?
437 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
438 
439 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
440 	if (na->tx_rings == NULL) {
441 		D("Cannot allocate krings");
442 		return ENOMEM;
443 	}
444 	na->rx_rings = na->tx_rings + ntx;
445 
446 	/*
447 	 * All fields in krings are 0 except the one initialized below.
448 	 * but better be explicit on important kring fields.
449 	 */
450 	ndesc = na->num_tx_desc;
451 	for (i = 0; i < ntx; i++) { /* Transmit rings */
452 		kring = &na->tx_rings[i];
453 		bzero(kring, sizeof(*kring));
454 		kring->na = na;
455 		kring->ring_id = i;
456 		kring->nkr_num_slots = ndesc;
457 		/*
458 		 * IMPORTANT: Always keep one slot empty.
459 		 */
460 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
461 		kring->rtail = kring->nr_hwtail = ndesc - 1;
462 		snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i);
463 		mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF);
464 		init_waitqueue_head(&kring->si);
465 	}
466 
467 	ndesc = na->num_rx_desc;
468 	for (i = 0; i < nrx; i++) { /* Receive rings */
469 		kring = &na->rx_rings[i];
470 		bzero(kring, sizeof(*kring));
471 		kring->na = na;
472 		kring->ring_id = i;
473 		kring->nkr_num_slots = ndesc;
474 		kring->rhead = kring->rcur = kring->nr_hwcur = 0;
475 		kring->rtail = kring->nr_hwtail = 0;
476 		snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i);
477 		mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF);
478 		init_waitqueue_head(&kring->si);
479 	}
480 	init_waitqueue_head(&na->tx_si);
481 	init_waitqueue_head(&na->rx_si);
482 
483 	na->tailroom = na->rx_rings + nrx;
484 
485 	return 0;
486 }
487 
488 
489 /* XXX check boundaries */
490 void
491 netmap_krings_delete(struct netmap_adapter *na)
492 {
493 	int i;
494 
495 	for (i = 0; i < na->num_tx_rings + 1; i++) {
496 		mtx_destroy(&na->tx_rings[i].q_lock);
497 	}
498 	for (i = 0; i < na->num_rx_rings + 1; i++) {
499 		mtx_destroy(&na->rx_rings[i].q_lock);
500 	}
501 	free(na->tx_rings, M_DEVBUF);
502 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
503 }
504 
505 
506 /*
507  * Destructor for NIC ports. They also have an mbuf queue
508  * on the rings connected to the host so we need to purge
509  * them first.
510  */
511 static void
512 netmap_hw_krings_delete(struct netmap_adapter *na)
513 {
514 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
515 
516 	ND("destroy sw mbq with len %d", mbq_len(q));
517 	mbq_purge(q);
518 	mbq_safe_destroy(q);
519 	netmap_krings_delete(na);
520 }
521 
522 
523 static struct netmap_if*
524 netmap_if_new(const char *ifname, struct netmap_adapter *na)
525 {
526 	struct netmap_if *nifp;
527 
528 	if (netmap_update_config(na)) {
529 		/* configuration mismatch, report and fail */
530 		return NULL;
531 	}
532 
533 	if (na->active_fds)
534 		goto final;
535 
536 	if (na->nm_krings_create(na))
537 		goto cleanup;
538 
539 	if (netmap_mem_rings_create(na))
540 		goto cleanup;
541 
542 final:
543 
544 	nifp = netmap_mem_if_new(ifname, na);
545 	if (nifp == NULL)
546 		goto cleanup;
547 
548 	return (nifp);
549 
550 cleanup:
551 
552 	if (na->active_fds == 0) {
553 		netmap_mem_rings_delete(na);
554 		na->nm_krings_delete(na);
555 	}
556 
557 	return NULL;
558 }
559 
560 
561 /* grab a reference to the memory allocator, if we don't have one already.  The
562  * reference is taken from the netmap_adapter registered with the priv.
563  *
564  */
565 static int
566 netmap_get_memory_locked(struct netmap_priv_d* p)
567 {
568 	struct netmap_mem_d *nmd;
569 	int error = 0;
570 
571 	if (p->np_na == NULL) {
572 		if (!netmap_mmap_unreg)
573 			return ENODEV;
574 		/* for compatibility with older versions of the API
575  		 * we use the global allocator when no interface has been
576  		 * registered
577  		 */
578 		nmd = &nm_mem;
579 	} else {
580 		nmd = p->np_na->nm_mem;
581 	}
582 	if (p->np_mref == NULL) {
583 		error = netmap_mem_finalize(nmd);
584 		if (!error)
585 			p->np_mref = nmd;
586 	} else if (p->np_mref != nmd) {
587 		/* a virtual port has been registered, but previous
588  		 * syscalls already used the global allocator.
589  		 * We cannot continue
590  		 */
591 		error = ENODEV;
592 	}
593 	return error;
594 }
595 
596 
597 int
598 netmap_get_memory(struct netmap_priv_d* p)
599 {
600 	int error;
601 	NMG_LOCK();
602 	error = netmap_get_memory_locked(p);
603 	NMG_UNLOCK();
604 	return error;
605 }
606 
607 
608 static int
609 netmap_have_memory_locked(struct netmap_priv_d* p)
610 {
611 	return p->np_mref != NULL;
612 }
613 
614 
615 static void
616 netmap_drop_memory_locked(struct netmap_priv_d* p)
617 {
618 	if (p->np_mref) {
619 		netmap_mem_deref(p->np_mref);
620 		p->np_mref = NULL;
621 	}
622 }
623 
624 
625 /*
626  * File descriptor's private data destructor.
627  *
628  * Call nm_register(ifp,0) to stop netmap mode on the interface and
629  * revert to normal operation. We expect that np_na->ifp has not gone.
630  * The second argument is the nifp to work on. In some cases it is
631  * not attached yet to the netmap_priv_d so we need to pass it as
632  * a separate argument.
633  */
634 /* call with NMG_LOCK held */
635 static void
636 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
637 {
638 	struct netmap_adapter *na = priv->np_na;
639 	struct ifnet *ifp = na->ifp;
640 
641 	NMG_LOCK_ASSERT();
642 	na->active_fds--;
643 	if (na->active_fds <= 0) {	/* last instance */
644 
645 		if (netmap_verbose)
646 			D("deleting last instance for %s", NM_IFPNAME(ifp));
647 		/*
648 		 * (TO CHECK) This function is only called
649 		 * when the last reference to this file descriptor goes
650 		 * away. This means we cannot have any pending poll()
651 		 * or interrupt routine operating on the structure.
652 		 * XXX The file may be closed in a thread while
653 		 * another thread is using it.
654 		 * Linux keeps the file opened until the last reference
655 		 * by any outstanding ioctl/poll or mmap is gone.
656 		 * FreeBSD does not track mmap()s (but we do) and
657 		 * wakes up any sleeping poll(). Need to check what
658 		 * happens if the close() occurs while a concurrent
659 		 * syscall is running.
660 		 */
661 		if (ifp)
662 			na->nm_register(na, 0); /* off, clear flags */
663 		/* Wake up any sleeping threads. netmap_poll will
664 		 * then return POLLERR
665 		 * XXX The wake up now must happen during *_down(), when
666 		 * we order all activities to stop. -gl
667 		 */
668 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
669 		/* knlist_destroy(&na->tx_si.si_note); */
670 		/* knlist_destroy(&na->rx_si.si_note); */
671 
672 		/* delete rings and buffers */
673 		netmap_mem_rings_delete(na);
674 		na->nm_krings_delete(na);
675 	}
676 	/* delete the nifp */
677 	netmap_mem_if_delete(na, nifp);
678 }
679 
680 
681 /*
682  * returns 1 if this is the last instance and we can free priv
683  */
684 int
685 netmap_dtor_locked(struct netmap_priv_d *priv)
686 {
687 	struct netmap_adapter *na = priv->np_na;
688 
689 #ifdef __FreeBSD__
690 	/*
691 	 * np_refcount is the number of active mmaps on
692 	 * this file descriptor
693 	 */
694 	if (--priv->np_refcount > 0) {
695 		return 0;
696 	}
697 #endif /* __FreeBSD__ */
698 	if (!na) {
699 	    return 1; //XXX is it correct?
700 	}
701 	netmap_do_unregif(priv, priv->np_nifp);
702 	priv->np_nifp = NULL;
703 	netmap_drop_memory_locked(priv);
704 	if (priv->np_na) {
705 		netmap_adapter_put(na);
706 		priv->np_na = NULL;
707 	}
708 	return 1;
709 }
710 
711 
712 void
713 netmap_dtor(void *data)
714 {
715 	struct netmap_priv_d *priv = data;
716 	int last_instance;
717 
718 	NMG_LOCK();
719 	last_instance = netmap_dtor_locked(priv);
720 	NMG_UNLOCK();
721 	if (last_instance) {
722 		bzero(priv, sizeof(*priv));	/* for safety */
723 		free(priv, M_DEVBUF);
724 	}
725 }
726 
727 
728 
729 
730 /*
731  * Handlers for synchronization of the queues from/to the host.
732  * Netmap has two operating modes:
733  * - in the default mode, the rings connected to the host stack are
734  *   just another ring pair managed by userspace;
735  * - in transparent mode (XXX to be defined) incoming packets
736  *   (from the host or the NIC) are marked as NS_FORWARD upon
737  *   arrival, and the user application has a chance to reset the
738  *   flag for packets that should be dropped.
739  *   On the RXSYNC or poll(), packets in RX rings between
740  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
741  *   to the other side.
742  * The transfer NIC --> host is relatively easy, just encapsulate
743  * into mbufs and we are done. The host --> NIC side is slightly
744  * harder because there might not be room in the tx ring so it
745  * might take a while before releasing the buffer.
746  */
747 
748 
749 /*
750  * pass a chain of buffers to the host stack as coming from 'dst'
751  * We do not need to lock because the queue is private.
752  */
753 static void
754 netmap_send_up(struct ifnet *dst, struct mbq *q)
755 {
756 	struct mbuf *m;
757 
758 	/* send packets up, outside the lock */
759 	while ((m = mbq_dequeue(q)) != NULL) {
760 		if (netmap_verbose & NM_VERB_HOST)
761 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
762 		NM_SEND_UP(dst, m);
763 	}
764 	mbq_destroy(q);
765 }
766 
767 
768 /*
769  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
770  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
771  * and pass them up. Drop remaining packets in the unlikely event
772  * of an mbuf shortage.
773  */
774 static void
775 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
776 {
777 	u_int const lim = kring->nkr_num_slots - 1;
778 	u_int const head = kring->ring->head;
779 	u_int n;
780 	struct netmap_adapter *na = kring->na;
781 
782 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
783 		struct mbuf *m;
784 		struct netmap_slot *slot = &kring->ring->slot[n];
785 
786 		if ((slot->flags & NS_FORWARD) == 0 && !force)
787 			continue;
788 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
789 			RD(5, "bad pkt at %d len %d", n, slot->len);
790 			continue;
791 		}
792 		slot->flags &= ~NS_FORWARD; // XXX needed ?
793 		/* XXX TODO: adapt to the case of a multisegment packet */
794 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
795 
796 		if (m == NULL)
797 			break;
798 		mbq_enqueue(q, m);
799 	}
800 }
801 
802 
803 /*
804  * Send to the NIC rings packets marked NS_FORWARD between
805  * kring->nr_hwcur and kring->rhead
806  * Called under kring->rx_queue.lock on the sw rx ring,
807  */
808 static u_int
809 netmap_sw_to_nic(struct netmap_adapter *na)
810 {
811 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
812 	struct netmap_slot *rxslot = kring->ring->slot;
813 	u_int i, rxcur = kring->nr_hwcur;
814 	u_int const head = kring->rhead;
815 	u_int const src_lim = kring->nkr_num_slots - 1;
816 	u_int sent = 0;
817 
818 	/* scan rings to find space, then fill as much as possible */
819 	for (i = 0; i < na->num_tx_rings; i++) {
820 		struct netmap_kring *kdst = &na->tx_rings[i];
821 		struct netmap_ring *rdst = kdst->ring;
822 		u_int const dst_lim = kdst->nkr_num_slots - 1;
823 
824 		/* XXX do we trust ring or kring->rcur,rtail ? */
825 		for (; rxcur != head && !nm_ring_empty(rdst);
826 		     rxcur = nm_next(rxcur, src_lim) ) {
827 			struct netmap_slot *src, *dst, tmp;
828 			u_int dst_cur = rdst->cur;
829 
830 			src = &rxslot[rxcur];
831 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
832 				continue;
833 
834 			sent++;
835 
836 			dst = &rdst->slot[dst_cur];
837 
838 			tmp = *src;
839 
840 			src->buf_idx = dst->buf_idx;
841 			src->flags = NS_BUF_CHANGED;
842 
843 			dst->buf_idx = tmp.buf_idx;
844 			dst->len = tmp.len;
845 			dst->flags = NS_BUF_CHANGED;
846 
847 			rdst->cur = nm_next(dst_cur, dst_lim);
848 		}
849 		/* if (sent) XXX txsync ? */
850 	}
851 	return sent;
852 }
853 
854 
855 /*
856  * netmap_txsync_to_host() passes packets up. We are called from a
857  * system call in user process context, and the only contention
858  * can be among multiple user threads erroneously calling
859  * this routine concurrently.
860  */
861 void
862 netmap_txsync_to_host(struct netmap_adapter *na)
863 {
864 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
865 	struct netmap_ring *ring = kring->ring;
866 	u_int const lim = kring->nkr_num_slots - 1;
867 	u_int const head = nm_txsync_prologue(kring);
868 	struct mbq q;
869 	int error;
870 
871 	error = nm_kr_tryget(kring);
872 	if (error) {
873 		if (error == NM_KR_BUSY)
874 			D("ring %p busy (user error)", kring);
875 		return;
876 	}
877 	if (head > lim) {
878 		D("invalid ring index in stack TX kring %p", kring);
879 		netmap_ring_reinit(kring);
880 		nm_kr_put(kring);
881 		return;
882 	}
883 
884 	/* Take packets from hwcur to head and pass them up.
885 	 * force head = cur since netmap_grab_packets() stops at head
886 	 * In case of no buffers we give up. At the end of the loop,
887 	 * the queue is drained in all cases.
888 	 */
889 	mbq_init(&q);
890 	ring->cur = head;
891 	netmap_grab_packets(kring, &q, 1 /* force */);
892 	ND("have %d pkts in queue", mbq_len(&q));
893 	kring->nr_hwcur = head;
894 	kring->nr_hwtail = head + lim;
895 	if (kring->nr_hwtail > lim)
896 		kring->nr_hwtail -= lim + 1;
897 	nm_txsync_finalize(kring);
898 
899 	nm_kr_put(kring);
900 	netmap_send_up(na->ifp, &q);
901 }
902 
903 
904 /*
905  * rxsync backend for packets coming from the host stack.
906  * They have been put in kring->rx_queue by netmap_transmit().
907  * We protect access to the kring using kring->rx_queue.lock
908  *
909  * This routine also does the selrecord if called from the poll handler
910  * (we know because td != NULL).
911  *
912  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
913  *     as an additional hidden argument.
914  * returns the number of packets delivered to tx queues in
915  * transparent mode, or a negative value if error
916  */
917 int
918 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
919 {
920 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
921 	struct netmap_ring *ring = kring->ring;
922 	u_int nm_i, n;
923 	u_int const lim = kring->nkr_num_slots - 1;
924 	u_int const head = nm_rxsync_prologue(kring);
925 	int ret = 0;
926 	struct mbq *q = &kring->rx_queue;
927 
928 	(void)pwait;	/* disable unused warnings */
929 
930 	if (head > lim) {
931 		netmap_ring_reinit(kring);
932 		return EINVAL;
933 	}
934 
935 	if (kring->nkr_stopped) /* check a first time without lock */
936 		return EBUSY;
937 
938 	mtx_lock(&q->lock);
939 
940 	if (kring->nkr_stopped) {  /* check again with lock held */
941 		ret = EBUSY;
942 		goto unlock_out;
943 	}
944 
945 	/* First part: import newly received packets */
946 	n = mbq_len(q);
947 	if (n) { /* grab packets from the queue */
948 		struct mbuf *m;
949 		uint32_t stop_i;
950 
951 		nm_i = kring->nr_hwtail;
952 		stop_i = nm_prev(nm_i, lim);
953 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
954 			int len = MBUF_LEN(m);
955 			struct netmap_slot *slot = &ring->slot[nm_i];
956 
957 			m_copydata(m, 0, len, BDG_NMB(na, slot));
958 			ND("nm %d len %d", nm_i, len);
959 			if (netmap_verbose)
960                                 D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL));
961 
962 			slot->len = len;
963 			slot->flags = kring->nkr_slot_flags;
964 			nm_i = nm_next(nm_i, lim);
965 		}
966 		kring->nr_hwtail = nm_i;
967 	}
968 
969 	/*
970 	 * Second part: skip past packets that userspace has released.
971 	 */
972 	nm_i = kring->nr_hwcur;
973 	if (nm_i != head) { /* something was released */
974 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
975 			ret = netmap_sw_to_nic(na);
976 		kring->nr_hwcur = head;
977 	}
978 
979 	nm_rxsync_finalize(kring);
980 
981 	/* access copies of cur,tail in the kring */
982 	if (kring->rcur == kring->rtail && td) /* no bufs available */
983 		selrecord(td, &kring->si);
984 
985 unlock_out:
986 
987 	mtx_unlock(&q->lock);
988 	return ret;
989 }
990 
991 
992 /* Get a netmap adapter for the port.
993  *
994  * If it is possible to satisfy the request, return 0
995  * with *na containing the netmap adapter found.
996  * Otherwise return an error code, with *na containing NULL.
997  *
998  * When the port is attached to a bridge, we always return
999  * EBUSY.
1000  * Otherwise, if the port is already bound to a file descriptor,
1001  * then we unconditionally return the existing adapter into *na.
1002  * In all the other cases, we return (into *na) either native,
1003  * generic or NULL, according to the following table:
1004  *
1005  *					native_support
1006  * active_fds   dev.netmap.admode         YES     NO
1007  * -------------------------------------------------------
1008  *    >0              *                 NA(ifp) NA(ifp)
1009  *
1010  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1011  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1012  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1013  *
1014  */
1015 
1016 int
1017 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1018 {
1019 	/* generic support */
1020 	int i = netmap_admode;	/* Take a snapshot. */
1021 	int error = 0;
1022 	struct netmap_adapter *prev_na;
1023 	struct netmap_generic_adapter *gna;
1024 
1025 	*na = NULL; /* default */
1026 
1027 	/* reset in case of invalid value */
1028 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1029 		i = netmap_admode = NETMAP_ADMODE_BEST;
1030 
1031 	if (NETMAP_CAPABLE(ifp)) {
1032 		/* If an adapter already exists, but is
1033 		 * attached to a vale port, we report that the
1034 		 * port is busy.
1035 		 */
1036 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
1037 			return EBUSY;
1038 
1039 		/* If an adapter already exists, return it if
1040 		 * there are active file descriptors or if
1041 		 * netmap is not forced to use generic
1042 		 * adapters.
1043 		 */
1044 		if (NA(ifp)->active_fds > 0 ||
1045 				i != NETMAP_ADMODE_GENERIC) {
1046 			*na = NA(ifp);
1047 			return 0;
1048 		}
1049 	}
1050 
1051 	/* If there isn't native support and netmap is not allowed
1052 	 * to use generic adapters, we cannot satisfy the request.
1053 	 */
1054 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1055 		return EOPNOTSUPP;
1056 
1057 	/* Otherwise, create a generic adapter and return it,
1058 	 * saving the previously used netmap adapter, if any.
1059 	 *
1060 	 * Note that here 'prev_na', if not NULL, MUST be a
1061 	 * native adapter, and CANNOT be a generic one. This is
1062 	 * true because generic adapters are created on demand, and
1063 	 * destroyed when not used anymore. Therefore, if the adapter
1064 	 * currently attached to an interface 'ifp' is generic, it
1065 	 * must be that
1066 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1067 	 * Consequently, if NA(ifp) is generic, we will enter one of
1068 	 * the branches above. This ensures that we never override
1069 	 * a generic adapter with another generic adapter.
1070 	 */
1071 	prev_na = NA(ifp);
1072 	error = generic_netmap_attach(ifp);
1073 	if (error)
1074 		return error;
1075 
1076 	*na = NA(ifp);
1077 	gna = (struct netmap_generic_adapter*)NA(ifp);
1078 	gna->prev = prev_na; /* save old na */
1079 	if (prev_na != NULL) {
1080 		ifunit_ref(ifp->if_xname);
1081 		// XXX add a refcount ?
1082 		netmap_adapter_get(prev_na);
1083 	}
1084 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1085 
1086 	return 0;
1087 }
1088 
1089 
1090 /*
1091  * MUST BE CALLED UNDER NMG_LOCK()
1092  *
1093  * Get a refcounted reference to a netmap adapter attached
1094  * to the interface specified by nmr.
1095  * This is always called in the execution of an ioctl().
1096  *
1097  * Return ENXIO if the interface specified by the request does
1098  * not exist, ENOTSUP if netmap is not supported by the interface,
1099  * EBUSY if the interface is already attached to a bridge,
1100  * EINVAL if parameters are invalid, ENOMEM if needed resources
1101  * could not be allocated.
1102  * If successful, hold a reference to the netmap adapter.
1103  *
1104  * No reference is kept on the real interface, which may then
1105  * disappear at any time.
1106  */
1107 int
1108 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1109 {
1110 	struct ifnet *ifp;
1111 	int error = 0;
1112 	struct netmap_adapter *ret;
1113 
1114 	*na = NULL;     /* default return value */
1115 
1116 	/* first try to see if this is a bridge port. */
1117 	NMG_LOCK_ASSERT();
1118 
1119 	error = netmap_get_bdg_na(nmr, na, create);
1120 	if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
1121 		return error;
1122 
1123 	ifp = ifunit_ref(nmr->nr_name);
1124 	if (ifp == NULL) {
1125 	        return ENXIO;
1126 	}
1127 
1128 	error = netmap_get_hw_na(ifp, &ret);
1129 	if (error)
1130 		goto out;
1131 
1132 	if (ret != NULL) {
1133 		/* Users cannot use the NIC attached to a bridge directly */
1134 		if (NETMAP_OWNED_BY_KERN(ret)) {
1135 			error = EBUSY;
1136 			goto out;
1137 		}
1138 		error = 0;
1139 		*na = ret;
1140 		netmap_adapter_get(ret);
1141 	}
1142 out:
1143 	if_rele(ifp);
1144 
1145 	return error;
1146 }
1147 
1148 
1149 /*
1150  * validate parameters on entry for *_txsync()
1151  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1152  * in case of error.
1153  *
1154  * rhead, rcur and rtail=hwtail are stored from previous round.
1155  * hwcur is the next packet to send to the ring.
1156  *
1157  * We want
1158  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1159  *
1160  * hwcur, rhead, rtail and hwtail are reliable
1161  */
1162 u_int
1163 nm_txsync_prologue(struct netmap_kring *kring)
1164 {
1165 	struct netmap_ring *ring = kring->ring;
1166 	u_int head = ring->head; /* read only once */
1167 	u_int cur = ring->cur; /* read only once */
1168 	u_int n = kring->nkr_num_slots;
1169 
1170 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1171 		kring->name,
1172 		kring->nr_hwcur, kring->nr_hwtail,
1173 		ring->head, ring->cur, ring->tail);
1174 #if 1 /* kernel sanity checks; but we can trust the kring. */
1175 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1176 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1177 		goto error;
1178 #endif /* kernel sanity checks */
1179 	/*
1180 	 * user sanity checks. We only use 'cur',
1181 	 * A, B, ... are possible positions for cur:
1182 	 *
1183 	 *  0    A  cur   B  tail  C  n-1
1184 	 *  0    D  tail  E  cur   F  n-1
1185 	 *
1186 	 * B, F, D are valid. A, C, E are wrong
1187 	 */
1188 	if (kring->rtail >= kring->rhead) {
1189 		/* want rhead <= head <= rtail */
1190 		if (head < kring->rhead || head > kring->rtail)
1191 			goto error;
1192 		/* and also head <= cur <= rtail */
1193 		if (cur < head || cur > kring->rtail)
1194 			goto error;
1195 	} else { /* here rtail < rhead */
1196 		/* we need head outside rtail .. rhead */
1197 		if (head > kring->rtail && head < kring->rhead)
1198 			goto error;
1199 
1200 		/* two cases now: head <= rtail or head >= rhead  */
1201 		if (head <= kring->rtail) {
1202 			/* want head <= cur <= rtail */
1203 			if (cur < head || cur > kring->rtail)
1204 				goto error;
1205 		} else { /* head >= rhead */
1206 			/* cur must be outside rtail..head */
1207 			if (cur > kring->rtail && cur < head)
1208 				goto error;
1209 		}
1210 	}
1211 	if (ring->tail != kring->rtail) {
1212 		RD(5, "tail overwritten was %d need %d",
1213 			ring->tail, kring->rtail);
1214 		ring->tail = kring->rtail;
1215 	}
1216 	kring->rhead = head;
1217 	kring->rcur = cur;
1218 	return head;
1219 
1220 error:
1221 	RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d",
1222 		kring->name,
1223 		kring->nr_hwcur,
1224 		kring->rcur, kring->nr_hwtail,
1225 		cur, ring->tail);
1226 	return n;
1227 }
1228 
1229 
1230 /*
1231  * validate parameters on entry for *_rxsync()
1232  * Returns ring->head if ok, kring->nkr_num_slots on error.
1233  *
1234  * For a valid configuration,
1235  * hwcur <= head <= cur <= tail <= hwtail
1236  *
1237  * We only consider head and cur.
1238  * hwcur and hwtail are reliable.
1239  *
1240  */
1241 u_int
1242 nm_rxsync_prologue(struct netmap_kring *kring)
1243 {
1244 	struct netmap_ring *ring = kring->ring;
1245 	uint32_t const n = kring->nkr_num_slots;
1246 	uint32_t head, cur;
1247 
1248 	ND("%s kc %d kt %d h %d c %d t %d",
1249 		kring->name,
1250 		kring->nr_hwcur, kring->nr_hwtail,
1251 		ring->head, ring->cur, ring->tail);
1252 	/*
1253 	 * Before storing the new values, we should check they do not
1254 	 * move backwards. However:
1255 	 * - head is not an issue because the previous value is hwcur;
1256 	 * - cur could in principle go back, however it does not matter
1257 	 *   because we are processing a brand new rxsync()
1258 	 */
1259 	cur = kring->rcur = ring->cur;	/* read only once */
1260 	head = kring->rhead = ring->head;	/* read only once */
1261 #if 1 /* kernel sanity checks */
1262 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1263 		goto error;
1264 #endif /* kernel sanity checks */
1265 	/* user sanity checks */
1266 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1267 		/* want hwcur <= rhead <= hwtail */
1268 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1269 			goto error;
1270 		/* and also rhead <= rcur <= hwtail */
1271 		if (cur < head || cur > kring->nr_hwtail)
1272 			goto error;
1273 	} else {
1274 		/* we need rhead outside hwtail..hwcur */
1275 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1276 			goto error;
1277 		/* two cases now: head <= hwtail or head >= hwcur  */
1278 		if (head <= kring->nr_hwtail) {
1279 			/* want head <= cur <= hwtail */
1280 			if (cur < head || cur > kring->nr_hwtail)
1281 				goto error;
1282 		} else {
1283 			/* cur must be outside hwtail..head */
1284 			if (cur < head && cur > kring->nr_hwtail)
1285 				goto error;
1286 		}
1287 	}
1288 	if (ring->tail != kring->rtail) {
1289 		RD(5, "%s tail overwritten was %d need %d",
1290 			kring->name,
1291 			ring->tail, kring->rtail);
1292 		ring->tail = kring->rtail;
1293 	}
1294 	return head;
1295 
1296 error:
1297 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1298 		kring->nr_hwcur,
1299 		kring->rcur, kring->nr_hwtail,
1300 		kring->rhead, kring->rcur, ring->tail);
1301 	return n;
1302 }
1303 
1304 
1305 /*
1306  * Error routine called when txsync/rxsync detects an error.
1307  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1308  * Return 1 on reinit.
1309  *
1310  * This routine is only called by the upper half of the kernel.
1311  * It only reads hwcur (which is changed only by the upper half, too)
1312  * and hwtail (which may be changed by the lower half, but only on
1313  * a tx ring and only to increase it, so any error will be recovered
1314  * on the next call). For the above, we don't strictly need to call
1315  * it under lock.
1316  */
1317 int
1318 netmap_ring_reinit(struct netmap_kring *kring)
1319 {
1320 	struct netmap_ring *ring = kring->ring;
1321 	u_int i, lim = kring->nkr_num_slots - 1;
1322 	int errors = 0;
1323 
1324 	// XXX KASSERT nm_kr_tryget
1325 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1326 	// XXX probably wrong to trust userspace
1327 	kring->rhead = ring->head;
1328 	kring->rcur  = ring->cur;
1329 	kring->rtail = ring->tail;
1330 
1331 	if (ring->cur > lim)
1332 		errors++;
1333 	if (ring->head > lim)
1334 		errors++;
1335 	if (ring->tail > lim)
1336 		errors++;
1337 	for (i = 0; i <= lim; i++) {
1338 		u_int idx = ring->slot[i].buf_idx;
1339 		u_int len = ring->slot[i].len;
1340 		if (idx < 2 || idx >= netmap_total_buffers) {
1341 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1342 			ring->slot[i].buf_idx = 0;
1343 			ring->slot[i].len = 0;
1344 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1345 			ring->slot[i].len = 0;
1346 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1347 		}
1348 	}
1349 	if (errors) {
1350 		RD(10, "total %d errors", errors);
1351 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1352 			kring->name,
1353 			ring->cur, kring->nr_hwcur,
1354 			ring->tail, kring->nr_hwtail);
1355 		ring->head = kring->rhead = kring->nr_hwcur;
1356 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1357 		ring->tail = kring->rtail = kring->nr_hwtail;
1358 	}
1359 	return (errors ? 1 : 0);
1360 }
1361 
1362 
1363 /*
1364  * Set the ring ID. For devices with a single queue, a request
1365  * for all rings is the same as a single ring.
1366  */
1367 static int
1368 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1369 {
1370 	struct netmap_adapter *na = priv->np_na;
1371 	struct ifnet *ifp = na->ifp;
1372 	u_int i = ringid & NETMAP_RING_MASK;
1373 	/* initially (np_qfirst == np_qlast) we don't want to lock */
1374 	u_int lim = na->num_rx_rings;
1375 
1376 	if (na->num_tx_rings > lim)
1377 		lim = na->num_tx_rings;
1378 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1379 		D("invalid ring id %d", i);
1380 		return (EINVAL);
1381 	}
1382 	priv->np_ringid = ringid;
1383 	if (ringid & NETMAP_SW_RING) {
1384 		priv->np_qfirst = NETMAP_SW_RING;
1385 		priv->np_qlast = 0;
1386 	} else if (ringid & NETMAP_HW_RING) {
1387 		priv->np_qfirst = i;
1388 		priv->np_qlast = i + 1;
1389 	} else {
1390 		priv->np_qfirst = 0;
1391 		priv->np_qlast = NETMAP_HW_RING ;
1392 	}
1393 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1394     if (netmap_verbose) {
1395 	if (ringid & NETMAP_SW_RING)
1396 		D("ringid %s set to SW RING", NM_IFPNAME(ifp));
1397 	else if (ringid & NETMAP_HW_RING)
1398 		D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
1399 			priv->np_qfirst);
1400 	else
1401 		D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
1402     }
1403 	return 0;
1404 }
1405 
1406 
1407 /*
1408  * possibly move the interface to netmap-mode.
1409  * If success it returns a pointer to netmap_if, otherwise NULL.
1410  * This must be called with NMG_LOCK held.
1411  */
1412 struct netmap_if *
1413 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1414 	uint16_t ringid, int *err)
1415 {
1416 	struct ifnet *ifp = na->ifp;
1417 	struct netmap_if *nifp = NULL;
1418 	int error, need_mem = 0;
1419 
1420 	NMG_LOCK_ASSERT();
1421 	/* ring configuration may have changed, fetch from the card */
1422 	netmap_update_config(na);
1423 	priv->np_na = na;     /* store the reference */
1424 	error = netmap_set_ringid(priv, ringid);
1425 	if (error)
1426 		goto out;
1427 	/* ensure allocators are ready */
1428 	need_mem = !netmap_have_memory_locked(priv);
1429 	if (need_mem) {
1430 		error = netmap_get_memory_locked(priv);
1431 		ND("get_memory returned %d", error);
1432 		if (error)
1433 			goto out;
1434 	}
1435 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1436 	if (nifp == NULL) { /* allocation failed */
1437 		/* we should drop the allocator, but only
1438 		 * if we were the ones who grabbed it
1439 		 */
1440 		error = ENOMEM;
1441 		goto out;
1442 	}
1443 	na->active_fds++;
1444 	if (ifp->if_capenable & IFCAP_NETMAP) {
1445 		/* was already set */
1446 	} else {
1447 		/* Otherwise set the card in netmap mode
1448 		 * and make it use the shared buffers.
1449 		 *
1450 		 * do not core lock because the race is harmless here,
1451 		 * there cannot be any traffic to netmap_transmit()
1452 		 */
1453 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1454 		ND("%p->na_lut == %p", na, na->na_lut);
1455 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1456 		error = na->nm_register(na, 1); /* mode on */
1457 		if (error) {
1458 			netmap_do_unregif(priv, nifp);
1459 			nifp = NULL;
1460 		}
1461 	}
1462 out:
1463 	*err = error;
1464 	if (error) {
1465 		priv->np_na = NULL;
1466 		if (need_mem)
1467 			netmap_drop_memory_locked(priv);
1468 	}
1469 	if (nifp != NULL) {
1470 		/*
1471 		 * advertise that the interface is ready bt setting ni_nifp.
1472 		 * The barrier is needed because readers (poll and *SYNC)
1473 		 * check for priv->np_nifp != NULL without locking
1474 		 */
1475 		wmb(); /* make sure previous writes are visible to all CPUs */
1476 		priv->np_nifp = nifp;
1477 	}
1478 	return nifp;
1479 }
1480 
1481 
1482 
1483 /*
1484  * ioctl(2) support for the "netmap" device.
1485  *
1486  * Following a list of accepted commands:
1487  * - NIOCGINFO
1488  * - SIOCGIFADDR	just for convenience
1489  * - NIOCREGIF
1490  * - NIOCTXSYNC
1491  * - NIOCRXSYNC
1492  *
1493  * Return 0 on success, errno otherwise.
1494  */
1495 int
1496 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1497 	int fflag, struct thread *td)
1498 {
1499 	struct netmap_priv_d *priv = NULL;
1500 	struct ifnet *ifp = NULL;
1501 	struct nmreq *nmr = (struct nmreq *) data;
1502 	struct netmap_adapter *na = NULL;
1503 	int error;
1504 	u_int i, lim;
1505 	struct netmap_if *nifp;
1506 	struct netmap_kring *krings;
1507 
1508 	(void)dev;	/* UNUSED */
1509 	(void)fflag;	/* UNUSED */
1510 #ifdef linux
1511 #define devfs_get_cdevpriv(pp)				\
1512 	({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; 	\
1513 		(*pp ? 0 : ENOENT); })
1514 
1515 /* devfs_set_cdevpriv cannot fail on linux */
1516 #define devfs_set_cdevpriv(p, fn)				\
1517 	({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
1518 
1519 
1520 #define devfs_clear_cdevpriv()	do {				\
1521 		netmap_dtor(priv); ((struct file *)td)->private_data = 0;	\
1522 	} while (0)
1523 #endif /* linux */
1524 
1525 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
1526 		/* truncate name */
1527 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
1528 		if (nmr->nr_version != NETMAP_API) {
1529 			D("API mismatch for %s got %d need %d",
1530 				nmr->nr_name,
1531 				nmr->nr_version, NETMAP_API);
1532 			nmr->nr_version = NETMAP_API;
1533 			return EINVAL;
1534 		}
1535 	}
1536 	CURVNET_SET(TD_TO_VNET(td));
1537 
1538 	error = devfs_get_cdevpriv((void **)&priv);
1539 	if (error) {
1540 		CURVNET_RESTORE();
1541 		/* XXX ENOENT should be impossible, since the priv
1542 		 * is now created in the open */
1543 		return (error == ENOENT ? ENXIO : error);
1544 	}
1545 
1546 	switch (cmd) {
1547 	case NIOCGINFO:		/* return capabilities etc */
1548 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1549 			error = netmap_bdg_ctl(nmr, NULL);
1550 			break;
1551 		}
1552 
1553 		NMG_LOCK();
1554 		do {
1555 			/* memsize is always valid */
1556 			struct netmap_mem_d *nmd = &nm_mem;
1557 			u_int memflags;
1558 
1559 			if (nmr->nr_name[0] != '\0') {
1560 				/* get a refcount */
1561 				error = netmap_get_na(nmr, &na, 1 /* create */);
1562 				if (error)
1563 					break;
1564 				nmd = na->nm_mem; /* get memory allocator */
1565 			}
1566 
1567 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
1568 			if (error)
1569 				break;
1570 			if (na == NULL) /* only memory info */
1571 				break;
1572 			nmr->nr_offset = 0;
1573 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1574 			netmap_update_config(na);
1575 			nmr->nr_rx_rings = na->num_rx_rings;
1576 			nmr->nr_tx_rings = na->num_tx_rings;
1577 			nmr->nr_rx_slots = na->num_rx_desc;
1578 			nmr->nr_tx_slots = na->num_tx_desc;
1579 			if (memflags & NETMAP_MEM_PRIVATE)
1580 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1581 			netmap_adapter_put(na);
1582 		} while (0);
1583 		NMG_UNLOCK();
1584 		break;
1585 
1586 	case NIOCREGIF:
1587 		/* possibly attach/detach NIC and VALE switch */
1588 		i = nmr->nr_cmd;
1589 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
1590 				|| i == NETMAP_BDG_OFFSET) {
1591 			error = netmap_bdg_ctl(nmr, NULL);
1592 			break;
1593 		} else if (i != 0) {
1594 			D("nr_cmd must be 0 not %d", i);
1595 			error = EINVAL;
1596 			break;
1597 		}
1598 
1599 		/* protect access to priv from concurrent NIOCREGIF */
1600 		NMG_LOCK();
1601 		do {
1602 			u_int memflags;
1603 
1604 			if (priv->np_na != NULL) {	/* thread already registered */
1605 				error = netmap_set_ringid(priv, nmr->nr_ringid);
1606 				break;
1607 			}
1608 			/* find the interface and a reference */
1609 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1610 			if (error)
1611 				break;
1612 			ifp = na->ifp;
1613 			if (NETMAP_OWNED_BY_KERN(na)) {
1614 				netmap_adapter_put(na);
1615 				error = EBUSY;
1616 				break;
1617 			}
1618 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
1619 			if (!nifp) {    /* reg. failed, release priv and ref */
1620 				netmap_adapter_put(na);
1621 				priv->np_nifp = NULL;
1622 				break;
1623 			}
1624 
1625 			/* return the offset of the netmap_if object */
1626 			nmr->nr_rx_rings = na->num_rx_rings;
1627 			nmr->nr_tx_rings = na->num_tx_rings;
1628 			nmr->nr_rx_slots = na->num_rx_desc;
1629 			nmr->nr_tx_slots = na->num_tx_desc;
1630 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
1631 			if (error) {
1632 				netmap_adapter_put(na);
1633 				break;
1634 			}
1635 			if (memflags & NETMAP_MEM_PRIVATE) {
1636 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1637 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1638 			}
1639 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1640 		} while (0);
1641 		NMG_UNLOCK();
1642 		break;
1643 
1644 	case NIOCTXSYNC:
1645 	case NIOCRXSYNC:
1646 		nifp = priv->np_nifp;
1647 
1648 		if (nifp == NULL) {
1649 			error = ENXIO;
1650 			break;
1651 		}
1652 		rmb(); /* make sure following reads are not from cache */
1653 
1654 		na = priv->np_na;      /* we have a reference */
1655 
1656 		if (na == NULL) {
1657 			D("Internal error: nifp != NULL && na == NULL");
1658 			error = ENXIO;
1659 			break;
1660 		}
1661 
1662 		ifp = na->ifp;
1663 		if (ifp == NULL) {
1664 			RD(1, "the ifp is gone");
1665 			error = ENXIO;
1666 			break;
1667 		}
1668 
1669 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
1670 			if (cmd == NIOCTXSYNC)
1671 				netmap_txsync_to_host(na);
1672 			else
1673 				netmap_rxsync_from_host(na, NULL, NULL);
1674 			break;
1675 		}
1676 		/* find the last ring to scan */
1677 		lim = priv->np_qlast;
1678 		if (lim == NETMAP_HW_RING)
1679 			lim = (cmd == NIOCTXSYNC) ?
1680 			    na->num_tx_rings : na->num_rx_rings;
1681 
1682 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
1683 		for (i = priv->np_qfirst; i < lim; i++) {
1684 			struct netmap_kring *kring = krings + i;
1685 			if (nm_kr_tryget(kring)) {
1686 				error = EBUSY;
1687 				goto out;
1688 			}
1689 			if (cmd == NIOCTXSYNC) {
1690 				if (netmap_verbose & NM_VERB_TXSYNC)
1691 					D("pre txsync ring %d cur %d hwcur %d",
1692 					    i, kring->ring->cur,
1693 					    kring->nr_hwcur);
1694 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1695 					netmap_ring_reinit(kring);
1696 				} else {
1697 					na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
1698 				}
1699 				if (netmap_verbose & NM_VERB_TXSYNC)
1700 					D("post txsync ring %d cur %d hwcur %d",
1701 					    i, kring->ring->cur,
1702 					    kring->nr_hwcur);
1703 			} else {
1704 				na->nm_rxsync(na, i, NAF_FORCE_READ);
1705 				microtime(&na->rx_rings[i].ring->ts);
1706 			}
1707 			nm_kr_put(kring);
1708 		}
1709 
1710 		break;
1711 
1712 #ifdef __FreeBSD__
1713 	case BIOCIMMEDIATE:
1714 	case BIOCGHDRCMPLT:
1715 	case BIOCSHDRCMPLT:
1716 	case BIOCSSEESENT:
1717 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1718 		break;
1719 
1720 	default:	/* allow device-specific ioctls */
1721 	    {
1722 		struct socket so;
1723 
1724 		bzero(&so, sizeof(so));
1725 		NMG_LOCK();
1726 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1727 		if (error) {
1728 			netmap_adapter_put(na);
1729 			NMG_UNLOCK();
1730 			break;
1731 		}
1732 		ifp = na->ifp;
1733 		so.so_vnet = ifp->if_vnet;
1734 		// so->so_proto not null.
1735 		error = ifioctl(&so, cmd, data, td);
1736 		netmap_adapter_put(na);
1737 		NMG_UNLOCK();
1738 		break;
1739 	    }
1740 
1741 #else /* linux */
1742 	default:
1743 		error = EOPNOTSUPP;
1744 #endif /* linux */
1745 	}
1746 out:
1747 
1748 	CURVNET_RESTORE();
1749 	return (error);
1750 }
1751 
1752 
1753 /*
1754  * select(2) and poll(2) handlers for the "netmap" device.
1755  *
1756  * Can be called for one or more queues.
1757  * Return true the event mask corresponding to ready events.
1758  * If there are no ready events, do a selrecord on either individual
1759  * selinfo or on the global one.
1760  * Device-dependent parts (locking and sync of tx/rx rings)
1761  * are done through callbacks.
1762  *
1763  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1764  * The first one is remapped to pwait as selrecord() uses the name as an
1765  * hidden argument.
1766  */
1767 int
1768 netmap_poll(struct cdev *dev, int events, struct thread *td)
1769 {
1770 	struct netmap_priv_d *priv = NULL;
1771 	struct netmap_adapter *na;
1772 	struct ifnet *ifp;
1773 	struct netmap_kring *kring;
1774 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1775 	u_int lim_tx, lim_rx;
1776 	struct mbq q;		/* packets from hw queues to host stack */
1777 	void *pwait = dev;	/* linux compatibility */
1778 
1779 	/*
1780 	 * In order to avoid nested locks, we need to "double check"
1781 	 * txsync and rxsync if we decide to do a selrecord().
1782 	 * retry_tx (and retry_rx, later) prevent looping forever.
1783 	 */
1784 	int retry_tx = 1, retry_rx = 1;
1785 
1786 	(void)pwait;
1787 	mbq_init(&q);
1788 
1789 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
1790 		return POLLERR;
1791 
1792 	if (priv->np_nifp == NULL) {
1793 		D("No if registered");
1794 		return POLLERR;
1795 	}
1796 	rmb(); /* make sure following reads are not from cache */
1797 
1798 	na = priv->np_na;
1799 	ifp = na->ifp;
1800 	// check for deleted
1801 	if (ifp == NULL) {
1802 		RD(1, "the ifp is gone");
1803 		return POLLERR;
1804 	}
1805 
1806 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1807 		return POLLERR;
1808 
1809 	if (netmap_verbose & 0x8000)
1810 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1811 	want_tx = events & (POLLOUT | POLLWRNORM);
1812 	want_rx = events & (POLLIN | POLLRDNORM);
1813 
1814 	lim_tx = na->num_tx_rings;
1815 	lim_rx = na->num_rx_rings;
1816 
1817 	if (priv->np_qfirst == NETMAP_SW_RING) {
1818 		// XXX locking ?
1819 		/* handle the host stack ring */
1820 		if (priv->np_txpoll || want_tx) {
1821 			/* push any packets up, then we are always ready */
1822 			netmap_txsync_to_host(na);
1823 			revents |= want_tx;
1824 		}
1825 		if (want_rx) {
1826 			kring = &na->rx_rings[lim_rx];
1827 			/* XXX replace with rxprologue etc. */
1828 			if (nm_ring_empty(kring->ring))
1829 				netmap_rxsync_from_host(na, td, dev);
1830 			if (!nm_ring_empty(kring->ring))
1831 				revents |= want_rx;
1832 		}
1833 		return (revents);
1834 	}
1835 
1836 
1837 	/*
1838 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1839 	 * the file descriptor is bound to all of them. If so, we sleep on
1840 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1841 	 * (FreeBSD only allows two selinfo's per file descriptor).
1842 	 * The interrupt routine in the driver wake one or the other
1843 	 * (or both) depending on which clients are active.
1844 	 *
1845 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1846 	 * txsync() is called if we run out of buffers on POLLOUT, or
1847 	 * there are pending packets to send. The latter can be disabled
1848 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1849 	 */
1850 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
1851 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
1852 
1853 	if (priv->np_qlast != NETMAP_HW_RING) {
1854 		lim_tx = lim_rx = priv->np_qlast;
1855 	}
1856 
1857 	/*
1858 	 * We start with a lock free round which is cheap if we have
1859 	 * slots available. If this fails, then lock and call the sync
1860 	 * routines.
1861 	 */
1862 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
1863 		kring = &na->rx_rings[i];
1864 		/* XXX compare ring->cur and kring->tail */
1865 		if (!nm_ring_empty(kring->ring)) {
1866 			revents |= want_rx;
1867 			want_rx = 0;	/* also breaks the loop */
1868 		}
1869 	}
1870 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
1871 		kring = &na->tx_rings[i];
1872 		/* XXX compare ring->cur and kring->tail */
1873 		if (!nm_ring_empty(kring->ring)) {
1874 			revents |= want_tx;
1875 			want_tx = 0;	/* also breaks the loop */
1876 		}
1877 	}
1878 
1879 	/*
1880 	 * If we want to push packets out (priv->np_txpoll) or
1881 	 * want_tx is still set, we must issue txsync calls
1882 	 * (on all rings, to avoid that the tx rings stall).
1883 	 * XXX should also check cur != hwcur on the tx rings.
1884 	 * Fortunately, normal tx mode has np_txpoll set.
1885 	 */
1886 	if (priv->np_txpoll || want_tx) {
1887 		/*
1888 		 * The first round checks if anyone is ready, if not
1889 		 * do a selrecord and another round to handle races.
1890 		 * want_tx goes to 0 if any space is found, and is
1891 		 * used to skip rings with no pending transmissions.
1892 		 */
1893 flush_tx:
1894 		for (i = priv->np_qfirst; i < lim_tx; i++) {
1895 			int found = 0;
1896 
1897 			kring = &na->tx_rings[i];
1898 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1899 				continue;
1900 			/* only one thread does txsync */
1901 			if (nm_kr_tryget(kring)) {
1902 				D("%p lost race on txring %d, ok", priv, i);
1903 				continue;
1904 			}
1905 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
1906 				netmap_ring_reinit(kring);
1907 				revents |= POLLERR;
1908 			} else {
1909 				if (na->nm_txsync(na, i, 0))
1910 					revents |= POLLERR;
1911 			}
1912 
1913 			/*
1914 			 * If we found new slots, notify potential
1915 			 * listeners on the same ring.
1916 			 * Since we just did a txsync, look at the copies
1917 			 * of cur,tail in the kring.
1918 			 */
1919 			found = kring->rcur != kring->rtail;
1920 			nm_kr_put(kring);
1921 			if (found) { /* notify other listeners */
1922 				revents |= want_tx;
1923 				want_tx = 0;
1924 				na->nm_notify(na, i, NR_TX, NAF_GLOBAL_NOTIFY);
1925 			}
1926 		}
1927 		if (want_tx && retry_tx) {
1928 			selrecord(td, check_all_tx ?
1929 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
1930 			retry_tx = 0;
1931 			goto flush_tx;
1932 		}
1933 	}
1934 
1935 	/*
1936 	 * If want_rx is still set scan receive rings.
1937 	 * Do it on all rings because otherwise we starve.
1938 	 */
1939 	if (want_rx) {
1940 		int send_down = 0; /* transparent mode */
1941 		/* two rounds here to for race avoidance */
1942 do_retry_rx:
1943 		for (i = priv->np_qfirst; i < lim_rx; i++) {
1944 			int found = 0;
1945 
1946 			kring = &na->rx_rings[i];
1947 
1948 			if (nm_kr_tryget(kring)) {
1949 				D("%p lost race on rxring %d, ok", priv, i);
1950 				continue;
1951 			}
1952 
1953 			/*
1954 			 * transparent mode support: collect packets
1955 			 * from the rxring(s).
1956 			 * XXX NR_FORWARD should only be read on
1957 			 * physical or NIC ports
1958 			 */
1959 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
1960 				ND(10, "forwarding some buffers up %d to %d",
1961 				    kring->nr_hwcur, kring->ring->cur);
1962 				netmap_grab_packets(kring, &q, netmap_fwd);
1963 			}
1964 
1965 			if (na->nm_rxsync(na, i, 0))
1966 				revents |= POLLERR;
1967 			if (netmap_no_timestamp == 0 ||
1968 					kring->ring->flags & NR_TIMESTAMP) {
1969 				microtime(&kring->ring->ts);
1970 			}
1971 			/* after an rxsync we can use kring->rcur, rtail */
1972 			found = kring->rcur != kring->rtail;
1973 			nm_kr_put(kring);
1974 			if (found) {
1975 				revents |= want_rx;
1976 				retry_rx = 0;
1977 				na->nm_notify(na, i, NR_RX, NAF_GLOBAL_NOTIFY);
1978 			}
1979 		}
1980 
1981 		/* transparent mode XXX only during first pass ? */
1982 		kring = &na->rx_rings[lim_rx];
1983 		if (check_all_rx
1984 		    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
1985 			/* XXX fix to use kring fields */
1986 			if (nm_ring_empty(kring->ring))
1987 				send_down = netmap_rxsync_from_host(na, td, dev);
1988 			if (!nm_ring_empty(kring->ring))
1989 				revents |= want_rx;
1990 		}
1991 
1992 		if (retry_rx)
1993 			selrecord(td, check_all_rx ?
1994 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
1995 		if (send_down > 0 || retry_rx) {
1996 			retry_rx = 0;
1997 			if (send_down)
1998 				goto flush_tx; /* and retry_rx */
1999 			else
2000 				goto do_retry_rx;
2001 		}
2002 	}
2003 
2004 	/*
2005 	 * Transparent mode: marked bufs on rx rings between
2006 	 * kring->nr_hwcur and ring->head
2007 	 * are passed to the other endpoint.
2008 	 *
2009 	 * In this mode we also scan the sw rxring, which in
2010 	 * turn passes packets up.
2011 	 *
2012 	 * XXX Transparent mode at the moment requires to bind all
2013  	 * rings to a single file descriptor.
2014 	 */
2015 
2016 	if (q.head)
2017 		netmap_send_up(na->ifp, &q);
2018 
2019 	return (revents);
2020 }
2021 
2022 
2023 /*-------------------- driver support routines -------------------*/
2024 
2025 static int netmap_hw_krings_create(struct netmap_adapter *);
2026 
2027 static int
2028 netmap_notify(struct netmap_adapter *na, u_int n_ring,
2029 	enum txrx tx, int flags)
2030 {
2031 	struct netmap_kring *kring;
2032 
2033 	if (tx == NR_TX) {
2034 		kring = na->tx_rings + n_ring;
2035 		selwakeuppri(&kring->si, PI_NET);
2036 		if (flags & NAF_GLOBAL_NOTIFY)
2037 			selwakeuppri(&na->tx_si, PI_NET);
2038 	} else {
2039 		kring = na->rx_rings + n_ring;
2040 		selwakeuppri(&kring->si, PI_NET);
2041 		if (flags & NAF_GLOBAL_NOTIFY)
2042 			selwakeuppri(&na->rx_si, PI_NET);
2043 	}
2044 	return 0;
2045 }
2046 
2047 
2048 // XXX check handling of failures
2049 int
2050 netmap_attach_common(struct netmap_adapter *na)
2051 {
2052 	struct ifnet *ifp = na->ifp;
2053 
2054 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2055 		D("%s: invalid rings tx %d rx %d",
2056 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
2057 		return EINVAL;
2058 	}
2059 	WNA(ifp) = na;
2060 
2061 	/* the following is only needed for na that use the host port.
2062 	 * XXX do we have something similar for linux ?
2063 	 */
2064 #ifdef __FreeBSD__
2065 	na->if_input = ifp->if_input; /* for netmap_send_up */
2066 #endif /* __FreeBSD__ */
2067 
2068 	NETMAP_SET_CAPABLE(ifp);
2069 	if (na->nm_krings_create == NULL) {
2070 		na->nm_krings_create = netmap_hw_krings_create;
2071 		na->nm_krings_delete = netmap_hw_krings_delete;
2072 	}
2073 	if (na->nm_notify == NULL)
2074 		na->nm_notify = netmap_notify;
2075 	na->active_fds = 0;
2076 
2077 	if (na->nm_mem == NULL)
2078 		na->nm_mem = &nm_mem;
2079 	return 0;
2080 }
2081 
2082 
2083 void
2084 netmap_detach_common(struct netmap_adapter *na)
2085 {
2086 	if (na->ifp)
2087 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2088 
2089 	if (na->tx_rings) { /* XXX should not happen */
2090 		D("freeing leftover tx_rings");
2091 		na->nm_krings_delete(na);
2092 	}
2093 	if (na->na_flags & NAF_MEM_OWNER)
2094 		netmap_mem_private_delete(na->nm_mem);
2095 	bzero(na, sizeof(*na));
2096 	free(na, M_DEVBUF);
2097 }
2098 
2099 
2100 /*
2101  * Initialize a ``netmap_adapter`` object created by driver on attach.
2102  * We allocate a block of memory with room for a struct netmap_adapter
2103  * plus two sets of N+2 struct netmap_kring (where N is the number
2104  * of hardware rings):
2105  * krings	0..N-1	are for the hardware queues.
2106  * kring	N	is for the host stack queue
2107  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2108  * Return 0 on success, ENOMEM otherwise.
2109  */
2110 int
2111 netmap_attach(struct netmap_adapter *arg)
2112 {
2113 	struct netmap_hw_adapter *hwna = NULL;
2114 	// XXX when is arg == NULL ?
2115 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2116 
2117 	if (arg == NULL || ifp == NULL)
2118 		goto fail;
2119 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2120 	if (hwna == NULL)
2121 		goto fail;
2122 	hwna->up = *arg;
2123 	if (netmap_attach_common(&hwna->up)) {
2124 		free(hwna, M_DEVBUF);
2125 		goto fail;
2126 	}
2127 	netmap_adapter_get(&hwna->up);
2128 
2129 #ifdef linux
2130 	if (ifp->netdev_ops) {
2131 		/* prepare a clone of the netdev ops */
2132 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2133 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2134 #else
2135 		hwna->nm_ndo = *ifp->netdev_ops;
2136 #endif
2137 	}
2138 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2139 #endif /* linux */
2140 
2141 	D("success for %s", NM_IFPNAME(ifp));
2142 	return 0;
2143 
2144 fail:
2145 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2146 	netmap_detach(ifp);
2147 	return (hwna ? EINVAL : ENOMEM);
2148 }
2149 
2150 
2151 void
2152 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2153 {
2154 	if (!na) {
2155 		return;
2156 	}
2157 
2158 	refcount_acquire(&na->na_refcount);
2159 }
2160 
2161 
2162 /* returns 1 iff the netmap_adapter is destroyed */
2163 int
2164 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2165 {
2166 	if (!na)
2167 		return 1;
2168 
2169 	if (!refcount_release(&na->na_refcount))
2170 		return 0;
2171 
2172 	if (na->nm_dtor)
2173 		na->nm_dtor(na);
2174 
2175 	netmap_detach_common(na);
2176 
2177 	return 1;
2178 }
2179 
2180 
2181 int
2182 netmap_hw_krings_create(struct netmap_adapter *na)
2183 {
2184 	int ret = netmap_krings_create(na,
2185 		na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
2186 	if (ret == 0) {
2187 		/* initialize the mbq for the sw rx ring */
2188 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2189 		ND("initialized sw rx queue %d", na->num_rx_rings);
2190 	}
2191 	return ret;
2192 }
2193 
2194 
2195 
2196 /*
2197  * Free the allocated memory linked to the given ``netmap_adapter``
2198  * object.
2199  */
2200 void
2201 netmap_detach(struct ifnet *ifp)
2202 {
2203 	struct netmap_adapter *na = NA(ifp);
2204 
2205 	if (!na)
2206 		return;
2207 
2208 	NMG_LOCK();
2209 	netmap_disable_all_rings(ifp);
2210 	if (!netmap_adapter_put(na)) {
2211 		/* someone is still using the adapter,
2212 		 * tell them that the interface is gone
2213 		 */
2214 		na->ifp = NULL;
2215 		/* give them a chance to notice */
2216 		netmap_enable_all_rings(ifp);
2217 	}
2218 	NMG_UNLOCK();
2219 }
2220 
2221 
2222 /*
2223  * Intercept packets from the network stack and pass them
2224  * to netmap as incoming packets on the 'software' ring.
2225  *
2226  * We only store packets in a bounded mbq and then copy them
2227  * in the relevant rxsync routine.
2228  *
2229  * We rely on the OS to make sure that the ifp and na do not go
2230  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2231  * In nm_register() or whenever there is a reinitialization,
2232  * we make sure to make the mode change visible here.
2233  */
2234 int
2235 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2236 {
2237 	struct netmap_adapter *na = NA(ifp);
2238 	struct netmap_kring *kring;
2239 	u_int len = MBUF_LEN(m);
2240 	u_int error = ENOBUFS;
2241 	struct mbq *q;
2242 	int space;
2243 
2244 	// XXX [Linux] we do not need this lock
2245 	// if we follow the down/configure/up protocol -gl
2246 	// mtx_lock(&na->core_lock);
2247 
2248 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2249 		D("%s not in netmap mode anymore", NM_IFPNAME(ifp));
2250 		error = ENXIO;
2251 		goto done;
2252 	}
2253 
2254 	kring = &na->rx_rings[na->num_rx_rings];
2255 	q = &kring->rx_queue;
2256 
2257 	// XXX reconsider long packets if we handle fragments
2258 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2259 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2260 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2261 		goto done;
2262 	}
2263 
2264 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2265 	 * and maybe other instances of netmap_transmit (the latter
2266 	 * not possible on Linux).
2267 	 * Also avoid overflowing the queue.
2268 	 */
2269 	mtx_lock(&q->lock);
2270 
2271         space = kring->nr_hwtail - kring->nr_hwcur;
2272         if (space < 0)
2273                 space += kring->nkr_num_slots;
2274 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2275 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2276 			 NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2277 			len, m);
2278 	} else {
2279 		mbq_enqueue(q, m);
2280 		ND(10, "%s %d bufs in queue len %d m %p",
2281 			NM_IFPNAME(ifp), mbq_len(q), len, m);
2282 		/* notify outside the lock */
2283 		m = NULL;
2284 		error = 0;
2285 	}
2286 	mtx_unlock(&q->lock);
2287 
2288 done:
2289 	if (m)
2290 		m_freem(m);
2291 	/* unconditionally wake up listeners */
2292 	na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2293 
2294 	return (error);
2295 }
2296 
2297 
2298 /*
2299  * netmap_reset() is called by the driver routines when reinitializing
2300  * a ring. The driver is in charge of locking to protect the kring.
2301  * If native netmap mode is not set just return NULL.
2302  */
2303 struct netmap_slot *
2304 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2305 	u_int new_cur)
2306 {
2307 	struct netmap_kring *kring;
2308 	int new_hwofs, lim;
2309 
2310 	if (na == NULL) {
2311 		D("NULL na, should not happen");
2312 		return NULL;	/* no netmap support here */
2313 	}
2314 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
2315 		ND("interface not in netmap mode");
2316 		return NULL;	/* nothing to reinitialize */
2317 	}
2318 
2319 	/* XXX note- in the new scheme, we are not guaranteed to be
2320 	 * under lock (e.g. when called on a device reset).
2321 	 * In this case, we should set a flag and do not trust too
2322 	 * much the values. In practice: TODO
2323 	 * - set a RESET flag somewhere in the kring
2324 	 * - do the processing in a conservative way
2325 	 * - let the *sync() fixup at the end.
2326 	 */
2327 	if (tx == NR_TX) {
2328 		if (n >= na->num_tx_rings)
2329 			return NULL;
2330 		kring = na->tx_rings + n;
2331 		// XXX check whether we should use hwcur or rcur
2332 		new_hwofs = kring->nr_hwcur - new_cur;
2333 	} else {
2334 		if (n >= na->num_rx_rings)
2335 			return NULL;
2336 		kring = na->rx_rings + n;
2337 		new_hwofs = kring->nr_hwtail - new_cur;
2338 	}
2339 	lim = kring->nkr_num_slots - 1;
2340 	if (new_hwofs > lim)
2341 		new_hwofs -= lim + 1;
2342 
2343 	/* Always set the new offset value and realign the ring. */
2344 	if (netmap_verbose)
2345 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2346 		NM_IFPNAME(na->ifp),
2347 		tx == NR_TX ? "TX" : "RX", n,
2348 		kring->nkr_hwofs, new_hwofs,
2349 		kring->nr_hwtail,
2350 		tx == NR_TX ? lim : kring->nr_hwtail);
2351 	kring->nkr_hwofs = new_hwofs;
2352 	if (tx == NR_TX) {
2353 		kring->nr_hwtail = kring->nr_hwcur + lim;
2354 		if (kring->nr_hwtail > lim)
2355 			kring->nr_hwtail -= lim + 1;
2356 	}
2357 
2358 #if 0 // def linux
2359 	/* XXX check that the mappings are correct */
2360 	/* need ring_nr, adapter->pdev, direction */
2361 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
2362 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
2363 		D("error mapping rx netmap buffer %d", i);
2364 		// XXX fix error handling
2365 	}
2366 
2367 #endif /* linux */
2368 	/*
2369 	 * Wakeup on the individual and global selwait
2370 	 * We do the wakeup here, but the ring is not yet reconfigured.
2371 	 * However, we are under lock so there are no races.
2372 	 */
2373 	na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
2374 	return kring->ring->slot;
2375 }
2376 
2377 
2378 /*
2379  * Dispatch rx/tx interrupts to the netmap rings.
2380  *
2381  * "work_done" is non-null on the RX path, NULL for the TX path.
2382  * We rely on the OS to make sure that there is only one active
2383  * instance per queue, and that there is appropriate locking.
2384  *
2385  * The 'notify' routine depends on what the ring is attached to.
2386  * - for a netmap file descriptor, do a selwakeup on the individual
2387  *   waitqueue, plus one on the global one if needed
2388  * - for a switch, call the proper forwarding routine
2389  * - XXX more ?
2390  */
2391 void
2392 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2393 {
2394 	struct netmap_adapter *na = NA(ifp);
2395 	struct netmap_kring *kring;
2396 
2397 	q &= NETMAP_RING_MASK;
2398 
2399 	if (netmap_verbose) {
2400 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2401 	}
2402 
2403 	if (work_done) { /* RX path */
2404 		if (q >= na->num_rx_rings)
2405 			return;	// not a physical queue
2406 		kring = na->rx_rings + q;
2407 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2408 		na->nm_notify(na, q, NR_RX,
2409 			(na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2410 		*work_done = 1; /* do not fire napi again */
2411 	} else { /* TX path */
2412 		if (q >= na->num_tx_rings)
2413 			return;	// not a physical queue
2414 		kring = na->tx_rings + q;
2415 		na->nm_notify(na, q, NR_TX,
2416 			(na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2417 	}
2418 }
2419 
2420 
2421 /*
2422  * Default functions to handle rx/tx interrupts from a physical device.
2423  * "work_done" is non-null on the RX path, NULL for the TX path.
2424  *
2425  * If the card is not in netmap mode, simply return 0,
2426  * so that the caller proceeds with regular processing.
2427  * Otherwise call netmap_common_irq() and return 1.
2428  *
2429  * If the card is connected to a netmap file descriptor,
2430  * do a selwakeup on the individual queue, plus one on the global one
2431  * if needed (multiqueue card _and_ there are multiqueue listeners),
2432  * and return 1.
2433  *
2434  * Finally, if called on rx from an interface connected to a switch,
2435  * calls the proper forwarding routine, and return 1.
2436  */
2437 int
2438 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2439 {
2440 	// XXX could we check NAF_NATIVE_ON ?
2441 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2442 		return 0;
2443 
2444 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2445 		ND("use regular interrupt");
2446 		return 0;
2447 	}
2448 
2449 	netmap_common_irq(ifp, q, work_done);
2450 	return 1;
2451 }
2452 
2453 
2454 /*
2455  * Module loader and unloader
2456  *
2457  * netmap_init() creates the /dev/netmap device and initializes
2458  * all global variables. Returns 0 on success, errno on failure
2459  * (but there is no chance)
2460  *
2461  * netmap_fini() destroys everything.
2462  */
2463 
2464 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2465 extern struct cdevsw netmap_cdevsw;
2466 
2467 
2468 void
2469 netmap_fini(void)
2470 {
2471 	// XXX destroy_bridges() ?
2472 	if (netmap_dev)
2473 		destroy_dev(netmap_dev);
2474 	netmap_mem_fini();
2475 	NMG_LOCK_DESTROY();
2476 	printf("netmap: unloaded module.\n");
2477 }
2478 
2479 
2480 int
2481 netmap_init(void)
2482 {
2483 	int error;
2484 
2485 	NMG_LOCK_INIT();
2486 
2487 	error = netmap_mem_init();
2488 	if (error != 0)
2489 		goto fail;
2490 	/* XXX could use make_dev_credv() to get error number */
2491 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2492 			      "netmap");
2493 	if (!netmap_dev)
2494 		goto fail;
2495 
2496 	netmap_init_bridges();
2497 	printf("netmap: loaded module\n");
2498 	return (0);
2499 fail:
2500 	netmap_fini();
2501 	return (EINVAL); /* may be incorrect */
2502 }
2503