xref: /freebsd/sys/dev/netmap/netmap.c (revision 68b8534bdfeb5078e84d668124e7585e43b03502)
1*68b8534bSLuigi Rizzo /*
2*68b8534bSLuigi Rizzo  * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved.
3*68b8534bSLuigi Rizzo  *
4*68b8534bSLuigi Rizzo  * Redistribution and use in source and binary forms, with or without
5*68b8534bSLuigi Rizzo  * modification, are permitted provided that the following conditions
6*68b8534bSLuigi Rizzo  * are met:
7*68b8534bSLuigi Rizzo  * 1. Redistributions of source code must retain the above copyright
8*68b8534bSLuigi Rizzo  *    notice, this list of conditions and the following disclaimer.
9*68b8534bSLuigi Rizzo  * 2. Redistributions in binary form must reproduce the above copyright
10*68b8534bSLuigi Rizzo  *    notice, this list of conditions and the following disclaimer in the
11*68b8534bSLuigi Rizzo  *    documentation and/or other materials provided with the distribution.
12*68b8534bSLuigi Rizzo  *
13*68b8534bSLuigi Rizzo  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14*68b8534bSLuigi Rizzo  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15*68b8534bSLuigi Rizzo  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16*68b8534bSLuigi Rizzo  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17*68b8534bSLuigi Rizzo  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18*68b8534bSLuigi Rizzo  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19*68b8534bSLuigi Rizzo  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20*68b8534bSLuigi Rizzo  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21*68b8534bSLuigi Rizzo  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22*68b8534bSLuigi Rizzo  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23*68b8534bSLuigi Rizzo  * SUCH DAMAGE.
24*68b8534bSLuigi Rizzo  */
25*68b8534bSLuigi Rizzo 
26*68b8534bSLuigi Rizzo /*
27*68b8534bSLuigi Rizzo  * $FreeBSD$
28*68b8534bSLuigi Rizzo  * $Id: netmap.c 9662 2011-11-16 13:18:06Z luigi $
29*68b8534bSLuigi Rizzo  *
30*68b8534bSLuigi Rizzo  * This module supports memory mapped access to network devices,
31*68b8534bSLuigi Rizzo  * see netmap(4).
32*68b8534bSLuigi Rizzo  *
33*68b8534bSLuigi Rizzo  * The module uses a large, memory pool allocated by the kernel
34*68b8534bSLuigi Rizzo  * and accessible as mmapped memory by multiple userspace threads/processes.
35*68b8534bSLuigi Rizzo  * The memory pool contains packet buffers and "netmap rings",
36*68b8534bSLuigi Rizzo  * i.e. user-accessible copies of the interface's queues.
37*68b8534bSLuigi Rizzo  *
38*68b8534bSLuigi Rizzo  * Access to the network card works like this:
39*68b8534bSLuigi Rizzo  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40*68b8534bSLuigi Rizzo  *    select()able file descriptor on which events are reported.
41*68b8534bSLuigi Rizzo  * 2. on each descriptor, the process issues an ioctl() to identify
42*68b8534bSLuigi Rizzo  *    the interface that should report events to the file descriptor.
43*68b8534bSLuigi Rizzo  * 3. on each descriptor, the process issues an mmap() request to
44*68b8534bSLuigi Rizzo  *    map the shared memory region within the process' address space.
45*68b8534bSLuigi Rizzo  *    The list of interesting queues is indicated by a location in
46*68b8534bSLuigi Rizzo  *    the shared memory region.
47*68b8534bSLuigi Rizzo  * 4. using the functions in the netmap(4) userspace API, a process
48*68b8534bSLuigi Rizzo  *    can look up the occupation state of a queue, access memory buffers,
49*68b8534bSLuigi Rizzo  *    and retrieve received packets or enqueue packets to transmit.
50*68b8534bSLuigi Rizzo  * 5. using some ioctl()s the process can synchronize the userspace view
51*68b8534bSLuigi Rizzo  *    of the queue with the actual status in the kernel. This includes both
52*68b8534bSLuigi Rizzo  *    receiving the notification of new packets, and transmitting new
53*68b8534bSLuigi Rizzo  *    packets on the output interface.
54*68b8534bSLuigi Rizzo  * 6. select() or poll() can be used to wait for events on individual
55*68b8534bSLuigi Rizzo  *    transmit or receive queues (or all queues for a given interface).
56*68b8534bSLuigi Rizzo  */
57*68b8534bSLuigi Rizzo 
58*68b8534bSLuigi Rizzo #include <sys/cdefs.h> /* prerequisite */
59*68b8534bSLuigi Rizzo __FBSDID("$FreeBSD$");
60*68b8534bSLuigi Rizzo 
61*68b8534bSLuigi Rizzo #include <sys/types.h>
62*68b8534bSLuigi Rizzo #include <sys/module.h>
63*68b8534bSLuigi Rizzo #include <sys/errno.h>
64*68b8534bSLuigi Rizzo #include <sys/param.h>	/* defines used in kernel.h */
65*68b8534bSLuigi Rizzo #include <sys/kernel.h>	/* types used in module initialization */
66*68b8534bSLuigi Rizzo #include <sys/conf.h>	/* cdevsw struct */
67*68b8534bSLuigi Rizzo #include <sys/uio.h>	/* uio struct */
68*68b8534bSLuigi Rizzo #include <sys/sockio.h>
69*68b8534bSLuigi Rizzo #include <sys/socketvar.h>	/* struct socket */
70*68b8534bSLuigi Rizzo #include <sys/malloc.h>
71*68b8534bSLuigi Rizzo #include <sys/mman.h>	/* PROT_EXEC */
72*68b8534bSLuigi Rizzo #include <sys/poll.h>
73*68b8534bSLuigi Rizzo #include <vm/vm.h>	/* vtophys */
74*68b8534bSLuigi Rizzo #include <vm/pmap.h>	/* vtophys */
75*68b8534bSLuigi Rizzo #include <sys/socket.h> /* sockaddrs */
76*68b8534bSLuigi Rizzo #include <machine/bus.h>
77*68b8534bSLuigi Rizzo #include <sys/selinfo.h>
78*68b8534bSLuigi Rizzo #include <sys/sysctl.h>
79*68b8534bSLuigi Rizzo #include <net/if.h>
80*68b8534bSLuigi Rizzo #include <net/bpf.h>		/* BIOCIMMEDIATE */
81*68b8534bSLuigi Rizzo #include <net/netmap.h>
82*68b8534bSLuigi Rizzo #include <dev/netmap/netmap_kern.h>
83*68b8534bSLuigi Rizzo #include <machine/bus.h>	/* bus_dmamap_* */
84*68b8534bSLuigi Rizzo 
85*68b8534bSLuigi Rizzo MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
86*68b8534bSLuigi Rizzo 
87*68b8534bSLuigi Rizzo /*
88*68b8534bSLuigi Rizzo  * lock and unlock for the netmap memory allocator
89*68b8534bSLuigi Rizzo  */
90*68b8534bSLuigi Rizzo #define NMA_LOCK()	mtx_lock(&netmap_mem_d->nm_mtx);
91*68b8534bSLuigi Rizzo #define NMA_UNLOCK()	mtx_unlock(&netmap_mem_d->nm_mtx);
92*68b8534bSLuigi Rizzo 
93*68b8534bSLuigi Rizzo /*
94*68b8534bSLuigi Rizzo  * Default amount of memory pre-allocated by the module.
95*68b8534bSLuigi Rizzo  * We start with a large size and then shrink our demand
96*68b8534bSLuigi Rizzo  * according to what is avalable when the module is loaded.
97*68b8534bSLuigi Rizzo  * At the moment the block is contiguous, but we can easily
98*68b8534bSLuigi Rizzo  * restrict our demand to smaller units (16..64k)
99*68b8534bSLuigi Rizzo  */
100*68b8534bSLuigi Rizzo #define NETMAP_MEMORY_SIZE (64 * 1024 * PAGE_SIZE)
101*68b8534bSLuigi Rizzo static void * netmap_malloc(size_t size, const char *msg);
102*68b8534bSLuigi Rizzo static void netmap_free(void *addr, const char *msg);
103*68b8534bSLuigi Rizzo 
104*68b8534bSLuigi Rizzo /*
105*68b8534bSLuigi Rizzo  * Allocator for a pool of packet buffers. For each buffer we have
106*68b8534bSLuigi Rizzo  * one entry in the bitmap to signal the state. Allocation scans
107*68b8534bSLuigi Rizzo  * the bitmap, but since this is done only on attach, we are not
108*68b8534bSLuigi Rizzo  * too worried about performance
109*68b8534bSLuigi Rizzo  * XXX if we need to allocate small blocks, a translation
110*68b8534bSLuigi Rizzo  * table is used both for kernel virtual address and physical
111*68b8534bSLuigi Rizzo  * addresses.
112*68b8534bSLuigi Rizzo  */
113*68b8534bSLuigi Rizzo struct netmap_buf_pool {
114*68b8534bSLuigi Rizzo 	u_int total_buffers;	/* total buffers. */
115*68b8534bSLuigi Rizzo 	u_int free;
116*68b8534bSLuigi Rizzo 	u_int bufsize;
117*68b8534bSLuigi Rizzo 	char *base;		/* buffer base address */
118*68b8534bSLuigi Rizzo 	uint32_t *bitmap;	/* one bit per buffer, 1 means free */
119*68b8534bSLuigi Rizzo };
120*68b8534bSLuigi Rizzo struct netmap_buf_pool nm_buf_pool;
121*68b8534bSLuigi Rizzo /* XXX move these two vars back into netmap_buf_pool */
122*68b8534bSLuigi Rizzo u_int netmap_total_buffers;
123*68b8534bSLuigi Rizzo char *netmap_buffer_base;
124*68b8534bSLuigi Rizzo 
125*68b8534bSLuigi Rizzo /* user-controlled variables */
126*68b8534bSLuigi Rizzo int netmap_verbose;
127*68b8534bSLuigi Rizzo 
128*68b8534bSLuigi Rizzo static int no_timestamp; /* don't timestamp on rxsync */
129*68b8534bSLuigi Rizzo 
130*68b8534bSLuigi Rizzo SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
131*68b8534bSLuigi Rizzo SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
132*68b8534bSLuigi Rizzo     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
133*68b8534bSLuigi Rizzo SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
134*68b8534bSLuigi Rizzo     CTLFLAG_RW, &no_timestamp, 0, "no_timestamp");
135*68b8534bSLuigi Rizzo SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers,
136*68b8534bSLuigi Rizzo     CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers");
137*68b8534bSLuigi Rizzo SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers,
138*68b8534bSLuigi Rizzo     CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers");
139*68b8534bSLuigi Rizzo 
140*68b8534bSLuigi Rizzo /*
141*68b8534bSLuigi Rizzo  * Allocate n buffers from the ring, and fill the slot.
142*68b8534bSLuigi Rizzo  * Buffer 0 is the 'junk' buffer.
143*68b8534bSLuigi Rizzo  */
144*68b8534bSLuigi Rizzo static void
145*68b8534bSLuigi Rizzo netmap_new_bufs(struct netmap_buf_pool *p, struct netmap_slot *slot, u_int n)
146*68b8534bSLuigi Rizzo {
147*68b8534bSLuigi Rizzo 	uint32_t bi = 0;		/* index in the bitmap */
148*68b8534bSLuigi Rizzo 	uint32_t mask, j, i = 0;	/* slot counter */
149*68b8534bSLuigi Rizzo 
150*68b8534bSLuigi Rizzo 	if (n > p->free) {
151*68b8534bSLuigi Rizzo 		D("only %d out of %d buffers available", i, n);
152*68b8534bSLuigi Rizzo 		return;
153*68b8534bSLuigi Rizzo 	}
154*68b8534bSLuigi Rizzo 	/* termination is guaranteed by p->free */
155*68b8534bSLuigi Rizzo 	while (i < n && p->free > 0) {
156*68b8534bSLuigi Rizzo 		uint32_t cur = p->bitmap[bi];
157*68b8534bSLuigi Rizzo 		if (cur == 0) { /* bitmask is fully used */
158*68b8534bSLuigi Rizzo 			bi++;
159*68b8534bSLuigi Rizzo 			continue;
160*68b8534bSLuigi Rizzo 		}
161*68b8534bSLuigi Rizzo 		/* locate a slot */
162*68b8534bSLuigi Rizzo 		for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ;
163*68b8534bSLuigi Rizzo 		p->bitmap[bi] &= ~mask;		/* slot in use */
164*68b8534bSLuigi Rizzo 		p->free--;
165*68b8534bSLuigi Rizzo 		slot[i].buf_idx = bi*32+j;
166*68b8534bSLuigi Rizzo 		slot[i].len = p->bufsize;
167*68b8534bSLuigi Rizzo 		slot[i].flags = NS_BUF_CHANGED;
168*68b8534bSLuigi Rizzo 		i++;
169*68b8534bSLuigi Rizzo 	}
170*68b8534bSLuigi Rizzo 	ND("allocated %d buffers, %d available", n, p->free);
171*68b8534bSLuigi Rizzo }
172*68b8534bSLuigi Rizzo 
173*68b8534bSLuigi Rizzo 
174*68b8534bSLuigi Rizzo static void
175*68b8534bSLuigi Rizzo netmap_free_buf(struct netmap_buf_pool *p, uint32_t i)
176*68b8534bSLuigi Rizzo {
177*68b8534bSLuigi Rizzo 	uint32_t pos, mask;
178*68b8534bSLuigi Rizzo 	if (i >= p->total_buffers) {
179*68b8534bSLuigi Rizzo 		D("invalid free index %d", i);
180*68b8534bSLuigi Rizzo 		return;
181*68b8534bSLuigi Rizzo 	}
182*68b8534bSLuigi Rizzo 	pos = i / 32;
183*68b8534bSLuigi Rizzo 	mask = 1 << (i % 32);
184*68b8534bSLuigi Rizzo 	if (p->bitmap[pos] & mask) {
185*68b8534bSLuigi Rizzo 		D("slot %d already free", i);
186*68b8534bSLuigi Rizzo 		return;
187*68b8534bSLuigi Rizzo 	}
188*68b8534bSLuigi Rizzo 	p->bitmap[pos] |= mask;
189*68b8534bSLuigi Rizzo 	p->free++;
190*68b8534bSLuigi Rizzo }
191*68b8534bSLuigi Rizzo 
192*68b8534bSLuigi Rizzo 
193*68b8534bSLuigi Rizzo /* Descriptor of the memory objects handled by our memory allocator. */
194*68b8534bSLuigi Rizzo struct netmap_mem_obj {
195*68b8534bSLuigi Rizzo 	TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the
196*68b8534bSLuigi Rizzo 						 chain. */
197*68b8534bSLuigi Rizzo 	int nmo_used; /* flag set on used memory objects. */
198*68b8534bSLuigi Rizzo 	size_t nmo_size; /* size of the memory area reserved for the
199*68b8534bSLuigi Rizzo 			    object. */
200*68b8534bSLuigi Rizzo 	void *nmo_data; /* pointer to the memory area. */
201*68b8534bSLuigi Rizzo };
202*68b8534bSLuigi Rizzo 
203*68b8534bSLuigi Rizzo /* Wrap our memory objects to make them ``chainable``. */
204*68b8534bSLuigi Rizzo TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj);
205*68b8534bSLuigi Rizzo 
206*68b8534bSLuigi Rizzo 
207*68b8534bSLuigi Rizzo /* Descriptor of our custom memory allocator. */
208*68b8534bSLuigi Rizzo struct netmap_mem_d {
209*68b8534bSLuigi Rizzo 	struct mtx nm_mtx; /* lock used to handle the chain of memory
210*68b8534bSLuigi Rizzo 			      objects. */
211*68b8534bSLuigi Rizzo 	struct netmap_mem_obj_h nm_molist; /* list of memory objects */
212*68b8534bSLuigi Rizzo 	size_t nm_size; /* total amount of memory used for rings etc. */
213*68b8534bSLuigi Rizzo 	size_t nm_totalsize; /* total amount of allocated memory
214*68b8534bSLuigi Rizzo 		(the difference is used for buffers) */
215*68b8534bSLuigi Rizzo 	size_t nm_buf_start; /* offset of packet buffers.
216*68b8534bSLuigi Rizzo 			This is page-aligned. */
217*68b8534bSLuigi Rizzo 	size_t nm_buf_len; /* total memory for buffers */
218*68b8534bSLuigi Rizzo 	void *nm_buffer; /* pointer to the whole pre-allocated memory
219*68b8534bSLuigi Rizzo 			    area. */
220*68b8534bSLuigi Rizzo };
221*68b8534bSLuigi Rizzo 
222*68b8534bSLuigi Rizzo 
223*68b8534bSLuigi Rizzo /* Structure associated to each thread which registered an interface. */
224*68b8534bSLuigi Rizzo struct netmap_priv_d {
225*68b8534bSLuigi Rizzo 	struct netmap_if *np_nifp;	/* netmap interface descriptor. */
226*68b8534bSLuigi Rizzo 
227*68b8534bSLuigi Rizzo 	struct ifnet	*np_ifp;	/* device for which we hold a reference */
228*68b8534bSLuigi Rizzo 	int		np_ringid;	/* from the ioctl */
229*68b8534bSLuigi Rizzo 	u_int		np_qfirst, np_qlast;	/* range of rings to scan */
230*68b8534bSLuigi Rizzo 	uint16_t	np_txpoll;
231*68b8534bSLuigi Rizzo };
232*68b8534bSLuigi Rizzo 
233*68b8534bSLuigi Rizzo 
234*68b8534bSLuigi Rizzo static struct cdev *netmap_dev; /* /dev/netmap character device. */
235*68b8534bSLuigi Rizzo static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */
236*68b8534bSLuigi Rizzo 
237*68b8534bSLuigi Rizzo 
238*68b8534bSLuigi Rizzo static d_mmap_t netmap_mmap;
239*68b8534bSLuigi Rizzo static d_ioctl_t netmap_ioctl;
240*68b8534bSLuigi Rizzo static d_poll_t netmap_poll;
241*68b8534bSLuigi Rizzo 
242*68b8534bSLuigi Rizzo #ifdef NETMAP_KEVENT
243*68b8534bSLuigi Rizzo static d_kqfilter_t netmap_kqfilter;
244*68b8534bSLuigi Rizzo #endif
245*68b8534bSLuigi Rizzo 
246*68b8534bSLuigi Rizzo static struct cdevsw netmap_cdevsw = {
247*68b8534bSLuigi Rizzo 	.d_version = D_VERSION,
248*68b8534bSLuigi Rizzo 	.d_name = "netmap",
249*68b8534bSLuigi Rizzo 	.d_mmap = netmap_mmap,
250*68b8534bSLuigi Rizzo 	.d_ioctl = netmap_ioctl,
251*68b8534bSLuigi Rizzo 	.d_poll = netmap_poll,
252*68b8534bSLuigi Rizzo #ifdef NETMAP_KEVENT
253*68b8534bSLuigi Rizzo 	.d_kqfilter = netmap_kqfilter,
254*68b8534bSLuigi Rizzo #endif
255*68b8534bSLuigi Rizzo };
256*68b8534bSLuigi Rizzo 
257*68b8534bSLuigi Rizzo #ifdef NETMAP_KEVENT
258*68b8534bSLuigi Rizzo static int              netmap_kqread(struct knote *, long);
259*68b8534bSLuigi Rizzo static int              netmap_kqwrite(struct knote *, long);
260*68b8534bSLuigi Rizzo static void             netmap_kqdetach(struct knote *);
261*68b8534bSLuigi Rizzo 
262*68b8534bSLuigi Rizzo static struct filterops netmap_read_filterops = {
263*68b8534bSLuigi Rizzo 	.f_isfd =       1,
264*68b8534bSLuigi Rizzo 	.f_attach =     NULL,
265*68b8534bSLuigi Rizzo 	.f_detach =     netmap_kqdetach,
266*68b8534bSLuigi Rizzo 	.f_event =      netmap_kqread,
267*68b8534bSLuigi Rizzo };
268*68b8534bSLuigi Rizzo 
269*68b8534bSLuigi Rizzo static struct filterops netmap_write_filterops = {
270*68b8534bSLuigi Rizzo 	.f_isfd =       1,
271*68b8534bSLuigi Rizzo 	.f_attach =     NULL,
272*68b8534bSLuigi Rizzo 	.f_detach =     netmap_kqdetach,
273*68b8534bSLuigi Rizzo 	.f_event =      netmap_kqwrite,
274*68b8534bSLuigi Rizzo };
275*68b8534bSLuigi Rizzo 
276*68b8534bSLuigi Rizzo /*
277*68b8534bSLuigi Rizzo  * support for the kevent() system call.
278*68b8534bSLuigi Rizzo  *
279*68b8534bSLuigi Rizzo  * This is the kevent filter, and is executed each time a new event
280*68b8534bSLuigi Rizzo  * is triggered on the device. This function execute some operation
281*68b8534bSLuigi Rizzo  * depending on the received filter.
282*68b8534bSLuigi Rizzo  *
283*68b8534bSLuigi Rizzo  * The implementation should test the filters and should implement
284*68b8534bSLuigi Rizzo  * filter operations we are interested on (a full list in /sys/event.h).
285*68b8534bSLuigi Rizzo  *
286*68b8534bSLuigi Rizzo  * On a match we should:
287*68b8534bSLuigi Rizzo  * - set kn->kn_fop
288*68b8534bSLuigi Rizzo  * - set kn->kn_hook
289*68b8534bSLuigi Rizzo  * - call knlist_add() to deliver the event to the application.
290*68b8534bSLuigi Rizzo  *
291*68b8534bSLuigi Rizzo  * Return 0 if the event should be delivered to the application.
292*68b8534bSLuigi Rizzo  */
293*68b8534bSLuigi Rizzo static int
294*68b8534bSLuigi Rizzo netmap_kqfilter(struct cdev *dev, struct knote *kn)
295*68b8534bSLuigi Rizzo {
296*68b8534bSLuigi Rizzo 	/* declare variables needed to read/write */
297*68b8534bSLuigi Rizzo 
298*68b8534bSLuigi Rizzo 	switch(kn->kn_filter) {
299*68b8534bSLuigi Rizzo 	case EVFILT_READ:
300*68b8534bSLuigi Rizzo 		if (netmap_verbose)
301*68b8534bSLuigi Rizzo 			D("%s kqfilter: EVFILT_READ" ifp->if_xname);
302*68b8534bSLuigi Rizzo 
303*68b8534bSLuigi Rizzo 		/* read operations */
304*68b8534bSLuigi Rizzo 		kn->kn_fop = &netmap_read_filterops;
305*68b8534bSLuigi Rizzo 		break;
306*68b8534bSLuigi Rizzo 
307*68b8534bSLuigi Rizzo 	case EVFILT_WRITE:
308*68b8534bSLuigi Rizzo 		if (netmap_verbose)
309*68b8534bSLuigi Rizzo 			D("%s kqfilter: EVFILT_WRITE" ifp->if_xname);
310*68b8534bSLuigi Rizzo 
311*68b8534bSLuigi Rizzo 		/* write operations */
312*68b8534bSLuigi Rizzo 		kn->kn_fop = &netmap_write_filterops;
313*68b8534bSLuigi Rizzo 		break;
314*68b8534bSLuigi Rizzo 
315*68b8534bSLuigi Rizzo 	default:
316*68b8534bSLuigi Rizzo 		if (netmap_verbose)
317*68b8534bSLuigi Rizzo 			D("%s kqfilter: invalid filter" ifp->if_xname);
318*68b8534bSLuigi Rizzo 		return(EINVAL);
319*68b8534bSLuigi Rizzo 	}
320*68b8534bSLuigi Rizzo 
321*68b8534bSLuigi Rizzo 	kn->kn_hook = 0;//
322*68b8534bSLuigi Rizzo 	knlist_add(&netmap_sc->tun_rsel.si_note, kn, 0);
323*68b8534bSLuigi Rizzo 
324*68b8534bSLuigi Rizzo 	return (0);
325*68b8534bSLuigi Rizzo }
326*68b8534bSLuigi Rizzo #endif /* NETMAP_KEVENT */
327*68b8534bSLuigi Rizzo 
328*68b8534bSLuigi Rizzo /*
329*68b8534bSLuigi Rizzo  * File descriptor's private data destructor.
330*68b8534bSLuigi Rizzo  *
331*68b8534bSLuigi Rizzo  * Call nm_register(ifp,0) to stop netmap mode on the interface and
332*68b8534bSLuigi Rizzo  * revert to normal operation. We expect that np_ifp has not gone.
333*68b8534bSLuigi Rizzo  */
334*68b8534bSLuigi Rizzo static void
335*68b8534bSLuigi Rizzo netmap_dtor(void *data)
336*68b8534bSLuigi Rizzo {
337*68b8534bSLuigi Rizzo 	struct netmap_priv_d *priv = data;
338*68b8534bSLuigi Rizzo 	struct ifnet *ifp = priv->np_ifp;
339*68b8534bSLuigi Rizzo 	struct netmap_adapter *na = NA(ifp);
340*68b8534bSLuigi Rizzo 	struct netmap_if *nifp = priv->np_nifp;
341*68b8534bSLuigi Rizzo 
342*68b8534bSLuigi Rizzo 	if (0)
343*68b8534bSLuigi Rizzo 	    printf("%s starting for %p ifp %p\n", __FUNCTION__, priv,
344*68b8534bSLuigi Rizzo 		priv ? priv->np_ifp : NULL);
345*68b8534bSLuigi Rizzo 
346*68b8534bSLuigi Rizzo 	na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0);
347*68b8534bSLuigi Rizzo 
348*68b8534bSLuigi Rizzo 	na->refcount--;
349*68b8534bSLuigi Rizzo 	if (na->refcount <= 0) {	/* last instance */
350*68b8534bSLuigi Rizzo 		u_int i;
351*68b8534bSLuigi Rizzo 
352*68b8534bSLuigi Rizzo 		D("deleting last netmap instance for %s", ifp->if_xname);
353*68b8534bSLuigi Rizzo 		/*
354*68b8534bSLuigi Rizzo 		 * there is a race here with *_netmap_task() and
355*68b8534bSLuigi Rizzo 		 * netmap_poll(), which don't run under NETMAP_CORE_LOCK.
356*68b8534bSLuigi Rizzo 		 * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP
357*68b8534bSLuigi Rizzo 		 * (aka NETMAP_DELETING(na)) are a unique marker that the
358*68b8534bSLuigi Rizzo 		 * device is dying.
359*68b8534bSLuigi Rizzo 		 * Before destroying stuff we sleep a bit, and then complete
360*68b8534bSLuigi Rizzo 		 * the job. NIOCREG should realize the condition and
361*68b8534bSLuigi Rizzo 		 * loop until they can continue; the other routines
362*68b8534bSLuigi Rizzo 		 * should check the condition at entry and quit if
363*68b8534bSLuigi Rizzo 		 * they cannot run.
364*68b8534bSLuigi Rizzo 		 */
365*68b8534bSLuigi Rizzo 		na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
366*68b8534bSLuigi Rizzo 		tsleep(na, 0, "NIOCUNREG", 4);
367*68b8534bSLuigi Rizzo 		na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0);
368*68b8534bSLuigi Rizzo 		na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
369*68b8534bSLuigi Rizzo 		/* Wake up any sleeping threads. netmap_poll will
370*68b8534bSLuigi Rizzo 		 * then return POLLERR
371*68b8534bSLuigi Rizzo 		 */
372*68b8534bSLuigi Rizzo 		for (i = 0; i < na->num_queues + 2; i++) {
373*68b8534bSLuigi Rizzo 			selwakeuppri(&na->tx_rings[i].si, PI_NET);
374*68b8534bSLuigi Rizzo 			selwakeuppri(&na->rx_rings[i].si, PI_NET);
375*68b8534bSLuigi Rizzo 		}
376*68b8534bSLuigi Rizzo 		/* release all buffers */
377*68b8534bSLuigi Rizzo 		NMA_LOCK();
378*68b8534bSLuigi Rizzo 		for (i = 0; i < na->num_queues + 1; i++) {
379*68b8534bSLuigi Rizzo 			int j, lim;
380*68b8534bSLuigi Rizzo 			struct netmap_ring *ring;
381*68b8534bSLuigi Rizzo 
382*68b8534bSLuigi Rizzo 			ND("tx queue %d", i);
383*68b8534bSLuigi Rizzo 			ring = na->tx_rings[i].ring;
384*68b8534bSLuigi Rizzo 			lim = na->tx_rings[i].nkr_num_slots;
385*68b8534bSLuigi Rizzo 			for (j = 0; j < lim; j++)
386*68b8534bSLuigi Rizzo 				netmap_free_buf(&nm_buf_pool,
387*68b8534bSLuigi Rizzo 					ring->slot[j].buf_idx);
388*68b8534bSLuigi Rizzo 
389*68b8534bSLuigi Rizzo 			ND("rx queue %d", i);
390*68b8534bSLuigi Rizzo 			ring = na->rx_rings[i].ring;
391*68b8534bSLuigi Rizzo 			lim = na->rx_rings[i].nkr_num_slots;
392*68b8534bSLuigi Rizzo 			for (j = 0; j < lim; j++)
393*68b8534bSLuigi Rizzo 				netmap_free_buf(&nm_buf_pool,
394*68b8534bSLuigi Rizzo 					ring->slot[j].buf_idx);
395*68b8534bSLuigi Rizzo 		}
396*68b8534bSLuigi Rizzo 		NMA_UNLOCK();
397*68b8534bSLuigi Rizzo 		netmap_free(na->tx_rings[0].ring, "shadow rings");
398*68b8534bSLuigi Rizzo 		wakeup(na);
399*68b8534bSLuigi Rizzo 	}
400*68b8534bSLuigi Rizzo 	netmap_free(nifp, "nifp");
401*68b8534bSLuigi Rizzo 
402*68b8534bSLuigi Rizzo 	na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
403*68b8534bSLuigi Rizzo 
404*68b8534bSLuigi Rizzo 	if_rele(ifp);
405*68b8534bSLuigi Rizzo 
406*68b8534bSLuigi Rizzo 	bzero(priv, sizeof(*priv));	/* XXX for safety */
407*68b8534bSLuigi Rizzo 	free(priv, M_DEVBUF);
408*68b8534bSLuigi Rizzo }
409*68b8534bSLuigi Rizzo 
410*68b8534bSLuigi Rizzo 
411*68b8534bSLuigi Rizzo 
412*68b8534bSLuigi Rizzo /*
413*68b8534bSLuigi Rizzo  * Create and return a new ``netmap_if`` object, and possibly also
414*68b8534bSLuigi Rizzo  * rings and packet buffors.
415*68b8534bSLuigi Rizzo  *
416*68b8534bSLuigi Rizzo  * Return NULL on failure.
417*68b8534bSLuigi Rizzo  */
418*68b8534bSLuigi Rizzo static void *
419*68b8534bSLuigi Rizzo netmap_if_new(const char *ifname, struct netmap_adapter *na)
420*68b8534bSLuigi Rizzo {
421*68b8534bSLuigi Rizzo 	struct netmap_if *nifp;
422*68b8534bSLuigi Rizzo 	struct netmap_ring *ring;
423*68b8534bSLuigi Rizzo 	char *buff;
424*68b8534bSLuigi Rizzo 	u_int i, len, ofs;
425*68b8534bSLuigi Rizzo 	u_int n = na->num_queues + 1; /* shorthand, include stack queue */
426*68b8534bSLuigi Rizzo 
427*68b8534bSLuigi Rizzo 	/*
428*68b8534bSLuigi Rizzo 	 * the descriptor is followed inline by an array of offsets
429*68b8534bSLuigi Rizzo 	 * to the tx and rx rings in the shared memory region.
430*68b8534bSLuigi Rizzo 	 */
431*68b8534bSLuigi Rizzo 	len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t);
432*68b8534bSLuigi Rizzo 	nifp = netmap_malloc(len, "nifp");
433*68b8534bSLuigi Rizzo 	if (nifp == NULL)
434*68b8534bSLuigi Rizzo 		return (NULL);
435*68b8534bSLuigi Rizzo 
436*68b8534bSLuigi Rizzo 	/* initialize base fields */
437*68b8534bSLuigi Rizzo 	*(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues;
438*68b8534bSLuigi Rizzo 	strncpy(nifp->ni_name, ifname, IFNAMSIZ);
439*68b8534bSLuigi Rizzo 
440*68b8534bSLuigi Rizzo 	(na->refcount)++;	/* XXX atomic ? we are under lock */
441*68b8534bSLuigi Rizzo 	if (na->refcount > 1)
442*68b8534bSLuigi Rizzo 		goto final;
443*68b8534bSLuigi Rizzo 
444*68b8534bSLuigi Rizzo 	/*
445*68b8534bSLuigi Rizzo 	 * If this is the first instance, allocate the shadow rings and
446*68b8534bSLuigi Rizzo 	 * buffers for this card (one for each hw queue, one for the host).
447*68b8534bSLuigi Rizzo 	 * The rings are contiguous, but have variable size.
448*68b8534bSLuigi Rizzo 	 * The entire block is reachable at
449*68b8534bSLuigi Rizzo 	 *	na->tx_rings[0].ring
450*68b8534bSLuigi Rizzo 	 */
451*68b8534bSLuigi Rizzo 
452*68b8534bSLuigi Rizzo 	len = n * (2 * sizeof(struct netmap_ring) +
453*68b8534bSLuigi Rizzo 		  (na->num_tx_desc + na->num_rx_desc) *
454*68b8534bSLuigi Rizzo 		   sizeof(struct netmap_slot) );
455*68b8534bSLuigi Rizzo 	buff = netmap_malloc(len, "shadow rings");
456*68b8534bSLuigi Rizzo 	if (buff == NULL) {
457*68b8534bSLuigi Rizzo 		D("failed to allocate %d bytes for %s shadow ring",
458*68b8534bSLuigi Rizzo 			len, ifname);
459*68b8534bSLuigi Rizzo error:
460*68b8534bSLuigi Rizzo 		(na->refcount)--;
461*68b8534bSLuigi Rizzo 		netmap_free(nifp, "nifp, rings failed");
462*68b8534bSLuigi Rizzo 		return (NULL);
463*68b8534bSLuigi Rizzo 	}
464*68b8534bSLuigi Rizzo 	/* do we have the bufers ? we are in need of num_tx_desc buffers for
465*68b8534bSLuigi Rizzo 	 * each tx ring and num_tx_desc buffers for each rx ring. */
466*68b8534bSLuigi Rizzo 	len = n * (na->num_tx_desc + na->num_rx_desc);
467*68b8534bSLuigi Rizzo 	NMA_LOCK();
468*68b8534bSLuigi Rizzo 	if (nm_buf_pool.free < len) {
469*68b8534bSLuigi Rizzo 		NMA_UNLOCK();
470*68b8534bSLuigi Rizzo 		netmap_free(buff, "not enough bufs");
471*68b8534bSLuigi Rizzo 		goto error;
472*68b8534bSLuigi Rizzo 	}
473*68b8534bSLuigi Rizzo 	/*
474*68b8534bSLuigi Rizzo 	 * in the kring, store the pointers to the shared rings
475*68b8534bSLuigi Rizzo 	 * and initialize the rings. We are under NMA_LOCK().
476*68b8534bSLuigi Rizzo 	 */
477*68b8534bSLuigi Rizzo 	ofs = 0;
478*68b8534bSLuigi Rizzo 	for (i = 0; i < n; i++) {
479*68b8534bSLuigi Rizzo 		struct netmap_kring *kring;
480*68b8534bSLuigi Rizzo 		int numdesc;
481*68b8534bSLuigi Rizzo 
482*68b8534bSLuigi Rizzo 		/* Transmit rings */
483*68b8534bSLuigi Rizzo 		kring = &na->tx_rings[i];
484*68b8534bSLuigi Rizzo 		numdesc = na->num_tx_desc;
485*68b8534bSLuigi Rizzo 		bzero(kring, sizeof(*kring));
486*68b8534bSLuigi Rizzo 		kring->na = na;
487*68b8534bSLuigi Rizzo 
488*68b8534bSLuigi Rizzo 		ring = kring->ring = (struct netmap_ring *)(buff + ofs);
489*68b8534bSLuigi Rizzo 		*(ssize_t *)(uintptr_t)&ring->buf_ofs =
490*68b8534bSLuigi Rizzo 			nm_buf_pool.base - (char *)ring;
491*68b8534bSLuigi Rizzo 		ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs);
492*68b8534bSLuigi Rizzo 		*(int *)(int *)(uintptr_t)&ring->num_slots =
493*68b8534bSLuigi Rizzo 			kring->nkr_num_slots = numdesc;
494*68b8534bSLuigi Rizzo 
495*68b8534bSLuigi Rizzo 		/*
496*68b8534bSLuigi Rizzo 		 * IMPORTANT:
497*68b8534bSLuigi Rizzo 		 * Always keep one slot empty, so we can detect new
498*68b8534bSLuigi Rizzo 		 * transmissions comparing cur and nr_hwcur (they are
499*68b8534bSLuigi Rizzo 		 * the same only if there are no new transmissions).
500*68b8534bSLuigi Rizzo 		 */
501*68b8534bSLuigi Rizzo 		ring->avail = kring->nr_hwavail = numdesc - 1;
502*68b8534bSLuigi Rizzo 		ring->cur = kring->nr_hwcur = 0;
503*68b8534bSLuigi Rizzo 		netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc);
504*68b8534bSLuigi Rizzo 
505*68b8534bSLuigi Rizzo 		ofs += sizeof(struct netmap_ring) +
506*68b8534bSLuigi Rizzo 			numdesc * sizeof(struct netmap_slot);
507*68b8534bSLuigi Rizzo 
508*68b8534bSLuigi Rizzo 		/* Receive rings */
509*68b8534bSLuigi Rizzo 		kring = &na->rx_rings[i];
510*68b8534bSLuigi Rizzo 		numdesc = na->num_rx_desc;
511*68b8534bSLuigi Rizzo 		bzero(kring, sizeof(*kring));
512*68b8534bSLuigi Rizzo 		kring->na = na;
513*68b8534bSLuigi Rizzo 
514*68b8534bSLuigi Rizzo 		ring = kring->ring = (struct netmap_ring *)(buff + ofs);
515*68b8534bSLuigi Rizzo 		*(ssize_t *)(uintptr_t)&ring->buf_ofs =
516*68b8534bSLuigi Rizzo 			nm_buf_pool.base - (char *)ring;
517*68b8534bSLuigi Rizzo 		ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs);
518*68b8534bSLuigi Rizzo 		*(int *)(int *)(uintptr_t)&ring->num_slots =
519*68b8534bSLuigi Rizzo 			kring->nkr_num_slots = numdesc;
520*68b8534bSLuigi Rizzo 		ring->cur = kring->nr_hwcur = 0;
521*68b8534bSLuigi Rizzo 		ring->avail = kring->nr_hwavail = 0; /* empty */
522*68b8534bSLuigi Rizzo 		netmap_new_bufs(&nm_buf_pool, ring->slot, numdesc);
523*68b8534bSLuigi Rizzo 		ofs += sizeof(struct netmap_ring) +
524*68b8534bSLuigi Rizzo 			numdesc * sizeof(struct netmap_slot);
525*68b8534bSLuigi Rizzo 	}
526*68b8534bSLuigi Rizzo 	NMA_UNLOCK();
527*68b8534bSLuigi Rizzo 	for (i = 0; i < n+1; i++) {
528*68b8534bSLuigi Rizzo 		// XXX initialize the selrecord structs.
529*68b8534bSLuigi Rizzo 	}
530*68b8534bSLuigi Rizzo final:
531*68b8534bSLuigi Rizzo 	/*
532*68b8534bSLuigi Rizzo 	 * fill the slots for the rx and tx queues. They contain the offset
533*68b8534bSLuigi Rizzo 	 * between the ring and nifp, so the information is usable in
534*68b8534bSLuigi Rizzo 	 * userspace to reach the ring from the nifp.
535*68b8534bSLuigi Rizzo 	 */
536*68b8534bSLuigi Rizzo 	for (i = 0; i < n; i++) {
537*68b8534bSLuigi Rizzo 		char *base = (char *)nifp;
538*68b8534bSLuigi Rizzo 		*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] =
539*68b8534bSLuigi Rizzo 			(char *)na->tx_rings[i].ring - base;
540*68b8534bSLuigi Rizzo 		*(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] =
541*68b8534bSLuigi Rizzo 			(char *)na->rx_rings[i].ring - base;
542*68b8534bSLuigi Rizzo 	}
543*68b8534bSLuigi Rizzo 	return (nifp);
544*68b8534bSLuigi Rizzo }
545*68b8534bSLuigi Rizzo 
546*68b8534bSLuigi Rizzo 
547*68b8534bSLuigi Rizzo /*
548*68b8534bSLuigi Rizzo  * mmap(2) support for the "netmap" device.
549*68b8534bSLuigi Rizzo  *
550*68b8534bSLuigi Rizzo  * Expose all the memory previously allocated by our custom memory
551*68b8534bSLuigi Rizzo  * allocator: this way the user has only to issue a single mmap(2), and
552*68b8534bSLuigi Rizzo  * can work on all the data structures flawlessly.
553*68b8534bSLuigi Rizzo  *
554*68b8534bSLuigi Rizzo  * Return 0 on success, -1 otherwise.
555*68b8534bSLuigi Rizzo  */
556*68b8534bSLuigi Rizzo static int
557*68b8534bSLuigi Rizzo #if __FreeBSD_version < 900000
558*68b8534bSLuigi Rizzo netmap_mmap(__unused struct cdev *dev, vm_offset_t offset, vm_paddr_t *paddr,
559*68b8534bSLuigi Rizzo 	    int nprot)
560*68b8534bSLuigi Rizzo #else
561*68b8534bSLuigi Rizzo netmap_mmap(__unused struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
562*68b8534bSLuigi Rizzo 	    int nprot, __unused vm_memattr_t *memattr)
563*68b8534bSLuigi Rizzo #endif
564*68b8534bSLuigi Rizzo {
565*68b8534bSLuigi Rizzo 	if (nprot & PROT_EXEC)
566*68b8534bSLuigi Rizzo 		return (-1);	// XXX -1 or EINVAL ?
567*68b8534bSLuigi Rizzo 	ND("request for offset 0x%x", (uint32_t)offset);
568*68b8534bSLuigi Rizzo 	*paddr = vtophys(netmap_mem_d->nm_buffer) + offset;
569*68b8534bSLuigi Rizzo 
570*68b8534bSLuigi Rizzo 	return (0);
571*68b8534bSLuigi Rizzo }
572*68b8534bSLuigi Rizzo 
573*68b8534bSLuigi Rizzo 
574*68b8534bSLuigi Rizzo /*
575*68b8534bSLuigi Rizzo  * handler for synchronization of the queues from/to the host
576*68b8534bSLuigi Rizzo  */
577*68b8534bSLuigi Rizzo static void
578*68b8534bSLuigi Rizzo netmap_sync_to_host(struct netmap_adapter *na)
579*68b8534bSLuigi Rizzo {
580*68b8534bSLuigi Rizzo 	struct netmap_kring *kring = &na->tx_rings[na->num_queues];
581*68b8534bSLuigi Rizzo 	struct netmap_ring *ring = kring->ring;
582*68b8534bSLuigi Rizzo 	struct mbuf *head = NULL, *tail = NULL, *m;
583*68b8534bSLuigi Rizzo 	u_int n, lim = kring->nkr_num_slots - 1;
584*68b8534bSLuigi Rizzo 
585*68b8534bSLuigi Rizzo 	na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0);
586*68b8534bSLuigi Rizzo 
587*68b8534bSLuigi Rizzo 	/* Take packets from hwcur to cur and pass them up.
588*68b8534bSLuigi Rizzo 	 * In case of no buffers we give up. At the end of the loop,
589*68b8534bSLuigi Rizzo 	 * the queue is drained in all cases.
590*68b8534bSLuigi Rizzo 	 */
591*68b8534bSLuigi Rizzo 	for (n = kring->nr_hwcur; n != ring->cur;) {
592*68b8534bSLuigi Rizzo 		struct netmap_slot *slot = &ring->slot[n];
593*68b8534bSLuigi Rizzo 
594*68b8534bSLuigi Rizzo 		n = (n == lim) ? 0 : n + 1;
595*68b8534bSLuigi Rizzo 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE) {
596*68b8534bSLuigi Rizzo 			D("bad pkt at %d len %d", n, slot->len);
597*68b8534bSLuigi Rizzo 			continue;
598*68b8534bSLuigi Rizzo 		}
599*68b8534bSLuigi Rizzo 		m = m_devget(NMB(slot), slot->len, 0, na->ifp, NULL);
600*68b8534bSLuigi Rizzo 
601*68b8534bSLuigi Rizzo 		if (m == NULL)
602*68b8534bSLuigi Rizzo 			break;
603*68b8534bSLuigi Rizzo 		if (tail)
604*68b8534bSLuigi Rizzo 			tail->m_nextpkt = m;
605*68b8534bSLuigi Rizzo 		else
606*68b8534bSLuigi Rizzo 			head = m;
607*68b8534bSLuigi Rizzo 		tail = m;
608*68b8534bSLuigi Rizzo 		m->m_nextpkt = NULL;
609*68b8534bSLuigi Rizzo 	}
610*68b8534bSLuigi Rizzo 	kring->nr_hwcur = ring->cur;
611*68b8534bSLuigi Rizzo 	kring->nr_hwavail = ring->avail = lim;
612*68b8534bSLuigi Rizzo 	na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
613*68b8534bSLuigi Rizzo 
614*68b8534bSLuigi Rizzo 	/* send packets up, outside the lock */
615*68b8534bSLuigi Rizzo 	while ((m = head) != NULL) {
616*68b8534bSLuigi Rizzo 		head = head->m_nextpkt;
617*68b8534bSLuigi Rizzo 		m->m_nextpkt = NULL;
618*68b8534bSLuigi Rizzo 		m->m_pkthdr.rcvif = na->ifp;
619*68b8534bSLuigi Rizzo 		if (netmap_verbose & NM_VERB_HOST)
620*68b8534bSLuigi Rizzo 			D("sending up pkt %p size %d", m, m->m_pkthdr.len);
621*68b8534bSLuigi Rizzo 		(na->ifp->if_input)(na->ifp, m);
622*68b8534bSLuigi Rizzo 	}
623*68b8534bSLuigi Rizzo }
624*68b8534bSLuigi Rizzo 
625*68b8534bSLuigi Rizzo /*
626*68b8534bSLuigi Rizzo  * This routine also does the selrecord if called from the poll handler
627*68b8534bSLuigi Rizzo  * (we know because td != NULL).
628*68b8534bSLuigi Rizzo  */
629*68b8534bSLuigi Rizzo static void
630*68b8534bSLuigi Rizzo netmap_sync_from_host(struct netmap_adapter *na, struct thread *td)
631*68b8534bSLuigi Rizzo {
632*68b8534bSLuigi Rizzo 	struct netmap_kring *kring = &na->rx_rings[na->num_queues];
633*68b8534bSLuigi Rizzo 	struct netmap_ring *ring = kring->ring;
634*68b8534bSLuigi Rizzo 	int delta;
635*68b8534bSLuigi Rizzo 
636*68b8534bSLuigi Rizzo 	na->nm_lock(na->ifp->if_softc, NETMAP_CORE_LOCK, 0);
637*68b8534bSLuigi Rizzo 
638*68b8534bSLuigi Rizzo 	/* skip past packets processed by userspace,
639*68b8534bSLuigi Rizzo 	 * and then sync cur/avail with hwcur/hwavail
640*68b8534bSLuigi Rizzo 	 */
641*68b8534bSLuigi Rizzo 	delta = ring->cur - kring->nr_hwcur;
642*68b8534bSLuigi Rizzo 	if (delta < 0)
643*68b8534bSLuigi Rizzo 		delta += kring->nkr_num_slots;
644*68b8534bSLuigi Rizzo 	kring->nr_hwavail -= delta;
645*68b8534bSLuigi Rizzo 	kring->nr_hwcur = ring->cur;
646*68b8534bSLuigi Rizzo 	ring->avail = kring->nr_hwavail;
647*68b8534bSLuigi Rizzo 	if (ring->avail == 0 && td)
648*68b8534bSLuigi Rizzo 		selrecord(td, &kring->si);
649*68b8534bSLuigi Rizzo 	if (ring->avail && (netmap_verbose & NM_VERB_HOST))
650*68b8534bSLuigi Rizzo 		D("%d pkts from stack", ring->avail);
651*68b8534bSLuigi Rizzo 	na->nm_lock(na->ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
652*68b8534bSLuigi Rizzo }
653*68b8534bSLuigi Rizzo 
654*68b8534bSLuigi Rizzo 
655*68b8534bSLuigi Rizzo /*
656*68b8534bSLuigi Rizzo  * get a refcounted reference to an interface.
657*68b8534bSLuigi Rizzo  * Return ENXIO if the interface does not exist, EINVAL if netmap
658*68b8534bSLuigi Rizzo  * is not supported by the interface.
659*68b8534bSLuigi Rizzo  * If successful, hold a reference.
660*68b8534bSLuigi Rizzo  */
661*68b8534bSLuigi Rizzo static int
662*68b8534bSLuigi Rizzo get_ifp(const char *name, struct ifnet **ifp)
663*68b8534bSLuigi Rizzo {
664*68b8534bSLuigi Rizzo 	*ifp = ifunit_ref(name);
665*68b8534bSLuigi Rizzo 	if (*ifp == NULL)
666*68b8534bSLuigi Rizzo 		return (ENXIO);
667*68b8534bSLuigi Rizzo 	/* can do this if the capability exists and if_pspare[0]
668*68b8534bSLuigi Rizzo 	 * points to the netmap descriptor.
669*68b8534bSLuigi Rizzo 	 */
670*68b8534bSLuigi Rizzo 	if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp))
671*68b8534bSLuigi Rizzo 		return 0;	/* valid pointer, we hold the refcount */
672*68b8534bSLuigi Rizzo 	if_rele(*ifp);
673*68b8534bSLuigi Rizzo 	return EINVAL;	// not NETMAP capable
674*68b8534bSLuigi Rizzo }
675*68b8534bSLuigi Rizzo 
676*68b8534bSLuigi Rizzo 
677*68b8534bSLuigi Rizzo /*
678*68b8534bSLuigi Rizzo  * Error routine called when txsync/rxsync detects an error.
679*68b8534bSLuigi Rizzo  * Can't do much more than resetting cur = hwcur, avail = hwavail.
680*68b8534bSLuigi Rizzo  * Return 1 on reinit.
681*68b8534bSLuigi Rizzo  */
682*68b8534bSLuigi Rizzo int
683*68b8534bSLuigi Rizzo netmap_ring_reinit(struct netmap_kring *kring)
684*68b8534bSLuigi Rizzo {
685*68b8534bSLuigi Rizzo 	struct netmap_ring *ring = kring->ring;
686*68b8534bSLuigi Rizzo 	u_int i, lim = kring->nkr_num_slots - 1;
687*68b8534bSLuigi Rizzo 	int errors = 0;
688*68b8534bSLuigi Rizzo 
689*68b8534bSLuigi Rizzo 	D("called for %s", kring->na->ifp->if_xname);
690*68b8534bSLuigi Rizzo 	if (ring->cur > lim)
691*68b8534bSLuigi Rizzo 		errors++;
692*68b8534bSLuigi Rizzo 	for (i = 0; i <= lim; i++) {
693*68b8534bSLuigi Rizzo 		u_int idx = ring->slot[i].buf_idx;
694*68b8534bSLuigi Rizzo 		u_int len = ring->slot[i].len;
695*68b8534bSLuigi Rizzo 		if (idx < 2 || idx >= netmap_total_buffers) {
696*68b8534bSLuigi Rizzo 			if (!errors++)
697*68b8534bSLuigi Rizzo 				D("bad buffer at slot %d idx %d len %d ", i, idx, len);
698*68b8534bSLuigi Rizzo 			ring->slot[i].buf_idx = 0;
699*68b8534bSLuigi Rizzo 			ring->slot[i].len = 0;
700*68b8534bSLuigi Rizzo 		} else if (len > NETMAP_BUF_SIZE) {
701*68b8534bSLuigi Rizzo 			ring->slot[i].len = 0;
702*68b8534bSLuigi Rizzo 			if (!errors++)
703*68b8534bSLuigi Rizzo 				D("bad len %d at slot %d idx %d",
704*68b8534bSLuigi Rizzo 					len, i, idx);
705*68b8534bSLuigi Rizzo 		}
706*68b8534bSLuigi Rizzo 	}
707*68b8534bSLuigi Rizzo 	if (errors) {
708*68b8534bSLuigi Rizzo 		int pos = kring - kring->na->tx_rings;
709*68b8534bSLuigi Rizzo 		int n = kring->na->num_queues + 2;
710*68b8534bSLuigi Rizzo 
711*68b8534bSLuigi Rizzo 		D("total %d errors", errors);
712*68b8534bSLuigi Rizzo 		errors++;
713*68b8534bSLuigi Rizzo 		D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
714*68b8534bSLuigi Rizzo 			kring->na->ifp->if_xname,
715*68b8534bSLuigi Rizzo 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
716*68b8534bSLuigi Rizzo 			ring->cur, kring->nr_hwcur,
717*68b8534bSLuigi Rizzo 			ring->avail, kring->nr_hwavail);
718*68b8534bSLuigi Rizzo 		ring->cur = kring->nr_hwcur;
719*68b8534bSLuigi Rizzo 		ring->avail = kring->nr_hwavail;
720*68b8534bSLuigi Rizzo 		ring->flags |= NR_REINIT;
721*68b8534bSLuigi Rizzo 		kring->na->flags |= NR_REINIT;
722*68b8534bSLuigi Rizzo 	}
723*68b8534bSLuigi Rizzo 	return (errors ? 1 : 0);
724*68b8534bSLuigi Rizzo }
725*68b8534bSLuigi Rizzo 
726*68b8534bSLuigi Rizzo /*
727*68b8534bSLuigi Rizzo  * Clean the reinit flag for our rings.
728*68b8534bSLuigi Rizzo  * XXX at the moment, clear for all rings
729*68b8534bSLuigi Rizzo  */
730*68b8534bSLuigi Rizzo static void
731*68b8534bSLuigi Rizzo netmap_clean_reinit(struct netmap_adapter *na)
732*68b8534bSLuigi Rizzo {
733*68b8534bSLuigi Rizzo 	//struct netmap_kring *kring;
734*68b8534bSLuigi Rizzo 	u_int i;
735*68b8534bSLuigi Rizzo 
736*68b8534bSLuigi Rizzo 	na->flags &= ~NR_REINIT;
737*68b8534bSLuigi Rizzo 	D("--- NR_REINIT reset on %s", na->ifp->if_xname);
738*68b8534bSLuigi Rizzo 	for (i = 0; i < na->num_queues + 1; i++) {
739*68b8534bSLuigi Rizzo 		na->tx_rings[i].ring->flags &= ~NR_REINIT;
740*68b8534bSLuigi Rizzo 		na->rx_rings[i].ring->flags &= ~NR_REINIT;
741*68b8534bSLuigi Rizzo 	}
742*68b8534bSLuigi Rizzo }
743*68b8534bSLuigi Rizzo 
744*68b8534bSLuigi Rizzo /*
745*68b8534bSLuigi Rizzo  * Set the ring ID. For devices with a single queue, a request
746*68b8534bSLuigi Rizzo  * for all rings is the same as a single ring.
747*68b8534bSLuigi Rizzo  */
748*68b8534bSLuigi Rizzo static int
749*68b8534bSLuigi Rizzo netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
750*68b8534bSLuigi Rizzo {
751*68b8534bSLuigi Rizzo 	struct ifnet *ifp = priv->np_ifp;
752*68b8534bSLuigi Rizzo 	struct netmap_adapter *na = NA(ifp);
753*68b8534bSLuigi Rizzo 	void *adapter = na->ifp->if_softc;	/* shorthand */
754*68b8534bSLuigi Rizzo 	u_int i = ringid & NETMAP_RING_MASK;
755*68b8534bSLuigi Rizzo 	/* first time we don't lock */
756*68b8534bSLuigi Rizzo 	int need_lock = (priv->np_qfirst != priv->np_qlast);
757*68b8534bSLuigi Rizzo 
758*68b8534bSLuigi Rizzo 	if ( (ringid & NETMAP_HW_RING) && i >= na->num_queues) {
759*68b8534bSLuigi Rizzo 		D("invalid ring id %d", i);
760*68b8534bSLuigi Rizzo 		return (EINVAL);
761*68b8534bSLuigi Rizzo 	}
762*68b8534bSLuigi Rizzo 	if (need_lock)
763*68b8534bSLuigi Rizzo 		na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
764*68b8534bSLuigi Rizzo 	priv->np_ringid = ringid;
765*68b8534bSLuigi Rizzo 	if (ringid & NETMAP_SW_RING) {
766*68b8534bSLuigi Rizzo 		priv->np_qfirst = na->num_queues;
767*68b8534bSLuigi Rizzo 		priv->np_qlast = na->num_queues + 1;
768*68b8534bSLuigi Rizzo 	} else if (ringid & NETMAP_HW_RING) {
769*68b8534bSLuigi Rizzo 		priv->np_qfirst = i;
770*68b8534bSLuigi Rizzo 		priv->np_qlast = i + 1;
771*68b8534bSLuigi Rizzo 	} else {
772*68b8534bSLuigi Rizzo 		priv->np_qfirst = 0;
773*68b8534bSLuigi Rizzo 		priv->np_qlast = na->num_queues;
774*68b8534bSLuigi Rizzo 	}
775*68b8534bSLuigi Rizzo 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
776*68b8534bSLuigi Rizzo 	if (need_lock)
777*68b8534bSLuigi Rizzo 		na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
778*68b8534bSLuigi Rizzo 	if (ringid & NETMAP_SW_RING)
779*68b8534bSLuigi Rizzo 		D("ringid %s set to SW RING", ifp->if_xname);
780*68b8534bSLuigi Rizzo 	else if (ringid & NETMAP_HW_RING)
781*68b8534bSLuigi Rizzo 		D("ringid %s set to HW RING %d", ifp->if_xname,
782*68b8534bSLuigi Rizzo 			priv->np_qfirst);
783*68b8534bSLuigi Rizzo 	else
784*68b8534bSLuigi Rizzo 		D("ringid %s set to all %d HW RINGS", ifp->if_xname,
785*68b8534bSLuigi Rizzo 			priv->np_qlast);
786*68b8534bSLuigi Rizzo 	return 0;
787*68b8534bSLuigi Rizzo }
788*68b8534bSLuigi Rizzo 
789*68b8534bSLuigi Rizzo /*
790*68b8534bSLuigi Rizzo  * ioctl(2) support for the "netmap" device.
791*68b8534bSLuigi Rizzo  *
792*68b8534bSLuigi Rizzo  * Following a list of accepted commands:
793*68b8534bSLuigi Rizzo  * - NIOCGINFO
794*68b8534bSLuigi Rizzo  * - SIOCGIFADDR	just for convenience
795*68b8534bSLuigi Rizzo  * - NIOCREGIF
796*68b8534bSLuigi Rizzo  * - NIOCUNREGIF
797*68b8534bSLuigi Rizzo  * - NIOCTXSYNC
798*68b8534bSLuigi Rizzo  * - NIOCRXSYNC
799*68b8534bSLuigi Rizzo  *
800*68b8534bSLuigi Rizzo  * Return 0 on success, errno otherwise.
801*68b8534bSLuigi Rizzo  */
802*68b8534bSLuigi Rizzo static int
803*68b8534bSLuigi Rizzo netmap_ioctl(__unused struct cdev *dev, u_long cmd, caddr_t data,
804*68b8534bSLuigi Rizzo 	__unused int fflag, __unused struct thread *td)
805*68b8534bSLuigi Rizzo {
806*68b8534bSLuigi Rizzo 	struct netmap_priv_d *priv = NULL;
807*68b8534bSLuigi Rizzo 	struct ifnet *ifp;
808*68b8534bSLuigi Rizzo 	struct nmreq *nmr = (struct nmreq *) data;
809*68b8534bSLuigi Rizzo 	struct netmap_adapter *na;
810*68b8534bSLuigi Rizzo 	void *adapter;
811*68b8534bSLuigi Rizzo 	int error;
812*68b8534bSLuigi Rizzo 	u_int i;
813*68b8534bSLuigi Rizzo 	struct netmap_if *nifp;
814*68b8534bSLuigi Rizzo 
815*68b8534bSLuigi Rizzo 	error = devfs_get_cdevpriv((void **)&priv);
816*68b8534bSLuigi Rizzo 	if (error != ENOENT && error != 0)
817*68b8534bSLuigi Rizzo 		return (error);
818*68b8534bSLuigi Rizzo 
819*68b8534bSLuigi Rizzo 	error = 0;	/* Could be ENOENT */
820*68b8534bSLuigi Rizzo 	switch (cmd) {
821*68b8534bSLuigi Rizzo 	case NIOCGINFO:		/* return capabilities etc */
822*68b8534bSLuigi Rizzo 		/* memsize is always valid */
823*68b8534bSLuigi Rizzo 		nmr->nr_memsize = netmap_mem_d->nm_totalsize;
824*68b8534bSLuigi Rizzo 		nmr->nr_offset = 0;
825*68b8534bSLuigi Rizzo 		nmr->nr_numrings = 0;
826*68b8534bSLuigi Rizzo 		nmr->nr_numslots = 0;
827*68b8534bSLuigi Rizzo 		if (nmr->nr_name[0] == '\0')	/* just get memory info */
828*68b8534bSLuigi Rizzo 			break;
829*68b8534bSLuigi Rizzo 		error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */
830*68b8534bSLuigi Rizzo 		if (error)
831*68b8534bSLuigi Rizzo 			break;
832*68b8534bSLuigi Rizzo 		na = NA(ifp); /* retrieve netmap_adapter */
833*68b8534bSLuigi Rizzo 		nmr->nr_numrings = na->num_queues;
834*68b8534bSLuigi Rizzo 		nmr->nr_numslots = na->num_tx_desc;
835*68b8534bSLuigi Rizzo 		if_rele(ifp);	/* return the refcount */
836*68b8534bSLuigi Rizzo 		break;
837*68b8534bSLuigi Rizzo 
838*68b8534bSLuigi Rizzo 	case NIOCREGIF:
839*68b8534bSLuigi Rizzo 		if (priv != NULL)	/* thread already registered */
840*68b8534bSLuigi Rizzo 			return netmap_set_ringid(priv, nmr->nr_ringid);
841*68b8534bSLuigi Rizzo 		/* find the interface and a reference */
842*68b8534bSLuigi Rizzo 		error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
843*68b8534bSLuigi Rizzo 		if (error)
844*68b8534bSLuigi Rizzo 			break;
845*68b8534bSLuigi Rizzo 		na = NA(ifp); /* retrieve netmap adapter */
846*68b8534bSLuigi Rizzo 		adapter = na->ifp->if_softc;	/* shorthand */
847*68b8534bSLuigi Rizzo 		/*
848*68b8534bSLuigi Rizzo 		 * Allocate the private per-thread structure.
849*68b8534bSLuigi Rizzo 		 * XXX perhaps we can use a blocking malloc ?
850*68b8534bSLuigi Rizzo 		 */
851*68b8534bSLuigi Rizzo 		priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
852*68b8534bSLuigi Rizzo 			      M_NOWAIT | M_ZERO);
853*68b8534bSLuigi Rizzo 		if (priv == NULL) {
854*68b8534bSLuigi Rizzo 			error = ENOMEM;
855*68b8534bSLuigi Rizzo 			if_rele(ifp);   /* return the refcount */
856*68b8534bSLuigi Rizzo 			break;
857*68b8534bSLuigi Rizzo 		}
858*68b8534bSLuigi Rizzo 
859*68b8534bSLuigi Rizzo 
860*68b8534bSLuigi Rizzo 		for (i = 10; i > 0; i--) {
861*68b8534bSLuigi Rizzo 			na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
862*68b8534bSLuigi Rizzo 			if (!NETMAP_DELETING(na))
863*68b8534bSLuigi Rizzo 				break;
864*68b8534bSLuigi Rizzo 			na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
865*68b8534bSLuigi Rizzo 			tsleep(na, 0, "NIOCREGIF", hz/10);
866*68b8534bSLuigi Rizzo 		}
867*68b8534bSLuigi Rizzo 		if (i == 0) {
868*68b8534bSLuigi Rizzo 			D("too many NIOCREGIF attempts, give up");
869*68b8534bSLuigi Rizzo 			error = EINVAL;
870*68b8534bSLuigi Rizzo 			free(priv, M_DEVBUF);
871*68b8534bSLuigi Rizzo 			if_rele(ifp);	/* return the refcount */
872*68b8534bSLuigi Rizzo 			break;
873*68b8534bSLuigi Rizzo 		}
874*68b8534bSLuigi Rizzo 
875*68b8534bSLuigi Rizzo 		priv->np_ifp = ifp;	/* store the reference */
876*68b8534bSLuigi Rizzo 		error = netmap_set_ringid(priv, nmr->nr_ringid);
877*68b8534bSLuigi Rizzo 		if (error)
878*68b8534bSLuigi Rizzo 			goto error;
879*68b8534bSLuigi Rizzo 		priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na);
880*68b8534bSLuigi Rizzo 		if (nifp == NULL) { /* allocation failed */
881*68b8534bSLuigi Rizzo 			error = ENOMEM;
882*68b8534bSLuigi Rizzo 		} else if (ifp->if_capenable & IFCAP_NETMAP) {
883*68b8534bSLuigi Rizzo 			/* was already set */
884*68b8534bSLuigi Rizzo 		} else {
885*68b8534bSLuigi Rizzo 			/* Otherwise set the card in netmap mode
886*68b8534bSLuigi Rizzo 			 * and make it use the shared buffers.
887*68b8534bSLuigi Rizzo 			 */
888*68b8534bSLuigi Rizzo 			error = na->nm_register(ifp, 1); /* mode on */
889*68b8534bSLuigi Rizzo 			if (error) {
890*68b8534bSLuigi Rizzo 				/*
891*68b8534bSLuigi Rizzo 				 * do something similar to netmap_dtor().
892*68b8534bSLuigi Rizzo 				 */
893*68b8534bSLuigi Rizzo 				netmap_free(na->tx_rings[0].ring, "rings, reg.failed");
894*68b8534bSLuigi Rizzo 				free(na->tx_rings, M_DEVBUF);
895*68b8534bSLuigi Rizzo 				na->tx_rings = na->rx_rings = NULL;
896*68b8534bSLuigi Rizzo 				na->refcount--;
897*68b8534bSLuigi Rizzo 				netmap_free(nifp, "nifp, rings failed");
898*68b8534bSLuigi Rizzo 				nifp = NULL;
899*68b8534bSLuigi Rizzo 			}
900*68b8534bSLuigi Rizzo 		}
901*68b8534bSLuigi Rizzo 
902*68b8534bSLuigi Rizzo 		if (error) {	/* reg. failed, release priv and ref */
903*68b8534bSLuigi Rizzo error:
904*68b8534bSLuigi Rizzo 			na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
905*68b8534bSLuigi Rizzo 			free(priv, M_DEVBUF);
906*68b8534bSLuigi Rizzo 			if_rele(ifp);	/* return the refcount */
907*68b8534bSLuigi Rizzo 			break;
908*68b8534bSLuigi Rizzo 		}
909*68b8534bSLuigi Rizzo 
910*68b8534bSLuigi Rizzo 		na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
911*68b8534bSLuigi Rizzo 		error = devfs_set_cdevpriv(priv, netmap_dtor);
912*68b8534bSLuigi Rizzo 
913*68b8534bSLuigi Rizzo 		if (error != 0) {
914*68b8534bSLuigi Rizzo 			/* could not assign the private storage for the
915*68b8534bSLuigi Rizzo 			 * thread, call the destructor explicitly.
916*68b8534bSLuigi Rizzo 			 */
917*68b8534bSLuigi Rizzo 			netmap_dtor(priv);
918*68b8534bSLuigi Rizzo 			break;
919*68b8534bSLuigi Rizzo 		}
920*68b8534bSLuigi Rizzo 
921*68b8534bSLuigi Rizzo 		/* return the offset of the netmap_if object */
922*68b8534bSLuigi Rizzo 		nmr->nr_numrings = na->num_queues;
923*68b8534bSLuigi Rizzo 		nmr->nr_numslots = na->num_tx_desc;
924*68b8534bSLuigi Rizzo 		nmr->nr_memsize = netmap_mem_d->nm_totalsize;
925*68b8534bSLuigi Rizzo 		nmr->nr_offset =
926*68b8534bSLuigi Rizzo 			((char *) nifp - (char *) netmap_mem_d->nm_buffer);
927*68b8534bSLuigi Rizzo 		break;
928*68b8534bSLuigi Rizzo 
929*68b8534bSLuigi Rizzo 	case NIOCUNREGIF:
930*68b8534bSLuigi Rizzo 		if (priv == NULL)
931*68b8534bSLuigi Rizzo 			return (ENXIO);
932*68b8534bSLuigi Rizzo 
933*68b8534bSLuigi Rizzo 		/* the interface is unregistered inside the
934*68b8534bSLuigi Rizzo 		   destructor of the private data. */
935*68b8534bSLuigi Rizzo 		devfs_clear_cdevpriv();
936*68b8534bSLuigi Rizzo 		break;
937*68b8534bSLuigi Rizzo 
938*68b8534bSLuigi Rizzo 	case NIOCTXSYNC:
939*68b8534bSLuigi Rizzo         case NIOCRXSYNC:
940*68b8534bSLuigi Rizzo 		if (priv == NULL)
941*68b8534bSLuigi Rizzo 			return (ENXIO);
942*68b8534bSLuigi Rizzo 		ifp = priv->np_ifp;	/* we have a reference */
943*68b8534bSLuigi Rizzo 		na = NA(ifp); /* retrieve netmap adapter */
944*68b8534bSLuigi Rizzo 		adapter = ifp->if_softc;	/* shorthand */
945*68b8534bSLuigi Rizzo 
946*68b8534bSLuigi Rizzo 		if (na->flags & NR_REINIT)
947*68b8534bSLuigi Rizzo 			netmap_clean_reinit(na);
948*68b8534bSLuigi Rizzo 
949*68b8534bSLuigi Rizzo 		if (priv->np_qfirst == na->num_queues) {
950*68b8534bSLuigi Rizzo 			/* queues to/from host */
951*68b8534bSLuigi Rizzo 			if (cmd == NIOCTXSYNC)
952*68b8534bSLuigi Rizzo 				netmap_sync_to_host(na);
953*68b8534bSLuigi Rizzo 			else
954*68b8534bSLuigi Rizzo 				netmap_sync_from_host(na, NULL);
955*68b8534bSLuigi Rizzo 			return error;
956*68b8534bSLuigi Rizzo 		}
957*68b8534bSLuigi Rizzo 
958*68b8534bSLuigi Rizzo 		for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
959*68b8534bSLuigi Rizzo 		    if (cmd == NIOCTXSYNC) {
960*68b8534bSLuigi Rizzo 			struct netmap_kring *kring = &na->tx_rings[i];
961*68b8534bSLuigi Rizzo 			if (netmap_verbose & NM_VERB_TXSYNC)
962*68b8534bSLuigi Rizzo 				D("sync tx ring %d cur %d hwcur %d",
963*68b8534bSLuigi Rizzo 					i, kring->ring->cur,
964*68b8534bSLuigi Rizzo 					kring->nr_hwcur);
965*68b8534bSLuigi Rizzo                         na->nm_txsync(adapter, i, 1 /* do lock */);
966*68b8534bSLuigi Rizzo 			if (netmap_verbose & NM_VERB_TXSYNC)
967*68b8534bSLuigi Rizzo 				D("after sync tx ring %d cur %d hwcur %d",
968*68b8534bSLuigi Rizzo 					i, kring->ring->cur,
969*68b8534bSLuigi Rizzo 					kring->nr_hwcur);
970*68b8534bSLuigi Rizzo 		    } else {
971*68b8534bSLuigi Rizzo 			na->nm_rxsync(adapter, i, 1 /* do lock */);
972*68b8534bSLuigi Rizzo 			microtime(&na->rx_rings[i].ring->ts);
973*68b8534bSLuigi Rizzo 		    }
974*68b8534bSLuigi Rizzo 		}
975*68b8534bSLuigi Rizzo 
976*68b8534bSLuigi Rizzo                 break;
977*68b8534bSLuigi Rizzo 
978*68b8534bSLuigi Rizzo 	case BIOCIMMEDIATE:
979*68b8534bSLuigi Rizzo 	case BIOCGHDRCMPLT:
980*68b8534bSLuigi Rizzo 	case BIOCSHDRCMPLT:
981*68b8534bSLuigi Rizzo 	case BIOCSSEESENT:
982*68b8534bSLuigi Rizzo 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
983*68b8534bSLuigi Rizzo 		break;
984*68b8534bSLuigi Rizzo 
985*68b8534bSLuigi Rizzo 	default:
986*68b8534bSLuigi Rizzo 	    {
987*68b8534bSLuigi Rizzo 		/*
988*68b8534bSLuigi Rizzo 		 * allow device calls
989*68b8534bSLuigi Rizzo 		 */
990*68b8534bSLuigi Rizzo 		struct socket so;
991*68b8534bSLuigi Rizzo 		bzero(&so, sizeof(so));
992*68b8534bSLuigi Rizzo 		error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
993*68b8534bSLuigi Rizzo 		if (error)
994*68b8534bSLuigi Rizzo 			break;
995*68b8534bSLuigi Rizzo 		so.so_vnet = ifp->if_vnet;
996*68b8534bSLuigi Rizzo 		// so->so_proto not null.
997*68b8534bSLuigi Rizzo 		error = ifioctl(&so, cmd, data, td);
998*68b8534bSLuigi Rizzo 		if_rele(ifp);
999*68b8534bSLuigi Rizzo 	    }
1000*68b8534bSLuigi Rizzo 	}
1001*68b8534bSLuigi Rizzo 
1002*68b8534bSLuigi Rizzo 	return (error);
1003*68b8534bSLuigi Rizzo }
1004*68b8534bSLuigi Rizzo 
1005*68b8534bSLuigi Rizzo 
1006*68b8534bSLuigi Rizzo /*
1007*68b8534bSLuigi Rizzo  * select(2) and poll(2) handlers for the "netmap" device.
1008*68b8534bSLuigi Rizzo  *
1009*68b8534bSLuigi Rizzo  * Can be called for one or more queues.
1010*68b8534bSLuigi Rizzo  * Return true the event mask corresponding to ready events.
1011*68b8534bSLuigi Rizzo  * If there are no ready events, do a selrecord on either individual
1012*68b8534bSLuigi Rizzo  * selfd or on the global one.
1013*68b8534bSLuigi Rizzo  * Device-dependent parts (locking and sync of tx/rx rings)
1014*68b8534bSLuigi Rizzo  * are done through callbacks.
1015*68b8534bSLuigi Rizzo  */
1016*68b8534bSLuigi Rizzo static int
1017*68b8534bSLuigi Rizzo netmap_poll(__unused struct cdev *dev, int events, struct thread *td)
1018*68b8534bSLuigi Rizzo {
1019*68b8534bSLuigi Rizzo 	struct netmap_priv_d *priv = NULL;
1020*68b8534bSLuigi Rizzo 	struct netmap_adapter *na;
1021*68b8534bSLuigi Rizzo 	struct ifnet *ifp;
1022*68b8534bSLuigi Rizzo 	struct netmap_kring *kring;
1023*68b8534bSLuigi Rizzo 	u_int i, check_all, want_tx, want_rx, revents = 0;
1024*68b8534bSLuigi Rizzo 	void *adapter;
1025*68b8534bSLuigi Rizzo 
1026*68b8534bSLuigi Rizzo 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
1027*68b8534bSLuigi Rizzo 		return POLLERR;
1028*68b8534bSLuigi Rizzo 
1029*68b8534bSLuigi Rizzo 	ifp = priv->np_ifp;
1030*68b8534bSLuigi Rizzo 	// XXX check for deleting() ?
1031*68b8534bSLuigi Rizzo 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1032*68b8534bSLuigi Rizzo 		return POLLERR;
1033*68b8534bSLuigi Rizzo 
1034*68b8534bSLuigi Rizzo 	if (netmap_verbose & 0x8000)
1035*68b8534bSLuigi Rizzo 		D("device %s events 0x%x", ifp->if_xname, events);
1036*68b8534bSLuigi Rizzo 	want_tx = events & (POLLOUT | POLLWRNORM);
1037*68b8534bSLuigi Rizzo 	want_rx = events & (POLLIN | POLLRDNORM);
1038*68b8534bSLuigi Rizzo 
1039*68b8534bSLuigi Rizzo 	adapter = ifp->if_softc;
1040*68b8534bSLuigi Rizzo 	na = NA(ifp); /* retrieve netmap adapter */
1041*68b8534bSLuigi Rizzo 
1042*68b8534bSLuigi Rizzo 	/* pending reinit, report up as a poll error. Pending
1043*68b8534bSLuigi Rizzo 	 * reads and writes are lost.
1044*68b8534bSLuigi Rizzo 	 */
1045*68b8534bSLuigi Rizzo 	if (na->flags & NR_REINIT) {
1046*68b8534bSLuigi Rizzo 		netmap_clean_reinit(na);
1047*68b8534bSLuigi Rizzo 		revents |= POLLERR;
1048*68b8534bSLuigi Rizzo 	}
1049*68b8534bSLuigi Rizzo 	/* how many queues we are scanning */
1050*68b8534bSLuigi Rizzo 	i = priv->np_qfirst;
1051*68b8534bSLuigi Rizzo 	if (i == na->num_queues) { /* from/to host */
1052*68b8534bSLuigi Rizzo 		if (priv->np_txpoll || want_tx) {
1053*68b8534bSLuigi Rizzo 			/* push any packets up, then we are always ready */
1054*68b8534bSLuigi Rizzo 			kring = &na->tx_rings[i];
1055*68b8534bSLuigi Rizzo 			netmap_sync_to_host(na);
1056*68b8534bSLuigi Rizzo 			revents |= want_tx;
1057*68b8534bSLuigi Rizzo 		}
1058*68b8534bSLuigi Rizzo 		if (want_rx) {
1059*68b8534bSLuigi Rizzo 			kring = &na->rx_rings[i];
1060*68b8534bSLuigi Rizzo 			if (kring->ring->avail == 0)
1061*68b8534bSLuigi Rizzo 				netmap_sync_from_host(na, td);
1062*68b8534bSLuigi Rizzo 			if (kring->ring->avail > 0) {
1063*68b8534bSLuigi Rizzo 				revents |= want_rx;
1064*68b8534bSLuigi Rizzo 			}
1065*68b8534bSLuigi Rizzo 		}
1066*68b8534bSLuigi Rizzo 		return (revents);
1067*68b8534bSLuigi Rizzo 	}
1068*68b8534bSLuigi Rizzo 
1069*68b8534bSLuigi Rizzo 	/*
1070*68b8534bSLuigi Rizzo 	 * check_all is set if the card has more than one queue and
1071*68b8534bSLuigi Rizzo 	 * the client is polling all of them. If true, we sleep on
1072*68b8534bSLuigi Rizzo 	 * the "global" selfd, otherwise we sleep on individual selfd
1073*68b8534bSLuigi Rizzo 	 * (we can only sleep on one of them per direction).
1074*68b8534bSLuigi Rizzo 	 * The interrupt routine in the driver should always wake on
1075*68b8534bSLuigi Rizzo 	 * the individual selfd, and also on the global one if the card
1076*68b8534bSLuigi Rizzo 	 * has more than one ring.
1077*68b8534bSLuigi Rizzo 	 *
1078*68b8534bSLuigi Rizzo 	 * If the card has only one lock, we just use that.
1079*68b8534bSLuigi Rizzo 	 * If the card has separate ring locks, we just use those
1080*68b8534bSLuigi Rizzo 	 * unless we are doing check_all, in which case the whole
1081*68b8534bSLuigi Rizzo 	 * loop is wrapped by the global lock.
1082*68b8534bSLuigi Rizzo 	 * We acquire locks only when necessary: if poll is called
1083*68b8534bSLuigi Rizzo 	 * when buffers are available, we can just return without locks.
1084*68b8534bSLuigi Rizzo 	 *
1085*68b8534bSLuigi Rizzo 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1086*68b8534bSLuigi Rizzo 	 * txsync() is called if we run out of buffers on POLLOUT, or
1087*68b8534bSLuigi Rizzo 	 * there are pending packets to send. The latter can be disabled
1088*68b8534bSLuigi Rizzo 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1089*68b8534bSLuigi Rizzo 	 */
1090*68b8534bSLuigi Rizzo 	check_all = (i + 1 != priv->np_qlast);
1091*68b8534bSLuigi Rizzo 
1092*68b8534bSLuigi Rizzo 	/*
1093*68b8534bSLuigi Rizzo 	 * core_lock indicates what to do with the core lock.
1094*68b8534bSLuigi Rizzo 	 * The core lock is used when either the card has no individual
1095*68b8534bSLuigi Rizzo 	 * locks, or it has individual locks but we are cheking all
1096*68b8534bSLuigi Rizzo 	 * rings so we need the core lock to avoid missing wakeup events.
1097*68b8534bSLuigi Rizzo 	 *
1098*68b8534bSLuigi Rizzo 	 * It has three possible states:
1099*68b8534bSLuigi Rizzo 	 * NO_CL	we don't need to use the core lock, e.g.
1100*68b8534bSLuigi Rizzo 	 *		because we are protected by individual locks.
1101*68b8534bSLuigi Rizzo 	 * NEED_CL	we need the core lock. In this case, when we
1102*68b8534bSLuigi Rizzo 	 *		call the lock routine, move to LOCKED_CL
1103*68b8534bSLuigi Rizzo 	 *		to remember to release the lock once done.
1104*68b8534bSLuigi Rizzo 	 * LOCKED_CL	core lock is set, so we need to release it.
1105*68b8534bSLuigi Rizzo 	 */
1106*68b8534bSLuigi Rizzo 	enum {NO_CL, NEED_CL, LOCKED_CL };
1107*68b8534bSLuigi Rizzo 	int core_lock = (check_all || !na->separate_locks) ?
1108*68b8534bSLuigi Rizzo 			NEED_CL:NO_CL;
1109*68b8534bSLuigi Rizzo 	/*
1110*68b8534bSLuigi Rizzo 	 * We start with a lock free round which is good if we have
1111*68b8534bSLuigi Rizzo 	 * data available. If this fails, then lock and call the sync
1112*68b8534bSLuigi Rizzo 	 * routines.
1113*68b8534bSLuigi Rizzo 	 */
1114*68b8534bSLuigi Rizzo 	for (i = priv->np_qfirst; want_rx && i < priv->np_qlast; i++) {
1115*68b8534bSLuigi Rizzo 		kring = &na->rx_rings[i];
1116*68b8534bSLuigi Rizzo 		if (kring->ring->avail > 0) {
1117*68b8534bSLuigi Rizzo 			revents |= want_rx;
1118*68b8534bSLuigi Rizzo 			want_rx = 0;	/* also breaks the loop */
1119*68b8534bSLuigi Rizzo 		}
1120*68b8534bSLuigi Rizzo 	}
1121*68b8534bSLuigi Rizzo 	for (i = priv->np_qfirst; want_tx && i < priv->np_qlast; i++) {
1122*68b8534bSLuigi Rizzo 		kring = &na->tx_rings[i];
1123*68b8534bSLuigi Rizzo 		if (kring->ring->avail > 0) {
1124*68b8534bSLuigi Rizzo 			revents |= want_tx;
1125*68b8534bSLuigi Rizzo 			want_tx = 0;	/* also breaks the loop */
1126*68b8534bSLuigi Rizzo 		}
1127*68b8534bSLuigi Rizzo 	}
1128*68b8534bSLuigi Rizzo 
1129*68b8534bSLuigi Rizzo 	/*
1130*68b8534bSLuigi Rizzo 	 * If we to push packets out (priv->np_txpoll) or want_tx is
1131*68b8534bSLuigi Rizzo 	 * still set, we do need to run the txsync calls (on all rings,
1132*68b8534bSLuigi Rizzo 	 * to avoid that the tx rings stall).
1133*68b8534bSLuigi Rizzo 	 */
1134*68b8534bSLuigi Rizzo 	if (priv->np_txpoll || want_tx) {
1135*68b8534bSLuigi Rizzo 		for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
1136*68b8534bSLuigi Rizzo 			kring = &na->tx_rings[i];
1137*68b8534bSLuigi Rizzo 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1138*68b8534bSLuigi Rizzo 				continue;
1139*68b8534bSLuigi Rizzo 			if (core_lock == NEED_CL) {
1140*68b8534bSLuigi Rizzo 				na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
1141*68b8534bSLuigi Rizzo 				core_lock = LOCKED_CL;
1142*68b8534bSLuigi Rizzo 			}
1143*68b8534bSLuigi Rizzo 			if (na->separate_locks)
1144*68b8534bSLuigi Rizzo 				na->nm_lock(adapter, NETMAP_TX_LOCK, i);
1145*68b8534bSLuigi Rizzo 			if (netmap_verbose & NM_VERB_TXSYNC)
1146*68b8534bSLuigi Rizzo 				D("send %d on %s %d",
1147*68b8534bSLuigi Rizzo 					kring->ring->cur,
1148*68b8534bSLuigi Rizzo 					ifp->if_xname, i);
1149*68b8534bSLuigi Rizzo 			if (na->nm_txsync(adapter, i, 0 /* no lock */))
1150*68b8534bSLuigi Rizzo 				revents |= POLLERR;
1151*68b8534bSLuigi Rizzo 
1152*68b8534bSLuigi Rizzo 			if (want_tx) {
1153*68b8534bSLuigi Rizzo 				if (kring->ring->avail > 0) {
1154*68b8534bSLuigi Rizzo 					/* stop at the first ring. We don't risk
1155*68b8534bSLuigi Rizzo 					 * starvation.
1156*68b8534bSLuigi Rizzo 					 */
1157*68b8534bSLuigi Rizzo 					revents |= want_tx;
1158*68b8534bSLuigi Rizzo 					want_tx = 0;
1159*68b8534bSLuigi Rizzo 				} else if (!check_all)
1160*68b8534bSLuigi Rizzo 					selrecord(td, &kring->si);
1161*68b8534bSLuigi Rizzo 			}
1162*68b8534bSLuigi Rizzo 			if (na->separate_locks)
1163*68b8534bSLuigi Rizzo 				na->nm_lock(adapter, NETMAP_TX_UNLOCK, i);
1164*68b8534bSLuigi Rizzo 		}
1165*68b8534bSLuigi Rizzo 	}
1166*68b8534bSLuigi Rizzo 
1167*68b8534bSLuigi Rizzo 	/*
1168*68b8534bSLuigi Rizzo 	 * now if want_rx is still set we need to lock and rxsync.
1169*68b8534bSLuigi Rizzo 	 * Do it on all rings because otherwise we starve.
1170*68b8534bSLuigi Rizzo 	 */
1171*68b8534bSLuigi Rizzo 	if (want_rx) {
1172*68b8534bSLuigi Rizzo 		for (i = priv->np_qfirst; i < priv->np_qlast; i++) {
1173*68b8534bSLuigi Rizzo 			kring = &na->rx_rings[i];
1174*68b8534bSLuigi Rizzo 			if (core_lock == NEED_CL) {
1175*68b8534bSLuigi Rizzo 				na->nm_lock(adapter, NETMAP_CORE_LOCK, 0);
1176*68b8534bSLuigi Rizzo 				core_lock = LOCKED_CL;
1177*68b8534bSLuigi Rizzo 			}
1178*68b8534bSLuigi Rizzo 			if (na->separate_locks)
1179*68b8534bSLuigi Rizzo 				na->nm_lock(adapter, NETMAP_RX_LOCK, i);
1180*68b8534bSLuigi Rizzo 
1181*68b8534bSLuigi Rizzo 			if (na->nm_rxsync(adapter, i, 0 /* no lock */))
1182*68b8534bSLuigi Rizzo 				revents |= POLLERR;
1183*68b8534bSLuigi Rizzo 			if (no_timestamp == 0 ||
1184*68b8534bSLuigi Rizzo 					kring->ring->flags & NR_TIMESTAMP)
1185*68b8534bSLuigi Rizzo 				microtime(&kring->ring->ts);
1186*68b8534bSLuigi Rizzo 
1187*68b8534bSLuigi Rizzo 			if (kring->ring->avail > 0)
1188*68b8534bSLuigi Rizzo 				revents |= want_rx;
1189*68b8534bSLuigi Rizzo 			else if (!check_all)
1190*68b8534bSLuigi Rizzo 				selrecord(td, &kring->si);
1191*68b8534bSLuigi Rizzo 			if (na->separate_locks)
1192*68b8534bSLuigi Rizzo 				na->nm_lock(adapter, NETMAP_RX_UNLOCK, i);
1193*68b8534bSLuigi Rizzo 		}
1194*68b8534bSLuigi Rizzo 	}
1195*68b8534bSLuigi Rizzo 	if (check_all && revents == 0) {
1196*68b8534bSLuigi Rizzo 		i = na->num_queues + 1; /* the global queue */
1197*68b8534bSLuigi Rizzo 		if (want_tx)
1198*68b8534bSLuigi Rizzo 			selrecord(td, &na->tx_rings[i].si);
1199*68b8534bSLuigi Rizzo 		if (want_rx)
1200*68b8534bSLuigi Rizzo 			selrecord(td, &na->rx_rings[i].si);
1201*68b8534bSLuigi Rizzo 	}
1202*68b8534bSLuigi Rizzo 	if (core_lock == LOCKED_CL)
1203*68b8534bSLuigi Rizzo 		na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0);
1204*68b8534bSLuigi Rizzo 
1205*68b8534bSLuigi Rizzo 	return (revents);
1206*68b8534bSLuigi Rizzo }
1207*68b8534bSLuigi Rizzo 
1208*68b8534bSLuigi Rizzo /*------- driver support routines ------*/
1209*68b8534bSLuigi Rizzo 
1210*68b8534bSLuigi Rizzo /*
1211*68b8534bSLuigi Rizzo  * Initialize a ``netmap_adapter`` object created by driver on attach.
1212*68b8534bSLuigi Rizzo  * We allocate a block of memory with room for a struct netmap_adapter
1213*68b8534bSLuigi Rizzo  * plus two sets of N+2 struct netmap_kring (where N is the number
1214*68b8534bSLuigi Rizzo  * of hardware rings):
1215*68b8534bSLuigi Rizzo  * krings	0..N-1	are for the hardware queues.
1216*68b8534bSLuigi Rizzo  * kring	N	is for the host stack queue
1217*68b8534bSLuigi Rizzo  * kring	N+1	is only used for the selinfo for all queues.
1218*68b8534bSLuigi Rizzo  * Return 0 on success, ENOMEM otherwise.
1219*68b8534bSLuigi Rizzo  */
1220*68b8534bSLuigi Rizzo int
1221*68b8534bSLuigi Rizzo netmap_attach(struct netmap_adapter *na, int num_queues)
1222*68b8534bSLuigi Rizzo {
1223*68b8534bSLuigi Rizzo 	int n = num_queues + 2;
1224*68b8534bSLuigi Rizzo 	int size = sizeof(*na) + 2 * n * sizeof(struct netmap_kring);
1225*68b8534bSLuigi Rizzo 	void *buf;
1226*68b8534bSLuigi Rizzo 	struct ifnet *ifp = na->ifp;
1227*68b8534bSLuigi Rizzo 
1228*68b8534bSLuigi Rizzo 	if (ifp == NULL) {
1229*68b8534bSLuigi Rizzo 		D("ifp not set, giving up");
1230*68b8534bSLuigi Rizzo 		return EINVAL;
1231*68b8534bSLuigi Rizzo 	}
1232*68b8534bSLuigi Rizzo 	na->refcount = 0;
1233*68b8534bSLuigi Rizzo 	na->num_queues = num_queues;
1234*68b8534bSLuigi Rizzo 
1235*68b8534bSLuigi Rizzo 	buf = malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO);
1236*68b8534bSLuigi Rizzo 	if (buf) {
1237*68b8534bSLuigi Rizzo 		ifp->if_pspare[0] = buf;
1238*68b8534bSLuigi Rizzo 		na->tx_rings = (void *)((char *)buf + sizeof(*na));
1239*68b8534bSLuigi Rizzo 		na->rx_rings = na->tx_rings + n;
1240*68b8534bSLuigi Rizzo 		bcopy(na, buf, sizeof(*na));
1241*68b8534bSLuigi Rizzo 		ifp->if_capabilities |= IFCAP_NETMAP;
1242*68b8534bSLuigi Rizzo 	}
1243*68b8534bSLuigi Rizzo 	D("%s for %s", buf ? "ok" : "failed", ifp->if_xname);
1244*68b8534bSLuigi Rizzo 
1245*68b8534bSLuigi Rizzo 	return (buf ? 0 : ENOMEM);
1246*68b8534bSLuigi Rizzo }
1247*68b8534bSLuigi Rizzo 
1248*68b8534bSLuigi Rizzo 
1249*68b8534bSLuigi Rizzo /*
1250*68b8534bSLuigi Rizzo  * Free the allocated memory linked to the given ``netmap_adapter``
1251*68b8534bSLuigi Rizzo  * object.
1252*68b8534bSLuigi Rizzo  */
1253*68b8534bSLuigi Rizzo void
1254*68b8534bSLuigi Rizzo netmap_detach(struct ifnet *ifp)
1255*68b8534bSLuigi Rizzo {
1256*68b8534bSLuigi Rizzo 	u_int i;
1257*68b8534bSLuigi Rizzo 	struct netmap_adapter *na = NA(ifp);
1258*68b8534bSLuigi Rizzo 
1259*68b8534bSLuigi Rizzo 	if (!na)
1260*68b8534bSLuigi Rizzo 		return;
1261*68b8534bSLuigi Rizzo 
1262*68b8534bSLuigi Rizzo 	for (i = 0; i < na->num_queues + 2; i++) {
1263*68b8534bSLuigi Rizzo 		knlist_destroy(&na->tx_rings[i].si.si_note);
1264*68b8534bSLuigi Rizzo 		knlist_destroy(&na->rx_rings[i].si.si_note);
1265*68b8534bSLuigi Rizzo 	}
1266*68b8534bSLuigi Rizzo 	bzero(na, sizeof(*na));
1267*68b8534bSLuigi Rizzo 	ifp->if_pspare[0] = NULL;
1268*68b8534bSLuigi Rizzo 	free(na, M_DEVBUF);
1269*68b8534bSLuigi Rizzo }
1270*68b8534bSLuigi Rizzo 
1271*68b8534bSLuigi Rizzo 
1272*68b8534bSLuigi Rizzo /*
1273*68b8534bSLuigi Rizzo  * intercept packets coming from the network stack and present
1274*68b8534bSLuigi Rizzo  * them to netmap as incoming packets on a separate ring.
1275*68b8534bSLuigi Rizzo  * We are not locked when called.
1276*68b8534bSLuigi Rizzo  */
1277*68b8534bSLuigi Rizzo int
1278*68b8534bSLuigi Rizzo netmap_start(struct ifnet *ifp, struct mbuf *m)
1279*68b8534bSLuigi Rizzo {
1280*68b8534bSLuigi Rizzo 	struct netmap_adapter *na = NA(ifp);
1281*68b8534bSLuigi Rizzo 	u_int i, len, n = na->num_queues;
1282*68b8534bSLuigi Rizzo 	int error = EBUSY;
1283*68b8534bSLuigi Rizzo 	struct netmap_kring *kring = &na->rx_rings[n];
1284*68b8534bSLuigi Rizzo 	struct netmap_slot *slot;
1285*68b8534bSLuigi Rizzo 
1286*68b8534bSLuigi Rizzo 	len = m->m_pkthdr.len;
1287*68b8534bSLuigi Rizzo 	if (netmap_verbose & NM_VERB_HOST)
1288*68b8534bSLuigi Rizzo 		D("%s packet %d len %d from the stack", ifp->if_xname,
1289*68b8534bSLuigi Rizzo 			kring->nr_hwcur + kring->nr_hwavail, len);
1290*68b8534bSLuigi Rizzo 	na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0);
1291*68b8534bSLuigi Rizzo 	if (kring->nr_hwavail >= (int)kring->nkr_num_slots - 1) {
1292*68b8534bSLuigi Rizzo 		D("stack ring %s full\n", ifp->if_xname);
1293*68b8534bSLuigi Rizzo 		goto done;	/* no space */
1294*68b8534bSLuigi Rizzo 	}
1295*68b8534bSLuigi Rizzo 	if (len > na->buff_size) {
1296*68b8534bSLuigi Rizzo 		D("drop packet size %d > %d", len, na->buff_size);
1297*68b8534bSLuigi Rizzo 		goto done;	/* too long for us */
1298*68b8534bSLuigi Rizzo 	}
1299*68b8534bSLuigi Rizzo 
1300*68b8534bSLuigi Rizzo 	/* compute the insert position */
1301*68b8534bSLuigi Rizzo 	i = kring->nr_hwcur + kring->nr_hwavail;
1302*68b8534bSLuigi Rizzo 	if (i >= kring->nkr_num_slots)
1303*68b8534bSLuigi Rizzo 		i -= kring->nkr_num_slots;
1304*68b8534bSLuigi Rizzo 	slot = &kring->ring->slot[i];
1305*68b8534bSLuigi Rizzo 	m_copydata(m, 0, len, NMB(slot));
1306*68b8534bSLuigi Rizzo 	slot->len = len;
1307*68b8534bSLuigi Rizzo 	kring->nr_hwavail++;
1308*68b8534bSLuigi Rizzo 	if (netmap_verbose  & NM_VERB_HOST)
1309*68b8534bSLuigi Rizzo 		D("wake up host ring %s %d", na->ifp->if_xname, na->num_queues);
1310*68b8534bSLuigi Rizzo 	selwakeuppri(&kring->si, PI_NET);
1311*68b8534bSLuigi Rizzo 	error = 0;
1312*68b8534bSLuigi Rizzo done:
1313*68b8534bSLuigi Rizzo 	na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0);
1314*68b8534bSLuigi Rizzo 
1315*68b8534bSLuigi Rizzo 	/* release the mbuf in either cases of success or failure. As an
1316*68b8534bSLuigi Rizzo 	 * alternative, put the mbuf in a free list and free the list
1317*68b8534bSLuigi Rizzo 	 * only when really necessary.
1318*68b8534bSLuigi Rizzo 	 */
1319*68b8534bSLuigi Rizzo 	m_freem(m);
1320*68b8534bSLuigi Rizzo 
1321*68b8534bSLuigi Rizzo 	return (error);
1322*68b8534bSLuigi Rizzo }
1323*68b8534bSLuigi Rizzo 
1324*68b8534bSLuigi Rizzo 
1325*68b8534bSLuigi Rizzo /*
1326*68b8534bSLuigi Rizzo  * netmap_reset() is called by the driver routines when reinitializing
1327*68b8534bSLuigi Rizzo  * a ring. The driver is in charge of locking to protect the kring.
1328*68b8534bSLuigi Rizzo  * If netmap mode is not set just return NULL.
1329*68b8534bSLuigi Rizzo  * Otherwise set NR_REINIT (in the ring and in na) to signal
1330*68b8534bSLuigi Rizzo  * that a ring has been reinitialized,
1331*68b8534bSLuigi Rizzo  * set cur = hwcur = 0 and avail = hwavail = num_slots - 1 .
1332*68b8534bSLuigi Rizzo  * IT IS IMPORTANT to leave one slot free even in the tx ring because
1333*68b8534bSLuigi Rizzo  * we rely on cur=hwcur only for empty rings.
1334*68b8534bSLuigi Rizzo  * These are good defaults but can be overridden later in the device
1335*68b8534bSLuigi Rizzo  * specific code if, after a reinit, the ring does not start from 0
1336*68b8534bSLuigi Rizzo  * (e.g. if_em.c does this).
1337*68b8534bSLuigi Rizzo  *
1338*68b8534bSLuigi Rizzo  * XXX we shouldn't be touching the ring, but there is a
1339*68b8534bSLuigi Rizzo  * race anyways and this is our best option.
1340*68b8534bSLuigi Rizzo  *
1341*68b8534bSLuigi Rizzo  * XXX setting na->flags makes the syscall code faster, as there is
1342*68b8534bSLuigi Rizzo  * only one place to check. On the other hand, we will need a better
1343*68b8534bSLuigi Rizzo  * way to notify multiple threads that rings have been reset.
1344*68b8534bSLuigi Rizzo  * One way is to increment na->rst_count at each ring reset.
1345*68b8534bSLuigi Rizzo  * Each thread in its own priv structure will keep a matching counter,
1346*68b8534bSLuigi Rizzo  * and on a reset will acknowledge and clean its own rings.
1347*68b8534bSLuigi Rizzo  */
1348*68b8534bSLuigi Rizzo struct netmap_slot *
1349*68b8534bSLuigi Rizzo netmap_reset(struct netmap_adapter *na, enum txrx tx, int n,
1350*68b8534bSLuigi Rizzo 	u_int new_cur)
1351*68b8534bSLuigi Rizzo {
1352*68b8534bSLuigi Rizzo 	struct netmap_kring *kring;
1353*68b8534bSLuigi Rizzo 	struct netmap_ring *ring;
1354*68b8534bSLuigi Rizzo 	struct netmap_slot *slot;
1355*68b8534bSLuigi Rizzo 	u_int i;
1356*68b8534bSLuigi Rizzo 
1357*68b8534bSLuigi Rizzo 	if (na == NULL)
1358*68b8534bSLuigi Rizzo 		return NULL;	/* no netmap support here */
1359*68b8534bSLuigi Rizzo 	if (!(na->ifp->if_capenable & IFCAP_NETMAP))
1360*68b8534bSLuigi Rizzo 		return NULL;	/* nothing to reinitialize */
1361*68b8534bSLuigi Rizzo 	kring = tx == NR_TX ?  na->tx_rings + n : na->rx_rings + n;
1362*68b8534bSLuigi Rizzo 	ring = kring->ring;
1363*68b8534bSLuigi Rizzo     if (tx == NR_TX) {
1364*68b8534bSLuigi Rizzo 	/*
1365*68b8534bSLuigi Rizzo 	 * The last argument is the new value of next_to_clean.
1366*68b8534bSLuigi Rizzo 	 *
1367*68b8534bSLuigi Rizzo 	 * In the TX ring, we have P pending transmissions (from
1368*68b8534bSLuigi Rizzo 	 * next_to_clean to nr_hwcur) followed by nr_hwavail free slots.
1369*68b8534bSLuigi Rizzo 	 * Generally we can use all the slots in the ring so
1370*68b8534bSLuigi Rizzo 	 * P = ring_size - nr_hwavail hence (modulo ring_size):
1371*68b8534bSLuigi Rizzo 	 *	next_to_clean == nr_hwcur + nr_hwavail
1372*68b8534bSLuigi Rizzo 	 *
1373*68b8534bSLuigi Rizzo 	 * If, upon a reset, nr_hwavail == ring_size and next_to_clean
1374*68b8534bSLuigi Rizzo 	 * does not change we have nothing to report. Otherwise some
1375*68b8534bSLuigi Rizzo 	 * pending packets may be lost, or newly injected packets will.
1376*68b8534bSLuigi Rizzo 	 */
1377*68b8534bSLuigi Rizzo 	/* if hwcur does not change, nothing to report.
1378*68b8534bSLuigi Rizzo 	 * otherwise remember the change so perhaps we can
1379*68b8534bSLuigi Rizzo 	 * shift the block at the next reinit
1380*68b8534bSLuigi Rizzo 	 */
1381*68b8534bSLuigi Rizzo 	if (new_cur == kring->nr_hwcur &&
1382*68b8534bSLuigi Rizzo 		    kring->nr_hwavail == kring->nkr_num_slots - 1) {
1383*68b8534bSLuigi Rizzo 		/* all ok */
1384*68b8534bSLuigi Rizzo 		D("+++ NR_REINIT ok on %s TX[%d]", na->ifp->if_xname, n);
1385*68b8534bSLuigi Rizzo 	} else {
1386*68b8534bSLuigi Rizzo 		D("+++ NR_REINIT set on %s TX[%d]", na->ifp->if_xname, n);
1387*68b8534bSLuigi Rizzo 	}
1388*68b8534bSLuigi Rizzo 		ring->flags |= NR_REINIT;
1389*68b8534bSLuigi Rizzo 		na->flags |= NR_REINIT;
1390*68b8534bSLuigi Rizzo 		ring->avail = kring->nr_hwavail = kring->nkr_num_slots - 1;
1391*68b8534bSLuigi Rizzo 		ring->cur = kring->nr_hwcur = new_cur;
1392*68b8534bSLuigi Rizzo     } else {
1393*68b8534bSLuigi Rizzo 	/*
1394*68b8534bSLuigi Rizzo 	 * The last argument is the next free slot.
1395*68b8534bSLuigi Rizzo 	 * In the RX ring we have nr_hwavail full buffers starting
1396*68b8534bSLuigi Rizzo 	 * from nr_hwcur.
1397*68b8534bSLuigi Rizzo 	 * If nr_hwavail == 0 and nr_hwcur does not change we are ok
1398*68b8534bSLuigi Rizzo 	 * otherwise we might be in trouble as the buffers are
1399*68b8534bSLuigi Rizzo 	 * changing.
1400*68b8534bSLuigi Rizzo 	 */
1401*68b8534bSLuigi Rizzo 	if (new_cur == kring->nr_hwcur && kring->nr_hwavail == 0) {
1402*68b8534bSLuigi Rizzo 		/* all ok */
1403*68b8534bSLuigi Rizzo 		D("+++ NR_REINIT ok on %s RX[%d]", na->ifp->if_xname, n);
1404*68b8534bSLuigi Rizzo 	} else {
1405*68b8534bSLuigi Rizzo 		D("+++ NR_REINIT set on %s RX[%d]", na->ifp->if_xname, n);
1406*68b8534bSLuigi Rizzo 	}
1407*68b8534bSLuigi Rizzo 	ring->flags |= NR_REINIT;
1408*68b8534bSLuigi Rizzo 	na->flags |= NR_REINIT;
1409*68b8534bSLuigi Rizzo 	ring->avail = kring->nr_hwavail = 0; /* no data */
1410*68b8534bSLuigi Rizzo 	ring->cur = kring->nr_hwcur = new_cur;
1411*68b8534bSLuigi Rizzo     }
1412*68b8534bSLuigi Rizzo 
1413*68b8534bSLuigi Rizzo 	slot = ring->slot;
1414*68b8534bSLuigi Rizzo 	/*
1415*68b8534bSLuigi Rizzo 	 * Check that buffer indexes are correct. If we find a
1416*68b8534bSLuigi Rizzo 	 * bogus value we are a bit in trouble because we cannot
1417*68b8534bSLuigi Rizzo 	 * recover easily. Best we can do is (probably) persistently
1418*68b8534bSLuigi Rizzo 	 * reset the ring.
1419*68b8534bSLuigi Rizzo 	 */
1420*68b8534bSLuigi Rizzo 	for (i = 0; i < kring->nkr_num_slots; i++) {
1421*68b8534bSLuigi Rizzo 		if (slot[i].buf_idx >= netmap_total_buffers) {
1422*68b8534bSLuigi Rizzo 			D("invalid buf_idx %d at slot %d", slot[i].buf_idx, i);
1423*68b8534bSLuigi Rizzo 			slot[i].buf_idx = 0; /* XXX reset */
1424*68b8534bSLuigi Rizzo 		}
1425*68b8534bSLuigi Rizzo 		/* XXX we don't really need to set the length */
1426*68b8534bSLuigi Rizzo 		slot[i].len = 0;
1427*68b8534bSLuigi Rizzo 	}
1428*68b8534bSLuigi Rizzo 	/* wakeup possible waiters, both on the ring and on the global
1429*68b8534bSLuigi Rizzo 	 * selfd. Perhaps a bit early now but the device specific
1430*68b8534bSLuigi Rizzo 	 * routine is locked so hopefully we won't have a race.
1431*68b8534bSLuigi Rizzo 	 */
1432*68b8534bSLuigi Rizzo 	selwakeuppri(&kring->si, PI_NET);
1433*68b8534bSLuigi Rizzo 	selwakeuppri(&kring[na->num_queues + 1 - n].si, PI_NET);
1434*68b8534bSLuigi Rizzo 	return kring->ring->slot;
1435*68b8534bSLuigi Rizzo }
1436*68b8534bSLuigi Rizzo 
1437*68b8534bSLuigi Rizzo static void
1438*68b8534bSLuigi Rizzo ns_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs,
1439*68b8534bSLuigi Rizzo 	__unused int nseg, __unused int error)
1440*68b8534bSLuigi Rizzo {
1441*68b8534bSLuigi Rizzo }
1442*68b8534bSLuigi Rizzo 
1443*68b8534bSLuigi Rizzo /* unload a bus_dmamap and create a new one. Used when the
1444*68b8534bSLuigi Rizzo  * buffer in the slot is changed.
1445*68b8534bSLuigi Rizzo  * XXX buflen is probably not needed, buffers have constant size.
1446*68b8534bSLuigi Rizzo  */
1447*68b8534bSLuigi Rizzo void
1448*68b8534bSLuigi Rizzo netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map,
1449*68b8534bSLuigi Rizzo 	void *buf, bus_size_t buflen)
1450*68b8534bSLuigi Rizzo {
1451*68b8534bSLuigi Rizzo 	bus_addr_t paddr;
1452*68b8534bSLuigi Rizzo 	bus_dmamap_unload(tag, map);
1453*68b8534bSLuigi Rizzo 	bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr,
1454*68b8534bSLuigi Rizzo 				BUS_DMA_NOWAIT);
1455*68b8534bSLuigi Rizzo }
1456*68b8534bSLuigi Rizzo 
1457*68b8534bSLuigi Rizzo void
1458*68b8534bSLuigi Rizzo netmap_load_map(bus_dma_tag_t tag, bus_dmamap_t map,
1459*68b8534bSLuigi Rizzo 	void *buf, bus_size_t buflen)
1460*68b8534bSLuigi Rizzo {
1461*68b8534bSLuigi Rizzo 	bus_addr_t paddr;
1462*68b8534bSLuigi Rizzo 	bus_dmamap_load(tag, map, buf, buflen, ns_dmamap_cb, &paddr,
1463*68b8534bSLuigi Rizzo 				BUS_DMA_NOWAIT);
1464*68b8534bSLuigi Rizzo }
1465*68b8534bSLuigi Rizzo 
1466*68b8534bSLuigi Rizzo /*------ netmap memory allocator -------*/
1467*68b8534bSLuigi Rizzo /*
1468*68b8534bSLuigi Rizzo  * Request for a chunk of memory.
1469*68b8534bSLuigi Rizzo  *
1470*68b8534bSLuigi Rizzo  * Memory objects are arranged into a list, hence we need to walk this
1471*68b8534bSLuigi Rizzo  * list until we find an object with the needed amount of data free.
1472*68b8534bSLuigi Rizzo  * This sounds like a completely inefficient implementation, but given
1473*68b8534bSLuigi Rizzo  * the fact that data allocation is done once, we can handle it
1474*68b8534bSLuigi Rizzo  * flawlessly.
1475*68b8534bSLuigi Rizzo  *
1476*68b8534bSLuigi Rizzo  * Return NULL on failure.
1477*68b8534bSLuigi Rizzo  */
1478*68b8534bSLuigi Rizzo static void *
1479*68b8534bSLuigi Rizzo netmap_malloc(size_t size, __unused const char *msg)
1480*68b8534bSLuigi Rizzo {
1481*68b8534bSLuigi Rizzo 	struct netmap_mem_obj *mem_obj, *new_mem_obj;
1482*68b8534bSLuigi Rizzo 	void *ret = NULL;
1483*68b8534bSLuigi Rizzo 
1484*68b8534bSLuigi Rizzo 	NMA_LOCK();
1485*68b8534bSLuigi Rizzo 	TAILQ_FOREACH(mem_obj, &netmap_mem_d->nm_molist, nmo_next) {
1486*68b8534bSLuigi Rizzo 		if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size)
1487*68b8534bSLuigi Rizzo 			continue;
1488*68b8534bSLuigi Rizzo 
1489*68b8534bSLuigi Rizzo 		new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
1490*68b8534bSLuigi Rizzo 				     M_WAITOK | M_ZERO);
1491*68b8534bSLuigi Rizzo 		TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next);
1492*68b8534bSLuigi Rizzo 
1493*68b8534bSLuigi Rizzo 		new_mem_obj->nmo_used = 1;
1494*68b8534bSLuigi Rizzo 		new_mem_obj->nmo_size = size;
1495*68b8534bSLuigi Rizzo 		new_mem_obj->nmo_data = mem_obj->nmo_data;
1496*68b8534bSLuigi Rizzo 		memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size);
1497*68b8534bSLuigi Rizzo 
1498*68b8534bSLuigi Rizzo 		mem_obj->nmo_size -= size;
1499*68b8534bSLuigi Rizzo 		mem_obj->nmo_data = (char *) mem_obj->nmo_data + size;
1500*68b8534bSLuigi Rizzo 		if (mem_obj->nmo_size == 0) {
1501*68b8534bSLuigi Rizzo 			TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj,
1502*68b8534bSLuigi Rizzo 				     nmo_next);
1503*68b8534bSLuigi Rizzo 			free(mem_obj, M_NETMAP);
1504*68b8534bSLuigi Rizzo 		}
1505*68b8534bSLuigi Rizzo 
1506*68b8534bSLuigi Rizzo 		ret = new_mem_obj->nmo_data;
1507*68b8534bSLuigi Rizzo 
1508*68b8534bSLuigi Rizzo 		break;
1509*68b8534bSLuigi Rizzo 	}
1510*68b8534bSLuigi Rizzo 	NMA_UNLOCK();
1511*68b8534bSLuigi Rizzo 	ND("%s: %d bytes at %p", msg, size, ret);
1512*68b8534bSLuigi Rizzo 
1513*68b8534bSLuigi Rizzo 	return (ret);
1514*68b8534bSLuigi Rizzo }
1515*68b8534bSLuigi Rizzo 
1516*68b8534bSLuigi Rizzo /*
1517*68b8534bSLuigi Rizzo  * Return the memory to the allocator.
1518*68b8534bSLuigi Rizzo  *
1519*68b8534bSLuigi Rizzo  * While freeing a memory object, we try to merge adjacent chunks in
1520*68b8534bSLuigi Rizzo  * order to reduce memory fragmentation.
1521*68b8534bSLuigi Rizzo  */
1522*68b8534bSLuigi Rizzo static void
1523*68b8534bSLuigi Rizzo netmap_free(void *addr, const char *msg)
1524*68b8534bSLuigi Rizzo {
1525*68b8534bSLuigi Rizzo 	size_t size;
1526*68b8534bSLuigi Rizzo 	struct netmap_mem_obj *cur, *prev, *next;
1527*68b8534bSLuigi Rizzo 
1528*68b8534bSLuigi Rizzo 	if (addr == NULL) {
1529*68b8534bSLuigi Rizzo 		D("NULL addr for %s", msg);
1530*68b8534bSLuigi Rizzo 		return;
1531*68b8534bSLuigi Rizzo 	}
1532*68b8534bSLuigi Rizzo 
1533*68b8534bSLuigi Rizzo 	NMA_LOCK();
1534*68b8534bSLuigi Rizzo 	TAILQ_FOREACH(cur, &netmap_mem_d->nm_molist, nmo_next) {
1535*68b8534bSLuigi Rizzo 		if (cur->nmo_data == addr && cur->nmo_used)
1536*68b8534bSLuigi Rizzo 			break;
1537*68b8534bSLuigi Rizzo 	}
1538*68b8534bSLuigi Rizzo 	if (cur == NULL) {
1539*68b8534bSLuigi Rizzo 		NMA_UNLOCK();
1540*68b8534bSLuigi Rizzo 		D("invalid addr %s %p", msg, addr);
1541*68b8534bSLuigi Rizzo 		return;
1542*68b8534bSLuigi Rizzo 	}
1543*68b8534bSLuigi Rizzo 
1544*68b8534bSLuigi Rizzo 	size = cur->nmo_size;
1545*68b8534bSLuigi Rizzo 	cur->nmo_used = 0;
1546*68b8534bSLuigi Rizzo 
1547*68b8534bSLuigi Rizzo 	/* merge current chunk of memory with the previous one,
1548*68b8534bSLuigi Rizzo 	   if present. */
1549*68b8534bSLuigi Rizzo 	prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next);
1550*68b8534bSLuigi Rizzo 	if (prev && prev->nmo_used == 0) {
1551*68b8534bSLuigi Rizzo 		TAILQ_REMOVE(&netmap_mem_d->nm_molist, cur, nmo_next);
1552*68b8534bSLuigi Rizzo 		prev->nmo_size += cur->nmo_size;
1553*68b8534bSLuigi Rizzo 		free(cur, M_NETMAP);
1554*68b8534bSLuigi Rizzo 		cur = prev;
1555*68b8534bSLuigi Rizzo 	}
1556*68b8534bSLuigi Rizzo 
1557*68b8534bSLuigi Rizzo 	/* merge with the next one */
1558*68b8534bSLuigi Rizzo 	next = TAILQ_NEXT(cur, nmo_next);
1559*68b8534bSLuigi Rizzo 	if (next && next->nmo_used == 0) {
1560*68b8534bSLuigi Rizzo 		TAILQ_REMOVE(&netmap_mem_d->nm_molist, next, nmo_next);
1561*68b8534bSLuigi Rizzo 		cur->nmo_size += next->nmo_size;
1562*68b8534bSLuigi Rizzo 		free(next, M_NETMAP);
1563*68b8534bSLuigi Rizzo 	}
1564*68b8534bSLuigi Rizzo 	NMA_UNLOCK();
1565*68b8534bSLuigi Rizzo 	ND("freed %s %d bytes at %p", msg, size, addr);
1566*68b8534bSLuigi Rizzo }
1567*68b8534bSLuigi Rizzo 
1568*68b8534bSLuigi Rizzo 
1569*68b8534bSLuigi Rizzo /*
1570*68b8534bSLuigi Rizzo  * Initialize the memory allocator.
1571*68b8534bSLuigi Rizzo  *
1572*68b8534bSLuigi Rizzo  * Create the descriptor for the memory , allocate the pool of memory
1573*68b8534bSLuigi Rizzo  * and initialize the list of memory objects with a single chunk
1574*68b8534bSLuigi Rizzo  * containing the whole pre-allocated memory marked as free.
1575*68b8534bSLuigi Rizzo  *
1576*68b8534bSLuigi Rizzo  * Start with a large size, then halve as needed if we fail to
1577*68b8534bSLuigi Rizzo  * allocate the block. While halving, always add one extra page
1578*68b8534bSLuigi Rizzo  * because buffers 0 and 1 are used for special purposes.
1579*68b8534bSLuigi Rizzo  * Return 0 on success, errno otherwise.
1580*68b8534bSLuigi Rizzo  */
1581*68b8534bSLuigi Rizzo static int
1582*68b8534bSLuigi Rizzo netmap_memory_init(void)
1583*68b8534bSLuigi Rizzo {
1584*68b8534bSLuigi Rizzo 	struct netmap_mem_obj *mem_obj;
1585*68b8534bSLuigi Rizzo 	void *buf = NULL;
1586*68b8534bSLuigi Rizzo 	int i, n, sz = NETMAP_MEMORY_SIZE;
1587*68b8534bSLuigi Rizzo 	int extra_sz = 0; // space for rings and two spare buffers
1588*68b8534bSLuigi Rizzo 
1589*68b8534bSLuigi Rizzo 	for (; !buf && sz >= 1<<20; sz >>=1) {
1590*68b8534bSLuigi Rizzo 		extra_sz = sz/200;
1591*68b8534bSLuigi Rizzo 		extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
1592*68b8534bSLuigi Rizzo 	        buf = contigmalloc(sz + extra_sz,
1593*68b8534bSLuigi Rizzo 			     M_NETMAP,
1594*68b8534bSLuigi Rizzo 			     M_WAITOK | M_ZERO,
1595*68b8534bSLuigi Rizzo 			     0, /* low address */
1596*68b8534bSLuigi Rizzo 			     -1UL, /* high address */
1597*68b8534bSLuigi Rizzo 			     PAGE_SIZE, /* alignment */
1598*68b8534bSLuigi Rizzo 			     0 /* boundary */
1599*68b8534bSLuigi Rizzo 			    );
1600*68b8534bSLuigi Rizzo 	}
1601*68b8534bSLuigi Rizzo 	if (buf == NULL)
1602*68b8534bSLuigi Rizzo 		return (ENOMEM);
1603*68b8534bSLuigi Rizzo 	sz += extra_sz;
1604*68b8534bSLuigi Rizzo 	netmap_mem_d = malloc(sizeof(struct netmap_mem_d), M_NETMAP,
1605*68b8534bSLuigi Rizzo 			      M_WAITOK | M_ZERO);
1606*68b8534bSLuigi Rizzo 	mtx_init(&netmap_mem_d->nm_mtx, "netmap memory allocator lock", NULL,
1607*68b8534bSLuigi Rizzo 		 MTX_DEF);
1608*68b8534bSLuigi Rizzo 	TAILQ_INIT(&netmap_mem_d->nm_molist);
1609*68b8534bSLuigi Rizzo 	netmap_mem_d->nm_buffer = buf;
1610*68b8534bSLuigi Rizzo 	netmap_mem_d->nm_totalsize = sz;
1611*68b8534bSLuigi Rizzo 
1612*68b8534bSLuigi Rizzo 	/*
1613*68b8534bSLuigi Rizzo 	 * A buffer takes 2k, a slot takes 8 bytes + ring overhead,
1614*68b8534bSLuigi Rizzo 	 * so the ratio is 200:1. In other words, we can use 1/200 of
1615*68b8534bSLuigi Rizzo 	 * the memory for the rings, and the rest for the buffers,
1616*68b8534bSLuigi Rizzo 	 * and be sure we never run out.
1617*68b8534bSLuigi Rizzo 	 */
1618*68b8534bSLuigi Rizzo 	netmap_mem_d->nm_size = sz/200;
1619*68b8534bSLuigi Rizzo 	netmap_mem_d->nm_buf_start =
1620*68b8534bSLuigi Rizzo 		(netmap_mem_d->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1);
1621*68b8534bSLuigi Rizzo 	netmap_mem_d->nm_buf_len = sz - netmap_mem_d->nm_buf_start;
1622*68b8534bSLuigi Rizzo 
1623*68b8534bSLuigi Rizzo 	nm_buf_pool.base = netmap_mem_d->nm_buffer;
1624*68b8534bSLuigi Rizzo 	nm_buf_pool.base += netmap_mem_d->nm_buf_start;
1625*68b8534bSLuigi Rizzo 	netmap_buffer_base = nm_buf_pool.base;
1626*68b8534bSLuigi Rizzo 	D("netmap_buffer_base %p (offset %d)",
1627*68b8534bSLuigi Rizzo 		netmap_buffer_base, netmap_mem_d->nm_buf_start);
1628*68b8534bSLuigi Rizzo 	/* number of buffers, they all start as free */
1629*68b8534bSLuigi Rizzo 
1630*68b8534bSLuigi Rizzo 	netmap_total_buffers = nm_buf_pool.total_buffers =
1631*68b8534bSLuigi Rizzo 		netmap_mem_d->nm_buf_len / NETMAP_BUF_SIZE;
1632*68b8534bSLuigi Rizzo 	nm_buf_pool.bufsize = NETMAP_BUF_SIZE;
1633*68b8534bSLuigi Rizzo 
1634*68b8534bSLuigi Rizzo 	D("Have %d MB, use %dKB for rings, %d buffers at %p",
1635*68b8534bSLuigi Rizzo 		(sz >> 20), (netmap_mem_d->nm_size >> 10),
1636*68b8534bSLuigi Rizzo 		nm_buf_pool.total_buffers, nm_buf_pool.base);
1637*68b8534bSLuigi Rizzo 
1638*68b8534bSLuigi Rizzo 	/* allocate and initialize the bitmap. Entry 0 is considered
1639*68b8534bSLuigi Rizzo 	 * always busy (used as default when there are no buffers left).
1640*68b8534bSLuigi Rizzo 	 */
1641*68b8534bSLuigi Rizzo 	n = (nm_buf_pool.total_buffers + 31) / 32;
1642*68b8534bSLuigi Rizzo 	nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP,
1643*68b8534bSLuigi Rizzo 			 M_WAITOK | M_ZERO);
1644*68b8534bSLuigi Rizzo 	nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */
1645*68b8534bSLuigi Rizzo 	for (i = 1; i < n; i++)
1646*68b8534bSLuigi Rizzo 		nm_buf_pool.bitmap[i] = ~0;
1647*68b8534bSLuigi Rizzo 	nm_buf_pool.free = nm_buf_pool.total_buffers - 2;
1648*68b8534bSLuigi Rizzo 
1649*68b8534bSLuigi Rizzo 	mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP,
1650*68b8534bSLuigi Rizzo 			 M_WAITOK | M_ZERO);
1651*68b8534bSLuigi Rizzo 	TAILQ_INSERT_HEAD(&netmap_mem_d->nm_molist, mem_obj, nmo_next);
1652*68b8534bSLuigi Rizzo 	mem_obj->nmo_used = 0;
1653*68b8534bSLuigi Rizzo 	mem_obj->nmo_size = netmap_mem_d->nm_size;
1654*68b8534bSLuigi Rizzo 	mem_obj->nmo_data = netmap_mem_d->nm_buffer;
1655*68b8534bSLuigi Rizzo 
1656*68b8534bSLuigi Rizzo 	return (0);
1657*68b8534bSLuigi Rizzo }
1658*68b8534bSLuigi Rizzo 
1659*68b8534bSLuigi Rizzo 
1660*68b8534bSLuigi Rizzo /*
1661*68b8534bSLuigi Rizzo  * Finalize the memory allocator.
1662*68b8534bSLuigi Rizzo  *
1663*68b8534bSLuigi Rizzo  * Free all the memory objects contained inside the list, and deallocate
1664*68b8534bSLuigi Rizzo  * the pool of memory; finally free the memory allocator descriptor.
1665*68b8534bSLuigi Rizzo  */
1666*68b8534bSLuigi Rizzo static void
1667*68b8534bSLuigi Rizzo netmap_memory_fini(void)
1668*68b8534bSLuigi Rizzo {
1669*68b8534bSLuigi Rizzo 	struct netmap_mem_obj *mem_obj;
1670*68b8534bSLuigi Rizzo 
1671*68b8534bSLuigi Rizzo 	while (!TAILQ_EMPTY(&netmap_mem_d->nm_molist)) {
1672*68b8534bSLuigi Rizzo 		mem_obj = TAILQ_FIRST(&netmap_mem_d->nm_molist);
1673*68b8534bSLuigi Rizzo 		TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, nmo_next);
1674*68b8534bSLuigi Rizzo 		if (mem_obj->nmo_used == 1) {
1675*68b8534bSLuigi Rizzo 			printf("netmap: leaked %d bytes at %p\n",
1676*68b8534bSLuigi Rizzo 			       mem_obj->nmo_size,
1677*68b8534bSLuigi Rizzo 			       mem_obj->nmo_data);
1678*68b8534bSLuigi Rizzo 		}
1679*68b8534bSLuigi Rizzo 		free(mem_obj, M_NETMAP);
1680*68b8534bSLuigi Rizzo 	}
1681*68b8534bSLuigi Rizzo 	contigfree(netmap_mem_d->nm_buffer, netmap_mem_d->nm_totalsize, M_NETMAP);
1682*68b8534bSLuigi Rizzo 	// XXX mutex_destroy(nm_mtx);
1683*68b8534bSLuigi Rizzo 	free(netmap_mem_d, M_NETMAP);
1684*68b8534bSLuigi Rizzo }
1685*68b8534bSLuigi Rizzo 
1686*68b8534bSLuigi Rizzo 
1687*68b8534bSLuigi Rizzo /*
1688*68b8534bSLuigi Rizzo  * Module loader.
1689*68b8534bSLuigi Rizzo  *
1690*68b8534bSLuigi Rizzo  * Create the /dev/netmap device and initialize all global
1691*68b8534bSLuigi Rizzo  * variables.
1692*68b8534bSLuigi Rizzo  *
1693*68b8534bSLuigi Rizzo  * Return 0 on success, errno on failure.
1694*68b8534bSLuigi Rizzo  */
1695*68b8534bSLuigi Rizzo static int
1696*68b8534bSLuigi Rizzo netmap_init(void)
1697*68b8534bSLuigi Rizzo {
1698*68b8534bSLuigi Rizzo 	int error;
1699*68b8534bSLuigi Rizzo 
1700*68b8534bSLuigi Rizzo 
1701*68b8534bSLuigi Rizzo 	error = netmap_memory_init();
1702*68b8534bSLuigi Rizzo 	if (error != 0) {
1703*68b8534bSLuigi Rizzo 		printf("netmap: unable to initialize the memory allocator.");
1704*68b8534bSLuigi Rizzo 		return (error);
1705*68b8534bSLuigi Rizzo 	}
1706*68b8534bSLuigi Rizzo 	printf("netmap: loaded module with %d Mbytes\n",
1707*68b8534bSLuigi Rizzo 		netmap_mem_d->nm_totalsize >> 20);
1708*68b8534bSLuigi Rizzo 
1709*68b8534bSLuigi Rizzo 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
1710*68b8534bSLuigi Rizzo 			      "netmap");
1711*68b8534bSLuigi Rizzo 
1712*68b8534bSLuigi Rizzo 	return (0);
1713*68b8534bSLuigi Rizzo }
1714*68b8534bSLuigi Rizzo 
1715*68b8534bSLuigi Rizzo 
1716*68b8534bSLuigi Rizzo /*
1717*68b8534bSLuigi Rizzo  * Module unloader.
1718*68b8534bSLuigi Rizzo  *
1719*68b8534bSLuigi Rizzo  * Free all the memory, and destroy the ``/dev/netmap`` device.
1720*68b8534bSLuigi Rizzo  */
1721*68b8534bSLuigi Rizzo static void
1722*68b8534bSLuigi Rizzo netmap_fini(void)
1723*68b8534bSLuigi Rizzo {
1724*68b8534bSLuigi Rizzo 	destroy_dev(netmap_dev);
1725*68b8534bSLuigi Rizzo 
1726*68b8534bSLuigi Rizzo 	netmap_memory_fini();
1727*68b8534bSLuigi Rizzo 
1728*68b8534bSLuigi Rizzo 	printf("netmap: unloaded module.\n");
1729*68b8534bSLuigi Rizzo }
1730*68b8534bSLuigi Rizzo 
1731*68b8534bSLuigi Rizzo 
1732*68b8534bSLuigi Rizzo /*
1733*68b8534bSLuigi Rizzo  * Kernel entry point.
1734*68b8534bSLuigi Rizzo  *
1735*68b8534bSLuigi Rizzo  * Initialize/finalize the module and return.
1736*68b8534bSLuigi Rizzo  *
1737*68b8534bSLuigi Rizzo  * Return 0 on success, errno on failure.
1738*68b8534bSLuigi Rizzo  */
1739*68b8534bSLuigi Rizzo static int
1740*68b8534bSLuigi Rizzo netmap_loader(__unused struct module *module, int event, __unused void *arg)
1741*68b8534bSLuigi Rizzo {
1742*68b8534bSLuigi Rizzo 	int error = 0;
1743*68b8534bSLuigi Rizzo 
1744*68b8534bSLuigi Rizzo 	switch (event) {
1745*68b8534bSLuigi Rizzo 	case MOD_LOAD:
1746*68b8534bSLuigi Rizzo 		error = netmap_init();
1747*68b8534bSLuigi Rizzo 		break;
1748*68b8534bSLuigi Rizzo 
1749*68b8534bSLuigi Rizzo 	case MOD_UNLOAD:
1750*68b8534bSLuigi Rizzo 		netmap_fini();
1751*68b8534bSLuigi Rizzo 		break;
1752*68b8534bSLuigi Rizzo 
1753*68b8534bSLuigi Rizzo 	default:
1754*68b8534bSLuigi Rizzo 		error = EOPNOTSUPP;
1755*68b8534bSLuigi Rizzo 		break;
1756*68b8534bSLuigi Rizzo 	}
1757*68b8534bSLuigi Rizzo 
1758*68b8534bSLuigi Rizzo 	return (error);
1759*68b8534bSLuigi Rizzo }
1760*68b8534bSLuigi Rizzo 
1761*68b8534bSLuigi Rizzo 
1762*68b8534bSLuigi Rizzo DEV_MODULE(netmap, netmap_loader, NULL);
1763