xref: /freebsd/sys/net/netmap.h (revision 4ec234c813eed05c166859bba82c882e40826eb9)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /*
28  * $FreeBSD$
29  *
30  * Definitions of constants and the structures used by the netmap
31  * framework, for the part visible to both kernel and userspace.
32  * Detailed info on netmap is available with "man netmap" or at
33  *
34  *	http://info.iet.unipi.it/~luigi/netmap/
35  *
36  * This API is also used to communicate with the VALE software switch
37  */
38 
39 #ifndef _NET_NETMAP_H_
40 #define _NET_NETMAP_H_
41 
42 #define	NETMAP_API	10		/* current API version */
43 
44 /*
45  * Some fields should be cache-aligned to reduce contention.
46  * The alignment is architecture and OS dependent, but rather than
47  * digging into OS headers to find the exact value we use an estimate
48  * that should cover most architectures.
49  */
50 #define NM_CACHE_ALIGN	128
51 
52 /*
53  * --- Netmap data structures ---
54  *
55  * The userspace data structures used by netmap are shown below.
56  * They are allocated by the kernel and mmap()ed by userspace threads.
57  * Pointers are implemented as memory offsets or indexes,
58  * so that they can be easily dereferenced in kernel and userspace.
59 
60    KERNEL (opaque, obviously)
61 
62   ====================================================================
63                                          |
64    USERSPACE                             |      struct netmap_ring
65                                          +---->+---------------+
66                                              / | head,cur,tail |
67    struct netmap_if (nifp, 1 per fd)        /  | buf_ofs       |
68     +---------------+                      /   | other fields  |
69     | ni_tx_rings   |                     /    +===============+
70     | ni_rx_rings   |                    /     | buf_idx, len  | slot[0]
71     |               |                   /      | flags, ptr    |
72     |               |                  /       +---------------+
73     +===============+                 /        | buf_idx, len  | slot[1]
74     | txring_ofs[0] | (rel.to nifp)--'         | flags, ptr    |
75     | txring_ofs[1] |                          +---------------+
76   (tx+1+extra_tx entries)                     (num_slots entries)
77     | txring_ofs[t] |                          | buf_idx, len  | slot[n-1]
78     +---------------+                          | flags, ptr    |
79     | rxring_ofs[0] |                          +---------------+
80     | rxring_ofs[1] |
81   (rx+1+extra_rx entries)
82     | rxring_ofs[r] |
83     +---------------+
84 
85  * For each "interface" (NIC, host stack, VALE switch port) attached to a
86  * file descriptor, the mmap()ed region contains a (logically readonly)
87  * struct netmap_if pointing to struct netmap_ring's.
88  * There is one netmap_ring per physical NIC ring, plus one tx/rx ring
89  * pair attached to the host stack (this pair is unused for VALE ports).
90  *
91  * All physical/host stack ports share the same memory region,
92  * so that zero-copy can be implemented between them.
93  * VALE switch ports instead have separate memory regions.
94  *
95  * The netmap_ring is the userspace-visible replica of the NIC ring.
96  * Each slot has the index of a buffer (MTU-sized and residing in the
97  * mmapped region), its length and some flags. An extra 64-bit pointer
98  * is provided for user-supplied buffers in the tx path.
99  *
100  * In user space, the buffer address is computed as
101  *	(char *)ring + buf_ofs + index*NETMAP_BUF_SIZE
102  */
103 
104 /*
105  * struct netmap_slot is a buffer descriptor
106  */
107 struct netmap_slot {
108 	uint32_t buf_idx;	/* buffer index */
109 	uint16_t len;		/* length for this slot */
110 	uint16_t flags;		/* buf changed, etc. */
111 	uint64_t ptr;		/* pointer for indirect buffers */
112 };
113 
114 /*
115  * The following flags control how the slot is used
116  */
117 
118 #define	NS_BUF_CHANGED	0x0001	/* buf_idx changed */
119 	/*
120 	 * must be set whenever buf_idx is changed (as it might be
121 	 * necessary to recompute the physical address and mapping)
122 	 */
123 
124 #define	NS_REPORT	0x0002	/* ask the hardware to report results */
125 	/*
126 	 * Request notification when slot is used by the hardware.
127 	 * Normally transmit completions are handled lazily and
128 	 * may be unreported. This flag lets us know when a slot
129 	 * has been sent (e.g. to terminate the sender).
130 	 */
131 
132 #define	NS_FORWARD	0x0004	/* pass packet 'forward' */
133 	/*
134 	 * (Only for physical ports, rx rings with NR_FORWARD set).
135 	 * Slot released to the kernel (i.e. before ring->head) with
136 	 * this flag set are passed to the peer ring (host/NIC),
137 	 * thus restoring the host-NIC connection for these slots.
138 	 * This supports efficient traffic monitoring or firewalling.
139 	 */
140 
141 #define	NS_NO_LEARN	0x0008	/* disable bridge learning */
142  	/*
143 	 * On a VALE switch, do not 'learn' the source port for
144  	 * this buffer.
145 	 */
146 
147 #define	NS_INDIRECT	0x0010	/* userspace buffer */
148  	/*
149 	 * (VALE tx rings only) data is in a userspace buffer,
150 	 * whose address is in the 'ptr' field in the slot.
151 	 */
152 
153 #define	NS_MOREFRAG	0x0020	/* packet has more fragments */
154  	/*
155 	 * (VALE ports only)
156 	 * Set on all but the last slot of a multi-segment packet.
157 	 * The 'len' field refers to the individual fragment.
158 	 */
159 
160 #define	NS_PORT_SHIFT	8
161 #define	NS_PORT_MASK	(0xff << NS_PORT_SHIFT)
162 	/*
163  	 * The high 8 bits of the flag, if not zero, indicate the
164 	 * destination port for the VALE switch, overriding
165  	 * the lookup table.
166  	 */
167 
168 #define	NS_RFRAGS(_slot)	( ((_slot)->flags >> 8) & 0xff)
169 	/*
170 	 * (VALE rx rings only) the high 8 bits
171 	 *  are the number of fragments.
172 	 */
173 
174 
175 /*
176  * struct netmap_ring
177  *
178  * Netmap representation of a TX or RX ring (also known as "queue").
179  * This is a queue implemented as a fixed-size circular array.
180  * At the software level the important fields are: head, cur, tail.
181  *
182  * In TX rings:
183  *
184  *	head	first slot available for transmission.
185  *	cur	wakeup point. select() and poll() will unblock
186  *		when 'tail' moves past 'cur'
187  *	tail	(readonly) first slot reserved to the kernel
188  *
189  *	[head .. tail-1] can be used for new packets to send;
190  *	'head' and 'cur' must be incremented as slots are filled
191  *	    with new packets to be sent;
192  *	'cur' can be moved further ahead if we need more space
193  *	for new transmissions.
194  *
195  * In RX rings:
196  *
197  *	head	first valid received packet
198  *	cur	wakeup point. select() and poll() will unblock
199  *		when 'tail' moves past 'cur'
200  *	tail	(readonly) first slot reserved to the kernel
201  *
202  *	[head .. tail-1] contain received packets;
203  *	'head' and 'cur' must be incremented as slots are consumed
204  *		and can be returned to the kernel;
205  *	'cur' can be moved further ahead if we want to wait for
206  *		new packets without returning the previous ones.
207  *
208  * DATA OWNERSHIP/LOCKING:
209  *	The netmap_ring, and all slots and buffers in the range
210  *	[head .. tail-1] are owned by the user program;
211  *	the kernel only accesses them during a netmap system call
212  *	and in the user thread context.
213  *
214  *	Other slots and buffers are reserved for use by the kernel
215  */
216 struct netmap_ring {
217 	/*
218 	 * buf_ofs is meant to be used through macros.
219 	 * It contains the offset of the buffer region from this
220 	 * descriptor.
221 	 */
222 	const int64_t	buf_ofs;
223 	const uint32_t	num_slots;	/* number of slots in the ring. */
224 	const uint32_t	nr_buf_size;
225 	const uint16_t	ringid;
226 	const uint16_t	dir;		/* 0: tx, 1: rx */
227 
228 	uint32_t        head;		/* (u) first user slot */
229 	uint32_t        cur;		/* (u) wakeup point */
230 	uint32_t	tail;		/* (k) first kernel slot */
231 
232 	uint32_t	flags;
233 
234 	struct timeval	ts;		/* (k) time of last *sync() */
235 
236 	/* opaque room for a mutex or similar object */
237 	uint8_t		sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN)));
238 
239 	/* the slots follow. This struct has variable size */
240 	struct netmap_slot slot[0];	/* array of slots. */
241 };
242 
243 
244 /*
245  * RING FLAGS
246  */
247 #define	NR_TIMESTAMP	0x0002		/* set timestamp on *sync() */
248 	/*
249 	 * updates the 'ts' field on each netmap syscall. This saves
250 	 * saves a separate gettimeofday(), and is not much worse than
251 	 * software timestamps generated in the interrupt handler.
252 	 */
253 
254 #define	NR_FORWARD	0x0004		/* enable NS_FORWARD for ring */
255  	/*
256 	 * Enables the NS_FORWARD slot flag for the ring.
257 	 */
258 
259 
260 /*
261  * Netmap representation of an interface and its queue(s).
262  * This is initialized by the kernel when binding a file
263  * descriptor to a port, and should be considered as readonly
264  * by user programs. The kernel never uses it.
265  *
266  * There is one netmap_if for each file descriptor on which we want
267  * to select/poll.
268  * select/poll operates on one or all pairs depending on the value of
269  * nmr_queueid passed on the ioctl.
270  */
271 struct netmap_if {
272 	char		ni_name[IFNAMSIZ]; /* name of the interface. */
273 	const uint32_t	ni_version;	/* API version, currently unused */
274 	const uint32_t	ni_flags;	/* properties */
275 #define	NI_PRIV_MEM	0x1		/* private memory region */
276 
277 	/*
278 	 * The number of packet rings available in netmap mode.
279 	 * Physical NICs can have different numbers of tx and rx rings.
280 	 * Physical NICs also have a 'host' ring pair.
281 	 * Additionally, clients can request additional ring pairs to
282 	 * be used for internal communication.
283 	 */
284 	const uint32_t	ni_tx_rings;	/* number of HW tx rings */
285 	const uint32_t	ni_rx_rings;	/* number of HW rx rings */
286 
287 	const uint32_t	ni_extra_tx_rings;
288 	const uint32_t	ni_extra_rx_rings;
289 	/*
290 	 * The following array contains the offset of each netmap ring
291 	 * from this structure, in the following order:
292 	 * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings;
293 	 * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings.
294 	 *
295 	 * The area is filled up by the kernel on NIOCREGIF,
296 	 * and then only read by userspace code.
297 	 */
298 	const ssize_t	ring_ofs[0];
299 };
300 
301 
302 #ifndef NIOCREGIF
303 /*
304  * ioctl names and related fields
305  *
306  * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues,
307  *	whose identity is set in NIOCREGIF through nr_ringid.
308  *	These are non blocking and take no argument.
309  *
310  * NIOCGINFO takes a struct ifreq, the interface name is the input,
311  *	the outputs are number of queues and number of descriptor
312  *	for each queue (useful to set number of threads etc.).
313  *	The info returned is only advisory and may change before
314  *	the interface is bound to a file descriptor.
315  *
316  * NIOCREGIF takes an interface name within a struct nmre,
317  *	and activates netmap mode on the interface (if possible).
318  *
319  * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we
320  * can pass it down to other NIC-related ioctls.
321  *
322  * The actual argument (struct nmreq) has a number of options to request
323  * different functions.
324  *
325  * nr_name	(in)
326  *	The name of the port (em0, valeXXX:YYY, etc.)
327  *	limited to IFNAMSIZ for backward compatibility.
328  *
329  * nr_version	(in/out)
330  *	Must match NETMAP_API as used in the kernel, error otherwise.
331  *	Always returns the desired value on output.
332  *
333  * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out)
334  *	On input, non-zero values may be used to reconfigure the port
335  *	according to the requested values, but this is not guaranteed.
336  *	On output the actual values in use are reported.
337  *
338  * nr_ringid (in)
339  *	Indicates how rings should be bound to the file descriptors.
340  *	0 (default)			binds all physical rings
341  *	NETMAP_HW_RING | ring number	binds a single ring pair
342  *	NETMAP_SW_RING			binds only the host tx/rx rings
343  *
344  *	NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push
345  *		packets on tx rings only if POLLOUT is set.
346  *		The default is to push any pending packet.
347  *
348  *	NETMAP_PRIV_MEM is set on return for ports that use private
349  *		memory regions and cannot use buffer swapping.
350  *
351  * nr_cmd (in)	if non-zero indicates a special command:
352  *	NETMAP_BDG_ATTACH	 and nr_name = vale*:ifname
353  *		attaches the NIC to the switch; nr_ringid specifies
354  *		which rings to use. Used by vale-ctl -a ...
355  *	    nr_arg1 = NETMAP_BDG_HOST also attaches the host port
356  *		as in vale-ctl -h ...
357  *
358  *	NETMAP_BDG_DETACH	and nr_name = vale*:ifname
359  *		disconnects a previously attached NIC.
360  *		Used by vale-ctl -d ...
361  *
362  *	NETMAP_BDG_LIST
363  *		list the configuration of VALE switches.
364  *
365  *	NETMAP_BDG_OFFSET	XXX ?
366  *		Set the offset of data in packets. Used with VALE
367  *		switches where the clients use the vhost header.
368  *
369  * nr_arg1, nr_arg2 (in/out)		command specific
370  *
371  */
372 
373 
374 /*
375  * struct nmreq overlays a struct ifreq
376  */
377 struct nmreq {
378 	char		nr_name[IFNAMSIZ];
379 	uint32_t	nr_version;	/* API version */
380 	uint32_t	nr_offset;	/* nifp offset in the shared region */
381 	uint32_t	nr_memsize;	/* size of the shared region */
382 	uint32_t	nr_tx_slots;	/* slots in tx rings */
383 	uint32_t	nr_rx_slots;	/* slots in rx rings */
384 	uint16_t	nr_tx_rings;	/* number of tx rings */
385 	uint16_t	nr_rx_rings;	/* number of rx rings */
386 	uint16_t	nr_ringid;	/* ring(s) we care about */
387 #define NETMAP_PRIV_MEM	0x8000		/* rings use private memory */
388 #define NETMAP_HW_RING	0x4000		/* low bits indicate one hw ring */
389 #define NETMAP_SW_RING	0x2000		/* process the sw ring */
390 #define NETMAP_NO_TX_POLL	0x1000	/* no automatic txsync on poll */
391 #define NETMAP_RING_MASK 0xfff		/* the ring number */
392 
393 	uint16_t	nr_cmd;
394 #define NETMAP_BDG_ATTACH	1	/* attach the NIC */
395 #define NETMAP_BDG_DETACH	2	/* detach the NIC */
396 #define NETMAP_BDG_LOOKUP_REG	3	/* register lookup function */
397 #define NETMAP_BDG_LIST		4	/* get bridge's info */
398 #define NETMAP_BDG_OFFSET       5       /* set the port offset */
399 
400 	uint16_t	nr_arg1;
401 #define NETMAP_BDG_HOST		1	/* attach the host stack on ATTACH */
402 #define NETMAP_BDG_MAX_OFFSET	12
403 
404 	uint16_t	nr_arg2;
405 	uint32_t	spare2[3];
406 };
407 
408 
409 /*
410  * FreeBSD uses the size value embedded in the _IOWR to determine
411  * how much to copy in/out. So we need it to match the actual
412  * data structure we pass. We put some spares in the structure
413  * to ease compatibility with other versions
414  */
415 #define NIOCGINFO	_IOWR('i', 145, struct nmreq) /* return IF info */
416 #define NIOCREGIF	_IOWR('i', 146, struct nmreq) /* interface register */
417 #define NIOCTXSYNC	_IO('i', 148) /* sync tx queues */
418 #define NIOCRXSYNC	_IO('i', 149) /* sync rx queues */
419 #endif /* !NIOCREGIF */
420 
421 
422 /*
423  * Helper functions for kernel and userspace
424  */
425 
426 /*
427  * check if space is available in the ring.
428  */
429 static inline int
430 nm_ring_empty(struct netmap_ring *ring)
431 {
432 	return (ring->cur == ring->tail);
433 }
434 
435 #endif /* _NET_NETMAP_H_ */
436