xref: /illumos-gate/usr/src/uts/common/io/vioif/vioif.h (revision fba27d8741c08c38aa9cf5fd383633304ddad810)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * VIRTIO NETWORK DRIVER
18  */
19 
20 #ifndef _VIOIF_H
21 #define	_VIOIF_H
22 
23 #include "virtio.h"
24 
25 #ifdef __cplusplus
26 extern "C" {
27 #endif
28 
29 /*
30  * VIRTIO NETWORK CONFIGURATION REGISTERS
31  *
32  * These are offsets into the device-specific configuration space available
33  * through the virtio_dev_*() family of functions.
34  */
35 #define	VIRTIO_NET_CONFIG_MAC		0x00	/* 48 R/W */
36 #define	VIRTIO_NET_CONFIG_STATUS	0x06	/* 16 R   */
37 #define	VIRTIO_NET_CONFIG_MAX_VQ_PAIRS	0x08	/* 16 R   */
38 #define	VIRTIO_NET_CONFIG_MTU		0x0A	/* 16 R   */
39 
40 /*
41  * VIRTIO NETWORK VIRTQUEUES
42  *
43  * Note that the control queue is only present if VIRTIO_NET_F_CTRL_VQ is
44  * negotiated with the device.
45  */
46 #define	VIRTIO_NET_VIRTQ_RX		0
47 #define	VIRTIO_NET_VIRTQ_TX		1
48 #define	VIRTIO_NET_VIRTQ_CONTROL	2
49 
50 /*
51  * VIRTIO NETWORK FEATURE BITS
52  */
53 
54 /*
55  * CSUM, GUEST_CSUM:
56  *	Partial checksum support.  These features signal that the device will
57  *	accept packets with partial checksums (CSUM), and that the driver will
58  *	accept packets with partial checksums (GUEST_CSUM).  These features
59  *	combine the use of the VIRTIO_NET_HDR_F_NEEDS_CSUM flag, and the
60  *	"csum_start" and "csum_offset" fields, in the virtio net header.
61  */
62 #define	VIRTIO_NET_F_CSUM		(1ULL << 0)
63 #define	VIRTIO_NET_F_GUEST_CSUM		(1ULL << 1)
64 
65 /*
66  * MTU:
67  *	The device offers a maximum MTU value at VIRTIO_NET_CONFIG_MTU.  If
68  *	this is not negotiated, we allow the largest possible MTU that our
69  *	buffer allocations support in case jumbo frames are tacitly supported
70  *	by the device.  The default MTU is always 1500.
71  */
72 #define	VIRTIO_NET_F_MTU		(1ULL << 3)
73 
74 /*
75  * MAC:
76  *	The device has an assigned primary MAC address.  If this feature bit is
77  *	not set, the driver must provide a locally assigned MAC address.  See
78  *	IEEE 802, "48-bit universal LAN MAC addresses" for more details on
79  *	assignment.
80  */
81 #define	VIRTIO_NET_F_MAC		(1ULL << 5)
82 
83 /*
84  * GUEST_TSO4, GUEST_TSO6, GUEST_UFO:
85  *	Inbound segmentation offload support.  These features depend on having
86  *	VIRTIO_NET_F_GUEST_CSUM and signal that the driver can accept large
87  *	combined TCP (v4 or v6) packets, or reassembled UDP fragments.
88  */
89 #define	VIRTIO_NET_F_GUEST_TSO4		(1ULL << 7)
90 #define	VIRTIO_NET_F_GUEST_TSO6		(1ULL << 8)
91 #define	VIRTIO_NET_F_GUEST_UFO		(1ULL << 10)
92 
93 /*
94  * GUEST_ECN:
95  *	Depends on either VIRTIO_NET_F_GUEST_TSO4 or VIRTIO_NET_F_GUEST_TSO6.
96  *	This feature means the driver will look for the VIRTIO_NET_HDR_GSO_ECN
97  *	bit in the "gso_type" of the virtio net header.  This bit tells the
98  *	driver that the Explicit Congestion Notification (ECN) bit was set in
99  *	the original TCP packets.
100  */
101 #define	VIRTIO_NET_F_GUEST_ECN		(1ULL << 9)
102 
103 /*
104  * HOST_TSO4, HOST_TSO6, HOST_UFO:
105  *	Outbound segmentation offload support.  These features depend on having
106  *	VIRTIO_NET_F_CSUM and signal that the device will accept large combined
107  *	TCP (v4 or v6) packets that require segmentation offload, or large
108  *	combined UDP packets that require fragmentation offload.
109  */
110 #define	VIRTIO_NET_F_HOST_TSO4		(1ULL << 11)
111 #define	VIRTIO_NET_F_HOST_TSO6		(1ULL << 12)
112 #define	VIRTIO_NET_F_HOST_UFO		(1ULL << 14)
113 
114 /*
115  * HOST_ECN:
116  *	Depends on either VIRTIO_NET_F_HOST_TSO4 or VIRTIO_NET_F_HOST_TSO6.
117  *	This features means the device will accept packets that both require
118  *	segmentation offload and have the Explicit Congestion Notification
119  *	(ECN) bit set.  If this feature is not present, the device must not
120  *	send large segments that require ECN to be set.
121  */
122 #define	VIRTIO_NET_F_HOST_ECN		(1ULL << 13)
123 
124 /*
125  * GSO:
126  *	The GSO feature is, in theory, the combination of HOST_TSO4, HOST_TSO6,
127  *	and HOST_ECN.  This is only useful for legacy devices; newer devices
128  *	should be using the more specific bits above.
129  */
130 #define	VIRTIO_NET_F_GSO		(1ULL << 6)
131 
132 /*
133  * MRG_RXBUF:
134  *	This feature allows the receipt of large packets without needing to
135  *	allocate large buffers.  The "virtio_net_hdr" will include an extra
136  *	value: the number of buffers to gang together.
137  */
138 #define	VIRTIO_NET_F_MRG_RXBUF		(1ULL << 15)
139 
140 /*
141  * STATUS:
142  *	The VIRTIO_NET_CONFIG_STATUS configuration register is available, which
143  *	allows the driver to read the link state from the device.
144  */
145 #define	VIRTIO_NET_F_STATUS		(1ULL << 16)
146 
147 /*
148  * CTRL_VQ, CTRL_RX, CTRL_VLAN:
149  *	These features signal that the device exposes the control queue
150  *	(VIRTIO_NET_VIRTQ_CONTROL), in the case of CTRL_VQ; and that the
151  *	control queue supports extra commands (CTRL_RX, CTRL_VLAN).
152  */
153 #define	VIRTIO_NET_F_CTRL_VQ		(1ULL << 17)
154 #define	VIRTIO_NET_F_CTRL_RX		(1ULL << 18)
155 #define	VIRTIO_NET_F_CTRL_VLAN		(1ULL << 19)
156 #define	VIRTIO_NET_F_CTRL_RX_EXTRA	(1ULL << 20)
157 
158 /*
159  * These features are supported by the driver and we will request them from the
160  * device.  Note that we do not currently request GUEST_CSUM, as the driver
161  * does not presently support receiving frames with any offload features from
162  * the device.
163  */
164 #define	VIRTIO_NET_WANTED_FEATURES	(VIRTIO_NET_F_CSUM |		\
165 					VIRTIO_NET_F_GSO |		\
166 					VIRTIO_NET_F_HOST_TSO4 |	\
167 					VIRTIO_NET_F_HOST_TSO6 |	\
168 					VIRTIO_NET_F_HOST_ECN |		\
169 					VIRTIO_NET_F_MAC |		\
170 					VIRTIO_NET_F_MTU)
171 
172 /*
173  * VIRTIO NETWORK HEADER
174  *
175  * This structure appears at the start of each transmit or receive packet
176  * buffer.
177  */
178 struct virtio_net_hdr {
179 	uint8_t				vnh_flags;
180 	uint8_t				vnh_gso_type;
181 	uint16_t			vnh_hdr_len;
182 	uint16_t			vnh_gso_size;
183 	uint16_t			vnh_csum_start;
184 	uint16_t			vnh_csum_offset;
185 } __packed;
186 
187 /*
188  * VIRTIO NETWORK HEADER: FLAGS (vnh_flags)
189  */
190 #define	VIRTIO_NET_HDR_F_NEEDS_CSUM	0x01
191 
192 /*
193  * VIRTIO NETWORK HEADER: OFFLOAD OPTIONS (vnh_gso_type)
194  *
195  * Each of these is an offload type, except for the ECN value which is
196  * logically OR-ed with one of the other types.
197  */
198 #define	VIRTIO_NET_HDR_GSO_NONE		0
199 #define	VIRTIO_NET_HDR_GSO_TCPV4	1
200 #define	VIRTIO_NET_HDR_GSO_UDP		3
201 #define	VIRTIO_NET_HDR_GSO_TCPV6	4
202 #define	VIRTIO_NET_HDR_GSO_ECN		0x80
203 
204 
205 /*
206  * DRIVER PARAMETERS
207  */
208 
209 /*
210  * At attach, we allocate a fixed pool of buffers for receipt and transmission
211  * of frames.  The maximum number of buffers of each type that we will allocate
212  * is specified here.  If the ring size is smaller than this number, we will
213  * use the ring size instead.
214  */
215 #define	VIRTIO_NET_TX_BUFS		256
216 #define	VIRTIO_NET_RX_BUFS		256
217 
218 /*
219  * The virtio net header and the first buffer segment share the same DMA
220  * allocation.  We round up the virtio header size to a multiple of 4 and add 2
221  * bytes so that the IP header, which starts immediately after the 14 or 18
222  * byte Ethernet header, is then correctly aligned:
223  *
224  *   0                10      16   18                              32/36
225  *   | virtio_net_hdr | %4==0 | +2 | Ethernet header (14/18 bytes) | IPv4 ...
226  *
227  * Note that for this to work correctly, the DMA allocation must also be 4 byte
228  * aligned.
229  */
230 #define	VIOIF_HEADER_ALIGN		4
231 #define	VIOIF_HEADER_SKIP		(P2ROUNDUP( \
232 					    sizeof (struct virtio_net_hdr), \
233 					    VIOIF_HEADER_ALIGN) + 2)
234 
235 /*
236  * Given we are not negotiating VIRTIO_NET_F_MRG_RXBUF, the specification says
237  * we must be able to accept a 1514 byte packet, or if any segmentation offload
238  * features have been negotiated a 65550 byte packet.  To keep things simple,
239  * we'll assume segmentation offload is possible in most cases.  In addition to
240  * the packet payload, we need to account for the Ethernet header and the
241  * virtio_net_hdr.
242  */
243 #define	VIOIF_RX_DATA_SIZE		65550
244 #define	VIOIF_RX_BUF_SIZE		(VIOIF_RX_DATA_SIZE + \
245 					    sizeof (struct ether_header) + \
246 					    VIOIF_HEADER_SKIP)
247 
248 /*
249  * If we assume that a large allocation will probably have mostly 4K page sized
250  * cookies, 64 segments allows us 256KB for a single frame.  We're in control
251  * of the allocation we use for receive buffers, so this value only has an
252  * impact on the length of chain we're able to create for external transmit
253  * buffer mappings.
254  */
255 #define	VIOIF_MAX_SEGS			64
256 
257 /*
258  * We pre-allocate a reasonably large buffer to copy small packets
259  * there. Bigger packets are mapped, packets with multiple
260  * cookies are mapped as indirect buffers.
261  */
262 #define	VIOIF_TX_INLINE_SIZE		(2 * 1024)
263 
264 
265 /*
266  * TYPE DEFINITIONS
267  */
268 
269 typedef struct vioif vioif_t;
270 
271 /*
272  * Receive buffers are allocated in advance as a combination of DMA memory and
273  * a descriptor chain.  Receive buffers can be loaned to the networking stack
274  * to avoid copying, and this object contains the free routine to pass to
275  * desballoc().
276  *
277  * When receive buffers are not in use, they are linked into the per-instance
278  * free list, "vif_rxbufs" via "rb_link".  Under normal conditions, we expect
279  * the free list to be empty much of the time; most buffers will be in the ring
280  * or on loan.
281  */
282 typedef struct vioif_rxbuf {
283 	vioif_t				*rb_vioif;
284 	frtn_t				rb_frtn;
285 
286 	virtio_dma_t			*rb_dma;
287 	virtio_chain_t			*rb_chain;
288 
289 	list_node_t			rb_link;
290 } vioif_rxbuf_t;
291 
292 /*
293  * Transmit buffers are also allocated in advance.  DMA memory is allocated for
294  * the virtio net header, and to hold small packets.  Larger packets are mapped
295  * from storage loaned to the driver by the network stack.
296  *
297  * When transmit buffers are not in use, they are linked into the per-instance
298  * free list, "vif_txbufs" via "tb_link".
299  */
300 typedef struct vioif_txbuf {
301 	mblk_t				*tb_mp;
302 
303 	/*
304 	 * Inline buffer space (VIOIF_TX_INLINE_SIZE) for storage of the virtio
305 	 * net header, and to hold copied (rather than mapped) packet data.
306 	 */
307 	virtio_dma_t			*tb_dma;
308 	virtio_chain_t			*tb_chain;
309 
310 	/*
311 	 * External buffer mapping.  The capacity is fixed at allocation time,
312 	 * and "tb_ndmaext" tracks the current number of mappings.
313 	 */
314 	virtio_dma_t			**tb_dmaext;
315 	uint_t				tb_dmaext_capacity;
316 	uint_t				tb_ndmaext;
317 
318 	list_node_t			tb_link;
319 } vioif_txbuf_t;
320 
321 typedef enum vioif_runstate {
322 	VIOIF_RUNSTATE_STOPPED = 1,
323 	VIOIF_RUNSTATE_STOPPING,
324 	VIOIF_RUNSTATE_RUNNING
325 } vioif_runstate_t;
326 
327 /*
328  * Per-instance driver object.
329  */
330 struct vioif {
331 	dev_info_t			*vif_dip;
332 	virtio_t			*vif_virtio;
333 
334 	kmutex_t			vif_mutex;
335 
336 	/*
337 	 * The NIC is considered RUNNING between the mc_start(9E) and
338 	 * mc_stop(9E) calls.  Otherwise it is STOPPING (while draining
339 	 * resources) then STOPPED.  When not RUNNING, we will drop incoming
340 	 * frames and refuse to insert more receive buffers into the receive
341 	 * queue.
342 	 */
343 	vioif_runstate_t		vif_runstate;
344 
345 	mac_handle_t			vif_mac_handle;
346 
347 	virtio_queue_t			*vif_rx_vq;
348 	virtio_queue_t			*vif_tx_vq;
349 
350 	/* TX virtqueue management resources */
351 	boolean_t			vif_tx_corked;
352 	boolean_t			vif_tx_drain;
353 	timeout_id_t			vif_tx_reclaim_tid;
354 
355 	/*
356 	 * Configured offload features:
357 	 */
358 	unsigned int			vif_tx_csum:1;
359 	unsigned int			vif_tx_tso4:1;
360 	unsigned int			vif_tx_tso6:1;
361 
362 	/*
363 	 * For debugging, it is useful to know whether the MAC address we
364 	 * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or
365 	 * was otherwise generated or set from within the guest.
366 	 */
367 	unsigned int			vif_mac_from_host:1;
368 
369 	uint_t				vif_mtu;
370 	uint_t				vif_mtu_max;
371 	uint8_t				vif_mac[ETHERADDRL];
372 
373 	/*
374 	 * Receive buffer free list and accounting:
375 	 */
376 	list_t				vif_rxbufs;
377 	uint_t				vif_nrxbufs_alloc;
378 	uint_t				vif_nrxbufs_onloan;
379 	uint_t				vif_nrxbufs_onloan_max;
380 	uint_t				vif_rxbufs_capacity;
381 	vioif_rxbuf_t			*vif_rxbufs_mem;
382 
383 	/*
384 	 * Transmit buffer free list and accounting:
385 	 */
386 	list_t				vif_txbufs;
387 	uint_t				vif_ntxbufs_alloc;
388 	uint_t				vif_txbufs_capacity;
389 	vioif_txbuf_t			*vif_txbufs_mem;
390 
391 	/*
392 	 * These copy size thresholds are exposed as private MAC properties so
393 	 * that they can be tuned without rebooting.
394 	 */
395 	uint_t				vif_rxcopy_thresh;
396 	uint_t				vif_txcopy_thresh;
397 
398 	/*
399 	 * Statistics visible through mac:
400 	 */
401 	uint64_t			vif_ipackets;
402 	uint64_t			vif_opackets;
403 	uint64_t			vif_rbytes;
404 	uint64_t			vif_obytes;
405 	uint64_t			vif_brdcstxmt;
406 	uint64_t			vif_brdcstrcv;
407 	uint64_t			vif_multixmt;
408 	uint64_t			vif_multircv;
409 	uint64_t			vif_norecvbuf;
410 	uint64_t			vif_notxbuf;
411 	uint64_t			vif_ierrors;
412 	uint64_t			vif_oerrors;
413 
414 	/*
415 	 * Internal debugging statistics:
416 	 */
417 	uint64_t			vif_rxfail_dma_handle;
418 	uint64_t			vif_rxfail_dma_buffer;
419 	uint64_t			vif_rxfail_dma_bind;
420 	uint64_t			vif_rxfail_chain_undersize;
421 	uint64_t			vif_rxfail_no_descriptors;
422 	uint64_t			vif_txfail_dma_handle;
423 	uint64_t			vif_txfail_dma_bind;
424 	uint64_t			vif_txfail_indirect_limit;
425 
426 	uint64_t			vif_stat_tx_reclaim;
427 };
428 
429 #ifdef __cplusplus
430 }
431 #endif
432 
433 #endif /* _VIOIF_H */
434