xref: /illumos-gate/usr/src/uts/common/io/vioif/vioif.h (revision c21bd51d7acbaf77116c4cc3a23dfc6d16c637c2)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021 Joyent, Inc.
14  */
15 
16 /*
17  * VIRTIO NETWORK DRIVER
18  */
19 
20 #ifndef _VIOIF_H
21 #define	_VIOIF_H
22 
23 #include "virtio.h"
24 
25 #ifdef __cplusplus
26 extern "C" {
27 #endif
28 
29 /*
30  * VIRTIO NETWORK CONFIGURATION REGISTERS
31  *
32  * These are offsets into the device-specific configuration space available
33  * through the virtio_dev_*() family of functions.
34  */
35 #define	VIRTIO_NET_CONFIG_MAC		0x00	/* 48 R/W */
36 #define	VIRTIO_NET_CONFIG_STATUS	0x06	/* 16 R   */
37 #define	VIRTIO_NET_CONFIG_MAX_VQ_PAIRS	0x08	/* 16 R   */
38 #define	VIRTIO_NET_CONFIG_MTU		0x0A	/* 16 R   */
39 
40 /*
41  * VIRTIO NETWORK VIRTQUEUES
42  *
43  * Note that the control queue is only present if VIRTIO_NET_F_CTRL_VQ is
44  * negotiated with the device.
45  */
46 #define	VIRTIO_NET_VIRTQ_RX		0
47 #define	VIRTIO_NET_VIRTQ_TX		1
48 #define	VIRTIO_NET_VIRTQ_CONTROL	2
49 
50 /*
51  * VIRTIO NETWORK FEATURE BITS
52  */
53 
54 /*
55  * CSUM, GUEST_CSUM:
56  *	Partial checksum support.  These features signal that the device will
57  *	accept packets with partial checksums (CSUM), and that the driver will
58  *	accept packets with partial checksums (GUEST_CSUM).  These features
59  *	combine the use of the VIRTIO_NET_HDR_F_NEEDS_CSUM flag, and the
60  *	"csum_start" and "csum_offset" fields, in the virtio net header.
61  */
62 #define	VIRTIO_NET_F_CSUM		(1ULL << 0)
63 #define	VIRTIO_NET_F_GUEST_CSUM		(1ULL << 1)
64 
65 /*
66  * MTU:
67  *	The device offers a maximum MTU value at VIRTIO_NET_CONFIG_MTU.  If
68  *	this is not negotiated, we allow the largest possible MTU that our
69  *	buffer allocations support in case jumbo frames are tacitly supported
70  *	by the device.  The default MTU is always 1500.
71  */
72 #define	VIRTIO_NET_F_MTU		(1ULL << 3)
73 
74 /*
75  * MAC:
76  *	The device has an assigned primary MAC address.  If this feature bit is
77  *	not set, the driver must provide a locally assigned MAC address.  See
78  *	IEEE 802, "48-bit universal LAN MAC addresses" for more details on
79  *	assignment.
80  */
81 #define	VIRTIO_NET_F_MAC		(1ULL << 5)
82 
83 /*
84  * GUEST_TSO4, GUEST_TSO6, GUEST_UFO:
85  *	Inbound segmentation offload support.  These features depend on having
86  *	VIRTIO_NET_F_GUEST_CSUM and signal that the driver can accept large
87  *	combined TCP (v4 or v6) packets, or reassembled UDP fragments.
88  */
89 #define	VIRTIO_NET_F_GUEST_TSO4		(1ULL << 7)
90 #define	VIRTIO_NET_F_GUEST_TSO6		(1ULL << 8)
91 #define	VIRTIO_NET_F_GUEST_UFO		(1ULL << 10)
92 
93 /*
94  * GUEST_ECN:
95  *	Depends on either VIRTIO_NET_F_GUEST_TSO4 or VIRTIO_NET_F_GUEST_TSO6.
96  *	This feature means the driver will look for the VIRTIO_NET_HDR_GSO_ECN
97  *	bit in the "gso_type" of the virtio net header.  This bit tells the
98  *	driver that the Explicit Congestion Notification (ECN) bit was set in
99  *	the original TCP packets.
100  */
101 #define	VIRTIO_NET_F_GUEST_ECN		(1ULL << 9)
102 
103 /*
104  * HOST_TSO4, HOST_TSO6, HOST_UFO:
105  *	Outbound segmentation offload support.  These features depend on having
106  *	VIRTIO_NET_F_CSUM and signal that the device will accept large combined
107  *	TCP (v4 or v6) packets that require segmentation offload, or large
108  *	combined UDP packets that require fragmentation offload.
109  */
110 #define	VIRTIO_NET_F_HOST_TSO4		(1ULL << 11)
111 #define	VIRTIO_NET_F_HOST_TSO6		(1ULL << 12)
112 #define	VIRTIO_NET_F_HOST_UFO		(1ULL << 14)
113 
114 /*
115  * HOST_ECN:
116  *	Depends on either VIRTIO_NET_F_HOST_TSO4 or VIRTIO_NET_F_HOST_TSO6.
117  *	This features means the device will accept packets that both require
118  *	segmentation offload and have the Explicit Congestion Notification
119  *	(ECN) bit set.  If this feature is not present, the device must not
120  *	send large segments that require ECN to be set.
121  */
122 #define	VIRTIO_NET_F_HOST_ECN		(1ULL << 13)
123 
124 /*
125  * GSO:
126  *	The GSO feature is, in theory, the combination of HOST_TSO4, HOST_TSO6,
127  *	and HOST_ECN.  This is only useful for legacy devices; newer devices
128  *	should be using the more specific bits above.
129  */
130 #define	VIRTIO_NET_F_GSO		(1ULL << 6)
131 
132 /*
133  * MRG_RXBUF:
134  *	This feature allows the receipt of large packets without needing to
135  *	allocate large buffers.  The "virtio_net_hdr" will include an extra
136  *	value: the number of buffers to gang together.
137  */
138 #define	VIRTIO_NET_F_MRG_RXBUF		(1ULL << 15)
139 
140 /*
141  * STATUS:
142  *	The VIRTIO_NET_CONFIG_STATUS configuration register is available, which
143  *	allows the driver to read the link state from the device.
144  */
145 #define	VIRTIO_NET_F_STATUS		(1ULL << 16)
146 
147 /*
148  * CTRL_VQ, CTRL_RX, CTRL_VLAN:
149  *	These features signal that the device exposes the control queue
150  *	(VIRTIO_NET_VIRTQ_CONTROL), in the case of CTRL_VQ; and that the
151  *	control queue supports extra commands (CTRL_RX, CTRL_VLAN).
152  */
153 #define	VIRTIO_NET_F_CTRL_VQ		(1ULL << 17)
154 #define	VIRTIO_NET_F_CTRL_RX		(1ULL << 18)
155 #define	VIRTIO_NET_F_CTRL_VLAN		(1ULL << 19)
156 #define	VIRTIO_NET_F_CTRL_RX_EXTRA	(1ULL << 20)
157 
158 /*
159  * These features are supported by the driver and we will request them from the
160  * device.  Note that we do not currently request GUEST_CSUM, as the driver
161  * does not presently support receiving frames with any offload features from
162  * the device.
163  */
164 #define	VIRTIO_NET_WANTED_FEATURES	(VIRTIO_NET_F_CSUM |		\
165 					VIRTIO_NET_F_GSO |		\
166 					VIRTIO_NET_F_HOST_TSO4 |	\
167 					VIRTIO_NET_F_HOST_TSO6 |	\
168 					VIRTIO_NET_F_HOST_ECN |		\
169 					VIRTIO_NET_F_MAC |		\
170 					VIRTIO_NET_F_MTU |		\
171 					VIRTIO_NET_F_CTRL_VQ |		\
172 					VIRTIO_NET_F_CTRL_RX)
173 
174 /*
175  * VIRTIO NETWORK HEADER
176  *
177  * This structure appears at the start of each transmit or receive packet
178  * buffer.
179  */
180 struct virtio_net_hdr {
181 	uint8_t				vnh_flags;
182 	uint8_t				vnh_gso_type;
183 	uint16_t			vnh_hdr_len;
184 	uint16_t			vnh_gso_size;
185 	uint16_t			vnh_csum_start;
186 	uint16_t			vnh_csum_offset;
187 } __packed;
188 
189 /*
190  * VIRTIO NETWORK HEADER: FLAGS (vnh_flags)
191  */
192 #define	VIRTIO_NET_HDR_F_NEEDS_CSUM	0x01
193 
194 /*
195  * VIRTIO NETWORK HEADER: OFFLOAD OPTIONS (vnh_gso_type)
196  *
197  * Each of these is an offload type, except for the ECN value which is
198  * logically OR-ed with one of the other types.
199  */
200 #define	VIRTIO_NET_HDR_GSO_NONE		0
201 #define	VIRTIO_NET_HDR_GSO_TCPV4	1
202 #define	VIRTIO_NET_HDR_GSO_UDP		3
203 #define	VIRTIO_NET_HDR_GSO_TCPV6	4
204 #define	VIRTIO_NET_HDR_GSO_ECN		0x80
205 
206 /*
207  * VIRTIO CONTROL VIRTQUEUE HEADER
208  *
209  * This structure appears at the start of each control virtqueue request.
210  */
211 struct virtio_net_ctrlq_hdr {
212 	uint8_t		vnch_class;
213 	uint8_t		vnch_command;
214 } __packed;
215 
216 /*
217  * Contol Queue Classes
218  */
219 #define	VIRTIO_NET_CTRL_RX		0
220 
221 /*
222  * CTRL_RX commands
223  */
224 #define	VIRTIO_NET_CTRL_RX_PROMISC	0
225 #define	VIRTIO_NET_CTRL_RX_ALLMULTI	1
226 #define	VIRTIO_NET_CTRL_RX_ALLUNI	2
227 #define	VIRTIO_NET_CTRL_RX_NOMULTI	3
228 #define	VIRTIO_NET_CTRL_RX_NOUNI	4
229 #define	VIRTIO_NET_CTRL_RX_NOBCAST	5
230 
231 /*
232  * Control queue ack values
233  */
234 #define	VIRTIO_NET_CQ_OK		0
235 #define	VIRTIO_NET_CQ_ERR		1
236 
237 
238 /*
239  * DRIVER PARAMETERS
240  */
241 
242 /*
243  * At attach, we allocate a fixed pool of buffers for receipt and transmission
244  * of frames.  The maximum number of buffers of each type that we will allocate
245  * is specified here.  If the ring size is smaller than this number, we will
246  * use the ring size instead.
247  */
248 #define	VIRTIO_NET_TX_BUFS		256
249 #define	VIRTIO_NET_RX_BUFS		256
250 
251 /*
252  * Initially, only use a single buf for control queue requests (when
253  * present). If this becomes a bottleneck, we can simply increase this
254  * value as necessary.
255  */
256 #define	VIRTIO_NET_CTRL_BUFS		1
257 
258 /*
259  * The virtio net header and the first buffer segment share the same DMA
260  * allocation.  We round up the virtio header size to a multiple of 4 and add 2
261  * bytes so that the IP header, which starts immediately after the 14 or 18
262  * byte Ethernet header, is then correctly aligned:
263  *
264  *   0                10      16   18                              32/36
265  *   | virtio_net_hdr | %4==0 | +2 | Ethernet header (14/18 bytes) | IPv4 ...
266  *
267  * Note that for this to work correctly, the DMA allocation must also be 4 byte
268  * aligned.
269  */
270 #define	VIOIF_HEADER_ALIGN		4
271 #define	VIOIF_HEADER_SKIP		(P2ROUNDUP( \
272 					    sizeof (struct virtio_net_hdr), \
273 					    VIOIF_HEADER_ALIGN) + 2)
274 
275 /*
276  * Given we are not negotiating VIRTIO_NET_F_MRG_RXBUF, the specification says
277  * we must be able to accept a 1514 byte packet, or if any segmentation offload
278  * features have been negotiated a 65550 byte packet.  To keep things simple,
279  * we'll assume segmentation offload is possible in most cases.  In addition to
280  * the packet payload, we need to account for the Ethernet header and the
281  * virtio_net_hdr.
282  */
283 #define	VIOIF_RX_DATA_SIZE		65550
284 #define	VIOIF_RX_BUF_SIZE		(VIOIF_RX_DATA_SIZE + \
285 					    sizeof (struct ether_header) + \
286 					    VIOIF_HEADER_SKIP)
287 
288 /*
289  * If we assume that a large allocation will probably have mostly 4K page sized
290  * cookies, 64 segments allows us 256KB for a single frame.  We're in control
291  * of the allocation we use for receive buffers, so this value only has an
292  * impact on the length of chain we're able to create for external transmit
293  * buffer mappings.
294  */
295 #define	VIOIF_MAX_SEGS			64
296 
297 /*
298  * We pre-allocate a reasonably large buffer to copy small packets
299  * there. Bigger packets are mapped, packets with multiple
300  * cookies are mapped as indirect buffers.
301  */
302 #define	VIOIF_TX_INLINE_SIZE		(2 * 1024)
303 
304 /*
305  * Control queue messages are very small. This is a rather arbitrary small
306  * bufer size that should be sufficiently large for any control queue
307  * messages we will send.
308  */
309 #define	VIOIF_CTRL_SIZE			256
310 
311 /*
312  * TYPE DEFINITIONS
313  */
314 
315 typedef struct vioif vioif_t;
316 
317 /*
318  * Receive buffers are allocated in advance as a combination of DMA memory and
319  * a descriptor chain.  Receive buffers can be loaned to the networking stack
320  * to avoid copying, and this object contains the free routine to pass to
321  * desballoc().
322  *
323  * When receive buffers are not in use, they are linked into the per-instance
324  * free list, "vif_rxbufs" via "rb_link".  Under normal conditions, we expect
325  * the free list to be empty much of the time; most buffers will be in the ring
326  * or on loan.
327  */
328 typedef struct vioif_rxbuf {
329 	vioif_t				*rb_vioif;
330 	frtn_t				rb_frtn;
331 
332 	virtio_dma_t			*rb_dma;
333 	virtio_chain_t			*rb_chain;
334 
335 	list_node_t			rb_link;
336 } vioif_rxbuf_t;
337 
338 typedef struct vioif_ctrlbuf {
339 	vioif_t				*cb_vioif;
340 
341 	virtio_dma_t			*cb_dma;
342 	virtio_chain_t			*cb_chain;
343 
344 	list_node_t			cb_link;
345 } vioif_ctrlbuf_t;
346 
347 /*
348  * Transmit buffers are also allocated in advance.  DMA memory is allocated for
349  * the virtio net header, and to hold small packets.  Larger packets are mapped
350  * from storage loaned to the driver by the network stack.
351  *
352  * When transmit buffers are not in use, they are linked into the per-instance
353  * free list, "vif_txbufs" via "tb_link".
354  */
355 typedef struct vioif_txbuf {
356 	mblk_t				*tb_mp;
357 
358 	/*
359 	 * Inline buffer space (VIOIF_TX_INLINE_SIZE) for storage of the virtio
360 	 * net header, and to hold copied (rather than mapped) packet data.
361 	 */
362 	virtio_dma_t			*tb_dma;
363 	virtio_chain_t			*tb_chain;
364 
365 	/*
366 	 * External buffer mapping.  The capacity is fixed at allocation time,
367 	 * and "tb_ndmaext" tracks the current number of mappings.
368 	 */
369 	virtio_dma_t			**tb_dmaext;
370 	uint_t				tb_dmaext_capacity;
371 	uint_t				tb_ndmaext;
372 
373 	list_node_t			tb_link;
374 } vioif_txbuf_t;
375 
376 typedef enum vioif_runstate {
377 	VIOIF_RUNSTATE_STOPPED = 1,
378 	VIOIF_RUNSTATE_STOPPING,
379 	VIOIF_RUNSTATE_RUNNING
380 } vioif_runstate_t;
381 
382 /*
383  * Per-instance driver object.
384  */
385 struct vioif {
386 	dev_info_t			*vif_dip;
387 	virtio_t			*vif_virtio;
388 
389 	kmutex_t			vif_mutex;
390 
391 	/*
392 	 * The NIC is considered RUNNING between the mc_start(9E) and
393 	 * mc_stop(9E) calls.  Otherwise it is STOPPING (while draining
394 	 * resources) then STOPPED.  When not RUNNING, we will drop incoming
395 	 * frames and refuse to insert more receive buffers into the receive
396 	 * queue.
397 	 */
398 	vioif_runstate_t		vif_runstate;
399 
400 	mac_handle_t			vif_mac_handle;
401 
402 	virtio_queue_t			*vif_rx_vq;
403 	virtio_queue_t			*vif_tx_vq;
404 	virtio_queue_t			*vif_ctrl_vq;
405 
406 	/* TX virtqueue management resources */
407 	boolean_t			vif_tx_corked;
408 	boolean_t			vif_tx_drain;
409 	timeout_id_t			vif_tx_reclaim_tid;
410 
411 	/*
412 	 * Configured offload features:
413 	 */
414 	unsigned int			vif_tx_csum:1;
415 	unsigned int			vif_tx_tso4:1;
416 	unsigned int			vif_tx_tso6:1;
417 
418 	/*
419 	 * For debugging, it is useful to know whether the MAC address we
420 	 * are using came from the host (via VIRTIO_NET_CONFIG_MAC) or
421 	 * was otherwise generated or set from within the guest.
422 	 */
423 	unsigned int			vif_mac_from_host:1;
424 
425 	unsigned int			vif_has_ctrlq:1;
426 	unsigned int			vif_has_ctrlq_rx:1;
427 
428 	uint_t				vif_mtu;
429 	uint_t				vif_mtu_max;
430 	uint8_t				vif_mac[ETHERADDRL];
431 
432 	/*
433 	 * Receive buffer free list and accounting:
434 	 */
435 	list_t				vif_rxbufs;
436 	uint_t				vif_nrxbufs_alloc;
437 	uint_t				vif_nrxbufs_onloan;
438 	uint_t				vif_nrxbufs_onloan_max;
439 	uint_t				vif_rxbufs_capacity;
440 	vioif_rxbuf_t			*vif_rxbufs_mem;
441 
442 	/*
443 	 * Transmit buffer free list and accounting:
444 	 */
445 	list_t				vif_txbufs;
446 	uint_t				vif_ntxbufs_alloc;
447 	uint_t				vif_txbufs_capacity;
448 	vioif_txbuf_t			*vif_txbufs_mem;
449 
450 	/*
451 	 * These copy size thresholds are exposed as private MAC properties so
452 	 * that they can be tuned without rebooting.
453 	 */
454 	uint_t				vif_rxcopy_thresh;
455 	uint_t				vif_txcopy_thresh;
456 
457 	list_t				vif_ctrlbufs;
458 	uint_t				vif_nctrlbufs_alloc;
459 	uint_t				vif_ctrlbufs_capacity;
460 	vioif_ctrlbuf_t			*vif_ctrlbufs_mem;
461 
462 	/*
463 	 * Statistics visible through mac:
464 	 */
465 	uint64_t			vif_ipackets;
466 	uint64_t			vif_opackets;
467 	uint64_t			vif_rbytes;
468 	uint64_t			vif_obytes;
469 	uint64_t			vif_brdcstxmt;
470 	uint64_t			vif_brdcstrcv;
471 	uint64_t			vif_multixmt;
472 	uint64_t			vif_multircv;
473 	uint64_t			vif_norecvbuf;
474 	uint64_t			vif_notxbuf;
475 	uint64_t			vif_ierrors;
476 	uint64_t			vif_oerrors;
477 
478 	/*
479 	 * Internal debugging statistics:
480 	 */
481 	uint64_t			vif_rxfail_dma_handle;
482 	uint64_t			vif_rxfail_dma_buffer;
483 	uint64_t			vif_rxfail_dma_bind;
484 	uint64_t			vif_rxfail_chain_undersize;
485 	uint64_t			vif_rxfail_no_descriptors;
486 	uint64_t			vif_txfail_dma_handle;
487 	uint64_t			vif_txfail_dma_bind;
488 	uint64_t			vif_txfail_indirect_limit;
489 
490 	uint64_t			vif_stat_tx_reclaim;
491 
492 	uint64_t			vif_noctrlbuf;
493 	uint64_t			vif_ctrlbuf_toosmall;
494 };
495 
496 #ifdef __cplusplus
497 }
498 #endif
499 
500 #endif /* _VIOIF_H */
501