xref: /freebsd/sys/dev/virtio/network/virtio_net.h (revision 4133f23624058951a3b66e3ad735de980a485f36)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * This header is BSD licensed so anyone can use the definitions to implement
5  * compatible drivers/servers.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of IBM nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * $FreeBSD$
31  */
32 
33 #ifndef _VIRTIO_NET_H
34 #define _VIRTIO_NET_H
35 
36 /* The feature bitmap for virtio net */
37 #define VIRTIO_NET_F_CSUM	0x00001 /* Host handles pkts w/ partial csum */
38 #define VIRTIO_NET_F_GUEST_CSUM 0x00002 /* Guest handles pkts w/ partial csum*/
39 #define VIRTIO_NET_F_MAC	0x00020 /* Host has given MAC address. */
40 #define VIRTIO_NET_F_GSO	0x00040 /* Host handles pkts w/ any GSO type */
41 #define VIRTIO_NET_F_GUEST_TSO4	0x00080 /* Guest can handle TSOv4 in. */
42 #define VIRTIO_NET_F_GUEST_TSO6	0x00100 /* Guest can handle TSOv6 in. */
43 #define VIRTIO_NET_F_GUEST_ECN	0x00200 /* Guest can handle TSO[6] w/ ECN in.*/
44 #define VIRTIO_NET_F_GUEST_UFO	0x00400 /* Guest can handle UFO in. */
45 #define VIRTIO_NET_F_HOST_TSO4	0x00800 /* Host can handle TSOv4 in. */
46 #define VIRTIO_NET_F_HOST_TSO6	0x01000 /* Host can handle TSOv6 in. */
47 #define VIRTIO_NET_F_HOST_ECN	0x02000 /* Host can handle TSO[6] w/ ECN in. */
48 #define VIRTIO_NET_F_HOST_UFO	0x04000 /* Host can handle UFO in. */
49 #define VIRTIO_NET_F_MRG_RXBUF	0x08000 /* Host can merge receive buffers. */
50 #define VIRTIO_NET_F_STATUS	0x10000 /* virtio_net_config.status available*/
51 #define VIRTIO_NET_F_CTRL_VQ	0x20000 /* Control channel available */
52 #define VIRTIO_NET_F_CTRL_RX	0x40000 /* Control channel RX mode support */
53 #define VIRTIO_NET_F_CTRL_VLAN	0x80000 /* Control channel VLAN filtering */
54 #define VIRTIO_NET_F_CTRL_RX_EXTRA 0x100000 /* Extra RX mode control support */
55 #define VIRTIO_NET_F_GUEST_ANNOUNCE 0x200000 /* Announce device on network */
56 #define VIRTIO_NET_F_MQ		0x400000 /* Device supports RFS */
57 #define VIRTIO_NET_F_CTRL_MAC_ADDR 0x800000 /* Set MAC address */
58 
59 #define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
60 
61 struct virtio_net_config {
62 	/* The config defining mac address (if VIRTIO_NET_F_MAC) */
63 	uint8_t		mac[ETHER_ADDR_LEN];
64 	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
65 	uint16_t	status;
66 	/* Maximum number of each of transmit and receive queues;
67 	 * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ.
68 	 * Legal values are between 1 and 0x8000.
69 	 */
70 	uint16_t	max_virtqueue_pairs;
71 } __packed;
72 
73 /*
74  * This is the first element of the scatter-gather list.  If you don't
75  * specify GSO or CSUM features, you can simply ignore the header.
76  */
77 struct virtio_net_hdr {
78 #define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	/* Use csum_start,csum_offset*/
79 #define VIRTIO_NET_HDR_F_DATA_VALID	2	/* Csum is valid */
80 	uint8_t	flags;
81 #define VIRTIO_NET_HDR_GSO_NONE		0	/* Not a GSO frame */
82 #define VIRTIO_NET_HDR_GSO_TCPV4	1	/* GSO frame, IPv4 TCP (TSO) */
83 #define VIRTIO_NET_HDR_GSO_UDP		3	/* GSO frame, IPv4 UDP (UFO) */
84 #define VIRTIO_NET_HDR_GSO_TCPV6	4	/* GSO frame, IPv6 TCP */
85 #define VIRTIO_NET_HDR_GSO_ECN		0x80	/* TCP has ECN set */
86 	uint8_t gso_type;
87 	uint16_t hdr_len;	/* Ethernet + IP + tcp/udp hdrs */
88 	uint16_t gso_size;	/* Bytes to append to hdr_len per frame */
89 	uint16_t csum_start;	/* Position to start checksumming from */
90 	uint16_t csum_offset;	/* Offset after that to place checksum */
91 };
92 
93 /*
94  * This is the version of the header to use when the MRG_RXBUF
95  * feature has been negotiated.
96  */
97 struct virtio_net_hdr_mrg_rxbuf {
98 	struct virtio_net_hdr hdr;
99 	uint16_t num_buffers;	/* Number of merged rx buffers */
100 };
101 
102 /*
103  * Control virtqueue data structures
104  *
105  * The control virtqueue expects a header in the first sg entry
106  * and an ack/status response in the last entry.  Data for the
107  * command goes in between.
108  */
109 struct virtio_net_ctrl_hdr {
110 	uint8_t class;
111 	uint8_t cmd;
112 } __packed;
113 
114 #define VIRTIO_NET_OK	0
115 #define VIRTIO_NET_ERR	1
116 
117 /*
118  * Control the RX mode, ie. promiscuous, allmulti, etc...
119  * All commands require an "out" sg entry containing a 1 byte
120  * state value, zero = disable, non-zero = enable.  Commands
121  * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature.
122  * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA.
123  */
124 #define VIRTIO_NET_CTRL_RX	0
125 #define VIRTIO_NET_CTRL_RX_PROMISC	0
126 #define VIRTIO_NET_CTRL_RX_ALLMULTI	1
127 #define VIRTIO_NET_CTRL_RX_ALLUNI	2
128 #define VIRTIO_NET_CTRL_RX_NOMULTI	3
129 #define VIRTIO_NET_CTRL_RX_NOUNI	4
130 #define VIRTIO_NET_CTRL_RX_NOBCAST	5
131 
132 /*
133  * Control the MAC filter table.
134  *
135  * The MAC filter table is managed by the hypervisor, the guest should
136  * assume the size is infinite.  Filtering should be considered
137  * non-perfect, ie. based on hypervisor resources, the guest may
138  * received packets from sources not specified in the filter list.
139  *
140  * In addition to the class/cmd header, the TABLE_SET command requires
141  * two out scatterlists.  Each contains a 4 byte count of entries followed
142  * by a concatenated byte stream of the ETH_ALEN MAC addresses.  The
143  * first sg list contains unicast addresses, the second is for multicast.
144  * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
145  * is available.
146  *
147  * The ADDR_SET command requests one out scatterlist, it contains a
148  * 6 bytes MAC address. This functionality is present if the
149  * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available.
150  */
151 struct virtio_net_ctrl_mac {
152 	uint32_t	entries;
153 	uint8_t		macs[][ETHER_ADDR_LEN];
154 } __packed;
155 
156 #define VIRTIO_NET_CTRL_MAC	1
157 #define VIRTIO_NET_CTRL_MAC_TABLE_SET	0
158 #define VIRTIO_NET_CTRL_MAC_ADDR_SET	1
159 
160 /*
161  * Control VLAN filtering
162  *
163  * The VLAN filter table is controlled via a simple ADD/DEL interface.
164  * VLAN IDs not added may be filtered by the hypervisor.  Del is the
165  * opposite of add.  Both commands expect an out entry containing a 2
166  * byte VLAN ID.  VLAN filtering is available with the
167  * VIRTIO_NET_F_CTRL_VLAN feature bit.
168  */
169 #define VIRTIO_NET_CTRL_VLAN	2
170 #define VIRTIO_NET_CTRL_VLAN_ADD	0
171 #define VIRTIO_NET_CTRL_VLAN_DEL	1
172 
173 /*
174  * Control link announce acknowledgement
175  *
176  * The command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that
177  * driver has recevied the notification; device would clear the
178  * VIRTIO_NET_S_ANNOUNCE bit in the status field after it receives
179  * this command.
180  */
181 #define VIRTIO_NET_CTRL_ANNOUNCE	3
182 #define VIRTIO_NET_CTRL_ANNOUNCE_ACK	0
183 
184 /*
185  * Control Receive Flow Steering
186  *
187  * The command VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET enables Receive Flow
188  * Steering, specifying the number of the transmit and receive queues
189  * that will be used. After the command is consumed and acked by the
190  * device, the device will not steer new packets on receive virtqueues
191  * other than specified nor read from transmit virtqueues other than
192  * specified. Accordingly, driver should not transmit new packets on
193  * virtqueues other than specified.
194  */
195 struct virtio_net_ctrl_mq {
196 	uint16_t	virtqueue_pairs;
197 } __packed;
198 
199 #define VIRTIO_NET_CTRL_MQ	4
200 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET		0
201 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN		1
202 #define VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX		0x8000
203 
204 /*
205  * Use the checksum offset in the VirtIO header to set the
206  * correct CSUM_* flags.
207  */
208 static inline int
209 virtio_net_rx_csum_by_offset(struct mbuf *m, uint16_t eth_type, int ip_start,
210 			struct virtio_net_hdr *hdr)
211 {
212 #if defined(INET) || defined(INET6)
213 	int offset = hdr->csum_start + hdr->csum_offset;
214 #endif
215 
216 	/* Only do a basic sanity check on the offset. */
217 	switch (eth_type) {
218 #if defined(INET)
219 	case ETHERTYPE_IP:
220 		if (__predict_false(offset < ip_start + sizeof(struct ip)))
221 			return (1);
222 		break;
223 #endif
224 #if defined(INET6)
225 	case ETHERTYPE_IPV6:
226 		if (__predict_false(offset < ip_start + sizeof(struct ip6_hdr)))
227 			return (1);
228 		break;
229 #endif
230 	default:
231 		/* Here we should increment the rx_csum_bad_ethtype counter. */
232 		return (1);
233 	}
234 
235 	/*
236 	 * Use the offset to determine the appropriate CSUM_* flags. This is
237 	 * a bit dirty, but we can get by with it since the checksum offsets
238 	 * happen to be different. We assume the host host does not do IPv4
239 	 * header checksum offloading.
240 	 */
241 	switch (hdr->csum_offset) {
242 	case offsetof(struct udphdr, uh_sum):
243 	case offsetof(struct tcphdr, th_sum):
244 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
245 		m->m_pkthdr.csum_data = 0xFFFF;
246 		break;
247 	default:
248 		/* Here we should increment the rx_csum_bad_offset counter. */
249 		return (1);
250 	}
251 
252 	return (0);
253 }
254 
255 static inline int
256 virtio_net_rx_csum_by_parse(struct mbuf *m, uint16_t eth_type, int ip_start,
257 		       struct virtio_net_hdr *hdr)
258 {
259 	int offset, proto;
260 
261 	switch (eth_type) {
262 #if defined(INET)
263 	case ETHERTYPE_IP: {
264 		struct ip *ip;
265 		if (__predict_false(m->m_len < ip_start + sizeof(struct ip)))
266 			return (1);
267 		ip = (struct ip *)(m->m_data + ip_start);
268 		proto = ip->ip_p;
269 		offset = ip_start + (ip->ip_hl << 2);
270 		break;
271 	}
272 #endif
273 #if defined(INET6)
274 	case ETHERTYPE_IPV6:
275 		if (__predict_false(m->m_len < ip_start +
276 		    sizeof(struct ip6_hdr)))
277 			return (1);
278 		offset = ip6_lasthdr(m, ip_start, IPPROTO_IPV6, &proto);
279 		if (__predict_false(offset < 0))
280 			return (1);
281 		break;
282 #endif
283 	default:
284 		/* Here we should increment the rx_csum_bad_ethtype counter. */
285 		return (1);
286 	}
287 
288 	switch (proto) {
289 	case IPPROTO_TCP:
290 		if (__predict_false(m->m_len < offset + sizeof(struct tcphdr)))
291 			return (1);
292 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
293 		m->m_pkthdr.csum_data = 0xFFFF;
294 		break;
295 	case IPPROTO_UDP:
296 		if (__predict_false(m->m_len < offset + sizeof(struct udphdr)))
297 			return (1);
298 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
299 		m->m_pkthdr.csum_data = 0xFFFF;
300 		break;
301 	default:
302 		/*
303 		 * For the remaining protocols, FreeBSD does not support
304 		 * checksum offloading, so the checksum will be recomputed.
305 		 */
306 #if 0
307 		if_printf(ifp, "cksum offload of unsupported "
308 		    "protocol eth_type=%#x proto=%d csum_start=%d "
309 		    "csum_offset=%d\n", __func__, eth_type, proto,
310 		    hdr->csum_start, hdr->csum_offset);
311 #endif
312 		break;
313 	}
314 
315 	return (0);
316 }
317 
318 /*
319  * Set the appropriate CSUM_* flags. Unfortunately, the information
320  * provided is not directly useful to us. The VirtIO header gives the
321  * offset of the checksum, which is all Linux needs, but this is not
322  * how FreeBSD does things. We are forced to peek inside the packet
323  * a bit.
324  *
325  * It would be nice if VirtIO gave us the L4 protocol or if FreeBSD
326  * could accept the offsets and let the stack figure it out.
327  */
328 static inline int
329 virtio_net_rx_csum(struct mbuf *m, struct virtio_net_hdr *hdr)
330 {
331 	struct ether_header *eh;
332 	struct ether_vlan_header *evh;
333 	uint16_t eth_type;
334 	int offset, error;
335 
336 	if ((hdr->flags & (VIRTIO_NET_HDR_F_NEEDS_CSUM |
337 	    VIRTIO_NET_HDR_F_DATA_VALID)) == 0) {
338 		return (0);
339 	}
340 
341 	eh = mtod(m, struct ether_header *);
342 	eth_type = ntohs(eh->ether_type);
343 	if (eth_type == ETHERTYPE_VLAN) {
344 		/* BMV: We should handle nested VLAN tags too. */
345 		evh = mtod(m, struct ether_vlan_header *);
346 		eth_type = ntohs(evh->evl_proto);
347 		offset = sizeof(struct ether_vlan_header);
348 	} else
349 		offset = sizeof(struct ether_header);
350 
351 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
352 		error = virtio_net_rx_csum_by_offset(m, eth_type, offset, hdr);
353 	else
354 		error = virtio_net_rx_csum_by_parse(m, eth_type, offset, hdr);
355 
356 	return (error);
357 }
358 
359 static inline int
360 virtio_net_tx_offload_ctx(struct mbuf *m, int *etype, int *proto, int *start)
361 {
362 	struct ether_vlan_header *evh;
363 	int offset;
364 
365 	evh = mtod(m, struct ether_vlan_header *);
366 	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
367 		/* BMV: We should handle nested VLAN tags too. */
368 		*etype = ntohs(evh->evl_proto);
369 		offset = sizeof(struct ether_vlan_header);
370 	} else {
371 		*etype = ntohs(evh->evl_encap_proto);
372 		offset = sizeof(struct ether_header);
373 	}
374 
375 	switch (*etype) {
376 #if defined(INET)
377 	case ETHERTYPE_IP: {
378 		struct ip *ip, iphdr;
379 		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
380 			m_copydata(m, offset, sizeof(struct ip),
381 			    (caddr_t) &iphdr);
382 			ip = &iphdr;
383 		} else
384 			ip = (struct ip *)(m->m_data + offset);
385 		*proto = ip->ip_p;
386 		*start = offset + (ip->ip_hl << 2);
387 		break;
388 	}
389 #endif
390 #if defined(INET6)
391 	case ETHERTYPE_IPV6:
392 		*proto = -1;
393 		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
394 		/* Assert the network stack sent us a valid packet. */
395 		KASSERT(*start > offset,
396 		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
397 		    *start, offset, *proto));
398 		break;
399 #endif
400 	default:
401 		/* Here we should increment the tx_csum_bad_ethtype counter. */
402 		return (EINVAL);
403 	}
404 
405 	return (0);
406 }
407 
408 static inline int
409 virtio_net_tx_offload_tso(if_t ifp, struct mbuf *m, int eth_type,
410 		     int offset, bool allow_ecn, struct virtio_net_hdr *hdr)
411 {
412 	static struct timeval lastecn;
413 	static int curecn;
414 	struct tcphdr *tcp, tcphdr;
415 
416 	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
417 		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
418 		tcp = &tcphdr;
419 	} else
420 		tcp = (struct tcphdr *)(m->m_data + offset);
421 
422 	hdr->hdr_len = offset + (tcp->th_off << 2);
423 	hdr->gso_size = m->m_pkthdr.tso_segsz;
424 	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
425 	    VIRTIO_NET_HDR_GSO_TCPV6;
426 
427 	if (tcp->th_flags & TH_CWR) {
428 		/*
429 		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In FreeBSD,
430 		 * ECN support is not on a per-interface basis, but globally via
431 		 * the net.inet.tcp.ecn.enable sysctl knob. The default is off.
432 		 */
433 		if (!allow_ecn) {
434 			if (ppsratecheck(&lastecn, &curecn, 1))
435 				if_printf(ifp,
436 				    "TSO with ECN not negotiated with host\n");
437 			return (ENOTSUP);
438 		}
439 		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
440 	}
441 
442 	/* Here we should increment tx_tso counter. */
443 
444 	return (0);
445 }
446 
447 static inline struct mbuf *
448 virtio_net_tx_offload(if_t ifp, struct mbuf *m, bool allow_ecn,
449 		 struct virtio_net_hdr *hdr)
450 {
451 	int flags, etype, csum_start, proto, error;
452 
453 	flags = m->m_pkthdr.csum_flags;
454 
455 	error = virtio_net_tx_offload_ctx(m, &etype, &proto, &csum_start);
456 	if (error)
457 		goto drop;
458 
459 	if ((etype == ETHERTYPE_IP && (flags & (CSUM_TCP | CSUM_UDP))) ||
460 	    (etype == ETHERTYPE_IPV6 &&
461 	        (flags & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6)))) {
462 		/*
463 		 * We could compare the IP protocol vs the CSUM_ flag too,
464 		 * but that really should not be necessary.
465 		 */
466 		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
467 		hdr->csum_start = csum_start;
468 		hdr->csum_offset = m->m_pkthdr.csum_data;
469 		/* Here we should increment the tx_csum counter. */
470 	}
471 
472 	if (flags & CSUM_TSO) {
473 		if (__predict_false(proto != IPPROTO_TCP)) {
474 			/* Likely failed to correctly parse the mbuf.
475 			 * Here we should increment the tx_tso_not_tcp
476 			 * counter. */
477 			goto drop;
478 		}
479 
480 		KASSERT(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM,
481 		    ("%s: mbuf %p TSO without checksum offload %#x",
482 		    __func__, m, flags));
483 
484 		error = virtio_net_tx_offload_tso(ifp, m, etype, csum_start,
485 					     allow_ecn, hdr);
486 		if (error)
487 			goto drop;
488 	}
489 
490 	return (m);
491 
492 drop:
493 	m_freem(m);
494 	return (NULL);
495 }
496 
497 #endif /* _VIRTIO_NET_H */
498