xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_impl.h (revision 7f3d7c9289dee6488b3cd2848a68c0b8580d750c)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
39  * Copyright 2025 Oxide Computer Company
40  */
41 
42 #ifndef	_VIONA_IMPL_H
43 #define	_VIONA_IMPL_H
44 
45 #include <sys/ddi.h>
46 #include <sys/list.h>
47 #include <sys/sunddi.h>
48 #include <sys/sunndi.h>
49 #include <sys/strsun.h>
50 #include <sys/sysmacros.h>
51 #include <sys/uio.h>
52 
53 #include <sys/mac_client.h>
54 #include <sys/mac_provider.h>
55 #include <sys/mac_client_priv.h>
56 #include <sys/neti.h>
57 #include <inet/ip.h>
58 #include <inet/tcp.h>
59 
60 #include <sys/vmm_drv.h>
61 #include <sys/viona_io.h>
62 
63 struct viona_link;
64 typedef struct viona_link viona_link_t;
65 struct viona_desb;
66 typedef struct viona_desb viona_desb_t;
67 struct viona_net;
68 typedef struct viona_neti viona_neti_t;
69 
70 typedef struct viona_transfer_stats {
71 	/* Packets transferred successfully */
72 	uint64_t vts_packets;
73 	/* Bytes transferred successfully */
74 	uint64_t vts_bytes;
75 	/*
76 	 * Count of transfers which encountered errors, not including
77 	 * insufficient space in ring.
78 	 */
79 	uint64_t vts_errors;
80 	/*
81 	 * Count of packets dropped due to insufficient space in the ring or by
82 	 * order of associated hook.
83 	 */
84 	uint64_t vts_drops;
85 } viona_transfer_stats_t;
86 
87 enum viona_ring_state {
88 	VRS_RESET	= 0x0,	/* just allocated or reset */
89 	VRS_SETUP	= 0x1,	/* addrs setup and starting worker thread */
90 	VRS_INIT	= 0x2,	/* worker thread started & waiting to run */
91 	VRS_RUN		= 0x3,	/* running work routine */
92 	VRS_STOP	= 0x4,	/* worker is exiting */
93 };
94 enum viona_ring_state_flags {
95 	VRSF_REQ_START	= 0x1,	/* start running from INIT state */
96 	VRSF_REQ_STOP	= 0x2,	/* stop running, clean up, goto RESET state */
97 	VRSF_REQ_PAUSE	= 0x4,	/* stop running, goto INIT state */
98 	VRSF_RENEW	= 0x8,	/* ring renewing lease */
99 };
100 
101 typedef struct viona_vring_tx {
102 	/*
103 	 * Temporary store of kernel-virtual addresses of guest buffers in a
104 	 * descriptor chain undergoing transmission.
105 	 *
106 	 * Length stored in vrt_iov_cnt.
107 	 */
108 	struct iovec	*vrt_iov;
109 	/*
110 	 * When device is configured to "loan" guest memory for transmitted
111 	 * packets, rather than allocating and copying them in their entirety,
112 	 * this holds a ring-sized array of viona_desb_t entries.
113 	 *
114 	 * In addition to the desballoc() accounting, those descriptors also
115 	 * hold a pre-allocated buffer sized to receive the packet headers
116 	 * (which must be copied despite for TOCTOU reasons).
117 	 */
118 	viona_desb_t	*vrt_desb;
119 	/*
120 	 * Length of vrt_iov
121 	 */
122 	uint_t		vrt_iov_cnt;
123 	/*
124 	 * Length in bytes to leave "empty" in front of the headers for each
125 	 * transmitted packet.  This allows subsequent encapsulation (such as
126 	 * vlans, VXLAN, etc) to use the space without requiring an additional
127 	 * allocation and header copy.
128 	 */
129 	uint_t		vrt_header_pad;
130 } viona_vring_tx_t;
131 
132 typedef struct viona_vring {
133 	viona_link_t	*vr_link;
134 
135 	kmutex_t	vr_lock;
136 	kcondvar_t	vr_cv;
137 	uint16_t	vr_state;
138 	uint16_t	vr_state_flags;
139 	uint_t		vr_xfer_outstanding;
140 	kthread_t	*vr_worker_thread;
141 	vmm_lease_t	*vr_lease;
142 
143 	/* Resources required for transmission on TX ring(s) */
144 	struct viona_vring_tx vr_tx;
145 
146 	uint_t		vr_intr_enabled;
147 	uint64_t	vr_msi_addr;
148 	uint64_t	vr_msi_msg;
149 
150 	/* Internal ring-related state */
151 	kmutex_t	vr_a_mutex;	/* sync consumers of 'avail' */
152 	kmutex_t	vr_u_mutex;	/* sync consumers of 'used' */
153 	uint64_t	vr_pa;
154 	uint16_t	vr_size;
155 	uint16_t	vr_mask;	/* cached from vr_size */
156 	uint16_t	vr_cur_aidx;	/* trails behind 'avail_idx' */
157 	uint16_t	vr_cur_uidx;	/* drives 'used_idx' */
158 
159 	/* Reference to guest pages holding virtqueue */
160 	void		**vr_map_pages;
161 	vmm_page_t	*vr_map_hold;
162 
163 	/* Per-ring general statistics */
164 	struct viona_transfer_stats vr_stats;
165 
166 	/* Per-ring error condition statistics */
167 	struct viona_ring_err_stats {
168 		uint64_t	rs_ndesc_too_high;
169 		uint64_t	rs_bad_idx;
170 		uint64_t	rs_indir_bad_len;
171 		uint64_t	rs_indir_bad_nest;
172 		uint64_t	rs_indir_bad_next;
173 		uint64_t	rs_no_space;
174 		uint64_t	rs_too_many_desc;
175 		uint64_t	rs_desc_bad_len;
176 		uint64_t	rs_len_overflow;
177 
178 		uint64_t	rs_bad_ring_addr;
179 
180 		uint64_t	rs_fail_hcksum;
181 		uint64_t	rs_fail_hcksum6;
182 		uint64_t	rs_fail_hcksum_proto;
183 
184 		uint64_t	rs_bad_rx_frame;
185 		uint64_t	rs_rx_merge_overrun;
186 		uint64_t	rs_rx_merge_underrun;
187 		uint64_t	rs_rx_pad_short;
188 		uint64_t	rs_rx_mcast_check;
189 		uint64_t	rs_rx_drop_over_mtu;
190 		uint64_t	rs_rx_gro_fallback;
191 		uint64_t	rs_rx_gro_fallback_fail;
192 		uint64_t	rs_too_short;
193 		uint64_t	rs_tx_absent;
194 		uint64_t	rs_tx_gso_fail;
195 
196 		uint64_t	rs_rx_hookdrop;
197 		uint64_t	rs_tx_hookdrop;
198 	} vr_err_stats;
199 } viona_vring_t;
200 
201 typedef struct viona_link_params {
202 	/* Amount of free space to prepend to TX header mblk */
203 	uint16_t	vlp_tx_header_pad;
204 	/* Force copying of TX data, rather than "loaning" guest memory */
205 	boolean_t	vlp_tx_copy_data;
206 } viona_link_params_t;
207 
208 struct viona_link {
209 	vmm_hold_t		*l_vm_hold;
210 	boolean_t		l_destroyed;
211 
212 	viona_vring_t		l_vrings[VIONA_VQ_MAX];
213 
214 	uint32_t		l_features;
215 	uint32_t		l_features_hw;
216 	uint32_t		l_cap_csum;
217 	viona_link_params_t	l_params;
218 	uint16_t		l_mtu;
219 
220 	uint16_t		l_notify_ioport;
221 	void			*l_notify_cookie;
222 
223 	datalink_id_t		l_linkid;
224 	mac_handle_t		l_mh;
225 	mac_client_handle_t	l_mch;
226 	mac_promisc_handle_t	l_mph;
227 	mac_unicast_handle_t	l_muh;
228 	viona_promisc_t		l_promisc;
229 
230 	pollhead_t		l_pollhead;
231 
232 	viona_neti_t		*l_neti;
233 
234 	kmutex_t		l_stats_lock;
235 	struct viona_link_stats {
236 		struct viona_transfer_stats vls_rx;
237 		struct viona_transfer_stats vls_tx;
238 	} l_stats;
239 };
240 
241 typedef struct viona_nethook {
242 	net_handle_t		vnh_neti;
243 	hook_family_t		vnh_family;
244 	hook_event_t		vnh_event_in;
245 	hook_event_t		vnh_event_out;
246 	hook_event_token_t	vnh_token_in;
247 	hook_event_token_t	vnh_token_out;
248 	boolean_t		vnh_hooked;
249 } viona_nethook_t;
250 
251 struct viona_neti {
252 	list_node_t		vni_node;
253 
254 	netid_t			vni_netid;
255 	zoneid_t		vni_zid;
256 
257 	viona_nethook_t		vni_nethook;
258 
259 	kmutex_t		vni_lock;	/* Protects remaining members */
260 	kcondvar_t		vni_ref_change; /* Protected by vni_lock */
261 	uint_t			vni_ref;	/* Protected by vni_lock */
262 	list_t			vni_dev_list;	/* Protected by vni_lock */
263 };
264 
265 typedef struct viona_kstats {
266 	kstat_named_t	vk_rx_packets;
267 	kstat_named_t	vk_rx_bytes;
268 	kstat_named_t	vk_rx_errors;
269 	kstat_named_t	vk_rx_drops;
270 	kstat_named_t	vk_tx_packets;
271 	kstat_named_t	vk_tx_bytes;
272 	kstat_named_t	vk_tx_errors;
273 	kstat_named_t	vk_tx_drops;
274 } viona_kstats_t;
275 
276 typedef struct used_elem {
277 	uint16_t	id;
278 	uint32_t	len;
279 } used_elem_t;
280 
281 /*
282  * Helper for performing copies from an array of iovec entries.
283  */
284 typedef struct iov_bunch {
285 	/*
286 	 * Head of array of iovec entries, which have an iov_len sum covering
287 	 * ib_remain bytes.
288 	 */
289 	struct iovec	*ib_iov;
290 	/* Byte offset in current ib_iov entry */
291 	uint32_t	ib_offset;
292 	/*
293 	 * Bytes remaining in entries covered by ib_iov entries, not including
294 	 * the offset specified by ib_offset
295 	 */
296 	uint32_t	ib_remain;
297 } iov_bunch_t;
298 
299 typedef struct viona_soft_state {
300 	kmutex_t		ss_lock;
301 	viona_link_t		*ss_link;
302 	list_node_t		ss_node;
303 	kstat_t			*ss_kstat;
304 	minor_t			ss_minor;
305 } viona_soft_state_t;
306 
307 #pragma pack(1)
308 struct virtio_desc {
309 	uint64_t	vd_addr;
310 	uint32_t	vd_len;
311 	uint16_t	vd_flags;
312 	uint16_t	vd_next;
313 };
314 
315 struct virtio_used {
316 	uint32_t	vu_idx;
317 	uint32_t	vu_tlen;
318 };
319 
320 struct virtio_net_mrgrxhdr {
321 	uint8_t		vrh_flags;
322 	uint8_t		vrh_gso_type;
323 	uint16_t	vrh_hdr_len;
324 	uint16_t	vrh_gso_size;
325 	uint16_t	vrh_csum_start;
326 	uint16_t	vrh_csum_offset;
327 	uint16_t	vrh_bufs;
328 };
329 
330 struct virtio_net_hdr {
331 	uint8_t		vrh_flags;
332 	uint8_t		vrh_gso_type;
333 	uint16_t	vrh_hdr_len;
334 	uint16_t	vrh_gso_size;
335 	uint16_t	vrh_csum_start;
336 	uint16_t	vrh_csum_offset;
337 };
338 #pragma pack()
339 
340 #define	VNETHOOK_INTERESTED_IN(neti) \
341 	(neti)->vni_nethook.vnh_event_in.he_interested
342 #define	VNETHOOK_INTERESTED_OUT(neti) \
343 	(neti)->vni_nethook.vnh_event_out.he_interested
344 
345 
346 #define	VIONA_PROBE(name)	DTRACE_PROBE(viona__##name)
347 #define	VIONA_PROBE1(name, arg1, arg2)	\
348 	DTRACE_PROBE1(viona__##name, arg1, arg2)
349 #define	VIONA_PROBE2(name, arg1, arg2, arg3, arg4)	\
350 	DTRACE_PROBE2(viona__##name, arg1, arg2, arg3, arg4)
351 #define	VIONA_PROBE3(name, arg1, arg2, arg3, arg4, arg5, arg6)	\
352 	DTRACE_PROBE3(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6)
353 #define	VIONA_PROBE4(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) \
354 	DTRACE_PROBE4(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
355 	arg8)
356 #define	VIONA_PROBE5(name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, \
357 	arg9, arg10) \
358 	DTRACE_PROBE5(viona__##name, arg1, arg2, arg3, arg4, arg5, arg6, arg7, \
359 	arg8, arg9, arg10)
360 #define	VIONA_PROBE_BAD_RING_ADDR(r, a)		\
361 	VIONA_PROBE2(bad_ring_addr, viona_vring_t *, r, void *, (void *)(a))
362 
363 /* Increment one of the named ring error stats */
364 #define	VIONA_RING_STAT_INCR(r, name)	\
365 	(((r)->vr_err_stats.rs_ ## name)++)
366 
367 #define	VIONA_MAX_HDRS_LEN	(sizeof (struct ether_vlan_header) + \
368 	IP_MAX_HDR_LENGTH + TCP_MAX_HDR_LENGTH)
369 
370 #define	VRING_AVAIL_F_NO_INTERRUPT	1
371 #define	VRING_USED_F_NO_NOTIFY		1
372 
373 #define	VRING_DESC_F_NEXT	(1 << 0)
374 #define	VRING_DESC_F_WRITE	(1 << 1)
375 #define	VRING_DESC_F_INDIRECT	(1 << 2)
376 
377 #define	VIRTIO_NET_HDR_F_NEEDS_CSUM	(1 << 0)
378 #define	VIRTIO_NET_HDR_F_DATA_VALID	(1 << 1)
379 
380 #define	VIRTIO_NET_HDR_GSO_NONE		0
381 #define	VIRTIO_NET_HDR_GSO_TCPV4	1
382 
383 #define	VIRTIO_NET_F_CSUM		(1 << 0)
384 #define	VIRTIO_NET_F_GUEST_CSUM		(1 << 1)
385 #define	VIRTIO_NET_F_MAC		(1 << 5) /* host supplies MAC */
386 #define	VIRTIO_NET_F_GUEST_TSO4		(1 << 7) /* guest can accept TSO */
387 #define	VIRTIO_NET_F_HOST_TSO4		(1 << 11) /* host can accept TSO */
388 #define	VIRTIO_NET_F_MRG_RXBUF		(1 << 15) /* host can merge RX bufs */
389 #define	VIRTIO_NET_F_STATUS		(1 << 16) /* cfg status field present */
390 #define	VIRTIO_F_RING_NOTIFY_ON_EMPTY	(1 << 24)
391 #define	VIRTIO_F_RING_INDIRECT_DESC	(1 << 28)
392 #define	VIRTIO_F_RING_EVENT_IDX		(1 << 29)
393 
394 /*
395  * Place an upper bound on the size of packets viona is willing to handle,
396  * particularly in the TX case, where guest behavior directs the sizing of
397  * buffer allocations.
398  * In the RX case, upper bounds are provided by the MTU (below) and a
399  * GRO-specific limit provided by the specification.
400  */
401 #define	VIONA_MAX_PACKET_SIZE		UINT16_MAX
402 #define	VIONA_GRO_MAX_PACKET_SIZE	65550
403 
404 /*
405  * Virtio v1.1+ allow the host to communicate its underlying MTU to the guest,
406  * which the guest uses to determine the maximum packet it is able to receive.
407  * Devices/drivers (legacy and otherwise) which do not negotiate this behave as
408  * though the MTU is 1500.
409  * The value for max MTU is part of the virtio spec (v1.0--1.3), and may need to
410  * change in future to account for IPv6 jumbograms (64 KiB..4GiB).
411  */
412 #define	VIONA_MIN_MTU			68
413 #define	VIONA_MAX_MTU			UINT16_MAX
414 #define	VIONA_DEFAULT_MTU		1500
415 
416 struct viona_ring_params {
417 	uint64_t	vrp_pa;
418 	uint16_t	vrp_size;
419 	uint16_t	vrp_avail_idx;
420 	uint16_t	vrp_used_idx;
421 };
422 
423 void viona_ring_alloc(viona_link_t *, viona_vring_t *);
424 void viona_ring_free(viona_vring_t *);
425 int viona_ring_get_state(viona_link_t *, uint16_t, struct viona_ring_params *);
426 int viona_ring_set_state(viona_link_t *, uint16_t,
427     const struct viona_ring_params *);
428 int viona_ring_reset(viona_vring_t *, boolean_t);
429 int viona_ring_init(viona_link_t *, uint16_t, const struct viona_ring_params *);
430 boolean_t viona_ring_lease_renew(viona_vring_t *);
431 bool vring_need_bail(const viona_vring_t *);
432 int viona_ring_pause(viona_vring_t *);
433 
434 int vq_popchain(viona_vring_t *, struct iovec *, uint_t, uint16_t *,
435     vmm_page_t **, uint32_t *);
436 void vq_pushchain(viona_vring_t *, uint32_t, uint16_t);
437 void vq_pushchain_many(viona_vring_t *, uint_t, used_elem_t *);
438 
439 void viona_intr_ring(viona_vring_t *ring, boolean_t);
440 void viona_ring_set_no_notify(viona_vring_t *, boolean_t);
441 void viona_ring_disable_notify(viona_vring_t *);
442 void viona_ring_enable_notify(viona_vring_t *);
443 uint16_t viona_ring_num_avail(viona_vring_t *);
444 
445 void viona_ring_stat_accept(viona_vring_t *, size_t, size_t);
446 void viona_ring_stat_drop(viona_vring_t *, size_t);
447 void viona_ring_stat_error(viona_vring_t *);
448 
449 bool iov_bunch_copy(iov_bunch_t *, void *, uint32_t);
450 bool iov_bunch_next_chunk(iov_bunch_t *, caddr_t *, uint32_t *);
451 
452 void viona_rx_init(void);
453 void viona_rx_fini(void);
454 int viona_rx_set(viona_link_t *, viona_promisc_t);
455 void viona_rx_clear(viona_link_t *);
456 void viona_worker_rx(viona_vring_t *, viona_link_t *);
457 
458 extern kmutex_t viona_force_copy_lock;
459 extern uint_t viona_max_header_pad;
460 boolean_t viona_tx_copy_needed(void);
461 void viona_worker_tx(viona_vring_t *, viona_link_t *);
462 void viona_tx_ring_alloc(viona_vring_t *, const uint16_t);
463 void viona_tx_ring_free(viona_vring_t *, const uint16_t);
464 
465 void viona_neti_attach(void);
466 void viona_neti_detach(void);
467 viona_neti_t *viona_neti_lookup_by_zid(zoneid_t);
468 void viona_neti_rele(viona_neti_t *);
469 int viona_hook(viona_link_t *, viona_vring_t *, mblk_t **, boolean_t);
470 
471 #endif	/* _VIONA_IMPL_H */
472