xref: /illumos-gate/usr/src/uts/sun4v/sys/vsw_ldc.h (revision 5c25b6f10367f6cb7b3d8b8bc7e374712eb977c2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * This header file contains the basic data structures which the
28  * virtual switch (vsw) uses to communicate with vnet clients.
29  *
30  * The virtual switch reads the machine description (MD) to
31  * determine how many port_t structures to create (each port_t
32  * can support communications to a single network device). The
33  * port_t's are maintained in a linked list.
34  *
35  * Each port in turn contains a number of logical domain channels
36  * (ldc's) which are inter domain communications channels which
37  * are used for passing small messages between the domains. There
38  * may be any number of channels associated with each port, though
39  * currently most devices only have a single channel. The current
40  * implementation provides support for only one channel per port.
41  *
42  * The ldc is a bi-directional channel, which is divided up into
43  * two directional 'lanes', one outbound from the switch to the
44  * virtual network device, the other inbound to the switch.
45  * Depending on the type of device each lane may have seperate
46  * communication paramaters (such as mtu etc).
47  *
48  * For those network clients which use descriptor rings the
49  * rings are associated with the appropriate lane. I.e. rings
50  * which the switch exports are associated with the outbound lanes
51  * while those which the network clients are exporting to the switch
52  * are associated with the inbound lane.
53  *
54  * In diagram form the data structures look as follows:
55  *
56  * vsw instance
57  *     |
58  *     +----->port_t----->port_t----->port_t----->
59  *		|
60  *		+--->ldc_t
61  *		       |
62  *		       +--->lane_t (inbound)
63  *		       |       |
64  *		       |       +--->dring
65  *		       |
66  *		       +--->lane_t (outbound)
67  *			       |
68  *			       +--->dring
69  *
70  */
71 
72 #ifndef	_VSW_LDC_H
73 #define	_VSW_LDC_H
74 
75 #ifdef	__cplusplus
76 extern "C" {
77 #endif
78 
79 /*
80  * LDC pkt tranfer MTU - largest msg size used
81  */
82 #define	VSW_LDC_MTU		64
83 
84 #define	VSW_DEF_MSG_WORDS	\
85 	(VNET_DRING_REG_EXT_MSG_SIZE_MAX / sizeof (uint64_t))
86 
87 /*
88  * Default message type.
89  */
90 typedef struct def_msg {
91 	uint64_t	data[VSW_DEF_MSG_WORDS];
92 } def_msg_t;
93 
94 /*
95  * Currently only support one major/minor pair.
96  */
97 #define	VSW_NUM_VER	1
98 
99 typedef struct ver_sup {
100 	uint16_t	ver_major;	/* major version number */
101 	uint16_t	ver_minor;	/* minor version number */
102 } ver_sup_t;
103 
104 /*
105  * Lane states.
106  */
107 #define	VSW_LANE_INACTIV	0x0	/* No params set for lane */
108 
109 #define	VSW_VER_INFO_SENT	0x1	/* Version # sent to peer */
110 #define	VSW_VER_INFO_RECV	0x2	/* Version # recv from peer */
111 #define	VSW_VER_ACK_RECV	0x4
112 #define	VSW_VER_ACK_SENT	0x8
113 #define	VSW_VER_NACK_RECV	0x10
114 #define	VSW_VER_NACK_SENT	0x20
115 
116 #define	VSW_ATTR_INFO_SENT	0x40	/* Attributes sent to peer */
117 #define	VSW_ATTR_INFO_RECV	0x80	/* Peer attributes received */
118 #define	VSW_ATTR_ACK_SENT	0x100
119 #define	VSW_ATTR_ACK_RECV	0x200
120 #define	VSW_ATTR_NACK_SENT	0x400
121 #define	VSW_ATTR_NACK_RECV	0x800
122 
123 #define	VSW_DRING_INFO_SENT	0x1000	/* Dring info sent to peer */
124 #define	VSW_DRING_INFO_RECV	0x2000	/* Dring info received */
125 #define	VSW_DRING_ACK_SENT	0x4000
126 #define	VSW_DRING_ACK_RECV	0x8000
127 #define	VSW_DRING_NACK_SENT	0x10000
128 #define	VSW_DRING_NACK_RECV	0x20000
129 
130 #define	VSW_RDX_INFO_SENT	0x40000	/* RDX sent to peer */
131 #define	VSW_RDX_INFO_RECV	0x80000	/* RDX received from peer */
132 #define	VSW_RDX_ACK_SENT	0x100000
133 #define	VSW_RDX_ACK_RECV	0x200000
134 #define	VSW_RDX_NACK_SENT	0x400000
135 #define	VSW_RDX_NACK_RECV	0x800000
136 
137 #define	VSW_MCST_INFO_SENT	0x1000000
138 #define	VSW_MCST_INFO_RECV	0x2000000
139 #define	VSW_MCST_ACK_SENT	0x4000000
140 #define	VSW_MCST_ACK_RECV	0x8000000
141 #define	VSW_MCST_NACK_SENT	0x10000000
142 #define	VSW_MCST_NACK_RECV	0x20000000
143 
144 #define	VSW_LANE_ACTIVE		0x40000000	/* Lane open to xmit data */
145 
146 /* Handshake milestones */
147 #define	VSW_MILESTONE0		0x1	/* ver info exchanged */
148 #define	VSW_MILESTONE1		0x2	/* attribute exchanged */
149 #define	VSW_MILESTONE2		0x4	/* dring info exchanged */
150 #define	VSW_MILESTONE3		0x8	/* rdx exchanged */
151 #define	VSW_MILESTONE4		0x10	/* handshake complete */
152 
153 /*
154  * Lane direction (relative to ourselves).
155  */
156 #define	INBOUND			0x1
157 #define	OUTBOUND		0x2
158 
159 /* Peer session id received */
160 #define	VSW_PEER_SESSION	0x1
161 
162 /*
163  * Maximum number of consecutive reads of data from channel
164  */
165 #define	VSW_MAX_CHAN_READ	50
166 
167 /*
168  * Currently only support one ldc per port.
169  */
170 #define	VSW_PORT_MAX_LDCS	1	/* max # of ldcs per port */
171 
172 /*
173  * Used for port add/deletion.
174  */
175 #define	VSW_PORT_UPDATED	0x1
176 
177 #define	LDC_TX_SUCCESS		0	/* ldc transmit success */
178 #define	LDC_TX_FAILURE		1	/* ldc transmit failure */
179 #define	LDC_TX_NORESOURCES	2	/* out of descriptors */
180 
181 /*
182  * Descriptor ring info
183  *
184  * Each descriptor element has a pre-allocated data buffer
185  * associated with it, into which data being transmitted is
186  * copied. By pre-allocating we speed up the copying process.
187  * The buffer is re-used once the peer has indicated that it is
188  * finished with the descriptor.
189  */
190 #define	VSW_RING_EL_DATA_SZ	2048	/* Size of data section (bytes) */
191 #define	VSW_PRIV_SIZE	sizeof (vnet_private_desc_t)
192 
193 #define	VSW_MAX_COOKIES		((ETHERMTU >> MMU_PAGESHIFT) + 2)
194 
195 /*
196  * Size of the mblk in each mblk pool.
197  */
198 #define	VSW_MBLK_SZ_128		128
199 #define	VSW_MBLK_SZ_256		256
200 #define	VSW_MBLK_SZ_2048	2048
201 
202 /*
203  * Number of mblks in each mblk pool.
204  */
205 #define	VSW_NUM_MBLKS	1024
206 
207 /*
208  * Number of rcv buffers in RxDringData mode
209  */
210 #define	VSW_RXDRING_NRBUFS	(vsw_num_descriptors * vsw_nrbufs_factor)
211 
212 /* increment recv index */
213 #define	INCR_DESC_INDEX(dp, i)	\
214 		((i) = (((i) + 1) & ((dp)->num_descriptors - 1)))
215 
216 /* decrement recv index */
217 #define	DECR_DESC_INDEX(dp, i)	\
218 		((i) = (((i) - 1) & ((dp)->num_descriptors - 1)))
219 
220 #define	INCR_TXI	INCR_DESC_INDEX
221 #define	DECR_TXI	DECR_DESC_INDEX
222 #define	INCR_RXI	INCR_DESC_INDEX
223 #define	DECR_RXI	DECR_DESC_INDEX
224 
225 /* bounds check rx index */
226 #define	CHECK_DESC_INDEX(dp, i)	\
227 		(((i) >= 0) && ((i) < (dp)->num_descriptors))
228 
229 #define	CHECK_RXI	CHECK_DESC_INDEX
230 #define	CHECK_TXI	CHECK_DESC_INDEX
231 
232 /*
233  * Private descriptor
234  */
235 typedef struct vsw_private_desc {
236 	/*
237 	 * Below lock must be held when accessing the state of
238 	 * a descriptor on either the private or public sections
239 	 * of the ring.
240 	 */
241 	kmutex_t		dstate_lock;
242 	uint64_t		dstate;
243 	vnet_public_desc_t	*descp;
244 	ldc_mem_handle_t	memhandle;
245 	void			*datap;
246 	uint64_t		datalen;
247 	uint64_t		ncookies;
248 	ldc_mem_cookie_t	memcookie[VSW_MAX_COOKIES];
249 	int			bound;
250 } vsw_private_desc_t;
251 
252 /*
253  * Descriptor ring structure
254  */
255 typedef struct dring_info {
256 	kmutex_t		dlock;		/* sync access */
257 	uint32_t		num_descriptors; /* # of descriptors */
258 	uint32_t		descriptor_size; /* size of descriptor */
259 	uint32_t		options;	/* dring options (mode) */
260 	ldc_dring_handle_t	dring_handle;	/* dring LDC handle */
261 	uint32_t		dring_ncookies;	/* # of dring cookies */
262 	ldc_mem_cookie_t	dring_cookie[1]; /* LDC cookie of dring */
263 	ldc_mem_handle_t	data_handle;	/* data area  LDC handle */
264 	uint32_t		data_ncookies;	/* # of data area cookies */
265 	ldc_mem_cookie_t	*data_cookie;	/* data area LDC cookies */
266 	uint64_t		ident;		/* identifier sent to peer */
267 	uint64_t		end_idx;	/* last idx processed */
268 	int64_t			last_ack_recv;	/* last ack received */
269 	kmutex_t		txlock;		/* protect tx desc alloc */
270 	uint32_t		next_txi;	/* next tx descriptor index */
271 	uint32_t		next_rxi;	/* next expected recv index */
272 	kmutex_t		restart_lock;	/* protect restart_reqd */
273 	boolean_t		restart_reqd;	/* send restart msg */
274 	uint32_t		restart_peer_txi; /* index to restart peer */
275 	void			*pub_addr;	/* base of public section */
276 	void			*priv_addr;	/* base of private section */
277 	void			*data_addr;	/* base of data section */
278 	size_t			data_sz;	/* size of data section */
279 	size_t			desc_data_sz;	/* size of descr data blk */
280 	uint8_t			dring_mtype;	/* dring mem map type */
281 	uint32_t		num_bufs;	/* # of buffers */
282 	vio_mblk_pool_t		*rx_vmp;	/* rx mblk pool */
283 	vio_mblk_t		**rxdp_to_vmp;	/* descr to buf map tbl */
284 } dring_info_t;
285 
286 /*
287  * Each ldc connection is comprised of two lanes, incoming
288  * from a peer, and outgoing to that peer. Each lane shares
289  * common ldc parameters and also has private lane-specific
290  * parameters.
291  */
292 typedef struct lane {
293 	uint64_t	lstate;		/* Lane state */
294 	uint16_t	ver_major;	/* Version major number */
295 	uint16_t	ver_minor;	/* Version minor number */
296 	uint64_t	seq_num;	/* Sequence number */
297 	uint64_t	mtu;		/* ETHERMTU */
298 	uint64_t	addr;		/* Unique physical address */
299 	uint8_t		addr_type;	/* Only MAC address at moment */
300 	uint8_t		xfer_mode;	/* Dring or Pkt based */
301 	uint8_t		ack_freq;	/* Only non zero for Pkt based xfer */
302 	uint32_t	physlink_update;	/* physlink updates */
303 	uint8_t		dring_mode;	/* Descriptor ring mode */
304 	dring_info_t	*dringp;	/* List of drings for this lane */
305 } lane_t;
306 
307 /* channel drain states */
308 #define	VSW_LDC_INIT		0x1	/* Initial non-drain state */
309 #define	VSW_LDC_DRAINING	0x2	/* Channel draining */
310 
311 /*
312  * vnet-protocol-version dependent function prototypes.
313  */
314 typedef int	(*vsw_ldctx_t) (void *, mblk_t *, mblk_t *, uint32_t);
315 typedef void	(*vsw_ldcrx_pktdata_t) (void *, void *, uint32_t);
316 typedef void	(*vsw_ldcrx_dringdata_t) (void *, void *);
317 
318 /* ldc information associated with a vsw-port */
319 typedef struct vsw_ldc {
320 	struct vsw_ldc		*ldc_next;	/* next ldc in the list */
321 	struct vsw_port		*ldc_port;	/* associated port */
322 	struct vsw		*ldc_vswp;	/* associated vsw */
323 	kmutex_t		ldc_cblock;	/* sync callback processing */
324 	kmutex_t		ldc_txlock;	/* sync transmits */
325 	kmutex_t		ldc_rxlock;	/* sync rx */
326 	uint64_t		ldc_id;		/* channel number */
327 	ldc_handle_t		ldc_handle;	/* channel handle */
328 	kmutex_t		drain_cv_lock;
329 	kcondvar_t		drain_cv;	/* channel draining */
330 	int			drain_state;
331 	uint32_t		hphase;		/* handshake phase */
332 	int			hcnt;		/* # handshake attempts */
333 	kmutex_t		status_lock;
334 	ldc_status_t		ldc_status;	/* channel status */
335 	uint8_t			reset_active;	/* reset flag */
336 	uint64_t		local_session;	/* Our session id */
337 	uint64_t		peer_session;	/* Our peers session id */
338 	uint8_t			session_status;	/* Session recv'd, sent */
339 	uint32_t		hss_id;		/* Handshake session id */
340 	uint64_t		next_ident;	/* Next dring ident # to use */
341 	lane_t			lane_in;	/* Inbound lane */
342 	lane_t			lane_out;	/* Outbound lane */
343 	uint8_t			dev_class;	/* Peer device class */
344 	boolean_t		pls_negotiated;	/* phys link state update ? */
345 	vio_multi_pool_t	vmp;		/* Receive mblk pools */
346 	uint32_t		max_rxpool_size; /* max size of rxpool in use */
347 	uint64_t		*ldcmsg;	/* msg buffer for ldc_read() */
348 	uint64_t		msglen;		/* size of ldcmsg */
349 	uint32_t		dringdata_msgid; /* msgid in RxDringData mode */
350 
351 	/* tx thread fields */
352 	kthread_t		*tx_thread;	/* tx thread */
353 	uint32_t		tx_thr_flags;	/* tx thread flags */
354 	kmutex_t		tx_thr_lock;	/* lock for tx thread */
355 	kcondvar_t		tx_thr_cv;	/* cond.var for tx thread */
356 	mblk_t			*tx_mhead;	/* tx mblks head */
357 	mblk_t			*tx_mtail;	/* tx mblks tail */
358 	uint32_t		tx_cnt;		/* # of pkts queued for tx */
359 
360 	/* message thread fields */
361 	kthread_t		*msg_thread;	/* message thread */
362 	uint32_t		msg_thr_flags;	/* message thread flags */
363 	kmutex_t		msg_thr_lock;	/* lock for message thread */
364 	kcondvar_t		msg_thr_cv;	/* cond.var for msg thread */
365 
366 	/* receive thread fields */
367 	kthread_t		*rcv_thread;	/* receive thread */
368 	uint32_t		rcv_thr_flags;	/* receive thread flags */
369 	kmutex_t		rcv_thr_lock;	/* lock for receive thread */
370 	kcondvar_t		rcv_thr_cv;	/* cond.var for recv thread */
371 
372 	vsw_ldctx_t		tx;		/* transmit function */
373 	vsw_ldcrx_pktdata_t	rx_pktdata;	/* process raw data msg */
374 	vsw_ldcrx_dringdata_t	rx_dringdata;	/* process dring data msg */
375 
376 	/* channel statistics */
377 	vgen_stats_t		ldc_stats;	/* channel statistics */
378 	kstat_t			*ksp;		/* channel kstats */
379 } vsw_ldc_t;
380 
381 /* worker thread flags */
382 #define	VSW_WTHR_DATARCVD 	0x01	/* data received */
383 #define	VSW_WTHR_STOP 		0x02	/* stop worker thread request */
384 
385 /* multicast addresses port is interested in */
386 typedef struct mcst_addr {
387 	struct mcst_addr	*nextp;
388 	struct ether_addr	mca;	/* multicast address */
389 	uint64_t		addr;	/* mcast addr converted to hash key */
390 	boolean_t		mac_added; /* added into physical device */
391 } mcst_addr_t;
392 
393 /* Port detach states */
394 #define	VSW_PORT_INIT		0x1	/* Initial non-detach state */
395 #define	VSW_PORT_DETACHING	0x2	/* In process of being detached */
396 #define	VSW_PORT_DETACHABLE	0x4	/* Safe to detach */
397 
398 /* port information associated with a vsw */
399 typedef struct vsw_port {
400 	int			p_instance;	/* port instance */
401 	struct vsw_port		*p_next;	/* next port in the list */
402 	struct vsw		*p_vswp;	/* associated vsw */
403 	int			num_ldcs;	/* # of ldcs in the port */
404 	uint64_t		*ldc_ids;	/* ldc ids */
405 	vsw_ldc_t		*ldcp;		/* ldc for this port */
406 
407 	kmutex_t		tx_lock;	/* transmit lock */
408 	int			(*transmit)(vsw_ldc_t *, mblk_t *);
409 
410 	int			state;		/* port state */
411 	kmutex_t		state_lock;
412 	kcondvar_t		state_cv;
413 
414 	krwlock_t		maccl_rwlock;	/* protect fields below */
415 	mac_client_handle_t	p_mch;		/* mac client handle */
416 	mac_unicast_handle_t	p_muh;		/* mac unicast handle */
417 
418 	kmutex_t		mca_lock;	/* multicast lock */
419 	mcst_addr_t		*mcap;		/* list of multicast addrs */
420 
421 	boolean_t		addr_set;	/* Addr set where */
422 
423 	/*
424 	 * mac address of the port & connected device
425 	 */
426 	struct ether_addr	p_macaddr;
427 	uint16_t		pvid;	/* port vlan id (untagged) */
428 	struct vsw_vlanid	*vids;	/* vlan ids (tagged) */
429 	uint16_t		nvids;	/* # of vids */
430 	mod_hash_t		*vlan_hashp;	/* vlan hash table */
431 	uint32_t		vlan_nchains;	/* # of vlan hash chains */
432 
433 	/* HybridIO related info */
434 	uint32_t		p_hio_enabled;	/* Hybrid mode enabled? */
435 	uint32_t		p_hio_capable;	/* Port capable of HIO */
436 
437 	/* bandwidth limit */
438 	uint64_t		p_bandwidth;	/* bandwidth limit */
439 } vsw_port_t;
440 
441 /* list of ports per vsw */
442 typedef struct vsw_port_list {
443 	vsw_port_t	*head;		/* head of the list */
444 	krwlock_t	lockrw;		/* sync access(rw) to the list */
445 	int		num_ports;	/* number of ports in the list */
446 } vsw_port_list_t;
447 
448 /*
449  * Taskq control message
450  */
451 typedef struct vsw_ctrl_task {
452 	vsw_ldc_t	*ldcp;
453 	def_msg_t	pktp;
454 	uint32_t	hss_id;
455 } vsw_ctrl_task_t;
456 
457 /*
458  * State of connection to peer. Some of these states
459  * can be mapped to LDC events as follows:
460  *
461  * VSW_CONN_RESET -> LDC_RESET_EVT
462  * VSW_CONN_UP    -> LDC_UP_EVT
463  */
464 #define	VSW_CONN_UP		0x1	/* Connection come up */
465 #define	VSW_CONN_RESET		0x2	/* Connection reset */
466 #define	VSW_CONN_RESTART	0x4	/* Restarting handshake on connection */
467 
468 typedef struct vsw_conn_evt {
469 	uint16_t	evt;		/* Connection event */
470 	vsw_ldc_t	*ldcp;
471 } vsw_conn_evt_t;
472 
473 /*
474  * Ethernet broadcast address definition.
475  */
476 static	struct	ether_addr	etherbroadcastaddr = {
477 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
478 };
479 
480 #define	IS_BROADCAST(ehp) \
481 	(bcmp(&ehp->ether_dhost, &etherbroadcastaddr, ETHERADDRL) == 0)
482 #define	IS_MULTICAST(ehp) \
483 	((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)
484 
485 #define	READ_ENTER(x)	rw_enter(x, RW_READER)
486 #define	WRITE_ENTER(x)	rw_enter(x, RW_WRITER)
487 #define	RW_EXIT(x)	rw_exit(x)
488 
489 #define	VSW_PORT_REFHOLD(portp)	atomic_inc_32(&((portp)->ref_cnt))
490 #define	VSW_PORT_REFRELE(portp)	atomic_dec_32(&((portp)->ref_cnt))
491 
492 #ifdef	__cplusplus
493 }
494 #endif
495 
496 #endif	/* _VSW_LDC_H */
497