xref: /illumos-gate/usr/src/uts/sun4v/sys/vsw.h (revision 5bc0fc0e85213e08d6b0388ae0690b7377d409a2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * This header file contains the basic data structures which the
29  * virtual switch (vsw) uses to communicate with its clients and
30  * the outside world.
31  *
32  * The virtual switch reads the machine description (MD) to
33  * determine how many port_t structures to create (each port_t
34  * can support communications to a single network device). The
35  * port_t's are maintained in a linked list.
36  *
37  * Each port in turn contains a number of logical domain channels
38  * (ldc's) which are inter domain communications channels which
39  * are used for passing small messages between the domains. Their
40  * may be an unlimited number of channels associated with each port,
41  * though most devices only use a single channel.
42  *
43  * The ldc is a bi-directional channel, which is divided up into
44  * two directional 'lanes', one outbound from the switch to the
45  * virtual network device, the other inbound to the switch.
46  * Depending on the type of device each lane may have seperate
47  * communication paramaters (such as mtu etc).
48  *
49  * For those network clients which use descriptor rings the
50  * rings are associated with the appropriate lane. I.e. rings
51  * which the switch exports are associated with the outbound lanes
52  * while those which the network clients are exporting to the switch
53  * are associated with the inbound lane.
54  *
55  * In diagram form the data structures look as follows:
56  *
57  * vsw instance
58  *     |
59  *     +----->port_t----->port_t----->port_t----->
60  *		|
61  *		+--->ldc_t--->ldc_t--->ldc_t--->
62  *		       |
63  *		       +--->lane_t (inbound)
64  *		       |       |
65  *		       |       +--->dring--->dring--->
66  *		       |
67  *		       +--->lane_t (outbound)
68  *			       |
69  *			       +--->dring--->dring--->
70  *
71  */
72 
73 #ifndef	_VSW_H
74 #define	_VSW_H
75 
76 #pragma ident	"%Z%%M%	%I%	%E% SMI"
77 
78 #ifdef	__cplusplus
79 extern "C" {
80 #endif
81 
82 #include <sys/vio_mailbox.h>
83 #include <sys/vnet_common.h>
84 #include <sys/ethernet.h>
85 #include <sys/vio_util.h>
86 
87 /*
88  * Default message type.
89  */
90 typedef struct def_msg {
91 	uint64_t	data[8];
92 } def_msg_t;
93 
94 /*
95  * Currently only support one major/minor pair.
96  */
97 #define	VSW_NUM_VER	1
98 
99 typedef struct ver_sup {
100 	uint32_t	ver_major:16,
101 			ver_minor:16;
102 } ver_sup_t;
103 
104 /*
105  * Only support ETHER mtu at moment.
106  */
107 #define	VSW_MTU		ETHERMAX
108 
109 /*
110  * Lane states.
111  */
112 #define	VSW_LANE_INACTIV	0x0	/* No params set for lane */
113 
114 #define	VSW_VER_INFO_SENT	0x1	/* Version # sent to peer */
115 #define	VSW_VER_INFO_RECV	0x2	/* Version # recv from peer */
116 #define	VSW_VER_ACK_RECV	0x4
117 #define	VSW_VER_ACK_SENT	0x8
118 #define	VSW_VER_NACK_RECV	0x10
119 #define	VSW_VER_NACK_SENT	0x20
120 
121 #define	VSW_ATTR_INFO_SENT	0x40	/* Attributes sent to peer */
122 #define	VSW_ATTR_INFO_RECV	0x80	/* Peer attributes received */
123 #define	VSW_ATTR_ACK_SENT	0x100
124 #define	VSW_ATTR_ACK_RECV	0x200
125 #define	VSW_ATTR_NACK_SENT	0x400
126 #define	VSW_ATTR_NACK_RECV	0x800
127 
128 #define	VSW_DRING_INFO_SENT	0x1000	/* Dring info sent to peer */
129 #define	VSW_DRING_INFO_RECV	0x2000	/* Dring info received */
130 #define	VSW_DRING_ACK_SENT	0x4000
131 #define	VSW_DRING_ACK_RECV	0x8000
132 #define	VSW_DRING_NACK_SENT	0x10000
133 #define	VSW_DRING_NACK_RECV	0x20000
134 
135 #define	VSW_RDX_INFO_SENT	0x40000	/* RDX sent to peer */
136 #define	VSW_RDX_INFO_RECV	0x80000	/* RDX received from peer */
137 #define	VSW_RDX_ACK_SENT	0x100000
138 #define	VSW_RDX_ACK_RECV	0x200000
139 #define	VSW_RDX_NACK_SENT	0x400000
140 #define	VSW_RDX_NACK_RECV	0x800000
141 
142 #define	VSW_MCST_INFO_SENT	0x1000000
143 #define	VSW_MCST_INFO_RECV	0x2000000
144 #define	VSW_MCST_ACK_SENT	0x4000000
145 #define	VSW_MCST_ACK_RECV	0x8000000
146 #define	VSW_MCST_NACK_SENT	0x10000000
147 #define	VSW_MCST_NACK_RECV	0x20000000
148 
149 #define	VSW_LANE_ACTIVE		0x40000000	/* Lane open to xmit data */
150 
151 /* Handshake milestones */
152 #define	VSW_MILESTONE0		0x1	/* ver info exchanged */
153 #define	VSW_MILESTONE1		0x2	/* attribute exchanged */
154 #define	VSW_MILESTONE2		0x4	/* dring info exchanged */
155 #define	VSW_MILESTONE3		0x8	/* rdx exchanged */
156 #define	VSW_MILESTONE4		0x10	/* handshake complete */
157 
158 /*
159  * Lane direction (relative to ourselves).
160  */
161 #define	INBOUND			0x1
162 #define	OUTBOUND		0x2
163 
164 /* Peer session id received */
165 #define	VSW_PEER_SESSION	0x1
166 
167 /*
168  * Maximum number of consecutive reads of data from channel
169  */
170 #define	VSW_MAX_CHAN_READ	50
171 
172 /*
173  * Currently only support one ldc per port.
174  */
175 #define	VSW_PORT_MAX_LDCS	1	/* max # of ldcs per port */
176 
177 /*
178  * Used for port add/deletion.
179  */
180 #define	VSW_PORT_UPDATED	0x1
181 
182 #define	LDC_TX_SUCCESS		0	/* ldc transmit success */
183 #define	LDC_TX_FAILURE		1	/* ldc transmit failure */
184 #define	LDC_TX_NORESOURCES	2	/* out of descriptors */
185 
186 /* ID of the source of a frame being switched */
187 #define	VSW_PHYSDEV		1	/* physical device associated */
188 #define	VSW_VNETPORT		2	/* port connected to vnet (over ldc) */
189 #define	VSW_LOCALDEV		4	/* vsw configured as an eth interface */
190 
191 /*
192  * Descriptor ring info
193  *
194  * Each descriptor element has a pre-allocated data buffer
195  * associated with it, into which data being transmitted is
196  * copied. By pre-allocating we speed up the copying process.
197  * The buffer is re-used once the peer has indicated that it is
198  * finished with the descriptor.
199  */
200 #define	VSW_RING_NUM_EL		512	/* Num of entries in ring */
201 #define	VSW_RING_EL_DATA_SZ	2048	/* Size of data section (bytes) */
202 #define	VSW_PRIV_SIZE	sizeof (vnet_private_desc_t)
203 #define	VSW_PUB_SIZE	sizeof (vnet_public_desc_t)
204 
205 #define	VSW_MAX_COOKIES		((ETHERMTU >> MMU_PAGESHIFT) + 2)
206 
207 /*
208  * LDC pkt tranfer MTU
209  */
210 #define	VSW_LDC_MTU	sizeof (def_msg_t)
211 
212 /*
213  * Size and number of mblks to be created in free pool.
214  */
215 #define	VSW_MBLK_SIZE	2048
216 #define	VSW_NUM_MBLKS	1024
217 
218 /*
219  * Private descriptor
220  */
221 typedef struct vsw_private_desc {
222 	/*
223 	 * Below lock must be held when accessing the state of
224 	 * a descriptor on either the private or public sections
225 	 * of the ring.
226 	 */
227 	kmutex_t		dstate_lock;
228 	uint64_t		dstate;
229 	vnet_public_desc_t	*descp;
230 	ldc_mem_handle_t	memhandle;
231 	void			*datap;
232 	uint64_t		datalen;
233 	uint64_t		ncookies;
234 	ldc_mem_cookie_t	memcookie[VSW_MAX_COOKIES];
235 	int			bound;
236 } vsw_private_desc_t;
237 
238 /*
239  * Descriptor ring structure
240  */
241 typedef struct dring_info {
242 	struct	dring_info	*next;	/* next ring in chain */
243 	kmutex_t		dlock;
244 	uint32_t		num_descriptors;
245 	uint32_t		descriptor_size;
246 	uint32_t		options;
247 	uint32_t		ncookies;
248 	ldc_mem_cookie_t	cookie[1];
249 
250 	ldc_dring_handle_t	handle;
251 	uint64_t		ident;	/* identifier sent to peer */
252 	uint64_t		end_idx;	/* last idx processed */
253 	int64_t			last_ack_recv;
254 
255 	kmutex_t		restart_lock;
256 	boolean_t		restart_reqd;	/* send restart msg */
257 
258 	/*
259 	 * base address of private and public portions of the
260 	 * ring (where appropriate), and data block.
261 	 */
262 	void			*pub_addr;	/* base of public section */
263 	void			*priv_addr;	/* base of private section */
264 	void			*data_addr;	/* base of data section */
265 	size_t			data_sz;	/* size of data section */
266 } dring_info_t;
267 
268 /*
269  * Each ldc connection is comprised of two lanes, incoming
270  * from a peer, and outgoing to that peer. Each lane shares
271  * common ldc parameters and also has private lane-specific
272  * parameters.
273  */
274 typedef struct lane {
275 	uint64_t	lstate;		/* Lane state */
276 	uint32_t	ver_major:16,	/* Version major number */
277 			ver_minor:16;	/* Version minor number */
278 	kmutex_t	seq_lock;
279 	uint64_t	seq_num;	/* Sequence number */
280 	uint64_t	mtu;		/* ETHERMTU */
281 	uint64_t	addr;		/* Unique physical address */
282 	uint8_t		addr_type;	/* Only MAC address at moment */
283 	uint8_t		xfer_mode;	/* Dring or Pkt based */
284 	uint8_t		ack_freq;	/* Only non zero for Pkt based xfer */
285 	krwlock_t	dlistrw;	/* Lock for dring list */
286 	dring_info_t	*dringp;	/* List of drings for this lane */
287 } lane_t;
288 
289 /* channel drain states */
290 #define	VSW_LDC_INIT		0x1	/* Initial non-drain state */
291 #define	VSW_LDC_DRAINING	0x2	/* Channel draining */
292 
293 /* ldc information associated with a vsw-port */
294 typedef struct vsw_ldc {
295 	struct vsw_ldc		*ldc_next;	/* next ldc in the list */
296 	struct vsw_port		*ldc_port;	/* associated port */
297 	struct vsw		*ldc_vswp;	/* associated vsw */
298 	kmutex_t		ldc_cblock;	/* sync callback processing */
299 	kmutex_t		ldc_txlock;	/* sync transmits */
300 	uint64_t		ldc_id;		/* channel number */
301 	ldc_handle_t		ldc_handle;	/* channel handle */
302 	kmutex_t		drain_cv_lock;
303 	kcondvar_t		drain_cv;	/* channel draining */
304 	int			drain_state;
305 	uint32_t		hphase;		/* handshake phase */
306 	int			hcnt;		/* # handshake attempts */
307 	kmutex_t		status_lock;
308 	ldc_status_t		ldc_status;	/* channel status */
309 	uint8_t			reset_active;	/* reset flag */
310 	uint64_t		local_session;	/* Our session id */
311 	uint64_t		peer_session;	/* Our peers session id */
312 	uint8_t			session_status;	/* Session recv'd, sent */
313 	kmutex_t		hss_lock;
314 	uint32_t		hss_id;		/* Handshake session id */
315 	uint64_t		next_ident;	/* Next dring ident # to use */
316 	lane_t			lane_in;	/* Inbound lane */
317 	lane_t			lane_out;	/* Outbound lane */
318 	uint8_t			dev_class;	/* Peer device class */
319 	vio_mblk_pool_t		*rxh;		/* Receive pool handle */
320 } vsw_ldc_t;
321 
322 /* list of ldcs per port */
323 typedef struct vsw_ldc_list {
324 	vsw_ldc_t	*head;		/* head of the list */
325 	krwlock_t	lockrw;		/* sync access(rw) to the list */
326 	int		num_ldcs;	/* number of ldcs in the list */
327 } vsw_ldc_list_t;
328 
329 /* multicast addresses port is interested in */
330 typedef struct mcst_addr {
331 	struct mcst_addr	*nextp;
332 	struct ether_addr	mca;	/* multicast address */
333 	uint64_t		addr;	/* mcast addr converted to hash key */
334 	boolean_t		mac_added; /* added into physical device */
335 } mcst_addr_t;
336 
337 /* Port detach states */
338 #define	VSW_PORT_INIT		0x1	/* Initial non-detach state */
339 #define	VSW_PORT_DETACHING	0x2	/* In process of being detached */
340 #define	VSW_PORT_DETACHABLE	0x4	/* Safe to detach */
341 
342 #define	VSW_ADDR_UNSET		0x0	/* Addr not set */
343 #define	VSW_ADDR_HW		0x1	/* Addr programmed in HW */
344 #define	VSW_ADDR_PROMISC	0x2	/* Card in promisc to see addr */
345 
346 /* port information associated with a vsw */
347 typedef struct vsw_port {
348 	int			p_instance;	/* port instance */
349 	struct vsw_port		*p_next;	/* next port in the list */
350 	struct vsw		*p_vswp;	/* associated vsw */
351 	vsw_ldc_list_t		p_ldclist;	/* list of ldcs for this port */
352 
353 	kmutex_t		tx_lock;	/* transmit lock */
354 	int			(*transmit)(vsw_ldc_t *, mblk_t *);
355 
356 	int			state;		/* port state */
357 	kmutex_t		state_lock;
358 	kcondvar_t		state_cv;
359 
360 	int			ref_cnt;	/* # of active references */
361 	kmutex_t		ref_lock;
362 	kcondvar_t		ref_cv;
363 
364 	kmutex_t		mca_lock;	/* multicast lock */
365 	mcst_addr_t		*mcap;		/* list of multicast addrs */
366 
367 	mac_addr_slot_t		addr_slot;	/* Unicast address slot */
368 	int			addr_set;	/* Addr set where */
369 
370 	/*
371 	 * mac address of the port & connected device
372 	 */
373 	struct ether_addr	p_macaddr;
374 } vsw_port_t;
375 
376 /* list of ports per vsw */
377 typedef struct vsw_port_list {
378 	vsw_port_t	*head;		/* head of the list */
379 	krwlock_t	lockrw;		/* sync access(rw) to the list */
380 	int		num_ports;	/* number of ports in the list */
381 } vsw_port_list_t;
382 
383 /*
384  * Taskq control message
385  */
386 typedef struct vsw_ctrl_task {
387 	vsw_ldc_t	*ldcp;
388 	def_msg_t	pktp;
389 	uint32_t	hss_id;
390 } vsw_ctrl_task_t;
391 
392 /*
393  * State of connection to peer. Some of these states
394  * can be mapped to LDC events as follows:
395  *
396  * VSW_CONN_RESET -> LDC_RESET_EVT
397  * VSW_CONN_UP    -> LDC_UP_EVT
398  */
399 #define	VSW_CONN_UP		0x1	/* Connection come up */
400 #define	VSW_CONN_RESET		0x2	/* Connection reset */
401 #define	VSW_CONN_RESTART	0x4	/* Restarting handshake on connection */
402 
403 typedef struct vsw_conn_evt {
404 	uint16_t	evt;		/* Connection event */
405 	vsw_ldc_t	*ldcp;
406 } vsw_conn_evt_t;
407 
408 /*
409  * Vsw queue -- largely modeled after squeue
410  *
411  * VSW_QUEUE_RUNNING, vqueue thread for queue is running.
412  * VSW_QUEUE_DRAINED, vqueue thread has drained current work and is exiting.
413  * VSW_QUEUE_STOP, request for the vqueue thread to stop.
414  * VSW_QUEUE_STOPPED, vqueue thread is not running.
415  */
416 #define	VSW_QUEUE_RUNNING	0x01
417 #define	VSW_QUEUE_DRAINED	0x02
418 #define	VSW_QUEUE_STOP		0x04
419 #define	VSW_QUEUE_STOPPED	0x08
420 
421 typedef struct vsw_queue_s {
422 	kmutex_t	vq_lock;	/* Lock, before using any member. */
423 	kcondvar_t	vq_cv;		/* Async threads block on. */
424 	uint32_t	vq_state;	/* State flags. */
425 
426 	mblk_t		*vq_first;	/* First mblk chain or NULL. */
427 	mblk_t		*vq_last;	/* Last mblk chain. */
428 
429 	processorid_t	vq_bind;	/* Process to bind to */
430 	kthread_t	*vq_worker;	/* Queue's thread */
431 } vsw_queue_t;
432 
433 /*
434  * VSW MAC Ring Resources.
435  *	MAC Ring resource is composed of this state structure and
436  *	a kernel thread to perform the processing of the ring.
437  */
438 typedef struct vsw_mac_ring_s {
439 	uint32_t	ring_state;
440 
441 	mac_blank_t	ring_blank;
442 	void		*ring_arg;
443 
444 	vsw_queue_t	*ring_vqp;
445 	struct vsw	*ring_vswp;
446 } vsw_mac_ring_t;
447 
448 /*
449  * Maximum Ring Resources.
450  */
451 #define	VSW_MAC_RX_RINGS	0x40
452 
453 /*
454  * States for entry in ring table.
455  */
456 #define	VSW_MAC_RING_FREE	1
457 #define	VSW_MAC_RING_INUSE	2
458 
459 /*
460  * Number of hash chains in the multicast forwarding database.
461  */
462 #define		VSW_NCHAINS	8
463 
464 /*
465  * State of interface if switch plumbed as network device.
466  */
467 #define		VSW_IF_REG	0x1	/* interface was registered */
468 #define		VSW_IF_UP	0x2	/* Interface UP */
469 #define		VSW_IF_PROMISC	0x4	/* Interface in promiscious mode */
470 
471 #define		VSW_U_P(state)	\
472 			(state == (VSW_IF_UP | VSW_IF_PROMISC))
473 
474 /*
475  * Switching modes.
476  */
477 #define		VSW_LAYER2		0x1	/* Layer 2 - MAC switching */
478 #define		VSW_LAYER2_PROMISC	0x2	/* Layer 2 + promisc mode */
479 #define		VSW_LAYER3		0x4	/* Layer 3 - IP switching */
480 
481 #define		NUM_SMODES	3	/* number of switching modes */
482 
483 /*
484  * vsw instance state information.
485  */
486 typedef struct	vsw {
487 	int			instance;	/* instance # */
488 	dev_info_t		*dip;		/* associated dev_info */
489 	uint64_t		regprop;	/* "reg" property */
490 	struct vsw		*next;		/* next in list */
491 	char			physname[LIFNAMSIZ];	/* phys-dev */
492 	uint8_t			smode[NUM_SMODES];	/* switching mode */
493 	int			smode_idx;	/* curr pos in smode array */
494 	int			smode_num;	/* # of modes specified */
495 	kmutex_t		swtmout_lock;	/* setup switching tmout lock */
496 	boolean_t		swtmout_enabled; /* setup switching tmout on */
497 	timeout_id_t		swtmout_id;	/* setup switching tmout id */
498 	uint32_t		switching_setup_done; /* setup switching done */
499 	int			mac_open_retries; /* mac_open() retry count */
500 	vsw_port_list_t		plist;		/* associated ports */
501 	ddi_taskq_t		*taskq_p;	/* VIO ctrl msg taskq */
502 	mod_hash_t		*fdb;		/* forwarding database */
503 
504 	mod_hash_t		*mfdb;		/* multicast FDB */
505 	krwlock_t		mfdbrw;		/* rwlock for mFDB */
506 
507 	vio_mblk_pool_t		*rxh;		/* Receive pool handle */
508 	void			(*vsw_switch_frame)
509 					(struct vsw *, mblk_t *, int,
510 					vsw_port_t *, mac_resource_handle_t);
511 
512 	/* mac layer */
513 	kmutex_t		mac_lock;	/* protect fields below */
514 	mac_handle_t		mh;
515 	mac_rx_handle_t		mrh;
516 	multiaddress_capab_t	maddr;		/* Multiple uni addr capable */
517 	const mac_txinfo_t	*txinfo;	/* MAC tx routine */
518 	boolean_t		mstarted;	/* Mac Started? */
519 	boolean_t		mresources;	/* Mac Resources cb? */
520 
521 	/*
522 	 * MAC Ring Resources.
523 	 */
524 	kmutex_t		mac_ring_lock;	/* Lock for the table. */
525 	uint32_t		mac_ring_tbl_sz;
526 	vsw_mac_ring_t		*mac_ring_tbl;	/* Mac ring table. */
527 
528 	kmutex_t		hw_lock;	/* sync access to HW */
529 	boolean_t		recfg_reqd;	/* Reconfig of addrs needed */
530 	int			promisc_cnt;
531 
532 	/* Machine Description updates  */
533 	mdeg_node_spec_t	*inst_spec;
534 	mdeg_handle_t		mdeg_hdl;
535 	mdeg_handle_t		mdeg_port_hdl;
536 
537 	/* if configured as an ethernet interface */
538 	mac_handle_t		if_mh;		/* MAC handle */
539 	struct ether_addr	if_addr;	/* interface address */
540 	krwlock_t		if_lockrw;
541 	uint8_t			if_state;	/* interface state */
542 
543 	mac_addr_slot_t		addr_slot;	/* Unicast address slot */
544 	int			addr_set;	/* Addr set where */
545 
546 	/* multicast addresses when configured as eth interface */
547 	kmutex_t		mca_lock;	/* multicast lock */
548 	mcst_addr_t		*mcap;		/* list of multicast addrs */
549 } vsw_t;
550 
551 
552 /*
553  * Ethernet broadcast address definition.
554  */
555 static	struct	ether_addr	etherbroadcastaddr = {
556 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
557 };
558 
559 #define	IS_BROADCAST(ehp) \
560 	(ether_cmp(&ehp->ether_dhost, &etherbroadcastaddr) == 0)
561 #define	IS_MULTICAST(ehp) \
562 	((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)
563 
564 #define	READ_ENTER(x)	rw_enter(x, RW_READER)
565 #define	WRITE_ENTER(x)	rw_enter(x, RW_WRITER)
566 #define	RW_EXIT(x)	rw_exit(x)
567 
568 #ifdef	__cplusplus
569 }
570 #endif
571 
572 #endif	/* _VSW_H */
573