xref: /titanic_51/usr/src/uts/sun4v/io/vsw.c (revision 8f514e743bde41fe7e0ca48510a6d4c40ca51c23)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/errno.h>
28 #include <sys/debug.h>
29 #include <sys/time.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/user.h>
33 #include <sys/stropts.h>
34 #include <sys/stream.h>
35 #include <sys/strlog.h>
36 #include <sys/strsubr.h>
37 #include <sys/cmn_err.h>
38 #include <sys/cpu.h>
39 #include <sys/kmem.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/ksynch.h>
44 #include <sys/stat.h>
45 #include <sys/kstat.h>
46 #include <sys/vtrace.h>
47 #include <sys/strsun.h>
48 #include <sys/dlpi.h>
49 #include <sys/ethernet.h>
50 #include <net/if.h>
51 #include <sys/varargs.h>
52 #include <sys/machsystm.h>
53 #include <sys/modctl.h>
54 #include <sys/modhash.h>
55 #include <sys/mac_provider.h>
56 #include <sys/mac_ether.h>
57 #include <sys/taskq.h>
58 #include <sys/note.h>
59 #include <sys/mach_descrip.h>
60 #include <sys/mac_provider.h>
61 #include <sys/mdeg.h>
62 #include <sys/ldc.h>
63 #include <sys/vsw_fdb.h>
64 #include <sys/vsw.h>
65 #include <sys/vio_mailbox.h>
66 #include <sys/vnet_mailbox.h>
67 #include <sys/vnet_common.h>
68 #include <sys/vio_util.h>
69 #include <sys/sdt.h>
70 #include <sys/atomic.h>
71 #include <sys/callb.h>
72 #include <sys/vlan.h>
73 
74 /*
75  * Function prototypes.
76  */
77 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
78 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
79 static	int vsw_unattach(vsw_t *vswp);
80 static	int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
81 static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *);
82 void vsw_destroy_rxpools(void *);
83 
84 /* MDEG routines */
85 static	int vsw_mdeg_register(vsw_t *vswp);
86 static	void vsw_mdeg_unregister(vsw_t *vswp);
87 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
88 static	int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
89 static	int vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
90 static	int vsw_read_mdprops(vsw_t *vswp);
91 static	void vsw_vlan_read_ids(void *arg, int type, md_t *mdp,
92 	mde_cookie_t node, uint16_t *pvidp, vsw_vlanid_t **vidspp,
93 	uint16_t *nvidsp, uint16_t *default_idp);
94 static	void vsw_port_read_bandwidth(vsw_port_t *portp, md_t *mdp,
95 	mde_cookie_t node, uint64_t *bw);
96 static	int vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
97 	md_t *mdp, mde_cookie_t *node);
98 static	void vsw_read_pri_eth_types(vsw_t *vswp, md_t *mdp,
99 	mde_cookie_t node);
100 static	void vsw_mtu_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
101 	uint32_t *mtu);
102 static	int vsw_mtu_update(vsw_t *vswp, uint32_t mtu);
103 static	void vsw_linkprop_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
104 	boolean_t *pls);
105 static	void vsw_bandwidth_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
106 	uint64_t *bw);
107 static	void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
108 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
109 static boolean_t vsw_cmp_vids(vsw_vlanid_t *vids1,
110 	vsw_vlanid_t *vids2, int nvids);
111 
112 /* Mac driver related routines */
113 static int vsw_mac_register(vsw_t *);
114 static int vsw_mac_unregister(vsw_t *);
115 static int vsw_m_stat(void *, uint_t, uint64_t *);
116 static void vsw_m_stop(void *arg);
117 static int vsw_m_start(void *arg);
118 static int vsw_m_unicst(void *arg, const uint8_t *);
119 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
120 static int vsw_m_promisc(void *arg, boolean_t);
121 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
122 void vsw_mac_link_update(vsw_t *vswp, link_state_t link_state);
123 void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
124     mblk_t *mp, vsw_macrx_flags_t flags);
125 void vsw_physlink_state_update(vsw_t *vswp);
126 
127 /*
128  * Functions imported from other files.
129  */
130 extern void vsw_setup_switching_thread(void *arg);
131 extern int vsw_setup_switching_start(vsw_t *vswp);
132 extern void vsw_setup_switching_stop(vsw_t *vswp);
133 extern int vsw_setup_switching(vsw_t *);
134 extern void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
135     vsw_port_t *port, mac_resource_handle_t mrh);
136 extern int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
137 extern int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
138 extern void vsw_del_mcst_vsw(vsw_t *);
139 extern mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
140 extern void vsw_detach_ports(vsw_t *vswp);
141 extern int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
142 extern int vsw_port_detach(vsw_t *vswp, int p_instance);
143 static int vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
144 	md_t *prev_mdp, mde_cookie_t prev_mdex);
145 extern	int vsw_port_attach(vsw_port_t *port);
146 extern vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
147 extern int vsw_mac_open(vsw_t *vswp);
148 extern void vsw_mac_close(vsw_t *vswp);
149 extern void vsw_mac_cleanup_ports(vsw_t *vswp);
150 extern void vsw_unset_addrs(vsw_t *vswp);
151 extern void vsw_setup_switching_post_process(vsw_t *vswp);
152 extern void vsw_create_vlans(void *arg, int type);
153 extern void vsw_destroy_vlans(void *arg, int type);
154 extern void vsw_vlan_add_ids(void *arg, int type);
155 extern void vsw_vlan_remove_ids(void *arg, int type);
156 extern void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
157 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
158 	mblk_t **npt);
159 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
160 extern void vsw_hio_cleanup(vsw_t *vswp);
161 extern void vsw_hio_start_ports(vsw_t *vswp);
162 extern void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled);
163 extern int vsw_mac_multicast_add(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
164 extern void vsw_mac_multicast_remove(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
165 extern void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
166     vsw_vlanid_t *new_vids, int new_nvids);
167 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
168 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
169 extern void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
170     uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
171 extern void vsw_reset_ports(vsw_t *vswp);
172 extern void vsw_port_reset(vsw_port_t *portp);
173 extern void vsw_physlink_update_ports(vsw_t *vswp);
174 extern void vsw_update_bandwidth(vsw_t *vswp, vsw_port_t *port, int type,
175     uint64_t maxbw);
176 
177 /*
178  * Internal tunables.
179  */
180 int	vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
181 int	vsw_wretries = 100;		/* # of write attempts */
182 int	vsw_setup_switching_delay = 3;	/* setup sw timeout interval in sec */
183 int	vsw_mac_open_retries = 300;	/* max # of mac_open() retries */
184 					/* 300*3 = 900sec(15min) of max tmout */
185 int	vsw_ldc_tx_delay = 5;		/* delay(ticks) for tx retries */
186 int	vsw_ldc_tx_retries = 10;	/* # of ldc tx retries */
187 int	vsw_ldc_retries = 5;		/* # of ldc_close() retries */
188 int	vsw_ldc_delay = 1000;		/* 1 ms delay for ldc_close() */
189 boolean_t vsw_ldc_rxthr_enabled = B_TRUE;	/* LDC Rx thread enabled */
190 boolean_t vsw_ldc_txthr_enabled = B_TRUE;	/* LDC Tx thread enabled */
191 int	vsw_rxpool_cleanup_delay = 100000;	/* 100ms */
192 
193 
194 uint32_t	vsw_fdb_nchains = 8;	/* # of chains in fdb hash table */
195 uint32_t	vsw_vlan_nchains = 4;	/* # of chains in vlan id hash table */
196 uint32_t	vsw_ethermtu = 1500;	/* mtu of the device */
197 
198 /* delay in usec to wait for all references on a fdb entry to be dropped */
199 uint32_t vsw_fdbe_refcnt_delay = 10;
200 
201 /*
202  * Default vlan id. This is only used internally when the "default-vlan-id"
203  * property is not present in the MD device node. Therefore, this should not be
204  * used as a tunable; if this value is changed, the corresponding variable
205  * should be updated to the same value in all vnets connected to this vsw.
206  */
207 uint16_t	vsw_default_vlan_id = 1;
208 
209 /*
210  * Workaround for a version handshake bug in obp's vnet.
211  * If vsw initiates version negotiation starting from the highest version,
212  * obp sends a nack and terminates version handshake. To workaround
213  * this, we do not initiate version handshake when the channel comes up.
214  * Instead, we wait for the peer to send its version info msg and go through
215  * the version protocol exchange. If we successfully negotiate a version,
216  * before sending the ack, we send our version info msg to the peer
217  * using the <major,minor> version that we are about to ack.
218  */
219 boolean_t vsw_obp_ver_proto_workaround = B_TRUE;
220 
221 /*
222  * In the absence of "priority-ether-types" property in MD, the following
223  * internal tunable can be set to specify a single priority ethertype.
224  */
225 uint64_t vsw_pri_eth_type = 0;
226 
227 /*
228  * Number of transmit priority buffers that are preallocated per device.
229  * This number is chosen to be a small value to throttle transmission
230  * of priority packets. Note: Must be a power of 2 for vio_create_mblks().
231  */
232 uint32_t vsw_pri_tx_nmblks = 64;
233 
234 /*
235  * Number of RARP packets sent to announce macaddr to the physical switch,
236  * after vsw's physical device is changed dynamically or after a guest (client
237  * vnet) is live migrated in.
238  */
239 uint32_t vsw_publish_macaddr_count = 3;
240 
241 /*
242  * Enable/disable HybridIO
243  */
244 boolean_t vsw_hio_enabled = B_TRUE;
245 
246 /*
247  * Max retries for HybridIO cleanup
248  */
249 int vsw_hio_max_cleanup_retries = 10;
250 
251 /*
252  * 10ms delay for HybridIO cleanup
253  */
254 int vsw_hio_cleanup_delay = 10000;
255 
256 /*
257  * Descriptor ring modes of LDC data transfer:
258  *
259  * 1) TxDring mode:
260  * In versions < v1.6 of VIO Protocol, we support only TxDring mode. In this
261  * mode, we create a transmit descriptor ring and export it to the peer through
262  * dring registration process of handshake. The descriptor ring is exported
263  * using LDC shared memory. Each descriptor is associated with a data buffer.
264  * The data buffer is also exported over LDC and the cookies for this data
265  * buffer are provided in the descriptor. The peer maps this ring as its
266  * receive ring. Similarly, the peer exports a transmit descriptor ring which
267  * is mapped by this device as its receive ring. In this mode, in a given data
268  * transfer direction, the transmitter copies the data to the exported data
269  * buffer (owned by itself), bound to the descriptor. The receiver uses the LDC
270  * cookies specified in the descriptor to copy the data into the receiving
271  * guest through the hypervisor (ldc_mem_copy()).
272  *
273  * 2) RxDringData mode:
274  * In versions >= v1.6 of VIO Protocol, we also support RxDringData mode. In
275  * this mode, we create a receive descriptor ring and export it to the peer
276  * through dring registration process of handshake. In addition, we export a
277  * receive buffer area and provide that information also in the dring
278  * registration message. The descriptor ring and the data buffer area are
279  * exported using LDC shared memory. Each descriptor is associated with a data
280  * buffer in the data buffer area and the offset of the specific data buffer
281  * within this area is specified in the descriptor. The peer maps this ring
282  * along with the data buffer area as its transmit ring. Similarly, the peer
283  * exports a receive ring which is mapped by this device as its transmit ring,
284  * along with its buffer area. In this mode, in a given data transfer
285  * direction, the transmitter copies the data to the data buffer offset
286  * specified in the descriptor. The receiver simply picks up the data buffer
287  * (owned by itself) without any copy operation into the receiving guest.
288  *
289  * We provide a tunable to enable RxDringData mode for versions >= v1.6 of VIO
290  * Protocol. By default, this tunable is set to 1 (VIO_TX_DRING). To enable
291  * RxDringData mode set this tunable to 4 (VIO_RX_DRING_DATA). This enables us
292  * to negotiate RxDringData mode with peers that support versions >= v1.6. For
293  * peers that support version < v1.6, we continue to operate in TxDring mode
294  * with them though the tunable is enabled.
295  */
296 uint8_t  vsw_dring_mode = VIO_TX_DRING;
297 
298 /*
299  * Number of descriptors;  must be power of 2.
300  */
301 uint32_t vsw_num_descriptors = VSW_NUM_DESCRIPTORS;
302 
303 /*
304  * In RxDringData mode, # of buffers is determined by multiplying the # of
305  * descriptors with the factor below. Note that the factor must be > 1; i.e,
306  * the # of buffers must always be > # of descriptors. This is needed because,
307  * while the shared memory buffers are sent up the stack on the receiver, the
308  * sender needs additional buffers that can be used for further transmits.
309  * See vsw_setup_rx_dring() for details.
310  */
311 uint32_t vsw_nrbufs_factor = 2;
312 
313 /*
314  * Delay when rx descr not ready; used in both dring modes.
315  */
316 int	vsw_recv_delay = 0;
317 
318 /*
319  * Retry when rx descr not ready; used in both dring modes.
320  */
321 int	vsw_recv_retries = 5;
322 
323 /*
324  * Max number of mblks received in one receive operation.
325  */
326 uint32_t vsw_chain_len = (VSW_NUM_MBLKS * 0.6);
327 
328 /*
329  * Internal tunables for receive buffer pools, that is,  the size and number of
330  * mblks for each pool. At least 3 sizes must be specified if these are used.
331  * The sizes must be specified in increasing order. Non-zero value of the first
332  * size will be used as a hint to use these values instead of the algorithm
333  * that determines the sizes based on MTU. Used in TxDring mode only.
334  */
335 uint32_t vsw_mblk_size1 = 0;
336 uint32_t vsw_mblk_size2 = 0;
337 uint32_t vsw_mblk_size3 = 0;
338 uint32_t vsw_mblk_size4 = 0;
339 uint32_t vsw_num_mblks1 = VSW_NUM_MBLKS;	/* number of mblks for pool1 */
340 uint32_t vsw_num_mblks2 = VSW_NUM_MBLKS;	/* number of mblks for pool2 */
341 uint32_t vsw_num_mblks3 = VSW_NUM_MBLKS;	/* number of mblks for pool3 */
342 uint32_t vsw_num_mblks4 = VSW_NUM_MBLKS;	/* number of mblks for pool4 */
343 
344 /*
345  * Set this to non-zero to enable additional internal receive buffer pools
346  * based on the MTU of the device for better performance at the cost of more
347  * memory consumption. This is turned off by default, to use allocb(9F) for
348  * receive buffer allocations of sizes > 2K.
349  */
350 boolean_t vsw_jumbo_rxpools = B_FALSE;
351 
352 /*
353  * vsw_max_tx_qcount is the maximum # of packets that can be queued
354  * before the tx worker thread begins processing the queue. Its value
355  * is chosen to be 4x the default length of tx descriptor ring.
356  */
357 uint32_t vsw_max_tx_qcount = 4 * VSW_NUM_DESCRIPTORS;
358 
359 /*
360  * MAC callbacks
361  */
362 static	mac_callbacks_t	vsw_m_callbacks = {
363 	0,
364 	vsw_m_stat,
365 	vsw_m_start,
366 	vsw_m_stop,
367 	vsw_m_promisc,
368 	vsw_m_multicst,
369 	vsw_m_unicst,
370 	vsw_m_tx
371 };
372 
373 static	struct	cb_ops	vsw_cb_ops = {
374 	nulldev,			/* cb_open */
375 	nulldev,			/* cb_close */
376 	nodev,				/* cb_strategy */
377 	nodev,				/* cb_print */
378 	nodev,				/* cb_dump */
379 	nodev,				/* cb_read */
380 	nodev,				/* cb_write */
381 	nodev,				/* cb_ioctl */
382 	nodev,				/* cb_devmap */
383 	nodev,				/* cb_mmap */
384 	nodev,				/* cb_segmap */
385 	nochpoll,			/* cb_chpoll */
386 	ddi_prop_op,			/* cb_prop_op */
387 	NULL,				/* cb_stream */
388 	D_MP,				/* cb_flag */
389 	CB_REV,				/* rev */
390 	nodev,				/* int (*cb_aread)() */
391 	nodev				/* int (*cb_awrite)() */
392 };
393 
394 static	struct	dev_ops	vsw_ops = {
395 	DEVO_REV,		/* devo_rev */
396 	0,			/* devo_refcnt */
397 	NULL,			/* devo_getinfo */
398 	nulldev,		/* devo_identify */
399 	nulldev,		/* devo_probe */
400 	vsw_attach,		/* devo_attach */
401 	vsw_detach,		/* devo_detach */
402 	nodev,			/* devo_reset */
403 	&vsw_cb_ops,		/* devo_cb_ops */
404 	(struct bus_ops *)NULL,	/* devo_bus_ops */
405 	ddi_power		/* devo_power */
406 };
407 
408 extern	struct	mod_ops	mod_driverops;
409 static struct modldrv vswmodldrv = {
410 	&mod_driverops,
411 	"sun4v Virtual Switch",
412 	&vsw_ops,
413 };
414 
415 #define	LDC_ENTER_LOCK(ldcp)	\
416 				mutex_enter(&((ldcp)->ldc_cblock));\
417 				mutex_enter(&((ldcp)->ldc_rxlock));\
418 				mutex_enter(&((ldcp)->ldc_txlock));
419 #define	LDC_EXIT_LOCK(ldcp)	\
420 				mutex_exit(&((ldcp)->ldc_txlock));\
421 				mutex_exit(&((ldcp)->ldc_rxlock));\
422 				mutex_exit(&((ldcp)->ldc_cblock));
423 
424 /* Driver soft state ptr  */
425 static void	*vsw_state;
426 
427 /*
428  * Linked list of "vsw_t" structures - one per instance.
429  */
430 vsw_t		*vsw_head = NULL;
431 krwlock_t	vsw_rw;
432 
433 /*
434  * Property names
435  */
436 static char vdev_propname[] = "virtual-device";
437 static char vsw_propname[] = "virtual-network-switch";
438 static char physdev_propname[] = "vsw-phys-dev";
439 static char smode_propname[] = "vsw-switch-mode";
440 static char macaddr_propname[] = "local-mac-address";
441 static char remaddr_propname[] = "remote-mac-address";
442 static char ldcids_propname[] = "ldc-ids";
443 static char chan_propname[] = "channel-endpoint";
444 static char id_propname[] = "id";
445 static char reg_propname[] = "reg";
446 static char pri_types_propname[] = "priority-ether-types";
447 static char vsw_pvid_propname[] = "port-vlan-id";
448 static char vsw_vid_propname[] = "vlan-id";
449 static char vsw_dvid_propname[] = "default-vlan-id";
450 static char port_pvid_propname[] = "remote-port-vlan-id";
451 static char port_vid_propname[] = "remote-vlan-id";
452 static char hybrid_propname[] = "hybrid";
453 static char vsw_mtu_propname[] = "mtu";
454 static char vsw_linkprop_propname[] = "linkprop";
455 static char vsw_maxbw_propname[] = "maxbw";
456 static char port_maxbw_propname[] = "maxbw";
457 
458 /*
459  * Matching criteria passed to the MDEG to register interest
460  * in changes to 'virtual-device-port' nodes identified by their
461  * 'id' property.
462  */
463 static md_prop_match_t vport_prop_match[] = {
464 	{ MDET_PROP_VAL,    "id"   },
465 	{ MDET_LIST_END,    NULL    }
466 };
467 
468 static mdeg_node_match_t vport_match = { "virtual-device-port",
469 						vport_prop_match };
470 
471 /*
472  * Matching criteria passed to the MDEG to register interest
473  * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
474  * by their 'name' and 'cfg-handle' properties.
475  */
476 static md_prop_match_t vdev_prop_match[] = {
477 	{ MDET_PROP_STR,    "name"   },
478 	{ MDET_PROP_VAL,    "cfg-handle" },
479 	{ MDET_LIST_END,    NULL    }
480 };
481 
482 static mdeg_node_match_t vdev_match = { "virtual-device",
483 						vdev_prop_match };
484 
485 
486 /*
487  * Specification of an MD node passed to the MDEG to filter any
488  * 'vport' nodes that do not belong to the specified node. This
489  * template is copied for each vsw instance and filled in with
490  * the appropriate 'cfg-handle' value before being passed to the MDEG.
491  */
492 static mdeg_prop_spec_t vsw_prop_template[] = {
493 	{ MDET_PROP_STR,    "name",		vsw_propname },
494 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
495 	{ MDET_LIST_END,    NULL,		NULL	}
496 };
497 
498 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
499 
500 #ifdef	DEBUG
501 /*
502  * Print debug messages - set to 0x1f to enable all msgs
503  * or 0x0 to turn all off.
504  */
505 int vswdbg = 0x0;
506 
507 /*
508  * debug levels:
509  * 0x01:	Function entry/exit tracing
510  * 0x02:	Internal function messages
511  * 0x04:	Verbose internal messages
512  * 0x08:	Warning messages
513  * 0x10:	Error messages
514  */
515 
516 void
517 vswdebug(vsw_t *vswp, const char *fmt, ...)
518 {
519 	char buf[512];
520 	va_list ap;
521 
522 	va_start(ap, fmt);
523 	(void) vsprintf(buf, fmt, ap);
524 	va_end(ap);
525 
526 	if (vswp == NULL)
527 		cmn_err(CE_CONT, "%s\n", buf);
528 	else
529 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
530 }
531 
532 #endif	/* DEBUG */
533 
534 static struct modlinkage modlinkage = {
535 	MODREV_1,
536 	&vswmodldrv,
537 	NULL
538 };
539 
540 int
541 _init(void)
542 {
543 	int status;
544 
545 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
546 
547 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
548 	if (status != 0) {
549 		return (status);
550 	}
551 
552 	mac_init_ops(&vsw_ops, DRV_NAME);
553 	status = mod_install(&modlinkage);
554 	if (status != 0) {
555 		ddi_soft_state_fini(&vsw_state);
556 	}
557 	return (status);
558 }
559 
560 int
561 _fini(void)
562 {
563 	int status;
564 
565 	status = mod_remove(&modlinkage);
566 	if (status != 0)
567 		return (status);
568 	mac_fini_ops(&vsw_ops);
569 	ddi_soft_state_fini(&vsw_state);
570 
571 	rw_destroy(&vsw_rw);
572 
573 	return (status);
574 }
575 
576 int
577 _info(struct modinfo *modinfop)
578 {
579 	return (mod_info(&modlinkage, modinfop));
580 }
581 
582 static int
583 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
584 {
585 	vsw_t			*vswp;
586 	int			instance;
587 	char			hashname[MAXNAMELEN];
588 	char			qname[TASKQ_NAMELEN];
589 	vsw_attach_progress_t	progress = PROG_init;
590 	int			rv;
591 
592 	switch (cmd) {
593 	case DDI_ATTACH:
594 		break;
595 	case DDI_RESUME:
596 		/* nothing to do for this non-device */
597 		return (DDI_SUCCESS);
598 	case DDI_PM_RESUME:
599 	default:
600 		return (DDI_FAILURE);
601 	}
602 
603 	instance = ddi_get_instance(dip);
604 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
605 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
606 		return (DDI_FAILURE);
607 	}
608 	vswp = ddi_get_soft_state(vsw_state, instance);
609 
610 	if (vswp == NULL) {
611 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
612 		goto vsw_attach_fail;
613 	}
614 
615 	vswp->dip = dip;
616 	vswp->instance = instance;
617 	vswp->phys_link_state = LINK_STATE_UNKNOWN;
618 	ddi_set_driver_private(dip, (caddr_t)vswp);
619 
620 	mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
621 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
622 	mutex_init(&vswp->sw_thr_lock, NULL, MUTEX_DRIVER, NULL);
623 	cv_init(&vswp->sw_thr_cv, NULL, CV_DRIVER, NULL);
624 	rw_init(&vswp->maccl_rwlock, NULL, RW_DRIVER, NULL);
625 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
626 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
627 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
628 
629 	progress |= PROG_locks;
630 
631 	rv = vsw_read_mdprops(vswp);
632 	if (rv != 0)
633 		goto vsw_attach_fail;
634 
635 	progress |= PROG_readmd;
636 
637 	/* setup the unicast forwarding database  */
638 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
639 	    vswp->instance);
640 	D2(vswp, "creating unicast hash table (%s)...", hashname);
641 	vswp->fdb_nchains = vsw_fdb_nchains;
642 	vswp->fdb_hashp = mod_hash_create_ptrhash(hashname, vswp->fdb_nchains,
643 	    mod_hash_null_valdtor, sizeof (void *));
644 	vsw_create_vlans((void *)vswp, VSW_LOCALDEV);
645 	progress |= PROG_fdb;
646 
647 	/* setup the multicast fowarding database */
648 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
649 	    vswp->instance);
650 	D2(vswp, "creating multicast hash table %s)...", hashname);
651 	vswp->mfdb = mod_hash_create_ptrhash(hashname, vsw_fdb_nchains,
652 	    mod_hash_null_valdtor, sizeof (void *));
653 
654 	progress |= PROG_mfdb;
655 
656 	/*
657 	 * Create the taskq which will process all the VIO
658 	 * control messages.
659 	 */
660 	(void) snprintf(qname, TASKQ_NAMELEN, "taskq%d", vswp->instance);
661 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
662 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
663 		cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
664 		    vswp->instance);
665 		goto vsw_attach_fail;
666 	}
667 
668 	progress |= PROG_taskq;
669 
670 	(void) snprintf(qname, TASKQ_NAMELEN, "rxpool_taskq%d",
671 	    vswp->instance);
672 	if ((vswp->rxp_taskq = ddi_taskq_create(vswp->dip, qname, 1,
673 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
674 		cmn_err(CE_WARN, "!vsw%d: Unable to create rxp task queue",
675 		    vswp->instance);
676 		goto vsw_attach_fail;
677 	}
678 
679 	progress |= PROG_rxp_taskq;
680 
681 	/* prevent auto-detaching */
682 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
683 	    DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
684 		cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
685 		    "instance %u", DDI_NO_AUTODETACH, instance);
686 	}
687 
688 	/*
689 	 * The null switching function is set to avoid panic until
690 	 * switch mode is setup.
691 	 */
692 	vswp->vsw_switch_frame = vsw_switch_frame_nop;
693 
694 	/*
695 	 * Setup the required switching mode, based on the mdprops that we read
696 	 * earlier. We start a thread to do this, to avoid calling mac_open()
697 	 * directly from attach().
698 	 */
699 	rv = vsw_setup_switching_start(vswp);
700 	if (rv != 0) {
701 		goto vsw_attach_fail;
702 	}
703 
704 	progress |= PROG_swmode;
705 
706 	/* Register with mac layer as a provider */
707 	rv = vsw_mac_register(vswp);
708 	if (rv != 0)
709 		goto vsw_attach_fail;
710 
711 	progress |= PROG_macreg;
712 
713 	/*
714 	 * Now we have everything setup, register an interest in
715 	 * specific MD nodes.
716 	 *
717 	 * The callback is invoked in 2 cases, firstly if upon mdeg
718 	 * registration there are existing nodes which match our specified
719 	 * criteria, and secondly if the MD is changed (and again, there
720 	 * are nodes which we are interested in present within it. Note
721 	 * that our callback will be invoked even if our specified nodes
722 	 * have not actually changed).
723 	 *
724 	 */
725 	rv = vsw_mdeg_register(vswp);
726 	if (rv != 0)
727 		goto vsw_attach_fail;
728 
729 	progress |= PROG_mdreg;
730 
731 	vswp->attach_progress = progress;
732 
733 	WRITE_ENTER(&vsw_rw);
734 	vswp->next = vsw_head;
735 	vsw_head = vswp;
736 	RW_EXIT(&vsw_rw);
737 
738 	ddi_report_dev(vswp->dip);
739 	return (DDI_SUCCESS);
740 
741 vsw_attach_fail:
742 	DERR(NULL, "vsw_attach: failed");
743 
744 	vswp->attach_progress = progress;
745 	(void) vsw_unattach(vswp);
746 	ddi_soft_state_free(vsw_state, instance);
747 	return (DDI_FAILURE);
748 }
749 
750 static int
751 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
752 {
753 	vsw_t			**vswpp, *vswp;
754 	int 			instance;
755 
756 	instance = ddi_get_instance(dip);
757 	vswp = ddi_get_soft_state(vsw_state, instance);
758 
759 	if (vswp == NULL) {
760 		return (DDI_FAILURE);
761 	}
762 
763 	switch (cmd) {
764 	case DDI_DETACH:
765 		break;
766 	case DDI_SUSPEND:
767 	case DDI_PM_SUSPEND:
768 	default:
769 		return (DDI_FAILURE);
770 	}
771 
772 	D2(vswp, "detaching instance %d", instance);
773 
774 	if (vsw_unattach(vswp) != 0) {
775 		return (DDI_FAILURE);
776 	}
777 
778 	ddi_remove_minor_node(dip, NULL);
779 
780 	WRITE_ENTER(&vsw_rw);
781 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
782 		if (*vswpp == vswp) {
783 			*vswpp = vswp->next;
784 			break;
785 		}
786 	}
787 	RW_EXIT(&vsw_rw);
788 
789 	ddi_soft_state_free(vsw_state, instance);
790 
791 	return (DDI_SUCCESS);
792 }
793 
794 /*
795  * Common routine to handle vsw_attach() failure and vsw_detach(). Note that
796  * the only reason this function could fail is if mac_unregister() fails.
797  * Otherwise, this function must ensure that all resources are freed and return
798  * success.
799  */
800 static int
801 vsw_unattach(vsw_t *vswp)
802 {
803 	vsw_attach_progress_t	progress;
804 
805 	progress = vswp->attach_progress;
806 
807 	/*
808 	 * Unregister from the gldv3 subsystem. This can fail, in particular
809 	 * if there are still any open references to this mac device; in which
810 	 * case we just return failure without continuing to detach further.
811 	 */
812 	if (progress & PROG_macreg) {
813 		if (vsw_mac_unregister(vswp) != 0) {
814 			cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
815 			    "MAC layer", vswp->instance);
816 			return (1);
817 		}
818 		progress &= ~PROG_macreg;
819 	}
820 
821 	/*
822 	 * Now that we have unregistered from gldv3, we must finish all other
823 	 * steps and successfully return from this function; otherwise we will
824 	 * end up leaving the device in a broken/unusable state.
825 	 *
826 	 * If we have registered with mdeg, unregister now to stop further
827 	 * callbacks to this vsw device and/or its ports. Then, detach any
828 	 * existing ports.
829 	 */
830 	if (progress & PROG_mdreg) {
831 		vsw_mdeg_unregister(vswp);
832 		vsw_detach_ports(vswp);
833 		progress &= ~PROG_mdreg;
834 	}
835 
836 	/*
837 	 * If we have started a thread to setup the switching mode, stop it, if
838 	 * it is still running. If it has finished setting up the switching
839 	 * mode, then we need to clean up some additional things if we are
840 	 * running in L2 mode: first free up any hybrid resources; then stop
841 	 * and close the underlying physical device. Note that we would have
842 	 * already released all per mac_client resources (ucast, mcast addrs,
843 	 * hio-shares etc) as all the ports are detached and if the vsw device
844 	 * itself was in use as an interface, it has been unplumbed (otherwise
845 	 * mac_unregister() above would fail).
846 	 */
847 	if (progress & PROG_swmode) {
848 
849 		vsw_setup_switching_stop(vswp);
850 
851 		if (vswp->hio_capable == B_TRUE) {
852 			vsw_hio_cleanup(vswp);
853 			vswp->hio_capable = B_FALSE;
854 		}
855 
856 		mutex_enter(&vswp->mac_lock);
857 		vsw_mac_close(vswp);
858 		mutex_exit(&vswp->mac_lock);
859 
860 		progress &= ~PROG_swmode;
861 	}
862 
863 	/*
864 	 * We now destroy the taskq used to clean up rx mblk pools that
865 	 * couldn't be destroyed when the ports/channels were detached.
866 	 * We implicitly wait for those tasks to complete in
867 	 * ddi_taskq_destroy().
868 	 */
869 	if (progress & PROG_rxp_taskq) {
870 		ddi_taskq_destroy(vswp->rxp_taskq);
871 		progress &= ~PROG_rxp_taskq;
872 	}
873 
874 	/*
875 	 * By now any pending tasks have finished and the underlying
876 	 * ldc's have been destroyed, so its safe to delete the control
877 	 * message taskq.
878 	 */
879 	if (progress & PROG_taskq) {
880 		ddi_taskq_destroy(vswp->taskq_p);
881 		progress &= ~PROG_taskq;
882 	}
883 
884 	/* Destroy the multicast hash table */
885 	if (progress & PROG_mfdb) {
886 		mod_hash_destroy_hash(vswp->mfdb);
887 		progress &= ~PROG_mfdb;
888 	}
889 
890 	/* Destroy the vlan hash table and fdb */
891 	if (progress & PROG_fdb) {
892 		vsw_destroy_vlans(vswp, VSW_LOCALDEV);
893 		mod_hash_destroy_hash(vswp->fdb_hashp);
894 		progress &= ~PROG_fdb;
895 	}
896 
897 	if (progress & PROG_readmd) {
898 		if (VSW_PRI_ETH_DEFINED(vswp)) {
899 			kmem_free(vswp->pri_types,
900 			    sizeof (uint16_t) * vswp->pri_num_types);
901 			(void) vio_destroy_mblks(vswp->pri_tx_vmp);
902 		}
903 		progress &= ~PROG_readmd;
904 	}
905 
906 	if (progress & PROG_locks) {
907 		rw_destroy(&vswp->plist.lockrw);
908 		rw_destroy(&vswp->mfdbrw);
909 		rw_destroy(&vswp->if_lockrw);
910 		rw_destroy(&vswp->maccl_rwlock);
911 		cv_destroy(&vswp->sw_thr_cv);
912 		mutex_destroy(&vswp->sw_thr_lock);
913 		mutex_destroy(&vswp->mca_lock);
914 		mutex_destroy(&vswp->mac_lock);
915 		progress &= ~PROG_locks;
916 	}
917 
918 	vswp->attach_progress = progress;
919 
920 	return (0);
921 }
922 
923 void
924 vsw_destroy_rxpools(void *arg)
925 {
926 	vio_mblk_pool_t	*poolp = (vio_mblk_pool_t *)arg;
927 	vio_mblk_pool_t	*npoolp;
928 
929 	while (poolp != NULL) {
930 		npoolp =  poolp->nextp;
931 		while (vio_destroy_mblks(poolp) != 0) {
932 			delay(drv_usectohz(vsw_rxpool_cleanup_delay));
933 		}
934 		poolp = npoolp;
935 	}
936 }
937 
938 /*
939  * Get the value of the "vsw-phys-dev" property in the specified
940  * node. This property is the name of the physical device that
941  * the virtual switch will use to talk to the outside world.
942  *
943  * Note it is valid for this property to be NULL (but the property
944  * itself must exist). Callers of this routine should verify that
945  * the value returned is what they expected (i.e. either NULL or non NULL).
946  *
947  * On success returns value of the property in region pointed to by
948  * the 'name' argument, and with return value of 0. Otherwise returns 1.
949  */
950 static int
951 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
952 {
953 	int		len = 0;
954 	int		instance;
955 	char		*physname = NULL;
956 	char		*dev;
957 	const char	*dev_name;
958 	char		myname[MAXNAMELEN];
959 
960 	dev_name = ddi_driver_name(vswp->dip);
961 	instance = ddi_get_instance(vswp->dip);
962 	(void) snprintf(myname, MAXNAMELEN, "%s%d", dev_name, instance);
963 
964 	if (md_get_prop_data(mdp, node, physdev_propname,
965 	    (uint8_t **)(&physname), &len) != 0) {
966 		cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
967 		    "device(s) from MD", vswp->instance);
968 		return (1);
969 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
970 		cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
971 		    vswp->instance, physname);
972 		return (1);
973 	} else if (strcmp(myname, physname) == 0) {
974 		/*
975 		 * Prevent the vswitch from opening itself as the
976 		 * network device.
977 		 */
978 		cmn_err(CE_WARN, "!vsw%d: %s is an invalid device name",
979 		    vswp->instance, physname);
980 		return (1);
981 	} else {
982 		(void) strncpy(name, physname, strlen(physname) + 1);
983 		D2(vswp, "%s: using first device specified (%s)",
984 		    __func__, physname);
985 	}
986 
987 #ifdef DEBUG
988 	/*
989 	 * As a temporary measure to aid testing we check to see if there
990 	 * is a vsw.conf file present. If there is we use the value of the
991 	 * vsw_physname property in the file as the name of the physical
992 	 * device, overriding the value from the MD.
993 	 *
994 	 * There may be multiple devices listed, but for the moment
995 	 * we just use the first one.
996 	 */
997 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
998 	    "vsw_physname", &dev) == DDI_PROP_SUCCESS) {
999 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
1000 			cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
1001 			    vswp->instance, dev);
1002 			ddi_prop_free(dev);
1003 			return (1);
1004 		} else {
1005 			cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
1006 			    "config file", vswp->instance, dev);
1007 
1008 			(void) strncpy(name, dev, strlen(dev) + 1);
1009 		}
1010 
1011 		ddi_prop_free(dev);
1012 	}
1013 #endif
1014 
1015 	return (0);
1016 }
1017 
1018 /*
1019  * Read the 'vsw-switch-mode' property from the specified MD node.
1020  *
1021  * Returns 0 on success, otherwise returns 1.
1022  */
1023 static int
1024 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint8_t *mode)
1025 {
1026 	int		len = 0;
1027 	char		*smode = NULL;
1028 	char		*curr_mode = NULL;
1029 
1030 	D1(vswp, "%s: enter", __func__);
1031 
1032 	/*
1033 	 * Get the switch-mode property. The modes are listed in
1034 	 * decreasing order of preference, i.e. prefered mode is
1035 	 * first item in list.
1036 	 */
1037 	len = 0;
1038 	if (md_get_prop_data(mdp, node, smode_propname,
1039 	    (uint8_t **)(&smode), &len) != 0) {
1040 		/*
1041 		 * Unable to get switch-mode property from MD, nothing
1042 		 * more we can do.
1043 		 */
1044 		cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
1045 		    " from the MD", vswp->instance);
1046 		return (1);
1047 	}
1048 
1049 	curr_mode = smode;
1050 	/*
1051 	 * Modes of operation:
1052 	 * 'switched'	 - layer 2 switching, underlying HW in
1053 	 *			programmed mode.
1054 	 * 'promiscuous' - layer 2 switching, underlying HW in
1055 	 *			promiscuous mode.
1056 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
1057 	 *			in non-promiscuous mode.
1058 	 */
1059 	while (curr_mode < (smode + len)) {
1060 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1061 		if (strcmp(curr_mode, "switched") == 0) {
1062 			*mode = VSW_LAYER2;
1063 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
1064 			*mode = VSW_LAYER2 | VSW_LAYER2_PROMISC;
1065 		} else if (strcmp(curr_mode, "routed") == 0) {
1066 			*mode = VSW_LAYER3;
1067 		} else {
1068 			cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
1069 			    "setting to default switched mode",
1070 			    vswp->instance, curr_mode);
1071 			*mode = VSW_LAYER2;
1072 		}
1073 		curr_mode += strlen(curr_mode) + 1;
1074 	}
1075 
1076 	D2(vswp, "%s: %d mode", __func__, *mode);
1077 
1078 	D1(vswp, "%s: exit", __func__);
1079 
1080 	return (0);
1081 }
1082 
1083 /*
1084  * Register with the MAC layer as a network device, so we
1085  * can be plumbed if necessary.
1086  */
1087 static int
1088 vsw_mac_register(vsw_t *vswp)
1089 {
1090 	mac_register_t	*macp;
1091 	int		rv;
1092 
1093 	D1(vswp, "%s: enter", __func__);
1094 
1095 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1096 		return (EINVAL);
1097 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1098 	macp->m_driver = vswp;
1099 	macp->m_dip = vswp->dip;
1100 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
1101 	macp->m_callbacks = &vsw_m_callbacks;
1102 	macp->m_min_sdu = 0;
1103 	macp->m_max_sdu = vswp->mtu;
1104 	macp->m_margin = VLAN_TAGSZ;
1105 	rv = mac_register(macp, &vswp->if_mh);
1106 	mac_free(macp);
1107 	if (rv != 0) {
1108 		/*
1109 		 * Treat this as a non-fatal error as we may be
1110 		 * able to operate in some other mode.
1111 		 */
1112 		cmn_err(CE_NOTE, "!vsw%d: Unable to register as "
1113 		    "a provider with MAC layer", vswp->instance);
1114 		return (rv);
1115 	}
1116 
1117 	vswp->if_state |= VSW_IF_REG;
1118 
1119 	D1(vswp, "%s: exit", __func__);
1120 
1121 	return (rv);
1122 }
1123 
1124 static int
1125 vsw_mac_unregister(vsw_t *vswp)
1126 {
1127 	int		rv = 0;
1128 
1129 	D1(vswp, "%s: enter", __func__);
1130 
1131 	WRITE_ENTER(&vswp->if_lockrw);
1132 
1133 	if (vswp->if_state & VSW_IF_REG) {
1134 		rv = mac_unregister(vswp->if_mh);
1135 		if (rv != 0) {
1136 			DWARN(vswp, "%s: unable to unregister from MAC "
1137 			    "framework", __func__);
1138 
1139 			RW_EXIT(&vswp->if_lockrw);
1140 			D1(vswp, "%s: fail exit", __func__);
1141 			return (rv);
1142 		}
1143 
1144 		/* mark i/f as down and unregistered */
1145 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
1146 	}
1147 	RW_EXIT(&vswp->if_lockrw);
1148 
1149 	D1(vswp, "%s: exit", __func__);
1150 
1151 	return (rv);
1152 }
1153 
1154 static int
1155 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
1156 {
1157 	vsw_t			*vswp = (vsw_t *)arg;
1158 
1159 	D1(vswp, "%s: enter", __func__);
1160 
1161 	mutex_enter(&vswp->mac_lock);
1162 	if (vswp->mh == NULL) {
1163 		mutex_exit(&vswp->mac_lock);
1164 		return (EINVAL);
1165 	}
1166 
1167 	/* return stats from underlying device */
1168 	*val = mac_stat_get(vswp->mh, stat);
1169 
1170 	mutex_exit(&vswp->mac_lock);
1171 
1172 	return (0);
1173 }
1174 
1175 static void
1176 vsw_m_stop(void *arg)
1177 {
1178 	vsw_t	*vswp = (vsw_t *)arg;
1179 
1180 	D1(vswp, "%s: enter", __func__);
1181 
1182 	WRITE_ENTER(&vswp->if_lockrw);
1183 	vswp->if_state &= ~VSW_IF_UP;
1184 	RW_EXIT(&vswp->if_lockrw);
1185 
1186 	/* Cleanup and close the mac client */
1187 	vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
1188 
1189 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1190 }
1191 
1192 static int
1193 vsw_m_start(void *arg)
1194 {
1195 	int		rv;
1196 	vsw_t		*vswp = (vsw_t *)arg;
1197 
1198 	D1(vswp, "%s: enter", __func__);
1199 
1200 	WRITE_ENTER(&vswp->if_lockrw);
1201 
1202 	vswp->if_state |= VSW_IF_UP;
1203 
1204 	if (vswp->switching_setup_done == B_FALSE) {
1205 		/*
1206 		 * If the switching mode has not been setup yet, just
1207 		 * return. The unicast address will be programmed
1208 		 * after the physical device is successfully setup by the
1209 		 * timeout handler.
1210 		 */
1211 		RW_EXIT(&vswp->if_lockrw);
1212 		return (0);
1213 	}
1214 
1215 	/* if in layer2 mode, program unicast address. */
1216 	if (vswp->mh != NULL) {
1217 		/* Init a mac client and program addresses */
1218 		rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV);
1219 		if (rv != 0) {
1220 			cmn_err(CE_NOTE,
1221 			    "!vsw%d: failed to program interface "
1222 			    "unicast address\n", vswp->instance);
1223 		}
1224 	}
1225 
1226 	RW_EXIT(&vswp->if_lockrw);
1227 
1228 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1229 	return (0);
1230 }
1231 
1232 /*
1233  * Change the local interface address.
1234  *
1235  * Note: we don't support this entry point. The local
1236  * mac address of the switch can only be changed via its
1237  * MD node properties.
1238  */
1239 static int
1240 vsw_m_unicst(void *arg, const uint8_t *macaddr)
1241 {
1242 	_NOTE(ARGUNUSED(arg, macaddr))
1243 
1244 	return (DDI_FAILURE);
1245 }
1246 
1247 static int
1248 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
1249 {
1250 	vsw_t		*vswp = (vsw_t *)arg;
1251 	mcst_addr_t	*mcst_p = NULL;
1252 	uint64_t	addr = 0x0;
1253 	int		i, ret = 0;
1254 
1255 	D1(vswp, "%s: enter", __func__);
1256 
1257 	/*
1258 	 * Convert address into form that can be used
1259 	 * as hash table key.
1260 	 */
1261 	for (i = 0; i < ETHERADDRL; i++) {
1262 		addr = (addr << 8) | mca[i];
1263 	}
1264 
1265 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
1266 
1267 	if (add) {
1268 		D2(vswp, "%s: adding multicast", __func__);
1269 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1270 			/*
1271 			 * Update the list of multicast addresses
1272 			 * contained within the vsw_t structure to
1273 			 * include this new one.
1274 			 */
1275 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
1276 			if (mcst_p == NULL) {
1277 				DERR(vswp, "%s unable to alloc mem", __func__);
1278 				(void) vsw_del_mcst(vswp,
1279 				    VSW_LOCALDEV, addr, NULL);
1280 				return (1);
1281 			}
1282 			mcst_p->addr = addr;
1283 			ether_copy(mca, &mcst_p->mca);
1284 
1285 			/*
1286 			 * Call into the underlying driver to program the
1287 			 * address into HW.
1288 			 */
1289 			ret = vsw_mac_multicast_add(vswp, NULL, mcst_p,
1290 			    VSW_LOCALDEV);
1291 			if (ret != 0) {
1292 				(void) vsw_del_mcst(vswp,
1293 				    VSW_LOCALDEV, addr, NULL);
1294 				kmem_free(mcst_p, sizeof (*mcst_p));
1295 				return (ret);
1296 			}
1297 
1298 			mutex_enter(&vswp->mca_lock);
1299 			mcst_p->nextp = vswp->mcap;
1300 			vswp->mcap = mcst_p;
1301 			mutex_exit(&vswp->mca_lock);
1302 		} else {
1303 			cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
1304 			    "address", vswp->instance);
1305 		}
1306 		return (ret);
1307 	}
1308 
1309 	D2(vswp, "%s: removing multicast", __func__);
1310 	/*
1311 	 * Remove the address from the hash table..
1312 	 */
1313 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1314 
1315 		/*
1316 		 * ..and then from the list maintained in the
1317 		 * vsw_t structure.
1318 		 */
1319 		mcst_p = vsw_del_addr(VSW_LOCALDEV, vswp, addr);
1320 		ASSERT(mcst_p != NULL);
1321 
1322 		vsw_mac_multicast_remove(vswp, NULL, mcst_p, VSW_LOCALDEV);
1323 		kmem_free(mcst_p, sizeof (*mcst_p));
1324 	}
1325 
1326 	D1(vswp, "%s: exit", __func__);
1327 
1328 	return (0);
1329 }
1330 
1331 static int
1332 vsw_m_promisc(void *arg, boolean_t on)
1333 {
1334 	vsw_t		*vswp = (vsw_t *)arg;
1335 
1336 	D1(vswp, "%s: enter", __func__);
1337 
1338 	WRITE_ENTER(&vswp->if_lockrw);
1339 	if (on)
1340 		vswp->if_state |= VSW_IF_PROMISC;
1341 	else
1342 		vswp->if_state &= ~VSW_IF_PROMISC;
1343 	RW_EXIT(&vswp->if_lockrw);
1344 
1345 	D1(vswp, "%s: exit", __func__);
1346 
1347 	return (0);
1348 }
1349 
1350 static mblk_t *
1351 vsw_m_tx(void *arg, mblk_t *mp)
1352 {
1353 	vsw_t		*vswp = (vsw_t *)arg;
1354 
1355 	D1(vswp, "%s: enter", __func__);
1356 
1357 	mp = vsw_vlan_frame_pretag(vswp, VSW_LOCALDEV, mp);
1358 
1359 	if (mp == NULL) {
1360 		return (NULL);
1361 	}
1362 
1363 	vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
1364 
1365 	D1(vswp, "%s: exit", __func__);
1366 
1367 	return (NULL);
1368 }
1369 
1370 /*
1371  * Register for machine description (MD) updates.
1372  *
1373  * Returns 0 on success, 1 on failure.
1374  */
1375 static int
1376 vsw_mdeg_register(vsw_t *vswp)
1377 {
1378 	mdeg_prop_spec_t	*pspecp;
1379 	mdeg_node_spec_t	*inst_specp;
1380 	mdeg_handle_t		mdeg_hdl, mdeg_port_hdl;
1381 	size_t			templatesz;
1382 	int			rv;
1383 
1384 	D1(vswp, "%s: enter", __func__);
1385 
1386 	/*
1387 	 * Allocate and initialize a per-instance copy
1388 	 * of the global property spec array that will
1389 	 * uniquely identify this vsw instance.
1390 	 */
1391 	templatesz = sizeof (vsw_prop_template);
1392 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
1393 
1394 	bcopy(vsw_prop_template, pspecp, templatesz);
1395 
1396 	VSW_SET_MDEG_PROP_INST(pspecp, vswp->regprop);
1397 
1398 	/* initialize the complete prop spec structure */
1399 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
1400 	inst_specp->namep = "virtual-device";
1401 	inst_specp->specp = pspecp;
1402 
1403 	D2(vswp, "%s: instance %d registering with mdeg", __func__,
1404 	    vswp->regprop);
1405 	/*
1406 	 * Register an interest in 'virtual-device' nodes with a
1407 	 * 'name' property of 'virtual-network-switch'
1408 	 */
1409 	rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
1410 	    (void *)vswp, &mdeg_hdl);
1411 	if (rv != MDEG_SUCCESS) {
1412 		DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
1413 		    __func__, rv);
1414 		goto mdeg_reg_fail;
1415 	}
1416 
1417 	/*
1418 	 * Register an interest in 'vsw-port' nodes.
1419 	 */
1420 	rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
1421 	    (void *)vswp, &mdeg_port_hdl);
1422 	if (rv != MDEG_SUCCESS) {
1423 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
1424 		(void) mdeg_unregister(mdeg_hdl);
1425 		goto mdeg_reg_fail;
1426 	}
1427 
1428 	/* save off data that will be needed later */
1429 	vswp->inst_spec = inst_specp;
1430 	vswp->mdeg_hdl = mdeg_hdl;
1431 	vswp->mdeg_port_hdl = mdeg_port_hdl;
1432 
1433 	D1(vswp, "%s: exit", __func__);
1434 	return (0);
1435 
1436 mdeg_reg_fail:
1437 	cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
1438 	    vswp->instance);
1439 	kmem_free(pspecp, templatesz);
1440 	kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
1441 
1442 	vswp->mdeg_hdl = NULL;
1443 	vswp->mdeg_port_hdl = NULL;
1444 
1445 	return (1);
1446 }
1447 
1448 static void
1449 vsw_mdeg_unregister(vsw_t *vswp)
1450 {
1451 	D1(vswp, "vsw_mdeg_unregister: enter");
1452 
1453 	if (vswp->mdeg_hdl != NULL)
1454 		(void) mdeg_unregister(vswp->mdeg_hdl);
1455 
1456 	if (vswp->mdeg_port_hdl != NULL)
1457 		(void) mdeg_unregister(vswp->mdeg_port_hdl);
1458 
1459 	if (vswp->inst_spec != NULL) {
1460 		if (vswp->inst_spec->specp != NULL) {
1461 			(void) kmem_free(vswp->inst_spec->specp,
1462 			    sizeof (vsw_prop_template));
1463 			vswp->inst_spec->specp = NULL;
1464 		}
1465 
1466 		(void) kmem_free(vswp->inst_spec, sizeof (mdeg_node_spec_t));
1467 		vswp->inst_spec = NULL;
1468 	}
1469 
1470 	D1(vswp, "vsw_mdeg_unregister: exit");
1471 }
1472 
1473 /*
1474  * Mdeg callback invoked for the vsw node itself.
1475  */
1476 static int
1477 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
1478 {
1479 	vsw_t		*vswp;
1480 	md_t		*mdp;
1481 	mde_cookie_t	node;
1482 	uint64_t	inst;
1483 	char		*node_name = NULL;
1484 
1485 	if (resp == NULL)
1486 		return (MDEG_FAILURE);
1487 
1488 	vswp = (vsw_t *)cb_argp;
1489 
1490 	D1(vswp, "%s: added %d : removed %d : curr matched %d"
1491 	    " : prev matched %d", __func__, resp->added.nelem,
1492 	    resp->removed.nelem, resp->match_curr.nelem,
1493 	    resp->match_prev.nelem);
1494 
1495 	/*
1496 	 * We get an initial callback for this node as 'added'
1497 	 * after registering with mdeg. Note that we would have
1498 	 * already gathered information about this vsw node by
1499 	 * walking MD earlier during attach (in vsw_read_mdprops()).
1500 	 * So, there is a window where the properties of this
1501 	 * node might have changed when we get this initial 'added'
1502 	 * callback. We handle this as if an update occured
1503 	 * and invoke the same function which handles updates to
1504 	 * the properties of this vsw-node if any.
1505 	 *
1506 	 * A non-zero 'match' value indicates that the MD has been
1507 	 * updated and that a virtual-network-switch node is
1508 	 * present which may or may not have been updated. It is
1509 	 * up to the clients to examine their own nodes and
1510 	 * determine if they have changed.
1511 	 */
1512 	if (resp->added.nelem != 0) {
1513 
1514 		if (resp->added.nelem != 1) {
1515 			cmn_err(CE_NOTE, "!vsw%d: number of nodes added "
1516 			    "invalid: %d\n", vswp->instance, resp->added.nelem);
1517 			return (MDEG_FAILURE);
1518 		}
1519 
1520 		mdp = resp->added.mdp;
1521 		node = resp->added.mdep[0];
1522 
1523 	} else if (resp->match_curr.nelem != 0) {
1524 
1525 		if (resp->match_curr.nelem != 1) {
1526 			cmn_err(CE_NOTE, "!vsw%d: number of nodes updated "
1527 			    "invalid: %d\n", vswp->instance,
1528 			    resp->match_curr.nelem);
1529 			return (MDEG_FAILURE);
1530 		}
1531 
1532 		mdp = resp->match_curr.mdp;
1533 		node = resp->match_curr.mdep[0];
1534 
1535 	} else {
1536 		return (MDEG_FAILURE);
1537 	}
1538 
1539 	/* Validate name and instance */
1540 	if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
1541 		DERR(vswp, "%s: unable to get node name\n",  __func__);
1542 		return (MDEG_FAILURE);
1543 	}
1544 
1545 	/* is this a virtual-network-switch? */
1546 	if (strcmp(node_name, vsw_propname) != 0) {
1547 		DERR(vswp, "%s: Invalid node name: %s\n",
1548 		    __func__, node_name);
1549 		return (MDEG_FAILURE);
1550 	}
1551 
1552 	if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
1553 		DERR(vswp, "%s: prop(cfg-handle) not found\n",
1554 		    __func__);
1555 		return (MDEG_FAILURE);
1556 	}
1557 
1558 	/* is this the right instance of vsw? */
1559 	if (inst != vswp->regprop) {
1560 		DERR(vswp, "%s: Invalid cfg-handle: %lx\n",
1561 		    __func__, inst);
1562 		return (MDEG_FAILURE);
1563 	}
1564 
1565 	vsw_update_md_prop(vswp, mdp, node);
1566 
1567 	return (MDEG_SUCCESS);
1568 }
1569 
1570 /*
1571  * Mdeg callback invoked for changes to the vsw-port nodes
1572  * under the vsw node.
1573  */
1574 static int
1575 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
1576 {
1577 	vsw_t		*vswp;
1578 	int		idx;
1579 	md_t		*mdp;
1580 	mde_cookie_t	node;
1581 	uint64_t	inst;
1582 	int		rv;
1583 
1584 	if ((resp == NULL) || (cb_argp == NULL))
1585 		return (MDEG_FAILURE);
1586 
1587 	vswp = (vsw_t *)cb_argp;
1588 
1589 	D2(vswp, "%s: added %d : removed %d : curr matched %d"
1590 	    " : prev matched %d", __func__, resp->added.nelem,
1591 	    resp->removed.nelem, resp->match_curr.nelem,
1592 	    resp->match_prev.nelem);
1593 
1594 	/* process added ports */
1595 	for (idx = 0; idx < resp->added.nelem; idx++) {
1596 		mdp = resp->added.mdp;
1597 		node = resp->added.mdep[idx];
1598 
1599 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
1600 
1601 		if ((rv = vsw_port_add(vswp, mdp, &node)) != 0) {
1602 			cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
1603 			    "(0x%lx), err=%d", vswp->instance, node, rv);
1604 		}
1605 	}
1606 
1607 	/* process removed ports */
1608 	for (idx = 0; idx < resp->removed.nelem; idx++) {
1609 		mdp = resp->removed.mdp;
1610 		node = resp->removed.mdep[idx];
1611 
1612 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
1613 			DERR(vswp, "%s: prop(%s) not found in port(%d)",
1614 			    __func__, id_propname, idx);
1615 			continue;
1616 		}
1617 
1618 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
1619 
1620 		if (vsw_port_detach(vswp, inst) != 0) {
1621 			cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
1622 			    vswp->instance, inst);
1623 		}
1624 	}
1625 
1626 	for (idx = 0; idx < resp->match_curr.nelem; idx++) {
1627 		(void) vsw_port_update(vswp, resp->match_curr.mdp,
1628 		    resp->match_curr.mdep[idx],
1629 		    resp->match_prev.mdp,
1630 		    resp->match_prev.mdep[idx]);
1631 	}
1632 
1633 	D1(vswp, "%s: exit", __func__);
1634 
1635 	return (MDEG_SUCCESS);
1636 }
1637 
1638 /*
1639  * Scan the machine description for this instance of vsw
1640  * and read its properties. Called only from vsw_attach().
1641  * Returns: 0 on success, 1 on failure.
1642  */
1643 static int
1644 vsw_read_mdprops(vsw_t *vswp)
1645 {
1646 	md_t		*mdp = NULL;
1647 	mde_cookie_t	rootnode;
1648 	mde_cookie_t	*listp = NULL;
1649 	uint64_t	inst;
1650 	uint64_t	cfgh;
1651 	char		*name;
1652 	int		rv = 1;
1653 	int		num_nodes = 0;
1654 	int		num_devs = 0;
1655 	int		listsz = 0;
1656 	int		i;
1657 
1658 	/*
1659 	 * In each 'virtual-device' node in the MD there is a
1660 	 * 'cfg-handle' property which is the MD's concept of
1661 	 * an instance number (this may be completely different from
1662 	 * the device drivers instance #). OBP reads that value and
1663 	 * stores it in the 'reg' property of the appropriate node in
1664 	 * the device tree. We first read this reg property and use this
1665 	 * to compare against the 'cfg-handle' property of vsw nodes
1666 	 * in MD to get to this specific vsw instance and then read
1667 	 * other properties that we are interested in.
1668 	 * We also cache the value of 'reg' property and use it later
1669 	 * to register callbacks with mdeg (see vsw_mdeg_register())
1670 	 */
1671 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
1672 	    DDI_PROP_DONTPASS, reg_propname, -1);
1673 	if (inst == -1) {
1674 		cmn_err(CE_NOTE, "!vsw%d: Unable to read %s property from "
1675 		    "OBP device tree", vswp->instance, reg_propname);
1676 		return (rv);
1677 	}
1678 
1679 	vswp->regprop = inst;
1680 
1681 	if ((mdp = md_get_handle()) == NULL) {
1682 		DWARN(vswp, "%s: cannot init MD\n", __func__);
1683 		return (rv);
1684 	}
1685 
1686 	num_nodes = md_node_count(mdp);
1687 	ASSERT(num_nodes > 0);
1688 
1689 	listsz = num_nodes * sizeof (mde_cookie_t);
1690 	listp = (mde_cookie_t *)kmem_zalloc(listsz, KM_SLEEP);
1691 
1692 	rootnode = md_root_node(mdp);
1693 
1694 	/* search for all "virtual_device" nodes */
1695 	num_devs = md_scan_dag(mdp, rootnode,
1696 	    md_find_name(mdp, vdev_propname),
1697 	    md_find_name(mdp, "fwd"), listp);
1698 	if (num_devs <= 0) {
1699 		DWARN(vswp, "%s: invalid num_devs:%d\n", __func__, num_devs);
1700 		goto vsw_readmd_exit;
1701 	}
1702 
1703 	/*
1704 	 * Now loop through the list of virtual-devices looking for
1705 	 * devices with name "virtual-network-switch" and for each
1706 	 * such device compare its instance with what we have from
1707 	 * the 'reg' property to find the right node in MD and then
1708 	 * read all its properties.
1709 	 */
1710 	for (i = 0; i < num_devs; i++) {
1711 
1712 		if (md_get_prop_str(mdp, listp[i], "name", &name) != 0) {
1713 			DWARN(vswp, "%s: name property not found\n",
1714 			    __func__);
1715 			goto vsw_readmd_exit;
1716 		}
1717 
1718 		/* is this a virtual-network-switch? */
1719 		if (strcmp(name, vsw_propname) != 0)
1720 			continue;
1721 
1722 		if (md_get_prop_val(mdp, listp[i], "cfg-handle", &cfgh) != 0) {
1723 			DWARN(vswp, "%s: cfg-handle property not found\n",
1724 			    __func__);
1725 			goto vsw_readmd_exit;
1726 		}
1727 
1728 		/* is this the required instance of vsw? */
1729 		if (inst != cfgh)
1730 			continue;
1731 
1732 		/* now read all properties of this vsw instance */
1733 		rv = vsw_get_initial_md_properties(vswp, mdp, listp[i]);
1734 		break;
1735 	}
1736 
1737 vsw_readmd_exit:
1738 
1739 	kmem_free(listp, listsz);
1740 	(void) md_fini_handle(mdp);
1741 	return (rv);
1742 }
1743 
1744 /*
1745  * Read the initial start-of-day values from the specified MD node.
1746  */
1747 static int
1748 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
1749 {
1750 	uint64_t	macaddr = 0;
1751 
1752 	D1(vswp, "%s: enter", __func__);
1753 
1754 	if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) != 0) {
1755 		return (1);
1756 	}
1757 
1758 	/* mac address for vswitch device itself */
1759 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
1760 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
1761 		    vswp->instance);
1762 		return (1);
1763 	}
1764 
1765 	vsw_save_lmacaddr(vswp, macaddr);
1766 
1767 	if (vsw_get_md_smodes(vswp, mdp, node, &vswp->smode)) {
1768 		DWARN(vswp, "%s: Unable to read %s property from MD, "
1769 		    "defaulting to 'switched' mode",
1770 		    __func__, smode_propname);
1771 
1772 		vswp->smode = VSW_LAYER2;
1773 	}
1774 
1775 	/*
1776 	 * Read the 'linkprop' property to know if this
1777 	 * vsw device wants to get physical link updates.
1778 	 */
1779 	vsw_linkprop_read(vswp, mdp, node, &vswp->pls_update);
1780 
1781 	/* read mtu */
1782 	vsw_mtu_read(vswp, mdp, node, &vswp->mtu);
1783 	if (vswp->mtu < ETHERMTU || vswp->mtu > VNET_MAX_MTU) {
1784 		vswp->mtu = ETHERMTU;
1785 	}
1786 	vswp->max_frame_size = vswp->mtu + sizeof (struct ether_header) +
1787 	    VLAN_TAGSZ;
1788 
1789 	/* read vlan id properties of this vsw instance */
1790 	vsw_vlan_read_ids(vswp, VSW_LOCALDEV, mdp, node, &vswp->pvid,
1791 	    &vswp->vids, &vswp->nvids, &vswp->default_vlan_id);
1792 
1793 	/* read priority-ether-types */
1794 	vsw_read_pri_eth_types(vswp, mdp, node);
1795 
1796 	/* read bandwidth property of this vsw instance */
1797 	vsw_bandwidth_read(vswp, mdp, node, &vswp->bandwidth);
1798 
1799 	D1(vswp, "%s: exit", __func__);
1800 	return (0);
1801 }
1802 
1803 /*
1804  * Read vlan id properties of the given MD node.
1805  * Arguments:
1806  *   arg:          device argument(vsw device or a port)
1807  *   type:         type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port)
1808  *   mdp:          machine description
1809  *   node:         md node cookie
1810  *
1811  * Returns:
1812  *   pvidp:        port-vlan-id of the node
1813  *   vidspp:       list of vlan-ids of the node
1814  *   nvidsp:       # of vlan-ids in the list
1815  *   default_idp:  default-vlan-id of the node(if node is vsw device)
1816  */
1817 static void
1818 vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node,
1819 	uint16_t *pvidp, vsw_vlanid_t **vidspp, uint16_t *nvidsp,
1820 	uint16_t *default_idp)
1821 {
1822 	vsw_t		*vswp;
1823 	vsw_port_t	*portp;
1824 	char		*pvid_propname;
1825 	char		*vid_propname;
1826 	uint_t		nvids = 0;
1827 	uint32_t	vids_size;
1828 	int		rv;
1829 	int		i;
1830 	uint64_t	*data;
1831 	uint64_t	val;
1832 	int		size;
1833 	int		inst;
1834 
1835 	if (type == VSW_LOCALDEV) {
1836 
1837 		vswp = (vsw_t *)arg;
1838 		pvid_propname = vsw_pvid_propname;
1839 		vid_propname = vsw_vid_propname;
1840 		inst = vswp->instance;
1841 
1842 	} else if (type == VSW_VNETPORT) {
1843 
1844 		portp = (vsw_port_t *)arg;
1845 		vswp = portp->p_vswp;
1846 		pvid_propname = port_pvid_propname;
1847 		vid_propname = port_vid_propname;
1848 		inst = portp->p_instance;
1849 
1850 	} else {
1851 		return;
1852 	}
1853 
1854 	if (type == VSW_LOCALDEV && default_idp != NULL) {
1855 		rv = md_get_prop_val(mdp, node, vsw_dvid_propname, &val);
1856 		if (rv != 0) {
1857 			DWARN(vswp, "%s: prop(%s) not found", __func__,
1858 			    vsw_dvid_propname);
1859 
1860 			*default_idp = vsw_default_vlan_id;
1861 		} else {
1862 			*default_idp = val & 0xFFF;
1863 			D2(vswp, "%s: %s(%d): (%d)\n", __func__,
1864 			    vsw_dvid_propname, inst, *default_idp);
1865 		}
1866 	}
1867 
1868 	rv = md_get_prop_val(mdp, node, pvid_propname, &val);
1869 	if (rv != 0) {
1870 		DWARN(vswp, "%s: prop(%s) not found", __func__, pvid_propname);
1871 		*pvidp = vsw_default_vlan_id;
1872 	} else {
1873 
1874 		*pvidp = val & 0xFFF;
1875 		D2(vswp, "%s: %s(%d): (%d)\n", __func__,
1876 		    pvid_propname, inst, *pvidp);
1877 	}
1878 
1879 	rv = md_get_prop_data(mdp, node, vid_propname, (uint8_t **)&data,
1880 	    &size);
1881 	if (rv != 0) {
1882 		D2(vswp, "%s: prop(%s) not found", __func__, vid_propname);
1883 		size = 0;
1884 	} else {
1885 		size /= sizeof (uint64_t);
1886 	}
1887 	nvids = size;
1888 
1889 	if (nvids != 0) {
1890 		D2(vswp, "%s: %s(%d): ", __func__, vid_propname, inst);
1891 		vids_size = sizeof (vsw_vlanid_t) * nvids;
1892 		*vidspp = kmem_zalloc(vids_size, KM_SLEEP);
1893 		for (i = 0; i < nvids; i++) {
1894 			(*vidspp)[i].vl_vid = data[i] & 0xFFFF;
1895 			(*vidspp)[i].vl_set = B_FALSE;
1896 			D2(vswp, " %d ", (*vidspp)[i].vl_vid);
1897 		}
1898 		D2(vswp, "\n");
1899 	}
1900 
1901 	*nvidsp = nvids;
1902 }
1903 
1904 static void
1905 vsw_port_read_bandwidth(vsw_port_t *portp, md_t *mdp, mde_cookie_t node,
1906     uint64_t *bw)
1907 {
1908 	int		rv;
1909 	uint64_t	val;
1910 	vsw_t		*vswp;
1911 
1912 	vswp = portp->p_vswp;
1913 
1914 	rv = md_get_prop_val(mdp, node, port_maxbw_propname, &val);
1915 
1916 	if (rv != 0) {
1917 		*bw = 0;
1918 		D3(vswp, "%s: prop(%s) not found\n", __func__,
1919 		    port_maxbw_propname);
1920 	} else {
1921 		*bw = val;
1922 		D3(vswp, "%s: %s nodes found", __func__, port_maxbw_propname);
1923 	}
1924 }
1925 
1926 /*
1927  * This function reads "priority-ether-types" property from md. This property
1928  * is used to enable support for priority frames. Applications which need
1929  * guaranteed and timely delivery of certain high priority frames to/from
1930  * a vnet or vsw within ldoms, should configure this property by providing
1931  * the ether type(s) for which the priority facility is needed.
1932  * Normal data frames are delivered over a ldc channel using the descriptor
1933  * ring mechanism which is constrained by factors such as descriptor ring size,
1934  * the rate at which the ring is processed at the peer ldc end point, etc.
1935  * The priority mechanism provides an Out-Of-Band path to send/receive frames
1936  * as raw pkt data (VIO_PKT_DATA) messages over the channel, avoiding the
1937  * descriptor ring path and enables a more reliable and timely delivery of
1938  * frames to the peer.
1939  */
1940 static void
1941 vsw_read_pri_eth_types(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
1942 {
1943 	int		rv;
1944 	uint16_t	*types;
1945 	uint64_t	*data;
1946 	int		size;
1947 	int		i;
1948 	size_t		mblk_sz;
1949 
1950 	rv = md_get_prop_data(mdp, node, pri_types_propname,
1951 	    (uint8_t **)&data, &size);
1952 	if (rv != 0) {
1953 		/*
1954 		 * Property may not exist if we are running pre-ldoms1.1 f/w.
1955 		 * Check if 'vsw_pri_eth_type' has been set in that case.
1956 		 */
1957 		if (vsw_pri_eth_type != 0) {
1958 			size = sizeof (vsw_pri_eth_type);
1959 			data = &vsw_pri_eth_type;
1960 		} else {
1961 			D3(vswp, "%s: prop(%s) not found", __func__,
1962 			    pri_types_propname);
1963 			size = 0;
1964 		}
1965 	}
1966 
1967 	if (size == 0) {
1968 		vswp->pri_num_types = 0;
1969 		return;
1970 	}
1971 
1972 	/*
1973 	 * we have some priority-ether-types defined;
1974 	 * allocate a table of these types and also
1975 	 * allocate a pool of mblks to transmit these
1976 	 * priority packets.
1977 	 */
1978 	size /= sizeof (uint64_t);
1979 	vswp->pri_num_types = size;
1980 	vswp->pri_types = kmem_zalloc(size * sizeof (uint16_t), KM_SLEEP);
1981 	for (i = 0, types = vswp->pri_types; i < size; i++) {
1982 		types[i] = data[i] & 0xFFFF;
1983 	}
1984 	mblk_sz = (VIO_PKT_DATA_HDRSIZE + ETHERMAX + 7) & ~7;
1985 	(void) vio_create_mblks(vsw_pri_tx_nmblks, mblk_sz, NULL,
1986 	    &vswp->pri_tx_vmp);
1987 }
1988 
1989 static void
1990 vsw_mtu_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint32_t *mtu)
1991 {
1992 	int		rv;
1993 	int		inst;
1994 	uint64_t	val;
1995 	char		*mtu_propname;
1996 
1997 	mtu_propname = vsw_mtu_propname;
1998 	inst = vswp->instance;
1999 
2000 	rv = md_get_prop_val(mdp, node, mtu_propname, &val);
2001 	if (rv != 0) {
2002 		D3(vswp, "%s: prop(%s) not found", __func__, mtu_propname);
2003 		*mtu = vsw_ethermtu;
2004 	} else {
2005 
2006 		*mtu = val & 0xFFFF;
2007 		D2(vswp, "%s: %s(%d): (%d)\n", __func__,
2008 		    mtu_propname, inst, *mtu);
2009 	}
2010 }
2011 
2012 /*
2013  * Update the mtu of the vsw device. We first check if the device has been
2014  * plumbed and if so fail the mtu update. Otherwise, we continue to update the
2015  * new mtu and reset all ports to initiate handshake re-negotiation with peers
2016  * using the new mtu.
2017  */
2018 static int
2019 vsw_mtu_update(vsw_t *vswp, uint32_t mtu)
2020 {
2021 	int	rv;
2022 
2023 	WRITE_ENTER(&vswp->if_lockrw);
2024 
2025 	if (vswp->if_state & VSW_IF_UP) {
2026 
2027 		RW_EXIT(&vswp->if_lockrw);
2028 
2029 		cmn_err(CE_NOTE, "!vsw%d: Unable to process mtu update"
2030 		    " as the device is plumbed\n", vswp->instance);
2031 		return (EBUSY);
2032 
2033 	} else {
2034 
2035 		D2(vswp, "%s: curr_mtu(%d) new_mtu(%d)\n",
2036 		    __func__, vswp->mtu, mtu);
2037 
2038 		vswp->mtu = mtu;
2039 		vswp->max_frame_size = vswp->mtu +
2040 		    sizeof (struct ether_header) + VLAN_TAGSZ;
2041 
2042 		rv = mac_maxsdu_update(vswp->if_mh, mtu);
2043 		if (rv != 0) {
2044 			cmn_err(CE_NOTE,
2045 			    "!vsw%d: Unable to update mtu with mac"
2046 			    " layer\n", vswp->instance);
2047 		}
2048 
2049 		RW_EXIT(&vswp->if_lockrw);
2050 
2051 		/* Reset ports to renegotiate with the new mtu */
2052 		vsw_reset_ports(vswp);
2053 
2054 	}
2055 
2056 	return (0);
2057 }
2058 
2059 static void
2060 vsw_linkprop_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
2061 	boolean_t *pls)
2062 {
2063 	int		rv;
2064 	uint64_t	val;
2065 	char		*linkpropname;
2066 
2067 	linkpropname = vsw_linkprop_propname;
2068 
2069 	rv = md_get_prop_val(mdp, node, linkpropname, &val);
2070 	if (rv != 0) {
2071 		D3(vswp, "%s: prop(%s) not found", __func__, linkpropname);
2072 		*pls = B_FALSE;
2073 	} else {
2074 
2075 		*pls = (val & 0x1) ? B_TRUE : B_FALSE;
2076 		D2(vswp, "%s: %s(%d): (%d)\n", __func__, linkpropname,
2077 		    vswp->instance, *pls);
2078 	}
2079 }
2080 
2081 void
2082 vsw_mac_link_update(vsw_t *vswp, link_state_t link_state)
2083 {
2084 	READ_ENTER(&vswp->if_lockrw);
2085 
2086 	if (vswp->if_state & VSW_IF_REG) {
2087 		mac_link_update(vswp->if_mh, link_state);
2088 	}
2089 
2090 	RW_EXIT(&vswp->if_lockrw);
2091 }
2092 
2093 void
2094 vsw_physlink_state_update(vsw_t *vswp)
2095 {
2096 	if (vswp->pls_update == B_TRUE) {
2097 		vsw_mac_link_update(vswp, vswp->phys_link_state);
2098 	}
2099 	vsw_physlink_update_ports(vswp);
2100 }
2101 
2102 static void
2103 vsw_bandwidth_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint64_t *bw)
2104 {
2105 	/* read the vsw bandwidth from md */
2106 	int		rv;
2107 	uint64_t	val;
2108 
2109 	rv = md_get_prop_val(mdp, node, vsw_maxbw_propname, &val);
2110 	if (rv != 0) {
2111 		*bw = 0;
2112 		D3(vswp, "%s: prop(%s) not found", __func__,
2113 		    vsw_maxbw_propname);
2114 	} else {
2115 		*bw = val;
2116 		D3(vswp, "%s: %s(%d): (%ld)\n", __func__,
2117 		    vsw_maxbw_propname, vswp->instance, *bw);
2118 	}
2119 }
2120 
2121 /*
2122  * Check to see if the relevant properties in the specified node have
2123  * changed, and if so take the appropriate action.
2124  *
2125  * If any of the properties are missing or invalid we don't take
2126  * any action, as this function should only be invoked when modifications
2127  * have been made to what we assume is a working configuration, which
2128  * we leave active.
2129  *
2130  * Note it is legal for this routine to be invoked even if none of the
2131  * properties in the port node within the MD have actually changed.
2132  */
2133 static void
2134 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2135 {
2136 	char		physname[LIFNAMSIZ];
2137 	char		drv[LIFNAMSIZ];
2138 	uint_t		ddi_instance;
2139 	uint8_t		new_smode;
2140 	int		i;
2141 	uint64_t 	macaddr = 0;
2142 	enum		{MD_init = 0x1,
2143 				MD_physname = 0x2,
2144 				MD_macaddr = 0x4,
2145 				MD_smode = 0x8,
2146 				MD_vlans = 0x10,
2147 				MD_mtu = 0x20,
2148 				MD_pls = 0x40,
2149 				MD_bw = 0x80} updated;
2150 	int		rv;
2151 	uint16_t	pvid;
2152 	vsw_vlanid_t	*vids;
2153 	uint16_t	nvids;
2154 	uint32_t	mtu;
2155 	boolean_t	pls_update;
2156 	uint64_t	maxbw;
2157 
2158 	updated = MD_init;
2159 
2160 	D1(vswp, "%s: enter", __func__);
2161 
2162 	/*
2163 	 * Check if name of physical device in MD has changed.
2164 	 */
2165 	if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
2166 		/*
2167 		 * Do basic sanity check on new device name/instance,
2168 		 * if its non NULL. It is valid for the device name to
2169 		 * have changed from a non NULL to a NULL value, i.e.
2170 		 * the vsw is being changed to 'routed' mode.
2171 		 */
2172 		if ((strlen(physname) != 0) &&
2173 		    (ddi_parse(physname, drv,
2174 		    &ddi_instance) != DDI_SUCCESS)) {
2175 			cmn_err(CE_WARN, "!vsw%d: physical device %s is not"
2176 			    " a valid device name/instance",
2177 			    vswp->instance, physname);
2178 			goto fail_reconf;
2179 		}
2180 
2181 		if (strcmp(physname, vswp->physname)) {
2182 			D2(vswp, "%s: device name changed from %s to %s",
2183 			    __func__, vswp->physname, physname);
2184 
2185 			updated |= MD_physname;
2186 		} else {
2187 			D2(vswp, "%s: device name unchanged at %s",
2188 			    __func__, vswp->physname);
2189 		}
2190 	} else {
2191 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2192 		    "device from updated MD.", vswp->instance);
2193 		goto fail_reconf;
2194 	}
2195 
2196 	/*
2197 	 * Check if MAC address has changed.
2198 	 */
2199 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2200 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2201 		    vswp->instance);
2202 		goto fail_reconf;
2203 	} else {
2204 		uint64_t maddr = macaddr;
2205 		READ_ENTER(&vswp->if_lockrw);
2206 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2207 			if (vswp->if_addr.ether_addr_octet[i]
2208 			    != (macaddr & 0xFF)) {
2209 				D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
2210 				    __func__, i,
2211 				    vswp->if_addr.ether_addr_octet[i],
2212 				    (macaddr & 0xFF));
2213 				updated |= MD_macaddr;
2214 				macaddr = maddr;
2215 				break;
2216 			}
2217 			macaddr >>= 8;
2218 		}
2219 		RW_EXIT(&vswp->if_lockrw);
2220 		if (updated & MD_macaddr) {
2221 			vsw_save_lmacaddr(vswp, macaddr);
2222 		}
2223 	}
2224 
2225 	/*
2226 	 * Check if switching modes have changed.
2227 	 */
2228 	if (vsw_get_md_smodes(vswp, mdp, node, &new_smode)) {
2229 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
2230 		    vswp->instance, smode_propname);
2231 		goto fail_reconf;
2232 	} else {
2233 		if (new_smode != vswp->smode) {
2234 			D2(vswp, "%s: switching mode changed from %d to %d",
2235 			    __func__, vswp->smode, new_smode);
2236 
2237 			updated |= MD_smode;
2238 		}
2239 	}
2240 
2241 	/* Read the vlan ids */
2242 	vsw_vlan_read_ids(vswp, VSW_LOCALDEV, mdp, node, &pvid, &vids,
2243 	    &nvids, NULL);
2244 
2245 	/* Determine if there are any vlan id updates */
2246 	if ((pvid != vswp->pvid) ||		/* pvid changed? */
2247 	    (nvids != vswp->nvids) ||		/* # of vids changed? */
2248 	    ((nvids != 0) && (vswp->nvids != 0) &&	/* vids changed? */
2249 	    !vsw_cmp_vids(vids, vswp->vids, nvids))) {
2250 		updated |= MD_vlans;
2251 	}
2252 
2253 	/* Read mtu */
2254 	vsw_mtu_read(vswp, mdp, node, &mtu);
2255 	if (mtu != vswp->mtu) {
2256 		if (mtu >= ETHERMTU && mtu <= VNET_MAX_MTU) {
2257 			updated |= MD_mtu;
2258 		} else {
2259 			cmn_err(CE_NOTE, "!vsw%d: Unable to process mtu update"
2260 			    " as the specified value:%d is invalid\n",
2261 			    vswp->instance, mtu);
2262 		}
2263 	}
2264 
2265 	/*
2266 	 * Read the 'linkprop' property.
2267 	 */
2268 	vsw_linkprop_read(vswp, mdp, node, &pls_update);
2269 	if (pls_update != vswp->pls_update) {
2270 		updated |= MD_pls;
2271 	}
2272 
2273 	/* Read bandwidth */
2274 	vsw_bandwidth_read(vswp, mdp, node, &maxbw);
2275 	if (maxbw != vswp->bandwidth) {
2276 		if (maxbw >= MRP_MAXBW_MINVAL || maxbw == 0) {
2277 			updated |= MD_bw;
2278 		} else {
2279 			cmn_err(CE_NOTE, "!vsw%d: Unable to process bandwidth"
2280 			    " update as the specified value:%ld is invalid\n",
2281 			    vswp->instance, maxbw);
2282 		}
2283 	}
2284 
2285 	/*
2286 	 * Now make any changes which are needed...
2287 	 */
2288 	if (updated & MD_pls) {
2289 
2290 		/* save the updated property. */
2291 		vswp->pls_update = pls_update;
2292 
2293 		if (pls_update == B_FALSE) {
2294 			/*
2295 			 * Phys link state update is now disabled for this vsw
2296 			 * interface. If we had previously reported a link-down
2297 			 * to the stack, undo that by sending a link-up.
2298 			 */
2299 			if (vswp->phys_link_state == LINK_STATE_DOWN) {
2300 				vsw_mac_link_update(vswp, LINK_STATE_UP);
2301 			}
2302 		} else {
2303 			/*
2304 			 * Phys link state update is now enabled. Send up an
2305 			 * update based on the current phys link state.
2306 			 */
2307 			if (vswp->smode & VSW_LAYER2) {
2308 				vsw_mac_link_update(vswp,
2309 				    vswp->phys_link_state);
2310 			}
2311 		}
2312 
2313 	}
2314 
2315 	if (updated & (MD_physname | MD_smode | MD_mtu)) {
2316 
2317 		/*
2318 		 * Stop any pending thread to setup switching mode.
2319 		 */
2320 		vsw_setup_switching_stop(vswp);
2321 
2322 		/* Cleanup HybridIO */
2323 		vsw_hio_cleanup(vswp);
2324 
2325 		/*
2326 		 * Remove unicst, mcst addrs of vsw interface
2327 		 * and ports from the physdev. This also closes
2328 		 * the corresponding mac clients.
2329 		 */
2330 		vsw_unset_addrs(vswp);
2331 
2332 		/*
2333 		 * Stop, detach and close the old device..
2334 		 */
2335 		mutex_enter(&vswp->mac_lock);
2336 		vsw_mac_close(vswp);
2337 		mutex_exit(&vswp->mac_lock);
2338 
2339 		/*
2340 		 * Update phys name.
2341 		 */
2342 		if (updated & MD_physname) {
2343 			cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
2344 			    vswp->instance, vswp->physname, physname);
2345 			(void) strncpy(vswp->physname,
2346 			    physname, strlen(physname) + 1);
2347 		}
2348 
2349 		/*
2350 		 * Update array with the new switch mode values.
2351 		 */
2352 		if (updated & MD_smode) {
2353 			vswp->smode = new_smode;
2354 		}
2355 
2356 		/* Update mtu */
2357 		if (updated & MD_mtu) {
2358 			rv = vsw_mtu_update(vswp, mtu);
2359 			if (rv != 0) {
2360 				goto fail_update;
2361 			}
2362 		}
2363 
2364 		/*
2365 		 * ..and attach, start the new device.
2366 		 */
2367 		rv = vsw_setup_switching(vswp);
2368 		if (rv == EAGAIN) {
2369 			/*
2370 			 * Unable to setup switching mode.
2371 			 * As the error is EAGAIN, schedule a thread to retry
2372 			 * and return. Programming addresses of ports and
2373 			 * vsw interface will be done by the thread when the
2374 			 * switching setup completes successfully.
2375 			 */
2376 			if (vsw_setup_switching_start(vswp) != 0) {
2377 				goto fail_update;
2378 			}
2379 			return;
2380 
2381 		} else if (rv) {
2382 			goto fail_update;
2383 		}
2384 
2385 		vsw_setup_switching_post_process(vswp);
2386 	} else if (updated & MD_macaddr) {
2387 		/*
2388 		 * We enter here if only MD_macaddr is exclusively updated.
2389 		 * If MD_physname and/or MD_smode are also updated, then
2390 		 * as part of that, we would have implicitly processed
2391 		 * MD_macaddr update (above).
2392 		 */
2393 		cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
2394 		    vswp->instance, macaddr);
2395 
2396 		READ_ENTER(&vswp->if_lockrw);
2397 		if (vswp->if_state & VSW_IF_UP) {
2398 			/* reconfigure with new address */
2399 			vsw_if_mac_reconfig(vswp, B_FALSE, 0, NULL, 0);
2400 
2401 			/*
2402 			 * Notify the MAC layer of the changed address.
2403 			 */
2404 			mac_unicst_update(vswp->if_mh,
2405 			    (uint8_t *)&vswp->if_addr);
2406 
2407 		}
2408 		RW_EXIT(&vswp->if_lockrw);
2409 
2410 	}
2411 
2412 	if (updated & MD_vlans) {
2413 		/* Remove existing vlan ids from the hash table. */
2414 		vsw_vlan_remove_ids(vswp, VSW_LOCALDEV);
2415 
2416 		if (vswp->if_state & VSW_IF_UP) {
2417 			vsw_if_mac_reconfig(vswp, B_TRUE, pvid, vids, nvids);
2418 		} else {
2419 			if (vswp->nvids != 0) {
2420 				kmem_free(vswp->vids,
2421 				    sizeof (vsw_vlanid_t) * vswp->nvids);
2422 			}
2423 			vswp->vids = vids;
2424 			vswp->nvids = nvids;
2425 			vswp->pvid = pvid;
2426 		}
2427 
2428 		/* add these new vlan ids into hash table */
2429 		vsw_vlan_add_ids(vswp, VSW_LOCALDEV);
2430 	} else {
2431 		if (nvids != 0) {
2432 			kmem_free(vids, sizeof (vsw_vlanid_t) * nvids);
2433 		}
2434 	}
2435 
2436 	if (updated & MD_bw) {
2437 		vsw_update_bandwidth(vswp, NULL, VSW_LOCALDEV, maxbw);
2438 	}
2439 
2440 	return;
2441 
2442 fail_reconf:
2443 	cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
2444 	return;
2445 
2446 fail_update:
2447 	cmn_err(CE_WARN, "!vsw%d: re-configuration failed",
2448 	    vswp->instance);
2449 }
2450 
2451 /*
2452  * Read the port's md properties.
2453  */
2454 static int
2455 vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
2456 	md_t *mdp, mde_cookie_t *node)
2457 {
2458 	uint64_t		ldc_id;
2459 	uint8_t			*addrp;
2460 	int			i, addrsz;
2461 	int			num_nodes = 0, nchan = 0;
2462 	int			listsz = 0;
2463 	mde_cookie_t		*listp = NULL;
2464 	struct ether_addr	ea;
2465 	uint64_t		macaddr;
2466 	uint64_t		inst = 0;
2467 	uint64_t		val;
2468 
2469 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
2470 		DWARN(vswp, "%s: prop(%s) not found", __func__,
2471 		    id_propname);
2472 		return (1);
2473 	}
2474 
2475 	/*
2476 	 * Find the channel endpoint node(s) (which should be under this
2477 	 * port node) which contain the channel id(s).
2478 	 */
2479 	if ((num_nodes = md_node_count(mdp)) <= 0) {
2480 		DERR(vswp, "%s: invalid number of nodes found (%d)",
2481 		    __func__, num_nodes);
2482 		return (1);
2483 	}
2484 
2485 	D2(vswp, "%s: %d nodes found", __func__, num_nodes);
2486 
2487 	/* allocate enough space for node list */
2488 	listsz = num_nodes * sizeof (mde_cookie_t);
2489 	listp = kmem_zalloc(listsz, KM_SLEEP);
2490 
2491 	nchan = md_scan_dag(mdp, *node, md_find_name(mdp, chan_propname),
2492 	    md_find_name(mdp, "fwd"), listp);
2493 
2494 	if (nchan <= 0) {
2495 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
2496 		kmem_free(listp, listsz);
2497 		return (1);
2498 	}
2499 
2500 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
2501 
2502 	/* use property from first node found */
2503 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
2504 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
2505 		    id_propname);
2506 		kmem_free(listp, listsz);
2507 		return (1);
2508 	}
2509 
2510 	/* don't need list any more */
2511 	kmem_free(listp, listsz);
2512 
2513 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
2514 
2515 	/* read mac-address property */
2516 	if (md_get_prop_data(mdp, *node, remaddr_propname,
2517 	    &addrp, &addrsz)) {
2518 		DWARN(vswp, "%s: prop(%s) not found",
2519 		    __func__, remaddr_propname);
2520 		return (1);
2521 	}
2522 
2523 	if (addrsz < ETHERADDRL) {
2524 		DWARN(vswp, "%s: invalid address size", __func__);
2525 		return (1);
2526 	}
2527 
2528 	macaddr = *((uint64_t *)addrp);
2529 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
2530 
2531 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2532 		ea.ether_addr_octet[i] = macaddr & 0xFF;
2533 		macaddr >>= 8;
2534 	}
2535 
2536 	/* now update all properties into the port */
2537 	portp->p_vswp = vswp;
2538 	portp->p_instance = inst;
2539 	portp->addr_set = B_FALSE;
2540 	ether_copy(&ea, &portp->p_macaddr);
2541 	if (nchan > VSW_PORT_MAX_LDCS) {
2542 		D2(vswp, "%s: using first of %d ldc ids",
2543 		    __func__, nchan);
2544 		nchan = VSW_PORT_MAX_LDCS;
2545 	}
2546 	portp->num_ldcs = nchan;
2547 	portp->ldc_ids =
2548 	    kmem_zalloc(sizeof (uint64_t) * nchan, KM_SLEEP);
2549 	bcopy(&ldc_id, (portp->ldc_ids), sizeof (uint64_t) * nchan);
2550 
2551 	/* read vlan id properties of this port node */
2552 	vsw_vlan_read_ids(portp, VSW_VNETPORT, mdp, *node, &portp->pvid,
2553 	    &portp->vids, &portp->nvids, NULL);
2554 
2555 	/* Check if hybrid property is present */
2556 	if (md_get_prop_val(mdp, *node, hybrid_propname, &val) == 0) {
2557 		D1(vswp, "%s: prop(%s) found\n", __func__, hybrid_propname);
2558 		portp->p_hio_enabled = B_TRUE;
2559 	} else {
2560 		portp->p_hio_enabled = B_FALSE;
2561 	}
2562 	/*
2563 	 * Port hio capability determined after version
2564 	 * negotiation, i.e., when we know the peer is HybridIO capable.
2565 	 */
2566 	portp->p_hio_capable = B_FALSE;
2567 
2568 	/* Read bandwidth of this port */
2569 	vsw_port_read_bandwidth(portp, mdp, *node, &portp->p_bandwidth);
2570 
2571 	return (0);
2572 }
2573 
2574 /*
2575  * Add a new port to the system.
2576  *
2577  * Returns 0 on success, 1 on failure.
2578  */
2579 int
2580 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
2581 {
2582 	vsw_port_t	*portp;
2583 	int		rv;
2584 
2585 	portp = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
2586 
2587 	rv = vsw_port_read_props(portp, vswp, mdp, node);
2588 	if (rv != 0) {
2589 		kmem_free(portp, sizeof (*portp));
2590 		return (1);
2591 	}
2592 
2593 	rv = vsw_port_attach(portp);
2594 	if (rv != 0) {
2595 		DERR(vswp, "%s: failed to attach port", __func__);
2596 		return (1);
2597 	}
2598 
2599 	return (0);
2600 }
2601 
2602 static int
2603 vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
2604 	md_t *prev_mdp, mde_cookie_t prev_mdex)
2605 {
2606 	uint64_t	cport_num;
2607 	uint64_t	pport_num;
2608 	vsw_port_list_t	*plistp;
2609 	vsw_port_t	*portp;
2610 	uint16_t	pvid;
2611 	vsw_vlanid_t	*vids;
2612 	uint16_t	nvids;
2613 	uint64_t	val;
2614 	boolean_t	hio_enabled = B_FALSE;
2615 	uint64_t	maxbw;
2616 	enum		{P_MD_init = 0x1,
2617 				P_MD_vlans = 0x2,
2618 				P_MD_hio = 0x4,
2619 				P_MD_maxbw = 0x8} updated;
2620 
2621 	updated = P_MD_init;
2622 
2623 	/*
2624 	 * For now, we get port updates only if vlan ids changed.
2625 	 * We read the port num and do some sanity check.
2626 	 */
2627 	if (md_get_prop_val(curr_mdp, curr_mdex, id_propname, &cport_num)) {
2628 		return (1);
2629 	}
2630 
2631 	if (md_get_prop_val(prev_mdp, prev_mdex, id_propname, &pport_num)) {
2632 		return (1);
2633 	}
2634 	if (cport_num != pport_num)
2635 		return (1);
2636 
2637 	plistp = &(vswp->plist);
2638 
2639 	READ_ENTER(&plistp->lockrw);
2640 
2641 	portp = vsw_lookup_port(vswp, cport_num);
2642 	if (portp == NULL) {
2643 		RW_EXIT(&plistp->lockrw);
2644 		return (1);
2645 	}
2646 
2647 	/* Read the vlan ids */
2648 	vsw_vlan_read_ids(portp, VSW_VNETPORT, curr_mdp, curr_mdex, &pvid,
2649 	    &vids, &nvids, NULL);
2650 
2651 	/* Determine if there are any vlan id updates */
2652 	if ((pvid != portp->pvid) ||		/* pvid changed? */
2653 	    (nvids != portp->nvids) ||		/* # of vids changed? */
2654 	    ((nvids != 0) && (portp->nvids != 0) &&	/* vids changed? */
2655 	    !vsw_cmp_vids(vids, portp->vids, nvids))) {
2656 		updated |= P_MD_vlans;
2657 	}
2658 
2659 	/* Check if hybrid property is present */
2660 	if (md_get_prop_val(curr_mdp, curr_mdex, hybrid_propname, &val) == 0) {
2661 		D1(vswp, "%s: prop(%s) found\n", __func__, hybrid_propname);
2662 		hio_enabled = B_TRUE;
2663 	}
2664 
2665 	if (portp->p_hio_enabled != hio_enabled) {
2666 		updated |= P_MD_hio;
2667 	}
2668 
2669 	/* Check if maxbw property is present */
2670 	vsw_port_read_bandwidth(portp, curr_mdp, curr_mdex, &maxbw);
2671 	if (maxbw != portp->p_bandwidth) {
2672 		if (maxbw >= MRP_MAXBW_MINVAL || maxbw == 0) {
2673 			updated |= P_MD_maxbw;
2674 		} else {
2675 			cmn_err(CE_NOTE, "!vsw%d: Unable to process bandwidth"
2676 			    " update for port %d as the specified value:%ld"
2677 			    " is invalid\n",
2678 			    vswp->instance, portp->p_instance, maxbw);
2679 		}
2680 	}
2681 
2682 	if (updated & P_MD_vlans) {
2683 		/* Remove existing vlan ids from the hash table. */
2684 		vsw_vlan_remove_ids(portp, VSW_VNETPORT);
2685 
2686 		/* Reconfigure vlans with network device */
2687 		vsw_mac_port_reconfig_vlans(portp, pvid, vids, nvids);
2688 
2689 		/* add these new vlan ids into hash table */
2690 		vsw_vlan_add_ids(portp, VSW_VNETPORT);
2691 
2692 		/* reset the port if it is vlan unaware (ver < 1.3) */
2693 		vsw_vlan_unaware_port_reset(portp);
2694 	}
2695 
2696 	if (updated & P_MD_hio) {
2697 		vsw_hio_port_update(portp, hio_enabled);
2698 	}
2699 
2700 	if (updated & P_MD_maxbw) {
2701 		vsw_update_bandwidth(NULL, portp, VSW_VNETPORT, maxbw);
2702 	}
2703 
2704 	RW_EXIT(&plistp->lockrw);
2705 
2706 	return (0);
2707 }
2708 
2709 /*
2710  * vsw_mac_rx -- A common function to send packets to the interface.
2711  * By default this function check if the interface is UP or not, the
2712  * rest of the behaviour depends on the flags as below:
2713  *
2714  *	VSW_MACRX_PROMISC -- Check if the promisc mode set or not.
2715  *	VSW_MACRX_COPYMSG -- Make a copy of the message(s).
2716  *	VSW_MACRX_FREEMSG -- Free if the messages cannot be sent up the stack.
2717  */
2718 void
2719 vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
2720     mblk_t *mp, vsw_macrx_flags_t flags)
2721 {
2722 	mblk_t		*mpt;
2723 
2724 	D1(vswp, "%s:enter\n", __func__);
2725 	READ_ENTER(&vswp->if_lockrw);
2726 	/* Check if the interface is up */
2727 	if (!(vswp->if_state & VSW_IF_UP)) {
2728 		RW_EXIT(&vswp->if_lockrw);
2729 		/* Free messages only if FREEMSG flag specified */
2730 		if (flags & VSW_MACRX_FREEMSG) {
2731 			freemsgchain(mp);
2732 		}
2733 		D1(vswp, "%s:exit\n", __func__);
2734 		return;
2735 	}
2736 	/*
2737 	 * If PROMISC flag is passed, then check if
2738 	 * the interface is in the PROMISC mode.
2739 	 * If not, drop the messages.
2740 	 */
2741 	if (flags & VSW_MACRX_PROMISC) {
2742 		if (!(vswp->if_state & VSW_IF_PROMISC)) {
2743 			RW_EXIT(&vswp->if_lockrw);
2744 			/* Free messages only if FREEMSG flag specified */
2745 			if (flags & VSW_MACRX_FREEMSG) {
2746 				freemsgchain(mp);
2747 			}
2748 			D1(vswp, "%s:exit\n", __func__);
2749 			return;
2750 		}
2751 	}
2752 	RW_EXIT(&vswp->if_lockrw);
2753 	/*
2754 	 * If COPYMSG flag is passed, then make a copy
2755 	 * of the message chain and send up the copy.
2756 	 */
2757 	if (flags & VSW_MACRX_COPYMSG) {
2758 		mp = copymsgchain(mp);
2759 		if (mp == NULL) {
2760 			D1(vswp, "%s:exit\n", __func__);
2761 			return;
2762 		}
2763 	}
2764 
2765 	D2(vswp, "%s: sending up stack", __func__);
2766 
2767 	mpt = NULL;
2768 	(void) vsw_vlan_frame_untag(vswp, VSW_LOCALDEV, &mp, &mpt);
2769 	if (mp != NULL) {
2770 		mac_rx(vswp->if_mh, mrh, mp);
2771 	}
2772 	D1(vswp, "%s:exit\n", __func__);
2773 }
2774 
2775 /* copy mac address of vsw into soft state structure */
2776 static void
2777 vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr)
2778 {
2779 	int	i;
2780 
2781 	WRITE_ENTER(&vswp->if_lockrw);
2782 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2783 		vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
2784 		macaddr >>= 8;
2785 	}
2786 	RW_EXIT(&vswp->if_lockrw);
2787 }
2788 
2789 /* Compare VLAN ids, array size expected to be same. */
2790 static boolean_t
2791 vsw_cmp_vids(vsw_vlanid_t *vids1, vsw_vlanid_t *vids2, int nvids)
2792 {
2793 	int i, j;
2794 	uint16_t vid;
2795 
2796 	for (i = 0; i < nvids; i++) {
2797 		vid = vids1[i].vl_vid;
2798 		for (j = 0; j < nvids; j++) {
2799 			if (vid == vids2[i].vl_vid)
2800 				break;
2801 		}
2802 		if (j == nvids) {
2803 			return (B_FALSE);
2804 		}
2805 	}
2806 	return (B_TRUE);
2807 }
2808