xref: /titanic_44/usr/src/uts/sun4v/io/vsw.c (revision 0db3240d392634cfff2f95fb6da34b56b8dc574f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/debug.h>
30 #include <sys/time.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/user.h>
34 #include <sys/stropts.h>
35 #include <sys/stream.h>
36 #include <sys/strlog.h>
37 #include <sys/strsubr.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cpu.h>
40 #include <sys/kmem.h>
41 #include <sys/conf.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/ksynch.h>
45 #include <sys/stat.h>
46 #include <sys/kstat.h>
47 #include <sys/vtrace.h>
48 #include <sys/strsun.h>
49 #include <sys/dlpi.h>
50 #include <sys/ethernet.h>
51 #include <net/if.h>
52 #include <sys/varargs.h>
53 #include <sys/machsystm.h>
54 #include <sys/modctl.h>
55 #include <sys/modhash.h>
56 #include <sys/mac_provider.h>
57 #include <sys/mac_ether.h>
58 #include <sys/taskq.h>
59 #include <sys/note.h>
60 #include <sys/mach_descrip.h>
61 #include <sys/mac_provider.h>
62 #include <sys/mdeg.h>
63 #include <sys/ldc.h>
64 #include <sys/vsw_fdb.h>
65 #include <sys/vsw.h>
66 #include <sys/vio_mailbox.h>
67 #include <sys/vnet_mailbox.h>
68 #include <sys/vnet_common.h>
69 #include <sys/vio_util.h>
70 #include <sys/sdt.h>
71 #include <sys/atomic.h>
72 #include <sys/callb.h>
73 #include <sys/vlan.h>
74 
75 /*
76  * Function prototypes.
77  */
78 static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
79 static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
80 static	int vsw_unattach(vsw_t *vswp);
81 static	int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *);
82 static	int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *);
83 void vsw_destroy_rxpools(void *);
84 
85 /* MDEG routines */
86 static	int vsw_mdeg_register(vsw_t *vswp);
87 static	void vsw_mdeg_unregister(vsw_t *vswp);
88 static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
89 static	int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *);
90 static	int vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t);
91 static	int vsw_read_mdprops(vsw_t *vswp);
92 static	void vsw_vlan_read_ids(void *arg, int type, md_t *mdp,
93 	mde_cookie_t node, uint16_t *pvidp, vsw_vlanid_t **vidspp,
94 	uint16_t *nvidsp, uint16_t *default_idp);
95 static	void vsw_port_read_bandwidth(vsw_port_t *portp, md_t *mdp,
96 	mde_cookie_t node, uint64_t *bw);
97 static	int vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
98 	md_t *mdp, mde_cookie_t *node);
99 static	void vsw_read_pri_eth_types(vsw_t *vswp, md_t *mdp,
100 	mde_cookie_t node);
101 static	void vsw_mtu_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
102 	uint32_t *mtu);
103 static	int vsw_mtu_update(vsw_t *vswp, uint32_t mtu);
104 static	void vsw_linkprop_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
105 	boolean_t *pls);
106 static	void vsw_bandwidth_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
107 	uint64_t *bw);
108 static	void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t);
109 static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr);
110 static boolean_t vsw_cmp_vids(vsw_vlanid_t *vids1,
111 	vsw_vlanid_t *vids2, int nvids);
112 
113 /* Mac driver related routines */
114 static int vsw_mac_register(vsw_t *);
115 static int vsw_mac_unregister(vsw_t *);
116 static int vsw_m_stat(void *, uint_t, uint64_t *);
117 static void vsw_m_stop(void *arg);
118 static int vsw_m_start(void *arg);
119 static int vsw_m_unicst(void *arg, const uint8_t *);
120 static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
121 static int vsw_m_promisc(void *arg, boolean_t);
122 static mblk_t *vsw_m_tx(void *arg, mblk_t *);
123 void vsw_mac_link_update(vsw_t *vswp, link_state_t link_state);
124 void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
125     mblk_t *mp, vsw_macrx_flags_t flags);
126 void vsw_physlink_state_update(vsw_t *vswp);
127 
128 /*
129  * Functions imported from other files.
130  */
131 extern void vsw_setup_switching_thread(void *arg);
132 extern int vsw_setup_switching_start(vsw_t *vswp);
133 extern void vsw_setup_switching_stop(vsw_t *vswp);
134 extern int vsw_setup_switching(vsw_t *);
135 extern void vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller,
136     vsw_port_t *port, mac_resource_handle_t mrh);
137 extern int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
138 extern int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
139 extern void vsw_del_mcst_vsw(vsw_t *);
140 extern mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr);
141 extern void vsw_detach_ports(vsw_t *vswp);
142 extern int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
143 extern int vsw_port_detach(vsw_t *vswp, int p_instance);
144 static int vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
145 	md_t *prev_mdp, mde_cookie_t prev_mdex);
146 extern	int vsw_port_attach(vsw_port_t *port);
147 extern vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
148 extern int vsw_mac_open(vsw_t *vswp);
149 extern void vsw_mac_close(vsw_t *vswp);
150 extern void vsw_mac_cleanup_ports(vsw_t *vswp);
151 extern void vsw_unset_addrs(vsw_t *vswp);
152 extern void vsw_setup_switching_post_process(vsw_t *vswp);
153 extern void vsw_create_vlans(void *arg, int type);
154 extern void vsw_destroy_vlans(void *arg, int type);
155 extern void vsw_vlan_add_ids(void *arg, int type);
156 extern void vsw_vlan_remove_ids(void *arg, int type);
157 extern void vsw_vlan_unaware_port_reset(vsw_port_t *portp);
158 extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np,
159 	mblk_t **npt);
160 extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp);
161 extern void vsw_hio_cleanup(vsw_t *vswp);
162 extern void vsw_hio_start_ports(vsw_t *vswp);
163 extern void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled);
164 extern int vsw_mac_multicast_add(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
165 extern void vsw_mac_multicast_remove(vsw_t *, vsw_port_t *, mcst_addr_t *, int);
166 extern void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid,
167     vsw_vlanid_t *new_vids, int new_nvids);
168 extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type);
169 extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type);
170 extern void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans,
171     uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids);
172 extern void vsw_reset_ports(vsw_t *vswp);
173 extern void vsw_port_reset(vsw_port_t *portp);
174 extern void vsw_physlink_update_ports(vsw_t *vswp);
175 extern void vsw_update_bandwidth(vsw_t *vswp, vsw_port_t *port, int type,
176     uint64_t maxbw);
177 
178 /*
179  * Internal tunables.
180  */
181 int	vsw_num_handshakes = VNET_NUM_HANDSHAKES; /* # of handshake attempts */
182 int	vsw_wretries = 100;		/* # of write attempts */
183 int	vsw_setup_switching_delay = 3;	/* setup sw timeout interval in sec */
184 int	vsw_mac_open_retries = 300;	/* max # of mac_open() retries */
185 					/* 300*3 = 900sec(15min) of max tmout */
186 int	vsw_ldc_tx_delay = 5;		/* delay(ticks) for tx retries */
187 int	vsw_ldc_tx_retries = 10;	/* # of ldc tx retries */
188 int	vsw_ldc_retries = 5;		/* # of ldc_close() retries */
189 int	vsw_ldc_delay = 1000;		/* 1 ms delay for ldc_close() */
190 boolean_t vsw_ldc_rxthr_enabled = B_TRUE;	/* LDC Rx thread enabled */
191 boolean_t vsw_ldc_txthr_enabled = B_TRUE;	/* LDC Tx thread enabled */
192 int	vsw_rxpool_cleanup_delay = 100000;	/* 100ms */
193 
194 
195 uint32_t	vsw_fdb_nchains = 8;	/* # of chains in fdb hash table */
196 uint32_t	vsw_vlan_nchains = 4;	/* # of chains in vlan id hash table */
197 uint32_t	vsw_ethermtu = 1500;	/* mtu of the device */
198 
199 /* delay in usec to wait for all references on a fdb entry to be dropped */
200 uint32_t vsw_fdbe_refcnt_delay = 10;
201 
202 /*
203  * Default vlan id. This is only used internally when the "default-vlan-id"
204  * property is not present in the MD device node. Therefore, this should not be
205  * used as a tunable; if this value is changed, the corresponding variable
206  * should be updated to the same value in all vnets connected to this vsw.
207  */
208 uint16_t	vsw_default_vlan_id = 1;
209 
210 /*
211  * Workaround for a version handshake bug in obp's vnet.
212  * If vsw initiates version negotiation starting from the highest version,
213  * obp sends a nack and terminates version handshake. To workaround
214  * this, we do not initiate version handshake when the channel comes up.
215  * Instead, we wait for the peer to send its version info msg and go through
216  * the version protocol exchange. If we successfully negotiate a version,
217  * before sending the ack, we send our version info msg to the peer
218  * using the <major,minor> version that we are about to ack.
219  */
220 boolean_t vsw_obp_ver_proto_workaround = B_TRUE;
221 
222 /*
223  * In the absence of "priority-ether-types" property in MD, the following
224  * internal tunable can be set to specify a single priority ethertype.
225  */
226 uint64_t vsw_pri_eth_type = 0;
227 
228 /*
229  * Number of transmit priority buffers that are preallocated per device.
230  * This number is chosen to be a small value to throttle transmission
231  * of priority packets. Note: Must be a power of 2 for vio_create_mblks().
232  */
233 uint32_t vsw_pri_tx_nmblks = 64;
234 
235 /*
236  * Number of RARP packets sent to announce macaddr to the physical switch,
237  * after vsw's physical device is changed dynamically or after a guest (client
238  * vnet) is live migrated in.
239  */
240 uint32_t vsw_publish_macaddr_count = 3;
241 
242 /*
243  * Enable/disable HybridIO
244  */
245 boolean_t vsw_hio_enabled = B_TRUE;
246 
247 /*
248  * Max retries for HybridIO cleanup
249  */
250 int vsw_hio_max_cleanup_retries = 10;
251 
252 /*
253  * 10ms delay for HybridIO cleanup
254  */
255 int vsw_hio_cleanup_delay = 10000;
256 
257 /*
258  * Descriptor ring modes of LDC data transfer:
259  *
260  * 1) TxDring mode:
261  * In versions < v1.6 of VIO Protocol, we support only TxDring mode. In this
262  * mode, we create a transmit descriptor ring and export it to the peer through
263  * dring registration process of handshake. The descriptor ring is exported
264  * using LDC shared memory. Each descriptor is associated with a data buffer.
265  * The data buffer is also exported over LDC and the cookies for this data
266  * buffer are provided in the descriptor. The peer maps this ring as its
267  * receive ring. Similarly, the peer exports a transmit descriptor ring which
268  * is mapped by this device as its receive ring. In this mode, in a given data
269  * transfer direction, the transmitter copies the data to the exported data
270  * buffer (owned by itself), bound to the descriptor. The receiver uses the LDC
271  * cookies specified in the descriptor to copy the data into the receiving
272  * guest through the hypervisor (ldc_mem_copy()).
273  *
274  * 2) RxDringData mode:
275  * In versions >= v1.6 of VIO Protocol, we also support RxDringData mode. In
276  * this mode, we create a receive descriptor ring and export it to the peer
277  * through dring registration process of handshake. In addition, we export a
278  * receive buffer area and provide that information also in the dring
279  * registration message. The descriptor ring and the data buffer area are
280  * exported using LDC shared memory. Each descriptor is associated with a data
281  * buffer in the data buffer area and the offset of the specific data buffer
282  * within this area is specified in the descriptor. The peer maps this ring
283  * along with the data buffer area as its transmit ring. Similarly, the peer
284  * exports a receive ring which is mapped by this device as its transmit ring,
285  * along with its buffer area. In this mode, in a given data transfer
286  * direction, the transmitter copies the data to the data buffer offset
287  * specified in the descriptor. The receiver simply picks up the data buffer
288  * (owned by itself) without any copy operation into the receiving guest.
289  *
290  * We provide a tunable to enable RxDringData mode for versions >= v1.6 of VIO
291  * Protocol. By default, this tunable is set to 1 (VIO_TX_DRING). To enable
292  * RxDringData mode set this tunable to 4 (VIO_RX_DRING_DATA). This enables us
293  * to negotiate RxDringData mode with peers that support versions >= v1.6. For
294  * peers that support version < v1.6, we continue to operate in TxDring mode
295  * with them though the tunable is enabled.
296  */
297 uint8_t  vsw_dring_mode = VIO_TX_DRING;
298 
299 /*
300  * Number of descriptors;  must be power of 2.
301  */
302 uint32_t vsw_num_descriptors = VSW_NUM_DESCRIPTORS;
303 
304 /*
305  * In RxDringData mode, # of buffers is determined by multiplying the # of
306  * descriptors with the factor below. Note that the factor must be > 1; i.e,
307  * the # of buffers must always be > # of descriptors. This is needed because,
308  * while the shared memory buffers are sent up the stack on the receiver, the
309  * sender needs additional buffers that can be used for further transmits.
310  * See vsw_setup_rx_dring() for details.
311  */
312 uint32_t vsw_nrbufs_factor = 2;
313 
314 /*
315  * Delay when rx descr not ready; used in both dring modes.
316  */
317 int	vsw_recv_delay = 0;
318 
319 /*
320  * Retry when rx descr not ready; used in both dring modes.
321  */
322 int	vsw_recv_retries = 5;
323 
324 /*
325  * Max number of mblks received in one receive operation.
326  */
327 uint32_t vsw_chain_len = (VSW_NUM_MBLKS * 0.6);
328 
329 /*
330  * Internal tunables for receive buffer pools, that is,  the size and number of
331  * mblks for each pool. At least 3 sizes must be specified if these are used.
332  * The sizes must be specified in increasing order. Non-zero value of the first
333  * size will be used as a hint to use these values instead of the algorithm
334  * that determines the sizes based on MTU. Used in TxDring mode only.
335  */
336 uint32_t vsw_mblk_size1 = 0;
337 uint32_t vsw_mblk_size2 = 0;
338 uint32_t vsw_mblk_size3 = 0;
339 uint32_t vsw_mblk_size4 = 0;
340 uint32_t vsw_num_mblks1 = VSW_NUM_MBLKS;	/* number of mblks for pool1 */
341 uint32_t vsw_num_mblks2 = VSW_NUM_MBLKS;	/* number of mblks for pool2 */
342 uint32_t vsw_num_mblks3 = VSW_NUM_MBLKS;	/* number of mblks for pool3 */
343 uint32_t vsw_num_mblks4 = VSW_NUM_MBLKS;	/* number of mblks for pool4 */
344 
345 /*
346  * Set this to non-zero to enable additional internal receive buffer pools
347  * based on the MTU of the device for better performance at the cost of more
348  * memory consumption. This is turned off by default, to use allocb(9F) for
349  * receive buffer allocations of sizes > 2K.
350  */
351 boolean_t vsw_jumbo_rxpools = B_FALSE;
352 
353 /*
354  * vsw_max_tx_qcount is the maximum # of packets that can be queued
355  * before the tx worker thread begins processing the queue. Its value
356  * is chosen to be 4x the default length of tx descriptor ring.
357  */
358 uint32_t vsw_max_tx_qcount = 4 * VSW_NUM_DESCRIPTORS;
359 
360 /*
361  * MAC callbacks
362  */
363 static	mac_callbacks_t	vsw_m_callbacks = {
364 	0,
365 	vsw_m_stat,
366 	vsw_m_start,
367 	vsw_m_stop,
368 	vsw_m_promisc,
369 	vsw_m_multicst,
370 	vsw_m_unicst,
371 	vsw_m_tx
372 };
373 
374 static	struct	cb_ops	vsw_cb_ops = {
375 	nulldev,			/* cb_open */
376 	nulldev,			/* cb_close */
377 	nodev,				/* cb_strategy */
378 	nodev,				/* cb_print */
379 	nodev,				/* cb_dump */
380 	nodev,				/* cb_read */
381 	nodev,				/* cb_write */
382 	nodev,				/* cb_ioctl */
383 	nodev,				/* cb_devmap */
384 	nodev,				/* cb_mmap */
385 	nodev,				/* cb_segmap */
386 	nochpoll,			/* cb_chpoll */
387 	ddi_prop_op,			/* cb_prop_op */
388 	NULL,				/* cb_stream */
389 	D_MP,				/* cb_flag */
390 	CB_REV,				/* rev */
391 	nodev,				/* int (*cb_aread)() */
392 	nodev				/* int (*cb_awrite)() */
393 };
394 
395 static	struct	dev_ops	vsw_ops = {
396 	DEVO_REV,		/* devo_rev */
397 	0,			/* devo_refcnt */
398 	NULL,			/* devo_getinfo */
399 	nulldev,		/* devo_identify */
400 	nulldev,		/* devo_probe */
401 	vsw_attach,		/* devo_attach */
402 	vsw_detach,		/* devo_detach */
403 	nodev,			/* devo_reset */
404 	&vsw_cb_ops,		/* devo_cb_ops */
405 	(struct bus_ops *)NULL,	/* devo_bus_ops */
406 	ddi_power		/* devo_power */
407 };
408 
409 extern	struct	mod_ops	mod_driverops;
410 static struct modldrv vswmodldrv = {
411 	&mod_driverops,
412 	"sun4v Virtual Switch",
413 	&vsw_ops,
414 };
415 
416 #define	LDC_ENTER_LOCK(ldcp)	\
417 				mutex_enter(&((ldcp)->ldc_cblock));\
418 				mutex_enter(&((ldcp)->ldc_rxlock));\
419 				mutex_enter(&((ldcp)->ldc_txlock));
420 #define	LDC_EXIT_LOCK(ldcp)	\
421 				mutex_exit(&((ldcp)->ldc_txlock));\
422 				mutex_exit(&((ldcp)->ldc_rxlock));\
423 				mutex_exit(&((ldcp)->ldc_cblock));
424 
425 /* Driver soft state ptr  */
426 static void	*vsw_state;
427 
428 /*
429  * Linked list of "vsw_t" structures - one per instance.
430  */
431 vsw_t		*vsw_head = NULL;
432 krwlock_t	vsw_rw;
433 
434 /*
435  * Property names
436  */
437 static char vdev_propname[] = "virtual-device";
438 static char vsw_propname[] = "virtual-network-switch";
439 static char physdev_propname[] = "vsw-phys-dev";
440 static char smode_propname[] = "vsw-switch-mode";
441 static char macaddr_propname[] = "local-mac-address";
442 static char remaddr_propname[] = "remote-mac-address";
443 static char ldcids_propname[] = "ldc-ids";
444 static char chan_propname[] = "channel-endpoint";
445 static char id_propname[] = "id";
446 static char reg_propname[] = "reg";
447 static char pri_types_propname[] = "priority-ether-types";
448 static char vsw_pvid_propname[] = "port-vlan-id";
449 static char vsw_vid_propname[] = "vlan-id";
450 static char vsw_dvid_propname[] = "default-vlan-id";
451 static char port_pvid_propname[] = "remote-port-vlan-id";
452 static char port_vid_propname[] = "remote-vlan-id";
453 static char hybrid_propname[] = "hybrid";
454 static char vsw_mtu_propname[] = "mtu";
455 static char vsw_linkprop_propname[] = "linkprop";
456 static char vsw_maxbw_propname[] = "maxbw";
457 static char port_maxbw_propname[] = "maxbw";
458 
459 /*
460  * Matching criteria passed to the MDEG to register interest
461  * in changes to 'virtual-device-port' nodes identified by their
462  * 'id' property.
463  */
464 static md_prop_match_t vport_prop_match[] = {
465 	{ MDET_PROP_VAL,    "id"   },
466 	{ MDET_LIST_END,    NULL    }
467 };
468 
469 static mdeg_node_match_t vport_match = { "virtual-device-port",
470 						vport_prop_match };
471 
472 /*
473  * Matching criteria passed to the MDEG to register interest
474  * in changes to 'virtual-device' nodes (i.e. vsw nodes) identified
475  * by their 'name' and 'cfg-handle' properties.
476  */
477 static md_prop_match_t vdev_prop_match[] = {
478 	{ MDET_PROP_STR,    "name"   },
479 	{ MDET_PROP_VAL,    "cfg-handle" },
480 	{ MDET_LIST_END,    NULL    }
481 };
482 
483 static mdeg_node_match_t vdev_match = { "virtual-device",
484 						vdev_prop_match };
485 
486 
487 /*
488  * Specification of an MD node passed to the MDEG to filter any
489  * 'vport' nodes that do not belong to the specified node. This
490  * template is copied for each vsw instance and filled in with
491  * the appropriate 'cfg-handle' value before being passed to the MDEG.
492  */
493 static mdeg_prop_spec_t vsw_prop_template[] = {
494 	{ MDET_PROP_STR,    "name",		vsw_propname },
495 	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
496 	{ MDET_LIST_END,    NULL,		NULL	}
497 };
498 
499 #define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
500 
501 #ifdef	DEBUG
502 /*
503  * Print debug messages - set to 0x1f to enable all msgs
504  * or 0x0 to turn all off.
505  */
506 int vswdbg = 0x0;
507 
508 /*
509  * debug levels:
510  * 0x01:	Function entry/exit tracing
511  * 0x02:	Internal function messages
512  * 0x04:	Verbose internal messages
513  * 0x08:	Warning messages
514  * 0x10:	Error messages
515  */
516 
517 void
518 vswdebug(vsw_t *vswp, const char *fmt, ...)
519 {
520 	char buf[512];
521 	va_list ap;
522 
523 	va_start(ap, fmt);
524 	(void) vsprintf(buf, fmt, ap);
525 	va_end(ap);
526 
527 	if (vswp == NULL)
528 		cmn_err(CE_CONT, "%s\n", buf);
529 	else
530 		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
531 }
532 
533 #endif	/* DEBUG */
534 
535 static struct modlinkage modlinkage = {
536 	MODREV_1,
537 	&vswmodldrv,
538 	NULL
539 };
540 
541 int
542 _init(void)
543 {
544 	int status;
545 
546 	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
547 
548 	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
549 	if (status != 0) {
550 		return (status);
551 	}
552 
553 	mac_init_ops(&vsw_ops, DRV_NAME);
554 	status = mod_install(&modlinkage);
555 	if (status != 0) {
556 		ddi_soft_state_fini(&vsw_state);
557 	}
558 	return (status);
559 }
560 
561 int
562 _fini(void)
563 {
564 	int status;
565 
566 	status = mod_remove(&modlinkage);
567 	if (status != 0)
568 		return (status);
569 	mac_fini_ops(&vsw_ops);
570 	ddi_soft_state_fini(&vsw_state);
571 
572 	rw_destroy(&vsw_rw);
573 
574 	return (status);
575 }
576 
577 int
578 _info(struct modinfo *modinfop)
579 {
580 	return (mod_info(&modlinkage, modinfop));
581 }
582 
583 static int
584 vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
585 {
586 	vsw_t			*vswp;
587 	int			instance;
588 	char			hashname[MAXNAMELEN];
589 	char			qname[TASKQ_NAMELEN];
590 	vsw_attach_progress_t	progress = PROG_init;
591 	int			rv;
592 
593 	switch (cmd) {
594 	case DDI_ATTACH:
595 		break;
596 	case DDI_RESUME:
597 		/* nothing to do for this non-device */
598 		return (DDI_SUCCESS);
599 	case DDI_PM_RESUME:
600 	default:
601 		return (DDI_FAILURE);
602 	}
603 
604 	instance = ddi_get_instance(dip);
605 	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
606 		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
607 		return (DDI_FAILURE);
608 	}
609 	vswp = ddi_get_soft_state(vsw_state, instance);
610 
611 	if (vswp == NULL) {
612 		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
613 		goto vsw_attach_fail;
614 	}
615 
616 	vswp->dip = dip;
617 	vswp->instance = instance;
618 	vswp->phys_link_state = LINK_STATE_UNKNOWN;
619 	ddi_set_driver_private(dip, (caddr_t)vswp);
620 
621 	mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL);
622 	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
623 	mutex_init(&vswp->sw_thr_lock, NULL, MUTEX_DRIVER, NULL);
624 	cv_init(&vswp->sw_thr_cv, NULL, CV_DRIVER, NULL);
625 	rw_init(&vswp->maccl_rwlock, NULL, RW_DRIVER, NULL);
626 	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
627 	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
628 	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
629 
630 	progress |= PROG_locks;
631 
632 	rv = vsw_read_mdprops(vswp);
633 	if (rv != 0)
634 		goto vsw_attach_fail;
635 
636 	progress |= PROG_readmd;
637 
638 	/* setup the unicast forwarding database  */
639 	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
640 	    vswp->instance);
641 	D2(vswp, "creating unicast hash table (%s)...", hashname);
642 	vswp->fdb_nchains = vsw_fdb_nchains;
643 	vswp->fdb_hashp = mod_hash_create_ptrhash(hashname, vswp->fdb_nchains,
644 	    mod_hash_null_valdtor, sizeof (void *));
645 	vsw_create_vlans((void *)vswp, VSW_LOCALDEV);
646 	progress |= PROG_fdb;
647 
648 	/* setup the multicast fowarding database */
649 	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
650 	    vswp->instance);
651 	D2(vswp, "creating multicast hash table %s)...", hashname);
652 	vswp->mfdb = mod_hash_create_ptrhash(hashname, vsw_fdb_nchains,
653 	    mod_hash_null_valdtor, sizeof (void *));
654 
655 	progress |= PROG_mfdb;
656 
657 	/*
658 	 * Create the taskq which will process all the VIO
659 	 * control messages.
660 	 */
661 	(void) snprintf(qname, TASKQ_NAMELEN, "taskq%d", vswp->instance);
662 	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
663 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
664 		cmn_err(CE_WARN, "!vsw%d: Unable to create task queue",
665 		    vswp->instance);
666 		goto vsw_attach_fail;
667 	}
668 
669 	progress |= PROG_taskq;
670 
671 	(void) snprintf(qname, TASKQ_NAMELEN, "rxpool_taskq%d",
672 	    vswp->instance);
673 	if ((vswp->rxp_taskq = ddi_taskq_create(vswp->dip, qname, 1,
674 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
675 		cmn_err(CE_WARN, "!vsw%d: Unable to create rxp task queue",
676 		    vswp->instance);
677 		goto vsw_attach_fail;
678 	}
679 
680 	progress |= PROG_rxp_taskq;
681 
682 	/* prevent auto-detaching */
683 	if (ddi_prop_update_int(DDI_DEV_T_NONE, vswp->dip,
684 	    DDI_NO_AUTODETACH, 1) != DDI_SUCCESS) {
685 		cmn_err(CE_NOTE, "!Unable to set \"%s\" property for "
686 		    "instance %u", DDI_NO_AUTODETACH, instance);
687 	}
688 
689 	/*
690 	 * The null switching function is set to avoid panic until
691 	 * switch mode is setup.
692 	 */
693 	vswp->vsw_switch_frame = vsw_switch_frame_nop;
694 
695 	/*
696 	 * Setup the required switching mode, based on the mdprops that we read
697 	 * earlier. We start a thread to do this, to avoid calling mac_open()
698 	 * directly from attach().
699 	 */
700 	rv = vsw_setup_switching_start(vswp);
701 	if (rv != 0) {
702 		goto vsw_attach_fail;
703 	}
704 
705 	progress |= PROG_swmode;
706 
707 	/* Register with mac layer as a provider */
708 	rv = vsw_mac_register(vswp);
709 	if (rv != 0)
710 		goto vsw_attach_fail;
711 
712 	progress |= PROG_macreg;
713 
714 	/*
715 	 * Now we have everything setup, register an interest in
716 	 * specific MD nodes.
717 	 *
718 	 * The callback is invoked in 2 cases, firstly if upon mdeg
719 	 * registration there are existing nodes which match our specified
720 	 * criteria, and secondly if the MD is changed (and again, there
721 	 * are nodes which we are interested in present within it. Note
722 	 * that our callback will be invoked even if our specified nodes
723 	 * have not actually changed).
724 	 *
725 	 */
726 	rv = vsw_mdeg_register(vswp);
727 	if (rv != 0)
728 		goto vsw_attach_fail;
729 
730 	progress |= PROG_mdreg;
731 
732 	vswp->attach_progress = progress;
733 
734 	WRITE_ENTER(&vsw_rw);
735 	vswp->next = vsw_head;
736 	vsw_head = vswp;
737 	RW_EXIT(&vsw_rw);
738 
739 	ddi_report_dev(vswp->dip);
740 	return (DDI_SUCCESS);
741 
742 vsw_attach_fail:
743 	DERR(NULL, "vsw_attach: failed");
744 
745 	vswp->attach_progress = progress;
746 	(void) vsw_unattach(vswp);
747 	ddi_soft_state_free(vsw_state, instance);
748 	return (DDI_FAILURE);
749 }
750 
751 static int
752 vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
753 {
754 	vsw_t			**vswpp, *vswp;
755 	int 			instance;
756 
757 	instance = ddi_get_instance(dip);
758 	vswp = ddi_get_soft_state(vsw_state, instance);
759 
760 	if (vswp == NULL) {
761 		return (DDI_FAILURE);
762 	}
763 
764 	switch (cmd) {
765 	case DDI_DETACH:
766 		break;
767 	case DDI_SUSPEND:
768 	case DDI_PM_SUSPEND:
769 	default:
770 		return (DDI_FAILURE);
771 	}
772 
773 	D2(vswp, "detaching instance %d", instance);
774 
775 	if (vsw_unattach(vswp) != 0) {
776 		return (DDI_FAILURE);
777 	}
778 
779 	ddi_remove_minor_node(dip, NULL);
780 
781 	WRITE_ENTER(&vsw_rw);
782 	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
783 		if (*vswpp == vswp) {
784 			*vswpp = vswp->next;
785 			break;
786 		}
787 	}
788 	RW_EXIT(&vsw_rw);
789 
790 	ddi_soft_state_free(vsw_state, instance);
791 
792 	return (DDI_SUCCESS);
793 }
794 
795 /*
796  * Common routine to handle vsw_attach() failure and vsw_detach(). Note that
797  * the only reason this function could fail is if mac_unregister() fails.
798  * Otherwise, this function must ensure that all resources are freed and return
799  * success.
800  */
801 static int
802 vsw_unattach(vsw_t *vswp)
803 {
804 	vsw_attach_progress_t	progress;
805 
806 	progress = vswp->attach_progress;
807 
808 	/*
809 	 * Unregister from the gldv3 subsystem. This can fail, in particular
810 	 * if there are still any open references to this mac device; in which
811 	 * case we just return failure without continuing to detach further.
812 	 */
813 	if (progress & PROG_macreg) {
814 		if (vsw_mac_unregister(vswp) != 0) {
815 			cmn_err(CE_WARN, "!vsw%d: Unable to detach from "
816 			    "MAC layer", vswp->instance);
817 			return (1);
818 		}
819 		progress &= ~PROG_macreg;
820 	}
821 
822 	/*
823 	 * Now that we have unregistered from gldv3, we must finish all other
824 	 * steps and successfully return from this function; otherwise we will
825 	 * end up leaving the device in a broken/unusable state.
826 	 *
827 	 * If we have registered with mdeg, unregister now to stop further
828 	 * callbacks to this vsw device and/or its ports. Then, detach any
829 	 * existing ports.
830 	 */
831 	if (progress & PROG_mdreg) {
832 		vsw_mdeg_unregister(vswp);
833 		vsw_detach_ports(vswp);
834 		progress &= ~PROG_mdreg;
835 	}
836 
837 	/*
838 	 * If we have started a thread to setup the switching mode, stop it, if
839 	 * it is still running. If it has finished setting up the switching
840 	 * mode, then we need to clean up some additional things if we are
841 	 * running in L2 mode: first free up any hybrid resources; then stop
842 	 * and close the underlying physical device. Note that we would have
843 	 * already released all per mac_client resources (ucast, mcast addrs,
844 	 * hio-shares etc) as all the ports are detached and if the vsw device
845 	 * itself was in use as an interface, it has been unplumbed (otherwise
846 	 * mac_unregister() above would fail).
847 	 */
848 	if (progress & PROG_swmode) {
849 
850 		vsw_setup_switching_stop(vswp);
851 
852 		if (vswp->hio_capable == B_TRUE) {
853 			vsw_hio_cleanup(vswp);
854 			vswp->hio_capable = B_FALSE;
855 		}
856 
857 		mutex_enter(&vswp->mac_lock);
858 		vsw_mac_close(vswp);
859 		mutex_exit(&vswp->mac_lock);
860 
861 		progress &= ~PROG_swmode;
862 	}
863 
864 	/*
865 	 * We now destroy the taskq used to clean up rx mblk pools that
866 	 * couldn't be destroyed when the ports/channels were detached.
867 	 * We implicitly wait for those tasks to complete in
868 	 * ddi_taskq_destroy().
869 	 */
870 	if (progress & PROG_rxp_taskq) {
871 		ddi_taskq_destroy(vswp->rxp_taskq);
872 		progress &= ~PROG_rxp_taskq;
873 	}
874 
875 	/*
876 	 * By now any pending tasks have finished and the underlying
877 	 * ldc's have been destroyed, so its safe to delete the control
878 	 * message taskq.
879 	 */
880 	if (progress & PROG_taskq) {
881 		ddi_taskq_destroy(vswp->taskq_p);
882 		progress &= ~PROG_taskq;
883 	}
884 
885 	/* Destroy the multicast hash table */
886 	if (progress & PROG_mfdb) {
887 		mod_hash_destroy_hash(vswp->mfdb);
888 		progress &= ~PROG_mfdb;
889 	}
890 
891 	/* Destroy the vlan hash table and fdb */
892 	if (progress & PROG_fdb) {
893 		vsw_destroy_vlans(vswp, VSW_LOCALDEV);
894 		mod_hash_destroy_hash(vswp->fdb_hashp);
895 		progress &= ~PROG_fdb;
896 	}
897 
898 	if (progress & PROG_readmd) {
899 		if (VSW_PRI_ETH_DEFINED(vswp)) {
900 			kmem_free(vswp->pri_types,
901 			    sizeof (uint16_t) * vswp->pri_num_types);
902 			(void) vio_destroy_mblks(vswp->pri_tx_vmp);
903 		}
904 		progress &= ~PROG_readmd;
905 	}
906 
907 	if (progress & PROG_locks) {
908 		rw_destroy(&vswp->plist.lockrw);
909 		rw_destroy(&vswp->mfdbrw);
910 		rw_destroy(&vswp->if_lockrw);
911 		rw_destroy(&vswp->maccl_rwlock);
912 		cv_destroy(&vswp->sw_thr_cv);
913 		mutex_destroy(&vswp->sw_thr_lock);
914 		mutex_destroy(&vswp->mca_lock);
915 		mutex_destroy(&vswp->mac_lock);
916 		progress &= ~PROG_locks;
917 	}
918 
919 	vswp->attach_progress = progress;
920 
921 	return (0);
922 }
923 
924 void
925 vsw_destroy_rxpools(void *arg)
926 {
927 	vio_mblk_pool_t	*poolp = (vio_mblk_pool_t *)arg;
928 	vio_mblk_pool_t	*npoolp;
929 
930 	while (poolp != NULL) {
931 		npoolp =  poolp->nextp;
932 		while (vio_destroy_mblks(poolp) != 0) {
933 			drv_usecwait(vsw_rxpool_cleanup_delay);
934 		}
935 		poolp = npoolp;
936 	}
937 }
938 
939 /*
940  * Get the value of the "vsw-phys-dev" property in the specified
941  * node. This property is the name of the physical device that
942  * the virtual switch will use to talk to the outside world.
943  *
944  * Note it is valid for this property to be NULL (but the property
945  * itself must exist). Callers of this routine should verify that
946  * the value returned is what they expected (i.e. either NULL or non NULL).
947  *
948  * On success returns value of the property in region pointed to by
949  * the 'name' argument, and with return value of 0. Otherwise returns 1.
950  */
951 static int
952 vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name)
953 {
954 	int		len = 0;
955 	int		instance;
956 	char		*physname = NULL;
957 	char		*dev;
958 	const char	*dev_name;
959 	char		myname[MAXNAMELEN];
960 
961 	dev_name = ddi_driver_name(vswp->dip);
962 	instance = ddi_get_instance(vswp->dip);
963 	(void) snprintf(myname, MAXNAMELEN, "%s%d", dev_name, instance);
964 
965 	if (md_get_prop_data(mdp, node, physdev_propname,
966 	    (uint8_t **)(&physname), &len) != 0) {
967 		cmn_err(CE_WARN, "!vsw%d: Unable to get name(s) of physical "
968 		    "device(s) from MD", vswp->instance);
969 		return (1);
970 	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
971 		cmn_err(CE_WARN, "!vsw%d: %s is too long a device name",
972 		    vswp->instance, physname);
973 		return (1);
974 	} else if (strcmp(myname, physname) == 0) {
975 		/*
976 		 * Prevent the vswitch from opening itself as the
977 		 * network device.
978 		 */
979 		cmn_err(CE_WARN, "!vsw%d: %s is an invalid device name",
980 		    vswp->instance, physname);
981 		return (1);
982 	} else {
983 		(void) strncpy(name, physname, strlen(physname) + 1);
984 		D2(vswp, "%s: using first device specified (%s)",
985 		    __func__, physname);
986 	}
987 
988 #ifdef DEBUG
989 	/*
990 	 * As a temporary measure to aid testing we check to see if there
991 	 * is a vsw.conf file present. If there is we use the value of the
992 	 * vsw_physname property in the file as the name of the physical
993 	 * device, overriding the value from the MD.
994 	 *
995 	 * There may be multiple devices listed, but for the moment
996 	 * we just use the first one.
997 	 */
998 	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
999 	    "vsw_physname", &dev) == DDI_PROP_SUCCESS) {
1000 		if ((strlen(dev) + 1) > LIFNAMSIZ) {
1001 			cmn_err(CE_WARN, "vsw%d: %s is too long a device name",
1002 			    vswp->instance, dev);
1003 			ddi_prop_free(dev);
1004 			return (1);
1005 		} else {
1006 			cmn_err(CE_NOTE, "vsw%d: Using device name (%s) from "
1007 			    "config file", vswp->instance, dev);
1008 
1009 			(void) strncpy(name, dev, strlen(dev) + 1);
1010 		}
1011 
1012 		ddi_prop_free(dev);
1013 	}
1014 #endif
1015 
1016 	return (0);
1017 }
1018 
1019 /*
1020  * Read the 'vsw-switch-mode' property from the specified MD node.
1021  *
1022  * Returns 0 on success, otherwise returns 1.
1023  */
1024 static int
1025 vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint8_t *mode)
1026 {
1027 	int		len = 0;
1028 	char		*smode = NULL;
1029 	char		*curr_mode = NULL;
1030 
1031 	D1(vswp, "%s: enter", __func__);
1032 
1033 	/*
1034 	 * Get the switch-mode property. The modes are listed in
1035 	 * decreasing order of preference, i.e. prefered mode is
1036 	 * first item in list.
1037 	 */
1038 	len = 0;
1039 	if (md_get_prop_data(mdp, node, smode_propname,
1040 	    (uint8_t **)(&smode), &len) != 0) {
1041 		/*
1042 		 * Unable to get switch-mode property from MD, nothing
1043 		 * more we can do.
1044 		 */
1045 		cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property"
1046 		    " from the MD", vswp->instance);
1047 		return (1);
1048 	}
1049 
1050 	curr_mode = smode;
1051 	/*
1052 	 * Modes of operation:
1053 	 * 'switched'	 - layer 2 switching, underlying HW in
1054 	 *			programmed mode.
1055 	 * 'promiscuous' - layer 2 switching, underlying HW in
1056 	 *			promiscuous mode.
1057 	 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
1058 	 *			in non-promiscuous mode.
1059 	 */
1060 	while (curr_mode < (smode + len)) {
1061 		D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
1062 		if (strcmp(curr_mode, "switched") == 0) {
1063 			*mode = VSW_LAYER2;
1064 		} else if (strcmp(curr_mode, "promiscuous") == 0) {
1065 			*mode = VSW_LAYER2 | VSW_LAYER2_PROMISC;
1066 		} else if (strcmp(curr_mode, "routed") == 0) {
1067 			*mode = VSW_LAYER3;
1068 		} else {
1069 			cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, "
1070 			    "setting to default switched mode",
1071 			    vswp->instance, curr_mode);
1072 			*mode = VSW_LAYER2;
1073 		}
1074 		curr_mode += strlen(curr_mode) + 1;
1075 	}
1076 
1077 	D2(vswp, "%s: %d mode", __func__, *mode);
1078 
1079 	D1(vswp, "%s: exit", __func__);
1080 
1081 	return (0);
1082 }
1083 
1084 /*
1085  * Register with the MAC layer as a network device, so we
1086  * can be plumbed if necessary.
1087  */
1088 static int
1089 vsw_mac_register(vsw_t *vswp)
1090 {
1091 	mac_register_t	*macp;
1092 	int		rv;
1093 
1094 	D1(vswp, "%s: enter", __func__);
1095 
1096 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1097 		return (EINVAL);
1098 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1099 	macp->m_driver = vswp;
1100 	macp->m_dip = vswp->dip;
1101 	macp->m_src_addr = (uint8_t *)&vswp->if_addr;
1102 	macp->m_callbacks = &vsw_m_callbacks;
1103 	macp->m_min_sdu = 0;
1104 	macp->m_max_sdu = vswp->mtu;
1105 	macp->m_margin = VLAN_TAGSZ;
1106 	rv = mac_register(macp, &vswp->if_mh);
1107 	mac_free(macp);
1108 	if (rv != 0) {
1109 		/*
1110 		 * Treat this as a non-fatal error as we may be
1111 		 * able to operate in some other mode.
1112 		 */
1113 		cmn_err(CE_NOTE, "!vsw%d: Unable to register as "
1114 		    "a provider with MAC layer", vswp->instance);
1115 		return (rv);
1116 	}
1117 
1118 	vswp->if_state |= VSW_IF_REG;
1119 
1120 	D1(vswp, "%s: exit", __func__);
1121 
1122 	return (rv);
1123 }
1124 
1125 static int
1126 vsw_mac_unregister(vsw_t *vswp)
1127 {
1128 	int		rv = 0;
1129 
1130 	D1(vswp, "%s: enter", __func__);
1131 
1132 	WRITE_ENTER(&vswp->if_lockrw);
1133 
1134 	if (vswp->if_state & VSW_IF_REG) {
1135 		rv = mac_unregister(vswp->if_mh);
1136 		if (rv != 0) {
1137 			DWARN(vswp, "%s: unable to unregister from MAC "
1138 			    "framework", __func__);
1139 
1140 			RW_EXIT(&vswp->if_lockrw);
1141 			D1(vswp, "%s: fail exit", __func__);
1142 			return (rv);
1143 		}
1144 
1145 		/* mark i/f as down and unregistered */
1146 		vswp->if_state &= ~(VSW_IF_UP | VSW_IF_REG);
1147 	}
1148 	RW_EXIT(&vswp->if_lockrw);
1149 
1150 	D1(vswp, "%s: exit", __func__);
1151 
1152 	return (rv);
1153 }
1154 
1155 static int
1156 vsw_m_stat(void *arg, uint_t stat, uint64_t *val)
1157 {
1158 	vsw_t			*vswp = (vsw_t *)arg;
1159 
1160 	D1(vswp, "%s: enter", __func__);
1161 
1162 	mutex_enter(&vswp->mac_lock);
1163 	if (vswp->mh == NULL) {
1164 		mutex_exit(&vswp->mac_lock);
1165 		return (EINVAL);
1166 	}
1167 
1168 	/* return stats from underlying device */
1169 	*val = mac_stat_get(vswp->mh, stat);
1170 
1171 	mutex_exit(&vswp->mac_lock);
1172 
1173 	return (0);
1174 }
1175 
1176 static void
1177 vsw_m_stop(void *arg)
1178 {
1179 	vsw_t	*vswp = (vsw_t *)arg;
1180 
1181 	D1(vswp, "%s: enter", __func__);
1182 
1183 	WRITE_ENTER(&vswp->if_lockrw);
1184 	vswp->if_state &= ~VSW_IF_UP;
1185 	RW_EXIT(&vswp->if_lockrw);
1186 
1187 	/* Cleanup and close the mac client */
1188 	vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV);
1189 
1190 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1191 }
1192 
1193 static int
1194 vsw_m_start(void *arg)
1195 {
1196 	int		rv;
1197 	vsw_t		*vswp = (vsw_t *)arg;
1198 
1199 	D1(vswp, "%s: enter", __func__);
1200 
1201 	WRITE_ENTER(&vswp->if_lockrw);
1202 
1203 	vswp->if_state |= VSW_IF_UP;
1204 
1205 	if (vswp->switching_setup_done == B_FALSE) {
1206 		/*
1207 		 * If the switching mode has not been setup yet, just
1208 		 * return. The unicast address will be programmed
1209 		 * after the physical device is successfully setup by the
1210 		 * timeout handler.
1211 		 */
1212 		RW_EXIT(&vswp->if_lockrw);
1213 		return (0);
1214 	}
1215 
1216 	/* if in layer2 mode, program unicast address. */
1217 	if (vswp->mh != NULL) {
1218 		/* Init a mac client and program addresses */
1219 		rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV);
1220 		if (rv != 0) {
1221 			cmn_err(CE_NOTE,
1222 			    "!vsw%d: failed to program interface "
1223 			    "unicast address\n", vswp->instance);
1224 		}
1225 	}
1226 
1227 	RW_EXIT(&vswp->if_lockrw);
1228 
1229 	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
1230 	return (0);
1231 }
1232 
1233 /*
1234  * Change the local interface address.
1235  *
1236  * Note: we don't support this entry point. The local
1237  * mac address of the switch can only be changed via its
1238  * MD node properties.
1239  */
1240 static int
1241 vsw_m_unicst(void *arg, const uint8_t *macaddr)
1242 {
1243 	_NOTE(ARGUNUSED(arg, macaddr))
1244 
1245 	return (DDI_FAILURE);
1246 }
1247 
1248 static int
1249 vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
1250 {
1251 	vsw_t		*vswp = (vsw_t *)arg;
1252 	mcst_addr_t	*mcst_p = NULL;
1253 	uint64_t	addr = 0x0;
1254 	int		i, ret = 0;
1255 
1256 	D1(vswp, "%s: enter", __func__);
1257 
1258 	/*
1259 	 * Convert address into form that can be used
1260 	 * as hash table key.
1261 	 */
1262 	for (i = 0; i < ETHERADDRL; i++) {
1263 		addr = (addr << 8) | mca[i];
1264 	}
1265 
1266 	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
1267 
1268 	if (add) {
1269 		D2(vswp, "%s: adding multicast", __func__);
1270 		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1271 			/*
1272 			 * Update the list of multicast addresses
1273 			 * contained within the vsw_t structure to
1274 			 * include this new one.
1275 			 */
1276 			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
1277 			if (mcst_p == NULL) {
1278 				DERR(vswp, "%s unable to alloc mem", __func__);
1279 				(void) vsw_del_mcst(vswp,
1280 				    VSW_LOCALDEV, addr, NULL);
1281 				return (1);
1282 			}
1283 			mcst_p->addr = addr;
1284 			ether_copy(mca, &mcst_p->mca);
1285 
1286 			/*
1287 			 * Call into the underlying driver to program the
1288 			 * address into HW.
1289 			 */
1290 			ret = vsw_mac_multicast_add(vswp, NULL, mcst_p,
1291 			    VSW_LOCALDEV);
1292 			if (ret != 0) {
1293 				(void) vsw_del_mcst(vswp,
1294 				    VSW_LOCALDEV, addr, NULL);
1295 				kmem_free(mcst_p, sizeof (*mcst_p));
1296 				return (ret);
1297 			}
1298 
1299 			mutex_enter(&vswp->mca_lock);
1300 			mcst_p->nextp = vswp->mcap;
1301 			vswp->mcap = mcst_p;
1302 			mutex_exit(&vswp->mca_lock);
1303 		} else {
1304 			cmn_err(CE_WARN, "!vsw%d: unable to add multicast "
1305 			    "address", vswp->instance);
1306 		}
1307 		return (ret);
1308 	}
1309 
1310 	D2(vswp, "%s: removing multicast", __func__);
1311 	/*
1312 	 * Remove the address from the hash table..
1313 	 */
1314 	if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
1315 
1316 		/*
1317 		 * ..and then from the list maintained in the
1318 		 * vsw_t structure.
1319 		 */
1320 		mcst_p = vsw_del_addr(VSW_LOCALDEV, vswp, addr);
1321 		ASSERT(mcst_p != NULL);
1322 
1323 		vsw_mac_multicast_remove(vswp, NULL, mcst_p, VSW_LOCALDEV);
1324 		kmem_free(mcst_p, sizeof (*mcst_p));
1325 	}
1326 
1327 	D1(vswp, "%s: exit", __func__);
1328 
1329 	return (0);
1330 }
1331 
1332 static int
1333 vsw_m_promisc(void *arg, boolean_t on)
1334 {
1335 	vsw_t		*vswp = (vsw_t *)arg;
1336 
1337 	D1(vswp, "%s: enter", __func__);
1338 
1339 	WRITE_ENTER(&vswp->if_lockrw);
1340 	if (on)
1341 		vswp->if_state |= VSW_IF_PROMISC;
1342 	else
1343 		vswp->if_state &= ~VSW_IF_PROMISC;
1344 	RW_EXIT(&vswp->if_lockrw);
1345 
1346 	D1(vswp, "%s: exit", __func__);
1347 
1348 	return (0);
1349 }
1350 
1351 static mblk_t *
1352 vsw_m_tx(void *arg, mblk_t *mp)
1353 {
1354 	vsw_t		*vswp = (vsw_t *)arg;
1355 
1356 	D1(vswp, "%s: enter", __func__);
1357 
1358 	mp = vsw_vlan_frame_pretag(vswp, VSW_LOCALDEV, mp);
1359 
1360 	if (mp == NULL) {
1361 		return (NULL);
1362 	}
1363 
1364 	vswp->vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
1365 
1366 	D1(vswp, "%s: exit", __func__);
1367 
1368 	return (NULL);
1369 }
1370 
1371 /*
1372  * Register for machine description (MD) updates.
1373  *
1374  * Returns 0 on success, 1 on failure.
1375  */
1376 static int
1377 vsw_mdeg_register(vsw_t *vswp)
1378 {
1379 	mdeg_prop_spec_t	*pspecp;
1380 	mdeg_node_spec_t	*inst_specp;
1381 	mdeg_handle_t		mdeg_hdl, mdeg_port_hdl;
1382 	size_t			templatesz;
1383 	int			rv;
1384 
1385 	D1(vswp, "%s: enter", __func__);
1386 
1387 	/*
1388 	 * Allocate and initialize a per-instance copy
1389 	 * of the global property spec array that will
1390 	 * uniquely identify this vsw instance.
1391 	 */
1392 	templatesz = sizeof (vsw_prop_template);
1393 	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
1394 
1395 	bcopy(vsw_prop_template, pspecp, templatesz);
1396 
1397 	VSW_SET_MDEG_PROP_INST(pspecp, vswp->regprop);
1398 
1399 	/* initialize the complete prop spec structure */
1400 	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
1401 	inst_specp->namep = "virtual-device";
1402 	inst_specp->specp = pspecp;
1403 
1404 	D2(vswp, "%s: instance %d registering with mdeg", __func__,
1405 	    vswp->regprop);
1406 	/*
1407 	 * Register an interest in 'virtual-device' nodes with a
1408 	 * 'name' property of 'virtual-network-switch'
1409 	 */
1410 	rv = mdeg_register(inst_specp, &vdev_match, vsw_mdeg_cb,
1411 	    (void *)vswp, &mdeg_hdl);
1412 	if (rv != MDEG_SUCCESS) {
1413 		DERR(vswp, "%s: mdeg_register failed (%d) for vsw node",
1414 		    __func__, rv);
1415 		goto mdeg_reg_fail;
1416 	}
1417 
1418 	/*
1419 	 * Register an interest in 'vsw-port' nodes.
1420 	 */
1421 	rv = mdeg_register(inst_specp, &vport_match, vsw_port_mdeg_cb,
1422 	    (void *)vswp, &mdeg_port_hdl);
1423 	if (rv != MDEG_SUCCESS) {
1424 		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
1425 		(void) mdeg_unregister(mdeg_hdl);
1426 		goto mdeg_reg_fail;
1427 	}
1428 
1429 	/* save off data that will be needed later */
1430 	vswp->inst_spec = inst_specp;
1431 	vswp->mdeg_hdl = mdeg_hdl;
1432 	vswp->mdeg_port_hdl = mdeg_port_hdl;
1433 
1434 	D1(vswp, "%s: exit", __func__);
1435 	return (0);
1436 
1437 mdeg_reg_fail:
1438 	cmn_err(CE_WARN, "!vsw%d: Unable to register MDEG callbacks",
1439 	    vswp->instance);
1440 	kmem_free(pspecp, templatesz);
1441 	kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
1442 
1443 	vswp->mdeg_hdl = NULL;
1444 	vswp->mdeg_port_hdl = NULL;
1445 
1446 	return (1);
1447 }
1448 
1449 static void
1450 vsw_mdeg_unregister(vsw_t *vswp)
1451 {
1452 	D1(vswp, "vsw_mdeg_unregister: enter");
1453 
1454 	if (vswp->mdeg_hdl != NULL)
1455 		(void) mdeg_unregister(vswp->mdeg_hdl);
1456 
1457 	if (vswp->mdeg_port_hdl != NULL)
1458 		(void) mdeg_unregister(vswp->mdeg_port_hdl);
1459 
1460 	if (vswp->inst_spec != NULL) {
1461 		if (vswp->inst_spec->specp != NULL) {
1462 			(void) kmem_free(vswp->inst_spec->specp,
1463 			    sizeof (vsw_prop_template));
1464 			vswp->inst_spec->specp = NULL;
1465 		}
1466 
1467 		(void) kmem_free(vswp->inst_spec, sizeof (mdeg_node_spec_t));
1468 		vswp->inst_spec = NULL;
1469 	}
1470 
1471 	D1(vswp, "vsw_mdeg_unregister: exit");
1472 }
1473 
1474 /*
1475  * Mdeg callback invoked for the vsw node itself.
1476  */
1477 static int
1478 vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
1479 {
1480 	vsw_t		*vswp;
1481 	md_t		*mdp;
1482 	mde_cookie_t	node;
1483 	uint64_t	inst;
1484 	char		*node_name = NULL;
1485 
1486 	if (resp == NULL)
1487 		return (MDEG_FAILURE);
1488 
1489 	vswp = (vsw_t *)cb_argp;
1490 
1491 	D1(vswp, "%s: added %d : removed %d : curr matched %d"
1492 	    " : prev matched %d", __func__, resp->added.nelem,
1493 	    resp->removed.nelem, resp->match_curr.nelem,
1494 	    resp->match_prev.nelem);
1495 
1496 	/*
1497 	 * We get an initial callback for this node as 'added'
1498 	 * after registering with mdeg. Note that we would have
1499 	 * already gathered information about this vsw node by
1500 	 * walking MD earlier during attach (in vsw_read_mdprops()).
1501 	 * So, there is a window where the properties of this
1502 	 * node might have changed when we get this initial 'added'
1503 	 * callback. We handle this as if an update occured
1504 	 * and invoke the same function which handles updates to
1505 	 * the properties of this vsw-node if any.
1506 	 *
1507 	 * A non-zero 'match' value indicates that the MD has been
1508 	 * updated and that a virtual-network-switch node is
1509 	 * present which may or may not have been updated. It is
1510 	 * up to the clients to examine their own nodes and
1511 	 * determine if they have changed.
1512 	 */
1513 	if (resp->added.nelem != 0) {
1514 
1515 		if (resp->added.nelem != 1) {
1516 			cmn_err(CE_NOTE, "!vsw%d: number of nodes added "
1517 			    "invalid: %d\n", vswp->instance, resp->added.nelem);
1518 			return (MDEG_FAILURE);
1519 		}
1520 
1521 		mdp = resp->added.mdp;
1522 		node = resp->added.mdep[0];
1523 
1524 	} else if (resp->match_curr.nelem != 0) {
1525 
1526 		if (resp->match_curr.nelem != 1) {
1527 			cmn_err(CE_NOTE, "!vsw%d: number of nodes updated "
1528 			    "invalid: %d\n", vswp->instance,
1529 			    resp->match_curr.nelem);
1530 			return (MDEG_FAILURE);
1531 		}
1532 
1533 		mdp = resp->match_curr.mdp;
1534 		node = resp->match_curr.mdep[0];
1535 
1536 	} else {
1537 		return (MDEG_FAILURE);
1538 	}
1539 
1540 	/* Validate name and instance */
1541 	if (md_get_prop_str(mdp, node, "name", &node_name) != 0) {
1542 		DERR(vswp, "%s: unable to get node name\n",  __func__);
1543 		return (MDEG_FAILURE);
1544 	}
1545 
1546 	/* is this a virtual-network-switch? */
1547 	if (strcmp(node_name, vsw_propname) != 0) {
1548 		DERR(vswp, "%s: Invalid node name: %s\n",
1549 		    __func__, node_name);
1550 		return (MDEG_FAILURE);
1551 	}
1552 
1553 	if (md_get_prop_val(mdp, node, "cfg-handle", &inst)) {
1554 		DERR(vswp, "%s: prop(cfg-handle) not found\n",
1555 		    __func__);
1556 		return (MDEG_FAILURE);
1557 	}
1558 
1559 	/* is this the right instance of vsw? */
1560 	if (inst != vswp->regprop) {
1561 		DERR(vswp, "%s: Invalid cfg-handle: %lx\n",
1562 		    __func__, inst);
1563 		return (MDEG_FAILURE);
1564 	}
1565 
1566 	vsw_update_md_prop(vswp, mdp, node);
1567 
1568 	return (MDEG_SUCCESS);
1569 }
1570 
1571 /*
1572  * Mdeg callback invoked for changes to the vsw-port nodes
1573  * under the vsw node.
1574  */
1575 static int
1576 vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
1577 {
1578 	vsw_t		*vswp;
1579 	int		idx;
1580 	md_t		*mdp;
1581 	mde_cookie_t	node;
1582 	uint64_t	inst;
1583 	int		rv;
1584 
1585 	if ((resp == NULL) || (cb_argp == NULL))
1586 		return (MDEG_FAILURE);
1587 
1588 	vswp = (vsw_t *)cb_argp;
1589 
1590 	D2(vswp, "%s: added %d : removed %d : curr matched %d"
1591 	    " : prev matched %d", __func__, resp->added.nelem,
1592 	    resp->removed.nelem, resp->match_curr.nelem,
1593 	    resp->match_prev.nelem);
1594 
1595 	/* process added ports */
1596 	for (idx = 0; idx < resp->added.nelem; idx++) {
1597 		mdp = resp->added.mdp;
1598 		node = resp->added.mdep[idx];
1599 
1600 		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
1601 
1602 		if ((rv = vsw_port_add(vswp, mdp, &node)) != 0) {
1603 			cmn_err(CE_WARN, "!vsw%d: Unable to add new port "
1604 			    "(0x%lx), err=%d", vswp->instance, node, rv);
1605 		}
1606 	}
1607 
1608 	/* process removed ports */
1609 	for (idx = 0; idx < resp->removed.nelem; idx++) {
1610 		mdp = resp->removed.mdp;
1611 		node = resp->removed.mdep[idx];
1612 
1613 		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
1614 			DERR(vswp, "%s: prop(%s) not found in port(%d)",
1615 			    __func__, id_propname, idx);
1616 			continue;
1617 		}
1618 
1619 		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
1620 
1621 		if (vsw_port_detach(vswp, inst) != 0) {
1622 			cmn_err(CE_WARN, "!vsw%d: Unable to remove port %ld",
1623 			    vswp->instance, inst);
1624 		}
1625 	}
1626 
1627 	for (idx = 0; idx < resp->match_curr.nelem; idx++) {
1628 		(void) vsw_port_update(vswp, resp->match_curr.mdp,
1629 		    resp->match_curr.mdep[idx],
1630 		    resp->match_prev.mdp,
1631 		    resp->match_prev.mdep[idx]);
1632 	}
1633 
1634 	D1(vswp, "%s: exit", __func__);
1635 
1636 	return (MDEG_SUCCESS);
1637 }
1638 
1639 /*
1640  * Scan the machine description for this instance of vsw
1641  * and read its properties. Called only from vsw_attach().
1642  * Returns: 0 on success, 1 on failure.
1643  */
1644 static int
1645 vsw_read_mdprops(vsw_t *vswp)
1646 {
1647 	md_t		*mdp = NULL;
1648 	mde_cookie_t	rootnode;
1649 	mde_cookie_t	*listp = NULL;
1650 	uint64_t	inst;
1651 	uint64_t	cfgh;
1652 	char		*name;
1653 	int		rv = 1;
1654 	int		num_nodes = 0;
1655 	int		num_devs = 0;
1656 	int		listsz = 0;
1657 	int		i;
1658 
1659 	/*
1660 	 * In each 'virtual-device' node in the MD there is a
1661 	 * 'cfg-handle' property which is the MD's concept of
1662 	 * an instance number (this may be completely different from
1663 	 * the device drivers instance #). OBP reads that value and
1664 	 * stores it in the 'reg' property of the appropriate node in
1665 	 * the device tree. We first read this reg property and use this
1666 	 * to compare against the 'cfg-handle' property of vsw nodes
1667 	 * in MD to get to this specific vsw instance and then read
1668 	 * other properties that we are interested in.
1669 	 * We also cache the value of 'reg' property and use it later
1670 	 * to register callbacks with mdeg (see vsw_mdeg_register())
1671 	 */
1672 	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
1673 	    DDI_PROP_DONTPASS, reg_propname, -1);
1674 	if (inst == -1) {
1675 		cmn_err(CE_NOTE, "!vsw%d: Unable to read %s property from "
1676 		    "OBP device tree", vswp->instance, reg_propname);
1677 		return (rv);
1678 	}
1679 
1680 	vswp->regprop = inst;
1681 
1682 	if ((mdp = md_get_handle()) == NULL) {
1683 		DWARN(vswp, "%s: cannot init MD\n", __func__);
1684 		return (rv);
1685 	}
1686 
1687 	num_nodes = md_node_count(mdp);
1688 	ASSERT(num_nodes > 0);
1689 
1690 	listsz = num_nodes * sizeof (mde_cookie_t);
1691 	listp = (mde_cookie_t *)kmem_zalloc(listsz, KM_SLEEP);
1692 
1693 	rootnode = md_root_node(mdp);
1694 
1695 	/* search for all "virtual_device" nodes */
1696 	num_devs = md_scan_dag(mdp, rootnode,
1697 	    md_find_name(mdp, vdev_propname),
1698 	    md_find_name(mdp, "fwd"), listp);
1699 	if (num_devs <= 0) {
1700 		DWARN(vswp, "%s: invalid num_devs:%d\n", __func__, num_devs);
1701 		goto vsw_readmd_exit;
1702 	}
1703 
1704 	/*
1705 	 * Now loop through the list of virtual-devices looking for
1706 	 * devices with name "virtual-network-switch" and for each
1707 	 * such device compare its instance with what we have from
1708 	 * the 'reg' property to find the right node in MD and then
1709 	 * read all its properties.
1710 	 */
1711 	for (i = 0; i < num_devs; i++) {
1712 
1713 		if (md_get_prop_str(mdp, listp[i], "name", &name) != 0) {
1714 			DWARN(vswp, "%s: name property not found\n",
1715 			    __func__);
1716 			goto vsw_readmd_exit;
1717 		}
1718 
1719 		/* is this a virtual-network-switch? */
1720 		if (strcmp(name, vsw_propname) != 0)
1721 			continue;
1722 
1723 		if (md_get_prop_val(mdp, listp[i], "cfg-handle", &cfgh) != 0) {
1724 			DWARN(vswp, "%s: cfg-handle property not found\n",
1725 			    __func__);
1726 			goto vsw_readmd_exit;
1727 		}
1728 
1729 		/* is this the required instance of vsw? */
1730 		if (inst != cfgh)
1731 			continue;
1732 
1733 		/* now read all properties of this vsw instance */
1734 		rv = vsw_get_initial_md_properties(vswp, mdp, listp[i]);
1735 		break;
1736 	}
1737 
1738 vsw_readmd_exit:
1739 
1740 	kmem_free(listp, listsz);
1741 	(void) md_fini_handle(mdp);
1742 	return (rv);
1743 }
1744 
1745 /*
1746  * Read the initial start-of-day values from the specified MD node.
1747  */
1748 static int
1749 vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
1750 {
1751 	uint64_t	macaddr = 0;
1752 
1753 	D1(vswp, "%s: enter", __func__);
1754 
1755 	if (vsw_get_md_physname(vswp, mdp, node, vswp->physname) != 0) {
1756 		return (1);
1757 	}
1758 
1759 	/* mac address for vswitch device itself */
1760 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
1761 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
1762 		    vswp->instance);
1763 		return (1);
1764 	}
1765 
1766 	vsw_save_lmacaddr(vswp, macaddr);
1767 
1768 	if (vsw_get_md_smodes(vswp, mdp, node, &vswp->smode)) {
1769 		DWARN(vswp, "%s: Unable to read %s property from MD, "
1770 		    "defaulting to 'switched' mode",
1771 		    __func__, smode_propname);
1772 
1773 		vswp->smode = VSW_LAYER2;
1774 	}
1775 
1776 	/*
1777 	 * Read the 'linkprop' property to know if this
1778 	 * vsw device wants to get physical link updates.
1779 	 */
1780 	vsw_linkprop_read(vswp, mdp, node, &vswp->pls_update);
1781 
1782 	/* read mtu */
1783 	vsw_mtu_read(vswp, mdp, node, &vswp->mtu);
1784 	if (vswp->mtu < ETHERMTU || vswp->mtu > VNET_MAX_MTU) {
1785 		vswp->mtu = ETHERMTU;
1786 	}
1787 	vswp->max_frame_size = vswp->mtu + sizeof (struct ether_header) +
1788 	    VLAN_TAGSZ;
1789 
1790 	/* read vlan id properties of this vsw instance */
1791 	vsw_vlan_read_ids(vswp, VSW_LOCALDEV, mdp, node, &vswp->pvid,
1792 	    &vswp->vids, &vswp->nvids, &vswp->default_vlan_id);
1793 
1794 	/* read priority-ether-types */
1795 	vsw_read_pri_eth_types(vswp, mdp, node);
1796 
1797 	/* read bandwidth property of this vsw instance */
1798 	vsw_bandwidth_read(vswp, mdp, node, &vswp->bandwidth);
1799 
1800 	D1(vswp, "%s: exit", __func__);
1801 	return (0);
1802 }
1803 
1804 /*
1805  * Read vlan id properties of the given MD node.
1806  * Arguments:
1807  *   arg:          device argument(vsw device or a port)
1808  *   type:         type of arg; VSW_LOCALDEV(vsw device) or VSW_VNETPORT(port)
1809  *   mdp:          machine description
1810  *   node:         md node cookie
1811  *
1812  * Returns:
1813  *   pvidp:        port-vlan-id of the node
1814  *   vidspp:       list of vlan-ids of the node
1815  *   nvidsp:       # of vlan-ids in the list
1816  *   default_idp:  default-vlan-id of the node(if node is vsw device)
1817  */
1818 static void
1819 vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node,
1820 	uint16_t *pvidp, vsw_vlanid_t **vidspp, uint16_t *nvidsp,
1821 	uint16_t *default_idp)
1822 {
1823 	vsw_t		*vswp;
1824 	vsw_port_t	*portp;
1825 	char		*pvid_propname;
1826 	char		*vid_propname;
1827 	uint_t		nvids = 0;
1828 	uint32_t	vids_size;
1829 	int		rv;
1830 	int		i;
1831 	uint64_t	*data;
1832 	uint64_t	val;
1833 	int		size;
1834 	int		inst;
1835 
1836 	if (type == VSW_LOCALDEV) {
1837 
1838 		vswp = (vsw_t *)arg;
1839 		pvid_propname = vsw_pvid_propname;
1840 		vid_propname = vsw_vid_propname;
1841 		inst = vswp->instance;
1842 
1843 	} else if (type == VSW_VNETPORT) {
1844 
1845 		portp = (vsw_port_t *)arg;
1846 		vswp = portp->p_vswp;
1847 		pvid_propname = port_pvid_propname;
1848 		vid_propname = port_vid_propname;
1849 		inst = portp->p_instance;
1850 
1851 	} else {
1852 		return;
1853 	}
1854 
1855 	if (type == VSW_LOCALDEV && default_idp != NULL) {
1856 		rv = md_get_prop_val(mdp, node, vsw_dvid_propname, &val);
1857 		if (rv != 0) {
1858 			DWARN(vswp, "%s: prop(%s) not found", __func__,
1859 			    vsw_dvid_propname);
1860 
1861 			*default_idp = vsw_default_vlan_id;
1862 		} else {
1863 			*default_idp = val & 0xFFF;
1864 			D2(vswp, "%s: %s(%d): (%d)\n", __func__,
1865 			    vsw_dvid_propname, inst, *default_idp);
1866 		}
1867 	}
1868 
1869 	rv = md_get_prop_val(mdp, node, pvid_propname, &val);
1870 	if (rv != 0) {
1871 		DWARN(vswp, "%s: prop(%s) not found", __func__, pvid_propname);
1872 		*pvidp = vsw_default_vlan_id;
1873 	} else {
1874 
1875 		*pvidp = val & 0xFFF;
1876 		D2(vswp, "%s: %s(%d): (%d)\n", __func__,
1877 		    pvid_propname, inst, *pvidp);
1878 	}
1879 
1880 	rv = md_get_prop_data(mdp, node, vid_propname, (uint8_t **)&data,
1881 	    &size);
1882 	if (rv != 0) {
1883 		D2(vswp, "%s: prop(%s) not found", __func__, vid_propname);
1884 		size = 0;
1885 	} else {
1886 		size /= sizeof (uint64_t);
1887 	}
1888 	nvids = size;
1889 
1890 	if (nvids != 0) {
1891 		D2(vswp, "%s: %s(%d): ", __func__, vid_propname, inst);
1892 		vids_size = sizeof (vsw_vlanid_t) * nvids;
1893 		*vidspp = kmem_zalloc(vids_size, KM_SLEEP);
1894 		for (i = 0; i < nvids; i++) {
1895 			(*vidspp)[i].vl_vid = data[i] & 0xFFFF;
1896 			(*vidspp)[i].vl_set = B_FALSE;
1897 			D2(vswp, " %d ", (*vidspp)[i].vl_vid);
1898 		}
1899 		D2(vswp, "\n");
1900 	}
1901 
1902 	*nvidsp = nvids;
1903 }
1904 
1905 static void
1906 vsw_port_read_bandwidth(vsw_port_t *portp, md_t *mdp, mde_cookie_t node,
1907     uint64_t *bw)
1908 {
1909 	int		rv;
1910 	uint64_t	val;
1911 	vsw_t		*vswp;
1912 
1913 	vswp = portp->p_vswp;
1914 
1915 	rv = md_get_prop_val(mdp, node, port_maxbw_propname, &val);
1916 
1917 	if (rv != 0) {
1918 		*bw = 0;
1919 		D3(vswp, "%s: prop(%s) not found\n", __func__,
1920 		    port_maxbw_propname);
1921 	} else {
1922 		*bw = val;
1923 		D3(vswp, "%s: %s nodes found", __func__, port_maxbw_propname);
1924 	}
1925 }
1926 
1927 /*
1928  * This function reads "priority-ether-types" property from md. This property
1929  * is used to enable support for priority frames. Applications which need
1930  * guaranteed and timely delivery of certain high priority frames to/from
1931  * a vnet or vsw within ldoms, should configure this property by providing
1932  * the ether type(s) for which the priority facility is needed.
1933  * Normal data frames are delivered over a ldc channel using the descriptor
1934  * ring mechanism which is constrained by factors such as descriptor ring size,
1935  * the rate at which the ring is processed at the peer ldc end point, etc.
1936  * The priority mechanism provides an Out-Of-Band path to send/receive frames
1937  * as raw pkt data (VIO_PKT_DATA) messages over the channel, avoiding the
1938  * descriptor ring path and enables a more reliable and timely delivery of
1939  * frames to the peer.
1940  */
1941 static void
1942 vsw_read_pri_eth_types(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
1943 {
1944 	int		rv;
1945 	uint16_t	*types;
1946 	uint64_t	*data;
1947 	int		size;
1948 	int		i;
1949 	size_t		mblk_sz;
1950 
1951 	rv = md_get_prop_data(mdp, node, pri_types_propname,
1952 	    (uint8_t **)&data, &size);
1953 	if (rv != 0) {
1954 		/*
1955 		 * Property may not exist if we are running pre-ldoms1.1 f/w.
1956 		 * Check if 'vsw_pri_eth_type' has been set in that case.
1957 		 */
1958 		if (vsw_pri_eth_type != 0) {
1959 			size = sizeof (vsw_pri_eth_type);
1960 			data = &vsw_pri_eth_type;
1961 		} else {
1962 			D3(vswp, "%s: prop(%s) not found", __func__,
1963 			    pri_types_propname);
1964 			size = 0;
1965 		}
1966 	}
1967 
1968 	if (size == 0) {
1969 		vswp->pri_num_types = 0;
1970 		return;
1971 	}
1972 
1973 	/*
1974 	 * we have some priority-ether-types defined;
1975 	 * allocate a table of these types and also
1976 	 * allocate a pool of mblks to transmit these
1977 	 * priority packets.
1978 	 */
1979 	size /= sizeof (uint64_t);
1980 	vswp->pri_num_types = size;
1981 	vswp->pri_types = kmem_zalloc(size * sizeof (uint16_t), KM_SLEEP);
1982 	for (i = 0, types = vswp->pri_types; i < size; i++) {
1983 		types[i] = data[i] & 0xFFFF;
1984 	}
1985 	mblk_sz = (VIO_PKT_DATA_HDRSIZE + ETHERMAX + 7) & ~7;
1986 	(void) vio_create_mblks(vsw_pri_tx_nmblks, mblk_sz, NULL,
1987 	    &vswp->pri_tx_vmp);
1988 }
1989 
1990 static void
1991 vsw_mtu_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint32_t *mtu)
1992 {
1993 	int		rv;
1994 	int		inst;
1995 	uint64_t	val;
1996 	char		*mtu_propname;
1997 
1998 	mtu_propname = vsw_mtu_propname;
1999 	inst = vswp->instance;
2000 
2001 	rv = md_get_prop_val(mdp, node, mtu_propname, &val);
2002 	if (rv != 0) {
2003 		D3(vswp, "%s: prop(%s) not found", __func__, mtu_propname);
2004 		*mtu = vsw_ethermtu;
2005 	} else {
2006 
2007 		*mtu = val & 0xFFFF;
2008 		D2(vswp, "%s: %s(%d): (%d)\n", __func__,
2009 		    mtu_propname, inst, *mtu);
2010 	}
2011 }
2012 
2013 /*
2014  * Update the mtu of the vsw device. We first check if the device has been
2015  * plumbed and if so fail the mtu update. Otherwise, we continue to update the
2016  * new mtu and reset all ports to initiate handshake re-negotiation with peers
2017  * using the new mtu.
2018  */
2019 static int
2020 vsw_mtu_update(vsw_t *vswp, uint32_t mtu)
2021 {
2022 	int	rv;
2023 
2024 	WRITE_ENTER(&vswp->if_lockrw);
2025 
2026 	if (vswp->if_state & VSW_IF_UP) {
2027 
2028 		RW_EXIT(&vswp->if_lockrw);
2029 
2030 		cmn_err(CE_NOTE, "!vsw%d: Unable to process mtu update"
2031 		    " as the device is plumbed\n", vswp->instance);
2032 		return (EBUSY);
2033 
2034 	} else {
2035 
2036 		D2(vswp, "%s: curr_mtu(%d) new_mtu(%d)\n",
2037 		    __func__, vswp->mtu, mtu);
2038 
2039 		vswp->mtu = mtu;
2040 		vswp->max_frame_size = vswp->mtu +
2041 		    sizeof (struct ether_header) + VLAN_TAGSZ;
2042 
2043 		rv = mac_maxsdu_update(vswp->if_mh, mtu);
2044 		if (rv != 0) {
2045 			cmn_err(CE_NOTE,
2046 			    "!vsw%d: Unable to update mtu with mac"
2047 			    " layer\n", vswp->instance);
2048 		}
2049 
2050 		RW_EXIT(&vswp->if_lockrw);
2051 
2052 		/* Reset ports to renegotiate with the new mtu */
2053 		vsw_reset_ports(vswp);
2054 
2055 	}
2056 
2057 	return (0);
2058 }
2059 
2060 static void
2061 vsw_linkprop_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node,
2062 	boolean_t *pls)
2063 {
2064 	int		rv;
2065 	uint64_t	val;
2066 	char		*linkpropname;
2067 
2068 	linkpropname = vsw_linkprop_propname;
2069 
2070 	rv = md_get_prop_val(mdp, node, linkpropname, &val);
2071 	if (rv != 0) {
2072 		D3(vswp, "%s: prop(%s) not found", __func__, linkpropname);
2073 		*pls = B_FALSE;
2074 	} else {
2075 
2076 		*pls = (val & 0x1) ? B_TRUE : B_FALSE;
2077 		D2(vswp, "%s: %s(%d): (%d)\n", __func__, linkpropname,
2078 		    vswp->instance, *pls);
2079 	}
2080 }
2081 
2082 void
2083 vsw_mac_link_update(vsw_t *vswp, link_state_t link_state)
2084 {
2085 	READ_ENTER(&vswp->if_lockrw);
2086 
2087 	if (vswp->if_state & VSW_IF_REG) {
2088 		mac_link_update(vswp->if_mh, link_state);
2089 	}
2090 
2091 	RW_EXIT(&vswp->if_lockrw);
2092 }
2093 
2094 void
2095 vsw_physlink_state_update(vsw_t *vswp)
2096 {
2097 	if (vswp->pls_update == B_TRUE) {
2098 		vsw_mac_link_update(vswp, vswp->phys_link_state);
2099 	}
2100 	vsw_physlink_update_ports(vswp);
2101 }
2102 
2103 static void
2104 vsw_bandwidth_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint64_t *bw)
2105 {
2106 	/* read the vsw bandwidth from md */
2107 	int		rv;
2108 	uint64_t	val;
2109 
2110 	rv = md_get_prop_val(mdp, node, vsw_maxbw_propname, &val);
2111 	if (rv != 0) {
2112 		*bw = 0;
2113 		D3(vswp, "%s: prop(%s) not found", __func__,
2114 		    vsw_maxbw_propname);
2115 	} else {
2116 		*bw = val;
2117 		D3(vswp, "%s: %s(%d): (%ld)\n", __func__,
2118 		    vsw_maxbw_propname, vswp->instance, *bw);
2119 	}
2120 }
2121 
2122 /*
2123  * Check to see if the relevant properties in the specified node have
2124  * changed, and if so take the appropriate action.
2125  *
2126  * If any of the properties are missing or invalid we don't take
2127  * any action, as this function should only be invoked when modifications
2128  * have been made to what we assume is a working configuration, which
2129  * we leave active.
2130  *
2131  * Note it is legal for this routine to be invoked even if none of the
2132  * properties in the port node within the MD have actually changed.
2133  */
2134 static void
2135 vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node)
2136 {
2137 	char		physname[LIFNAMSIZ];
2138 	char		drv[LIFNAMSIZ];
2139 	uint_t		ddi_instance;
2140 	uint8_t		new_smode;
2141 	int		i;
2142 	uint64_t 	macaddr = 0;
2143 	enum		{MD_init = 0x1,
2144 				MD_physname = 0x2,
2145 				MD_macaddr = 0x4,
2146 				MD_smode = 0x8,
2147 				MD_vlans = 0x10,
2148 				MD_mtu = 0x20,
2149 				MD_pls = 0x40,
2150 				MD_bw = 0x80} updated;
2151 	int		rv;
2152 	uint16_t	pvid;
2153 	vsw_vlanid_t	*vids;
2154 	uint16_t	nvids;
2155 	uint32_t	mtu;
2156 	boolean_t	pls_update;
2157 	uint64_t	maxbw;
2158 
2159 	updated = MD_init;
2160 
2161 	D1(vswp, "%s: enter", __func__);
2162 
2163 	/*
2164 	 * Check if name of physical device in MD has changed.
2165 	 */
2166 	if (vsw_get_md_physname(vswp, mdp, node, (char *)&physname) == 0) {
2167 		/*
2168 		 * Do basic sanity check on new device name/instance,
2169 		 * if its non NULL. It is valid for the device name to
2170 		 * have changed from a non NULL to a NULL value, i.e.
2171 		 * the vsw is being changed to 'routed' mode.
2172 		 */
2173 		if ((strlen(physname) != 0) &&
2174 		    (ddi_parse(physname, drv,
2175 		    &ddi_instance) != DDI_SUCCESS)) {
2176 			cmn_err(CE_WARN, "!vsw%d: physical device %s is not"
2177 			    " a valid device name/instance",
2178 			    vswp->instance, physname);
2179 			goto fail_reconf;
2180 		}
2181 
2182 		if (strcmp(physname, vswp->physname)) {
2183 			D2(vswp, "%s: device name changed from %s to %s",
2184 			    __func__, vswp->physname, physname);
2185 
2186 			updated |= MD_physname;
2187 		} else {
2188 			D2(vswp, "%s: device name unchanged at %s",
2189 			    __func__, vswp->physname);
2190 		}
2191 	} else {
2192 		cmn_err(CE_WARN, "!vsw%d: Unable to read name of physical "
2193 		    "device from updated MD.", vswp->instance);
2194 		goto fail_reconf;
2195 	}
2196 
2197 	/*
2198 	 * Check if MAC address has changed.
2199 	 */
2200 	if (md_get_prop_val(mdp, node, macaddr_propname, &macaddr) != 0) {
2201 		cmn_err(CE_WARN, "!vsw%d: Unable to get MAC address from MD",
2202 		    vswp->instance);
2203 		goto fail_reconf;
2204 	} else {
2205 		uint64_t maddr = macaddr;
2206 		READ_ENTER(&vswp->if_lockrw);
2207 		for (i = ETHERADDRL - 1; i >= 0; i--) {
2208 			if (vswp->if_addr.ether_addr_octet[i]
2209 			    != (macaddr & 0xFF)) {
2210 				D2(vswp, "%s: octet[%d] 0x%x != 0x%x",
2211 				    __func__, i,
2212 				    vswp->if_addr.ether_addr_octet[i],
2213 				    (macaddr & 0xFF));
2214 				updated |= MD_macaddr;
2215 				macaddr = maddr;
2216 				break;
2217 			}
2218 			macaddr >>= 8;
2219 		}
2220 		RW_EXIT(&vswp->if_lockrw);
2221 		if (updated & MD_macaddr) {
2222 			vsw_save_lmacaddr(vswp, macaddr);
2223 		}
2224 	}
2225 
2226 	/*
2227 	 * Check if switching modes have changed.
2228 	 */
2229 	if (vsw_get_md_smodes(vswp, mdp, node, &new_smode)) {
2230 		cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD",
2231 		    vswp->instance, smode_propname);
2232 		goto fail_reconf;
2233 	} else {
2234 		if (new_smode != vswp->smode) {
2235 			D2(vswp, "%s: switching mode changed from %d to %d",
2236 			    __func__, vswp->smode, new_smode);
2237 
2238 			updated |= MD_smode;
2239 		}
2240 	}
2241 
2242 	/* Read the vlan ids */
2243 	vsw_vlan_read_ids(vswp, VSW_LOCALDEV, mdp, node, &pvid, &vids,
2244 	    &nvids, NULL);
2245 
2246 	/* Determine if there are any vlan id updates */
2247 	if ((pvid != vswp->pvid) ||		/* pvid changed? */
2248 	    (nvids != vswp->nvids) ||		/* # of vids changed? */
2249 	    ((nvids != 0) && (vswp->nvids != 0) &&	/* vids changed? */
2250 	    !vsw_cmp_vids(vids, vswp->vids, nvids))) {
2251 		updated |= MD_vlans;
2252 	}
2253 
2254 	/* Read mtu */
2255 	vsw_mtu_read(vswp, mdp, node, &mtu);
2256 	if (mtu != vswp->mtu) {
2257 		if (mtu >= ETHERMTU && mtu <= VNET_MAX_MTU) {
2258 			updated |= MD_mtu;
2259 		} else {
2260 			cmn_err(CE_NOTE, "!vsw%d: Unable to process mtu update"
2261 			    " as the specified value:%d is invalid\n",
2262 			    vswp->instance, mtu);
2263 		}
2264 	}
2265 
2266 	/*
2267 	 * Read the 'linkprop' property.
2268 	 */
2269 	vsw_linkprop_read(vswp, mdp, node, &pls_update);
2270 	if (pls_update != vswp->pls_update) {
2271 		updated |= MD_pls;
2272 	}
2273 
2274 	/* Read bandwidth */
2275 	vsw_bandwidth_read(vswp, mdp, node, &maxbw);
2276 	if (maxbw != vswp->bandwidth) {
2277 		if (maxbw >= MRP_MAXBW_MINVAL || maxbw == 0) {
2278 			updated |= MD_bw;
2279 		} else {
2280 			cmn_err(CE_NOTE, "!vsw%d: Unable to process bandwidth"
2281 			    " update as the specified value:%ld is invalid\n",
2282 			    vswp->instance, maxbw);
2283 		}
2284 	}
2285 
2286 	/*
2287 	 * Now make any changes which are needed...
2288 	 */
2289 	if (updated & MD_pls) {
2290 
2291 		/* save the updated property. */
2292 		vswp->pls_update = pls_update;
2293 
2294 		if (pls_update == B_FALSE) {
2295 			/*
2296 			 * Phys link state update is now disabled for this vsw
2297 			 * interface. If we had previously reported a link-down
2298 			 * to the stack, undo that by sending a link-up.
2299 			 */
2300 			if (vswp->phys_link_state == LINK_STATE_DOWN) {
2301 				vsw_mac_link_update(vswp, LINK_STATE_UP);
2302 			}
2303 		} else {
2304 			/*
2305 			 * Phys link state update is now enabled. Send up an
2306 			 * update based on the current phys link state.
2307 			 */
2308 			if (vswp->smode & VSW_LAYER2) {
2309 				vsw_mac_link_update(vswp,
2310 				    vswp->phys_link_state);
2311 			}
2312 		}
2313 
2314 	}
2315 
2316 	if (updated & (MD_physname | MD_smode | MD_mtu)) {
2317 
2318 		/*
2319 		 * Stop any pending thread to setup switching mode.
2320 		 */
2321 		vsw_setup_switching_stop(vswp);
2322 
2323 		/* Cleanup HybridIO */
2324 		vsw_hio_cleanup(vswp);
2325 
2326 		/*
2327 		 * Remove unicst, mcst addrs of vsw interface
2328 		 * and ports from the physdev. This also closes
2329 		 * the corresponding mac clients.
2330 		 */
2331 		vsw_unset_addrs(vswp);
2332 
2333 		/*
2334 		 * Stop, detach and close the old device..
2335 		 */
2336 		mutex_enter(&vswp->mac_lock);
2337 		vsw_mac_close(vswp);
2338 		mutex_exit(&vswp->mac_lock);
2339 
2340 		/*
2341 		 * Update phys name.
2342 		 */
2343 		if (updated & MD_physname) {
2344 			cmn_err(CE_NOTE, "!vsw%d: changing from %s to %s",
2345 			    vswp->instance, vswp->physname, physname);
2346 			(void) strncpy(vswp->physname,
2347 			    physname, strlen(physname) + 1);
2348 		}
2349 
2350 		/*
2351 		 * Update array with the new switch mode values.
2352 		 */
2353 		if (updated & MD_smode) {
2354 			vswp->smode = new_smode;
2355 		}
2356 
2357 		/* Update mtu */
2358 		if (updated & MD_mtu) {
2359 			rv = vsw_mtu_update(vswp, mtu);
2360 			if (rv != 0) {
2361 				goto fail_update;
2362 			}
2363 		}
2364 
2365 		/*
2366 		 * ..and attach, start the new device.
2367 		 */
2368 		rv = vsw_setup_switching(vswp);
2369 		if (rv == EAGAIN) {
2370 			/*
2371 			 * Unable to setup switching mode.
2372 			 * As the error is EAGAIN, schedule a thread to retry
2373 			 * and return. Programming addresses of ports and
2374 			 * vsw interface will be done by the thread when the
2375 			 * switching setup completes successfully.
2376 			 */
2377 			if (vsw_setup_switching_start(vswp) != 0) {
2378 				goto fail_update;
2379 			}
2380 			return;
2381 
2382 		} else if (rv) {
2383 			goto fail_update;
2384 		}
2385 
2386 		vsw_setup_switching_post_process(vswp);
2387 	} else if (updated & MD_macaddr) {
2388 		/*
2389 		 * We enter here if only MD_macaddr is exclusively updated.
2390 		 * If MD_physname and/or MD_smode are also updated, then
2391 		 * as part of that, we would have implicitly processed
2392 		 * MD_macaddr update (above).
2393 		 */
2394 		cmn_err(CE_NOTE, "!vsw%d: changing mac address to 0x%lx",
2395 		    vswp->instance, macaddr);
2396 
2397 		READ_ENTER(&vswp->if_lockrw);
2398 		if (vswp->if_state & VSW_IF_UP) {
2399 			/* reconfigure with new address */
2400 			vsw_if_mac_reconfig(vswp, B_FALSE, 0, NULL, 0);
2401 
2402 			/*
2403 			 * Notify the MAC layer of the changed address.
2404 			 */
2405 			mac_unicst_update(vswp->if_mh,
2406 			    (uint8_t *)&vswp->if_addr);
2407 
2408 		}
2409 		RW_EXIT(&vswp->if_lockrw);
2410 
2411 	}
2412 
2413 	if (updated & MD_vlans) {
2414 		/* Remove existing vlan ids from the hash table. */
2415 		vsw_vlan_remove_ids(vswp, VSW_LOCALDEV);
2416 
2417 		if (vswp->if_state & VSW_IF_UP) {
2418 			vsw_if_mac_reconfig(vswp, B_TRUE, pvid, vids, nvids);
2419 		} else {
2420 			if (vswp->nvids != 0) {
2421 				kmem_free(vswp->vids,
2422 				    sizeof (vsw_vlanid_t) * vswp->nvids);
2423 			}
2424 			vswp->vids = vids;
2425 			vswp->nvids = nvids;
2426 			vswp->pvid = pvid;
2427 		}
2428 
2429 		/* add these new vlan ids into hash table */
2430 		vsw_vlan_add_ids(vswp, VSW_LOCALDEV);
2431 	} else {
2432 		if (nvids != 0) {
2433 			kmem_free(vids, sizeof (vsw_vlanid_t) * nvids);
2434 		}
2435 	}
2436 
2437 	if (updated & MD_bw) {
2438 		vsw_update_bandwidth(vswp, NULL, VSW_LOCALDEV, maxbw);
2439 	}
2440 
2441 	return;
2442 
2443 fail_reconf:
2444 	cmn_err(CE_WARN, "!vsw%d: configuration unchanged", vswp->instance);
2445 	return;
2446 
2447 fail_update:
2448 	cmn_err(CE_WARN, "!vsw%d: re-configuration failed",
2449 	    vswp->instance);
2450 }
2451 
2452 /*
2453  * Read the port's md properties.
2454  */
2455 static int
2456 vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp,
2457 	md_t *mdp, mde_cookie_t *node)
2458 {
2459 	uint64_t		ldc_id;
2460 	uint8_t			*addrp;
2461 	int			i, addrsz;
2462 	int			num_nodes = 0, nchan = 0;
2463 	int			listsz = 0;
2464 	mde_cookie_t		*listp = NULL;
2465 	struct ether_addr	ea;
2466 	uint64_t		macaddr;
2467 	uint64_t		inst = 0;
2468 	uint64_t		val;
2469 
2470 	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
2471 		DWARN(vswp, "%s: prop(%s) not found", __func__,
2472 		    id_propname);
2473 		return (1);
2474 	}
2475 
2476 	/*
2477 	 * Find the channel endpoint node(s) (which should be under this
2478 	 * port node) which contain the channel id(s).
2479 	 */
2480 	if ((num_nodes = md_node_count(mdp)) <= 0) {
2481 		DERR(vswp, "%s: invalid number of nodes found (%d)",
2482 		    __func__, num_nodes);
2483 		return (1);
2484 	}
2485 
2486 	D2(vswp, "%s: %d nodes found", __func__, num_nodes);
2487 
2488 	/* allocate enough space for node list */
2489 	listsz = num_nodes * sizeof (mde_cookie_t);
2490 	listp = kmem_zalloc(listsz, KM_SLEEP);
2491 
2492 	nchan = md_scan_dag(mdp, *node, md_find_name(mdp, chan_propname),
2493 	    md_find_name(mdp, "fwd"), listp);
2494 
2495 	if (nchan <= 0) {
2496 		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
2497 		kmem_free(listp, listsz);
2498 		return (1);
2499 	}
2500 
2501 	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
2502 
2503 	/* use property from first node found */
2504 	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
2505 		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
2506 		    id_propname);
2507 		kmem_free(listp, listsz);
2508 		return (1);
2509 	}
2510 
2511 	/* don't need list any more */
2512 	kmem_free(listp, listsz);
2513 
2514 	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
2515 
2516 	/* read mac-address property */
2517 	if (md_get_prop_data(mdp, *node, remaddr_propname,
2518 	    &addrp, &addrsz)) {
2519 		DWARN(vswp, "%s: prop(%s) not found",
2520 		    __func__, remaddr_propname);
2521 		return (1);
2522 	}
2523 
2524 	if (addrsz < ETHERADDRL) {
2525 		DWARN(vswp, "%s: invalid address size", __func__);
2526 		return (1);
2527 	}
2528 
2529 	macaddr = *((uint64_t *)addrp);
2530 	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
2531 
2532 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2533 		ea.ether_addr_octet[i] = macaddr & 0xFF;
2534 		macaddr >>= 8;
2535 	}
2536 
2537 	/* now update all properties into the port */
2538 	portp->p_vswp = vswp;
2539 	portp->p_instance = inst;
2540 	portp->addr_set = B_FALSE;
2541 	ether_copy(&ea, &portp->p_macaddr);
2542 	if (nchan > VSW_PORT_MAX_LDCS) {
2543 		D2(vswp, "%s: using first of %d ldc ids",
2544 		    __func__, nchan);
2545 		nchan = VSW_PORT_MAX_LDCS;
2546 	}
2547 	portp->num_ldcs = nchan;
2548 	portp->ldc_ids =
2549 	    kmem_zalloc(sizeof (uint64_t) * nchan, KM_SLEEP);
2550 	bcopy(&ldc_id, (portp->ldc_ids), sizeof (uint64_t) * nchan);
2551 
2552 	/* read vlan id properties of this port node */
2553 	vsw_vlan_read_ids(portp, VSW_VNETPORT, mdp, *node, &portp->pvid,
2554 	    &portp->vids, &portp->nvids, NULL);
2555 
2556 	/* Check if hybrid property is present */
2557 	if (md_get_prop_val(mdp, *node, hybrid_propname, &val) == 0) {
2558 		D1(vswp, "%s: prop(%s) found\n", __func__, hybrid_propname);
2559 		portp->p_hio_enabled = B_TRUE;
2560 	} else {
2561 		portp->p_hio_enabled = B_FALSE;
2562 	}
2563 	/*
2564 	 * Port hio capability determined after version
2565 	 * negotiation, i.e., when we know the peer is HybridIO capable.
2566 	 */
2567 	portp->p_hio_capable = B_FALSE;
2568 
2569 	/* Read bandwidth of this port */
2570 	vsw_port_read_bandwidth(portp, mdp, *node, &portp->p_bandwidth);
2571 
2572 	return (0);
2573 }
2574 
2575 /*
2576  * Add a new port to the system.
2577  *
2578  * Returns 0 on success, 1 on failure.
2579  */
2580 int
2581 vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
2582 {
2583 	vsw_port_t	*portp;
2584 	int		rv;
2585 
2586 	portp = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
2587 
2588 	rv = vsw_port_read_props(portp, vswp, mdp, node);
2589 	if (rv != 0) {
2590 		kmem_free(portp, sizeof (*portp));
2591 		return (1);
2592 	}
2593 
2594 	rv = vsw_port_attach(portp);
2595 	if (rv != 0) {
2596 		DERR(vswp, "%s: failed to attach port", __func__);
2597 		return (1);
2598 	}
2599 
2600 	return (0);
2601 }
2602 
2603 static int
2604 vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex,
2605 	md_t *prev_mdp, mde_cookie_t prev_mdex)
2606 {
2607 	uint64_t	cport_num;
2608 	uint64_t	pport_num;
2609 	vsw_port_list_t	*plistp;
2610 	vsw_port_t	*portp;
2611 	uint16_t	pvid;
2612 	vsw_vlanid_t	*vids;
2613 	uint16_t	nvids;
2614 	uint64_t	val;
2615 	boolean_t	hio_enabled = B_FALSE;
2616 	uint64_t	maxbw;
2617 	enum		{P_MD_init = 0x1,
2618 				P_MD_vlans = 0x2,
2619 				P_MD_hio = 0x4,
2620 				P_MD_maxbw = 0x8} updated;
2621 
2622 	updated = P_MD_init;
2623 
2624 	/*
2625 	 * For now, we get port updates only if vlan ids changed.
2626 	 * We read the port num and do some sanity check.
2627 	 */
2628 	if (md_get_prop_val(curr_mdp, curr_mdex, id_propname, &cport_num)) {
2629 		return (1);
2630 	}
2631 
2632 	if (md_get_prop_val(prev_mdp, prev_mdex, id_propname, &pport_num)) {
2633 		return (1);
2634 	}
2635 	if (cport_num != pport_num)
2636 		return (1);
2637 
2638 	plistp = &(vswp->plist);
2639 
2640 	READ_ENTER(&plistp->lockrw);
2641 
2642 	portp = vsw_lookup_port(vswp, cport_num);
2643 	if (portp == NULL) {
2644 		RW_EXIT(&plistp->lockrw);
2645 		return (1);
2646 	}
2647 
2648 	/* Read the vlan ids */
2649 	vsw_vlan_read_ids(portp, VSW_VNETPORT, curr_mdp, curr_mdex, &pvid,
2650 	    &vids, &nvids, NULL);
2651 
2652 	/* Determine if there are any vlan id updates */
2653 	if ((pvid != portp->pvid) ||		/* pvid changed? */
2654 	    (nvids != portp->nvids) ||		/* # of vids changed? */
2655 	    ((nvids != 0) && (portp->nvids != 0) &&	/* vids changed? */
2656 	    !vsw_cmp_vids(vids, portp->vids, nvids))) {
2657 		updated |= P_MD_vlans;
2658 	}
2659 
2660 	/* Check if hybrid property is present */
2661 	if (md_get_prop_val(curr_mdp, curr_mdex, hybrid_propname, &val) == 0) {
2662 		D1(vswp, "%s: prop(%s) found\n", __func__, hybrid_propname);
2663 		hio_enabled = B_TRUE;
2664 	}
2665 
2666 	if (portp->p_hio_enabled != hio_enabled) {
2667 		updated |= P_MD_hio;
2668 	}
2669 
2670 	/* Check if maxbw property is present */
2671 	vsw_port_read_bandwidth(portp, curr_mdp, curr_mdex, &maxbw);
2672 	if (maxbw != portp->p_bandwidth) {
2673 		if (maxbw >= MRP_MAXBW_MINVAL || maxbw == 0) {
2674 			updated |= P_MD_maxbw;
2675 		} else {
2676 			cmn_err(CE_NOTE, "!vsw%d: Unable to process bandwidth"
2677 			    " update for port %d as the specified value:%ld"
2678 			    " is invalid\n",
2679 			    vswp->instance, portp->p_instance, maxbw);
2680 		}
2681 	}
2682 
2683 	if (updated & P_MD_vlans) {
2684 		/* Remove existing vlan ids from the hash table. */
2685 		vsw_vlan_remove_ids(portp, VSW_VNETPORT);
2686 
2687 		/* Reconfigure vlans with network device */
2688 		vsw_mac_port_reconfig_vlans(portp, pvid, vids, nvids);
2689 
2690 		/* add these new vlan ids into hash table */
2691 		vsw_vlan_add_ids(portp, VSW_VNETPORT);
2692 
2693 		/* reset the port if it is vlan unaware (ver < 1.3) */
2694 		vsw_vlan_unaware_port_reset(portp);
2695 	}
2696 
2697 	if (updated & P_MD_hio) {
2698 		vsw_hio_port_update(portp, hio_enabled);
2699 	}
2700 
2701 	if (updated & P_MD_maxbw) {
2702 		vsw_update_bandwidth(NULL, portp, VSW_VNETPORT, maxbw);
2703 	}
2704 
2705 	RW_EXIT(&plistp->lockrw);
2706 
2707 	return (0);
2708 }
2709 
2710 /*
2711  * vsw_mac_rx -- A common function to send packets to the interface.
2712  * By default this function check if the interface is UP or not, the
2713  * rest of the behaviour depends on the flags as below:
2714  *
2715  *	VSW_MACRX_PROMISC -- Check if the promisc mode set or not.
2716  *	VSW_MACRX_COPYMSG -- Make a copy of the message(s).
2717  *	VSW_MACRX_FREEMSG -- Free if the messages cannot be sent up the stack.
2718  */
2719 void
2720 vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh,
2721     mblk_t *mp, vsw_macrx_flags_t flags)
2722 {
2723 	mblk_t		*mpt;
2724 
2725 	D1(vswp, "%s:enter\n", __func__);
2726 	READ_ENTER(&vswp->if_lockrw);
2727 	/* Check if the interface is up */
2728 	if (!(vswp->if_state & VSW_IF_UP)) {
2729 		RW_EXIT(&vswp->if_lockrw);
2730 		/* Free messages only if FREEMSG flag specified */
2731 		if (flags & VSW_MACRX_FREEMSG) {
2732 			freemsgchain(mp);
2733 		}
2734 		D1(vswp, "%s:exit\n", __func__);
2735 		return;
2736 	}
2737 	/*
2738 	 * If PROMISC flag is passed, then check if
2739 	 * the interface is in the PROMISC mode.
2740 	 * If not, drop the messages.
2741 	 */
2742 	if (flags & VSW_MACRX_PROMISC) {
2743 		if (!(vswp->if_state & VSW_IF_PROMISC)) {
2744 			RW_EXIT(&vswp->if_lockrw);
2745 			/* Free messages only if FREEMSG flag specified */
2746 			if (flags & VSW_MACRX_FREEMSG) {
2747 				freemsgchain(mp);
2748 			}
2749 			D1(vswp, "%s:exit\n", __func__);
2750 			return;
2751 		}
2752 	}
2753 	RW_EXIT(&vswp->if_lockrw);
2754 	/*
2755 	 * If COPYMSG flag is passed, then make a copy
2756 	 * of the message chain and send up the copy.
2757 	 */
2758 	if (flags & VSW_MACRX_COPYMSG) {
2759 		mp = copymsgchain(mp);
2760 		if (mp == NULL) {
2761 			D1(vswp, "%s:exit\n", __func__);
2762 			return;
2763 		}
2764 	}
2765 
2766 	D2(vswp, "%s: sending up stack", __func__);
2767 
2768 	mpt = NULL;
2769 	(void) vsw_vlan_frame_untag(vswp, VSW_LOCALDEV, &mp, &mpt);
2770 	if (mp != NULL) {
2771 		mac_rx(vswp->if_mh, mrh, mp);
2772 	}
2773 	D1(vswp, "%s:exit\n", __func__);
2774 }
2775 
2776 /* copy mac address of vsw into soft state structure */
2777 static void
2778 vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr)
2779 {
2780 	int	i;
2781 
2782 	WRITE_ENTER(&vswp->if_lockrw);
2783 	for (i = ETHERADDRL - 1; i >= 0; i--) {
2784 		vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
2785 		macaddr >>= 8;
2786 	}
2787 	RW_EXIT(&vswp->if_lockrw);
2788 }
2789 
2790 /* Compare VLAN ids, array size expected to be same. */
2791 static boolean_t
2792 vsw_cmp_vids(vsw_vlanid_t *vids1, vsw_vlanid_t *vids2, int nvids)
2793 {
2794 	int i, j;
2795 	uint16_t vid;
2796 
2797 	for (i = 0; i < nvids; i++) {
2798 		vid = vids1[i].vl_vid;
2799 		for (j = 0; j < nvids; j++) {
2800 			if (vid == vids2[i].vl_vid)
2801 				break;
2802 		}
2803 		if (j == nvids) {
2804 			return (B_FALSE);
2805 		}
2806 	}
2807 	return (B_TRUE);
2808 }
2809