xref: /illumos-gate/usr/src/uts/sun4v/io/vnet.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2018 Joyent, Inc.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/errno.h>
30 #include <sys/param.h>
31 #include <sys/callb.h>
32 #include <sys/stream.h>
33 #include <sys/kmem.h>
34 #include <sys/conf.h>
35 #include <sys/devops.h>
36 #include <sys/ksynch.h>
37 #include <sys/stat.h>
38 #include <sys/modctl.h>
39 #include <sys/modhash.h>
40 #include <sys/debug.h>
41 #include <sys/ethernet.h>
42 #include <sys/dlpi.h>
43 #include <net/if.h>
44 #include <sys/mac_provider.h>
45 #include <sys/mac_client.h>
46 #include <sys/mac_client_priv.h>
47 #include <sys/mac_ether.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/strsun.h>
51 #include <sys/note.h>
52 #include <sys/atomic.h>
53 #include <sys/vnet.h>
54 #include <sys/vlan.h>
55 #include <sys/vnet_mailbox.h>
56 #include <sys/vnet_common.h>
57 #include <sys/dds.h>
58 #include <sys/strsubr.h>
59 #include <sys/taskq.h>
60 
61 /*
62  * Function prototypes.
63  */
64 
65 /* DDI entrypoints */
66 static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
67 static int vnetattach(dev_info_t *, ddi_attach_cmd_t);
68 static int vnetdetach(dev_info_t *, ddi_detach_cmd_t);
69 
70 /* MAC entrypoints  */
71 static int vnet_m_stat(void *, uint_t, uint64_t *);
72 static int vnet_m_start(void *);
73 static void vnet_m_stop(void *);
74 static int vnet_m_promisc(void *, boolean_t);
75 static int vnet_m_multicst(void *, boolean_t, const uint8_t *);
76 static int vnet_m_unicst(void *, const uint8_t *);
77 mblk_t *vnet_m_tx(void *, mblk_t *);
78 static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
79 #ifdef	VNET_IOC_DEBUG
80 static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp);
81 #endif
82 static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data);
83 static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
84 	const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle);
85 static void vnet_get_group(void *arg, mac_ring_type_t type, const int index,
86 	mac_group_info_t *infop, mac_group_handle_t handle);
87 static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
88 static void vnet_rx_ring_stop(mac_ring_driver_t rdriver);
89 static int vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
90 	uint64_t *val);
91 static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
92 static void vnet_tx_ring_stop(mac_ring_driver_t rdriver);
93 static int vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
94 	uint64_t *val);
95 static int vnet_ring_enable_intr(void *arg);
96 static int vnet_ring_disable_intr(void *arg);
97 static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup);
98 static int vnet_addmac(void *arg, const uint8_t *mac_addr);
99 static int vnet_remmac(void *arg, const uint8_t *mac_addr);
100 
101 /* vnet internal functions */
102 static int vnet_unattach(vnet_t *vnetp);
103 static void vnet_ring_grp_init(vnet_t *vnetp);
104 static void vnet_ring_grp_uninit(vnet_t *vnetp);
105 static int vnet_mac_register(vnet_t *);
106 static int vnet_read_mac_address(vnet_t *vnetp);
107 static int vnet_bind_vgenring(vnet_res_t *vresp);
108 static void vnet_unbind_vgenring(vnet_res_t *vresp);
109 static int vnet_bind_hwrings(vnet_t *vnetp);
110 static void vnet_unbind_hwrings(vnet_t *vnetp);
111 static int vnet_bind_rings(vnet_res_t *vresp);
112 static void vnet_unbind_rings(vnet_res_t *vresp);
113 static int vnet_hio_stat(void *, uint_t, uint64_t *);
114 static int vnet_hio_start(void *);
115 static void vnet_hio_stop(void *);
116 mblk_t *vnet_hio_tx(void *, mblk_t *);
117 
118 /* Forwarding database (FDB) routines */
119 static void vnet_fdb_create(vnet_t *vnetp);
120 static void vnet_fdb_destroy(vnet_t *vnetp);
121 static vnet_res_t *vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp);
122 static void vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
123 void vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp);
124 static void vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp);
125 
126 static void vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp);
127 static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp);
128 static void vnet_tx_update(vio_net_handle_t vrh);
129 static void vnet_res_start_task(void *arg);
130 static void vnet_start_resources(vnet_t *vnetp);
131 static void vnet_stop_resources(vnet_t *vnetp);
132 static void vnet_dispatch_res_task(vnet_t *vnetp);
133 static void vnet_res_start_task(void *arg);
134 static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err);
135 static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp);
136 static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp);
137 static void vnet_tx_notify_thread(void *);
138 
139 /* Exported to vnet_gen */
140 int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu);
141 void vnet_link_update(vnet_t *vnetp, link_state_t link_state);
142 void vnet_dds_cleanup_hio(vnet_t *vnetp);
143 
144 static kstat_t *vnet_hio_setup_kstats(char *ks_mod, char *ks_name,
145     vnet_res_t *vresp);
146 static int vnet_hio_update_kstats(kstat_t *ksp, int rw);
147 static void vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp);
148 static void vnet_hio_destroy_kstats(kstat_t *ksp);
149 
150 /* Exported to to vnet_dds */
151 int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
152 int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
153 void vnet_hio_mac_cleanup(vnet_t *vnetp);
154 
155 /* Externs that are imported from vnet_gen */
156 extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
157     const uint8_t *macaddr, void **vgenhdl);
158 extern int vgen_init_mdeg(void *arg);
159 extern void vgen_uninit(void *arg);
160 extern int vgen_dds_tx(void *arg, void *dmsg);
161 extern int vgen_enable_intr(void *arg);
162 extern int vgen_disable_intr(void *arg);
163 extern mblk_t *vgen_rx_poll(void *arg, int bytes_to_pickup);
164 
165 /* Externs that are imported from vnet_dds */
166 extern void vdds_mod_init(void);
167 extern void vdds_mod_fini(void);
168 extern int vdds_init(vnet_t *vnetp);
169 extern void vdds_cleanup(vnet_t *vnetp);
170 extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg);
171 extern void vdds_cleanup_hybrid_res(void *arg);
172 extern void vdds_cleanup_hio(vnet_t *vnetp);
173 
174 extern pri_t	minclsyspri;
175 
176 #define	DRV_NAME	"vnet"
177 #define	VNET_FDBE_REFHOLD(p)						\
178 {									\
179 	atomic_inc_32(&(p)->refcnt);					\
180 	ASSERT((p)->refcnt != 0);					\
181 }
182 
183 #define	VNET_FDBE_REFRELE(p)						\
184 {									\
185 	ASSERT((p)->refcnt != 0);					\
186 	atomic_dec_32(&(p)->refcnt);					\
187 }
188 
189 #ifdef	VNET_IOC_DEBUG
190 #define	VNET_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
191 #else
192 #define	VNET_M_CALLBACK_FLAGS	(MC_GETCAPAB)
193 #endif
194 
195 static mac_callbacks_t vnet_m_callbacks = {
196 	VNET_M_CALLBACK_FLAGS,
197 	vnet_m_stat,
198 	vnet_m_start,
199 	vnet_m_stop,
200 	vnet_m_promisc,
201 	vnet_m_multicst,
202 	NULL,	/* m_unicst entry must be NULL while rx rings are exposed */
203 	NULL,	/* m_tx entry must be NULL while tx rings are exposed */
204 	NULL,
205 	vnet_m_ioctl,
206 	vnet_m_capab,
207 	NULL
208 };
209 
210 static mac_callbacks_t vnet_hio_res_callbacks = {
211 	0,
212 	vnet_hio_stat,
213 	vnet_hio_start,
214 	vnet_hio_stop,
215 	NULL,
216 	NULL,
217 	NULL,
218 	vnet_hio_tx,
219 	NULL,
220 	NULL,
221 	NULL
222 };
223 
224 /*
225  * Linked list of "vnet_t" structures - one per instance.
226  */
227 static vnet_t	*vnet_headp = NULL;
228 static krwlock_t vnet_rw;
229 
230 /* Tunables */
231 uint32_t vnet_num_descriptors = VNET_NUM_DESCRIPTORS;
232 
233 /*
234  * Configure tx serialization in mac layer for the vnet device. This tunable
235  * should be enabled to improve performance only if HybridIO is configured for
236  * the vnet device.
237  */
238 boolean_t vnet_mac_tx_serialize = B_FALSE;
239 
240 /* Configure enqueing at Rx soft rings in mac layer for the vnet device */
241 boolean_t vnet_mac_rx_queuing = B_TRUE;
242 
243 /*
244  * Set this to non-zero to enable additional internal receive buffer pools
245  * based on the MTU of the device for better performance at the cost of more
246  * memory consumption. This is turned off by default, to use allocb(9F) for
247  * receive buffer allocations of sizes > 2K.
248  */
249 boolean_t vnet_jumbo_rxpools = B_FALSE;
250 
251 /* # of chains in fdb hash table */
252 uint32_t	vnet_fdb_nchains = VNET_NFDB_HASH;
253 
254 /* Internal tunables */
255 uint32_t	vnet_ethermtu = 1500;	/* mtu of the device */
256 
257 /*
258  * Default vlan id. This is only used internally when the "default-vlan-id"
259  * property is not present in the MD device node. Therefore, this should not be
260  * used as a tunable; if this value is changed, the corresponding variable
261  * should be updated to the same value in vsw and also other vnets connected to
262  * the same vsw.
263  */
264 uint16_t	vnet_default_vlan_id = 1;
265 
266 /* delay in usec to wait for all references on a fdb entry to be dropped */
267 uint32_t vnet_fdbe_refcnt_delay = 10;
268 
269 static struct ether_addr etherbroadcastaddr = {
270 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
271 };
272 
273 /* mac_open() retry delay in usec */
274 uint32_t vnet_mac_open_delay = 100;	/* 0.1 ms */
275 
276 /* max # of mac_open() retries */
277 uint32_t vnet_mac_open_retries = 100;
278 
279 /*
280  * Property names
281  */
282 static char macaddr_propname[] = "local-mac-address";
283 
284 /*
285  * This is the string displayed by modinfo(8).
286  */
287 static char vnet_ident[] = "vnet driver";
288 extern struct mod_ops mod_driverops;
289 static struct cb_ops cb_vnetops = {
290 	nulldev,		/* cb_open */
291 	nulldev,		/* cb_close */
292 	nodev,			/* cb_strategy */
293 	nodev,			/* cb_print */
294 	nodev,			/* cb_dump */
295 	nodev,			/* cb_read */
296 	nodev,			/* cb_write */
297 	nodev,			/* cb_ioctl */
298 	nodev,			/* cb_devmap */
299 	nodev,			/* cb_mmap */
300 	nodev,			/* cb_segmap */
301 	nochpoll,		/* cb_chpoll */
302 	ddi_prop_op,		/* cb_prop_op */
303 	NULL,			/* cb_stream */
304 	(int)(D_MP)		/* cb_flag */
305 };
306 
307 static struct dev_ops vnetops = {
308 	DEVO_REV,		/* devo_rev */
309 	0,			/* devo_refcnt */
310 	NULL,			/* devo_getinfo */
311 	nulldev,		/* devo_identify */
312 	nulldev,		/* devo_probe */
313 	vnetattach,		/* devo_attach */
314 	vnetdetach,		/* devo_detach */
315 	nodev,			/* devo_reset */
316 	&cb_vnetops,		/* devo_cb_ops */
317 	(struct bus_ops *)NULL,	/* devo_bus_ops */
318 	NULL,			/* devo_power */
319 	ddi_quiesce_not_supported,	/* devo_quiesce */
320 };
321 
322 static struct modldrv modldrv = {
323 	&mod_driverops,		/* Type of module.  This one is a driver */
324 	vnet_ident,		/* ID string */
325 	&vnetops		/* driver specific ops */
326 };
327 
328 static struct modlinkage modlinkage = {
329 	MODREV_1, (void *)&modldrv, NULL
330 };
331 
332 #ifdef DEBUG
333 
334 #define	DEBUG_PRINTF	debug_printf
335 
336 /*
337  * Print debug messages - set to 0xf to enable all msgs
338  */
339 int vnet_dbglevel = 0x8;
340 
341 static void
342 debug_printf(const char *fname, void *arg, const char *fmt, ...)
343 {
344 	char    buf[512];
345 	va_list ap;
346 	vnet_t *vnetp = (vnet_t *)arg;
347 	char    *bufp = buf;
348 
349 	if (vnetp == NULL) {
350 		(void) sprintf(bufp, "%s: ", fname);
351 		bufp += strlen(bufp);
352 	} else {
353 		(void) sprintf(bufp, "vnet%d:%s: ", vnetp->instance, fname);
354 		bufp += strlen(bufp);
355 	}
356 	va_start(ap, fmt);
357 	(void) vsprintf(bufp, fmt, ap);
358 	va_end(ap);
359 	cmn_err(CE_CONT, "%s\n", buf);
360 }
361 
362 #endif
363 
364 /* _init(9E): initialize the loadable module */
365 int
366 _init(void)
367 {
368 	int status;
369 
370 	DBG1(NULL, "enter\n");
371 
372 	mac_init_ops(&vnetops, "vnet");
373 	status = mod_install(&modlinkage);
374 	if (status != 0) {
375 		mac_fini_ops(&vnetops);
376 	}
377 	vdds_mod_init();
378 	DBG1(NULL, "exit(%d)\n", status);
379 	return (status);
380 }
381 
382 /* _fini(9E): prepare the module for unloading. */
383 int
384 _fini(void)
385 {
386 	int		status;
387 
388 	DBG1(NULL, "enter\n");
389 
390 	status = mod_remove(&modlinkage);
391 	if (status != 0)
392 		return (status);
393 	mac_fini_ops(&vnetops);
394 	vdds_mod_fini();
395 
396 	DBG1(NULL, "exit(%d)\n", status);
397 	return (status);
398 }
399 
400 /* _info(9E): return information about the loadable module */
401 int
402 _info(struct modinfo *modinfop)
403 {
404 	return (mod_info(&modlinkage, modinfop));
405 }
406 
407 /*
408  * attach(9E): attach a device to the system.
409  * called once for each instance of the device on the system.
410  */
411 static int
412 vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
413 {
414 	vnet_t			*vnetp;
415 	int			status;
416 	int			instance;
417 	uint64_t		reg;
418 	char			qname[TASKQ_NAMELEN];
419 	vnet_attach_progress_t	attach_progress;
420 
421 	attach_progress = AST_init;
422 
423 	switch (cmd) {
424 	case DDI_ATTACH:
425 		break;
426 	case DDI_RESUME:
427 	case DDI_PM_RESUME:
428 	default:
429 		goto vnet_attach_fail;
430 	}
431 
432 	instance = ddi_get_instance(dip);
433 	DBG1(NULL, "instance(%d) enter\n", instance);
434 
435 	/* allocate vnet_t and mac_t structures */
436 	vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP);
437 	vnetp->dip = dip;
438 	vnetp->instance = instance;
439 	rw_init(&vnetp->vrwlock, NULL, RW_DRIVER, NULL);
440 	rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL);
441 	attach_progress |= AST_vnet_alloc;
442 
443 	vnet_ring_grp_init(vnetp);
444 	attach_progress |= AST_ring_init;
445 
446 	status = vdds_init(vnetp);
447 	if (status != 0) {
448 		goto vnet_attach_fail;
449 	}
450 	attach_progress |= AST_vdds_init;
451 
452 	/* setup links to vnet_t from both devinfo and mac_t */
453 	ddi_set_driver_private(dip, (caddr_t)vnetp);
454 
455 	/* read the mac address */
456 	status = vnet_read_mac_address(vnetp);
457 	if (status != DDI_SUCCESS) {
458 		goto vnet_attach_fail;
459 	}
460 	attach_progress |= AST_read_macaddr;
461 
462 	reg = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
463 	    DDI_PROP_DONTPASS, "reg", -1);
464 	if (reg == -1) {
465 		goto vnet_attach_fail;
466 	}
467 	vnetp->reg = reg;
468 
469 	vnet_fdb_create(vnetp);
470 	attach_progress |= AST_fdbh_alloc;
471 
472 	(void) snprintf(qname, TASKQ_NAMELEN, "vres_taskq%d", instance);
473 	if ((vnetp->taskqp = ddi_taskq_create(dip, qname, 1,
474 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
475 		cmn_err(CE_WARN, "!vnet%d: Unable to create task queue",
476 		    instance);
477 		goto vnet_attach_fail;
478 	}
479 	attach_progress |= AST_taskq_create;
480 
481 	/* add to the list of vnet devices */
482 	WRITE_ENTER(&vnet_rw);
483 	vnetp->nextp = vnet_headp;
484 	vnet_headp = vnetp;
485 	RW_EXIT(&vnet_rw);
486 
487 	attach_progress |= AST_vnet_list;
488 
489 	/*
490 	 * Initialize the generic vnet plugin which provides communication via
491 	 * sun4v LDC (logical domain channel) based resources. This involves 2
492 	 * steps; first, vgen_init() is invoked to read the various properties
493 	 * of the vnet device from its MD node (including its mtu which is
494 	 * needed to mac_register()) and obtain a handle to the vgen layer.
495 	 * After mac_register() is done and we have a mac handle, we then
496 	 * invoke vgen_init_mdeg() which registers with the the MD event
497 	 * generator (mdeg) framework to allow LDC resource notifications.
498 	 * Note: this sequence also allows us to report the correct default #
499 	 * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked
500 	 * in the context of mac_register(); and avoids conflicting with
501 	 * dynamic pseudo rx rings which get added/removed as a result of mdeg
502 	 * events in vgen.
503 	 */
504 	status = vgen_init(vnetp, reg, vnetp->dip,
505 	    (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl);
506 	if (status != DDI_SUCCESS) {
507 		DERR(vnetp, "vgen_init() failed\n");
508 		goto vnet_attach_fail;
509 	}
510 	attach_progress |= AST_vgen_init;
511 
512 	status = vnet_mac_register(vnetp);
513 	if (status != DDI_SUCCESS) {
514 		goto vnet_attach_fail;
515 	}
516 	vnetp->link_state = LINK_STATE_UNKNOWN;
517 	attach_progress |= AST_macreg;
518 
519 	status = vgen_init_mdeg(vnetp->vgenhdl);
520 	if (status != DDI_SUCCESS) {
521 		goto vnet_attach_fail;
522 	}
523 	attach_progress |= AST_init_mdeg;
524 
525 	vnetp->attach_progress = attach_progress;
526 
527 	DBG1(NULL, "instance(%d) exit\n", instance);
528 	return (DDI_SUCCESS);
529 
530 vnet_attach_fail:
531 	vnetp->attach_progress = attach_progress;
532 	status = vnet_unattach(vnetp);
533 	ASSERT(status == 0);
534 	return (DDI_FAILURE);
535 }
536 
537 /*
538  * detach(9E): detach a device from the system.
539  */
540 static int
541 vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
542 {
543 	vnet_t		*vnetp;
544 	int		instance;
545 
546 	instance = ddi_get_instance(dip);
547 	DBG1(NULL, "instance(%d) enter\n", instance);
548 
549 	vnetp = ddi_get_driver_private(dip);
550 	if (vnetp == NULL) {
551 		goto vnet_detach_fail;
552 	}
553 
554 	switch (cmd) {
555 	case DDI_DETACH:
556 		break;
557 	case DDI_SUSPEND:
558 	case DDI_PM_SUSPEND:
559 	default:
560 		goto vnet_detach_fail;
561 	}
562 
563 	if (vnet_unattach(vnetp) != 0) {
564 		goto vnet_detach_fail;
565 	}
566 
567 	return (DDI_SUCCESS);
568 
569 vnet_detach_fail:
570 	return (DDI_FAILURE);
571 }
572 
573 /*
574  * Common routine to handle vnetattach() failure and vnetdetach(). Note that
575  * the only reason this function could fail is if mac_unregister() fails.
576  * Otherwise, this function must ensure that all resources are freed and return
577  * success.
578  */
579 static int
580 vnet_unattach(vnet_t *vnetp)
581 {
582 	vnet_attach_progress_t	attach_progress;
583 
584 	attach_progress = vnetp->attach_progress;
585 
586 	/*
587 	 * Disable the mac device in the gldv3 subsystem. This can fail, in
588 	 * particular if there are still any open references to this mac
589 	 * device; in which case we just return failure without continuing to
590 	 * detach further.
591 	 * If it succeeds, we then invoke vgen_uninit() which should unregister
592 	 * any pseudo rings registered with the mac layer. Note we keep the
593 	 * AST_macreg flag on, so we can unregister with the mac layer at
594 	 * the end of this routine.
595 	 */
596 	if (attach_progress & AST_macreg) {
597 		if (mac_disable(vnetp->mh) != 0) {
598 			return (1);
599 		}
600 	}
601 
602 	/*
603 	 * Now that we have disabled the device, we must finish all other steps
604 	 * and successfully return from this function; otherwise we will end up
605 	 * leaving the device in a broken/unusable state.
606 	 *
607 	 * First, release any hybrid resources assigned to this vnet device.
608 	 */
609 	if (attach_progress & AST_vdds_init) {
610 		vdds_cleanup(vnetp);
611 		attach_progress &= ~AST_vdds_init;
612 	}
613 
614 	/*
615 	 * Uninit vgen. This stops further mdeg callbacks to this vnet
616 	 * device and/or its ports; and detaches any existing ports.
617 	 */
618 	if (attach_progress & (AST_vgen_init|AST_init_mdeg)) {
619 		vgen_uninit(vnetp->vgenhdl);
620 		attach_progress &= ~AST_vgen_init;
621 		attach_progress &= ~AST_init_mdeg;
622 	}
623 
624 	/* Destroy the taskq. */
625 	if (attach_progress & AST_taskq_create) {
626 		ddi_taskq_destroy(vnetp->taskqp);
627 		attach_progress &= ~AST_taskq_create;
628 	}
629 
630 	/* Destroy fdb. */
631 	if (attach_progress & AST_fdbh_alloc) {
632 		vnet_fdb_destroy(vnetp);
633 		attach_progress &= ~AST_fdbh_alloc;
634 	}
635 
636 	/* Remove from the device list */
637 	if (attach_progress & AST_vnet_list) {
638 		vnet_t		**vnetpp;
639 		/* unlink from instance(vnet_t) list */
640 		WRITE_ENTER(&vnet_rw);
641 		for (vnetpp = &vnet_headp; *vnetpp;
642 		    vnetpp = &(*vnetpp)->nextp) {
643 			if (*vnetpp == vnetp) {
644 				*vnetpp = vnetp->nextp;
645 				break;
646 			}
647 		}
648 		RW_EXIT(&vnet_rw);
649 		attach_progress &= ~AST_vnet_list;
650 	}
651 
652 	if (attach_progress & AST_ring_init) {
653 		vnet_ring_grp_uninit(vnetp);
654 		attach_progress &= ~AST_ring_init;
655 	}
656 
657 	if (attach_progress & AST_macreg) {
658 		VERIFY(mac_unregister(vnetp->mh) == 0);
659 		vnetp->mh = NULL;
660 		attach_progress &= ~AST_macreg;
661 	}
662 
663 	if (attach_progress & AST_vnet_alloc) {
664 		rw_destroy(&vnetp->vrwlock);
665 		rw_destroy(&vnetp->vsw_fp_rw);
666 		attach_progress &= ~AST_vnet_list;
667 		KMEM_FREE(vnetp);
668 	}
669 
670 	return (0);
671 }
672 
673 /* enable the device for transmit/receive */
674 static int
675 vnet_m_start(void *arg)
676 {
677 	vnet_t		*vnetp = arg;
678 
679 	DBG1(vnetp, "enter\n");
680 
681 	WRITE_ENTER(&vnetp->vrwlock);
682 	vnetp->flags |= VNET_STARTED;
683 	vnet_start_resources(vnetp);
684 	RW_EXIT(&vnetp->vrwlock);
685 
686 	DBG1(vnetp, "exit\n");
687 	return (VNET_SUCCESS);
688 
689 }
690 
691 /* stop transmit/receive for the device */
692 static void
693 vnet_m_stop(void *arg)
694 {
695 	vnet_t		*vnetp = arg;
696 
697 	DBG1(vnetp, "enter\n");
698 
699 	WRITE_ENTER(&vnetp->vrwlock);
700 	if (vnetp->flags & VNET_STARTED) {
701 		/*
702 		 * Set the flags appropriately; this should prevent starting of
703 		 * any new resources that are added(see vnet_res_start_task()),
704 		 * while we release the vrwlock in vnet_stop_resources() before
705 		 * stopping each resource.
706 		 */
707 		vnetp->flags &= ~VNET_STARTED;
708 		vnetp->flags |= VNET_STOPPING;
709 		vnet_stop_resources(vnetp);
710 		vnetp->flags &= ~VNET_STOPPING;
711 	}
712 	RW_EXIT(&vnetp->vrwlock);
713 
714 	DBG1(vnetp, "exit\n");
715 }
716 
717 /* set the unicast mac address of the device */
718 static int
719 vnet_m_unicst(void *arg, const uint8_t *macaddr)
720 {
721 	_NOTE(ARGUNUSED(macaddr))
722 
723 	vnet_t *vnetp = arg;
724 
725 	DBG1(vnetp, "enter\n");
726 	/*
727 	 * NOTE: setting mac address dynamically is not supported.
728 	 */
729 	DBG1(vnetp, "exit\n");
730 
731 	return (VNET_FAILURE);
732 }
733 
734 /* enable/disable a multicast address */
735 static int
736 vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
737 {
738 	_NOTE(ARGUNUSED(add, mca))
739 
740 	vnet_t		*vnetp = arg;
741 	vnet_res_t	*vresp;
742 	mac_register_t	*macp;
743 	mac_callbacks_t	*cbp;
744 	int		rv = VNET_SUCCESS;
745 
746 	DBG1(vnetp, "enter\n");
747 
748 	READ_ENTER(&vnetp->vsw_fp_rw);
749 	if (vnetp->vsw_fp == NULL) {
750 		RW_EXIT(&vnetp->vsw_fp_rw);
751 		return (EAGAIN);
752 	}
753 	VNET_FDBE_REFHOLD(vnetp->vsw_fp);
754 	RW_EXIT(&vnetp->vsw_fp_rw);
755 
756 	vresp = vnetp->vsw_fp;
757 	macp = &vresp->macreg;
758 	cbp = macp->m_callbacks;
759 	rv = cbp->mc_multicst(macp->m_driver, add, mca);
760 
761 	VNET_FDBE_REFRELE(vnetp->vsw_fp);
762 
763 	DBG1(vnetp, "exit(%d)\n", rv);
764 	return (rv);
765 }
766 
767 /* set or clear promiscuous mode on the device */
768 static int
769 vnet_m_promisc(void *arg, boolean_t on)
770 {
771 	_NOTE(ARGUNUSED(on))
772 
773 	vnet_t *vnetp = arg;
774 	DBG1(vnetp, "enter\n");
775 	/*
776 	 * NOTE: setting promiscuous mode is not supported, just return success.
777 	 */
778 	DBG1(vnetp, "exit\n");
779 	return (VNET_SUCCESS);
780 }
781 
782 /*
783  * Transmit a chain of packets. This function provides switching functionality
784  * based on the destination mac address to reach other guests (within ldoms) or
785  * external hosts.
786  */
787 mblk_t *
788 vnet_tx_ring_send(void *arg, mblk_t *mp)
789 {
790 	vnet_pseudo_tx_ring_t	*tx_ringp;
791 	vnet_tx_ring_stats_t	*statsp;
792 	vnet_t			*vnetp;
793 	vnet_res_t		*vresp;
794 	mblk_t			*next;
795 	mblk_t			*resid_mp;
796 	mac_register_t		*macp;
797 	struct ether_header	*ehp;
798 	boolean_t		is_unicast;
799 	boolean_t		is_pvid;	/* non-default pvid ? */
800 	boolean_t		hres;		/* Hybrid resource ? */
801 	void			*tx_arg;
802 	size_t			size;
803 
804 	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
805 	statsp = &tx_ringp->tx_ring_stats;
806 	vnetp = (vnet_t *)tx_ringp->vnetp;
807 	DBG1(vnetp, "enter\n");
808 	ASSERT(mp != NULL);
809 
810 	is_pvid = (vnetp->pvid != vnetp->default_vlan_id) ? B_TRUE : B_FALSE;
811 
812 	while (mp != NULL) {
813 
814 		next = mp->b_next;
815 		mp->b_next = NULL;
816 
817 		/* update stats */
818 		size = msgsize(mp);
819 
820 		/*
821 		 * Find fdb entry for the destination
822 		 * and hold a reference to it.
823 		 */
824 		ehp = (struct ether_header *)mp->b_rptr;
825 		vresp = vnet_fdbe_find(vnetp, &ehp->ether_dhost);
826 		if (vresp != NULL) {
827 
828 			/*
829 			 * Destination found in FDB.
830 			 * The destination is a vnet device within ldoms
831 			 * and directly reachable, invoke the tx function
832 			 * in the fdb entry.
833 			 */
834 			macp = &vresp->macreg;
835 			resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp);
836 
837 			/* tx done; now release ref on fdb entry */
838 			VNET_FDBE_REFRELE(vresp);
839 
840 			if (resid_mp != NULL) {
841 				/* m_tx failed */
842 				mp->b_next = next;
843 				break;
844 			}
845 		} else {
846 			is_unicast = !(IS_BROADCAST(ehp) ||
847 			    (IS_MULTICAST(ehp)));
848 			/*
849 			 * Destination is not in FDB.
850 			 * If the destination is broadcast or multicast,
851 			 * then forward the packet to vswitch.
852 			 * If a Hybrid resource avilable, then send the
853 			 * unicast packet via hybrid resource, otherwise
854 			 * forward it to vswitch.
855 			 */
856 			READ_ENTER(&vnetp->vsw_fp_rw);
857 
858 			if ((is_unicast) && (vnetp->hio_fp != NULL)) {
859 				vresp = vnetp->hio_fp;
860 				hres = B_TRUE;
861 			} else {
862 				vresp = vnetp->vsw_fp;
863 				hres = B_FALSE;
864 			}
865 			if (vresp == NULL) {
866 				/*
867 				 * no fdb entry to vsw? drop the packet.
868 				 */
869 				RW_EXIT(&vnetp->vsw_fp_rw);
870 				freemsg(mp);
871 				mp = next;
872 				continue;
873 			}
874 
875 			/* ref hold the fdb entry to vsw */
876 			VNET_FDBE_REFHOLD(vresp);
877 
878 			RW_EXIT(&vnetp->vsw_fp_rw);
879 
880 			/*
881 			 * In the case of a hybrid resource we need to insert
882 			 * the tag for the pvid case here; unlike packets that
883 			 * are destined to a vnet/vsw in which case the vgen
884 			 * layer does the tagging before sending it over ldc.
885 			 */
886 			if (hres == B_TRUE) {
887 				/*
888 				 * Determine if the frame being transmitted
889 				 * over the hybrid resource is untagged. If so,
890 				 * insert the tag before transmitting.
891 				 */
892 				if (is_pvid == B_TRUE &&
893 				    ehp->ether_type != htons(ETHERTYPE_VLAN)) {
894 
895 					mp = vnet_vlan_insert_tag(mp,
896 					    vnetp->pvid);
897 					if (mp == NULL) {
898 						VNET_FDBE_REFRELE(vresp);
899 						mp = next;
900 						continue;
901 					}
902 
903 				}
904 
905 				macp = &vresp->macreg;
906 				tx_arg = tx_ringp;
907 			} else {
908 				macp = &vresp->macreg;
909 				tx_arg = macp->m_driver;
910 			}
911 			resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp);
912 
913 			/* tx done; now release ref on fdb entry */
914 			VNET_FDBE_REFRELE(vresp);
915 
916 			if (resid_mp != NULL) {
917 				/* m_tx failed */
918 				mp->b_next = next;
919 				break;
920 			}
921 		}
922 
923 		statsp->obytes += size;
924 		statsp->opackets++;
925 		mp = next;
926 	}
927 
928 	DBG1(vnetp, "exit\n");
929 	return (mp);
930 }
931 
932 /* get statistics from the device */
933 int
934 vnet_m_stat(void *arg, uint_t stat, uint64_t *val)
935 {
936 	vnet_t *vnetp = arg;
937 	vnet_res_t	*vresp;
938 	mac_register_t	*macp;
939 	mac_callbacks_t	*cbp;
940 	uint64_t val_total = 0;
941 
942 	DBG1(vnetp, "enter\n");
943 
944 	/*
945 	 * get the specified statistic from each transport and return the
946 	 * aggregate val.  This obviously only works for counters.
947 	 */
948 	if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) ||
949 	    (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) {
950 		return (ENOTSUP);
951 	}
952 
953 	READ_ENTER(&vnetp->vrwlock);
954 	for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
955 		macp = &vresp->macreg;
956 		cbp = macp->m_callbacks;
957 		if (cbp->mc_getstat(macp->m_driver, stat, val) == 0)
958 			val_total += *val;
959 	}
960 	RW_EXIT(&vnetp->vrwlock);
961 
962 	*val = val_total;
963 
964 	DBG1(vnetp, "exit\n");
965 	return (0);
966 }
967 
968 static void
969 vnet_ring_grp_init(vnet_t *vnetp)
970 {
971 	vnet_pseudo_rx_group_t	*rx_grp;
972 	vnet_pseudo_rx_ring_t	*rx_ringp;
973 	vnet_pseudo_tx_group_t	*tx_grp;
974 	vnet_pseudo_tx_ring_t	*tx_ringp;
975 	int			i;
976 
977 	tx_grp = &vnetp->tx_grp[0];
978 	tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) *
979 	    VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP);
980 	for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) {
981 		tx_ringp[i].state |= VNET_TXRING_SHARED;
982 	}
983 	tx_grp->rings = tx_ringp;
984 	tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS;
985 	mutex_init(&tx_grp->flowctl_lock, NULL, MUTEX_DRIVER, NULL);
986 	cv_init(&tx_grp->flowctl_cv, NULL, CV_DRIVER, NULL);
987 	tx_grp->flowctl_thread = thread_create(NULL, 0,
988 	    vnet_tx_notify_thread, tx_grp, 0, &p0, TS_RUN, minclsyspri);
989 
990 	rx_grp = &vnetp->rx_grp[0];
991 	rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP;
992 	rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL);
993 	rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) *
994 	    rx_grp->max_ring_cnt, KM_SLEEP);
995 
996 	/*
997 	 * Setup the first 3 Pseudo RX Rings that are reserved;
998 	 * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource.
999 	 */
1000 	rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE;
1001 	rx_ringp[0].index = 0;
1002 	rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
1003 	rx_ringp[1].index = 1;
1004 	rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
1005 	rx_ringp[2].index = 2;
1006 
1007 	rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1008 	rx_grp->rings = rx_ringp;
1009 
1010 	for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1011 	    i < rx_grp->max_ring_cnt; i++) {
1012 		rx_ringp = &rx_grp->rings[i];
1013 		rx_ringp->state = VNET_RXRING_FREE;
1014 		rx_ringp->index = i;
1015 	}
1016 }
1017 
1018 static void
1019 vnet_ring_grp_uninit(vnet_t *vnetp)
1020 {
1021 	vnet_pseudo_rx_group_t	*rx_grp;
1022 	vnet_pseudo_tx_group_t	*tx_grp;
1023 	kt_did_t		tid = 0;
1024 
1025 	tx_grp = &vnetp->tx_grp[0];
1026 
1027 	/* Inform tx_notify_thread to exit */
1028 	mutex_enter(&tx_grp->flowctl_lock);
1029 	if (tx_grp->flowctl_thread != NULL) {
1030 		tid = tx_grp->flowctl_thread->t_did;
1031 		tx_grp->flowctl_done = B_TRUE;
1032 		cv_signal(&tx_grp->flowctl_cv);
1033 	}
1034 	mutex_exit(&tx_grp->flowctl_lock);
1035 	if (tid != 0)
1036 		thread_join(tid);
1037 
1038 	if (tx_grp->rings != NULL) {
1039 		ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS);
1040 		kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) *
1041 		    tx_grp->ring_cnt);
1042 		tx_grp->rings = NULL;
1043 	}
1044 
1045 	rx_grp = &vnetp->rx_grp[0];
1046 	if (rx_grp->rings != NULL) {
1047 		ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP);
1048 		ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
1049 		kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) *
1050 		    rx_grp->max_ring_cnt);
1051 		rx_grp->rings = NULL;
1052 	}
1053 }
1054 
1055 static vnet_pseudo_rx_ring_t *
1056 vnet_alloc_pseudo_rx_ring(vnet_t *vnetp)
1057 {
1058 	vnet_pseudo_rx_group_t  *rx_grp;
1059 	vnet_pseudo_rx_ring_t	*rx_ringp;
1060 	int			index;
1061 
1062 	rx_grp = &vnetp->rx_grp[0];
1063 	WRITE_ENTER(&rx_grp->lock);
1064 
1065 	if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) {
1066 		/* no rings available */
1067 		RW_EXIT(&rx_grp->lock);
1068 		return (NULL);
1069 	}
1070 
1071 	for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1072 	    index < rx_grp->max_ring_cnt; index++) {
1073 		rx_ringp = &rx_grp->rings[index];
1074 		if (rx_ringp->state == VNET_RXRING_FREE) {
1075 			rx_ringp->state |= VNET_RXRING_INUSE;
1076 			rx_grp->ring_cnt++;
1077 			break;
1078 		}
1079 	}
1080 
1081 	RW_EXIT(&rx_grp->lock);
1082 	return (rx_ringp);
1083 }
1084 
1085 static void
1086 vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp)
1087 {
1088 	vnet_pseudo_rx_group_t  *rx_grp;
1089 
1090 	ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
1091 	rx_grp = &vnetp->rx_grp[0];
1092 	WRITE_ENTER(&rx_grp->lock);
1093 
1094 	if (ringp->state != VNET_RXRING_FREE) {
1095 		ringp->state = VNET_RXRING_FREE;
1096 		ringp->handle = NULL;
1097 		rx_grp->ring_cnt--;
1098 	}
1099 
1100 	RW_EXIT(&rx_grp->lock);
1101 }
1102 
1103 /* wrapper function for mac_register() */
1104 static int
1105 vnet_mac_register(vnet_t *vnetp)
1106 {
1107 	mac_register_t	*macp;
1108 	int		err;
1109 
1110 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1111 		return (DDI_FAILURE);
1112 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1113 	macp->m_driver = vnetp;
1114 	macp->m_dip = vnetp->dip;
1115 	macp->m_src_addr = vnetp->curr_macaddr;
1116 	macp->m_callbacks = &vnet_m_callbacks;
1117 	macp->m_min_sdu = 0;
1118 	macp->m_max_sdu = vnetp->mtu;
1119 	macp->m_margin = VLAN_TAGSZ;
1120 
1121 	macp->m_v12n = MAC_VIRT_LEVEL1;
1122 
1123 	/*
1124 	 * Finally, we're ready to register ourselves with the MAC layer
1125 	 * interface; if this succeeds, we're all ready to start()
1126 	 */
1127 	err = mac_register(macp, &vnetp->mh);
1128 	mac_free(macp);
1129 	return (err == 0 ? DDI_SUCCESS : DDI_FAILURE);
1130 }
1131 
1132 /* read the mac address of the device */
1133 static int
1134 vnet_read_mac_address(vnet_t *vnetp)
1135 {
1136 	uchar_t		*macaddr;
1137 	uint32_t	size;
1138 	int		rv;
1139 
1140 	rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip,
1141 	    DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size);
1142 	if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) {
1143 		DWARN(vnetp, "prop_lookup failed(%s) err(%d)\n",
1144 		    macaddr_propname, rv);
1145 		return (DDI_FAILURE);
1146 	}
1147 	bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL);
1148 	bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL);
1149 	ddi_prop_free(macaddr);
1150 
1151 	return (DDI_SUCCESS);
1152 }
1153 
1154 static void
1155 vnet_fdb_create(vnet_t *vnetp)
1156 {
1157 	char		hashname[MAXNAMELEN];
1158 
1159 	(void) snprintf(hashname, MAXNAMELEN, "vnet%d-fdbhash",
1160 	    vnetp->instance);
1161 	vnetp->fdb_nchains = vnet_fdb_nchains;
1162 	vnetp->fdb_hashp = mod_hash_create_ptrhash(hashname, vnetp->fdb_nchains,
1163 	    mod_hash_null_valdtor, sizeof (void *));
1164 }
1165 
1166 static void
1167 vnet_fdb_destroy(vnet_t *vnetp)
1168 {
1169 	/* destroy fdb-hash-table */
1170 	if (vnetp->fdb_hashp != NULL) {
1171 		mod_hash_destroy_hash(vnetp->fdb_hashp);
1172 		vnetp->fdb_hashp = NULL;
1173 		vnetp->fdb_nchains = 0;
1174 	}
1175 }
1176 
1177 /*
1178  * Add an entry into the fdb.
1179  */
1180 void
1181 vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp)
1182 {
1183 	uint64_t	addr = 0;
1184 	int		rv;
1185 
1186 	KEY_HASH(addr, vresp->rem_macaddr);
1187 
1188 	/*
1189 	 * If the entry being added corresponds to LDC_SERVICE resource,
1190 	 * that is, vswitch connection, it is added to the hash and also
1191 	 * the entry is cached, an additional reference count reflects
1192 	 * this. The HYBRID resource is not added to the hash, but only
1193 	 * cached, as it is only used for sending out packets for unknown
1194 	 * unicast destinations.
1195 	 */
1196 	(vresp->type == VIO_NET_RES_LDC_SERVICE) ?
1197 	    (vresp->refcnt = 1) : (vresp->refcnt = 0);
1198 
1199 	/*
1200 	 * Note: duplicate keys will be rejected by mod_hash.
1201 	 */
1202 	if (vresp->type != VIO_NET_RES_HYBRID) {
1203 		rv = mod_hash_insert(vnetp->fdb_hashp, (mod_hash_key_t)addr,
1204 		    (mod_hash_val_t)vresp);
1205 		if (rv != 0) {
1206 			DWARN(vnetp, "Duplicate macaddr key(%lx)\n", addr);
1207 			return;
1208 		}
1209 	}
1210 
1211 	if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
1212 		/* Cache the fdb entry to vsw-port */
1213 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1214 		if (vnetp->vsw_fp == NULL)
1215 			vnetp->vsw_fp = vresp;
1216 		RW_EXIT(&vnetp->vsw_fp_rw);
1217 	} else if (vresp->type == VIO_NET_RES_HYBRID) {
1218 		/* Cache the fdb entry to hybrid resource */
1219 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1220 		if (vnetp->hio_fp == NULL)
1221 			vnetp->hio_fp = vresp;
1222 		RW_EXIT(&vnetp->vsw_fp_rw);
1223 	}
1224 }
1225 
1226 /*
1227  * Remove an entry from fdb.
1228  */
1229 static void
1230 vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp)
1231 {
1232 	uint64_t	addr = 0;
1233 	int		rv;
1234 	uint32_t	refcnt;
1235 	vnet_res_t	*tmp;
1236 
1237 	KEY_HASH(addr, vresp->rem_macaddr);
1238 
1239 	/*
1240 	 * Remove the entry from fdb hash table.
1241 	 * This prevents further references to this fdb entry.
1242 	 */
1243 	if (vresp->type != VIO_NET_RES_HYBRID) {
1244 		rv = mod_hash_remove(vnetp->fdb_hashp, (mod_hash_key_t)addr,
1245 		    (mod_hash_val_t *)&tmp);
1246 		if (rv != 0) {
1247 			/*
1248 			 * As the resources are added to the hash only
1249 			 * after they are started, this can occur if
1250 			 * a resource unregisters before it is ever started.
1251 			 */
1252 			return;
1253 		}
1254 	}
1255 
1256 	if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
1257 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1258 
1259 		ASSERT(tmp == vnetp->vsw_fp);
1260 		vnetp->vsw_fp = NULL;
1261 
1262 		RW_EXIT(&vnetp->vsw_fp_rw);
1263 	} else if (vresp->type == VIO_NET_RES_HYBRID) {
1264 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1265 
1266 		vnetp->hio_fp = NULL;
1267 
1268 		RW_EXIT(&vnetp->vsw_fp_rw);
1269 	}
1270 
1271 	/*
1272 	 * If there are threads already ref holding before the entry was
1273 	 * removed from hash table, then wait for ref count to drop to zero.
1274 	 */
1275 	(vresp->type == VIO_NET_RES_LDC_SERVICE) ?
1276 	    (refcnt = 1) : (refcnt = 0);
1277 	while (vresp->refcnt > refcnt) {
1278 		delay(drv_usectohz(vnet_fdbe_refcnt_delay));
1279 	}
1280 }
1281 
1282 /*
1283  * Search fdb for a given mac address. If an entry is found, hold
1284  * a reference to it and return the entry; else returns NULL.
1285  */
1286 static vnet_res_t *
1287 vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp)
1288 {
1289 	uint64_t	key = 0;
1290 	vnet_res_t	*vresp;
1291 	int		rv;
1292 
1293 	KEY_HASH(key, addrp->ether_addr_octet);
1294 
1295 	rv = mod_hash_find_cb(vnetp->fdb_hashp, (mod_hash_key_t)key,
1296 	    (mod_hash_val_t *)&vresp, vnet_fdbe_find_cb);
1297 
1298 	if (rv != 0)
1299 		return (NULL);
1300 
1301 	return (vresp);
1302 }
1303 
1304 /*
1305  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1306  * entry corresponding to the key (macaddr), this callback will be invoked by
1307  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1308  * entry before returning the found entry.
1309  */
1310 static void
1311 vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1312 {
1313 	_NOTE(ARGUNUSED(key))
1314 	VNET_FDBE_REFHOLD((vnet_res_t *)val);
1315 }
1316 
1317 /*
1318  * Frames received that are tagged with the pvid of the vnet device must be
1319  * untagged before sending up the stack. This function walks the chain of rx
1320  * frames, untags any such frames and returns the updated chain.
1321  *
1322  * Arguments:
1323  *    pvid:  pvid of the vnet device for which packets are being received
1324  *    mp:    head of pkt chain to be validated and untagged
1325  *
1326  * Returns:
1327  *    mp:    head of updated chain of packets
1328  */
1329 static void
1330 vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp)
1331 {
1332 	struct ether_vlan_header	*evhp;
1333 	mblk_t				*bp;
1334 	mblk_t				*bpt;
1335 	mblk_t				*bph;
1336 	mblk_t				*bpn;
1337 
1338 	bpn = bph = bpt = NULL;
1339 
1340 	for (bp = *mp; bp != NULL; bp = bpn) {
1341 
1342 		bpn = bp->b_next;
1343 		bp->b_next = bp->b_prev = NULL;
1344 
1345 		evhp = (struct ether_vlan_header *)bp->b_rptr;
1346 
1347 		if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN &&
1348 		    VLAN_ID(ntohs(evhp->ether_tci)) == pvid) {
1349 
1350 			bp = vnet_vlan_remove_tag(bp);
1351 			if (bp == NULL) {
1352 				continue;
1353 			}
1354 
1355 		}
1356 
1357 		/* build a chain of processed packets */
1358 		if (bph == NULL) {
1359 			bph = bpt = bp;
1360 		} else {
1361 			bpt->b_next = bp;
1362 			bpt = bp;
1363 		}
1364 
1365 	}
1366 
1367 	*mp = bph;
1368 }
1369 
1370 static void
1371 vnet_rx(vio_net_handle_t vrh, mblk_t *mp)
1372 {
1373 	vnet_res_t		*vresp = (vnet_res_t *)vrh;
1374 	vnet_t			*vnetp = vresp->vnetp;
1375 	vnet_pseudo_rx_ring_t	*ringp;
1376 
1377 	if ((vnetp == NULL) || (vnetp->mh == 0)) {
1378 		freemsgchain(mp);
1379 		return;
1380 	}
1381 
1382 	ringp = vresp->rx_ringp;
1383 	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
1384 }
1385 
1386 void
1387 vnet_tx_update(vio_net_handle_t vrh)
1388 {
1389 	vnet_res_t		*vresp = (vnet_res_t *)vrh;
1390 	vnet_t			*vnetp = vresp->vnetp;
1391 	vnet_pseudo_tx_ring_t	*tx_ringp;
1392 	vnet_pseudo_tx_group_t	*tx_grp;
1393 	int			i;
1394 
1395 	if (vnetp == NULL || vnetp->mh == NULL) {
1396 		return;
1397 	}
1398 
1399 	/*
1400 	 * Currently, the tx hwring API (used to access rings that belong to
1401 	 * a Hybrid IO resource) does not provide us a per ring flow ctrl
1402 	 * update; also the pseudo rings are shared by the ports/ldcs in the
1403 	 * vgen layer. Thus we can't figure out which pseudo ring is being
1404 	 * re-enabled for transmits. To work around this, when we get a tx
1405 	 * restart notification from below, we simply propagate that to all
1406 	 * the tx pseudo rings registered with the mac layer above.
1407 	 *
1408 	 * There are a couple of side effects with this approach, but they are
1409 	 * not harmful, as outlined below:
1410 	 *
1411 	 * A) We might send an invalid ring_update() for a ring that is not
1412 	 * really flow controlled. This will not have any effect in the mac
1413 	 * layer and packets will continue to be transmitted on that ring.
1414 	 *
1415 	 * B) We might end up clearing the flow control in the mac layer for
1416 	 * a ring that is still flow controlled in the underlying resource.
1417 	 * This will result in the mac layer restarting	transmit, only to be
1418 	 * flow controlled again on that ring.
1419 	 */
1420 	tx_grp = &vnetp->tx_grp[0];
1421 	for (i = 0; i < tx_grp->ring_cnt; i++) {
1422 		tx_ringp = &tx_grp->rings[i];
1423 		mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
1424 	}
1425 }
1426 
1427 /*
1428  * vnet_tx_notify_thread:
1429  *
1430  * vnet_tx_ring_update() callback function wakes up this thread when
1431  * it gets called. This thread will call mac_tx_ring_update() to
1432  * notify upper mac of flow control getting relieved. Note that
1433  * vnet_tx_ring_update() cannot call mac_tx_ring_update() directly
1434  * because vnet_tx_ring_update() is called from lower mac with
1435  * mi_rw_lock held and mac_tx_ring_update() would also try to grab
1436  * the same lock.
1437  */
1438 static void
1439 vnet_tx_notify_thread(void *arg)
1440 {
1441 	callb_cpr_t		cprinfo;
1442 	vnet_pseudo_tx_group_t	*tx_grp = (vnet_pseudo_tx_group_t *)arg;
1443 	vnet_pseudo_tx_ring_t	*tx_ringp;
1444 	vnet_t			*vnetp;
1445 	int			i;
1446 
1447 	CALLB_CPR_INIT(&cprinfo, &tx_grp->flowctl_lock, callb_generic_cpr,
1448 	    "vnet_tx_notify_thread");
1449 
1450 	mutex_enter(&tx_grp->flowctl_lock);
1451 	while (!tx_grp->flowctl_done) {
1452 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1453 		cv_wait(&tx_grp->flowctl_cv, &tx_grp->flowctl_lock);
1454 		CALLB_CPR_SAFE_END(&cprinfo, &tx_grp->flowctl_lock);
1455 
1456 		for (i = 0; i < tx_grp->ring_cnt; i++) {
1457 			tx_ringp = &tx_grp->rings[i];
1458 			if (tx_ringp->woken_up) {
1459 				tx_ringp->woken_up = B_FALSE;
1460 				vnetp = tx_ringp->vnetp;
1461 				mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
1462 			}
1463 		}
1464 	}
1465 	/*
1466 	 * The tx_grp is being destroyed, exit the thread.
1467 	 */
1468 	tx_grp->flowctl_thread = NULL;
1469 	CALLB_CPR_EXIT(&cprinfo);
1470 	thread_exit();
1471 }
1472 
1473 void
1474 vnet_tx_ring_update(void *arg1, uintptr_t arg2)
1475 {
1476 	vnet_t			*vnetp = (vnet_t *)arg1;
1477 	vnet_pseudo_tx_group_t	*tx_grp;
1478 	vnet_pseudo_tx_ring_t	*tx_ringp;
1479 	int			i;
1480 
1481 	tx_grp = &vnetp->tx_grp[0];
1482 	for (i = 0; i < tx_grp->ring_cnt; i++) {
1483 		tx_ringp = &tx_grp->rings[i];
1484 		if (tx_ringp->hw_rh == (mac_ring_handle_t)arg2) {
1485 			mutex_enter(&tx_grp->flowctl_lock);
1486 			tx_ringp->woken_up = B_TRUE;
1487 			cv_signal(&tx_grp->flowctl_cv);
1488 			mutex_exit(&tx_grp->flowctl_lock);
1489 			break;
1490 		}
1491 	}
1492 }
1493 
1494 /*
1495  * Update the new mtu of vnet into the mac layer. First check if the device has
1496  * been plumbed and if so fail the mtu update. Returns 0 on success.
1497  */
1498 int
1499 vnet_mtu_update(vnet_t *vnetp, uint32_t mtu)
1500 {
1501 	int	rv;
1502 
1503 	if (vnetp == NULL || vnetp->mh == NULL) {
1504 		return (EINVAL);
1505 	}
1506 
1507 	WRITE_ENTER(&vnetp->vrwlock);
1508 
1509 	if (vnetp->flags & VNET_STARTED) {
1510 		RW_EXIT(&vnetp->vrwlock);
1511 		cmn_err(CE_NOTE, "!vnet%d: Unable to process mtu "
1512 		    "update as the device is plumbed\n",
1513 		    vnetp->instance);
1514 		return (EBUSY);
1515 	}
1516 
1517 	/* update mtu in the mac layer */
1518 	rv = mac_maxsdu_update(vnetp->mh, mtu);
1519 	if (rv != 0) {
1520 		RW_EXIT(&vnetp->vrwlock);
1521 		cmn_err(CE_NOTE,
1522 		    "!vnet%d: Unable to update mtu with mac layer\n",
1523 		    vnetp->instance);
1524 		return (EIO);
1525 	}
1526 
1527 	vnetp->mtu = mtu;
1528 
1529 	RW_EXIT(&vnetp->vrwlock);
1530 
1531 	return (0);
1532 }
1533 
1534 /*
1535  * Update the link state of vnet to the mac layer.
1536  */
1537 void
1538 vnet_link_update(vnet_t *vnetp, link_state_t link_state)
1539 {
1540 	if (vnetp == NULL || vnetp->mh == NULL) {
1541 		return;
1542 	}
1543 
1544 	WRITE_ENTER(&vnetp->vrwlock);
1545 	if (vnetp->link_state == link_state) {
1546 		RW_EXIT(&vnetp->vrwlock);
1547 		return;
1548 	}
1549 	vnetp->link_state = link_state;
1550 	RW_EXIT(&vnetp->vrwlock);
1551 
1552 	mac_link_update(vnetp->mh, link_state);
1553 }
1554 
1555 /*
1556  * vio_net_resource_reg -- An interface called to register a resource
1557  *	with vnet.
1558  *	macp -- a GLDv3 mac_register that has all the details of
1559  *		a resource and its callbacks etc.
1560  *	type -- resource type.
1561  *	local_macaddr -- resource's MAC address. This is used to
1562  *			 associate a resource with a corresponding vnet.
1563  *	remote_macaddr -- remote side MAC address. This is ignored for
1564  *			  the Hybrid resources.
1565  *	vhp -- A handle returned to the caller.
1566  *	vcb -- A set of callbacks provided to the callers.
1567  */
1568 int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
1569     ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp,
1570     vio_net_callbacks_t *vcb)
1571 {
1572 	vnet_t		*vnetp;
1573 	vnet_res_t	*vresp;
1574 
1575 	vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP);
1576 	ether_copy(local_macaddr, vresp->local_macaddr);
1577 	ether_copy(rem_macaddr, vresp->rem_macaddr);
1578 	vresp->type = type;
1579 	bcopy(macp, &vresp->macreg, sizeof (mac_register_t));
1580 
1581 	DBG1(NULL, "Resource Registerig type=0%X\n", type);
1582 
1583 	READ_ENTER(&vnet_rw);
1584 	vnetp = vnet_headp;
1585 	while (vnetp != NULL) {
1586 		if (VNET_MATCH_RES(vresp, vnetp)) {
1587 			vresp->vnetp = vnetp;
1588 
1589 			/* Setup kstats for hio resource */
1590 			if (vresp->type == VIO_NET_RES_HYBRID) {
1591 				vresp->ksp = vnet_hio_setup_kstats(DRV_NAME,
1592 				    "hio", vresp);
1593 				if (vresp->ksp == NULL) {
1594 					cmn_err(CE_NOTE, "!vnet%d: Cannot "
1595 					    "create kstats for hio resource",
1596 					    vnetp->instance);
1597 				}
1598 			}
1599 			vnet_add_resource(vnetp, vresp);
1600 			break;
1601 		}
1602 		vnetp = vnetp->nextp;
1603 	}
1604 	RW_EXIT(&vnet_rw);
1605 	if (vresp->vnetp == NULL) {
1606 		DWARN(NULL, "No vnet instance");
1607 		kmem_free(vresp, sizeof (vnet_res_t));
1608 		return (ENXIO);
1609 	}
1610 
1611 	*vhp = vresp;
1612 	vcb->vio_net_rx_cb = vnet_rx;
1613 	vcb->vio_net_tx_update = vnet_tx_update;
1614 	vcb->vio_net_report_err = vnet_handle_res_err;
1615 
1616 	/* Bind the resource to pseudo ring(s) */
1617 	if (vnet_bind_rings(vresp) != 0) {
1618 		(void) vnet_rem_resource(vnetp, vresp);
1619 		vnet_hio_destroy_kstats(vresp->ksp);
1620 		KMEM_FREE(vresp);
1621 		return (1);
1622 	}
1623 
1624 	/* Dispatch a task to start resources */
1625 	vnet_dispatch_res_task(vnetp);
1626 	return (0);
1627 }
1628 
1629 /*
1630  * vio_net_resource_unreg -- An interface to unregister a resource.
1631  */
1632 void
1633 vio_net_resource_unreg(vio_net_handle_t vhp)
1634 {
1635 	vnet_res_t	*vresp = (vnet_res_t *)vhp;
1636 	vnet_t		*vnetp = vresp->vnetp;
1637 
1638 	DBG1(NULL, "Resource Registerig hdl=0x%p", vhp);
1639 
1640 	ASSERT(vnetp != NULL);
1641 	/*
1642 	 * Remove the resource from fdb; this ensures
1643 	 * there are no references to the resource.
1644 	 */
1645 	vnet_fdbe_del(vnetp, vresp);
1646 
1647 	vnet_unbind_rings(vresp);
1648 
1649 	/* Now remove the resource from the list */
1650 	(void) vnet_rem_resource(vnetp, vresp);
1651 
1652 	vnet_hio_destroy_kstats(vresp->ksp);
1653 	KMEM_FREE(vresp);
1654 }
1655 
1656 static void
1657 vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp)
1658 {
1659 	WRITE_ENTER(&vnetp->vrwlock);
1660 	vresp->nextp = vnetp->vres_list;
1661 	vnetp->vres_list = vresp;
1662 	RW_EXIT(&vnetp->vrwlock);
1663 }
1664 
1665 static vnet_res_t *
1666 vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp)
1667 {
1668 	vnet_res_t	*vrp;
1669 
1670 	WRITE_ENTER(&vnetp->vrwlock);
1671 	if (vresp == vnetp->vres_list) {
1672 		vnetp->vres_list = vresp->nextp;
1673 	} else {
1674 		vrp = vnetp->vres_list;
1675 		while (vrp->nextp != NULL) {
1676 			if (vrp->nextp == vresp) {
1677 				vrp->nextp = vresp->nextp;
1678 				break;
1679 			}
1680 			vrp = vrp->nextp;
1681 		}
1682 	}
1683 	vresp->vnetp = NULL;
1684 	vresp->nextp = NULL;
1685 
1686 	RW_EXIT(&vnetp->vrwlock);
1687 
1688 	return (vresp);
1689 }
1690 
1691 /*
1692  * vnet_dds_rx -- an interface called by vgen to DDS messages.
1693  */
1694 void
1695 vnet_dds_rx(void *arg, void *dmsg)
1696 {
1697 	vnet_t *vnetp = arg;
1698 	vdds_process_dds_msg(vnetp, dmsg);
1699 }
1700 
1701 /*
1702  * vnet_send_dds_msg -- An interface provided to DDS to send
1703  *	DDS messages. This simply sends meessages via vgen.
1704  */
1705 int
1706 vnet_send_dds_msg(vnet_t *vnetp, void *dmsg)
1707 {
1708 	int rv;
1709 
1710 	if (vnetp->vgenhdl != NULL) {
1711 		rv = vgen_dds_tx(vnetp->vgenhdl, dmsg);
1712 	}
1713 	return (rv);
1714 }
1715 
1716 /*
1717  * vnet_cleanup_hio -- an interface called by vgen to cleanup hio resources.
1718  */
1719 void
1720 vnet_dds_cleanup_hio(vnet_t *vnetp)
1721 {
1722 	vdds_cleanup_hio(vnetp);
1723 }
1724 
1725 /*
1726  * vnet_handle_res_err -- A callback function called by a resource
1727  *	to report an error. For example, vgen can call to report
1728  *	an LDC down/reset event. This will trigger cleanup of associated
1729  *	Hybrid resource.
1730  */
1731 /* ARGSUSED */
1732 static void
1733 vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err)
1734 {
1735 	vnet_res_t *vresp = (vnet_res_t *)vrh;
1736 	vnet_t *vnetp = vresp->vnetp;
1737 
1738 	if (vnetp == NULL) {
1739 		return;
1740 	}
1741 	if ((vresp->type != VIO_NET_RES_LDC_SERVICE) &&
1742 	    (vresp->type != VIO_NET_RES_HYBRID)) {
1743 		return;
1744 	}
1745 
1746 	vdds_cleanup_hio(vnetp);
1747 }
1748 
1749 /*
1750  * vnet_dispatch_res_task -- A function to dispatch tasks start resources.
1751  */
1752 static void
1753 vnet_dispatch_res_task(vnet_t *vnetp)
1754 {
1755 	int rv;
1756 
1757 	/*
1758 	 * Dispatch the task. It could be the case that vnetp->flags does
1759 	 * not have VNET_STARTED set. This is ok as vnet_rest_start_task()
1760 	 * can abort the task when the task is started. See related comments
1761 	 * in vnet_m_stop() and vnet_stop_resources().
1762 	 */
1763 	rv = ddi_taskq_dispatch(vnetp->taskqp, vnet_res_start_task,
1764 	    vnetp, DDI_NOSLEEP);
1765 	if (rv != DDI_SUCCESS) {
1766 		cmn_err(CE_WARN,
1767 		    "vnet%d:Can't dispatch start resource task",
1768 		    vnetp->instance);
1769 	}
1770 }
1771 
1772 /*
1773  * vnet_res_start_task -- A taskq callback function that starts a resource.
1774  */
1775 static void
1776 vnet_res_start_task(void *arg)
1777 {
1778 	vnet_t *vnetp = arg;
1779 
1780 	WRITE_ENTER(&vnetp->vrwlock);
1781 	if (vnetp->flags & VNET_STARTED) {
1782 		vnet_start_resources(vnetp);
1783 	}
1784 	RW_EXIT(&vnetp->vrwlock);
1785 }
1786 
1787 /*
1788  * vnet_start_resources -- starts all resources associated with
1789  *	a vnet.
1790  */
1791 static void
1792 vnet_start_resources(vnet_t *vnetp)
1793 {
1794 	mac_register_t	*macp;
1795 	mac_callbacks_t	*cbp;
1796 	vnet_res_t	*vresp;
1797 	int rv;
1798 
1799 	DBG1(vnetp, "enter\n");
1800 
1801 	ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));
1802 
1803 	for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
1804 		/* skip if it is already started */
1805 		if (vresp->flags & VNET_STARTED) {
1806 			continue;
1807 		}
1808 		macp = &vresp->macreg;
1809 		cbp = macp->m_callbacks;
1810 		rv = cbp->mc_start(macp->m_driver);
1811 		if (rv == 0) {
1812 			/*
1813 			 * Successfully started the resource, so now
1814 			 * add it to the fdb.
1815 			 */
1816 			vresp->flags |= VNET_STARTED;
1817 			vnet_fdbe_add(vnetp, vresp);
1818 		}
1819 	}
1820 
1821 	DBG1(vnetp, "exit\n");
1822 
1823 }
1824 
1825 /*
1826  * vnet_stop_resources -- stop all resources associated with a vnet.
1827  */
1828 static void
1829 vnet_stop_resources(vnet_t *vnetp)
1830 {
1831 	vnet_res_t	*vresp;
1832 	mac_register_t	*macp;
1833 	mac_callbacks_t	*cbp;
1834 
1835 	DBG1(vnetp, "enter\n");
1836 
1837 	ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));
1838 
1839 	for (vresp = vnetp->vres_list; vresp != NULL; ) {
1840 		if (vresp->flags & VNET_STARTED) {
1841 			/*
1842 			 * Release the lock while invoking mc_stop() of the
1843 			 * underlying resource. We hold a reference to this
1844 			 * resource to prevent being removed from the list in
1845 			 * vio_net_resource_unreg(). Note that new resources
1846 			 * can be added to the head of the list while the lock
1847 			 * is released, but they won't be started, as
1848 			 * VNET_STARTED flag has been cleared for the vnet
1849 			 * device in vnet_m_stop(). Also, while the lock is
1850 			 * released a resource could be removed from the list
1851 			 * in vio_net_resource_unreg(); but that is ok, as we
1852 			 * re-acquire the lock and only then access the forward
1853 			 * link (vresp->nextp) to continue with the next
1854 			 * resource.
1855 			 */
1856 			vresp->flags &= ~VNET_STARTED;
1857 			vresp->flags |= VNET_STOPPING;
1858 			macp = &vresp->macreg;
1859 			cbp = macp->m_callbacks;
1860 			VNET_FDBE_REFHOLD(vresp);
1861 			RW_EXIT(&vnetp->vrwlock);
1862 
1863 			cbp->mc_stop(macp->m_driver);
1864 
1865 			WRITE_ENTER(&vnetp->vrwlock);
1866 			vresp->flags &= ~VNET_STOPPING;
1867 			VNET_FDBE_REFRELE(vresp);
1868 		}
1869 		vresp = vresp->nextp;
1870 	}
1871 	DBG1(vnetp, "exit\n");
1872 }
1873 
1874 /*
1875  * Setup kstats for the HIO statistics.
1876  * NOTE: the synchronization for the statistics is the
1877  * responsibility of the caller.
1878  */
1879 kstat_t *
1880 vnet_hio_setup_kstats(char *ks_mod, char *ks_name, vnet_res_t *vresp)
1881 {
1882 	kstat_t *ksp;
1883 	vnet_t *vnetp = vresp->vnetp;
1884 	vnet_hio_kstats_t *hiokp;
1885 	size_t size;
1886 
1887 	ASSERT(vnetp != NULL);
1888 	size = sizeof (vnet_hio_kstats_t) / sizeof (kstat_named_t);
1889 	ksp = kstat_create(ks_mod, vnetp->instance, ks_name, "net",
1890 	    KSTAT_TYPE_NAMED, size, 0);
1891 	if (ksp == NULL) {
1892 		return (NULL);
1893 	}
1894 
1895 	hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
1896 	kstat_named_init(&hiokp->ipackets,		"ipackets",
1897 	    KSTAT_DATA_ULONG);
1898 	kstat_named_init(&hiokp->ierrors,		"ierrors",
1899 	    KSTAT_DATA_ULONG);
1900 	kstat_named_init(&hiokp->opackets,		"opackets",
1901 	    KSTAT_DATA_ULONG);
1902 	kstat_named_init(&hiokp->oerrors,		"oerrors",
1903 	    KSTAT_DATA_ULONG);
1904 
1905 
1906 	/* MIB II kstat variables */
1907 	kstat_named_init(&hiokp->rbytes,		"rbytes",
1908 	    KSTAT_DATA_ULONG);
1909 	kstat_named_init(&hiokp->obytes,		"obytes",
1910 	    KSTAT_DATA_ULONG);
1911 	kstat_named_init(&hiokp->multircv,		"multircv",
1912 	    KSTAT_DATA_ULONG);
1913 	kstat_named_init(&hiokp->multixmt,		"multixmt",
1914 	    KSTAT_DATA_ULONG);
1915 	kstat_named_init(&hiokp->brdcstrcv,		"brdcstrcv",
1916 	    KSTAT_DATA_ULONG);
1917 	kstat_named_init(&hiokp->brdcstxmt,		"brdcstxmt",
1918 	    KSTAT_DATA_ULONG);
1919 	kstat_named_init(&hiokp->norcvbuf,		"norcvbuf",
1920 	    KSTAT_DATA_ULONG);
1921 	kstat_named_init(&hiokp->noxmtbuf,		"noxmtbuf",
1922 	    KSTAT_DATA_ULONG);
1923 
1924 	ksp->ks_update = vnet_hio_update_kstats;
1925 	ksp->ks_private = (void *)vresp;
1926 	kstat_install(ksp);
1927 	return (ksp);
1928 }
1929 
1930 /*
1931  * Destroy kstats.
1932  */
1933 static void
1934 vnet_hio_destroy_kstats(kstat_t *ksp)
1935 {
1936 	if (ksp != NULL)
1937 		kstat_delete(ksp);
1938 }
1939 
1940 /*
1941  * Update the kstats.
1942  */
1943 static int
1944 vnet_hio_update_kstats(kstat_t *ksp, int rw)
1945 {
1946 	vnet_t *vnetp;
1947 	vnet_res_t *vresp;
1948 	vnet_hio_stats_t statsp;
1949 	vnet_hio_kstats_t *hiokp;
1950 
1951 	vresp = (vnet_res_t *)ksp->ks_private;
1952 	vnetp = vresp->vnetp;
1953 
1954 	bzero(&statsp, sizeof (vnet_hio_stats_t));
1955 
1956 	READ_ENTER(&vnetp->vsw_fp_rw);
1957 	if (vnetp->hio_fp == NULL) {
1958 		/* not using hio resources, just return */
1959 		RW_EXIT(&vnetp->vsw_fp_rw);
1960 		return (0);
1961 	}
1962 	VNET_FDBE_REFHOLD(vnetp->hio_fp);
1963 	RW_EXIT(&vnetp->vsw_fp_rw);
1964 	vnet_hio_get_stats(vnetp->hio_fp, &statsp);
1965 	VNET_FDBE_REFRELE(vnetp->hio_fp);
1966 
1967 	hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
1968 
1969 	if (rw == KSTAT_READ) {
1970 		/* Link Input/Output stats */
1971 		hiokp->ipackets.value.ul	= (uint32_t)statsp.ipackets;
1972 		hiokp->ipackets64.value.ull	= statsp.ipackets;
1973 		hiokp->ierrors.value.ul		= statsp.ierrors;
1974 		hiokp->opackets.value.ul	= (uint32_t)statsp.opackets;
1975 		hiokp->opackets64.value.ull	= statsp.opackets;
1976 		hiokp->oerrors.value.ul		= statsp.oerrors;
1977 
1978 		/* MIB II kstat variables */
1979 		hiokp->rbytes.value.ul		= (uint32_t)statsp.rbytes;
1980 		hiokp->rbytes64.value.ull	= statsp.rbytes;
1981 		hiokp->obytes.value.ul		= (uint32_t)statsp.obytes;
1982 		hiokp->obytes64.value.ull	= statsp.obytes;
1983 		hiokp->multircv.value.ul	= statsp.multircv;
1984 		hiokp->multixmt.value.ul	= statsp.multixmt;
1985 		hiokp->brdcstrcv.value.ul	= statsp.brdcstrcv;
1986 		hiokp->brdcstxmt.value.ul	= statsp.brdcstxmt;
1987 		hiokp->norcvbuf.value.ul	= statsp.norcvbuf;
1988 		hiokp->noxmtbuf.value.ul	= statsp.noxmtbuf;
1989 	} else {
1990 		return (EACCES);
1991 	}
1992 
1993 	return (0);
1994 }
1995 
1996 static void
1997 vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp)
1998 {
1999 	mac_register_t		*macp;
2000 	mac_callbacks_t		*cbp;
2001 	uint64_t		val;
2002 	int			stat;
2003 
2004 	/*
2005 	 * get the specified statistics from the underlying nxge.
2006 	 */
2007 	macp = &vresp->macreg;
2008 	cbp = macp->m_callbacks;
2009 	for (stat = MAC_STAT_MIN; stat < MAC_STAT_OVERFLOWS; stat++) {
2010 		if (cbp->mc_getstat(macp->m_driver, stat, &val) == 0) {
2011 			switch (stat) {
2012 			case MAC_STAT_IPACKETS:
2013 				statsp->ipackets = val;
2014 				break;
2015 
2016 			case MAC_STAT_IERRORS:
2017 				statsp->ierrors = val;
2018 				break;
2019 
2020 			case MAC_STAT_OPACKETS:
2021 				statsp->opackets = val;
2022 				break;
2023 
2024 			case MAC_STAT_OERRORS:
2025 				statsp->oerrors = val;
2026 				break;
2027 
2028 			case MAC_STAT_RBYTES:
2029 				statsp->rbytes = val;
2030 				break;
2031 
2032 			case MAC_STAT_OBYTES:
2033 				statsp->obytes = val;
2034 				break;
2035 
2036 			case MAC_STAT_MULTIRCV:
2037 				statsp->multircv = val;
2038 				break;
2039 
2040 			case MAC_STAT_MULTIXMT:
2041 				statsp->multixmt = val;
2042 				break;
2043 
2044 			case MAC_STAT_BRDCSTRCV:
2045 				statsp->brdcstrcv = val;
2046 				break;
2047 
2048 			case MAC_STAT_BRDCSTXMT:
2049 				statsp->brdcstxmt = val;
2050 				break;
2051 
2052 			case MAC_STAT_NOXMTBUF:
2053 				statsp->noxmtbuf = val;
2054 				break;
2055 
2056 			case MAC_STAT_NORCVBUF:
2057 				statsp->norcvbuf = val;
2058 				break;
2059 
2060 			default:
2061 				/*
2062 				 * parameters not interested.
2063 				 */
2064 				break;
2065 			}
2066 		}
2067 	}
2068 }
2069 
2070 static boolean_t
2071 vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data)
2072 {
2073 	vnet_t	*vnetp = (vnet_t *)arg;
2074 
2075 	if (vnetp == NULL) {
2076 		return (0);
2077 	}
2078 
2079 	switch (cap) {
2080 
2081 	case MAC_CAPAB_RINGS: {
2082 
2083 		mac_capab_rings_t *cap_rings = cap_data;
2084 		/*
2085 		 * Rings Capability Notes:
2086 		 * We advertise rings to make use of the rings framework in
2087 		 * gldv3 mac layer, to improve the performance. This is
2088 		 * specifically needed when a Hybrid resource (with multiple
2089 		 * tx/rx hardware rings) is assigned to a vnet device. We also
2090 		 * leverage this for the normal case when no Hybrid resource is
2091 		 * assigned.
2092 		 *
2093 		 * Ring Allocation:
2094 		 * - TX path:
2095 		 * We expose a pseudo ring group with 2 pseudo tx rings (as
2096 		 * currently HybridIO exports only 2 rings) In the normal case,
2097 		 * transmit traffic that comes down to the driver through the
2098 		 * mri_tx (vnet_tx_ring_send()) entry point goes through the
2099 		 * distributed switching algorithm in vnet and gets transmitted
2100 		 * over a port/LDC in the vgen layer to either the vswitch or a
2101 		 * peer vnet. If and when a Hybrid resource is assigned to the
2102 		 * vnet, we obtain the tx ring information of the Hybrid device
2103 		 * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings.
2104 		 * Traffic being sent over the Hybrid resource by the mac layer
2105 		 * gets spread across both hw rings, as they are mapped to the
2106 		 * 2 pseudo tx rings in vnet.
2107 		 *
2108 		 * - RX path:
2109 		 * We expose a pseudo ring group with 3 pseudo rx rings (static
2110 		 * rings) initially. The first (default) pseudo rx ring is
2111 		 * reserved for the resource that connects to the vswitch
2112 		 * service. The next 2 rings are reserved for a Hybrid resource
2113 		 * that may be assigned to the vnet device. If and when a
2114 		 * Hybrid resource is assigned to the vnet, we obtain the rx
2115 		 * ring information of the Hybrid device (nxge) and map these
2116 		 * pseudo rings 1:1 to the 2 hw rx rings. For each additional
2117 		 * resource that connects to a peer vnet, we dynamically
2118 		 * allocate a pseudo rx ring and map it to that resource, when
2119 		 * the resource gets added; and the pseudo rx ring is
2120 		 * dynamically registered with the upper mac layer. We do the
2121 		 * reverse and unregister the ring with the mac layer when
2122 		 * the resource gets removed.
2123 		 *
2124 		 * Synchronization notes:
2125 		 * We don't need any lock to protect members of ring structure,
2126 		 * specifically ringp->hw_rh, in either the TX or the RX ring,
2127 		 * as explained below.
2128 		 * - TX ring:
2129 		 * ring->hw_rh is initialized only when a Hybrid resource is
2130 		 * associated; and gets referenced only in vnet_hio_tx(). The
2131 		 * Hybrid resource itself is available in fdb only after tx
2132 		 * hwrings are found and mapped; i.e, in vio_net_resource_reg()
2133 		 * we call vnet_bind_rings() first and then call
2134 		 * vnet_start_resources() which adds an entry to fdb. For
2135 		 * traffic going over LDC resources, we don't reference
2136 		 * ring->hw_rh at all.
2137 		 * - RX ring:
2138 		 * For rings mapped to Hybrid resource ring->hw_rh is
2139 		 * initialized and only then do we add the rx callback for
2140 		 * the underlying Hybrid resource; we disable callbacks before
2141 		 * we unmap ring->hw_rh. For rings mapped to LDC resources, we
2142 		 * stop the rx callbacks (in vgen) before we remove ring->hw_rh
2143 		 * (vio_net_resource_unreg()).
2144 		 * Also, we access ring->hw_rh in vnet_rx_ring_stat().
2145 		 * Note that for rings mapped to Hybrid resource, though the
2146 		 * rings are statically registered with the mac layer, its
2147 		 * hardware ring mapping (ringp->hw_rh) can be torn down in
2148 		 * vnet_unbind_hwrings() while the kstat operation is in
2149 		 * progress. To protect against this, we hold a reference to
2150 		 * the resource in FDB; this ensures that the thread in
2151 		 * vio_net_resource_unreg() waits for the reference to be
2152 		 * dropped before unbinding the ring.
2153 		 *
2154 		 * We don't need to do this for rings mapped to LDC resources.
2155 		 * These rings are registered/unregistered dynamically with
2156 		 * the mac layer and so any attempt to unregister the ring
2157 		 * while kstat operation is in progress will block in
2158 		 * mac_group_rem_ring(). Thus implicitly protects the
2159 		 * resource (ringp->hw_rh) from disappearing.
2160 		 */
2161 
2162 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2163 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2164 
2165 			/*
2166 			 * The ring_cnt for rx grp is initialized in
2167 			 * vnet_ring_grp_init(). Later, the ring_cnt gets
2168 			 * updated dynamically whenever LDC resources are added
2169 			 * or removed.
2170 			 */
2171 			cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt;
2172 			cap_rings->mr_rget = vnet_get_ring;
2173 
2174 			cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS;
2175 			cap_rings->mr_gget = vnet_get_group;
2176 			cap_rings->mr_gaddring = NULL;
2177 			cap_rings->mr_gremring = NULL;
2178 		} else {
2179 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2180 
2181 			/*
2182 			 * The ring_cnt for tx grp is initialized in
2183 			 * vnet_ring_grp_init() and remains constant, as we
2184 			 * do not support dymanic tx rings for now.
2185 			 */
2186 			cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt;
2187 			cap_rings->mr_rget = vnet_get_ring;
2188 
2189 			/*
2190 			 * Transmit rings are not grouped; i.e, the number of
2191 			 * transmit ring groups advertised should be set to 0.
2192 			 */
2193 			cap_rings->mr_gnum = 0;
2194 
2195 			cap_rings->mr_gget = vnet_get_group;
2196 			cap_rings->mr_gaddring = NULL;
2197 			cap_rings->mr_gremring = NULL;
2198 		}
2199 		return (B_TRUE);
2200 
2201 	}
2202 
2203 	default:
2204 		break;
2205 
2206 	}
2207 
2208 	return (B_FALSE);
2209 }
2210 
2211 /*
2212  * Callback funtion for MAC layer to get ring information.
2213  */
2214 static void
2215 vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
2216     const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle)
2217 {
2218 	vnet_t	*vnetp = arg;
2219 
2220 	switch (rtype) {
2221 
2222 	case MAC_RING_TYPE_RX: {
2223 
2224 		vnet_pseudo_rx_group_t	*rx_grp;
2225 		vnet_pseudo_rx_ring_t	*rx_ringp;
2226 		mac_intr_t		*mintr;
2227 
2228 		/* We advertised only one RX group */
2229 		ASSERT(g_index == 0);
2230 		rx_grp = &vnetp->rx_grp[g_index];
2231 
2232 		/* Check the current # of rings in the rx group */
2233 		ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt));
2234 
2235 		/* Get the ring based on the index */
2236 		rx_ringp = &rx_grp->rings[r_index];
2237 
2238 		rx_ringp->handle = r_handle;
2239 		/*
2240 		 * Note: we don't need to save the incoming r_index in rx_ring,
2241 		 * as vnet_ring_grp_init() would have initialized the index for
2242 		 * each ring in the array.
2243 		 */
2244 		rx_ringp->grp = rx_grp;
2245 		rx_ringp->vnetp = vnetp;
2246 
2247 		mintr = &infop->mri_intr;
2248 		mintr->mi_handle = (mac_intr_handle_t)rx_ringp;
2249 		mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr;
2250 		mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr;
2251 
2252 		infop->mri_driver = (mac_ring_driver_t)rx_ringp;
2253 		infop->mri_start = vnet_rx_ring_start;
2254 		infop->mri_stop = vnet_rx_ring_stop;
2255 		infop->mri_stat = vnet_rx_ring_stat;
2256 
2257 		/* Set the poll function, as this is an rx ring */
2258 		infop->mri_poll = vnet_rx_poll;
2259 		/*
2260 		 * MAC_RING_RX_ENQUEUE bit needed to be set for nxge
2261 		 * which was not sending packet chains in interrupt
2262 		 * context. For such drivers, packets are queued in
2263 		 * Rx soft rings so that we get a chance to switch
2264 		 * into a polling mode under backlog. This bug (not
2265 		 * sending packet chains) has now been fixed. Once
2266 		 * the performance impact is measured, this change
2267 		 * will be removed.
2268 		 */
2269 		infop->mri_flags = (vnet_mac_rx_queuing ?
2270 		    MAC_RING_RX_ENQUEUE : 0);
2271 		break;
2272 	}
2273 
2274 	case MAC_RING_TYPE_TX: {
2275 		vnet_pseudo_tx_group_t	*tx_grp;
2276 		vnet_pseudo_tx_ring_t	*tx_ringp;
2277 
2278 		/*
2279 		 * No need to check grp index; mac layer passes -1 for it.
2280 		 */
2281 		tx_grp = &vnetp->tx_grp[0];
2282 
2283 		/* Check the # of rings in the tx group */
2284 		ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt));
2285 
2286 		/* Get the ring based on the index */
2287 		tx_ringp = &tx_grp->rings[r_index];
2288 
2289 		tx_ringp->handle = r_handle;
2290 		tx_ringp->index = r_index;
2291 		tx_ringp->grp = tx_grp;
2292 		tx_ringp->vnetp = vnetp;
2293 
2294 		infop->mri_driver = (mac_ring_driver_t)tx_ringp;
2295 		infop->mri_start = vnet_tx_ring_start;
2296 		infop->mri_stop = vnet_tx_ring_stop;
2297 		infop->mri_stat = vnet_tx_ring_stat;
2298 
2299 		/* Set the transmit function, as this is a tx ring */
2300 		infop->mri_tx = vnet_tx_ring_send;
2301 		/*
2302 		 * MAC_RING_TX_SERIALIZE bit needs to be set while
2303 		 * hybridIO is enabled to workaround tx lock
2304 		 * contention issues in nxge.
2305 		 */
2306 		infop->mri_flags = (vnet_mac_tx_serialize ?
2307 		    MAC_RING_TX_SERIALIZE : 0);
2308 		break;
2309 	}
2310 
2311 	default:
2312 		break;
2313 	}
2314 }
2315 
2316 /*
2317  * Callback funtion for MAC layer to get group information.
2318  */
2319 static void
2320 vnet_get_group(void *arg, mac_ring_type_t type, const int index,
2321     mac_group_info_t *infop, mac_group_handle_t handle)
2322 {
2323 	vnet_t	*vnetp = (vnet_t *)arg;
2324 
2325 	switch (type) {
2326 
2327 	case MAC_RING_TYPE_RX:
2328 	{
2329 		vnet_pseudo_rx_group_t	*rx_grp;
2330 
2331 		/* We advertised only one RX group */
2332 		ASSERT(index == 0);
2333 
2334 		rx_grp = &vnetp->rx_grp[index];
2335 		rx_grp->handle = handle;
2336 		rx_grp->index = index;
2337 		rx_grp->vnetp = vnetp;
2338 
2339 		infop->mgi_driver = (mac_group_driver_t)rx_grp;
2340 		infop->mgi_start = NULL;
2341 		infop->mgi_stop = NULL;
2342 		infop->mgi_addmac = vnet_addmac;
2343 		infop->mgi_remmac = vnet_remmac;
2344 		infop->mgi_count = rx_grp->ring_cnt;
2345 
2346 		break;
2347 	}
2348 
2349 	case MAC_RING_TYPE_TX:
2350 	{
2351 		vnet_pseudo_tx_group_t	*tx_grp;
2352 
2353 		/* We advertised only one TX group */
2354 		ASSERT(index == 0);
2355 
2356 		tx_grp = &vnetp->tx_grp[index];
2357 		tx_grp->handle = handle;
2358 		tx_grp->index = index;
2359 		tx_grp->vnetp = vnetp;
2360 
2361 		infop->mgi_driver = (mac_group_driver_t)tx_grp;
2362 		infop->mgi_start = NULL;
2363 		infop->mgi_stop = NULL;
2364 		infop->mgi_addmac = NULL;
2365 		infop->mgi_remmac = NULL;
2366 		infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS;
2367 
2368 		break;
2369 	}
2370 
2371 	default:
2372 		break;
2373 
2374 	}
2375 }
2376 
2377 static int
2378 vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
2379 {
2380 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2381 	int			err;
2382 
2383 	/*
2384 	 * If this ring is mapped to a LDC resource, simply mark the state to
2385 	 * indicate the ring is started and return.
2386 	 */
2387 	if ((rx_ringp->state &
2388 	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
2389 		rx_ringp->gen_num = mr_gen_num;
2390 		rx_ringp->state |= VNET_RXRING_STARTED;
2391 		return (0);
2392 	}
2393 
2394 	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2395 
2396 	/*
2397 	 * This must be a ring reserved for a hwring. If the hwring is not
2398 	 * bound yet, simply mark the state to indicate the ring is started and
2399 	 * return. If and when a hybrid resource is activated for this vnet
2400 	 * device, we will bind the hwring and start it then. If a hwring is
2401 	 * already bound, start it now.
2402 	 */
2403 	if (rx_ringp->hw_rh == NULL) {
2404 		rx_ringp->gen_num = mr_gen_num;
2405 		rx_ringp->state |= VNET_RXRING_STARTED;
2406 		return (0);
2407 	}
2408 
2409 	err = mac_hwring_activate(rx_ringp->hw_rh);
2410 	if (err == 0) {
2411 		rx_ringp->gen_num = mr_gen_num;
2412 		rx_ringp->state |= VNET_RXRING_STARTED;
2413 	} else {
2414 		err = ENXIO;
2415 	}
2416 
2417 	return (err);
2418 }
2419 
2420 static void
2421 vnet_rx_ring_stop(mac_ring_driver_t arg)
2422 {
2423 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2424 
2425 	/*
2426 	 * If this ring is mapped to a LDC resource, simply mark the state to
2427 	 * indicate the ring is now stopped and return.
2428 	 */
2429 	if ((rx_ringp->state &
2430 	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
2431 		rx_ringp->state &= ~VNET_RXRING_STARTED;
2432 		return;
2433 	}
2434 
2435 	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2436 
2437 	/*
2438 	 * This must be a ring reserved for a hwring. If the hwring is not
2439 	 * bound yet, simply mark the state to indicate the ring is stopped and
2440 	 * return. If a hwring is already bound, stop it now.
2441 	 */
2442 	if (rx_ringp->hw_rh == NULL) {
2443 		rx_ringp->state &= ~VNET_RXRING_STARTED;
2444 		return;
2445 	}
2446 
2447 	mac_hwring_quiesce(rx_ringp->hw_rh);
2448 	rx_ringp->state &= ~VNET_RXRING_STARTED;
2449 }
2450 
2451 static int
2452 vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2453 {
2454 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)rdriver;
2455 	vnet_t			*vnetp = (vnet_t *)rx_ringp->vnetp;
2456 	vnet_res_t		*vresp;
2457 	mac_register_t		*macp;
2458 	mac_callbacks_t		*cbp;
2459 
2460 	/*
2461 	 * Refer to vnet_m_capab() function for detailed comments on ring
2462 	 * synchronization.
2463 	 */
2464 	if ((rx_ringp->state & VNET_RXRING_HYBRID) != 0) {
2465 		READ_ENTER(&vnetp->vsw_fp_rw);
2466 		if (vnetp->hio_fp == NULL) {
2467 			RW_EXIT(&vnetp->vsw_fp_rw);
2468 			return (0);
2469 		}
2470 
2471 		VNET_FDBE_REFHOLD(vnetp->hio_fp);
2472 		RW_EXIT(&vnetp->vsw_fp_rw);
2473 		(void) mac_hwring_getstat(rx_ringp->hw_rh, stat, val);
2474 		VNET_FDBE_REFRELE(vnetp->hio_fp);
2475 		return (0);
2476 	}
2477 
2478 	ASSERT((rx_ringp->state &
2479 	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0);
2480 	vresp = (vnet_res_t *)rx_ringp->hw_rh;
2481 	macp = &vresp->macreg;
2482 	cbp = macp->m_callbacks;
2483 
2484 	(void) cbp->mc_getstat(macp->m_driver, stat, val);
2485 
2486 	return (0);
2487 }
2488 
2489 /* ARGSUSED */
2490 static int
2491 vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
2492 {
2493 	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
2494 
2495 	tx_ringp->state |= VNET_TXRING_STARTED;
2496 	return (0);
2497 }
2498 
2499 static void
2500 vnet_tx_ring_stop(mac_ring_driver_t arg)
2501 {
2502 	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
2503 
2504 	tx_ringp->state &= ~VNET_TXRING_STARTED;
2505 }
2506 
2507 static int
2508 vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2509 {
2510 	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)rdriver;
2511 	vnet_tx_ring_stats_t	*statsp;
2512 
2513 	statsp = &tx_ringp->tx_ring_stats;
2514 
2515 	switch (stat) {
2516 	case MAC_STAT_OPACKETS:
2517 		*val = statsp->opackets;
2518 		break;
2519 
2520 	case MAC_STAT_OBYTES:
2521 		*val = statsp->obytes;
2522 		break;
2523 
2524 	default:
2525 		*val = 0;
2526 		return (ENOTSUP);
2527 	}
2528 
2529 	return (0);
2530 }
2531 
2532 /*
2533  * Disable polling for a ring and enable its interrupt.
2534  */
2535 static int
2536 vnet_ring_enable_intr(void *arg)
2537 {
2538 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2539 	vnet_res_t		*vresp;
2540 
2541 	if (rx_ringp->hw_rh == NULL) {
2542 		/*
2543 		 * Ring enable intr func is being invoked, but the ring is
2544 		 * not bound to any underlying resource ? This must be a ring
2545 		 * reserved for Hybrid resource and no such resource has been
2546 		 * assigned to this vnet device yet. We simply return success.
2547 		 */
2548 		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2549 		return (0);
2550 	}
2551 
2552 	/*
2553 	 * The rx ring has been bound to either a LDC or a Hybrid resource.
2554 	 * Call the appropriate function to enable interrupts for the ring.
2555 	 */
2556 	if (rx_ringp->state & VNET_RXRING_HYBRID) {
2557 		return (mac_hwring_enable_intr(rx_ringp->hw_rh));
2558 	} else {
2559 		vresp = (vnet_res_t *)rx_ringp->hw_rh;
2560 		return (vgen_enable_intr(vresp->macreg.m_driver));
2561 	}
2562 }
2563 
2564 /*
2565  * Enable polling for a ring and disable its interrupt.
2566  */
2567 static int
2568 vnet_ring_disable_intr(void *arg)
2569 {
2570 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2571 	vnet_res_t		*vresp;
2572 
2573 	if (rx_ringp->hw_rh == NULL) {
2574 		/*
2575 		 * Ring disable intr func is being invoked, but the ring is
2576 		 * not bound to any underlying resource ? This must be a ring
2577 		 * reserved for Hybrid resource and no such resource has been
2578 		 * assigned to this vnet device yet. We simply return success.
2579 		 */
2580 		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2581 		return (0);
2582 	}
2583 
2584 	/*
2585 	 * The rx ring has been bound to either a LDC or a Hybrid resource.
2586 	 * Call the appropriate function to disable interrupts for the ring.
2587 	 */
2588 	if (rx_ringp->state & VNET_RXRING_HYBRID) {
2589 		return (mac_hwring_disable_intr(rx_ringp->hw_rh));
2590 	} else {
2591 		vresp = (vnet_res_t *)rx_ringp->hw_rh;
2592 		return (vgen_disable_intr(vresp->macreg.m_driver));
2593 	}
2594 }
2595 
2596 /*
2597  * Poll 'bytes_to_pickup' bytes of message from the rx ring.
2598  */
2599 static mblk_t *
2600 vnet_rx_poll(void *arg, int bytes_to_pickup)
2601 {
2602 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2603 	mblk_t			*mp = NULL;
2604 	vnet_res_t		*vresp;
2605 	vnet_t			*vnetp = rx_ringp->vnetp;
2606 
2607 	if (rx_ringp->hw_rh == NULL) {
2608 		return (NULL);
2609 	}
2610 
2611 	if (rx_ringp->state & VNET_RXRING_HYBRID) {
2612 		mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup);
2613 		/*
2614 		 * Packets received over a hybrid resource need additional
2615 		 * processing to remove the tag, for the pvid case. The
2616 		 * underlying resource is not aware of the vnet's pvid and thus
2617 		 * packets are received with the vlan tag in the header; unlike
2618 		 * packets that are received over a ldc channel in which case
2619 		 * the peer vnet/vsw would have already removed the tag.
2620 		 */
2621 		if (vnetp->pvid != vnetp->default_vlan_id) {
2622 			vnet_rx_frames_untag(vnetp->pvid, &mp);
2623 		}
2624 	} else {
2625 		vresp = (vnet_res_t *)rx_ringp->hw_rh;
2626 		mp = vgen_rx_poll(vresp->macreg.m_driver, bytes_to_pickup);
2627 	}
2628 	return (mp);
2629 }
2630 
2631 /* ARGSUSED */
2632 void
2633 vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
2634     boolean_t loopback)
2635 {
2636 	vnet_t			*vnetp = (vnet_t *)arg;
2637 	vnet_pseudo_rx_ring_t	*ringp = (vnet_pseudo_rx_ring_t *)mrh;
2638 
2639 	/*
2640 	 * Packets received over a hybrid resource need additional processing
2641 	 * to remove the tag, for the pvid case. The underlying resource is
2642 	 * not aware of the vnet's pvid and thus packets are received with the
2643 	 * vlan tag in the header; unlike packets that are received over a ldc
2644 	 * channel in which case the peer vnet/vsw would have already removed
2645 	 * the tag.
2646 	 */
2647 	if (vnetp->pvid != vnetp->default_vlan_id) {
2648 		vnet_rx_frames_untag(vnetp->pvid, &mp);
2649 		if (mp == NULL) {
2650 			return;
2651 		}
2652 	}
2653 	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
2654 }
2655 
2656 static int
2657 vnet_addmac(void *arg, const uint8_t *mac_addr)
2658 {
2659 	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
2660 	vnet_t			*vnetp;
2661 
2662 	vnetp = rx_grp->vnetp;
2663 
2664 	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
2665 		return (0);
2666 	}
2667 
2668 	cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n",
2669 	    vnetp->instance, __func__);
2670 	return (EINVAL);
2671 }
2672 
2673 static int
2674 vnet_remmac(void *arg, const uint8_t *mac_addr)
2675 {
2676 	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
2677 	vnet_t			*vnetp;
2678 
2679 	vnetp = rx_grp->vnetp;
2680 
2681 	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
2682 		return (0);
2683 	}
2684 
2685 	cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n",
2686 	    vnetp->instance, __func__, ether_sprintf((void *)mac_addr));
2687 	return (EINVAL);
2688 }
2689 
2690 int
2691 vnet_hio_mac_init(vnet_t *vnetp, char *ifname)
2692 {
2693 	mac_handle_t		mh;
2694 	mac_client_handle_t	mch = NULL;
2695 	mac_unicast_handle_t	muh = NULL;
2696 	mac_diag_t		diag;
2697 	mac_register_t		*macp;
2698 	char			client_name[MAXNAMELEN];
2699 	int			rv;
2700 	uint16_t		mac_flags = MAC_UNICAST_TAG_DISABLE |
2701 	    MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY;
2702 	vio_net_callbacks_t	vcb;
2703 	ether_addr_t		rem_addr =
2704 		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
2705 	uint32_t		retries = 0;
2706 
2707 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2708 		return (EAGAIN);
2709 	}
2710 
2711 	do {
2712 		rv = mac_open_by_linkname(ifname, &mh);
2713 		if (rv == 0) {
2714 			break;
2715 		}
2716 		if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) {
2717 			mac_free(macp);
2718 			return (rv);
2719 		}
2720 		drv_usecwait(vnet_mac_open_delay);
2721 	} while (rv == ENOENT);
2722 
2723 	vnetp->hio_mh = mh;
2724 
2725 	(void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance,
2726 	    ifname);
2727 	rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE);
2728 	if (rv != 0) {
2729 		goto fail;
2730 	}
2731 	vnetp->hio_mch = mch;
2732 
2733 	rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0,
2734 	    &diag);
2735 	if (rv != 0) {
2736 		goto fail;
2737 	}
2738 	vnetp->hio_muh = muh;
2739 
2740 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2741 	macp->m_driver = vnetp;
2742 	macp->m_dip = NULL;
2743 	macp->m_src_addr = NULL;
2744 	macp->m_callbacks = &vnet_hio_res_callbacks;
2745 	macp->m_min_sdu = 0;
2746 	macp->m_max_sdu = ETHERMTU;
2747 
2748 	rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID,
2749 	    vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb);
2750 	if (rv != 0) {
2751 		goto fail;
2752 	}
2753 	mac_free(macp);
2754 
2755 	/* add the recv callback */
2756 	mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp);
2757 
2758 	return (0);
2759 
2760 fail:
2761 	mac_free(macp);
2762 	vnet_hio_mac_cleanup(vnetp);
2763 	return (1);
2764 }
2765 
2766 void
2767 vnet_hio_mac_cleanup(vnet_t *vnetp)
2768 {
2769 	if (vnetp->hio_vhp != NULL) {
2770 		vio_net_resource_unreg(vnetp->hio_vhp);
2771 		vnetp->hio_vhp = NULL;
2772 	}
2773 
2774 	if (vnetp->hio_muh != NULL) {
2775 		(void) mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh);
2776 		vnetp->hio_muh = NULL;
2777 	}
2778 
2779 	if (vnetp->hio_mch != NULL) {
2780 		mac_client_close(vnetp->hio_mch, 0);
2781 		vnetp->hio_mch = NULL;
2782 	}
2783 
2784 	if (vnetp->hio_mh != NULL) {
2785 		mac_close(vnetp->hio_mh);
2786 		vnetp->hio_mh = NULL;
2787 	}
2788 }
2789 
2790 /* Bind pseudo rings to hwrings */
2791 static int
2792 vnet_bind_hwrings(vnet_t *vnetp)
2793 {
2794 	mac_ring_handle_t	hw_rh[VNET_NUM_HYBRID_RINGS];
2795 	mac_perim_handle_t	mph1;
2796 	vnet_pseudo_rx_group_t	*rx_grp;
2797 	vnet_pseudo_rx_ring_t	*rx_ringp;
2798 	vnet_pseudo_tx_group_t	*tx_grp;
2799 	vnet_pseudo_tx_ring_t	*tx_ringp;
2800 	int			hw_ring_cnt;
2801 	int			i;
2802 	int			rv;
2803 
2804 	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
2805 
2806 	/* Get the list of the underlying RX rings. */
2807 	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh,
2808 	    MAC_RING_TYPE_RX);
2809 
2810 	/* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */
2811 	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
2812 		cmn_err(CE_WARN,
2813 		    "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n",
2814 		    vnetp->instance, hw_ring_cnt);
2815 		goto fail;
2816 	}
2817 
2818 	if (vnetp->rx_hwgh != NULL) {
2819 		/*
2820 		 * Quiesce the HW ring and the mac srs on the ring. Note
2821 		 * that the HW ring will be restarted when the pseudo ring
2822 		 * is started. At that time all the packets will be
2823 		 * directly passed up to the pseudo RX ring and handled
2824 		 * by mac srs created over the pseudo RX ring.
2825 		 */
2826 		mac_rx_client_quiesce(vnetp->hio_mch);
2827 		mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE);
2828 	}
2829 
2830 	/*
2831 	 * Bind the pseudo rings to the hwrings and start the hwrings.
2832 	 * Note we don't need to register these with the upper mac, as we have
2833 	 * statically exported these pseudo rxrings which are reserved for
2834 	 * rxrings of Hybrid resource.
2835 	 */
2836 	rx_grp = &vnetp->rx_grp[0];
2837 	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2838 		/* Pick the rxrings reserved for Hybrid resource */
2839 		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
2840 
2841 		/* Store the hw ring handle */
2842 		rx_ringp->hw_rh = hw_rh[i];
2843 
2844 		/* Bind the pseudo ring to the underlying hwring */
2845 		mac_hwring_setup(rx_ringp->hw_rh,
2846 		    (mac_resource_handle_t)rx_ringp, NULL);
2847 
2848 		/* Start the hwring if needed */
2849 		if (rx_ringp->state & VNET_RXRING_STARTED) {
2850 			rv = mac_hwring_activate(rx_ringp->hw_rh);
2851 			if (rv != 0) {
2852 				mac_hwring_teardown(rx_ringp->hw_rh);
2853 				rx_ringp->hw_rh = NULL;
2854 				goto fail;
2855 			}
2856 		}
2857 	}
2858 
2859 	/* Get the list of the underlying TX rings. */
2860 	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh,
2861 	    MAC_RING_TYPE_TX);
2862 
2863 	/* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */
2864 	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
2865 		cmn_err(CE_WARN,
2866 		    "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n",
2867 		    vnetp->instance, hw_ring_cnt);
2868 		goto fail;
2869 	}
2870 
2871 	/*
2872 	 * Now map the pseudo txrings to the hw txrings. Note we don't need
2873 	 * to register these with the upper mac, as we have statically exported
2874 	 * these rings. Note that these rings will continue to be used for LDC
2875 	 * resources to peer vnets and vswitch (shared ring).
2876 	 */
2877 	tx_grp = &vnetp->tx_grp[0];
2878 	for (i = 0; i < tx_grp->ring_cnt; i++) {
2879 		tx_ringp = &tx_grp->rings[i];
2880 		tx_ringp->hw_rh = hw_rh[i];
2881 		tx_ringp->state |= VNET_TXRING_HYBRID;
2882 	}
2883 	tx_grp->tx_notify_handle =
2884 	    mac_client_tx_notify(vnetp->hio_mch, vnet_tx_ring_update, vnetp);
2885 
2886 	mac_perim_exit(mph1);
2887 	return (0);
2888 
2889 fail:
2890 	mac_perim_exit(mph1);
2891 	vnet_unbind_hwrings(vnetp);
2892 	return (1);
2893 }
2894 
2895 /* Unbind pseudo rings from hwrings */
2896 static void
2897 vnet_unbind_hwrings(vnet_t *vnetp)
2898 {
2899 	mac_perim_handle_t	mph1;
2900 	vnet_pseudo_rx_ring_t	*rx_ringp;
2901 	vnet_pseudo_rx_group_t	*rx_grp;
2902 	vnet_pseudo_tx_group_t	*tx_grp;
2903 	vnet_pseudo_tx_ring_t	*tx_ringp;
2904 	int			i;
2905 
2906 	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
2907 
2908 	tx_grp = &vnetp->tx_grp[0];
2909 	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2910 		tx_ringp = &tx_grp->rings[i];
2911 		if (tx_ringp->state & VNET_TXRING_HYBRID) {
2912 			tx_ringp->state &= ~VNET_TXRING_HYBRID;
2913 			tx_ringp->hw_rh = NULL;
2914 		}
2915 	}
2916 	(void) mac_client_tx_notify(vnetp->hio_mch, NULL,
2917 	    tx_grp->tx_notify_handle);
2918 
2919 	rx_grp = &vnetp->rx_grp[0];
2920 	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2921 		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
2922 		if (rx_ringp->hw_rh != NULL) {
2923 			/* Stop the hwring */
2924 			mac_hwring_quiesce(rx_ringp->hw_rh);
2925 
2926 			/* Teardown the hwring */
2927 			mac_hwring_teardown(rx_ringp->hw_rh);
2928 			rx_ringp->hw_rh = NULL;
2929 		}
2930 	}
2931 
2932 	if (vnetp->rx_hwgh != NULL) {
2933 		vnetp->rx_hwgh = NULL;
2934 		/*
2935 		 * First clear the permanent-quiesced flag of the RX srs then
2936 		 * restart the HW ring and the mac srs on the ring.
2937 		 */
2938 		mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE);
2939 		mac_rx_client_restart(vnetp->hio_mch);
2940 	}
2941 
2942 	mac_perim_exit(mph1);
2943 }
2944 
2945 /* Bind pseudo ring to a LDC resource */
2946 static int
2947 vnet_bind_vgenring(vnet_res_t *vresp)
2948 {
2949 	vnet_t			*vnetp;
2950 	vnet_pseudo_rx_group_t	*rx_grp;
2951 	vnet_pseudo_rx_ring_t	*rx_ringp;
2952 	mac_perim_handle_t	mph1;
2953 	int			rv;
2954 	int			type;
2955 
2956 	vnetp = vresp->vnetp;
2957 	type = vresp->type;
2958 	rx_grp = &vnetp->rx_grp[0];
2959 
2960 	if (type == VIO_NET_RES_LDC_SERVICE) {
2961 		/*
2962 		 * Ring Index 0 is the default ring in the group and is
2963 		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
2964 		 * is allocated statically and is reported to the mac layer
2965 		 * in vnet_m_capab(). So, all we need to do here, is save a
2966 		 * reference to the associated vresp.
2967 		 */
2968 		rx_ringp = &rx_grp->rings[0];
2969 		rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
2970 		vresp->rx_ringp = (void *)rx_ringp;
2971 		return (0);
2972 	}
2973 	ASSERT(type == VIO_NET_RES_LDC_GUEST);
2974 
2975 	mac_perim_enter_by_mh(vnetp->mh, &mph1);
2976 
2977 	rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp);
2978 	if (rx_ringp == NULL) {
2979 		cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring",
2980 		    vnetp->instance);
2981 		goto fail;
2982 	}
2983 
2984 	/* Store the LDC resource itself as the ring handle */
2985 	rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
2986 
2987 	/*
2988 	 * Save a reference to the ring in the resource for lookup during
2989 	 * unbind. Note this is only done for LDC resources. We don't need this
2990 	 * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its
2991 	 * rx rings are mapped to reserved pseudo rx rings (index 1 and 2).
2992 	 */
2993 	vresp->rx_ringp = (void *)rx_ringp;
2994 	rx_ringp->state |= VNET_RXRING_LDC_GUEST;
2995 
2996 	/* Register the pseudo ring with upper-mac */
2997 	rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index);
2998 	if (rv != 0) {
2999 		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
3000 		rx_ringp->hw_rh = NULL;
3001 		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
3002 		goto fail;
3003 	}
3004 
3005 	mac_perim_exit(mph1);
3006 	return (0);
3007 fail:
3008 	mac_perim_exit(mph1);
3009 	return (1);
3010 }
3011 
3012 /* Unbind pseudo ring from a LDC resource */
3013 static void
3014 vnet_unbind_vgenring(vnet_res_t *vresp)
3015 {
3016 	vnet_t			*vnetp;
3017 	vnet_pseudo_rx_group_t	*rx_grp;
3018 	vnet_pseudo_rx_ring_t	*rx_ringp;
3019 	mac_perim_handle_t	mph1;
3020 	int			type;
3021 
3022 	vnetp = vresp->vnetp;
3023 	type = vresp->type;
3024 	rx_grp = &vnetp->rx_grp[0];
3025 
3026 	if (vresp->rx_ringp == NULL) {
3027 		return;
3028 	}
3029 
3030 	if (type == VIO_NET_RES_LDC_SERVICE) {
3031 		/*
3032 		 * Ring Index 0 is the default ring in the group and is
3033 		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
3034 		 * is allocated statically and is reported to the mac layer
3035 		 * in vnet_m_capab(). So, all we need to do here, is remove its
3036 		 * reference to the associated vresp.
3037 		 */
3038 		rx_ringp = &rx_grp->rings[0];
3039 		rx_ringp->hw_rh = NULL;
3040 		vresp->rx_ringp = NULL;
3041 		return;
3042 	}
3043 	ASSERT(type == VIO_NET_RES_LDC_GUEST);
3044 
3045 	mac_perim_enter_by_mh(vnetp->mh, &mph1);
3046 
3047 	rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp;
3048 	vresp->rx_ringp = NULL;
3049 
3050 	if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) {
3051 		/* Unregister the pseudo ring with upper-mac */
3052 		mac_group_rem_ring(rx_grp->handle, rx_ringp->handle);
3053 
3054 		rx_ringp->hw_rh = NULL;
3055 		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
3056 
3057 		/* Free the pseudo rx ring */
3058 		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
3059 	}
3060 
3061 	mac_perim_exit(mph1);
3062 }
3063 
3064 static void
3065 vnet_unbind_rings(vnet_res_t *vresp)
3066 {
3067 	switch (vresp->type) {
3068 
3069 	case VIO_NET_RES_LDC_SERVICE:
3070 	case VIO_NET_RES_LDC_GUEST:
3071 		vnet_unbind_vgenring(vresp);
3072 		break;
3073 
3074 	case VIO_NET_RES_HYBRID:
3075 		vnet_unbind_hwrings(vresp->vnetp);
3076 		break;
3077 
3078 	default:
3079 		break;
3080 
3081 	}
3082 }
3083 
3084 static int
3085 vnet_bind_rings(vnet_res_t *vresp)
3086 {
3087 	int	rv;
3088 
3089 	switch (vresp->type) {
3090 
3091 	case VIO_NET_RES_LDC_SERVICE:
3092 	case VIO_NET_RES_LDC_GUEST:
3093 		rv = vnet_bind_vgenring(vresp);
3094 		break;
3095 
3096 	case VIO_NET_RES_HYBRID:
3097 		rv = vnet_bind_hwrings(vresp->vnetp);
3098 		break;
3099 
3100 	default:
3101 		rv = 1;
3102 		break;
3103 
3104 	}
3105 
3106 	return (rv);
3107 }
3108 
3109 /* ARGSUSED */
3110 int
3111 vnet_hio_stat(void *arg, uint_t stat, uint64_t *val)
3112 {
3113 	vnet_t	*vnetp = (vnet_t *)arg;
3114 
3115 	*val = mac_stat_get(vnetp->hio_mh, stat);
3116 	return (0);
3117 }
3118 
3119 /*
3120  * The start() and stop() routines for the Hybrid resource below, are just
3121  * dummy functions. This is provided to avoid resource type specific code in
3122  * vnet_start_resources() and vnet_stop_resources(). The starting and stopping
3123  * of the Hybrid resource happens in the context of the mac_client interfaces
3124  * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup().
3125  */
3126 /* ARGSUSED */
3127 static int
3128 vnet_hio_start(void *arg)
3129 {
3130 	return (0);
3131 }
3132 
3133 /* ARGSUSED */
3134 static void
3135 vnet_hio_stop(void *arg)
3136 {
3137 }
3138 
3139 mblk_t *
3140 vnet_hio_tx(void *arg, mblk_t *mp)
3141 {
3142 	vnet_pseudo_tx_ring_t	*tx_ringp;
3143 	mblk_t			*nextp;
3144 	mblk_t			*ret_mp;
3145 
3146 	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
3147 	for (;;) {
3148 		nextp = mp->b_next;
3149 		mp->b_next = NULL;
3150 
3151 		ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp);
3152 		if (ret_mp != NULL) {
3153 			ret_mp->b_next = nextp;
3154 			mp = ret_mp;
3155 			break;
3156 		}
3157 
3158 		if ((mp = nextp) == NULL)
3159 			break;
3160 	}
3161 	return (mp);
3162 }
3163 
3164 #ifdef	VNET_IOC_DEBUG
3165 
3166 /*
3167  * The ioctl entry point is used only for debugging for now. The ioctl commands
3168  * can be used to force the link state of the channel connected to vsw.
3169  */
3170 static void
3171 vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
3172 {
3173 	struct iocblk	*iocp;
3174 	vnet_t		*vnetp;
3175 
3176 	iocp = (struct iocblk *)(uintptr_t)mp->b_rptr;
3177 	iocp->ioc_error = 0;
3178 	vnetp = (vnet_t *)arg;
3179 
3180 	if (vnetp == NULL) {
3181 		miocnak(q, mp, 0, EINVAL);
3182 		return;
3183 	}
3184 
3185 	switch (iocp->ioc_cmd) {
3186 
3187 	case VNET_FORCE_LINK_DOWN:
3188 	case VNET_FORCE_LINK_UP:
3189 		vnet_force_link_state(vnetp, q, mp);
3190 		break;
3191 
3192 	default:
3193 		iocp->ioc_error = EINVAL;
3194 		miocnak(q, mp, 0, iocp->ioc_error);
3195 		break;
3196 
3197 	}
3198 }
3199 
3200 static void
3201 vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp)
3202 {
3203 	mac_register_t	*macp;
3204 	mac_callbacks_t	*cbp;
3205 	vnet_res_t	*vresp;
3206 
3207 	READ_ENTER(&vnetp->vsw_fp_rw);
3208 
3209 	vresp = vnetp->vsw_fp;
3210 	if (vresp == NULL) {
3211 		RW_EXIT(&vnetp->vsw_fp_rw);
3212 		return;
3213 	}
3214 
3215 	macp = &vresp->macreg;
3216 	cbp = macp->m_callbacks;
3217 	cbp->mc_ioctl(macp->m_driver, q, mp);
3218 
3219 	RW_EXIT(&vnetp->vsw_fp_rw);
3220 }
3221 
3222 #else
3223 
3224 static void
3225 vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
3226 {
3227 	vnet_t		*vnetp;
3228 
3229 	vnetp = (vnet_t *)arg;
3230 
3231 	if (vnetp == NULL) {
3232 		miocnak(q, mp, 0, EINVAL);
3233 		return;
3234 	}
3235 
3236 	/* ioctl support only for debugging */
3237 	miocnak(q, mp, 0, ENOTSUP);
3238 }
3239 
3240 #endif
3241