xref: /titanic_44/usr/src/uts/sun4v/io/vnet.c (revision efaadbbfd41d2a35674006a9d58a2812beba0ea8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/errno.h>
29 #include <sys/param.h>
30 #include <sys/callb.h>
31 #include <sys/stream.h>
32 #include <sys/kmem.h>
33 #include <sys/conf.h>
34 #include <sys/devops.h>
35 #include <sys/ksynch.h>
36 #include <sys/stat.h>
37 #include <sys/modctl.h>
38 #include <sys/modhash.h>
39 #include <sys/debug.h>
40 #include <sys/ethernet.h>
41 #include <sys/dlpi.h>
42 #include <net/if.h>
43 #include <sys/mac_provider.h>
44 #include <sys/mac_client.h>
45 #include <sys/mac_client_priv.h>
46 #include <sys/mac_ether.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/strsun.h>
50 #include <sys/note.h>
51 #include <sys/atomic.h>
52 #include <sys/vnet.h>
53 #include <sys/vlan.h>
54 #include <sys/vnet_mailbox.h>
55 #include <sys/vnet_common.h>
56 #include <sys/dds.h>
57 #include <sys/strsubr.h>
58 #include <sys/taskq.h>
59 
60 /*
61  * Function prototypes.
62  */
63 
64 /* DDI entrypoints */
65 static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
66 static int vnetattach(dev_info_t *, ddi_attach_cmd_t);
67 static int vnetdetach(dev_info_t *, ddi_detach_cmd_t);
68 
69 /* MAC entrypoints  */
70 static int vnet_m_stat(void *, uint_t, uint64_t *);
71 static int vnet_m_start(void *);
72 static void vnet_m_stop(void *);
73 static int vnet_m_promisc(void *, boolean_t);
74 static int vnet_m_multicst(void *, boolean_t, const uint8_t *);
75 static int vnet_m_unicst(void *, const uint8_t *);
76 mblk_t *vnet_m_tx(void *, mblk_t *);
77 static void vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
78 #ifdef	VNET_IOC_DEBUG
79 static void vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp);
80 #endif
81 static boolean_t vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data);
82 static void vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
83 	const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle);
84 static void vnet_get_group(void *arg, mac_ring_type_t type, const int index,
85 	mac_group_info_t *infop, mac_group_handle_t handle);
86 static int vnet_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
87 static void vnet_rx_ring_stop(mac_ring_driver_t rdriver);
88 static int vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
89 	uint64_t *val);
90 static int vnet_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num);
91 static void vnet_tx_ring_stop(mac_ring_driver_t rdriver);
92 static int vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat,
93 	uint64_t *val);
94 static int vnet_ring_enable_intr(void *arg);
95 static int vnet_ring_disable_intr(void *arg);
96 static mblk_t *vnet_rx_poll(void *arg, int bytes_to_pickup);
97 static int vnet_addmac(void *arg, const uint8_t *mac_addr);
98 static int vnet_remmac(void *arg, const uint8_t *mac_addr);
99 
100 /* vnet internal functions */
101 static int vnet_unattach(vnet_t *vnetp);
102 static void vnet_ring_grp_init(vnet_t *vnetp);
103 static void vnet_ring_grp_uninit(vnet_t *vnetp);
104 static int vnet_mac_register(vnet_t *);
105 static int vnet_read_mac_address(vnet_t *vnetp);
106 static int vnet_bind_vgenring(vnet_res_t *vresp);
107 static void vnet_unbind_vgenring(vnet_res_t *vresp);
108 static int vnet_bind_hwrings(vnet_t *vnetp);
109 static void vnet_unbind_hwrings(vnet_t *vnetp);
110 static int vnet_bind_rings(vnet_res_t *vresp);
111 static void vnet_unbind_rings(vnet_res_t *vresp);
112 static int vnet_hio_stat(void *, uint_t, uint64_t *);
113 static int vnet_hio_start(void *);
114 static void vnet_hio_stop(void *);
115 mblk_t *vnet_hio_tx(void *, mblk_t *);
116 
117 /* Forwarding database (FDB) routines */
118 static void vnet_fdb_create(vnet_t *vnetp);
119 static void vnet_fdb_destroy(vnet_t *vnetp);
120 static vnet_res_t *vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp);
121 static void vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val);
122 void vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp);
123 static void vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp);
124 
125 static void vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp);
126 static void vnet_rx(vio_net_handle_t vrh, mblk_t *mp);
127 static void vnet_tx_update(vio_net_handle_t vrh);
128 static void vnet_res_start_task(void *arg);
129 static void vnet_start_resources(vnet_t *vnetp);
130 static void vnet_stop_resources(vnet_t *vnetp);
131 static void vnet_dispatch_res_task(vnet_t *vnetp);
132 static void vnet_res_start_task(void *arg);
133 static void vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err);
134 static void vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp);
135 static vnet_res_t *vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp);
136 static void vnet_tx_notify_thread(void *);
137 
138 /* Exported to vnet_gen */
139 int vnet_mtu_update(vnet_t *vnetp, uint32_t mtu);
140 void vnet_link_update(vnet_t *vnetp, link_state_t link_state);
141 void vnet_dds_cleanup_hio(vnet_t *vnetp);
142 
143 static kstat_t *vnet_hio_setup_kstats(char *ks_mod, char *ks_name,
144     vnet_res_t *vresp);
145 static int vnet_hio_update_kstats(kstat_t *ksp, int rw);
146 static void vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp);
147 static void vnet_hio_destroy_kstats(kstat_t *ksp);
148 
149 /* Exported to to vnet_dds */
150 int vnet_send_dds_msg(vnet_t *vnetp, void *dmsg);
151 int vnet_hio_mac_init(vnet_t *vnetp, char *ifname);
152 void vnet_hio_mac_cleanup(vnet_t *vnetp);
153 
154 /* Externs that are imported from vnet_gen */
155 extern int vgen_init(void *vnetp, uint64_t regprop, dev_info_t *vnetdip,
156     const uint8_t *macaddr, void **vgenhdl);
157 extern int vgen_init_mdeg(void *arg);
158 extern void vgen_uninit(void *arg);
159 extern int vgen_dds_tx(void *arg, void *dmsg);
160 extern void vgen_mod_init(void);
161 extern int vgen_mod_cleanup(void);
162 extern void vgen_mod_fini(void);
163 extern int vgen_enable_intr(void *arg);
164 extern int vgen_disable_intr(void *arg);
165 extern mblk_t *vgen_poll(void *arg, int bytes_to_pickup);
166 
167 /* Externs that are imported from vnet_dds */
168 extern void vdds_mod_init(void);
169 extern void vdds_mod_fini(void);
170 extern int vdds_init(vnet_t *vnetp);
171 extern void vdds_cleanup(vnet_t *vnetp);
172 extern void vdds_process_dds_msg(vnet_t *vnetp, vio_dds_msg_t *dmsg);
173 extern void vdds_cleanup_hybrid_res(void *arg);
174 extern void vdds_cleanup_hio(vnet_t *vnetp);
175 
176 extern pri_t	minclsyspri;
177 
178 #define	DRV_NAME	"vnet"
179 #define	VNET_FDBE_REFHOLD(p)						\
180 {									\
181 	atomic_inc_32(&(p)->refcnt);					\
182 	ASSERT((p)->refcnt != 0);					\
183 }
184 
185 #define	VNET_FDBE_REFRELE(p)						\
186 {									\
187 	ASSERT((p)->refcnt != 0);					\
188 	atomic_dec_32(&(p)->refcnt);					\
189 }
190 
191 #ifdef	VNET_IOC_DEBUG
192 #define	VNET_M_CALLBACK_FLAGS	(MC_IOCTL | MC_GETCAPAB)
193 #else
194 #define	VNET_M_CALLBACK_FLAGS	(MC_GETCAPAB)
195 #endif
196 
197 static mac_callbacks_t vnet_m_callbacks = {
198 	VNET_M_CALLBACK_FLAGS,
199 	vnet_m_stat,
200 	vnet_m_start,
201 	vnet_m_stop,
202 	vnet_m_promisc,
203 	vnet_m_multicst,
204 	NULL,	/* m_unicst entry must be NULL while rx rings are exposed */
205 	NULL,	/* m_tx entry must be NULL while tx rings are exposed */
206 	NULL,
207 	vnet_m_ioctl,
208 	vnet_m_capab,
209 	NULL
210 };
211 
212 static mac_callbacks_t vnet_hio_res_callbacks = {
213 	0,
214 	vnet_hio_stat,
215 	vnet_hio_start,
216 	vnet_hio_stop,
217 	NULL,
218 	NULL,
219 	NULL,
220 	vnet_hio_tx,
221 	NULL,
222 	NULL,
223 	NULL
224 };
225 
226 /*
227  * Linked list of "vnet_t" structures - one per instance.
228  */
229 static vnet_t	*vnet_headp = NULL;
230 static krwlock_t vnet_rw;
231 
232 /* Tunables */
233 uint32_t vnet_ntxds = VNET_NTXDS;	/* power of 2 transmit descriptors */
234 uint32_t vnet_ldcwd_interval = VNET_LDCWD_INTERVAL; /* watchdog freq in msec */
235 uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT;  /* tx timeout in msec */
236 uint32_t vnet_ldc_mtu = VNET_LDC_MTU;		/* ldc mtu */
237 
238 /* Configure tx serialization in mac layer for the vnet device */
239 boolean_t vnet_mac_tx_serialize = B_TRUE;
240 /* Configure enqueing at Rx soft rings in mac layer for the vnet device */
241 boolean_t vnet_mac_rx_queuing = B_TRUE;
242 
243 /*
244  * Set this to non-zero to enable additional internal receive buffer pools
245  * based on the MTU of the device for better performance at the cost of more
246  * memory consumption. This is turned off by default, to use allocb(9F) for
247  * receive buffer allocations of sizes > 2K.
248  */
249 boolean_t vnet_jumbo_rxpools = B_FALSE;
250 
251 /* # of chains in fdb hash table */
252 uint32_t	vnet_fdb_nchains = VNET_NFDB_HASH;
253 
254 /* Internal tunables */
255 uint32_t	vnet_ethermtu = 1500;	/* mtu of the device */
256 
257 /*
258  * Default vlan id. This is only used internally when the "default-vlan-id"
259  * property is not present in the MD device node. Therefore, this should not be
260  * used as a tunable; if this value is changed, the corresponding variable
261  * should be updated to the same value in vsw and also other vnets connected to
262  * the same vsw.
263  */
264 uint16_t	vnet_default_vlan_id = 1;
265 
266 /* delay in usec to wait for all references on a fdb entry to be dropped */
267 uint32_t vnet_fdbe_refcnt_delay = 10;
268 
269 static struct ether_addr etherbroadcastaddr = {
270 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
271 };
272 
273 /* mac_open() retry delay in usec */
274 uint32_t vnet_mac_open_delay = 100;	/* 0.1 ms */
275 
276 /* max # of mac_open() retries */
277 uint32_t vnet_mac_open_retries = 100;
278 
279 /*
280  * Property names
281  */
282 static char macaddr_propname[] = "local-mac-address";
283 
284 /*
285  * This is the string displayed by modinfo(1m).
286  */
287 static char vnet_ident[] = "vnet driver";
288 extern struct mod_ops mod_driverops;
289 static struct cb_ops cb_vnetops = {
290 	nulldev,		/* cb_open */
291 	nulldev,		/* cb_close */
292 	nodev,			/* cb_strategy */
293 	nodev,			/* cb_print */
294 	nodev,			/* cb_dump */
295 	nodev,			/* cb_read */
296 	nodev,			/* cb_write */
297 	nodev,			/* cb_ioctl */
298 	nodev,			/* cb_devmap */
299 	nodev,			/* cb_mmap */
300 	nodev,			/* cb_segmap */
301 	nochpoll,		/* cb_chpoll */
302 	ddi_prop_op,		/* cb_prop_op */
303 	NULL,			/* cb_stream */
304 	(int)(D_MP)		/* cb_flag */
305 };
306 
307 static struct dev_ops vnetops = {
308 	DEVO_REV,		/* devo_rev */
309 	0,			/* devo_refcnt */
310 	NULL,			/* devo_getinfo */
311 	nulldev,		/* devo_identify */
312 	nulldev,		/* devo_probe */
313 	vnetattach,		/* devo_attach */
314 	vnetdetach,		/* devo_detach */
315 	nodev,			/* devo_reset */
316 	&cb_vnetops,		/* devo_cb_ops */
317 	(struct bus_ops *)NULL,	/* devo_bus_ops */
318 	NULL,			/* devo_power */
319 	ddi_quiesce_not_supported,	/* devo_quiesce */
320 };
321 
322 static struct modldrv modldrv = {
323 	&mod_driverops,		/* Type of module.  This one is a driver */
324 	vnet_ident,		/* ID string */
325 	&vnetops		/* driver specific ops */
326 };
327 
328 static struct modlinkage modlinkage = {
329 	MODREV_1, (void *)&modldrv, NULL
330 };
331 
332 #ifdef DEBUG
333 
334 /*
335  * Print debug messages - set to 0xf to enable all msgs
336  */
337 int vnet_dbglevel = 0x8;
338 
339 static void
340 debug_printf(const char *fname, void *arg, const char *fmt, ...)
341 {
342 	char    buf[512];
343 	va_list ap;
344 	vnet_t *vnetp = (vnet_t *)arg;
345 	char    *bufp = buf;
346 
347 	if (vnetp == NULL) {
348 		(void) sprintf(bufp, "%s: ", fname);
349 		bufp += strlen(bufp);
350 	} else {
351 		(void) sprintf(bufp, "vnet%d:%s: ", vnetp->instance, fname);
352 		bufp += strlen(bufp);
353 	}
354 	va_start(ap, fmt);
355 	(void) vsprintf(bufp, fmt, ap);
356 	va_end(ap);
357 	cmn_err(CE_CONT, "%s\n", buf);
358 }
359 
360 #endif
361 
362 /* _init(9E): initialize the loadable module */
363 int
364 _init(void)
365 {
366 	int status;
367 
368 	DBG1(NULL, "enter\n");
369 
370 	mac_init_ops(&vnetops, "vnet");
371 	status = mod_install(&modlinkage);
372 	if (status != 0) {
373 		mac_fini_ops(&vnetops);
374 	}
375 	vdds_mod_init();
376 	vgen_mod_init();
377 	DBG1(NULL, "exit(%d)\n", status);
378 	return (status);
379 }
380 
381 /* _fini(9E): prepare the module for unloading. */
382 int
383 _fini(void)
384 {
385 	int		status;
386 
387 	DBG1(NULL, "enter\n");
388 
389 	status = vgen_mod_cleanup();
390 	if (status != 0)
391 		return (status);
392 
393 	status = mod_remove(&modlinkage);
394 	if (status != 0)
395 		return (status);
396 	mac_fini_ops(&vnetops);
397 	vgen_mod_fini();
398 	vdds_mod_fini();
399 
400 	DBG1(NULL, "exit(%d)\n", status);
401 	return (status);
402 }
403 
404 /* _info(9E): return information about the loadable module */
405 int
406 _info(struct modinfo *modinfop)
407 {
408 	return (mod_info(&modlinkage, modinfop));
409 }
410 
411 /*
412  * attach(9E): attach a device to the system.
413  * called once for each instance of the device on the system.
414  */
415 static int
416 vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
417 {
418 	vnet_t			*vnetp;
419 	int			status;
420 	int			instance;
421 	uint64_t		reg;
422 	char			qname[TASKQ_NAMELEN];
423 	vnet_attach_progress_t	attach_progress;
424 
425 	attach_progress = AST_init;
426 
427 	switch (cmd) {
428 	case DDI_ATTACH:
429 		break;
430 	case DDI_RESUME:
431 	case DDI_PM_RESUME:
432 	default:
433 		goto vnet_attach_fail;
434 	}
435 
436 	instance = ddi_get_instance(dip);
437 	DBG1(NULL, "instance(%d) enter\n", instance);
438 
439 	/* allocate vnet_t and mac_t structures */
440 	vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP);
441 	vnetp->dip = dip;
442 	vnetp->instance = instance;
443 	rw_init(&vnetp->vrwlock, NULL, RW_DRIVER, NULL);
444 	rw_init(&vnetp->vsw_fp_rw, NULL, RW_DRIVER, NULL);
445 	attach_progress |= AST_vnet_alloc;
446 
447 	vnet_ring_grp_init(vnetp);
448 	attach_progress |= AST_ring_init;
449 
450 	status = vdds_init(vnetp);
451 	if (status != 0) {
452 		goto vnet_attach_fail;
453 	}
454 	attach_progress |= AST_vdds_init;
455 
456 	/* setup links to vnet_t from both devinfo and mac_t */
457 	ddi_set_driver_private(dip, (caddr_t)vnetp);
458 
459 	/* read the mac address */
460 	status = vnet_read_mac_address(vnetp);
461 	if (status != DDI_SUCCESS) {
462 		goto vnet_attach_fail;
463 	}
464 	attach_progress |= AST_read_macaddr;
465 
466 	reg = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
467 	    DDI_PROP_DONTPASS, "reg", -1);
468 	if (reg == -1) {
469 		goto vnet_attach_fail;
470 	}
471 	vnetp->reg = reg;
472 
473 	vnet_fdb_create(vnetp);
474 	attach_progress |= AST_fdbh_alloc;
475 
476 	(void) snprintf(qname, TASKQ_NAMELEN, "vnet_taskq%d", instance);
477 	if ((vnetp->taskqp = ddi_taskq_create(dip, qname, 1,
478 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
479 		cmn_err(CE_WARN, "!vnet%d: Unable to create task queue",
480 		    instance);
481 		goto vnet_attach_fail;
482 	}
483 	attach_progress |= AST_taskq_create;
484 
485 	/* add to the list of vnet devices */
486 	WRITE_ENTER(&vnet_rw);
487 	vnetp->nextp = vnet_headp;
488 	vnet_headp = vnetp;
489 	RW_EXIT(&vnet_rw);
490 
491 	attach_progress |= AST_vnet_list;
492 
493 	/*
494 	 * Initialize the generic vnet plugin which provides communication via
495 	 * sun4v LDC (logical domain channel) based resources. This involves 2
496 	 * steps; first, vgen_init() is invoked to read the various properties
497 	 * of the vnet device from its MD node (including its mtu which is
498 	 * needed to mac_register()) and obtain a handle to the vgen layer.
499 	 * After mac_register() is done and we have a mac handle, we then
500 	 * invoke vgen_init_mdeg() which registers with the the MD event
501 	 * generator (mdeg) framework to allow LDC resource notifications.
502 	 * Note: this sequence also allows us to report the correct default #
503 	 * of pseudo rings (2TX and 3RX) in vnet_m_capab() which gets invoked
504 	 * in the context of mac_register(); and avoids conflicting with
505 	 * dynamic pseudo rx rings which get added/removed as a result of mdeg
506 	 * events in vgen.
507 	 */
508 	status = vgen_init(vnetp, reg, vnetp->dip,
509 	    (uint8_t *)vnetp->curr_macaddr, &vnetp->vgenhdl);
510 	if (status != DDI_SUCCESS) {
511 		DERR(vnetp, "vgen_init() failed\n");
512 		goto vnet_attach_fail;
513 	}
514 	attach_progress |= AST_vgen_init;
515 
516 	status = vnet_mac_register(vnetp);
517 	if (status != DDI_SUCCESS) {
518 		goto vnet_attach_fail;
519 	}
520 	vnetp->link_state = LINK_STATE_UNKNOWN;
521 	attach_progress |= AST_macreg;
522 
523 	status = vgen_init_mdeg(vnetp->vgenhdl);
524 	if (status != DDI_SUCCESS) {
525 		goto vnet_attach_fail;
526 	}
527 	attach_progress |= AST_init_mdeg;
528 
529 	vnetp->attach_progress = attach_progress;
530 
531 	DBG1(NULL, "instance(%d) exit\n", instance);
532 	return (DDI_SUCCESS);
533 
534 vnet_attach_fail:
535 	vnetp->attach_progress = attach_progress;
536 	status = vnet_unattach(vnetp);
537 	ASSERT(status == 0);
538 	return (DDI_FAILURE);
539 }
540 
541 /*
542  * detach(9E): detach a device from the system.
543  */
544 static int
545 vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
546 {
547 	vnet_t		*vnetp;
548 	int		instance;
549 
550 	instance = ddi_get_instance(dip);
551 	DBG1(NULL, "instance(%d) enter\n", instance);
552 
553 	vnetp = ddi_get_driver_private(dip);
554 	if (vnetp == NULL) {
555 		goto vnet_detach_fail;
556 	}
557 
558 	switch (cmd) {
559 	case DDI_DETACH:
560 		break;
561 	case DDI_SUSPEND:
562 	case DDI_PM_SUSPEND:
563 	default:
564 		goto vnet_detach_fail;
565 	}
566 
567 	if (vnet_unattach(vnetp) != 0) {
568 		goto vnet_detach_fail;
569 	}
570 
571 	return (DDI_SUCCESS);
572 
573 vnet_detach_fail:
574 	return (DDI_FAILURE);
575 }
576 
577 /*
578  * Common routine to handle vnetattach() failure and vnetdetach(). Note that
579  * the only reason this function could fail is if mac_unregister() fails.
580  * Otherwise, this function must ensure that all resources are freed and return
581  * success.
582  */
583 static int
584 vnet_unattach(vnet_t *vnetp)
585 {
586 	vnet_attach_progress_t	attach_progress;
587 
588 	attach_progress = vnetp->attach_progress;
589 
590 	/*
591 	 * Disable the mac device in the gldv3 subsystem. This can fail, in
592 	 * particular if there are still any open references to this mac
593 	 * device; in which case we just return failure without continuing to
594 	 * detach further.
595 	 * If it succeeds, we then invoke vgen_uninit() which should unregister
596 	 * any pseudo rings registered with the mac layer. Note we keep the
597 	 * AST_macreg flag on, so we can unregister with the mac layer at
598 	 * the end of this routine.
599 	 */
600 	if (attach_progress & AST_macreg) {
601 		if (mac_disable(vnetp->mh) != 0) {
602 			return (1);
603 		}
604 	}
605 
606 	/*
607 	 * Now that we have disabled the device, we must finish all other steps
608 	 * and successfully return from this function; otherwise we will end up
609 	 * leaving the device in a broken/unusable state.
610 	 *
611 	 * First, release any hybrid resources assigned to this vnet device.
612 	 */
613 	if (attach_progress & AST_vdds_init) {
614 		vdds_cleanup(vnetp);
615 		attach_progress &= ~AST_vdds_init;
616 	}
617 
618 	/*
619 	 * Uninit vgen. This stops further mdeg callbacks to this vnet
620 	 * device and/or its ports; and detaches any existing ports.
621 	 */
622 	if (attach_progress & (AST_vgen_init|AST_init_mdeg)) {
623 		vgen_uninit(vnetp->vgenhdl);
624 		attach_progress &= ~AST_vgen_init;
625 		attach_progress &= ~AST_init_mdeg;
626 	}
627 
628 	/* Destroy the taskq. */
629 	if (attach_progress & AST_taskq_create) {
630 		ddi_taskq_destroy(vnetp->taskqp);
631 		attach_progress &= ~AST_taskq_create;
632 	}
633 
634 	/* Destroy fdb. */
635 	if (attach_progress & AST_fdbh_alloc) {
636 		vnet_fdb_destroy(vnetp);
637 		attach_progress &= ~AST_fdbh_alloc;
638 	}
639 
640 	/* Remove from the device list */
641 	if (attach_progress & AST_vnet_list) {
642 		vnet_t		**vnetpp;
643 		/* unlink from instance(vnet_t) list */
644 		WRITE_ENTER(&vnet_rw);
645 		for (vnetpp = &vnet_headp; *vnetpp;
646 		    vnetpp = &(*vnetpp)->nextp) {
647 			if (*vnetpp == vnetp) {
648 				*vnetpp = vnetp->nextp;
649 				break;
650 			}
651 		}
652 		RW_EXIT(&vnet_rw);
653 		attach_progress &= ~AST_vnet_list;
654 	}
655 
656 	if (attach_progress & AST_ring_init) {
657 		vnet_ring_grp_uninit(vnetp);
658 		attach_progress &= ~AST_ring_init;
659 	}
660 
661 	if (attach_progress & AST_macreg) {
662 		VERIFY(mac_unregister(vnetp->mh) == 0);
663 		vnetp->mh = NULL;
664 		attach_progress &= ~AST_macreg;
665 	}
666 
667 	if (attach_progress & AST_vnet_alloc) {
668 		rw_destroy(&vnetp->vrwlock);
669 		rw_destroy(&vnetp->vsw_fp_rw);
670 		attach_progress &= ~AST_vnet_list;
671 		KMEM_FREE(vnetp);
672 	}
673 
674 	return (0);
675 }
676 
677 /* enable the device for transmit/receive */
678 static int
679 vnet_m_start(void *arg)
680 {
681 	vnet_t		*vnetp = arg;
682 
683 	DBG1(vnetp, "enter\n");
684 
685 	WRITE_ENTER(&vnetp->vrwlock);
686 	vnetp->flags |= VNET_STARTED;
687 	vnet_start_resources(vnetp);
688 	RW_EXIT(&vnetp->vrwlock);
689 
690 	DBG1(vnetp, "exit\n");
691 	return (VNET_SUCCESS);
692 
693 }
694 
695 /* stop transmit/receive for the device */
696 static void
697 vnet_m_stop(void *arg)
698 {
699 	vnet_t		*vnetp = arg;
700 
701 	DBG1(vnetp, "enter\n");
702 
703 	WRITE_ENTER(&vnetp->vrwlock);
704 	if (vnetp->flags & VNET_STARTED) {
705 		/*
706 		 * Set the flags appropriately; this should prevent starting of
707 		 * any new resources that are added(see vnet_res_start_task()),
708 		 * while we release the vrwlock in vnet_stop_resources() before
709 		 * stopping each resource.
710 		 */
711 		vnetp->flags &= ~VNET_STARTED;
712 		vnetp->flags |= VNET_STOPPING;
713 		vnet_stop_resources(vnetp);
714 		vnetp->flags &= ~VNET_STOPPING;
715 	}
716 	RW_EXIT(&vnetp->vrwlock);
717 
718 	DBG1(vnetp, "exit\n");
719 }
720 
721 /* set the unicast mac address of the device */
722 static int
723 vnet_m_unicst(void *arg, const uint8_t *macaddr)
724 {
725 	_NOTE(ARGUNUSED(macaddr))
726 
727 	vnet_t *vnetp = arg;
728 
729 	DBG1(vnetp, "enter\n");
730 	/*
731 	 * NOTE: setting mac address dynamically is not supported.
732 	 */
733 	DBG1(vnetp, "exit\n");
734 
735 	return (VNET_FAILURE);
736 }
737 
738 /* enable/disable a multicast address */
739 static int
740 vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
741 {
742 	_NOTE(ARGUNUSED(add, mca))
743 
744 	vnet_t		*vnetp = arg;
745 	vnet_res_t	*vresp;
746 	mac_register_t	*macp;
747 	mac_callbacks_t	*cbp;
748 	int		rv = VNET_SUCCESS;
749 
750 	DBG1(vnetp, "enter\n");
751 
752 	READ_ENTER(&vnetp->vsw_fp_rw);
753 	if (vnetp->vsw_fp == NULL) {
754 		RW_EXIT(&vnetp->vsw_fp_rw);
755 		return (EAGAIN);
756 	}
757 	VNET_FDBE_REFHOLD(vnetp->vsw_fp);
758 	RW_EXIT(&vnetp->vsw_fp_rw);
759 
760 	vresp = vnetp->vsw_fp;
761 	macp = &vresp->macreg;
762 	cbp = macp->m_callbacks;
763 	rv = cbp->mc_multicst(macp->m_driver, add, mca);
764 
765 	VNET_FDBE_REFRELE(vnetp->vsw_fp);
766 
767 	DBG1(vnetp, "exit(%d)\n", rv);
768 	return (rv);
769 }
770 
771 /* set or clear promiscuous mode on the device */
772 static int
773 vnet_m_promisc(void *arg, boolean_t on)
774 {
775 	_NOTE(ARGUNUSED(on))
776 
777 	vnet_t *vnetp = arg;
778 	DBG1(vnetp, "enter\n");
779 	/*
780 	 * NOTE: setting promiscuous mode is not supported, just return success.
781 	 */
782 	DBG1(vnetp, "exit\n");
783 	return (VNET_SUCCESS);
784 }
785 
786 /*
787  * Transmit a chain of packets. This function provides switching functionality
788  * based on the destination mac address to reach other guests (within ldoms) or
789  * external hosts.
790  */
791 mblk_t *
792 vnet_tx_ring_send(void *arg, mblk_t *mp)
793 {
794 	vnet_pseudo_tx_ring_t	*tx_ringp;
795 	vnet_tx_ring_stats_t	*statsp;
796 	vnet_t			*vnetp;
797 	vnet_res_t		*vresp;
798 	mblk_t			*next;
799 	mblk_t			*resid_mp;
800 	mac_register_t		*macp;
801 	struct ether_header	*ehp;
802 	boolean_t		is_unicast;
803 	boolean_t		is_pvid;	/* non-default pvid ? */
804 	boolean_t		hres;		/* Hybrid resource ? */
805 	void			*tx_arg;
806 	size_t			size;
807 
808 	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
809 	statsp = &tx_ringp->tx_ring_stats;
810 	vnetp = (vnet_t *)tx_ringp->vnetp;
811 	DBG1(vnetp, "enter\n");
812 	ASSERT(mp != NULL);
813 
814 	is_pvid = (vnetp->pvid != vnetp->default_vlan_id) ? B_TRUE : B_FALSE;
815 
816 	while (mp != NULL) {
817 
818 		next = mp->b_next;
819 		mp->b_next = NULL;
820 
821 		/* update stats */
822 		size = msgsize(mp);
823 
824 		/*
825 		 * Find fdb entry for the destination
826 		 * and hold a reference to it.
827 		 */
828 		ehp = (struct ether_header *)mp->b_rptr;
829 		vresp = vnet_fdbe_find(vnetp, &ehp->ether_dhost);
830 		if (vresp != NULL) {
831 
832 			/*
833 			 * Destination found in FDB.
834 			 * The destination is a vnet device within ldoms
835 			 * and directly reachable, invoke the tx function
836 			 * in the fdb entry.
837 			 */
838 			macp = &vresp->macreg;
839 			resid_mp = macp->m_callbacks->mc_tx(macp->m_driver, mp);
840 
841 			/* tx done; now release ref on fdb entry */
842 			VNET_FDBE_REFRELE(vresp);
843 
844 			if (resid_mp != NULL) {
845 				/* m_tx failed */
846 				mp->b_next = next;
847 				break;
848 			}
849 		} else {
850 			is_unicast = !(IS_BROADCAST(ehp) ||
851 			    (IS_MULTICAST(ehp)));
852 			/*
853 			 * Destination is not in FDB.
854 			 * If the destination is broadcast or multicast,
855 			 * then forward the packet to vswitch.
856 			 * If a Hybrid resource avilable, then send the
857 			 * unicast packet via hybrid resource, otherwise
858 			 * forward it to vswitch.
859 			 */
860 			READ_ENTER(&vnetp->vsw_fp_rw);
861 
862 			if ((is_unicast) && (vnetp->hio_fp != NULL)) {
863 				vresp = vnetp->hio_fp;
864 				hres = B_TRUE;
865 			} else {
866 				vresp = vnetp->vsw_fp;
867 				hres = B_FALSE;
868 			}
869 			if (vresp == NULL) {
870 				/*
871 				 * no fdb entry to vsw? drop the packet.
872 				 */
873 				RW_EXIT(&vnetp->vsw_fp_rw);
874 				freemsg(mp);
875 				mp = next;
876 				continue;
877 			}
878 
879 			/* ref hold the fdb entry to vsw */
880 			VNET_FDBE_REFHOLD(vresp);
881 
882 			RW_EXIT(&vnetp->vsw_fp_rw);
883 
884 			/*
885 			 * In the case of a hybrid resource we need to insert
886 			 * the tag for the pvid case here; unlike packets that
887 			 * are destined to a vnet/vsw in which case the vgen
888 			 * layer does the tagging before sending it over ldc.
889 			 */
890 			if (hres == B_TRUE) {
891 				/*
892 				 * Determine if the frame being transmitted
893 				 * over the hybrid resource is untagged. If so,
894 				 * insert the tag before transmitting.
895 				 */
896 				if (is_pvid == B_TRUE &&
897 				    ehp->ether_type != htons(ETHERTYPE_VLAN)) {
898 
899 					mp = vnet_vlan_insert_tag(mp,
900 					    vnetp->pvid);
901 					if (mp == NULL) {
902 						VNET_FDBE_REFRELE(vresp);
903 						mp = next;
904 						continue;
905 					}
906 
907 				}
908 
909 				macp = &vresp->macreg;
910 				tx_arg = tx_ringp;
911 			} else {
912 				macp = &vresp->macreg;
913 				tx_arg = macp->m_driver;
914 			}
915 			resid_mp = macp->m_callbacks->mc_tx(tx_arg, mp);
916 
917 			/* tx done; now release ref on fdb entry */
918 			VNET_FDBE_REFRELE(vresp);
919 
920 			if (resid_mp != NULL) {
921 				/* m_tx failed */
922 				mp->b_next = next;
923 				break;
924 			}
925 		}
926 
927 		statsp->obytes += size;
928 		statsp->opackets++;
929 		mp = next;
930 	}
931 
932 	DBG1(vnetp, "exit\n");
933 	return (mp);
934 }
935 
936 /* get statistics from the device */
937 int
938 vnet_m_stat(void *arg, uint_t stat, uint64_t *val)
939 {
940 	vnet_t *vnetp = arg;
941 	vnet_res_t	*vresp;
942 	mac_register_t	*macp;
943 	mac_callbacks_t	*cbp;
944 	uint64_t val_total = 0;
945 
946 	DBG1(vnetp, "enter\n");
947 
948 	/*
949 	 * get the specified statistic from each transport and return the
950 	 * aggregate val.  This obviously only works for counters.
951 	 */
952 	if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) ||
953 	    (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) {
954 		return (ENOTSUP);
955 	}
956 
957 	READ_ENTER(&vnetp->vrwlock);
958 	for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
959 		macp = &vresp->macreg;
960 		cbp = macp->m_callbacks;
961 		if (cbp->mc_getstat(macp->m_driver, stat, val) == 0)
962 			val_total += *val;
963 	}
964 	RW_EXIT(&vnetp->vrwlock);
965 
966 	*val = val_total;
967 
968 	DBG1(vnetp, "exit\n");
969 	return (0);
970 }
971 
972 static void
973 vnet_ring_grp_init(vnet_t *vnetp)
974 {
975 	vnet_pseudo_rx_group_t	*rx_grp;
976 	vnet_pseudo_rx_ring_t	*rx_ringp;
977 	vnet_pseudo_tx_group_t	*tx_grp;
978 	vnet_pseudo_tx_ring_t	*tx_ringp;
979 	int			i;
980 
981 	tx_grp = &vnetp->tx_grp[0];
982 	tx_ringp = kmem_zalloc(sizeof (vnet_pseudo_tx_ring_t) *
983 	    VNET_NUM_PSEUDO_TXRINGS, KM_SLEEP);
984 	for (i = 0; i < VNET_NUM_PSEUDO_TXRINGS; i++) {
985 		tx_ringp[i].state |= VNET_TXRING_SHARED;
986 	}
987 	tx_grp->rings = tx_ringp;
988 	tx_grp->ring_cnt = VNET_NUM_PSEUDO_TXRINGS;
989 	mutex_init(&tx_grp->flowctl_lock, NULL, MUTEX_DRIVER, NULL);
990 	cv_init(&tx_grp->flowctl_cv, NULL, CV_DRIVER, NULL);
991 	tx_grp->flowctl_thread = thread_create(NULL, 0,
992 	    vnet_tx_notify_thread, tx_grp, 0, &p0, TS_RUN, minclsyspri);
993 
994 	rx_grp = &vnetp->rx_grp[0];
995 	rx_grp->max_ring_cnt = MAX_RINGS_PER_GROUP;
996 	rw_init(&rx_grp->lock, NULL, RW_DRIVER, NULL);
997 	rx_ringp = kmem_zalloc(sizeof (vnet_pseudo_rx_ring_t) *
998 	    rx_grp->max_ring_cnt, KM_SLEEP);
999 
1000 	/*
1001 	 * Setup the first 3 Pseudo RX Rings that are reserved;
1002 	 * 1 for LDC resource to vswitch + 2 for RX rings of Hybrid resource.
1003 	 */
1004 	rx_ringp[0].state |= VNET_RXRING_INUSE|VNET_RXRING_LDC_SERVICE;
1005 	rx_ringp[0].index = 0;
1006 	rx_ringp[1].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
1007 	rx_ringp[1].index = 1;
1008 	rx_ringp[2].state |= VNET_RXRING_INUSE|VNET_RXRING_HYBRID;
1009 	rx_ringp[2].index = 2;
1010 
1011 	rx_grp->ring_cnt = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1012 	rx_grp->rings = rx_ringp;
1013 
1014 	for (i = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1015 	    i < rx_grp->max_ring_cnt; i++) {
1016 		rx_ringp = &rx_grp->rings[i];
1017 		rx_ringp->state = VNET_RXRING_FREE;
1018 		rx_ringp->index = i;
1019 	}
1020 }
1021 
1022 static void
1023 vnet_ring_grp_uninit(vnet_t *vnetp)
1024 {
1025 	vnet_pseudo_rx_group_t	*rx_grp;
1026 	vnet_pseudo_tx_group_t	*tx_grp;
1027 	kt_did_t		tid = 0;
1028 
1029 	tx_grp = &vnetp->tx_grp[0];
1030 
1031 	/* Inform tx_notify_thread to exit */
1032 	mutex_enter(&tx_grp->flowctl_lock);
1033 	if (tx_grp->flowctl_thread != NULL) {
1034 		tid = tx_grp->flowctl_thread->t_did;
1035 		tx_grp->flowctl_done = B_TRUE;
1036 		cv_signal(&tx_grp->flowctl_cv);
1037 	}
1038 	mutex_exit(&tx_grp->flowctl_lock);
1039 	if (tid != 0)
1040 		thread_join(tid);
1041 
1042 	if (tx_grp->rings != NULL) {
1043 		ASSERT(tx_grp->ring_cnt == VNET_NUM_PSEUDO_TXRINGS);
1044 		kmem_free(tx_grp->rings, sizeof (vnet_pseudo_tx_ring_t) *
1045 		    tx_grp->ring_cnt);
1046 		tx_grp->rings = NULL;
1047 	}
1048 
1049 	rx_grp = &vnetp->rx_grp[0];
1050 	if (rx_grp->rings != NULL) {
1051 		ASSERT(rx_grp->max_ring_cnt == MAX_RINGS_PER_GROUP);
1052 		ASSERT(rx_grp->ring_cnt == VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
1053 		kmem_free(rx_grp->rings, sizeof (vnet_pseudo_rx_ring_t) *
1054 		    rx_grp->max_ring_cnt);
1055 		rx_grp->rings = NULL;
1056 	}
1057 }
1058 
1059 static vnet_pseudo_rx_ring_t *
1060 vnet_alloc_pseudo_rx_ring(vnet_t *vnetp)
1061 {
1062 	vnet_pseudo_rx_group_t  *rx_grp;
1063 	vnet_pseudo_rx_ring_t	*rx_ringp;
1064 	int			index;
1065 
1066 	rx_grp = &vnetp->rx_grp[0];
1067 	WRITE_ENTER(&rx_grp->lock);
1068 
1069 	if (rx_grp->ring_cnt == rx_grp->max_ring_cnt) {
1070 		/* no rings available */
1071 		RW_EXIT(&rx_grp->lock);
1072 		return (NULL);
1073 	}
1074 
1075 	for (index = VNET_NUM_PSEUDO_RXRINGS_DEFAULT;
1076 	    index < rx_grp->max_ring_cnt; index++) {
1077 		rx_ringp = &rx_grp->rings[index];
1078 		if (rx_ringp->state == VNET_RXRING_FREE) {
1079 			rx_ringp->state |= VNET_RXRING_INUSE;
1080 			rx_grp->ring_cnt++;
1081 			break;
1082 		}
1083 	}
1084 
1085 	RW_EXIT(&rx_grp->lock);
1086 	return (rx_ringp);
1087 }
1088 
1089 static void
1090 vnet_free_pseudo_rx_ring(vnet_t *vnetp, vnet_pseudo_rx_ring_t *ringp)
1091 {
1092 	vnet_pseudo_rx_group_t  *rx_grp;
1093 
1094 	ASSERT(ringp->index >= VNET_NUM_PSEUDO_RXRINGS_DEFAULT);
1095 	rx_grp = &vnetp->rx_grp[0];
1096 	WRITE_ENTER(&rx_grp->lock);
1097 
1098 	if (ringp->state != VNET_RXRING_FREE) {
1099 		ringp->state = VNET_RXRING_FREE;
1100 		ringp->handle = NULL;
1101 		rx_grp->ring_cnt--;
1102 	}
1103 
1104 	RW_EXIT(&rx_grp->lock);
1105 }
1106 
1107 /* wrapper function for mac_register() */
1108 static int
1109 vnet_mac_register(vnet_t *vnetp)
1110 {
1111 	mac_register_t	*macp;
1112 	int		err;
1113 
1114 	if ((macp = mac_alloc(MAC_VERSION)) == NULL)
1115 		return (DDI_FAILURE);
1116 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1117 	macp->m_driver = vnetp;
1118 	macp->m_dip = vnetp->dip;
1119 	macp->m_src_addr = vnetp->curr_macaddr;
1120 	macp->m_callbacks = &vnet_m_callbacks;
1121 	macp->m_min_sdu = 0;
1122 	macp->m_max_sdu = vnetp->mtu;
1123 	macp->m_margin = VLAN_TAGSZ;
1124 
1125 	macp->m_v12n = MAC_VIRT_LEVEL1;
1126 
1127 	/*
1128 	 * Finally, we're ready to register ourselves with the MAC layer
1129 	 * interface; if this succeeds, we're all ready to start()
1130 	 */
1131 	err = mac_register(macp, &vnetp->mh);
1132 	mac_free(macp);
1133 	return (err == 0 ? DDI_SUCCESS : DDI_FAILURE);
1134 }
1135 
1136 /* read the mac address of the device */
1137 static int
1138 vnet_read_mac_address(vnet_t *vnetp)
1139 {
1140 	uchar_t 	*macaddr;
1141 	uint32_t 	size;
1142 	int 		rv;
1143 
1144 	rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip,
1145 	    DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size);
1146 	if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) {
1147 		DWARN(vnetp, "prop_lookup failed(%s) err(%d)\n",
1148 		    macaddr_propname, rv);
1149 		return (DDI_FAILURE);
1150 	}
1151 	bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL);
1152 	bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL);
1153 	ddi_prop_free(macaddr);
1154 
1155 	return (DDI_SUCCESS);
1156 }
1157 
1158 static void
1159 vnet_fdb_create(vnet_t *vnetp)
1160 {
1161 	char		hashname[MAXNAMELEN];
1162 
1163 	(void) snprintf(hashname, MAXNAMELEN, "vnet%d-fdbhash",
1164 	    vnetp->instance);
1165 	vnetp->fdb_nchains = vnet_fdb_nchains;
1166 	vnetp->fdb_hashp = mod_hash_create_ptrhash(hashname, vnetp->fdb_nchains,
1167 	    mod_hash_null_valdtor, sizeof (void *));
1168 }
1169 
1170 static void
1171 vnet_fdb_destroy(vnet_t *vnetp)
1172 {
1173 	/* destroy fdb-hash-table */
1174 	if (vnetp->fdb_hashp != NULL) {
1175 		mod_hash_destroy_hash(vnetp->fdb_hashp);
1176 		vnetp->fdb_hashp = NULL;
1177 		vnetp->fdb_nchains = 0;
1178 	}
1179 }
1180 
1181 /*
1182  * Add an entry into the fdb.
1183  */
1184 void
1185 vnet_fdbe_add(vnet_t *vnetp, vnet_res_t *vresp)
1186 {
1187 	uint64_t	addr = 0;
1188 	int		rv;
1189 
1190 	KEY_HASH(addr, vresp->rem_macaddr);
1191 
1192 	/*
1193 	 * If the entry being added corresponds to LDC_SERVICE resource,
1194 	 * that is, vswitch connection, it is added to the hash and also
1195 	 * the entry is cached, an additional reference count reflects
1196 	 * this. The HYBRID resource is not added to the hash, but only
1197 	 * cached, as it is only used for sending out packets for unknown
1198 	 * unicast destinations.
1199 	 */
1200 	(vresp->type == VIO_NET_RES_LDC_SERVICE) ?
1201 	    (vresp->refcnt = 1) : (vresp->refcnt = 0);
1202 
1203 	/*
1204 	 * Note: duplicate keys will be rejected by mod_hash.
1205 	 */
1206 	if (vresp->type != VIO_NET_RES_HYBRID) {
1207 		rv = mod_hash_insert(vnetp->fdb_hashp, (mod_hash_key_t)addr,
1208 		    (mod_hash_val_t)vresp);
1209 		if (rv != 0) {
1210 			DWARN(vnetp, "Duplicate macaddr key(%lx)\n", addr);
1211 			return;
1212 		}
1213 	}
1214 
1215 	if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
1216 		/* Cache the fdb entry to vsw-port */
1217 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1218 		if (vnetp->vsw_fp == NULL)
1219 			vnetp->vsw_fp = vresp;
1220 		RW_EXIT(&vnetp->vsw_fp_rw);
1221 	} else if (vresp->type == VIO_NET_RES_HYBRID) {
1222 		/* Cache the fdb entry to hybrid resource */
1223 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1224 		if (vnetp->hio_fp == NULL)
1225 			vnetp->hio_fp = vresp;
1226 		RW_EXIT(&vnetp->vsw_fp_rw);
1227 	}
1228 }
1229 
1230 /*
1231  * Remove an entry from fdb.
1232  */
1233 static void
1234 vnet_fdbe_del(vnet_t *vnetp, vnet_res_t *vresp)
1235 {
1236 	uint64_t	addr = 0;
1237 	int		rv;
1238 	uint32_t	refcnt;
1239 	vnet_res_t	*tmp;
1240 
1241 	KEY_HASH(addr, vresp->rem_macaddr);
1242 
1243 	/*
1244 	 * Remove the entry from fdb hash table.
1245 	 * This prevents further references to this fdb entry.
1246 	 */
1247 	if (vresp->type != VIO_NET_RES_HYBRID) {
1248 		rv = mod_hash_remove(vnetp->fdb_hashp, (mod_hash_key_t)addr,
1249 		    (mod_hash_val_t *)&tmp);
1250 		if (rv != 0) {
1251 			/*
1252 			 * As the resources are added to the hash only
1253 			 * after they are started, this can occur if
1254 			 * a resource unregisters before it is ever started.
1255 			 */
1256 			return;
1257 		}
1258 	}
1259 
1260 	if (vresp->type == VIO_NET_RES_LDC_SERVICE) {
1261 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1262 
1263 		ASSERT(tmp == vnetp->vsw_fp);
1264 		vnetp->vsw_fp = NULL;
1265 
1266 		RW_EXIT(&vnetp->vsw_fp_rw);
1267 	} else if (vresp->type == VIO_NET_RES_HYBRID) {
1268 		WRITE_ENTER(&vnetp->vsw_fp_rw);
1269 
1270 		vnetp->hio_fp = NULL;
1271 
1272 		RW_EXIT(&vnetp->vsw_fp_rw);
1273 	}
1274 
1275 	/*
1276 	 * If there are threads already ref holding before the entry was
1277 	 * removed from hash table, then wait for ref count to drop to zero.
1278 	 */
1279 	(vresp->type == VIO_NET_RES_LDC_SERVICE) ?
1280 	    (refcnt = 1) : (refcnt = 0);
1281 	while (vresp->refcnt > refcnt) {
1282 		delay(drv_usectohz(vnet_fdbe_refcnt_delay));
1283 	}
1284 }
1285 
1286 /*
1287  * Search fdb for a given mac address. If an entry is found, hold
1288  * a reference to it and return the entry; else returns NULL.
1289  */
1290 static vnet_res_t *
1291 vnet_fdbe_find(vnet_t *vnetp, struct ether_addr *addrp)
1292 {
1293 	uint64_t	key = 0;
1294 	vnet_res_t	*vresp;
1295 	int		rv;
1296 
1297 	KEY_HASH(key, addrp->ether_addr_octet);
1298 
1299 	rv = mod_hash_find_cb(vnetp->fdb_hashp, (mod_hash_key_t)key,
1300 	    (mod_hash_val_t *)&vresp, vnet_fdbe_find_cb);
1301 
1302 	if (rv != 0)
1303 		return (NULL);
1304 
1305 	return (vresp);
1306 }
1307 
1308 /*
1309  * Callback function provided to mod_hash_find_cb(). After finding the fdb
1310  * entry corresponding to the key (macaddr), this callback will be invoked by
1311  * mod_hash_find_cb() to atomically increment the reference count on the fdb
1312  * entry before returning the found entry.
1313  */
1314 static void
1315 vnet_fdbe_find_cb(mod_hash_key_t key, mod_hash_val_t val)
1316 {
1317 	_NOTE(ARGUNUSED(key))
1318 	VNET_FDBE_REFHOLD((vnet_res_t *)val);
1319 }
1320 
1321 /*
1322  * Frames received that are tagged with the pvid of the vnet device must be
1323  * untagged before sending up the stack. This function walks the chain of rx
1324  * frames, untags any such frames and returns the updated chain.
1325  *
1326  * Arguments:
1327  *    pvid:  pvid of the vnet device for which packets are being received
1328  *    mp:    head of pkt chain to be validated and untagged
1329  *
1330  * Returns:
1331  *    mp:    head of updated chain of packets
1332  */
1333 static void
1334 vnet_rx_frames_untag(uint16_t pvid, mblk_t **mp)
1335 {
1336 	struct ether_vlan_header	*evhp;
1337 	mblk_t				*bp;
1338 	mblk_t				*bpt;
1339 	mblk_t				*bph;
1340 	mblk_t				*bpn;
1341 
1342 	bpn = bph = bpt = NULL;
1343 
1344 	for (bp = *mp; bp != NULL; bp = bpn) {
1345 
1346 		bpn = bp->b_next;
1347 		bp->b_next = bp->b_prev = NULL;
1348 
1349 		evhp = (struct ether_vlan_header *)bp->b_rptr;
1350 
1351 		if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN &&
1352 		    VLAN_ID(ntohs(evhp->ether_tci)) == pvid) {
1353 
1354 			bp = vnet_vlan_remove_tag(bp);
1355 			if (bp == NULL) {
1356 				continue;
1357 			}
1358 
1359 		}
1360 
1361 		/* build a chain of processed packets */
1362 		if (bph == NULL) {
1363 			bph = bpt = bp;
1364 		} else {
1365 			bpt->b_next = bp;
1366 			bpt = bp;
1367 		}
1368 
1369 	}
1370 
1371 	*mp = bph;
1372 }
1373 
1374 static void
1375 vnet_rx(vio_net_handle_t vrh, mblk_t *mp)
1376 {
1377 	vnet_res_t		*vresp = (vnet_res_t *)vrh;
1378 	vnet_t			*vnetp = vresp->vnetp;
1379 	vnet_pseudo_rx_ring_t	*ringp;
1380 
1381 	if ((vnetp == NULL) || (vnetp->mh == 0)) {
1382 		freemsgchain(mp);
1383 		return;
1384 	}
1385 
1386 	ringp = vresp->rx_ringp;
1387 	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
1388 }
1389 
1390 void
1391 vnet_tx_update(vio_net_handle_t vrh)
1392 {
1393 	vnet_res_t		*vresp = (vnet_res_t *)vrh;
1394 	vnet_t			*vnetp = vresp->vnetp;
1395 	vnet_pseudo_tx_ring_t	*tx_ringp;
1396 	vnet_pseudo_tx_group_t	*tx_grp;
1397 	int			i;
1398 
1399 	if (vnetp == NULL || vnetp->mh == NULL) {
1400 		return;
1401 	}
1402 
1403 	/*
1404 	 * Currently, the tx hwring API (used to access rings that belong to
1405 	 * a Hybrid IO resource) does not provide us a per ring flow ctrl
1406 	 * update; also the pseudo rings are shared by the ports/ldcs in the
1407 	 * vgen layer. Thus we can't figure out which pseudo ring is being
1408 	 * re-enabled for transmits. To work around this, when we get a tx
1409 	 * restart notification from below, we simply propagate that to all
1410 	 * the tx pseudo rings registered with the mac layer above.
1411 	 *
1412 	 * There are a couple of side effects with this approach, but they are
1413 	 * not harmful, as outlined below:
1414 	 *
1415 	 * A) We might send an invalid ring_update() for a ring that is not
1416 	 * really flow controlled. This will not have any effect in the mac
1417 	 * layer and packets will continue to be transmitted on that ring.
1418 	 *
1419 	 * B) We might end up clearing the flow control in the mac layer for
1420 	 * a ring that is still flow controlled in the underlying resource.
1421 	 * This will result in the mac layer restarting	transmit, only to be
1422 	 * flow controlled again on that ring.
1423 	 */
1424 	tx_grp = &vnetp->tx_grp[0];
1425 	for (i = 0; i < tx_grp->ring_cnt; i++) {
1426 		tx_ringp = &tx_grp->rings[i];
1427 		mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
1428 	}
1429 }
1430 
1431 /*
1432  * vnet_tx_notify_thread:
1433  *
1434  * vnet_tx_ring_update() callback function wakes up this thread when
1435  * it gets called. This thread will call mac_tx_ring_update() to
1436  * notify upper mac of flow control getting relieved. Note that
1437  * vnet_tx_ring_update() cannot call mac_tx_ring_update() directly
1438  * because vnet_tx_ring_update() is called from lower mac with
1439  * mi_rw_lock held and mac_tx_ring_update() would also try to grab
1440  * the same lock.
1441  */
1442 static void
1443 vnet_tx_notify_thread(void *arg)
1444 {
1445 	callb_cpr_t		cprinfo;
1446 	vnet_pseudo_tx_group_t	*tx_grp = (vnet_pseudo_tx_group_t *)arg;
1447 	vnet_pseudo_tx_ring_t	*tx_ringp;
1448 	vnet_t			*vnetp;
1449 	int			i;
1450 
1451 	CALLB_CPR_INIT(&cprinfo, &tx_grp->flowctl_lock, callb_generic_cpr,
1452 	    "vnet_tx_notify_thread");
1453 
1454 	mutex_enter(&tx_grp->flowctl_lock);
1455 	while (!tx_grp->flowctl_done) {
1456 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1457 		cv_wait(&tx_grp->flowctl_cv, &tx_grp->flowctl_lock);
1458 		CALLB_CPR_SAFE_END(&cprinfo, &tx_grp->flowctl_lock);
1459 
1460 		for (i = 0; i < tx_grp->ring_cnt; i++) {
1461 			tx_ringp = &tx_grp->rings[i];
1462 			if (tx_ringp->woken_up) {
1463 				tx_ringp->woken_up = B_FALSE;
1464 				vnetp = tx_ringp->vnetp;
1465 				mac_tx_ring_update(vnetp->mh, tx_ringp->handle);
1466 			}
1467 		}
1468 	}
1469 	/*
1470 	 * The tx_grp is being destroyed, exit the thread.
1471 	 */
1472 	tx_grp->flowctl_thread = NULL;
1473 	CALLB_CPR_EXIT(&cprinfo);
1474 	thread_exit();
1475 }
1476 
1477 void
1478 vnet_tx_ring_update(void *arg1, uintptr_t arg2)
1479 {
1480 	vnet_t			*vnetp = (vnet_t *)arg1;
1481 	vnet_pseudo_tx_group_t	*tx_grp;
1482 	vnet_pseudo_tx_ring_t	*tx_ringp;
1483 	int			i;
1484 
1485 	tx_grp = &vnetp->tx_grp[0];
1486 	for (i = 0; i < tx_grp->ring_cnt; i++) {
1487 		tx_ringp = &tx_grp->rings[i];
1488 		if (tx_ringp->hw_rh == (mac_ring_handle_t)arg2) {
1489 			mutex_enter(&tx_grp->flowctl_lock);
1490 			tx_ringp->woken_up = B_TRUE;
1491 			cv_signal(&tx_grp->flowctl_cv);
1492 			mutex_exit(&tx_grp->flowctl_lock);
1493 			break;
1494 		}
1495 	}
1496 }
1497 
1498 /*
1499  * Update the new mtu of vnet into the mac layer. First check if the device has
1500  * been plumbed and if so fail the mtu update. Returns 0 on success.
1501  */
1502 int
1503 vnet_mtu_update(vnet_t *vnetp, uint32_t mtu)
1504 {
1505 	int	rv;
1506 
1507 	if (vnetp == NULL || vnetp->mh == NULL) {
1508 		return (EINVAL);
1509 	}
1510 
1511 	WRITE_ENTER(&vnetp->vrwlock);
1512 
1513 	if (vnetp->flags & VNET_STARTED) {
1514 		RW_EXIT(&vnetp->vrwlock);
1515 		cmn_err(CE_NOTE, "!vnet%d: Unable to process mtu "
1516 		    "update as the device is plumbed\n",
1517 		    vnetp->instance);
1518 		return (EBUSY);
1519 	}
1520 
1521 	/* update mtu in the mac layer */
1522 	rv = mac_maxsdu_update(vnetp->mh, mtu);
1523 	if (rv != 0) {
1524 		RW_EXIT(&vnetp->vrwlock);
1525 		cmn_err(CE_NOTE,
1526 		    "!vnet%d: Unable to update mtu with mac layer\n",
1527 		    vnetp->instance);
1528 		return (EIO);
1529 	}
1530 
1531 	vnetp->mtu = mtu;
1532 
1533 	RW_EXIT(&vnetp->vrwlock);
1534 
1535 	return (0);
1536 }
1537 
1538 /*
1539  * Update the link state of vnet to the mac layer.
1540  */
1541 void
1542 vnet_link_update(vnet_t *vnetp, link_state_t link_state)
1543 {
1544 	if (vnetp == NULL || vnetp->mh == NULL) {
1545 		return;
1546 	}
1547 
1548 	WRITE_ENTER(&vnetp->vrwlock);
1549 	if (vnetp->link_state == link_state) {
1550 		RW_EXIT(&vnetp->vrwlock);
1551 		return;
1552 	}
1553 	vnetp->link_state = link_state;
1554 	RW_EXIT(&vnetp->vrwlock);
1555 
1556 	mac_link_update(vnetp->mh, link_state);
1557 }
1558 
1559 /*
1560  * vio_net_resource_reg -- An interface called to register a resource
1561  *	with vnet.
1562  *	macp -- a GLDv3 mac_register that has all the details of
1563  *		a resource and its callbacks etc.
1564  *	type -- resource type.
1565  *	local_macaddr -- resource's MAC address. This is used to
1566  *			 associate a resource with a corresponding vnet.
1567  *	remote_macaddr -- remote side MAC address. This is ignored for
1568  *			  the Hybrid resources.
1569  *	vhp -- A handle returned to the caller.
1570  *	vcb -- A set of callbacks provided to the callers.
1571  */
1572 int vio_net_resource_reg(mac_register_t *macp, vio_net_res_type_t type,
1573     ether_addr_t local_macaddr, ether_addr_t rem_macaddr, vio_net_handle_t *vhp,
1574     vio_net_callbacks_t *vcb)
1575 {
1576 	vnet_t		*vnetp;
1577 	vnet_res_t	*vresp;
1578 
1579 	vresp = kmem_zalloc(sizeof (vnet_res_t), KM_SLEEP);
1580 	ether_copy(local_macaddr, vresp->local_macaddr);
1581 	ether_copy(rem_macaddr, vresp->rem_macaddr);
1582 	vresp->type = type;
1583 	bcopy(macp, &vresp->macreg, sizeof (mac_register_t));
1584 
1585 	DBG1(NULL, "Resource Registerig type=0%X\n", type);
1586 
1587 	READ_ENTER(&vnet_rw);
1588 	vnetp = vnet_headp;
1589 	while (vnetp != NULL) {
1590 		if (VNET_MATCH_RES(vresp, vnetp)) {
1591 			vresp->vnetp = vnetp;
1592 
1593 			/* Setup kstats for hio resource */
1594 			if (vresp->type == VIO_NET_RES_HYBRID) {
1595 				vresp->ksp = vnet_hio_setup_kstats(DRV_NAME,
1596 				    "hio", vresp);
1597 				if (vresp->ksp == NULL) {
1598 					cmn_err(CE_NOTE, "!vnet%d: Cannot "
1599 					    "create kstats for hio resource",
1600 					    vnetp->instance);
1601 				}
1602 			}
1603 			vnet_add_resource(vnetp, vresp);
1604 			break;
1605 		}
1606 		vnetp = vnetp->nextp;
1607 	}
1608 	RW_EXIT(&vnet_rw);
1609 	if (vresp->vnetp == NULL) {
1610 		DWARN(NULL, "No vnet instance");
1611 		kmem_free(vresp, sizeof (vnet_res_t));
1612 		return (ENXIO);
1613 	}
1614 
1615 	*vhp = vresp;
1616 	vcb->vio_net_rx_cb = vnet_rx;
1617 	vcb->vio_net_tx_update = vnet_tx_update;
1618 	vcb->vio_net_report_err = vnet_handle_res_err;
1619 
1620 	/* Bind the resource to pseudo ring(s) */
1621 	if (vnet_bind_rings(vresp) != 0) {
1622 		(void) vnet_rem_resource(vnetp, vresp);
1623 		vnet_hio_destroy_kstats(vresp->ksp);
1624 		KMEM_FREE(vresp);
1625 		return (1);
1626 	}
1627 
1628 	/* Dispatch a task to start resources */
1629 	vnet_dispatch_res_task(vnetp);
1630 	return (0);
1631 }
1632 
1633 /*
1634  * vio_net_resource_unreg -- An interface to unregister a resource.
1635  */
1636 void
1637 vio_net_resource_unreg(vio_net_handle_t vhp)
1638 {
1639 	vnet_res_t	*vresp = (vnet_res_t *)vhp;
1640 	vnet_t		*vnetp = vresp->vnetp;
1641 
1642 	DBG1(NULL, "Resource Registerig hdl=0x%p", vhp);
1643 
1644 	ASSERT(vnetp != NULL);
1645 	/*
1646 	 * Remove the resource from fdb; this ensures
1647 	 * there are no references to the resource.
1648 	 */
1649 	vnet_fdbe_del(vnetp, vresp);
1650 
1651 	vnet_unbind_rings(vresp);
1652 
1653 	/* Now remove the resource from the list */
1654 	(void) vnet_rem_resource(vnetp, vresp);
1655 
1656 	vnet_hio_destroy_kstats(vresp->ksp);
1657 	KMEM_FREE(vresp);
1658 }
1659 
1660 static void
1661 vnet_add_resource(vnet_t *vnetp, vnet_res_t *vresp)
1662 {
1663 	WRITE_ENTER(&vnetp->vrwlock);
1664 	vresp->nextp = vnetp->vres_list;
1665 	vnetp->vres_list = vresp;
1666 	RW_EXIT(&vnetp->vrwlock);
1667 }
1668 
1669 static vnet_res_t *
1670 vnet_rem_resource(vnet_t *vnetp, vnet_res_t *vresp)
1671 {
1672 	vnet_res_t	*vrp;
1673 
1674 	WRITE_ENTER(&vnetp->vrwlock);
1675 	if (vresp == vnetp->vres_list) {
1676 		vnetp->vres_list = vresp->nextp;
1677 	} else {
1678 		vrp = vnetp->vres_list;
1679 		while (vrp->nextp != NULL) {
1680 			if (vrp->nextp == vresp) {
1681 				vrp->nextp = vresp->nextp;
1682 				break;
1683 			}
1684 			vrp = vrp->nextp;
1685 		}
1686 	}
1687 	vresp->vnetp = NULL;
1688 	vresp->nextp = NULL;
1689 
1690 	RW_EXIT(&vnetp->vrwlock);
1691 
1692 	return (vresp);
1693 }
1694 
1695 /*
1696  * vnet_dds_rx -- an interface called by vgen to DDS messages.
1697  */
1698 void
1699 vnet_dds_rx(void *arg, void *dmsg)
1700 {
1701 	vnet_t *vnetp = arg;
1702 	vdds_process_dds_msg(vnetp, dmsg);
1703 }
1704 
1705 /*
1706  * vnet_send_dds_msg -- An interface provided to DDS to send
1707  *	DDS messages. This simply sends meessages via vgen.
1708  */
1709 int
1710 vnet_send_dds_msg(vnet_t *vnetp, void *dmsg)
1711 {
1712 	int rv;
1713 
1714 	if (vnetp->vgenhdl != NULL) {
1715 		rv = vgen_dds_tx(vnetp->vgenhdl, dmsg);
1716 	}
1717 	return (rv);
1718 }
1719 
1720 /*
1721  * vnet_cleanup_hio -- an interface called by vgen to cleanup hio resources.
1722  */
1723 void
1724 vnet_dds_cleanup_hio(vnet_t *vnetp)
1725 {
1726 	vdds_cleanup_hio(vnetp);
1727 }
1728 
1729 /*
1730  * vnet_handle_res_err -- A callback function called by a resource
1731  *	to report an error. For example, vgen can call to report
1732  *	an LDC down/reset event. This will trigger cleanup of associated
1733  *	Hybrid resource.
1734  */
1735 /* ARGSUSED */
1736 static void
1737 vnet_handle_res_err(vio_net_handle_t vrh, vio_net_err_val_t err)
1738 {
1739 	vnet_res_t *vresp = (vnet_res_t *)vrh;
1740 	vnet_t *vnetp = vresp->vnetp;
1741 
1742 	if (vnetp == NULL) {
1743 		return;
1744 	}
1745 	if ((vresp->type != VIO_NET_RES_LDC_SERVICE) &&
1746 	    (vresp->type != VIO_NET_RES_HYBRID)) {
1747 		return;
1748 	}
1749 
1750 	vdds_cleanup_hio(vnetp);
1751 }
1752 
1753 /*
1754  * vnet_dispatch_res_task -- A function to dispatch tasks start resources.
1755  */
1756 static void
1757 vnet_dispatch_res_task(vnet_t *vnetp)
1758 {
1759 	int rv;
1760 
1761 	/*
1762 	 * Dispatch the task. It could be the case that vnetp->flags does
1763 	 * not have VNET_STARTED set. This is ok as vnet_rest_start_task()
1764 	 * can abort the task when the task is started. See related comments
1765 	 * in vnet_m_stop() and vnet_stop_resources().
1766 	 */
1767 	rv = ddi_taskq_dispatch(vnetp->taskqp, vnet_res_start_task,
1768 	    vnetp, DDI_NOSLEEP);
1769 	if (rv != DDI_SUCCESS) {
1770 		cmn_err(CE_WARN,
1771 		    "vnet%d:Can't dispatch start resource task",
1772 		    vnetp->instance);
1773 	}
1774 }
1775 
1776 /*
1777  * vnet_res_start_task -- A taskq callback function that starts a resource.
1778  */
1779 static void
1780 vnet_res_start_task(void *arg)
1781 {
1782 	vnet_t *vnetp = arg;
1783 
1784 	WRITE_ENTER(&vnetp->vrwlock);
1785 	if (vnetp->flags & VNET_STARTED) {
1786 		vnet_start_resources(vnetp);
1787 	}
1788 	RW_EXIT(&vnetp->vrwlock);
1789 }
1790 
1791 /*
1792  * vnet_start_resources -- starts all resources associated with
1793  *	a vnet.
1794  */
1795 static void
1796 vnet_start_resources(vnet_t *vnetp)
1797 {
1798 	mac_register_t	*macp;
1799 	mac_callbacks_t	*cbp;
1800 	vnet_res_t	*vresp;
1801 	int rv;
1802 
1803 	DBG1(vnetp, "enter\n");
1804 
1805 	ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));
1806 
1807 	for (vresp = vnetp->vres_list; vresp != NULL; vresp = vresp->nextp) {
1808 		/* skip if it is already started */
1809 		if (vresp->flags & VNET_STARTED) {
1810 			continue;
1811 		}
1812 		macp = &vresp->macreg;
1813 		cbp = macp->m_callbacks;
1814 		rv = cbp->mc_start(macp->m_driver);
1815 		if (rv == 0) {
1816 			/*
1817 			 * Successfully started the resource, so now
1818 			 * add it to the fdb.
1819 			 */
1820 			vresp->flags |= VNET_STARTED;
1821 			vnet_fdbe_add(vnetp, vresp);
1822 		}
1823 	}
1824 
1825 	DBG1(vnetp, "exit\n");
1826 
1827 }
1828 
1829 /*
1830  * vnet_stop_resources -- stop all resources associated with a vnet.
1831  */
1832 static void
1833 vnet_stop_resources(vnet_t *vnetp)
1834 {
1835 	vnet_res_t	*vresp;
1836 	mac_register_t	*macp;
1837 	mac_callbacks_t	*cbp;
1838 
1839 	DBG1(vnetp, "enter\n");
1840 
1841 	ASSERT(RW_WRITE_HELD(&vnetp->vrwlock));
1842 
1843 	for (vresp = vnetp->vres_list; vresp != NULL; ) {
1844 		if (vresp->flags & VNET_STARTED) {
1845 			/*
1846 			 * Release the lock while invoking mc_stop() of the
1847 			 * underlying resource. We hold a reference to this
1848 			 * resource to prevent being removed from the list in
1849 			 * vio_net_resource_unreg(). Note that new resources
1850 			 * can be added to the head of the list while the lock
1851 			 * is released, but they won't be started, as
1852 			 * VNET_STARTED flag has been cleared for the vnet
1853 			 * device in vnet_m_stop(). Also, while the lock is
1854 			 * released a resource could be removed from the list
1855 			 * in vio_net_resource_unreg(); but that is ok, as we
1856 			 * re-acquire the lock and only then access the forward
1857 			 * link (vresp->nextp) to continue with the next
1858 			 * resource.
1859 			 */
1860 			vresp->flags &= ~VNET_STARTED;
1861 			vresp->flags |= VNET_STOPPING;
1862 			macp = &vresp->macreg;
1863 			cbp = macp->m_callbacks;
1864 			VNET_FDBE_REFHOLD(vresp);
1865 			RW_EXIT(&vnetp->vrwlock);
1866 
1867 			cbp->mc_stop(macp->m_driver);
1868 
1869 			WRITE_ENTER(&vnetp->vrwlock);
1870 			vresp->flags &= ~VNET_STOPPING;
1871 			VNET_FDBE_REFRELE(vresp);
1872 		}
1873 		vresp = vresp->nextp;
1874 	}
1875 	DBG1(vnetp, "exit\n");
1876 }
1877 
1878 /*
1879  * Setup kstats for the HIO statistics.
1880  * NOTE: the synchronization for the statistics is the
1881  * responsibility of the caller.
1882  */
1883 kstat_t *
1884 vnet_hio_setup_kstats(char *ks_mod, char *ks_name, vnet_res_t *vresp)
1885 {
1886 	kstat_t *ksp;
1887 	vnet_t *vnetp = vresp->vnetp;
1888 	vnet_hio_kstats_t *hiokp;
1889 	size_t size;
1890 
1891 	ASSERT(vnetp != NULL);
1892 	size = sizeof (vnet_hio_kstats_t) / sizeof (kstat_named_t);
1893 	ksp = kstat_create(ks_mod, vnetp->instance, ks_name, "net",
1894 	    KSTAT_TYPE_NAMED, size, 0);
1895 	if (ksp == NULL) {
1896 		return (NULL);
1897 	}
1898 
1899 	hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
1900 	kstat_named_init(&hiokp->ipackets,		"ipackets",
1901 	    KSTAT_DATA_ULONG);
1902 	kstat_named_init(&hiokp->ierrors,		"ierrors",
1903 	    KSTAT_DATA_ULONG);
1904 	kstat_named_init(&hiokp->opackets,		"opackets",
1905 	    KSTAT_DATA_ULONG);
1906 	kstat_named_init(&hiokp->oerrors,		"oerrors",
1907 	    KSTAT_DATA_ULONG);
1908 
1909 
1910 	/* MIB II kstat variables */
1911 	kstat_named_init(&hiokp->rbytes,		"rbytes",
1912 	    KSTAT_DATA_ULONG);
1913 	kstat_named_init(&hiokp->obytes,		"obytes",
1914 	    KSTAT_DATA_ULONG);
1915 	kstat_named_init(&hiokp->multircv,		"multircv",
1916 	    KSTAT_DATA_ULONG);
1917 	kstat_named_init(&hiokp->multixmt,		"multixmt",
1918 	    KSTAT_DATA_ULONG);
1919 	kstat_named_init(&hiokp->brdcstrcv,		"brdcstrcv",
1920 	    KSTAT_DATA_ULONG);
1921 	kstat_named_init(&hiokp->brdcstxmt,		"brdcstxmt",
1922 	    KSTAT_DATA_ULONG);
1923 	kstat_named_init(&hiokp->norcvbuf,		"norcvbuf",
1924 	    KSTAT_DATA_ULONG);
1925 	kstat_named_init(&hiokp->noxmtbuf,		"noxmtbuf",
1926 	    KSTAT_DATA_ULONG);
1927 
1928 	ksp->ks_update = vnet_hio_update_kstats;
1929 	ksp->ks_private = (void *)vresp;
1930 	kstat_install(ksp);
1931 	return (ksp);
1932 }
1933 
1934 /*
1935  * Destroy kstats.
1936  */
1937 static void
1938 vnet_hio_destroy_kstats(kstat_t *ksp)
1939 {
1940 	if (ksp != NULL)
1941 		kstat_delete(ksp);
1942 }
1943 
1944 /*
1945  * Update the kstats.
1946  */
1947 static int
1948 vnet_hio_update_kstats(kstat_t *ksp, int rw)
1949 {
1950 	vnet_t *vnetp;
1951 	vnet_res_t *vresp;
1952 	vnet_hio_stats_t statsp;
1953 	vnet_hio_kstats_t *hiokp;
1954 
1955 	vresp = (vnet_res_t *)ksp->ks_private;
1956 	vnetp = vresp->vnetp;
1957 
1958 	bzero(&statsp, sizeof (vnet_hio_stats_t));
1959 
1960 	READ_ENTER(&vnetp->vsw_fp_rw);
1961 	if (vnetp->hio_fp == NULL) {
1962 		/* not using hio resources, just return */
1963 		RW_EXIT(&vnetp->vsw_fp_rw);
1964 		return (0);
1965 	}
1966 	VNET_FDBE_REFHOLD(vnetp->hio_fp);
1967 	RW_EXIT(&vnetp->vsw_fp_rw);
1968 	vnet_hio_get_stats(vnetp->hio_fp, &statsp);
1969 	VNET_FDBE_REFRELE(vnetp->hio_fp);
1970 
1971 	hiokp = (vnet_hio_kstats_t *)ksp->ks_data;
1972 
1973 	if (rw == KSTAT_READ) {
1974 		/* Link Input/Output stats */
1975 		hiokp->ipackets.value.ul	= (uint32_t)statsp.ipackets;
1976 		hiokp->ipackets64.value.ull	= statsp.ipackets;
1977 		hiokp->ierrors.value.ul		= statsp.ierrors;
1978 		hiokp->opackets.value.ul	= (uint32_t)statsp.opackets;
1979 		hiokp->opackets64.value.ull	= statsp.opackets;
1980 		hiokp->oerrors.value.ul		= statsp.oerrors;
1981 
1982 		/* MIB II kstat variables */
1983 		hiokp->rbytes.value.ul		= (uint32_t)statsp.rbytes;
1984 		hiokp->rbytes64.value.ull	= statsp.rbytes;
1985 		hiokp->obytes.value.ul		= (uint32_t)statsp.obytes;
1986 		hiokp->obytes64.value.ull	= statsp.obytes;
1987 		hiokp->multircv.value.ul	= statsp.multircv;
1988 		hiokp->multixmt.value.ul	= statsp.multixmt;
1989 		hiokp->brdcstrcv.value.ul	= statsp.brdcstrcv;
1990 		hiokp->brdcstxmt.value.ul	= statsp.brdcstxmt;
1991 		hiokp->norcvbuf.value.ul	= statsp.norcvbuf;
1992 		hiokp->noxmtbuf.value.ul	= statsp.noxmtbuf;
1993 	} else {
1994 		return (EACCES);
1995 	}
1996 
1997 	return (0);
1998 }
1999 
2000 static void
2001 vnet_hio_get_stats(vnet_res_t *vresp, vnet_hio_stats_t *statsp)
2002 {
2003 	mac_register_t		*macp;
2004 	mac_callbacks_t		*cbp;
2005 	uint64_t		val;
2006 	int			stat;
2007 
2008 	/*
2009 	 * get the specified statistics from the underlying nxge.
2010 	 */
2011 	macp = &vresp->macreg;
2012 	cbp = macp->m_callbacks;
2013 	for (stat = MAC_STAT_MIN; stat < MAC_STAT_OVERFLOWS; stat++) {
2014 		if (cbp->mc_getstat(macp->m_driver, stat, &val) == 0) {
2015 			switch (stat) {
2016 			case MAC_STAT_IPACKETS:
2017 				statsp->ipackets = val;
2018 				break;
2019 
2020 			case MAC_STAT_IERRORS:
2021 				statsp->ierrors = val;
2022 				break;
2023 
2024 			case MAC_STAT_OPACKETS:
2025 				statsp->opackets = val;
2026 				break;
2027 
2028 			case MAC_STAT_OERRORS:
2029 				statsp->oerrors = val;
2030 				break;
2031 
2032 			case MAC_STAT_RBYTES:
2033 				statsp->rbytes = val;
2034 				break;
2035 
2036 			case MAC_STAT_OBYTES:
2037 				statsp->obytes = val;
2038 				break;
2039 
2040 			case MAC_STAT_MULTIRCV:
2041 				statsp->multircv = val;
2042 				break;
2043 
2044 			case MAC_STAT_MULTIXMT:
2045 				statsp->multixmt = val;
2046 				break;
2047 
2048 			case MAC_STAT_BRDCSTRCV:
2049 				statsp->brdcstrcv = val;
2050 				break;
2051 
2052 			case MAC_STAT_BRDCSTXMT:
2053 				statsp->brdcstxmt = val;
2054 				break;
2055 
2056 			case MAC_STAT_NOXMTBUF:
2057 				statsp->noxmtbuf = val;
2058 				break;
2059 
2060 			case MAC_STAT_NORCVBUF:
2061 				statsp->norcvbuf = val;
2062 				break;
2063 
2064 			default:
2065 				/*
2066 				 * parameters not interested.
2067 				 */
2068 				break;
2069 			}
2070 		}
2071 	}
2072 }
2073 
2074 static boolean_t
2075 vnet_m_capab(void *arg, mac_capab_t cap, void *cap_data)
2076 {
2077 	vnet_t	*vnetp = (vnet_t *)arg;
2078 
2079 	if (vnetp == NULL) {
2080 		return (0);
2081 	}
2082 
2083 	switch (cap) {
2084 
2085 	case MAC_CAPAB_RINGS: {
2086 
2087 		mac_capab_rings_t *cap_rings = cap_data;
2088 		/*
2089 		 * Rings Capability Notes:
2090 		 * We advertise rings to make use of the rings framework in
2091 		 * gldv3 mac layer, to improve the performance. This is
2092 		 * specifically needed when a Hybrid resource (with multiple
2093 		 * tx/rx hardware rings) is assigned to a vnet device. We also
2094 		 * leverage this for the normal case when no Hybrid resource is
2095 		 * assigned.
2096 		 *
2097 		 * Ring Allocation:
2098 		 * - TX path:
2099 		 * We expose a pseudo ring group with 2 pseudo tx rings (as
2100 		 * currently HybridIO exports only 2 rings) In the normal case,
2101 		 * transmit traffic that comes down to the driver through the
2102 		 * mri_tx (vnet_tx_ring_send()) entry point goes through the
2103 		 * distributed switching algorithm in vnet and gets transmitted
2104 		 * over a port/LDC in the vgen layer to either the vswitch or a
2105 		 * peer vnet. If and when a Hybrid resource is assigned to the
2106 		 * vnet, we obtain the tx ring information of the Hybrid device
2107 		 * (nxge) and map the pseudo rings 1:1 to the 2 hw tx rings.
2108 		 * Traffic being sent over the Hybrid resource by the mac layer
2109 		 * gets spread across both hw rings, as they are mapped to the
2110 		 * 2 pseudo tx rings in vnet.
2111 		 *
2112 		 * - RX path:
2113 		 * We expose a pseudo ring group with 3 pseudo rx rings (static
2114 		 * rings) initially. The first (default) pseudo rx ring is
2115 		 * reserved for the resource that connects to the vswitch
2116 		 * service. The next 2 rings are reserved for a Hybrid resource
2117 		 * that may be assigned to the vnet device. If and when a
2118 		 * Hybrid resource is assigned to the vnet, we obtain the rx
2119 		 * ring information of the Hybrid device (nxge) and map these
2120 		 * pseudo rings 1:1 to the 2 hw rx rings. For each additional
2121 		 * resource that connects to a peer vnet, we dynamically
2122 		 * allocate a pseudo rx ring and map it to that resource, when
2123 		 * the resource gets added; and the pseudo rx ring is
2124 		 * dynamically registered with the upper mac layer. We do the
2125 		 * reverse and unregister the ring with the mac layer when
2126 		 * the resource gets removed.
2127 		 *
2128 		 * Synchronization notes:
2129 		 * We don't need any lock to protect members of ring structure,
2130 		 * specifically ringp->hw_rh, in either the TX or the RX ring,
2131 		 * as explained below.
2132 		 * - TX ring:
2133 		 * ring->hw_rh is initialized only when a Hybrid resource is
2134 		 * associated; and gets referenced only in vnet_hio_tx(). The
2135 		 * Hybrid resource itself is available in fdb only after tx
2136 		 * hwrings are found and mapped; i.e, in vio_net_resource_reg()
2137 		 * we call vnet_bind_rings() first and then call
2138 		 * vnet_start_resources() which adds an entry to fdb. For
2139 		 * traffic going over LDC resources, we don't reference
2140 		 * ring->hw_rh at all.
2141 		 * - RX ring:
2142 		 * For rings mapped to Hybrid resource ring->hw_rh is
2143 		 * initialized and only then do we add the rx callback for
2144 		 * the underlying Hybrid resource; we disable callbacks before
2145 		 * we unmap ring->hw_rh. For rings mapped to LDC resources, we
2146 		 * stop the rx callbacks (in vgen) before we remove ring->hw_rh
2147 		 * (vio_net_resource_unreg()).
2148 		 * Also, we access ring->hw_rh in vnet_rx_ring_stat().
2149 		 * Note that for rings mapped to Hybrid resource, though the
2150 		 * rings are statically registered with the mac layer, its
2151 		 * hardware ring mapping (ringp->hw_rh) can be torn down in
2152 		 * vnet_unbind_hwrings() while the kstat operation is in
2153 		 * progress. To protect against this, we hold a reference to
2154 		 * the resource in FDB; this ensures that the thread in
2155 		 * vio_net_resource_unreg() waits for the reference to be
2156 		 * dropped before unbinding the ring.
2157 		 *
2158 		 * We don't need to do this for rings mapped to LDC resources.
2159 		 * These rings are registered/unregistered dynamically with
2160 		 * the mac layer and so any attempt to unregister the ring
2161 		 * while kstat operation is in progress will block in
2162 		 * mac_group_rem_ring(). Thus implicitly protects the
2163 		 * resource (ringp->hw_rh) from disappearing.
2164 		 */
2165 
2166 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2167 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2168 
2169 			/*
2170 			 * The ring_cnt for rx grp is initialized in
2171 			 * vnet_ring_grp_init(). Later, the ring_cnt gets
2172 			 * updated dynamically whenever LDC resources are added
2173 			 * or removed.
2174 			 */
2175 			cap_rings->mr_rnum = vnetp->rx_grp[0].ring_cnt;
2176 			cap_rings->mr_rget = vnet_get_ring;
2177 
2178 			cap_rings->mr_gnum = VNET_NUM_PSEUDO_GROUPS;
2179 			cap_rings->mr_gget = vnet_get_group;
2180 			cap_rings->mr_gaddring = NULL;
2181 			cap_rings->mr_gremring = NULL;
2182 		} else {
2183 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2184 
2185 			/*
2186 			 * The ring_cnt for tx grp is initialized in
2187 			 * vnet_ring_grp_init() and remains constant, as we
2188 			 * do not support dymanic tx rings for now.
2189 			 */
2190 			cap_rings->mr_rnum = vnetp->tx_grp[0].ring_cnt;
2191 			cap_rings->mr_rget = vnet_get_ring;
2192 
2193 			/*
2194 			 * Transmit rings are not grouped; i.e, the number of
2195 			 * transmit ring groups advertised should be set to 0.
2196 			 */
2197 			cap_rings->mr_gnum = 0;
2198 
2199 			cap_rings->mr_gget = vnet_get_group;
2200 			cap_rings->mr_gaddring = NULL;
2201 			cap_rings->mr_gremring = NULL;
2202 		}
2203 		return (B_TRUE);
2204 
2205 	}
2206 
2207 	default:
2208 		break;
2209 
2210 	}
2211 
2212 	return (B_FALSE);
2213 }
2214 
2215 /*
2216  * Callback funtion for MAC layer to get ring information.
2217  */
2218 static void
2219 vnet_get_ring(void *arg, mac_ring_type_t rtype, const int g_index,
2220     const int r_index, mac_ring_info_t *infop, mac_ring_handle_t r_handle)
2221 {
2222 	vnet_t	*vnetp = arg;
2223 
2224 	switch (rtype) {
2225 
2226 	case MAC_RING_TYPE_RX: {
2227 
2228 		vnet_pseudo_rx_group_t	*rx_grp;
2229 		vnet_pseudo_rx_ring_t	*rx_ringp;
2230 		mac_intr_t		*mintr;
2231 
2232 		/* We advertised only one RX group */
2233 		ASSERT(g_index == 0);
2234 		rx_grp = &vnetp->rx_grp[g_index];
2235 
2236 		/* Check the current # of rings in the rx group */
2237 		ASSERT((r_index >= 0) && (r_index < rx_grp->max_ring_cnt));
2238 
2239 		/* Get the ring based on the index */
2240 		rx_ringp = &rx_grp->rings[r_index];
2241 
2242 		rx_ringp->handle = r_handle;
2243 		/*
2244 		 * Note: we don't need to save the incoming r_index in rx_ring,
2245 		 * as vnet_ring_grp_init() would have initialized the index for
2246 		 * each ring in the array.
2247 		 */
2248 		rx_ringp->grp = rx_grp;
2249 		rx_ringp->vnetp = vnetp;
2250 
2251 		mintr = &infop->mri_intr;
2252 		mintr->mi_handle = (mac_intr_handle_t)rx_ringp;
2253 		mintr->mi_enable = (mac_intr_enable_t)vnet_ring_enable_intr;
2254 		mintr->mi_disable = (mac_intr_disable_t)vnet_ring_disable_intr;
2255 
2256 		infop->mri_driver = (mac_ring_driver_t)rx_ringp;
2257 		infop->mri_start = vnet_rx_ring_start;
2258 		infop->mri_stop = vnet_rx_ring_stop;
2259 		infop->mri_stat = vnet_rx_ring_stat;
2260 
2261 		/* Set the poll function, as this is an rx ring */
2262 		infop->mri_poll = vnet_rx_poll;
2263 		/*
2264 		 * MAC_RING_RX_ENQUEUE bit needed to be set for nxge
2265 		 * which was not sending packet chains in interrupt
2266 		 * context. For such drivers, packets are queued in
2267 		 * Rx soft rings so that we get a chance to switch
2268 		 * into a polling mode under backlog. This bug (not
2269 		 * sending packet chains) has now been fixed. Once
2270 		 * the performance impact is measured, this change
2271 		 * will be removed.
2272 		 */
2273 		infop->mri_flags = (vnet_mac_rx_queuing ?
2274 		    MAC_RING_RX_ENQUEUE : 0);
2275 		break;
2276 	}
2277 
2278 	case MAC_RING_TYPE_TX: {
2279 		vnet_pseudo_tx_group_t	*tx_grp;
2280 		vnet_pseudo_tx_ring_t	*tx_ringp;
2281 
2282 		/*
2283 		 * No need to check grp index; mac layer passes -1 for it.
2284 		 */
2285 		tx_grp = &vnetp->tx_grp[0];
2286 
2287 		/* Check the # of rings in the tx group */
2288 		ASSERT((r_index >= 0) && (r_index < tx_grp->ring_cnt));
2289 
2290 		/* Get the ring based on the index */
2291 		tx_ringp = &tx_grp->rings[r_index];
2292 
2293 		tx_ringp->handle = r_handle;
2294 		tx_ringp->index = r_index;
2295 		tx_ringp->grp = tx_grp;
2296 		tx_ringp->vnetp = vnetp;
2297 
2298 		infop->mri_driver = (mac_ring_driver_t)tx_ringp;
2299 		infop->mri_start = vnet_tx_ring_start;
2300 		infop->mri_stop = vnet_tx_ring_stop;
2301 		infop->mri_stat = vnet_tx_ring_stat;
2302 
2303 		/* Set the transmit function, as this is a tx ring */
2304 		infop->mri_tx = vnet_tx_ring_send;
2305 		/*
2306 		 * MAC_RING_TX_SERIALIZE bit needs to be set while
2307 		 * hybridIO is enabled to workaround tx lock
2308 		 * contention issues in nxge.
2309 		 */
2310 		infop->mri_flags = (vnet_mac_tx_serialize ?
2311 		    MAC_RING_TX_SERIALIZE : 0);
2312 		break;
2313 	}
2314 
2315 	default:
2316 		break;
2317 	}
2318 }
2319 
2320 /*
2321  * Callback funtion for MAC layer to get group information.
2322  */
2323 static void
2324 vnet_get_group(void *arg, mac_ring_type_t type, const int index,
2325 	mac_group_info_t *infop, mac_group_handle_t handle)
2326 {
2327 	vnet_t	*vnetp = (vnet_t *)arg;
2328 
2329 	switch (type) {
2330 
2331 	case MAC_RING_TYPE_RX:
2332 	{
2333 		vnet_pseudo_rx_group_t	*rx_grp;
2334 
2335 		/* We advertised only one RX group */
2336 		ASSERT(index == 0);
2337 
2338 		rx_grp = &vnetp->rx_grp[index];
2339 		rx_grp->handle = handle;
2340 		rx_grp->index = index;
2341 		rx_grp->vnetp = vnetp;
2342 
2343 		infop->mgi_driver = (mac_group_driver_t)rx_grp;
2344 		infop->mgi_start = NULL;
2345 		infop->mgi_stop = NULL;
2346 		infop->mgi_addmac = vnet_addmac;
2347 		infop->mgi_remmac = vnet_remmac;
2348 		infop->mgi_count = rx_grp->ring_cnt;
2349 
2350 		break;
2351 	}
2352 
2353 	case MAC_RING_TYPE_TX:
2354 	{
2355 		vnet_pseudo_tx_group_t	*tx_grp;
2356 
2357 		/* We advertised only one TX group */
2358 		ASSERT(index == 0);
2359 
2360 		tx_grp = &vnetp->tx_grp[index];
2361 		tx_grp->handle = handle;
2362 		tx_grp->index = index;
2363 		tx_grp->vnetp = vnetp;
2364 
2365 		infop->mgi_driver = (mac_group_driver_t)tx_grp;
2366 		infop->mgi_start = NULL;
2367 		infop->mgi_stop = NULL;
2368 		infop->mgi_addmac = NULL;
2369 		infop->mgi_remmac = NULL;
2370 		infop->mgi_count = VNET_NUM_PSEUDO_TXRINGS;
2371 
2372 		break;
2373 	}
2374 
2375 	default:
2376 		break;
2377 
2378 	}
2379 }
2380 
2381 static int
2382 vnet_rx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
2383 {
2384 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2385 	int			err;
2386 
2387 	/*
2388 	 * If this ring is mapped to a LDC resource, simply mark the state to
2389 	 * indicate the ring is started and return.
2390 	 */
2391 	if ((rx_ringp->state &
2392 	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
2393 		rx_ringp->gen_num = mr_gen_num;
2394 		rx_ringp->state |= VNET_RXRING_STARTED;
2395 		return (0);
2396 	}
2397 
2398 	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2399 
2400 	/*
2401 	 * This must be a ring reserved for a hwring. If the hwring is not
2402 	 * bound yet, simply mark the state to indicate the ring is started and
2403 	 * return. If and when a hybrid resource is activated for this vnet
2404 	 * device, we will bind the hwring and start it then. If a hwring is
2405 	 * already bound, start it now.
2406 	 */
2407 	if (rx_ringp->hw_rh == NULL) {
2408 		rx_ringp->gen_num = mr_gen_num;
2409 		rx_ringp->state |= VNET_RXRING_STARTED;
2410 		return (0);
2411 	}
2412 
2413 	err = mac_hwring_start(rx_ringp->hw_rh);
2414 	if (err == 0) {
2415 		rx_ringp->gen_num = mr_gen_num;
2416 		rx_ringp->state |= VNET_RXRING_STARTED;
2417 	} else {
2418 		err = ENXIO;
2419 	}
2420 
2421 	return (err);
2422 }
2423 
2424 static void
2425 vnet_rx_ring_stop(mac_ring_driver_t arg)
2426 {
2427 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2428 
2429 	/*
2430 	 * If this ring is mapped to a LDC resource, simply mark the state to
2431 	 * indicate the ring is now stopped and return.
2432 	 */
2433 	if ((rx_ringp->state &
2434 	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0) {
2435 		rx_ringp->state &= ~VNET_RXRING_STARTED;
2436 		return;
2437 	}
2438 
2439 	ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2440 
2441 	/*
2442 	 * This must be a ring reserved for a hwring. If the hwring is not
2443 	 * bound yet, simply mark the state to indicate the ring is stopped and
2444 	 * return. If a hwring is already bound, stop it now.
2445 	 */
2446 	if (rx_ringp->hw_rh == NULL) {
2447 		rx_ringp->state &= ~VNET_RXRING_STARTED;
2448 		return;
2449 	}
2450 
2451 	mac_hwring_stop(rx_ringp->hw_rh);
2452 	rx_ringp->state &= ~VNET_RXRING_STARTED;
2453 }
2454 
2455 static int
2456 vnet_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2457 {
2458 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)rdriver;
2459 	vnet_t			*vnetp = (vnet_t *)rx_ringp->vnetp;
2460 	vnet_res_t		*vresp;
2461 	mac_register_t		*macp;
2462 	mac_callbacks_t		*cbp;
2463 
2464 	/*
2465 	 * Refer to vnet_m_capab() function for detailed comments on ring
2466 	 * synchronization.
2467 	 */
2468 	if ((rx_ringp->state & VNET_RXRING_HYBRID) != 0) {
2469 		READ_ENTER(&vnetp->vsw_fp_rw);
2470 		if (vnetp->hio_fp == NULL) {
2471 			RW_EXIT(&vnetp->vsw_fp_rw);
2472 			return (0);
2473 		}
2474 
2475 		VNET_FDBE_REFHOLD(vnetp->hio_fp);
2476 		RW_EXIT(&vnetp->vsw_fp_rw);
2477 		(void) mac_hwring_getstat(rx_ringp->hw_rh, stat, val);
2478 		VNET_FDBE_REFRELE(vnetp->hio_fp);
2479 		return (0);
2480 	}
2481 
2482 	ASSERT((rx_ringp->state &
2483 	    (VNET_RXRING_LDC_SERVICE|VNET_RXRING_LDC_GUEST)) != 0);
2484 	vresp = (vnet_res_t *)rx_ringp->hw_rh;
2485 	macp = &vresp->macreg;
2486 	cbp = macp->m_callbacks;
2487 
2488 	cbp->mc_getstat(macp->m_driver, stat, val);
2489 
2490 	return (0);
2491 }
2492 
2493 /* ARGSUSED */
2494 static int
2495 vnet_tx_ring_start(mac_ring_driver_t arg, uint64_t mr_gen_num)
2496 {
2497 	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
2498 
2499 	tx_ringp->state |= VNET_TXRING_STARTED;
2500 	return (0);
2501 }
2502 
2503 static void
2504 vnet_tx_ring_stop(mac_ring_driver_t arg)
2505 {
2506 	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
2507 
2508 	tx_ringp->state &= ~VNET_TXRING_STARTED;
2509 }
2510 
2511 static int
2512 vnet_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2513 {
2514 	vnet_pseudo_tx_ring_t	*tx_ringp = (vnet_pseudo_tx_ring_t *)rdriver;
2515 	vnet_tx_ring_stats_t	*statsp;
2516 
2517 	statsp = &tx_ringp->tx_ring_stats;
2518 
2519 	switch (stat) {
2520 	case MAC_STAT_OPACKETS:
2521 		*val = statsp->opackets;
2522 		break;
2523 
2524 	case MAC_STAT_OBYTES:
2525 		*val = statsp->obytes;
2526 		break;
2527 
2528 	default:
2529 		*val = 0;
2530 		return (ENOTSUP);
2531 	}
2532 
2533 	return (0);
2534 }
2535 
2536 /*
2537  * Disable polling for a ring and enable its interrupt.
2538  */
2539 static int
2540 vnet_ring_enable_intr(void *arg)
2541 {
2542 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2543 	vnet_res_t		*vresp;
2544 
2545 	if (rx_ringp->hw_rh == NULL) {
2546 		/*
2547 		 * Ring enable intr func is being invoked, but the ring is
2548 		 * not bound to any underlying resource ? This must be a ring
2549 		 * reserved for Hybrid resource and no such resource has been
2550 		 * assigned to this vnet device yet. We simply return success.
2551 		 */
2552 		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2553 		return (0);
2554 	}
2555 
2556 	/*
2557 	 * The rx ring has been bound to either a LDC or a Hybrid resource.
2558 	 * Call the appropriate function to enable interrupts for the ring.
2559 	 */
2560 	if (rx_ringp->state & VNET_RXRING_HYBRID) {
2561 		return (mac_hwring_enable_intr(rx_ringp->hw_rh));
2562 	} else {
2563 		vresp = (vnet_res_t *)rx_ringp->hw_rh;
2564 		return (vgen_enable_intr(vresp->macreg.m_driver));
2565 	}
2566 }
2567 
2568 /*
2569  * Enable polling for a ring and disable its interrupt.
2570  */
2571 static int
2572 vnet_ring_disable_intr(void *arg)
2573 {
2574 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2575 	vnet_res_t		*vresp;
2576 
2577 	if (rx_ringp->hw_rh == NULL) {
2578 		/*
2579 		 * Ring disable intr func is being invoked, but the ring is
2580 		 * not bound to any underlying resource ? This must be a ring
2581 		 * reserved for Hybrid resource and no such resource has been
2582 		 * assigned to this vnet device yet. We simply return success.
2583 		 */
2584 		ASSERT((rx_ringp->state & VNET_RXRING_HYBRID) != 0);
2585 		return (0);
2586 	}
2587 
2588 	/*
2589 	 * The rx ring has been bound to either a LDC or a Hybrid resource.
2590 	 * Call the appropriate function to disable interrupts for the ring.
2591 	 */
2592 	if (rx_ringp->state & VNET_RXRING_HYBRID) {
2593 		return (mac_hwring_disable_intr(rx_ringp->hw_rh));
2594 	} else {
2595 		vresp = (vnet_res_t *)rx_ringp->hw_rh;
2596 		return (vgen_disable_intr(vresp->macreg.m_driver));
2597 	}
2598 }
2599 
2600 /*
2601  * Poll 'bytes_to_pickup' bytes of message from the rx ring.
2602  */
2603 static mblk_t *
2604 vnet_rx_poll(void *arg, int bytes_to_pickup)
2605 {
2606 	vnet_pseudo_rx_ring_t	*rx_ringp = (vnet_pseudo_rx_ring_t *)arg;
2607 	mblk_t			*mp = NULL;
2608 	vnet_res_t		*vresp;
2609 	vnet_t			*vnetp = rx_ringp->vnetp;
2610 
2611 	if (rx_ringp->hw_rh == NULL) {
2612 		return (NULL);
2613 	}
2614 
2615 	if (rx_ringp->state & VNET_RXRING_HYBRID) {
2616 		mp = mac_hwring_poll(rx_ringp->hw_rh, bytes_to_pickup);
2617 		/*
2618 		 * Packets received over a hybrid resource need additional
2619 		 * processing to remove the tag, for the pvid case. The
2620 		 * underlying resource is not aware of the vnet's pvid and thus
2621 		 * packets are received with the vlan tag in the header; unlike
2622 		 * packets that are received over a ldc channel in which case
2623 		 * the peer vnet/vsw would have already removed the tag.
2624 		 */
2625 		if (vnetp->pvid != vnetp->default_vlan_id) {
2626 			vnet_rx_frames_untag(vnetp->pvid, &mp);
2627 		}
2628 	} else {
2629 		vresp = (vnet_res_t *)rx_ringp->hw_rh;
2630 		mp = vgen_poll(vresp->macreg.m_driver, bytes_to_pickup);
2631 	}
2632 	return (mp);
2633 }
2634 
2635 /* ARGSUSED */
2636 void
2637 vnet_hio_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
2638 	boolean_t loopback)
2639 {
2640 	vnet_t			*vnetp = (vnet_t *)arg;
2641 	vnet_pseudo_rx_ring_t	*ringp = (vnet_pseudo_rx_ring_t *)mrh;
2642 
2643 	/*
2644 	 * Packets received over a hybrid resource need additional processing
2645 	 * to remove the tag, for the pvid case. The underlying resource is
2646 	 * not aware of the vnet's pvid and thus packets are received with the
2647 	 * vlan tag in the header; unlike packets that are received over a ldc
2648 	 * channel in which case the peer vnet/vsw would have already removed
2649 	 * the tag.
2650 	 */
2651 	if (vnetp->pvid != vnetp->default_vlan_id) {
2652 		vnet_rx_frames_untag(vnetp->pvid, &mp);
2653 		if (mp == NULL) {
2654 			return;
2655 		}
2656 	}
2657 	mac_rx_ring(vnetp->mh, ringp->handle, mp, ringp->gen_num);
2658 }
2659 
2660 static int
2661 vnet_addmac(void *arg, const uint8_t *mac_addr)
2662 {
2663 	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
2664 	vnet_t			*vnetp;
2665 
2666 	vnetp = rx_grp->vnetp;
2667 
2668 	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
2669 		return (0);
2670 	}
2671 
2672 	cmn_err(CE_CONT, "!vnet%d: %s: Multiple macaddr unsupported\n",
2673 	    vnetp->instance, __func__);
2674 	return (EINVAL);
2675 }
2676 
2677 static int
2678 vnet_remmac(void *arg, const uint8_t *mac_addr)
2679 {
2680 	vnet_pseudo_rx_group_t  *rx_grp = (vnet_pseudo_rx_group_t *)arg;
2681 	vnet_t			*vnetp;
2682 
2683 	vnetp = rx_grp->vnetp;
2684 
2685 	if (bcmp(mac_addr, vnetp->curr_macaddr, ETHERADDRL) == 0) {
2686 		return (0);
2687 	}
2688 
2689 	cmn_err(CE_CONT, "!vnet%d: %s: Invalid macaddr: %s\n",
2690 	    vnetp->instance, __func__, ether_sprintf((void *)mac_addr));
2691 	return (EINVAL);
2692 }
2693 
2694 int
2695 vnet_hio_mac_init(vnet_t *vnetp, char *ifname)
2696 {
2697 	mac_handle_t		mh;
2698 	mac_client_handle_t	mch = NULL;
2699 	mac_unicast_handle_t	muh = NULL;
2700 	mac_diag_t		diag;
2701 	mac_register_t		*macp;
2702 	char			client_name[MAXNAMELEN];
2703 	int			rv;
2704 	uint16_t		mac_flags = MAC_UNICAST_TAG_DISABLE |
2705 	    MAC_UNICAST_STRIP_DISABLE | MAC_UNICAST_PRIMARY;
2706 	vio_net_callbacks_t	vcb;
2707 	ether_addr_t		rem_addr =
2708 		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
2709 	uint32_t		retries = 0;
2710 
2711 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2712 		return (EAGAIN);
2713 	}
2714 
2715 	do {
2716 		rv = mac_open_by_linkname(ifname, &mh);
2717 		if (rv == 0) {
2718 			break;
2719 		}
2720 		if (rv != ENOENT || (retries++ >= vnet_mac_open_retries)) {
2721 			mac_free(macp);
2722 			return (rv);
2723 		}
2724 		drv_usecwait(vnet_mac_open_delay);
2725 	} while (rv == ENOENT);
2726 
2727 	vnetp->hio_mh = mh;
2728 
2729 	(void) snprintf(client_name, MAXNAMELEN, "vnet%d-%s", vnetp->instance,
2730 	    ifname);
2731 	rv = mac_client_open(mh, &mch, client_name, MAC_OPEN_FLAGS_EXCLUSIVE);
2732 	if (rv != 0) {
2733 		goto fail;
2734 	}
2735 	vnetp->hio_mch = mch;
2736 
2737 	rv = mac_unicast_add(mch, vnetp->curr_macaddr, mac_flags, &muh, 0,
2738 	    &diag);
2739 	if (rv != 0) {
2740 		goto fail;
2741 	}
2742 	vnetp->hio_muh = muh;
2743 
2744 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
2745 	macp->m_driver = vnetp;
2746 	macp->m_dip = NULL;
2747 	macp->m_src_addr = NULL;
2748 	macp->m_callbacks = &vnet_hio_res_callbacks;
2749 	macp->m_min_sdu = 0;
2750 	macp->m_max_sdu = ETHERMTU;
2751 
2752 	rv = vio_net_resource_reg(macp, VIO_NET_RES_HYBRID,
2753 	    vnetp->curr_macaddr, rem_addr, &vnetp->hio_vhp, &vcb);
2754 	if (rv != 0) {
2755 		goto fail;
2756 	}
2757 	mac_free(macp);
2758 
2759 	/* add the recv callback */
2760 	mac_rx_set(vnetp->hio_mch, vnet_hio_rx_cb, vnetp);
2761 
2762 	return (0);
2763 
2764 fail:
2765 	mac_free(macp);
2766 	vnet_hio_mac_cleanup(vnetp);
2767 	return (1);
2768 }
2769 
2770 void
2771 vnet_hio_mac_cleanup(vnet_t *vnetp)
2772 {
2773 	if (vnetp->hio_vhp != NULL) {
2774 		vio_net_resource_unreg(vnetp->hio_vhp);
2775 		vnetp->hio_vhp = NULL;
2776 	}
2777 
2778 	if (vnetp->hio_muh != NULL) {
2779 		(void) mac_unicast_remove(vnetp->hio_mch, vnetp->hio_muh);
2780 		vnetp->hio_muh = NULL;
2781 	}
2782 
2783 	if (vnetp->hio_mch != NULL) {
2784 		mac_client_close(vnetp->hio_mch, 0);
2785 		vnetp->hio_mch = NULL;
2786 	}
2787 
2788 	if (vnetp->hio_mh != NULL) {
2789 		mac_close(vnetp->hio_mh);
2790 		vnetp->hio_mh = NULL;
2791 	}
2792 }
2793 
2794 /* Bind pseudo rings to hwrings */
2795 static int
2796 vnet_bind_hwrings(vnet_t *vnetp)
2797 {
2798 	mac_ring_handle_t	hw_rh[VNET_NUM_HYBRID_RINGS];
2799 	mac_perim_handle_t	mph1;
2800 	vnet_pseudo_rx_group_t	*rx_grp;
2801 	vnet_pseudo_rx_ring_t	*rx_ringp;
2802 	vnet_pseudo_tx_group_t	*tx_grp;
2803 	vnet_pseudo_tx_ring_t	*tx_ringp;
2804 	int			hw_ring_cnt;
2805 	int			i;
2806 	int			rv;
2807 
2808 	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
2809 
2810 	/* Get the list of the underlying RX rings. */
2811 	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->rx_hwgh, hw_rh,
2812 	    MAC_RING_TYPE_RX);
2813 
2814 	/* We expect the the # of hw rx rings to match VNET_NUM_HYBRID_RINGS */
2815 	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
2816 		cmn_err(CE_WARN,
2817 		    "!vnet%d: vnet_bind_hwrings: bad rx hw_ring_cnt(%d)\n",
2818 		    vnetp->instance, hw_ring_cnt);
2819 		goto fail;
2820 	}
2821 
2822 	if (vnetp->rx_hwgh != NULL) {
2823 		/*
2824 		 * Quiesce the HW ring and the mac srs on the ring. Note
2825 		 * that the HW ring will be restarted when the pseudo ring
2826 		 * is started. At that time all the packets will be
2827 		 * directly passed up to the pseudo RX ring and handled
2828 		 * by mac srs created over the pseudo RX ring.
2829 		 */
2830 		mac_rx_client_quiesce(vnetp->hio_mch);
2831 		mac_srs_perm_quiesce(vnetp->hio_mch, B_TRUE);
2832 	}
2833 
2834 	/*
2835 	 * Bind the pseudo rings to the hwrings and start the hwrings.
2836 	 * Note we don't need to register these with the upper mac, as we have
2837 	 * statically exported these pseudo rxrings which are reserved for
2838 	 * rxrings of Hybrid resource.
2839 	 */
2840 	rx_grp = &vnetp->rx_grp[0];
2841 	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2842 		/* Pick the rxrings reserved for Hybrid resource */
2843 		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
2844 
2845 		/* Store the hw ring handle */
2846 		rx_ringp->hw_rh = hw_rh[i];
2847 
2848 		/* Bind the pseudo ring to the underlying hwring */
2849 		mac_hwring_setup(rx_ringp->hw_rh,
2850 		    (mac_resource_handle_t)rx_ringp, NULL);
2851 
2852 		/* Start the hwring if needed */
2853 		if (rx_ringp->state & VNET_RXRING_STARTED) {
2854 			rv = mac_hwring_start(rx_ringp->hw_rh);
2855 			if (rv != 0) {
2856 				mac_hwring_teardown(rx_ringp->hw_rh);
2857 				rx_ringp->hw_rh = NULL;
2858 				goto fail;
2859 			}
2860 		}
2861 	}
2862 
2863 	/* Get the list of the underlying TX rings. */
2864 	hw_ring_cnt = mac_hwrings_get(vnetp->hio_mch, &vnetp->tx_hwgh, hw_rh,
2865 	    MAC_RING_TYPE_TX);
2866 
2867 	/* We expect the # of hw tx rings to match VNET_NUM_HYBRID_RINGS */
2868 	if (hw_ring_cnt != VNET_NUM_HYBRID_RINGS) {
2869 		cmn_err(CE_WARN,
2870 		    "!vnet%d: vnet_bind_hwrings: bad tx hw_ring_cnt(%d)\n",
2871 		    vnetp->instance, hw_ring_cnt);
2872 		goto fail;
2873 	}
2874 
2875 	/*
2876 	 * Now map the pseudo txrings to the hw txrings. Note we don't need
2877 	 * to register these with the upper mac, as we have statically exported
2878 	 * these rings. Note that these rings will continue to be used for LDC
2879 	 * resources to peer vnets and vswitch (shared ring).
2880 	 */
2881 	tx_grp = &vnetp->tx_grp[0];
2882 	for (i = 0; i < tx_grp->ring_cnt; i++) {
2883 		tx_ringp = &tx_grp->rings[i];
2884 		tx_ringp->hw_rh = hw_rh[i];
2885 		tx_ringp->state |= VNET_TXRING_HYBRID;
2886 	}
2887 	tx_grp->tx_notify_handle =
2888 	    mac_client_tx_notify(vnetp->hio_mch, vnet_tx_ring_update, vnetp);
2889 
2890 	mac_perim_exit(mph1);
2891 	return (0);
2892 
2893 fail:
2894 	mac_perim_exit(mph1);
2895 	vnet_unbind_hwrings(vnetp);
2896 	return (1);
2897 }
2898 
2899 /* Unbind pseudo rings from hwrings */
2900 static void
2901 vnet_unbind_hwrings(vnet_t *vnetp)
2902 {
2903 	mac_perim_handle_t	mph1;
2904 	vnet_pseudo_rx_ring_t	*rx_ringp;
2905 	vnet_pseudo_rx_group_t	*rx_grp;
2906 	vnet_pseudo_tx_group_t	*tx_grp;
2907 	vnet_pseudo_tx_ring_t	*tx_ringp;
2908 	int			i;
2909 
2910 	mac_perim_enter_by_mh(vnetp->hio_mh, &mph1);
2911 
2912 	tx_grp = &vnetp->tx_grp[0];
2913 	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2914 		tx_ringp = &tx_grp->rings[i];
2915 		if (tx_ringp->state & VNET_TXRING_HYBRID) {
2916 			tx_ringp->state &= ~VNET_TXRING_HYBRID;
2917 			tx_ringp->hw_rh = NULL;
2918 		}
2919 	}
2920 	(void) mac_client_tx_notify(vnetp->hio_mch, NULL,
2921 	    tx_grp->tx_notify_handle);
2922 
2923 	rx_grp = &vnetp->rx_grp[0];
2924 	for (i = 0; i < VNET_NUM_HYBRID_RINGS; i++) {
2925 		rx_ringp = &rx_grp->rings[i + VNET_HYBRID_RXRING_INDEX];
2926 		if (rx_ringp->hw_rh != NULL) {
2927 			/* Stop the hwring */
2928 			mac_hwring_stop(rx_ringp->hw_rh);
2929 
2930 			/* Teardown the hwring */
2931 			mac_hwring_teardown(rx_ringp->hw_rh);
2932 			rx_ringp->hw_rh = NULL;
2933 		}
2934 	}
2935 
2936 	if (vnetp->rx_hwgh != NULL) {
2937 		vnetp->rx_hwgh = NULL;
2938 		/*
2939 		 * First clear the permanent-quiesced flag of the RX srs then
2940 		 * restart the HW ring and the mac srs on the ring.
2941 		 */
2942 		mac_srs_perm_quiesce(vnetp->hio_mch, B_FALSE);
2943 		mac_rx_client_restart(vnetp->hio_mch);
2944 	}
2945 
2946 	mac_perim_exit(mph1);
2947 }
2948 
2949 /* Bind pseudo ring to a LDC resource */
2950 static int
2951 vnet_bind_vgenring(vnet_res_t *vresp)
2952 {
2953 	vnet_t			*vnetp;
2954 	vnet_pseudo_rx_group_t	*rx_grp;
2955 	vnet_pseudo_rx_ring_t	*rx_ringp;
2956 	mac_perim_handle_t	mph1;
2957 	int			rv;
2958 	int			type;
2959 
2960 	vnetp = vresp->vnetp;
2961 	type = vresp->type;
2962 	rx_grp = &vnetp->rx_grp[0];
2963 
2964 	if (type == VIO_NET_RES_LDC_SERVICE) {
2965 		/*
2966 		 * Ring Index 0 is the default ring in the group and is
2967 		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
2968 		 * is allocated statically and is reported to the mac layer
2969 		 * in vnet_m_capab(). So, all we need to do here, is save a
2970 		 * reference to the associated vresp.
2971 		 */
2972 		rx_ringp = &rx_grp->rings[0];
2973 		rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
2974 		vresp->rx_ringp = (void *)rx_ringp;
2975 		return (0);
2976 	}
2977 	ASSERT(type == VIO_NET_RES_LDC_GUEST);
2978 
2979 	mac_perim_enter_by_mh(vnetp->mh, &mph1);
2980 
2981 	rx_ringp = vnet_alloc_pseudo_rx_ring(vnetp);
2982 	if (rx_ringp == NULL) {
2983 		cmn_err(CE_WARN, "!vnet%d: Failed to allocate pseudo rx ring",
2984 		    vnetp->instance);
2985 		goto fail;
2986 	}
2987 
2988 	/* Store the LDC resource itself as the ring handle */
2989 	rx_ringp->hw_rh = (mac_ring_handle_t)vresp;
2990 
2991 	/*
2992 	 * Save a reference to the ring in the resource for lookup during
2993 	 * unbind. Note this is only done for LDC resources. We don't need this
2994 	 * in the case of a Hybrid resource (see vnet_bind_hwrings()), as its
2995 	 * rx rings are mapped to reserved pseudo rx rings (index 1 and 2).
2996 	 */
2997 	vresp->rx_ringp = (void *)rx_ringp;
2998 	rx_ringp->state |= VNET_RXRING_LDC_GUEST;
2999 
3000 	/* Register the pseudo ring with upper-mac */
3001 	rv = mac_group_add_ring(rx_grp->handle, rx_ringp->index);
3002 	if (rv != 0) {
3003 		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
3004 		rx_ringp->hw_rh = NULL;
3005 		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
3006 		goto fail;
3007 	}
3008 
3009 	mac_perim_exit(mph1);
3010 	return (0);
3011 fail:
3012 	mac_perim_exit(mph1);
3013 	return (1);
3014 }
3015 
3016 /* Unbind pseudo ring from a LDC resource */
3017 static void
3018 vnet_unbind_vgenring(vnet_res_t *vresp)
3019 {
3020 	vnet_t			*vnetp;
3021 	vnet_pseudo_rx_group_t	*rx_grp;
3022 	vnet_pseudo_rx_ring_t	*rx_ringp;
3023 	mac_perim_handle_t	mph1;
3024 	int			type;
3025 
3026 	vnetp = vresp->vnetp;
3027 	type = vresp->type;
3028 	rx_grp = &vnetp->rx_grp[0];
3029 
3030 	if (vresp->rx_ringp == NULL) {
3031 		return;
3032 	}
3033 
3034 	if (type == VIO_NET_RES_LDC_SERVICE) {
3035 		/*
3036 		 * Ring Index 0 is the default ring in the group and is
3037 		 * reserved for LDC_SERVICE in vnet_ring_grp_init(). This ring
3038 		 * is allocated statically and is reported to the mac layer
3039 		 * in vnet_m_capab(). So, all we need to do here, is remove its
3040 		 * reference to the associated vresp.
3041 		 */
3042 		rx_ringp = &rx_grp->rings[0];
3043 		rx_ringp->hw_rh = NULL;
3044 		vresp->rx_ringp = NULL;
3045 		return;
3046 	}
3047 	ASSERT(type == VIO_NET_RES_LDC_GUEST);
3048 
3049 	mac_perim_enter_by_mh(vnetp->mh, &mph1);
3050 
3051 	rx_ringp = (vnet_pseudo_rx_ring_t *)vresp->rx_ringp;
3052 	vresp->rx_ringp = NULL;
3053 
3054 	if (rx_ringp != NULL && (rx_ringp->state & VNET_RXRING_LDC_GUEST)) {
3055 		/* Unregister the pseudo ring with upper-mac */
3056 		mac_group_rem_ring(rx_grp->handle, rx_ringp->handle);
3057 
3058 		rx_ringp->hw_rh = NULL;
3059 		rx_ringp->state &= ~VNET_RXRING_LDC_GUEST;
3060 
3061 		/* Free the pseudo rx ring */
3062 		vnet_free_pseudo_rx_ring(vnetp, rx_ringp);
3063 	}
3064 
3065 	mac_perim_exit(mph1);
3066 }
3067 
3068 static void
3069 vnet_unbind_rings(vnet_res_t *vresp)
3070 {
3071 	switch (vresp->type) {
3072 
3073 	case VIO_NET_RES_LDC_SERVICE:
3074 	case VIO_NET_RES_LDC_GUEST:
3075 		vnet_unbind_vgenring(vresp);
3076 		break;
3077 
3078 	case VIO_NET_RES_HYBRID:
3079 		vnet_unbind_hwrings(vresp->vnetp);
3080 		break;
3081 
3082 	default:
3083 		break;
3084 
3085 	}
3086 }
3087 
3088 static int
3089 vnet_bind_rings(vnet_res_t *vresp)
3090 {
3091 	int	rv;
3092 
3093 	switch (vresp->type) {
3094 
3095 	case VIO_NET_RES_LDC_SERVICE:
3096 	case VIO_NET_RES_LDC_GUEST:
3097 		rv = vnet_bind_vgenring(vresp);
3098 		break;
3099 
3100 	case VIO_NET_RES_HYBRID:
3101 		rv = vnet_bind_hwrings(vresp->vnetp);
3102 		break;
3103 
3104 	default:
3105 		rv = 1;
3106 		break;
3107 
3108 	}
3109 
3110 	return (rv);
3111 }
3112 
3113 /* ARGSUSED */
3114 int
3115 vnet_hio_stat(void *arg, uint_t stat, uint64_t *val)
3116 {
3117 	vnet_t	*vnetp = (vnet_t *)arg;
3118 
3119 	*val = mac_stat_get(vnetp->hio_mh, stat);
3120 	return (0);
3121 }
3122 
3123 /*
3124  * The start() and stop() routines for the Hybrid resource below, are just
3125  * dummy functions. This is provided to avoid resource type specific code in
3126  * vnet_start_resources() and vnet_stop_resources(). The starting and stopping
3127  * of the Hybrid resource happens in the context of the mac_client interfaces
3128  * that are invoked in vnet_hio_mac_init() and vnet_hio_mac_cleanup().
3129  */
3130 /* ARGSUSED */
3131 static int
3132 vnet_hio_start(void *arg)
3133 {
3134 	return (0);
3135 }
3136 
3137 /* ARGSUSED */
3138 static void
3139 vnet_hio_stop(void *arg)
3140 {
3141 }
3142 
3143 mblk_t *
3144 vnet_hio_tx(void *arg, mblk_t *mp)
3145 {
3146 	vnet_pseudo_tx_ring_t	*tx_ringp;
3147 	mblk_t			*nextp;
3148 	mblk_t			*ret_mp;
3149 
3150 	tx_ringp = (vnet_pseudo_tx_ring_t *)arg;
3151 	for (;;) {
3152 		nextp = mp->b_next;
3153 		mp->b_next = NULL;
3154 
3155 		ret_mp = mac_hwring_tx(tx_ringp->hw_rh, mp);
3156 		if (ret_mp != NULL) {
3157 			ret_mp->b_next = nextp;
3158 			mp = ret_mp;
3159 			break;
3160 		}
3161 
3162 		if ((mp = nextp) == NULL)
3163 			break;
3164 	}
3165 	return (mp);
3166 }
3167 
3168 #ifdef	VNET_IOC_DEBUG
3169 
3170 /*
3171  * The ioctl entry point is used only for debugging for now. The ioctl commands
3172  * can be used to force the link state of the channel connected to vsw.
3173  */
3174 static void
3175 vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
3176 {
3177 	struct iocblk	*iocp;
3178 	vnet_t		*vnetp;
3179 
3180 	iocp = (struct iocblk *)(uintptr_t)mp->b_rptr;
3181 	iocp->ioc_error = 0;
3182 	vnetp = (vnet_t *)arg;
3183 
3184 	if (vnetp == NULL) {
3185 		miocnak(q, mp, 0, EINVAL);
3186 		return;
3187 	}
3188 
3189 	switch (iocp->ioc_cmd) {
3190 
3191 	case VNET_FORCE_LINK_DOWN:
3192 	case VNET_FORCE_LINK_UP:
3193 		vnet_force_link_state(vnetp, q, mp);
3194 		break;
3195 
3196 	default:
3197 		iocp->ioc_error = EINVAL;
3198 		miocnak(q, mp, 0, iocp->ioc_error);
3199 		break;
3200 
3201 	}
3202 }
3203 
3204 static void
3205 vnet_force_link_state(vnet_t *vnetp, queue_t *q, mblk_t *mp)
3206 {
3207 	mac_register_t	*macp;
3208 	mac_callbacks_t	*cbp;
3209 	vnet_res_t	*vresp;
3210 
3211 	READ_ENTER(&vnetp->vsw_fp_rw);
3212 
3213 	vresp = vnetp->vsw_fp;
3214 	if (vresp == NULL) {
3215 		RW_EXIT(&vnetp->vsw_fp_rw);
3216 		return;
3217 	}
3218 
3219 	macp = &vresp->macreg;
3220 	cbp = macp->m_callbacks;
3221 	cbp->mc_ioctl(macp->m_driver, q, mp);
3222 
3223 	RW_EXIT(&vnetp->vsw_fp_rw);
3224 }
3225 
3226 #else
3227 
3228 static void
3229 vnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
3230 {
3231 	vnet_t		*vnetp;
3232 
3233 	vnetp = (vnet_t *)arg;
3234 
3235 	if (vnetp == NULL) {
3236 		miocnak(q, mp, 0, EINVAL);
3237 		return;
3238 	}
3239 
3240 	/* ioctl support only for debugging */
3241 	miocnak(q, mp, 0, ENOTSUP);
3242 }
3243 
3244 #endif
3245