xref: /titanic_52/usr/src/uts/common/io/vnic/vnic_dev.c (revision 296a41153b384aff24610e491670659621352e81)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/conf.h>
31 #include <sys/cmn_err.h>
32 #include <sys/list.h>
33 #include <sys/ksynch.h>
34 #include <sys/kmem.h>
35 #include <sys/stream.h>
36 #include <sys/modctl.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/atomic.h>
40 #include <sys/stat.h>
41 #include <sys/modhash.h>
42 #include <sys/strsubr.h>
43 #include <sys/strsun.h>
44 #include <sys/dlpi.h>
45 #include <sys/mac.h>
46 #include <sys/mac_ether.h>
47 #include <sys/pattr.h>
48 #if 0
49 #include <sys/vlan.h>
50 #endif
51 #include <sys/vnic.h>
52 #include <sys/vnic_impl.h>
53 #include <sys/gld.h>
54 #include <inet/ip.h>
55 #include <inet/ip_impl.h>
56 
57 static int vnic_m_start(void *);
58 static void vnic_m_stop(void *);
59 static int vnic_m_promisc(void *, boolean_t);
60 static int vnic_m_multicst(void *, boolean_t, const uint8_t *);
61 static int vnic_m_unicst(void *, const uint8_t *);
62 static int vnic_m_stat(void *, uint_t, uint64_t *);
63 static void vnic_m_resources(void *);
64 static mblk_t *vnic_m_tx(void *, mblk_t *);
65 static boolean_t vnic_m_capab_get(void *, mac_capab_t, void *);
66 static void vnic_mac_free(vnic_mac_t *);
67 static uint_t vnic_info_walker(mod_hash_key_t, mod_hash_val_t *, void *);
68 static void vnic_notify_cb(void *, mac_notify_type_t);
69 static int vnic_modify_mac_addr(vnic_t *, uint_t, uchar_t *);
70 static mblk_t *vnic_active_tx(void *, mblk_t *);
71 static int vnic_promisc_set(vnic_t *, boolean_t);
72 
73 static kmem_cache_t	*vnic_cache;
74 static kmem_cache_t	*vnic_mac_cache;
75 static krwlock_t	vnic_lock;
76 static kmutex_t		vnic_mac_lock;
77 static uint_t		vnic_count;
78 
79 /* hash of VNICs (vnic_t's), keyed by VNIC id */
80 static mod_hash_t	*vnic_hash;
81 #define	VNIC_HASHSZ	64
82 #define	VNIC_HASH_KEY(vnic_id)	((mod_hash_key_t)(uintptr_t)vnic_id)
83 
84 /*
85  * Hash of underlying open MACs (vnic_mac_t's), keyed by the string
86  * "<device name><instance number>/<port number>".
87  */
88 static mod_hash_t	*vnic_mac_hash;
89 #define	VNIC_MAC_HASHSZ	64
90 
91 #define	VNIC_MAC_REFHOLD(va) {			\
92 	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
93 	(va)->va_refs++;			\
94 	ASSERT((va)->va_refs != 0);		\
95 }
96 
97 #define	VNIC_MAC_REFRELE(va) {			\
98 	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
99 	ASSERT((va)->va_refs != 0);		\
100 	if (--((va)->va_refs) == 0)		\
101 		vnic_mac_free(va);		\
102 }
103 
104 static uchar_t vnic_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
105 
106 /* used by vnic_walker */
107 typedef struct vnic_info_state {
108 	uint32_t	vs_vnic_id;
109 	char		vs_dev_name[MAXNAMELEN];
110 	boolean_t	vs_vnic_found;
111 	vnic_info_new_vnic_fn_t	vs_new_vnic_fn;
112 	void		*vs_fn_arg;
113 	int		vs_rc;
114 } vnic_info_state_t;
115 
116 #define	VNIC_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_GETCAPAB)
117 
118 static mac_callbacks_t vnic_m_callbacks = {
119 	VNIC_M_CALLBACK_FLAGS,
120 	vnic_m_stat,
121 	vnic_m_start,
122 	vnic_m_stop,
123 	vnic_m_promisc,
124 	vnic_m_multicst,
125 	vnic_m_unicst,
126 	vnic_m_tx,
127 	vnic_m_resources,
128 	NULL,			/* m_ioctl */
129 	vnic_m_capab_get
130 };
131 
132 /* ARGSUSED */
133 static int
134 vnic_mac_ctor(void *buf, void *arg, int kmflag)
135 {
136 	vnic_mac_t *vnic_mac = buf;
137 
138 	bzero(vnic_mac, sizeof (vnic_mac_t));
139 	rw_init(&vnic_mac->va_bcast_grp_lock, NULL, RW_DRIVER, NULL);
140 	rw_init(&vnic_mac->va_promisc_lock, NULL, RW_DRIVER, NULL);
141 
142 	return (0);
143 }
144 
145 /* ARGSUSED */
146 static void
147 vnic_mac_dtor(void *buf, void *arg)
148 {
149 	vnic_mac_t *vnic_mac = buf;
150 
151 	rw_destroy(&vnic_mac->va_promisc_lock);
152 	rw_destroy(&vnic_mac->va_bcast_grp_lock);
153 }
154 
155 void
156 vnic_dev_init(void)
157 {
158 	vnic_cache = kmem_cache_create("vnic_cache",
159 	    sizeof (vnic_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
160 
161 	vnic_mac_cache = kmem_cache_create("vnic_mac_cache",
162 	    sizeof (vnic_mac_t), 0, vnic_mac_ctor, vnic_mac_dtor,
163 	    NULL, NULL, NULL, 0);
164 
165 	vnic_hash = mod_hash_create_idhash("vnic_hash",
166 	    VNIC_HASHSZ, mod_hash_null_valdtor);
167 
168 	vnic_mac_hash = mod_hash_create_strhash("vnic_mac_hash",
169 	    VNIC_MAC_HASHSZ, mod_hash_null_valdtor);
170 
171 	rw_init(&vnic_lock, NULL, RW_DEFAULT, NULL);
172 
173 	mutex_init(&vnic_mac_lock, NULL, MUTEX_DEFAULT, NULL);
174 
175 	vnic_count = 0;
176 }
177 
178 void
179 vnic_dev_fini(void)
180 {
181 	ASSERT(vnic_count == 0);
182 
183 	mutex_destroy(&vnic_mac_lock);
184 	rw_destroy(&vnic_lock);
185 	mod_hash_destroy_strhash(vnic_mac_hash);
186 	mod_hash_destroy_idhash(vnic_hash);
187 	kmem_cache_destroy(vnic_mac_cache);
188 	kmem_cache_destroy(vnic_cache);
189 }
190 
191 uint_t
192 vnic_dev_count(void)
193 {
194 	return (vnic_count);
195 }
196 
197 static int
198 vnic_mac_open(const char *dev_name, vnic_mac_t **vmp)
199 {
200 	char *str_key;
201 	int err;
202 	vnic_mac_t *vnic_mac = NULL;
203 	char driver[MAXNAMELEN];
204 	uint_t ddi_instance;
205 	const mac_info_t *mip;
206 
207 	*vmp = NULL;
208 
209 	if (ddi_parse(dev_name, driver, &ddi_instance) != DDI_SUCCESS)
210 		return (EINVAL);
211 
212 	mutex_enter(&vnic_mac_lock);
213 
214 	err = mod_hash_find(vnic_mac_hash, (mod_hash_key_t)dev_name,
215 	    (mod_hash_val_t *)&vnic_mac);
216 	if (err == 0) {
217 		/* this MAC is already opened, increment reference count */
218 		VNIC_MAC_REFHOLD(vnic_mac);
219 		mutex_exit(&vnic_mac_lock);
220 		*vmp = vnic_mac;
221 		return (0);
222 	}
223 
224 	vnic_mac = kmem_cache_alloc(vnic_mac_cache, KM_SLEEP);
225 
226 	if ((err = mac_open(dev_name, ddi_instance, &vnic_mac->va_mh)) != 0) {
227 		vnic_mac->va_mh = NULL;
228 		goto bail;
229 	}
230 
231 	/* only ethernet support, for now */
232 	mip = mac_info(vnic_mac->va_mh);
233 	if (mip->mi_media != DL_ETHER) {
234 		err = ENOTSUP;
235 		goto bail;
236 	}
237 	if (mip->mi_media != mip->mi_nativemedia) {
238 		err = ENOTSUP;
239 		goto bail;
240 	}
241 
242 	(void) strcpy(vnic_mac->va_dev_name, dev_name);
243 
244 	/* add entry to hash table */
245 	str_key = kmem_alloc(strlen(dev_name) + 1, KM_SLEEP);
246 	(void) strcpy(str_key, dev_name);
247 	err = mod_hash_insert(vnic_mac_hash, (mod_hash_key_t)str_key,
248 	    (mod_hash_val_t)vnic_mac);
249 	ASSERT(err == 0);
250 
251 	/* initialize the flow table associated with lower MAC */
252 	vnic_mac->va_addr_len = ETHERADDRL;
253 	(void) vnic_classifier_flow_tab_init(vnic_mac, vnic_mac->va_addr_len,
254 	    KM_SLEEP);
255 
256 	vnic_mac->va_txinfo = mac_vnic_tx_get(vnic_mac->va_mh);
257 	vnic_mac->va_notify_hdl = mac_notify_add(vnic_mac->va_mh,
258 	    vnic_notify_cb, vnic_mac);
259 
260 	VNIC_MAC_REFHOLD(vnic_mac);
261 	*vmp = vnic_mac;
262 	mutex_exit(&vnic_mac_lock);
263 	return (0);
264 
265 bail:
266 	if (vnic_mac != NULL) {
267 		if (vnic_mac->va_mh != NULL)
268 			mac_close(vnic_mac->va_mh);
269 		kmem_cache_free(vnic_mac_cache, vnic_mac);
270 	}
271 	mutex_exit(&vnic_mac_lock);
272 	return (err);
273 }
274 
275 /*
276  * Create a new flow for the active MAC client sharing the NIC
277  * with the VNICs. This allows the unicast packets for that NIC
278  * to be classified and passed up to the active MAC client. It
279  * also allows packets sent from a VNIC to the active link to
280  * be classified by the VNIC transmit function and delivered via
281  * the MAC module locally. Returns B_TRUE on success, B_FALSE on
282  * failure.
283  */
284 static int
285 vnic_init_active_rx(vnic_mac_t *vnic_mac)
286 {
287 	uchar_t nic_mac_addr[MAXMACADDRLEN];
288 
289 	if (vnic_mac->va_active_flow != NULL)
290 		return (B_TRUE);
291 
292 	mac_unicst_get(vnic_mac->va_mh, nic_mac_addr);
293 
294 	vnic_mac->va_active_flow = vnic_classifier_flow_create(
295 	    vnic_mac->va_addr_len, nic_mac_addr, NULL, B_TRUE, KM_SLEEP);
296 
297 	vnic_classifier_flow_add(vnic_mac, vnic_mac->va_active_flow,
298 	    (vnic_rx_fn_t)mac_active_rx, vnic_mac->va_mh, NULL);
299 	return (B_TRUE);
300 }
301 
302 static void
303 vnic_fini_active_rx(vnic_mac_t *vnic_mac)
304 {
305 	if (vnic_mac->va_active_flow == NULL)
306 		return;
307 
308 	vnic_classifier_flow_remove(vnic_mac, vnic_mac->va_active_flow);
309 	vnic_classifier_flow_destroy(vnic_mac->va_active_flow);
310 	vnic_mac->va_active_flow = NULL;
311 }
312 
313 static void
314 vnic_update_active_rx(vnic_mac_t *vnic_mac)
315 {
316 	if (vnic_mac->va_active_flow == NULL)
317 		return;
318 
319 	vnic_fini_active_rx(vnic_mac);
320 	(void) vnic_init_active_rx(vnic_mac);
321 }
322 
323 /*
324  * Copy an mblk, preserving its hardware checksum flags.
325  */
326 mblk_t *
327 vnic_copymsg_cksum(mblk_t *mp)
328 {
329 	mblk_t *mp1;
330 	uint32_t start, stuff, end, value, flags;
331 
332 	mp1 = copymsg(mp);
333 	if (mp1 == NULL)
334 		return (NULL);
335 
336 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
337 	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
338 	    flags, KM_NOSLEEP);
339 
340 	return (mp1);
341 }
342 
343 /*
344  * Copy an mblk chain, presenting the hardware checksum flags of the
345  * individual mblks.
346  */
347 mblk_t *
348 vnic_copymsgchain_cksum(mblk_t *mp)
349 {
350 	mblk_t *nmp = NULL;
351 	mblk_t **nmpp = &nmp;
352 
353 	for (; mp != NULL; mp = mp->b_next) {
354 		if ((*nmpp = vnic_copymsg_cksum(mp)) == NULL) {
355 			freemsgchain(nmp);
356 			return (NULL);
357 		}
358 
359 		nmpp = &((*nmpp)->b_next);
360 	}
361 
362 	return (nmp);
363 }
364 
365 
366 /*
367  * Process the specified mblk chain for proper handling of hardware
368  * checksum offload. This routine is invoked for loopback VNIC traffic.
369  * The function handles a NULL mblk chain passed as argument.
370  */
371 mblk_t *
372 vnic_fix_cksum(mblk_t *mp_chain)
373 {
374 	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
375 	uint32_t flags, start, stuff, end, value;
376 
377 	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
378 		uint16_t len;
379 		uint32_t offset;
380 		struct ether_header *ehp;
381 		uint16_t sap;
382 
383 		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
384 		    &flags);
385 		if (flags == 0)
386 			continue;
387 
388 		/*
389 		 * Since the processing of checksum offload for loopback
390 		 * traffic requires modification of the packet contents,
391 		 * ensure sure that we are always modifying our own copy.
392 		 */
393 		if (DB_REF(mp) > 1) {
394 			mp1 = copymsg(mp);
395 			if (mp1 == NULL)
396 				continue;
397 			mp1->b_next = mp->b_next;
398 			mp->b_next = NULL;
399 			freemsg(mp);
400 			if (prev != NULL)
401 				prev->b_next = mp1;
402 			else
403 				new_chain = mp1;
404 			mp = mp1;
405 		}
406 
407 		/*
408 		 * Ethernet, and optionally VLAN header.
409 		 */
410 		/*LINTED*/
411 		ehp = (struct ether_header *)mp->b_rptr;
412 		if (ntohs(ehp->ether_type) == VLAN_TPID) {
413 			struct ether_vlan_header *evhp;
414 
415 			ASSERT(MBLKL(mp) >=
416 			    sizeof (struct ether_vlan_header));
417 			/*LINTED*/
418 			evhp = (struct ether_vlan_header *)mp->b_rptr;
419 			sap = ntohs(evhp->ether_type);
420 			offset = sizeof (struct ether_vlan_header);
421 		} else {
422 			sap = ntohs(ehp->ether_type);
423 			offset = sizeof (struct ether_header);
424 		}
425 
426 		if (MBLKL(mp) <= offset) {
427 			offset -= MBLKL(mp);
428 			if (mp->b_cont == NULL) {
429 				/* corrupted packet, skip it */
430 				if (prev != NULL)
431 					prev->b_next = mp->b_next;
432 				else
433 					new_chain = mp->b_next;
434 				mp1 = mp->b_next;
435 				mp->b_next = NULL;
436 				freemsg(mp);
437 				mp = mp1;
438 				continue;
439 			}
440 			mp = mp->b_cont;
441 		}
442 
443 		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
444 			ipha_t *ipha = NULL;
445 
446 			/*
447 			 * In order to compute the full and header
448 			 * checksums, we need to find and parse
449 			 * the IP and/or ULP headers.
450 			 */
451 
452 			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
453 
454 			/*
455 			 * IP header.
456 			 */
457 			if (sap != ETHERTYPE_IP)
458 				continue;
459 
460 			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
461 			/*LINTED*/
462 			ipha = (ipha_t *)(mp->b_rptr + offset);
463 
464 			if (flags & HCK_FULLCKSUM) {
465 				ipaddr_t src, dst;
466 				uint32_t cksum;
467 				uint16_t *up;
468 				uint8_t proto;
469 
470 				/*
471 				 * Pointer to checksum field in ULP header.
472 				 */
473 				proto = ipha->ipha_protocol;
474 				ASSERT(ipha->ipha_version_and_hdr_length ==
475 				    IP_SIMPLE_HDR_VERSION);
476 				if (proto == IPPROTO_TCP) {
477 					/*LINTED*/
478 					up = IPH_TCPH_CHECKSUMP(ipha,
479 					    IP_SIMPLE_HDR_LENGTH);
480 				} else {
481 					ASSERT(proto == IPPROTO_UDP);
482 					/*LINTED*/
483 					up = IPH_UDPH_CHECKSUMP(ipha,
484 					    IP_SIMPLE_HDR_LENGTH);
485 				}
486 
487 				/*
488 				 * Pseudo-header checksum.
489 				 */
490 				src = ipha->ipha_src;
491 				dst = ipha->ipha_dst;
492 				len = ntohs(ipha->ipha_length) -
493 				    IP_SIMPLE_HDR_LENGTH;
494 
495 				cksum = (dst >> 16) + (dst & 0xFFFF) +
496 				    (src >> 16) + (src & 0xFFFF);
497 				cksum += htons(len);
498 
499 				/*
500 				 * The checksum value stored in the packet needs
501 				 * to be correct. Compute it here.
502 				 */
503 				*up = 0;
504 				cksum += (((proto) == IPPROTO_UDP) ?
505 				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
506 				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
507 				    offset, cksum);
508 				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
509 
510 				flags |= HCK_FULLCKSUM_OK;
511 				value = 0xffff;
512 			}
513 
514 			if (flags & HCK_IPV4_HDRCKSUM) {
515 				ASSERT(ipha != NULL);
516 				ipha->ipha_hdr_checksum =
517 				    (uint16_t)ip_csum_hdr(ipha);
518 			}
519 		}
520 
521 		if (flags & HCK_PARTIALCKSUM) {
522 			uint16_t *up, partial, cksum;
523 			uchar_t *ipp; /* ptr to beginning of IP header */
524 
525 			if (mp->b_cont != NULL) {
526 				mblk_t *mp1;
527 
528 				mp1 = msgpullup(mp, offset + end);
529 				if (mp1 == NULL)
530 					continue;
531 				mp1->b_next = mp->b_next;
532 				mp->b_next = NULL;
533 				freemsg(mp);
534 				if (prev != NULL)
535 					prev->b_next = mp1;
536 				else
537 					new_chain = mp1;
538 				mp = mp1;
539 			}
540 
541 			ipp = mp->b_rptr + offset;
542 			/*LINTED*/
543 			up = (uint16_t *)((uchar_t *)ipp + stuff);
544 			partial = *up;
545 			*up = 0;
546 
547 			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
548 			    end - start, partial);
549 			cksum = ~cksum;
550 			*up = cksum ? cksum : ~cksum;
551 
552 			/*
553 			 * Since we already computed the whole checksum,
554 			 * indicate to the stack that it has already
555 			 * been verified by the hardware.
556 			 */
557 			flags &= ~HCK_PARTIALCKSUM;
558 			flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
559 			value = 0xffff;
560 		}
561 
562 		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
563 		    value, flags, KM_NOSLEEP);
564 	}
565 
566 	return (new_chain);
567 }
568 
569 static void
570 vnic_mac_close(vnic_mac_t *vnic_mac)
571 {
572 	mutex_enter(&vnic_mac_lock);
573 	VNIC_MAC_REFRELE(vnic_mac);
574 	mutex_exit(&vnic_mac_lock);
575 }
576 
577 static void
578 vnic_mac_free(vnic_mac_t *vnic_mac)
579 {
580 	mod_hash_val_t val;
581 
582 	ASSERT(MUTEX_HELD(&vnic_mac_lock));
583 	vnic_fini_active_rx(vnic_mac);
584 	mac_notify_remove(vnic_mac->va_mh, vnic_mac->va_notify_hdl);
585 	if (vnic_mac->va_mac_set) {
586 		vnic_mac->va_mac_set = B_FALSE;
587 		mac_vnic_clear(vnic_mac->va_mh);
588 	}
589 	vnic_classifier_flow_tab_fini(vnic_mac);
590 	mac_close(vnic_mac->va_mh);
591 
592 	(void) mod_hash_remove(vnic_mac_hash,
593 	    (mod_hash_key_t)vnic_mac->va_dev_name, &val);
594 	ASSERT(vnic_mac == (vnic_mac_t *)val);
595 
596 	kmem_cache_free(vnic_mac_cache, vnic_mac);
597 }
598 
599 /*
600  * Initial VNIC receive routine. Invoked for packets that are steered
601  * to a VNIC but the VNIC has not been started yet.
602  */
603 /* ARGSUSED */
604 static void
605 vnic_rx_initial(void *arg1, void *arg2, mblk_t *mp_chain)
606 {
607 	vnic_t *vnic = arg1;
608 	mblk_t *mp;
609 
610 	/* update stats */
611 	for (mp = mp_chain; mp != NULL; mp = mp->b_next)
612 		vnic->vn_stat_ierrors++;
613 	freemsgchain(mp_chain);
614 }
615 
616 /*
617  * VNIC receive routine invoked after the classifier for the VNIC
618  * has been initialized and the VNIC has been started.
619  */
620 /* ARGSUSED */
621 void
622 vnic_rx(void *arg1, void *arg2, mblk_t *mp_chain)
623 {
624 	vnic_t *vnic = arg1;
625 	mblk_t *mp;
626 
627 	/* update stats */
628 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
629 		vnic->vn_stat_ipackets++;
630 		vnic->vn_stat_rbytes += msgdsize(mp);
631 	}
632 
633 	/* pass packet up */
634 	mac_rx(vnic->vn_mh, NULL, mp_chain);
635 }
636 
637 /*
638  * Routine to create a MAC-based VNIC. Adds the passed MAC address
639  * to an unused slot in the NIC if one is available. Otherwise it
640  * sets the NIC in promiscuous mode and assigns the MAC address to
641  * a Rx ring if available or a soft ring.
642  */
643 static int
644 vnic_add_unicstaddr(vnic_t *vnic, mac_multi_addr_t *maddr)
645 {
646 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
647 	int err;
648 
649 	if (mac_unicst_verify(vnic_mac->va_mh, maddr->mma_addr,
650 	    maddr->mma_addrlen) == B_FALSE)
651 		return (EINVAL);
652 
653 	if (mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_MULTIADDRESS,
654 	    &(vnic->vn_mma_capab))) {
655 		if (vnic->vn_maddr_naddrfree == 0) {
656 			/*
657 			 * No free address slots available.
658 			 * Enable promiscuous mode.
659 			 */
660 			goto set_promisc;
661 		}
662 
663 		err = vnic->vn_maddr_add(vnic->vn_maddr_handle, maddr);
664 		if (err != 0) {
665 			if (err == ENOSPC) {
666 				/*
667 				 * There was a race to add addresses
668 				 * with other multiple address consumers,
669 				 * and we lost out. Use promisc mode.
670 				 */
671 				goto set_promisc;
672 			}
673 
674 			return (err);
675 		}
676 
677 		vnic->vn_slot_id = maddr->mma_slot;
678 		vnic->vn_multi_mac = B_TRUE;
679 	} else {
680 		/*
681 		 * Either multiple MAC address support is not
682 		 * available or all available addresses have
683 		 * been used up.
684 		 */
685 	set_promisc:
686 		err = mac_promisc_set(vnic_mac->va_mh, B_TRUE, MAC_DEVPROMISC);
687 		if (err != 0) {
688 			return (err);
689 		}
690 
691 		vnic->vn_promisc_mac = B_TRUE;
692 	}
693 	return (err);
694 }
695 
696 /*
697  * VNIC is getting deleted. Remove the MAC address from the slot.
698  * If promiscuous mode was being used, then unset the promiscuous mode.
699  */
700 static int
701 vnic_remove_unicstaddr(vnic_t *vnic)
702 {
703 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
704 	int err;
705 
706 	if (vnic->vn_multi_mac) {
707 		ASSERT(vnic->vn_promisc_mac == B_FALSE);
708 		err = vnic->vn_maddr_remove(vnic->vn_maddr_handle,
709 		    vnic->vn_slot_id);
710 		vnic->vn_multi_mac = B_FALSE;
711 	}
712 
713 	if (vnic->vn_promisc_mac) {
714 		ASSERT(vnic->vn_multi_mac == B_FALSE);
715 		err = mac_promisc_set(vnic_mac->va_mh, B_FALSE, MAC_DEVPROMISC);
716 		vnic->vn_promisc_mac = B_FALSE;
717 	}
718 
719 	return (err);
720 }
721 
722 /*
723  * Create a new VNIC upon request from administrator.
724  * Returns 0 on success, an errno on failure.
725  */
726 int
727 vnic_dev_create(uint_t vnic_id, char *dev_name, int mac_len, uchar_t *mac_addr)
728 {
729 	vnic_t *vnic = NULL;
730 	mac_register_t *mac;
731 	int err;
732 	vnic_mac_t *vnic_mac;
733 	const mac_info_t *lower_mac_info;
734 	mac_multi_addr_t maddr;
735 	mac_txinfo_t tx_info;
736 
737 	if (mac_len != ETHERADDRL) {
738 		/* currently only ethernet NICs are supported */
739 		return (EINVAL);
740 	}
741 
742 	rw_enter(&vnic_lock, RW_WRITER);
743 
744 	/* does a VNIC with the same id already exist? */
745 	err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
746 	    (mod_hash_val_t *)&vnic);
747 	if (err == 0) {
748 		rw_exit(&vnic_lock);
749 		return (EEXIST);
750 	}
751 
752 	vnic = kmem_cache_alloc(vnic_cache, KM_NOSLEEP);
753 	if (vnic == NULL) {
754 		rw_exit(&vnic_lock);
755 		return (ENOMEM);
756 	}
757 
758 	/* open underlying MAC */
759 	err = vnic_mac_open(dev_name, &vnic_mac);
760 	if (err != 0) {
761 		kmem_cache_free(vnic_cache, vnic);
762 		rw_exit(&vnic_lock);
763 		return (err);
764 	}
765 
766 	bzero(vnic, sizeof (*vnic));
767 	vnic->vn_id = vnic_id;
768 	vnic->vn_vnic_mac = vnic_mac;
769 
770 	vnic->vn_started = B_FALSE;
771 	vnic->vn_promisc = B_FALSE;
772 	vnic->vn_multi_mac = B_FALSE;
773 	vnic->vn_bcast_grp = B_FALSE;
774 
775 	/* set the VNIC MAC address */
776 	maddr.mma_addrlen = mac_len;
777 	maddr.mma_slot = 0;
778 	maddr.mma_flags = 0;
779 	bcopy(mac_addr, maddr.mma_addr, mac_len);
780 	if ((err = vnic_add_unicstaddr(vnic, &maddr)) != 0)
781 		goto bail;
782 	bcopy(mac_addr, vnic->vn_addr, mac_len);
783 
784 	/* set the initial VNIC capabilities */
785 	if (!mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_HCKSUM,
786 	    &vnic->vn_hcksum_txflags))
787 		vnic->vn_hcksum_txflags = 0;
788 
789 	/* register with the MAC module */
790 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
791 		goto bail;
792 
793 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
794 	mac->m_driver = vnic;
795 	mac->m_dip = vnic_get_dip();
796 	mac->m_instance = vnic_id;
797 	mac->m_src_addr = vnic->vn_addr;
798 	mac->m_callbacks = &vnic_m_callbacks;
799 
800 	lower_mac_info = mac_info(vnic_mac->va_mh);
801 	mac->m_min_sdu = lower_mac_info->mi_sdu_min;
802 	mac->m_max_sdu = lower_mac_info->mi_sdu_max;
803 
804 	err = mac_register(mac, &vnic->vn_mh);
805 	mac_free(mac);
806 	if (err != 0)
807 		goto bail;
808 
809 	/* add new VNIC to hash table */
810 	err = mod_hash_insert(vnic_hash, VNIC_HASH_KEY(vnic_id),
811 	    (mod_hash_val_t)vnic);
812 	ASSERT(err == 0);
813 	vnic_count++;
814 
815 	rw_exit(&vnic_lock);
816 
817 	/* Create a flow, initialized with the MAC address of the VNIC */
818 	if ((vnic->vn_flow_ent = vnic_classifier_flow_create(mac_len, mac_addr,
819 	    NULL, B_FALSE, KM_SLEEP)) == NULL) {
820 		(void) vnic_dev_delete(vnic_id);
821 		vnic = NULL;
822 		err = ENOMEM;
823 		goto bail_unlocked;
824 	}
825 
826 	vnic_classifier_flow_add(vnic_mac, vnic->vn_flow_ent, vnic_rx_initial,
827 	    vnic, vnic);
828 
829 	/* setup VNIC to receive broadcast packets */
830 	err = vnic_bcast_add(vnic, vnic_brdcst_mac, MAC_ADDRTYPE_BROADCAST);
831 	if (err != 0) {
832 		(void) vnic_dev_delete(vnic_id);
833 		vnic = NULL;
834 		goto bail_unlocked;
835 	}
836 	vnic->vn_bcast_grp = B_TRUE;
837 
838 	mutex_enter(&vnic_mac_lock);
839 	if (!vnic_mac->va_mac_set) {
840 		/*
841 		 * We want to MAC layer to call the VNIC tx outbound
842 		 * routine, so that local broadcast packets sent by
843 		 * the active interface sharing the underlying NIC (if
844 		 * any), can be broadcast to every VNIC.
845 		 */
846 		tx_info.mt_fn = vnic_active_tx;
847 		tx_info.mt_arg = vnic_mac;
848 		if (!mac_vnic_set(vnic_mac->va_mh, &tx_info,
849 		    vnic_m_capab_get, vnic)) {
850 			mutex_exit(&vnic_mac_lock);
851 			(void) vnic_dev_delete(vnic_id);
852 			vnic = NULL;
853 			err = EBUSY;
854 			goto bail_unlocked;
855 		}
856 		vnic_mac->va_mac_set = B_TRUE;
857 	}
858 	mutex_exit(&vnic_mac_lock);
859 
860 	/* allow passing packets to NIC's active MAC client */
861 	if (!vnic_init_active_rx(vnic_mac)) {
862 		(void) vnic_dev_delete(vnic_id);
863 		vnic = NULL;
864 		err = ENOMEM;
865 		goto bail_unlocked;
866 	}
867 
868 	return (0);
869 
870 bail:
871 	(void) vnic_remove_unicstaddr(vnic);
872 	vnic_mac_close(vnic_mac);
873 	rw_exit(&vnic_lock);
874 
875 bail_unlocked:
876 	if (vnic != NULL) {
877 		kmem_cache_free(vnic_cache, vnic);
878 	}
879 
880 	return (err);
881 }
882 
883 /*
884  * Modify the properties of an existing VNIC.
885  */
886 /* ARGSUSED */
887 int
888 vnic_dev_modify(uint_t vnic_id, uint_t modify_mask,
889     vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr)
890 {
891 	vnic_t *vnic = NULL;
892 	int rv = 0;
893 	boolean_t notify_mac_addr = B_FALSE;
894 
895 	rw_enter(&vnic_lock, RW_WRITER);
896 
897 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
898 	    (mod_hash_val_t *)&vnic) != 0) {
899 		rw_exit(&vnic_lock);
900 		return (ENOENT);
901 	}
902 
903 	if (modify_mask & VNIC_IOC_MODIFY_ADDR) {
904 		rv = vnic_modify_mac_addr(vnic, mac_len, mac_addr);
905 		if (rv == 0)
906 			notify_mac_addr = B_TRUE;
907 	}
908 
909 	rw_exit(&vnic_lock);
910 
911 	if (notify_mac_addr)
912 		mac_unicst_update(vnic->vn_mh, mac_addr);
913 
914 	return (rv);
915 }
916 
917 int
918 vnic_dev_delete(uint_t vnic_id)
919 {
920 	vnic_t *vnic = NULL;
921 	mod_hash_val_t val;
922 	vnic_flow_t *flent;
923 	int rc;
924 	vnic_mac_t *vnic_mac;
925 
926 	rw_enter(&vnic_lock, RW_WRITER);
927 
928 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
929 	    (mod_hash_val_t *)&vnic) != 0) {
930 		rw_exit(&vnic_lock);
931 		return (ENOENT);
932 	}
933 
934 	/*
935 	 * We cannot unregister the MAC yet. Unregistering would
936 	 * free up mac_impl_t which should not happen at this time.
937 	 * Packets could be entering vnic_rx() through the
938 	 * flow entry and so mac_impl_t cannot be NULL. So disable
939 	 * mac_impl_t by calling mac_disable(). This will prevent any
940 	 * new claims on mac_impl_t.
941 	 */
942 	if (mac_disable(vnic->vn_mh) != 0) {
943 		rw_exit(&vnic_lock);
944 		return (EBUSY);
945 	}
946 
947 	(void) mod_hash_remove(vnic_hash, VNIC_HASH_KEY(vnic_id), &val);
948 	ASSERT(vnic == (vnic_t *)val);
949 
950 	if (vnic->vn_bcast_grp)
951 		(void) vnic_bcast_delete(vnic, vnic_brdcst_mac);
952 
953 	flent = vnic->vn_flow_ent;
954 	if (flent != NULL) {
955 		/*
956 		 * vnic_classifier_flow_destroy() ensures that the
957 		 * flow is no longer used.
958 		 */
959 		vnic_classifier_flow_remove(vnic->vn_vnic_mac, flent);
960 		vnic_classifier_flow_destroy(flent);
961 	}
962 
963 	rc = mac_unregister(vnic->vn_mh);
964 	ASSERT(rc == 0);
965 	(void) vnic_remove_unicstaddr(vnic);
966 	vnic_mac = vnic->vn_vnic_mac;
967 	kmem_cache_free(vnic_cache, vnic);
968 	vnic_count--;
969 	rw_exit(&vnic_lock);
970 	vnic_mac_close(vnic_mac);
971 	return (0);
972 }
973 
974 /*
975  * For the specified packet chain, return a sub-chain to be sent
976  * and the transmit function to be used to send the packet. Also
977  * return a pointer to the sub-chain of packets that should
978  * be re-classified. If the function returns NULL, the packet
979  * should be sent using the underlying NIC.
980  */
981 static vnic_flow_t *
982 vnic_classify(vnic_mac_t *vnic_mac, mblk_t *mp, mblk_t **mp_chain_rest)
983 {
984 	vnic_flow_t *flow_ent;
985 
986 	/* one packet at a time */
987 	*mp_chain_rest = mp->b_next;
988 	mp->b_next = NULL;
989 
990 	/* do classification on the packet */
991 	flow_ent = vnic_classifier_get_flow(vnic_mac, mp);
992 
993 	return (flow_ent);
994 }
995 
996 /*
997  * Send a packet chain to a local VNIC or an active MAC client.
998  */
999 static void
1000 vnic_local_tx(vnic_mac_t *vnic_mac, vnic_flow_t *flow_ent, mblk_t *mp_chain)
1001 {
1002 	mblk_t *mp1;
1003 	const vnic_flow_fn_info_t *fn_info;
1004 	vnic_t *vnic;
1005 
1006 	if (!vnic_classifier_is_active(flow_ent) &&
1007 	    mac_promisc_get(vnic_mac->va_mh, MAC_PROMISC)) {
1008 		/*
1009 		 * If the MAC is in promiscous mode,
1010 		 * send a copy of the active client.
1011 		 */
1012 		if ((mp1 = vnic_copymsgchain_cksum(mp_chain)) == NULL)
1013 			goto sendit;
1014 		if ((mp1 = vnic_fix_cksum(mp1)) == NULL)
1015 			goto sendit;
1016 		mac_active_rx(vnic_mac->va_mh, NULL, mp1);
1017 	}
1018 sendit:
1019 	fn_info = vnic_classifier_get_fn_info(flow_ent);
1020 	/*
1021 	 * If the vnic to which we would deliver this packet is in
1022 	 * promiscuous mode then it already received the packet via
1023 	 * vnic_promisc_rx().
1024 	 *
1025 	 * XXX assumes that ff_arg2 is a vnic_t pointer if it is
1026 	 * non-NULL (currently always true).
1027 	 */
1028 	vnic = (vnic_t *)fn_info->ff_arg2;
1029 	if ((vnic != NULL) && vnic->vn_promisc)
1030 		freemsg(mp_chain);
1031 	else if ((mp1 = vnic_fix_cksum(mp_chain)) != NULL)
1032 		(fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp1);
1033 }
1034 
1035 /*
1036  * This function is invoked when a MAC client needs to send a packet
1037  * to a NIC which is shared by VNICs. It is passed to the MAC layer
1038  * by a call to mac_vnic_set() when the NIC is opened, and is returned
1039  * to MAC clients by mac_tx_get() when VNICs are present.
1040  */
1041 mblk_t *
1042 vnic_active_tx(void *arg, mblk_t *mp_chain)
1043 {
1044 	vnic_mac_t *vnic_mac = arg;
1045 	mblk_t *mp, *extra_mp = NULL;
1046 	vnic_flow_t *flow_ent;
1047 	void *flow_cookie;
1048 	const mac_txinfo_t *mtp = vnic_mac->va_txinfo;
1049 
1050 	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
1051 		mblk_t *next;
1052 
1053 		next = mp->b_next;
1054 		mp->b_next = NULL;
1055 
1056 		vnic_promisc_rx(vnic_mac, (vnic_t *)-1, mp);
1057 
1058 		flow_ent = vnic_classify(vnic_mac, mp, &extra_mp);
1059 		ASSERT(extra_mp == NULL);
1060 		extra_mp = next;
1061 
1062 		if (flow_ent != NULL) {
1063 			flow_cookie = vnic_classifier_get_client_cookie(
1064 			    flow_ent);
1065 			if (flow_cookie != NULL) {
1066 				/*
1067 				 * Send a copy to every VNIC defined on the
1068 				 * interface, as well as the underlying MAC.
1069 				 */
1070 				vnic_bcast_send(flow_cookie, (vnic_t *)-1, mp);
1071 			} else {
1072 				/*
1073 				 * loopback the packet to a local VNIC or
1074 				 * an active MAC client.
1075 				 */
1076 				vnic_local_tx(vnic_mac, flow_ent, mp);
1077 			}
1078 			VNIC_FLOW_REFRELE(flow_ent);
1079 			mp_chain = NULL;
1080 		} else {
1081 			/*
1082 			 * Non-VNIC destination, send via the underlying
1083 			 * NIC. In order to avoid a recursive call
1084 			 * to this function, we ensured that mtp points
1085 			 * to the unerlying NIC transmit function
1086 			 * by inilizating through mac_vnic_tx_get().
1087 			 */
1088 			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
1089 			if (mp_chain != NULL)
1090 				break;
1091 		}
1092 	}
1093 
1094 	if ((mp_chain != NULL) && (extra_mp != NULL)) {
1095 		ASSERT(mp_chain->b_next == NULL);
1096 		mp_chain->b_next = extra_mp;
1097 	}
1098 	return (mp_chain);
1099 }
1100 
1101 /*
1102  * VNIC transmit function.
1103  */
1104 mblk_t *
1105 vnic_m_tx(void *arg, mblk_t *mp_chain)
1106 {
1107 	vnic_t *vnic = arg;
1108 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1109 	mblk_t *mp, *extra_mp = NULL;
1110 	vnic_flow_t *flow_ent;
1111 	void *flow_cookie;
1112 
1113 	/*
1114 	 * Update stats.
1115 	 */
1116 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
1117 		vnic->vn_stat_opackets++;
1118 		vnic->vn_stat_obytes += msgdsize(mp);
1119 	}
1120 
1121 	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
1122 		mblk_t *next;
1123 
1124 		next = mp->b_next;
1125 		mp->b_next = NULL;
1126 
1127 		vnic_promisc_rx(vnic->vn_vnic_mac, vnic, mp);
1128 
1129 		flow_ent = vnic_classify(vnic->vn_vnic_mac, mp, &extra_mp);
1130 		ASSERT(extra_mp == NULL);
1131 		extra_mp = next;
1132 
1133 		if (flow_ent != NULL) {
1134 			flow_cookie = vnic_classifier_get_client_cookie(
1135 			    flow_ent);
1136 			if (flow_cookie != NULL) {
1137 				/*
1138 				 * The vnic_bcast_send function expects
1139 				 * to receive the sender VNIC as value
1140 				 * for arg2.
1141 				 */
1142 				vnic_bcast_send(flow_cookie, vnic, mp);
1143 			} else {
1144 				/*
1145 				 * loopback the packet to a local VNIC or
1146 				 * an active MAC client.
1147 				 */
1148 				vnic_local_tx(vnic_mac, flow_ent, mp);
1149 			}
1150 			VNIC_FLOW_REFRELE(flow_ent);
1151 			mp_chain = NULL;
1152 		} else {
1153 			/*
1154 			 * Non-local destination, send via the underlying
1155 			 * NIC.
1156 			 */
1157 			const mac_txinfo_t *mtp = vnic->vn_txinfo;
1158 			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
1159 			if (mp_chain != NULL)
1160 				break;
1161 		}
1162 	}
1163 
1164 	/* update stats to account for unsent packets */
1165 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
1166 		vnic->vn_stat_opackets--;
1167 		vnic->vn_stat_obytes -= msgdsize(mp);
1168 		vnic->vn_stat_oerrors++;
1169 		/*
1170 		 * link back in the last portion not counted due to bandwidth
1171 		 * control.
1172 		 */
1173 		if (mp->b_next == NULL) {
1174 			mp->b_next = extra_mp;
1175 			break;
1176 		}
1177 	}
1178 
1179 	return (mp_chain);
1180 }
1181 
1182 /* ARGSUSED */
1183 static void
1184 vnic_m_resources(void *arg)
1185 {
1186 	/* no resources to advertise */
1187 }
1188 
1189 static int
1190 vnic_m_stat(void *arg, uint_t stat, uint64_t *val)
1191 {
1192 	vnic_t *vnic = arg;
1193 	int rval = 0;
1194 
1195 	rw_enter(&vnic_lock, RW_READER);
1196 
1197 	switch (stat) {
1198 	case ETHER_STAT_LINK_DUPLEX:
1199 		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
1200 		    ETHER_STAT_LINK_DUPLEX);
1201 		break;
1202 	case MAC_STAT_IFSPEED:
1203 		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
1204 		    MAC_STAT_IFSPEED);
1205 		break;
1206 	case MAC_STAT_MULTIRCV:
1207 		*val = vnic->vn_stat_multircv;
1208 		break;
1209 	case MAC_STAT_BRDCSTRCV:
1210 		*val = vnic->vn_stat_brdcstrcv;
1211 		break;
1212 	case MAC_STAT_MULTIXMT:
1213 		*val = vnic->vn_stat_multixmt;
1214 		break;
1215 	case MAC_STAT_BRDCSTXMT:
1216 		*val = vnic->vn_stat_brdcstxmt;
1217 		break;
1218 	case MAC_STAT_IERRORS:
1219 		*val = vnic->vn_stat_ierrors;
1220 		break;
1221 	case MAC_STAT_OERRORS:
1222 		*val = vnic->vn_stat_oerrors;
1223 		break;
1224 	case MAC_STAT_RBYTES:
1225 		*val = vnic->vn_stat_rbytes;
1226 		break;
1227 	case MAC_STAT_IPACKETS:
1228 		*val = vnic->vn_stat_ipackets;
1229 		break;
1230 	case MAC_STAT_OBYTES:
1231 		*val = vnic->vn_stat_obytes;
1232 		break;
1233 	case MAC_STAT_OPACKETS:
1234 		*val = vnic->vn_stat_opackets;
1235 		break;
1236 	default:
1237 		rval = ENOTSUP;
1238 	}
1239 
1240 	rw_exit(&vnic_lock);
1241 	return (rval);
1242 }
1243 
1244 /*
1245  * Return information about the specified capability.
1246  */
1247 /* ARGSUSED */
1248 static boolean_t
1249 vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
1250 {
1251 	vnic_t *vnic = arg;
1252 
1253 	switch (cap) {
1254 	case MAC_CAPAB_POLL:
1255 		return (B_TRUE);
1256 	case MAC_CAPAB_HCKSUM: {
1257 		uint32_t *hcksum_txflags = cap_data;
1258 
1259 		*hcksum_txflags = vnic->vn_hcksum_txflags &
1260 		    (HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM |
1261 		    HCKSUM_INET_PARTIAL);
1262 		break;
1263 	}
1264 	default:
1265 		return (B_FALSE);
1266 	}
1267 	return (B_TRUE);
1268 }
1269 
1270 static int
1271 vnic_m_start(void *arg)
1272 {
1273 	vnic_t *vnic = arg;
1274 	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
1275 	int rc;
1276 
1277 	rc = mac_start(lower_mh);
1278 	if (rc != 0)
1279 		return (rc);
1280 
1281 	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx, vnic, vnic);
1282 	return (0);
1283 }
1284 
1285 static void
1286 vnic_m_stop(void *arg)
1287 {
1288 	vnic_t *vnic = arg;
1289 	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
1290 
1291 	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx_initial,
1292 	    vnic, vnic);
1293 	mac_stop(lower_mh);
1294 }
1295 
1296 /* ARGSUSED */
1297 static int
1298 vnic_m_promisc(void *arg, boolean_t on)
1299 {
1300 	vnic_t *vnic = arg;
1301 
1302 	return (vnic_promisc_set(vnic, on));
1303 }
1304 
1305 static int
1306 vnic_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
1307 {
1308 	vnic_t *vnic = arg;
1309 	int rc = 0;
1310 
1311 	if (add)
1312 		rc = vnic_bcast_add(vnic, addrp, MAC_ADDRTYPE_MULTICAST);
1313 	else
1314 		vnic_bcast_delete(vnic, addrp);
1315 
1316 	return (rc);
1317 }
1318 
1319 static int
1320 vnic_m_unicst(void *arg, const uint8_t *mac_addr)
1321 {
1322 	vnic_t *vnic = arg;
1323 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1324 	int rv;
1325 
1326 	rw_enter(&vnic_lock, RW_WRITER);
1327 	rv = vnic_modify_mac_addr(vnic, vnic_mac->va_addr_len,
1328 	    (uchar_t *)mac_addr);
1329 	rw_exit(&vnic_lock);
1330 
1331 	if (rv == 0)
1332 		mac_unicst_update(vnic->vn_mh, mac_addr);
1333 	return (0);
1334 }
1335 
1336 int
1337 vnic_info(uint_t *nvnics, uint32_t vnic_id, char *dev_name, void *fn_arg,
1338     vnic_info_new_vnic_fn_t new_vnic_fn)
1339 {
1340 	vnic_info_state_t state;
1341 	int rc = 0;
1342 
1343 	rw_enter(&vnic_lock, RW_READER);
1344 
1345 	*nvnics = vnic_count;
1346 
1347 	bzero(&state, sizeof (state));
1348 	state.vs_vnic_id = vnic_id;
1349 	bcopy(state.vs_dev_name, dev_name, MAXNAMELEN);
1350 	state.vs_new_vnic_fn = new_vnic_fn;
1351 	state.vs_fn_arg = fn_arg;
1352 
1353 	mod_hash_walk(vnic_hash, vnic_info_walker, &state);
1354 
1355 	if ((rc = state.vs_rc) == 0 && vnic_id != 0 &&
1356 	    state.vs_vnic_found)
1357 		rc = ENOENT;
1358 
1359 	rw_exit(&vnic_lock);
1360 	return (rc);
1361 }
1362 
1363 /*
1364  * Walker invoked when building a list of vnics that must be passed
1365  * up to user space.
1366  */
1367 /*ARGSUSED*/
1368 static uint_t
1369 vnic_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
1370 {
1371 	vnic_t *vnic;
1372 	vnic_info_state_t *state = arg;
1373 
1374 	if (state->vs_rc != 0)
1375 		return (MH_WALK_TERMINATE);	/* terminate walk */
1376 
1377 	vnic = (vnic_t *)val;
1378 
1379 	if (state->vs_vnic_id != 0 && vnic->vn_id != state->vs_vnic_id)
1380 		goto bail;
1381 
1382 	state->vs_vnic_found = B_TRUE;
1383 
1384 	state->vs_rc = state->vs_new_vnic_fn(state->vs_fn_arg,
1385 	    vnic->vn_id, vnic->vn_addr_type, vnic->vn_vnic_mac->va_addr_len,
1386 	    vnic->vn_addr, vnic->vn_vnic_mac->va_dev_name);
1387 bail:
1388 	return ((state->vs_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
1389 }
1390 
1391 /*
1392  * vnic_notify_cb() and vnic_notify_walker() below are used to
1393  * process events received from an underlying NIC and, if needed,
1394  * forward these events to the VNICs defined on top of that NIC.
1395  */
1396 
1397 typedef struct vnic_notify_state {
1398 	mac_notify_type_t	vo_type;
1399 	vnic_mac_t		*vo_vnic_mac;
1400 } vnic_notify_state_t;
1401 
1402 /* ARGSUSED */
1403 static uint_t
1404 vnic_notify_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
1405 {
1406 	vnic_t *vnic = (vnic_t *)val;
1407 	vnic_notify_state_t *state = arg;
1408 
1409 	/* ignore VNICs that don't use the specified underlying MAC */
1410 	if (vnic->vn_vnic_mac != state->vo_vnic_mac)
1411 		return (MH_WALK_CONTINUE);
1412 
1413 	switch (state->vo_type) {
1414 	case MAC_NOTE_TX:
1415 		mac_tx_update(vnic->vn_mh);
1416 		break;
1417 	case MAC_NOTE_LINK:
1418 		/*
1419 		 * The VNIC link state must be up regardless of
1420 		 * the link state of the underlying NIC to maintain
1421 		 * connectivity between VNICs on the same host.
1422 		 */
1423 		mac_link_update(vnic->vn_mh, LINK_STATE_UP);
1424 		break;
1425 	case MAC_NOTE_UNICST:
1426 		vnic_update_active_rx(vnic->vn_vnic_mac);
1427 		break;
1428 	case MAC_NOTE_VNIC:
1429 		/* only for clients which share a NIC with a VNIC */
1430 		break;
1431 	case MAC_NOTE_PROMISC:
1432 		mutex_enter(&vnic_mac_lock);
1433 		vnic->vn_vnic_mac->va_txinfo = mac_vnic_tx_get(
1434 		    vnic->vn_vnic_mac->va_mh);
1435 		mutex_exit(&vnic_mac_lock);
1436 		break;
1437 	}
1438 
1439 	return (MH_WALK_CONTINUE);
1440 }
1441 
1442 static void
1443 vnic_notify_cb(void *arg, mac_notify_type_t type)
1444 {
1445 	vnic_mac_t *vnic = arg;
1446 	vnic_notify_state_t state;
1447 
1448 	state.vo_type = type;
1449 	state.vo_vnic_mac = vnic;
1450 
1451 	rw_enter(&vnic_lock, RW_READER);
1452 	mod_hash_walk(vnic_hash, vnic_notify_walker, &state);
1453 	rw_exit(&vnic_lock);
1454 }
1455 
1456 static int
1457 vnic_modify_mac_addr(vnic_t *vnic, uint_t mac_len, uchar_t *mac_addr)
1458 {
1459 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1460 	vnic_flow_t *vnic_flow = vnic->vn_flow_ent;
1461 
1462 	ASSERT(RW_WRITE_HELD(&vnic_lock));
1463 
1464 	if (mac_len != vnic_mac->va_addr_len)
1465 		return (EINVAL);
1466 
1467 	vnic_classifier_flow_update_addr(vnic_flow, mac_addr);
1468 	return (0);
1469 }
1470 
1471 static int
1472 vnic_promisc_set(vnic_t *vnic, boolean_t on)
1473 {
1474 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1475 	int r = -1;
1476 
1477 	if (vnic->vn_promisc == on)
1478 		return (0);
1479 
1480 	if (on) {
1481 		r = mac_promisc_set(vnic_mac->va_mh, B_TRUE, MAC_DEVPROMISC);
1482 		if (r != 0)
1483 			return (r);
1484 
1485 		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
1486 		vnic->vn_promisc_next = vnic_mac->va_promisc;
1487 		vnic_mac->va_promisc = vnic;
1488 		vnic_mac->va_promisc_gen++;
1489 
1490 		vnic->vn_promisc = B_TRUE;
1491 		rw_exit(&vnic_mac->va_promisc_lock);
1492 
1493 		return (0);
1494 	} else {
1495 		vnic_t *loop, *prev = NULL;
1496 
1497 		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
1498 		loop = vnic_mac->va_promisc;
1499 
1500 		while ((loop != NULL) && (loop != vnic)) {
1501 			prev = loop;
1502 			loop = loop->vn_promisc_next;
1503 		}
1504 
1505 		if ((loop != NULL) &&
1506 		    ((r = mac_promisc_set(vnic_mac->va_mh, B_FALSE,
1507 		    MAC_DEVPROMISC)) == 0)) {
1508 			if (prev != NULL)
1509 				prev->vn_promisc_next = loop->vn_promisc_next;
1510 			else
1511 				vnic_mac->va_promisc = loop->vn_promisc_next;
1512 			vnic_mac->va_promisc_gen++;
1513 
1514 			vnic->vn_promisc = B_FALSE;
1515 		}
1516 		rw_exit(&vnic_mac->va_promisc_lock);
1517 
1518 		return (r);
1519 	}
1520 }
1521 
1522 void
1523 vnic_promisc_rx(vnic_mac_t *vnic_mac, vnic_t *sender, mblk_t *mp)
1524 {
1525 	vnic_t *loop;
1526 	vnic_flow_t *flow;
1527 	const vnic_flow_fn_info_t *fn_info;
1528 	mac_header_info_t hdr_info;
1529 	boolean_t dst_must_match = B_TRUE;
1530 
1531 	ASSERT(mp->b_next == NULL);
1532 
1533 	rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
1534 	if (vnic_mac->va_promisc == NULL)
1535 		goto done;
1536 
1537 	if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0)
1538 		goto done;
1539 
1540 	/*
1541 	 * If this is broadcast or multicast then the destination
1542 	 * address need not match for us to deliver it.
1543 	 */
1544 	if ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) ||
1545 	    (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST))
1546 		dst_must_match = B_FALSE;
1547 
1548 	for (loop = vnic_mac->va_promisc;
1549 	    loop != NULL;
1550 	    loop = loop->vn_promisc_next) {
1551 		if (loop == sender)
1552 			continue;
1553 
1554 		if (dst_must_match &&
1555 		    (bcmp(hdr_info.mhi_daddr, loop->vn_addr,
1556 		    sizeof (loop->vn_addr)) != 0))
1557 			continue;
1558 
1559 		flow = loop->vn_flow_ent;
1560 		ASSERT(flow != NULL);
1561 
1562 		if (!flow->vf_is_active) {
1563 			mblk_t *copy;
1564 			uint64_t gen;
1565 
1566 			if ((copy = vnic_copymsg_cksum(mp)) == NULL)
1567 				break;
1568 			if ((sender != NULL) &&
1569 			    ((copy = vnic_fix_cksum(copy)) == NULL))
1570 				break;
1571 
1572 			VNIC_FLOW_REFHOLD(flow);
1573 			gen = vnic_mac->va_promisc_gen;
1574 			rw_exit(&vnic_mac->va_promisc_lock);
1575 
1576 			fn_info = vnic_classifier_get_fn_info(flow);
1577 			(fn_info->ff_fn)(fn_info->ff_arg1,
1578 			    fn_info->ff_arg2, copy);
1579 
1580 			VNIC_FLOW_REFRELE(flow);
1581 			rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
1582 			if (vnic_mac->va_promisc_gen != gen)
1583 				break;
1584 		}
1585 	}
1586 done:
1587 	rw_exit(&vnic_mac->va_promisc_lock);
1588 }
1589