xref: /titanic_41/usr/src/uts/common/io/vnic/vnic_dev.c (revision d62bc4badc1c1f1549c961cfb8b420e650e1272b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/conf.h>
31 #include <sys/cmn_err.h>
32 #include <sys/list.h>
33 #include <sys/ksynch.h>
34 #include <sys/kmem.h>
35 #include <sys/stream.h>
36 #include <sys/modctl.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/atomic.h>
40 #include <sys/stat.h>
41 #include <sys/modhash.h>
42 #include <sys/strsubr.h>
43 #include <sys/strsun.h>
44 #include <sys/dlpi.h>
45 #include <sys/mac.h>
46 #include <sys/mac_ether.h>
47 #include <sys/dls.h>
48 #include <sys/pattr.h>
49 #include <sys/vnic.h>
50 #include <sys/vnic_impl.h>
51 #include <sys/gld.h>
52 #include <inet/ip.h>
53 #include <inet/ip_impl.h>
54 
55 static int vnic_m_start(void *);
56 static void vnic_m_stop(void *);
57 static int vnic_m_promisc(void *, boolean_t);
58 static int vnic_m_multicst(void *, boolean_t, const uint8_t *);
59 static int vnic_m_unicst(void *, const uint8_t *);
60 static int vnic_m_stat(void *, uint_t, uint64_t *);
61 static void vnic_m_resources(void *);
62 static mblk_t *vnic_m_tx(void *, mblk_t *);
63 static boolean_t vnic_m_capab_get(void *, mac_capab_t, void *);
64 static void vnic_mac_free(vnic_mac_t *);
65 static uint_t vnic_info_walker(mod_hash_key_t, mod_hash_val_t *, void *);
66 static void vnic_notify_cb(void *, mac_notify_type_t);
67 static int vnic_modify_mac_addr(vnic_t *, uint_t, uchar_t *);
68 static mblk_t *vnic_active_tx(void *, mblk_t *);
69 static int vnic_promisc_set(vnic_t *, boolean_t);
70 
71 static kmem_cache_t	*vnic_cache;
72 static kmem_cache_t	*vnic_mac_cache;
73 static krwlock_t	vnic_lock;
74 static kmutex_t		vnic_mac_lock;
75 static uint_t		vnic_count;
76 
77 /* hash of VNICs (vnic_t's), keyed by VNIC id */
78 static mod_hash_t	*vnic_hash;
79 #define	VNIC_HASHSZ	64
80 #define	VNIC_HASH_KEY(vnic_id)	((mod_hash_key_t)(uintptr_t)vnic_id)
81 
82 /*
83  * Hash of underlying open MACs (vnic_mac_t's), keyed by the string
84  * "<device name><instance number>/<port number>".
85  */
86 static mod_hash_t	*vnic_mac_hash;
87 #define	VNIC_MAC_HASHSZ	64
88 
89 #define	VNIC_MAC_REFHOLD(va) {			\
90 	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
91 	(va)->va_refs++;			\
92 	ASSERT((va)->va_refs != 0);		\
93 }
94 
95 #define	VNIC_MAC_REFRELE(va) {			\
96 	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
97 	ASSERT((va)->va_refs != 0);		\
98 	if (--((va)->va_refs) == 0)		\
99 		vnic_mac_free(va);		\
100 }
101 
102 static uchar_t vnic_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
103 
104 /* used by vnic_walker */
105 typedef struct vnic_info_state {
106 	datalink_id_t	vs_vnic_id;
107 	datalink_id_t	vs_linkid;
108 	boolean_t	vs_vnic_found;
109 	vnic_info_new_vnic_fn_t	vs_new_vnic_fn;
110 	void		*vs_fn_arg;
111 	int		vs_rc;
112 } vnic_info_state_t;
113 
114 #define	VNIC_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_GETCAPAB)
115 
116 static mac_callbacks_t vnic_m_callbacks = {
117 	VNIC_M_CALLBACK_FLAGS,
118 	vnic_m_stat,
119 	vnic_m_start,
120 	vnic_m_stop,
121 	vnic_m_promisc,
122 	vnic_m_multicst,
123 	vnic_m_unicst,
124 	vnic_m_tx,
125 	vnic_m_resources,
126 	NULL,			/* m_ioctl */
127 	vnic_m_capab_get
128 };
129 
130 /* ARGSUSED */
131 static int
132 vnic_mac_ctor(void *buf, void *arg, int kmflag)
133 {
134 	vnic_mac_t *vnic_mac = buf;
135 
136 	bzero(vnic_mac, sizeof (vnic_mac_t));
137 	rw_init(&vnic_mac->va_bcast_grp_lock, NULL, RW_DRIVER, NULL);
138 	rw_init(&vnic_mac->va_promisc_lock, NULL, RW_DRIVER, NULL);
139 
140 	return (0);
141 }
142 
143 /* ARGSUSED */
144 static void
145 vnic_mac_dtor(void *buf, void *arg)
146 {
147 	vnic_mac_t *vnic_mac = buf;
148 
149 	rw_destroy(&vnic_mac->va_promisc_lock);
150 	rw_destroy(&vnic_mac->va_bcast_grp_lock);
151 }
152 
153 void
154 vnic_dev_init(void)
155 {
156 	vnic_cache = kmem_cache_create("vnic_cache",
157 	    sizeof (vnic_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
158 
159 	vnic_mac_cache = kmem_cache_create("vnic_mac_cache",
160 	    sizeof (vnic_mac_t), 0, vnic_mac_ctor, vnic_mac_dtor,
161 	    NULL, NULL, NULL, 0);
162 
163 	vnic_hash = mod_hash_create_idhash("vnic_hash",
164 	    VNIC_HASHSZ, mod_hash_null_valdtor);
165 
166 	vnic_mac_hash = mod_hash_create_idhash("vnic_mac_hash",
167 	    VNIC_MAC_HASHSZ, mod_hash_null_valdtor);
168 
169 	rw_init(&vnic_lock, NULL, RW_DEFAULT, NULL);
170 
171 	mutex_init(&vnic_mac_lock, NULL, MUTEX_DEFAULT, NULL);
172 
173 	vnic_count = 0;
174 }
175 
176 void
177 vnic_dev_fini(void)
178 {
179 	ASSERT(vnic_count == 0);
180 
181 	mutex_destroy(&vnic_mac_lock);
182 	rw_destroy(&vnic_lock);
183 	mod_hash_destroy_idhash(vnic_mac_hash);
184 	mod_hash_destroy_idhash(vnic_hash);
185 	kmem_cache_destroy(vnic_mac_cache);
186 	kmem_cache_destroy(vnic_cache);
187 }
188 
189 uint_t
190 vnic_dev_count(void)
191 {
192 	return (vnic_count);
193 }
194 
195 static int
196 vnic_mac_open(datalink_id_t linkid, vnic_mac_t **vmp)
197 {
198 	int err;
199 	vnic_mac_t *vnic_mac = NULL;
200 	const mac_info_t *mip;
201 
202 	*vmp = NULL;
203 
204 	mutex_enter(&vnic_mac_lock);
205 
206 	err = mod_hash_find(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
207 	    (mod_hash_val_t *)&vnic_mac);
208 	if (err == 0) {
209 		/* this MAC is already opened, increment reference count */
210 		VNIC_MAC_REFHOLD(vnic_mac);
211 		mutex_exit(&vnic_mac_lock);
212 		*vmp = vnic_mac;
213 		return (0);
214 	}
215 
216 	vnic_mac = kmem_cache_alloc(vnic_mac_cache, KM_SLEEP);
217 	if ((err = mac_open_by_linkid(linkid, &vnic_mac->va_mh)) != 0) {
218 		vnic_mac->va_mh = NULL;
219 		goto bail;
220 	}
221 
222 	/*
223 	 * For now, we do not support VNICs over legacy drivers.  This will
224 	 * soon be changed.
225 	 */
226 	if (mac_is_legacy(vnic_mac->va_mh)) {
227 		err = ENOTSUP;
228 		goto bail;
229 	}
230 
231 	/* only ethernet support, for now */
232 	mip = mac_info(vnic_mac->va_mh);
233 	if (mip->mi_media != DL_ETHER) {
234 		err = ENOTSUP;
235 		goto bail;
236 	}
237 	if (mip->mi_media != mip->mi_nativemedia) {
238 		err = ENOTSUP;
239 		goto bail;
240 	}
241 
242 	vnic_mac->va_linkid = linkid;
243 
244 	/* add entry to hash table */
245 	err = mod_hash_insert(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
246 	    (mod_hash_val_t)vnic_mac);
247 	ASSERT(err == 0);
248 
249 	/* initialize the flow table associated with lower MAC */
250 	vnic_mac->va_addr_len = ETHERADDRL;
251 	(void) vnic_classifier_flow_tab_init(vnic_mac, vnic_mac->va_addr_len,
252 	    KM_SLEEP);
253 
254 	vnic_mac->va_txinfo = mac_vnic_tx_get(vnic_mac->va_mh);
255 	vnic_mac->va_notify_hdl = mac_notify_add(vnic_mac->va_mh,
256 	    vnic_notify_cb, vnic_mac);
257 
258 	VNIC_MAC_REFHOLD(vnic_mac);
259 	*vmp = vnic_mac;
260 	mutex_exit(&vnic_mac_lock);
261 	return (0);
262 
263 bail:
264 	if (vnic_mac != NULL) {
265 		if (vnic_mac->va_mh != NULL)
266 			mac_close(vnic_mac->va_mh);
267 		kmem_cache_free(vnic_mac_cache, vnic_mac);
268 	}
269 	mutex_exit(&vnic_mac_lock);
270 	return (err);
271 }
272 
273 /*
274  * Create a new flow for the active MAC client sharing the NIC
275  * with the VNICs. This allows the unicast packets for that NIC
276  * to be classified and passed up to the active MAC client. It
277  * also allows packets sent from a VNIC to the active link to
278  * be classified by the VNIC transmit function and delivered via
279  * the MAC module locally. Returns B_TRUE on success, B_FALSE on
280  * failure.
281  */
282 static int
283 vnic_init_active_rx(vnic_mac_t *vnic_mac)
284 {
285 	uchar_t nic_mac_addr[MAXMACADDRLEN];
286 
287 	if (vnic_mac->va_active_flow != NULL)
288 		return (B_TRUE);
289 
290 	mac_unicst_get(vnic_mac->va_mh, nic_mac_addr);
291 
292 	vnic_mac->va_active_flow = vnic_classifier_flow_create(
293 	    vnic_mac->va_addr_len, nic_mac_addr, NULL, B_TRUE, KM_SLEEP);
294 
295 	vnic_classifier_flow_add(vnic_mac, vnic_mac->va_active_flow,
296 	    (vnic_rx_fn_t)mac_active_rx, vnic_mac->va_mh, NULL);
297 	return (B_TRUE);
298 }
299 
300 static void
301 vnic_fini_active_rx(vnic_mac_t *vnic_mac)
302 {
303 	if (vnic_mac->va_active_flow == NULL)
304 		return;
305 
306 	vnic_classifier_flow_remove(vnic_mac, vnic_mac->va_active_flow);
307 	vnic_classifier_flow_destroy(vnic_mac->va_active_flow);
308 	vnic_mac->va_active_flow = NULL;
309 }
310 
311 static void
312 vnic_update_active_rx(vnic_mac_t *vnic_mac)
313 {
314 	if (vnic_mac->va_active_flow == NULL)
315 		return;
316 
317 	vnic_fini_active_rx(vnic_mac);
318 	(void) vnic_init_active_rx(vnic_mac);
319 }
320 
321 /*
322  * Copy an mblk, preserving its hardware checksum flags.
323  */
324 mblk_t *
325 vnic_copymsg_cksum(mblk_t *mp)
326 {
327 	mblk_t *mp1;
328 	uint32_t start, stuff, end, value, flags;
329 
330 	mp1 = copymsg(mp);
331 	if (mp1 == NULL)
332 		return (NULL);
333 
334 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
335 	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
336 	    flags, KM_NOSLEEP);
337 
338 	return (mp1);
339 }
340 
341 /*
342  * Copy an mblk chain, presenting the hardware checksum flags of the
343  * individual mblks.
344  */
345 mblk_t *
346 vnic_copymsgchain_cksum(mblk_t *mp)
347 {
348 	mblk_t *nmp = NULL;
349 	mblk_t **nmpp = &nmp;
350 
351 	for (; mp != NULL; mp = mp->b_next) {
352 		if ((*nmpp = vnic_copymsg_cksum(mp)) == NULL) {
353 			freemsgchain(nmp);
354 			return (NULL);
355 		}
356 
357 		nmpp = &((*nmpp)->b_next);
358 	}
359 
360 	return (nmp);
361 }
362 
363 
364 /*
365  * Process the specified mblk chain for proper handling of hardware
366  * checksum offload. This routine is invoked for loopback VNIC traffic.
367  * The function handles a NULL mblk chain passed as argument.
368  */
369 mblk_t *
370 vnic_fix_cksum(mblk_t *mp_chain)
371 {
372 	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
373 	uint32_t flags, start, stuff, end, value;
374 
375 	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
376 		uint16_t len;
377 		uint32_t offset;
378 		struct ether_header *ehp;
379 		uint16_t sap;
380 
381 		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
382 		    &flags);
383 		if (flags == 0)
384 			continue;
385 
386 		/*
387 		 * Since the processing of checksum offload for loopback
388 		 * traffic requires modification of the packet contents,
389 		 * ensure sure that we are always modifying our own copy.
390 		 */
391 		if (DB_REF(mp) > 1) {
392 			mp1 = copymsg(mp);
393 			if (mp1 == NULL)
394 				continue;
395 			mp1->b_next = mp->b_next;
396 			mp->b_next = NULL;
397 			freemsg(mp);
398 			if (prev != NULL)
399 				prev->b_next = mp1;
400 			else
401 				new_chain = mp1;
402 			mp = mp1;
403 		}
404 
405 		/*
406 		 * Ethernet, and optionally VLAN header.
407 		 */
408 		/*LINTED*/
409 		ehp = (struct ether_header *)mp->b_rptr;
410 		if (ntohs(ehp->ether_type) == VLAN_TPID) {
411 			struct ether_vlan_header *evhp;
412 
413 			ASSERT(MBLKL(mp) >=
414 			    sizeof (struct ether_vlan_header));
415 			/*LINTED*/
416 			evhp = (struct ether_vlan_header *)mp->b_rptr;
417 			sap = ntohs(evhp->ether_type);
418 			offset = sizeof (struct ether_vlan_header);
419 		} else {
420 			sap = ntohs(ehp->ether_type);
421 			offset = sizeof (struct ether_header);
422 		}
423 
424 		if (MBLKL(mp) <= offset) {
425 			offset -= MBLKL(mp);
426 			if (mp->b_cont == NULL) {
427 				/* corrupted packet, skip it */
428 				if (prev != NULL)
429 					prev->b_next = mp->b_next;
430 				else
431 					new_chain = mp->b_next;
432 				mp1 = mp->b_next;
433 				mp->b_next = NULL;
434 				freemsg(mp);
435 				mp = mp1;
436 				continue;
437 			}
438 			mp = mp->b_cont;
439 		}
440 
441 		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
442 			ipha_t *ipha = NULL;
443 
444 			/*
445 			 * In order to compute the full and header
446 			 * checksums, we need to find and parse
447 			 * the IP and/or ULP headers.
448 			 */
449 
450 			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
451 
452 			/*
453 			 * IP header.
454 			 */
455 			if (sap != ETHERTYPE_IP)
456 				continue;
457 
458 			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
459 			/*LINTED*/
460 			ipha = (ipha_t *)(mp->b_rptr + offset);
461 
462 			if (flags & HCK_FULLCKSUM) {
463 				ipaddr_t src, dst;
464 				uint32_t cksum;
465 				uint16_t *up;
466 				uint8_t proto;
467 
468 				/*
469 				 * Pointer to checksum field in ULP header.
470 				 */
471 				proto = ipha->ipha_protocol;
472 				ASSERT(ipha->ipha_version_and_hdr_length ==
473 				    IP_SIMPLE_HDR_VERSION);
474 				if (proto == IPPROTO_TCP) {
475 					/*LINTED*/
476 					up = IPH_TCPH_CHECKSUMP(ipha,
477 					    IP_SIMPLE_HDR_LENGTH);
478 				} else {
479 					ASSERT(proto == IPPROTO_UDP);
480 					/*LINTED*/
481 					up = IPH_UDPH_CHECKSUMP(ipha,
482 					    IP_SIMPLE_HDR_LENGTH);
483 				}
484 
485 				/*
486 				 * Pseudo-header checksum.
487 				 */
488 				src = ipha->ipha_src;
489 				dst = ipha->ipha_dst;
490 				len = ntohs(ipha->ipha_length) -
491 				    IP_SIMPLE_HDR_LENGTH;
492 
493 				cksum = (dst >> 16) + (dst & 0xFFFF) +
494 				    (src >> 16) + (src & 0xFFFF);
495 				cksum += htons(len);
496 
497 				/*
498 				 * The checksum value stored in the packet needs
499 				 * to be correct. Compute it here.
500 				 */
501 				*up = 0;
502 				cksum += (((proto) == IPPROTO_UDP) ?
503 				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
504 				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
505 				    offset, cksum);
506 				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
507 
508 				flags |= HCK_FULLCKSUM_OK;
509 				value = 0xffff;
510 			}
511 
512 			if (flags & HCK_IPV4_HDRCKSUM) {
513 				ASSERT(ipha != NULL);
514 				ipha->ipha_hdr_checksum =
515 				    (uint16_t)ip_csum_hdr(ipha);
516 			}
517 		}
518 
519 		if (flags & HCK_PARTIALCKSUM) {
520 			uint16_t *up, partial, cksum;
521 			uchar_t *ipp; /* ptr to beginning of IP header */
522 
523 			if (mp->b_cont != NULL) {
524 				mblk_t *mp1;
525 
526 				mp1 = msgpullup(mp, offset + end);
527 				if (mp1 == NULL)
528 					continue;
529 				mp1->b_next = mp->b_next;
530 				mp->b_next = NULL;
531 				freemsg(mp);
532 				if (prev != NULL)
533 					prev->b_next = mp1;
534 				else
535 					new_chain = mp1;
536 				mp = mp1;
537 			}
538 
539 			ipp = mp->b_rptr + offset;
540 			/*LINTED*/
541 			up = (uint16_t *)((uchar_t *)ipp + stuff);
542 			partial = *up;
543 			*up = 0;
544 
545 			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
546 			    end - start, partial);
547 			cksum = ~cksum;
548 			*up = cksum ? cksum : ~cksum;
549 
550 			/*
551 			 * Since we already computed the whole checksum,
552 			 * indicate to the stack that it has already
553 			 * been verified by the hardware.
554 			 */
555 			flags &= ~HCK_PARTIALCKSUM;
556 			flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
557 			value = 0xffff;
558 		}
559 
560 		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
561 		    value, flags, KM_NOSLEEP);
562 	}
563 
564 	return (new_chain);
565 }
566 
567 static void
568 vnic_mac_close(vnic_mac_t *vnic_mac)
569 {
570 	mutex_enter(&vnic_mac_lock);
571 	VNIC_MAC_REFRELE(vnic_mac);
572 	mutex_exit(&vnic_mac_lock);
573 }
574 
575 static void
576 vnic_mac_free(vnic_mac_t *vnic_mac)
577 {
578 	mod_hash_val_t val;
579 
580 	ASSERT(MUTEX_HELD(&vnic_mac_lock));
581 	vnic_fini_active_rx(vnic_mac);
582 	mac_notify_remove(vnic_mac->va_mh, vnic_mac->va_notify_hdl);
583 	if (vnic_mac->va_mac_set) {
584 		vnic_mac->va_mac_set = B_FALSE;
585 		mac_vnic_clear(vnic_mac->va_mh);
586 	}
587 	vnic_classifier_flow_tab_fini(vnic_mac);
588 	mac_close(vnic_mac->va_mh);
589 
590 	(void) mod_hash_remove(vnic_mac_hash,
591 	    (mod_hash_key_t)(uintptr_t)vnic_mac->va_linkid, &val);
592 	ASSERT(vnic_mac == (vnic_mac_t *)val);
593 
594 	kmem_cache_free(vnic_mac_cache, vnic_mac);
595 }
596 
597 /*
598  * Initial VNIC receive routine. Invoked for packets that are steered
599  * to a VNIC but the VNIC has not been started yet.
600  */
601 /* ARGSUSED */
602 static void
603 vnic_rx_initial(void *arg1, void *arg2, mblk_t *mp_chain)
604 {
605 	vnic_t *vnic = arg1;
606 	mblk_t *mp;
607 
608 	/* update stats */
609 	for (mp = mp_chain; mp != NULL; mp = mp->b_next)
610 		vnic->vn_stat_ierrors++;
611 	freemsgchain(mp_chain);
612 }
613 
614 /*
615  * VNIC receive routine invoked after the classifier for the VNIC
616  * has been initialized and the VNIC has been started.
617  */
618 /* ARGSUSED */
619 void
620 vnic_rx(void *arg1, void *arg2, mblk_t *mp_chain)
621 {
622 	vnic_t *vnic = arg1;
623 	mblk_t *mp;
624 
625 	/* update stats */
626 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
627 		vnic->vn_stat_ipackets++;
628 		vnic->vn_stat_rbytes += msgdsize(mp);
629 	}
630 
631 	/* pass packet up */
632 	mac_rx(vnic->vn_mh, NULL, mp_chain);
633 }
634 
635 /*
636  * Routine to create a MAC-based VNIC. Adds the passed MAC address
637  * to an unused slot in the NIC if one is available. Otherwise it
638  * sets the NIC in promiscuous mode and assigns the MAC address to
639  * a Rx ring if available or a soft ring.
640  */
641 static int
642 vnic_add_unicstaddr(vnic_t *vnic, mac_multi_addr_t *maddr)
643 {
644 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
645 	int err;
646 
647 	if (mac_unicst_verify(vnic_mac->va_mh, maddr->mma_addr,
648 	    maddr->mma_addrlen) == B_FALSE)
649 		return (EINVAL);
650 
651 	if (mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_MULTIADDRESS,
652 	    &(vnic->vn_mma_capab))) {
653 		if (vnic->vn_maddr_naddrfree == 0) {
654 			/*
655 			 * No free address slots available.
656 			 * Enable promiscuous mode.
657 			 */
658 			goto set_promisc;
659 		}
660 
661 		err = vnic->vn_maddr_add(vnic->vn_maddr_handle, maddr);
662 		if (err != 0) {
663 			if (err == ENOSPC) {
664 				/*
665 				 * There was a race to add addresses
666 				 * with other multiple address consumers,
667 				 * and we lost out. Use promisc mode.
668 				 */
669 				goto set_promisc;
670 			}
671 
672 			return (err);
673 		}
674 
675 		vnic->vn_slot_id = maddr->mma_slot;
676 		vnic->vn_multi_mac = B_TRUE;
677 	} else {
678 		/*
679 		 * Either multiple MAC address support is not
680 		 * available or all available addresses have
681 		 * been used up.
682 		 */
683 	set_promisc:
684 		if ((err = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
685 		    MAC_DEVPROMISC)) != 0) {
686 			return (err);
687 		}
688 
689 		vnic->vn_promisc_mac = B_TRUE;
690 	}
691 	return (err);
692 }
693 
694 /*
695  * VNIC is getting deleted. Remove the MAC address from the slot.
696  * If promiscuous mode was being used, then unset the promiscuous mode.
697  */
698 static int
699 vnic_remove_unicstaddr(vnic_t *vnic)
700 {
701 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
702 	int err;
703 
704 	if (vnic->vn_multi_mac) {
705 		ASSERT(vnic->vn_promisc_mac == B_FALSE);
706 		err = vnic->vn_maddr_remove(vnic->vn_maddr_handle,
707 		    vnic->vn_slot_id);
708 		vnic->vn_multi_mac = B_FALSE;
709 	}
710 
711 	if (vnic->vn_promisc_mac) {
712 		ASSERT(vnic->vn_multi_mac == B_FALSE);
713 		err = mac_promisc_set(vnic_mac->va_mh, B_FALSE, MAC_DEVPROMISC);
714 		vnic->vn_promisc_mac = B_FALSE;
715 	}
716 
717 	return (err);
718 }
719 
720 /*
721  * Create a new VNIC upon request from administrator.
722  * Returns 0 on success, an errno on failure.
723  */
724 int
725 vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
726     uchar_t *mac_addr)
727 {
728 	vnic_t *vnic = NULL;
729 	mac_register_t *mac;
730 	int err;
731 	vnic_mac_t *vnic_mac;
732 	const mac_info_t *lower_mac_info;
733 	mac_multi_addr_t maddr;
734 	mac_txinfo_t tx_info;
735 
736 	if (mac_len != ETHERADDRL) {
737 		/* currently only ethernet NICs are supported */
738 		return (EINVAL);
739 	}
740 
741 	rw_enter(&vnic_lock, RW_WRITER);
742 
743 	/* does a VNIC with the same id already exist? */
744 	err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
745 	    (mod_hash_val_t *)&vnic);
746 	if (err == 0) {
747 		rw_exit(&vnic_lock);
748 		return (EEXIST);
749 	}
750 
751 	vnic = kmem_cache_alloc(vnic_cache, KM_NOSLEEP);
752 	if (vnic == NULL) {
753 		rw_exit(&vnic_lock);
754 		return (ENOMEM);
755 	}
756 
757 	/* open underlying MAC */
758 	err = vnic_mac_open(linkid, &vnic_mac);
759 	if (err != 0) {
760 		kmem_cache_free(vnic_cache, vnic);
761 		rw_exit(&vnic_lock);
762 		return (err);
763 	}
764 
765 	bzero(vnic, sizeof (*vnic));
766 	vnic->vn_id = vnic_id;
767 	vnic->vn_vnic_mac = vnic_mac;
768 
769 	vnic->vn_started = B_FALSE;
770 	vnic->vn_promisc = B_FALSE;
771 	vnic->vn_multi_mac = B_FALSE;
772 	vnic->vn_bcast_grp = B_FALSE;
773 
774 	/* set the VNIC MAC address */
775 	maddr.mma_addrlen = mac_len;
776 	maddr.mma_slot = 0;
777 	maddr.mma_flags = 0;
778 	bcopy(mac_addr, maddr.mma_addr, mac_len);
779 	if ((err = vnic_add_unicstaddr(vnic, &maddr)) != 0)
780 		goto bail;
781 	bcopy(mac_addr, vnic->vn_addr, mac_len);
782 
783 	/* set the initial VNIC capabilities */
784 	if (!mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_HCKSUM,
785 	    &vnic->vn_hcksum_txflags))
786 		vnic->vn_hcksum_txflags = 0;
787 
788 	/* register with the MAC module */
789 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
790 		goto bail;
791 
792 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
793 	mac->m_driver = vnic;
794 	mac->m_dip = vnic_get_dip();
795 	mac->m_instance = (uint_t)-1;
796 	mac->m_src_addr = vnic->vn_addr;
797 	mac->m_callbacks = &vnic_m_callbacks;
798 
799 	lower_mac_info = mac_info(vnic_mac->va_mh);
800 	mac->m_min_sdu = lower_mac_info->mi_sdu_min;
801 	mac->m_max_sdu = lower_mac_info->mi_sdu_max;
802 
803 	/*
804 	 * As the current margin size of the underlying mac is used to
805 	 * determine the margin size of the VNIC itself, request the
806 	 * underlying mac not to change to a smaller margin size.
807 	 */
808 	err = mac_margin_add(vnic_mac->va_mh, &(vnic->vn_margin), B_TRUE);
809 	if (err != 0)
810 		goto bail;
811 	mac->m_margin = vnic->vn_margin;
812 	err = mac_register(mac, &vnic->vn_mh);
813 	mac_free(mac);
814 	if (err != 0) {
815 		VERIFY(mac_margin_remove(vnic_mac->va_mh,
816 		    vnic->vn_margin) == 0);
817 		goto bail;
818 	}
819 
820 	if ((err = dls_devnet_create(vnic->vn_mh, vnic->vn_id)) != 0) {
821 		VERIFY(mac_margin_remove(vnic_mac->va_mh,
822 		    vnic->vn_margin) == 0);
823 		(void) mac_unregister(vnic->vn_mh);
824 		goto bail;
825 	}
826 
827 	/* add new VNIC to hash table */
828 	err = mod_hash_insert(vnic_hash, VNIC_HASH_KEY(vnic_id),
829 	    (mod_hash_val_t)vnic);
830 	ASSERT(err == 0);
831 	vnic_count++;
832 
833 	rw_exit(&vnic_lock);
834 
835 	/* Create a flow, initialized with the MAC address of the VNIC */
836 	if ((vnic->vn_flow_ent = vnic_classifier_flow_create(mac_len, mac_addr,
837 	    NULL, B_FALSE, KM_SLEEP)) == NULL) {
838 		(void) vnic_dev_delete(vnic_id);
839 		vnic = NULL;
840 		err = ENOMEM;
841 		goto bail_unlocked;
842 	}
843 
844 	vnic_classifier_flow_add(vnic_mac, vnic->vn_flow_ent, vnic_rx_initial,
845 	    vnic, vnic);
846 
847 	/* setup VNIC to receive broadcast packets */
848 	err = vnic_bcast_add(vnic, vnic_brdcst_mac, MAC_ADDRTYPE_BROADCAST);
849 	if (err != 0) {
850 		(void) vnic_dev_delete(vnic_id);
851 		vnic = NULL;
852 		goto bail_unlocked;
853 	}
854 	vnic->vn_bcast_grp = B_TRUE;
855 
856 	mutex_enter(&vnic_mac_lock);
857 	if (!vnic_mac->va_mac_set) {
858 		/*
859 		 * We want to MAC layer to call the VNIC tx outbound
860 		 * routine, so that local broadcast packets sent by
861 		 * the active interface sharing the underlying NIC (if
862 		 * any), can be broadcast to every VNIC.
863 		 */
864 		tx_info.mt_fn = vnic_active_tx;
865 		tx_info.mt_arg = vnic_mac;
866 		if (!mac_vnic_set(vnic_mac->va_mh, &tx_info,
867 		    vnic_m_capab_get, vnic)) {
868 			mutex_exit(&vnic_mac_lock);
869 			(void) vnic_dev_delete(vnic_id);
870 			vnic = NULL;
871 			err = EBUSY;
872 			goto bail_unlocked;
873 		}
874 		vnic_mac->va_mac_set = B_TRUE;
875 	}
876 	mutex_exit(&vnic_mac_lock);
877 
878 	/* allow passing packets to NIC's active MAC client */
879 	if (!vnic_init_active_rx(vnic_mac)) {
880 		(void) vnic_dev_delete(vnic_id);
881 		vnic = NULL;
882 		err = ENOMEM;
883 		goto bail_unlocked;
884 	}
885 
886 	return (0);
887 
888 bail:
889 	(void) vnic_remove_unicstaddr(vnic);
890 	vnic_mac_close(vnic_mac);
891 	rw_exit(&vnic_lock);
892 
893 bail_unlocked:
894 	if (vnic != NULL) {
895 		kmem_cache_free(vnic_cache, vnic);
896 	}
897 
898 	return (err);
899 }
900 
901 /*
902  * Modify the properties of an existing VNIC.
903  */
904 /* ARGSUSED */
905 int
906 vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask,
907     vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr)
908 {
909 	vnic_t *vnic = NULL;
910 	int rv = 0;
911 	boolean_t notify_mac_addr = B_FALSE;
912 
913 	rw_enter(&vnic_lock, RW_WRITER);
914 
915 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
916 	    (mod_hash_val_t *)&vnic) != 0) {
917 		rw_exit(&vnic_lock);
918 		return (ENOENT);
919 	}
920 
921 	if (modify_mask & VNIC_IOC_MODIFY_ADDR) {
922 		rv = vnic_modify_mac_addr(vnic, mac_len, mac_addr);
923 		if (rv == 0)
924 			notify_mac_addr = B_TRUE;
925 	}
926 
927 	rw_exit(&vnic_lock);
928 
929 	if (notify_mac_addr)
930 		mac_unicst_update(vnic->vn_mh, mac_addr);
931 
932 	return (rv);
933 }
934 
935 int
936 vnic_dev_delete(datalink_id_t vnic_id)
937 {
938 	vnic_t *vnic = NULL;
939 	mod_hash_val_t val;
940 	vnic_flow_t *flent;
941 	datalink_id_t tmpid;
942 	int rc;
943 	vnic_mac_t *vnic_mac;
944 
945 	rw_enter(&vnic_lock, RW_WRITER);
946 
947 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
948 	    (mod_hash_val_t *)&vnic) != 0) {
949 		rw_exit(&vnic_lock);
950 		return (ENOENT);
951 	}
952 
953 	if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid)) != 0) {
954 		rw_exit(&vnic_lock);
955 		return (rc);
956 	}
957 
958 	ASSERT(vnic_id == tmpid);
959 
960 	/*
961 	 * We cannot unregister the MAC yet. Unregistering would
962 	 * free up mac_impl_t which should not happen at this time.
963 	 * Packets could be entering vnic_rx() through the
964 	 * flow entry and so mac_impl_t cannot be NULL. So disable
965 	 * mac_impl_t by calling mac_disable(). This will prevent any
966 	 * new claims on mac_impl_t.
967 	 */
968 	if (mac_disable(vnic->vn_mh) != 0) {
969 		(void) dls_devnet_create(vnic->vn_mh, vnic_id);
970 		rw_exit(&vnic_lock);
971 		return (EBUSY);
972 	}
973 
974 	(void) mod_hash_remove(vnic_hash, VNIC_HASH_KEY(vnic_id), &val);
975 	ASSERT(vnic == (vnic_t *)val);
976 
977 	if (vnic->vn_bcast_grp)
978 		(void) vnic_bcast_delete(vnic, vnic_brdcst_mac);
979 
980 	flent = vnic->vn_flow_ent;
981 	if (flent != NULL) {
982 		/*
983 		 * vnic_classifier_flow_destroy() ensures that the
984 		 * flow is no longer used.
985 		 */
986 		vnic_classifier_flow_remove(vnic->vn_vnic_mac, flent);
987 		vnic_classifier_flow_destroy(flent);
988 	}
989 
990 	rc = mac_margin_remove(vnic->vn_vnic_mac->va_mh, vnic->vn_margin);
991 	ASSERT(rc == 0);
992 	rc = mac_unregister(vnic->vn_mh);
993 	ASSERT(rc == 0);
994 	(void) vnic_remove_unicstaddr(vnic);
995 	vnic_mac = vnic->vn_vnic_mac;
996 	kmem_cache_free(vnic_cache, vnic);
997 	vnic_count--;
998 	rw_exit(&vnic_lock);
999 	vnic_mac_close(vnic_mac);
1000 	return (0);
1001 }
1002 
1003 /*
1004  * For the specified packet chain, return a sub-chain to be sent
1005  * and the transmit function to be used to send the packet. Also
1006  * return a pointer to the sub-chain of packets that should
1007  * be re-classified. If the function returns NULL, the packet
1008  * should be sent using the underlying NIC.
1009  */
1010 static vnic_flow_t *
1011 vnic_classify(vnic_mac_t *vnic_mac, mblk_t *mp, mblk_t **mp_chain_rest)
1012 {
1013 	vnic_flow_t *flow_ent;
1014 
1015 	/* one packet at a time */
1016 	*mp_chain_rest = mp->b_next;
1017 	mp->b_next = NULL;
1018 
1019 	/* do classification on the packet */
1020 	flow_ent = vnic_classifier_get_flow(vnic_mac, mp);
1021 
1022 	return (flow_ent);
1023 }
1024 
1025 /*
1026  * Send a packet chain to a local VNIC or an active MAC client.
1027  */
1028 static void
1029 vnic_local_tx(vnic_mac_t *vnic_mac, vnic_flow_t *flow_ent, mblk_t *mp_chain)
1030 {
1031 	mblk_t *mp1;
1032 	const vnic_flow_fn_info_t *fn_info;
1033 	vnic_t *vnic;
1034 
1035 	if (!vnic_classifier_is_active(flow_ent) &&
1036 	    mac_promisc_get(vnic_mac->va_mh, MAC_PROMISC)) {
1037 		/*
1038 		 * If the MAC is in promiscous mode,
1039 		 * send a copy of the active client.
1040 		 */
1041 		if ((mp1 = vnic_copymsgchain_cksum(mp_chain)) == NULL)
1042 			goto sendit;
1043 		if ((mp1 = vnic_fix_cksum(mp1)) == NULL)
1044 			goto sendit;
1045 		mac_active_rx(vnic_mac->va_mh, NULL, mp1);
1046 	}
1047 sendit:
1048 	fn_info = vnic_classifier_get_fn_info(flow_ent);
1049 	/*
1050 	 * If the vnic to which we would deliver this packet is in
1051 	 * promiscuous mode then it already received the packet via
1052 	 * vnic_promisc_rx().
1053 	 *
1054 	 * XXX assumes that ff_arg2 is a vnic_t pointer if it is
1055 	 * non-NULL (currently always true).
1056 	 */
1057 	vnic = (vnic_t *)fn_info->ff_arg2;
1058 	if ((vnic != NULL) && vnic->vn_promisc)
1059 		freemsg(mp_chain);
1060 	else if ((mp1 = vnic_fix_cksum(mp_chain)) != NULL)
1061 		(fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp1);
1062 }
1063 
1064 /*
1065  * This function is invoked when a MAC client needs to send a packet
1066  * to a NIC which is shared by VNICs. It is passed to the MAC layer
1067  * by a call to mac_vnic_set() when the NIC is opened, and is returned
1068  * to MAC clients by mac_tx_get() when VNICs are present.
1069  */
1070 mblk_t *
1071 vnic_active_tx(void *arg, mblk_t *mp_chain)
1072 {
1073 	vnic_mac_t *vnic_mac = arg;
1074 	mblk_t *mp, *extra_mp = NULL;
1075 	vnic_flow_t *flow_ent;
1076 	void *flow_cookie;
1077 	const mac_txinfo_t *mtp = vnic_mac->va_txinfo;
1078 
1079 	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
1080 		mblk_t *next;
1081 
1082 		next = mp->b_next;
1083 		mp->b_next = NULL;
1084 
1085 		vnic_promisc_rx(vnic_mac, (vnic_t *)-1, mp);
1086 
1087 		flow_ent = vnic_classify(vnic_mac, mp, &extra_mp);
1088 		ASSERT(extra_mp == NULL);
1089 		extra_mp = next;
1090 
1091 		if (flow_ent != NULL) {
1092 			flow_cookie = vnic_classifier_get_client_cookie(
1093 			    flow_ent);
1094 			if (flow_cookie != NULL) {
1095 				/*
1096 				 * Send a copy to every VNIC defined on the
1097 				 * interface, as well as the underlying MAC.
1098 				 */
1099 				vnic_bcast_send(flow_cookie, (vnic_t *)-1, mp);
1100 			} else {
1101 				/*
1102 				 * loopback the packet to a local VNIC or
1103 				 * an active MAC client.
1104 				 */
1105 				vnic_local_tx(vnic_mac, flow_ent, mp);
1106 			}
1107 			VNIC_FLOW_REFRELE(flow_ent);
1108 			mp_chain = NULL;
1109 		} else {
1110 			/*
1111 			 * Non-VNIC destination, send via the underlying
1112 			 * NIC. In order to avoid a recursive call
1113 			 * to this function, we ensured that mtp points
1114 			 * to the unerlying NIC transmit function
1115 			 * by inilizating through mac_vnic_tx_get().
1116 			 */
1117 			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
1118 			if (mp_chain != NULL)
1119 				break;
1120 		}
1121 	}
1122 
1123 	if ((mp_chain != NULL) && (extra_mp != NULL)) {
1124 		ASSERT(mp_chain->b_next == NULL);
1125 		mp_chain->b_next = extra_mp;
1126 	}
1127 	return (mp_chain);
1128 }
1129 
1130 /*
1131  * VNIC transmit function.
1132  */
1133 mblk_t *
1134 vnic_m_tx(void *arg, mblk_t *mp_chain)
1135 {
1136 	vnic_t *vnic = arg;
1137 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1138 	mblk_t *mp, *extra_mp = NULL;
1139 	vnic_flow_t *flow_ent;
1140 	void *flow_cookie;
1141 
1142 	/*
1143 	 * Update stats.
1144 	 */
1145 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
1146 		vnic->vn_stat_opackets++;
1147 		vnic->vn_stat_obytes += msgdsize(mp);
1148 	}
1149 
1150 	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
1151 		mblk_t *next;
1152 
1153 		next = mp->b_next;
1154 		mp->b_next = NULL;
1155 
1156 		vnic_promisc_rx(vnic->vn_vnic_mac, vnic, mp);
1157 
1158 		flow_ent = vnic_classify(vnic->vn_vnic_mac, mp, &extra_mp);
1159 		ASSERT(extra_mp == NULL);
1160 		extra_mp = next;
1161 
1162 		if (flow_ent != NULL) {
1163 			flow_cookie = vnic_classifier_get_client_cookie(
1164 			    flow_ent);
1165 			if (flow_cookie != NULL) {
1166 				/*
1167 				 * The vnic_bcast_send function expects
1168 				 * to receive the sender VNIC as value
1169 				 * for arg2.
1170 				 */
1171 				vnic_bcast_send(flow_cookie, vnic, mp);
1172 			} else {
1173 				/*
1174 				 * loopback the packet to a local VNIC or
1175 				 * an active MAC client.
1176 				 */
1177 				vnic_local_tx(vnic_mac, flow_ent, mp);
1178 			}
1179 			VNIC_FLOW_REFRELE(flow_ent);
1180 			mp_chain = NULL;
1181 		} else {
1182 			/*
1183 			 * Non-local destination, send via the underlying
1184 			 * NIC.
1185 			 */
1186 			const mac_txinfo_t *mtp = vnic->vn_txinfo;
1187 			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
1188 			if (mp_chain != NULL)
1189 				break;
1190 		}
1191 	}
1192 
1193 	/* update stats to account for unsent packets */
1194 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
1195 		vnic->vn_stat_opackets--;
1196 		vnic->vn_stat_obytes -= msgdsize(mp);
1197 		vnic->vn_stat_oerrors++;
1198 		/*
1199 		 * link back in the last portion not counted due to bandwidth
1200 		 * control.
1201 		 */
1202 		if (mp->b_next == NULL) {
1203 			mp->b_next = extra_mp;
1204 			break;
1205 		}
1206 	}
1207 
1208 	return (mp_chain);
1209 }
1210 
1211 /* ARGSUSED */
1212 static void
1213 vnic_m_resources(void *arg)
1214 {
1215 	/* no resources to advertise */
1216 }
1217 
1218 static int
1219 vnic_m_stat(void *arg, uint_t stat, uint64_t *val)
1220 {
1221 	vnic_t *vnic = arg;
1222 	int rval = 0;
1223 
1224 	rw_enter(&vnic_lock, RW_READER);
1225 
1226 	switch (stat) {
1227 	case ETHER_STAT_LINK_DUPLEX:
1228 		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
1229 		    ETHER_STAT_LINK_DUPLEX);
1230 		break;
1231 	case MAC_STAT_IFSPEED:
1232 		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
1233 		    MAC_STAT_IFSPEED);
1234 		break;
1235 	case MAC_STAT_MULTIRCV:
1236 		*val = vnic->vn_stat_multircv;
1237 		break;
1238 	case MAC_STAT_BRDCSTRCV:
1239 		*val = vnic->vn_stat_brdcstrcv;
1240 		break;
1241 	case MAC_STAT_MULTIXMT:
1242 		*val = vnic->vn_stat_multixmt;
1243 		break;
1244 	case MAC_STAT_BRDCSTXMT:
1245 		*val = vnic->vn_stat_brdcstxmt;
1246 		break;
1247 	case MAC_STAT_IERRORS:
1248 		*val = vnic->vn_stat_ierrors;
1249 		break;
1250 	case MAC_STAT_OERRORS:
1251 		*val = vnic->vn_stat_oerrors;
1252 		break;
1253 	case MAC_STAT_RBYTES:
1254 		*val = vnic->vn_stat_rbytes;
1255 		break;
1256 	case MAC_STAT_IPACKETS:
1257 		*val = vnic->vn_stat_ipackets;
1258 		break;
1259 	case MAC_STAT_OBYTES:
1260 		*val = vnic->vn_stat_obytes;
1261 		break;
1262 	case MAC_STAT_OPACKETS:
1263 		*val = vnic->vn_stat_opackets;
1264 		break;
1265 	default:
1266 		rval = ENOTSUP;
1267 	}
1268 
1269 	rw_exit(&vnic_lock);
1270 	return (rval);
1271 }
1272 
1273 /*
1274  * Return information about the specified capability.
1275  */
1276 /* ARGSUSED */
1277 static boolean_t
1278 vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
1279 {
1280 	vnic_t *vnic = arg;
1281 
1282 	switch (cap) {
1283 	case MAC_CAPAB_POLL:
1284 		return (B_TRUE);
1285 	case MAC_CAPAB_HCKSUM: {
1286 		uint32_t *hcksum_txflags = cap_data;
1287 
1288 		*hcksum_txflags = vnic->vn_hcksum_txflags &
1289 		    (HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM |
1290 		    HCKSUM_INET_PARTIAL);
1291 		break;
1292 	}
1293 	default:
1294 		return (B_FALSE);
1295 	}
1296 	return (B_TRUE);
1297 }
1298 
1299 static int
1300 vnic_m_start(void *arg)
1301 {
1302 	vnic_t *vnic = arg;
1303 	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
1304 	int rc;
1305 
1306 	rc = mac_start(lower_mh);
1307 	if (rc != 0)
1308 		return (rc);
1309 
1310 	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx, vnic, vnic);
1311 	return (0);
1312 }
1313 
1314 static void
1315 vnic_m_stop(void *arg)
1316 {
1317 	vnic_t *vnic = arg;
1318 	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
1319 
1320 	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx_initial,
1321 	    vnic, vnic);
1322 	mac_stop(lower_mh);
1323 }
1324 
1325 /* ARGSUSED */
1326 static int
1327 vnic_m_promisc(void *arg, boolean_t on)
1328 {
1329 	vnic_t *vnic = arg;
1330 
1331 	return (vnic_promisc_set(vnic, on));
1332 }
1333 
1334 static int
1335 vnic_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
1336 {
1337 	vnic_t *vnic = arg;
1338 	int rc = 0;
1339 
1340 	if (add)
1341 		rc = vnic_bcast_add(vnic, addrp, MAC_ADDRTYPE_MULTICAST);
1342 	else
1343 		vnic_bcast_delete(vnic, addrp);
1344 
1345 	return (rc);
1346 }
1347 
1348 static int
1349 vnic_m_unicst(void *arg, const uint8_t *mac_addr)
1350 {
1351 	vnic_t *vnic = arg;
1352 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1353 	int rv;
1354 
1355 	rw_enter(&vnic_lock, RW_WRITER);
1356 	rv = vnic_modify_mac_addr(vnic, vnic_mac->va_addr_len,
1357 	    (uchar_t *)mac_addr);
1358 	rw_exit(&vnic_lock);
1359 
1360 	if (rv == 0)
1361 		mac_unicst_update(vnic->vn_mh, mac_addr);
1362 	return (0);
1363 }
1364 
1365 int
1366 vnic_info(uint_t *nvnics, datalink_id_t vnic_id, datalink_id_t linkid,
1367     void *fn_arg, vnic_info_new_vnic_fn_t new_vnic_fn)
1368 {
1369 	vnic_info_state_t state;
1370 	int rc = 0;
1371 
1372 	rw_enter(&vnic_lock, RW_READER);
1373 
1374 	*nvnics = vnic_count;
1375 
1376 	bzero(&state, sizeof (state));
1377 	state.vs_vnic_id = vnic_id;
1378 	state.vs_linkid = linkid;
1379 	state.vs_new_vnic_fn = new_vnic_fn;
1380 	state.vs_fn_arg = fn_arg;
1381 
1382 	mod_hash_walk(vnic_hash, vnic_info_walker, &state);
1383 
1384 	if ((rc = state.vs_rc) == 0 && vnic_id != DATALINK_ALL_LINKID &&
1385 	    !state.vs_vnic_found)
1386 		rc = ENOENT;
1387 
1388 	rw_exit(&vnic_lock);
1389 	return (rc);
1390 }
1391 
1392 /*
1393  * Walker invoked when building a list of vnics that must be passed
1394  * up to user space.
1395  */
1396 /*ARGSUSED*/
1397 static uint_t
1398 vnic_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
1399 {
1400 	vnic_t *vnic;
1401 	vnic_info_state_t *state = arg;
1402 
1403 	if (state->vs_rc != 0)
1404 		return (MH_WALK_TERMINATE);	/* terminate walk */
1405 
1406 	vnic = (vnic_t *)val;
1407 
1408 	if (state->vs_vnic_id != DATALINK_ALL_LINKID &&
1409 	    vnic->vn_id != state->vs_vnic_id) {
1410 		goto bail;
1411 	}
1412 
1413 	state->vs_vnic_found = B_TRUE;
1414 
1415 	state->vs_rc = state->vs_new_vnic_fn(state->vs_fn_arg,
1416 	    vnic->vn_id, vnic->vn_addr_type, vnic->vn_vnic_mac->va_addr_len,
1417 	    vnic->vn_addr, vnic->vn_vnic_mac->va_linkid);
1418 bail:
1419 	return ((state->vs_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
1420 }
1421 
1422 /*
1423  * vnic_notify_cb() and vnic_notify_walker() below are used to
1424  * process events received from an underlying NIC and, if needed,
1425  * forward these events to the VNICs defined on top of that NIC.
1426  */
1427 
1428 typedef struct vnic_notify_state {
1429 	mac_notify_type_t	vo_type;
1430 	vnic_mac_t		*vo_vnic_mac;
1431 } vnic_notify_state_t;
1432 
1433 /* ARGSUSED */
1434 static uint_t
1435 vnic_notify_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
1436 {
1437 	vnic_t *vnic = (vnic_t *)val;
1438 	vnic_notify_state_t *state = arg;
1439 
1440 	/* ignore VNICs that don't use the specified underlying MAC */
1441 	if (vnic->vn_vnic_mac != state->vo_vnic_mac)
1442 		return (MH_WALK_CONTINUE);
1443 
1444 	switch (state->vo_type) {
1445 	case MAC_NOTE_TX:
1446 		mac_tx_update(vnic->vn_mh);
1447 		break;
1448 	case MAC_NOTE_LINK:
1449 		/*
1450 		 * The VNIC link state must be up regardless of
1451 		 * the link state of the underlying NIC to maintain
1452 		 * connectivity between VNICs on the same host.
1453 		 */
1454 		mac_link_update(vnic->vn_mh, LINK_STATE_UP);
1455 		break;
1456 	case MAC_NOTE_UNICST:
1457 		vnic_update_active_rx(vnic->vn_vnic_mac);
1458 		break;
1459 	case MAC_NOTE_VNIC:
1460 		/* only for clients which share a NIC with a VNIC */
1461 		break;
1462 	case MAC_NOTE_PROMISC:
1463 		mutex_enter(&vnic_mac_lock);
1464 		vnic->vn_vnic_mac->va_txinfo = mac_vnic_tx_get(
1465 		    vnic->vn_vnic_mac->va_mh);
1466 		mutex_exit(&vnic_mac_lock);
1467 		break;
1468 	}
1469 
1470 	return (MH_WALK_CONTINUE);
1471 }
1472 
1473 static void
1474 vnic_notify_cb(void *arg, mac_notify_type_t type)
1475 {
1476 	vnic_mac_t *vnic = arg;
1477 	vnic_notify_state_t state;
1478 
1479 	state.vo_type = type;
1480 	state.vo_vnic_mac = vnic;
1481 
1482 	rw_enter(&vnic_lock, RW_READER);
1483 	mod_hash_walk(vnic_hash, vnic_notify_walker, &state);
1484 	rw_exit(&vnic_lock);
1485 }
1486 
1487 static int
1488 vnic_modify_mac_addr(vnic_t *vnic, uint_t mac_len, uchar_t *mac_addr)
1489 {
1490 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1491 	vnic_flow_t *vnic_flow = vnic->vn_flow_ent;
1492 
1493 	ASSERT(RW_WRITE_HELD(&vnic_lock));
1494 
1495 	if (mac_len != vnic_mac->va_addr_len)
1496 		return (EINVAL);
1497 
1498 	vnic_classifier_flow_update_addr(vnic_flow, mac_addr);
1499 	return (0);
1500 }
1501 
1502 static int
1503 vnic_promisc_set(vnic_t *vnic, boolean_t on)
1504 {
1505 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1506 	int r = -1;
1507 
1508 	if (vnic->vn_promisc == on)
1509 		return (0);
1510 
1511 	if (on) {
1512 		if ((r = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
1513 		    MAC_DEVPROMISC)) != 0) {
1514 			return (r);
1515 		}
1516 
1517 		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
1518 		vnic->vn_promisc_next = vnic_mac->va_promisc;
1519 		vnic_mac->va_promisc = vnic;
1520 		vnic_mac->va_promisc_gen++;
1521 
1522 		vnic->vn_promisc = B_TRUE;
1523 		rw_exit(&vnic_mac->va_promisc_lock);
1524 
1525 		return (0);
1526 	} else {
1527 		vnic_t *loop, *prev = NULL;
1528 
1529 		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
1530 		loop = vnic_mac->va_promisc;
1531 
1532 		while ((loop != NULL) && (loop != vnic)) {
1533 			prev = loop;
1534 			loop = loop->vn_promisc_next;
1535 		}
1536 
1537 		if ((loop != NULL) &&
1538 		    ((r = mac_promisc_set(vnic_mac->va_mh, B_FALSE,
1539 		    MAC_DEVPROMISC)) == 0)) {
1540 			if (prev != NULL)
1541 				prev->vn_promisc_next = loop->vn_promisc_next;
1542 			else
1543 				vnic_mac->va_promisc = loop->vn_promisc_next;
1544 			vnic_mac->va_promisc_gen++;
1545 
1546 			vnic->vn_promisc = B_FALSE;
1547 		}
1548 		rw_exit(&vnic_mac->va_promisc_lock);
1549 
1550 		return (r);
1551 	}
1552 }
1553 
1554 void
1555 vnic_promisc_rx(vnic_mac_t *vnic_mac, vnic_t *sender, mblk_t *mp)
1556 {
1557 	vnic_t *loop;
1558 	vnic_flow_t *flow;
1559 	const vnic_flow_fn_info_t *fn_info;
1560 	mac_header_info_t hdr_info;
1561 	boolean_t dst_must_match = B_TRUE;
1562 
1563 	ASSERT(mp->b_next == NULL);
1564 
1565 	rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
1566 	if (vnic_mac->va_promisc == NULL)
1567 		goto done;
1568 
1569 	if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0)
1570 		goto done;
1571 
1572 	/*
1573 	 * If this is broadcast or multicast then the destination
1574 	 * address need not match for us to deliver it.
1575 	 */
1576 	if ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) ||
1577 	    (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST))
1578 		dst_must_match = B_FALSE;
1579 
1580 	for (loop = vnic_mac->va_promisc;
1581 	    loop != NULL;
1582 	    loop = loop->vn_promisc_next) {
1583 		if (loop == sender)
1584 			continue;
1585 
1586 		if (dst_must_match &&
1587 		    (bcmp(hdr_info.mhi_daddr, loop->vn_addr,
1588 		    sizeof (loop->vn_addr)) != 0))
1589 			continue;
1590 
1591 		flow = loop->vn_flow_ent;
1592 		ASSERT(flow != NULL);
1593 
1594 		if (!flow->vf_is_active) {
1595 			mblk_t *copy;
1596 			uint64_t gen;
1597 
1598 			if ((copy = vnic_copymsg_cksum(mp)) == NULL)
1599 				break;
1600 			if ((sender != NULL) &&
1601 			    ((copy = vnic_fix_cksum(copy)) == NULL))
1602 				break;
1603 
1604 			VNIC_FLOW_REFHOLD(flow);
1605 			gen = vnic_mac->va_promisc_gen;
1606 			rw_exit(&vnic_mac->va_promisc_lock);
1607 
1608 			fn_info = vnic_classifier_get_fn_info(flow);
1609 			(fn_info->ff_fn)(fn_info->ff_arg1,
1610 			    fn_info->ff_arg2, copy);
1611 
1612 			VNIC_FLOW_REFRELE(flow);
1613 			rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
1614 			if (vnic_mac->va_promisc_gen != gen)
1615 				break;
1616 		}
1617 	}
1618 done:
1619 	rw_exit(&vnic_mac->va_promisc_lock);
1620 }
1621