xref: /titanic_51/usr/src/uts/common/io/vnic/vnic_dev.c (revision a3c4695861e3f0a8d3706f77ccd53683cca48d67)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/sysmacros.h>
30 #include <sys/conf.h>
31 #include <sys/cmn_err.h>
32 #include <sys/list.h>
33 #include <sys/ksynch.h>
34 #include <sys/kmem.h>
35 #include <sys/stream.h>
36 #include <sys/modctl.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/atomic.h>
40 #include <sys/stat.h>
41 #include <sys/modhash.h>
42 #include <sys/strsubr.h>
43 #include <sys/strsun.h>
44 #include <sys/dlpi.h>
45 #include <sys/mac.h>
46 #include <sys/mac_ether.h>
47 #include <sys/dls.h>
48 #include <sys/pattr.h>
49 #include <sys/vnic.h>
50 #include <sys/vnic_impl.h>
51 #include <sys/gld.h>
52 #include <inet/ip.h>
53 #include <inet/ip_impl.h>
54 
55 static int vnic_m_start(void *);
56 static void vnic_m_stop(void *);
57 static int vnic_m_promisc(void *, boolean_t);
58 static int vnic_m_multicst(void *, boolean_t, const uint8_t *);
59 static int vnic_m_unicst(void *, const uint8_t *);
60 static int vnic_m_stat(void *, uint_t, uint64_t *);
61 static void vnic_m_resources(void *);
62 static mblk_t *vnic_m_tx(void *, mblk_t *);
63 static boolean_t vnic_m_capab_get(void *, mac_capab_t, void *);
64 static void vnic_mac_free(vnic_mac_t *);
65 static uint_t vnic_info_walker(mod_hash_key_t, mod_hash_val_t *, void *);
66 static void vnic_notify_cb(void *, mac_notify_type_t);
67 static int vnic_modify_mac_addr(vnic_t *, uint_t, uchar_t *);
68 static mblk_t *vnic_active_tx(void *, mblk_t *);
69 static int vnic_promisc_set(vnic_t *, boolean_t);
70 
71 static kmem_cache_t	*vnic_cache;
72 static kmem_cache_t	*vnic_mac_cache;
73 static krwlock_t	vnic_lock;
74 static kmutex_t		vnic_mac_lock;
75 static uint_t		vnic_count;
76 
77 /* hash of VNICs (vnic_t's), keyed by VNIC id */
78 static mod_hash_t	*vnic_hash;
79 #define	VNIC_HASHSZ	64
80 #define	VNIC_HASH_KEY(vnic_id)	((mod_hash_key_t)(uintptr_t)vnic_id)
81 
82 /*
83  * Hash of underlying open MACs (vnic_mac_t's), keyed by the string
84  * "<device name><instance number>/<port number>".
85  */
86 static mod_hash_t	*vnic_mac_hash;
87 #define	VNIC_MAC_HASHSZ	64
88 
89 #define	VNIC_MAC_REFHOLD(va) {			\
90 	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
91 	(va)->va_refs++;			\
92 	ASSERT((va)->va_refs != 0);		\
93 }
94 
95 #define	VNIC_MAC_REFRELE(va) {			\
96 	ASSERT(MUTEX_HELD(&vnic_mac_lock));	\
97 	ASSERT((va)->va_refs != 0);		\
98 	if (--((va)->va_refs) == 0)		\
99 		vnic_mac_free(va);		\
100 }
101 
102 static uchar_t vnic_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
103 
104 /* used by vnic_walker */
105 typedef struct vnic_info_state {
106 	datalink_id_t	vs_vnic_id;
107 	datalink_id_t	vs_linkid;
108 	boolean_t	vs_vnic_found;
109 	vnic_info_new_vnic_fn_t	vs_new_vnic_fn;
110 	void		*vs_fn_arg;
111 	int		vs_rc;
112 } vnic_info_state_t;
113 
114 #define	VNIC_M_CALLBACK_FLAGS	(MC_RESOURCES | MC_GETCAPAB)
115 
116 static mac_callbacks_t vnic_m_callbacks = {
117 	VNIC_M_CALLBACK_FLAGS,
118 	vnic_m_stat,
119 	vnic_m_start,
120 	vnic_m_stop,
121 	vnic_m_promisc,
122 	vnic_m_multicst,
123 	vnic_m_unicst,
124 	vnic_m_tx,
125 	vnic_m_resources,
126 	NULL,			/* m_ioctl */
127 	vnic_m_capab_get
128 };
129 
130 /* ARGSUSED */
131 static int
132 vnic_mac_ctor(void *buf, void *arg, int kmflag)
133 {
134 	vnic_mac_t *vnic_mac = buf;
135 
136 	bzero(vnic_mac, sizeof (vnic_mac_t));
137 	rw_init(&vnic_mac->va_bcast_grp_lock, NULL, RW_DRIVER, NULL);
138 	rw_init(&vnic_mac->va_promisc_lock, NULL, RW_DRIVER, NULL);
139 
140 	return (0);
141 }
142 
143 /* ARGSUSED */
144 static void
145 vnic_mac_dtor(void *buf, void *arg)
146 {
147 	vnic_mac_t *vnic_mac = buf;
148 
149 	rw_destroy(&vnic_mac->va_promisc_lock);
150 	rw_destroy(&vnic_mac->va_bcast_grp_lock);
151 }
152 
153 void
154 vnic_dev_init(void)
155 {
156 	vnic_cache = kmem_cache_create("vnic_cache",
157 	    sizeof (vnic_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
158 
159 	vnic_mac_cache = kmem_cache_create("vnic_mac_cache",
160 	    sizeof (vnic_mac_t), 0, vnic_mac_ctor, vnic_mac_dtor,
161 	    NULL, NULL, NULL, 0);
162 
163 	vnic_hash = mod_hash_create_idhash("vnic_hash",
164 	    VNIC_HASHSZ, mod_hash_null_valdtor);
165 
166 	vnic_mac_hash = mod_hash_create_idhash("vnic_mac_hash",
167 	    VNIC_MAC_HASHSZ, mod_hash_null_valdtor);
168 
169 	rw_init(&vnic_lock, NULL, RW_DEFAULT, NULL);
170 
171 	mutex_init(&vnic_mac_lock, NULL, MUTEX_DEFAULT, NULL);
172 
173 	vnic_count = 0;
174 }
175 
176 void
177 vnic_dev_fini(void)
178 {
179 	ASSERT(vnic_count == 0);
180 
181 	mutex_destroy(&vnic_mac_lock);
182 	rw_destroy(&vnic_lock);
183 	mod_hash_destroy_idhash(vnic_mac_hash);
184 	mod_hash_destroy_idhash(vnic_hash);
185 	kmem_cache_destroy(vnic_mac_cache);
186 	kmem_cache_destroy(vnic_cache);
187 }
188 
189 uint_t
190 vnic_dev_count(void)
191 {
192 	return (vnic_count);
193 }
194 
195 static int
196 vnic_mac_open(datalink_id_t linkid, vnic_mac_t **vmp)
197 {
198 	int err;
199 	vnic_mac_t *vnic_mac = NULL;
200 	const mac_info_t *mip;
201 
202 	*vmp = NULL;
203 
204 	mutex_enter(&vnic_mac_lock);
205 
206 	err = mod_hash_find(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
207 	    (mod_hash_val_t *)&vnic_mac);
208 	if (err == 0) {
209 		/* this MAC is already opened, increment reference count */
210 		VNIC_MAC_REFHOLD(vnic_mac);
211 		mutex_exit(&vnic_mac_lock);
212 		*vmp = vnic_mac;
213 		return (0);
214 	}
215 
216 	vnic_mac = kmem_cache_alloc(vnic_mac_cache, KM_SLEEP);
217 	if ((err = mac_open_by_linkid(linkid, &vnic_mac->va_mh)) != 0) {
218 		vnic_mac->va_mh = NULL;
219 		goto bail;
220 	}
221 
222 	/*
223 	 * For now, we do not support VNICs over legacy drivers.  This will
224 	 * soon be changed.
225 	 */
226 	if (mac_is_legacy(vnic_mac->va_mh)) {
227 		err = ENOTSUP;
228 		goto bail;
229 	}
230 
231 	/* only ethernet support, for now */
232 	mip = mac_info(vnic_mac->va_mh);
233 	if (mip->mi_media != DL_ETHER) {
234 		err = ENOTSUP;
235 		goto bail;
236 	}
237 	if (mip->mi_media != mip->mi_nativemedia) {
238 		err = ENOTSUP;
239 		goto bail;
240 	}
241 
242 	vnic_mac->va_linkid = linkid;
243 
244 	/* add entry to hash table */
245 	err = mod_hash_insert(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid,
246 	    (mod_hash_val_t)vnic_mac);
247 	ASSERT(err == 0);
248 
249 	/* initialize the flow table associated with lower MAC */
250 	vnic_mac->va_addr_len = ETHERADDRL;
251 	(void) vnic_classifier_flow_tab_init(vnic_mac, vnic_mac->va_addr_len,
252 	    KM_SLEEP);
253 
254 	vnic_mac->va_txinfo = mac_vnic_tx_get(vnic_mac->va_mh);
255 	vnic_mac->va_notify_hdl = mac_notify_add(vnic_mac->va_mh,
256 	    vnic_notify_cb, vnic_mac);
257 
258 	VNIC_MAC_REFHOLD(vnic_mac);
259 	*vmp = vnic_mac;
260 	mutex_exit(&vnic_mac_lock);
261 	return (0);
262 
263 bail:
264 	if (vnic_mac != NULL) {
265 		if (vnic_mac->va_mh != NULL)
266 			mac_close(vnic_mac->va_mh);
267 		kmem_cache_free(vnic_mac_cache, vnic_mac);
268 	}
269 	mutex_exit(&vnic_mac_lock);
270 	return (err);
271 }
272 
273 /*
274  * Create a new flow for the active MAC client sharing the NIC
275  * with the VNICs. This allows the unicast packets for that NIC
276  * to be classified and passed up to the active MAC client. It
277  * also allows packets sent from a VNIC to the active link to
278  * be classified by the VNIC transmit function and delivered via
279  * the MAC module locally. Returns B_TRUE on success, B_FALSE on
280  * failure.
281  */
282 static int
283 vnic_init_active_rx(vnic_mac_t *vnic_mac)
284 {
285 	uchar_t nic_mac_addr[MAXMACADDRLEN];
286 
287 	if (vnic_mac->va_active_flow != NULL)
288 		return (B_TRUE);
289 
290 	mac_unicst_get(vnic_mac->va_mh, nic_mac_addr);
291 
292 	vnic_mac->va_active_flow = vnic_classifier_flow_create(
293 	    vnic_mac->va_addr_len, nic_mac_addr, NULL, B_TRUE, KM_SLEEP);
294 
295 	vnic_classifier_flow_add(vnic_mac, vnic_mac->va_active_flow,
296 	    (vnic_rx_fn_t)mac_active_rx, vnic_mac->va_mh, NULL);
297 	return (B_TRUE);
298 }
299 
300 static void
301 vnic_fini_active_rx(vnic_mac_t *vnic_mac)
302 {
303 	if (vnic_mac->va_active_flow == NULL)
304 		return;
305 
306 	vnic_classifier_flow_remove(vnic_mac, vnic_mac->va_active_flow);
307 	vnic_classifier_flow_destroy(vnic_mac->va_active_flow);
308 	vnic_mac->va_active_flow = NULL;
309 }
310 
311 static void
312 vnic_update_active_rx(vnic_mac_t *vnic_mac)
313 {
314 	if (vnic_mac->va_active_flow == NULL)
315 		return;
316 
317 	vnic_fini_active_rx(vnic_mac);
318 	(void) vnic_init_active_rx(vnic_mac);
319 }
320 
321 /*
322  * Copy an mblk, preserving its hardware checksum flags.
323  */
324 mblk_t *
325 vnic_copymsg_cksum(mblk_t *mp)
326 {
327 	mblk_t *mp1;
328 	uint32_t start, stuff, end, value, flags;
329 
330 	mp1 = copymsg(mp);
331 	if (mp1 == NULL)
332 		return (NULL);
333 
334 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags);
335 	(void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value,
336 	    flags, KM_NOSLEEP);
337 
338 	return (mp1);
339 }
340 
341 /*
342  * Copy an mblk chain, presenting the hardware checksum flags of the
343  * individual mblks.
344  */
345 mblk_t *
346 vnic_copymsgchain_cksum(mblk_t *mp)
347 {
348 	mblk_t *nmp = NULL;
349 	mblk_t **nmpp = &nmp;
350 
351 	for (; mp != NULL; mp = mp->b_next) {
352 		if ((*nmpp = vnic_copymsg_cksum(mp)) == NULL) {
353 			freemsgchain(nmp);
354 			return (NULL);
355 		}
356 
357 		nmpp = &((*nmpp)->b_next);
358 	}
359 
360 	return (nmp);
361 }
362 
363 
364 /*
365  * Process the specified mblk chain for proper handling of hardware
366  * checksum offload. This routine is invoked for loopback VNIC traffic.
367  * The function handles a NULL mblk chain passed as argument.
368  */
369 mblk_t *
370 vnic_fix_cksum(mblk_t *mp_chain)
371 {
372 	mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
373 	uint32_t flags, start, stuff, end, value;
374 
375 	for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
376 		uint16_t len;
377 		uint32_t offset;
378 		struct ether_header *ehp;
379 		uint16_t sap;
380 
381 		hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value,
382 		    &flags);
383 		if (flags == 0)
384 			continue;
385 
386 		/*
387 		 * Since the processing of checksum offload for loopback
388 		 * traffic requires modification of the packet contents,
389 		 * ensure sure that we are always modifying our own copy.
390 		 */
391 		if (DB_REF(mp) > 1) {
392 			mp1 = copymsg(mp);
393 			if (mp1 == NULL)
394 				continue;
395 			mp1->b_next = mp->b_next;
396 			mp->b_next = NULL;
397 			freemsg(mp);
398 			if (prev != NULL)
399 				prev->b_next = mp1;
400 			else
401 				new_chain = mp1;
402 			mp = mp1;
403 		}
404 
405 		/*
406 		 * Ethernet, and optionally VLAN header.
407 		 */
408 		/*LINTED*/
409 		ehp = (struct ether_header *)mp->b_rptr;
410 		if (ntohs(ehp->ether_type) == VLAN_TPID) {
411 			struct ether_vlan_header *evhp;
412 
413 			ASSERT(MBLKL(mp) >=
414 			    sizeof (struct ether_vlan_header));
415 			/*LINTED*/
416 			evhp = (struct ether_vlan_header *)mp->b_rptr;
417 			sap = ntohs(evhp->ether_type);
418 			offset = sizeof (struct ether_vlan_header);
419 		} else {
420 			sap = ntohs(ehp->ether_type);
421 			offset = sizeof (struct ether_header);
422 		}
423 
424 		if (MBLKL(mp) <= offset) {
425 			offset -= MBLKL(mp);
426 			if (mp->b_cont == NULL) {
427 				/* corrupted packet, skip it */
428 				if (prev != NULL)
429 					prev->b_next = mp->b_next;
430 				else
431 					new_chain = mp->b_next;
432 				mp1 = mp->b_next;
433 				mp->b_next = NULL;
434 				freemsg(mp);
435 				mp = mp1;
436 				continue;
437 			}
438 			mp = mp->b_cont;
439 		}
440 
441 		if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
442 			ipha_t *ipha = NULL;
443 
444 			/*
445 			 * In order to compute the full and header
446 			 * checksums, we need to find and parse
447 			 * the IP and/or ULP headers.
448 			 */
449 
450 			sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
451 
452 			/*
453 			 * IP header.
454 			 */
455 			if (sap != ETHERTYPE_IP)
456 				continue;
457 
458 			ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
459 			/*LINTED*/
460 			ipha = (ipha_t *)(mp->b_rptr + offset);
461 
462 			if (flags & HCK_FULLCKSUM) {
463 				ipaddr_t src, dst;
464 				uint32_t cksum;
465 				uint16_t *up;
466 				uint8_t proto;
467 
468 				/*
469 				 * Pointer to checksum field in ULP header.
470 				 */
471 				proto = ipha->ipha_protocol;
472 				ASSERT(ipha->ipha_version_and_hdr_length ==
473 				    IP_SIMPLE_HDR_VERSION);
474 				if (proto == IPPROTO_TCP) {
475 					/*LINTED*/
476 					up = IPH_TCPH_CHECKSUMP(ipha,
477 					    IP_SIMPLE_HDR_LENGTH);
478 				} else {
479 					ASSERT(proto == IPPROTO_UDP);
480 					/*LINTED*/
481 					up = IPH_UDPH_CHECKSUMP(ipha,
482 					    IP_SIMPLE_HDR_LENGTH);
483 				}
484 
485 				/*
486 				 * Pseudo-header checksum.
487 				 */
488 				src = ipha->ipha_src;
489 				dst = ipha->ipha_dst;
490 				len = ntohs(ipha->ipha_length) -
491 				    IP_SIMPLE_HDR_LENGTH;
492 
493 				cksum = (dst >> 16) + (dst & 0xFFFF) +
494 				    (src >> 16) + (src & 0xFFFF);
495 				cksum += htons(len);
496 
497 				/*
498 				 * The checksum value stored in the packet needs
499 				 * to be correct. Compute it here.
500 				 */
501 				*up = 0;
502 				cksum += (((proto) == IPPROTO_UDP) ?
503 				    IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
504 				cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
505 				    offset, cksum);
506 				*(up) = (uint16_t)(cksum ? cksum : ~cksum);
507 
508 				flags |= HCK_FULLCKSUM_OK;
509 				value = 0xffff;
510 			}
511 
512 			if (flags & HCK_IPV4_HDRCKSUM) {
513 				ASSERT(ipha != NULL);
514 				ipha->ipha_hdr_checksum =
515 				    (uint16_t)ip_csum_hdr(ipha);
516 			}
517 		}
518 
519 		if (flags & HCK_PARTIALCKSUM) {
520 			uint16_t *up, partial, cksum;
521 			uchar_t *ipp; /* ptr to beginning of IP header */
522 
523 			if (mp->b_cont != NULL) {
524 				mblk_t *mp1;
525 
526 				mp1 = msgpullup(mp, offset + end);
527 				if (mp1 == NULL)
528 					continue;
529 				mp1->b_next = mp->b_next;
530 				mp->b_next = NULL;
531 				freemsg(mp);
532 				if (prev != NULL)
533 					prev->b_next = mp1;
534 				else
535 					new_chain = mp1;
536 				mp = mp1;
537 			}
538 
539 			ipp = mp->b_rptr + offset;
540 			/*LINTED*/
541 			up = (uint16_t *)((uchar_t *)ipp + stuff);
542 			partial = *up;
543 			*up = 0;
544 
545 			cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
546 			    end - start, partial);
547 			cksum = ~cksum;
548 			*up = cksum ? cksum : ~cksum;
549 
550 			/*
551 			 * Since we already computed the whole checksum,
552 			 * indicate to the stack that it has already
553 			 * been verified by the hardware.
554 			 */
555 			flags &= ~HCK_PARTIALCKSUM;
556 			flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK);
557 			value = 0xffff;
558 		}
559 
560 		(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
561 		    value, flags, KM_NOSLEEP);
562 	}
563 
564 	return (new_chain);
565 }
566 
567 static void
568 vnic_mac_close(vnic_mac_t *vnic_mac)
569 {
570 	mutex_enter(&vnic_mac_lock);
571 	VNIC_MAC_REFRELE(vnic_mac);
572 	mutex_exit(&vnic_mac_lock);
573 }
574 
575 static void
576 vnic_mac_free(vnic_mac_t *vnic_mac)
577 {
578 	mod_hash_val_t val;
579 
580 	ASSERT(MUTEX_HELD(&vnic_mac_lock));
581 	vnic_fini_active_rx(vnic_mac);
582 	mac_notify_remove(vnic_mac->va_mh, vnic_mac->va_notify_hdl);
583 	if (vnic_mac->va_mac_set) {
584 		vnic_mac->va_mac_set = B_FALSE;
585 		mac_vnic_clear(vnic_mac->va_mh);
586 	}
587 	vnic_classifier_flow_tab_fini(vnic_mac);
588 	mac_close(vnic_mac->va_mh);
589 
590 	(void) mod_hash_remove(vnic_mac_hash,
591 	    (mod_hash_key_t)(uintptr_t)vnic_mac->va_linkid, &val);
592 	ASSERT(vnic_mac == (vnic_mac_t *)val);
593 
594 	kmem_cache_free(vnic_mac_cache, vnic_mac);
595 }
596 
597 /*
598  * Initial VNIC receive routine. Invoked for packets that are steered
599  * to a VNIC but the VNIC has not been started yet.
600  */
601 /* ARGSUSED */
602 static void
603 vnic_rx_initial(void *arg1, void *arg2, mblk_t *mp_chain)
604 {
605 	vnic_t *vnic = arg1;
606 	mblk_t *mp;
607 
608 	/* update stats */
609 	for (mp = mp_chain; mp != NULL; mp = mp->b_next)
610 		vnic->vn_stat_ierrors++;
611 	freemsgchain(mp_chain);
612 }
613 
614 /*
615  * VNIC receive routine invoked after the classifier for the VNIC
616  * has been initialized and the VNIC has been started.
617  */
618 /* ARGSUSED */
619 void
620 vnic_rx(void *arg1, void *arg2, mblk_t *mp_chain)
621 {
622 	vnic_t *vnic = arg1;
623 	mblk_t *mp;
624 
625 	/* update stats */
626 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
627 		vnic->vn_stat_ipackets++;
628 		vnic->vn_stat_rbytes += msgdsize(mp);
629 	}
630 
631 	/* pass packet up */
632 	mac_rx(vnic->vn_mh, NULL, mp_chain);
633 }
634 
635 /*
636  * Routine to create a MAC-based VNIC. Adds the passed MAC address
637  * to an unused slot in the NIC if one is available. Otherwise it
638  * sets the NIC in promiscuous mode and assigns the MAC address to
639  * a Rx ring if available or a soft ring.
640  */
641 static int
642 vnic_add_unicstaddr(vnic_t *vnic, mac_multi_addr_t *maddr)
643 {
644 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
645 	int err;
646 
647 	if (mac_unicst_verify(vnic_mac->va_mh, maddr->mma_addr,
648 	    maddr->mma_addrlen) == B_FALSE)
649 		return (EINVAL);
650 
651 	if (mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_MULTIADDRESS,
652 	    &(vnic->vn_mma_capab))) {
653 		if (vnic->vn_maddr_naddrfree == 0) {
654 			/*
655 			 * No free address slots available.
656 			 * Enable promiscuous mode.
657 			 */
658 			goto set_promisc;
659 		}
660 
661 		err = vnic->vn_maddr_add(vnic->vn_maddr_handle, maddr);
662 		if (err != 0) {
663 			if (err == ENOSPC) {
664 				/*
665 				 * There was a race to add addresses
666 				 * with other multiple address consumers,
667 				 * and we lost out. Use promisc mode.
668 				 */
669 				goto set_promisc;
670 			}
671 
672 			return (err);
673 		}
674 
675 		vnic->vn_slot_id = maddr->mma_slot;
676 		vnic->vn_multi_mac = B_TRUE;
677 	} else {
678 		/*
679 		 * Either multiple MAC address support is not
680 		 * available or all available addresses have
681 		 * been used up.
682 		 */
683 	set_promisc:
684 		if ((err = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
685 		    MAC_DEVPROMISC)) != 0) {
686 			return (err);
687 		}
688 
689 		vnic->vn_promisc_mac = B_TRUE;
690 	}
691 	return (err);
692 }
693 
694 /*
695  * VNIC is getting deleted. Remove the MAC address from the slot.
696  * If promiscuous mode was being used, then unset the promiscuous mode.
697  */
698 static int
699 vnic_remove_unicstaddr(vnic_t *vnic)
700 {
701 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
702 	int err;
703 
704 	if (vnic->vn_multi_mac) {
705 		ASSERT(vnic->vn_promisc_mac == B_FALSE);
706 		err = vnic->vn_maddr_remove(vnic->vn_maddr_handle,
707 		    vnic->vn_slot_id);
708 		vnic->vn_multi_mac = B_FALSE;
709 	}
710 
711 	if (vnic->vn_promisc_mac) {
712 		ASSERT(vnic->vn_multi_mac == B_FALSE);
713 		err = mac_promisc_set(vnic_mac->va_mh, B_FALSE, MAC_DEVPROMISC);
714 		vnic->vn_promisc_mac = B_FALSE;
715 	}
716 
717 	return (err);
718 }
719 
720 /*
721  * Create a new VNIC upon request from administrator.
722  * Returns 0 on success, an errno on failure.
723  */
724 int
725 vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len,
726     uchar_t *mac_addr)
727 {
728 	vnic_t *vnic = NULL;
729 	mac_register_t *mac;
730 	int err;
731 	vnic_mac_t *vnic_mac;
732 	mac_multi_addr_t maddr;
733 	mac_txinfo_t tx_info;
734 
735 	if (mac_len != ETHERADDRL) {
736 		/* currently only ethernet NICs are supported */
737 		return (EINVAL);
738 	}
739 
740 	rw_enter(&vnic_lock, RW_WRITER);
741 
742 	/* does a VNIC with the same id already exist? */
743 	err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
744 	    (mod_hash_val_t *)&vnic);
745 	if (err == 0) {
746 		rw_exit(&vnic_lock);
747 		return (EEXIST);
748 	}
749 
750 	vnic = kmem_cache_alloc(vnic_cache, KM_NOSLEEP);
751 	if (vnic == NULL) {
752 		rw_exit(&vnic_lock);
753 		return (ENOMEM);
754 	}
755 
756 	/* open underlying MAC */
757 	err = vnic_mac_open(linkid, &vnic_mac);
758 	if (err != 0) {
759 		kmem_cache_free(vnic_cache, vnic);
760 		rw_exit(&vnic_lock);
761 		return (err);
762 	}
763 
764 	bzero(vnic, sizeof (*vnic));
765 	vnic->vn_id = vnic_id;
766 	vnic->vn_vnic_mac = vnic_mac;
767 
768 	vnic->vn_started = B_FALSE;
769 	vnic->vn_promisc = B_FALSE;
770 	vnic->vn_multi_mac = B_FALSE;
771 	vnic->vn_bcast_grp = B_FALSE;
772 
773 	/* set the VNIC MAC address */
774 	maddr.mma_addrlen = mac_len;
775 	maddr.mma_slot = 0;
776 	maddr.mma_flags = 0;
777 	bcopy(mac_addr, maddr.mma_addr, mac_len);
778 	if ((err = vnic_add_unicstaddr(vnic, &maddr)) != 0)
779 		goto bail;
780 	bcopy(mac_addr, vnic->vn_addr, mac_len);
781 
782 	/* set the initial VNIC capabilities */
783 	if (!mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_HCKSUM,
784 	    &vnic->vn_hcksum_txflags))
785 		vnic->vn_hcksum_txflags = 0;
786 
787 	/* register with the MAC module */
788 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
789 		goto bail;
790 
791 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
792 	mac->m_driver = vnic;
793 	mac->m_dip = vnic_get_dip();
794 	mac->m_instance = (uint_t)-1;
795 	mac->m_src_addr = vnic->vn_addr;
796 	mac->m_callbacks = &vnic_m_callbacks;
797 
798 	mac_sdu_get(vnic_mac->va_mh, &mac->m_min_sdu, &mac->m_max_sdu);
799 
800 	/*
801 	 * As the current margin size of the underlying mac is used to
802 	 * determine the margin size of the VNIC itself, request the
803 	 * underlying mac not to change to a smaller margin size.
804 	 */
805 	err = mac_margin_add(vnic_mac->va_mh, &(vnic->vn_margin), B_TRUE);
806 	if (err != 0)
807 		goto bail;
808 	mac->m_margin = vnic->vn_margin;
809 	err = mac_register(mac, &vnic->vn_mh);
810 	mac_free(mac);
811 	if (err != 0) {
812 		VERIFY(mac_margin_remove(vnic_mac->va_mh,
813 		    vnic->vn_margin) == 0);
814 		goto bail;
815 	}
816 
817 	if ((err = dls_devnet_create(vnic->vn_mh, vnic->vn_id)) != 0) {
818 		VERIFY(mac_margin_remove(vnic_mac->va_mh,
819 		    vnic->vn_margin) == 0);
820 		(void) mac_unregister(vnic->vn_mh);
821 		goto bail;
822 	}
823 
824 	/* add new VNIC to hash table */
825 	err = mod_hash_insert(vnic_hash, VNIC_HASH_KEY(vnic_id),
826 	    (mod_hash_val_t)vnic);
827 	ASSERT(err == 0);
828 	vnic_count++;
829 
830 	rw_exit(&vnic_lock);
831 
832 	/* Create a flow, initialized with the MAC address of the VNIC */
833 	if ((vnic->vn_flow_ent = vnic_classifier_flow_create(mac_len, mac_addr,
834 	    NULL, B_FALSE, KM_SLEEP)) == NULL) {
835 		(void) vnic_dev_delete(vnic_id);
836 		vnic = NULL;
837 		err = ENOMEM;
838 		goto bail_unlocked;
839 	}
840 
841 	vnic_classifier_flow_add(vnic_mac, vnic->vn_flow_ent, vnic_rx_initial,
842 	    vnic, vnic);
843 
844 	/* setup VNIC to receive broadcast packets */
845 	err = vnic_bcast_add(vnic, vnic_brdcst_mac, MAC_ADDRTYPE_BROADCAST);
846 	if (err != 0) {
847 		(void) vnic_dev_delete(vnic_id);
848 		vnic = NULL;
849 		goto bail_unlocked;
850 	}
851 	vnic->vn_bcast_grp = B_TRUE;
852 
853 	mutex_enter(&vnic_mac_lock);
854 	if (!vnic_mac->va_mac_set) {
855 		/*
856 		 * We want to MAC layer to call the VNIC tx outbound
857 		 * routine, so that local broadcast packets sent by
858 		 * the active interface sharing the underlying NIC (if
859 		 * any), can be broadcast to every VNIC.
860 		 */
861 		tx_info.mt_fn = vnic_active_tx;
862 		tx_info.mt_arg = vnic_mac;
863 		if (!mac_vnic_set(vnic_mac->va_mh, &tx_info,
864 		    vnic_m_capab_get, vnic)) {
865 			mutex_exit(&vnic_mac_lock);
866 			(void) vnic_dev_delete(vnic_id);
867 			vnic = NULL;
868 			err = EBUSY;
869 			goto bail_unlocked;
870 		}
871 		vnic_mac->va_mac_set = B_TRUE;
872 	}
873 	mutex_exit(&vnic_mac_lock);
874 
875 	/* allow passing packets to NIC's active MAC client */
876 	if (!vnic_init_active_rx(vnic_mac)) {
877 		(void) vnic_dev_delete(vnic_id);
878 		vnic = NULL;
879 		err = ENOMEM;
880 		goto bail_unlocked;
881 	}
882 
883 	return (0);
884 
885 bail:
886 	(void) vnic_remove_unicstaddr(vnic);
887 	vnic_mac_close(vnic_mac);
888 	rw_exit(&vnic_lock);
889 
890 bail_unlocked:
891 	if (vnic != NULL) {
892 		kmem_cache_free(vnic_cache, vnic);
893 	}
894 
895 	return (err);
896 }
897 
898 /*
899  * Modify the properties of an existing VNIC.
900  */
901 /* ARGSUSED */
902 int
903 vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask,
904     vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr)
905 {
906 	vnic_t *vnic = NULL;
907 	int rv = 0;
908 	boolean_t notify_mac_addr = B_FALSE;
909 
910 	rw_enter(&vnic_lock, RW_WRITER);
911 
912 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
913 	    (mod_hash_val_t *)&vnic) != 0) {
914 		rw_exit(&vnic_lock);
915 		return (ENOENT);
916 	}
917 
918 	if (modify_mask & VNIC_IOC_MODIFY_ADDR) {
919 		rv = vnic_modify_mac_addr(vnic, mac_len, mac_addr);
920 		if (rv == 0)
921 			notify_mac_addr = B_TRUE;
922 	}
923 
924 	rw_exit(&vnic_lock);
925 
926 	if (notify_mac_addr)
927 		mac_unicst_update(vnic->vn_mh, mac_addr);
928 
929 	return (rv);
930 }
931 
932 int
933 vnic_dev_delete(datalink_id_t vnic_id)
934 {
935 	vnic_t *vnic = NULL;
936 	mod_hash_val_t val;
937 	vnic_flow_t *flent;
938 	datalink_id_t tmpid;
939 	int rc;
940 	vnic_mac_t *vnic_mac;
941 
942 	rw_enter(&vnic_lock, RW_WRITER);
943 
944 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
945 	    (mod_hash_val_t *)&vnic) != 0) {
946 		rw_exit(&vnic_lock);
947 		return (ENOENT);
948 	}
949 
950 	if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid)) != 0) {
951 		rw_exit(&vnic_lock);
952 		return (rc);
953 	}
954 
955 	ASSERT(vnic_id == tmpid);
956 
957 	/*
958 	 * We cannot unregister the MAC yet. Unregistering would
959 	 * free up mac_impl_t which should not happen at this time.
960 	 * Packets could be entering vnic_rx() through the
961 	 * flow entry and so mac_impl_t cannot be NULL. So disable
962 	 * mac_impl_t by calling mac_disable(). This will prevent any
963 	 * new claims on mac_impl_t.
964 	 */
965 	if (mac_disable(vnic->vn_mh) != 0) {
966 		(void) dls_devnet_create(vnic->vn_mh, vnic_id);
967 		rw_exit(&vnic_lock);
968 		return (EBUSY);
969 	}
970 
971 	(void) mod_hash_remove(vnic_hash, VNIC_HASH_KEY(vnic_id), &val);
972 	ASSERT(vnic == (vnic_t *)val);
973 
974 	if (vnic->vn_bcast_grp)
975 		(void) vnic_bcast_delete(vnic, vnic_brdcst_mac);
976 
977 	flent = vnic->vn_flow_ent;
978 	if (flent != NULL) {
979 		/*
980 		 * vnic_classifier_flow_destroy() ensures that the
981 		 * flow is no longer used.
982 		 */
983 		vnic_classifier_flow_remove(vnic->vn_vnic_mac, flent);
984 		vnic_classifier_flow_destroy(flent);
985 	}
986 
987 	rc = mac_margin_remove(vnic->vn_vnic_mac->va_mh, vnic->vn_margin);
988 	ASSERT(rc == 0);
989 	rc = mac_unregister(vnic->vn_mh);
990 	ASSERT(rc == 0);
991 	(void) vnic_remove_unicstaddr(vnic);
992 	vnic_mac = vnic->vn_vnic_mac;
993 	kmem_cache_free(vnic_cache, vnic);
994 	vnic_count--;
995 	rw_exit(&vnic_lock);
996 	vnic_mac_close(vnic_mac);
997 	return (0);
998 }
999 
1000 /*
1001  * For the specified packet chain, return a sub-chain to be sent
1002  * and the transmit function to be used to send the packet. Also
1003  * return a pointer to the sub-chain of packets that should
1004  * be re-classified. If the function returns NULL, the packet
1005  * should be sent using the underlying NIC.
1006  */
1007 static vnic_flow_t *
1008 vnic_classify(vnic_mac_t *vnic_mac, mblk_t *mp, mblk_t **mp_chain_rest)
1009 {
1010 	vnic_flow_t *flow_ent;
1011 
1012 	/* one packet at a time */
1013 	*mp_chain_rest = mp->b_next;
1014 	mp->b_next = NULL;
1015 
1016 	/* do classification on the packet */
1017 	flow_ent = vnic_classifier_get_flow(vnic_mac, mp);
1018 
1019 	return (flow_ent);
1020 }
1021 
1022 /*
1023  * Send a packet chain to a local VNIC or an active MAC client.
1024  */
1025 static void
1026 vnic_local_tx(vnic_mac_t *vnic_mac, vnic_flow_t *flow_ent, mblk_t *mp_chain)
1027 {
1028 	mblk_t *mp1;
1029 	const vnic_flow_fn_info_t *fn_info;
1030 	vnic_t *vnic;
1031 
1032 	if (!vnic_classifier_is_active(flow_ent) &&
1033 	    mac_promisc_get(vnic_mac->va_mh, MAC_PROMISC)) {
1034 		/*
1035 		 * If the MAC is in promiscous mode,
1036 		 * send a copy of the active client.
1037 		 */
1038 		if ((mp1 = vnic_copymsgchain_cksum(mp_chain)) == NULL)
1039 			goto sendit;
1040 		if ((mp1 = vnic_fix_cksum(mp1)) == NULL)
1041 			goto sendit;
1042 		mac_active_rx(vnic_mac->va_mh, NULL, mp1);
1043 	}
1044 sendit:
1045 	fn_info = vnic_classifier_get_fn_info(flow_ent);
1046 	/*
1047 	 * If the vnic to which we would deliver this packet is in
1048 	 * promiscuous mode then it already received the packet via
1049 	 * vnic_promisc_rx().
1050 	 *
1051 	 * XXX assumes that ff_arg2 is a vnic_t pointer if it is
1052 	 * non-NULL (currently always true).
1053 	 */
1054 	vnic = (vnic_t *)fn_info->ff_arg2;
1055 	if ((vnic != NULL) && vnic->vn_promisc)
1056 		freemsg(mp_chain);
1057 	else if ((mp1 = vnic_fix_cksum(mp_chain)) != NULL)
1058 		(fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp1);
1059 }
1060 
1061 /*
1062  * This function is invoked when a MAC client needs to send a packet
1063  * to a NIC which is shared by VNICs. It is passed to the MAC layer
1064  * by a call to mac_vnic_set() when the NIC is opened, and is returned
1065  * to MAC clients by mac_tx_get() when VNICs are present.
1066  */
1067 mblk_t *
1068 vnic_active_tx(void *arg, mblk_t *mp_chain)
1069 {
1070 	vnic_mac_t *vnic_mac = arg;
1071 	mblk_t *mp, *extra_mp = NULL;
1072 	vnic_flow_t *flow_ent;
1073 	void *flow_cookie;
1074 	const mac_txinfo_t *mtp = vnic_mac->va_txinfo;
1075 
1076 	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
1077 		mblk_t *next;
1078 
1079 		next = mp->b_next;
1080 		mp->b_next = NULL;
1081 
1082 		vnic_promisc_rx(vnic_mac, (vnic_t *)-1, mp);
1083 
1084 		flow_ent = vnic_classify(vnic_mac, mp, &extra_mp);
1085 		ASSERT(extra_mp == NULL);
1086 		extra_mp = next;
1087 
1088 		if (flow_ent != NULL) {
1089 			flow_cookie = vnic_classifier_get_client_cookie(
1090 			    flow_ent);
1091 			if (flow_cookie != NULL) {
1092 				/*
1093 				 * Send a copy to every VNIC defined on the
1094 				 * interface, as well as the underlying MAC.
1095 				 */
1096 				vnic_bcast_send(flow_cookie, (vnic_t *)-1, mp);
1097 			} else {
1098 				/*
1099 				 * loopback the packet to a local VNIC or
1100 				 * an active MAC client.
1101 				 */
1102 				vnic_local_tx(vnic_mac, flow_ent, mp);
1103 			}
1104 			VNIC_FLOW_REFRELE(flow_ent);
1105 			mp_chain = NULL;
1106 		} else {
1107 			/*
1108 			 * Non-VNIC destination, send via the underlying
1109 			 * NIC. In order to avoid a recursive call
1110 			 * to this function, we ensured that mtp points
1111 			 * to the unerlying NIC transmit function
1112 			 * by inilizating through mac_vnic_tx_get().
1113 			 */
1114 			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
1115 			if (mp_chain != NULL)
1116 				break;
1117 		}
1118 	}
1119 
1120 	if ((mp_chain != NULL) && (extra_mp != NULL)) {
1121 		ASSERT(mp_chain->b_next == NULL);
1122 		mp_chain->b_next = extra_mp;
1123 	}
1124 	return (mp_chain);
1125 }
1126 
1127 /*
1128  * VNIC transmit function.
1129  */
1130 mblk_t *
1131 vnic_m_tx(void *arg, mblk_t *mp_chain)
1132 {
1133 	vnic_t *vnic = arg;
1134 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1135 	mblk_t *mp, *extra_mp = NULL;
1136 	vnic_flow_t *flow_ent;
1137 	void *flow_cookie;
1138 
1139 	/*
1140 	 * Update stats.
1141 	 */
1142 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
1143 		vnic->vn_stat_opackets++;
1144 		vnic->vn_stat_obytes += msgdsize(mp);
1145 	}
1146 
1147 	for (mp = mp_chain; mp != NULL; mp = extra_mp) {
1148 		mblk_t *next;
1149 
1150 		next = mp->b_next;
1151 		mp->b_next = NULL;
1152 
1153 		vnic_promisc_rx(vnic->vn_vnic_mac, vnic, mp);
1154 
1155 		flow_ent = vnic_classify(vnic->vn_vnic_mac, mp, &extra_mp);
1156 		ASSERT(extra_mp == NULL);
1157 		extra_mp = next;
1158 
1159 		if (flow_ent != NULL) {
1160 			flow_cookie = vnic_classifier_get_client_cookie(
1161 			    flow_ent);
1162 			if (flow_cookie != NULL) {
1163 				/*
1164 				 * The vnic_bcast_send function expects
1165 				 * to receive the sender VNIC as value
1166 				 * for arg2.
1167 				 */
1168 				vnic_bcast_send(flow_cookie, vnic, mp);
1169 			} else {
1170 				/*
1171 				 * loopback the packet to a local VNIC or
1172 				 * an active MAC client.
1173 				 */
1174 				vnic_local_tx(vnic_mac, flow_ent, mp);
1175 			}
1176 			VNIC_FLOW_REFRELE(flow_ent);
1177 			mp_chain = NULL;
1178 		} else {
1179 			/*
1180 			 * Non-local destination, send via the underlying
1181 			 * NIC.
1182 			 */
1183 			const mac_txinfo_t *mtp = vnic->vn_txinfo;
1184 			mp_chain = mtp->mt_fn(mtp->mt_arg, mp);
1185 			if (mp_chain != NULL)
1186 				break;
1187 		}
1188 	}
1189 
1190 	/* update stats to account for unsent packets */
1191 	for (mp = mp_chain; mp != NULL; mp = mp->b_next) {
1192 		vnic->vn_stat_opackets--;
1193 		vnic->vn_stat_obytes -= msgdsize(mp);
1194 		vnic->vn_stat_oerrors++;
1195 		/*
1196 		 * link back in the last portion not counted due to bandwidth
1197 		 * control.
1198 		 */
1199 		if (mp->b_next == NULL) {
1200 			mp->b_next = extra_mp;
1201 			break;
1202 		}
1203 	}
1204 
1205 	return (mp_chain);
1206 }
1207 
1208 /* ARGSUSED */
1209 static void
1210 vnic_m_resources(void *arg)
1211 {
1212 	/* no resources to advertise */
1213 }
1214 
1215 static int
1216 vnic_m_stat(void *arg, uint_t stat, uint64_t *val)
1217 {
1218 	vnic_t *vnic = arg;
1219 	int rval = 0;
1220 
1221 	rw_enter(&vnic_lock, RW_READER);
1222 
1223 	switch (stat) {
1224 	case ETHER_STAT_LINK_DUPLEX:
1225 		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
1226 		    ETHER_STAT_LINK_DUPLEX);
1227 		break;
1228 	case MAC_STAT_IFSPEED:
1229 		*val = mac_stat_get(vnic->vn_vnic_mac->va_mh,
1230 		    MAC_STAT_IFSPEED);
1231 		break;
1232 	case MAC_STAT_MULTIRCV:
1233 		*val = vnic->vn_stat_multircv;
1234 		break;
1235 	case MAC_STAT_BRDCSTRCV:
1236 		*val = vnic->vn_stat_brdcstrcv;
1237 		break;
1238 	case MAC_STAT_MULTIXMT:
1239 		*val = vnic->vn_stat_multixmt;
1240 		break;
1241 	case MAC_STAT_BRDCSTXMT:
1242 		*val = vnic->vn_stat_brdcstxmt;
1243 		break;
1244 	case MAC_STAT_IERRORS:
1245 		*val = vnic->vn_stat_ierrors;
1246 		break;
1247 	case MAC_STAT_OERRORS:
1248 		*val = vnic->vn_stat_oerrors;
1249 		break;
1250 	case MAC_STAT_RBYTES:
1251 		*val = vnic->vn_stat_rbytes;
1252 		break;
1253 	case MAC_STAT_IPACKETS:
1254 		*val = vnic->vn_stat_ipackets;
1255 		break;
1256 	case MAC_STAT_OBYTES:
1257 		*val = vnic->vn_stat_obytes;
1258 		break;
1259 	case MAC_STAT_OPACKETS:
1260 		*val = vnic->vn_stat_opackets;
1261 		break;
1262 	default:
1263 		rval = ENOTSUP;
1264 	}
1265 
1266 	rw_exit(&vnic_lock);
1267 	return (rval);
1268 }
1269 
1270 /*
1271  * Return information about the specified capability.
1272  */
1273 /* ARGSUSED */
1274 static boolean_t
1275 vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
1276 {
1277 	vnic_t *vnic = arg;
1278 
1279 	switch (cap) {
1280 	case MAC_CAPAB_POLL:
1281 		return (B_TRUE);
1282 	case MAC_CAPAB_HCKSUM: {
1283 		uint32_t *hcksum_txflags = cap_data;
1284 
1285 		*hcksum_txflags = vnic->vn_hcksum_txflags &
1286 		    (HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM |
1287 		    HCKSUM_INET_PARTIAL);
1288 		break;
1289 	}
1290 	default:
1291 		return (B_FALSE);
1292 	}
1293 	return (B_TRUE);
1294 }
1295 
1296 static int
1297 vnic_m_start(void *arg)
1298 {
1299 	vnic_t *vnic = arg;
1300 	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
1301 	int rc;
1302 
1303 	rc = mac_start(lower_mh);
1304 	if (rc != 0)
1305 		return (rc);
1306 
1307 	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx, vnic, vnic);
1308 	return (0);
1309 }
1310 
1311 static void
1312 vnic_m_stop(void *arg)
1313 {
1314 	vnic_t *vnic = arg;
1315 	mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh;
1316 
1317 	vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx_initial,
1318 	    vnic, vnic);
1319 	mac_stop(lower_mh);
1320 }
1321 
1322 /* ARGSUSED */
1323 static int
1324 vnic_m_promisc(void *arg, boolean_t on)
1325 {
1326 	vnic_t *vnic = arg;
1327 
1328 	return (vnic_promisc_set(vnic, on));
1329 }
1330 
1331 static int
1332 vnic_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
1333 {
1334 	vnic_t *vnic = arg;
1335 	int rc = 0;
1336 
1337 	if (add)
1338 		rc = vnic_bcast_add(vnic, addrp, MAC_ADDRTYPE_MULTICAST);
1339 	else
1340 		vnic_bcast_delete(vnic, addrp);
1341 
1342 	return (rc);
1343 }
1344 
1345 static int
1346 vnic_m_unicst(void *arg, const uint8_t *mac_addr)
1347 {
1348 	vnic_t *vnic = arg;
1349 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1350 	int rv;
1351 
1352 	rw_enter(&vnic_lock, RW_WRITER);
1353 	rv = vnic_modify_mac_addr(vnic, vnic_mac->va_addr_len,
1354 	    (uchar_t *)mac_addr);
1355 	rw_exit(&vnic_lock);
1356 
1357 	if (rv == 0)
1358 		mac_unicst_update(vnic->vn_mh, mac_addr);
1359 	return (0);
1360 }
1361 
1362 int
1363 vnic_info(uint_t *nvnics, datalink_id_t vnic_id, datalink_id_t linkid,
1364     void *fn_arg, vnic_info_new_vnic_fn_t new_vnic_fn)
1365 {
1366 	vnic_info_state_t state;
1367 	int rc = 0;
1368 
1369 	rw_enter(&vnic_lock, RW_READER);
1370 
1371 	*nvnics = vnic_count;
1372 
1373 	bzero(&state, sizeof (state));
1374 	state.vs_vnic_id = vnic_id;
1375 	state.vs_linkid = linkid;
1376 	state.vs_new_vnic_fn = new_vnic_fn;
1377 	state.vs_fn_arg = fn_arg;
1378 
1379 	mod_hash_walk(vnic_hash, vnic_info_walker, &state);
1380 
1381 	if ((rc = state.vs_rc) == 0 && vnic_id != DATALINK_ALL_LINKID &&
1382 	    !state.vs_vnic_found)
1383 		rc = ENOENT;
1384 
1385 	rw_exit(&vnic_lock);
1386 	return (rc);
1387 }
1388 
1389 /*
1390  * Walker invoked when building a list of vnics that must be passed
1391  * up to user space.
1392  */
1393 /*ARGSUSED*/
1394 static uint_t
1395 vnic_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
1396 {
1397 	vnic_t *vnic;
1398 	vnic_info_state_t *state = arg;
1399 
1400 	if (state->vs_rc != 0)
1401 		return (MH_WALK_TERMINATE);	/* terminate walk */
1402 
1403 	vnic = (vnic_t *)val;
1404 
1405 	if (state->vs_vnic_id != DATALINK_ALL_LINKID &&
1406 	    vnic->vn_id != state->vs_vnic_id) {
1407 		goto bail;
1408 	}
1409 
1410 	state->vs_vnic_found = B_TRUE;
1411 
1412 	state->vs_rc = state->vs_new_vnic_fn(state->vs_fn_arg,
1413 	    vnic->vn_id, vnic->vn_addr_type, vnic->vn_vnic_mac->va_addr_len,
1414 	    vnic->vn_addr, vnic->vn_vnic_mac->va_linkid);
1415 bail:
1416 	return ((state->vs_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
1417 }
1418 
1419 /*
1420  * vnic_notify_cb() and vnic_notify_walker() below are used to
1421  * process events received from an underlying NIC and, if needed,
1422  * forward these events to the VNICs defined on top of that NIC.
1423  */
1424 
1425 typedef struct vnic_notify_state {
1426 	mac_notify_type_t	vo_type;
1427 	vnic_mac_t		*vo_vnic_mac;
1428 } vnic_notify_state_t;
1429 
1430 /* ARGSUSED */
1431 static uint_t
1432 vnic_notify_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
1433 {
1434 	vnic_t *vnic = (vnic_t *)val;
1435 	vnic_notify_state_t *state = arg;
1436 
1437 	/* ignore VNICs that don't use the specified underlying MAC */
1438 	if (vnic->vn_vnic_mac != state->vo_vnic_mac)
1439 		return (MH_WALK_CONTINUE);
1440 
1441 	switch (state->vo_type) {
1442 	case MAC_NOTE_TX:
1443 		mac_tx_update(vnic->vn_mh);
1444 		break;
1445 	case MAC_NOTE_LINK:
1446 		/*
1447 		 * The VNIC link state must be up regardless of
1448 		 * the link state of the underlying NIC to maintain
1449 		 * connectivity between VNICs on the same host.
1450 		 */
1451 		mac_link_update(vnic->vn_mh, LINK_STATE_UP);
1452 		break;
1453 	case MAC_NOTE_UNICST:
1454 		vnic_update_active_rx(vnic->vn_vnic_mac);
1455 		break;
1456 	case MAC_NOTE_VNIC:
1457 		/* only for clients which share a NIC with a VNIC */
1458 		break;
1459 	case MAC_NOTE_PROMISC:
1460 		mutex_enter(&vnic_mac_lock);
1461 		vnic->vn_vnic_mac->va_txinfo = mac_vnic_tx_get(
1462 		    vnic->vn_vnic_mac->va_mh);
1463 		mutex_exit(&vnic_mac_lock);
1464 		break;
1465 	}
1466 
1467 	return (MH_WALK_CONTINUE);
1468 }
1469 
1470 static void
1471 vnic_notify_cb(void *arg, mac_notify_type_t type)
1472 {
1473 	vnic_mac_t *vnic = arg;
1474 	vnic_notify_state_t state;
1475 
1476 	state.vo_type = type;
1477 	state.vo_vnic_mac = vnic;
1478 
1479 	rw_enter(&vnic_lock, RW_READER);
1480 	mod_hash_walk(vnic_hash, vnic_notify_walker, &state);
1481 	rw_exit(&vnic_lock);
1482 }
1483 
1484 static int
1485 vnic_modify_mac_addr(vnic_t *vnic, uint_t mac_len, uchar_t *mac_addr)
1486 {
1487 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1488 	vnic_flow_t *vnic_flow = vnic->vn_flow_ent;
1489 
1490 	ASSERT(RW_WRITE_HELD(&vnic_lock));
1491 
1492 	if (mac_len != vnic_mac->va_addr_len)
1493 		return (EINVAL);
1494 
1495 	vnic_classifier_flow_update_addr(vnic_flow, mac_addr);
1496 	return (0);
1497 }
1498 
1499 static int
1500 vnic_promisc_set(vnic_t *vnic, boolean_t on)
1501 {
1502 	vnic_mac_t *vnic_mac = vnic->vn_vnic_mac;
1503 	int r = -1;
1504 
1505 	if (vnic->vn_promisc == on)
1506 		return (0);
1507 
1508 	if (on) {
1509 		if ((r = mac_promisc_set(vnic_mac->va_mh, B_TRUE,
1510 		    MAC_DEVPROMISC)) != 0) {
1511 			return (r);
1512 		}
1513 
1514 		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
1515 		vnic->vn_promisc_next = vnic_mac->va_promisc;
1516 		vnic_mac->va_promisc = vnic;
1517 		vnic_mac->va_promisc_gen++;
1518 
1519 		vnic->vn_promisc = B_TRUE;
1520 		rw_exit(&vnic_mac->va_promisc_lock);
1521 
1522 		return (0);
1523 	} else {
1524 		vnic_t *loop, *prev = NULL;
1525 
1526 		rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER);
1527 		loop = vnic_mac->va_promisc;
1528 
1529 		while ((loop != NULL) && (loop != vnic)) {
1530 			prev = loop;
1531 			loop = loop->vn_promisc_next;
1532 		}
1533 
1534 		if ((loop != NULL) &&
1535 		    ((r = mac_promisc_set(vnic_mac->va_mh, B_FALSE,
1536 		    MAC_DEVPROMISC)) == 0)) {
1537 			if (prev != NULL)
1538 				prev->vn_promisc_next = loop->vn_promisc_next;
1539 			else
1540 				vnic_mac->va_promisc = loop->vn_promisc_next;
1541 			vnic_mac->va_promisc_gen++;
1542 
1543 			vnic->vn_promisc = B_FALSE;
1544 		}
1545 		rw_exit(&vnic_mac->va_promisc_lock);
1546 
1547 		return (r);
1548 	}
1549 }
1550 
1551 void
1552 vnic_promisc_rx(vnic_mac_t *vnic_mac, vnic_t *sender, mblk_t *mp)
1553 {
1554 	vnic_t *loop;
1555 	vnic_flow_t *flow;
1556 	const vnic_flow_fn_info_t *fn_info;
1557 	mac_header_info_t hdr_info;
1558 	boolean_t dst_must_match = B_TRUE;
1559 
1560 	ASSERT(mp->b_next == NULL);
1561 
1562 	rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
1563 	if (vnic_mac->va_promisc == NULL)
1564 		goto done;
1565 
1566 	if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0)
1567 		goto done;
1568 
1569 	/*
1570 	 * If this is broadcast or multicast then the destination
1571 	 * address need not match for us to deliver it.
1572 	 */
1573 	if ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) ||
1574 	    (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST))
1575 		dst_must_match = B_FALSE;
1576 
1577 	for (loop = vnic_mac->va_promisc;
1578 	    loop != NULL;
1579 	    loop = loop->vn_promisc_next) {
1580 		if (loop == sender)
1581 			continue;
1582 
1583 		if (dst_must_match &&
1584 		    (bcmp(hdr_info.mhi_daddr, loop->vn_addr,
1585 		    sizeof (loop->vn_addr)) != 0))
1586 			continue;
1587 
1588 		flow = loop->vn_flow_ent;
1589 		ASSERT(flow != NULL);
1590 
1591 		if (!flow->vf_is_active) {
1592 			mblk_t *copy;
1593 			uint64_t gen;
1594 
1595 			if ((copy = vnic_copymsg_cksum(mp)) == NULL)
1596 				break;
1597 			if ((sender != NULL) &&
1598 			    ((copy = vnic_fix_cksum(copy)) == NULL))
1599 				break;
1600 
1601 			VNIC_FLOW_REFHOLD(flow);
1602 			gen = vnic_mac->va_promisc_gen;
1603 			rw_exit(&vnic_mac->va_promisc_lock);
1604 
1605 			fn_info = vnic_classifier_get_fn_info(flow);
1606 			(fn_info->ff_fn)(fn_info->ff_arg1,
1607 			    fn_info->ff_arg2, copy);
1608 
1609 			VNIC_FLOW_REFRELE(flow);
1610 			rw_enter(&vnic_mac->va_promisc_lock, RW_READER);
1611 			if (vnic_mac->va_promisc_gen != gen)
1612 				break;
1613 		}
1614 	}
1615 done:
1616 	rw_exit(&vnic_mac->va_promisc_lock);
1617 }
1618