xref: /titanic_44/usr/src/uts/common/io/vnic/vnic_dev.c (revision 45405cce0657d01714b3d014a0facf3bdce45736)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/cred.h>
28 #include <sys/sysmacros.h>
29 #include <sys/conf.h>
30 #include <sys/cmn_err.h>
31 #include <sys/list.h>
32 #include <sys/ksynch.h>
33 #include <sys/kmem.h>
34 #include <sys/stream.h>
35 #include <sys/modctl.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/atomic.h>
39 #include <sys/stat.h>
40 #include <sys/modhash.h>
41 #include <sys/strsubr.h>
42 #include <sys/strsun.h>
43 #include <sys/dlpi.h>
44 #include <sys/mac.h>
45 #include <sys/mac_provider.h>
46 #include <sys/mac_client.h>
47 #include <sys/mac_client_priv.h>
48 #include <sys/mac_ether.h>
49 #include <sys/dls.h>
50 #include <sys/pattr.h>
51 #include <sys/time.h>
52 #include <sys/vlan.h>
53 #include <sys/vnic.h>
54 #include <sys/vnic_impl.h>
55 #include <sys/mac_flow_impl.h>
56 #include <inet/ip_impl.h>
57 
58 /*
59  * Note that for best performance, the VNIC is a passthrough design.
60  * For each VNIC corresponds a MAC client of the underlying MAC (lower MAC).
61  * This MAC client is opened by the VNIC driver at VNIC creation,
62  * and closed when the VNIC is deleted.
63  * When a MAC client of the VNIC itself opens a VNIC, the MAC layer
64  * (upper MAC) detects that the MAC being opened is a VNIC. Instead
65  * of allocating a new MAC client, it asks the VNIC driver to return
66  * the lower MAC client handle associated with the VNIC, and that handle
67  * is returned to the upper MAC client directly. This allows access
68  * by upper MAC clients of the VNIC to have direct access to the lower
69  * MAC client for the control path and data path.
70  *
71  * Due to this passthrough, some of the entry points exported by the
72  * VNIC driver are never directly invoked. These entry points include
73  * vnic_m_start, vnic_m_stop, vnic_m_promisc, vnic_m_multicst, etc.
74  *
75  * VNICs support multiple upper mac clients to enable support for
76  * multiple MAC addresses on the VNIC. When the VNIC is created the
77  * initial mac client is the primary upper mac. Any additional mac
78  * clients are secondary macs.
79  */
80 
81 static int vnic_m_start(void *);
82 static void vnic_m_stop(void *);
83 static int vnic_m_promisc(void *, boolean_t);
84 static int vnic_m_multicst(void *, boolean_t, const uint8_t *);
85 static int vnic_m_unicst(void *, const uint8_t *);
86 static int vnic_m_stat(void *, uint_t, uint64_t *);
87 static void vnic_m_ioctl(void *, queue_t *, mblk_t *);
88 static int vnic_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
89     const void *);
90 static int vnic_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
91 static void vnic_m_propinfo(void *, const char *, mac_prop_id_t,
92     mac_prop_info_handle_t);
93 static mblk_t *vnic_m_tx(void *, mblk_t *);
94 static boolean_t vnic_m_capab_get(void *, mac_capab_t, void *);
95 static void vnic_notify_cb(void *, mac_notify_type_t);
96 static void vnic_cleanup_secondary_macs(vnic_t *, int);
97 
98 static kmem_cache_t	*vnic_cache;
99 static krwlock_t	vnic_lock;
100 static uint_t		vnic_count;
101 
102 #define	ANCHOR_VNIC_MIN_MTU	576
103 #define	ANCHOR_VNIC_MAX_MTU	9000
104 
105 /* hash of VNICs (vnic_t's), keyed by VNIC id */
106 static mod_hash_t	*vnic_hash;
107 #define	VNIC_HASHSZ	64
108 #define	VNIC_HASH_KEY(vnic_id)	((mod_hash_key_t)(uintptr_t)vnic_id)
109 
110 #define	VNIC_M_CALLBACK_FLAGS	\
111 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
112 
113 static mac_callbacks_t vnic_m_callbacks = {
114 	VNIC_M_CALLBACK_FLAGS,
115 	vnic_m_stat,
116 	vnic_m_start,
117 	vnic_m_stop,
118 	vnic_m_promisc,
119 	vnic_m_multicst,
120 	vnic_m_unicst,
121 	vnic_m_tx,
122 	NULL,
123 	vnic_m_ioctl,
124 	vnic_m_capab_get,
125 	NULL,
126 	NULL,
127 	vnic_m_setprop,
128 	vnic_m_getprop,
129 	vnic_m_propinfo
130 };
131 
132 void
133 vnic_dev_init(void)
134 {
135 	vnic_cache = kmem_cache_create("vnic_cache",
136 	    sizeof (vnic_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
137 
138 	vnic_hash = mod_hash_create_idhash("vnic_hash",
139 	    VNIC_HASHSZ, mod_hash_null_valdtor);
140 
141 	rw_init(&vnic_lock, NULL, RW_DEFAULT, NULL);
142 
143 	vnic_count = 0;
144 }
145 
146 void
147 vnic_dev_fini(void)
148 {
149 	ASSERT(vnic_count == 0);
150 
151 	rw_destroy(&vnic_lock);
152 	mod_hash_destroy_idhash(vnic_hash);
153 	kmem_cache_destroy(vnic_cache);
154 }
155 
156 uint_t
157 vnic_dev_count(void)
158 {
159 	return (vnic_count);
160 }
161 
162 static vnic_ioc_diag_t
163 vnic_mac2vnic_diag(mac_diag_t diag)
164 {
165 	switch (diag) {
166 	case MAC_DIAG_MACADDR_NIC:
167 		return (VNIC_IOC_DIAG_MACADDR_NIC);
168 	case MAC_DIAG_MACADDR_INUSE:
169 		return (VNIC_IOC_DIAG_MACADDR_INUSE);
170 	case MAC_DIAG_MACADDR_INVALID:
171 		return (VNIC_IOC_DIAG_MACADDR_INVALID);
172 	case MAC_DIAG_MACADDRLEN_INVALID:
173 		return (VNIC_IOC_DIAG_MACADDRLEN_INVALID);
174 	case MAC_DIAG_MACFACTORYSLOTINVALID:
175 		return (VNIC_IOC_DIAG_MACFACTORYSLOTINVALID);
176 	case MAC_DIAG_MACFACTORYSLOTUSED:
177 		return (VNIC_IOC_DIAG_MACFACTORYSLOTUSED);
178 	case MAC_DIAG_MACFACTORYSLOTALLUSED:
179 		return (VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED);
180 	case MAC_DIAG_MACFACTORYNOTSUP:
181 		return (VNIC_IOC_DIAG_MACFACTORYNOTSUP);
182 	case MAC_DIAG_MACPREFIX_INVALID:
183 		return (VNIC_IOC_DIAG_MACPREFIX_INVALID);
184 	case MAC_DIAG_MACPREFIXLEN_INVALID:
185 		return (VNIC_IOC_DIAG_MACPREFIXLEN_INVALID);
186 	case MAC_DIAG_MACNO_HWRINGS:
187 		return (VNIC_IOC_DIAG_NO_HWRINGS);
188 	default:
189 		return (VNIC_IOC_DIAG_NONE);
190 	}
191 }
192 
193 static int
194 vnic_unicast_add(vnic_t *vnic, vnic_mac_addr_type_t vnic_addr_type,
195     int *addr_slot, uint_t prefix_len, int *addr_len_ptr_arg,
196     uint8_t *mac_addr_arg, uint16_t flags, vnic_ioc_diag_t *diag,
197     uint16_t vid, boolean_t req_hwgrp_flag)
198 {
199 	mac_diag_t mac_diag;
200 	uint16_t mac_flags = 0;
201 	int err;
202 	uint_t addr_len;
203 
204 	if (flags & VNIC_IOC_CREATE_NODUPCHECK)
205 		mac_flags |= MAC_UNICAST_NODUPCHECK;
206 
207 	switch (vnic_addr_type) {
208 	case VNIC_MAC_ADDR_TYPE_FIXED:
209 	case VNIC_MAC_ADDR_TYPE_VRID:
210 		/*
211 		 * The MAC address value to assign to the VNIC
212 		 * is already provided in mac_addr_arg. addr_len_ptr_arg
213 		 * already contains the MAC address length.
214 		 */
215 		break;
216 
217 	case VNIC_MAC_ADDR_TYPE_RANDOM:
218 		/*
219 		 * Random MAC address. There are two sub-cases:
220 		 *
221 		 * 1 - If mac_len == 0, a new MAC address is generated.
222 		 *	The length of the MAC address to generated depends
223 		 *	on the type of MAC used. The prefix to use for the MAC
224 		 *	address is stored in the most significant bytes
225 		 *	of the mac_addr argument, and its length is specified
226 		 *	by the mac_prefix_len argument. This prefix can
227 		 *	correspond to a IEEE OUI in the case of Ethernet,
228 		 *	for example.
229 		 *
230 		 * 2 - If mac_len > 0, the address was already picked
231 		 *	randomly, and is now passed back during VNIC
232 		 *	re-creation. The mac_addr argument contains the MAC
233 		 *	address that was generated. We distinguish this
234 		 *	case from the fixed MAC address case, since we
235 		 *	want the user consumers to know, when they query
236 		 *	the list of VNICs, that a VNIC was assigned a
237 		 *	random MAC address vs assigned a fixed address
238 		 *	specified by the user.
239 		 */
240 
241 		/*
242 		 * If it's a pre-generated address, we're done. mac_addr_arg
243 		 * and addr_len_ptr_arg already contain the MAC address
244 		 * value and length.
245 		 */
246 		if (*addr_len_ptr_arg > 0)
247 			break;
248 
249 		/* generate a new random MAC address */
250 		if ((err = mac_addr_random(vnic->vn_mch,
251 		    prefix_len, mac_addr_arg, &mac_diag)) != 0) {
252 			*diag = vnic_mac2vnic_diag(mac_diag);
253 			return (err);
254 		}
255 		*addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
256 		break;
257 
258 	case VNIC_MAC_ADDR_TYPE_FACTORY:
259 		err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot);
260 		if (err != 0) {
261 			if (err == EINVAL)
262 				*diag = VNIC_IOC_DIAG_MACFACTORYSLOTINVALID;
263 			if (err == EBUSY)
264 				*diag = VNIC_IOC_DIAG_MACFACTORYSLOTUSED;
265 			if (err == ENOSPC)
266 				*diag = VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED;
267 			return (err);
268 		}
269 
270 		mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot,
271 		    mac_addr_arg, &addr_len, NULL, NULL);
272 		*addr_len_ptr_arg = addr_len;
273 		break;
274 
275 	case VNIC_MAC_ADDR_TYPE_AUTO:
276 		/* first try to allocate a factory MAC address */
277 		err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot);
278 		if (err == 0) {
279 			mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot,
280 			    mac_addr_arg, &addr_len, NULL, NULL);
281 			vnic_addr_type = VNIC_MAC_ADDR_TYPE_FACTORY;
282 			*addr_len_ptr_arg = addr_len;
283 			break;
284 		}
285 
286 		/*
287 		 * Allocating a factory MAC address failed, generate a
288 		 * random MAC address instead.
289 		 */
290 		if ((err = mac_addr_random(vnic->vn_mch,
291 		    prefix_len, mac_addr_arg, &mac_diag)) != 0) {
292 			*diag = vnic_mac2vnic_diag(mac_diag);
293 			return (err);
294 		}
295 		*addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
296 		vnic_addr_type = VNIC_MAC_ADDR_TYPE_RANDOM;
297 		break;
298 	case VNIC_MAC_ADDR_TYPE_PRIMARY:
299 		/*
300 		 * We get the address here since we copy it in the
301 		 * vnic's vn_addr.
302 		 * We can't ask for hardware resources since we
303 		 * don't currently support hardware classification
304 		 * for these MAC clients.
305 		 */
306 		if (req_hwgrp_flag) {
307 			*diag = VNIC_IOC_DIAG_NO_HWRINGS;
308 			return (ENOTSUP);
309 		}
310 		mac_unicast_primary_get(vnic->vn_lower_mh, mac_addr_arg);
311 		*addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh);
312 		mac_flags |= MAC_UNICAST_VNIC_PRIMARY;
313 		break;
314 	}
315 
316 	vnic->vn_addr_type = vnic_addr_type;
317 
318 	err = mac_unicast_add(vnic->vn_mch, mac_addr_arg, mac_flags,
319 	    &vnic->vn_muh, vid, &mac_diag);
320 	if (err != 0) {
321 		if (vnic_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) {
322 			/* release factory MAC address */
323 			mac_addr_factory_release(vnic->vn_mch, *addr_slot);
324 		}
325 		*diag = vnic_mac2vnic_diag(mac_diag);
326 	}
327 
328 	return (err);
329 }
330 
331 /*
332  * Create a new VNIC upon request from administrator.
333  * Returns 0 on success, an errno on failure.
334  */
335 /* ARGSUSED */
336 int
337 vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
338     vnic_mac_addr_type_t *vnic_addr_type, int *mac_len, uchar_t *mac_addr,
339     int *mac_slot, uint_t mac_prefix_len, uint16_t vid, vrid_t vrid,
340     int af, mac_resource_props_t *mrp, uint32_t flags, vnic_ioc_diag_t *diag,
341     cred_t *credp)
342 {
343 	vnic_t *vnic;
344 	mac_register_t *mac;
345 	int err;
346 	boolean_t is_anchor = ((flags & VNIC_IOC_CREATE_ANCHOR) != 0);
347 	char vnic_name[MAXNAMELEN];
348 	const mac_info_t *minfop;
349 	uint32_t req_hwgrp_flag = B_FALSE;
350 
351 	*diag = VNIC_IOC_DIAG_NONE;
352 
353 	rw_enter(&vnic_lock, RW_WRITER);
354 
355 	/* does a VNIC with the same id already exist? */
356 	err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
357 	    (mod_hash_val_t *)&vnic);
358 	if (err == 0) {
359 		rw_exit(&vnic_lock);
360 		return (EEXIST);
361 	}
362 
363 	vnic = kmem_cache_alloc(vnic_cache, KM_NOSLEEP);
364 	if (vnic == NULL) {
365 		rw_exit(&vnic_lock);
366 		return (ENOMEM);
367 	}
368 
369 	bzero(vnic, sizeof (*vnic));
370 
371 	vnic->vn_id = vnic_id;
372 	vnic->vn_link_id = linkid;
373 	vnic->vn_vrid = vrid;
374 	vnic->vn_af = af;
375 
376 	if (!is_anchor) {
377 		if (linkid == DATALINK_INVALID_LINKID) {
378 			err = EINVAL;
379 			goto bail;
380 		}
381 
382 		/*
383 		 * Open the lower MAC and assign its initial bandwidth and
384 		 * MAC address. We do this here during VNIC creation and
385 		 * do not wait until the upper MAC client open so that we
386 		 * can validate the VNIC creation parameters (bandwidth,
387 		 * MAC address, etc) and reserve a factory MAC address if
388 		 * one was requested.
389 		 */
390 		err = mac_open_by_linkid(linkid, &vnic->vn_lower_mh);
391 		if (err != 0)
392 			goto bail;
393 
394 		/*
395 		 * VNIC(vlan) over VNICs(vlans) is not supported.
396 		 */
397 		if (mac_is_vnic(vnic->vn_lower_mh)) {
398 			err = EINVAL;
399 			goto bail;
400 		}
401 
402 		/* only ethernet support for now */
403 		minfop = mac_info(vnic->vn_lower_mh);
404 		if (minfop->mi_nativemedia != DL_ETHER) {
405 			err = ENOTSUP;
406 			goto bail;
407 		}
408 
409 		(void) dls_mgmt_get_linkinfo(vnic_id, vnic_name, NULL, NULL,
410 		    NULL);
411 		err = mac_client_open(vnic->vn_lower_mh, &vnic->vn_mch,
412 		    vnic_name, MAC_OPEN_FLAGS_IS_VNIC);
413 		if (err != 0)
414 			goto bail;
415 
416 		/* assign a MAC address to the VNIC */
417 
418 		err = vnic_unicast_add(vnic, *vnic_addr_type, mac_slot,
419 		    mac_prefix_len, mac_len, mac_addr, flags, diag, vid,
420 		    req_hwgrp_flag);
421 		if (err != 0) {
422 			vnic->vn_muh = NULL;
423 			if (diag != NULL && req_hwgrp_flag)
424 				*diag = VNIC_IOC_DIAG_NO_HWRINGS;
425 			goto bail;
426 		}
427 
428 		/* register to receive notification from underlying MAC */
429 		vnic->vn_mnh = mac_notify_add(vnic->vn_lower_mh, vnic_notify_cb,
430 		    vnic);
431 
432 		*vnic_addr_type = vnic->vn_addr_type;
433 		vnic->vn_addr_len = *mac_len;
434 		vnic->vn_vid = vid;
435 
436 		bcopy(mac_addr, vnic->vn_addr, vnic->vn_addr_len);
437 
438 		if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY)
439 			vnic->vn_slot_id = *mac_slot;
440 
441 		/*
442 		 * Set the initial VNIC capabilities. If the VNIC is created
443 		 * over MACs which does not support nactive vlan, disable
444 		 * VNIC's hardware checksum capability if its VID is not 0,
445 		 * since the underlying MAC would get the hardware checksum
446 		 * offset wrong in case of VLAN packets.
447 		 */
448 		if (vid == 0 || !mac_capab_get(vnic->vn_lower_mh,
449 		    MAC_CAPAB_NO_NATIVEVLAN, NULL)) {
450 			if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_HCKSUM,
451 			    &vnic->vn_hcksum_txflags))
452 				vnic->vn_hcksum_txflags = 0;
453 		} else {
454 			vnic->vn_hcksum_txflags = 0;
455 		}
456 	}
457 
458 	/* register with the MAC module */
459 	if ((mac = mac_alloc(MAC_VERSION)) == NULL)
460 		goto bail;
461 
462 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
463 	mac->m_driver = vnic;
464 	mac->m_dip = vnic_get_dip();
465 	mac->m_instance = (uint_t)-1;
466 	mac->m_src_addr = vnic->vn_addr;
467 	mac->m_callbacks = &vnic_m_callbacks;
468 
469 	if (!is_anchor) {
470 		/*
471 		 * If this is a VNIC based VLAN, then we check for the
472 		 * margin unless it has been created with the force
473 		 * flag. If we are configuring a VLAN over an etherstub,
474 		 * we don't check the margin even if force is not set.
475 		 */
476 		if (vid == 0 || (flags & VNIC_IOC_CREATE_FORCE) != 0) {
477 			if (vid != VLAN_ID_NONE)
478 				vnic->vn_force = B_TRUE;
479 			/*
480 			 * As the current margin size of the underlying mac is
481 			 * used to determine the margin size of the VNIC
482 			 * itself, request the underlying mac not to change
483 			 * to a smaller margin size.
484 			 */
485 			err = mac_margin_add(vnic->vn_lower_mh,
486 			    &vnic->vn_margin, B_TRUE);
487 			ASSERT(err == 0);
488 		} else {
489 			vnic->vn_margin = VLAN_TAGSZ;
490 			err = mac_margin_add(vnic->vn_lower_mh,
491 			    &vnic->vn_margin, B_FALSE);
492 			if (err != 0) {
493 				mac_free(mac);
494 				if (diag != NULL)
495 					*diag = VNIC_IOC_DIAG_MACMARGIN_INVALID;
496 				goto bail;
497 			}
498 		}
499 
500 		mac_sdu_get(vnic->vn_lower_mh, &mac->m_min_sdu,
501 		    &mac->m_max_sdu);
502 		err = mac_mtu_add(vnic->vn_lower_mh, &mac->m_max_sdu, B_FALSE);
503 		if (err != 0) {
504 			VERIFY(mac_margin_remove(vnic->vn_lower_mh,
505 			    vnic->vn_margin) == 0);
506 			mac_free(mac);
507 			if (diag != NULL)
508 				*diag = VNIC_IOC_DIAG_MACMTU_INVALID;
509 			goto bail;
510 		}
511 		vnic->vn_mtu = mac->m_max_sdu;
512 	} else {
513 		vnic->vn_margin = VLAN_TAGSZ;
514 		mac->m_min_sdu = 1;
515 		mac->m_max_sdu = ANCHOR_VNIC_MAX_MTU;
516 		vnic->vn_mtu = ANCHOR_VNIC_MAX_MTU;
517 	}
518 
519 	mac->m_margin = vnic->vn_margin;
520 
521 	err = mac_register(mac, &vnic->vn_mh);
522 	mac_free(mac);
523 	if (err != 0) {
524 		if (!is_anchor) {
525 			VERIFY(mac_mtu_remove(vnic->vn_lower_mh,
526 			    vnic->vn_mtu) == 0);
527 			VERIFY(mac_margin_remove(vnic->vn_lower_mh,
528 			    vnic->vn_margin) == 0);
529 		}
530 		goto bail;
531 	}
532 
533 	/* Set the VNIC's MAC in the client */
534 	if (!is_anchor) {
535 		mac_set_upper_mac(vnic->vn_mch, vnic->vn_mh, mrp);
536 
537 		if (mrp != NULL) {
538 			if ((mrp->mrp_mask & MRP_RX_RINGS) != 0 ||
539 			    (mrp->mrp_mask & MRP_TX_RINGS) != 0) {
540 				req_hwgrp_flag = B_TRUE;
541 			}
542 			err = mac_client_set_resources(vnic->vn_mch, mrp);
543 			if (err != 0) {
544 				VERIFY(mac_mtu_remove(vnic->vn_lower_mh,
545 				    vnic->vn_mtu) == 0);
546 				VERIFY(mac_margin_remove(vnic->vn_lower_mh,
547 				    vnic->vn_margin) == 0);
548 				(void) mac_unregister(vnic->vn_mh);
549 				goto bail;
550 			}
551 		}
552 	}
553 
554 	err = dls_devnet_create(vnic->vn_mh, vnic->vn_id, crgetzoneid(credp));
555 	if (err != 0) {
556 		VERIFY(is_anchor || mac_margin_remove(vnic->vn_lower_mh,
557 		    vnic->vn_margin) == 0);
558 		if (!is_anchor) {
559 			VERIFY(mac_mtu_remove(vnic->vn_lower_mh,
560 			    vnic->vn_mtu) == 0);
561 			VERIFY(mac_margin_remove(vnic->vn_lower_mh,
562 			    vnic->vn_margin) == 0);
563 		}
564 		(void) mac_unregister(vnic->vn_mh);
565 		goto bail;
566 	}
567 
568 	/* add new VNIC to hash table */
569 	err = mod_hash_insert(vnic_hash, VNIC_HASH_KEY(vnic_id),
570 	    (mod_hash_val_t)vnic);
571 	ASSERT(err == 0);
572 	vnic_count++;
573 
574 	vnic->vn_enabled = B_TRUE;
575 	rw_exit(&vnic_lock);
576 
577 	return (0);
578 
579 bail:
580 	rw_exit(&vnic_lock);
581 	if (!is_anchor) {
582 		if (vnic->vn_mnh != NULL)
583 			(void) mac_notify_remove(vnic->vn_mnh, B_TRUE);
584 		if (vnic->vn_muh != NULL)
585 			(void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh);
586 		if (vnic->vn_mch != NULL)
587 			mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC);
588 		if (vnic->vn_lower_mh != NULL)
589 			mac_close(vnic->vn_lower_mh);
590 	}
591 
592 	kmem_cache_free(vnic_cache, vnic);
593 	return (err);
594 }
595 
596 /*
597  * Modify the properties of an existing VNIC.
598  */
599 /* ARGSUSED */
600 int
601 vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask,
602     vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr,
603     uint_t mac_slot, mac_resource_props_t *mrp)
604 {
605 	vnic_t *vnic = NULL;
606 
607 	rw_enter(&vnic_lock, RW_WRITER);
608 
609 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
610 	    (mod_hash_val_t *)&vnic) != 0) {
611 		rw_exit(&vnic_lock);
612 		return (ENOENT);
613 	}
614 
615 	rw_exit(&vnic_lock);
616 
617 	return (0);
618 }
619 
620 /* ARGSUSED */
621 int
622 vnic_dev_delete(datalink_id_t vnic_id, uint32_t flags, cred_t *credp)
623 {
624 	vnic_t *vnic = NULL;
625 	mod_hash_val_t val;
626 	datalink_id_t tmpid;
627 	int rc;
628 
629 	rw_enter(&vnic_lock, RW_WRITER);
630 
631 	if (mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id),
632 	    (mod_hash_val_t *)&vnic) != 0) {
633 		rw_exit(&vnic_lock);
634 		return (ENOENT);
635 	}
636 
637 	if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid, B_TRUE)) != 0) {
638 		rw_exit(&vnic_lock);
639 		return (rc);
640 	}
641 
642 	ASSERT(vnic_id == tmpid);
643 
644 	/*
645 	 * We cannot unregister the MAC yet. Unregistering would
646 	 * free up mac_impl_t which should not happen at this time.
647 	 * So disable mac_impl_t by calling mac_disable(). This will prevent
648 	 * any new claims on mac_impl_t.
649 	 */
650 	if ((rc = mac_disable(vnic->vn_mh)) != 0) {
651 		(void) dls_devnet_create(vnic->vn_mh, vnic_id,
652 		    crgetzoneid(credp));
653 		rw_exit(&vnic_lock);
654 		return (rc);
655 	}
656 
657 	vnic_cleanup_secondary_macs(vnic, vnic->vn_nhandles);
658 
659 	vnic->vn_enabled = B_FALSE;
660 	(void) mod_hash_remove(vnic_hash, VNIC_HASH_KEY(vnic_id), &val);
661 	ASSERT(vnic == (vnic_t *)val);
662 	vnic_count--;
663 	rw_exit(&vnic_lock);
664 
665 	/*
666 	 * XXX-nicolas shouldn't have a void cast here, if it's
667 	 * expected that the function will never fail, then we should
668 	 * have an ASSERT().
669 	 */
670 	(void) mac_unregister(vnic->vn_mh);
671 
672 	if (vnic->vn_lower_mh != NULL) {
673 		/*
674 		 * Check if MAC address for the vnic was obtained from the
675 		 * factory MAC addresses. If yes, release it.
676 		 */
677 		if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) {
678 			(void) mac_addr_factory_release(vnic->vn_mch,
679 			    vnic->vn_slot_id);
680 		}
681 		(void) mac_margin_remove(vnic->vn_lower_mh, vnic->vn_margin);
682 		(void) mac_mtu_remove(vnic->vn_lower_mh, vnic->vn_mtu);
683 		(void) mac_notify_remove(vnic->vn_mnh, B_TRUE);
684 		(void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh);
685 		mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC);
686 		mac_close(vnic->vn_lower_mh);
687 	}
688 
689 	kmem_cache_free(vnic_cache, vnic);
690 	return (0);
691 }
692 
693 /* ARGSUSED */
694 mblk_t *
695 vnic_m_tx(void *arg, mblk_t *mp_chain)
696 {
697 	/*
698 	 * This function could be invoked for an anchor VNIC when sending
699 	 * broadcast and multicast packets, and unicast packets which did
700 	 * not match any local known destination.
701 	 */
702 	freemsgchain(mp_chain);
703 	return (NULL);
704 }
705 
706 /*ARGSUSED*/
707 static void
708 vnic_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
709 {
710 	miocnak(q, mp, 0, ENOTSUP);
711 }
712 
713 /*
714  * This entry point cannot be passed-through, since it is invoked
715  * for the per-VNIC kstats which must be exported independently
716  * of the existence of VNIC MAC clients.
717  */
718 static int
719 vnic_m_stat(void *arg, uint_t stat, uint64_t *val)
720 {
721 	vnic_t *vnic = arg;
722 	int rval = 0;
723 
724 	if (vnic->vn_lower_mh == NULL) {
725 		/*
726 		 * It's an anchor VNIC, which does not have any
727 		 * statistics in itself.
728 		 */
729 		return (ENOTSUP);
730 	}
731 
732 	/*
733 	 * ENOTSUP must be reported for unsupported stats, the VNIC
734 	 * driver reports a subset of the stats that would
735 	 * be returned by a real piece of hardware.
736 	 */
737 
738 	switch (stat) {
739 	case MAC_STAT_LINK_STATE:
740 	case MAC_STAT_LINK_UP:
741 	case MAC_STAT_PROMISC:
742 	case MAC_STAT_IFSPEED:
743 	case MAC_STAT_MULTIRCV:
744 	case MAC_STAT_MULTIXMT:
745 	case MAC_STAT_BRDCSTRCV:
746 	case MAC_STAT_BRDCSTXMT:
747 	case MAC_STAT_OPACKETS:
748 	case MAC_STAT_OBYTES:
749 	case MAC_STAT_IERRORS:
750 	case MAC_STAT_OERRORS:
751 	case MAC_STAT_RBYTES:
752 	case MAC_STAT_IPACKETS:
753 		*val = mac_client_stat_get(vnic->vn_mch, stat);
754 		break;
755 	default:
756 		rval = ENOTSUP;
757 	}
758 
759 	return (rval);
760 }
761 
762 /*
763  * Invoked by the upper MAC to retrieve the lower MAC client handle
764  * corresponding to a VNIC. A pointer to this function is obtained
765  * by the upper MAC via capability query.
766  *
767  * XXX-nicolas Note: this currently causes all VNIC MAC clients to
768  * receive the same MAC client handle for the same VNIC. This is ok
769  * as long as we have only one VNIC MAC client which sends and
770  * receives data, but we don't currently enforce this at the MAC layer.
771  */
772 static void *
773 vnic_mac_client_handle(void *vnic_arg)
774 {
775 	vnic_t *vnic = vnic_arg;
776 
777 	return (vnic->vn_mch);
778 }
779 
780 /*
781  * Invoked when updating the primary MAC so that the secondary MACs are
782  * kept in sync.
783  */
784 static void
785 vnic_mac_secondary_update(void *vnic_arg)
786 {
787 	vnic_t *vn = vnic_arg;
788 	int i;
789 
790 	for (i = 1; i <= vn->vn_nhandles; i++) {
791 		mac_secondary_dup(vn->vn_mc_handles[0], vn->vn_mc_handles[i]);
792 	}
793 }
794 
795 /*
796  * Return information about the specified capability.
797  */
798 /* ARGSUSED */
799 static boolean_t
800 vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
801 {
802 	vnic_t *vnic = arg;
803 
804 	switch (cap) {
805 	case MAC_CAPAB_HCKSUM: {
806 		uint32_t *hcksum_txflags = cap_data;
807 
808 		*hcksum_txflags = vnic->vn_hcksum_txflags &
809 		    (HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM |
810 		    HCKSUM_INET_PARTIAL);
811 		break;
812 	}
813 	case MAC_CAPAB_VNIC: {
814 		mac_capab_vnic_t *vnic_capab = cap_data;
815 
816 		if (vnic->vn_lower_mh == NULL) {
817 			/*
818 			 * It's an anchor VNIC, we don't have an underlying
819 			 * NIC and MAC client handle.
820 			 */
821 			return (B_FALSE);
822 		}
823 
824 		if (vnic_capab != NULL) {
825 			vnic_capab->mcv_arg = vnic;
826 			vnic_capab->mcv_mac_client_handle =
827 			    vnic_mac_client_handle;
828 			vnic_capab->mcv_mac_secondary_update =
829 			    vnic_mac_secondary_update;
830 		}
831 		break;
832 	}
833 	case MAC_CAPAB_ANCHOR_VNIC: {
834 		/* since it's an anchor VNIC we don't have lower mac handle */
835 		if (vnic->vn_lower_mh == NULL) {
836 			ASSERT(vnic->vn_link_id == 0);
837 			return (B_TRUE);
838 		}
839 		return (B_FALSE);
840 	}
841 	case MAC_CAPAB_NO_NATIVEVLAN:
842 		return (B_FALSE);
843 	case MAC_CAPAB_NO_ZCOPY:
844 		return (B_TRUE);
845 	case MAC_CAPAB_VRRP: {
846 		mac_capab_vrrp_t *vrrp_capab = cap_data;
847 
848 		if (vnic->vn_vrid != 0) {
849 			if (vrrp_capab != NULL)
850 				vrrp_capab->mcv_af = vnic->vn_af;
851 			return (B_TRUE);
852 		}
853 		return (B_FALSE);
854 	}
855 	default:
856 		return (B_FALSE);
857 	}
858 	return (B_TRUE);
859 }
860 
861 /* ARGSUSED */
862 static int
863 vnic_m_start(void *arg)
864 {
865 	return (0);
866 }
867 
868 /* ARGSUSED */
869 static void
870 vnic_m_stop(void *arg)
871 {
872 }
873 
874 /* ARGSUSED */
875 static int
876 vnic_m_promisc(void *arg, boolean_t on)
877 {
878 	return (0);
879 }
880 
881 /* ARGSUSED */
882 static int
883 vnic_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
884 {
885 	return (0);
886 }
887 
888 static int
889 vnic_m_unicst(void *arg, const uint8_t *macaddr)
890 {
891 	vnic_t *vnic = arg;
892 
893 	return (mac_vnic_unicast_set(vnic->vn_mch, macaddr));
894 }
895 
896 static void
897 vnic_cleanup_secondary_macs(vnic_t *vn, int cnt)
898 {
899 	int i;
900 
901 	/* Remove existing secondaries (primary is at 0) */
902 	for (i = 1; i <= cnt; i++) {
903 		mac_rx_clear(vn->vn_mc_handles[i]);
904 
905 		/* unicast handle might not have been set yet */
906 		if (vn->vn_mu_handles[i] != NULL)
907 			(void) mac_unicast_remove(vn->vn_mc_handles[i],
908 			    vn->vn_mu_handles[i]);
909 
910 		mac_secondary_cleanup(vn->vn_mc_handles[i]);
911 
912 		mac_client_close(vn->vn_mc_handles[i], MAC_CLOSE_FLAGS_IS_VNIC);
913 
914 		vn->vn_mu_handles[i] = NULL;
915 		vn->vn_mc_handles[i] = NULL;
916 	}
917 
918 	vn->vn_nhandles = 0;
919 }
920 
921 /*
922  * Setup secondary MAC addresses on the vnic. Due to limitations in the mac
923  * code, each mac address must be associated with a mac_client (and the
924  * flow that goes along with the client) so we need to create those clients
925  * here.
926  */
927 static int
928 vnic_set_secondary_macs(vnic_t *vn, mac_secondary_addr_t *msa)
929 {
930 	int i, err;
931 	char primary_name[MAXNAMELEN];
932 
933 	/* First, remove pre-existing secondaries */
934 	ASSERT(vn->vn_nhandles < MPT_MAXMACADDR);
935 	vnic_cleanup_secondary_macs(vn, vn->vn_nhandles);
936 
937 	if (msa->ms_addrcnt == (uint32_t)-1)
938 		msa->ms_addrcnt = 0;
939 
940 	vn->vn_nhandles = msa->ms_addrcnt;
941 
942 	(void) dls_mgmt_get_linkinfo(vn->vn_id, primary_name, NULL, NULL, NULL);
943 
944 	/*
945 	 * Now add the new secondary MACs
946 	 * Recall that the primary MAC address is the first element.
947 	 * The secondary clients are named after the primary with their
948 	 * index to distinguish them.
949 	 */
950 	for (i = 1; i <= vn->vn_nhandles; i++) {
951 		uint8_t *addr;
952 		mac_diag_t mac_diag;
953 		char secondary_name[MAXNAMELEN];
954 
955 		(void) snprintf(secondary_name, sizeof (secondary_name),
956 		    "%s%02d", primary_name, i);
957 
958 		err = mac_client_open(vn->vn_lower_mh, &vn->vn_mc_handles[i],
959 		    secondary_name, MAC_OPEN_FLAGS_IS_VNIC);
960 		if (err != 0) {
961 			/* Remove any that we successfully added */
962 			vnic_cleanup_secondary_macs(vn, --i);
963 			return (err);
964 		}
965 
966 		/*
967 		 * Assign a MAC address to the VNIC
968 		 *
969 		 * Normally this would be done with vnic_unicast_add but since
970 		 * we know these are fixed adddresses, and since we need to
971 		 * save this in the proper array slot, we bypass that function
972 		 * and go direct.
973 		 */
974 		addr = msa->ms_addrs[i - 1];
975 		err = mac_unicast_add(vn->vn_mc_handles[i], addr, 0,
976 		    &vn->vn_mu_handles[i], vn->vn_vid, &mac_diag);
977 		if (err != 0) {
978 			/* Remove any that we successfully added */
979 			vnic_cleanup_secondary_macs(vn, i);
980 			return (err);
981 		}
982 
983 		/*
984 		 * Setup the secondary the same way as the primary (i.e.
985 		 * receiver function/argument (e.g. i_dls_link_rx, mac_pkt_drop,
986 		 * etc.), the promisc list, and the resource controls).
987 		 */
988 		mac_secondary_dup(vn->vn_mc_handles[0], vn->vn_mc_handles[i]);
989 	}
990 
991 	return (0);
992 }
993 
994 static int
995 vnic_get_secondary_macs(vnic_t *vn, uint_t pr_valsize, void *pr_val)
996 {
997 	int i;
998 	mac_secondary_addr_t msa;
999 
1000 	if (pr_valsize < sizeof (msa))
1001 		return (EINVAL);
1002 
1003 	/* Get existing addresses (primary is at 0) */
1004 	ASSERT(vn->vn_nhandles < MPT_MAXMACADDR);
1005 	for (i = 1; i <= vn->vn_nhandles; i++) {
1006 		ASSERT(vn->vn_mc_handles[i] != NULL);
1007 		mac_unicast_secondary_get(vn->vn_mc_handles[i],
1008 		    msa.ms_addrs[i - 1]);
1009 	}
1010 	msa.ms_addrcnt = vn->vn_nhandles;
1011 
1012 	bcopy(&msa, pr_val, sizeof (msa));
1013 	return (0);
1014 }
1015 
1016 /*
1017  * Callback functions for set/get of properties
1018  */
1019 /*ARGSUSED*/
1020 static int
1021 vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
1022     uint_t pr_valsize, const void *pr_val)
1023 {
1024 	int 		err = 0;
1025 	vnic_t		*vn = m_driver;
1026 
1027 	switch (pr_num) {
1028 	case MAC_PROP_MTU: {
1029 		uint32_t	mtu;
1030 
1031 		if (pr_valsize < sizeof (mtu)) {
1032 			err = EINVAL;
1033 			break;
1034 		}
1035 		bcopy(pr_val, &mtu, sizeof (mtu));
1036 
1037 		if (vn->vn_link_id == DATALINK_INVALID_LINKID) {
1038 			if (mtu < ANCHOR_VNIC_MIN_MTU ||
1039 			    mtu > ANCHOR_VNIC_MAX_MTU) {
1040 				err = EINVAL;
1041 				break;
1042 			}
1043 		} else {
1044 			err = mac_mtu_add(vn->vn_lower_mh, &mtu, B_FALSE);
1045 			/*
1046 			 * If it's not supported to set a value here, translate
1047 			 * that to EINVAL, so user land gets a better idea of
1048 			 * what went wrong. This realistically means that they
1049 			 * violated the output of prop info.
1050 			 */
1051 			if (err == ENOTSUP)
1052 				err = EINVAL;
1053 			if (err != 0)
1054 				break;
1055 			VERIFY(mac_mtu_remove(vn->vn_lower_mh,
1056 			    vn->vn_mtu) == 0);
1057 		}
1058 		vn->vn_mtu = mtu;
1059 		err = mac_maxsdu_update(vn->vn_mh, mtu);
1060 		break;
1061 	}
1062 	case MAC_PROP_SECONDARY_ADDRS: {
1063 		mac_secondary_addr_t msa;
1064 
1065 		bcopy(pr_val, &msa, sizeof (msa));
1066 		err = vnic_set_secondary_macs(vn, &msa);
1067 		break;
1068 	}
1069 	default:
1070 		err = ENOTSUP;
1071 		break;
1072 	}
1073 	return (err);
1074 }
1075 
1076 /* ARGSUSED */
1077 static int
1078 vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1079     uint_t pr_valsize, void *pr_val)
1080 {
1081 	vnic_t		*vn = arg;
1082 	int 		ret = 0;
1083 
1084 	switch (pr_num) {
1085 	case MAC_PROP_SECONDARY_ADDRS:
1086 		ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val);
1087 		break;
1088 	default:
1089 		ret = EINVAL;
1090 		break;
1091 	}
1092 
1093 	return (ret);
1094 }
1095 
1096 /* ARGSUSED */
1097 static void vnic_m_propinfo(void *m_driver, const char *pr_name,
1098     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
1099 {
1100 	vnic_t		*vn = m_driver;
1101 
1102 	switch (pr_num) {
1103 	case MAC_PROP_MTU:
1104 		if (vn->vn_link_id == DATALINK_INVALID_LINKID) {
1105 			mac_prop_info_set_range_uint32(prh,
1106 			    ANCHOR_VNIC_MIN_MTU, ANCHOR_VNIC_MAX_MTU);
1107 		} else {
1108 			uint32_t		max;
1109 			mac_perim_handle_t	mph;
1110 			mac_propval_range_t	range;
1111 
1112 			/*
1113 			 * The valid range for a VNIC's MTU is the minimum that
1114 			 * the device supports and the current value of the
1115 			 * device. A VNIC cannot increase the current MTU of the
1116 			 * device. Therefore we need to get the range from the
1117 			 * propinfo endpoint and current mtu from the
1118 			 * traditional property endpoint.
1119 			 */
1120 			mac_perim_enter_by_mh(vn->vn_lower_mh, &mph);
1121 			if (mac_get_prop(vn->vn_lower_mh, MAC_PROP_MTU, "mtu",
1122 			    &max, sizeof (uint32_t)) != 0) {
1123 				mac_perim_exit(mph);
1124 				return;
1125 			}
1126 
1127 			range.mpr_count = 1;
1128 			if (mac_prop_info(vn->vn_lower_mh, MAC_PROP_MTU, "mtu",
1129 			    NULL, 0, &range, NULL) != 0) {
1130 				mac_perim_exit(mph);
1131 				return;
1132 			}
1133 
1134 			mac_prop_info_set_default_uint32(prh, max);
1135 			mac_prop_info_set_range_uint32(prh,
1136 			    range.mpr_range_uint32[0].mpur_min, max);
1137 			mac_perim_exit(mph);
1138 		}
1139 		break;
1140 	}
1141 }
1142 
1143 
1144 int
1145 vnic_info(vnic_info_t *info, cred_t *credp)
1146 {
1147 	vnic_t		*vnic;
1148 	int		err;
1149 
1150 	/* Make sure that the VNIC link is visible from the caller's zone. */
1151 	if (!dls_devnet_islinkvisible(info->vn_vnic_id, crgetzoneid(credp)))
1152 		return (ENOENT);
1153 
1154 	rw_enter(&vnic_lock, RW_WRITER);
1155 
1156 	err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(info->vn_vnic_id),
1157 	    (mod_hash_val_t *)&vnic);
1158 	if (err != 0) {
1159 		rw_exit(&vnic_lock);
1160 		return (ENOENT);
1161 	}
1162 
1163 	info->vn_link_id = vnic->vn_link_id;
1164 	info->vn_mac_addr_type = vnic->vn_addr_type;
1165 	info->vn_mac_len = vnic->vn_addr_len;
1166 	bcopy(vnic->vn_addr, info->vn_mac_addr, MAXMACADDRLEN);
1167 	info->vn_mac_slot = vnic->vn_slot_id;
1168 	info->vn_mac_prefix_len = 0;
1169 	info->vn_vid = vnic->vn_vid;
1170 	info->vn_force = vnic->vn_force;
1171 	info->vn_vrid = vnic->vn_vrid;
1172 	info->vn_af = vnic->vn_af;
1173 
1174 	bzero(&info->vn_resource_props, sizeof (mac_resource_props_t));
1175 	if (vnic->vn_mch != NULL)
1176 		mac_client_get_resources(vnic->vn_mch,
1177 		    &info->vn_resource_props);
1178 
1179 	rw_exit(&vnic_lock);
1180 	return (0);
1181 }
1182 
1183 static void
1184 vnic_notify_cb(void *arg, mac_notify_type_t type)
1185 {
1186 	vnic_t *vnic = arg;
1187 
1188 	/*
1189 	 * Do not deliver notifications if the vnic is not fully initialized
1190 	 * or is in process of being torn down.
1191 	 */
1192 	if (!vnic->vn_enabled)
1193 		return;
1194 
1195 	switch (type) {
1196 	case MAC_NOTE_UNICST:
1197 		/*
1198 		 * Only the VLAN VNIC needs to be notified with primary MAC
1199 		 * address change.
1200 		 */
1201 		if (vnic->vn_addr_type != VNIC_MAC_ADDR_TYPE_PRIMARY)
1202 			return;
1203 
1204 		/*  the unicast MAC address value */
1205 		mac_unicast_primary_get(vnic->vn_lower_mh, vnic->vn_addr);
1206 
1207 		/* notify its upper layer MAC about MAC address change */
1208 		mac_unicst_update(vnic->vn_mh, (const uint8_t *)vnic->vn_addr);
1209 		break;
1210 
1211 	case MAC_NOTE_LINK:
1212 		mac_link_update(vnic->vn_mh,
1213 		    mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE));
1214 		break;
1215 
1216 	default:
1217 		break;
1218 	}
1219 }
1220