xref: /illumos-gate/usr/src/uts/common/io/mac/mac_provider.c (revision 4283d10e18fc3904736c7c067fb29de9bb67d25d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
26  * Copyright 2020 RackTop Systems, Inc.
27  * Copyright 2023 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/id_space.h>
33 #include <sys/esunddi.h>
34 #include <sys/stat.h>
35 #include <sys/mkdev.h>
36 #include <sys/stream.h>
37 #include <sys/strsubr.h>
38 #include <sys/dlpi.h>
39 #include <sys/modhash.h>
40 #include <sys/mac.h>
41 #include <sys/mac_provider.h>
42 #include <sys/mac_impl.h>
43 #include <sys/mac_client_impl.h>
44 #include <sys/mac_client_priv.h>
45 #include <sys/mac_soft_ring.h>
46 #include <sys/mac_stat.h>
47 #include <sys/dld.h>
48 #include <sys/modctl.h>
49 #include <sys/fs/dv_node.h>
50 #include <sys/thread.h>
51 #include <sys/proc.h>
52 #include <sys/callb.h>
53 #include <sys/cpuvar.h>
54 #include <sys/atomic.h>
55 #include <sys/sdt.h>
56 #include <sys/mac_flow.h>
57 #include <sys/ddi_intr_impl.h>
58 #include <sys/disp.h>
59 #include <sys/sdt.h>
60 #include <sys/pattr.h>
61 #include <sys/strsun.h>
62 #include <sys/vlan.h>
63 #include <inet/ip.h>
64 #include <inet/tcp.h>
65 #include <netinet/udp.h>
66 #include <netinet/sctp.h>
67 
68 /*
69  * MAC Provider Interface.
70  *
71  * Interface for GLDv3 compatible NIC drivers.
72  */
73 
74 static void i_mac_notify_thread(void *);
75 
76 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
77 
78 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = {
79 	mac_fanout_recompute,	/* MAC_NOTE_LINK */
80 	NULL,		/* MAC_NOTE_UNICST */
81 	NULL,		/* MAC_NOTE_TX */
82 	NULL,		/* MAC_NOTE_DEVPROMISC */
83 	NULL,		/* MAC_NOTE_FASTPATH_FLUSH */
84 	NULL,		/* MAC_NOTE_SDU_SIZE */
85 	NULL,		/* MAC_NOTE_MARGIN */
86 	NULL,		/* MAC_NOTE_CAPAB_CHG */
87 	NULL		/* MAC_NOTE_LOWLINK */
88 };
89 
90 /*
91  * Driver support functions.
92  */
93 
94 /* REGISTRATION */
95 
96 mac_register_t *
97 mac_alloc(uint_t mac_version)
98 {
99 	mac_register_t *mregp;
100 
101 	/*
102 	 * Make sure there isn't a version mismatch between the driver and
103 	 * the framework.  In the future, if multiple versions are
104 	 * supported, this check could become more sophisticated.
105 	 */
106 	if (mac_version != MAC_VERSION)
107 		return (NULL);
108 
109 	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
110 	mregp->m_version = mac_version;
111 	return (mregp);
112 }
113 
114 void
115 mac_free(mac_register_t *mregp)
116 {
117 	kmem_free(mregp, sizeof (mac_register_t));
118 }
119 
120 /*
121  * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
122  * value.
123  */
124 static uint16_t
125 mac_features_to_flags(mac_handle_t mh)
126 {
127 	uint16_t flags = 0;
128 	uint32_t cap_sum = 0;
129 	mac_capab_lso_t cap_lso;
130 
131 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
132 		if (cap_sum & HCKSUM_IPHDRCKSUM)
133 			flags |= HCK_IPV4_HDRCKSUM;
134 
135 		if (cap_sum & HCKSUM_INET_PARTIAL)
136 			flags |= HCK_PARTIALCKSUM;
137 		else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
138 			flags |= HCK_FULLCKSUM;
139 	}
140 
141 	/*
142 	 * We don't need the information stored in 'cap_lso', but we
143 	 * need to pass a non-NULL pointer to appease the driver.
144 	 */
145 	if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
146 		flags |= HW_LSO;
147 
148 	return (flags);
149 }
150 
151 /*
152  * mac_register() is how drivers register new MACs with the GLDv3
153  * framework.  The mregp argument is allocated by drivers using the
154  * mac_alloc() function, and can be freed using mac_free() immediately upon
155  * return from mac_register().  Upon success (0 return value), the mhp
156  * opaque pointer becomes the driver's handle to its MAC interface, and is
157  * the argument to all other mac module entry points.
158  */
159 /* ARGSUSED */
160 int
161 mac_register(mac_register_t *mregp, mac_handle_t *mhp)
162 {
163 	mac_impl_t		*mip;
164 	mactype_t		*mtype;
165 	int			err = EINVAL;
166 	struct devnames		*dnp = NULL;
167 	uint_t			instance;
168 	boolean_t		style1_created = B_FALSE;
169 	boolean_t		style2_created = B_FALSE;
170 	char			*driver;
171 	minor_t			minor = 0;
172 
173 	/* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */
174 	if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip)))
175 		return (EINVAL);
176 
177 	/* Find the required MAC-Type plugin. */
178 	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
179 		return (EINVAL);
180 
181 	/* Create a mac_impl_t to represent this MAC. */
182 	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
183 
184 	/*
185 	 * The mac is not ready for open yet.
186 	 */
187 	mip->mi_state_flags |= MIS_DISABLED;
188 
189 	/*
190 	 * When a mac is registered, the m_instance field can be set to:
191 	 *
192 	 *  0:	Get the mac's instance number from m_dip.
193 	 *	This is usually used for physical device dips.
194 	 *
195 	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
196 	 *	For example, when an aggregation is created with the key option,
197 	 *	"key" will be used as the instance number.
198 	 *
199 	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
200 	 *	This is often used when a MAC of a virtual link is registered
201 	 *	(e.g., aggregation when "key" is not specified, or vnic).
202 	 *
203 	 * Note that the instance number is used to derive the mi_minor field
204 	 * of mac_impl_t, which will then be used to derive the name of kstats
205 	 * and the devfs nodes.  The first 2 cases are needed to preserve
206 	 * backward compatibility.
207 	 */
208 	switch (mregp->m_instance) {
209 	case 0:
210 		instance = ddi_get_instance(mregp->m_dip);
211 		break;
212 	case ((uint_t)-1):
213 		minor = mac_minor_hold(B_TRUE);
214 		if (minor == 0) {
215 			err = ENOSPC;
216 			goto fail;
217 		}
218 		instance = minor - 1;
219 		break;
220 	default:
221 		instance = mregp->m_instance;
222 		if (instance >= MAC_MAX_MINOR) {
223 			err = EINVAL;
224 			goto fail;
225 		}
226 		break;
227 	}
228 
229 	mip->mi_minor = (minor_t)(instance + 1);
230 	mip->mi_dip = mregp->m_dip;
231 	mip->mi_clients_list = NULL;
232 	mip->mi_nclients = 0;
233 
234 	/* Set the default IEEE Port VLAN Identifier */
235 	mip->mi_pvid = 1;
236 
237 	/* Default bridge link learning protection values */
238 	mip->mi_llimit = 1000;
239 	mip->mi_ldecay = 200;
240 
241 	driver = (char *)ddi_driver_name(mip->mi_dip);
242 
243 	/* Construct the MAC name as <drvname><instance> */
244 	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
245 	    driver, instance);
246 
247 	mip->mi_driver = mregp->m_driver;
248 
249 	mip->mi_type = mtype;
250 	mip->mi_margin = mregp->m_margin;
251 	mip->mi_info.mi_media = mtype->mt_type;
252 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
253 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
254 		goto fail;
255 	if (mregp->m_multicast_sdu == 0)
256 		mregp->m_multicast_sdu = mregp->m_max_sdu;
257 	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
258 	    mregp->m_multicast_sdu > mregp->m_max_sdu)
259 		goto fail;
260 	mip->mi_sdu_min = mregp->m_min_sdu;
261 	mip->mi_sdu_max = mregp->m_max_sdu;
262 	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
263 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
264 	/*
265 	 * If the media supports a broadcast address, cache a pointer to it
266 	 * in the mac_info_t so that upper layers can use it.
267 	 */
268 	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
269 
270 	mip->mi_v12n_level = mregp->m_v12n;
271 
272 	/*
273 	 * Copy the unicast source address into the mac_info_t, but only if
274 	 * the MAC-Type defines a non-zero address length.  We need to
275 	 * handle MAC-Types that have an address length of 0
276 	 * (point-to-point protocol MACs for example).
277 	 */
278 	if (mip->mi_type->mt_addr_length > 0) {
279 		if (mregp->m_src_addr == NULL)
280 			goto fail;
281 		mip->mi_info.mi_unicst_addr =
282 		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
283 		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
284 		    mip->mi_type->mt_addr_length);
285 
286 		/*
287 		 * Copy the fixed 'factory' MAC address from the immutable
288 		 * info.  This is taken to be the MAC address currently in
289 		 * use.
290 		 */
291 		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
292 		    mip->mi_type->mt_addr_length);
293 
294 		/*
295 		 * At this point, we should set up the classification
296 		 * rules etc but we delay it till mac_open() so that
297 		 * the resource discovery has taken place and we
298 		 * know someone wants to use the device. Otherwise
299 		 * memory gets allocated for Rx ring structures even
300 		 * during probe.
301 		 */
302 
303 		/* Copy the destination address if one is provided. */
304 		if (mregp->m_dst_addr != NULL) {
305 			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
306 			    mip->mi_type->mt_addr_length);
307 			mip->mi_dstaddr_set = B_TRUE;
308 		}
309 	} else if (mregp->m_src_addr != NULL) {
310 		goto fail;
311 	}
312 
313 	/*
314 	 * The format of the m_pdata is specific to the plugin.  It is
315 	 * passed in as an argument to all of the plugin callbacks.  The
316 	 * driver can update this information by calling
317 	 * mac_pdata_update().
318 	 */
319 	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) {
320 		/*
321 		 * Verify if the supplied plugin data is valid.  Note that
322 		 * even if the caller passed in a NULL pointer as plugin data,
323 		 * we still need to verify if that's valid as the plugin may
324 		 * require plugin data to function.
325 		 */
326 		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
327 		    mregp->m_pdata_size)) {
328 			goto fail;
329 		}
330 		if (mregp->m_pdata != NULL) {
331 			mip->mi_pdata =
332 			    kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
333 			bcopy(mregp->m_pdata, mip->mi_pdata,
334 			    mregp->m_pdata_size);
335 			mip->mi_pdata_size = mregp->m_pdata_size;
336 		}
337 	} else if (mregp->m_pdata != NULL) {
338 		/*
339 		 * The caller supplied non-NULL plugin data, but the plugin
340 		 * does not recognize plugin data.
341 		 */
342 		err = EINVAL;
343 		goto fail;
344 	}
345 
346 	/*
347 	 * Register the private properties.
348 	 */
349 	mac_register_priv_prop(mip, mregp->m_priv_props);
350 
351 	/*
352 	 * Stash the driver callbacks into the mac_impl_t, but first sanity
353 	 * check to make sure all mandatory callbacks are set.
354 	 */
355 	if (mregp->m_callbacks->mc_getstat == NULL ||
356 	    mregp->m_callbacks->mc_start == NULL ||
357 	    mregp->m_callbacks->mc_stop == NULL ||
358 	    mregp->m_callbacks->mc_setpromisc == NULL ||
359 	    mregp->m_callbacks->mc_multicst == NULL) {
360 		goto fail;
361 	}
362 	mip->mi_callbacks = mregp->m_callbacks;
363 
364 	if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY,
365 	    &mip->mi_capab_legacy)) {
366 		mip->mi_state_flags |= MIS_LEGACY;
367 		mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev;
368 	} else {
369 		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
370 		    mip->mi_minor);
371 	}
372 
373 	/*
374 	 * Allocate a notification thread. thread_create blocks for memory
375 	 * if needed, it never fails.
376 	 */
377 	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
378 	    mip, 0, &p0, TS_RUN, minclsyspri);
379 
380 	/*
381 	 * Cache the DB_CKSUMFLAGS that this MAC supports.
382 	 */
383 	mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
384 
385 	/*
386 	 * Initialize the capabilities
387 	 */
388 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
389 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
390 
391 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
392 		mip->mi_state_flags |= MIS_IS_VNIC;
393 
394 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
395 		mip->mi_state_flags |= MIS_IS_AGGR;
396 
397 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
398 		mip->mi_state_flags |= MIS_IS_OVERLAY;
399 
400 	mac_addr_factory_init(mip);
401 
402 	mac_transceiver_init(mip);
403 
404 	mac_led_init(mip);
405 
406 	/*
407 	 * Enforce the virtrualization level registered.
408 	 */
409 	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
410 		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
411 		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
412 			goto fail;
413 
414 		/*
415 		 * The driver needs to register at least rx rings for this
416 		 * virtualization level.
417 		 */
418 		if (mip->mi_rx_groups == NULL)
419 			goto fail;
420 	}
421 
422 	/*
423 	 * The driver must set mc_unicst entry point to NULL when it advertises
424 	 * CAP_RINGS for rx groups.
425 	 */
426 	if (mip->mi_rx_groups != NULL) {
427 		if (mregp->m_callbacks->mc_unicst != NULL)
428 			goto fail;
429 	} else {
430 		if (mregp->m_callbacks->mc_unicst == NULL)
431 			goto fail;
432 	}
433 
434 	/*
435 	 * Initialize MAC addresses. Must be called after mac_init_rings().
436 	 */
437 	mac_init_macaddr(mip);
438 
439 	mip->mi_share_capab.ms_snum = 0;
440 	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
441 		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
442 		    &mip->mi_share_capab);
443 	}
444 
445 	/*
446 	 * Initialize the kstats for this device.
447 	 */
448 	mac_driver_stat_create(mip);
449 
450 	/* Zero out any properties. */
451 	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
452 
453 	if (mip->mi_minor <= MAC_MAX_MINOR) {
454 		/* Create a style-2 DLPI device */
455 		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
456 		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
457 			goto fail;
458 		style2_created = B_TRUE;
459 
460 		/* Create a style-1 DLPI device */
461 		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
462 		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
463 			goto fail;
464 		style1_created = B_TRUE;
465 	}
466 
467 	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
468 
469 	rw_enter(&i_mac_impl_lock, RW_WRITER);
470 	if (mod_hash_insert(i_mac_impl_hash,
471 	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
472 		rw_exit(&i_mac_impl_lock);
473 		err = EEXIST;
474 		goto fail;
475 	}
476 
477 	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
478 	    (mac_impl_t *), mip);
479 
480 	/*
481 	 * Mark the MAC to be ready for open.
482 	 */
483 	mip->mi_state_flags &= ~MIS_DISABLED;
484 	rw_exit(&i_mac_impl_lock);
485 
486 	atomic_inc_32(&i_mac_impl_count);
487 
488 	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
489 	*mhp = (mac_handle_t)mip;
490 	return (0);
491 
492 fail:
493 	if (style1_created)
494 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
495 
496 	if (style2_created)
497 		ddi_remove_minor_node(mip->mi_dip, driver);
498 
499 	mac_addr_factory_fini(mip);
500 
501 	/* Clean up registered MAC addresses */
502 	mac_fini_macaddr(mip);
503 
504 	/* Clean up registered rings */
505 	mac_free_rings(mip, MAC_RING_TYPE_RX);
506 	mac_free_rings(mip, MAC_RING_TYPE_TX);
507 
508 	/* Clean up notification thread */
509 	if (mip->mi_notify_thread != NULL)
510 		i_mac_notify_exit(mip);
511 
512 	if (mip->mi_info.mi_unicst_addr != NULL) {
513 		kmem_free(mip->mi_info.mi_unicst_addr,
514 		    mip->mi_type->mt_addr_length);
515 		mip->mi_info.mi_unicst_addr = NULL;
516 	}
517 
518 	mac_driver_stat_delete(mip);
519 
520 	if (mip->mi_type != NULL) {
521 		atomic_dec_32(&mip->mi_type->mt_ref);
522 		mip->mi_type = NULL;
523 	}
524 
525 	if (mip->mi_pdata != NULL) {
526 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
527 		mip->mi_pdata = NULL;
528 		mip->mi_pdata_size = 0;
529 	}
530 
531 	if (minor != 0) {
532 		ASSERT(minor > MAC_MAX_MINOR);
533 		mac_minor_rele(minor);
534 	}
535 
536 	mip->mi_state_flags = 0;
537 	mac_unregister_priv_prop(mip);
538 
539 	/*
540 	 * Clear the state before destroying the mac_impl_t
541 	 */
542 	mip->mi_state_flags = 0;
543 
544 	kmem_cache_free(i_mac_impl_cachep, mip);
545 	return (err);
546 }
547 
548 /*
549  * Unregister from the GLDv3 framework
550  */
551 int
552 mac_unregister(mac_handle_t mh)
553 {
554 	int			err;
555 	mac_impl_t		*mip = (mac_impl_t *)mh;
556 	mod_hash_val_t		val;
557 	mac_margin_req_t	*mmr, *nextmmr;
558 
559 	/* Fail the unregister if there are any open references to this mac. */
560 	if ((err = mac_disable_nowait(mh)) != 0)
561 		return (err);
562 
563 	/*
564 	 * Clean up notification thread and wait for it to exit.
565 	 */
566 	i_mac_notify_exit(mip);
567 
568 	/*
569 	 * Prior to acquiring the MAC perimeter, remove the MAC instance from
570 	 * the internal hash table. Such removal means table-walkers that
571 	 * acquire the perimeter will not do so on behalf of what we are
572 	 * unregistering, which prevents a deadlock.
573 	 */
574 	rw_enter(&i_mac_impl_lock, RW_WRITER);
575 	(void) mod_hash_remove(i_mac_impl_hash,
576 	    (mod_hash_key_t)mip->mi_name, &val);
577 	rw_exit(&i_mac_impl_lock);
578 	ASSERT(mip == (mac_impl_t *)val);
579 
580 	i_mac_perim_enter(mip);
581 
582 	/*
583 	 * There is still resource properties configured over this mac.
584 	 */
585 	if (mip->mi_resource_props.mrp_mask != 0)
586 		mac_fastpath_enable((mac_handle_t)mip);
587 
588 	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
589 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
590 		ddi_remove_minor_node(mip->mi_dip,
591 		    (char *)ddi_driver_name(mip->mi_dip));
592 	}
593 
594 	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
595 	    MIS_EXCLUSIVE));
596 
597 	mac_driver_stat_delete(mip);
598 
599 	ASSERT(i_mac_impl_count > 0);
600 	atomic_dec_32(&i_mac_impl_count);
601 
602 	if (mip->mi_pdata != NULL)
603 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
604 	mip->mi_pdata = NULL;
605 	mip->mi_pdata_size = 0;
606 
607 	/*
608 	 * Free the list of margin request.
609 	 */
610 	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
611 		nextmmr = mmr->mmr_nextp;
612 		kmem_free(mmr, sizeof (mac_margin_req_t));
613 	}
614 	mip->mi_mmrp = NULL;
615 
616 	mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN;
617 	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
618 	mip->mi_info.mi_unicst_addr = NULL;
619 
620 	atomic_dec_32(&mip->mi_type->mt_ref);
621 	mip->mi_type = NULL;
622 
623 	/*
624 	 * Free the primary MAC address.
625 	 */
626 	mac_fini_macaddr(mip);
627 
628 	/*
629 	 * free all rings
630 	 */
631 	mac_free_rings(mip, MAC_RING_TYPE_RX);
632 	mac_free_rings(mip, MAC_RING_TYPE_TX);
633 
634 	mac_addr_factory_fini(mip);
635 
636 	bzero(mip->mi_addr, MAXMACADDRLEN);
637 	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
638 	mip->mi_dstaddr_set = B_FALSE;
639 
640 	/* and the flows */
641 	mac_flow_tab_destroy(mip->mi_flow_tab);
642 	mip->mi_flow_tab = NULL;
643 
644 	if (mip->mi_minor > MAC_MAX_MINOR)
645 		mac_minor_rele(mip->mi_minor);
646 
647 	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
648 
649 	/*
650 	 * Reset the perim related fields to default values before
651 	 * kmem_cache_free
652 	 */
653 	i_mac_perim_exit(mip);
654 	mip->mi_state_flags = 0;
655 
656 	mac_unregister_priv_prop(mip);
657 
658 	ASSERT(mip->mi_bridge_link == NULL);
659 	kmem_cache_free(i_mac_impl_cachep, mip);
660 
661 	return (0);
662 }
663 
664 /* DATA RECEPTION */
665 
666 /*
667  * This function is invoked for packets received by the MAC driver in
668  * interrupt context. The ring generation number provided by the driver
669  * is matched with the ring generation number held in MAC. If they do not
670  * match, received packets are considered stale packets coming from an older
671  * assignment of the ring. Drop them.
672  */
673 void
674 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
675     uint64_t mr_gen_num)
676 {
677 	mac_ring_t		*mr = (mac_ring_t *)mrh;
678 
679 	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
680 		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
681 		    mr->mr_gen_num, uint64_t, mr_gen_num);
682 		freemsgchain(mp_chain);
683 		return;
684 	}
685 	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
686 }
687 
688 /*
689  * This function is invoked for each packet received by the underlying driver.
690  */
691 void
692 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
693 {
694 	mac_impl_t *mip = (mac_impl_t *)mh;
695 
696 	/*
697 	 * Check if the link is part of a bridge.  If not, then we don't need
698 	 * to take the lock to remain consistent.  Make this common case
699 	 * lock-free and tail-call optimized.
700 	 */
701 	if (mip->mi_bridge_link == NULL) {
702 		mac_rx_common(mh, mrh, mp_chain);
703 	} else {
704 		/*
705 		 * Once we take a reference on the bridge link, the bridge
706 		 * module itself can't unload, so the callback pointers are
707 		 * stable.
708 		 */
709 		mutex_enter(&mip->mi_bridge_lock);
710 		if ((mh = mip->mi_bridge_link) != NULL)
711 			mac_bridge_ref_cb(mh, B_TRUE);
712 		mutex_exit(&mip->mi_bridge_lock);
713 		if (mh == NULL) {
714 			mac_rx_common((mac_handle_t)mip, mrh, mp_chain);
715 		} else {
716 			mac_bridge_rx_cb(mh, mrh, mp_chain);
717 			mac_bridge_ref_cb(mh, B_FALSE);
718 		}
719 	}
720 }
721 
722 /*
723  * Special case function: this allows snooping of packets transmitted and
724  * received by TRILL. By design, they go directly into the TRILL module.
725  */
726 void
727 mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
728 {
729 	mac_impl_t *mip = (mac_impl_t *)mh;
730 
731 	if (mip->mi_promisc_list != NULL)
732 		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
733 }
734 
735 /*
736  * This is the upward reentry point for packets arriving from the bridging
737  * module and from mac_rx for links not part of a bridge.
738  */
739 void
740 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
741 {
742 	mac_impl_t		*mip = (mac_impl_t *)mh;
743 	mac_ring_t		*mr = (mac_ring_t *)mrh;
744 	mac_soft_ring_set_t	*mac_srs;
745 	mblk_t			*bp = mp_chain;
746 
747 	/*
748 	 * If there are any promiscuous mode callbacks defined for
749 	 * this MAC, pass them a copy if appropriate.
750 	 */
751 	if (mip->mi_promisc_list != NULL)
752 		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
753 
754 	if (mr != NULL) {
755 		/*
756 		 * If the SRS teardown has started, just return. The 'mr'
757 		 * continues to be valid until the driver unregisters the MAC.
758 		 * Hardware classified packets will not make their way up
759 		 * beyond this point once the teardown has started. The driver
760 		 * is never passed a pointer to a flow entry or SRS or any
761 		 * structure that can be freed much before mac_unregister.
762 		 */
763 		mutex_enter(&mr->mr_lock);
764 		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
765 		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
766 			mutex_exit(&mr->mr_lock);
767 			freemsgchain(mp_chain);
768 			return;
769 		}
770 
771 		/*
772 		 * The ring is in passthru mode; pass the chain up to
773 		 * the pseudo ring.
774 		 */
775 		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
776 			MR_REFHOLD_LOCKED(mr);
777 			mutex_exit(&mr->mr_lock);
778 			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
779 			    B_FALSE);
780 			MR_REFRELE(mr);
781 			return;
782 		}
783 
784 		/*
785 		 * The passthru callback should only be set when in
786 		 * MAC_PASSTHRU_CLASSIFIER mode.
787 		 */
788 		ASSERT3P(mr->mr_pt_fn, ==, NULL);
789 
790 		/*
791 		 * We check if an SRS is controlling this ring.
792 		 * If so, we can directly call the srs_lower_proc
793 		 * routine otherwise we need to go through mac_rx_classify
794 		 * to reach the right place.
795 		 */
796 		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
797 			MR_REFHOLD_LOCKED(mr);
798 			mutex_exit(&mr->mr_lock);
799 			ASSERT3P(mr->mr_srs, !=, NULL);
800 			mac_srs = mr->mr_srs;
801 
802 			/*
803 			 * This is the fast path. All packets received
804 			 * on this ring are hardware classified and
805 			 * share the same MAC header info.
806 			 */
807 			mac_srs->srs_rx.sr_lower_proc(mh,
808 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
809 			MR_REFRELE(mr);
810 			return;
811 		}
812 
813 		mutex_exit(&mr->mr_lock);
814 		/* We'll fall through to software classification */
815 	} else {
816 		flow_entry_t *flent;
817 		int err;
818 
819 		rw_enter(&mip->mi_rw_lock, RW_READER);
820 		if (mip->mi_single_active_client != NULL) {
821 			flent = mip->mi_single_active_client->mci_flent_list;
822 			FLOW_TRY_REFHOLD(flent, err);
823 			rw_exit(&mip->mi_rw_lock);
824 			if (err == 0) {
825 				(flent->fe_cb_fn)(flent->fe_cb_arg1,
826 				    flent->fe_cb_arg2, mp_chain, B_FALSE);
827 				FLOW_REFRELE(flent);
828 				return;
829 			}
830 		} else {
831 			rw_exit(&mip->mi_rw_lock);
832 		}
833 	}
834 
835 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
836 		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
837 			return;
838 	}
839 
840 	freemsgchain(bp);
841 }
842 
843 /* DATA TRANSMISSION */
844 
845 /*
846  * A driver's notification to resume transmission, in case of a provider
847  * without TX rings.
848  */
849 void
850 mac_tx_update(mac_handle_t mh)
851 {
852 	mac_tx_ring_update(mh, NULL);
853 }
854 
855 /*
856  * A driver's notification to resume transmission on the specified TX ring.
857  */
858 void
859 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
860 {
861 	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
862 }
863 
864 /* LINK STATE */
865 /*
866  * Notify the MAC layer about a link state change
867  */
868 void
869 mac_link_update(mac_handle_t mh, link_state_t link)
870 {
871 	mac_impl_t	*mip = (mac_impl_t *)mh;
872 
873 	/*
874 	 * Save the link state.
875 	 */
876 	mip->mi_lowlinkstate = link;
877 
878 	/*
879 	 * Send a MAC_NOTE_LOWLINK notification.  This tells the notification
880 	 * thread to deliver both lower and upper notifications.
881 	 */
882 	i_mac_notify(mip, MAC_NOTE_LOWLINK);
883 }
884 
885 /*
886  * Notify the MAC layer about a link state change due to bridging.
887  */
888 void
889 mac_link_redo(mac_handle_t mh, link_state_t link)
890 {
891 	mac_impl_t	*mip = (mac_impl_t *)mh;
892 
893 	/*
894 	 * Save the link state.
895 	 */
896 	mip->mi_linkstate = link;
897 
898 	/*
899 	 * Send a MAC_NOTE_LINK notification.  Only upper notifications are
900 	 * made.
901 	 */
902 	i_mac_notify(mip, MAC_NOTE_LINK);
903 }
904 
905 /* MINOR NODE HANDLING */
906 
907 /*
908  * Given a dev_t, return the instance number (PPA) associated with it.
909  * Drivers can use this in their getinfo(9e) implementation to lookup
910  * the instance number (i.e. PPA) of the device, to use as an index to
911  * their own array of soft state structures.
912  *
913  * Returns -1 on error.
914  */
915 int
916 mac_devt_to_instance(dev_t devt)
917 {
918 	return (dld_devt_to_instance(devt));
919 }
920 
921 /*
922  * Drivers that make use of the private minor number space are expected to
923  * provide their own getinfo(9e) entry point. This function simply forwards
924  * to the default MAC framework getinfo(9e) implementation as a convenience
925  * if they don't need any special mapping (mac instance != ddi_get_instance())
926  */
927 int
928 mac_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
929 {
930 	return (dld_getinfo(dip, cmd, arg, resp));
931 }
932 
933 /*
934  * This function returns the first minor number that is available for
935  * driver private use.  All minor numbers smaller than this are
936  * reserved for GLDv3 use.
937  */
938 minor_t
939 mac_private_minor(void)
940 {
941 	return (MAC_PRIVATE_MINOR);
942 }
943 
944 /* OTHER CONTROL INFORMATION */
945 
946 /*
947  * A driver notified us that its primary MAC address has changed.
948  */
949 void
950 mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
951 {
952 	mac_impl_t	*mip = (mac_impl_t *)mh;
953 
954 	if (mip->mi_type->mt_addr_length == 0)
955 		return;
956 
957 	i_mac_perim_enter(mip);
958 
959 	/*
960 	 * If address changes, freshen the MAC address value and update
961 	 * all MAC clients that share this MAC address.
962 	 */
963 	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) {
964 		mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
965 		    (uint8_t *)addr);
966 	}
967 
968 	i_mac_perim_exit(mip);
969 
970 	/*
971 	 * Send a MAC_NOTE_UNICST notification.
972 	 */
973 	i_mac_notify(mip, MAC_NOTE_UNICST);
974 }
975 
976 void
977 mac_dst_update(mac_handle_t mh, const uint8_t *addr)
978 {
979 	mac_impl_t	*mip = (mac_impl_t *)mh;
980 
981 	if (mip->mi_type->mt_addr_length == 0)
982 		return;
983 
984 	i_mac_perim_enter(mip);
985 	bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length);
986 	i_mac_perim_exit(mip);
987 	i_mac_notify(mip, MAC_NOTE_DEST);
988 }
989 
990 /*
991  * MAC plugin information changed.
992  */
993 int
994 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
995 {
996 	mac_impl_t	*mip = (mac_impl_t *)mh;
997 
998 	/*
999 	 * Verify that the plugin supports MAC plugin data and that the
1000 	 * supplied data is valid.
1001 	 */
1002 	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
1003 		return (EINVAL);
1004 	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
1005 		return (EINVAL);
1006 
1007 	if (mip->mi_pdata != NULL)
1008 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
1009 
1010 	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
1011 	bcopy(mac_pdata, mip->mi_pdata, dsize);
1012 	mip->mi_pdata_size = dsize;
1013 
1014 	/*
1015 	 * Since the MAC plugin data is used to construct MAC headers that
1016 	 * were cached in fast-path headers, we need to flush fast-path
1017 	 * information for links associated with this mac.
1018 	 */
1019 	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
1020 	return (0);
1021 }
1022 
1023 /*
1024  * The mac provider or mac frameowrk calls this function when it wants
1025  * to notify upstream consumers that the capabilities have changed and
1026  * that they should modify their own internal state accordingly.
1027  *
1028  * We currently have no regard for the fact that a provider could
1029  * decide to drop capabilities which would invalidate pending traffic.
1030  * For example, if one was to disable the Tx checksum offload while
1031  * TCP/IP traffic was being sent by mac clients relying on that
1032  * feature, then those packets would hit the write with missing or
1033  * partial checksums. A proper solution involves not only providing
1034  * notfication, but also performing client quiescing. That is, a capab
1035  * change should be treated as an atomic transaction that forms a
1036  * barrier between traffic relying on the current capabs and traffic
1037  * relying on the new capabs. In practice, simnet is currently the
1038  * only provider that could hit this, and it's an easily avoidable
1039  * situation (and at worst it should only lead to some dropped
1040  * packets). But if we ever want better on-the-fly capab change to
1041  * actual hardware providers, then we should give this update
1042  * mechanism a proper implementation.
1043  */
1044 void
1045 mac_capab_update(mac_handle_t mh)
1046 {
1047 	/*
1048 	 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
1049 	 * clients to renegotiate capabilities.
1050 	 */
1051 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
1052 }
1053 
1054 /*
1055  * Used by normal drivers to update the max sdu size.
1056  * We need to handle the case of a smaller mi_sdu_multicast
1057  * since this is called by mac_set_mtu() even for drivers that
1058  * have differing unicast and multicast mtu and we don't want to
1059  * increase the multicast mtu by accident in that case.
1060  */
1061 int
1062 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
1063 {
1064 	mac_impl_t	*mip = (mac_impl_t *)mh;
1065 
1066 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1067 		return (EINVAL);
1068 	mip->mi_sdu_max = sdu_max;
1069 	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
1070 		mip->mi_sdu_multicast = mip->mi_sdu_max;
1071 
1072 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1073 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1074 	return (0);
1075 }
1076 
1077 /*
1078  * Version of the above function that is used by drivers that have a different
1079  * max sdu size for multicast/broadcast vs. unicast.
1080  */
1081 int
1082 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
1083 {
1084 	mac_impl_t	*mip = (mac_impl_t *)mh;
1085 
1086 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1087 		return (EINVAL);
1088 	if (sdu_multicast == 0)
1089 		sdu_multicast = sdu_max;
1090 	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
1091 		return (EINVAL);
1092 	mip->mi_sdu_max = sdu_max;
1093 	mip->mi_sdu_multicast = sdu_multicast;
1094 
1095 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1096 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1097 	return (0);
1098 }
1099 
1100 static void
1101 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring)
1102 {
1103 	mac_client_impl_t *mcip;
1104 	flow_entry_t *flent;
1105 	mac_soft_ring_set_t *mac_rx_srs;
1106 	mac_cpus_t *srs_cpu;
1107 	int i;
1108 
1109 	if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) &&
1110 	    (!ring->mr_info.mri_intr.mi_ddi_shared)) {
1111 		/* interrupt can be re-targeted */
1112 		ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
1113 		flent = mcip->mci_flent;
1114 		if (ring->mr_type == MAC_RING_TYPE_RX) {
1115 			for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1116 				mac_rx_srs = flent->fe_rx_srs[i];
1117 				if (mac_rx_srs->srs_ring != ring)
1118 					continue;
1119 				srs_cpu = &mac_rx_srs->srs_cpu;
1120 				mutex_enter(&cpu_lock);
1121 				mac_rx_srs_retarget_intr(mac_rx_srs,
1122 				    srs_cpu->mc_rx_intr_cpu);
1123 				mutex_exit(&cpu_lock);
1124 				break;
1125 			}
1126 		} else {
1127 			if (flent->fe_tx_srs != NULL) {
1128 				mutex_enter(&cpu_lock);
1129 				mac_tx_srs_retarget_intr(
1130 				    flent->fe_tx_srs);
1131 				mutex_exit(&cpu_lock);
1132 			}
1133 		}
1134 	}
1135 }
1136 
1137 /*
1138  * Clients like aggr create pseudo rings (mac_ring_t) and expose them to
1139  * their clients. There is a 1-1 mapping pseudo ring and the hardware
1140  * ring. ddi interrupt handles are exported from the hardware ring to
1141  * the pseudo ring. Thus when the interrupt handle changes, clients of
1142  * aggr that are using the handle need to use the new handle and
1143  * re-target their interrupts.
1144  */
1145 static void
1146 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring,
1147     ddi_intr_handle_t ddh)
1148 {
1149 	mac_ring_t *pring;
1150 	mac_group_t *pgroup;
1151 	mac_impl_t *pmip;
1152 	char macname[MAXNAMELEN];
1153 	mac_perim_handle_t p_mph;
1154 	uint64_t saved_gen_num;
1155 
1156 again:
1157 	pring = (mac_ring_t *)ring->mr_prh;
1158 	pgroup = (mac_group_t *)pring->mr_gh;
1159 	pmip = (mac_impl_t *)pgroup->mrg_mh;
1160 	saved_gen_num = ring->mr_gen_num;
1161 	(void) strlcpy(macname, pmip->mi_name, MAXNAMELEN);
1162 	/*
1163 	 * We need to enter aggr's perimeter. The locking hierarchy
1164 	 * dictates that aggr's perimeter should be entered first
1165 	 * and then the port's perimeter. So drop the port's
1166 	 * perimeter, enter aggr's and then re-enter port's
1167 	 * perimeter.
1168 	 */
1169 	i_mac_perim_exit(mip);
1170 	/*
1171 	 * While we know pmip is the aggr's mip, there is a
1172 	 * possibility that aggr could have unregistered by
1173 	 * the time we exit port's perimeter (mip) and
1174 	 * enter aggr's perimeter (pmip). To avoid that
1175 	 * scenario, enter aggr's perimeter using its name.
1176 	 */
1177 	if (mac_perim_enter_by_macname(macname, &p_mph) != 0)
1178 		return;
1179 	i_mac_perim_enter(mip);
1180 	/*
1181 	 * Check if the ring got assigned to another aggregation before
1182 	 * be could enter aggr's and the port's perimeter. When a ring
1183 	 * gets deleted from an aggregation, it calls mac_stop_ring()
1184 	 * which increments the generation number. So checking
1185 	 * generation number will be enough.
1186 	 */
1187 	if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) {
1188 		i_mac_perim_exit(mip);
1189 		mac_perim_exit(p_mph);
1190 		i_mac_perim_enter(mip);
1191 		goto again;
1192 	}
1193 
1194 	/* Check if pseudo ring is still present */
1195 	if (ring->mr_prh != NULL) {
1196 		pring->mr_info.mri_intr.mi_ddi_handle = ddh;
1197 		pring->mr_info.mri_intr.mi_ddi_shared =
1198 		    ring->mr_info.mri_intr.mi_ddi_shared;
1199 		if (ddh != NULL)
1200 			mac_ring_intr_retarget(pgroup, pring);
1201 	}
1202 	i_mac_perim_exit(mip);
1203 	mac_perim_exit(p_mph);
1204 }
1205 /*
1206  * API called by driver to provide new interrupt handle for TX/RX rings.
1207  * This usually happens when IRM (Interrupt Resource Manangement)
1208  * framework either gives the driver more MSI-x interrupts or takes
1209  * away MSI-x interrupts from the driver.
1210  */
1211 void
1212 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh)
1213 {
1214 	mac_ring_t	*ring = (mac_ring_t *)mrh;
1215 	mac_group_t	*group = (mac_group_t *)ring->mr_gh;
1216 	mac_impl_t	*mip = (mac_impl_t *)group->mrg_mh;
1217 
1218 	i_mac_perim_enter(mip);
1219 	ring->mr_info.mri_intr.mi_ddi_handle = ddh;
1220 	if (ddh == NULL) {
1221 		/* Interrupts being reset */
1222 		ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE;
1223 		if (ring->mr_prh != NULL) {
1224 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1225 			return;
1226 		}
1227 	} else {
1228 		/* New interrupt handle */
1229 		mac_compare_ddi_handle(mip->mi_rx_groups,
1230 		    mip->mi_rx_group_count, ring);
1231 		if (!ring->mr_info.mri_intr.mi_ddi_shared) {
1232 			mac_compare_ddi_handle(mip->mi_tx_groups,
1233 			    mip->mi_tx_group_count, ring);
1234 		}
1235 		if (ring->mr_prh != NULL) {
1236 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1237 			return;
1238 		} else {
1239 			mac_ring_intr_retarget(group, ring);
1240 		}
1241 	}
1242 	i_mac_perim_exit(mip);
1243 }
1244 
1245 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
1246 
1247 /*
1248  * Updates the mac_impl structure with the current state of the link
1249  */
1250 static void
1251 i_mac_log_link_state(mac_impl_t *mip)
1252 {
1253 	/*
1254 	 * If no change, then it is not interesting.
1255 	 */
1256 	if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate)
1257 		return;
1258 
1259 	switch (mip->mi_lowlinkstate) {
1260 	case LINK_STATE_UP:
1261 		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
1262 			char det[200];
1263 
1264 			mip->mi_type->mt_ops.mtops_link_details(det,
1265 			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
1266 
1267 			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
1268 		} else {
1269 			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
1270 		}
1271 		break;
1272 
1273 	case LINK_STATE_DOWN:
1274 		/*
1275 		 * Only transitions from UP to DOWN are interesting
1276 		 */
1277 		if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN)
1278 			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
1279 		break;
1280 
1281 	case LINK_STATE_UNKNOWN:
1282 		/*
1283 		 * This case is normally not interesting.
1284 		 */
1285 		break;
1286 	}
1287 	mip->mi_lastlowlinkstate = mip->mi_lowlinkstate;
1288 }
1289 
1290 /*
1291  * Main routine for the callbacks notifications thread
1292  */
1293 static void
1294 i_mac_notify_thread(void *arg)
1295 {
1296 	mac_impl_t	*mip = arg;
1297 	callb_cpr_t	cprinfo;
1298 	mac_cb_t	*mcb;
1299 	mac_cb_info_t	*mcbi;
1300 	mac_notify_cb_t	*mncb;
1301 
1302 	mcbi = &mip->mi_notify_cb_info;
1303 	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
1304 	    "i_mac_notify_thread");
1305 
1306 	mutex_enter(mcbi->mcbi_lockp);
1307 
1308 	for (;;) {
1309 		uint32_t	bits;
1310 		uint32_t	type;
1311 
1312 		bits = mip->mi_notify_bits;
1313 		if (bits == 0) {
1314 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1315 			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1316 			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
1317 			continue;
1318 		}
1319 		mip->mi_notify_bits = 0;
1320 		if ((bits & (1 << MAC_NNOTE)) != 0) {
1321 			/* request to quit */
1322 			ASSERT(mip->mi_state_flags & MIS_DISABLED);
1323 			break;
1324 		}
1325 
1326 		mutex_exit(mcbi->mcbi_lockp);
1327 
1328 		/*
1329 		 * Log link changes on the actual link, but then do reports on
1330 		 * synthetic state (if part of a bridge).
1331 		 */
1332 		if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) {
1333 			link_state_t newstate;
1334 			mac_handle_t mh;
1335 
1336 			i_mac_log_link_state(mip);
1337 			newstate = mip->mi_lowlinkstate;
1338 			if (mip->mi_bridge_link != NULL) {
1339 				mutex_enter(&mip->mi_bridge_lock);
1340 				if ((mh = mip->mi_bridge_link) != NULL) {
1341 					newstate = mac_bridge_ls_cb(mh,
1342 					    newstate);
1343 				}
1344 				mutex_exit(&mip->mi_bridge_lock);
1345 			}
1346 			if (newstate != mip->mi_linkstate) {
1347 				mip->mi_linkstate = newstate;
1348 				bits |= 1 << MAC_NOTE_LINK;
1349 			}
1350 		}
1351 
1352 		/*
1353 		 * Depending on which capabs have changed, the Tx
1354 		 * checksum flags may also need to be updated.
1355 		 */
1356 		if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
1357 			mac_perim_handle_t mph;
1358 			mac_handle_t mh = (mac_handle_t)mip;
1359 
1360 			mac_perim_enter_by_mh(mh, &mph);
1361 			mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
1362 			mac_perim_exit(mph);
1363 		}
1364 
1365 		/*
1366 		 * Do notification callbacks for each notification type.
1367 		 */
1368 		for (type = 0; type < MAC_NNOTE; type++) {
1369 			if ((bits & (1 << type)) == 0) {
1370 				continue;
1371 			}
1372 
1373 			if (mac_notify_cb_list[type] != NULL)
1374 				(*mac_notify_cb_list[type])(mip);
1375 
1376 			/*
1377 			 * Walk the list of notifications.
1378 			 */
1379 			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
1380 			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
1381 			    mcb = mcb->mcb_nextp) {
1382 				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
1383 				mncb->mncb_fn(mncb->mncb_arg, type);
1384 			}
1385 			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
1386 			    &mip->mi_notify_cb_list);
1387 		}
1388 
1389 		mutex_enter(mcbi->mcbi_lockp);
1390 	}
1391 
1392 	mip->mi_state_flags |= MIS_NOTIFY_DONE;
1393 	cv_broadcast(&mcbi->mcbi_cv);
1394 
1395 	/* CALLB_CPR_EXIT drops the lock */
1396 	CALLB_CPR_EXIT(&cprinfo);
1397 	thread_exit();
1398 }
1399 
1400 /*
1401  * Signal the i_mac_notify_thread asking it to quit.
1402  * Then wait till it is done.
1403  */
1404 void
1405 i_mac_notify_exit(mac_impl_t *mip)
1406 {
1407 	mac_cb_info_t	*mcbi;
1408 
1409 	mcbi = &mip->mi_notify_cb_info;
1410 
1411 	mutex_enter(mcbi->mcbi_lockp);
1412 	mip->mi_notify_bits = (1 << MAC_NNOTE);
1413 	cv_broadcast(&mcbi->mcbi_cv);
1414 
1415 
1416 	while ((mip->mi_notify_thread != NULL) &&
1417 	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
1418 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1419 	}
1420 
1421 	/* Necessary clean up before doing kmem_cache_free */
1422 	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
1423 	mip->mi_notify_bits = 0;
1424 	mip->mi_notify_thread = NULL;
1425 	mutex_exit(mcbi->mcbi_lockp);
1426 }
1427 
1428 /*
1429  * Entry point invoked by drivers to dynamically add a ring to an
1430  * existing group.
1431  */
1432 int
1433 mac_group_add_ring(mac_group_handle_t gh, int index)
1434 {
1435 	mac_group_t *group = (mac_group_t *)gh;
1436 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1437 	int ret;
1438 
1439 	i_mac_perim_enter(mip);
1440 	ret = i_mac_group_add_ring(group, NULL, index);
1441 	i_mac_perim_exit(mip);
1442 	return (ret);
1443 }
1444 
1445 /*
1446  * Entry point invoked by drivers to dynamically remove a ring
1447  * from an existing group. The specified ring handle must no longer
1448  * be used by the driver after a call to this function.
1449  */
1450 void
1451 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
1452 {
1453 	mac_group_t *group = (mac_group_t *)gh;
1454 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1455 
1456 	i_mac_perim_enter(mip);
1457 	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
1458 	i_mac_perim_exit(mip);
1459 }
1460 
1461 /*
1462  * mac_prop_info_*() callbacks called from the driver's prefix_propinfo()
1463  * entry points.
1464  */
1465 
1466 void
1467 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val)
1468 {
1469 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1470 
1471 	/* nothing to do if the caller doesn't want the default value */
1472 	if (pr->pr_default == NULL)
1473 		return;
1474 
1475 	ASSERT(pr->pr_default_size >= sizeof (uint8_t));
1476 
1477 	*(uint8_t *)(pr->pr_default) = val;
1478 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1479 }
1480 
1481 void
1482 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val)
1483 {
1484 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1485 
1486 	/* nothing to do if the caller doesn't want the default value */
1487 	if (pr->pr_default == NULL)
1488 		return;
1489 
1490 	ASSERT(pr->pr_default_size >= sizeof (uint64_t));
1491 
1492 	bcopy(&val, pr->pr_default, sizeof (val));
1493 
1494 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1495 }
1496 
1497 void
1498 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val)
1499 {
1500 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1501 
1502 	/* nothing to do if the caller doesn't want the default value */
1503 	if (pr->pr_default == NULL)
1504 		return;
1505 
1506 	ASSERT(pr->pr_default_size >= sizeof (uint32_t));
1507 
1508 	bcopy(&val, pr->pr_default, sizeof (val));
1509 
1510 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1511 }
1512 
1513 void
1514 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str)
1515 {
1516 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1517 
1518 	/* nothing to do if the caller doesn't want the default value */
1519 	if (pr->pr_default == NULL)
1520 		return;
1521 
1522 	if (strlen(str) >= pr->pr_default_size)
1523 		pr->pr_errno = ENOBUFS;
1524 	else
1525 		(void) strlcpy(pr->pr_default, str, pr->pr_default_size);
1526 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1527 }
1528 
1529 void
1530 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
1531     link_flowctrl_t val)
1532 {
1533 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1534 
1535 	/* nothing to do if the caller doesn't want the default value */
1536 	if (pr->pr_default == NULL)
1537 		return;
1538 
1539 	ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t));
1540 
1541 	bcopy(&val, pr->pr_default, sizeof (val));
1542 
1543 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1544 }
1545 
1546 void
1547 mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val)
1548 {
1549 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1550 
1551 	/* nothing to do if the caller doesn't want the default value */
1552 	if (pr->pr_default == NULL)
1553 		return;
1554 
1555 	ASSERT(pr->pr_default_size >= sizeof (link_fec_t));
1556 
1557 	bcopy(&val, pr->pr_default, sizeof (val));
1558 
1559 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1560 }
1561 
1562 void
1563 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
1564     uint32_t max)
1565 {
1566 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1567 	mac_propval_range_t *range = pr->pr_range;
1568 	mac_propval_uint32_range_t *range32;
1569 
1570 	/* nothing to do if the caller doesn't want the range info */
1571 	if (range == NULL)
1572 		return;
1573 
1574 	if (pr->pr_range_cur_count++ == 0) {
1575 		/* first range */
1576 		pr->pr_flags |= MAC_PROP_INFO_RANGE;
1577 		range->mpr_type = MAC_PROPVAL_UINT32;
1578 	} else {
1579 		/* all ranges of a property should be of the same type */
1580 		ASSERT(range->mpr_type == MAC_PROPVAL_UINT32);
1581 		if (pr->pr_range_cur_count > range->mpr_count) {
1582 			pr->pr_errno = ENOSPC;
1583 			return;
1584 		}
1585 	}
1586 
1587 	range32 = range->mpr_range_uint32;
1588 	range32[pr->pr_range_cur_count - 1].mpur_min = min;
1589 	range32[pr->pr_range_cur_count - 1].mpur_max = max;
1590 }
1591 
1592 void
1593 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
1594 {
1595 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1596 
1597 	pr->pr_perm = perm;
1598 	pr->pr_flags |= MAC_PROP_INFO_PERM;
1599 }
1600 
1601 void
1602 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
1603     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
1604 {
1605 	uint32_t flags;
1606 
1607 	ASSERT(DB_TYPE(mp) == M_DATA);
1608 
1609 	flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
1610 	if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) {
1611 		if (value != NULL)
1612 			*value = (uint32_t)DB_CKSUM16(mp);
1613 		if ((flags & HCK_PARTIALCKSUM) != 0) {
1614 			if (start != NULL)
1615 				*start = (uint32_t)DB_CKSUMSTART(mp);
1616 			if (stuff != NULL)
1617 				*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
1618 			if (end != NULL)
1619 				*end = (uint32_t)DB_CKSUMEND(mp);
1620 		}
1621 	}
1622 
1623 	if (flags_ptr != NULL)
1624 		*flags_ptr = flags;
1625 }
1626 
1627 void
1628 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
1629     uint32_t value, uint32_t flags)
1630 {
1631 	ASSERT(DB_TYPE(mp) == M_DATA);
1632 
1633 	DB_CKSUMSTART(mp) = (intptr_t)start;
1634 	DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
1635 	DB_CKSUMEND(mp) = (intptr_t)end;
1636 	DB_CKSUMFLAGS(mp) = (uint16_t)flags;
1637 	DB_CKSUM16(mp) = (uint16_t)value;
1638 }
1639 
1640 void
1641 mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
1642 {
1643 	ASSERT3U(DB_TYPE(src), ==, M_DATA);
1644 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
1645 
1646 	/*
1647 	 * Do these assignments unconditionally, rather than only when
1648 	 * flags is non-zero. This protects a situation where zeroed
1649 	 * hcksum data does not make the jump onto an mblk_t with
1650 	 * stale data in those fields. It's important to copy all
1651 	 * possible flags (HCK_* as well as HW_*) and not just the
1652 	 * checksum specific flags. Dropping flags during a clone
1653 	 * could result in dropped packets. If the caller has good
1654 	 * reason to drop those flags then it should do it manually,
1655 	 * after the clone.
1656 	 */
1657 	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
1658 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
1659 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
1660 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
1661 	DB_CKSUM16(dst) = DB_CKSUM16(src);
1662 	DB_LSOMSS(dst) = DB_LSOMSS(src);
1663 }
1664 
1665 void
1666 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
1667 {
1668 	ASSERT(DB_TYPE(mp) == M_DATA);
1669 
1670 	if (flags != NULL) {
1671 		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
1672 		if ((*flags != 0) && (mss != NULL))
1673 			*mss = (uint32_t)DB_LSOMSS(mp);
1674 	}
1675 }
1676 
1677 void
1678 mac_transceiver_info_set_present(mac_transceiver_info_t *infop,
1679     boolean_t present)
1680 {
1681 	infop->mti_present = present;
1682 }
1683 
1684 void
1685 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
1686     boolean_t usable)
1687 {
1688 	infop->mti_usable = usable;
1689 }
1690 
1691 /*
1692  * We should really keep track of our offset and not walk everything every
1693  * time. I can't imagine that this will be kind to us at high packet rates;
1694  * however, for the moment, let's leave that.
1695  *
1696  * This walks a message block chain without pulling up to fill in the context
1697  * information. Note that the data we care about could be hidden across more
1698  * than one mblk_t.
1699  */
1700 static int
1701 mac_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out)
1702 {
1703 	size_t mpsize;
1704 	uint8_t *bp;
1705 
1706 	mpsize = msgsize(mp);
1707 	/* Check for overflow */
1708 	if (off + sizeof (uint16_t) > mpsize)
1709 		return (-1);
1710 
1711 	mpsize = MBLKL(mp);
1712 	while (off >= mpsize) {
1713 		mp = mp->b_cont;
1714 		off -= mpsize;
1715 		mpsize = MBLKL(mp);
1716 	}
1717 
1718 	bp = mp->b_rptr + off;
1719 	*out = *bp;
1720 	return (0);
1721 
1722 }
1723 
1724 static int
1725 mac_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out)
1726 {
1727 	size_t mpsize;
1728 	uint8_t *bp;
1729 
1730 	mpsize = msgsize(mp);
1731 	/* Check for overflow */
1732 	if (off + sizeof (uint16_t) > mpsize)
1733 		return (-1);
1734 
1735 	mpsize = MBLKL(mp);
1736 	while (off >= mpsize) {
1737 		mp = mp->b_cont;
1738 		off -= mpsize;
1739 		mpsize = MBLKL(mp);
1740 	}
1741 
1742 	/*
1743 	 * Data is in network order. Note the second byte of data might be in
1744 	 * the next mp.
1745 	 */
1746 	bp = mp->b_rptr + off;
1747 	*out = *bp << 8;
1748 	if (off + 1 == mpsize) {
1749 		mp = mp->b_cont;
1750 		bp = mp->b_rptr;
1751 	} else {
1752 		bp++;
1753 	}
1754 
1755 	*out |= *bp;
1756 	return (0);
1757 
1758 }
1759 
1760 
1761 int
1762 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
1763 {
1764 	size_t off;
1765 	uint16_t ether;
1766 	uint8_t ipproto, iplen, l4len, maclen;
1767 
1768 	bzero(meoi, sizeof (mac_ether_offload_info_t));
1769 
1770 	meoi->meoi_len = msgsize(mp);
1771 	off = offsetof(struct ether_header, ether_type);
1772 	if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1773 		return (-1);
1774 
1775 	if (ether == ETHERTYPE_VLAN) {
1776 		off = offsetof(struct ether_vlan_header, ether_type);
1777 		if (mac_meoi_get_uint16(mp, off, &ether) != 0)
1778 			return (-1);
1779 		meoi->meoi_flags |= MEOI_VLAN_TAGGED;
1780 		maclen = sizeof (struct ether_vlan_header);
1781 	} else {
1782 		maclen = sizeof (struct ether_header);
1783 	}
1784 	meoi->meoi_flags |= MEOI_L2INFO_SET;
1785 	meoi->meoi_l2hlen = maclen;
1786 	meoi->meoi_l3proto = ether;
1787 
1788 	switch (ether) {
1789 	case ETHERTYPE_IP:
1790 		/*
1791 		 * For IPv4 we need to get the length of the header, as it can
1792 		 * be variable.
1793 		 */
1794 		off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen;
1795 		if (mac_meoi_get_uint8(mp, off, &iplen) != 0)
1796 			return (-1);
1797 		iplen &= 0x0f;
1798 		if (iplen < 5 || iplen > 0x0f)
1799 			return (-1);
1800 		iplen *= 4;
1801 		off = offsetof(ipha_t, ipha_protocol) + maclen;
1802 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1803 			return (-1);
1804 		break;
1805 	case ETHERTYPE_IPV6:
1806 		iplen = 40;
1807 		off = offsetof(ip6_t, ip6_nxt) + maclen;
1808 		if (mac_meoi_get_uint8(mp, off, &ipproto) == -1)
1809 			return (-1);
1810 		break;
1811 	default:
1812 		return (0);
1813 	}
1814 	meoi->meoi_l3hlen = iplen;
1815 	meoi->meoi_l4proto = ipproto;
1816 	meoi->meoi_flags |= MEOI_L3INFO_SET;
1817 
1818 	switch (ipproto) {
1819 	case IPPROTO_TCP:
1820 		off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen;
1821 		if (mac_meoi_get_uint8(mp, off, &l4len) == -1)
1822 			return (-1);
1823 		l4len = (l4len & 0xf0) >> 4;
1824 		if (l4len < 5 || l4len > 0xf)
1825 			return (-1);
1826 		l4len *= 4;
1827 		break;
1828 	case IPPROTO_UDP:
1829 		l4len = sizeof (struct udphdr);
1830 		break;
1831 	case IPPROTO_SCTP:
1832 		l4len = sizeof (sctp_hdr_t);
1833 		break;
1834 	default:
1835 		return (0);
1836 	}
1837 
1838 	meoi->meoi_l4hlen = l4len;
1839 	meoi->meoi_flags |= MEOI_L4INFO_SET;
1840 	return (0);
1841 }
1842