xref: /illumos-gate/usr/src/uts/common/io/mac/mac_provider.c (revision 3d6ee46b4ddaa0ca6a00cc84d52edf88676f88ce)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
26  * Copyright 2020 RackTop Systems, Inc.
27  * Copyright 2025 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/id_space.h>
33 #include <sys/esunddi.h>
34 #include <sys/stat.h>
35 #include <sys/mkdev.h>
36 #include <sys/stream.h>
37 #include <sys/strsubr.h>
38 #include <sys/dlpi.h>
39 #include <sys/modhash.h>
40 #include <sys/mac.h>
41 #include <sys/mac_provider.h>
42 #include <sys/mac_impl.h>
43 #include <sys/mac_client_impl.h>
44 #include <sys/mac_client_priv.h>
45 #include <sys/mac_soft_ring.h>
46 #include <sys/mac_stat.h>
47 #include <sys/dld.h>
48 #include <sys/modctl.h>
49 #include <sys/fs/dv_node.h>
50 #include <sys/thread.h>
51 #include <sys/proc.h>
52 #include <sys/callb.h>
53 #include <sys/cpuvar.h>
54 #include <sys/atomic.h>
55 #include <sys/sdt.h>
56 #include <sys/mac_flow.h>
57 #include <sys/ddi_intr_impl.h>
58 #include <sys/disp.h>
59 #include <sys/sdt.h>
60 #include <sys/stdbool.h>
61 #include <sys/pattr.h>
62 #include <sys/strsun.h>
63 #include <sys/vlan.h>
64 #include <inet/ip.h>
65 #include <inet/tcp.h>
66 #include <netinet/udp.h>
67 #include <netinet/sctp.h>
68 #include <netinet/icmp6.h>
69 
70 /*
71  * MAC Provider Interface.
72  *
73  * Interface for GLDv3 compatible NIC drivers.
74  */
75 
76 static void i_mac_notify_thread(void *);
77 
78 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
79 
80 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = {
81 	mac_fanout_recompute,	/* MAC_NOTE_LINK */
82 	NULL,		/* MAC_NOTE_UNICST */
83 	NULL,		/* MAC_NOTE_TX */
84 	NULL,		/* MAC_NOTE_DEVPROMISC */
85 	NULL,		/* MAC_NOTE_FASTPATH_FLUSH */
86 	NULL,		/* MAC_NOTE_SDU_SIZE */
87 	NULL,		/* MAC_NOTE_MARGIN */
88 	NULL,		/* MAC_NOTE_CAPAB_CHG */
89 	NULL		/* MAC_NOTE_LOWLINK */
90 };
91 
92 /*
93  * Driver support functions.
94  */
95 
96 /* REGISTRATION */
97 
98 mac_register_t *
mac_alloc(uint_t mac_version)99 mac_alloc(uint_t mac_version)
100 {
101 	mac_register_t *mregp;
102 
103 	/*
104 	 * Make sure there isn't a version mismatch between the driver and
105 	 * the framework.  In the future, if multiple versions are
106 	 * supported, this check could become more sophisticated.
107 	 */
108 	if (mac_version != MAC_VERSION)
109 		return (NULL);
110 
111 	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
112 	mregp->m_version = mac_version;
113 	return (mregp);
114 }
115 
116 void
mac_free(mac_register_t * mregp)117 mac_free(mac_register_t *mregp)
118 {
119 	kmem_free(mregp, sizeof (mac_register_t));
120 }
121 
122 /*
123  * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
124  * value.
125  */
126 static uint16_t
mac_features_to_flags(mac_handle_t mh)127 mac_features_to_flags(mac_handle_t mh)
128 {
129 	uint16_t flags = 0;
130 	uint32_t cap_sum = 0;
131 	mac_capab_lso_t cap_lso;
132 
133 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
134 		if (cap_sum & HCKSUM_IPHDRCKSUM)
135 			flags |= HCK_IPV4_HDRCKSUM;
136 
137 		if (cap_sum & HCKSUM_INET_PARTIAL)
138 			flags |= HCK_PARTIALCKSUM;
139 		else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
140 			flags |= HCK_FULLCKSUM;
141 	}
142 
143 	/*
144 	 * We don't need the information stored in 'cap_lso', but we
145 	 * need to pass a non-NULL pointer to appease the driver.
146 	 */
147 	if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
148 		flags |= HW_LSO;
149 
150 	return (flags);
151 }
152 
153 /*
154  * mac_register() is how drivers register new MACs with the GLDv3
155  * framework.  The mregp argument is allocated by drivers using the
156  * mac_alloc() function, and can be freed using mac_free() immediately upon
157  * return from mac_register().  Upon success (0 return value), the mhp
158  * opaque pointer becomes the driver's handle to its MAC interface, and is
159  * the argument to all other mac module entry points.
160  */
161 /* ARGSUSED */
162 int
mac_register(mac_register_t * mregp,mac_handle_t * mhp)163 mac_register(mac_register_t *mregp, mac_handle_t *mhp)
164 {
165 	mac_impl_t		*mip;
166 	mactype_t		*mtype;
167 	int			err = EINVAL;
168 	struct devnames		*dnp = NULL;
169 	uint_t			instance;
170 	boolean_t		style1_created = B_FALSE;
171 	boolean_t		style2_created = B_FALSE;
172 	char			*driver;
173 	minor_t			minor = 0;
174 
175 	/* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */
176 	if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip)))
177 		return (EINVAL);
178 
179 	/* Find the required MAC-Type plugin. */
180 	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
181 		return (EINVAL);
182 
183 	/* Create a mac_impl_t to represent this MAC. */
184 	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
185 
186 	/*
187 	 * The mac is not ready for open yet.
188 	 */
189 	mip->mi_state_flags |= MIS_DISABLED;
190 
191 	/*
192 	 * When a mac is registered, the m_instance field can be set to:
193 	 *
194 	 *  0:	Get the mac's instance number from m_dip.
195 	 *	This is usually used for physical device dips.
196 	 *
197 	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
198 	 *	For example, when an aggregation is created with the key option,
199 	 *	"key" will be used as the instance number.
200 	 *
201 	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
202 	 *	This is often used when a MAC of a virtual link is registered
203 	 *	(e.g., aggregation when "key" is not specified, or vnic).
204 	 *
205 	 * Note that the instance number is used to derive the mi_minor field
206 	 * of mac_impl_t, which will then be used to derive the name of kstats
207 	 * and the devfs nodes.  The first 2 cases are needed to preserve
208 	 * backward compatibility.
209 	 */
210 	switch (mregp->m_instance) {
211 	case 0:
212 		instance = ddi_get_instance(mregp->m_dip);
213 		break;
214 	case ((uint_t)-1):
215 		minor = mac_minor_hold(B_TRUE);
216 		if (minor == 0) {
217 			err = ENOSPC;
218 			goto fail;
219 		}
220 		instance = minor - 1;
221 		break;
222 	default:
223 		instance = mregp->m_instance;
224 		if (instance >= MAC_MAX_MINOR) {
225 			err = EINVAL;
226 			goto fail;
227 		}
228 		break;
229 	}
230 
231 	mip->mi_minor = (minor_t)(instance + 1);
232 	mip->mi_dip = mregp->m_dip;
233 	mip->mi_clients_list = NULL;
234 	mip->mi_nclients = 0;
235 
236 	/* Set the default IEEE Port VLAN Identifier */
237 	mip->mi_pvid = 1;
238 
239 	/* Default bridge link learning protection values */
240 	mip->mi_llimit = 1000;
241 	mip->mi_ldecay = 200;
242 
243 	driver = (char *)ddi_driver_name(mip->mi_dip);
244 
245 	/* Construct the MAC name as <drvname><instance> */
246 	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
247 	    driver, instance);
248 
249 	mip->mi_driver = mregp->m_driver;
250 
251 	mip->mi_type = mtype;
252 	mip->mi_margin = mregp->m_margin;
253 	mip->mi_info.mi_media = mtype->mt_type;
254 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
255 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
256 		goto fail;
257 	if (mregp->m_multicast_sdu == 0)
258 		mregp->m_multicast_sdu = mregp->m_max_sdu;
259 	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
260 	    mregp->m_multicast_sdu > mregp->m_max_sdu)
261 		goto fail;
262 	mip->mi_sdu_min = mregp->m_min_sdu;
263 	mip->mi_sdu_max = mregp->m_max_sdu;
264 	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
265 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
266 	/*
267 	 * If the media supports a broadcast address, cache a pointer to it
268 	 * in the mac_info_t so that upper layers can use it.
269 	 */
270 	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
271 
272 	mip->mi_v12n_level = mregp->m_v12n;
273 
274 	/*
275 	 * Copy the unicast source address into the mac_info_t, but only if
276 	 * the MAC-Type defines a non-zero address length.  We need to
277 	 * handle MAC-Types that have an address length of 0
278 	 * (point-to-point protocol MACs for example).
279 	 */
280 	if (mip->mi_type->mt_addr_length > 0) {
281 		if (mregp->m_src_addr == NULL)
282 			goto fail;
283 		mip->mi_info.mi_unicst_addr =
284 		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
285 		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
286 		    mip->mi_type->mt_addr_length);
287 
288 		/*
289 		 * Copy the fixed 'factory' MAC address from the immutable
290 		 * info.  This is taken to be the MAC address currently in
291 		 * use.
292 		 */
293 		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
294 		    mip->mi_type->mt_addr_length);
295 
296 		/*
297 		 * At this point, we should set up the classification
298 		 * rules etc but we delay it till mac_open() so that
299 		 * the resource discovery has taken place and we
300 		 * know someone wants to use the device. Otherwise
301 		 * memory gets allocated for Rx ring structures even
302 		 * during probe.
303 		 */
304 
305 		/* Copy the destination address if one is provided. */
306 		if (mregp->m_dst_addr != NULL) {
307 			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
308 			    mip->mi_type->mt_addr_length);
309 			mip->mi_dstaddr_set = B_TRUE;
310 		}
311 	} else if (mregp->m_src_addr != NULL) {
312 		goto fail;
313 	}
314 
315 	/*
316 	 * The format of the m_pdata is specific to the plugin.  It is
317 	 * passed in as an argument to all of the plugin callbacks.  The
318 	 * driver can update this information by calling
319 	 * mac_pdata_update().
320 	 */
321 	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) {
322 		/*
323 		 * Verify if the supplied plugin data is valid.  Note that
324 		 * even if the caller passed in a NULL pointer as plugin data,
325 		 * we still need to verify if that's valid as the plugin may
326 		 * require plugin data to function.
327 		 */
328 		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
329 		    mregp->m_pdata_size)) {
330 			goto fail;
331 		}
332 		if (mregp->m_pdata != NULL) {
333 			mip->mi_pdata =
334 			    kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
335 			bcopy(mregp->m_pdata, mip->mi_pdata,
336 			    mregp->m_pdata_size);
337 			mip->mi_pdata_size = mregp->m_pdata_size;
338 		}
339 	} else if (mregp->m_pdata != NULL) {
340 		/*
341 		 * The caller supplied non-NULL plugin data, but the plugin
342 		 * does not recognize plugin data.
343 		 */
344 		err = EINVAL;
345 		goto fail;
346 	}
347 
348 	/*
349 	 * Register the private properties.
350 	 */
351 	mac_register_priv_prop(mip, mregp->m_priv_props);
352 
353 	/*
354 	 * Stash the driver callbacks into the mac_impl_t, but first sanity
355 	 * check to make sure all mandatory callbacks are set.
356 	 */
357 	if (mregp->m_callbacks->mc_getstat == NULL ||
358 	    mregp->m_callbacks->mc_start == NULL ||
359 	    mregp->m_callbacks->mc_stop == NULL ||
360 	    mregp->m_callbacks->mc_setpromisc == NULL ||
361 	    mregp->m_callbacks->mc_multicst == NULL) {
362 		goto fail;
363 	}
364 	mip->mi_callbacks = mregp->m_callbacks;
365 
366 	if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY,
367 	    &mip->mi_capab_legacy)) {
368 		mip->mi_state_flags |= MIS_LEGACY;
369 		mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev;
370 	} else {
371 		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
372 		    mip->mi_minor);
373 	}
374 
375 	/*
376 	 * Allocate a notification thread. thread_create blocks for memory
377 	 * if needed, it never fails.
378 	 */
379 	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
380 	    mip, 0, &p0, TS_RUN, minclsyspri);
381 
382 	/*
383 	 * Cache the DB_CKSUMFLAGS that this MAC supports.
384 	 */
385 	mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
386 
387 	/*
388 	 * Initialize the capabilities
389 	 */
390 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
391 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
392 
393 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
394 		mip->mi_state_flags |= MIS_IS_VNIC;
395 
396 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
397 		mip->mi_state_flags |= MIS_IS_AGGR;
398 
399 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
400 		mip->mi_state_flags |= MIS_IS_OVERLAY;
401 
402 	mac_addr_factory_init(mip);
403 
404 	mac_transceiver_init(mip);
405 
406 	mac_led_init(mip);
407 
408 	/*
409 	 * Enforce the virtrualization level registered.
410 	 */
411 	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
412 		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
413 		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
414 			goto fail;
415 
416 		/*
417 		 * The driver needs to register at least rx rings for this
418 		 * virtualization level.
419 		 */
420 		if (mip->mi_rx_groups == NULL)
421 			goto fail;
422 	}
423 
424 	/*
425 	 * The driver must set mc_unicst entry point to NULL when it advertises
426 	 * CAP_RINGS for rx groups.
427 	 */
428 	if (mip->mi_rx_groups != NULL) {
429 		if (mregp->m_callbacks->mc_unicst != NULL)
430 			goto fail;
431 	} else {
432 		if (mregp->m_callbacks->mc_unicst == NULL)
433 			goto fail;
434 	}
435 
436 	/*
437 	 * Initialize MAC addresses. Must be called after mac_init_rings().
438 	 */
439 	mac_init_macaddr(mip);
440 
441 	mip->mi_share_capab.ms_snum = 0;
442 	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
443 		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
444 		    &mip->mi_share_capab);
445 	}
446 
447 	/*
448 	 * Initialize the kstats for this device.
449 	 */
450 	mac_driver_stat_create(mip);
451 
452 	/* Zero out any properties. */
453 	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
454 
455 	if (mip->mi_minor <= MAC_MAX_MINOR) {
456 		/* Create a style-2 DLPI device */
457 		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
458 		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
459 			goto fail;
460 		style2_created = B_TRUE;
461 
462 		/* Create a style-1 DLPI device */
463 		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
464 		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
465 			goto fail;
466 		style1_created = B_TRUE;
467 	}
468 
469 	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
470 
471 	rw_enter(&i_mac_impl_lock, RW_WRITER);
472 	if (mod_hash_insert(i_mac_impl_hash,
473 	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
474 		rw_exit(&i_mac_impl_lock);
475 		err = EEXIST;
476 		goto fail;
477 	}
478 
479 	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
480 	    (mac_impl_t *), mip);
481 
482 	/*
483 	 * Mark the MAC to be ready for open.
484 	 */
485 	mip->mi_state_flags &= ~MIS_DISABLED;
486 	rw_exit(&i_mac_impl_lock);
487 
488 	atomic_inc_32(&i_mac_impl_count);
489 
490 	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
491 	*mhp = (mac_handle_t)mip;
492 	return (0);
493 
494 fail:
495 	if (style1_created)
496 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
497 
498 	if (style2_created)
499 		ddi_remove_minor_node(mip->mi_dip, driver);
500 
501 	mac_addr_factory_fini(mip);
502 
503 	/* Clean up registered MAC addresses */
504 	mac_fini_macaddr(mip);
505 
506 	/* Clean up registered rings */
507 	mac_free_rings(mip, MAC_RING_TYPE_RX);
508 	mac_free_rings(mip, MAC_RING_TYPE_TX);
509 
510 	/* Clean up notification thread */
511 	if (mip->mi_notify_thread != NULL)
512 		i_mac_notify_exit(mip);
513 
514 	if (mip->mi_info.mi_unicst_addr != NULL) {
515 		kmem_free(mip->mi_info.mi_unicst_addr,
516 		    mip->mi_type->mt_addr_length);
517 		mip->mi_info.mi_unicst_addr = NULL;
518 	}
519 
520 	mac_driver_stat_delete(mip);
521 
522 	if (mip->mi_type != NULL) {
523 		atomic_dec_32(&mip->mi_type->mt_ref);
524 		mip->mi_type = NULL;
525 	}
526 
527 	if (mip->mi_pdata != NULL) {
528 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
529 		mip->mi_pdata = NULL;
530 		mip->mi_pdata_size = 0;
531 	}
532 
533 	if (minor != 0) {
534 		ASSERT(minor > MAC_MAX_MINOR);
535 		mac_minor_rele(minor);
536 	}
537 
538 	mip->mi_state_flags = 0;
539 	mac_unregister_priv_prop(mip);
540 
541 	/*
542 	 * Clear the state before destroying the mac_impl_t
543 	 */
544 	mip->mi_state_flags = 0;
545 
546 	kmem_cache_free(i_mac_impl_cachep, mip);
547 	return (err);
548 }
549 
550 /*
551  * Unregister from the GLDv3 framework
552  */
553 int
mac_unregister(mac_handle_t mh)554 mac_unregister(mac_handle_t mh)
555 {
556 	int			err;
557 	mac_impl_t		*mip = (mac_impl_t *)mh;
558 	mod_hash_val_t		val;
559 	mac_margin_req_t	*mmr, *nextmmr;
560 
561 	/* Fail the unregister if there are any open references to this mac. */
562 	if ((err = mac_disable_nowait(mh)) != 0)
563 		return (err);
564 
565 	/*
566 	 * Clean up notification thread and wait for it to exit.
567 	 */
568 	i_mac_notify_exit(mip);
569 
570 	/*
571 	 * Prior to acquiring the MAC perimeter, remove the MAC instance from
572 	 * the internal hash table. Such removal means table-walkers that
573 	 * acquire the perimeter will not do so on behalf of what we are
574 	 * unregistering, which prevents a deadlock.
575 	 */
576 	rw_enter(&i_mac_impl_lock, RW_WRITER);
577 	(void) mod_hash_remove(i_mac_impl_hash,
578 	    (mod_hash_key_t)mip->mi_name, &val);
579 	rw_exit(&i_mac_impl_lock);
580 	ASSERT(mip == (mac_impl_t *)val);
581 
582 	i_mac_perim_enter(mip);
583 
584 	/*
585 	 * There is still resource properties configured over this mac.
586 	 */
587 	if (mip->mi_resource_props.mrp_mask != 0)
588 		mac_fastpath_enable((mac_handle_t)mip);
589 
590 	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
591 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
592 		ddi_remove_minor_node(mip->mi_dip,
593 		    (char *)ddi_driver_name(mip->mi_dip));
594 	}
595 
596 	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
597 	    MIS_EXCLUSIVE));
598 
599 	mac_driver_stat_delete(mip);
600 
601 	ASSERT(i_mac_impl_count > 0);
602 	atomic_dec_32(&i_mac_impl_count);
603 
604 	if (mip->mi_pdata != NULL)
605 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
606 	mip->mi_pdata = NULL;
607 	mip->mi_pdata_size = 0;
608 
609 	/*
610 	 * Free the list of margin request.
611 	 */
612 	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
613 		nextmmr = mmr->mmr_nextp;
614 		kmem_free(mmr, sizeof (mac_margin_req_t));
615 	}
616 	mip->mi_mmrp = NULL;
617 
618 	mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN;
619 	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
620 	mip->mi_info.mi_unicst_addr = NULL;
621 
622 	atomic_dec_32(&mip->mi_type->mt_ref);
623 	mip->mi_type = NULL;
624 
625 	/*
626 	 * Free the primary MAC address.
627 	 */
628 	mac_fini_macaddr(mip);
629 
630 	/*
631 	 * free all rings
632 	 */
633 	mac_free_rings(mip, MAC_RING_TYPE_RX);
634 	mac_free_rings(mip, MAC_RING_TYPE_TX);
635 
636 	mac_addr_factory_fini(mip);
637 
638 	bzero(mip->mi_addr, MAXMACADDRLEN);
639 	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
640 	mip->mi_dstaddr_set = B_FALSE;
641 
642 	/* and the flows */
643 	mac_flow_tab_destroy(mip->mi_flow_tab);
644 	mip->mi_flow_tab = NULL;
645 
646 	if (mip->mi_minor > MAC_MAX_MINOR)
647 		mac_minor_rele(mip->mi_minor);
648 
649 	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
650 
651 	/*
652 	 * Reset the perim related fields to default values before
653 	 * kmem_cache_free
654 	 */
655 	i_mac_perim_exit(mip);
656 	mip->mi_state_flags = 0;
657 
658 	mac_unregister_priv_prop(mip);
659 
660 	ASSERT(mip->mi_bridge_link == NULL);
661 	kmem_cache_free(i_mac_impl_cachep, mip);
662 
663 	return (0);
664 }
665 
666 /* DATA RECEPTION */
667 
668 /*
669  * This function is invoked for packets received by the MAC driver in
670  * interrupt context. The ring generation number provided by the driver
671  * is matched with the ring generation number held in MAC. If they do not
672  * match, received packets are considered stale packets coming from an older
673  * assignment of the ring. Drop them.
674  */
675 void
mac_rx_ring(mac_handle_t mh,mac_ring_handle_t mrh,mblk_t * mp_chain,uint64_t mr_gen_num)676 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
677     uint64_t mr_gen_num)
678 {
679 	mac_ring_t		*mr = (mac_ring_t *)mrh;
680 
681 	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
682 		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
683 		    mr->mr_gen_num, uint64_t, mr_gen_num);
684 		freemsgchain(mp_chain);
685 		return;
686 	}
687 	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
688 }
689 
690 /*
691  * This function is invoked for each packet received by the underlying driver.
692  */
693 void
mac_rx(mac_handle_t mh,mac_resource_handle_t mrh,mblk_t * mp_chain)694 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
695 {
696 	mac_impl_t *mip = (mac_impl_t *)mh;
697 
698 	/*
699 	 * Check if the link is part of a bridge.  If not, then we don't need
700 	 * to take the lock to remain consistent.  Make this common case
701 	 * lock-free and tail-call optimized.
702 	 */
703 	if (mip->mi_bridge_link == NULL) {
704 		mac_rx_common(mh, mrh, mp_chain);
705 	} else {
706 		/*
707 		 * Once we take a reference on the bridge link, the bridge
708 		 * module itself can't unload, so the callback pointers are
709 		 * stable.
710 		 */
711 		mutex_enter(&mip->mi_bridge_lock);
712 		if ((mh = mip->mi_bridge_link) != NULL)
713 			mac_bridge_ref_cb(mh, B_TRUE);
714 		mutex_exit(&mip->mi_bridge_lock);
715 		if (mh == NULL) {
716 			mac_rx_common((mac_handle_t)mip, mrh, mp_chain);
717 		} else {
718 			mac_bridge_rx_cb(mh, mrh, mp_chain);
719 			mac_bridge_ref_cb(mh, B_FALSE);
720 		}
721 	}
722 }
723 
724 /*
725  * Special case function: this allows snooping of packets transmitted and
726  * received by TRILL. By design, they go directly into the TRILL module.
727  */
728 void
mac_trill_snoop(mac_handle_t mh,mblk_t * mp)729 mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
730 {
731 	mac_impl_t *mip = (mac_impl_t *)mh;
732 
733 	if (mip->mi_promisc_list != NULL)
734 		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
735 }
736 
737 /*
738  * This is the upward reentry point for packets arriving from the bridging
739  * module and from mac_rx for links not part of a bridge.
740  */
741 void
mac_rx_common(mac_handle_t mh,mac_resource_handle_t mrh,mblk_t * mp_chain)742 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
743 {
744 	mac_impl_t		*mip = (mac_impl_t *)mh;
745 	mac_ring_t		*mr = (mac_ring_t *)mrh;
746 	mac_soft_ring_set_t	*mac_srs;
747 	mblk_t			*bp = mp_chain;
748 
749 	/*
750 	 * If there are any promiscuous mode callbacks defined for
751 	 * this MAC, pass them a copy if appropriate.
752 	 */
753 	if (mip->mi_promisc_list != NULL)
754 		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
755 
756 	if (mr != NULL) {
757 		/*
758 		 * If the SRS teardown has started, just return. The 'mr'
759 		 * continues to be valid until the driver unregisters the MAC.
760 		 * Hardware classified packets will not make their way up
761 		 * beyond this point once the teardown has started. The driver
762 		 * is never passed a pointer to a flow entry or SRS or any
763 		 * structure that can be freed much before mac_unregister.
764 		 */
765 		mutex_enter(&mr->mr_lock);
766 		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
767 		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
768 			mutex_exit(&mr->mr_lock);
769 			freemsgchain(mp_chain);
770 			return;
771 		}
772 
773 		/*
774 		 * The ring is in passthru mode; pass the chain up to
775 		 * the pseudo ring.
776 		 */
777 		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
778 			MR_REFHOLD_LOCKED(mr);
779 			mutex_exit(&mr->mr_lock);
780 			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
781 			    B_FALSE);
782 			MR_REFRELE(mr);
783 			return;
784 		}
785 
786 		/*
787 		 * The passthru callback should only be set when in
788 		 * MAC_PASSTHRU_CLASSIFIER mode.
789 		 */
790 		ASSERT3P(mr->mr_pt_fn, ==, NULL);
791 
792 		/*
793 		 * We check if an SRS is controlling this ring.
794 		 * If so, we can directly call the srs_lower_proc
795 		 * routine otherwise we need to go through mac_rx_classify
796 		 * to reach the right place.
797 		 */
798 		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
799 			MR_REFHOLD_LOCKED(mr);
800 			mutex_exit(&mr->mr_lock);
801 			ASSERT3P(mr->mr_srs, !=, NULL);
802 			mac_srs = mr->mr_srs;
803 
804 			/*
805 			 * This is the fast path. All packets received
806 			 * on this ring are hardware classified and
807 			 * share the same MAC header info.
808 			 */
809 			mac_srs->srs_rx.sr_lower_proc(mh,
810 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
811 			MR_REFRELE(mr);
812 			return;
813 		}
814 
815 		mutex_exit(&mr->mr_lock);
816 		/* We'll fall through to software classification */
817 	} else {
818 		flow_entry_t *flent;
819 		int err;
820 
821 		rw_enter(&mip->mi_rw_lock, RW_READER);
822 		if (mip->mi_single_active_client != NULL) {
823 			flent = mip->mi_single_active_client->mci_flent_list;
824 			FLOW_TRY_REFHOLD(flent, err);
825 			rw_exit(&mip->mi_rw_lock);
826 			if (err == 0) {
827 				(flent->fe_cb_fn)(flent->fe_cb_arg1,
828 				    flent->fe_cb_arg2, mp_chain, B_FALSE);
829 				FLOW_REFRELE(flent);
830 				return;
831 			}
832 		} else {
833 			rw_exit(&mip->mi_rw_lock);
834 		}
835 	}
836 
837 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
838 		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
839 			return;
840 	}
841 
842 	freemsgchain(bp);
843 }
844 
845 /* DATA TRANSMISSION */
846 
847 /*
848  * A driver's notification to resume transmission, in case of a provider
849  * without TX rings.
850  */
851 void
mac_tx_update(mac_handle_t mh)852 mac_tx_update(mac_handle_t mh)
853 {
854 	mac_tx_ring_update(mh, NULL);
855 }
856 
857 /*
858  * A driver's notification to resume transmission on the specified TX ring.
859  */
860 void
mac_tx_ring_update(mac_handle_t mh,mac_ring_handle_t rh)861 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
862 {
863 	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
864 }
865 
866 /* LINK STATE */
867 /*
868  * Notify the MAC layer about a link state change
869  */
870 void
mac_link_update(mac_handle_t mh,link_state_t link)871 mac_link_update(mac_handle_t mh, link_state_t link)
872 {
873 	mac_impl_t	*mip = (mac_impl_t *)mh;
874 
875 	/*
876 	 * Save the link state.
877 	 */
878 	mip->mi_lowlinkstate = link;
879 
880 	/*
881 	 * Send a MAC_NOTE_LOWLINK notification.  This tells the notification
882 	 * thread to deliver both lower and upper notifications.
883 	 */
884 	i_mac_notify(mip, MAC_NOTE_LOWLINK);
885 }
886 
887 /*
888  * Notify the MAC layer about a link state change due to bridging.
889  */
890 void
mac_link_redo(mac_handle_t mh,link_state_t link)891 mac_link_redo(mac_handle_t mh, link_state_t link)
892 {
893 	mac_impl_t	*mip = (mac_impl_t *)mh;
894 
895 	/*
896 	 * Save the link state.
897 	 */
898 	mip->mi_linkstate = link;
899 
900 	/*
901 	 * Send a MAC_NOTE_LINK notification.  Only upper notifications are
902 	 * made.
903 	 */
904 	i_mac_notify(mip, MAC_NOTE_LINK);
905 }
906 
907 /* MINOR NODE HANDLING */
908 
909 /*
910  * Given a dev_t, return the instance number (PPA) associated with it.
911  * Drivers can use this in their getinfo(9e) implementation to lookup
912  * the instance number (i.e. PPA) of the device, to use as an index to
913  * their own array of soft state structures.
914  *
915  * Returns -1 on error.
916  */
917 int
mac_devt_to_instance(dev_t devt)918 mac_devt_to_instance(dev_t devt)
919 {
920 	return (dld_devt_to_instance(devt));
921 }
922 
923 /*
924  * Drivers that make use of the private minor number space are expected to
925  * provide their own getinfo(9e) entry point. This function simply forwards
926  * to the default MAC framework getinfo(9e) implementation as a convenience
927  * if they don't need any special mapping (mac instance != ddi_get_instance())
928  */
929 int
mac_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** resp)930 mac_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
931 {
932 	return (dld_getinfo(dip, cmd, arg, resp));
933 }
934 
935 /*
936  * This function returns the first minor number that is available for
937  * driver private use.  All minor numbers smaller than this are
938  * reserved for GLDv3 use.
939  */
940 minor_t
mac_private_minor(void)941 mac_private_minor(void)
942 {
943 	return (MAC_PRIVATE_MINOR);
944 }
945 
946 /* OTHER CONTROL INFORMATION */
947 
948 /*
949  * A driver notified us that its primary MAC address has changed.
950  */
951 void
mac_unicst_update(mac_handle_t mh,const uint8_t * addr)952 mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
953 {
954 	mac_impl_t	*mip = (mac_impl_t *)mh;
955 
956 	if (mip->mi_type->mt_addr_length == 0)
957 		return;
958 
959 	i_mac_perim_enter(mip);
960 
961 	/*
962 	 * If address changes, freshen the MAC address value and update
963 	 * all MAC clients that share this MAC address.
964 	 */
965 	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) {
966 		mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
967 		    (uint8_t *)addr);
968 	}
969 
970 	i_mac_perim_exit(mip);
971 
972 	/*
973 	 * Send a MAC_NOTE_UNICST notification.
974 	 */
975 	i_mac_notify(mip, MAC_NOTE_UNICST);
976 }
977 
978 void
mac_dst_update(mac_handle_t mh,const uint8_t * addr)979 mac_dst_update(mac_handle_t mh, const uint8_t *addr)
980 {
981 	mac_impl_t	*mip = (mac_impl_t *)mh;
982 
983 	if (mip->mi_type->mt_addr_length == 0)
984 		return;
985 
986 	i_mac_perim_enter(mip);
987 	bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length);
988 	i_mac_perim_exit(mip);
989 	i_mac_notify(mip, MAC_NOTE_DEST);
990 }
991 
992 /*
993  * MAC plugin information changed.
994  */
995 int
mac_pdata_update(mac_handle_t mh,void * mac_pdata,size_t dsize)996 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
997 {
998 	mac_impl_t	*mip = (mac_impl_t *)mh;
999 
1000 	/*
1001 	 * Verify that the plugin supports MAC plugin data and that the
1002 	 * supplied data is valid.
1003 	 */
1004 	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
1005 		return (EINVAL);
1006 	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
1007 		return (EINVAL);
1008 
1009 	if (mip->mi_pdata != NULL)
1010 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
1011 
1012 	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
1013 	bcopy(mac_pdata, mip->mi_pdata, dsize);
1014 	mip->mi_pdata_size = dsize;
1015 
1016 	/*
1017 	 * Since the MAC plugin data is used to construct MAC headers that
1018 	 * were cached in fast-path headers, we need to flush fast-path
1019 	 * information for links associated with this mac.
1020 	 */
1021 	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
1022 	return (0);
1023 }
1024 
1025 /*
1026  * The mac provider or mac frameowrk calls this function when it wants
1027  * to notify upstream consumers that the capabilities have changed and
1028  * that they should modify their own internal state accordingly.
1029  *
1030  * We currently have no regard for the fact that a provider could
1031  * decide to drop capabilities which would invalidate pending traffic.
1032  * For example, if one was to disable the Tx checksum offload while
1033  * TCP/IP traffic was being sent by mac clients relying on that
1034  * feature, then those packets would hit the write with missing or
1035  * partial checksums. A proper solution involves not only providing
1036  * notfication, but also performing client quiescing. That is, a capab
1037  * change should be treated as an atomic transaction that forms a
1038  * barrier between traffic relying on the current capabs and traffic
1039  * relying on the new capabs. In practice, simnet is currently the
1040  * only provider that could hit this, and it's an easily avoidable
1041  * situation (and at worst it should only lead to some dropped
1042  * packets). But if we ever want better on-the-fly capab change to
1043  * actual hardware providers, then we should give this update
1044  * mechanism a proper implementation.
1045  */
1046 void
mac_capab_update(mac_handle_t mh)1047 mac_capab_update(mac_handle_t mh)
1048 {
1049 	/*
1050 	 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
1051 	 * clients to renegotiate capabilities.
1052 	 */
1053 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
1054 }
1055 
1056 /*
1057  * Used by normal drivers to update the max sdu size.
1058  * We need to handle the case of a smaller mi_sdu_multicast
1059  * since this is called by mac_set_mtu() even for drivers that
1060  * have differing unicast and multicast mtu and we don't want to
1061  * increase the multicast mtu by accident in that case.
1062  */
1063 int
mac_maxsdu_update(mac_handle_t mh,uint_t sdu_max)1064 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
1065 {
1066 	mac_impl_t	*mip = (mac_impl_t *)mh;
1067 
1068 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1069 		return (EINVAL);
1070 	mip->mi_sdu_max = sdu_max;
1071 	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
1072 		mip->mi_sdu_multicast = mip->mi_sdu_max;
1073 
1074 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1075 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1076 	return (0);
1077 }
1078 
1079 /*
1080  * Version of the above function that is used by drivers that have a different
1081  * max sdu size for multicast/broadcast vs. unicast.
1082  */
1083 int
mac_maxsdu_update2(mac_handle_t mh,uint_t sdu_max,uint_t sdu_multicast)1084 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
1085 {
1086 	mac_impl_t	*mip = (mac_impl_t *)mh;
1087 
1088 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1089 		return (EINVAL);
1090 	if (sdu_multicast == 0)
1091 		sdu_multicast = sdu_max;
1092 	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
1093 		return (EINVAL);
1094 	mip->mi_sdu_max = sdu_max;
1095 	mip->mi_sdu_multicast = sdu_multicast;
1096 
1097 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1098 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1099 	return (0);
1100 }
1101 
1102 static void
mac_ring_intr_retarget(mac_group_t * group,mac_ring_t * ring)1103 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring)
1104 {
1105 	mac_client_impl_t *mcip;
1106 	flow_entry_t *flent;
1107 	mac_soft_ring_set_t *mac_rx_srs;
1108 	mac_cpus_t *srs_cpu;
1109 	int i;
1110 
1111 	if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) &&
1112 	    (!ring->mr_info.mri_intr.mi_ddi_shared)) {
1113 		/* interrupt can be re-targeted */
1114 		ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
1115 		flent = mcip->mci_flent;
1116 		if (ring->mr_type == MAC_RING_TYPE_RX) {
1117 			for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1118 				mac_rx_srs = flent->fe_rx_srs[i];
1119 				if (mac_rx_srs->srs_ring != ring)
1120 					continue;
1121 				srs_cpu = &mac_rx_srs->srs_cpu;
1122 				mutex_enter(&cpu_lock);
1123 				mac_rx_srs_retarget_intr(mac_rx_srs,
1124 				    srs_cpu->mc_rx_intr_cpu);
1125 				mutex_exit(&cpu_lock);
1126 				break;
1127 			}
1128 		} else {
1129 			if (flent->fe_tx_srs != NULL) {
1130 				mutex_enter(&cpu_lock);
1131 				mac_tx_srs_retarget_intr(
1132 				    flent->fe_tx_srs);
1133 				mutex_exit(&cpu_lock);
1134 			}
1135 		}
1136 	}
1137 }
1138 
1139 /*
1140  * Clients like aggr create pseudo rings (mac_ring_t) and expose them to
1141  * their clients. There is a 1-1 mapping pseudo ring and the hardware
1142  * ring. ddi interrupt handles are exported from the hardware ring to
1143  * the pseudo ring. Thus when the interrupt handle changes, clients of
1144  * aggr that are using the handle need to use the new handle and
1145  * re-target their interrupts.
1146  */
1147 static void
mac_pseudo_ring_intr_retarget(mac_impl_t * mip,mac_ring_t * ring,ddi_intr_handle_t ddh)1148 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring,
1149     ddi_intr_handle_t ddh)
1150 {
1151 	mac_ring_t *pring;
1152 	mac_group_t *pgroup;
1153 	mac_impl_t *pmip;
1154 	char macname[MAXNAMELEN];
1155 	mac_perim_handle_t p_mph;
1156 	uint64_t saved_gen_num;
1157 
1158 again:
1159 	pring = (mac_ring_t *)ring->mr_prh;
1160 	pgroup = (mac_group_t *)pring->mr_gh;
1161 	pmip = (mac_impl_t *)pgroup->mrg_mh;
1162 	saved_gen_num = ring->mr_gen_num;
1163 	(void) strlcpy(macname, pmip->mi_name, MAXNAMELEN);
1164 	/*
1165 	 * We need to enter aggr's perimeter. The locking hierarchy
1166 	 * dictates that aggr's perimeter should be entered first
1167 	 * and then the port's perimeter. So drop the port's
1168 	 * perimeter, enter aggr's and then re-enter port's
1169 	 * perimeter.
1170 	 */
1171 	i_mac_perim_exit(mip);
1172 	/*
1173 	 * While we know pmip is the aggr's mip, there is a
1174 	 * possibility that aggr could have unregistered by
1175 	 * the time we exit port's perimeter (mip) and
1176 	 * enter aggr's perimeter (pmip). To avoid that
1177 	 * scenario, enter aggr's perimeter using its name.
1178 	 */
1179 	if (mac_perim_enter_by_macname(macname, &p_mph) != 0)
1180 		return;
1181 	i_mac_perim_enter(mip);
1182 	/*
1183 	 * Check if the ring got assigned to another aggregation before
1184 	 * be could enter aggr's and the port's perimeter. When a ring
1185 	 * gets deleted from an aggregation, it calls mac_stop_ring()
1186 	 * which increments the generation number. So checking
1187 	 * generation number will be enough.
1188 	 */
1189 	if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) {
1190 		i_mac_perim_exit(mip);
1191 		mac_perim_exit(p_mph);
1192 		i_mac_perim_enter(mip);
1193 		goto again;
1194 	}
1195 
1196 	/* Check if pseudo ring is still present */
1197 	if (ring->mr_prh != NULL) {
1198 		pring->mr_info.mri_intr.mi_ddi_handle = ddh;
1199 		pring->mr_info.mri_intr.mi_ddi_shared =
1200 		    ring->mr_info.mri_intr.mi_ddi_shared;
1201 		if (ddh != NULL)
1202 			mac_ring_intr_retarget(pgroup, pring);
1203 	}
1204 	i_mac_perim_exit(mip);
1205 	mac_perim_exit(p_mph);
1206 }
1207 /*
1208  * API called by driver to provide new interrupt handle for TX/RX rings.
1209  * This usually happens when IRM (Interrupt Resource Manangement)
1210  * framework either gives the driver more MSI-x interrupts or takes
1211  * away MSI-x interrupts from the driver.
1212  */
1213 void
mac_ring_intr_set(mac_ring_handle_t mrh,ddi_intr_handle_t ddh)1214 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh)
1215 {
1216 	mac_ring_t	*ring = (mac_ring_t *)mrh;
1217 	mac_group_t	*group = (mac_group_t *)ring->mr_gh;
1218 	mac_impl_t	*mip = (mac_impl_t *)group->mrg_mh;
1219 
1220 	i_mac_perim_enter(mip);
1221 	ring->mr_info.mri_intr.mi_ddi_handle = ddh;
1222 	if (ddh == NULL) {
1223 		/* Interrupts being reset */
1224 		ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE;
1225 		if (ring->mr_prh != NULL) {
1226 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1227 			return;
1228 		}
1229 	} else {
1230 		/* New interrupt handle */
1231 		mac_compare_ddi_handle(mip->mi_rx_groups,
1232 		    mip->mi_rx_group_count, ring);
1233 		if (!ring->mr_info.mri_intr.mi_ddi_shared) {
1234 			mac_compare_ddi_handle(mip->mi_tx_groups,
1235 			    mip->mi_tx_group_count, ring);
1236 		}
1237 		if (ring->mr_prh != NULL) {
1238 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1239 			return;
1240 		} else {
1241 			mac_ring_intr_retarget(group, ring);
1242 		}
1243 	}
1244 	i_mac_perim_exit(mip);
1245 }
1246 
1247 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
1248 
1249 /*
1250  * Updates the mac_impl structure with the current state of the link
1251  */
1252 static void
i_mac_log_link_state(mac_impl_t * mip)1253 i_mac_log_link_state(mac_impl_t *mip)
1254 {
1255 	/*
1256 	 * If no change, then it is not interesting.
1257 	 */
1258 	if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate)
1259 		return;
1260 
1261 	switch (mip->mi_lowlinkstate) {
1262 	case LINK_STATE_UP:
1263 		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
1264 			char det[200];
1265 
1266 			mip->mi_type->mt_ops.mtops_link_details(det,
1267 			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
1268 
1269 			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
1270 		} else {
1271 			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
1272 		}
1273 		break;
1274 
1275 	case LINK_STATE_DOWN:
1276 		/*
1277 		 * Only transitions from UP to DOWN are interesting
1278 		 */
1279 		if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN)
1280 			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
1281 		break;
1282 
1283 	case LINK_STATE_UNKNOWN:
1284 		/*
1285 		 * This case is normally not interesting.
1286 		 */
1287 		break;
1288 	}
1289 	mip->mi_lastlowlinkstate = mip->mi_lowlinkstate;
1290 }
1291 
1292 /*
1293  * Main routine for the callbacks notifications thread
1294  */
1295 static void
i_mac_notify_thread(void * arg)1296 i_mac_notify_thread(void *arg)
1297 {
1298 	mac_impl_t	*mip = arg;
1299 	callb_cpr_t	cprinfo;
1300 	mac_cb_t	*mcb;
1301 	mac_cb_info_t	*mcbi;
1302 	mac_notify_cb_t	*mncb;
1303 
1304 	mcbi = &mip->mi_notify_cb_info;
1305 	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
1306 	    "i_mac_notify_thread");
1307 
1308 	mutex_enter(mcbi->mcbi_lockp);
1309 
1310 	for (;;) {
1311 		uint32_t	bits;
1312 		uint32_t	type;
1313 
1314 		bits = mip->mi_notify_bits;
1315 		if (bits == 0) {
1316 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1317 			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1318 			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
1319 			continue;
1320 		}
1321 		mip->mi_notify_bits = 0;
1322 		if ((bits & (1 << MAC_NNOTE)) != 0) {
1323 			/* request to quit */
1324 			ASSERT(mip->mi_state_flags & MIS_DISABLED);
1325 			break;
1326 		}
1327 
1328 		mutex_exit(mcbi->mcbi_lockp);
1329 
1330 		/*
1331 		 * Log link changes on the actual link, but then do reports on
1332 		 * synthetic state (if part of a bridge).
1333 		 */
1334 		if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) {
1335 			link_state_t newstate;
1336 			mac_handle_t mh;
1337 
1338 			i_mac_log_link_state(mip);
1339 			newstate = mip->mi_lowlinkstate;
1340 			if (mip->mi_bridge_link != NULL) {
1341 				mutex_enter(&mip->mi_bridge_lock);
1342 				if ((mh = mip->mi_bridge_link) != NULL) {
1343 					newstate = mac_bridge_ls_cb(mh,
1344 					    newstate);
1345 				}
1346 				mutex_exit(&mip->mi_bridge_lock);
1347 			}
1348 			if (newstate != mip->mi_linkstate) {
1349 				mip->mi_linkstate = newstate;
1350 				bits |= 1 << MAC_NOTE_LINK;
1351 			}
1352 		}
1353 
1354 		/*
1355 		 * Depending on which capabs have changed, the Tx
1356 		 * checksum flags may also need to be updated.
1357 		 */
1358 		if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
1359 			mac_perim_handle_t mph;
1360 			mac_handle_t mh = (mac_handle_t)mip;
1361 
1362 			mac_perim_enter_by_mh(mh, &mph);
1363 			mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
1364 			mac_perim_exit(mph);
1365 		}
1366 
1367 		/*
1368 		 * Do notification callbacks for each notification type.
1369 		 */
1370 		for (type = 0; type < MAC_NNOTE; type++) {
1371 			if ((bits & (1 << type)) == 0) {
1372 				continue;
1373 			}
1374 
1375 			if (mac_notify_cb_list[type] != NULL)
1376 				(*mac_notify_cb_list[type])(mip);
1377 
1378 			/*
1379 			 * Walk the list of notifications.
1380 			 */
1381 			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
1382 			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
1383 			    mcb = mcb->mcb_nextp) {
1384 				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
1385 				mncb->mncb_fn(mncb->mncb_arg, type);
1386 			}
1387 			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
1388 			    &mip->mi_notify_cb_list);
1389 		}
1390 
1391 		mutex_enter(mcbi->mcbi_lockp);
1392 	}
1393 
1394 	mip->mi_state_flags |= MIS_NOTIFY_DONE;
1395 	cv_broadcast(&mcbi->mcbi_cv);
1396 
1397 	/* CALLB_CPR_EXIT drops the lock */
1398 	CALLB_CPR_EXIT(&cprinfo);
1399 	thread_exit();
1400 }
1401 
1402 /*
1403  * Signal the i_mac_notify_thread asking it to quit.
1404  * Then wait till it is done.
1405  */
1406 void
i_mac_notify_exit(mac_impl_t * mip)1407 i_mac_notify_exit(mac_impl_t *mip)
1408 {
1409 	mac_cb_info_t	*mcbi;
1410 
1411 	mcbi = &mip->mi_notify_cb_info;
1412 
1413 	mutex_enter(mcbi->mcbi_lockp);
1414 	mip->mi_notify_bits = (1 << MAC_NNOTE);
1415 	cv_broadcast(&mcbi->mcbi_cv);
1416 
1417 
1418 	while ((mip->mi_notify_thread != NULL) &&
1419 	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
1420 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1421 	}
1422 
1423 	/* Necessary clean up before doing kmem_cache_free */
1424 	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
1425 	mip->mi_notify_bits = 0;
1426 	mip->mi_notify_thread = NULL;
1427 	mutex_exit(mcbi->mcbi_lockp);
1428 }
1429 
1430 /*
1431  * Entry point invoked by drivers to dynamically add a ring to an
1432  * existing group.
1433  */
1434 int
mac_group_add_ring(mac_group_handle_t gh,int index)1435 mac_group_add_ring(mac_group_handle_t gh, int index)
1436 {
1437 	mac_group_t *group = (mac_group_t *)gh;
1438 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1439 	int ret;
1440 
1441 	i_mac_perim_enter(mip);
1442 	ret = i_mac_group_add_ring(group, NULL, index);
1443 	i_mac_perim_exit(mip);
1444 	return (ret);
1445 }
1446 
1447 /*
1448  * Entry point invoked by drivers to dynamically remove a ring
1449  * from an existing group. The specified ring handle must no longer
1450  * be used by the driver after a call to this function.
1451  */
1452 void
mac_group_rem_ring(mac_group_handle_t gh,mac_ring_handle_t rh)1453 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
1454 {
1455 	mac_group_t *group = (mac_group_t *)gh;
1456 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1457 
1458 	i_mac_perim_enter(mip);
1459 	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
1460 	i_mac_perim_exit(mip);
1461 }
1462 
1463 /*
1464  * mac_prop_info_*() callbacks called from the driver's prefix_propinfo()
1465  * entry points.
1466  */
1467 
1468 void
mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph,uint8_t val)1469 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val)
1470 {
1471 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1472 
1473 	/* nothing to do if the caller doesn't want the default value */
1474 	if (pr->pr_default == NULL)
1475 		return;
1476 
1477 	ASSERT(pr->pr_default_size >= sizeof (uint8_t));
1478 
1479 	*(uint8_t *)(pr->pr_default) = val;
1480 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1481 }
1482 
1483 void
mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph,uint64_t val)1484 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val)
1485 {
1486 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1487 
1488 	/* nothing to do if the caller doesn't want the default value */
1489 	if (pr->pr_default == NULL)
1490 		return;
1491 
1492 	ASSERT(pr->pr_default_size >= sizeof (uint64_t));
1493 
1494 	bcopy(&val, pr->pr_default, sizeof (val));
1495 
1496 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1497 }
1498 
1499 void
mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph,uint32_t val)1500 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val)
1501 {
1502 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1503 
1504 	/* nothing to do if the caller doesn't want the default value */
1505 	if (pr->pr_default == NULL)
1506 		return;
1507 
1508 	ASSERT(pr->pr_default_size >= sizeof (uint32_t));
1509 
1510 	bcopy(&val, pr->pr_default, sizeof (val));
1511 
1512 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1513 }
1514 
1515 void
mac_prop_info_set_default_str(mac_prop_info_handle_t ph,const char * str)1516 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str)
1517 {
1518 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1519 
1520 	/* nothing to do if the caller doesn't want the default value */
1521 	if (pr->pr_default == NULL)
1522 		return;
1523 
1524 	if (strlen(str) >= pr->pr_default_size)
1525 		pr->pr_errno = ENOBUFS;
1526 	else
1527 		(void) strlcpy(pr->pr_default, str, pr->pr_default_size);
1528 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1529 }
1530 
1531 void
mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,link_flowctrl_t val)1532 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
1533     link_flowctrl_t val)
1534 {
1535 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1536 
1537 	/* nothing to do if the caller doesn't want the default value */
1538 	if (pr->pr_default == NULL)
1539 		return;
1540 
1541 	ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t));
1542 
1543 	bcopy(&val, pr->pr_default, sizeof (val));
1544 
1545 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1546 }
1547 
1548 void
mac_prop_info_set_default_fec(mac_prop_info_handle_t ph,link_fec_t val)1549 mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val)
1550 {
1551 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1552 
1553 	/* nothing to do if the caller doesn't want the default value */
1554 	if (pr->pr_default == NULL)
1555 		return;
1556 
1557 	ASSERT(pr->pr_default_size >= sizeof (link_fec_t));
1558 
1559 	bcopy(&val, pr->pr_default, sizeof (val));
1560 
1561 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1562 }
1563 
1564 void
mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph,uint32_t min,uint32_t max)1565 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
1566     uint32_t max)
1567 {
1568 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1569 	mac_propval_range_t *range = pr->pr_range;
1570 	mac_propval_uint32_range_t *range32;
1571 
1572 	/* nothing to do if the caller doesn't want the range info */
1573 	if (range == NULL)
1574 		return;
1575 
1576 	if (pr->pr_range_cur_count++ == 0) {
1577 		/* first range */
1578 		pr->pr_flags |= MAC_PROP_INFO_RANGE;
1579 		range->mpr_type = MAC_PROPVAL_UINT32;
1580 	} else {
1581 		/* all ranges of a property should be of the same type */
1582 		ASSERT(range->mpr_type == MAC_PROPVAL_UINT32);
1583 		if (pr->pr_range_cur_count > range->mpr_count) {
1584 			pr->pr_errno = ENOSPC;
1585 			return;
1586 		}
1587 	}
1588 
1589 	range32 = range->mpr_range_uint32;
1590 	range32[pr->pr_range_cur_count - 1].mpur_min = min;
1591 	range32[pr->pr_range_cur_count - 1].mpur_max = max;
1592 }
1593 
1594 void
mac_prop_info_set_perm(mac_prop_info_handle_t ph,uint8_t perm)1595 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
1596 {
1597 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1598 
1599 	pr->pr_perm = perm;
1600 	pr->pr_flags |= MAC_PROP_INFO_PERM;
1601 }
1602 
1603 void
mac_hcksum_get(const mblk_t * mp,uint32_t * start,uint32_t * stuff,uint32_t * end,uint32_t * value,uint32_t * flags_ptr)1604 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
1605     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
1606 {
1607 	uint32_t flags;
1608 
1609 	ASSERT(DB_TYPE(mp) == M_DATA);
1610 
1611 	flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
1612 	if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) {
1613 		if (value != NULL)
1614 			*value = (uint32_t)DB_CKSUM16(mp);
1615 		if ((flags & HCK_PARTIALCKSUM) != 0) {
1616 			if (start != NULL)
1617 				*start = (uint32_t)DB_CKSUMSTART(mp);
1618 			if (stuff != NULL)
1619 				*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
1620 			if (end != NULL)
1621 				*end = (uint32_t)DB_CKSUMEND(mp);
1622 		}
1623 	}
1624 
1625 	if (flags_ptr != NULL)
1626 		*flags_ptr = flags;
1627 }
1628 
1629 void
mac_hcksum_set(mblk_t * mp,uint32_t start,uint32_t stuff,uint32_t end,uint32_t value,uint32_t flags)1630 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
1631     uint32_t value, uint32_t flags)
1632 {
1633 	ASSERT(DB_TYPE(mp) == M_DATA);
1634 
1635 	DB_CKSUMSTART(mp) = (intptr_t)start;
1636 	DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
1637 	DB_CKSUMEND(mp) = (intptr_t)end;
1638 	DB_CKSUMFLAGS(mp) = (uint16_t)flags;
1639 	DB_CKSUM16(mp) = (uint16_t)value;
1640 }
1641 
1642 void
mac_hcksum_clone(const mblk_t * src,mblk_t * dst)1643 mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
1644 {
1645 	ASSERT3U(DB_TYPE(src), ==, M_DATA);
1646 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
1647 
1648 	/*
1649 	 * Do these assignments unconditionally, rather than only when
1650 	 * flags is non-zero. This protects a situation where zeroed
1651 	 * hcksum data does not make the jump onto an mblk_t with
1652 	 * stale data in those fields. It's important to copy all
1653 	 * possible flags (HCK_* as well as HW_*) and not just the
1654 	 * checksum specific flags. Dropping flags during a clone
1655 	 * could result in dropped packets. If the caller has good
1656 	 * reason to drop those flags then it should do it manually,
1657 	 * after the clone.
1658 	 */
1659 	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
1660 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
1661 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
1662 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
1663 	DB_CKSUM16(dst) = DB_CKSUM16(src);
1664 	DB_LSOMSS(dst) = DB_LSOMSS(src);
1665 }
1666 
1667 void
mac_lso_get(mblk_t * mp,uint32_t * mss,uint32_t * flags)1668 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
1669 {
1670 	ASSERT(DB_TYPE(mp) == M_DATA);
1671 
1672 	if (flags != NULL) {
1673 		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
1674 		if ((*flags != 0) && (mss != NULL))
1675 			*mss = (uint32_t)DB_LSOMSS(mp);
1676 	}
1677 }
1678 
1679 void
mac_transceiver_info_set_present(mac_transceiver_info_t * infop,boolean_t present)1680 mac_transceiver_info_set_present(mac_transceiver_info_t *infop,
1681     boolean_t present)
1682 {
1683 	infop->mti_present = present;
1684 }
1685 
1686 void
mac_transceiver_info_set_usable(mac_transceiver_info_t * infop,boolean_t usable)1687 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
1688     boolean_t usable)
1689 {
1690 	infop->mti_usable = usable;
1691 }
1692 
1693 static bool
mac_parse_is_ipv6eh(uint8_t id)1694 mac_parse_is_ipv6eh(uint8_t id)
1695 {
1696 	switch (id) {
1697 	case IPPROTO_HOPOPTS:
1698 	case IPPROTO_ROUTING:
1699 	case IPPROTO_FRAGMENT:
1700 	case IPPROTO_AH:
1701 	case IPPROTO_DSTOPTS:
1702 	case IPPROTO_MH:
1703 	case IPPROTO_HIP:
1704 	case IPPROTO_SHIM6:
1705 		/* Currently known extension headers */
1706 		return (true);
1707 	case IPPROTO_ESP:
1708 		/*
1709 		 * While the IANA protocol numbers listing notes ESP as an IPv6
1710 		 * extension header, we cannot effectively parse it like one.
1711 		 *
1712 		 * For now, mac_ether_offload_info() will report it as the L4
1713 		 * protocol for a parsed packet containing this EH.
1714 		 */
1715 	default:
1716 		return (false);
1717 	}
1718 }
1719 
1720 typedef struct mac_mblk_cursor {
1721 	mblk_t	*mmc_head;
1722 	mblk_t	*mmc_cur;
1723 	size_t	mmc_off_total;
1724 	size_t	mmc_off_mp;
1725 } mac_mblk_cursor_t;
1726 
1727 static void mac_mmc_advance(mac_mblk_cursor_t *, size_t);
1728 static void mac_mmc_reset(mac_mblk_cursor_t *);
1729 
1730 static void
mac_mmc_init(mac_mblk_cursor_t * cursor,mblk_t * mp)1731 mac_mmc_init(mac_mblk_cursor_t *cursor, mblk_t *mp)
1732 {
1733 	cursor->mmc_head = mp;
1734 	mac_mmc_reset(cursor);
1735 }
1736 
1737 static void
mac_mmc_reset(mac_mblk_cursor_t * cursor)1738 mac_mmc_reset(mac_mblk_cursor_t *cursor)
1739 {
1740 	ASSERT(cursor->mmc_head != NULL);
1741 
1742 	cursor->mmc_cur = cursor->mmc_head;
1743 	cursor->mmc_off_total = cursor->mmc_off_mp = 0;
1744 
1745 	/* Advance past any zero-length mblks at head */
1746 	mac_mmc_advance(cursor, 0);
1747 }
1748 
1749 static inline size_t
mac_mmc_mp_left(const mac_mblk_cursor_t * cursor)1750 mac_mmc_mp_left(const mac_mblk_cursor_t *cursor)
1751 {
1752 	if (cursor->mmc_cur != NULL) {
1753 		const size_t mp_len = MBLKL(cursor->mmc_cur);
1754 
1755 		ASSERT3U(mp_len, >=, cursor->mmc_off_mp);
1756 
1757 		return (mp_len - cursor->mmc_off_mp);
1758 	} else {
1759 		return (0);
1760 	}
1761 }
1762 
1763 static inline uint8_t *
mac_mmc_mp_ptr(const mac_mblk_cursor_t * cursor)1764 mac_mmc_mp_ptr(const mac_mblk_cursor_t *cursor)
1765 {
1766 	return (cursor->mmc_cur->b_rptr + cursor->mmc_off_mp);
1767 }
1768 
1769 static inline size_t
mac_mmc_offset(const mac_mblk_cursor_t * cursor)1770 mac_mmc_offset(const mac_mblk_cursor_t *cursor)
1771 {
1772 	return (cursor->mmc_off_total);
1773 }
1774 
1775 /*
1776  * Advance cursor forward `len` bytes.
1777  *
1778  * The length to advance must be no greater than the number of bytes remaining
1779  * in the current mblk.  If the position reaches (exactly) the end of the
1780  * current mblk, the cursor will be pushed forward to the next non-zero-length
1781  * mblk in the chain.
1782  */
1783 static inline void
mac_mmc_advance(mac_mblk_cursor_t * cursor,size_t len)1784 mac_mmc_advance(mac_mblk_cursor_t *cursor, size_t len)
1785 {
1786 	ASSERT(cursor->mmc_cur != NULL);
1787 
1788 	const size_t mp_len = MBLKL(cursor->mmc_cur);
1789 
1790 	ASSERT3U(cursor->mmc_off_mp + len, <=, mp_len);
1791 
1792 	cursor->mmc_off_total += len;
1793 	cursor->mmc_off_mp += len;
1794 
1795 	if (cursor->mmc_off_mp == mp_len) {
1796 		cursor->mmc_off_mp = 0;
1797 		cursor->mmc_cur = cursor->mmc_cur->b_cont;
1798 	}
1799 
1800 	/* Skip over any 0-length mblks */
1801 	while (cursor->mmc_cur != NULL && MBLKL(cursor->mmc_cur) == 0) {
1802 		cursor->mmc_cur = cursor->mmc_cur->b_cont;
1803 	}
1804 }
1805 
1806 /*
1807  * Attempt to seek to byte offset `off` in mblk chain.
1808  *
1809  * Returns true if the offset is <= the total chain length.
1810  */
1811 static bool
mac_mmc_seek(mac_mblk_cursor_t * cursor,const size_t off)1812 mac_mmc_seek(mac_mblk_cursor_t *cursor, const size_t off)
1813 {
1814 	ASSERT(cursor->mmc_head != NULL);
1815 
1816 	if (off == cursor->mmc_off_total) {
1817 		/*
1818 		 * Any prior init, reset, or seek operation will have advanced
1819 		 * past any zero-length mblks, making this short-circuit safe.
1820 		 */
1821 		return (true);
1822 	} else if (off < cursor->mmc_off_total) {
1823 		/* Rewind to beginning if offset precedes current position */
1824 		mac_mmc_reset(cursor);
1825 	}
1826 
1827 	size_t seek_left = off - cursor->mmc_off_total;
1828 	while (cursor->mmc_cur != NULL) {
1829 		const size_t mp_left = mac_mmc_mp_left(cursor);
1830 
1831 		if (mp_left > seek_left) {
1832 			/* Target position is within current mblk */
1833 			cursor->mmc_off_mp += seek_left;
1834 			cursor->mmc_off_total += seek_left;
1835 			return (true);
1836 		}
1837 
1838 		/* Move on to the next mblk... */
1839 		mac_mmc_advance(cursor, mp_left);
1840 		seek_left -= mp_left;
1841 	}
1842 
1843 	/*
1844 	 * We have reached the end of the mblk chain, but there is a chance that
1845 	 * it corresponds to the target seek position.
1846 	 */
1847 	return (cursor->mmc_off_total == off);
1848 }
1849 
1850 /*
1851  * Attempt to read uint8_t at offset `pos` in mblk chain.
1852  *
1853  * Returns true (and sets value in `out`) if the offset is within the chain.
1854  */
1855 static bool
mac_mmc_get_uint8(mac_mblk_cursor_t * cursor,size_t pos,uint8_t * out)1856 mac_mmc_get_uint8(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out)
1857 {
1858 	if (!mac_mmc_seek(cursor, pos)) {
1859 		return (false);
1860 	}
1861 
1862 	if (mac_mmc_mp_left(cursor) != 0) {
1863 		*out = *(mac_mmc_mp_ptr(cursor));
1864 		mac_mmc_advance(cursor, 1);
1865 		return (true);
1866 	}
1867 
1868 	return (false);
1869 }
1870 
1871 /*
1872  * Attempt to read uint16_t at offset `pos` in mblk chain.  The two
1873  * network-order bytes are converted into a host-order value.
1874  *
1875  * Returns true (and sets value in `out`) if the 16-bit region specified by the
1876  * offset is within the chain.
1877  */
1878 static bool
mac_mmc_get_uint16(mac_mblk_cursor_t * cursor,size_t pos,uint16_t * out)1879 mac_mmc_get_uint16(mac_mblk_cursor_t *cursor, size_t pos, uint16_t *out)
1880 {
1881 	if (!mac_mmc_seek(cursor, pos)) {
1882 		return (false);
1883 	}
1884 
1885 	const size_t mp_left = mac_mmc_mp_left(cursor);
1886 	uint16_t result = 0;
1887 
1888 	if (mp_left >= 2) {
1889 		uint8_t *bp = mac_mmc_mp_ptr(cursor);
1890 
1891 		result = (uint16_t)bp[0] << 8;
1892 		result |= bp[1];
1893 		mac_mmc_advance(cursor, 2);
1894 		*out = result;
1895 		return (true);
1896 	} else if (mp_left == 1) {
1897 		result = (uint16_t)*(mac_mmc_mp_ptr(cursor));
1898 		mac_mmc_advance(cursor, 1);
1899 
1900 		if (mac_mmc_mp_left(cursor) == 0) {
1901 			return (false);
1902 		}
1903 
1904 		result = result << 8;
1905 		result |= (uint16_t)*(mac_mmc_mp_ptr(cursor));
1906 		mac_mmc_advance(cursor, 1);
1907 		*out = result;
1908 		return (true);
1909 	}
1910 
1911 	return (false);
1912 }
1913 
1914 /*
1915  * Attempt to read `count` bytes at offset `pos` in mblk chain.
1916  *
1917  * Returns true (and copies data to `out`) if `count` length region is available
1918  * at offset within the chain.
1919  */
1920 static bool
mac_mmc_get_bytes(mac_mblk_cursor_t * cursor,size_t pos,uint8_t * out,size_t count)1921 mac_mmc_get_bytes(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out,
1922     size_t count)
1923 {
1924 	if (!mac_mmc_seek(cursor, pos)) {
1925 		return (false);
1926 	}
1927 
1928 	while (count > 0) {
1929 		const size_t mp_left = mac_mmc_mp_left(cursor);
1930 
1931 		if (mp_left == 0) {
1932 			return (false);
1933 		}
1934 		const size_t to_copy = MIN(mp_left, count);
1935 
1936 		bcopy(mac_mmc_mp_ptr(cursor), out, to_copy);
1937 		out += to_copy;
1938 		mac_mmc_advance(cursor, to_copy);
1939 		count -= to_copy;
1940 	}
1941 	return (true);
1942 }
1943 
1944 /*
1945  * Attempt to parse ethernet header (VLAN or not) from mblk chain.
1946  *
1947  * Returns true if header was successfully parsed.  Parsing will begin at
1948  * current offset of `cursor`.  Any non-NULL arguments for VLAN, SAP, and header
1949  * size will be populated on success.  A value of MEOI_VLAN_TCI_INVALID will be
1950  * reported for the TCI if the header does not bear VLAN infomation.
1951  */
1952 static bool
mac_mmc_parse_ether(mac_mblk_cursor_t * cursor,uint8_t * dst_addrp,uint32_t * vlan_tcip,uint16_t * ethertypep,uint16_t * hdr_sizep)1953 mac_mmc_parse_ether(mac_mblk_cursor_t *cursor, uint8_t *dst_addrp,
1954     uint32_t *vlan_tcip, uint16_t *ethertypep, uint16_t *hdr_sizep)
1955 {
1956 	const size_t l2_off = mac_mmc_offset(cursor);
1957 
1958 	if (dst_addrp != NULL) {
1959 		if (!mac_mmc_get_bytes(cursor, l2_off, dst_addrp, ETHERADDRL)) {
1960 			return (false);
1961 		}
1962 	}
1963 
1964 	uint16_t ethertype = 0;
1965 	if (!mac_mmc_get_uint16(cursor,
1966 	    l2_off + offsetof(struct ether_header, ether_type), &ethertype)) {
1967 		return (false);
1968 	}
1969 
1970 	uint32_t tci = MEOI_VLAN_TCI_INVALID;
1971 	uint16_t hdrsize = sizeof (struct ether_header);
1972 
1973 	if (ethertype == ETHERTYPE_VLAN) {
1974 		uint16_t tci_val;
1975 
1976 		if (!mac_mmc_get_uint16(cursor,
1977 		    l2_off + offsetof(struct ether_vlan_header, ether_tci),
1978 		    &tci_val)) {
1979 			return (false);
1980 		}
1981 		if (!mac_mmc_get_uint16(cursor,
1982 		    l2_off + offsetof(struct ether_vlan_header, ether_type),
1983 		    &ethertype)) {
1984 			return (false);
1985 		}
1986 		hdrsize = sizeof (struct ether_vlan_header);
1987 		tci = (uint32_t)tci_val;
1988 	}
1989 
1990 	if (vlan_tcip != NULL) {
1991 		*vlan_tcip = tci;
1992 	}
1993 	if (ethertypep != NULL) {
1994 		*ethertypep = ethertype;
1995 	}
1996 	if (hdr_sizep != NULL) {
1997 		*hdr_sizep = hdrsize;
1998 	}
1999 	return (true);
2000 }
2001 
2002 /*
2003  * Attempt to parse L3 protocol header from mblk chain.
2004  *
2005  * The SAP/ethertype of the containing header must be specified by the caller.
2006  *
2007  * Returns true if header was successfully parsed.  Parsing will begin at
2008  * current offset of `cursor`.  Any non-NULL arguments for IP protocol and
2009  * header size will be populated on success.
2010  */
2011 static bool
mac_mmc_parse_l3(mac_mblk_cursor_t * cursor,uint16_t l3_sap,uint8_t * ipprotop,bool * is_fragp,uint16_t * hdr_sizep)2012 mac_mmc_parse_l3(mac_mblk_cursor_t *cursor, uint16_t l3_sap, uint8_t *ipprotop,
2013     bool *is_fragp, uint16_t *hdr_sizep)
2014 {
2015 	const size_t l3_off = mac_mmc_offset(cursor);
2016 
2017 	if (l3_sap == ETHERTYPE_IP) {
2018 		uint8_t verlen, ipproto;
2019 		uint16_t frag_off;
2020 
2021 		if (!mac_mmc_get_uint8(cursor, l3_off, &verlen)) {
2022 			return (false);
2023 		}
2024 		verlen &= 0x0f;
2025 		if (verlen < 5 || verlen > 0x0f) {
2026 			return (false);
2027 		}
2028 
2029 		if (!mac_mmc_get_uint16(cursor,
2030 		    l3_off + offsetof(ipha_t, ipha_fragment_offset_and_flags),
2031 		    &frag_off)) {
2032 			return (false);
2033 		}
2034 
2035 		if (!mac_mmc_get_uint8(cursor,
2036 		    l3_off + offsetof(ipha_t, ipha_protocol), &ipproto)) {
2037 			return (false);
2038 		}
2039 
2040 		if (ipprotop != NULL) {
2041 			*ipprotop = ipproto;
2042 		}
2043 		if (is_fragp != NULL) {
2044 			*is_fragp = ((frag_off & (IPH_MF | IPH_OFFSET)) != 0);
2045 		}
2046 		if (hdr_sizep != NULL) {
2047 			*hdr_sizep = verlen * 4;
2048 		}
2049 		return (true);
2050 	}
2051 	if (l3_sap == ETHERTYPE_IPV6) {
2052 		uint16_t ip_len = sizeof (ip6_t);
2053 		uint8_t ipproto;
2054 		bool found_frag_eh = false;
2055 
2056 		if (!mac_mmc_get_uint8(cursor,
2057 		    l3_off + offsetof(ip6_t, ip6_nxt), &ipproto)) {
2058 			return (false);
2059 		}
2060 
2061 		/* Chase any extension headers present in packet */
2062 		while (mac_parse_is_ipv6eh(ipproto)) {
2063 			uint8_t len_val, next_hdr;
2064 			uint16_t eh_len;
2065 
2066 			const size_t hdr_off = l3_off + ip_len;
2067 			if (!mac_mmc_get_uint8(cursor, hdr_off, &next_hdr)) {
2068 				return (false);
2069 			}
2070 
2071 			if (ipproto == IPPROTO_FRAGMENT) {
2072 				/*
2073 				 * The Fragment extension header bears a
2074 				 * predefined fixed length, rather than
2075 				 * communicating it through the EH itself.
2076 				 */
2077 				eh_len = 8;
2078 				found_frag_eh = true;
2079 			} else if (ipproto == IPPROTO_AH) {
2080 				/*
2081 				 * The length of the IP Authentication EH is
2082 				 * stored as (n + 2) * 32-bits, where 'n' is the
2083 				 * recorded EH length field
2084 				 */
2085 				if (!mac_mmc_get_uint8(cursor, hdr_off + 1,
2086 				    &len_val)) {
2087 					return (false);
2088 				}
2089 				eh_len = ((uint16_t)len_val + 2) * 4;
2090 			} else {
2091 				/*
2092 				 * All other EHs should follow the sizing
2093 				 * formula of (n + 1) * 64-bits, where 'n' is
2094 				 * the recorded EH length field.
2095 				 */
2096 				if (!mac_mmc_get_uint8(cursor, hdr_off + 1,
2097 				    &len_val)) {
2098 					return (false);
2099 				}
2100 				eh_len = ((uint16_t)len_val + 1) * 8;
2101 			}
2102 			/*
2103 			 * Protect against overflow in the case of a very
2104 			 * contrived packet.
2105 			 */
2106 			if ((ip_len + eh_len) < ip_len) {
2107 				return (-1);
2108 			}
2109 
2110 			ipproto = next_hdr;
2111 			ip_len += eh_len;
2112 		}
2113 
2114 		if (ipprotop != NULL) {
2115 			*ipprotop = ipproto;
2116 		}
2117 		if (is_fragp != NULL) {
2118 			*is_fragp = found_frag_eh;
2119 		}
2120 		if (hdr_sizep != NULL) {
2121 			*hdr_sizep = ip_len;
2122 		}
2123 		return (true);
2124 	}
2125 
2126 	return (false);
2127 }
2128 
2129 /*
2130  * Attempt to parse L4 protocol header from mblk chain.
2131  *
2132  * The IP protocol of the containing header must be specified by the caller.
2133  *
2134  * Returns true if header was successfully parsed.  Parsing will begin at
2135  * current offset of `cursor`.  A non-NULL argument for header size will be
2136  * populated on success.
2137  */
2138 static bool
mac_mmc_parse_l4(mac_mblk_cursor_t * cursor,uint8_t ipproto,uint8_t * hdr_sizep)2139 mac_mmc_parse_l4(mac_mblk_cursor_t *cursor, uint8_t ipproto, uint8_t *hdr_sizep)
2140 {
2141 	ASSERT(hdr_sizep != NULL);
2142 
2143 	const size_t l4_off = mac_mmc_offset(cursor);
2144 	uint8_t tcp_doff;
2145 
2146 	switch (ipproto) {
2147 	case IPPROTO_TCP:
2148 		if (!mac_mmc_get_uint8(cursor,
2149 		    l4_off + offsetof(tcph_t, th_offset_and_rsrvd),
2150 		    &tcp_doff)) {
2151 			return (false);
2152 		}
2153 		tcp_doff = (tcp_doff & 0xf0) >> 4;
2154 		if (tcp_doff < 5 || tcp_doff > 0xf) {
2155 			return (false);
2156 		}
2157 		*hdr_sizep = tcp_doff * 4;
2158 		return (true);
2159 	case IPPROTO_UDP:
2160 		*hdr_sizep = sizeof (struct udphdr);
2161 		return (true);
2162 	case IPPROTO_ICMPV6:
2163 		*hdr_sizep = sizeof (icmp6_t);
2164 		return (true);
2165 	case IPPROTO_SCTP:
2166 		*hdr_sizep = sizeof (sctp_hdr_t);
2167 		return (true);
2168 	default:
2169 		return (false);
2170 	}
2171 }
2172 
2173 /*
2174  * Parse destination MAC address and VLAN TCI (if any) from mblk chain.
2175  *
2176  * If packet ethertype does not indicate that a VLAN is present,
2177  * MEOI_VLAN_TCI_INVALID will be returned for the TCI.
2178  *
2179  * Returns B_TRUE if header could be parsed for destination MAC address and VLAN
2180  * TCI, otherwise B_FALSE.
2181  */
2182 boolean_t
mac_ether_l2_info(mblk_t * mp,uint8_t * dst_addrp,uint32_t * vlan_tcip)2183 mac_ether_l2_info(mblk_t *mp, uint8_t *dst_addrp, uint32_t *vlan_tcip)
2184 {
2185 	mac_mblk_cursor_t cursor;
2186 
2187 	mac_mmc_init(&cursor, mp);
2188 	if (!mac_mmc_parse_ether(&cursor, dst_addrp, vlan_tcip, NULL, NULL)) {
2189 		return (B_FALSE);
2190 	}
2191 
2192 	return (B_TRUE);
2193 }
2194 
2195 /*
2196  * Perform a partial parsing of offload info from a frame and/or packet.
2197  *
2198  * Beginning at the provided byte offset (`off`) in the mblk, attempt to parse
2199  * any offload info which has not yet been populated in `meoi`.  The contents of
2200  * `meoi_flags` upon entry will be considered as "already parsed", their
2201  * corresponding data fields will be considered valid.
2202  *
2203  * A motivating example: A non-Ethernet packet could be parsed for L3/L4 offload
2204  * information by setting MEOI_L2INFO_SET in `meoi_flags`, and the L3 SAP in
2205  * `meoi_l3_proto`. With a value in `meoi_l2hlen` that, when combined with the
2206  * provided `off`, will direct the parser to the start of the L3 header in the
2207  * mblk, the rest of the logic will be free to run.
2208  *
2209  * Alternatively, this could be used to parse the headers in an encapsulated
2210  * Ethernet packet by simply specifying the start of its header in `off`.
2211  *
2212  * The degree to which parsing was able to proceed is stored in `meoi_flags`.
2213  */
2214 void
mac_partial_offload_info(mblk_t * mp,size_t off,mac_ether_offload_info_t * meoi)2215 mac_partial_offload_info(mblk_t *mp, size_t off, mac_ether_offload_info_t *meoi)
2216 {
2217 	mac_mblk_cursor_t cursor;
2218 
2219 	mac_mmc_init(&cursor, mp);
2220 
2221 	if (!mac_mmc_seek(&cursor, off)) {
2222 		return;
2223 	}
2224 
2225 	if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) {
2226 		uint32_t vlan_tci;
2227 		uint16_t l2_sz, ethertype;
2228 		if (!mac_mmc_parse_ether(&cursor, NULL, &vlan_tci, &ethertype,
2229 		    &l2_sz)) {
2230 			return;
2231 		}
2232 
2233 		meoi->meoi_flags |= MEOI_L2INFO_SET;
2234 		meoi->meoi_l2hlen = l2_sz;
2235 		meoi->meoi_l3proto = ethertype;
2236 		if (vlan_tci != MEOI_VLAN_TCI_INVALID) {
2237 			ASSERT3U(meoi->meoi_l2hlen, ==,
2238 			    sizeof (struct ether_vlan_header));
2239 			meoi->meoi_flags |= MEOI_VLAN_TAGGED;
2240 		}
2241 	}
2242 	const size_t l2_end = off + (size_t)meoi->meoi_l2hlen;
2243 	if (!mac_mmc_seek(&cursor, l2_end)) {
2244 		meoi->meoi_flags &= ~MEOI_L2INFO_SET;
2245 		return;
2246 	}
2247 
2248 	if ((meoi->meoi_flags & MEOI_L3INFO_SET) == 0) {
2249 		uint8_t ipproto;
2250 		uint16_t l3_sz;
2251 		bool is_frag;
2252 		if (!mac_mmc_parse_l3(&cursor, meoi->meoi_l3proto, &ipproto,
2253 		    &is_frag, &l3_sz)) {
2254 			return;
2255 		}
2256 
2257 		meoi->meoi_l3hlen = l3_sz;
2258 		meoi->meoi_l4proto = ipproto;
2259 		meoi->meoi_flags |= MEOI_L3INFO_SET;
2260 		if (is_frag) {
2261 			meoi->meoi_flags |= MEOI_L3_FRAGMENT;
2262 		}
2263 	}
2264 	const size_t l3_end = l2_end + (size_t)meoi->meoi_l3hlen;
2265 	if (!mac_mmc_seek(&cursor, l3_end)) {
2266 		meoi->meoi_flags &= ~MEOI_L3INFO_SET;
2267 		return;
2268 	}
2269 
2270 	if ((meoi->meoi_flags & MEOI_L4INFO_SET) == 0) {
2271 		uint8_t l4_sz;
2272 		if (!mac_mmc_parse_l4(&cursor, meoi->meoi_l4proto, &l4_sz)) {
2273 			return;
2274 		}
2275 
2276 		meoi->meoi_l4hlen = l4_sz;
2277 		meoi->meoi_flags |= MEOI_L4INFO_SET;
2278 	}
2279 	const size_t l4_end = l3_end + (size_t)meoi->meoi_l4hlen;
2280 	if (!mac_mmc_seek(&cursor, l4_end)) {
2281 		meoi->meoi_flags &= ~MEOI_L4INFO_SET;
2282 	}
2283 }
2284 
2285 /*
2286  * Attempt to parse packet headers to extract information useful for various
2287  * offloads.  This includes header protocols and lengths.
2288  *
2289  * The meoi_flags field will indicate the extent to which parsing was able to
2290  * complete.  Each in turn promises that subsequent fields are populated, and
2291  * that the mblk chain is large enough to contain the parsed header(s):
2292  *
2293  * - MEOI_L2INFO_SET: meoi_l3_proto and meoi_l2hlen
2294  * - MEOI_L3INFO_SET: meoi_l4_proto and meoi_l3hlen
2295  * - MEOI_L4INFO_SET: meoi_l4hlen
2296  *
2297  * When any of those flags are absent, their corresponding data fields will be
2298  * zeroed.
2299  *
2300  * These additional flags are set when certain conditions are met during
2301  * parsing:
2302  *
2303  * - MEOI_VLAN_TAGGED: Ethernet header is tagged with a VLAN
2304  * - MEOI_L3_FRAGMENT: L3 header indicated fragmentation
2305  */
2306 void
mac_ether_offload_info(mblk_t * mp,mac_ether_offload_info_t * meoi)2307 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
2308 {
2309 	bzero(meoi, sizeof (mac_ether_offload_info_t));
2310 	meoi->meoi_len = msgdsize(mp);
2311 
2312 	mac_partial_offload_info(mp, 0, meoi);
2313 }
2314