xref: /illumos-gate/usr/src/uts/common/io/mac/mac_provider.c (revision 8da72dbfb6b03df1a6817e55358ad6ba21c88e74)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
26  * Copyright 2020 RackTop Systems, Inc.
27  * Copyright 2025 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/id_space.h>
33 #include <sys/esunddi.h>
34 #include <sys/stat.h>
35 #include <sys/mkdev.h>
36 #include <sys/stream.h>
37 #include <sys/strsubr.h>
38 #include <sys/dlpi.h>
39 #include <sys/modhash.h>
40 #include <sys/mac.h>
41 #include <sys/mac_provider.h>
42 #include <sys/mac_impl.h>
43 #include <sys/mac_client_impl.h>
44 #include <sys/mac_client_priv.h>
45 #include <sys/mac_soft_ring.h>
46 #include <sys/mac_stat.h>
47 #include <sys/dld.h>
48 #include <sys/modctl.h>
49 #include <sys/fs/dv_node.h>
50 #include <sys/thread.h>
51 #include <sys/proc.h>
52 #include <sys/callb.h>
53 #include <sys/cpuvar.h>
54 #include <sys/atomic.h>
55 #include <sys/sdt.h>
56 #include <sys/mac_flow.h>
57 #include <sys/ddi_intr_impl.h>
58 #include <sys/disp.h>
59 #include <sys/sdt.h>
60 #include <sys/stdbool.h>
61 #include <sys/pattr.h>
62 #include <sys/strsun.h>
63 #include <sys/vlan.h>
64 #include <inet/ip.h>
65 #include <inet/tcp.h>
66 #include <netinet/udp.h>
67 #include <netinet/sctp.h>
68 
69 /*
70  * MAC Provider Interface.
71  *
72  * Interface for GLDv3 compatible NIC drivers.
73  */
74 
75 static void i_mac_notify_thread(void *);
76 
77 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
78 
79 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = {
80 	mac_fanout_recompute,	/* MAC_NOTE_LINK */
81 	NULL,		/* MAC_NOTE_UNICST */
82 	NULL,		/* MAC_NOTE_TX */
83 	NULL,		/* MAC_NOTE_DEVPROMISC */
84 	NULL,		/* MAC_NOTE_FASTPATH_FLUSH */
85 	NULL,		/* MAC_NOTE_SDU_SIZE */
86 	NULL,		/* MAC_NOTE_MARGIN */
87 	NULL,		/* MAC_NOTE_CAPAB_CHG */
88 	NULL		/* MAC_NOTE_LOWLINK */
89 };
90 
91 /*
92  * Driver support functions.
93  */
94 
95 /* REGISTRATION */
96 
97 mac_register_t *
mac_alloc(uint_t mac_version)98 mac_alloc(uint_t mac_version)
99 {
100 	mac_register_t *mregp;
101 
102 	/*
103 	 * Make sure there isn't a version mismatch between the driver and
104 	 * the framework.  In the future, if multiple versions are
105 	 * supported, this check could become more sophisticated.
106 	 */
107 	if (mac_version != MAC_VERSION)
108 		return (NULL);
109 
110 	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
111 	mregp->m_version = mac_version;
112 	return (mregp);
113 }
114 
115 void
mac_free(mac_register_t * mregp)116 mac_free(mac_register_t *mregp)
117 {
118 	kmem_free(mregp, sizeof (mac_register_t));
119 }
120 
121 /*
122  * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
123  * value.
124  */
125 static uint16_t
mac_features_to_flags(mac_handle_t mh)126 mac_features_to_flags(mac_handle_t mh)
127 {
128 	uint16_t flags = 0;
129 	uint32_t cap_sum = 0;
130 	mac_capab_lso_t cap_lso;
131 
132 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
133 		if (cap_sum & HCKSUM_IPHDRCKSUM)
134 			flags |= HCK_IPV4_HDRCKSUM;
135 
136 		if (cap_sum & HCKSUM_INET_PARTIAL)
137 			flags |= HCK_PARTIALCKSUM;
138 		else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
139 			flags |= HCK_FULLCKSUM;
140 	}
141 
142 	/*
143 	 * We don't need the information stored in 'cap_lso', but we
144 	 * need to pass a non-NULL pointer to appease the driver.
145 	 */
146 	if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
147 		flags |= HW_LSO;
148 
149 	return (flags);
150 }
151 
152 /*
153  * mac_register() is how drivers register new MACs with the GLDv3
154  * framework.  The mregp argument is allocated by drivers using the
155  * mac_alloc() function, and can be freed using mac_free() immediately upon
156  * return from mac_register().  Upon success (0 return value), the mhp
157  * opaque pointer becomes the driver's handle to its MAC interface, and is
158  * the argument to all other mac module entry points.
159  */
160 /* ARGSUSED */
161 int
mac_register(mac_register_t * mregp,mac_handle_t * mhp)162 mac_register(mac_register_t *mregp, mac_handle_t *mhp)
163 {
164 	mac_impl_t		*mip;
165 	mactype_t		*mtype;
166 	int			err = EINVAL;
167 	struct devnames		*dnp = NULL;
168 	uint_t			instance;
169 	boolean_t		style1_created = B_FALSE;
170 	boolean_t		style2_created = B_FALSE;
171 	char			*driver;
172 	minor_t			minor = 0;
173 
174 	/* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */
175 	if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip)))
176 		return (EINVAL);
177 
178 	/* Find the required MAC-Type plugin. */
179 	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
180 		return (EINVAL);
181 
182 	/* Create a mac_impl_t to represent this MAC. */
183 	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
184 
185 	/*
186 	 * The mac is not ready for open yet.
187 	 */
188 	mip->mi_state_flags |= MIS_DISABLED;
189 
190 	/*
191 	 * When a mac is registered, the m_instance field can be set to:
192 	 *
193 	 *  0:	Get the mac's instance number from m_dip.
194 	 *	This is usually used for physical device dips.
195 	 *
196 	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
197 	 *	For example, when an aggregation is created with the key option,
198 	 *	"key" will be used as the instance number.
199 	 *
200 	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
201 	 *	This is often used when a MAC of a virtual link is registered
202 	 *	(e.g., aggregation when "key" is not specified, or vnic).
203 	 *
204 	 * Note that the instance number is used to derive the mi_minor field
205 	 * of mac_impl_t, which will then be used to derive the name of kstats
206 	 * and the devfs nodes.  The first 2 cases are needed to preserve
207 	 * backward compatibility.
208 	 */
209 	switch (mregp->m_instance) {
210 	case 0:
211 		instance = ddi_get_instance(mregp->m_dip);
212 		break;
213 	case ((uint_t)-1):
214 		minor = mac_minor_hold(B_TRUE);
215 		if (minor == 0) {
216 			err = ENOSPC;
217 			goto fail;
218 		}
219 		instance = minor - 1;
220 		break;
221 	default:
222 		instance = mregp->m_instance;
223 		if (instance >= MAC_MAX_MINOR) {
224 			err = EINVAL;
225 			goto fail;
226 		}
227 		break;
228 	}
229 
230 	mip->mi_minor = (minor_t)(instance + 1);
231 	mip->mi_dip = mregp->m_dip;
232 	mip->mi_clients_list = NULL;
233 	mip->mi_nclients = 0;
234 
235 	/* Set the default IEEE Port VLAN Identifier */
236 	mip->mi_pvid = 1;
237 
238 	/* Default bridge link learning protection values */
239 	mip->mi_llimit = 1000;
240 	mip->mi_ldecay = 200;
241 
242 	driver = (char *)ddi_driver_name(mip->mi_dip);
243 
244 	/* Construct the MAC name as <drvname><instance> */
245 	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
246 	    driver, instance);
247 
248 	mip->mi_driver = mregp->m_driver;
249 
250 	mip->mi_type = mtype;
251 	mip->mi_margin = mregp->m_margin;
252 	mip->mi_info.mi_media = mtype->mt_type;
253 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
254 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
255 		goto fail;
256 	if (mregp->m_multicast_sdu == 0)
257 		mregp->m_multicast_sdu = mregp->m_max_sdu;
258 	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
259 	    mregp->m_multicast_sdu > mregp->m_max_sdu)
260 		goto fail;
261 	mip->mi_sdu_min = mregp->m_min_sdu;
262 	mip->mi_sdu_max = mregp->m_max_sdu;
263 	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
264 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
265 	/*
266 	 * If the media supports a broadcast address, cache a pointer to it
267 	 * in the mac_info_t so that upper layers can use it.
268 	 */
269 	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
270 
271 	mip->mi_v12n_level = mregp->m_v12n;
272 
273 	/*
274 	 * Copy the unicast source address into the mac_info_t, but only if
275 	 * the MAC-Type defines a non-zero address length.  We need to
276 	 * handle MAC-Types that have an address length of 0
277 	 * (point-to-point protocol MACs for example).
278 	 */
279 	if (mip->mi_type->mt_addr_length > 0) {
280 		if (mregp->m_src_addr == NULL)
281 			goto fail;
282 		mip->mi_info.mi_unicst_addr =
283 		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
284 		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
285 		    mip->mi_type->mt_addr_length);
286 
287 		/*
288 		 * Copy the fixed 'factory' MAC address from the immutable
289 		 * info.  This is taken to be the MAC address currently in
290 		 * use.
291 		 */
292 		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
293 		    mip->mi_type->mt_addr_length);
294 
295 		/*
296 		 * At this point, we should set up the classification
297 		 * rules etc but we delay it till mac_open() so that
298 		 * the resource discovery has taken place and we
299 		 * know someone wants to use the device. Otherwise
300 		 * memory gets allocated for Rx ring structures even
301 		 * during probe.
302 		 */
303 
304 		/* Copy the destination address if one is provided. */
305 		if (mregp->m_dst_addr != NULL) {
306 			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
307 			    mip->mi_type->mt_addr_length);
308 			mip->mi_dstaddr_set = B_TRUE;
309 		}
310 	} else if (mregp->m_src_addr != NULL) {
311 		goto fail;
312 	}
313 
314 	/*
315 	 * The format of the m_pdata is specific to the plugin.  It is
316 	 * passed in as an argument to all of the plugin callbacks.  The
317 	 * driver can update this information by calling
318 	 * mac_pdata_update().
319 	 */
320 	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) {
321 		/*
322 		 * Verify if the supplied plugin data is valid.  Note that
323 		 * even if the caller passed in a NULL pointer as plugin data,
324 		 * we still need to verify if that's valid as the plugin may
325 		 * require plugin data to function.
326 		 */
327 		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
328 		    mregp->m_pdata_size)) {
329 			goto fail;
330 		}
331 		if (mregp->m_pdata != NULL) {
332 			mip->mi_pdata =
333 			    kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
334 			bcopy(mregp->m_pdata, mip->mi_pdata,
335 			    mregp->m_pdata_size);
336 			mip->mi_pdata_size = mregp->m_pdata_size;
337 		}
338 	} else if (mregp->m_pdata != NULL) {
339 		/*
340 		 * The caller supplied non-NULL plugin data, but the plugin
341 		 * does not recognize plugin data.
342 		 */
343 		err = EINVAL;
344 		goto fail;
345 	}
346 
347 	/*
348 	 * Register the private properties.
349 	 */
350 	mac_register_priv_prop(mip, mregp->m_priv_props);
351 
352 	/*
353 	 * Stash the driver callbacks into the mac_impl_t, but first sanity
354 	 * check to make sure all mandatory callbacks are set.
355 	 */
356 	if (mregp->m_callbacks->mc_getstat == NULL ||
357 	    mregp->m_callbacks->mc_start == NULL ||
358 	    mregp->m_callbacks->mc_stop == NULL ||
359 	    mregp->m_callbacks->mc_setpromisc == NULL ||
360 	    mregp->m_callbacks->mc_multicst == NULL) {
361 		goto fail;
362 	}
363 	mip->mi_callbacks = mregp->m_callbacks;
364 
365 	if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY,
366 	    &mip->mi_capab_legacy)) {
367 		mip->mi_state_flags |= MIS_LEGACY;
368 		mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev;
369 	} else {
370 		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
371 		    mip->mi_minor);
372 	}
373 
374 	/*
375 	 * Allocate a notification thread. thread_create blocks for memory
376 	 * if needed, it never fails.
377 	 */
378 	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
379 	    mip, 0, &p0, TS_RUN, minclsyspri);
380 
381 	/*
382 	 * Cache the DB_CKSUMFLAGS that this MAC supports.
383 	 */
384 	mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
385 
386 	/*
387 	 * Initialize the capabilities
388 	 */
389 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
390 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
391 
392 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
393 		mip->mi_state_flags |= MIS_IS_VNIC;
394 
395 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
396 		mip->mi_state_flags |= MIS_IS_AGGR;
397 
398 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
399 		mip->mi_state_flags |= MIS_IS_OVERLAY;
400 
401 	mac_addr_factory_init(mip);
402 
403 	mac_transceiver_init(mip);
404 
405 	mac_led_init(mip);
406 
407 	/*
408 	 * Enforce the virtrualization level registered.
409 	 */
410 	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
411 		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
412 		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
413 			goto fail;
414 
415 		/*
416 		 * The driver needs to register at least rx rings for this
417 		 * virtualization level.
418 		 */
419 		if (mip->mi_rx_groups == NULL)
420 			goto fail;
421 	}
422 
423 	/*
424 	 * The driver must set mc_unicst entry point to NULL when it advertises
425 	 * CAP_RINGS for rx groups.
426 	 */
427 	if (mip->mi_rx_groups != NULL) {
428 		if (mregp->m_callbacks->mc_unicst != NULL)
429 			goto fail;
430 	} else {
431 		if (mregp->m_callbacks->mc_unicst == NULL)
432 			goto fail;
433 	}
434 
435 	/*
436 	 * Initialize MAC addresses. Must be called after mac_init_rings().
437 	 */
438 	mac_init_macaddr(mip);
439 
440 	mip->mi_share_capab.ms_snum = 0;
441 	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
442 		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
443 		    &mip->mi_share_capab);
444 	}
445 
446 	/*
447 	 * Initialize the kstats for this device.
448 	 */
449 	mac_driver_stat_create(mip);
450 
451 	/* Zero out any properties. */
452 	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
453 
454 	if (mip->mi_minor <= MAC_MAX_MINOR) {
455 		/* Create a style-2 DLPI device */
456 		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
457 		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
458 			goto fail;
459 		style2_created = B_TRUE;
460 
461 		/* Create a style-1 DLPI device */
462 		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
463 		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
464 			goto fail;
465 		style1_created = B_TRUE;
466 	}
467 
468 	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
469 
470 	rw_enter(&i_mac_impl_lock, RW_WRITER);
471 	if (mod_hash_insert(i_mac_impl_hash,
472 	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
473 		rw_exit(&i_mac_impl_lock);
474 		err = EEXIST;
475 		goto fail;
476 	}
477 
478 	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
479 	    (mac_impl_t *), mip);
480 
481 	/*
482 	 * Mark the MAC to be ready for open.
483 	 */
484 	mip->mi_state_flags &= ~MIS_DISABLED;
485 	rw_exit(&i_mac_impl_lock);
486 
487 	atomic_inc_32(&i_mac_impl_count);
488 
489 	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
490 	*mhp = (mac_handle_t)mip;
491 	return (0);
492 
493 fail:
494 	if (style1_created)
495 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
496 
497 	if (style2_created)
498 		ddi_remove_minor_node(mip->mi_dip, driver);
499 
500 	mac_addr_factory_fini(mip);
501 
502 	/* Clean up registered MAC addresses */
503 	mac_fini_macaddr(mip);
504 
505 	/* Clean up registered rings */
506 	mac_free_rings(mip, MAC_RING_TYPE_RX);
507 	mac_free_rings(mip, MAC_RING_TYPE_TX);
508 
509 	/* Clean up notification thread */
510 	if (mip->mi_notify_thread != NULL)
511 		i_mac_notify_exit(mip);
512 
513 	if (mip->mi_info.mi_unicst_addr != NULL) {
514 		kmem_free(mip->mi_info.mi_unicst_addr,
515 		    mip->mi_type->mt_addr_length);
516 		mip->mi_info.mi_unicst_addr = NULL;
517 	}
518 
519 	mac_driver_stat_delete(mip);
520 
521 	if (mip->mi_type != NULL) {
522 		atomic_dec_32(&mip->mi_type->mt_ref);
523 		mip->mi_type = NULL;
524 	}
525 
526 	if (mip->mi_pdata != NULL) {
527 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
528 		mip->mi_pdata = NULL;
529 		mip->mi_pdata_size = 0;
530 	}
531 
532 	if (minor != 0) {
533 		ASSERT(minor > MAC_MAX_MINOR);
534 		mac_minor_rele(minor);
535 	}
536 
537 	mip->mi_state_flags = 0;
538 	mac_unregister_priv_prop(mip);
539 
540 	/*
541 	 * Clear the state before destroying the mac_impl_t
542 	 */
543 	mip->mi_state_flags = 0;
544 
545 	kmem_cache_free(i_mac_impl_cachep, mip);
546 	return (err);
547 }
548 
549 /*
550  * Unregister from the GLDv3 framework
551  */
552 int
mac_unregister(mac_handle_t mh)553 mac_unregister(mac_handle_t mh)
554 {
555 	int			err;
556 	mac_impl_t		*mip = (mac_impl_t *)mh;
557 	mod_hash_val_t		val;
558 	mac_margin_req_t	*mmr, *nextmmr;
559 
560 	/* Fail the unregister if there are any open references to this mac. */
561 	if ((err = mac_disable_nowait(mh)) != 0)
562 		return (err);
563 
564 	/*
565 	 * Clean up notification thread and wait for it to exit.
566 	 */
567 	i_mac_notify_exit(mip);
568 
569 	/*
570 	 * Prior to acquiring the MAC perimeter, remove the MAC instance from
571 	 * the internal hash table. Such removal means table-walkers that
572 	 * acquire the perimeter will not do so on behalf of what we are
573 	 * unregistering, which prevents a deadlock.
574 	 */
575 	rw_enter(&i_mac_impl_lock, RW_WRITER);
576 	(void) mod_hash_remove(i_mac_impl_hash,
577 	    (mod_hash_key_t)mip->mi_name, &val);
578 	rw_exit(&i_mac_impl_lock);
579 	ASSERT(mip == (mac_impl_t *)val);
580 
581 	i_mac_perim_enter(mip);
582 
583 	/*
584 	 * There is still resource properties configured over this mac.
585 	 */
586 	if (mip->mi_resource_props.mrp_mask != 0)
587 		mac_fastpath_enable((mac_handle_t)mip);
588 
589 	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
590 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
591 		ddi_remove_minor_node(mip->mi_dip,
592 		    (char *)ddi_driver_name(mip->mi_dip));
593 	}
594 
595 	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
596 	    MIS_EXCLUSIVE));
597 
598 	mac_driver_stat_delete(mip);
599 
600 	ASSERT(i_mac_impl_count > 0);
601 	atomic_dec_32(&i_mac_impl_count);
602 
603 	if (mip->mi_pdata != NULL)
604 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
605 	mip->mi_pdata = NULL;
606 	mip->mi_pdata_size = 0;
607 
608 	/*
609 	 * Free the list of margin request.
610 	 */
611 	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
612 		nextmmr = mmr->mmr_nextp;
613 		kmem_free(mmr, sizeof (mac_margin_req_t));
614 	}
615 	mip->mi_mmrp = NULL;
616 
617 	mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN;
618 	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
619 	mip->mi_info.mi_unicst_addr = NULL;
620 
621 	atomic_dec_32(&mip->mi_type->mt_ref);
622 	mip->mi_type = NULL;
623 
624 	/*
625 	 * Free the primary MAC address.
626 	 */
627 	mac_fini_macaddr(mip);
628 
629 	/*
630 	 * free all rings
631 	 */
632 	mac_free_rings(mip, MAC_RING_TYPE_RX);
633 	mac_free_rings(mip, MAC_RING_TYPE_TX);
634 
635 	mac_addr_factory_fini(mip);
636 
637 	bzero(mip->mi_addr, MAXMACADDRLEN);
638 	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
639 	mip->mi_dstaddr_set = B_FALSE;
640 
641 	/* and the flows */
642 	mac_flow_tab_destroy(mip->mi_flow_tab);
643 	mip->mi_flow_tab = NULL;
644 
645 	if (mip->mi_minor > MAC_MAX_MINOR)
646 		mac_minor_rele(mip->mi_minor);
647 
648 	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
649 
650 	/*
651 	 * Reset the perim related fields to default values before
652 	 * kmem_cache_free
653 	 */
654 	i_mac_perim_exit(mip);
655 	mip->mi_state_flags = 0;
656 
657 	mac_unregister_priv_prop(mip);
658 
659 	ASSERT(mip->mi_bridge_link == NULL);
660 	kmem_cache_free(i_mac_impl_cachep, mip);
661 
662 	return (0);
663 }
664 
665 /* DATA RECEPTION */
666 
667 /*
668  * This function is invoked for packets received by the MAC driver in
669  * interrupt context. The ring generation number provided by the driver
670  * is matched with the ring generation number held in MAC. If they do not
671  * match, received packets are considered stale packets coming from an older
672  * assignment of the ring. Drop them.
673  */
674 void
mac_rx_ring(mac_handle_t mh,mac_ring_handle_t mrh,mblk_t * mp_chain,uint64_t mr_gen_num)675 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
676     uint64_t mr_gen_num)
677 {
678 	mac_ring_t		*mr = (mac_ring_t *)mrh;
679 
680 	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
681 		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
682 		    mr->mr_gen_num, uint64_t, mr_gen_num);
683 		freemsgchain(mp_chain);
684 		return;
685 	}
686 	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
687 }
688 
689 /*
690  * This function is invoked for each packet received by the underlying driver.
691  */
692 void
mac_rx(mac_handle_t mh,mac_resource_handle_t mrh,mblk_t * mp_chain)693 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
694 {
695 	mac_impl_t *mip = (mac_impl_t *)mh;
696 
697 	/*
698 	 * Check if the link is part of a bridge.  If not, then we don't need
699 	 * to take the lock to remain consistent.  Make this common case
700 	 * lock-free and tail-call optimized.
701 	 */
702 	if (mip->mi_bridge_link == NULL) {
703 		mac_rx_common(mh, mrh, mp_chain);
704 	} else {
705 		/*
706 		 * Once we take a reference on the bridge link, the bridge
707 		 * module itself can't unload, so the callback pointers are
708 		 * stable.
709 		 */
710 		mutex_enter(&mip->mi_bridge_lock);
711 		if ((mh = mip->mi_bridge_link) != NULL)
712 			mac_bridge_ref_cb(mh, B_TRUE);
713 		mutex_exit(&mip->mi_bridge_lock);
714 		if (mh == NULL) {
715 			mac_rx_common((mac_handle_t)mip, mrh, mp_chain);
716 		} else {
717 			mac_bridge_rx_cb(mh, mrh, mp_chain);
718 			mac_bridge_ref_cb(mh, B_FALSE);
719 		}
720 	}
721 }
722 
723 /*
724  * Special case function: this allows snooping of packets transmitted and
725  * received by TRILL. By design, they go directly into the TRILL module.
726  */
727 void
mac_trill_snoop(mac_handle_t mh,mblk_t * mp)728 mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
729 {
730 	mac_impl_t *mip = (mac_impl_t *)mh;
731 
732 	if (mip->mi_promisc_list != NULL)
733 		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
734 }
735 
736 /*
737  * This is the upward reentry point for packets arriving from the bridging
738  * module and from mac_rx for links not part of a bridge.
739  */
740 void
mac_rx_common(mac_handle_t mh,mac_resource_handle_t mrh,mblk_t * mp_chain)741 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
742 {
743 	mac_impl_t		*mip = (mac_impl_t *)mh;
744 	mac_ring_t		*mr = (mac_ring_t *)mrh;
745 	mac_soft_ring_set_t	*mac_srs;
746 	mblk_t			*bp = mp_chain;
747 
748 	/*
749 	 * If there are any promiscuous mode callbacks defined for
750 	 * this MAC, pass them a copy if appropriate.
751 	 */
752 	if (mip->mi_promisc_list != NULL)
753 		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
754 
755 	if (mr != NULL) {
756 		/*
757 		 * If the SRS teardown has started, just return. The 'mr'
758 		 * continues to be valid until the driver unregisters the MAC.
759 		 * Hardware classified packets will not make their way up
760 		 * beyond this point once the teardown has started. The driver
761 		 * is never passed a pointer to a flow entry or SRS or any
762 		 * structure that can be freed much before mac_unregister.
763 		 */
764 		mutex_enter(&mr->mr_lock);
765 		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
766 		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
767 			mutex_exit(&mr->mr_lock);
768 			freemsgchain(mp_chain);
769 			return;
770 		}
771 
772 		/*
773 		 * The ring is in passthru mode; pass the chain up to
774 		 * the pseudo ring.
775 		 */
776 		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
777 			MR_REFHOLD_LOCKED(mr);
778 			mutex_exit(&mr->mr_lock);
779 			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
780 			    B_FALSE);
781 			MR_REFRELE(mr);
782 			return;
783 		}
784 
785 		/*
786 		 * The passthru callback should only be set when in
787 		 * MAC_PASSTHRU_CLASSIFIER mode.
788 		 */
789 		ASSERT3P(mr->mr_pt_fn, ==, NULL);
790 
791 		/*
792 		 * We check if an SRS is controlling this ring.
793 		 * If so, we can directly call the srs_lower_proc
794 		 * routine otherwise we need to go through mac_rx_classify
795 		 * to reach the right place.
796 		 */
797 		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
798 			MR_REFHOLD_LOCKED(mr);
799 			mutex_exit(&mr->mr_lock);
800 			ASSERT3P(mr->mr_srs, !=, NULL);
801 			mac_srs = mr->mr_srs;
802 
803 			/*
804 			 * This is the fast path. All packets received
805 			 * on this ring are hardware classified and
806 			 * share the same MAC header info.
807 			 */
808 			mac_srs->srs_rx.sr_lower_proc(mh,
809 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
810 			MR_REFRELE(mr);
811 			return;
812 		}
813 
814 		mutex_exit(&mr->mr_lock);
815 		/* We'll fall through to software classification */
816 	} else {
817 		flow_entry_t *flent;
818 		int err;
819 
820 		rw_enter(&mip->mi_rw_lock, RW_READER);
821 		if (mip->mi_single_active_client != NULL) {
822 			flent = mip->mi_single_active_client->mci_flent_list;
823 			FLOW_TRY_REFHOLD(flent, err);
824 			rw_exit(&mip->mi_rw_lock);
825 			if (err == 0) {
826 				(flent->fe_cb_fn)(flent->fe_cb_arg1,
827 				    flent->fe_cb_arg2, mp_chain, B_FALSE);
828 				FLOW_REFRELE(flent);
829 				return;
830 			}
831 		} else {
832 			rw_exit(&mip->mi_rw_lock);
833 		}
834 	}
835 
836 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
837 		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
838 			return;
839 	}
840 
841 	freemsgchain(bp);
842 }
843 
844 /* DATA TRANSMISSION */
845 
846 /*
847  * A driver's notification to resume transmission, in case of a provider
848  * without TX rings.
849  */
850 void
mac_tx_update(mac_handle_t mh)851 mac_tx_update(mac_handle_t mh)
852 {
853 	mac_tx_ring_update(mh, NULL);
854 }
855 
856 /*
857  * A driver's notification to resume transmission on the specified TX ring.
858  */
859 void
mac_tx_ring_update(mac_handle_t mh,mac_ring_handle_t rh)860 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
861 {
862 	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
863 }
864 
865 /* LINK STATE */
866 /*
867  * Notify the MAC layer about a link state change
868  */
869 void
mac_link_update(mac_handle_t mh,link_state_t link)870 mac_link_update(mac_handle_t mh, link_state_t link)
871 {
872 	mac_impl_t	*mip = (mac_impl_t *)mh;
873 
874 	/*
875 	 * Save the link state.
876 	 */
877 	mip->mi_lowlinkstate = link;
878 
879 	/*
880 	 * Send a MAC_NOTE_LOWLINK notification.  This tells the notification
881 	 * thread to deliver both lower and upper notifications.
882 	 */
883 	i_mac_notify(mip, MAC_NOTE_LOWLINK);
884 }
885 
886 /*
887  * Notify the MAC layer about a link state change due to bridging.
888  */
889 void
mac_link_redo(mac_handle_t mh,link_state_t link)890 mac_link_redo(mac_handle_t mh, link_state_t link)
891 {
892 	mac_impl_t	*mip = (mac_impl_t *)mh;
893 
894 	/*
895 	 * Save the link state.
896 	 */
897 	mip->mi_linkstate = link;
898 
899 	/*
900 	 * Send a MAC_NOTE_LINK notification.  Only upper notifications are
901 	 * made.
902 	 */
903 	i_mac_notify(mip, MAC_NOTE_LINK);
904 }
905 
906 /* MINOR NODE HANDLING */
907 
908 /*
909  * Given a dev_t, return the instance number (PPA) associated with it.
910  * Drivers can use this in their getinfo(9e) implementation to lookup
911  * the instance number (i.e. PPA) of the device, to use as an index to
912  * their own array of soft state structures.
913  *
914  * Returns -1 on error.
915  */
916 int
mac_devt_to_instance(dev_t devt)917 mac_devt_to_instance(dev_t devt)
918 {
919 	return (dld_devt_to_instance(devt));
920 }
921 
922 /*
923  * Drivers that make use of the private minor number space are expected to
924  * provide their own getinfo(9e) entry point. This function simply forwards
925  * to the default MAC framework getinfo(9e) implementation as a convenience
926  * if they don't need any special mapping (mac instance != ddi_get_instance())
927  */
928 int
mac_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** resp)929 mac_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
930 {
931 	return (dld_getinfo(dip, cmd, arg, resp));
932 }
933 
934 /*
935  * This function returns the first minor number that is available for
936  * driver private use.  All minor numbers smaller than this are
937  * reserved for GLDv3 use.
938  */
939 minor_t
mac_private_minor(void)940 mac_private_minor(void)
941 {
942 	return (MAC_PRIVATE_MINOR);
943 }
944 
945 /* OTHER CONTROL INFORMATION */
946 
947 /*
948  * A driver notified us that its primary MAC address has changed.
949  */
950 void
mac_unicst_update(mac_handle_t mh,const uint8_t * addr)951 mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
952 {
953 	mac_impl_t	*mip = (mac_impl_t *)mh;
954 
955 	if (mip->mi_type->mt_addr_length == 0)
956 		return;
957 
958 	i_mac_perim_enter(mip);
959 
960 	/*
961 	 * If address changes, freshen the MAC address value and update
962 	 * all MAC clients that share this MAC address.
963 	 */
964 	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) {
965 		mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
966 		    (uint8_t *)addr);
967 	}
968 
969 	i_mac_perim_exit(mip);
970 
971 	/*
972 	 * Send a MAC_NOTE_UNICST notification.
973 	 */
974 	i_mac_notify(mip, MAC_NOTE_UNICST);
975 }
976 
977 void
mac_dst_update(mac_handle_t mh,const uint8_t * addr)978 mac_dst_update(mac_handle_t mh, const uint8_t *addr)
979 {
980 	mac_impl_t	*mip = (mac_impl_t *)mh;
981 
982 	if (mip->mi_type->mt_addr_length == 0)
983 		return;
984 
985 	i_mac_perim_enter(mip);
986 	bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length);
987 	i_mac_perim_exit(mip);
988 	i_mac_notify(mip, MAC_NOTE_DEST);
989 }
990 
991 /*
992  * MAC plugin information changed.
993  */
994 int
mac_pdata_update(mac_handle_t mh,void * mac_pdata,size_t dsize)995 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
996 {
997 	mac_impl_t	*mip = (mac_impl_t *)mh;
998 
999 	/*
1000 	 * Verify that the plugin supports MAC plugin data and that the
1001 	 * supplied data is valid.
1002 	 */
1003 	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
1004 		return (EINVAL);
1005 	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
1006 		return (EINVAL);
1007 
1008 	if (mip->mi_pdata != NULL)
1009 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
1010 
1011 	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
1012 	bcopy(mac_pdata, mip->mi_pdata, dsize);
1013 	mip->mi_pdata_size = dsize;
1014 
1015 	/*
1016 	 * Since the MAC plugin data is used to construct MAC headers that
1017 	 * were cached in fast-path headers, we need to flush fast-path
1018 	 * information for links associated with this mac.
1019 	 */
1020 	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
1021 	return (0);
1022 }
1023 
1024 /*
1025  * The mac provider or mac frameowrk calls this function when it wants
1026  * to notify upstream consumers that the capabilities have changed and
1027  * that they should modify their own internal state accordingly.
1028  *
1029  * We currently have no regard for the fact that a provider could
1030  * decide to drop capabilities which would invalidate pending traffic.
1031  * For example, if one was to disable the Tx checksum offload while
1032  * TCP/IP traffic was being sent by mac clients relying on that
1033  * feature, then those packets would hit the write with missing or
1034  * partial checksums. A proper solution involves not only providing
1035  * notfication, but also performing client quiescing. That is, a capab
1036  * change should be treated as an atomic transaction that forms a
1037  * barrier between traffic relying on the current capabs and traffic
1038  * relying on the new capabs. In practice, simnet is currently the
1039  * only provider that could hit this, and it's an easily avoidable
1040  * situation (and at worst it should only lead to some dropped
1041  * packets). But if we ever want better on-the-fly capab change to
1042  * actual hardware providers, then we should give this update
1043  * mechanism a proper implementation.
1044  */
1045 void
mac_capab_update(mac_handle_t mh)1046 mac_capab_update(mac_handle_t mh)
1047 {
1048 	/*
1049 	 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
1050 	 * clients to renegotiate capabilities.
1051 	 */
1052 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
1053 }
1054 
1055 /*
1056  * Used by normal drivers to update the max sdu size.
1057  * We need to handle the case of a smaller mi_sdu_multicast
1058  * since this is called by mac_set_mtu() even for drivers that
1059  * have differing unicast and multicast mtu and we don't want to
1060  * increase the multicast mtu by accident in that case.
1061  */
1062 int
mac_maxsdu_update(mac_handle_t mh,uint_t sdu_max)1063 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
1064 {
1065 	mac_impl_t	*mip = (mac_impl_t *)mh;
1066 
1067 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1068 		return (EINVAL);
1069 	mip->mi_sdu_max = sdu_max;
1070 	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
1071 		mip->mi_sdu_multicast = mip->mi_sdu_max;
1072 
1073 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1074 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1075 	return (0);
1076 }
1077 
1078 /*
1079  * Version of the above function that is used by drivers that have a different
1080  * max sdu size for multicast/broadcast vs. unicast.
1081  */
1082 int
mac_maxsdu_update2(mac_handle_t mh,uint_t sdu_max,uint_t sdu_multicast)1083 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
1084 {
1085 	mac_impl_t	*mip = (mac_impl_t *)mh;
1086 
1087 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1088 		return (EINVAL);
1089 	if (sdu_multicast == 0)
1090 		sdu_multicast = sdu_max;
1091 	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
1092 		return (EINVAL);
1093 	mip->mi_sdu_max = sdu_max;
1094 	mip->mi_sdu_multicast = sdu_multicast;
1095 
1096 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1097 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1098 	return (0);
1099 }
1100 
1101 static void
mac_ring_intr_retarget(mac_group_t * group,mac_ring_t * ring)1102 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring)
1103 {
1104 	mac_client_impl_t *mcip;
1105 	flow_entry_t *flent;
1106 	mac_soft_ring_set_t *mac_rx_srs;
1107 	mac_cpus_t *srs_cpu;
1108 	int i;
1109 
1110 	if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) &&
1111 	    (!ring->mr_info.mri_intr.mi_ddi_shared)) {
1112 		/* interrupt can be re-targeted */
1113 		ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
1114 		flent = mcip->mci_flent;
1115 		if (ring->mr_type == MAC_RING_TYPE_RX) {
1116 			for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1117 				mac_rx_srs = flent->fe_rx_srs[i];
1118 				if (mac_rx_srs->srs_ring != ring)
1119 					continue;
1120 				srs_cpu = &mac_rx_srs->srs_cpu;
1121 				mutex_enter(&cpu_lock);
1122 				mac_rx_srs_retarget_intr(mac_rx_srs,
1123 				    srs_cpu->mc_rx_intr_cpu);
1124 				mutex_exit(&cpu_lock);
1125 				break;
1126 			}
1127 		} else {
1128 			if (flent->fe_tx_srs != NULL) {
1129 				mutex_enter(&cpu_lock);
1130 				mac_tx_srs_retarget_intr(
1131 				    flent->fe_tx_srs);
1132 				mutex_exit(&cpu_lock);
1133 			}
1134 		}
1135 	}
1136 }
1137 
1138 /*
1139  * Clients like aggr create pseudo rings (mac_ring_t) and expose them to
1140  * their clients. There is a 1-1 mapping pseudo ring and the hardware
1141  * ring. ddi interrupt handles are exported from the hardware ring to
1142  * the pseudo ring. Thus when the interrupt handle changes, clients of
1143  * aggr that are using the handle need to use the new handle and
1144  * re-target their interrupts.
1145  */
1146 static void
mac_pseudo_ring_intr_retarget(mac_impl_t * mip,mac_ring_t * ring,ddi_intr_handle_t ddh)1147 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring,
1148     ddi_intr_handle_t ddh)
1149 {
1150 	mac_ring_t *pring;
1151 	mac_group_t *pgroup;
1152 	mac_impl_t *pmip;
1153 	char macname[MAXNAMELEN];
1154 	mac_perim_handle_t p_mph;
1155 	uint64_t saved_gen_num;
1156 
1157 again:
1158 	pring = (mac_ring_t *)ring->mr_prh;
1159 	pgroup = (mac_group_t *)pring->mr_gh;
1160 	pmip = (mac_impl_t *)pgroup->mrg_mh;
1161 	saved_gen_num = ring->mr_gen_num;
1162 	(void) strlcpy(macname, pmip->mi_name, MAXNAMELEN);
1163 	/*
1164 	 * We need to enter aggr's perimeter. The locking hierarchy
1165 	 * dictates that aggr's perimeter should be entered first
1166 	 * and then the port's perimeter. So drop the port's
1167 	 * perimeter, enter aggr's and then re-enter port's
1168 	 * perimeter.
1169 	 */
1170 	i_mac_perim_exit(mip);
1171 	/*
1172 	 * While we know pmip is the aggr's mip, there is a
1173 	 * possibility that aggr could have unregistered by
1174 	 * the time we exit port's perimeter (mip) and
1175 	 * enter aggr's perimeter (pmip). To avoid that
1176 	 * scenario, enter aggr's perimeter using its name.
1177 	 */
1178 	if (mac_perim_enter_by_macname(macname, &p_mph) != 0)
1179 		return;
1180 	i_mac_perim_enter(mip);
1181 	/*
1182 	 * Check if the ring got assigned to another aggregation before
1183 	 * be could enter aggr's and the port's perimeter. When a ring
1184 	 * gets deleted from an aggregation, it calls mac_stop_ring()
1185 	 * which increments the generation number. So checking
1186 	 * generation number will be enough.
1187 	 */
1188 	if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) {
1189 		i_mac_perim_exit(mip);
1190 		mac_perim_exit(p_mph);
1191 		i_mac_perim_enter(mip);
1192 		goto again;
1193 	}
1194 
1195 	/* Check if pseudo ring is still present */
1196 	if (ring->mr_prh != NULL) {
1197 		pring->mr_info.mri_intr.mi_ddi_handle = ddh;
1198 		pring->mr_info.mri_intr.mi_ddi_shared =
1199 		    ring->mr_info.mri_intr.mi_ddi_shared;
1200 		if (ddh != NULL)
1201 			mac_ring_intr_retarget(pgroup, pring);
1202 	}
1203 	i_mac_perim_exit(mip);
1204 	mac_perim_exit(p_mph);
1205 }
1206 /*
1207  * API called by driver to provide new interrupt handle for TX/RX rings.
1208  * This usually happens when IRM (Interrupt Resource Manangement)
1209  * framework either gives the driver more MSI-x interrupts or takes
1210  * away MSI-x interrupts from the driver.
1211  */
1212 void
mac_ring_intr_set(mac_ring_handle_t mrh,ddi_intr_handle_t ddh)1213 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh)
1214 {
1215 	mac_ring_t	*ring = (mac_ring_t *)mrh;
1216 	mac_group_t	*group = (mac_group_t *)ring->mr_gh;
1217 	mac_impl_t	*mip = (mac_impl_t *)group->mrg_mh;
1218 
1219 	i_mac_perim_enter(mip);
1220 	ring->mr_info.mri_intr.mi_ddi_handle = ddh;
1221 	if (ddh == NULL) {
1222 		/* Interrupts being reset */
1223 		ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE;
1224 		if (ring->mr_prh != NULL) {
1225 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1226 			return;
1227 		}
1228 	} else {
1229 		/* New interrupt handle */
1230 		mac_compare_ddi_handle(mip->mi_rx_groups,
1231 		    mip->mi_rx_group_count, ring);
1232 		if (!ring->mr_info.mri_intr.mi_ddi_shared) {
1233 			mac_compare_ddi_handle(mip->mi_tx_groups,
1234 			    mip->mi_tx_group_count, ring);
1235 		}
1236 		if (ring->mr_prh != NULL) {
1237 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1238 			return;
1239 		} else {
1240 			mac_ring_intr_retarget(group, ring);
1241 		}
1242 	}
1243 	i_mac_perim_exit(mip);
1244 }
1245 
1246 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
1247 
1248 /*
1249  * Updates the mac_impl structure with the current state of the link
1250  */
1251 static void
i_mac_log_link_state(mac_impl_t * mip)1252 i_mac_log_link_state(mac_impl_t *mip)
1253 {
1254 	/*
1255 	 * If no change, then it is not interesting.
1256 	 */
1257 	if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate)
1258 		return;
1259 
1260 	switch (mip->mi_lowlinkstate) {
1261 	case LINK_STATE_UP:
1262 		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
1263 			char det[200];
1264 
1265 			mip->mi_type->mt_ops.mtops_link_details(det,
1266 			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
1267 
1268 			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
1269 		} else {
1270 			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
1271 		}
1272 		break;
1273 
1274 	case LINK_STATE_DOWN:
1275 		/*
1276 		 * Only transitions from UP to DOWN are interesting
1277 		 */
1278 		if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN)
1279 			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
1280 		break;
1281 
1282 	case LINK_STATE_UNKNOWN:
1283 		/*
1284 		 * This case is normally not interesting.
1285 		 */
1286 		break;
1287 	}
1288 	mip->mi_lastlowlinkstate = mip->mi_lowlinkstate;
1289 }
1290 
1291 /*
1292  * Main routine for the callbacks notifications thread
1293  */
1294 static void
i_mac_notify_thread(void * arg)1295 i_mac_notify_thread(void *arg)
1296 {
1297 	mac_impl_t	*mip = arg;
1298 	callb_cpr_t	cprinfo;
1299 	mac_cb_t	*mcb;
1300 	mac_cb_info_t	*mcbi;
1301 	mac_notify_cb_t	*mncb;
1302 
1303 	mcbi = &mip->mi_notify_cb_info;
1304 	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
1305 	    "i_mac_notify_thread");
1306 
1307 	mutex_enter(mcbi->mcbi_lockp);
1308 
1309 	for (;;) {
1310 		uint32_t	bits;
1311 		uint32_t	type;
1312 
1313 		bits = mip->mi_notify_bits;
1314 		if (bits == 0) {
1315 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1316 			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1317 			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
1318 			continue;
1319 		}
1320 		mip->mi_notify_bits = 0;
1321 		if ((bits & (1 << MAC_NNOTE)) != 0) {
1322 			/* request to quit */
1323 			ASSERT(mip->mi_state_flags & MIS_DISABLED);
1324 			break;
1325 		}
1326 
1327 		mutex_exit(mcbi->mcbi_lockp);
1328 
1329 		/*
1330 		 * Log link changes on the actual link, but then do reports on
1331 		 * synthetic state (if part of a bridge).
1332 		 */
1333 		if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) {
1334 			link_state_t newstate;
1335 			mac_handle_t mh;
1336 
1337 			i_mac_log_link_state(mip);
1338 			newstate = mip->mi_lowlinkstate;
1339 			if (mip->mi_bridge_link != NULL) {
1340 				mutex_enter(&mip->mi_bridge_lock);
1341 				if ((mh = mip->mi_bridge_link) != NULL) {
1342 					newstate = mac_bridge_ls_cb(mh,
1343 					    newstate);
1344 				}
1345 				mutex_exit(&mip->mi_bridge_lock);
1346 			}
1347 			if (newstate != mip->mi_linkstate) {
1348 				mip->mi_linkstate = newstate;
1349 				bits |= 1 << MAC_NOTE_LINK;
1350 			}
1351 		}
1352 
1353 		/*
1354 		 * Depending on which capabs have changed, the Tx
1355 		 * checksum flags may also need to be updated.
1356 		 */
1357 		if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
1358 			mac_perim_handle_t mph;
1359 			mac_handle_t mh = (mac_handle_t)mip;
1360 
1361 			mac_perim_enter_by_mh(mh, &mph);
1362 			mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
1363 			mac_perim_exit(mph);
1364 		}
1365 
1366 		/*
1367 		 * Do notification callbacks for each notification type.
1368 		 */
1369 		for (type = 0; type < MAC_NNOTE; type++) {
1370 			if ((bits & (1 << type)) == 0) {
1371 				continue;
1372 			}
1373 
1374 			if (mac_notify_cb_list[type] != NULL)
1375 				(*mac_notify_cb_list[type])(mip);
1376 
1377 			/*
1378 			 * Walk the list of notifications.
1379 			 */
1380 			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
1381 			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
1382 			    mcb = mcb->mcb_nextp) {
1383 				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
1384 				mncb->mncb_fn(mncb->mncb_arg, type);
1385 			}
1386 			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
1387 			    &mip->mi_notify_cb_list);
1388 		}
1389 
1390 		mutex_enter(mcbi->mcbi_lockp);
1391 	}
1392 
1393 	mip->mi_state_flags |= MIS_NOTIFY_DONE;
1394 	cv_broadcast(&mcbi->mcbi_cv);
1395 
1396 	/* CALLB_CPR_EXIT drops the lock */
1397 	CALLB_CPR_EXIT(&cprinfo);
1398 	thread_exit();
1399 }
1400 
1401 /*
1402  * Signal the i_mac_notify_thread asking it to quit.
1403  * Then wait till it is done.
1404  */
1405 void
i_mac_notify_exit(mac_impl_t * mip)1406 i_mac_notify_exit(mac_impl_t *mip)
1407 {
1408 	mac_cb_info_t	*mcbi;
1409 
1410 	mcbi = &mip->mi_notify_cb_info;
1411 
1412 	mutex_enter(mcbi->mcbi_lockp);
1413 	mip->mi_notify_bits = (1 << MAC_NNOTE);
1414 	cv_broadcast(&mcbi->mcbi_cv);
1415 
1416 
1417 	while ((mip->mi_notify_thread != NULL) &&
1418 	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
1419 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1420 	}
1421 
1422 	/* Necessary clean up before doing kmem_cache_free */
1423 	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
1424 	mip->mi_notify_bits = 0;
1425 	mip->mi_notify_thread = NULL;
1426 	mutex_exit(mcbi->mcbi_lockp);
1427 }
1428 
1429 /*
1430  * Entry point invoked by drivers to dynamically add a ring to an
1431  * existing group.
1432  */
1433 int
mac_group_add_ring(mac_group_handle_t gh,int index)1434 mac_group_add_ring(mac_group_handle_t gh, int index)
1435 {
1436 	mac_group_t *group = (mac_group_t *)gh;
1437 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1438 	int ret;
1439 
1440 	i_mac_perim_enter(mip);
1441 	ret = i_mac_group_add_ring(group, NULL, index);
1442 	i_mac_perim_exit(mip);
1443 	return (ret);
1444 }
1445 
1446 /*
1447  * Entry point invoked by drivers to dynamically remove a ring
1448  * from an existing group. The specified ring handle must no longer
1449  * be used by the driver after a call to this function.
1450  */
1451 void
mac_group_rem_ring(mac_group_handle_t gh,mac_ring_handle_t rh)1452 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
1453 {
1454 	mac_group_t *group = (mac_group_t *)gh;
1455 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1456 
1457 	i_mac_perim_enter(mip);
1458 	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
1459 	i_mac_perim_exit(mip);
1460 }
1461 
1462 /*
1463  * mac_prop_info_*() callbacks called from the driver's prefix_propinfo()
1464  * entry points.
1465  */
1466 
1467 void
mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph,uint8_t val)1468 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val)
1469 {
1470 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1471 
1472 	/* nothing to do if the caller doesn't want the default value */
1473 	if (pr->pr_default == NULL)
1474 		return;
1475 
1476 	ASSERT(pr->pr_default_size >= sizeof (uint8_t));
1477 
1478 	*(uint8_t *)(pr->pr_default) = val;
1479 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1480 }
1481 
1482 void
mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph,uint64_t val)1483 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val)
1484 {
1485 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1486 
1487 	/* nothing to do if the caller doesn't want the default value */
1488 	if (pr->pr_default == NULL)
1489 		return;
1490 
1491 	ASSERT(pr->pr_default_size >= sizeof (uint64_t));
1492 
1493 	bcopy(&val, pr->pr_default, sizeof (val));
1494 
1495 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1496 }
1497 
1498 void
mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph,uint32_t val)1499 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val)
1500 {
1501 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1502 
1503 	/* nothing to do if the caller doesn't want the default value */
1504 	if (pr->pr_default == NULL)
1505 		return;
1506 
1507 	ASSERT(pr->pr_default_size >= sizeof (uint32_t));
1508 
1509 	bcopy(&val, pr->pr_default, sizeof (val));
1510 
1511 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1512 }
1513 
1514 void
mac_prop_info_set_default_str(mac_prop_info_handle_t ph,const char * str)1515 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str)
1516 {
1517 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1518 
1519 	/* nothing to do if the caller doesn't want the default value */
1520 	if (pr->pr_default == NULL)
1521 		return;
1522 
1523 	if (strlen(str) >= pr->pr_default_size)
1524 		pr->pr_errno = ENOBUFS;
1525 	else
1526 		(void) strlcpy(pr->pr_default, str, pr->pr_default_size);
1527 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1528 }
1529 
1530 void
mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,link_flowctrl_t val)1531 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
1532     link_flowctrl_t val)
1533 {
1534 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1535 
1536 	/* nothing to do if the caller doesn't want the default value */
1537 	if (pr->pr_default == NULL)
1538 		return;
1539 
1540 	ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t));
1541 
1542 	bcopy(&val, pr->pr_default, sizeof (val));
1543 
1544 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1545 }
1546 
1547 void
mac_prop_info_set_default_fec(mac_prop_info_handle_t ph,link_fec_t val)1548 mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val)
1549 {
1550 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1551 
1552 	/* nothing to do if the caller doesn't want the default value */
1553 	if (pr->pr_default == NULL)
1554 		return;
1555 
1556 	ASSERT(pr->pr_default_size >= sizeof (link_fec_t));
1557 
1558 	bcopy(&val, pr->pr_default, sizeof (val));
1559 
1560 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1561 }
1562 
1563 void
mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph,uint32_t min,uint32_t max)1564 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
1565     uint32_t max)
1566 {
1567 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1568 	mac_propval_range_t *range = pr->pr_range;
1569 	mac_propval_uint32_range_t *range32;
1570 
1571 	/* nothing to do if the caller doesn't want the range info */
1572 	if (range == NULL)
1573 		return;
1574 
1575 	if (pr->pr_range_cur_count++ == 0) {
1576 		/* first range */
1577 		pr->pr_flags |= MAC_PROP_INFO_RANGE;
1578 		range->mpr_type = MAC_PROPVAL_UINT32;
1579 	} else {
1580 		/* all ranges of a property should be of the same type */
1581 		ASSERT(range->mpr_type == MAC_PROPVAL_UINT32);
1582 		if (pr->pr_range_cur_count > range->mpr_count) {
1583 			pr->pr_errno = ENOSPC;
1584 			return;
1585 		}
1586 	}
1587 
1588 	range32 = range->mpr_range_uint32;
1589 	range32[pr->pr_range_cur_count - 1].mpur_min = min;
1590 	range32[pr->pr_range_cur_count - 1].mpur_max = max;
1591 }
1592 
1593 void
mac_prop_info_set_perm(mac_prop_info_handle_t ph,uint8_t perm)1594 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
1595 {
1596 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1597 
1598 	pr->pr_perm = perm;
1599 	pr->pr_flags |= MAC_PROP_INFO_PERM;
1600 }
1601 
1602 void
mac_hcksum_get(const mblk_t * mp,uint32_t * start,uint32_t * stuff,uint32_t * end,uint32_t * value,uint32_t * flags_ptr)1603 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
1604     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
1605 {
1606 	uint32_t flags;
1607 
1608 	ASSERT(DB_TYPE(mp) == M_DATA);
1609 
1610 	flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
1611 	if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) {
1612 		if (value != NULL)
1613 			*value = (uint32_t)DB_CKSUM16(mp);
1614 		if ((flags & HCK_PARTIALCKSUM) != 0) {
1615 			if (start != NULL)
1616 				*start = (uint32_t)DB_CKSUMSTART(mp);
1617 			if (stuff != NULL)
1618 				*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
1619 			if (end != NULL)
1620 				*end = (uint32_t)DB_CKSUMEND(mp);
1621 		}
1622 	}
1623 
1624 	if (flags_ptr != NULL)
1625 		*flags_ptr = flags;
1626 }
1627 
1628 void
mac_hcksum_set(mblk_t * mp,uint32_t start,uint32_t stuff,uint32_t end,uint32_t value,uint32_t flags)1629 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
1630     uint32_t value, uint32_t flags)
1631 {
1632 	ASSERT(DB_TYPE(mp) == M_DATA);
1633 
1634 	DB_CKSUMSTART(mp) = (intptr_t)start;
1635 	DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
1636 	DB_CKSUMEND(mp) = (intptr_t)end;
1637 	DB_CKSUMFLAGS(mp) = (uint16_t)flags;
1638 	DB_CKSUM16(mp) = (uint16_t)value;
1639 }
1640 
1641 void
mac_hcksum_clone(const mblk_t * src,mblk_t * dst)1642 mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
1643 {
1644 	ASSERT3U(DB_TYPE(src), ==, M_DATA);
1645 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
1646 
1647 	/*
1648 	 * Do these assignments unconditionally, rather than only when
1649 	 * flags is non-zero. This protects a situation where zeroed
1650 	 * hcksum data does not make the jump onto an mblk_t with
1651 	 * stale data in those fields. It's important to copy all
1652 	 * possible flags (HCK_* as well as HW_*) and not just the
1653 	 * checksum specific flags. Dropping flags during a clone
1654 	 * could result in dropped packets. If the caller has good
1655 	 * reason to drop those flags then it should do it manually,
1656 	 * after the clone.
1657 	 */
1658 	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
1659 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
1660 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
1661 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
1662 	DB_CKSUM16(dst) = DB_CKSUM16(src);
1663 	DB_LSOMSS(dst) = DB_LSOMSS(src);
1664 }
1665 
1666 void
mac_lso_get(mblk_t * mp,uint32_t * mss,uint32_t * flags)1667 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
1668 {
1669 	ASSERT(DB_TYPE(mp) == M_DATA);
1670 
1671 	if (flags != NULL) {
1672 		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
1673 		if ((*flags != 0) && (mss != NULL))
1674 			*mss = (uint32_t)DB_LSOMSS(mp);
1675 	}
1676 }
1677 
1678 void
mac_transceiver_info_set_present(mac_transceiver_info_t * infop,boolean_t present)1679 mac_transceiver_info_set_present(mac_transceiver_info_t *infop,
1680     boolean_t present)
1681 {
1682 	infop->mti_present = present;
1683 }
1684 
1685 void
mac_transceiver_info_set_usable(mac_transceiver_info_t * infop,boolean_t usable)1686 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
1687     boolean_t usable)
1688 {
1689 	infop->mti_usable = usable;
1690 }
1691 
1692 static bool
mac_parse_is_ipv6eh(uint8_t id)1693 mac_parse_is_ipv6eh(uint8_t id)
1694 {
1695 	switch (id) {
1696 	case IPPROTO_HOPOPTS:
1697 	case IPPROTO_ROUTING:
1698 	case IPPROTO_FRAGMENT:
1699 	case IPPROTO_AH:
1700 	case IPPROTO_DSTOPTS:
1701 	case IPPROTO_MH:
1702 	case IPPROTO_HIP:
1703 	case IPPROTO_SHIM6:
1704 		/* Currently known extension headers */
1705 		return (true);
1706 	case IPPROTO_ESP:
1707 		/*
1708 		 * While the IANA protocol numbers listing notes ESP as an IPv6
1709 		 * extension header, we cannot effectively parse it like one.
1710 		 *
1711 		 * For now, mac_ether_offload_info() will report it as the L4
1712 		 * protocol for a parsed packet containing this EH.
1713 		 */
1714 	default:
1715 		return (false);
1716 	}
1717 }
1718 
1719 typedef struct mac_mblk_cursor {
1720 	mblk_t	*mmc_head;
1721 	mblk_t	*mmc_cur;
1722 	size_t	mmc_off_total;
1723 	size_t	mmc_off_mp;
1724 } mac_mblk_cursor_t;
1725 
1726 static void mac_mmc_advance(mac_mblk_cursor_t *, size_t);
1727 static void mac_mmc_reset(mac_mblk_cursor_t *);
1728 
1729 static void
mac_mmc_init(mac_mblk_cursor_t * cursor,mblk_t * mp)1730 mac_mmc_init(mac_mblk_cursor_t *cursor, mblk_t *mp)
1731 {
1732 	cursor->mmc_head = mp;
1733 	mac_mmc_reset(cursor);
1734 }
1735 
1736 static void
mac_mmc_reset(mac_mblk_cursor_t * cursor)1737 mac_mmc_reset(mac_mblk_cursor_t *cursor)
1738 {
1739 	ASSERT(cursor->mmc_head != NULL);
1740 
1741 	cursor->mmc_cur = cursor->mmc_head;
1742 	cursor->mmc_off_total = cursor->mmc_off_mp = 0;
1743 
1744 	/* Advance past any zero-length mblks at head */
1745 	mac_mmc_advance(cursor, 0);
1746 }
1747 
1748 static inline size_t
mac_mmc_mp_left(const mac_mblk_cursor_t * cursor)1749 mac_mmc_mp_left(const mac_mblk_cursor_t *cursor)
1750 {
1751 	if (cursor->mmc_cur != NULL) {
1752 		const size_t mp_len = MBLKL(cursor->mmc_cur);
1753 
1754 		ASSERT3U(mp_len, >=, cursor->mmc_off_mp);
1755 
1756 		return (mp_len - cursor->mmc_off_mp);
1757 	} else {
1758 		return (0);
1759 	}
1760 }
1761 
1762 static inline uint8_t *
mac_mmc_mp_ptr(const mac_mblk_cursor_t * cursor)1763 mac_mmc_mp_ptr(const mac_mblk_cursor_t *cursor)
1764 {
1765 	return (cursor->mmc_cur->b_rptr + cursor->mmc_off_mp);
1766 }
1767 
1768 static inline size_t
mac_mmc_offset(const mac_mblk_cursor_t * cursor)1769 mac_mmc_offset(const mac_mblk_cursor_t *cursor)
1770 {
1771 	return (cursor->mmc_off_total);
1772 }
1773 
1774 /*
1775  * Advance cursor forward `len` bytes.
1776  *
1777  * The length to advance must be no greater than the number of bytes remaining
1778  * in the current mblk.  If the position reaches (exactly) the end of the
1779  * current mblk, the cursor will be pushed forward to the next non-zero-length
1780  * mblk in the chain.
1781  */
1782 static inline void
mac_mmc_advance(mac_mblk_cursor_t * cursor,size_t len)1783 mac_mmc_advance(mac_mblk_cursor_t *cursor, size_t len)
1784 {
1785 	ASSERT(cursor->mmc_cur != NULL);
1786 
1787 	const size_t mp_len = MBLKL(cursor->mmc_cur);
1788 
1789 	ASSERT3U(cursor->mmc_off_mp + len, <=, mp_len);
1790 
1791 	cursor->mmc_off_total += len;
1792 	cursor->mmc_off_mp += len;
1793 
1794 	if (cursor->mmc_off_mp == mp_len) {
1795 		cursor->mmc_off_mp = 0;
1796 		cursor->mmc_cur = cursor->mmc_cur->b_cont;
1797 	}
1798 
1799 	/* Skip over any 0-length mblks */
1800 	while (cursor->mmc_cur != NULL && MBLKL(cursor->mmc_cur) == 0) {
1801 		cursor->mmc_cur = cursor->mmc_cur->b_cont;
1802 	}
1803 }
1804 
1805 /*
1806  * Attempt to seek to byte offset `off` in mblk chain.
1807  *
1808  * Returns true if the offset is <= the total chain length.
1809  */
1810 static bool
mac_mmc_seek(mac_mblk_cursor_t * cursor,const size_t off)1811 mac_mmc_seek(mac_mblk_cursor_t *cursor, const size_t off)
1812 {
1813 	ASSERT(cursor->mmc_head != NULL);
1814 
1815 	if (off == cursor->mmc_off_total) {
1816 		/*
1817 		 * Any prior init, reset, or seek operation will have advanced
1818 		 * past any zero-length mblks, making this short-circuit safe.
1819 		 */
1820 		return (true);
1821 	} else if (off < cursor->mmc_off_total) {
1822 		/* Rewind to beginning if offset precedes current position */
1823 		mac_mmc_reset(cursor);
1824 	}
1825 
1826 	size_t seek_left = off - cursor->mmc_off_total;
1827 	while (cursor->mmc_cur != NULL) {
1828 		const size_t mp_left = mac_mmc_mp_left(cursor);
1829 
1830 		if (mp_left > seek_left) {
1831 			/* Target position is within current mblk */
1832 			cursor->mmc_off_mp += seek_left;
1833 			cursor->mmc_off_total += seek_left;
1834 			return (true);
1835 		}
1836 
1837 		/* Move on to the next mblk... */
1838 		mac_mmc_advance(cursor, mp_left);
1839 		seek_left -= mp_left;
1840 	}
1841 
1842 	/*
1843 	 * We have reached the end of the mblk chain, but there is a chance that
1844 	 * it corresponds to the target seek position.
1845 	 */
1846 	return (cursor->mmc_off_total == off);
1847 }
1848 
1849 /*
1850  * Attempt to read uint8_t at offset `pos` in mblk chain.
1851  *
1852  * Returns true (and sets value in `out`) if the offset is within the chain.
1853  */
1854 static bool
mac_mmc_get_uint8(mac_mblk_cursor_t * cursor,size_t pos,uint8_t * out)1855 mac_mmc_get_uint8(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out)
1856 {
1857 	if (!mac_mmc_seek(cursor, pos)) {
1858 		return (false);
1859 	}
1860 
1861 	if (mac_mmc_mp_left(cursor) != 0) {
1862 		*out = *(mac_mmc_mp_ptr(cursor));
1863 		mac_mmc_advance(cursor, 1);
1864 		return (true);
1865 	}
1866 
1867 	return (false);
1868 }
1869 
1870 /*
1871  * Attempt to read uint16_t at offset `pos` in mblk chain.  The two
1872  * network-order bytes are converted into a host-order value.
1873  *
1874  * Returns true (and sets value in `out`) if the 16-bit region specified by the
1875  * offset is within the chain.
1876  */
1877 static bool
mac_mmc_get_uint16(mac_mblk_cursor_t * cursor,size_t pos,uint16_t * out)1878 mac_mmc_get_uint16(mac_mblk_cursor_t *cursor, size_t pos, uint16_t *out)
1879 {
1880 	if (!mac_mmc_seek(cursor, pos)) {
1881 		return (false);
1882 	}
1883 
1884 	const size_t mp_left = mac_mmc_mp_left(cursor);
1885 	uint16_t result = 0;
1886 
1887 	if (mp_left >= 2) {
1888 		uint8_t *bp = mac_mmc_mp_ptr(cursor);
1889 
1890 		result = (uint16_t)bp[0] << 8;
1891 		result |= bp[1];
1892 		mac_mmc_advance(cursor, 2);
1893 		*out = result;
1894 		return (true);
1895 	} else if (mp_left == 1) {
1896 		result = (uint16_t)*(mac_mmc_mp_ptr(cursor));
1897 		mac_mmc_advance(cursor, 1);
1898 
1899 		if (mac_mmc_mp_left(cursor) == 0) {
1900 			return (false);
1901 		}
1902 
1903 		result = result << 8;
1904 		result |= (uint16_t)*(mac_mmc_mp_ptr(cursor));
1905 		mac_mmc_advance(cursor, 1);
1906 		*out = result;
1907 		return (true);
1908 	}
1909 
1910 	return (false);
1911 }
1912 
1913 /*
1914  * Attempt to read `count` bytes at offset `pos` in mblk chain.
1915  *
1916  * Returns true (and copies data to `out`) if `count` length region is available
1917  * at offset within the chain.
1918  */
1919 static bool
mac_mmc_get_bytes(mac_mblk_cursor_t * cursor,size_t pos,uint8_t * out,size_t count)1920 mac_mmc_get_bytes(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out,
1921     size_t count)
1922 {
1923 	if (!mac_mmc_seek(cursor, pos)) {
1924 		return (false);
1925 	}
1926 
1927 	while (count > 0) {
1928 		const size_t mp_left = mac_mmc_mp_left(cursor);
1929 
1930 		if (mp_left == 0) {
1931 			return (false);
1932 		}
1933 		const size_t to_copy = MIN(mp_left, count);
1934 
1935 		bcopy(mac_mmc_mp_ptr(cursor), out, to_copy);
1936 		out += to_copy;
1937 		mac_mmc_advance(cursor, to_copy);
1938 		count -= to_copy;
1939 	}
1940 	return (true);
1941 }
1942 
1943 /*
1944  * Attempt to parse ethernet header (VLAN or not) from mblk chain.
1945  *
1946  * Returns true if header was successfully parsed.  Parsing will begin at
1947  * current offset of `cursor`.  Any non-NULL arguments for VLAN, SAP, and header
1948  * size will be populated on success.  A value of MEOI_VLAN_TCI_INVALID will be
1949  * reported for the TCI if the header does not bear VLAN infomation.
1950  */
1951 static bool
mac_mmc_parse_ether(mac_mblk_cursor_t * cursor,uint8_t * dst_addrp,uint32_t * vlan_tcip,uint16_t * ethertypep,uint16_t * hdr_sizep)1952 mac_mmc_parse_ether(mac_mblk_cursor_t *cursor, uint8_t *dst_addrp,
1953     uint32_t *vlan_tcip, uint16_t *ethertypep, uint16_t *hdr_sizep)
1954 {
1955 	const size_t l2_off = mac_mmc_offset(cursor);
1956 
1957 	if (dst_addrp != NULL) {
1958 		if (!mac_mmc_get_bytes(cursor, l2_off, dst_addrp, ETHERADDRL)) {
1959 			return (false);
1960 		}
1961 	}
1962 
1963 	uint16_t ethertype = 0;
1964 	if (!mac_mmc_get_uint16(cursor,
1965 	    l2_off + offsetof(struct ether_header, ether_type), &ethertype)) {
1966 		return (false);
1967 	}
1968 
1969 	uint32_t tci = MEOI_VLAN_TCI_INVALID;
1970 	uint16_t hdrsize = sizeof (struct ether_header);
1971 
1972 	if (ethertype == ETHERTYPE_VLAN) {
1973 		uint16_t tci_val;
1974 
1975 		if (!mac_mmc_get_uint16(cursor,
1976 		    l2_off + offsetof(struct ether_vlan_header, ether_tci),
1977 		    &tci_val)) {
1978 			return (false);
1979 		}
1980 		if (!mac_mmc_get_uint16(cursor,
1981 		    l2_off + offsetof(struct ether_vlan_header, ether_type),
1982 		    &ethertype)) {
1983 			return (false);
1984 		}
1985 		hdrsize = sizeof (struct ether_vlan_header);
1986 		tci = (uint32_t)tci_val;
1987 	}
1988 
1989 	if (vlan_tcip != NULL) {
1990 		*vlan_tcip = tci;
1991 	}
1992 	if (ethertypep != NULL) {
1993 		*ethertypep = ethertype;
1994 	}
1995 	if (hdr_sizep != NULL) {
1996 		*hdr_sizep = hdrsize;
1997 	}
1998 	return (true);
1999 }
2000 
2001 /*
2002  * Attempt to parse L3 protocol header from mblk chain.
2003  *
2004  * The SAP/ethertype of the containing header must be specified by the caller.
2005  *
2006  * Returns true if header was successfully parsed.  Parsing will begin at
2007  * current offset of `cursor`.  Any non-NULL arguments for IP protocol and
2008  * header size will be populated on success.
2009  */
2010 static bool
mac_mmc_parse_l3(mac_mblk_cursor_t * cursor,uint16_t l3_sap,uint8_t * ipprotop,bool * is_fragp,uint16_t * hdr_sizep)2011 mac_mmc_parse_l3(mac_mblk_cursor_t *cursor, uint16_t l3_sap, uint8_t *ipprotop,
2012     bool *is_fragp, uint16_t *hdr_sizep)
2013 {
2014 	const size_t l3_off = mac_mmc_offset(cursor);
2015 
2016 	if (l3_sap == ETHERTYPE_IP) {
2017 		uint8_t verlen, ipproto;
2018 		uint16_t frag_off;
2019 
2020 		if (!mac_mmc_get_uint8(cursor, l3_off, &verlen)) {
2021 			return (false);
2022 		}
2023 		verlen &= 0x0f;
2024 		if (verlen < 5 || verlen > 0x0f) {
2025 			return (false);
2026 		}
2027 
2028 		if (!mac_mmc_get_uint16(cursor,
2029 		    l3_off + offsetof(ipha_t, ipha_fragment_offset_and_flags),
2030 		    &frag_off)) {
2031 			return (false);
2032 		}
2033 
2034 		if (!mac_mmc_get_uint8(cursor,
2035 		    l3_off + offsetof(ipha_t, ipha_protocol), &ipproto)) {
2036 			return (false);
2037 		}
2038 
2039 		if (ipprotop != NULL) {
2040 			*ipprotop = ipproto;
2041 		}
2042 		if (is_fragp != NULL) {
2043 			*is_fragp = ((frag_off & (IPH_MF | IPH_OFFSET)) != 0);
2044 		}
2045 		if (hdr_sizep != NULL) {
2046 			*hdr_sizep = verlen * 4;
2047 		}
2048 		return (true);
2049 	}
2050 	if (l3_sap == ETHERTYPE_IPV6) {
2051 		uint16_t ip_len = sizeof (ip6_t);
2052 		uint8_t ipproto;
2053 		bool found_frag_eh = false;
2054 
2055 		if (!mac_mmc_get_uint8(cursor,
2056 		    l3_off + offsetof(ip6_t, ip6_nxt), &ipproto)) {
2057 			return (false);
2058 		}
2059 
2060 		/* Chase any extension headers present in packet */
2061 		while (mac_parse_is_ipv6eh(ipproto)) {
2062 			uint8_t len_val, next_hdr;
2063 			uint16_t eh_len;
2064 
2065 			const size_t hdr_off = l3_off + ip_len;
2066 			if (!mac_mmc_get_uint8(cursor, hdr_off, &next_hdr)) {
2067 				return (false);
2068 			}
2069 
2070 			if (ipproto == IPPROTO_FRAGMENT) {
2071 				/*
2072 				 * The Fragment extension header bears a
2073 				 * predefined fixed length, rather than
2074 				 * communicating it through the EH itself.
2075 				 */
2076 				eh_len = 8;
2077 				found_frag_eh = true;
2078 			} else if (ipproto == IPPROTO_AH) {
2079 				/*
2080 				 * The length of the IP Authentication EH is
2081 				 * stored as (n + 2) * 32-bits, where 'n' is the
2082 				 * recorded EH length field
2083 				 */
2084 				if (!mac_mmc_get_uint8(cursor, hdr_off + 1,
2085 				    &len_val)) {
2086 					return (false);
2087 				}
2088 				eh_len = ((uint16_t)len_val + 2) * 4;
2089 			} else {
2090 				/*
2091 				 * All other EHs should follow the sizing
2092 				 * formula of (n + 1) * 64-bits, where 'n' is
2093 				 * the recorded EH length field.
2094 				 */
2095 				if (!mac_mmc_get_uint8(cursor, hdr_off + 1,
2096 				    &len_val)) {
2097 					return (false);
2098 				}
2099 				eh_len = ((uint16_t)len_val + 1) * 8;
2100 			}
2101 			/*
2102 			 * Protect against overflow in the case of a very
2103 			 * contrived packet.
2104 			 */
2105 			if ((ip_len + eh_len) < ip_len) {
2106 				return (-1);
2107 			}
2108 
2109 			ipproto = next_hdr;
2110 			ip_len += eh_len;
2111 		}
2112 
2113 		if (ipprotop != NULL) {
2114 			*ipprotop = ipproto;
2115 		}
2116 		if (is_fragp != NULL) {
2117 			*is_fragp = found_frag_eh;
2118 		}
2119 		if (hdr_sizep != NULL) {
2120 			*hdr_sizep = ip_len;
2121 		}
2122 		return (true);
2123 	}
2124 
2125 	return (false);
2126 }
2127 
2128 /*
2129  * Attempt to parse L4 protocol header from mblk chain.
2130  *
2131  * The IP protocol of the containing header must be specified by the caller.
2132  *
2133  * Returns true if header was successfully parsed.  Parsing will begin at
2134  * current offset of `cursor`.  A non-NULL argument for header size will be
2135  * populated on success.
2136  */
2137 static bool
mac_mmc_parse_l4(mac_mblk_cursor_t * cursor,uint8_t ipproto,uint8_t * hdr_sizep)2138 mac_mmc_parse_l4(mac_mblk_cursor_t *cursor, uint8_t ipproto, uint8_t *hdr_sizep)
2139 {
2140 	ASSERT(hdr_sizep != NULL);
2141 
2142 	const size_t l4_off = mac_mmc_offset(cursor);
2143 	uint8_t tcp_doff;
2144 
2145 	switch (ipproto) {
2146 	case IPPROTO_TCP:
2147 		if (!mac_mmc_get_uint8(cursor,
2148 		    l4_off + offsetof(tcph_t, th_offset_and_rsrvd),
2149 		    &tcp_doff)) {
2150 			return (false);
2151 		}
2152 		tcp_doff = (tcp_doff & 0xf0) >> 4;
2153 		if (tcp_doff < 5 || tcp_doff > 0xf) {
2154 			return (false);
2155 		}
2156 		*hdr_sizep = tcp_doff * 4;
2157 		return (true);
2158 	case IPPROTO_UDP:
2159 		*hdr_sizep = sizeof (struct udphdr);
2160 		return (true);
2161 	case IPPROTO_SCTP:
2162 		*hdr_sizep = sizeof (sctp_hdr_t);
2163 		return (true);
2164 	default:
2165 		return (false);
2166 	}
2167 }
2168 
2169 /*
2170  * Parse destination MAC address and VLAN TCI (if any) from mblk chain.
2171  *
2172  * If packet ethertype does not indicate that a VLAN is present,
2173  * MEOI_VLAN_TCI_INVALID will be returned for the TCI.
2174  */
2175 int
mac_ether_l2_info(mblk_t * mp,uint8_t * dst_addrp,uint32_t * vlan_tcip)2176 mac_ether_l2_info(mblk_t *mp, uint8_t *dst_addrp, uint32_t *vlan_tcip)
2177 {
2178 	mac_mblk_cursor_t cursor;
2179 
2180 	mac_mmc_init(&cursor, mp);
2181 	if (!mac_mmc_parse_ether(&cursor, dst_addrp, vlan_tcip, NULL, NULL)) {
2182 		return (-1);
2183 	}
2184 
2185 	return (0);
2186 }
2187 
2188 /*
2189  * Perform a partial parsing of offload info from a frame and/or packet.
2190  *
2191  * Beginning at the provided byte offset (`off`) in the mblk, attempt to parse
2192  * any offload info which has not yet been populated in `meoi`.  The contents of
2193  * `meoi_flags` upon entry will be considered as "already parsed", their
2194  * corresponding data fields will be considered valid.
2195  *
2196  * A motivating example: A non-Ethernet packet could be parsed for L3/L4 offload
2197  * information by setting MEOI_L2INFO_SET in `meoi_flags`, and the L3 SAP in
2198  * `meoi_l3_proto`. With a value in `meoi_l2hlen` that, when combined with the
2199  * provided `off`, will direct the parser to the start of the L3 header in the
2200  * mblk, the rest of the logic will be free to run.
2201  *
2202  * Alternatively, this could be used to parse the headers in an encapsulated
2203  * Ethernet packet by simply specifying the start of its header in `off`.
2204  *
2205  * Returns 0 if parsing was able to proceed all the way through the L4 header.
2206  * The meoi_flags field will be updated regardless for any partial (L2/L3)
2207  * parsing which was successful.
2208  */
2209 int
mac_partial_offload_info(mblk_t * mp,size_t off,mac_ether_offload_info_t * meoi)2210 mac_partial_offload_info(mblk_t *mp, size_t off, mac_ether_offload_info_t *meoi)
2211 {
2212 	mac_mblk_cursor_t cursor;
2213 
2214 	mac_mmc_init(&cursor, mp);
2215 
2216 	if (!mac_mmc_seek(&cursor, off)) {
2217 		return (-1);
2218 	}
2219 
2220 	if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) {
2221 		uint32_t vlan_tci;
2222 		uint16_t l2_sz, ethertype;
2223 		if (!mac_mmc_parse_ether(&cursor, NULL, &vlan_tci, &ethertype,
2224 		    &l2_sz)) {
2225 			return (-1);
2226 		}
2227 
2228 		meoi->meoi_flags |= MEOI_L2INFO_SET;
2229 		meoi->meoi_l2hlen = l2_sz;
2230 		meoi->meoi_l3proto = ethertype;
2231 		if (vlan_tci != MEOI_VLAN_TCI_INVALID) {
2232 			ASSERT3U(meoi->meoi_l2hlen, ==,
2233 			    sizeof (struct ether_vlan_header));
2234 			meoi->meoi_flags |= MEOI_VLAN_TAGGED;
2235 		}
2236 	}
2237 	const size_t l2_end = off + (size_t)meoi->meoi_l2hlen;
2238 	if (!mac_mmc_seek(&cursor, l2_end)) {
2239 		meoi->meoi_flags &= ~MEOI_L2INFO_SET;
2240 		return (-1);
2241 	}
2242 
2243 	if ((meoi->meoi_flags & MEOI_L3INFO_SET) == 0) {
2244 		uint8_t ipproto;
2245 		uint16_t l3_sz;
2246 		bool is_frag;
2247 		if (!mac_mmc_parse_l3(&cursor, meoi->meoi_l3proto, &ipproto,
2248 		    &is_frag, &l3_sz)) {
2249 			return (-1);
2250 		}
2251 
2252 		meoi->meoi_l3hlen = l3_sz;
2253 		meoi->meoi_l4proto = ipproto;
2254 		meoi->meoi_flags |= MEOI_L3INFO_SET;
2255 		if (is_frag) {
2256 			meoi->meoi_flags |= MEOI_L3_FRAGMENT;
2257 		}
2258 	}
2259 	const size_t l3_end = l2_end + (size_t)meoi->meoi_l3hlen;
2260 	if (!mac_mmc_seek(&cursor, l3_end)) {
2261 		meoi->meoi_flags &= ~MEOI_L3INFO_SET;
2262 		return (-1);
2263 	}
2264 
2265 	if ((meoi->meoi_flags & MEOI_L4INFO_SET) == 0) {
2266 		uint8_t l4_sz;
2267 		if (!mac_mmc_parse_l4(&cursor, meoi->meoi_l4proto, &l4_sz)) {
2268 			return (-1);
2269 		}
2270 
2271 		meoi->meoi_l4hlen = l4_sz;
2272 		meoi->meoi_flags |= MEOI_L4INFO_SET;
2273 	}
2274 	const size_t l4_end = l3_end + (size_t)meoi->meoi_l4hlen;
2275 	if (!mac_mmc_seek(&cursor, l4_end)) {
2276 		meoi->meoi_flags &= ~MEOI_L4INFO_SET;
2277 		return (-1);
2278 	}
2279 
2280 	return (0);
2281 }
2282 
2283 int
mac_ether_offload_info(mblk_t * mp,mac_ether_offload_info_t * meoi)2284 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
2285 {
2286 	bzero(meoi, sizeof (mac_ether_offload_info_t));
2287 	meoi->meoi_len = msgdsize(mp);
2288 
2289 	return (mac_partial_offload_info(mp, 0, meoi));
2290 }
2291