xref: /illumos-gate/usr/src/uts/common/io/mac/mac_provider.c (revision 1f8b8a0145321ca42ee324565958ceb82a14ee7a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2019 Joyent, Inc.
25  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
26  * Copyright 2020 RackTop Systems, Inc.
27  * Copyright 2025 Oxide Computer Company
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/id_space.h>
33 #include <sys/esunddi.h>
34 #include <sys/stat.h>
35 #include <sys/mkdev.h>
36 #include <sys/stream.h>
37 #include <sys/strsubr.h>
38 #include <sys/dlpi.h>
39 #include <sys/modhash.h>
40 #include <sys/mac.h>
41 #include <sys/mac_provider.h>
42 #include <sys/mac_impl.h>
43 #include <sys/mac_client_impl.h>
44 #include <sys/mac_client_priv.h>
45 #include <sys/mac_soft_ring.h>
46 #include <sys/mac_stat.h>
47 #include <sys/dld.h>
48 #include <sys/modctl.h>
49 #include <sys/fs/dv_node.h>
50 #include <sys/thread.h>
51 #include <sys/proc.h>
52 #include <sys/callb.h>
53 #include <sys/cpuvar.h>
54 #include <sys/atomic.h>
55 #include <sys/sdt.h>
56 #include <sys/mac_flow.h>
57 #include <sys/ddi_intr_impl.h>
58 #include <sys/disp.h>
59 #include <sys/sdt.h>
60 #include <sys/stdbool.h>
61 #include <sys/pattr.h>
62 #include <sys/strsun.h>
63 #include <sys/vlan.h>
64 #include <inet/ip.h>
65 #include <inet/tcp.h>
66 #include <netinet/udp.h>
67 #include <netinet/sctp.h>
68 #include <netinet/ip_icmp.h>
69 #include <netinet/icmp6.h>
70 
71 /*
72  * MAC Provider Interface.
73  *
74  * Interface for GLDv3 compatible NIC drivers.
75  */
76 
77 static void i_mac_notify_thread(void *);
78 
79 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *);
80 
81 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = {
82 	mac_fanout_recompute,	/* MAC_NOTE_LINK */
83 	NULL,		/* MAC_NOTE_UNICST */
84 	NULL,		/* MAC_NOTE_TX */
85 	NULL,		/* MAC_NOTE_DEVPROMISC */
86 	NULL,		/* MAC_NOTE_FASTPATH_FLUSH */
87 	NULL,		/* MAC_NOTE_SDU_SIZE */
88 	NULL,		/* MAC_NOTE_MARGIN */
89 	NULL,		/* MAC_NOTE_CAPAB_CHG */
90 	NULL		/* MAC_NOTE_LOWLINK */
91 };
92 
93 /*
94  * Driver support functions.
95  */
96 
97 /* REGISTRATION */
98 
99 mac_register_t *
mac_alloc(uint_t mac_version)100 mac_alloc(uint_t mac_version)
101 {
102 	mac_register_t *mregp;
103 
104 	/*
105 	 * Make sure there isn't a version mismatch between the driver and
106 	 * the framework.  In the future, if multiple versions are
107 	 * supported, this check could become more sophisticated.
108 	 */
109 	if (mac_version != MAC_VERSION)
110 		return (NULL);
111 
112 	mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP);
113 	mregp->m_version = mac_version;
114 	return (mregp);
115 }
116 
117 void
mac_free(mac_register_t * mregp)118 mac_free(mac_register_t *mregp)
119 {
120 	kmem_free(mregp, sizeof (mac_register_t));
121 }
122 
123 /*
124  * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
125  * value.
126  */
127 static uint16_t
mac_features_to_flags(mac_handle_t mh)128 mac_features_to_flags(mac_handle_t mh)
129 {
130 	uint16_t flags = 0;
131 	uint32_t cap_sum = 0;
132 	mac_capab_lso_t cap_lso;
133 
134 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
135 		if (cap_sum & HCKSUM_IPHDRCKSUM)
136 			flags |= HCK_IPV4_HDRCKSUM;
137 
138 		if (cap_sum & HCKSUM_INET_PARTIAL)
139 			flags |= HCK_PARTIALCKSUM;
140 		else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
141 			flags |= HCK_FULLCKSUM;
142 	}
143 
144 	/*
145 	 * We don't need the information stored in 'cap_lso', but we
146 	 * need to pass a non-NULL pointer to appease the driver.
147 	 */
148 	if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
149 		flags |= HW_LSO;
150 
151 	return (flags);
152 }
153 
154 /*
155  * mac_register() is how drivers register new MACs with the GLDv3
156  * framework.  The mregp argument is allocated by drivers using the
157  * mac_alloc() function, and can be freed using mac_free() immediately upon
158  * return from mac_register().  Upon success (0 return value), the mhp
159  * opaque pointer becomes the driver's handle to its MAC interface, and is
160  * the argument to all other mac module entry points.
161  */
162 /* ARGSUSED */
163 int
mac_register(mac_register_t * mregp,mac_handle_t * mhp)164 mac_register(mac_register_t *mregp, mac_handle_t *mhp)
165 {
166 	mac_impl_t		*mip;
167 	mactype_t		*mtype;
168 	int			err = EINVAL;
169 	struct devnames		*dnp = NULL;
170 	uint_t			instance;
171 	boolean_t		style1_created = B_FALSE;
172 	boolean_t		style2_created = B_FALSE;
173 	char			*driver;
174 	minor_t			minor = 0;
175 
176 	/* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */
177 	if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip)))
178 		return (EINVAL);
179 
180 	/* Find the required MAC-Type plugin. */
181 	if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL)
182 		return (EINVAL);
183 
184 	/* Create a mac_impl_t to represent this MAC. */
185 	mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP);
186 
187 	/*
188 	 * The mac is not ready for open yet.
189 	 */
190 	mip->mi_state_flags |= MIS_DISABLED;
191 
192 	/*
193 	 * When a mac is registered, the m_instance field can be set to:
194 	 *
195 	 *  0:	Get the mac's instance number from m_dip.
196 	 *	This is usually used for physical device dips.
197 	 *
198 	 *  [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number.
199 	 *	For example, when an aggregation is created with the key option,
200 	 *	"key" will be used as the instance number.
201 	 *
202 	 *  -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1].
203 	 *	This is often used when a MAC of a virtual link is registered
204 	 *	(e.g., aggregation when "key" is not specified, or vnic).
205 	 *
206 	 * Note that the instance number is used to derive the mi_minor field
207 	 * of mac_impl_t, which will then be used to derive the name of kstats
208 	 * and the devfs nodes.  The first 2 cases are needed to preserve
209 	 * backward compatibility.
210 	 */
211 	switch (mregp->m_instance) {
212 	case 0:
213 		instance = ddi_get_instance(mregp->m_dip);
214 		break;
215 	case ((uint_t)-1):
216 		minor = mac_minor_hold(B_TRUE);
217 		if (minor == 0) {
218 			err = ENOSPC;
219 			goto fail;
220 		}
221 		instance = minor - 1;
222 		break;
223 	default:
224 		instance = mregp->m_instance;
225 		if (instance >= MAC_MAX_MINOR) {
226 			err = EINVAL;
227 			goto fail;
228 		}
229 		break;
230 	}
231 
232 	mip->mi_minor = (minor_t)(instance + 1);
233 	mip->mi_dip = mregp->m_dip;
234 	mip->mi_clients_list = NULL;
235 	mip->mi_nclients = 0;
236 
237 	/* Set the default IEEE Port VLAN Identifier */
238 	mip->mi_pvid = 1;
239 
240 	/* Default bridge link learning protection values */
241 	mip->mi_llimit = 1000;
242 	mip->mi_ldecay = 200;
243 
244 	driver = (char *)ddi_driver_name(mip->mi_dip);
245 
246 	/* Construct the MAC name as <drvname><instance> */
247 	(void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d",
248 	    driver, instance);
249 
250 	mip->mi_driver = mregp->m_driver;
251 
252 	mip->mi_type = mtype;
253 	mip->mi_margin = mregp->m_margin;
254 	mip->mi_info.mi_media = mtype->mt_type;
255 	mip->mi_info.mi_nativemedia = mtype->mt_nativetype;
256 	if (mregp->m_max_sdu <= mregp->m_min_sdu)
257 		goto fail;
258 	if (mregp->m_multicast_sdu == 0)
259 		mregp->m_multicast_sdu = mregp->m_max_sdu;
260 	if (mregp->m_multicast_sdu < mregp->m_min_sdu ||
261 	    mregp->m_multicast_sdu > mregp->m_max_sdu)
262 		goto fail;
263 	mip->mi_sdu_min = mregp->m_min_sdu;
264 	mip->mi_sdu_max = mregp->m_max_sdu;
265 	mip->mi_sdu_multicast = mregp->m_multicast_sdu;
266 	mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length;
267 	/*
268 	 * If the media supports a broadcast address, cache a pointer to it
269 	 * in the mac_info_t so that upper layers can use it.
270 	 */
271 	mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr;
272 
273 	mip->mi_v12n_level = mregp->m_v12n;
274 
275 	/*
276 	 * Copy the unicast source address into the mac_info_t, but only if
277 	 * the MAC-Type defines a non-zero address length.  We need to
278 	 * handle MAC-Types that have an address length of 0
279 	 * (point-to-point protocol MACs for example).
280 	 */
281 	if (mip->mi_type->mt_addr_length > 0) {
282 		if (mregp->m_src_addr == NULL)
283 			goto fail;
284 		mip->mi_info.mi_unicst_addr =
285 		    kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP);
286 		bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr,
287 		    mip->mi_type->mt_addr_length);
288 
289 		/*
290 		 * Copy the fixed 'factory' MAC address from the immutable
291 		 * info.  This is taken to be the MAC address currently in
292 		 * use.
293 		 */
294 		bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr,
295 		    mip->mi_type->mt_addr_length);
296 
297 		/*
298 		 * At this point, we should set up the classification
299 		 * rules etc but we delay it till mac_open() so that
300 		 * the resource discovery has taken place and we
301 		 * know someone wants to use the device. Otherwise
302 		 * memory gets allocated for Rx ring structures even
303 		 * during probe.
304 		 */
305 
306 		/* Copy the destination address if one is provided. */
307 		if (mregp->m_dst_addr != NULL) {
308 			bcopy(mregp->m_dst_addr, mip->mi_dstaddr,
309 			    mip->mi_type->mt_addr_length);
310 			mip->mi_dstaddr_set = B_TRUE;
311 		}
312 	} else if (mregp->m_src_addr != NULL) {
313 		goto fail;
314 	}
315 
316 	/*
317 	 * The format of the m_pdata is specific to the plugin.  It is
318 	 * passed in as an argument to all of the plugin callbacks.  The
319 	 * driver can update this information by calling
320 	 * mac_pdata_update().
321 	 */
322 	if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) {
323 		/*
324 		 * Verify if the supplied plugin data is valid.  Note that
325 		 * even if the caller passed in a NULL pointer as plugin data,
326 		 * we still need to verify if that's valid as the plugin may
327 		 * require plugin data to function.
328 		 */
329 		if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata,
330 		    mregp->m_pdata_size)) {
331 			goto fail;
332 		}
333 		if (mregp->m_pdata != NULL) {
334 			mip->mi_pdata =
335 			    kmem_alloc(mregp->m_pdata_size, KM_SLEEP);
336 			bcopy(mregp->m_pdata, mip->mi_pdata,
337 			    mregp->m_pdata_size);
338 			mip->mi_pdata_size = mregp->m_pdata_size;
339 		}
340 	} else if (mregp->m_pdata != NULL) {
341 		/*
342 		 * The caller supplied non-NULL plugin data, but the plugin
343 		 * does not recognize plugin data.
344 		 */
345 		err = EINVAL;
346 		goto fail;
347 	}
348 
349 	/*
350 	 * Register the private properties.
351 	 */
352 	mac_register_priv_prop(mip, mregp->m_priv_props);
353 
354 	/*
355 	 * Stash the driver callbacks into the mac_impl_t, but first sanity
356 	 * check to make sure all mandatory callbacks are set.
357 	 */
358 	if (mregp->m_callbacks->mc_getstat == NULL ||
359 	    mregp->m_callbacks->mc_start == NULL ||
360 	    mregp->m_callbacks->mc_stop == NULL ||
361 	    mregp->m_callbacks->mc_setpromisc == NULL ||
362 	    mregp->m_callbacks->mc_multicst == NULL) {
363 		goto fail;
364 	}
365 	mip->mi_callbacks = mregp->m_callbacks;
366 
367 	if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY,
368 	    &mip->mi_capab_legacy)) {
369 		mip->mi_state_flags |= MIS_LEGACY;
370 		mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev;
371 	} else {
372 		mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip),
373 		    mip->mi_minor);
374 	}
375 
376 	/*
377 	 * Allocate a notification thread. thread_create blocks for memory
378 	 * if needed, it never fails.
379 	 */
380 	mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread,
381 	    mip, 0, &p0, TS_RUN, minclsyspri);
382 
383 	/*
384 	 * Cache the DB_CKSUMFLAGS that this MAC supports.
385 	 */
386 	mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
387 
388 	/*
389 	 * Initialize the capabilities
390 	 */
391 	bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
392 	bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
393 
394 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL))
395 		mip->mi_state_flags |= MIS_IS_VNIC;
396 
397 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL))
398 		mip->mi_state_flags |= MIS_IS_AGGR;
399 
400 	if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL))
401 		mip->mi_state_flags |= MIS_IS_OVERLAY;
402 
403 	mac_addr_factory_init(mip);
404 
405 	mac_transceiver_init(mip);
406 
407 	mac_led_init(mip);
408 
409 	/*
410 	 * Enforce the virtrualization level registered.
411 	 */
412 	if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) {
413 		if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 ||
414 		    mac_init_rings(mip, MAC_RING_TYPE_TX) != 0)
415 			goto fail;
416 
417 		/*
418 		 * The driver needs to register at least rx rings for this
419 		 * virtualization level.
420 		 */
421 		if (mip->mi_rx_groups == NULL)
422 			goto fail;
423 	}
424 
425 	/*
426 	 * The driver must set mc_unicst entry point to NULL when it advertises
427 	 * CAP_RINGS for rx groups.
428 	 */
429 	if (mip->mi_rx_groups != NULL) {
430 		if (mregp->m_callbacks->mc_unicst != NULL)
431 			goto fail;
432 	} else {
433 		if (mregp->m_callbacks->mc_unicst == NULL)
434 			goto fail;
435 	}
436 
437 	/*
438 	 * Initialize MAC addresses. Must be called after mac_init_rings().
439 	 */
440 	mac_init_macaddr(mip);
441 
442 	mip->mi_share_capab.ms_snum = 0;
443 	if (mip->mi_v12n_level & MAC_VIRT_HIO) {
444 		(void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES,
445 		    &mip->mi_share_capab);
446 	}
447 
448 	/*
449 	 * Initialize the kstats for this device.
450 	 */
451 	mac_driver_stat_create(mip);
452 
453 	/* Zero out any properties. */
454 	bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t));
455 
456 	if (mip->mi_minor <= MAC_MAX_MINOR) {
457 		/* Create a style-2 DLPI device */
458 		if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0,
459 		    DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS)
460 			goto fail;
461 		style2_created = B_TRUE;
462 
463 		/* Create a style-1 DLPI device */
464 		if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR,
465 		    mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS)
466 			goto fail;
467 		style1_created = B_TRUE;
468 	}
469 
470 	mac_flow_l2tab_create(mip, &mip->mi_flow_tab);
471 
472 	rw_enter(&i_mac_impl_lock, RW_WRITER);
473 	if (mod_hash_insert(i_mac_impl_hash,
474 	    (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) {
475 		rw_exit(&i_mac_impl_lock);
476 		err = EEXIST;
477 		goto fail;
478 	}
479 
480 	DTRACE_PROBE2(mac__register, struct devnames *, dnp,
481 	    (mac_impl_t *), mip);
482 
483 	/*
484 	 * Mark the MAC to be ready for open.
485 	 */
486 	mip->mi_state_flags &= ~MIS_DISABLED;
487 	rw_exit(&i_mac_impl_lock);
488 
489 	atomic_inc_32(&i_mac_impl_count);
490 
491 	cmn_err(CE_NOTE, "!%s registered", mip->mi_name);
492 	*mhp = (mac_handle_t)mip;
493 	return (0);
494 
495 fail:
496 	if (style1_created)
497 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
498 
499 	if (style2_created)
500 		ddi_remove_minor_node(mip->mi_dip, driver);
501 
502 	mac_addr_factory_fini(mip);
503 
504 	/* Clean up registered MAC addresses */
505 	mac_fini_macaddr(mip);
506 
507 	/* Clean up registered rings */
508 	mac_free_rings(mip, MAC_RING_TYPE_RX);
509 	mac_free_rings(mip, MAC_RING_TYPE_TX);
510 
511 	/* Clean up notification thread */
512 	if (mip->mi_notify_thread != NULL)
513 		i_mac_notify_exit(mip);
514 
515 	if (mip->mi_info.mi_unicst_addr != NULL) {
516 		kmem_free(mip->mi_info.mi_unicst_addr,
517 		    mip->mi_type->mt_addr_length);
518 		mip->mi_info.mi_unicst_addr = NULL;
519 	}
520 
521 	mac_driver_stat_delete(mip);
522 
523 	if (mip->mi_type != NULL) {
524 		atomic_dec_32(&mip->mi_type->mt_ref);
525 		mip->mi_type = NULL;
526 	}
527 
528 	if (mip->mi_pdata != NULL) {
529 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
530 		mip->mi_pdata = NULL;
531 		mip->mi_pdata_size = 0;
532 	}
533 
534 	if (minor != 0) {
535 		ASSERT(minor > MAC_MAX_MINOR);
536 		mac_minor_rele(minor);
537 	}
538 
539 	mip->mi_state_flags = 0;
540 	mac_unregister_priv_prop(mip);
541 
542 	/*
543 	 * Clear the state before destroying the mac_impl_t
544 	 */
545 	mip->mi_state_flags = 0;
546 
547 	kmem_cache_free(i_mac_impl_cachep, mip);
548 	return (err);
549 }
550 
551 /*
552  * Unregister from the GLDv3 framework
553  */
554 int
mac_unregister(mac_handle_t mh)555 mac_unregister(mac_handle_t mh)
556 {
557 	int			err;
558 	mac_impl_t		*mip = (mac_impl_t *)mh;
559 	mod_hash_val_t		val;
560 	mac_margin_req_t	*mmr, *nextmmr;
561 
562 	/* Fail the unregister if there are any open references to this mac. */
563 	if ((err = mac_disable_nowait(mh)) != 0)
564 		return (err);
565 
566 	/*
567 	 * Clean up notification thread and wait for it to exit.
568 	 */
569 	i_mac_notify_exit(mip);
570 
571 	/*
572 	 * Prior to acquiring the MAC perimeter, remove the MAC instance from
573 	 * the internal hash table. Such removal means table-walkers that
574 	 * acquire the perimeter will not do so on behalf of what we are
575 	 * unregistering, which prevents a deadlock.
576 	 */
577 	rw_enter(&i_mac_impl_lock, RW_WRITER);
578 	(void) mod_hash_remove(i_mac_impl_hash,
579 	    (mod_hash_key_t)mip->mi_name, &val);
580 	rw_exit(&i_mac_impl_lock);
581 	ASSERT(mip == (mac_impl_t *)val);
582 
583 	i_mac_perim_enter(mip);
584 
585 	/*
586 	 * There is still resource properties configured over this mac.
587 	 */
588 	if (mip->mi_resource_props.mrp_mask != 0)
589 		mac_fastpath_enable((mac_handle_t)mip);
590 
591 	if (mip->mi_minor < MAC_MAX_MINOR + 1) {
592 		ddi_remove_minor_node(mip->mi_dip, mip->mi_name);
593 		ddi_remove_minor_node(mip->mi_dip,
594 		    (char *)ddi_driver_name(mip->mi_dip));
595 	}
596 
597 	ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags &
598 	    MIS_EXCLUSIVE));
599 
600 	mac_driver_stat_delete(mip);
601 
602 	ASSERT(i_mac_impl_count > 0);
603 	atomic_dec_32(&i_mac_impl_count);
604 
605 	if (mip->mi_pdata != NULL)
606 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
607 	mip->mi_pdata = NULL;
608 	mip->mi_pdata_size = 0;
609 
610 	/*
611 	 * Free the list of margin request.
612 	 */
613 	for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) {
614 		nextmmr = mmr->mmr_nextp;
615 		kmem_free(mmr, sizeof (mac_margin_req_t));
616 	}
617 	mip->mi_mmrp = NULL;
618 
619 	mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN;
620 	kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length);
621 	mip->mi_info.mi_unicst_addr = NULL;
622 
623 	atomic_dec_32(&mip->mi_type->mt_ref);
624 	mip->mi_type = NULL;
625 
626 	/*
627 	 * Free the primary MAC address.
628 	 */
629 	mac_fini_macaddr(mip);
630 
631 	/*
632 	 * free all rings
633 	 */
634 	mac_free_rings(mip, MAC_RING_TYPE_RX);
635 	mac_free_rings(mip, MAC_RING_TYPE_TX);
636 
637 	mac_addr_factory_fini(mip);
638 
639 	bzero(mip->mi_addr, MAXMACADDRLEN);
640 	bzero(mip->mi_dstaddr, MAXMACADDRLEN);
641 	mip->mi_dstaddr_set = B_FALSE;
642 
643 	/* and the flows */
644 	mac_flow_tab_destroy(mip->mi_flow_tab);
645 	mip->mi_flow_tab = NULL;
646 
647 	if (mip->mi_minor > MAC_MAX_MINOR)
648 		mac_minor_rele(mip->mi_minor);
649 
650 	cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name);
651 
652 	/*
653 	 * Reset the perim related fields to default values before
654 	 * kmem_cache_free
655 	 */
656 	i_mac_perim_exit(mip);
657 	mip->mi_state_flags = 0;
658 
659 	mac_unregister_priv_prop(mip);
660 
661 	ASSERT(mip->mi_bridge_link == NULL);
662 	kmem_cache_free(i_mac_impl_cachep, mip);
663 
664 	return (0);
665 }
666 
667 /* DATA RECEPTION */
668 
669 /*
670  * This function is invoked for packets received by the MAC driver in
671  * interrupt context. The ring generation number provided by the driver
672  * is matched with the ring generation number held in MAC. If they do not
673  * match, received packets are considered stale packets coming from an older
674  * assignment of the ring. Drop them.
675  */
676 void
mac_rx_ring(mac_handle_t mh,mac_ring_handle_t mrh,mblk_t * mp_chain,uint64_t mr_gen_num)677 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain,
678     uint64_t mr_gen_num)
679 {
680 	mac_ring_t		*mr = (mac_ring_t *)mrh;
681 
682 	if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) {
683 		DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t,
684 		    mr->mr_gen_num, uint64_t, mr_gen_num);
685 		freemsgchain(mp_chain);
686 		return;
687 	}
688 	mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain);
689 }
690 
691 /*
692  * This function is invoked for each packet received by the underlying driver.
693  */
694 void
mac_rx(mac_handle_t mh,mac_resource_handle_t mrh,mblk_t * mp_chain)695 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
696 {
697 	mac_impl_t *mip = (mac_impl_t *)mh;
698 
699 	/*
700 	 * Check if the link is part of a bridge.  If not, then we don't need
701 	 * to take the lock to remain consistent.  Make this common case
702 	 * lock-free and tail-call optimized.
703 	 */
704 	if (mip->mi_bridge_link == NULL) {
705 		mac_rx_common(mh, mrh, mp_chain);
706 	} else {
707 		/*
708 		 * Once we take a reference on the bridge link, the bridge
709 		 * module itself can't unload, so the callback pointers are
710 		 * stable.
711 		 */
712 		mutex_enter(&mip->mi_bridge_lock);
713 		if ((mh = mip->mi_bridge_link) != NULL)
714 			mac_bridge_ref_cb(mh, B_TRUE);
715 		mutex_exit(&mip->mi_bridge_lock);
716 		if (mh == NULL) {
717 			mac_rx_common((mac_handle_t)mip, mrh, mp_chain);
718 		} else {
719 			mac_bridge_rx_cb(mh, mrh, mp_chain);
720 			mac_bridge_ref_cb(mh, B_FALSE);
721 		}
722 	}
723 }
724 
725 /*
726  * Special case function: this allows snooping of packets transmitted and
727  * received by TRILL. By design, they go directly into the TRILL module.
728  */
729 void
mac_trill_snoop(mac_handle_t mh,mblk_t * mp)730 mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
731 {
732 	mac_impl_t *mip = (mac_impl_t *)mh;
733 
734 	if (mip->mi_promisc_list != NULL)
735 		mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
736 }
737 
738 /*
739  * This is the upward reentry point for packets arriving from the bridging
740  * module and from mac_rx for links not part of a bridge.
741  */
742 void
mac_rx_common(mac_handle_t mh,mac_resource_handle_t mrh,mblk_t * mp_chain)743 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
744 {
745 	mac_impl_t		*mip = (mac_impl_t *)mh;
746 	mac_ring_t		*mr = (mac_ring_t *)mrh;
747 	mac_soft_ring_set_t	*mac_srs;
748 	mblk_t			*bp = mp_chain;
749 
750 	/*
751 	 * If there are any promiscuous mode callbacks defined for
752 	 * this MAC, pass them a copy if appropriate.
753 	 */
754 	if (mip->mi_promisc_list != NULL)
755 		mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
756 
757 	if (mr != NULL) {
758 		/*
759 		 * If the SRS teardown has started, just return. The 'mr'
760 		 * continues to be valid until the driver unregisters the MAC.
761 		 * Hardware classified packets will not make their way up
762 		 * beyond this point once the teardown has started. The driver
763 		 * is never passed a pointer to a flow entry or SRS or any
764 		 * structure that can be freed much before mac_unregister.
765 		 */
766 		mutex_enter(&mr->mr_lock);
767 		if ((mr->mr_state != MR_INUSE) || (mr->mr_flag &
768 		    (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) {
769 			mutex_exit(&mr->mr_lock);
770 			freemsgchain(mp_chain);
771 			return;
772 		}
773 
774 		/*
775 		 * The ring is in passthru mode; pass the chain up to
776 		 * the pseudo ring.
777 		 */
778 		if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) {
779 			MR_REFHOLD_LOCKED(mr);
780 			mutex_exit(&mr->mr_lock);
781 			mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain,
782 			    B_FALSE);
783 			MR_REFRELE(mr);
784 			return;
785 		}
786 
787 		/*
788 		 * The passthru callback should only be set when in
789 		 * MAC_PASSTHRU_CLASSIFIER mode.
790 		 */
791 		ASSERT3P(mr->mr_pt_fn, ==, NULL);
792 
793 		/*
794 		 * We check if an SRS is controlling this ring.
795 		 * If so, we can directly call the srs_lower_proc
796 		 * routine otherwise we need to go through mac_rx_classify
797 		 * to reach the right place.
798 		 */
799 		if (mr->mr_classify_type == MAC_HW_CLASSIFIER) {
800 			MR_REFHOLD_LOCKED(mr);
801 			mutex_exit(&mr->mr_lock);
802 			ASSERT3P(mr->mr_srs, !=, NULL);
803 			mac_srs = mr->mr_srs;
804 
805 			/*
806 			 * This is the fast path. All packets received
807 			 * on this ring are hardware classified and
808 			 * share the same MAC header info.
809 			 */
810 			mac_srs->srs_rx.sr_lower_proc(mh,
811 			    (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE);
812 			MR_REFRELE(mr);
813 			return;
814 		}
815 
816 		mutex_exit(&mr->mr_lock);
817 		/* We'll fall through to software classification */
818 	} else {
819 		flow_entry_t *flent;
820 		int err;
821 
822 		rw_enter(&mip->mi_rw_lock, RW_READER);
823 		if (mip->mi_single_active_client != NULL) {
824 			flent = mip->mi_single_active_client->mci_flent_list;
825 			FLOW_TRY_REFHOLD(flent, err);
826 			rw_exit(&mip->mi_rw_lock);
827 			if (err == 0) {
828 				(flent->fe_cb_fn)(flent->fe_cb_arg1,
829 				    flent->fe_cb_arg2, mp_chain, B_FALSE);
830 				FLOW_REFRELE(flent);
831 				return;
832 			}
833 		} else {
834 			rw_exit(&mip->mi_rw_lock);
835 		}
836 	}
837 
838 	if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) {
839 		if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL)
840 			return;
841 	}
842 
843 	freemsgchain(bp);
844 }
845 
846 /* DATA TRANSMISSION */
847 
848 /*
849  * A driver's notification to resume transmission, in case of a provider
850  * without TX rings.
851  */
852 void
mac_tx_update(mac_handle_t mh)853 mac_tx_update(mac_handle_t mh)
854 {
855 	mac_tx_ring_update(mh, NULL);
856 }
857 
858 /*
859  * A driver's notification to resume transmission on the specified TX ring.
860  */
861 void
mac_tx_ring_update(mac_handle_t mh,mac_ring_handle_t rh)862 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh)
863 {
864 	i_mac_tx_srs_notify((mac_impl_t *)mh, rh);
865 }
866 
867 /* LINK STATE */
868 /*
869  * Notify the MAC layer about a link state change
870  */
871 void
mac_link_update(mac_handle_t mh,link_state_t link)872 mac_link_update(mac_handle_t mh, link_state_t link)
873 {
874 	mac_impl_t	*mip = (mac_impl_t *)mh;
875 
876 	/*
877 	 * Save the link state.
878 	 */
879 	mip->mi_lowlinkstate = link;
880 
881 	/*
882 	 * Send a MAC_NOTE_LOWLINK notification.  This tells the notification
883 	 * thread to deliver both lower and upper notifications.
884 	 */
885 	i_mac_notify(mip, MAC_NOTE_LOWLINK);
886 }
887 
888 /*
889  * Notify the MAC layer about a link state change due to bridging.
890  */
891 void
mac_link_redo(mac_handle_t mh,link_state_t link)892 mac_link_redo(mac_handle_t mh, link_state_t link)
893 {
894 	mac_impl_t	*mip = (mac_impl_t *)mh;
895 
896 	/*
897 	 * Save the link state.
898 	 */
899 	mip->mi_linkstate = link;
900 
901 	/*
902 	 * Send a MAC_NOTE_LINK notification.  Only upper notifications are
903 	 * made.
904 	 */
905 	i_mac_notify(mip, MAC_NOTE_LINK);
906 }
907 
908 /* MINOR NODE HANDLING */
909 
910 /*
911  * Given a dev_t, return the instance number (PPA) associated with it.
912  * Drivers can use this in their getinfo(9e) implementation to lookup
913  * the instance number (i.e. PPA) of the device, to use as an index to
914  * their own array of soft state structures.
915  *
916  * Returns -1 on error.
917  */
918 int
mac_devt_to_instance(dev_t devt)919 mac_devt_to_instance(dev_t devt)
920 {
921 	return (dld_devt_to_instance(devt));
922 }
923 
924 /*
925  * Drivers that make use of the private minor number space are expected to
926  * provide their own getinfo(9e) entry point. This function simply forwards
927  * to the default MAC framework getinfo(9e) implementation as a convenience
928  * if they don't need any special mapping (mac instance != ddi_get_instance())
929  */
930 int
mac_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** resp)931 mac_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
932 {
933 	return (dld_getinfo(dip, cmd, arg, resp));
934 }
935 
936 /*
937  * This function returns the first minor number that is available for
938  * driver private use.  All minor numbers smaller than this are
939  * reserved for GLDv3 use.
940  */
941 minor_t
mac_private_minor(void)942 mac_private_minor(void)
943 {
944 	return (MAC_PRIVATE_MINOR);
945 }
946 
947 /* OTHER CONTROL INFORMATION */
948 
949 /*
950  * A driver notified us that its primary MAC address has changed.
951  */
952 void
mac_unicst_update(mac_handle_t mh,const uint8_t * addr)953 mac_unicst_update(mac_handle_t mh, const uint8_t *addr)
954 {
955 	mac_impl_t	*mip = (mac_impl_t *)mh;
956 
957 	if (mip->mi_type->mt_addr_length == 0)
958 		return;
959 
960 	i_mac_perim_enter(mip);
961 
962 	/*
963 	 * If address changes, freshen the MAC address value and update
964 	 * all MAC clients that share this MAC address.
965 	 */
966 	if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) {
967 		mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr),
968 		    (uint8_t *)addr);
969 	}
970 
971 	i_mac_perim_exit(mip);
972 
973 	/*
974 	 * Send a MAC_NOTE_UNICST notification.
975 	 */
976 	i_mac_notify(mip, MAC_NOTE_UNICST);
977 }
978 
979 void
mac_dst_update(mac_handle_t mh,const uint8_t * addr)980 mac_dst_update(mac_handle_t mh, const uint8_t *addr)
981 {
982 	mac_impl_t	*mip = (mac_impl_t *)mh;
983 
984 	if (mip->mi_type->mt_addr_length == 0)
985 		return;
986 
987 	i_mac_perim_enter(mip);
988 	bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length);
989 	i_mac_perim_exit(mip);
990 	i_mac_notify(mip, MAC_NOTE_DEST);
991 }
992 
993 /*
994  * MAC plugin information changed.
995  */
996 int
mac_pdata_update(mac_handle_t mh,void * mac_pdata,size_t dsize)997 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
998 {
999 	mac_impl_t	*mip = (mac_impl_t *)mh;
1000 
1001 	/*
1002 	 * Verify that the plugin supports MAC plugin data and that the
1003 	 * supplied data is valid.
1004 	 */
1005 	if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY))
1006 		return (EINVAL);
1007 	if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize))
1008 		return (EINVAL);
1009 
1010 	if (mip->mi_pdata != NULL)
1011 		kmem_free(mip->mi_pdata, mip->mi_pdata_size);
1012 
1013 	mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP);
1014 	bcopy(mac_pdata, mip->mi_pdata, dsize);
1015 	mip->mi_pdata_size = dsize;
1016 
1017 	/*
1018 	 * Since the MAC plugin data is used to construct MAC headers that
1019 	 * were cached in fast-path headers, we need to flush fast-path
1020 	 * information for links associated with this mac.
1021 	 */
1022 	i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH);
1023 	return (0);
1024 }
1025 
1026 /*
1027  * The mac provider or mac frameowrk calls this function when it wants
1028  * to notify upstream consumers that the capabilities have changed and
1029  * that they should modify their own internal state accordingly.
1030  *
1031  * We currently have no regard for the fact that a provider could
1032  * decide to drop capabilities which would invalidate pending traffic.
1033  * For example, if one was to disable the Tx checksum offload while
1034  * TCP/IP traffic was being sent by mac clients relying on that
1035  * feature, then those packets would hit the write with missing or
1036  * partial checksums. A proper solution involves not only providing
1037  * notfication, but also performing client quiescing. That is, a capab
1038  * change should be treated as an atomic transaction that forms a
1039  * barrier between traffic relying on the current capabs and traffic
1040  * relying on the new capabs. In practice, simnet is currently the
1041  * only provider that could hit this, and it's an easily avoidable
1042  * situation (and at worst it should only lead to some dropped
1043  * packets). But if we ever want better on-the-fly capab change to
1044  * actual hardware providers, then we should give this update
1045  * mechanism a proper implementation.
1046  */
1047 void
mac_capab_update(mac_handle_t mh)1048 mac_capab_update(mac_handle_t mh)
1049 {
1050 	/*
1051 	 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
1052 	 * clients to renegotiate capabilities.
1053 	 */
1054 	i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
1055 }
1056 
1057 /*
1058  * Used by normal drivers to update the max sdu size.
1059  * We need to handle the case of a smaller mi_sdu_multicast
1060  * since this is called by mac_set_mtu() even for drivers that
1061  * have differing unicast and multicast mtu and we don't want to
1062  * increase the multicast mtu by accident in that case.
1063  */
1064 int
mac_maxsdu_update(mac_handle_t mh,uint_t sdu_max)1065 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max)
1066 {
1067 	mac_impl_t	*mip = (mac_impl_t *)mh;
1068 
1069 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1070 		return (EINVAL);
1071 	mip->mi_sdu_max = sdu_max;
1072 	if (mip->mi_sdu_multicast > mip->mi_sdu_max)
1073 		mip->mi_sdu_multicast = mip->mi_sdu_max;
1074 
1075 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1076 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1077 	return (0);
1078 }
1079 
1080 /*
1081  * Version of the above function that is used by drivers that have a different
1082  * max sdu size for multicast/broadcast vs. unicast.
1083  */
1084 int
mac_maxsdu_update2(mac_handle_t mh,uint_t sdu_max,uint_t sdu_multicast)1085 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast)
1086 {
1087 	mac_impl_t	*mip = (mac_impl_t *)mh;
1088 
1089 	if (sdu_max == 0 || sdu_max < mip->mi_sdu_min)
1090 		return (EINVAL);
1091 	if (sdu_multicast == 0)
1092 		sdu_multicast = sdu_max;
1093 	if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min)
1094 		return (EINVAL);
1095 	mip->mi_sdu_max = sdu_max;
1096 	mip->mi_sdu_multicast = sdu_multicast;
1097 
1098 	/* Send a MAC_NOTE_SDU_SIZE notification. */
1099 	i_mac_notify(mip, MAC_NOTE_SDU_SIZE);
1100 	return (0);
1101 }
1102 
1103 static void
mac_ring_intr_retarget(mac_group_t * group,mac_ring_t * ring)1104 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring)
1105 {
1106 	mac_client_impl_t *mcip;
1107 	flow_entry_t *flent;
1108 	mac_soft_ring_set_t *mac_rx_srs;
1109 	mac_cpus_t *srs_cpu;
1110 	int i;
1111 
1112 	if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) &&
1113 	    (!ring->mr_info.mri_intr.mi_ddi_shared)) {
1114 		/* interrupt can be re-targeted */
1115 		ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
1116 		flent = mcip->mci_flent;
1117 		if (ring->mr_type == MAC_RING_TYPE_RX) {
1118 			for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1119 				mac_rx_srs = flent->fe_rx_srs[i];
1120 				if (mac_rx_srs->srs_ring != ring)
1121 					continue;
1122 				srs_cpu = &mac_rx_srs->srs_cpu;
1123 				mutex_enter(&cpu_lock);
1124 				mac_rx_srs_retarget_intr(mac_rx_srs,
1125 				    srs_cpu->mc_rx_intr_cpu);
1126 				mutex_exit(&cpu_lock);
1127 				break;
1128 			}
1129 		} else {
1130 			if (flent->fe_tx_srs != NULL) {
1131 				mutex_enter(&cpu_lock);
1132 				mac_tx_srs_retarget_intr(
1133 				    flent->fe_tx_srs);
1134 				mutex_exit(&cpu_lock);
1135 			}
1136 		}
1137 	}
1138 }
1139 
1140 /*
1141  * Clients like aggr create pseudo rings (mac_ring_t) and expose them to
1142  * their clients. There is a 1-1 mapping pseudo ring and the hardware
1143  * ring. ddi interrupt handles are exported from the hardware ring to
1144  * the pseudo ring. Thus when the interrupt handle changes, clients of
1145  * aggr that are using the handle need to use the new handle and
1146  * re-target their interrupts.
1147  */
1148 static void
mac_pseudo_ring_intr_retarget(mac_impl_t * mip,mac_ring_t * ring,ddi_intr_handle_t ddh)1149 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring,
1150     ddi_intr_handle_t ddh)
1151 {
1152 	mac_ring_t *pring;
1153 	mac_group_t *pgroup;
1154 	mac_impl_t *pmip;
1155 	char macname[MAXNAMELEN];
1156 	mac_perim_handle_t p_mph;
1157 	uint64_t saved_gen_num;
1158 
1159 again:
1160 	pring = (mac_ring_t *)ring->mr_prh;
1161 	pgroup = (mac_group_t *)pring->mr_gh;
1162 	pmip = (mac_impl_t *)pgroup->mrg_mh;
1163 	saved_gen_num = ring->mr_gen_num;
1164 	(void) strlcpy(macname, pmip->mi_name, MAXNAMELEN);
1165 	/*
1166 	 * We need to enter aggr's perimeter. The locking hierarchy
1167 	 * dictates that aggr's perimeter should be entered first
1168 	 * and then the port's perimeter. So drop the port's
1169 	 * perimeter, enter aggr's and then re-enter port's
1170 	 * perimeter.
1171 	 */
1172 	i_mac_perim_exit(mip);
1173 	/*
1174 	 * While we know pmip is the aggr's mip, there is a
1175 	 * possibility that aggr could have unregistered by
1176 	 * the time we exit port's perimeter (mip) and
1177 	 * enter aggr's perimeter (pmip). To avoid that
1178 	 * scenario, enter aggr's perimeter using its name.
1179 	 */
1180 	if (mac_perim_enter_by_macname(macname, &p_mph) != 0)
1181 		return;
1182 	i_mac_perim_enter(mip);
1183 	/*
1184 	 * Check if the ring got assigned to another aggregation before
1185 	 * be could enter aggr's and the port's perimeter. When a ring
1186 	 * gets deleted from an aggregation, it calls mac_stop_ring()
1187 	 * which increments the generation number. So checking
1188 	 * generation number will be enough.
1189 	 */
1190 	if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) {
1191 		i_mac_perim_exit(mip);
1192 		mac_perim_exit(p_mph);
1193 		i_mac_perim_enter(mip);
1194 		goto again;
1195 	}
1196 
1197 	/* Check if pseudo ring is still present */
1198 	if (ring->mr_prh != NULL) {
1199 		pring->mr_info.mri_intr.mi_ddi_handle = ddh;
1200 		pring->mr_info.mri_intr.mi_ddi_shared =
1201 		    ring->mr_info.mri_intr.mi_ddi_shared;
1202 		if (ddh != NULL)
1203 			mac_ring_intr_retarget(pgroup, pring);
1204 	}
1205 	i_mac_perim_exit(mip);
1206 	mac_perim_exit(p_mph);
1207 }
1208 /*
1209  * API called by driver to provide new interrupt handle for TX/RX rings.
1210  * This usually happens when IRM (Interrupt Resource Manangement)
1211  * framework either gives the driver more MSI-x interrupts or takes
1212  * away MSI-x interrupts from the driver.
1213  */
1214 void
mac_ring_intr_set(mac_ring_handle_t mrh,ddi_intr_handle_t ddh)1215 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh)
1216 {
1217 	mac_ring_t	*ring = (mac_ring_t *)mrh;
1218 	mac_group_t	*group = (mac_group_t *)ring->mr_gh;
1219 	mac_impl_t	*mip = (mac_impl_t *)group->mrg_mh;
1220 
1221 	i_mac_perim_enter(mip);
1222 	ring->mr_info.mri_intr.mi_ddi_handle = ddh;
1223 	if (ddh == NULL) {
1224 		/* Interrupts being reset */
1225 		ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE;
1226 		if (ring->mr_prh != NULL) {
1227 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1228 			return;
1229 		}
1230 	} else {
1231 		/* New interrupt handle */
1232 		mac_compare_ddi_handle(mip->mi_rx_groups,
1233 		    mip->mi_rx_group_count, ring);
1234 		if (!ring->mr_info.mri_intr.mi_ddi_shared) {
1235 			mac_compare_ddi_handle(mip->mi_tx_groups,
1236 			    mip->mi_tx_group_count, ring);
1237 		}
1238 		if (ring->mr_prh != NULL) {
1239 			mac_pseudo_ring_intr_retarget(mip, ring, ddh);
1240 			return;
1241 		} else {
1242 			mac_ring_intr_retarget(group, ring);
1243 		}
1244 	}
1245 	i_mac_perim_exit(mip);
1246 }
1247 
1248 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */
1249 
1250 /*
1251  * Updates the mac_impl structure with the current state of the link
1252  */
1253 static void
i_mac_log_link_state(mac_impl_t * mip)1254 i_mac_log_link_state(mac_impl_t *mip)
1255 {
1256 	/*
1257 	 * If no change, then it is not interesting.
1258 	 */
1259 	if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate)
1260 		return;
1261 
1262 	switch (mip->mi_lowlinkstate) {
1263 	case LINK_STATE_UP:
1264 		if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) {
1265 			char det[200];
1266 
1267 			mip->mi_type->mt_ops.mtops_link_details(det,
1268 			    sizeof (det), (mac_handle_t)mip, mip->mi_pdata);
1269 
1270 			cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det);
1271 		} else {
1272 			cmn_err(CE_NOTE, "!%s link up", mip->mi_name);
1273 		}
1274 		break;
1275 
1276 	case LINK_STATE_DOWN:
1277 		/*
1278 		 * Only transitions from UP to DOWN are interesting
1279 		 */
1280 		if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN)
1281 			cmn_err(CE_NOTE, "!%s link down", mip->mi_name);
1282 		break;
1283 
1284 	case LINK_STATE_UNKNOWN:
1285 		/*
1286 		 * This case is normally not interesting.
1287 		 */
1288 		break;
1289 	}
1290 	mip->mi_lastlowlinkstate = mip->mi_lowlinkstate;
1291 }
1292 
1293 /*
1294  * Main routine for the callbacks notifications thread
1295  */
1296 static void
i_mac_notify_thread(void * arg)1297 i_mac_notify_thread(void *arg)
1298 {
1299 	mac_impl_t	*mip = arg;
1300 	callb_cpr_t	cprinfo;
1301 	mac_cb_t	*mcb;
1302 	mac_cb_info_t	*mcbi;
1303 	mac_notify_cb_t	*mncb;
1304 
1305 	mcbi = &mip->mi_notify_cb_info;
1306 	CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr,
1307 	    "i_mac_notify_thread");
1308 
1309 	mutex_enter(mcbi->mcbi_lockp);
1310 
1311 	for (;;) {
1312 		uint32_t	bits;
1313 		uint32_t	type;
1314 
1315 		bits = mip->mi_notify_bits;
1316 		if (bits == 0) {
1317 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1318 			cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1319 			CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp);
1320 			continue;
1321 		}
1322 		mip->mi_notify_bits = 0;
1323 		if ((bits & (1 << MAC_NNOTE)) != 0) {
1324 			/* request to quit */
1325 			ASSERT(mip->mi_state_flags & MIS_DISABLED);
1326 			break;
1327 		}
1328 
1329 		mutex_exit(mcbi->mcbi_lockp);
1330 
1331 		/*
1332 		 * Log link changes on the actual link, but then do reports on
1333 		 * synthetic state (if part of a bridge).
1334 		 */
1335 		if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) {
1336 			link_state_t newstate;
1337 			mac_handle_t mh;
1338 
1339 			i_mac_log_link_state(mip);
1340 			newstate = mip->mi_lowlinkstate;
1341 			if (mip->mi_bridge_link != NULL) {
1342 				mutex_enter(&mip->mi_bridge_lock);
1343 				if ((mh = mip->mi_bridge_link) != NULL) {
1344 					newstate = mac_bridge_ls_cb(mh,
1345 					    newstate);
1346 				}
1347 				mutex_exit(&mip->mi_bridge_lock);
1348 			}
1349 			if (newstate != mip->mi_linkstate) {
1350 				mip->mi_linkstate = newstate;
1351 				bits |= 1 << MAC_NOTE_LINK;
1352 			}
1353 		}
1354 
1355 		/*
1356 		 * Depending on which capabs have changed, the Tx
1357 		 * checksum flags may also need to be updated.
1358 		 */
1359 		if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
1360 			mac_perim_handle_t mph;
1361 			mac_handle_t mh = (mac_handle_t)mip;
1362 
1363 			mac_perim_enter_by_mh(mh, &mph);
1364 			mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
1365 			mac_perim_exit(mph);
1366 		}
1367 
1368 		/*
1369 		 * Do notification callbacks for each notification type.
1370 		 */
1371 		for (type = 0; type < MAC_NNOTE; type++) {
1372 			if ((bits & (1 << type)) == 0) {
1373 				continue;
1374 			}
1375 
1376 			if (mac_notify_cb_list[type] != NULL)
1377 				(*mac_notify_cb_list[type])(mip);
1378 
1379 			/*
1380 			 * Walk the list of notifications.
1381 			 */
1382 			MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info);
1383 			for (mcb = mip->mi_notify_cb_list; mcb != NULL;
1384 			    mcb = mcb->mcb_nextp) {
1385 				mncb = (mac_notify_cb_t *)mcb->mcb_objp;
1386 				mncb->mncb_fn(mncb->mncb_arg, type);
1387 			}
1388 			MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info,
1389 			    &mip->mi_notify_cb_list);
1390 		}
1391 
1392 		mutex_enter(mcbi->mcbi_lockp);
1393 	}
1394 
1395 	mip->mi_state_flags |= MIS_NOTIFY_DONE;
1396 	cv_broadcast(&mcbi->mcbi_cv);
1397 
1398 	/* CALLB_CPR_EXIT drops the lock */
1399 	CALLB_CPR_EXIT(&cprinfo);
1400 	thread_exit();
1401 }
1402 
1403 /*
1404  * Signal the i_mac_notify_thread asking it to quit.
1405  * Then wait till it is done.
1406  */
1407 void
i_mac_notify_exit(mac_impl_t * mip)1408 i_mac_notify_exit(mac_impl_t *mip)
1409 {
1410 	mac_cb_info_t	*mcbi;
1411 
1412 	mcbi = &mip->mi_notify_cb_info;
1413 
1414 	mutex_enter(mcbi->mcbi_lockp);
1415 	mip->mi_notify_bits = (1 << MAC_NNOTE);
1416 	cv_broadcast(&mcbi->mcbi_cv);
1417 
1418 
1419 	while ((mip->mi_notify_thread != NULL) &&
1420 	    !(mip->mi_state_flags & MIS_NOTIFY_DONE)) {
1421 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
1422 	}
1423 
1424 	/* Necessary clean up before doing kmem_cache_free */
1425 	mip->mi_state_flags &= ~MIS_NOTIFY_DONE;
1426 	mip->mi_notify_bits = 0;
1427 	mip->mi_notify_thread = NULL;
1428 	mutex_exit(mcbi->mcbi_lockp);
1429 }
1430 
1431 /*
1432  * Entry point invoked by drivers to dynamically add a ring to an
1433  * existing group.
1434  */
1435 int
mac_group_add_ring(mac_group_handle_t gh,int index)1436 mac_group_add_ring(mac_group_handle_t gh, int index)
1437 {
1438 	mac_group_t *group = (mac_group_t *)gh;
1439 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1440 	int ret;
1441 
1442 	i_mac_perim_enter(mip);
1443 	ret = i_mac_group_add_ring(group, NULL, index);
1444 	i_mac_perim_exit(mip);
1445 	return (ret);
1446 }
1447 
1448 /*
1449  * Entry point invoked by drivers to dynamically remove a ring
1450  * from an existing group. The specified ring handle must no longer
1451  * be used by the driver after a call to this function.
1452  */
1453 void
mac_group_rem_ring(mac_group_handle_t gh,mac_ring_handle_t rh)1454 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh)
1455 {
1456 	mac_group_t *group = (mac_group_t *)gh;
1457 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
1458 
1459 	i_mac_perim_enter(mip);
1460 	i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE);
1461 	i_mac_perim_exit(mip);
1462 }
1463 
1464 /*
1465  * mac_prop_info_*() callbacks called from the driver's prefix_propinfo()
1466  * entry points.
1467  */
1468 
1469 void
mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph,uint8_t val)1470 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val)
1471 {
1472 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1473 
1474 	/* nothing to do if the caller doesn't want the default value */
1475 	if (pr->pr_default == NULL)
1476 		return;
1477 
1478 	ASSERT(pr->pr_default_size >= sizeof (uint8_t));
1479 
1480 	*(uint8_t *)(pr->pr_default) = val;
1481 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1482 }
1483 
1484 void
mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph,uint64_t val)1485 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val)
1486 {
1487 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1488 
1489 	/* nothing to do if the caller doesn't want the default value */
1490 	if (pr->pr_default == NULL)
1491 		return;
1492 
1493 	ASSERT(pr->pr_default_size >= sizeof (uint64_t));
1494 
1495 	bcopy(&val, pr->pr_default, sizeof (val));
1496 
1497 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1498 }
1499 
1500 void
mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph,uint32_t val)1501 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val)
1502 {
1503 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1504 
1505 	/* nothing to do if the caller doesn't want the default value */
1506 	if (pr->pr_default == NULL)
1507 		return;
1508 
1509 	ASSERT(pr->pr_default_size >= sizeof (uint32_t));
1510 
1511 	bcopy(&val, pr->pr_default, sizeof (val));
1512 
1513 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1514 }
1515 
1516 void
mac_prop_info_set_default_str(mac_prop_info_handle_t ph,const char * str)1517 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str)
1518 {
1519 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1520 
1521 	/* nothing to do if the caller doesn't want the default value */
1522 	if (pr->pr_default == NULL)
1523 		return;
1524 
1525 	if (strlen(str) >= pr->pr_default_size)
1526 		pr->pr_errno = ENOBUFS;
1527 	else
1528 		(void) strlcpy(pr->pr_default, str, pr->pr_default_size);
1529 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1530 }
1531 
1532 void
mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,link_flowctrl_t val)1533 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph,
1534     link_flowctrl_t val)
1535 {
1536 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1537 
1538 	/* nothing to do if the caller doesn't want the default value */
1539 	if (pr->pr_default == NULL)
1540 		return;
1541 
1542 	ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t));
1543 
1544 	bcopy(&val, pr->pr_default, sizeof (val));
1545 
1546 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1547 }
1548 
1549 void
mac_prop_info_set_default_fec(mac_prop_info_handle_t ph,link_fec_t val)1550 mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val)
1551 {
1552 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1553 
1554 	/* nothing to do if the caller doesn't want the default value */
1555 	if (pr->pr_default == NULL)
1556 		return;
1557 
1558 	ASSERT(pr->pr_default_size >= sizeof (link_fec_t));
1559 
1560 	bcopy(&val, pr->pr_default, sizeof (val));
1561 
1562 	pr->pr_flags |= MAC_PROP_INFO_DEFAULT;
1563 }
1564 
1565 void
mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph,uint32_t min,uint32_t max)1566 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min,
1567     uint32_t max)
1568 {
1569 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1570 	mac_propval_range_t *range = pr->pr_range;
1571 	mac_propval_uint32_range_t *range32;
1572 
1573 	/* nothing to do if the caller doesn't want the range info */
1574 	if (range == NULL)
1575 		return;
1576 
1577 	if (pr->pr_range_cur_count++ == 0) {
1578 		/* first range */
1579 		pr->pr_flags |= MAC_PROP_INFO_RANGE;
1580 		range->mpr_type = MAC_PROPVAL_UINT32;
1581 	} else {
1582 		/* all ranges of a property should be of the same type */
1583 		ASSERT(range->mpr_type == MAC_PROPVAL_UINT32);
1584 		if (pr->pr_range_cur_count > range->mpr_count) {
1585 			pr->pr_errno = ENOSPC;
1586 			return;
1587 		}
1588 	}
1589 
1590 	range32 = range->mpr_range_uint32;
1591 	range32[pr->pr_range_cur_count - 1].mpur_min = min;
1592 	range32[pr->pr_range_cur_count - 1].mpur_max = max;
1593 }
1594 
1595 void
mac_prop_info_set_perm(mac_prop_info_handle_t ph,uint8_t perm)1596 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm)
1597 {
1598 	mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph;
1599 
1600 	pr->pr_perm = perm;
1601 	pr->pr_flags |= MAC_PROP_INFO_PERM;
1602 }
1603 
1604 void
mac_hcksum_get(const mblk_t * mp,uint32_t * start,uint32_t * stuff,uint32_t * end,uint32_t * value,uint32_t * flags_ptr)1605 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff,
1606     uint32_t *end, uint32_t *value, uint32_t *flags_ptr)
1607 {
1608 	uint32_t flags;
1609 
1610 	ASSERT(DB_TYPE(mp) == M_DATA);
1611 
1612 	flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
1613 	if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) {
1614 		if (value != NULL)
1615 			*value = (uint32_t)DB_CKSUM16(mp);
1616 		if ((flags & HCK_PARTIALCKSUM) != 0) {
1617 			if (start != NULL)
1618 				*start = (uint32_t)DB_CKSUMSTART(mp);
1619 			if (stuff != NULL)
1620 				*stuff = (uint32_t)DB_CKSUMSTUFF(mp);
1621 			if (end != NULL)
1622 				*end = (uint32_t)DB_CKSUMEND(mp);
1623 		}
1624 	}
1625 
1626 	if (flags_ptr != NULL)
1627 		*flags_ptr = flags;
1628 }
1629 
1630 void
mac_hcksum_set(mblk_t * mp,uint32_t start,uint32_t stuff,uint32_t end,uint32_t value,uint32_t flags)1631 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end,
1632     uint32_t value, uint32_t flags)
1633 {
1634 	ASSERT(DB_TYPE(mp) == M_DATA);
1635 
1636 	DB_CKSUMSTART(mp) = (intptr_t)start;
1637 	DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
1638 	DB_CKSUMEND(mp) = (intptr_t)end;
1639 	DB_CKSUMFLAGS(mp) = (uint16_t)flags;
1640 	DB_CKSUM16(mp) = (uint16_t)value;
1641 }
1642 
1643 void
mac_hcksum_clone(const mblk_t * src,mblk_t * dst)1644 mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
1645 {
1646 	ASSERT3U(DB_TYPE(src), ==, M_DATA);
1647 	ASSERT3U(DB_TYPE(dst), ==, M_DATA);
1648 
1649 	/*
1650 	 * Do these assignments unconditionally, rather than only when
1651 	 * flags is non-zero. This protects a situation where zeroed
1652 	 * hcksum data does not make the jump onto an mblk_t with
1653 	 * stale data in those fields. It's important to copy all
1654 	 * possible flags (HCK_* as well as HW_*) and not just the
1655 	 * checksum specific flags. Dropping flags during a clone
1656 	 * could result in dropped packets. If the caller has good
1657 	 * reason to drop those flags then it should do it manually,
1658 	 * after the clone.
1659 	 */
1660 	DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
1661 	DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
1662 	DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
1663 	DB_CKSUMEND(dst) = DB_CKSUMEND(src);
1664 	DB_CKSUM16(dst) = DB_CKSUM16(src);
1665 	DB_LSOMSS(dst) = DB_LSOMSS(src);
1666 }
1667 
1668 void
mac_lso_get(mblk_t * mp,uint32_t * mss,uint32_t * flags)1669 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
1670 {
1671 	ASSERT(DB_TYPE(mp) == M_DATA);
1672 
1673 	if (flags != NULL) {
1674 		*flags = DB_CKSUMFLAGS(mp) & HW_LSO;
1675 		if ((*flags != 0) && (mss != NULL))
1676 			*mss = (uint32_t)DB_LSOMSS(mp);
1677 	}
1678 }
1679 
1680 void
mac_transceiver_info_set_present(mac_transceiver_info_t * infop,boolean_t present)1681 mac_transceiver_info_set_present(mac_transceiver_info_t *infop,
1682     boolean_t present)
1683 {
1684 	infop->mti_present = present;
1685 }
1686 
1687 void
mac_transceiver_info_set_usable(mac_transceiver_info_t * infop,boolean_t usable)1688 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop,
1689     boolean_t usable)
1690 {
1691 	infop->mti_usable = usable;
1692 }
1693 
1694 static bool
mac_parse_is_ipv6eh(uint8_t id)1695 mac_parse_is_ipv6eh(uint8_t id)
1696 {
1697 	switch (id) {
1698 	case IPPROTO_HOPOPTS:
1699 	case IPPROTO_ROUTING:
1700 	case IPPROTO_FRAGMENT:
1701 	case IPPROTO_AH:
1702 	case IPPROTO_DSTOPTS:
1703 	case IPPROTO_MH:
1704 	case IPPROTO_HIP:
1705 	case IPPROTO_SHIM6:
1706 		/* Currently known extension headers */
1707 		return (true);
1708 	case IPPROTO_ESP:
1709 		/*
1710 		 * While the IANA protocol numbers listing notes ESP as an IPv6
1711 		 * extension header, we cannot effectively parse it like one.
1712 		 *
1713 		 * For now, mac_ether_offload_info() will report it as the L4
1714 		 * protocol for a parsed packet containing this EH.
1715 		 */
1716 	default:
1717 		return (false);
1718 	}
1719 }
1720 
1721 typedef struct mac_mblk_cursor {
1722 	mblk_t	*mmc_head;
1723 	mblk_t	*mmc_cur;
1724 	size_t	mmc_off_total;
1725 	size_t	mmc_off_mp;
1726 } mac_mblk_cursor_t;
1727 
1728 static void mac_mmc_advance(mac_mblk_cursor_t *, size_t);
1729 static void mac_mmc_reset(mac_mblk_cursor_t *);
1730 
1731 static void
mac_mmc_init(mac_mblk_cursor_t * cursor,mblk_t * mp)1732 mac_mmc_init(mac_mblk_cursor_t *cursor, mblk_t *mp)
1733 {
1734 	cursor->mmc_head = mp;
1735 	mac_mmc_reset(cursor);
1736 }
1737 
1738 static void
mac_mmc_reset(mac_mblk_cursor_t * cursor)1739 mac_mmc_reset(mac_mblk_cursor_t *cursor)
1740 {
1741 	ASSERT(cursor->mmc_head != NULL);
1742 
1743 	cursor->mmc_cur = cursor->mmc_head;
1744 	cursor->mmc_off_total = cursor->mmc_off_mp = 0;
1745 
1746 	/* Advance past any zero-length mblks at head */
1747 	mac_mmc_advance(cursor, 0);
1748 }
1749 
1750 static inline size_t
mac_mmc_mp_left(const mac_mblk_cursor_t * cursor)1751 mac_mmc_mp_left(const mac_mblk_cursor_t *cursor)
1752 {
1753 	if (cursor->mmc_cur != NULL) {
1754 		const size_t mp_len = MBLKL(cursor->mmc_cur);
1755 
1756 		ASSERT3U(mp_len, >=, cursor->mmc_off_mp);
1757 
1758 		return (mp_len - cursor->mmc_off_mp);
1759 	} else {
1760 		return (0);
1761 	}
1762 }
1763 
1764 static inline uint8_t *
mac_mmc_mp_ptr(const mac_mblk_cursor_t * cursor)1765 mac_mmc_mp_ptr(const mac_mblk_cursor_t *cursor)
1766 {
1767 	return (cursor->mmc_cur->b_rptr + cursor->mmc_off_mp);
1768 }
1769 
1770 static inline size_t
mac_mmc_offset(const mac_mblk_cursor_t * cursor)1771 mac_mmc_offset(const mac_mblk_cursor_t *cursor)
1772 {
1773 	return (cursor->mmc_off_total);
1774 }
1775 
1776 /*
1777  * Advance cursor forward `len` bytes.
1778  *
1779  * The length to advance must be no greater than the number of bytes remaining
1780  * in the current mblk.  If the position reaches (exactly) the end of the
1781  * current mblk, the cursor will be pushed forward to the next non-zero-length
1782  * mblk in the chain.
1783  */
1784 static inline void
mac_mmc_advance(mac_mblk_cursor_t * cursor,size_t len)1785 mac_mmc_advance(mac_mblk_cursor_t *cursor, size_t len)
1786 {
1787 	ASSERT(cursor->mmc_cur != NULL);
1788 
1789 	const size_t mp_len = MBLKL(cursor->mmc_cur);
1790 
1791 	ASSERT3U(cursor->mmc_off_mp + len, <=, mp_len);
1792 
1793 	cursor->mmc_off_total += len;
1794 	cursor->mmc_off_mp += len;
1795 
1796 	if (cursor->mmc_off_mp == mp_len) {
1797 		cursor->mmc_off_mp = 0;
1798 		cursor->mmc_cur = cursor->mmc_cur->b_cont;
1799 	}
1800 
1801 	/* Skip over any 0-length mblks */
1802 	while (cursor->mmc_cur != NULL && MBLKL(cursor->mmc_cur) == 0) {
1803 		cursor->mmc_cur = cursor->mmc_cur->b_cont;
1804 	}
1805 }
1806 
1807 /*
1808  * Attempt to seek to byte offset `off` in mblk chain.
1809  *
1810  * Returns true if the offset is <= the total chain length.
1811  */
1812 static bool
mac_mmc_seek(mac_mblk_cursor_t * cursor,const size_t off)1813 mac_mmc_seek(mac_mblk_cursor_t *cursor, const size_t off)
1814 {
1815 	ASSERT(cursor->mmc_head != NULL);
1816 
1817 	if (off == cursor->mmc_off_total) {
1818 		/*
1819 		 * Any prior init, reset, or seek operation will have advanced
1820 		 * past any zero-length mblks, making this short-circuit safe.
1821 		 */
1822 		return (true);
1823 	} else if (off < cursor->mmc_off_total) {
1824 		/* Rewind to beginning if offset precedes current position */
1825 		mac_mmc_reset(cursor);
1826 	}
1827 
1828 	size_t seek_left = off - cursor->mmc_off_total;
1829 	while (cursor->mmc_cur != NULL) {
1830 		const size_t mp_left = mac_mmc_mp_left(cursor);
1831 
1832 		if (mp_left > seek_left) {
1833 			/* Target position is within current mblk */
1834 			cursor->mmc_off_mp += seek_left;
1835 			cursor->mmc_off_total += seek_left;
1836 			return (true);
1837 		}
1838 
1839 		/* Move on to the next mblk... */
1840 		mac_mmc_advance(cursor, mp_left);
1841 		seek_left -= mp_left;
1842 	}
1843 
1844 	/*
1845 	 * We have reached the end of the mblk chain, but there is a chance that
1846 	 * it corresponds to the target seek position.
1847 	 */
1848 	return (cursor->mmc_off_total == off);
1849 }
1850 
1851 /*
1852  * Attempt to read uint8_t at offset `pos` in mblk chain.
1853  *
1854  * Returns true (and sets value in `out`) if the offset is within the chain.
1855  */
1856 static bool
mac_mmc_get_uint8(mac_mblk_cursor_t * cursor,size_t pos,uint8_t * out)1857 mac_mmc_get_uint8(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out)
1858 {
1859 	if (!mac_mmc_seek(cursor, pos)) {
1860 		return (false);
1861 	}
1862 
1863 	if (mac_mmc_mp_left(cursor) != 0) {
1864 		*out = *(mac_mmc_mp_ptr(cursor));
1865 		mac_mmc_advance(cursor, 1);
1866 		return (true);
1867 	}
1868 
1869 	return (false);
1870 }
1871 
1872 /*
1873  * Attempt to read uint16_t at offset `pos` in mblk chain.  The two
1874  * network-order bytes are converted into a host-order value.
1875  *
1876  * Returns true (and sets value in `out`) if the 16-bit region specified by the
1877  * offset is within the chain.
1878  */
1879 static bool
mac_mmc_get_uint16(mac_mblk_cursor_t * cursor,size_t pos,uint16_t * out)1880 mac_mmc_get_uint16(mac_mblk_cursor_t *cursor, size_t pos, uint16_t *out)
1881 {
1882 	if (!mac_mmc_seek(cursor, pos)) {
1883 		return (false);
1884 	}
1885 
1886 	const size_t mp_left = mac_mmc_mp_left(cursor);
1887 	uint16_t result = 0;
1888 
1889 	if (mp_left >= 2) {
1890 		uint8_t *bp = mac_mmc_mp_ptr(cursor);
1891 
1892 		result = (uint16_t)bp[0] << 8;
1893 		result |= bp[1];
1894 		mac_mmc_advance(cursor, 2);
1895 		*out = result;
1896 		return (true);
1897 	} else if (mp_left == 1) {
1898 		result = (uint16_t)*(mac_mmc_mp_ptr(cursor));
1899 		mac_mmc_advance(cursor, 1);
1900 
1901 		if (mac_mmc_mp_left(cursor) == 0) {
1902 			return (false);
1903 		}
1904 
1905 		result = result << 8;
1906 		result |= (uint16_t)*(mac_mmc_mp_ptr(cursor));
1907 		mac_mmc_advance(cursor, 1);
1908 		*out = result;
1909 		return (true);
1910 	}
1911 
1912 	return (false);
1913 }
1914 
1915 /*
1916  * Attempt to read `count` bytes at offset `pos` in mblk chain.
1917  *
1918  * Returns true (and copies data to `out`) if `count` length region is available
1919  * at offset within the chain.
1920  */
1921 static bool
mac_mmc_get_bytes(mac_mblk_cursor_t * cursor,size_t pos,uint8_t * out,size_t count)1922 mac_mmc_get_bytes(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out,
1923     size_t count)
1924 {
1925 	if (!mac_mmc_seek(cursor, pos)) {
1926 		return (false);
1927 	}
1928 
1929 	while (count > 0) {
1930 		const size_t mp_left = mac_mmc_mp_left(cursor);
1931 
1932 		if (mp_left == 0) {
1933 			return (false);
1934 		}
1935 		const size_t to_copy = MIN(mp_left, count);
1936 
1937 		bcopy(mac_mmc_mp_ptr(cursor), out, to_copy);
1938 		out += to_copy;
1939 		mac_mmc_advance(cursor, to_copy);
1940 		count -= to_copy;
1941 	}
1942 	return (true);
1943 }
1944 
1945 /*
1946  * Attempt to parse ethernet header (VLAN or not) from mblk chain.
1947  *
1948  * Returns true if header was successfully parsed.  Parsing will begin at
1949  * current offset of `cursor`.  Any non-NULL arguments for VLAN, SAP, and header
1950  * size will be populated on success.  A value of MEOI_VLAN_TCI_INVALID will be
1951  * reported for the TCI if the header does not bear VLAN infomation.
1952  */
1953 static bool
mac_mmc_parse_ether(mac_mblk_cursor_t * cursor,uint8_t * dst_addrp,uint32_t * vlan_tcip,uint16_t * ethertypep,uint16_t * hdr_sizep)1954 mac_mmc_parse_ether(mac_mblk_cursor_t *cursor, uint8_t *dst_addrp,
1955     uint32_t *vlan_tcip, uint16_t *ethertypep, uint16_t *hdr_sizep)
1956 {
1957 	const size_t l2_off = mac_mmc_offset(cursor);
1958 
1959 	if (dst_addrp != NULL) {
1960 		if (!mac_mmc_get_bytes(cursor, l2_off, dst_addrp, ETHERADDRL)) {
1961 			return (false);
1962 		}
1963 	}
1964 
1965 	uint16_t ethertype = 0;
1966 	if (!mac_mmc_get_uint16(cursor,
1967 	    l2_off + offsetof(struct ether_header, ether_type), &ethertype)) {
1968 		return (false);
1969 	}
1970 
1971 	uint32_t tci = MEOI_VLAN_TCI_INVALID;
1972 	uint16_t hdrsize = sizeof (struct ether_header);
1973 
1974 	if (ethertype == ETHERTYPE_VLAN) {
1975 		uint16_t tci_val;
1976 
1977 		if (!mac_mmc_get_uint16(cursor,
1978 		    l2_off + offsetof(struct ether_vlan_header, ether_tci),
1979 		    &tci_val)) {
1980 			return (false);
1981 		}
1982 		if (!mac_mmc_get_uint16(cursor,
1983 		    l2_off + offsetof(struct ether_vlan_header, ether_type),
1984 		    &ethertype)) {
1985 			return (false);
1986 		}
1987 		hdrsize = sizeof (struct ether_vlan_header);
1988 		tci = (uint32_t)tci_val;
1989 	}
1990 
1991 	if (vlan_tcip != NULL) {
1992 		*vlan_tcip = tci;
1993 	}
1994 	if (ethertypep != NULL) {
1995 		*ethertypep = ethertype;
1996 	}
1997 	if (hdr_sizep != NULL) {
1998 		*hdr_sizep = hdrsize;
1999 	}
2000 	return (true);
2001 }
2002 
2003 /*
2004  * Attempt to parse L3 protocol header from mblk chain.
2005  *
2006  * The SAP/ethertype of the containing header must be specified by the caller.
2007  *
2008  * Returns true if header was successfully parsed.  Parsing will begin at
2009  * current offset of `cursor`.  Any non-NULL arguments for IP protocol and
2010  * header size will be populated on success.
2011  */
2012 static bool
mac_mmc_parse_l3(mac_mblk_cursor_t * cursor,uint16_t l3_sap,uint8_t * ipprotop,bool * is_fragp,uint16_t * hdr_sizep)2013 mac_mmc_parse_l3(mac_mblk_cursor_t *cursor, uint16_t l3_sap, uint8_t *ipprotop,
2014     bool *is_fragp, uint16_t *hdr_sizep)
2015 {
2016 	const size_t l3_off = mac_mmc_offset(cursor);
2017 
2018 	if (l3_sap == ETHERTYPE_IP) {
2019 		uint8_t verlen, ipproto;
2020 		uint16_t frag_off;
2021 
2022 		if (!mac_mmc_get_uint8(cursor, l3_off, &verlen)) {
2023 			return (false);
2024 		}
2025 		verlen &= 0x0f;
2026 		if (verlen < 5 || verlen > 0x0f) {
2027 			return (false);
2028 		}
2029 
2030 		if (!mac_mmc_get_uint16(cursor,
2031 		    l3_off + offsetof(ipha_t, ipha_fragment_offset_and_flags),
2032 		    &frag_off)) {
2033 			return (false);
2034 		}
2035 
2036 		if (!mac_mmc_get_uint8(cursor,
2037 		    l3_off + offsetof(ipha_t, ipha_protocol), &ipproto)) {
2038 			return (false);
2039 		}
2040 
2041 		if (ipprotop != NULL) {
2042 			*ipprotop = ipproto;
2043 		}
2044 		if (is_fragp != NULL) {
2045 			*is_fragp = ((frag_off & (IPH_MF | IPH_OFFSET)) != 0);
2046 		}
2047 		if (hdr_sizep != NULL) {
2048 			*hdr_sizep = verlen * 4;
2049 		}
2050 		return (true);
2051 	}
2052 	if (l3_sap == ETHERTYPE_IPV6) {
2053 		uint16_t ip_len = sizeof (ip6_t);
2054 		uint8_t ipproto;
2055 		bool found_frag_eh = false;
2056 
2057 		if (!mac_mmc_get_uint8(cursor,
2058 		    l3_off + offsetof(ip6_t, ip6_nxt), &ipproto)) {
2059 			return (false);
2060 		}
2061 
2062 		/* Chase any extension headers present in packet */
2063 		while (mac_parse_is_ipv6eh(ipproto)) {
2064 			uint8_t len_val, next_hdr;
2065 			uint16_t eh_len;
2066 
2067 			const size_t hdr_off = l3_off + ip_len;
2068 			if (!mac_mmc_get_uint8(cursor, hdr_off, &next_hdr)) {
2069 				return (false);
2070 			}
2071 
2072 			if (ipproto == IPPROTO_FRAGMENT) {
2073 				/*
2074 				 * The Fragment extension header bears a
2075 				 * predefined fixed length, rather than
2076 				 * communicating it through the EH itself.
2077 				 */
2078 				eh_len = 8;
2079 				found_frag_eh = true;
2080 			} else if (ipproto == IPPROTO_AH) {
2081 				/*
2082 				 * The length of the IP Authentication EH is
2083 				 * stored as (n + 2) * 32-bits, where 'n' is the
2084 				 * recorded EH length field
2085 				 */
2086 				if (!mac_mmc_get_uint8(cursor, hdr_off + 1,
2087 				    &len_val)) {
2088 					return (false);
2089 				}
2090 				eh_len = ((uint16_t)len_val + 2) * 4;
2091 			} else {
2092 				/*
2093 				 * All other EHs should follow the sizing
2094 				 * formula of (n + 1) * 64-bits, where 'n' is
2095 				 * the recorded EH length field.
2096 				 */
2097 				if (!mac_mmc_get_uint8(cursor, hdr_off + 1,
2098 				    &len_val)) {
2099 					return (false);
2100 				}
2101 				eh_len = ((uint16_t)len_val + 1) * 8;
2102 			}
2103 			/*
2104 			 * Protect against overflow in the case of a very
2105 			 * contrived packet.
2106 			 */
2107 			if ((ip_len + eh_len) < ip_len) {
2108 				return (-1);
2109 			}
2110 
2111 			ipproto = next_hdr;
2112 			ip_len += eh_len;
2113 		}
2114 
2115 		if (ipprotop != NULL) {
2116 			*ipprotop = ipproto;
2117 		}
2118 		if (is_fragp != NULL) {
2119 			*is_fragp = found_frag_eh;
2120 		}
2121 		if (hdr_sizep != NULL) {
2122 			*hdr_sizep = ip_len;
2123 		}
2124 		return (true);
2125 	}
2126 
2127 	return (false);
2128 }
2129 
2130 /*
2131  * Attempt to parse L4 protocol header from mblk chain.
2132  *
2133  * The IP protocol of the containing header must be specified by the caller.
2134  *
2135  * Returns true if header was successfully parsed.  Parsing will begin at
2136  * current offset of `cursor`.  A non-NULL argument for header size will be
2137  * populated on success.
2138  */
2139 static bool
mac_mmc_parse_l4(mac_mblk_cursor_t * cursor,uint8_t ipproto,uint8_t * hdr_sizep)2140 mac_mmc_parse_l4(mac_mblk_cursor_t *cursor, uint8_t ipproto, uint8_t *hdr_sizep)
2141 {
2142 	ASSERT(hdr_sizep != NULL);
2143 
2144 	const size_t l4_off = mac_mmc_offset(cursor);
2145 	uint8_t tcp_doff;
2146 
2147 	switch (ipproto) {
2148 	case IPPROTO_TCP:
2149 		if (!mac_mmc_get_uint8(cursor,
2150 		    l4_off + offsetof(tcph_t, th_offset_and_rsrvd),
2151 		    &tcp_doff)) {
2152 			return (false);
2153 		}
2154 		tcp_doff = (tcp_doff & 0xf0) >> 4;
2155 		if (tcp_doff < 5 || tcp_doff > 0xf) {
2156 			return (false);
2157 		}
2158 		*hdr_sizep = tcp_doff * 4;
2159 		return (true);
2160 	case IPPROTO_UDP:
2161 		*hdr_sizep = sizeof (struct udphdr);
2162 		return (true);
2163 	case IPPROTO_ICMP:
2164 		/*
2165 		 * Only count the parts of the header which are common to
2166 		 * message types.
2167 		 */
2168 		*hdr_sizep = offsetof(struct icmp, icmp_hun);
2169 		return (true);
2170 	case IPPROTO_ICMPV6:
2171 		*hdr_sizep = sizeof (icmp6_t);
2172 		return (true);
2173 	case IPPROTO_SCTP:
2174 		*hdr_sizep = sizeof (sctp_hdr_t);
2175 		return (true);
2176 	default:
2177 		return (false);
2178 	}
2179 }
2180 
2181 /*
2182  * Parse destination MAC address and VLAN TCI (if any) from mblk chain.
2183  *
2184  * If packet ethertype does not indicate that a VLAN is present,
2185  * MEOI_VLAN_TCI_INVALID will be returned for the TCI.
2186  *
2187  * Returns B_TRUE if header could be parsed for destination MAC address and VLAN
2188  * TCI, otherwise B_FALSE.
2189  */
2190 boolean_t
mac_ether_l2_info(mblk_t * mp,uint8_t * dst_addrp,uint32_t * vlan_tcip)2191 mac_ether_l2_info(mblk_t *mp, uint8_t *dst_addrp, uint32_t *vlan_tcip)
2192 {
2193 	mac_mblk_cursor_t cursor;
2194 
2195 	mac_mmc_init(&cursor, mp);
2196 	if (!mac_mmc_parse_ether(&cursor, dst_addrp, vlan_tcip, NULL, NULL)) {
2197 		return (B_FALSE);
2198 	}
2199 
2200 	return (B_TRUE);
2201 }
2202 
2203 /*
2204  * Perform a partial parsing of offload info from a frame and/or packet.
2205  *
2206  * Beginning at the provided byte offset (`off`) in the mblk, attempt to parse
2207  * any offload info which has not yet been populated in `meoi`.  The contents of
2208  * `meoi_flags` upon entry will be considered as "already parsed", their
2209  * corresponding data fields will be considered valid.
2210  *
2211  * A motivating example: A non-Ethernet packet could be parsed for L3/L4 offload
2212  * information by setting MEOI_L2INFO_SET in `meoi_flags`, and the L3 SAP in
2213  * `meoi_l3_proto`. With a value in `meoi_l2hlen` that, when combined with the
2214  * provided `off`, will direct the parser to the start of the L3 header in the
2215  * mblk, the rest of the logic will be free to run.
2216  *
2217  * Alternatively, this could be used to parse the headers in an encapsulated
2218  * Ethernet packet by simply specifying the start of its header in `off`.
2219  *
2220  * The degree to which parsing was able to proceed is stored in `meoi_flags`.
2221  */
2222 void
mac_partial_offload_info(mblk_t * mp,size_t off,mac_ether_offload_info_t * meoi)2223 mac_partial_offload_info(mblk_t *mp, size_t off, mac_ether_offload_info_t *meoi)
2224 {
2225 	mac_mblk_cursor_t cursor;
2226 
2227 	mac_mmc_init(&cursor, mp);
2228 
2229 	if (!mac_mmc_seek(&cursor, off)) {
2230 		return;
2231 	}
2232 
2233 	if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) {
2234 		uint32_t vlan_tci;
2235 		uint16_t l2_sz, ethertype;
2236 		if (!mac_mmc_parse_ether(&cursor, NULL, &vlan_tci, &ethertype,
2237 		    &l2_sz)) {
2238 			return;
2239 		}
2240 
2241 		meoi->meoi_flags |= MEOI_L2INFO_SET;
2242 		meoi->meoi_l2hlen = l2_sz;
2243 		meoi->meoi_l3proto = ethertype;
2244 		if (vlan_tci != MEOI_VLAN_TCI_INVALID) {
2245 			ASSERT3U(meoi->meoi_l2hlen, ==,
2246 			    sizeof (struct ether_vlan_header));
2247 			meoi->meoi_flags |= MEOI_VLAN_TAGGED;
2248 		}
2249 	}
2250 	const size_t l2_end = off + (size_t)meoi->meoi_l2hlen;
2251 	if (!mac_mmc_seek(&cursor, l2_end)) {
2252 		meoi->meoi_flags &= ~MEOI_L2INFO_SET;
2253 		return;
2254 	}
2255 
2256 	if ((meoi->meoi_flags & MEOI_L3INFO_SET) == 0) {
2257 		uint8_t ipproto;
2258 		uint16_t l3_sz;
2259 		bool is_frag;
2260 		if (!mac_mmc_parse_l3(&cursor, meoi->meoi_l3proto, &ipproto,
2261 		    &is_frag, &l3_sz)) {
2262 			return;
2263 		}
2264 
2265 		meoi->meoi_l3hlen = l3_sz;
2266 		meoi->meoi_l4proto = ipproto;
2267 		meoi->meoi_flags |= MEOI_L3INFO_SET;
2268 		if (is_frag) {
2269 			meoi->meoi_flags |= MEOI_L3_FRAGMENT;
2270 		}
2271 	}
2272 	const size_t l3_end = l2_end + (size_t)meoi->meoi_l3hlen;
2273 	if (!mac_mmc_seek(&cursor, l3_end)) {
2274 		meoi->meoi_flags &= ~MEOI_L3INFO_SET;
2275 		return;
2276 	}
2277 
2278 	if ((meoi->meoi_flags & MEOI_L4INFO_SET) == 0) {
2279 		uint8_t l4_sz;
2280 		if (!mac_mmc_parse_l4(&cursor, meoi->meoi_l4proto, &l4_sz)) {
2281 			return;
2282 		}
2283 
2284 		meoi->meoi_l4hlen = l4_sz;
2285 		meoi->meoi_flags |= MEOI_L4INFO_SET;
2286 	}
2287 	const size_t l4_end = l3_end + (size_t)meoi->meoi_l4hlen;
2288 	if (!mac_mmc_seek(&cursor, l4_end)) {
2289 		meoi->meoi_flags &= ~MEOI_L4INFO_SET;
2290 	}
2291 }
2292 
2293 /*
2294  * Attempt to parse packet headers to extract information useful for various
2295  * offloads.  This includes header protocols and lengths.
2296  *
2297  * The meoi_flags field will indicate the extent to which parsing was able to
2298  * complete.  Each in turn promises that subsequent fields are populated, and
2299  * that the mblk chain is large enough to contain the parsed header(s):
2300  *
2301  * - MEOI_L2INFO_SET: meoi_l3_proto and meoi_l2hlen
2302  * - MEOI_L3INFO_SET: meoi_l4_proto and meoi_l3hlen
2303  * - MEOI_L4INFO_SET: meoi_l4hlen
2304  *
2305  * When any of those flags are absent, their corresponding data fields will be
2306  * zeroed.
2307  *
2308  * These additional flags are set when certain conditions are met during
2309  * parsing:
2310  *
2311  * - MEOI_VLAN_TAGGED: Ethernet header is tagged with a VLAN
2312  * - MEOI_L3_FRAGMENT: L3 header indicated fragmentation
2313  */
2314 void
mac_ether_offload_info(mblk_t * mp,mac_ether_offload_info_t * meoi)2315 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi)
2316 {
2317 	bzero(meoi, sizeof (mac_ether_offload_info_t));
2318 	meoi->meoi_len = msgdsize(mp);
2319 
2320 	mac_partial_offload_info(mp, 0, meoi);
2321 }
2322