xref: /illumos-gate/usr/src/uts/common/io/dls/dls_link.c (revision fec047081731fd77caf46ec0471c501b2cb33894)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2019 Joyent, Inc.
25  */
26 
27 /*
28  * Data-Link Services Module
29  */
30 
31 #include	<sys/sysmacros.h>
32 #include	<sys/strsubr.h>
33 #include	<sys/strsun.h>
34 #include	<sys/vlan.h>
35 #include	<sys/dld_impl.h>
36 #include	<sys/sdt.h>
37 #include	<sys/atomic.h>
38 #include	<sys/sysevent.h>
39 #include	<sys/sysevent/eventdefs.h>
40 #include	<sys/sysevent/datalink.h>
41 
42 static kmem_cache_t	*i_dls_link_cachep;
43 mod_hash_t		*i_dls_link_hash;
44 static uint_t		i_dls_link_count;
45 
46 #define		LINK_HASHSZ	67	/* prime */
47 #define		IMPL_HASHSZ	67	/* prime */
48 
49 /*
50  * Construct a hash key from the DLSAP value.
51  */
52 #define	MAKE_KEY(_sap)						\
53 	((mod_hash_key_t)(uintptr_t)((_sap) << VLAN_ID_SIZE))
54 
55 #define	DLS_STRIP_PADDING(pktsize, p) {			\
56 	if (pktsize != 0) {				\
57 		ssize_t delta = pktsize - msgdsize(p);	\
58 							\
59 		if (delta < 0)				\
60 			(void) adjmsg(p, delta);	\
61 	}						\
62 }
63 
64 /*
65  * Private functions.
66  */
67 
68 /*ARGSUSED*/
69 static int
70 i_dls_link_constructor(void *buf, void *arg, int kmflag)
71 {
72 	dls_link_t	*dlp = buf;
73 	char		name[MAXNAMELEN];
74 
75 	bzero(buf, sizeof (dls_link_t));
76 
77 	(void) snprintf(name, MAXNAMELEN, "dls_link_t_%p_hash", buf);
78 	dlp->dl_str_hash = mod_hash_create_idhash(name, IMPL_HASHSZ,
79 	    mod_hash_null_valdtor);
80 
81 	return (0);
82 }
83 
84 /*ARGSUSED*/
85 static void
86 i_dls_link_destructor(void *buf, void *arg)
87 {
88 	dls_link_t	*dlp = buf;
89 
90 	ASSERT(dlp->dl_ref == 0);
91 	ASSERT(dlp->dl_mh == NULL);
92 	ASSERT(dlp->dl_mah == NULL);
93 	ASSERT(dlp->dl_unknowns == 0);
94 
95 	mod_hash_destroy_idhash(dlp->dl_str_hash);
96 	dlp->dl_str_hash = NULL;
97 
98 }
99 
100 /*
101  * - Parse the mac header information of the given packet.
102  * - Strip the padding and skip over the header. Note that because some
103  *   DLS consumers only check the db_ref count of the first mblk, we
104  *   pullup the message into a single mblk. Because the original message
105  *   is freed as the result of message pulling up, mac_vlan_header_info()
106  *   is called again to update the mhi_saddr and mhi_daddr pointers in the
107  *   mhip. Further, the mac_vlan_header_info() function ensures that the
108  *   size of the pulled message is greater than the MAC header size,
109  *   therefore we can directly advance b_rptr to point at the payload.
110  *
111  * We choose to use a macro for performance reasons.
112  */
113 #define	DLS_PREPARE_PKT(mh, mp, mhip, err) {				\
114 	mblk_t *nextp = (mp)->b_next;					\
115 	if (((err) = mac_vlan_header_info((mh), (mp), (mhip))) == 0) {	\
116 		DLS_STRIP_PADDING((mhip)->mhi_pktsize, (mp));		\
117 		if (MBLKL((mp)) < (mhip)->mhi_hdrsize) {		\
118 			mblk_t *newmp;					\
119 			if ((newmp = msgpullup((mp), -1)) == NULL) {	\
120 				(err) = EINVAL;				\
121 			} else {					\
122 				(mp)->b_next = NULL;			\
123 				freemsg((mp));				\
124 				(mp) = newmp;				\
125 				VERIFY(mac_vlan_header_info((mh),	\
126 				    (mp), (mhip)) == 0);		\
127 				(mp)->b_next = nextp;			\
128 				(mp)->b_rptr += (mhip)->mhi_hdrsize;	\
129 			}						\
130 		} else {						\
131 			(mp)->b_rptr += (mhip)->mhi_hdrsize;		\
132 		}							\
133 	}								\
134 }
135 
136 /*
137  * Truncate the chain starting at mp such that all packets in the chain
138  * have identical source and destination addresses, saps, and tag types
139  * (see below).  It returns a pointer to the mblk following the chain,
140  * NULL if there is no further packet following the processed chain.
141  * The countp argument is set to the number of valid packets in the chain.
142  * Note that the whole MAC header (including the VLAN tag if any) in each
143  * packet will be stripped.
144  */
145 static mblk_t *
146 i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
147     uint_t *countp)
148 {
149 	mblk_t		*prevp;
150 	uint_t		npacket = 1;
151 	size_t		addr_size = dlp->dl_mip->mi_addr_length;
152 	uint16_t	vid = VLAN_ID(mhip->mhi_tci);
153 	uint16_t	pri = VLAN_PRI(mhip->mhi_tci);
154 
155 	/*
156 	 * Compare with subsequent headers until we find one that has
157 	 * differing header information. After checking each packet
158 	 * strip padding and skip over the header.
159 	 */
160 	for (prevp = mp; (mp = mp->b_next) != NULL; prevp = mp) {
161 		mac_header_info_t cmhi;
162 		uint16_t cvid, cpri;
163 		int err;
164 
165 		DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err);
166 		if (err != 0)
167 			break;
168 
169 		prevp->b_next = mp;
170 
171 		/*
172 		 * The source, destination, sap, vlan tag must all match in
173 		 * a given subchain.
174 		 */
175 		if (mhip->mhi_saddr == NULL || cmhi.mhi_saddr == NULL ||
176 		    memcmp(mhip->mhi_daddr, cmhi.mhi_daddr, addr_size) != 0 ||
177 		    memcmp(mhip->mhi_saddr, cmhi.mhi_saddr, addr_size) != 0 ||
178 		    mhip->mhi_bindsap != cmhi.mhi_bindsap) {
179 			/*
180 			 * Note that we don't need to restore the padding.
181 			 */
182 			mp->b_rptr -= cmhi.mhi_hdrsize;
183 			break;
184 		}
185 
186 		cvid = VLAN_ID(cmhi.mhi_tci);
187 		cpri = VLAN_PRI(cmhi.mhi_tci);
188 
189 		/*
190 		 * There are several types of packets. Packets don't match
191 		 * if they are classified to different type or if they are
192 		 * VLAN packets but belong to different VLANs:
193 		 *
194 		 * packet type		tagged		vid		pri
195 		 * ---------------------------------------------------------
196 		 * untagged		No		zero		zero
197 		 * VLAN packets		Yes		non-zero	-
198 		 * priority tagged	Yes		zero		non-zero
199 		 * 0 tagged		Yes		zero		zero
200 		 */
201 		if ((mhip->mhi_istagged != cmhi.mhi_istagged) ||
202 		    (vid != cvid) || ((vid == VLAN_ID_NONE) &&
203 		    (((pri == 0) && (cpri != 0)) ||
204 		    ((pri != 0) && (cpri == 0))))) {
205 			mp->b_rptr -= cmhi.mhi_hdrsize;
206 			break;
207 		}
208 
209 		npacket++;
210 	}
211 
212 	/*
213 	 * Break the chain at this point and return a pointer to the next
214 	 * sub-chain.
215 	 */
216 	prevp->b_next = NULL;
217 	*countp = npacket;
218 	return (mp);
219 }
220 
221 /* ARGSUSED */
222 static int
223 i_dls_head_hold(mod_hash_key_t key, mod_hash_val_t val)
224 {
225 	dls_head_t *dhp = (dls_head_t *)val;
226 
227 	/*
228 	 * The lock order is  mod_hash's internal lock -> dh_lock as in the
229 	 * call to i_dls_link_rx -> mod_hash_find_cb_rval -> i_dls_head_hold
230 	 */
231 	mutex_enter(&dhp->dh_lock);
232 	if (dhp->dh_removing) {
233 		mutex_exit(&dhp->dh_lock);
234 		return (-1);
235 	}
236 	dhp->dh_ref++;
237 	mutex_exit(&dhp->dh_lock);
238 	return (0);
239 }
240 
241 void
242 i_dls_head_rele(dls_head_t *dhp)
243 {
244 	mutex_enter(&dhp->dh_lock);
245 	dhp->dh_ref--;
246 	if (dhp->dh_ref == 0 && dhp->dh_removing != 0)
247 		cv_broadcast(&dhp->dh_cv);
248 	mutex_exit(&dhp->dh_lock);
249 }
250 
251 static dls_head_t *
252 i_dls_head_alloc(mod_hash_key_t key)
253 {
254 	dls_head_t	*dhp;
255 
256 	dhp = kmem_zalloc(sizeof (dls_head_t), KM_SLEEP);
257 	dhp->dh_key = key;
258 	return (dhp);
259 }
260 
261 static void
262 i_dls_head_free(dls_head_t *dhp)
263 {
264 	ASSERT(dhp->dh_ref == 0);
265 	kmem_free(dhp, sizeof (dls_head_t));
266 }
267 
268 /*
269  * Try to send mp up to the streams of the given sap. Return the
270  * number of streams which accepted this message, or 0 if no streams
271  * accepted the message.
272  *
273  * Note that this function copies the message chain and the original
274  * mp remains valid after this function returns.
275  */
276 static uint_t
277 i_dls_link_rx_func(dls_link_t *dlp, mac_resource_handle_t mrh,
278     mac_header_info_t *mhip, mblk_t *mp, uint32_t sap,
279     boolean_t (*acceptfunc)())
280 {
281 	mod_hash_t	*hash = dlp->dl_str_hash;
282 	mod_hash_key_t	key;
283 	dls_head_t	*dhp;
284 	dld_str_t	*dsp;
285 	mblk_t		*nmp;
286 	dls_rx_t	ds_rx;
287 	void		*ds_rx_arg;
288 	uint_t		naccepted = 0;
289 	int		rval;
290 
291 	/*
292 	 * Construct a hash key from the DLSAP.
293 	 */
294 	key = MAKE_KEY(sap);
295 
296 	/*
297 	 * Search the hash table for a dld_str_t eligible to receive a
298 	 * packet chain for this DLSAP. The mod hash's internal lock
299 	 * serializes find/insert/remove from the mod hash list.
300 	 * Incrementing the dh_ref (while holding the mod hash lock)
301 	 * ensures dls_link_remove will wait for the upcall to finish.
302 	 */
303 	if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
304 	    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
305 		return (0);
306 	}
307 
308 	/*
309 	 * Find all dld_str_t that will accept the sub-chain.
310 	 */
311 	for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next) {
312 		if (!acceptfunc(dsp, mhip, &ds_rx, &ds_rx_arg))
313 			continue;
314 
315 		/*
316 		 * We have at least one acceptor.
317 		 */
318 		naccepted++;
319 
320 		/*
321 		 * There will normally be at least one more dld_str_t
322 		 * (since we've yet to check for non-promiscuous
323 		 * dld_str_t) so dup the sub-chain.
324 		 */
325 		if ((nmp = copymsgchain(mp)) != NULL)
326 			ds_rx(ds_rx_arg, mrh, nmp, mhip);
327 	}
328 
329 	/*
330 	 * Release the hold on the dld_str_t chain now that we have
331 	 * finished walking it.
332 	 */
333 	i_dls_head_rele(dhp);
334 	return (naccepted);
335 }
336 
337 /* ARGSUSED */
338 void
339 i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
340     boolean_t loopback)
341 {
342 	dls_link_t			*dlp = arg;
343 	mod_hash_t			*hash = dlp->dl_str_hash;
344 	mblk_t				*nextp;
345 	mac_header_info_t		mhi;
346 	dls_head_t			*dhp;
347 	dld_str_t			*dsp;
348 	dld_str_t			*ndsp;
349 	mblk_t				*nmp;
350 	mod_hash_key_t			key;
351 	uint_t				npacket;
352 	boolean_t			accepted;
353 	dls_rx_t			ds_rx, nds_rx;
354 	void				*ds_rx_arg, *nds_rx_arg;
355 	uint16_t			vid;
356 	int				err, rval;
357 
358 	/*
359 	 * Walk the packet chain.
360 	 */
361 	for (; mp != NULL; mp = nextp) {
362 		/*
363 		 * Wipe the accepted state.
364 		 */
365 		accepted = B_FALSE;
366 
367 		DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
368 		if (err != 0) {
369 			atomic_inc_32(&(dlp->dl_unknowns));
370 			nextp = mp->b_next;
371 			mp->b_next = NULL;
372 			freemsg(mp);
373 			continue;
374 		}
375 
376 		/*
377 		 * Grab the longest sub-chain we can process as a single
378 		 * unit.
379 		 */
380 		nextp = i_dls_link_subchain(dlp, mp, &mhi, &npacket);
381 		ASSERT(npacket != 0);
382 
383 		vid = VLAN_ID(mhi.mhi_tci);
384 
385 		/*
386 		 * This condition is true only when a sun4v vsw client
387 		 * is on the scene; as it is the only type of client
388 		 * that multiplexes VLANs on a single client instance.
389 		 * All other types of clients have one VLAN per client
390 		 * instance. In that case, MAC strips the VLAN tag
391 		 * before delivering it to DLS (see mac_rx_deliver()).
392 		 */
393 		if (mhi.mhi_istagged) {
394 
395 			/*
396 			 * If it is tagged traffic, send it upstream to
397 			 * all dld_str_t which are attached to the physical
398 			 * link and bound to SAP 0x8100.
399 			 */
400 			if (i_dls_link_rx_func(dlp, mrh, &mhi, mp,
401 			    ETHERTYPE_VLAN, dls_accept) > 0) {
402 				accepted = B_TRUE;
403 			}
404 
405 			/*
406 			 * Don't pass the packets up if they are tagged
407 			 * packets and:
408 			 *  - their VID and priority are both zero and the
409 			 *    original packet isn't using the PVID (invalid
410 			 *    packets).
411 			 *  - their sap is ETHERTYPE_VLAN and their VID is
412 			 *    zero as they have already been sent upstreams.
413 			 */
414 			if ((vid == VLAN_ID_NONE && !mhi.mhi_ispvid &&
415 			    VLAN_PRI(mhi.mhi_tci) == 0) ||
416 			    (mhi.mhi_bindsap == ETHERTYPE_VLAN &&
417 			    vid == VLAN_ID_NONE)) {
418 				freemsgchain(mp);
419 				goto loop;
420 			}
421 		}
422 
423 		/*
424 		 * Construct a hash key from the DLSAP.
425 		 */
426 		key = MAKE_KEY(mhi.mhi_bindsap);
427 
428 		/*
429 		 * Search the hash table for dld_str_t eligible to receive
430 		 * a packet chain for this DLSAP.
431 		 */
432 		if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
433 		    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
434 			freemsgchain(mp);
435 			goto loop;
436 		}
437 
438 		/*
439 		 * Find the first dld_str_t that will accept the sub-chain.
440 		 */
441 		for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next)
442 			if (dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
443 				break;
444 
445 		/*
446 		 * If we did not find any dld_str_t willing to accept the
447 		 * sub-chain then throw it away.
448 		 */
449 		if (dsp == NULL) {
450 			i_dls_head_rele(dhp);
451 			freemsgchain(mp);
452 			goto loop;
453 		}
454 
455 		/*
456 		 * We have at least one acceptor.
457 		 */
458 		accepted = B_TRUE;
459 		for (;;) {
460 			/*
461 			 * Find the next dld_str_t that will accept the
462 			 * sub-chain.
463 			 */
464 			for (ndsp = dsp->ds_next; ndsp != NULL;
465 			    ndsp = ndsp->ds_next)
466 				if (dls_accept(ndsp, &mhi, &nds_rx,
467 				    &nds_rx_arg))
468 					break;
469 
470 			/*
471 			 * If there are no more dld_str_t that are willing
472 			 * to accept the sub-chain then we don't need to dup
473 			 * it before handing it to the current one.
474 			 */
475 			if (ndsp == NULL) {
476 				ds_rx(ds_rx_arg, mrh, mp, &mhi);
477 
478 				/*
479 				 * Since there are no more dld_str_t, we're
480 				 * done.
481 				 */
482 				break;
483 			}
484 
485 			/*
486 			 * There are more dld_str_t so dup the sub-chain.
487 			 */
488 			if ((nmp = copymsgchain(mp)) != NULL)
489 				ds_rx(ds_rx_arg, mrh, nmp, &mhi);
490 
491 			dsp = ndsp;
492 			ds_rx = nds_rx;
493 			ds_rx_arg = nds_rx_arg;
494 		}
495 
496 		/*
497 		 * Release the hold on the dld_str_t chain now that we have
498 		 * finished walking it.
499 		 */
500 		i_dls_head_rele(dhp);
501 
502 loop:
503 		/*
504 		 * If there were no acceptors then add the packet count to the
505 		 * 'unknown' count.
506 		 */
507 		if (!accepted)
508 			atomic_add_32(&(dlp->dl_unknowns), npacket);
509 	}
510 }
511 
512 /* ARGSUSED */
513 void
514 dls_rx_vlan_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
515     boolean_t loopback)
516 {
517 	dld_str_t			*dsp = arg;
518 	dls_link_t			*dlp = dsp->ds_dlp;
519 	mac_header_info_t		mhi;
520 	dls_rx_t			ds_rx;
521 	void				*ds_rx_arg;
522 	int				err;
523 
524 	DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
525 	if (err != 0)
526 		goto drop;
527 
528 	/*
529 	 * If there is promiscuous handle for vlan, we filter out the untagged
530 	 * pkts and pkts that are not for the primary unicast address.
531 	 */
532 	if (dsp->ds_vlan_mph != NULL) {
533 		uint8_t prim_addr[MAXMACADDRLEN];
534 		size_t	addr_length = dsp->ds_mip->mi_addr_length;
535 
536 		if (!(mhi.mhi_istagged))
537 			goto drop;
538 		ASSERT(dsp->ds_mh != NULL);
539 		mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)prim_addr);
540 		if (memcmp(mhi.mhi_daddr, prim_addr, addr_length) != 0)
541 			goto drop;
542 
543 		if (!dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
544 			goto drop;
545 
546 		ds_rx(ds_rx_arg, NULL, mp, &mhi);
547 		return;
548 	}
549 
550 drop:
551 	atomic_inc_32(&dlp->dl_unknowns);
552 	freemsg(mp);
553 }
554 
555 /* ARGSUSED */
556 void
557 dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
558     boolean_t loopback)
559 {
560 	dld_str_t			*dsp = arg;
561 	dls_link_t			*dlp = dsp->ds_dlp;
562 	mac_header_info_t		mhi;
563 	dls_rx_t			ds_rx;
564 	void				*ds_rx_arg;
565 	int				err;
566 	dls_head_t			*dhp;
567 	mod_hash_key_t			key;
568 
569 	/*
570 	 * We expect to deal with only a single packet.
571 	 */
572 	ASSERT3P(mp->b_next, ==, NULL);
573 
574 	DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
575 
576 	if (err != 0)
577 		goto drop;
578 
579 	/*
580 	 * In order to filter out sap pkt that no dls channel listens, search
581 	 * the hash table trying to find a dld_str_t eligible to receive the pkt
582 	 */
583 	if ((dsp->ds_promisc & DLS_PROMISC_SAP) == 0) {
584 		key = MAKE_KEY(mhi.mhi_bindsap);
585 		if (mod_hash_find(dsp->ds_dlp->dl_str_hash, key,
586 		    (mod_hash_val_t *)&dhp) != 0)
587 			goto drop;
588 	}
589 
590 	if (!dls_accept_promisc(dsp, &mhi, &ds_rx, &ds_rx_arg, loopback))
591 		goto drop;
592 
593 	ds_rx(ds_rx_arg, NULL, mp, &mhi);
594 	return;
595 
596 drop:
597 	atomic_inc_32(&dlp->dl_unknowns);
598 	freemsg(mp);
599 }
600 
601 /*
602  * We'd like to notify via sysevents that a link state change has occurred.
603  * There are a couple of challenges associated with this. The first is that if
604  * the link is flapping a lot, we may not see an accurate state when we launch
605  * the notification, we're told it changed, not what it changed to.
606  *
607  * The next problem is that all of the information that a user has associated
608  * with this device is the exact opposite of what we have on the dls_link_t. We
609  * have the name of the mac device, which has no bearing on what users see.
610  * Likewise, we don't have the datalink id either. So we're going to have to get
611  * this from dls.
612  *
613  * This is all further complicated by the fact that this could be going on in
614  * another thread at the same time as someone is tearing down the dls_link_t
615  * that we're associated with. We need to be careful not to grab the mac
616  * perimeter, otherwise we stand a good chance of deadlock.
617  */
618 static void
619 dls_link_notify(void *arg, mac_notify_type_t type)
620 {
621 	dls_link_t	*dlp = arg;
622 	dls_dl_handle_t	dhp;
623 	nvlist_t	*nvp;
624 	sysevent_t	*event;
625 	sysevent_id_t	eid;
626 
627 	if (type != MAC_NOTE_LINK && type != MAC_NOTE_LOWLINK)
628 		return;
629 
630 	/*
631 	 * If we can't find a devnet handle for this link, then there is no user
632 	 * knowable device for this at the moment and there's nothing we can
633 	 * really share with them that will make sense.
634 	 */
635 	if (dls_devnet_hold_tmp_by_link(dlp, &dhp) != 0)
636 		return;
637 
638 	/*
639 	 * Because we're attaching this nvlist_t to the sysevent, it'll get
640 	 * cleaned up when we call sysevent_free.
641 	 */
642 	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
643 	VERIFY(nvlist_add_int32(nvp, DATALINK_EV_LINK_ID,
644 	    dls_devnet_linkid(dhp)) == 0);
645 	VERIFY(nvlist_add_string(nvp, DATALINK_EV_LINK_NAME,
646 	    dls_devnet_link(dhp)) == 0);
647 	VERIFY(nvlist_add_int32(nvp, DATALINK_EV_ZONE_ID,
648 	    dls_devnet_getzid(dhp)) == 0);
649 
650 	dls_devnet_rele_tmp(dhp);
651 
652 	event = sysevent_alloc(EC_DATALINK, ESC_DATALINK_LINK_STATE,
653 	    ILLUMOS_KERN_PUB"dls", SE_SLEEP);
654 	VERIFY(event != NULL);
655 	(void) sysevent_attach_attributes(event, (sysevent_attr_list_t *)nvp);
656 
657 	(void) log_sysevent(event, SE_SLEEP, &eid);
658 	sysevent_free(event);
659 
660 }
661 
662 static void
663 i_dls_link_destroy(dls_link_t *dlp)
664 {
665 	ASSERT(dlp->dl_nactive == 0);
666 	ASSERT(dlp->dl_impl_count == 0);
667 	ASSERT(dlp->dl_zone_ref == 0);
668 
669 	/*
670 	 * Free the structure back to the cache.
671 	 */
672 	if (dlp->dl_mnh != NULL)
673 		mac_notify_remove(dlp->dl_mnh, B_TRUE);
674 
675 	if (dlp->dl_mch != NULL)
676 		mac_client_close(dlp->dl_mch, 0);
677 
678 	if (dlp->dl_mh != NULL) {
679 		ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
680 		mac_close(dlp->dl_mh);
681 	}
682 
683 	dlp->dl_mh = NULL;
684 	dlp->dl_mch = NULL;
685 	dlp->dl_mip = NULL;
686 	dlp->dl_mnh = NULL;
687 	dlp->dl_unknowns = 0;
688 	dlp->dl_nonip_cnt = 0;
689 	kmem_cache_free(i_dls_link_cachep, dlp);
690 }
691 
692 static int
693 i_dls_link_create(const char *name, dls_link_t **dlpp)
694 {
695 	dls_link_t		*dlp;
696 	int			err;
697 
698 	/*
699 	 * Allocate a new dls_link_t structure.
700 	 */
701 	dlp = kmem_cache_alloc(i_dls_link_cachep, KM_SLEEP);
702 
703 	/*
704 	 * Name the dls_link_t after the MAC interface it represents.
705 	 */
706 	(void) strlcpy(dlp->dl_name, name, sizeof (dlp->dl_name));
707 
708 	/*
709 	 * First reference; hold open the MAC interface.
710 	 */
711 	ASSERT(dlp->dl_mh == NULL);
712 	err = mac_open(dlp->dl_name, &dlp->dl_mh);
713 	if (err != 0)
714 		goto bail;
715 
716 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
717 	dlp->dl_mip = mac_info(dlp->dl_mh);
718 
719 	/* DLS is the "primary" MAC client */
720 	ASSERT(dlp->dl_mch == NULL);
721 
722 	err = mac_client_open(dlp->dl_mh, &dlp->dl_mch, NULL,
723 	    MAC_OPEN_FLAGS_USE_DATALINK_NAME);
724 	if (err != 0)
725 		goto bail;
726 
727 	dlp->dl_mnh = mac_notify_add(dlp->dl_mh, dls_link_notify, dlp);
728 
729 	DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
730 	    dlp->dl_mch);
731 
732 	*dlpp = dlp;
733 	return (0);
734 
735 bail:
736 	i_dls_link_destroy(dlp);
737 	return (err);
738 }
739 
740 /*
741  * Module initialization functions.
742  */
743 
744 void
745 dls_link_init(void)
746 {
747 	/*
748 	 * Create a kmem_cache of dls_link_t structures.
749 	 */
750 	i_dls_link_cachep = kmem_cache_create("dls_link_cache",
751 	    sizeof (dls_link_t), 0, i_dls_link_constructor,
752 	    i_dls_link_destructor, NULL, NULL, NULL, 0);
753 	ASSERT(i_dls_link_cachep != NULL);
754 
755 	/*
756 	 * Create a dls_link_t hash table and associated lock.
757 	 */
758 	i_dls_link_hash = mod_hash_create_extended("dls_link_hash",
759 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
760 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
761 	i_dls_link_count = 0;
762 }
763 
764 int
765 dls_link_fini(void)
766 {
767 	if (i_dls_link_count > 0)
768 		return (EBUSY);
769 
770 	/*
771 	 * Destroy the kmem_cache.
772 	 */
773 	kmem_cache_destroy(i_dls_link_cachep);
774 
775 	/*
776 	 * Destroy the hash table and associated lock.
777 	 */
778 	mod_hash_destroy_hash(i_dls_link_hash);
779 	return (0);
780 }
781 
782 /*
783  * Exported functions.
784  */
785 
786 static int
787 dls_link_hold_common(const char *name, dls_link_t **dlpp, boolean_t create)
788 {
789 	dls_link_t		*dlp;
790 	int			err;
791 
792 	/*
793 	 * Look up a dls_link_t corresponding to the given macname in the
794 	 * global hash table. The i_dls_link_hash itself is protected by the
795 	 * mod_hash package's internal lock which synchronizes
796 	 * find/insert/remove into the global mod_hash list. Assumes that
797 	 * inserts and removes are single threaded on a per mac end point
798 	 * by the mac perimeter.
799 	 */
800 	if ((err = mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
801 	    (mod_hash_val_t *)&dlp)) == 0)
802 		goto done;
803 
804 	ASSERT(err == MH_ERR_NOTFOUND);
805 	if (!create)
806 		return (ENOENT);
807 
808 	/*
809 	 * We didn't find anything so we need to create one.
810 	 */
811 	if ((err = i_dls_link_create(name, &dlp)) != 0)
812 		return (err);
813 
814 	/*
815 	 * Insert the dls_link_t.
816 	 */
817 	err = mod_hash_insert(i_dls_link_hash, (mod_hash_key_t)dlp->dl_name,
818 	    (mod_hash_val_t)dlp);
819 	ASSERT(err == 0);
820 
821 	atomic_inc_32(&i_dls_link_count);
822 	ASSERT(i_dls_link_count != 0);
823 
824 done:
825 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
826 	/*
827 	 * Bump the reference count and hand back the reference.
828 	 */
829 	dlp->dl_ref++;
830 	*dlpp = dlp;
831 	return (0);
832 }
833 
834 int
835 dls_link_hold_create(const char *name, dls_link_t **dlpp)
836 {
837 	return (dls_link_hold_common(name, dlpp, B_TRUE));
838 }
839 
840 int
841 dls_link_hold(const char *name, dls_link_t **dlpp)
842 {
843 	return (dls_link_hold_common(name, dlpp, B_FALSE));
844 }
845 
846 dev_info_t *
847 dls_link_devinfo(dev_t dev)
848 {
849 	dls_link_t	*dlp;
850 	dev_info_t	*dip;
851 	char	macname[MAXNAMELEN];
852 	char	*drv;
853 	mac_perim_handle_t	mph;
854 
855 	if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
856 		return (NULL);
857 	(void) snprintf(macname, MAXNAMELEN, "%s%d", drv,
858 	    DLS_MINOR2INST(getminor(dev)));
859 
860 	/*
861 	 * The code below assumes that the name constructed above is the
862 	 * macname. This is not the case for legacy devices. Currently this
863 	 * is ok because this function is only called in the getinfo(9e) path,
864 	 * which for a legacy device would directly end up in the driver's
865 	 * getinfo, rather than here
866 	 */
867 	if (mac_perim_enter_by_macname(macname, &mph) != 0)
868 		return (NULL);
869 
870 	if (dls_link_hold(macname, &dlp) != 0) {
871 		mac_perim_exit(mph);
872 		return (NULL);
873 	}
874 
875 	dip = mac_devinfo_get(dlp->dl_mh);
876 	dls_link_rele(dlp);
877 	mac_perim_exit(mph);
878 
879 	return (dip);
880 }
881 
882 dev_t
883 dls_link_dev(dls_link_t *dlp)
884 {
885 	return (makedevice(ddi_driver_major(mac_devinfo_get(dlp->dl_mh)),
886 	    mac_minor(dlp->dl_mh)));
887 }
888 
889 void
890 dls_link_rele(dls_link_t *dlp)
891 {
892 	mod_hash_val_t	val;
893 
894 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
895 	/*
896 	 * Check if there are any more references.
897 	 */
898 	if (--dlp->dl_ref == 0) {
899 		(void) mod_hash_remove(i_dls_link_hash,
900 		    (mod_hash_key_t)dlp->dl_name, &val);
901 		ASSERT(dlp == (dls_link_t *)val);
902 
903 		/*
904 		 * Destroy the dls_link_t.
905 		 */
906 		i_dls_link_destroy(dlp);
907 		ASSERT(i_dls_link_count > 0);
908 		atomic_dec_32(&i_dls_link_count);
909 	}
910 }
911 
912 int
913 dls_link_rele_by_name(const char *name)
914 {
915 	dls_link_t		*dlp;
916 
917 	if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
918 	    (mod_hash_val_t *)&dlp) != 0)
919 		return (ENOENT);
920 
921 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
922 
923 	/*
924 	 * Must fail detach if mac client is busy.
925 	 */
926 	ASSERT(dlp->dl_ref > 0 && dlp->dl_mch != NULL);
927 	if (mac_link_has_flows(dlp->dl_mch))
928 		return (ENOTEMPTY);
929 
930 	dls_link_rele(dlp);
931 	return (0);
932 }
933 
934 int
935 dls_link_setzid(const char *name, zoneid_t zid)
936 {
937 	dls_link_t	*dlp;
938 	int		err = 0;
939 	zoneid_t	old_zid;
940 
941 	if ((err = dls_link_hold_create(name, &dlp)) != 0)
942 		return (err);
943 
944 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
945 
946 	if ((old_zid = dlp->dl_zid) == zid)
947 		goto done;
948 
949 	/*
950 	 * Check whether this dlp is used by its own zone.  If yes, we cannot
951 	 * change its zoneid.
952 	 */
953 	if (dlp->dl_zone_ref != 0) {
954 		err = EBUSY;
955 		goto done;
956 	}
957 
958 	dlp->dl_zid = zid;
959 
960 	if (zid == GLOBAL_ZONEID) {
961 		/*
962 		 * The link is moving from a non-global zone to the global
963 		 * zone, so we need to release the reference that was held
964 		 * when the link was originally assigned to the non-global
965 		 * zone.
966 		 */
967 		dls_link_rele(dlp);
968 	}
969 
970 done:
971 	/*
972 	 * We only keep the reference to this link open if the link has
973 	 * successfully moved from the global zone to a non-global zone.
974 	 */
975 	if (err != 0 || old_zid != GLOBAL_ZONEID)
976 		dls_link_rele(dlp);
977 	return (err);
978 }
979 
980 int
981 dls_link_getzid(const char *name, zoneid_t *zidp)
982 {
983 	dls_link_t	*dlp;
984 	int		err = 0;
985 
986 	if ((err = dls_link_hold(name, &dlp)) != 0)
987 		return (err);
988 
989 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
990 
991 	*zidp = dlp->dl_zid;
992 
993 	dls_link_rele(dlp);
994 	return (0);
995 }
996 
997 void
998 dls_link_add(dls_link_t *dlp, uint32_t sap, dld_str_t *dsp)
999 {
1000 	mod_hash_t	*hash = dlp->dl_str_hash;
1001 	mod_hash_key_t	key;
1002 	dls_head_t	*dhp;
1003 	dld_str_t	*p;
1004 	int		err;
1005 
1006 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
1007 
1008 	/*
1009 	 * Generate a hash key based on the sap.
1010 	 */
1011 	key = MAKE_KEY(sap);
1012 
1013 	/*
1014 	 * Search the table for a list head with this key.
1015 	 */
1016 	if ((err = mod_hash_find(hash, key, (mod_hash_val_t *)&dhp)) != 0) {
1017 		ASSERT(err == MH_ERR_NOTFOUND);
1018 
1019 		dhp = i_dls_head_alloc(key);
1020 		err = mod_hash_insert(hash, key, (mod_hash_val_t)dhp);
1021 		ASSERT(err == 0);
1022 	}
1023 
1024 	/*
1025 	 * Add the dld_str_t to the head of the list. List walkers in
1026 	 * i_dls_link_rx_* bump up dh_ref to ensure the list does not change
1027 	 * while they walk the list. The membar below ensures that list walkers
1028 	 * see exactly the old list or the new list.
1029 	 */
1030 	ASSERT(dsp->ds_next == NULL);
1031 	p = dhp->dh_list;
1032 	dsp->ds_next = p;
1033 
1034 	membar_producer();
1035 
1036 	dhp->dh_list = dsp;
1037 
1038 	/*
1039 	 * Save a pointer to the list head.
1040 	 */
1041 	dsp->ds_head = dhp;
1042 	dlp->dl_impl_count++;
1043 }
1044 
1045 void
1046 dls_link_remove(dls_link_t *dlp, dld_str_t *dsp)
1047 {
1048 	mod_hash_t	*hash = dlp->dl_str_hash;
1049 	dld_str_t	**pp;
1050 	dld_str_t	*p;
1051 	dls_head_t	*dhp;
1052 
1053 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
1054 
1055 	/*
1056 	 * We set dh_removing here to tell the receive callbacks not to pass
1057 	 * up packets anymore. Then wait till the current callbacks are done.
1058 	 * This happens either in the close path or in processing the
1059 	 * DL_UNBIND_REQ via a taskq thread, and it is ok to cv_wait in either.
1060 	 * The dh_ref ensures there aren't and there won't be any upcalls
1061 	 * walking or using the dh_list. The mod hash internal lock ensures
1062 	 * that the insert/remove of the dls_head_t itself synchronizes with
1063 	 * any i_dls_link_rx trying to locate it. The perimeter ensures that
1064 	 * there isn't another simultaneous dls_link_add/remove.
1065 	 */
1066 	dhp = dsp->ds_head;
1067 
1068 	mutex_enter(&dhp->dh_lock);
1069 	dhp->dh_removing = B_TRUE;
1070 	while (dhp->dh_ref != 0)
1071 		cv_wait(&dhp->dh_cv, &dhp->dh_lock);
1072 	mutex_exit(&dhp->dh_lock);
1073 
1074 	/*
1075 	 * Walk the list and remove the dld_str_t.
1076 	 */
1077 	for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->ds_next)) {
1078 		if (p == dsp)
1079 			break;
1080 	}
1081 	ASSERT(p != NULL);
1082 	*pp = p->ds_next;
1083 	p->ds_next = NULL;
1084 	p->ds_head = NULL;
1085 
1086 	ASSERT(dlp->dl_impl_count != 0);
1087 	dlp->dl_impl_count--;
1088 
1089 	if (dhp->dh_list == NULL) {
1090 		mod_hash_val_t	val = NULL;
1091 
1092 		/*
1093 		 * The list is empty so remove the hash table entry.
1094 		 */
1095 		(void) mod_hash_remove(hash, dhp->dh_key, &val);
1096 		ASSERT(dhp == (dls_head_t *)val);
1097 		i_dls_head_free(dhp);
1098 	} else {
1099 		mutex_enter(&dhp->dh_lock);
1100 		dhp->dh_removing = B_FALSE;
1101 		mutex_exit(&dhp->dh_lock);
1102 	}
1103 }
1104