xref: /titanic_50/usr/src/uts/common/io/dls/dls_link.c (revision 8d4cf8d8d2965ea43bccdc838f15c18634fee02d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Services Module
28  */
29 
30 #include	<sys/sysmacros.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/strsun.h>
33 #include	<sys/vlan.h>
34 #include	<sys/dld_impl.h>
35 #include	<sys/sdt.h>
36 #include	<sys/atomic.h>
37 
38 static void		dls_bpf_newzone(dls_link_t *dlp, zoneid_t zid);
39 
40 static kmem_cache_t	*i_dls_link_cachep;
41 mod_hash_t		*i_dls_link_hash;
42 static uint_t		i_dls_link_count;
43 
44 #define		LINK_HASHSZ	67	/* prime */
45 #define		IMPL_HASHSZ	67	/* prime */
46 
47 /*
48  * Construct a hash key encompassing both DLSAP value and VLAN idenitifier.
49  */
50 #define	MAKE_KEY(_sap)						\
51 	((mod_hash_key_t)(uintptr_t)((_sap) << VLAN_ID_SIZE))
52 
53 #define	DLS_STRIP_PADDING(pktsize, p) {			\
54 	if (pktsize != 0) {				\
55 		ssize_t delta = pktsize - msgdsize(p);	\
56 							\
57 		if (delta < 0)				\
58 			(void) adjmsg(p, delta);	\
59 	}						\
60 }
61 
62 /*
63  * Private functions.
64  */
65 
66 /*ARGSUSED*/
67 static int
68 i_dls_link_constructor(void *buf, void *arg, int kmflag)
69 {
70 	dls_link_t	*dlp = buf;
71 	char		name[MAXNAMELEN];
72 
73 	bzero(buf, sizeof (dls_link_t));
74 
75 	(void) snprintf(name, MAXNAMELEN, "dls_link_t_%p_hash", buf);
76 	dlp->dl_str_hash = mod_hash_create_idhash(name, IMPL_HASHSZ,
77 	    mod_hash_null_valdtor);
78 
79 	return (0);
80 }
81 
82 /*ARGSUSED*/
83 static void
84 i_dls_link_destructor(void *buf, void *arg)
85 {
86 	dls_link_t	*dlp = buf;
87 
88 	ASSERT(dlp->dl_ref == 0);
89 	ASSERT(dlp->dl_mh == NULL);
90 	ASSERT(dlp->dl_mah == NULL);
91 	ASSERT(dlp->dl_unknowns == 0);
92 
93 	mod_hash_destroy_idhash(dlp->dl_str_hash);
94 	dlp->dl_str_hash = NULL;
95 
96 }
97 
98 /*
99  * - Parse the mac header information of the given packet.
100  * - Strip the padding and skip over the header. Note that because some
101  *   DLS consumers only check the db_ref count of the first mblk, we
102  *   pullup the message into a single mblk. Because the original message
103  *   is freed as the result of message pulling up, mac_vlan_header_info()
104  *   is called again to update the mhi_saddr and mhi_daddr pointers in the
105  *   mhip. Further, the mac_vlan_header_info() function ensures that the
106  *   size of the pulled message is greater than the MAC header size,
107  *   therefore we can directly advance b_rptr to point at the payload.
108  *
109  * We choose to use a macro for performance reasons.
110  */
111 #define	DLS_PREPARE_PKT(mh, mp, mhip, err) {				\
112 	mblk_t *nextp = (mp)->b_next;					\
113 	if (((err) = mac_vlan_header_info((mh), (mp), (mhip))) == 0) {	\
114 		DLS_STRIP_PADDING((mhip)->mhi_pktsize, (mp));		\
115 		if (MBLKL((mp)) < (mhip)->mhi_hdrsize) {		\
116 			mblk_t *newmp;					\
117 			if ((newmp = msgpullup((mp), -1)) == NULL) {	\
118 				(err) = EINVAL;				\
119 			} else {					\
120 				(mp)->b_next = NULL;			\
121 				freemsg((mp));				\
122 				(mp) = newmp;				\
123 				VERIFY(mac_vlan_header_info((mh),	\
124 				    (mp), (mhip)) == 0);		\
125 				(mp)->b_next = nextp;			\
126 				(mp)->b_rptr += (mhip)->mhi_hdrsize;	\
127 			}						\
128 		} else {						\
129 			(mp)->b_rptr += (mhip)->mhi_hdrsize;		\
130 		}							\
131 	}								\
132 }
133 
134 /*
135  * Truncate the chain starting at mp such that all packets in the chain
136  * have identical source and destination addresses, saps, and tag types
137  * (see below).  It returns a pointer to the mblk following the chain,
138  * NULL if there is no further packet following the processed chain.
139  * The countp argument is set to the number of valid packets in the chain.
140  * Note that the whole MAC header (including the VLAN tag if any) in each
141  * packet will be stripped.
142  */
143 static mblk_t *
144 i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
145     uint_t *countp)
146 {
147 	mblk_t		*prevp;
148 	uint_t		npacket = 1;
149 	size_t		addr_size = dlp->dl_mip->mi_addr_length;
150 	uint16_t	vid = VLAN_ID(mhip->mhi_tci);
151 	uint16_t	pri = VLAN_PRI(mhip->mhi_tci);
152 
153 	/*
154 	 * Compare with subsequent headers until we find one that has
155 	 * differing header information. After checking each packet
156 	 * strip padding and skip over the header.
157 	 */
158 	for (prevp = mp; (mp = mp->b_next) != NULL; prevp = mp) {
159 		mac_header_info_t cmhi;
160 		uint16_t cvid, cpri;
161 		int err;
162 
163 		DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err);
164 		if (err != 0)
165 			break;
166 
167 		prevp->b_next = mp;
168 
169 		/*
170 		 * The source, destination, sap, vlan tag must all match in
171 		 * a given subchain.
172 		 */
173 		if (mhip->mhi_saddr == NULL || cmhi.mhi_saddr == NULL ||
174 		    memcmp(mhip->mhi_daddr, cmhi.mhi_daddr, addr_size) != 0 ||
175 		    memcmp(mhip->mhi_saddr, cmhi.mhi_saddr, addr_size) != 0 ||
176 		    mhip->mhi_bindsap != cmhi.mhi_bindsap) {
177 			/*
178 			 * Note that we don't need to restore the padding.
179 			 */
180 			mp->b_rptr -= cmhi.mhi_hdrsize;
181 			break;
182 		}
183 
184 		cvid = VLAN_ID(cmhi.mhi_tci);
185 		cpri = VLAN_PRI(cmhi.mhi_tci);
186 
187 		/*
188 		 * There are several types of packets. Packets don't match
189 		 * if they are classified to different type or if they are
190 		 * VLAN packets but belong to different VLANs:
191 		 *
192 		 * packet type		tagged		vid		pri
193 		 * ---------------------------------------------------------
194 		 * untagged		No		zero		zero
195 		 * VLAN packets		Yes		non-zero	-
196 		 * priority tagged	Yes		zero		non-zero
197 		 * 0 tagged		Yes		zero		zero
198 		 */
199 		if ((mhip->mhi_istagged != cmhi.mhi_istagged) ||
200 		    (vid != cvid) || ((vid == VLAN_ID_NONE) &&
201 		    (((pri == 0) && (cpri != 0)) ||
202 		    ((pri != 0) && (cpri == 0))))) {
203 			mp->b_rptr -= cmhi.mhi_hdrsize;
204 			break;
205 		}
206 
207 		npacket++;
208 	}
209 
210 	/*
211 	 * Break the chain at this point and return a pointer to the next
212 	 * sub-chain.
213 	 */
214 	prevp->b_next = NULL;
215 	*countp = npacket;
216 	return (mp);
217 }
218 
219 /* ARGSUSED */
220 static int
221 i_dls_head_hold(mod_hash_key_t key, mod_hash_val_t val)
222 {
223 	dls_head_t *dhp = (dls_head_t *)val;
224 
225 	/*
226 	 * The lock order is  mod_hash's internal lock -> dh_lock as in the
227 	 * call to i_dls_link_rx -> mod_hash_find_cb_rval -> i_dls_head_hold
228 	 */
229 	mutex_enter(&dhp->dh_lock);
230 	if (dhp->dh_removing) {
231 		mutex_exit(&dhp->dh_lock);
232 		return (-1);
233 	}
234 	dhp->dh_ref++;
235 	mutex_exit(&dhp->dh_lock);
236 	return (0);
237 }
238 
239 void
240 i_dls_head_rele(dls_head_t *dhp)
241 {
242 	mutex_enter(&dhp->dh_lock);
243 	dhp->dh_ref--;
244 	if (dhp->dh_ref == 0 && dhp->dh_removing != 0)
245 		cv_broadcast(&dhp->dh_cv);
246 	mutex_exit(&dhp->dh_lock);
247 }
248 
249 static dls_head_t *
250 i_dls_head_alloc(mod_hash_key_t key)
251 {
252 	dls_head_t	*dhp;
253 
254 	dhp = kmem_zalloc(sizeof (dls_head_t), KM_SLEEP);
255 	dhp->dh_key = key;
256 	return (dhp);
257 }
258 
259 static void
260 i_dls_head_free(dls_head_t *dhp)
261 {
262 	ASSERT(dhp->dh_ref == 0);
263 	kmem_free(dhp, sizeof (dls_head_t));
264 }
265 
266 /*
267  * Try to send mp up to the streams of the given sap and vid. Return B_TRUE
268  * if this message is sent to any streams.
269  * Note that this function will copy the message chain and the original
270  * mp will remain valid after this function
271  */
272 static uint_t
273 i_dls_link_rx_func(dls_link_t *dlp, mac_resource_handle_t mrh,
274     mac_header_info_t *mhip, mblk_t *mp, uint32_t sap,
275     boolean_t (*acceptfunc)())
276 {
277 	mod_hash_t	*hash = dlp->dl_str_hash;
278 	mod_hash_key_t	key;
279 	dls_head_t	*dhp;
280 	dld_str_t	*dsp;
281 	mblk_t		*nmp;
282 	dls_rx_t	ds_rx;
283 	void		*ds_rx_arg;
284 	uint_t		naccepted = 0;
285 	int		rval;
286 
287 	/*
288 	 * Construct a hash key from the VLAN identifier and the
289 	 * DLSAP that represents dld_str_t in promiscuous mode.
290 	 */
291 	key = MAKE_KEY(sap);
292 
293 	/*
294 	 * Search the hash table for dld_str_t eligible to receive
295 	 * a packet chain for this DLSAP/VLAN combination. The mod hash's
296 	 * internal lock serializes find/insert/remove from the mod hash list.
297 	 * Incrementing the dh_ref (while holding the mod hash lock) ensures
298 	 * dls_link_remove will wait for the upcall to finish.
299 	 */
300 	if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
301 	    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
302 		return (B_FALSE);
303 	}
304 
305 	/*
306 	 * Find dld_str_t that will accept the sub-chain.
307 	 */
308 	for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next) {
309 		if (!acceptfunc(dsp, mhip, &ds_rx, &ds_rx_arg))
310 			continue;
311 
312 		/*
313 		 * We have at least one acceptor.
314 		 */
315 		naccepted++;
316 
317 		/*
318 		 * There will normally be at least more dld_str_t
319 		 * (since we've yet to check for non-promiscuous
320 		 * dld_str_t) so dup the sub-chain.
321 		 */
322 		if ((nmp = copymsgchain(mp)) != NULL)
323 			ds_rx(ds_rx_arg, mrh, nmp, mhip);
324 	}
325 
326 	/*
327 	 * Release the hold on the dld_str_t chain now that we have
328 	 * finished walking it.
329 	 */
330 	i_dls_head_rele(dhp);
331 	return (naccepted);
332 }
333 
334 /* ARGSUSED */
335 void
336 i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
337     boolean_t loopback)
338 {
339 	dls_link_t			*dlp = arg;
340 	mod_hash_t			*hash = dlp->dl_str_hash;
341 	mblk_t				*nextp;
342 	mac_header_info_t		mhi;
343 	dls_head_t			*dhp;
344 	dld_str_t			*dsp;
345 	dld_str_t			*ndsp;
346 	mblk_t				*nmp;
347 	mod_hash_key_t			key;
348 	uint_t				npacket;
349 	boolean_t			accepted;
350 	dls_rx_t			ds_rx, nds_rx;
351 	void				*ds_rx_arg, *nds_rx_arg;
352 	uint16_t			vid;
353 	int				err, rval;
354 
355 	/*
356 	 * Walk the packet chain.
357 	 */
358 	for (; mp != NULL; mp = nextp) {
359 		/*
360 		 * Wipe the accepted state.
361 		 */
362 		accepted = B_FALSE;
363 
364 		DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
365 		if (err != 0) {
366 			atomic_add_32(&(dlp->dl_unknowns), 1);
367 			nextp = mp->b_next;
368 			mp->b_next = NULL;
369 			freemsg(mp);
370 			continue;
371 		}
372 
373 		/*
374 		 * Grab the longest sub-chain we can process as a single
375 		 * unit.
376 		 */
377 		nextp = i_dls_link_subchain(dlp, mp, &mhi, &npacket);
378 		ASSERT(npacket != 0);
379 
380 		vid = VLAN_ID(mhi.mhi_tci);
381 
382 		if (mhi.mhi_istagged) {
383 			/*
384 			 * If it is tagged traffic, send it upstream to
385 			 * all dld_str_t which are attached to the physical
386 			 * link and bound to SAP 0x8100.
387 			 */
388 			if (i_dls_link_rx_func(dlp, mrh, &mhi, mp,
389 			    ETHERTYPE_VLAN, dls_accept) > 0) {
390 				accepted = B_TRUE;
391 			}
392 
393 			/*
394 			 * Don't pass the packets up if they are tagged
395 			 * packets and:
396 			 *  - their VID and priority are both zero and the
397 			 *    original packet isn't using the PVID (invalid
398 			 *    packets).
399 			 *  - their sap is ETHERTYPE_VLAN and their VID is
400 			 *    zero as they have already been sent upstreams.
401 			 */
402 			if ((vid == VLAN_ID_NONE && !mhi.mhi_ispvid &&
403 			    VLAN_PRI(mhi.mhi_tci) == 0) ||
404 			    (mhi.mhi_bindsap == ETHERTYPE_VLAN &&
405 			    vid == VLAN_ID_NONE)) {
406 				freemsgchain(mp);
407 				goto loop;
408 			}
409 		}
410 
411 		/*
412 		 * Construct a hash key from the VLAN identifier and the
413 		 * DLSAP.
414 		 */
415 		key = MAKE_KEY(mhi.mhi_bindsap);
416 
417 		/*
418 		 * Search the has table for dld_str_t eligible to receive
419 		 * a packet chain for this DLSAP/VLAN combination.
420 		 */
421 		if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
422 		    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
423 			freemsgchain(mp);
424 			goto loop;
425 		}
426 
427 		/*
428 		 * Find the first dld_str_t that will accept the sub-chain.
429 		 */
430 		for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next)
431 			if (dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
432 				break;
433 
434 		/*
435 		 * If we did not find any dld_str_t willing to accept the
436 		 * sub-chain then throw it away.
437 		 */
438 		if (dsp == NULL) {
439 			i_dls_head_rele(dhp);
440 			freemsgchain(mp);
441 			goto loop;
442 		}
443 
444 		/*
445 		 * We have at least one acceptor.
446 		 */
447 		accepted = B_TRUE;
448 		for (;;) {
449 			/*
450 			 * Find the next dld_str_t that will accept the
451 			 * sub-chain.
452 			 */
453 			for (ndsp = dsp->ds_next; ndsp != NULL;
454 			    ndsp = ndsp->ds_next)
455 				if (dls_accept(ndsp, &mhi, &nds_rx,
456 				    &nds_rx_arg))
457 					break;
458 
459 			/*
460 			 * If there are no more dld_str_t that are willing
461 			 * to accept the sub-chain then we don't need to dup
462 			 * it before handing it to the current one.
463 			 */
464 			if (ndsp == NULL) {
465 				ds_rx(ds_rx_arg, mrh, mp, &mhi);
466 
467 				/*
468 				 * Since there are no more dld_str_t, we're
469 				 * done.
470 				 */
471 				break;
472 			}
473 
474 			/*
475 			 * There are more dld_str_t so dup the sub-chain.
476 			 */
477 			if ((nmp = copymsgchain(mp)) != NULL)
478 				ds_rx(ds_rx_arg, mrh, nmp, &mhi);
479 
480 			dsp = ndsp;
481 			ds_rx = nds_rx;
482 			ds_rx_arg = nds_rx_arg;
483 		}
484 
485 		/*
486 		 * Release the hold on the dld_str_t chain now that we have
487 		 * finished walking it.
488 		 */
489 		i_dls_head_rele(dhp);
490 
491 loop:
492 		/*
493 		 * If there were no acceptors then add the packet count to the
494 		 * 'unknown' count.
495 		 */
496 		if (!accepted)
497 			atomic_add_32(&(dlp->dl_unknowns), npacket);
498 	}
499 }
500 
501 /* ARGSUSED */
502 void
503 dls_rx_vlan_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
504     boolean_t loopback)
505 {
506 	dld_str_t			*dsp = arg;
507 	dls_link_t			*dlp = dsp->ds_dlp;
508 	mac_header_info_t		mhi;
509 	dls_rx_t			ds_rx;
510 	void				*ds_rx_arg;
511 	int				err;
512 
513 	DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
514 	if (err != 0)
515 		goto drop;
516 
517 	/*
518 	 * If there is promiscuous handle for vlan, we filter out the untagged
519 	 * pkts and pkts that are not for the primary unicast address.
520 	 */
521 	if (dsp->ds_vlan_mph != NULL) {
522 		uint8_t prim_addr[MAXMACADDRLEN];
523 		size_t	addr_length = dsp->ds_mip->mi_addr_length;
524 
525 		if (!(mhi.mhi_istagged))
526 			goto drop;
527 		ASSERT(dsp->ds_mh != NULL);
528 		mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)prim_addr);
529 		if (memcmp(mhi.mhi_daddr, prim_addr, addr_length) != 0)
530 			goto drop;
531 
532 		if (!dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
533 			goto drop;
534 
535 		ds_rx(ds_rx_arg, NULL, mp, &mhi);
536 		return;
537 	}
538 
539 drop:
540 	atomic_add_32(&dlp->dl_unknowns, 1);
541 	freemsg(mp);
542 }
543 
544 /* ARGSUSED */
545 void
546 dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
547     boolean_t loopback)
548 {
549 	dld_str_t			*dsp = arg;
550 	dls_link_t			*dlp = dsp->ds_dlp;
551 	mac_header_info_t		mhi;
552 	dls_rx_t			ds_rx;
553 	void				*ds_rx_arg;
554 	int				err;
555 	dls_head_t			*dhp;
556 	mod_hash_key_t			key;
557 
558 	DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
559 	if (err != 0)
560 		goto drop;
561 
562 	/*
563 	 * In order to filter out sap pkt that no dls channel listens, search
564 	 * the hash table trying to find a dld_str_t eligible to receive the pkt
565 	 */
566 	if ((dsp->ds_promisc & DLS_PROMISC_SAP) == 0) {
567 		key = MAKE_KEY(mhi.mhi_bindsap);
568 		if (mod_hash_find(dsp->ds_dlp->dl_str_hash, key,
569 		    (mod_hash_val_t *)&dhp) != 0)
570 			goto drop;
571 	}
572 
573 	if (!dls_accept_promisc(dsp, &mhi, &ds_rx, &ds_rx_arg, loopback))
574 		goto drop;
575 
576 	ds_rx(ds_rx_arg, NULL, mp, &mhi);
577 	return;
578 
579 drop:
580 	atomic_add_32(&dlp->dl_unknowns, 1);
581 	freemsg(mp);
582 }
583 
584 static void
585 i_dls_link_destroy(dls_link_t *dlp)
586 {
587 	ASSERT(dlp->dl_nactive == 0);
588 	ASSERT(dlp->dl_impl_count == 0);
589 	ASSERT(dlp->dl_zone_ref == 0);
590 
591 	/*
592 	 * Free the structure back to the cache.
593 	 */
594 	if (dlp->dl_mch != NULL)
595 		mac_client_close(dlp->dl_mch, 0);
596 
597 	if (dlp->dl_mh != NULL) {
598 		ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
599 		mac_close(dlp->dl_mh);
600 	}
601 
602 	dlp->dl_mh = NULL;
603 	dlp->dl_mch = NULL;
604 	dlp->dl_mip = NULL;
605 	dlp->dl_unknowns = 0;
606 	dlp->dl_nonip_cnt = 0;
607 	kmem_cache_free(i_dls_link_cachep, dlp);
608 }
609 
610 static int
611 i_dls_link_create(const char *name, dls_link_t **dlpp)
612 {
613 	dls_link_t		*dlp;
614 	int			err;
615 
616 	/*
617 	 * Allocate a new dls_link_t structure.
618 	 */
619 	dlp = kmem_cache_alloc(i_dls_link_cachep, KM_SLEEP);
620 
621 	/*
622 	 * Name the dls_link_t after the MAC interface it represents.
623 	 */
624 	(void) strlcpy(dlp->dl_name, name, sizeof (dlp->dl_name));
625 
626 	/*
627 	 * First reference; hold open the MAC interface.
628 	 */
629 	ASSERT(dlp->dl_mh == NULL);
630 	err = mac_open(dlp->dl_name, &dlp->dl_mh);
631 	if (err != 0)
632 		goto bail;
633 
634 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
635 	dlp->dl_mip = mac_info(dlp->dl_mh);
636 
637 	/* DLS is the "primary" MAC client */
638 	ASSERT(dlp->dl_mch == NULL);
639 
640 	err = mac_client_open(dlp->dl_mh, &dlp->dl_mch, NULL,
641 	    MAC_OPEN_FLAGS_USE_DATALINK_NAME);
642 	if (err != 0)
643 		goto bail;
644 
645 	DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
646 	    dlp->dl_mch);
647 
648 	*dlpp = dlp;
649 	return (0);
650 
651 bail:
652 	i_dls_link_destroy(dlp);
653 	return (err);
654 }
655 
656 /*
657  * Module initialization functions.
658  */
659 
660 void
661 dls_link_init(void)
662 {
663 	/*
664 	 * Create a kmem_cache of dls_link_t structures.
665 	 */
666 	i_dls_link_cachep = kmem_cache_create("dls_link_cache",
667 	    sizeof (dls_link_t), 0, i_dls_link_constructor,
668 	    i_dls_link_destructor, NULL, NULL, NULL, 0);
669 	ASSERT(i_dls_link_cachep != NULL);
670 
671 	/*
672 	 * Create a dls_link_t hash table and associated lock.
673 	 */
674 	i_dls_link_hash = mod_hash_create_extended("dls_link_hash",
675 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
676 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
677 	i_dls_link_count = 0;
678 }
679 
680 int
681 dls_link_fini(void)
682 {
683 	if (i_dls_link_count > 0)
684 		return (EBUSY);
685 
686 	/*
687 	 * Destroy the kmem_cache.
688 	 */
689 	kmem_cache_destroy(i_dls_link_cachep);
690 
691 	/*
692 	 * Destroy the hash table and associated lock.
693 	 */
694 	mod_hash_destroy_hash(i_dls_link_hash);
695 	return (0);
696 }
697 
698 /*
699  * Exported functions.
700  */
701 
702 static int
703 dls_link_hold_common(const char *name, dls_link_t **dlpp, boolean_t create)
704 {
705 	dls_link_t		*dlp;
706 	int			err;
707 
708 	/*
709 	 * Look up a dls_link_t corresponding to the given macname in the
710 	 * global hash table. The i_dls_link_hash itself is protected by the
711 	 * mod_hash package's internal lock which synchronizes
712 	 * find/insert/remove into the global mod_hash list. Assumes that
713 	 * inserts and removes are single threaded on a per mac end point
714 	 * by the mac perimeter.
715 	 */
716 	if ((err = mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
717 	    (mod_hash_val_t *)&dlp)) == 0)
718 		goto done;
719 
720 	ASSERT(err == MH_ERR_NOTFOUND);
721 	if (!create)
722 		return (ENOENT);
723 
724 	/*
725 	 * We didn't find anything so we need to create one.
726 	 */
727 	if ((err = i_dls_link_create(name, &dlp)) != 0)
728 		return (err);
729 
730 	/*
731 	 * Insert the dls_link_t.
732 	 */
733 	err = mod_hash_insert(i_dls_link_hash, (mod_hash_key_t)dlp->dl_name,
734 	    (mod_hash_val_t)dlp);
735 	ASSERT(err == 0);
736 
737 	atomic_add_32(&i_dls_link_count, 1);
738 	ASSERT(i_dls_link_count != 0);
739 
740 done:
741 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
742 	/*
743 	 * Bump the reference count and hand back the reference.
744 	 */
745 	dlp->dl_ref++;
746 	*dlpp = dlp;
747 	return (0);
748 }
749 
750 int
751 dls_link_hold_create(const char *name, dls_link_t **dlpp)
752 {
753 	return (dls_link_hold_common(name, dlpp, B_TRUE));
754 }
755 
756 int
757 dls_link_hold(const char *name, dls_link_t **dlpp)
758 {
759 	return (dls_link_hold_common(name, dlpp, B_FALSE));
760 }
761 
762 dev_info_t *
763 dls_link_devinfo(dev_t dev)
764 {
765 	dls_link_t	*dlp;
766 	dev_info_t	*dip;
767 	char	macname[MAXNAMELEN];
768 	char	*drv;
769 	mac_perim_handle_t	mph;
770 
771 	if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
772 		return (NULL);
773 	(void) snprintf(macname, MAXNAMELEN, "%s%d", drv,
774 	    DLS_MINOR2INST(getminor(dev)));
775 
776 	/*
777 	 * The code below assumes that the name constructed above is the
778 	 * macname. This is not the case for legacy devices. Currently this
779 	 * is ok because this function is only called in the getinfo(9e) path,
780 	 * which for a legacy device would directly end up in the driver's
781 	 * getinfo, rather than here
782 	 */
783 	if (mac_perim_enter_by_macname(macname, &mph) != 0)
784 		return (NULL);
785 
786 	if (dls_link_hold(macname, &dlp) != 0) {
787 		mac_perim_exit(mph);
788 		return (NULL);
789 	}
790 
791 	dip = mac_devinfo_get(dlp->dl_mh);
792 	dls_link_rele(dlp);
793 	mac_perim_exit(mph);
794 
795 	return (dip);
796 }
797 
798 dev_t
799 dls_link_dev(dls_link_t *dlp)
800 {
801 	return (makedevice(ddi_driver_major(mac_devinfo_get(dlp->dl_mh)),
802 	    mac_minor(dlp->dl_mh)));
803 }
804 
805 void
806 dls_link_rele(dls_link_t *dlp)
807 {
808 	mod_hash_val_t	val;
809 
810 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
811 	/*
812 	 * Check if there are any more references.
813 	 */
814 	if (--dlp->dl_ref == 0) {
815 		(void) mod_hash_remove(i_dls_link_hash,
816 		    (mod_hash_key_t)dlp->dl_name, &val);
817 		ASSERT(dlp == (dls_link_t *)val);
818 
819 		/*
820 		 * Destroy the dls_link_t.
821 		 */
822 		i_dls_link_destroy(dlp);
823 		ASSERT(i_dls_link_count > 0);
824 		atomic_add_32(&i_dls_link_count, -1);
825 	}
826 }
827 
828 int
829 dls_link_rele_by_name(const char *name)
830 {
831 	dls_link_t		*dlp;
832 
833 	if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
834 	    (mod_hash_val_t *)&dlp) != 0)
835 		return (ENOENT);
836 
837 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
838 
839 	/*
840 	 * Must fail detach if mac client is busy.
841 	 */
842 	ASSERT(dlp->dl_ref > 0 && dlp->dl_mch != NULL);
843 	if (mac_link_has_flows(dlp->dl_mch))
844 		return (ENOTEMPTY);
845 
846 	dls_link_rele(dlp);
847 	return (0);
848 }
849 
850 int
851 dls_link_setzid(const char *name, zoneid_t zid)
852 {
853 	dls_link_t	*dlp;
854 	int		err = 0;
855 	zoneid_t	old_zid;
856 
857 	if ((err = dls_link_hold_create(name, &dlp)) != 0)
858 		return (err);
859 
860 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
861 
862 	if ((old_zid = dlp->dl_zid) == zid)
863 		goto done;
864 
865 	/*
866 	 * Check whether this dlp is used by its own zone.  If yes, we cannot
867 	 * change its zoneid.
868 	 */
869 	if (dlp->dl_zone_ref != 0) {
870 		err = EBUSY;
871 		goto done;
872 	}
873 
874 	dls_bpf_newzone(dlp, zid);
875 	dlp->dl_zid = zid;
876 
877 	if (zid == GLOBAL_ZONEID) {
878 		/*
879 		 * The link is moving from a non-global zone to the global
880 		 * zone, so we need to release the reference that was held
881 		 * when the link was originally assigned to the non-global
882 		 * zone.
883 		 */
884 		dls_link_rele(dlp);
885 	}
886 
887 done:
888 	/*
889 	 * We only keep the reference to this link open if the link has
890 	 * successfully moved from the global zone to a non-global zone.
891 	 */
892 	if (err != 0 || old_zid != GLOBAL_ZONEID)
893 		dls_link_rele(dlp);
894 	return (err);
895 }
896 
897 
898 /*
899  * When a NIC changes zone, that change needs to be communicated to BPF
900  * so that it can correctly enforce access rights on it via BPF. In the
901  * absence of a function from BPF to just change the zoneid, this is
902  * done with a detach followed by an attach.
903  */
904 static void
905 dls_bpf_newzone(dls_link_t *dlp, zoneid_t zid)
906 {
907 	if (dls_bpfdetach_fn != NULL)
908 		dls_bpfdetach_fn((uintptr_t)dlp->dl_mh);
909 
910 	if (dls_bpfattach_fn != NULL)
911 		dls_bpfattach_fn((uintptr_t)dlp->dl_mh, mac_type(dlp->dl_mh),
912 		    zid, BPR_MAC);
913 }
914 
915 int
916 dls_link_getzid(const char *name, zoneid_t *zidp)
917 {
918 	dls_link_t	*dlp;
919 	int		err = 0;
920 
921 	if ((err = dls_link_hold(name, &dlp)) != 0)
922 		return (err);
923 
924 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
925 
926 	*zidp = dlp->dl_zid;
927 
928 	dls_link_rele(dlp);
929 	return (0);
930 }
931 
932 void
933 dls_link_add(dls_link_t *dlp, uint32_t sap, dld_str_t *dsp)
934 {
935 	mod_hash_t	*hash = dlp->dl_str_hash;
936 	mod_hash_key_t	key;
937 	dls_head_t	*dhp;
938 	dld_str_t	*p;
939 	int		err;
940 
941 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
942 
943 	/*
944 	 * Generate a hash key based on the sap.
945 	 */
946 	key = MAKE_KEY(sap);
947 
948 	/*
949 	 * Search the table for a list head with this key.
950 	 */
951 	if ((err = mod_hash_find(hash, key, (mod_hash_val_t *)&dhp)) != 0) {
952 		ASSERT(err == MH_ERR_NOTFOUND);
953 
954 		dhp = i_dls_head_alloc(key);
955 		err = mod_hash_insert(hash, key, (mod_hash_val_t)dhp);
956 		ASSERT(err == 0);
957 	}
958 
959 	/*
960 	 * Add the dld_str_t to the head of the list. List walkers in
961 	 * i_dls_link_rx_* bump up dh_ref to ensure the list does not change
962 	 * while they walk the list. The membar below ensures that list walkers
963 	 * see exactly the old list or the new list.
964 	 */
965 	ASSERT(dsp->ds_next == NULL);
966 	p = dhp->dh_list;
967 	dsp->ds_next = p;
968 
969 	membar_producer();
970 
971 	dhp->dh_list = dsp;
972 
973 	/*
974 	 * Save a pointer to the list head.
975 	 */
976 	dsp->ds_head = dhp;
977 	dlp->dl_impl_count++;
978 }
979 
980 void
981 dls_link_remove(dls_link_t *dlp, dld_str_t *dsp)
982 {
983 	mod_hash_t	*hash = dlp->dl_str_hash;
984 	dld_str_t	**pp;
985 	dld_str_t	*p;
986 	dls_head_t	*dhp;
987 
988 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
989 
990 	/*
991 	 * We set dh_removing here to tell the receive callbacks not to pass
992 	 * up packets anymore. Then wait till the current callbacks are done.
993 	 * This happens either in the close path or in processing the
994 	 * DL_UNBIND_REQ via a taskq thread, and it is ok to cv_wait in either.
995 	 * The dh_ref ensures there aren't and there won't be any upcalls
996 	 * walking or using the dh_list. The mod hash internal lock ensures
997 	 * that the insert/remove of the dls_head_t itself synchronizes with
998 	 * any i_dls_link_rx trying to locate it. The perimeter ensures that
999 	 * there isn't another simultaneous dls_link_add/remove.
1000 	 */
1001 	dhp = dsp->ds_head;
1002 
1003 	mutex_enter(&dhp->dh_lock);
1004 	dhp->dh_removing = B_TRUE;
1005 	while (dhp->dh_ref != 0)
1006 		cv_wait(&dhp->dh_cv, &dhp->dh_lock);
1007 	mutex_exit(&dhp->dh_lock);
1008 
1009 	/*
1010 	 * Walk the list and remove the dld_str_t.
1011 	 */
1012 	for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->ds_next)) {
1013 		if (p == dsp)
1014 			break;
1015 	}
1016 	ASSERT(p != NULL);
1017 	*pp = p->ds_next;
1018 	p->ds_next = NULL;
1019 	p->ds_head = NULL;
1020 
1021 	ASSERT(dlp->dl_impl_count != 0);
1022 	dlp->dl_impl_count--;
1023 
1024 	if (dhp->dh_list == NULL) {
1025 		mod_hash_val_t	val = NULL;
1026 
1027 		/*
1028 		 * The list is empty so remove the hash table entry.
1029 		 */
1030 		(void) mod_hash_remove(hash, dhp->dh_key, &val);
1031 		ASSERT(dhp == (dls_head_t *)val);
1032 		i_dls_head_free(dhp);
1033 	} else {
1034 		mutex_enter(&dhp->dh_lock);
1035 		dhp->dh_removing = B_FALSE;
1036 		mutex_exit(&dhp->dh_lock);
1037 	}
1038 }
1039