xref: /illumos-gate/usr/src/uts/common/io/dls/dls_link.c (revision 6915124bb75bc67ae012532a3a7727adc043a7cb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Services Module
28  */
29 
30 #include	<sys/sysmacros.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/strsun.h>
33 #include	<sys/vlan.h>
34 #include	<sys/dld_impl.h>
35 #include	<sys/sdt.h>
36 #include	<sys/atomic.h>
37 
38 static kmem_cache_t	*i_dls_link_cachep;
39 mod_hash_t		*i_dls_link_hash;
40 static uint_t		i_dls_link_count;
41 
42 #define		LINK_HASHSZ	67	/* prime */
43 #define		IMPL_HASHSZ	67	/* prime */
44 
45 /*
46  * Construct a hash key encompassing both DLSAP value and VLAN idenitifier.
47  */
48 #define	MAKE_KEY(_sap)						\
49 	((mod_hash_key_t)(uintptr_t)((_sap) << VLAN_ID_SIZE))
50 
51 #define	DLS_STRIP_PADDING(pktsize, p) {			\
52 	if (pktsize != 0) {				\
53 		ssize_t delta = pktsize - msgdsize(p);	\
54 							\
55 		if (delta < 0)				\
56 			(void) adjmsg(p, delta);	\
57 	}						\
58 }
59 
60 /*
61  * Private functions.
62  */
63 
64 /*ARGSUSED*/
65 static int
66 i_dls_link_constructor(void *buf, void *arg, int kmflag)
67 {
68 	dls_link_t	*dlp = buf;
69 	char		name[MAXNAMELEN];
70 
71 	bzero(buf, sizeof (dls_link_t));
72 
73 	(void) snprintf(name, MAXNAMELEN, "dls_link_t_%p_hash", buf);
74 	dlp->dl_str_hash = mod_hash_create_idhash(name, IMPL_HASHSZ,
75 	    mod_hash_null_valdtor);
76 
77 	return (0);
78 }
79 
80 /*ARGSUSED*/
81 static void
82 i_dls_link_destructor(void *buf, void *arg)
83 {
84 	dls_link_t	*dlp = buf;
85 
86 	ASSERT(dlp->dl_ref == 0);
87 	ASSERT(dlp->dl_mh == NULL);
88 	ASSERT(dlp->dl_mah == NULL);
89 	ASSERT(dlp->dl_unknowns == 0);
90 
91 	mod_hash_destroy_idhash(dlp->dl_str_hash);
92 	dlp->dl_str_hash = NULL;
93 
94 }
95 
96 /*
97  * - Parse the mac header information of the given packet.
98  * - Strip the padding and skip over the header. Note that because some
99  *   DLS consumers only check the db_ref count of the first mblk, we
100  *   pullup the message into a single mblk. Because the original message
101  *   is freed as the result of message pulling up, dls_link_header_info()
102  *   is called again to update the mhi_saddr and mhi_daddr pointers in the
103  *   mhip. Further, the dls_link_header_info() function ensures that the
104  *   size of the pulled message is greater than the MAC header size,
105  *   therefore we can directly advance b_rptr to point at the payload.
106  *
107  * We choose to use a macro for performance reasons.
108  */
109 #define	DLS_PREPARE_PKT(dlp, mp, mhip, err) {				\
110 	mblk_t *nextp = (mp)->b_next;					\
111 	if (((err) = dls_link_header_info((dlp), (mp), (mhip))) == 0) {	\
112 		DLS_STRIP_PADDING((mhip)->mhi_pktsize, (mp));		\
113 		if (MBLKL((mp)) < (mhip)->mhi_hdrsize) {		\
114 			mblk_t *newmp;					\
115 			if ((newmp = msgpullup((mp), -1)) == NULL) {	\
116 				(err) = EINVAL;				\
117 			} else {					\
118 				(mp)->b_next = NULL;			\
119 				freemsg((mp));				\
120 				(mp) = newmp;				\
121 				VERIFY(dls_link_header_info((dlp),	\
122 				    (mp), (mhip)) == 0);		\
123 				(mp)->b_next = nextp;			\
124 				(mp)->b_rptr += (mhip)->mhi_hdrsize;	\
125 			}						\
126 		} else {						\
127 			(mp)->b_rptr += (mhip)->mhi_hdrsize;		\
128 		}							\
129 	}								\
130 }
131 
132 /*
133  * Truncate the chain starting at mp such that all packets in the chain
134  * have identical source and destination addresses, saps, and tag types
135  * (see below).  It returns a pointer to the mblk following the chain,
136  * NULL if there is no further packet following the processed chain.
137  * The countp argument is set to the number of valid packets in the chain.
138  * Note that the whole MAC header (including the VLAN tag if any) in each
139  * packet will be stripped.
140  */
141 static mblk_t *
142 i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip,
143     uint_t *countp)
144 {
145 	mblk_t		*prevp;
146 	uint_t		npacket = 1;
147 	size_t		addr_size = dlp->dl_mip->mi_addr_length;
148 	uint16_t	vid = VLAN_ID(mhip->mhi_tci);
149 	uint16_t	pri = VLAN_PRI(mhip->mhi_tci);
150 
151 	/*
152 	 * Compare with subsequent headers until we find one that has
153 	 * differing header information. After checking each packet
154 	 * strip padding and skip over the header.
155 	 */
156 	for (prevp = mp; (mp = mp->b_next) != NULL; prevp = mp) {
157 		mac_header_info_t cmhi;
158 		uint16_t cvid, cpri;
159 		int err;
160 
161 		DLS_PREPARE_PKT(dlp, mp, &cmhi, err);
162 		if (err != 0)
163 			break;
164 
165 		prevp->b_next = mp;
166 
167 		/*
168 		 * The source, destination, sap, vlan id and the MSGNOLOOP
169 		 * flag must all match in a given subchain.
170 		 */
171 		if (memcmp(mhip->mhi_daddr, cmhi.mhi_daddr, addr_size) != 0 ||
172 		    memcmp(mhip->mhi_saddr, cmhi.mhi_saddr, addr_size) != 0 ||
173 		    mhip->mhi_bindsap != cmhi.mhi_bindsap) {
174 			/*
175 			 * Note that we don't need to restore the padding.
176 			 */
177 			mp->b_rptr -= cmhi.mhi_hdrsize;
178 			break;
179 		}
180 
181 		cvid = VLAN_ID(cmhi.mhi_tci);
182 		cpri = VLAN_PRI(cmhi.mhi_tci);
183 
184 		/*
185 		 * There are several types of packets. Packets don't match
186 		 * if they are classified to different type or if they are
187 		 * VLAN packets but belong to different VLANs:
188 		 *
189 		 * packet type		tagged		vid		pri
190 		 * ---------------------------------------------------------
191 		 * untagged		No		zero		zero
192 		 * VLAN packets		Yes		non-zero	-
193 		 * priority tagged	Yes		zero		non-zero
194 		 * 0 tagged		Yes		zero		zero
195 		 */
196 		if ((mhip->mhi_istagged != cmhi.mhi_istagged) ||
197 		    (vid != cvid) || ((vid == VLAN_ID_NONE) &&
198 		    (((pri == 0) && (cpri != 0)) ||
199 		    ((pri != 0) && (cpri == 0))))) {
200 			mp->b_rptr -= cmhi.mhi_hdrsize;
201 			break;
202 		}
203 
204 		npacket++;
205 	}
206 
207 	/*
208 	 * Break the chain at this point and return a pointer to the next
209 	 * sub-chain.
210 	 */
211 	prevp->b_next = NULL;
212 	*countp = npacket;
213 	return (mp);
214 }
215 
216 /* ARGSUSED */
217 static int
218 i_dls_head_hold(mod_hash_key_t key, mod_hash_val_t val)
219 {
220 	dls_head_t *dhp = (dls_head_t *)val;
221 
222 	/*
223 	 * The lock order is  mod_hash's internal lock -> dh_lock as in the
224 	 * call to i_dls_link_rx -> mod_hash_find_cb_rval -> i_dls_head_hold
225 	 */
226 	mutex_enter(&dhp->dh_lock);
227 	if (dhp->dh_removing) {
228 		mutex_exit(&dhp->dh_lock);
229 		return (-1);
230 	}
231 	dhp->dh_ref++;
232 	mutex_exit(&dhp->dh_lock);
233 	return (0);
234 }
235 
236 void
237 i_dls_head_rele(dls_head_t *dhp)
238 {
239 	mutex_enter(&dhp->dh_lock);
240 	dhp->dh_ref--;
241 	if (dhp->dh_ref == 0 && dhp->dh_removing != 0)
242 		cv_broadcast(&dhp->dh_cv);
243 	mutex_exit(&dhp->dh_lock);
244 }
245 
246 static dls_head_t *
247 i_dls_head_alloc(mod_hash_key_t key)
248 {
249 	dls_head_t	*dhp;
250 
251 	dhp = kmem_zalloc(sizeof (dls_head_t), KM_SLEEP);
252 	dhp->dh_key = key;
253 	return (dhp);
254 }
255 
256 static void
257 i_dls_head_free(dls_head_t *dhp)
258 {
259 	ASSERT(dhp->dh_ref == 0);
260 	kmem_free(dhp, sizeof (dls_head_t));
261 }
262 
263 /*
264  * Try to send mp up to the streams of the given sap and vid. Return B_TRUE
265  * if this message is sent to any streams.
266  * Note that this function will copy the message chain and the original
267  * mp will remain valid after this function
268  */
269 static uint_t
270 i_dls_link_rx_func(dls_link_t *dlp, mac_resource_handle_t mrh,
271     mac_header_info_t *mhip, mblk_t *mp, uint32_t sap,
272     boolean_t (*acceptfunc)())
273 {
274 	mod_hash_t	*hash = dlp->dl_str_hash;
275 	mod_hash_key_t	key;
276 	dls_head_t	*dhp;
277 	dld_str_t	*dsp;
278 	mblk_t		*nmp;
279 	dls_rx_t	ds_rx;
280 	void		*ds_rx_arg;
281 	uint_t		naccepted = 0;
282 	int		rval;
283 
284 	/*
285 	 * Construct a hash key from the VLAN identifier and the
286 	 * DLSAP that represents dld_str_t in promiscuous mode.
287 	 */
288 	key = MAKE_KEY(sap);
289 
290 	/*
291 	 * Search the hash table for dld_str_t eligible to receive
292 	 * a packet chain for this DLSAP/VLAN combination. The mod hash's
293 	 * internal lock serializes find/insert/remove from the mod hash list.
294 	 * Incrementing the dh_ref (while holding the mod hash lock) ensures
295 	 * dls_link_remove will wait for the upcall to finish.
296 	 */
297 	if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
298 	    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
299 		return (B_FALSE);
300 	}
301 
302 	/*
303 	 * Find dld_str_t that will accept the sub-chain.
304 	 */
305 	for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next) {
306 		if (!acceptfunc(dsp, mhip, &ds_rx, &ds_rx_arg))
307 			continue;
308 
309 		/*
310 		 * We have at least one acceptor.
311 		 */
312 		naccepted++;
313 
314 		/*
315 		 * There will normally be at least more dld_str_t
316 		 * (since we've yet to check for non-promiscuous
317 		 * dld_str_t) so dup the sub-chain.
318 		 */
319 		if ((nmp = copymsgchain(mp)) != NULL)
320 			ds_rx(ds_rx_arg, mrh, nmp, mhip);
321 	}
322 
323 	/*
324 	 * Release the hold on the dld_str_t chain now that we have
325 	 * finished walking it.
326 	 */
327 	i_dls_head_rele(dhp);
328 	return (naccepted);
329 }
330 
331 /* ARGSUSED */
332 void
333 i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
334     boolean_t loopback)
335 {
336 	dls_link_t			*dlp = arg;
337 	mod_hash_t			*hash = dlp->dl_str_hash;
338 	mblk_t				*nextp;
339 	mac_header_info_t		mhi;
340 	dls_head_t			*dhp;
341 	dld_str_t			*dsp;
342 	dld_str_t			*ndsp;
343 	mblk_t				*nmp;
344 	mod_hash_key_t			key;
345 	uint_t				npacket;
346 	boolean_t			accepted;
347 	dls_rx_t			ds_rx, nds_rx;
348 	void				*ds_rx_arg, *nds_rx_arg;
349 	uint16_t			vid;
350 	int				err, rval;
351 
352 	/*
353 	 * Walk the packet chain.
354 	 */
355 	for (; mp != NULL; mp = nextp) {
356 		/*
357 		 * Wipe the accepted state.
358 		 */
359 		accepted = B_FALSE;
360 
361 		DLS_PREPARE_PKT(dlp, mp, &mhi, err);
362 		if (err != 0) {
363 			atomic_add_32(&(dlp->dl_unknowns), 1);
364 			nextp = mp->b_next;
365 			mp->b_next = NULL;
366 			freemsg(mp);
367 			continue;
368 		}
369 
370 		/*
371 		 * Grab the longest sub-chain we can process as a single
372 		 * unit.
373 		 */
374 		nextp = i_dls_link_subchain(dlp, mp, &mhi, &npacket);
375 		ASSERT(npacket != 0);
376 
377 		vid = VLAN_ID(mhi.mhi_tci);
378 
379 		if (mhi.mhi_istagged) {
380 			/*
381 			 * If it is tagged traffic, send it upstream to
382 			 * all dld_str_t which are attached to the physical
383 			 * link and bound to SAP 0x8100.
384 			 */
385 			if (i_dls_link_rx_func(dlp, mrh, &mhi, mp,
386 			    ETHERTYPE_VLAN, dls_accept) > 0) {
387 				accepted = B_TRUE;
388 			}
389 
390 			/*
391 			 * Don't pass the packets up if they are tagged
392 			 * packets and:
393 			 *  - their VID and priority are both zero and the
394 			 *    original packet isn't using the PVID (invalid
395 			 *    packets).
396 			 *  - their sap is ETHERTYPE_VLAN and their VID is
397 			 *    zero as they have already been sent upstreams.
398 			 */
399 			if ((vid == VLAN_ID_NONE && !mhi.mhi_ispvid &&
400 			    VLAN_PRI(mhi.mhi_tci) == 0) ||
401 			    (mhi.mhi_bindsap == ETHERTYPE_VLAN &&
402 			    vid == VLAN_ID_NONE)) {
403 				freemsgchain(mp);
404 				goto loop;
405 			}
406 		}
407 
408 		/*
409 		 * Construct a hash key from the VLAN identifier and the
410 		 * DLSAP.
411 		 */
412 		key = MAKE_KEY(mhi.mhi_bindsap);
413 
414 		/*
415 		 * Search the has table for dld_str_t eligible to receive
416 		 * a packet chain for this DLSAP/VLAN combination.
417 		 */
418 		if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp,
419 		    i_dls_head_hold, &rval) != 0 || (rval != 0)) {
420 			freemsgchain(mp);
421 			goto loop;
422 		}
423 
424 		/*
425 		 * Find the first dld_str_t that will accept the sub-chain.
426 		 */
427 		for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next)
428 			if (dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
429 				break;
430 
431 		/*
432 		 * If we did not find any dld_str_t willing to accept the
433 		 * sub-chain then throw it away.
434 		 */
435 		if (dsp == NULL) {
436 			i_dls_head_rele(dhp);
437 			freemsgchain(mp);
438 			goto loop;
439 		}
440 
441 		/*
442 		 * We have at least one acceptor.
443 		 */
444 		accepted = B_TRUE;
445 		for (;;) {
446 			/*
447 			 * Find the next dld_str_t that will accept the
448 			 * sub-chain.
449 			 */
450 			for (ndsp = dsp->ds_next; ndsp != NULL;
451 			    ndsp = ndsp->ds_next)
452 				if (dls_accept(ndsp, &mhi, &nds_rx,
453 				    &nds_rx_arg))
454 					break;
455 
456 			/*
457 			 * If there are no more dld_str_t that are willing
458 			 * to accept the sub-chain then we don't need to dup
459 			 * it before handing it to the current one.
460 			 */
461 			if (ndsp == NULL) {
462 				ds_rx(ds_rx_arg, mrh, mp, &mhi);
463 
464 				/*
465 				 * Since there are no more dld_str_t, we're
466 				 * done.
467 				 */
468 				break;
469 			}
470 
471 			/*
472 			 * There are more dld_str_t so dup the sub-chain.
473 			 */
474 			if ((nmp = copymsgchain(mp)) != NULL)
475 				ds_rx(ds_rx_arg, mrh, nmp, &mhi);
476 
477 			dsp = ndsp;
478 			ds_rx = nds_rx;
479 			ds_rx_arg = nds_rx_arg;
480 		}
481 
482 		/*
483 		 * Release the hold on the dld_str_t chain now that we have
484 		 * finished walking it.
485 		 */
486 		i_dls_head_rele(dhp);
487 
488 loop:
489 		/*
490 		 * If there were no acceptors then add the packet count to the
491 		 * 'unknown' count.
492 		 */
493 		if (!accepted)
494 			atomic_add_32(&(dlp->dl_unknowns), npacket);
495 	}
496 }
497 
498 /* ARGSUSED */
499 void
500 dls_rx_vlan_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
501     boolean_t loopback)
502 {
503 	dld_str_t			*dsp = arg;
504 	dls_link_t			*dlp = dsp->ds_dlp;
505 	mac_header_info_t		mhi;
506 	dls_rx_t			ds_rx;
507 	void				*ds_rx_arg;
508 	int				err;
509 
510 	DLS_PREPARE_PKT(dlp, mp, &mhi, err);
511 	if (err != 0)
512 		goto drop;
513 
514 	/*
515 	 * If there is promiscuous handle for vlan, we filter out the untagged
516 	 * pkts and pkts that are not for the primary unicast address.
517 	 */
518 	if (dsp->ds_vlan_mph != NULL) {
519 		uint8_t prim_addr[MAXMACADDRLEN];
520 		size_t	addr_length = dsp->ds_mip->mi_addr_length;
521 
522 		if (!(mhi.mhi_istagged))
523 			goto drop;
524 		ASSERT(dsp->ds_mh != NULL);
525 		mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)prim_addr);
526 		if (memcmp(mhi.mhi_daddr, prim_addr, addr_length) != 0)
527 			goto drop;
528 
529 		if (!dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg))
530 			goto drop;
531 
532 		ds_rx(ds_rx_arg, NULL, mp, &mhi);
533 		return;
534 	}
535 
536 drop:
537 	atomic_add_32(&dlp->dl_unknowns, 1);
538 	freemsg(mp);
539 }
540 
541 /* ARGSUSED */
542 void
543 dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
544     boolean_t loopback)
545 {
546 	dld_str_t			*dsp = arg;
547 	dls_link_t			*dlp = dsp->ds_dlp;
548 	mac_header_info_t		mhi;
549 	dls_rx_t			ds_rx;
550 	void				*ds_rx_arg;
551 	int				err;
552 	dls_head_t			*dhp;
553 	mod_hash_key_t			key;
554 
555 	DLS_PREPARE_PKT(dlp, mp, &mhi, err);
556 	if (err != 0)
557 		goto drop;
558 
559 	/*
560 	 * In order to filter out sap pkt that no dls channel listens, search
561 	 * the hash table trying to find a dld_str_t eligible to receive the pkt
562 	 */
563 	if ((dsp->ds_promisc & DLS_PROMISC_SAP) == 0) {
564 		key = MAKE_KEY(mhi.mhi_bindsap);
565 		if (mod_hash_find(dsp->ds_dlp->dl_str_hash, key,
566 		    (mod_hash_val_t *)&dhp) != 0)
567 			goto drop;
568 	}
569 
570 	if (!dls_accept_promisc(dsp, &mhi, &ds_rx, &ds_rx_arg, loopback))
571 		goto drop;
572 
573 	ds_rx(ds_rx_arg, NULL, mp, &mhi);
574 	return;
575 
576 drop:
577 	atomic_add_32(&dlp->dl_unknowns, 1);
578 	freemsg(mp);
579 }
580 
581 static void
582 i_dls_link_destroy(dls_link_t *dlp)
583 {
584 	ASSERT(dlp->dl_nactive == 0);
585 	ASSERT(dlp->dl_impl_count == 0);
586 	ASSERT(dlp->dl_zone_ref == 0);
587 
588 	/*
589 	 * Free the structure back to the cache.
590 	 */
591 	if (dlp->dl_mch != NULL)
592 		mac_client_close(dlp->dl_mch, 0);
593 
594 	if (dlp->dl_mh != NULL) {
595 		ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
596 		mac_close(dlp->dl_mh);
597 	}
598 
599 	dlp->dl_mh = NULL;
600 	dlp->dl_mch = NULL;
601 	dlp->dl_mip = NULL;
602 	dlp->dl_unknowns = 0;
603 	kmem_cache_free(i_dls_link_cachep, dlp);
604 }
605 
606 static int
607 i_dls_link_create(const char *name, dls_link_t **dlpp)
608 {
609 	dls_link_t		*dlp;
610 	int			err;
611 
612 	/*
613 	 * Allocate a new dls_link_t structure.
614 	 */
615 	dlp = kmem_cache_alloc(i_dls_link_cachep, KM_SLEEP);
616 
617 	/*
618 	 * Name the dls_link_t after the MAC interface it represents.
619 	 */
620 	(void) strlcpy(dlp->dl_name, name, sizeof (dlp->dl_name));
621 
622 	/*
623 	 * First reference; hold open the MAC interface.
624 	 */
625 	ASSERT(dlp->dl_mh == NULL);
626 	err = mac_open(dlp->dl_name, &dlp->dl_mh);
627 	if (err != 0)
628 		goto bail;
629 
630 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
631 	dlp->dl_mip = mac_info(dlp->dl_mh);
632 
633 	/* DLS is the "primary" MAC client */
634 	ASSERT(dlp->dl_mch == NULL);
635 
636 	err = mac_client_open(dlp->dl_mh, &dlp->dl_mch, NULL,
637 	    MAC_OPEN_FLAGS_USE_DATALINK_NAME);
638 	if (err != 0)
639 		goto bail;
640 
641 	DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *,
642 	    dlp->dl_mch);
643 
644 	*dlpp = dlp;
645 	return (0);
646 
647 bail:
648 	i_dls_link_destroy(dlp);
649 	return (err);
650 }
651 
652 /*
653  * Module initialization functions.
654  */
655 
656 void
657 dls_link_init(void)
658 {
659 	/*
660 	 * Create a kmem_cache of dls_link_t structures.
661 	 */
662 	i_dls_link_cachep = kmem_cache_create("dls_link_cache",
663 	    sizeof (dls_link_t), 0, i_dls_link_constructor,
664 	    i_dls_link_destructor, NULL, NULL, NULL, 0);
665 	ASSERT(i_dls_link_cachep != NULL);
666 
667 	/*
668 	 * Create a dls_link_t hash table and associated lock.
669 	 */
670 	i_dls_link_hash = mod_hash_create_extended("dls_link_hash",
671 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
672 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
673 	i_dls_link_count = 0;
674 }
675 
676 int
677 dls_link_fini(void)
678 {
679 	if (i_dls_link_count > 0)
680 		return (EBUSY);
681 
682 	/*
683 	 * Destroy the kmem_cache.
684 	 */
685 	kmem_cache_destroy(i_dls_link_cachep);
686 
687 	/*
688 	 * Destroy the hash table and associated lock.
689 	 */
690 	mod_hash_destroy_hash(i_dls_link_hash);
691 	return (0);
692 }
693 
694 /*
695  * Exported functions.
696  */
697 
698 static int
699 dls_link_hold_common(const char *name, dls_link_t **dlpp, boolean_t create)
700 {
701 	dls_link_t		*dlp;
702 	int			err;
703 
704 	/*
705 	 * Look up a dls_link_t corresponding to the given macname in the
706 	 * global hash table. The i_dls_link_hash itself is protected by the
707 	 * mod_hash package's internal lock which synchronizes
708 	 * find/insert/remove into the global mod_hash list. Assumes that
709 	 * inserts and removes are single threaded on a per mac end point
710 	 * by the mac perimeter.
711 	 */
712 	if ((err = mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
713 	    (mod_hash_val_t *)&dlp)) == 0)
714 		goto done;
715 
716 	ASSERT(err == MH_ERR_NOTFOUND);
717 	if (!create)
718 		return (ENOENT);
719 
720 	/*
721 	 * We didn't find anything so we need to create one.
722 	 */
723 	if ((err = i_dls_link_create(name, &dlp)) != 0)
724 		return (err);
725 
726 	/*
727 	 * Insert the dls_link_t.
728 	 */
729 	err = mod_hash_insert(i_dls_link_hash, (mod_hash_key_t)dlp->dl_name,
730 	    (mod_hash_val_t)dlp);
731 	ASSERT(err == 0);
732 
733 	atomic_add_32(&i_dls_link_count, 1);
734 	ASSERT(i_dls_link_count != 0);
735 
736 done:
737 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
738 	/*
739 	 * Bump the reference count and hand back the reference.
740 	 */
741 	dlp->dl_ref++;
742 	*dlpp = dlp;
743 	return (0);
744 }
745 
746 int
747 dls_link_hold_create(const char *name, dls_link_t **dlpp)
748 {
749 	return (dls_link_hold_common(name, dlpp, B_TRUE));
750 }
751 
752 int
753 dls_link_hold(const char *name, dls_link_t **dlpp)
754 {
755 	return (dls_link_hold_common(name, dlpp, B_FALSE));
756 }
757 
758 dev_info_t *
759 dls_link_devinfo(dev_t dev)
760 {
761 	dls_link_t	*dlp;
762 	dev_info_t	*dip;
763 	char	macname[MAXNAMELEN];
764 	char	*drv;
765 	mac_perim_handle_t	mph;
766 
767 	if ((drv = ddi_major_to_name(getmajor(dev))) == NULL)
768 		return (NULL);
769 	(void) snprintf(macname, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1);
770 
771 	/*
772 	 * The code below assumes that the name constructed above is the
773 	 * macname. This is not the case for legacy devices. Currently this
774 	 * is ok because this function is only called in the getinfo(9e) path,
775 	 * which for a legacy device would directly end up in the driver's
776 	 * getinfo, rather than here
777 	 */
778 	if (mac_perim_enter_by_macname(macname, &mph) != 0)
779 		return (NULL);
780 
781 	if (dls_link_hold(macname, &dlp) != 0) {
782 		mac_perim_exit(mph);
783 		return (NULL);
784 	}
785 
786 	dip = mac_devinfo_get(dlp->dl_mh);
787 	dls_link_rele(dlp);
788 	mac_perim_exit(mph);
789 
790 	return (dip);
791 }
792 
793 dev_t
794 dls_link_dev(dls_link_t *dlp)
795 {
796 	return (makedevice(ddi_driver_major(mac_devinfo_get(dlp->dl_mh)),
797 	    mac_minor(dlp->dl_mh)));
798 }
799 
800 void
801 dls_link_rele(dls_link_t *dlp)
802 {
803 	mod_hash_val_t	val;
804 
805 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
806 	/*
807 	 * Check if there are any more references.
808 	 */
809 	if (--dlp->dl_ref == 0) {
810 		(void) mod_hash_remove(i_dls_link_hash,
811 		    (mod_hash_key_t)dlp->dl_name, &val);
812 		ASSERT(dlp == (dls_link_t *)val);
813 
814 		/*
815 		 * Destroy the dls_link_t.
816 		 */
817 		i_dls_link_destroy(dlp);
818 		ASSERT(i_dls_link_count > 0);
819 		atomic_add_32(&i_dls_link_count, -1);
820 	}
821 }
822 
823 int
824 dls_link_rele_by_name(const char *name)
825 {
826 	dls_link_t		*dlp;
827 
828 	if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name,
829 	    (mod_hash_val_t *)&dlp) != 0)
830 		return (ENOENT);
831 
832 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
833 
834 	/*
835 	 * Must fail detach if mac client is busy.
836 	 */
837 	ASSERT(dlp->dl_ref > 0 && dlp->dl_mch != NULL);
838 	if (mac_link_has_flows(dlp->dl_mch))
839 		return (ENOTEMPTY);
840 
841 	dls_link_rele(dlp);
842 	return (0);
843 }
844 
845 int
846 dls_link_setzid(const char *name, zoneid_t zid)
847 {
848 	dls_link_t	*dlp;
849 	int		err = 0;
850 	zoneid_t	old_zid;
851 
852 	if ((err = dls_link_hold_create(name, &dlp)) != 0)
853 		return (err);
854 
855 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
856 
857 	if ((old_zid = dlp->dl_zid) == zid)
858 		goto done;
859 
860 	/*
861 	 * Check whether this dlp is used by its own zones, if yes,
862 	 * we cannot change its zoneid.
863 	 */
864 	if (dlp->dl_zone_ref != 0) {
865 		err = EBUSY;
866 		goto done;
867 	}
868 
869 	if (zid == GLOBAL_ZONEID) {
870 		/*
871 		 * Move the link from the local zone to the global zone,
872 		 * and release the reference to this link.  At the same time
873 		 * reset the link's active state so that an aggregation is
874 		 * allowed to be created over it.
875 		 */
876 		dlp->dl_zid = zid;
877 		dls_mac_active_clear(dlp);
878 		dls_link_rele(dlp);
879 		goto done;
880 	} else if (old_zid == GLOBAL_ZONEID) {
881 		/*
882 		 * Move the link from the global zone to the local zone,
883 		 * and hold a reference to this link.  Also, set the link
884 		 * to the "active" state so that the global zone is
885 		 * not able to create an aggregation over this link.
886 		 * TODO: revisit once we allow creating aggregations
887 		 * within a local zone.
888 		 */
889 		if ((err = dls_mac_active_set(dlp)) != 0) {
890 			if (err != ENXIO)
891 				err = EBUSY;
892 			goto done;
893 		}
894 		dlp->dl_zid = zid;
895 		return (0);
896 	} else {
897 		/*
898 		 * Move the link from a local zone to another local zone.
899 		 */
900 		dlp->dl_zid = zid;
901 	}
902 
903 done:
904 	dls_link_rele(dlp);
905 	return (err);
906 }
907 
908 void
909 dls_link_add(dls_link_t *dlp, uint32_t sap, dld_str_t *dsp)
910 {
911 	mod_hash_t	*hash = dlp->dl_str_hash;
912 	mod_hash_key_t	key;
913 	dls_head_t	*dhp;
914 	dld_str_t	*p;
915 	int		err;
916 
917 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
918 
919 	/*
920 	 * Generate a hash key based on the sap.
921 	 */
922 	key = MAKE_KEY(sap);
923 
924 	/*
925 	 * Search the table for a list head with this key.
926 	 */
927 	if ((err = mod_hash_find(hash, key, (mod_hash_val_t *)&dhp)) != 0) {
928 		ASSERT(err == MH_ERR_NOTFOUND);
929 
930 		dhp = i_dls_head_alloc(key);
931 		err = mod_hash_insert(hash, key, (mod_hash_val_t)dhp);
932 		ASSERT(err == 0);
933 	}
934 
935 	/*
936 	 * Add the dld_str_t to the head of the list. List walkers in
937 	 * i_dls_link_rx_* bump up dh_ref to ensure the list does not change
938 	 * while they walk the list. The membar below ensures that list walkers
939 	 * see exactly the old list or the new list.
940 	 */
941 	ASSERT(dsp->ds_next == NULL);
942 	p = dhp->dh_list;
943 	dsp->ds_next = p;
944 
945 	membar_producer();
946 
947 	dhp->dh_list = dsp;
948 
949 	/*
950 	 * Save a pointer to the list head.
951 	 */
952 	dsp->ds_head = dhp;
953 	dlp->dl_impl_count++;
954 }
955 
956 void
957 dls_link_remove(dls_link_t *dlp, dld_str_t *dsp)
958 {
959 	mod_hash_t	*hash = dlp->dl_str_hash;
960 	dld_str_t	**pp;
961 	dld_str_t	*p;
962 	dls_head_t	*dhp;
963 
964 	ASSERT(MAC_PERIM_HELD(dlp->dl_mh));
965 
966 	/*
967 	 * We set dh_removing here to tell the receive callbacks not to pass
968 	 * up packets anymore. Then wait till the current callbacks are done.
969 	 * This happens either in the close path or in processing the
970 	 * DL_UNBIND_REQ via a taskq thread, and it is ok to cv_wait in either.
971 	 * The dh_ref ensures there aren't and there won't be any upcalls
972 	 * walking or using the dh_list. The mod hash internal lock ensures
973 	 * that the insert/remove of the dls_head_t itself synchronizes with
974 	 * any i_dls_link_rx trying to locate it. The perimeter ensures that
975 	 * there isn't another simultaneous dls_link_add/remove.
976 	 */
977 	dhp = dsp->ds_head;
978 
979 	mutex_enter(&dhp->dh_lock);
980 	dhp->dh_removing = B_TRUE;
981 	while (dhp->dh_ref != 0)
982 		cv_wait(&dhp->dh_cv, &dhp->dh_lock);
983 	mutex_exit(&dhp->dh_lock);
984 
985 	/*
986 	 * Walk the list and remove the dld_str_t.
987 	 */
988 	for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->ds_next)) {
989 		if (p == dsp)
990 			break;
991 	}
992 	ASSERT(p != NULL);
993 	*pp = p->ds_next;
994 	p->ds_next = NULL;
995 	p->ds_head = NULL;
996 
997 	ASSERT(dlp->dl_impl_count != 0);
998 	dlp->dl_impl_count--;
999 
1000 	if (dhp->dh_list == NULL) {
1001 		mod_hash_val_t	val = NULL;
1002 
1003 		/*
1004 		 * The list is empty so remove the hash table entry.
1005 		 */
1006 		(void) mod_hash_remove(hash, dhp->dh_key, &val);
1007 		ASSERT(dhp == (dls_head_t *)val);
1008 		i_dls_head_free(dhp);
1009 	} else {
1010 		mutex_enter(&dhp->dh_lock);
1011 		dhp->dh_removing = B_FALSE;
1012 		mutex_exit(&dhp->dh_lock);
1013 	}
1014 }
1015 
1016 int
1017 dls_link_header_info(dls_link_t *dlp, mblk_t *mp, mac_header_info_t *mhip)
1018 {
1019 	boolean_t	is_ethernet = (dlp->dl_mip->mi_media == DL_ETHER);
1020 	uint16_t	pvid = mac_get_pvid(dlp->dl_mh);
1021 	int		err = 0;
1022 
1023 	/*
1024 	 * Packets should always be at least 16 bit aligned.
1025 	 */
1026 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
1027 
1028 	if ((err = mac_header_info(dlp->dl_mh, mp, mhip)) != 0)
1029 		return (err);
1030 
1031 	/*
1032 	 * If this is a VLAN-tagged Ethernet packet, then the SAP in the
1033 	 * mac_header_info_t as returned by mac_header_info() is
1034 	 * ETHERTYPE_VLAN. We need to grab the ethertype from the VLAN header.
1035 	 */
1036 	mhip->mhi_ispvid = B_FALSE;
1037 	if (is_ethernet && (mhip->mhi_bindsap == ETHERTYPE_VLAN)) {
1038 		struct ether_vlan_header *evhp;
1039 		uint16_t sap;
1040 		mblk_t *tmp = NULL;
1041 		size_t size;
1042 
1043 		size = sizeof (struct ether_vlan_header);
1044 		if (MBLKL(mp) < size) {
1045 			/*
1046 			 * Pullup the message in order to get the MAC header
1047 			 * infomation. Note that this is a read-only function,
1048 			 * we keep the input packet intact.
1049 			 */
1050 			if ((tmp = msgpullup(mp, size)) == NULL)
1051 				return (EINVAL);
1052 
1053 			mp = tmp;
1054 		}
1055 		evhp = (struct ether_vlan_header *)mp->b_rptr;
1056 		sap = ntohs(evhp->ether_type);
1057 		(void) mac_sap_verify(dlp->dl_mh, sap, &mhip->mhi_bindsap);
1058 		mhip->mhi_hdrsize = sizeof (struct ether_vlan_header);
1059 		mhip->mhi_tci = ntohs(evhp->ether_tci);
1060 		mhip->mhi_istagged = B_TRUE;
1061 		freemsg(tmp);
1062 
1063 		/*
1064 		 * If this port has a non-zero PVID, then we have to lie to the
1065 		 * caller about the VLAN ID.  It's always zero on receive for
1066 		 * that VLAN.
1067 		 */
1068 		if (pvid != VLAN_ID_NONE && VLAN_ID(mhip->mhi_tci) == pvid) {
1069 			mhip->mhi_tci &= ~(VLAN_ID_MASK << VLAN_ID_SHIFT);
1070 			mhip->mhi_ispvid = B_TRUE;
1071 		}
1072 
1073 		if (VLAN_CFI(mhip->mhi_tci) != ETHER_CFI)
1074 			return (EINVAL);
1075 	} else {
1076 		mhip->mhi_istagged = B_FALSE;
1077 		mhip->mhi_tci = 0;
1078 	}
1079 
1080 	return (0);
1081 }
1082