xref: /titanic_51/usr/src/uts/common/io/dld/dld_str.c (revision 631e5d7f05cf5843041dce683121c6c6e20a9b75)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Driver
28  */
29 
30 #include	<inet/common.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/vlan.h>
35 #include	<sys/dld_impl.h>
36 #include	<sys/cpuvar.h>
37 #include	<sys/callb.h>
38 #include	<sys/list.h>
39 #include	<sys/mac_client.h>
40 #include	<sys/mac_client_priv.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
61 
62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
64     link_tagmode_t);
65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
66 
67 static uint32_t		str_count;
68 static kmem_cache_t	*str_cachep;
69 static mod_hash_t	*str_hashp;
70 
71 #define	STR_HASHSZ		64
72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
73 
74 #define	dld_taskq	system_taskq
75 
76 static kmutex_t		dld_taskq_lock;
77 static kcondvar_t	dld_taskq_cv;
78 static list_t		dld_taskq_list;		/* List of dld_str_t */
79 boolean_t		dld_taskq_quit;
80 boolean_t		dld_taskq_done;
81 
82 static void		dld_taskq_dispatch(void);
83 
84 /*
85  * Some notes on entry points, flow-control, queueing.
86  *
87  * This driver exports the traditional STREAMS put entry point as well as
88  * the non-STREAMS fast-path transmit routine which is provided to IP via
89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
90  * and data operations, while the fast-path routine deals only with M_DATA
91  * fast-path packets.  Regardless of the entry point, all outbound packets
92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
93  *
94  * The transmit logic operates in the following way: All packets coming
95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
96  * happens when the MAC layer indicates the packets couldn't be
97  * transmitted due to 1) lack of resources (e.g. running out of
98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
99  * particular flow. The indication comes in the form of a Tx cookie that
100  * identifies the blocked ring. In such case, DLD will place a
101  * dummy message on its write-side STREAMS queue so that the queue is
102  * marked as "full". Any subsequent packets arriving at the driver will
103  * still be sent to the MAC layer where it either gets queued in the Tx
104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
106  * When the write service procedure runs, it will remove the dummy
107  * message from the write-side STREAMS queue; in effect this will trigger
108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
109  * respectively, due to the above reasons.
110  *
111  * All non-data operations, both DLPI and ioctls are single threaded on a per
112  * dld_str_t endpoint. This is done using a taskq so that the control operation
113  * has kernel context and can cv_wait for resources. In addition all set type
114  * operations that involve mac level state modification are serialized on a
115  * per mac end point using the perimeter mechanism provided by the mac layer.
116  * This serializes all mac clients trying to modify a single mac end point over
117  * the entire sequence of mac calls made by that client as an atomic unit. The
118  * mac framework locking is described in mac.c. A critical element is that
119  * DLD/DLS does not hold any locks across the mac perimeter.
120  *
121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
124  * is returned. If the mac handle is non-null, it can be safely accessed
125  * below. The mac handle won't be freed until the mac_unregister which
126  * won't happen until the driver detaches. The DDI framework ensures that
127  * the detach won't happen while a getinfo is in progress.
128  */
129 typedef struct i_dld_str_state_s {
130 	major_t		ds_major;
131 	minor_t		ds_minor;
132 	int		ds_instance;
133 	dev_info_t	*ds_dip;
134 } i_dld_str_state_t;
135 
136 /* ARGSUSED */
137 static uint_t
138 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
139 {
140 	i_dld_str_state_t	*statep = arg;
141 	dld_str_t		*dsp = (dld_str_t *)val;
142 	mac_handle_t		mh;
143 
144 	if (statep->ds_major != dsp->ds_major)
145 		return (MH_WALK_CONTINUE);
146 
147 	ASSERT(statep->ds_minor != 0);
148 	mh = dsp->ds_mh;
149 
150 	if (statep->ds_minor == dsp->ds_minor) {
151 		/*
152 		 * Clone: a clone minor is unique. we can terminate the
153 		 * walk if we find a matching stream -- even if we fail
154 		 * to obtain the devinfo.
155 		 */
156 		if (mh != NULL) {
157 			statep->ds_dip = mac_devinfo_get(mh);
158 			statep->ds_instance = mac_minor(mh) - 1;
159 		}
160 		return (MH_WALK_TERMINATE);
161 	}
162 	return (MH_WALK_CONTINUE);
163 }
164 
165 static dev_info_t *
166 dld_finddevinfo(dev_t dev)
167 {
168 	dev_info_t		*dip;
169 	i_dld_str_state_t	state;
170 
171 	if (getminor(dev) == 0)
172 		return (NULL);
173 
174 	/*
175 	 * See if it's a minor node of a link
176 	 */
177 	if ((dip = dls_link_devinfo(dev)) != NULL)
178 		return (dip);
179 
180 	state.ds_minor = getminor(dev);
181 	state.ds_major = getmajor(dev);
182 	state.ds_dip = NULL;
183 	state.ds_instance = -1;
184 
185 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
186 	return (state.ds_dip);
187 }
188 
189 int
190 dld_devt_to_instance(dev_t dev)
191 {
192 	minor_t			minor;
193 	i_dld_str_state_t	state;
194 
195 	/*
196 	 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
197 	 * Minor number 0 is reserved for the DLPI style 2 unattached
198 	 * node.
199 	 */
200 
201 	if ((minor = getminor(dev)) == 0)
202 		return (-1);
203 
204 	/*
205 	 * Check for style 2 unassociated node, this is quick and
206 	 * easy.  Note that this doesn't *necessarily* work for legacy
207 	 * devices, but this code is only called within the
208 	 * getinfo(9e) implementation for true GLDv3 devices, so it
209 	 * doesn't matter.
210 	 */
211 	if (minor > 0 && minor <= DLS_MAX_MINOR) {
212 		return (DLS_MINOR2INST(minor));
213 	}
214 
215 	state.ds_minor = getminor(dev);
216 	state.ds_major = getmajor(dev);
217 	state.ds_dip = NULL;
218 	state.ds_instance = -1;
219 
220 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
221 	return (state.ds_instance);
222 }
223 
224 /*
225  * devo_getinfo: getinfo(9e)
226  *
227  * NB: This may be called for a provider before the provider's
228  * instances are attached.  Hence, if a particular provider needs a
229  * special mapping (the mac instance != ddi_get_instance()), then it
230  * may need to provide its own implmentation using the
231  * MAC_MINOR_TO_INSTANCE() function, and translating the returned mac
232  * instance to a devinfo instance.  For dev_t's where the minor number
233  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
234  * function indirectly via the mac_getinfo() function.
235  */
236 /*ARGSUSED*/
237 int
238 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
239 {
240 	dev_info_t	*devinfo;
241 	minor_t		minor = getminor((dev_t)arg);
242 	int		rc = DDI_FAILURE;
243 
244 	switch (cmd) {
245 	case DDI_INFO_DEVT2DEVINFO:
246 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
247 			*(dev_info_t **)resp = devinfo;
248 			rc = DDI_SUCCESS;
249 		}
250 		break;
251 	case DDI_INFO_DEVT2INSTANCE:
252 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
253 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
254 			rc = DDI_SUCCESS;
255 		} else if (minor > DLS_MAX_MINOR &&
256 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
257 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
258 			rc = DDI_SUCCESS;
259 		}
260 		break;
261 	}
262 	return (rc);
263 }
264 
265 void *
266 dld_str_private(queue_t *q)
267 {
268 	return (((dld_str_t *)(q->q_ptr))->ds_private);
269 }
270 
271 int
272 dld_str_open(queue_t *rq, dev_t *devp, void *private)
273 {
274 	dld_str_t	*dsp;
275 	major_t		major;
276 	minor_t		minor;
277 	int		err;
278 
279 	major = getmajor(*devp);
280 	minor = getminor(*devp);
281 
282 	/*
283 	 * Create a new dld_str_t for the stream. This will grab a new minor
284 	 * number that will be handed back in the cloned dev_t.  Creation may
285 	 * fail if we can't allocate the dummy mblk used for flow-control.
286 	 */
287 	dsp = dld_str_create(rq, DLD_DLPI, major,
288 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
289 	if (dsp == NULL)
290 		return (ENOSR);
291 
292 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
293 	dsp->ds_private = private;
294 	if (minor != 0) {
295 		/*
296 		 * Style 1 open
297 		 */
298 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
299 			goto failed;
300 
301 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
302 	} else {
303 		(void) qassociate(rq, -1);
304 	}
305 
306 	/*
307 	 * Enable the queue srv(9e) routine.
308 	 */
309 	qprocson(rq);
310 
311 	/*
312 	 * Construct a cloned dev_t to hand back.
313 	 */
314 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
315 	return (0);
316 
317 failed:
318 	dld_str_destroy(dsp);
319 	return (err);
320 }
321 
322 int
323 dld_str_close(queue_t *rq)
324 {
325 	dld_str_t	*dsp = rq->q_ptr;
326 
327 	/*
328 	 * All modules on top have been popped off. So there can't be any
329 	 * threads from the top.
330 	 */
331 	ASSERT(dsp->ds_datathr_cnt == 0);
332 
333 	/*
334 	 * Wait until pending DLPI requests are processed.
335 	 */
336 	mutex_enter(&dsp->ds_lock);
337 	while (dsp->ds_dlpi_pending)
338 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
339 	mutex_exit(&dsp->ds_lock);
340 
341 
342 	/*
343 	 * This stream was open to a provider node. Check to see
344 	 * if it has been cleanly shut down.
345 	 */
346 	if (dsp->ds_dlstate != DL_UNATTACHED) {
347 		/*
348 		 * The stream is either open to a style 1 provider or
349 		 * this is not clean shutdown. Detach from the PPA.
350 		 * (This is still ok even in the style 1 case).
351 		 */
352 		dld_str_detach(dsp);
353 	}
354 
355 	dld_str_destroy(dsp);
356 	return (0);
357 }
358 
359 /*
360  * qi_qopen: open(9e)
361  */
362 /*ARGSUSED*/
363 int
364 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
365 {
366 	if (sflag == MODOPEN)
367 		return (ENOTSUP);
368 
369 	/*
370 	 * This is a cloning driver and therefore each queue should only
371 	 * ever get opened once.
372 	 */
373 	if (rq->q_ptr != NULL)
374 		return (EBUSY);
375 
376 	return (dld_str_open(rq, devp, NULL));
377 }
378 
379 /*
380  * qi_qclose: close(9e)
381  */
382 int
383 dld_close(queue_t *rq)
384 {
385 	/*
386 	 * Disable the queue srv(9e) routine.
387 	 */
388 	qprocsoff(rq);
389 
390 	return (dld_str_close(rq));
391 }
392 
393 /*
394  * qi_qputp: put(9e)
395  */
396 void
397 dld_wput(queue_t *wq, mblk_t *mp)
398 {
399 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
400 	dld_str_mode_t	mode;
401 
402 	switch (DB_TYPE(mp)) {
403 	case M_DATA:
404 		mutex_enter(&dsp->ds_lock);
405 		mode = dsp->ds_mode;
406 		if ((dsp->ds_dlstate != DL_IDLE) ||
407 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
408 			mutex_exit(&dsp->ds_lock);
409 			freemsg(mp);
410 			break;
411 		}
412 
413 		DLD_DATATHR_INC(dsp);
414 		mutex_exit(&dsp->ds_lock);
415 		if (mode == DLD_FASTPATH) {
416 			if (dsp->ds_mip->mi_media == DL_ETHER &&
417 			    (MBLKL(mp) < sizeof (struct ether_header))) {
418 				freemsg(mp);
419 			} else {
420 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
421 			}
422 		} else {
423 			str_mdata_raw_put(dsp, mp);
424 		}
425 		DLD_DATATHR_DCR(dsp);
426 		break;
427 	case M_PROTO:
428 	case M_PCPROTO: {
429 		t_uscalar_t	prim;
430 
431 		if (MBLKL(mp) < sizeof (t_uscalar_t))
432 			break;
433 
434 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
435 
436 		if (prim == DL_UNITDATA_REQ) {
437 			proto_unitdata_req(dsp, mp);
438 		} else {
439 			dld_wput_nondata(dsp, mp);
440 		}
441 		break;
442 	}
443 
444 	case M_IOCTL:
445 		dld_wput_nondata(dsp, mp);
446 		break;
447 
448 	case M_FLUSH:
449 		if (*mp->b_rptr & FLUSHW) {
450 			DLD_CLRQFULL(dsp);
451 			*mp->b_rptr &= ~FLUSHW;
452 		}
453 
454 		if (*mp->b_rptr & FLUSHR) {
455 			qreply(wq, mp);
456 		} else {
457 			freemsg(mp);
458 		}
459 		break;
460 
461 	default:
462 		freemsg(mp);
463 		break;
464 	}
465 }
466 
467 /*
468  * qi_srvp: srv(9e)
469  */
470 void
471 dld_wsrv(queue_t *wq)
472 {
473 	dld_str_t	*dsp = wq->q_ptr;
474 
475 	DLD_CLRQFULL(dsp);
476 }
477 
478 void
479 dld_init_ops(struct dev_ops *ops, const char *name)
480 {
481 	struct streamtab *stream;
482 	struct qinit *rq, *wq;
483 	struct module_info *modinfo;
484 
485 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
486 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
487 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
488 	modinfo->mi_minpsz = 0;
489 	modinfo->mi_maxpsz = 64*1024;
490 	modinfo->mi_hiwat  = 1;
491 	modinfo->mi_lowat = 0;
492 
493 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
494 	rq->qi_qopen = dld_open;
495 	rq->qi_qclose = dld_close;
496 	rq->qi_minfo = modinfo;
497 
498 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
499 	wq->qi_putp = (pfi_t)dld_wput;
500 	wq->qi_srvp = (pfi_t)dld_wsrv;
501 	wq->qi_minfo = modinfo;
502 
503 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
504 	stream->st_rdinit = rq;
505 	stream->st_wrinit = wq;
506 	ops->devo_cb_ops->cb_str = stream;
507 
508 	if (ops->devo_getinfo == NULL)
509 		ops->devo_getinfo = &dld_getinfo;
510 }
511 
512 void
513 dld_fini_ops(struct dev_ops *ops)
514 {
515 	struct streamtab *stream;
516 	struct qinit *rq, *wq;
517 	struct module_info *modinfo;
518 
519 	stream = ops->devo_cb_ops->cb_str;
520 	rq = stream->st_rdinit;
521 	wq = stream->st_wrinit;
522 	modinfo = rq->qi_minfo;
523 	ASSERT(wq->qi_minfo == modinfo);
524 
525 	kmem_free(stream, sizeof (struct streamtab));
526 	kmem_free(wq, sizeof (struct qinit));
527 	kmem_free(rq, sizeof (struct qinit));
528 	kmem_free(modinfo->mi_idname, FMNAMESZ);
529 	kmem_free(modinfo, sizeof (struct module_info));
530 }
531 
532 /*
533  * Initialize this module's data structures.
534  */
535 void
536 dld_str_init(void)
537 {
538 	/*
539 	 * Create dld_str_t object cache.
540 	 */
541 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
542 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
543 	ASSERT(str_cachep != NULL);
544 
545 	/*
546 	 * Create a hash table for maintaining dld_str_t's.
547 	 * The ds_minor field (the clone minor number) of a dld_str_t
548 	 * is used as a key for this hash table because this number is
549 	 * globally unique (allocated from "dls_minor_arena").
550 	 */
551 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
552 	    mod_hash_null_valdtor);
553 
554 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
555 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
556 
557 	dld_taskq_quit = B_FALSE;
558 	dld_taskq_done = B_FALSE;
559 	list_create(&dld_taskq_list, sizeof (dld_str_t),
560 	    offsetof(dld_str_t, ds_tqlist));
561 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
562 	    &p0, TS_RUN, minclsyspri);
563 }
564 
565 /*
566  * Tear down this module's data structures.
567  */
568 int
569 dld_str_fini(void)
570 {
571 	/*
572 	 * Make sure that there are no objects in use.
573 	 */
574 	if (str_count != 0)
575 		return (EBUSY);
576 
577 	/*
578 	 * Ask the dld_taskq thread to quit and wait for it to be done
579 	 */
580 	mutex_enter(&dld_taskq_lock);
581 	dld_taskq_quit = B_TRUE;
582 	cv_signal(&dld_taskq_cv);
583 	while (!dld_taskq_done)
584 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
585 	mutex_exit(&dld_taskq_lock);
586 	list_destroy(&dld_taskq_list);
587 	/*
588 	 * Destroy object cache.
589 	 */
590 	kmem_cache_destroy(str_cachep);
591 	mod_hash_destroy_idhash(str_hashp);
592 	return (0);
593 }
594 
595 /*
596  * Create a new dld_str_t object.
597  */
598 dld_str_t *
599 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
600 {
601 	dld_str_t	*dsp;
602 	int		err;
603 
604 	/*
605 	 * Allocate an object from the cache.
606 	 */
607 	atomic_add_32(&str_count, 1);
608 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
609 
610 	/*
611 	 * Allocate the dummy mblk for flow-control.
612 	 */
613 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
614 	if (dsp->ds_tx_flow_mp == NULL) {
615 		kmem_cache_free(str_cachep, dsp);
616 		atomic_add_32(&str_count, -1);
617 		return (NULL);
618 	}
619 	dsp->ds_type = type;
620 	dsp->ds_major = major;
621 	dsp->ds_style = style;
622 
623 	/*
624 	 * Initialize the queue pointers.
625 	 */
626 	ASSERT(RD(rq) == rq);
627 	dsp->ds_rq = rq;
628 	dsp->ds_wq = WR(rq);
629 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
630 
631 	/*
632 	 * We want explicit control over our write-side STREAMS queue
633 	 * where the dummy mblk gets added/removed for flow-control.
634 	 */
635 	noenable(WR(rq));
636 
637 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
638 	    (mod_hash_val_t)dsp);
639 	ASSERT(err == 0);
640 	return (dsp);
641 }
642 
643 /*
644  * Destroy a dld_str_t object.
645  */
646 void
647 dld_str_destroy(dld_str_t *dsp)
648 {
649 	queue_t		*rq;
650 	queue_t		*wq;
651 	mod_hash_val_t	val;
652 
653 	/*
654 	 * Clear the queue pointers.
655 	 */
656 	rq = dsp->ds_rq;
657 	wq = dsp->ds_wq;
658 	ASSERT(wq == WR(rq));
659 	rq->q_ptr = wq->q_ptr = NULL;
660 	dsp->ds_rq = dsp->ds_wq = NULL;
661 
662 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
663 	ASSERT(dsp->ds_sap == 0);
664 	ASSERT(dsp->ds_mh == NULL);
665 	ASSERT(dsp->ds_mch == NULL);
666 	ASSERT(dsp->ds_promisc == 0);
667 	ASSERT(dsp->ds_mph == NULL);
668 	ASSERT(dsp->ds_mip == NULL);
669 	ASSERT(dsp->ds_mnh == NULL);
670 
671 	ASSERT(dsp->ds_polling == B_FALSE);
672 	ASSERT(dsp->ds_direct == B_FALSE);
673 	ASSERT(dsp->ds_lso == B_FALSE);
674 	ASSERT(dsp->ds_lso_max == 0);
675 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
676 
677 	/*
678 	 * Reinitialize all the flags.
679 	 */
680 	dsp->ds_notifications = 0;
681 	dsp->ds_passivestate = DLD_UNINITIALIZED;
682 	dsp->ds_mode = DLD_UNITDATA;
683 	dsp->ds_native = B_FALSE;
684 
685 	ASSERT(dsp->ds_datathr_cnt == 0);
686 	ASSERT(dsp->ds_pending_head == NULL);
687 	ASSERT(dsp->ds_pending_tail == NULL);
688 	ASSERT(!dsp->ds_dlpi_pending);
689 
690 	ASSERT(dsp->ds_dlp == NULL);
691 	ASSERT(dsp->ds_dmap == NULL);
692 	ASSERT(dsp->ds_rx == NULL);
693 	ASSERT(dsp->ds_rx_arg == NULL);
694 	ASSERT(dsp->ds_next == NULL);
695 	ASSERT(dsp->ds_head == NULL);
696 
697 	/*
698 	 * Free the dummy mblk if exists.
699 	 */
700 	if (dsp->ds_tx_flow_mp != NULL) {
701 		freeb(dsp->ds_tx_flow_mp);
702 		dsp->ds_tx_flow_mp = NULL;
703 	}
704 
705 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
706 	ASSERT(dsp == (dld_str_t *)val);
707 
708 	/*
709 	 * Free the object back to the cache.
710 	 */
711 	kmem_cache_free(str_cachep, dsp);
712 	atomic_add_32(&str_count, -1);
713 }
714 
715 /*
716  * kmem_cache contructor function: see kmem_cache_create(9f).
717  */
718 /*ARGSUSED*/
719 static int
720 str_constructor(void *buf, void *cdrarg, int kmflags)
721 {
722 	dld_str_t	*dsp = buf;
723 
724 	bzero(buf, sizeof (dld_str_t));
725 
726 	/*
727 	 * Allocate a new minor number.
728 	 */
729 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
730 		return (-1);
731 
732 	/*
733 	 * Initialize the DLPI state machine.
734 	 */
735 	dsp->ds_dlstate = DL_UNATTACHED;
736 
737 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
738 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
739 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
740 
741 	return (0);
742 }
743 
744 /*
745  * kmem_cache destructor function.
746  */
747 /*ARGSUSED*/
748 static void
749 str_destructor(void *buf, void *cdrarg)
750 {
751 	dld_str_t	*dsp = buf;
752 
753 	/*
754 	 * Release the minor number.
755 	 */
756 	mac_minor_rele(dsp->ds_minor);
757 
758 	ASSERT(dsp->ds_tx_flow_mp == NULL);
759 
760 	mutex_destroy(&dsp->ds_lock);
761 	cv_destroy(&dsp->ds_datathr_cv);
762 	cv_destroy(&dsp->ds_dlpi_pending_cv);
763 }
764 
765 /*
766  * Update the priority bits and VID (may need to insert tag if mp points
767  * to an untagged packet.
768  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
769  */
770 static mblk_t *
771 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
772     link_tagmode_t tagmode)
773 {
774 	mblk_t *hmp;
775 	struct ether_vlan_header *evhp;
776 	struct ether_header *ehp;
777 	uint16_t old_tci = 0;
778 	size_t len;
779 
780 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
781 
782 	evhp = (struct ether_vlan_header *)mp->b_rptr;
783 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
784 		/*
785 		 * Tagged packet, update the priority bits.
786 		 */
787 		len = sizeof (struct ether_vlan_header);
788 
789 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
790 			/*
791 			 * In case some drivers only check the db_ref
792 			 * count of the first mblk, we pullup the
793 			 * message into a single mblk.
794 			 */
795 			hmp = msgpullup(mp, -1);
796 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
797 				freemsg(hmp);
798 				return (NULL);
799 			} else {
800 				freemsg(mp);
801 				mp = hmp;
802 			}
803 		}
804 
805 		evhp = (struct ether_vlan_header *)mp->b_rptr;
806 		old_tci = ntohs(evhp->ether_tci);
807 	} else {
808 		/*
809 		 * Untagged packet.  Two factors will cause us to insert a
810 		 * VLAN header:
811 		 * - This is a VLAN link (vid is specified)
812 		 * - The link supports user priority tagging and the priority
813 		 *   is non-zero.
814 		 */
815 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
816 			return (mp);
817 
818 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
819 		if (hmp == NULL)
820 			return (NULL);
821 
822 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
823 		ehp = (struct ether_header *)mp->b_rptr;
824 
825 		/*
826 		 * Copy the MAC addresses and typelen
827 		 */
828 		bcopy(ehp, evhp, (ETHERADDRL * 2));
829 		evhp->ether_type = ehp->ether_type;
830 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
831 
832 		hmp->b_wptr += sizeof (struct ether_vlan_header);
833 		mp->b_rptr += sizeof (struct ether_header);
834 
835 		/*
836 		 * Free the original message if it's now empty. Link the
837 		 * rest of the messages to the header message.
838 		 */
839 		if (MBLKL(mp) == 0) {
840 			hmp->b_cont = mp->b_cont;
841 			freeb(mp);
842 		} else {
843 			hmp->b_cont = mp;
844 		}
845 		mp = hmp;
846 	}
847 
848 	if (pri == 0)
849 		pri = VLAN_PRI(old_tci);
850 	if (vid == VLAN_ID_NONE)
851 		vid = VLAN_ID(old_tci);
852 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
853 	return (mp);
854 }
855 
856 /*
857  * M_DATA put (IP fast-path mode)
858  */
859 mac_tx_cookie_t
860 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
861     uint16_t flag)
862 {
863 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
864 	mblk_t *newmp;
865 	uint_t pri;
866 	mac_tx_cookie_t cookie;
867 
868 	if (is_ethernet) {
869 		/*
870 		 * Update the priority bits to the assigned priority.
871 		 */
872 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
873 
874 		if (pri != 0) {
875 			newmp = i_dld_ether_header_update_tag(mp, pri,
876 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
877 			if (newmp == NULL)
878 				goto discard;
879 			mp = newmp;
880 		}
881 	}
882 
883 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
884 		DLD_SETQFULL(dsp);
885 	}
886 	return (cookie);
887 
888 discard:
889 	/* TODO: bump kstat? */
890 	freemsg(mp);
891 	return (NULL);
892 }
893 
894 /*
895  * M_DATA put (DLIOCRAW mode)
896  */
897 static void
898 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
899 {
900 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
901 	mblk_t *bp, *newmp;
902 	size_t size;
903 	mac_header_info_t mhi;
904 	uint_t pri, vid, dvid;
905 	uint_t max_sdu;
906 
907 	/*
908 	 * Certain MAC type plugins provide an illusion for raw DLPI
909 	 * consumers.  They pretend that the MAC layer is something that
910 	 * it's not for the benefit of observability tools.  For example,
911 	 * mac_wifi pretends that it's Ethernet for such consumers.
912 	 * Here, unless native mode is enabled, we call into the MAC layer so
913 	 * that this illusion can be maintained.  The plugin will optionally
914 	 * transform the MAC header here into something that can be passed
915 	 * down.  The header goes from raw mode to "cooked" mode.
916 	 */
917 	if (!dsp->ds_native) {
918 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
919 			goto discard;
920 		mp = newmp;
921 	}
922 
923 	size = MBLKL(mp);
924 
925 	/*
926 	 * Check the packet is not too big and that any remaining
927 	 * fragment list is composed entirely of M_DATA messages. (We
928 	 * know the first fragment was M_DATA otherwise we could not
929 	 * have got here).
930 	 */
931 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
932 		if (DB_TYPE(bp) != M_DATA)
933 			goto discard;
934 		size += MBLKL(bp);
935 	}
936 
937 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
938 		goto discard;
939 
940 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
941 	/*
942 	 * If LSO is enabled, check the size against lso_max. Otherwise,
943 	 * compare the packet size with max_sdu.
944 	 */
945 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
946 	if (size > max_sdu + mhi.mhi_hdrsize)
947 		goto discard;
948 
949 	if (is_ethernet) {
950 		dvid = mac_client_vid(dsp->ds_mch);
951 
952 		/*
953 		 * Discard the packet if this is a VLAN stream but the VID in
954 		 * the packet is not correct.
955 		 */
956 		vid = VLAN_ID(mhi.mhi_tci);
957 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
958 			goto discard;
959 
960 		/*
961 		 * Discard the packet if this packet is a tagged packet
962 		 * but both pri and VID are 0.
963 		 */
964 		pri = VLAN_PRI(mhi.mhi_tci);
965 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
966 		    vid == VLAN_ID_NONE)
967 			goto discard;
968 
969 		/*
970 		 * Update the priority bits to the per-stream priority if
971 		 * priority is not set in the packet. Update the VID for
972 		 * packets on a VLAN stream.
973 		 */
974 		pri = (pri == 0) ? dsp->ds_pri : 0;
975 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
976 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
977 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
978 				goto discard;
979 			}
980 			mp = newmp;
981 		}
982 	}
983 
984 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
985 		/* Turn on flow-control for dld */
986 		DLD_SETQFULL(dsp);
987 	}
988 	return;
989 
990 discard:
991 	/* TODO: bump kstat? */
992 	freemsg(mp);
993 }
994 
995 /*
996  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
997  */
998 int
999 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1000 {
1001 	dev_t			dev;
1002 	int			err;
1003 	const char		*drvname;
1004 	mac_perim_handle_t	mph = NULL;
1005 	boolean_t		qassociated = B_FALSE;
1006 	dls_link_t		*dlp = NULL;
1007 	dls_dl_handle_t		ddp = NULL;
1008 
1009 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1010 		return (EINVAL);
1011 
1012 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1013 		return (ENOTSUP);
1014 
1015 	/*
1016 	 * /dev node access. This will still be supported for backward
1017 	 * compatibility reason.
1018 	 */
1019 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1020 	    (strcmp(drvname, "vnic") != 0)) {
1021 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1022 			return (EINVAL);
1023 		qassociated = B_TRUE;
1024 	}
1025 
1026 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1027 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1028 		goto failed;
1029 
1030 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1031 		goto failed;
1032 
1033 	/*
1034 	 * Open a channel.
1035 	 */
1036 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1037 		goto failed;
1038 
1039 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
1040 		goto failed;
1041 
1042 	/*
1043 	 * Set the default packet priority.
1044 	 */
1045 	dsp->ds_pri = 0;
1046 
1047 	/*
1048 	 * Add a notify function so that the we get updates from the MAC.
1049 	 */
1050 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1051 	dsp->ds_dlstate = DL_UNBOUND;
1052 	mac_perim_exit(mph);
1053 	return (0);
1054 
1055 failed:
1056 	if (dlp != NULL)
1057 		dls_link_rele(dlp);
1058 	if (mph != NULL)
1059 		mac_perim_exit(mph);
1060 	if (ddp != NULL)
1061 		dls_devnet_rele(ddp);
1062 	if (qassociated)
1063 		(void) qassociate(dsp->ds_wq, -1);
1064 
1065 	return (err);
1066 }
1067 
1068 /*
1069  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1070  * from close(2) for style 2.
1071  */
1072 void
1073 dld_str_detach(dld_str_t *dsp)
1074 {
1075 	mac_perim_handle_t	mph;
1076 	int			err;
1077 
1078 	ASSERT(dsp->ds_datathr_cnt == 0);
1079 
1080 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1081 	/*
1082 	 * Remove the notify function.
1083 	 *
1084 	 * Note that we cannot wait for the notification callback to be removed
1085 	 * since it could cause the deadlock with str_notify() since they both
1086 	 * need the mac perimeter. Continue if we cannot remove the
1087 	 * notification callback right now and wait after we leave the
1088 	 * perimeter.
1089 	 */
1090 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1091 	dsp->ds_mnh = NULL;
1092 
1093 	/*
1094 	 * Disable the capabilities
1095 	 */
1096 	dld_capabilities_disable(dsp);
1097 
1098 	/*
1099 	 * Clear LSO flags.
1100 	 */
1101 	dsp->ds_lso = B_FALSE;
1102 	dsp->ds_lso_max = 0;
1103 
1104 	dls_close(dsp);
1105 	mac_perim_exit(mph);
1106 
1107 	/*
1108 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1109 	 * because the notification callback was in progress, wait for
1110 	 * it to finish before we proceed.
1111 	 */
1112 	if (err != 0)
1113 		mac_notify_remove_wait(dsp->ds_mh);
1114 
1115 	/*
1116 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1117 	 * automatically in the call to dls_devnet_rele.
1118 	 */
1119 	dls_devnet_rele(dsp->ds_ddh);
1120 
1121 	dsp->ds_sap = 0;
1122 	dsp->ds_mh = NULL;
1123 	dsp->ds_mch = NULL;
1124 	dsp->ds_mip = NULL;
1125 
1126 	if (dsp->ds_style == DL_STYLE2)
1127 		(void) qassociate(dsp->ds_wq, -1);
1128 
1129 	/*
1130 	 * Re-initialize the DLPI state machine.
1131 	 */
1132 	dsp->ds_dlstate = DL_UNATTACHED;
1133 }
1134 
1135 /*
1136  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1137  * tags before sending packets up to the DLS clients, with the exception of
1138  * special priority tagged packets, in that case, we set the VID to 0.
1139  * mp must be a VLAN tagged packet.
1140  */
1141 static mblk_t *
1142 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1143 {
1144 	mblk_t *newmp;
1145 	struct ether_vlan_header *evhp;
1146 	uint16_t tci, new_tci;
1147 
1148 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1149 	if (DB_REF(mp) > 1) {
1150 		newmp = copymsg(mp);
1151 		if (newmp == NULL)
1152 			return (NULL);
1153 		freemsg(mp);
1154 		mp = newmp;
1155 	}
1156 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1157 
1158 	tci = ntohs(evhp->ether_tci);
1159 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1160 		/*
1161 		 * Priority is 0, strip the tag.
1162 		 */
1163 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1164 		mp->b_rptr += VLAN_TAGSZ;
1165 	} else {
1166 		/*
1167 		 * Priority is not 0, update the VID to 0.
1168 		 */
1169 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1170 		evhp->ether_tci = htons(new_tci);
1171 	}
1172 	return (mp);
1173 }
1174 
1175 /*
1176  * Raw mode receive function.
1177  */
1178 /*ARGSUSED*/
1179 void
1180 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1181     mac_header_info_t *mhip)
1182 {
1183 	dld_str_t *dsp = (dld_str_t *)arg;
1184 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1185 	mblk_t *next, *newmp;
1186 
1187 	ASSERT(mp != NULL);
1188 	do {
1189 		/*
1190 		 * Get the pointer to the next packet in the chain and then
1191 		 * clear b_next before the packet gets passed on.
1192 		 */
1193 		next = mp->b_next;
1194 		mp->b_next = NULL;
1195 
1196 		/*
1197 		 * Wind back b_rptr to point at the MAC header.
1198 		 */
1199 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1200 		mp->b_rptr -= mhip->mhi_hdrsize;
1201 
1202 		/*
1203 		 * Certain MAC type plugins provide an illusion for raw
1204 		 * DLPI consumers.  They pretend that the MAC layer is
1205 		 * something that it's not for the benefit of observability
1206 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1207 		 * for such consumers.	Here, unless native mode is enabled,
1208 		 * we call into the MAC layer so that this illusion can be
1209 		 * maintained.	The plugin will optionally transform the MAC
1210 		 * header here into something that can be passed up to raw
1211 		 * consumers.  The header goes from "cooked" mode to raw mode.
1212 		 */
1213 		if (!dsp->ds_native) {
1214 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1215 			if (newmp == NULL) {
1216 				freemsg(mp);
1217 				goto next;
1218 			}
1219 			mp = newmp;
1220 		}
1221 
1222 		/*
1223 		 * Strip the VLAN tag for VLAN streams.
1224 		 */
1225 		if (is_ethernet &&
1226 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1227 			/*
1228 			 * The priority should be kept only for VLAN
1229 			 * data-links.
1230 			 */
1231 			newmp = i_dld_ether_header_strip_tag(mp,
1232 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1233 			if (newmp == NULL) {
1234 				freemsg(mp);
1235 				goto next;
1236 			}
1237 			mp = newmp;
1238 		}
1239 
1240 		/*
1241 		 * Pass the packet on.
1242 		 */
1243 		if (canputnext(dsp->ds_rq))
1244 			putnext(dsp->ds_rq, mp);
1245 		else
1246 			freemsg(mp);
1247 
1248 next:
1249 		/*
1250 		 * Move on to the next packet in the chain.
1251 		 */
1252 		mp = next;
1253 	} while (mp != NULL);
1254 }
1255 
1256 /*
1257  * Fast-path receive function.
1258  */
1259 /*ARGSUSED*/
1260 void
1261 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1262     mac_header_info_t *mhip)
1263 {
1264 	dld_str_t *dsp = (dld_str_t *)arg;
1265 	mblk_t *next;
1266 	size_t offset = 0;
1267 
1268 	/*
1269 	 * MAC header stripping rules:
1270 	 *    - Tagged packets:
1271 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1272 	 *	b. Physical streams
1273 	 *	- VLAN packets (non-zero VID). The stream must be either a
1274 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1275 	 *	  Strip the Ethernet header but keep the VLAN header.
1276 	 *	- Special tagged packets (zero VID)
1277 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1278 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1279 	 *	    keep the VLAN header.
1280 	 *	  * Otherwise, strip the whole VLAN header.
1281 	 *    - Untagged packets. Strip the whole MAC header.
1282 	 */
1283 	if (mhip->mhi_istagged &&
1284 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1285 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1286 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1287 		offset = VLAN_TAGSZ;
1288 	}
1289 
1290 	ASSERT(mp != NULL);
1291 	do {
1292 		/*
1293 		 * Get the pointer to the next packet in the chain and then
1294 		 * clear b_next before the packet gets passed on.
1295 		 */
1296 		next = mp->b_next;
1297 		mp->b_next = NULL;
1298 
1299 		/*
1300 		 * Wind back b_rptr to point at the VLAN header.
1301 		 */
1302 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1303 		mp->b_rptr -= offset;
1304 
1305 		/*
1306 		 * Pass the packet on.
1307 		 */
1308 		if (canputnext(dsp->ds_rq))
1309 			putnext(dsp->ds_rq, mp);
1310 		else
1311 			freemsg(mp);
1312 		/*
1313 		 * Move on to the next packet in the chain.
1314 		 */
1315 		mp = next;
1316 	} while (mp != NULL);
1317 }
1318 
1319 /*
1320  * Default receive function (send DL_UNITDATA_IND messages).
1321  */
1322 /*ARGSUSED*/
1323 void
1324 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1325     mac_header_info_t *mhip)
1326 {
1327 	dld_str_t		*dsp = (dld_str_t *)arg;
1328 	mblk_t			*ud_mp;
1329 	mblk_t			*next;
1330 	size_t			offset = 0;
1331 	boolean_t		strip_vlan = B_TRUE;
1332 
1333 	/*
1334 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1335 	 */
1336 	if (mhip->mhi_istagged &&
1337 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1338 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1339 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1340 		offset = VLAN_TAGSZ;
1341 		strip_vlan = B_FALSE;
1342 	}
1343 
1344 	ASSERT(mp != NULL);
1345 	do {
1346 		/*
1347 		 * Get the pointer to the next packet in the chain and then
1348 		 * clear b_next before the packet gets passed on.
1349 		 */
1350 		next = mp->b_next;
1351 		mp->b_next = NULL;
1352 
1353 		/*
1354 		 * Wind back b_rptr to point at the MAC header.
1355 		 */
1356 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1357 		mp->b_rptr -= mhip->mhi_hdrsize;
1358 
1359 		/*
1360 		 * Create the DL_UNITDATA_IND M_PROTO.
1361 		 */
1362 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1363 			freemsgchain(mp);
1364 			return;
1365 		}
1366 
1367 		/*
1368 		 * Advance b_rptr to point at the payload (or the VLAN header).
1369 		 */
1370 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1371 
1372 		/*
1373 		 * Prepend the DL_UNITDATA_IND.
1374 		 */
1375 		ud_mp->b_cont = mp;
1376 
1377 		/*
1378 		 * Send the message.
1379 		 */
1380 		if (canputnext(dsp->ds_rq))
1381 			putnext(dsp->ds_rq, ud_mp);
1382 		else
1383 			freemsg(ud_mp);
1384 
1385 		/*
1386 		 * Move on to the next packet in the chain.
1387 		 */
1388 		mp = next;
1389 	} while (mp != NULL);
1390 }
1391 
1392 /*
1393  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1394  */
1395 static void
1396 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1397 {
1398 	mblk_t		*mp;
1399 	dl_notify_ind_t *dlip;
1400 
1401 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1402 		return;
1403 
1404 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1405 	    M_PROTO, 0)) == NULL)
1406 		return;
1407 
1408 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1409 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1410 	dlip->dl_primitive = DL_NOTIFY_IND;
1411 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1412 	dlip->dl_data = max_sdu;
1413 
1414 	qreply(dsp->ds_wq, mp);
1415 }
1416 
1417 /*
1418  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1419  * current state of the interface.
1420  */
1421 void
1422 dld_str_notify_ind(dld_str_t *dsp)
1423 {
1424 	mac_notify_type_t	type;
1425 
1426 	for (type = 0; type < MAC_NNOTE; type++)
1427 		str_notify(dsp, type);
1428 }
1429 
1430 typedef struct dl_unitdata_ind_wrapper {
1431 	dl_unitdata_ind_t	dl_unitdata;
1432 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1433 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1434 } dl_unitdata_ind_wrapper_t;
1435 
1436 /*
1437  * Create a DL_UNITDATA_IND M_PROTO message.
1438  */
1439 static mblk_t *
1440 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1441 {
1442 	mblk_t				*nmp;
1443 	dl_unitdata_ind_wrapper_t	*dlwp;
1444 	dl_unitdata_ind_t		*dlp;
1445 	mac_header_info_t		mhi;
1446 	uint_t				addr_length;
1447 	uint8_t				*daddr;
1448 	uint8_t				*saddr;
1449 
1450 	/*
1451 	 * Get the packet header information.
1452 	 */
1453 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
1454 		return (NULL);
1455 
1456 	/*
1457 	 * Allocate a message large enough to contain the wrapper structure
1458 	 * defined above.
1459 	 */
1460 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1461 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1462 	    DL_UNITDATA_IND)) == NULL)
1463 		return (NULL);
1464 
1465 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1466 
1467 	dlp = &(dlwp->dl_unitdata);
1468 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1469 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1470 
1471 	/*
1472 	 * Copy in the destination address.
1473 	 */
1474 	addr_length = dsp->ds_mip->mi_addr_length;
1475 	daddr = dlwp->dl_dest_addr;
1476 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1477 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1478 
1479 	/*
1480 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1481 	 */
1482 	if (mhi.mhi_istagged && !strip_vlan)
1483 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1484 	else
1485 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1486 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1487 
1488 	/*
1489 	 * If the destination address was multicast or broadcast then the
1490 	 * dl_group_address field should be non-zero.
1491 	 */
1492 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1493 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1494 
1495 	/*
1496 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1497 	 * for example) may not have access to source information.
1498 	 */
1499 	if (mhi.mhi_saddr == NULL) {
1500 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1501 	} else {
1502 		saddr = dlwp->dl_src_addr;
1503 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1504 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1505 
1506 		/*
1507 		 * Set the source DLSAP to the packet ethertype.
1508 		 */
1509 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1510 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1511 	}
1512 
1513 	return (nmp);
1514 }
1515 
1516 /*
1517  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1518  */
1519 static void
1520 str_notify_promisc_on_phys(dld_str_t *dsp)
1521 {
1522 	mblk_t		*mp;
1523 	dl_notify_ind_t	*dlip;
1524 
1525 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1526 		return;
1527 
1528 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1529 	    M_PROTO, 0)) == NULL)
1530 		return;
1531 
1532 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1533 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1534 	dlip->dl_primitive = DL_NOTIFY_IND;
1535 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1536 
1537 	qreply(dsp->ds_wq, mp);
1538 }
1539 
1540 /*
1541  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1542  */
1543 static void
1544 str_notify_promisc_off_phys(dld_str_t *dsp)
1545 {
1546 	mblk_t		*mp;
1547 	dl_notify_ind_t	*dlip;
1548 
1549 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1550 		return;
1551 
1552 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1553 	    M_PROTO, 0)) == NULL)
1554 		return;
1555 
1556 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1557 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1558 	dlip->dl_primitive = DL_NOTIFY_IND;
1559 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1560 
1561 	qreply(dsp->ds_wq, mp);
1562 }
1563 
1564 /*
1565  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1566  */
1567 static void
1568 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1569 {
1570 	mblk_t		*mp;
1571 	dl_notify_ind_t	*dlip;
1572 	uint_t		addr_length;
1573 	uint16_t	ethertype;
1574 
1575 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1576 		return;
1577 
1578 	addr_length = dsp->ds_mip->mi_addr_length;
1579 	if ((mp = mexchange(dsp->ds_wq, NULL,
1580 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1581 	    M_PROTO, 0)) == NULL)
1582 		return;
1583 
1584 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1585 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1586 	dlip->dl_primitive = DL_NOTIFY_IND;
1587 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1588 	dlip->dl_data = addr_type;
1589 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1590 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1591 
1592 	bcopy(addr, &dlip[1], addr_length);
1593 
1594 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1595 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1596 
1597 	qreply(dsp->ds_wq, mp);
1598 }
1599 
1600 /*
1601  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1602  */
1603 static void
1604 str_notify_link_up(dld_str_t *dsp)
1605 {
1606 	mblk_t		*mp;
1607 	dl_notify_ind_t	*dlip;
1608 
1609 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1610 		return;
1611 
1612 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1613 	    M_PROTO, 0)) == NULL)
1614 		return;
1615 
1616 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1617 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1618 	dlip->dl_primitive = DL_NOTIFY_IND;
1619 	dlip->dl_notification = DL_NOTE_LINK_UP;
1620 
1621 	qreply(dsp->ds_wq, mp);
1622 }
1623 
1624 /*
1625  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1626  */
1627 static void
1628 str_notify_link_down(dld_str_t *dsp)
1629 {
1630 	mblk_t		*mp;
1631 	dl_notify_ind_t	*dlip;
1632 
1633 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1634 		return;
1635 
1636 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1637 	    M_PROTO, 0)) == NULL)
1638 		return;
1639 
1640 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1641 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1642 	dlip->dl_primitive = DL_NOTIFY_IND;
1643 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1644 
1645 	qreply(dsp->ds_wq, mp);
1646 }
1647 
1648 /*
1649  * DL_NOTIFY_IND: DL_NOTE_SPEED
1650  */
1651 static void
1652 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1653 {
1654 	mblk_t		*mp;
1655 	dl_notify_ind_t	*dlip;
1656 
1657 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1658 		return;
1659 
1660 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1661 	    M_PROTO, 0)) == NULL)
1662 		return;
1663 
1664 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1665 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1666 	dlip->dl_primitive = DL_NOTIFY_IND;
1667 	dlip->dl_notification = DL_NOTE_SPEED;
1668 	dlip->dl_data = speed;
1669 
1670 	qreply(dsp->ds_wq, mp);
1671 }
1672 
1673 /*
1674  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1675  */
1676 static void
1677 str_notify_capab_reneg(dld_str_t *dsp)
1678 {
1679 	mblk_t		*mp;
1680 	dl_notify_ind_t	*dlip;
1681 
1682 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1683 		return;
1684 
1685 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1686 	    M_PROTO, 0)) == NULL)
1687 		return;
1688 
1689 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1690 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1691 	dlip->dl_primitive = DL_NOTIFY_IND;
1692 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1693 
1694 	qreply(dsp->ds_wq, mp);
1695 }
1696 
1697 /*
1698  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1699  */
1700 static void
1701 str_notify_fastpath_flush(dld_str_t *dsp)
1702 {
1703 	mblk_t		*mp;
1704 	dl_notify_ind_t	*dlip;
1705 
1706 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1707 		return;
1708 
1709 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1710 	    M_PROTO, 0)) == NULL)
1711 		return;
1712 
1713 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1714 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1715 	dlip->dl_primitive = DL_NOTIFY_IND;
1716 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1717 
1718 	qreply(dsp->ds_wq, mp);
1719 }
1720 
1721 /*
1722  * MAC notification callback.
1723  */
1724 void
1725 str_notify(void *arg, mac_notify_type_t type)
1726 {
1727 	dld_str_t		*dsp = (dld_str_t *)arg;
1728 	queue_t			*q = dsp->ds_wq;
1729 	mac_handle_t		mh = dsp->ds_mh;
1730 	mac_client_handle_t	mch = dsp->ds_mch;
1731 	uint8_t			addr[MAXMACADDRLEN];
1732 
1733 	switch (type) {
1734 	case MAC_NOTE_TX:
1735 		qenable(q);
1736 		break;
1737 
1738 	case MAC_NOTE_DEVPROMISC:
1739 		/*
1740 		 * Send the appropriate DL_NOTIFY_IND.
1741 		 */
1742 		if (mac_promisc_get(mh))
1743 			str_notify_promisc_on_phys(dsp);
1744 		else
1745 			str_notify_promisc_off_phys(dsp);
1746 		break;
1747 
1748 	case MAC_NOTE_UNICST:
1749 		/*
1750 		 * This notification is sent whenever the MAC unicast
1751 		 * address changes.
1752 		 */
1753 		mac_unicast_primary_get(mh, addr);
1754 
1755 		/*
1756 		 * Send the appropriate DL_NOTIFY_IND.
1757 		 */
1758 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1759 		break;
1760 
1761 	case MAC_NOTE_DEST:
1762 		/*
1763 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
1764 		 * destination address.
1765 		 */
1766 		if (mac_dst_get(dsp->ds_mh, addr))
1767 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1768 		break;
1769 
1770 	case MAC_NOTE_LOWLINK:
1771 	case MAC_NOTE_LINK:
1772 		/*
1773 		 * LOWLINK refers to the actual link status. For links that
1774 		 * are not part of a bridge instance LOWLINK and LINK state
1775 		 * are the same. But for a link part of a bridge instance
1776 		 * LINK state refers to the aggregate link status: "up" when
1777 		 * at least one link part of the bridge is up and is "down"
1778 		 * when all links part of the bridge are down.
1779 		 *
1780 		 * Clients can request to be notified of the LOWLINK state
1781 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1782 		 * daemon request lowlink state changes and upper layer clients
1783 		 * receive notifications of the aggregate link state changes
1784 		 * which is the default when requesting LINK UP/DOWN state
1785 		 * notifications.
1786 		 */
1787 
1788 		/*
1789 		 * Check that the notification type matches the one that we
1790 		 * want.  If we want lower-level link notifications, and this
1791 		 * is upper, or if we want upper and this is lower, then
1792 		 * ignore.
1793 		 */
1794 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1795 			break;
1796 		/*
1797 		 * This notification is sent every time the MAC driver
1798 		 * updates the link state.
1799 		 */
1800 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1801 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1802 		case LINK_STATE_UP: {
1803 			uint64_t speed;
1804 			/*
1805 			 * The link is up so send the appropriate
1806 			 * DL_NOTIFY_IND.
1807 			 */
1808 			str_notify_link_up(dsp);
1809 
1810 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1811 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1812 			break;
1813 		}
1814 		case LINK_STATE_DOWN:
1815 			/*
1816 			 * The link is down so send the appropriate
1817 			 * DL_NOTIFY_IND.
1818 			 */
1819 			str_notify_link_down(dsp);
1820 			break;
1821 
1822 		default:
1823 			break;
1824 		}
1825 		break;
1826 
1827 	case MAC_NOTE_CAPAB_CHG:
1828 		/*
1829 		 * This notification is sent whenever the MAC resources
1830 		 * change or capabilities change. We need to renegotiate
1831 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1832 		 */
1833 		str_notify_capab_reneg(dsp);
1834 		break;
1835 
1836 	case MAC_NOTE_SDU_SIZE: {
1837 		uint_t  max_sdu;
1838 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1839 		str_notify_sdu_size(dsp, max_sdu);
1840 		break;
1841 	}
1842 
1843 	case MAC_NOTE_FASTPATH_FLUSH:
1844 		str_notify_fastpath_flush(dsp);
1845 		break;
1846 
1847 	/* Unused notifications */
1848 	case MAC_NOTE_MARGIN:
1849 		break;
1850 
1851 	default:
1852 		ASSERT(B_FALSE);
1853 		break;
1854 	}
1855 }
1856 
1857 /*
1858  * This function is called via a taskq mechansim to process all control
1859  * messages on a per 'dsp' end point.
1860  */
1861 static void
1862 dld_wput_nondata_task(void *arg)
1863 {
1864 	dld_str_t	*dsp = arg;
1865 	mblk_t		*mp;
1866 
1867 	mutex_enter(&dsp->ds_lock);
1868 	while (dsp->ds_pending_head != NULL) {
1869 		mp = dsp->ds_pending_head;
1870 		dsp->ds_pending_head = mp->b_next;
1871 		mp->b_next = NULL;
1872 		if (dsp->ds_pending_head == NULL)
1873 			dsp->ds_pending_tail = NULL;
1874 		mutex_exit(&dsp->ds_lock);
1875 
1876 		switch (DB_TYPE(mp)) {
1877 		case M_PROTO:
1878 		case M_PCPROTO:
1879 			dld_proto(dsp, mp);
1880 			break;
1881 		case M_IOCTL:
1882 			dld_ioc(dsp, mp);
1883 			break;
1884 		default:
1885 			ASSERT(0);
1886 		}
1887 
1888 		mutex_enter(&dsp->ds_lock);
1889 	}
1890 	ASSERT(dsp->ds_pending_tail == NULL);
1891 	dsp->ds_dlpi_pending = 0;
1892 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1893 	mutex_exit(&dsp->ds_lock);
1894 }
1895 
1896 /*
1897  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1898  * thread is started at boot time.
1899  */
1900 static void
1901 dld_taskq_dispatch(void)
1902 {
1903 	callb_cpr_t	cprinfo;
1904 	dld_str_t	*dsp;
1905 
1906 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1907 	    "dld_taskq_dispatch");
1908 	mutex_enter(&dld_taskq_lock);
1909 
1910 	while (!dld_taskq_quit) {
1911 		dsp = list_head(&dld_taskq_list);
1912 		while (dsp != NULL) {
1913 			list_remove(&dld_taskq_list, dsp);
1914 			mutex_exit(&dld_taskq_lock);
1915 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1916 			    dsp, TQ_SLEEP) != 0);
1917 			mutex_enter(&dld_taskq_lock);
1918 			dsp = list_head(&dld_taskq_list);
1919 		}
1920 
1921 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1922 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1923 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1924 	}
1925 
1926 	dld_taskq_done = B_TRUE;
1927 	cv_signal(&dld_taskq_cv);
1928 	CALLB_CPR_EXIT(&cprinfo);
1929 	thread_exit();
1930 }
1931 
1932 /*
1933  * All control operations are serialized on the 'dsp' and are also funneled
1934  * through a taskq mechanism to ensure that subsequent processing has kernel
1935  * context and can safely use cv_wait.
1936  *
1937  * Mechanisms to handle taskq dispatch failures
1938  *
1939  * The only way to be sure that taskq dispatch does not fail is to either
1940  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1941  * some number of entries and make sure that the number of outstanding requests
1942  * are less than that number. We can't use TQ_SLEEP since we don't know the
1943  * context. Nor can we bound the total number of 'dsp' end points. So we are
1944  * unable to use either of the above schemes, and are forced to deal with
1945  * taskq dispatch failures. Note that even dynamic taskq could fail in
1946  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1947  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1948  * framework.
1949  *
1950  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1951  * We also have a single global thread to retry the taskq dispatch. This
1952  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1953  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1954  */
1955 static void
1956 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1957 {
1958 	ASSERT(mp->b_next == NULL);
1959 	mutex_enter(&dsp->ds_lock);
1960 	if (dsp->ds_pending_head != NULL) {
1961 		ASSERT(dsp->ds_dlpi_pending);
1962 		dsp->ds_pending_tail->b_next = mp;
1963 		dsp->ds_pending_tail = mp;
1964 		mutex_exit(&dsp->ds_lock);
1965 		return;
1966 	}
1967 	ASSERT(dsp->ds_pending_tail == NULL);
1968 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
1969 	/*
1970 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
1971 	 * thread is still active and is processing the last message, though
1972 	 * the pending queue has been emptied.
1973 	 */
1974 	if (dsp->ds_dlpi_pending) {
1975 		mutex_exit(&dsp->ds_lock);
1976 		return;
1977 	}
1978 
1979 	dsp->ds_dlpi_pending = 1;
1980 	mutex_exit(&dsp->ds_lock);
1981 
1982 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
1983 	    TQ_NOSLEEP) != 0)
1984 		return;
1985 
1986 	mutex_enter(&dld_taskq_lock);
1987 	list_insert_tail(&dld_taskq_list, dsp);
1988 	cv_signal(&dld_taskq_cv);
1989 	mutex_exit(&dld_taskq_lock);
1990 }
1991 
1992 /*
1993  * Process an M_IOCTL message.
1994  */
1995 static void
1996 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1997 {
1998 	uint_t			cmd;
1999 
2000 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2001 	ASSERT(dsp->ds_type == DLD_DLPI);
2002 
2003 	switch (cmd) {
2004 	case DLIOCNATIVE:
2005 		ioc_native(dsp, mp);
2006 		break;
2007 	case DLIOCMARGININFO:
2008 		ioc_margin(dsp, mp);
2009 		break;
2010 	case DLIOCRAW:
2011 		ioc_raw(dsp, mp);
2012 		break;
2013 	case DLIOCHDRINFO:
2014 		ioc_fast(dsp, mp);
2015 		break;
2016 	case DLIOCLOWLINK:
2017 		ioc_lowlink(dsp, mp);
2018 		break;
2019 	default:
2020 		ioc(dsp, mp);
2021 	}
2022 }
2023 
2024 /*
2025  * DLIOCNATIVE
2026  */
2027 static void
2028 ioc_native(dld_str_t *dsp, mblk_t *mp)
2029 {
2030 	queue_t *q = dsp->ds_wq;
2031 	const mac_info_t *mip = dsp->ds_mip;
2032 
2033 	/*
2034 	 * Native mode can be enabled if it's disabled and if the
2035 	 * native media type is different.
2036 	 */
2037 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2038 		dsp->ds_native = B_TRUE;
2039 
2040 	if (dsp->ds_native)
2041 		miocack(q, mp, 0, mip->mi_nativemedia);
2042 	else
2043 		miocnak(q, mp, 0, ENOTSUP);
2044 }
2045 
2046 /*
2047  * DLIOCMARGININFO
2048  */
2049 static void
2050 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2051 {
2052 	queue_t *q = dsp->ds_wq;
2053 	uint32_t margin;
2054 	int err;
2055 
2056 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2057 		err = EINVAL;
2058 		goto failed;
2059 	}
2060 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2061 		goto failed;
2062 
2063 	mac_margin_get(dsp->ds_mh, &margin);
2064 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2065 	miocack(q, mp, sizeof (uint32_t), 0);
2066 	return;
2067 
2068 failed:
2069 	miocnak(q, mp, 0, err);
2070 }
2071 
2072 /*
2073  * DLIOCRAW
2074  */
2075 static void
2076 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2077 {
2078 	queue_t *q = dsp->ds_wq;
2079 	mac_perim_handle_t	mph;
2080 
2081 	if (dsp->ds_mh == NULL) {
2082 		dsp->ds_mode = DLD_RAW;
2083 		miocack(q, mp, 0, 0);
2084 		return;
2085 	}
2086 
2087 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2088 	if (dsp->ds_polling || dsp->ds_direct) {
2089 		mac_perim_exit(mph);
2090 		miocnak(q, mp, 0, EPROTO);
2091 		return;
2092 	}
2093 
2094 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2095 		/*
2096 		 * Set the receive callback.
2097 		 */
2098 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2099 	}
2100 
2101 	/*
2102 	 * Note that raw mode is enabled.
2103 	 */
2104 	dsp->ds_mode = DLD_RAW;
2105 	mac_perim_exit(mph);
2106 
2107 	miocack(q, mp, 0, 0);
2108 }
2109 
2110 /*
2111  * DLIOCHDRINFO
2112  */
2113 static void
2114 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2115 {
2116 	dl_unitdata_req_t *dlp;
2117 	off_t		off;
2118 	size_t		len;
2119 	const uint8_t	*addr;
2120 	uint16_t	sap;
2121 	mblk_t		*nmp;
2122 	mblk_t		*hmp;
2123 	uint_t		addr_length;
2124 	queue_t		*q = dsp->ds_wq;
2125 	int		err;
2126 	mac_perim_handle_t	mph;
2127 
2128 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2129 		err = ENOTSUP;
2130 		goto failed;
2131 	}
2132 
2133 	/*
2134 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2135 	 * user-land should not be allowed.
2136 	 */
2137 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2138 		err = EINVAL;
2139 		goto failed;
2140 	}
2141 
2142 	nmp = mp->b_cont;
2143 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2144 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2145 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2146 		err = EINVAL;
2147 		goto failed;
2148 	}
2149 
2150 	off = dlp->dl_dest_addr_offset;
2151 	len = dlp->dl_dest_addr_length;
2152 
2153 	if (!MBLKIN(nmp, off, len)) {
2154 		err = EINVAL;
2155 		goto failed;
2156 	}
2157 
2158 	if (dsp->ds_dlstate != DL_IDLE) {
2159 		err = ENOTSUP;
2160 		goto failed;
2161 	}
2162 
2163 	addr_length = dsp->ds_mip->mi_addr_length;
2164 	if (len != addr_length + sizeof (uint16_t)) {
2165 		err = EINVAL;
2166 		goto failed;
2167 	}
2168 
2169 	addr = nmp->b_rptr + off;
2170 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2171 
2172 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2173 		err = ENOMEM;
2174 		goto failed;
2175 	}
2176 
2177 	/*
2178 	 * This ioctl might happen concurrently with a direct call to dld_capab
2179 	 * that tries to enable direct and/or poll capabilities. Since the
2180 	 * stack does not serialize them, we do so here to avoid mixing
2181 	 * the callbacks.
2182 	 */
2183 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2184 	if (dsp->ds_mode != DLD_FASTPATH) {
2185 		/*
2186 		 * Set the receive callback (unless polling is enabled).
2187 		 */
2188 		if (!dsp->ds_polling && !dsp->ds_direct)
2189 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2190 
2191 		/*
2192 		 * Note that fast-path mode is enabled.
2193 		 */
2194 		dsp->ds_mode = DLD_FASTPATH;
2195 	}
2196 	mac_perim_exit(mph);
2197 
2198 	freemsg(nmp->b_cont);
2199 	nmp->b_cont = hmp;
2200 
2201 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2202 	return;
2203 failed:
2204 	miocnak(q, mp, 0, err);
2205 }
2206 
2207 /*
2208  * DLIOCLOWLINK: request actual link state changes. When the
2209  * link is part of a bridge instance the client receives actual
2210  * link state changes and not the aggregate link status. Used by
2211  * the bridging daemon (bridged) for proper RSTP operation.
2212  */
2213 static void
2214 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2215 {
2216 	queue_t *q = dsp->ds_wq;
2217 	int err;
2218 
2219 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
2220 		miocnak(q, mp, 0, err);
2221 	} else {
2222 		/* LINTED: alignment */
2223 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2224 		miocack(q, mp, 0, 0);
2225 	}
2226 }
2227 
2228 /*
2229  * Catch-all handler.
2230  */
2231 static void
2232 ioc(dld_str_t *dsp, mblk_t *mp)
2233 {
2234 	queue_t	*q = dsp->ds_wq;
2235 
2236 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2237 		miocnak(q, mp, 0, EINVAL);
2238 		return;
2239 	}
2240 	mac_ioctl(dsp->ds_mh, q, mp);
2241 }
2242