xref: /titanic_50/usr/src/uts/common/io/dld/dld_str.c (revision f899e5733f35e45012ad40c8325b2622dcc2b673)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Driver
28  */
29 
30 #include	<inet/common.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/vlan.h>
35 #include	<sys/dld_impl.h>
36 #include	<sys/cpuvar.h>
37 #include	<sys/callb.h>
38 #include	<sys/list.h>
39 #include	<sys/mac_client.h>
40 #include	<sys/mac_client_priv.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
61 
62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
64     link_tagmode_t);
65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
66 
67 static uint32_t		str_count;
68 static kmem_cache_t	*str_cachep;
69 static mod_hash_t	*str_hashp;
70 
71 #define	STR_HASHSZ		64
72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
73 
74 #define	dld_taskq	system_taskq
75 
76 static kmutex_t		dld_taskq_lock;
77 static kcondvar_t	dld_taskq_cv;
78 static list_t		dld_taskq_list;		/* List of dld_str_t */
79 boolean_t		dld_taskq_quit;
80 boolean_t		dld_taskq_done;
81 
82 static void		dld_taskq_dispatch(void);
83 
84 /*
85  * Some notes on entry points, flow-control, queueing.
86  *
87  * This driver exports the traditional STREAMS put entry point as well as
88  * the non-STREAMS fast-path transmit routine which is provided to IP via
89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
90  * and data operations, while the fast-path routine deals only with M_DATA
91  * fast-path packets.  Regardless of the entry point, all outbound packets
92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
93  *
94  * The transmit logic operates in the following way: All packets coming
95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
96  * happens when the MAC layer indicates the packets couldn't be
97  * transmitted due to 1) lack of resources (e.g. running out of
98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
99  * particular flow. The indication comes in the form of a Tx cookie that
100  * identifies the blocked ring. In such case, DLD will place a
101  * dummy message on its write-side STREAMS queue so that the queue is
102  * marked as "full". Any subsequent packets arriving at the driver will
103  * still be sent to the MAC layer where it either gets queued in the Tx
104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
106  * When the write service procedure runs, it will remove the dummy
107  * message from the write-side STREAMS queue; in effect this will trigger
108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
109  * respectively, due to the above reasons.
110  *
111  * All non-data operations, both DLPI and ioctls are single threaded on a per
112  * dld_str_t endpoint. This is done using a taskq so that the control operation
113  * has kernel context and can cv_wait for resources. In addition all set type
114  * operations that involve mac level state modification are serialized on a
115  * per mac end point using the perimeter mechanism provided by the mac layer.
116  * This serializes all mac clients trying to modify a single mac end point over
117  * the entire sequence of mac calls made by that client as an atomic unit. The
118  * mac framework locking is described in mac.c. A critical element is that
119  * DLD/DLS does not hold any locks across the mac perimeter.
120  *
121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
124  * is returned. If the mac handle is non-null, it can be safely accessed
125  * below. The mac handle won't be freed until the mac_unregister which
126  * won't happen until the driver detaches. The DDI framework ensures that
127  * the detach won't happen while a getinfo is in progress.
128  */
129 typedef struct i_dld_str_state_s {
130 	major_t		ds_major;
131 	minor_t		ds_minor;
132 	dev_info_t	*ds_dip;
133 } i_dld_str_state_t;
134 
135 /* ARGSUSED */
136 static uint_t
137 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
138 {
139 	i_dld_str_state_t	*statep = arg;
140 	dld_str_t		*dsp = (dld_str_t *)val;
141 	mac_handle_t		mh;
142 
143 	if (statep->ds_major != dsp->ds_major)
144 		return (MH_WALK_CONTINUE);
145 
146 	ASSERT(statep->ds_minor != 0);
147 	mh = dsp->ds_mh;
148 
149 	if (statep->ds_minor == dsp->ds_minor) {
150 		/*
151 		 * Clone: a clone minor is unique. we can terminate the
152 		 * walk if we find a matching stream -- even if we fail
153 		 * to obtain the devinfo.
154 		 */
155 		if (mh != NULL)
156 			statep->ds_dip = mac_devinfo_get(mh);
157 		return (MH_WALK_TERMINATE);
158 	}
159 	return (MH_WALK_CONTINUE);
160 }
161 
162 static dev_info_t *
163 dld_finddevinfo(dev_t dev)
164 {
165 	dev_info_t		*dip;
166 	i_dld_str_state_t	state;
167 
168 	if (getminor(dev) == 0)
169 		return (NULL);
170 
171 	/*
172 	 * See if it's a minor node of a link
173 	 */
174 	if ((dip = dls_link_devinfo(dev)) != NULL)
175 		return (dip);
176 
177 	state.ds_minor = getminor(dev);
178 	state.ds_major = getmajor(dev);
179 	state.ds_dip = NULL;
180 
181 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
182 	return (state.ds_dip);
183 }
184 
185 /*
186  * devo_getinfo: getinfo(9e)
187  */
188 /*ARGSUSED*/
189 int
190 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
191 {
192 	dev_info_t	*devinfo;
193 	minor_t		minor = getminor((dev_t)arg);
194 	int		rc = DDI_FAILURE;
195 
196 	switch (cmd) {
197 	case DDI_INFO_DEVT2DEVINFO:
198 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
199 			*(dev_info_t **)resp = devinfo;
200 			rc = DDI_SUCCESS;
201 		}
202 		break;
203 	case DDI_INFO_DEVT2INSTANCE:
204 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
205 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
206 			rc = DDI_SUCCESS;
207 		} else if (minor > DLS_MAX_MINOR &&
208 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
209 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
210 			rc = DDI_SUCCESS;
211 		}
212 		break;
213 	}
214 	return (rc);
215 }
216 
217 void *
218 dld_str_private(queue_t *q)
219 {
220 	return (((dld_str_t *)(q->q_ptr))->ds_private);
221 }
222 
223 int
224 dld_str_open(queue_t *rq, dev_t *devp, void *private)
225 {
226 	dld_str_t	*dsp;
227 	major_t		major;
228 	minor_t		minor;
229 	int		err;
230 
231 	major = getmajor(*devp);
232 	minor = getminor(*devp);
233 
234 	/*
235 	 * Create a new dld_str_t for the stream. This will grab a new minor
236 	 * number that will be handed back in the cloned dev_t.  Creation may
237 	 * fail if we can't allocate the dummy mblk used for flow-control.
238 	 */
239 	dsp = dld_str_create(rq, DLD_DLPI, major,
240 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
241 	if (dsp == NULL)
242 		return (ENOSR);
243 
244 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
245 	dsp->ds_private = private;
246 	if (minor != 0) {
247 		/*
248 		 * Style 1 open
249 		 */
250 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
251 			goto failed;
252 
253 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
254 	} else {
255 		(void) qassociate(rq, -1);
256 	}
257 
258 	/*
259 	 * Enable the queue srv(9e) routine.
260 	 */
261 	qprocson(rq);
262 
263 	/*
264 	 * Construct a cloned dev_t to hand back.
265 	 */
266 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
267 	return (0);
268 
269 failed:
270 	dld_str_destroy(dsp);
271 	return (err);
272 }
273 
274 int
275 dld_str_close(queue_t *rq)
276 {
277 	dld_str_t	*dsp = rq->q_ptr;
278 
279 	/*
280 	 * All modules on top have been popped off. So there can't be any
281 	 * threads from the top.
282 	 */
283 	ASSERT(dsp->ds_datathr_cnt == 0);
284 
285 	/*
286 	 * Wait until pending DLPI requests are processed.
287 	 */
288 	mutex_enter(&dsp->ds_lock);
289 	while (dsp->ds_dlpi_pending)
290 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
291 	mutex_exit(&dsp->ds_lock);
292 
293 
294 	/*
295 	 * This stream was open to a provider node. Check to see
296 	 * if it has been cleanly shut down.
297 	 */
298 	if (dsp->ds_dlstate != DL_UNATTACHED) {
299 		/*
300 		 * The stream is either open to a style 1 provider or
301 		 * this is not clean shutdown. Detach from the PPA.
302 		 * (This is still ok even in the style 1 case).
303 		 */
304 		dld_str_detach(dsp);
305 	}
306 
307 	dld_str_destroy(dsp);
308 	return (0);
309 }
310 
311 /*
312  * qi_qopen: open(9e)
313  */
314 /*ARGSUSED*/
315 int
316 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
317 {
318 	if (sflag == MODOPEN)
319 		return (ENOTSUP);
320 
321 	/*
322 	 * This is a cloning driver and therefore each queue should only
323 	 * ever get opened once.
324 	 */
325 	if (rq->q_ptr != NULL)
326 		return (EBUSY);
327 
328 	return (dld_str_open(rq, devp, NULL));
329 }
330 
331 /*
332  * qi_qclose: close(9e)
333  */
334 int
335 dld_close(queue_t *rq)
336 {
337 	/*
338 	 * Disable the queue srv(9e) routine.
339 	 */
340 	qprocsoff(rq);
341 
342 	return (dld_str_close(rq));
343 }
344 
345 /*
346  * qi_qputp: put(9e)
347  */
348 void
349 dld_wput(queue_t *wq, mblk_t *mp)
350 {
351 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
352 	dld_str_mode_t	mode;
353 
354 	switch (DB_TYPE(mp)) {
355 	case M_DATA:
356 		mutex_enter(&dsp->ds_lock);
357 		mode = dsp->ds_mode;
358 		if ((dsp->ds_dlstate != DL_IDLE) ||
359 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
360 			mutex_exit(&dsp->ds_lock);
361 			freemsg(mp);
362 			break;
363 		}
364 
365 		DLD_DATATHR_INC(dsp);
366 		mutex_exit(&dsp->ds_lock);
367 		if (mode == DLD_FASTPATH) {
368 			if (dsp->ds_mip->mi_media == DL_ETHER &&
369 			    (MBLKL(mp) < sizeof (struct ether_header))) {
370 				freemsg(mp);
371 			} else {
372 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
373 			}
374 		} else {
375 			str_mdata_raw_put(dsp, mp);
376 		}
377 		DLD_DATATHR_DCR(dsp);
378 		break;
379 	case M_PROTO:
380 	case M_PCPROTO: {
381 		t_uscalar_t	prim;
382 
383 		if (MBLKL(mp) < sizeof (t_uscalar_t))
384 			break;
385 
386 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
387 
388 		if (prim == DL_UNITDATA_REQ) {
389 			proto_unitdata_req(dsp, mp);
390 		} else {
391 			dld_wput_nondata(dsp, mp);
392 		}
393 		break;
394 	}
395 
396 	case M_IOCTL:
397 		dld_wput_nondata(dsp, mp);
398 		break;
399 
400 	case M_FLUSH:
401 		if (*mp->b_rptr & FLUSHW) {
402 			DLD_CLRQFULL(dsp);
403 			*mp->b_rptr &= ~FLUSHW;
404 		}
405 
406 		if (*mp->b_rptr & FLUSHR) {
407 			qreply(wq, mp);
408 		} else {
409 			freemsg(mp);
410 		}
411 		break;
412 
413 	default:
414 		freemsg(mp);
415 		break;
416 	}
417 }
418 
419 /*
420  * qi_srvp: srv(9e)
421  */
422 void
423 dld_wsrv(queue_t *wq)
424 {
425 	dld_str_t	*dsp = wq->q_ptr;
426 
427 	DLD_CLRQFULL(dsp);
428 }
429 
430 void
431 dld_init_ops(struct dev_ops *ops, const char *name)
432 {
433 	struct streamtab *stream;
434 	struct qinit *rq, *wq;
435 	struct module_info *modinfo;
436 
437 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
438 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
439 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
440 	modinfo->mi_minpsz = 0;
441 	modinfo->mi_maxpsz = 64*1024;
442 	modinfo->mi_hiwat  = 1;
443 	modinfo->mi_lowat = 0;
444 
445 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
446 	rq->qi_qopen = dld_open;
447 	rq->qi_qclose = dld_close;
448 	rq->qi_minfo = modinfo;
449 
450 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
451 	wq->qi_putp = (pfi_t)dld_wput;
452 	wq->qi_srvp = (pfi_t)dld_wsrv;
453 	wq->qi_minfo = modinfo;
454 
455 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
456 	stream->st_rdinit = rq;
457 	stream->st_wrinit = wq;
458 	ops->devo_cb_ops->cb_str = stream;
459 
460 	if (ops->devo_getinfo == NULL)
461 		ops->devo_getinfo = &dld_getinfo;
462 }
463 
464 void
465 dld_fini_ops(struct dev_ops *ops)
466 {
467 	struct streamtab *stream;
468 	struct qinit *rq, *wq;
469 	struct module_info *modinfo;
470 
471 	stream = ops->devo_cb_ops->cb_str;
472 	rq = stream->st_rdinit;
473 	wq = stream->st_wrinit;
474 	modinfo = rq->qi_minfo;
475 	ASSERT(wq->qi_minfo == modinfo);
476 
477 	kmem_free(stream, sizeof (struct streamtab));
478 	kmem_free(wq, sizeof (struct qinit));
479 	kmem_free(rq, sizeof (struct qinit));
480 	kmem_free(modinfo->mi_idname, FMNAMESZ);
481 	kmem_free(modinfo, sizeof (struct module_info));
482 }
483 
484 /*
485  * Initialize this module's data structures.
486  */
487 void
488 dld_str_init(void)
489 {
490 	/*
491 	 * Create dld_str_t object cache.
492 	 */
493 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
494 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
495 	ASSERT(str_cachep != NULL);
496 
497 	/*
498 	 * Create a hash table for maintaining dld_str_t's.
499 	 * The ds_minor field (the clone minor number) of a dld_str_t
500 	 * is used as a key for this hash table because this number is
501 	 * globally unique (allocated from "dls_minor_arena").
502 	 */
503 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
504 	    mod_hash_null_valdtor);
505 
506 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
507 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
508 
509 	dld_taskq_quit = B_FALSE;
510 	dld_taskq_done = B_FALSE;
511 	list_create(&dld_taskq_list, sizeof (dld_str_t),
512 	    offsetof(dld_str_t, ds_tqlist));
513 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
514 	    &p0, TS_RUN, minclsyspri);
515 }
516 
517 /*
518  * Tear down this module's data structures.
519  */
520 int
521 dld_str_fini(void)
522 {
523 	/*
524 	 * Make sure that there are no objects in use.
525 	 */
526 	if (str_count != 0)
527 		return (EBUSY);
528 
529 	/*
530 	 * Ask the dld_taskq thread to quit and wait for it to be done
531 	 */
532 	mutex_enter(&dld_taskq_lock);
533 	dld_taskq_quit = B_TRUE;
534 	cv_signal(&dld_taskq_cv);
535 	while (!dld_taskq_done)
536 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
537 	mutex_exit(&dld_taskq_lock);
538 	list_destroy(&dld_taskq_list);
539 	/*
540 	 * Destroy object cache.
541 	 */
542 	kmem_cache_destroy(str_cachep);
543 	mod_hash_destroy_idhash(str_hashp);
544 	return (0);
545 }
546 
547 /*
548  * Create a new dld_str_t object.
549  */
550 dld_str_t *
551 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
552 {
553 	dld_str_t	*dsp;
554 	int		err;
555 
556 	/*
557 	 * Allocate an object from the cache.
558 	 */
559 	atomic_add_32(&str_count, 1);
560 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
561 
562 	/*
563 	 * Allocate the dummy mblk for flow-control.
564 	 */
565 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
566 	if (dsp->ds_tx_flow_mp == NULL) {
567 		kmem_cache_free(str_cachep, dsp);
568 		atomic_add_32(&str_count, -1);
569 		return (NULL);
570 	}
571 	dsp->ds_type = type;
572 	dsp->ds_major = major;
573 	dsp->ds_style = style;
574 
575 	/*
576 	 * Initialize the queue pointers.
577 	 */
578 	ASSERT(RD(rq) == rq);
579 	dsp->ds_rq = rq;
580 	dsp->ds_wq = WR(rq);
581 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
582 
583 	/*
584 	 * We want explicit control over our write-side STREAMS queue
585 	 * where the dummy mblk gets added/removed for flow-control.
586 	 */
587 	noenable(WR(rq));
588 
589 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
590 	    (mod_hash_val_t)dsp);
591 	ASSERT(err == 0);
592 	return (dsp);
593 }
594 
595 /*
596  * Destroy a dld_str_t object.
597  */
598 void
599 dld_str_destroy(dld_str_t *dsp)
600 {
601 	queue_t		*rq;
602 	queue_t		*wq;
603 	mod_hash_val_t	val;
604 
605 	/*
606 	 * Clear the queue pointers.
607 	 */
608 	rq = dsp->ds_rq;
609 	wq = dsp->ds_wq;
610 	ASSERT(wq == WR(rq));
611 	rq->q_ptr = wq->q_ptr = NULL;
612 	dsp->ds_rq = dsp->ds_wq = NULL;
613 
614 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
615 	ASSERT(dsp->ds_sap == 0);
616 	ASSERT(dsp->ds_mh == NULL);
617 	ASSERT(dsp->ds_mch == NULL);
618 	ASSERT(dsp->ds_promisc == 0);
619 	ASSERT(dsp->ds_mph == NULL);
620 	ASSERT(dsp->ds_mip == NULL);
621 	ASSERT(dsp->ds_mnh == NULL);
622 
623 	ASSERT(dsp->ds_polling == B_FALSE);
624 	ASSERT(dsp->ds_direct == B_FALSE);
625 	ASSERT(dsp->ds_lso == B_FALSE);
626 	ASSERT(dsp->ds_lso_max == 0);
627 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
628 
629 	/*
630 	 * Reinitialize all the flags.
631 	 */
632 	dsp->ds_notifications = 0;
633 	dsp->ds_passivestate = DLD_UNINITIALIZED;
634 	dsp->ds_mode = DLD_UNITDATA;
635 	dsp->ds_native = B_FALSE;
636 
637 	ASSERT(dsp->ds_datathr_cnt == 0);
638 	ASSERT(dsp->ds_pending_head == NULL);
639 	ASSERT(dsp->ds_pending_tail == NULL);
640 	ASSERT(!dsp->ds_dlpi_pending);
641 
642 	ASSERT(dsp->ds_dlp == NULL);
643 	ASSERT(dsp->ds_dmap == NULL);
644 	ASSERT(dsp->ds_rx == NULL);
645 	ASSERT(dsp->ds_rx_arg == NULL);
646 	ASSERT(dsp->ds_next == NULL);
647 	ASSERT(dsp->ds_head == NULL);
648 
649 	/*
650 	 * Free the dummy mblk if exists.
651 	 */
652 	if (dsp->ds_tx_flow_mp != NULL) {
653 		freeb(dsp->ds_tx_flow_mp);
654 		dsp->ds_tx_flow_mp = NULL;
655 	}
656 
657 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
658 	ASSERT(dsp == (dld_str_t *)val);
659 
660 	/*
661 	 * Free the object back to the cache.
662 	 */
663 	kmem_cache_free(str_cachep, dsp);
664 	atomic_add_32(&str_count, -1);
665 }
666 
667 /*
668  * kmem_cache contructor function: see kmem_cache_create(9f).
669  */
670 /*ARGSUSED*/
671 static int
672 str_constructor(void *buf, void *cdrarg, int kmflags)
673 {
674 	dld_str_t	*dsp = buf;
675 
676 	bzero(buf, sizeof (dld_str_t));
677 
678 	/*
679 	 * Allocate a new minor number.
680 	 */
681 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
682 		return (-1);
683 
684 	/*
685 	 * Initialize the DLPI state machine.
686 	 */
687 	dsp->ds_dlstate = DL_UNATTACHED;
688 
689 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
690 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
691 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
692 
693 	return (0);
694 }
695 
696 /*
697  * kmem_cache destructor function.
698  */
699 /*ARGSUSED*/
700 static void
701 str_destructor(void *buf, void *cdrarg)
702 {
703 	dld_str_t	*dsp = buf;
704 
705 	/*
706 	 * Release the minor number.
707 	 */
708 	mac_minor_rele(dsp->ds_minor);
709 
710 	ASSERT(dsp->ds_tx_flow_mp == NULL);
711 
712 	mutex_destroy(&dsp->ds_lock);
713 	cv_destroy(&dsp->ds_datathr_cv);
714 	cv_destroy(&dsp->ds_dlpi_pending_cv);
715 }
716 
717 /*
718  * Update the priority bits and VID (may need to insert tag if mp points
719  * to an untagged packet.
720  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
721  */
722 static mblk_t *
723 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
724     link_tagmode_t tagmode)
725 {
726 	mblk_t *hmp;
727 	struct ether_vlan_header *evhp;
728 	struct ether_header *ehp;
729 	uint16_t old_tci = 0;
730 	size_t len;
731 
732 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
733 
734 	evhp = (struct ether_vlan_header *)mp->b_rptr;
735 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
736 		/*
737 		 * Tagged packet, update the priority bits.
738 		 */
739 		len = sizeof (struct ether_vlan_header);
740 
741 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
742 			/*
743 			 * In case some drivers only check the db_ref
744 			 * count of the first mblk, we pullup the
745 			 * message into a single mblk.
746 			 */
747 			hmp = msgpullup(mp, -1);
748 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
749 				freemsg(hmp);
750 				return (NULL);
751 			} else {
752 				freemsg(mp);
753 				mp = hmp;
754 			}
755 		}
756 
757 		evhp = (struct ether_vlan_header *)mp->b_rptr;
758 		old_tci = ntohs(evhp->ether_tci);
759 	} else {
760 		/*
761 		 * Untagged packet.  Two factors will cause us to insert a
762 		 * VLAN header:
763 		 * - This is a VLAN link (vid is specified)
764 		 * - The link supports user priority tagging and the priority
765 		 *   is non-zero.
766 		 */
767 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
768 			return (mp);
769 
770 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
771 		if (hmp == NULL)
772 			return (NULL);
773 
774 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
775 		ehp = (struct ether_header *)mp->b_rptr;
776 
777 		/*
778 		 * Copy the MAC addresses and typelen
779 		 */
780 		bcopy(ehp, evhp, (ETHERADDRL * 2));
781 		evhp->ether_type = ehp->ether_type;
782 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
783 
784 		hmp->b_wptr += sizeof (struct ether_vlan_header);
785 		mp->b_rptr += sizeof (struct ether_header);
786 
787 		/*
788 		 * Free the original message if it's now empty. Link the
789 		 * rest of the messages to the header message.
790 		 */
791 		if (MBLKL(mp) == 0) {
792 			hmp->b_cont = mp->b_cont;
793 			freeb(mp);
794 		} else {
795 			hmp->b_cont = mp;
796 		}
797 		mp = hmp;
798 	}
799 
800 	if (pri == 0)
801 		pri = VLAN_PRI(old_tci);
802 	if (vid == VLAN_ID_NONE)
803 		vid = VLAN_ID(old_tci);
804 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
805 	return (mp);
806 }
807 
808 /*
809  * M_DATA put (IP fast-path mode)
810  */
811 mac_tx_cookie_t
812 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
813     uint16_t flag)
814 {
815 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
816 	mblk_t *newmp;
817 	uint_t pri;
818 	mac_tx_cookie_t cookie;
819 
820 	if (is_ethernet) {
821 		/*
822 		 * Update the priority bits to the assigned priority.
823 		 */
824 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
825 
826 		if (pri != 0) {
827 			newmp = i_dld_ether_header_update_tag(mp, pri,
828 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
829 			if (newmp == NULL)
830 				goto discard;
831 			mp = newmp;
832 		}
833 	}
834 
835 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
836 		DLD_SETQFULL(dsp);
837 	}
838 	return (cookie);
839 
840 discard:
841 	/* TODO: bump kstat? */
842 	freemsg(mp);
843 	return (NULL);
844 }
845 
846 /*
847  * M_DATA put (DLIOCRAW mode)
848  */
849 static void
850 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
851 {
852 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
853 	mblk_t *bp, *newmp;
854 	size_t size;
855 	mac_header_info_t mhi;
856 	uint_t pri, vid, dvid;
857 	uint_t max_sdu;
858 
859 	/*
860 	 * Certain MAC type plugins provide an illusion for raw DLPI
861 	 * consumers.  They pretend that the MAC layer is something that
862 	 * it's not for the benefit of observability tools.  For example,
863 	 * mac_wifi pretends that it's Ethernet for such consumers.
864 	 * Here, unless native mode is enabled, we call into the MAC layer so
865 	 * that this illusion can be maintained.  The plugin will optionally
866 	 * transform the MAC header here into something that can be passed
867 	 * down.  The header goes from raw mode to "cooked" mode.
868 	 */
869 	if (!dsp->ds_native) {
870 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
871 			goto discard;
872 		mp = newmp;
873 	}
874 
875 	size = MBLKL(mp);
876 
877 	/*
878 	 * Check the packet is not too big and that any remaining
879 	 * fragment list is composed entirely of M_DATA messages. (We
880 	 * know the first fragment was M_DATA otherwise we could not
881 	 * have got here).
882 	 */
883 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
884 		if (DB_TYPE(bp) != M_DATA)
885 			goto discard;
886 		size += MBLKL(bp);
887 	}
888 
889 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
890 		goto discard;
891 
892 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
893 	/*
894 	 * If LSO is enabled, check the size against lso_max. Otherwise,
895 	 * compare the packet size with max_sdu.
896 	 */
897 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
898 	if (size > max_sdu + mhi.mhi_hdrsize)
899 		goto discard;
900 
901 	if (is_ethernet) {
902 		dvid = mac_client_vid(dsp->ds_mch);
903 
904 		/*
905 		 * Discard the packet if this is a VLAN stream but the VID in
906 		 * the packet is not correct.
907 		 */
908 		vid = VLAN_ID(mhi.mhi_tci);
909 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
910 			goto discard;
911 
912 		/*
913 		 * Discard the packet if this packet is a tagged packet
914 		 * but both pri and VID are 0.
915 		 */
916 		pri = VLAN_PRI(mhi.mhi_tci);
917 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
918 		    vid == VLAN_ID_NONE)
919 			goto discard;
920 
921 		/*
922 		 * Update the priority bits to the per-stream priority if
923 		 * priority is not set in the packet. Update the VID for
924 		 * packets on a VLAN stream.
925 		 */
926 		pri = (pri == 0) ? dsp->ds_pri : 0;
927 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
928 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
929 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
930 				goto discard;
931 			}
932 			mp = newmp;
933 		}
934 	}
935 
936 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
937 		/* Turn on flow-control for dld */
938 		DLD_SETQFULL(dsp);
939 	}
940 	return;
941 
942 discard:
943 	/* TODO: bump kstat? */
944 	freemsg(mp);
945 }
946 
947 /*
948  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
949  */
950 int
951 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
952 {
953 	dev_t			dev;
954 	int			err;
955 	const char		*drvname;
956 	mac_perim_handle_t	mph = NULL;
957 	boolean_t		qassociated = B_FALSE;
958 	dls_link_t		*dlp = NULL;
959 	dls_dl_handle_t		ddp = NULL;
960 
961 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
962 		return (EINVAL);
963 
964 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
965 		return (ENOTSUP);
966 
967 	/*
968 	 * /dev node access. This will still be supported for backward
969 	 * compatibility reason.
970 	 */
971 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
972 	    (strcmp(drvname, "vnic") != 0)) {
973 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
974 			return (EINVAL);
975 		qassociated = B_TRUE;
976 	}
977 
978 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
979 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
980 		goto failed;
981 
982 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
983 		goto failed;
984 
985 	/*
986 	 * Open a channel.
987 	 */
988 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
989 		goto failed;
990 
991 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
992 		goto failed;
993 
994 	/*
995 	 * Set the default packet priority.
996 	 */
997 	dsp->ds_pri = 0;
998 
999 	/*
1000 	 * Add a notify function so that the we get updates from the MAC.
1001 	 */
1002 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1003 	dsp->ds_dlstate = DL_UNBOUND;
1004 	mac_perim_exit(mph);
1005 	return (0);
1006 
1007 failed:
1008 	if (dlp != NULL)
1009 		dls_link_rele(dlp);
1010 	if (mph != NULL)
1011 		mac_perim_exit(mph);
1012 	if (ddp != NULL)
1013 		dls_devnet_rele(ddp);
1014 	if (qassociated)
1015 		(void) qassociate(dsp->ds_wq, -1);
1016 
1017 	return (err);
1018 }
1019 
1020 /*
1021  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1022  * from close(2) for style 2.
1023  */
1024 void
1025 dld_str_detach(dld_str_t *dsp)
1026 {
1027 	mac_perim_handle_t	mph;
1028 	int			err;
1029 
1030 	ASSERT(dsp->ds_datathr_cnt == 0);
1031 
1032 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1033 	/*
1034 	 * Remove the notify function.
1035 	 *
1036 	 * Note that we cannot wait for the notification callback to be removed
1037 	 * since it could cause the deadlock with str_notify() since they both
1038 	 * need the mac perimeter. Continue if we cannot remove the
1039 	 * notification callback right now and wait after we leave the
1040 	 * perimeter.
1041 	 */
1042 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1043 	dsp->ds_mnh = NULL;
1044 
1045 	/*
1046 	 * Disable the capabilities
1047 	 */
1048 	dld_capabilities_disable(dsp);
1049 
1050 	/*
1051 	 * Clear LSO flags.
1052 	 */
1053 	dsp->ds_lso = B_FALSE;
1054 	dsp->ds_lso_max = 0;
1055 
1056 	dls_close(dsp);
1057 	mac_perim_exit(mph);
1058 
1059 	/*
1060 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1061 	 * because the notification callback was in progress, wait for
1062 	 * it to finish before we proceed.
1063 	 */
1064 	if (err != 0)
1065 		mac_notify_remove_wait(dsp->ds_mh);
1066 
1067 	/*
1068 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1069 	 * automatically in the call to dls_devnet_rele.
1070 	 */
1071 	dls_devnet_rele(dsp->ds_ddh);
1072 
1073 	dsp->ds_sap = 0;
1074 	dsp->ds_mh = NULL;
1075 	dsp->ds_mch = NULL;
1076 	dsp->ds_mip = NULL;
1077 
1078 	if (dsp->ds_style == DL_STYLE2)
1079 		(void) qassociate(dsp->ds_wq, -1);
1080 
1081 	/*
1082 	 * Re-initialize the DLPI state machine.
1083 	 */
1084 	dsp->ds_dlstate = DL_UNATTACHED;
1085 }
1086 
1087 /*
1088  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1089  * tags before sending packets up to the DLS clients, with the exception of
1090  * special priority tagged packets, in that case, we set the VID to 0.
1091  * mp must be a VLAN tagged packet.
1092  */
1093 static mblk_t *
1094 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1095 {
1096 	mblk_t *newmp;
1097 	struct ether_vlan_header *evhp;
1098 	uint16_t tci, new_tci;
1099 
1100 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1101 	if (DB_REF(mp) > 1) {
1102 		newmp = copymsg(mp);
1103 		if (newmp == NULL)
1104 			return (NULL);
1105 		freemsg(mp);
1106 		mp = newmp;
1107 	}
1108 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1109 
1110 	tci = ntohs(evhp->ether_tci);
1111 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1112 		/*
1113 		 * Priority is 0, strip the tag.
1114 		 */
1115 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1116 		mp->b_rptr += VLAN_TAGSZ;
1117 	} else {
1118 		/*
1119 		 * Priority is not 0, update the VID to 0.
1120 		 */
1121 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1122 		evhp->ether_tci = htons(new_tci);
1123 	}
1124 	return (mp);
1125 }
1126 
1127 /*
1128  * Raw mode receive function.
1129  */
1130 /*ARGSUSED*/
1131 void
1132 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1133     mac_header_info_t *mhip)
1134 {
1135 	dld_str_t *dsp = (dld_str_t *)arg;
1136 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1137 	mblk_t *next, *newmp;
1138 
1139 	ASSERT(mp != NULL);
1140 	do {
1141 		/*
1142 		 * Get the pointer to the next packet in the chain and then
1143 		 * clear b_next before the packet gets passed on.
1144 		 */
1145 		next = mp->b_next;
1146 		mp->b_next = NULL;
1147 
1148 		/*
1149 		 * Wind back b_rptr to point at the MAC header.
1150 		 */
1151 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1152 		mp->b_rptr -= mhip->mhi_hdrsize;
1153 
1154 		/*
1155 		 * Certain MAC type plugins provide an illusion for raw
1156 		 * DLPI consumers.  They pretend that the MAC layer is
1157 		 * something that it's not for the benefit of observability
1158 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1159 		 * for such consumers.	Here, unless native mode is enabled,
1160 		 * we call into the MAC layer so that this illusion can be
1161 		 * maintained.	The plugin will optionally transform the MAC
1162 		 * header here into something that can be passed up to raw
1163 		 * consumers.  The header goes from "cooked" mode to raw mode.
1164 		 */
1165 		if (!dsp->ds_native) {
1166 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1167 			if (newmp == NULL) {
1168 				freemsg(mp);
1169 				goto next;
1170 			}
1171 			mp = newmp;
1172 		}
1173 
1174 		/*
1175 		 * Strip the VLAN tag for VLAN streams.
1176 		 */
1177 		if (is_ethernet &&
1178 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1179 			/*
1180 			 * The priority should be kept only for VLAN
1181 			 * data-links.
1182 			 */
1183 			newmp = i_dld_ether_header_strip_tag(mp,
1184 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1185 			if (newmp == NULL) {
1186 				freemsg(mp);
1187 				goto next;
1188 			}
1189 			mp = newmp;
1190 		}
1191 
1192 		/*
1193 		 * Pass the packet on.
1194 		 */
1195 		if (canputnext(dsp->ds_rq))
1196 			putnext(dsp->ds_rq, mp);
1197 		else
1198 			freemsg(mp);
1199 
1200 next:
1201 		/*
1202 		 * Move on to the next packet in the chain.
1203 		 */
1204 		mp = next;
1205 	} while (mp != NULL);
1206 }
1207 
1208 /*
1209  * Fast-path receive function.
1210  */
1211 /*ARGSUSED*/
1212 void
1213 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1214     mac_header_info_t *mhip)
1215 {
1216 	dld_str_t *dsp = (dld_str_t *)arg;
1217 	mblk_t *next;
1218 	size_t offset = 0;
1219 
1220 	/*
1221 	 * MAC header stripping rules:
1222 	 *    - Tagged packets:
1223 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1224 	 *	b. Physical streams
1225 	 *	- VLAN packets (non-zero VID). The stream must be either a
1226 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1227 	 *	  Strip the Ethernet header but keep the VLAN header.
1228 	 *	- Special tagged packets (zero VID)
1229 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1230 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1231 	 *	    keep the VLAN header.
1232 	 *	  * Otherwise, strip the whole VLAN header.
1233 	 *    - Untagged packets. Strip the whole MAC header.
1234 	 */
1235 	if (mhip->mhi_istagged &&
1236 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1237 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1238 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1239 		offset = VLAN_TAGSZ;
1240 	}
1241 
1242 	ASSERT(mp != NULL);
1243 	do {
1244 		/*
1245 		 * Get the pointer to the next packet in the chain and then
1246 		 * clear b_next before the packet gets passed on.
1247 		 */
1248 		next = mp->b_next;
1249 		mp->b_next = NULL;
1250 
1251 		/*
1252 		 * Wind back b_rptr to point at the VLAN header.
1253 		 */
1254 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1255 		mp->b_rptr -= offset;
1256 
1257 		/*
1258 		 * Pass the packet on.
1259 		 */
1260 		if (canputnext(dsp->ds_rq))
1261 			putnext(dsp->ds_rq, mp);
1262 		else
1263 			freemsg(mp);
1264 		/*
1265 		 * Move on to the next packet in the chain.
1266 		 */
1267 		mp = next;
1268 	} while (mp != NULL);
1269 }
1270 
1271 /*
1272  * Default receive function (send DL_UNITDATA_IND messages).
1273  */
1274 /*ARGSUSED*/
1275 void
1276 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1277     mac_header_info_t *mhip)
1278 {
1279 	dld_str_t		*dsp = (dld_str_t *)arg;
1280 	mblk_t			*ud_mp;
1281 	mblk_t			*next;
1282 	size_t			offset = 0;
1283 	boolean_t		strip_vlan = B_TRUE;
1284 
1285 	/*
1286 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1287 	 */
1288 	if (mhip->mhi_istagged &&
1289 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1290 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1291 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1292 		offset = VLAN_TAGSZ;
1293 		strip_vlan = B_FALSE;
1294 	}
1295 
1296 	ASSERT(mp != NULL);
1297 	do {
1298 		/*
1299 		 * Get the pointer to the next packet in the chain and then
1300 		 * clear b_next before the packet gets passed on.
1301 		 */
1302 		next = mp->b_next;
1303 		mp->b_next = NULL;
1304 
1305 		/*
1306 		 * Wind back b_rptr to point at the MAC header.
1307 		 */
1308 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1309 		mp->b_rptr -= mhip->mhi_hdrsize;
1310 
1311 		/*
1312 		 * Create the DL_UNITDATA_IND M_PROTO.
1313 		 */
1314 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1315 			freemsgchain(mp);
1316 			return;
1317 		}
1318 
1319 		/*
1320 		 * Advance b_rptr to point at the payload (or the VLAN header).
1321 		 */
1322 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1323 
1324 		/*
1325 		 * Prepend the DL_UNITDATA_IND.
1326 		 */
1327 		ud_mp->b_cont = mp;
1328 
1329 		/*
1330 		 * Send the message.
1331 		 */
1332 		if (canputnext(dsp->ds_rq))
1333 			putnext(dsp->ds_rq, ud_mp);
1334 		else
1335 			freemsg(ud_mp);
1336 
1337 		/*
1338 		 * Move on to the next packet in the chain.
1339 		 */
1340 		mp = next;
1341 	} while (mp != NULL);
1342 }
1343 
1344 /*
1345  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1346  */
1347 static void
1348 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1349 {
1350 	mblk_t		*mp;
1351 	dl_notify_ind_t *dlip;
1352 
1353 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1354 		return;
1355 
1356 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1357 	    M_PROTO, 0)) == NULL)
1358 		return;
1359 
1360 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1361 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1362 	dlip->dl_primitive = DL_NOTIFY_IND;
1363 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1364 	dlip->dl_data = max_sdu;
1365 
1366 	qreply(dsp->ds_wq, mp);
1367 }
1368 
1369 /*
1370  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1371  * current state of the interface.
1372  */
1373 void
1374 dld_str_notify_ind(dld_str_t *dsp)
1375 {
1376 	mac_notify_type_t	type;
1377 
1378 	for (type = 0; type < MAC_NNOTE; type++)
1379 		str_notify(dsp, type);
1380 }
1381 
1382 typedef struct dl_unitdata_ind_wrapper {
1383 	dl_unitdata_ind_t	dl_unitdata;
1384 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1385 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1386 } dl_unitdata_ind_wrapper_t;
1387 
1388 /*
1389  * Create a DL_UNITDATA_IND M_PROTO message.
1390  */
1391 static mblk_t *
1392 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1393 {
1394 	mblk_t				*nmp;
1395 	dl_unitdata_ind_wrapper_t	*dlwp;
1396 	dl_unitdata_ind_t		*dlp;
1397 	mac_header_info_t		mhi;
1398 	uint_t				addr_length;
1399 	uint8_t				*daddr;
1400 	uint8_t				*saddr;
1401 
1402 	/*
1403 	 * Get the packet header information.
1404 	 */
1405 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
1406 		return (NULL);
1407 
1408 	/*
1409 	 * Allocate a message large enough to contain the wrapper structure
1410 	 * defined above.
1411 	 */
1412 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1413 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1414 	    DL_UNITDATA_IND)) == NULL)
1415 		return (NULL);
1416 
1417 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1418 
1419 	dlp = &(dlwp->dl_unitdata);
1420 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1421 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1422 
1423 	/*
1424 	 * Copy in the destination address.
1425 	 */
1426 	addr_length = dsp->ds_mip->mi_addr_length;
1427 	daddr = dlwp->dl_dest_addr;
1428 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1429 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1430 
1431 	/*
1432 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1433 	 */
1434 	if (mhi.mhi_istagged && !strip_vlan)
1435 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1436 	else
1437 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1438 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1439 
1440 	/*
1441 	 * If the destination address was multicast or broadcast then the
1442 	 * dl_group_address field should be non-zero.
1443 	 */
1444 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1445 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1446 
1447 	/*
1448 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1449 	 * for example) may not have access to source information.
1450 	 */
1451 	if (mhi.mhi_saddr == NULL) {
1452 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1453 	} else {
1454 		saddr = dlwp->dl_src_addr;
1455 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1456 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1457 
1458 		/*
1459 		 * Set the source DLSAP to the packet ethertype.
1460 		 */
1461 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1462 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1463 	}
1464 
1465 	return (nmp);
1466 }
1467 
1468 /*
1469  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1470  */
1471 static void
1472 str_notify_promisc_on_phys(dld_str_t *dsp)
1473 {
1474 	mblk_t		*mp;
1475 	dl_notify_ind_t	*dlip;
1476 
1477 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1478 		return;
1479 
1480 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1481 	    M_PROTO, 0)) == NULL)
1482 		return;
1483 
1484 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1485 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1486 	dlip->dl_primitive = DL_NOTIFY_IND;
1487 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1488 
1489 	qreply(dsp->ds_wq, mp);
1490 }
1491 
1492 /*
1493  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1494  */
1495 static void
1496 str_notify_promisc_off_phys(dld_str_t *dsp)
1497 {
1498 	mblk_t		*mp;
1499 	dl_notify_ind_t	*dlip;
1500 
1501 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1502 		return;
1503 
1504 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1505 	    M_PROTO, 0)) == NULL)
1506 		return;
1507 
1508 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1509 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1510 	dlip->dl_primitive = DL_NOTIFY_IND;
1511 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1512 
1513 	qreply(dsp->ds_wq, mp);
1514 }
1515 
1516 /*
1517  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1518  */
1519 static void
1520 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1521 {
1522 	mblk_t		*mp;
1523 	dl_notify_ind_t	*dlip;
1524 	uint_t		addr_length;
1525 	uint16_t	ethertype;
1526 
1527 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1528 		return;
1529 
1530 	addr_length = dsp->ds_mip->mi_addr_length;
1531 	if ((mp = mexchange(dsp->ds_wq, NULL,
1532 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1533 	    M_PROTO, 0)) == NULL)
1534 		return;
1535 
1536 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1537 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1538 	dlip->dl_primitive = DL_NOTIFY_IND;
1539 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1540 	dlip->dl_data = addr_type;
1541 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1542 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1543 
1544 	bcopy(addr, &dlip[1], addr_length);
1545 
1546 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1547 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1548 
1549 	qreply(dsp->ds_wq, mp);
1550 }
1551 
1552 /*
1553  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1554  */
1555 static void
1556 str_notify_link_up(dld_str_t *dsp)
1557 {
1558 	mblk_t		*mp;
1559 	dl_notify_ind_t	*dlip;
1560 
1561 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1562 		return;
1563 
1564 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1565 	    M_PROTO, 0)) == NULL)
1566 		return;
1567 
1568 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1569 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1570 	dlip->dl_primitive = DL_NOTIFY_IND;
1571 	dlip->dl_notification = DL_NOTE_LINK_UP;
1572 
1573 	qreply(dsp->ds_wq, mp);
1574 }
1575 
1576 /*
1577  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1578  */
1579 static void
1580 str_notify_link_down(dld_str_t *dsp)
1581 {
1582 	mblk_t		*mp;
1583 	dl_notify_ind_t	*dlip;
1584 
1585 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1586 		return;
1587 
1588 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1589 	    M_PROTO, 0)) == NULL)
1590 		return;
1591 
1592 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1593 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1594 	dlip->dl_primitive = DL_NOTIFY_IND;
1595 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1596 
1597 	qreply(dsp->ds_wq, mp);
1598 }
1599 
1600 /*
1601  * DL_NOTIFY_IND: DL_NOTE_SPEED
1602  */
1603 static void
1604 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1605 {
1606 	mblk_t		*mp;
1607 	dl_notify_ind_t	*dlip;
1608 
1609 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1610 		return;
1611 
1612 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1613 	    M_PROTO, 0)) == NULL)
1614 		return;
1615 
1616 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1617 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1618 	dlip->dl_primitive = DL_NOTIFY_IND;
1619 	dlip->dl_notification = DL_NOTE_SPEED;
1620 	dlip->dl_data = speed;
1621 
1622 	qreply(dsp->ds_wq, mp);
1623 }
1624 
1625 /*
1626  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1627  */
1628 static void
1629 str_notify_capab_reneg(dld_str_t *dsp)
1630 {
1631 	mblk_t		*mp;
1632 	dl_notify_ind_t	*dlip;
1633 
1634 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1635 		return;
1636 
1637 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1638 	    M_PROTO, 0)) == NULL)
1639 		return;
1640 
1641 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1642 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1643 	dlip->dl_primitive = DL_NOTIFY_IND;
1644 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1645 
1646 	qreply(dsp->ds_wq, mp);
1647 }
1648 
1649 /*
1650  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1651  */
1652 static void
1653 str_notify_fastpath_flush(dld_str_t *dsp)
1654 {
1655 	mblk_t		*mp;
1656 	dl_notify_ind_t	*dlip;
1657 
1658 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1659 		return;
1660 
1661 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1662 	    M_PROTO, 0)) == NULL)
1663 		return;
1664 
1665 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1666 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1667 	dlip->dl_primitive = DL_NOTIFY_IND;
1668 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1669 
1670 	qreply(dsp->ds_wq, mp);
1671 }
1672 
1673 /*
1674  * MAC notification callback.
1675  */
1676 void
1677 str_notify(void *arg, mac_notify_type_t type)
1678 {
1679 	dld_str_t		*dsp = (dld_str_t *)arg;
1680 	queue_t			*q = dsp->ds_wq;
1681 	mac_handle_t		mh = dsp->ds_mh;
1682 	mac_client_handle_t	mch = dsp->ds_mch;
1683 	uint8_t			addr[MAXMACADDRLEN];
1684 
1685 	switch (type) {
1686 	case MAC_NOTE_TX:
1687 		qenable(q);
1688 		break;
1689 
1690 	case MAC_NOTE_DEVPROMISC:
1691 		/*
1692 		 * Send the appropriate DL_NOTIFY_IND.
1693 		 */
1694 		if (mac_promisc_get(mh))
1695 			str_notify_promisc_on_phys(dsp);
1696 		else
1697 			str_notify_promisc_off_phys(dsp);
1698 		break;
1699 
1700 	case MAC_NOTE_UNICST:
1701 		/*
1702 		 * This notification is sent whenever the MAC unicast
1703 		 * address changes.
1704 		 */
1705 		mac_unicast_primary_get(mh, addr);
1706 
1707 		/*
1708 		 * Send the appropriate DL_NOTIFY_IND.
1709 		 */
1710 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1711 		break;
1712 
1713 	case MAC_NOTE_DEST:
1714 		/*
1715 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
1716 		 * destination address.
1717 		 */
1718 		if (mac_dst_get(dsp->ds_mh, addr))
1719 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1720 		break;
1721 
1722 	case MAC_NOTE_LOWLINK:
1723 	case MAC_NOTE_LINK:
1724 		/*
1725 		 * LOWLINK refers to the actual link status. For links that
1726 		 * are not part of a bridge instance LOWLINK and LINK state
1727 		 * are the same. But for a link part of a bridge instance
1728 		 * LINK state refers to the aggregate link status: "up" when
1729 		 * at least one link part of the bridge is up and is "down"
1730 		 * when all links part of the bridge are down.
1731 		 *
1732 		 * Clients can request to be notified of the LOWLINK state
1733 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1734 		 * daemon request lowlink state changes and upper layer clients
1735 		 * receive notifications of the aggregate link state changes
1736 		 * which is the default when requesting LINK UP/DOWN state
1737 		 * notifications.
1738 		 */
1739 
1740 		/*
1741 		 * Check that the notification type matches the one that we
1742 		 * want.  If we want lower-level link notifications, and this
1743 		 * is upper, or if we want upper and this is lower, then
1744 		 * ignore.
1745 		 */
1746 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1747 			break;
1748 		/*
1749 		 * This notification is sent every time the MAC driver
1750 		 * updates the link state.
1751 		 */
1752 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1753 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1754 		case LINK_STATE_UP: {
1755 			uint64_t speed;
1756 			/*
1757 			 * The link is up so send the appropriate
1758 			 * DL_NOTIFY_IND.
1759 			 */
1760 			str_notify_link_up(dsp);
1761 
1762 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1763 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1764 			break;
1765 		}
1766 		case LINK_STATE_DOWN:
1767 			/*
1768 			 * The link is down so send the appropriate
1769 			 * DL_NOTIFY_IND.
1770 			 */
1771 			str_notify_link_down(dsp);
1772 			break;
1773 
1774 		default:
1775 			break;
1776 		}
1777 		break;
1778 
1779 	case MAC_NOTE_CAPAB_CHG:
1780 		/*
1781 		 * This notification is sent whenever the MAC resources
1782 		 * change or capabilities change. We need to renegotiate
1783 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1784 		 */
1785 		str_notify_capab_reneg(dsp);
1786 		break;
1787 
1788 	case MAC_NOTE_SDU_SIZE: {
1789 		uint_t  max_sdu;
1790 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1791 		str_notify_sdu_size(dsp, max_sdu);
1792 		break;
1793 	}
1794 
1795 	case MAC_NOTE_FASTPATH_FLUSH:
1796 		str_notify_fastpath_flush(dsp);
1797 		break;
1798 
1799 	/* Unused notifications */
1800 	case MAC_NOTE_MARGIN:
1801 		break;
1802 
1803 	default:
1804 		ASSERT(B_FALSE);
1805 		break;
1806 	}
1807 }
1808 
1809 /*
1810  * This function is called via a taskq mechansim to process all control
1811  * messages on a per 'dsp' end point.
1812  */
1813 static void
1814 dld_wput_nondata_task(void *arg)
1815 {
1816 	dld_str_t	*dsp = arg;
1817 	mblk_t		*mp;
1818 
1819 	mutex_enter(&dsp->ds_lock);
1820 	while (dsp->ds_pending_head != NULL) {
1821 		mp = dsp->ds_pending_head;
1822 		dsp->ds_pending_head = mp->b_next;
1823 		mp->b_next = NULL;
1824 		if (dsp->ds_pending_head == NULL)
1825 			dsp->ds_pending_tail = NULL;
1826 		mutex_exit(&dsp->ds_lock);
1827 
1828 		switch (DB_TYPE(mp)) {
1829 		case M_PROTO:
1830 		case M_PCPROTO:
1831 			dld_proto(dsp, mp);
1832 			break;
1833 		case M_IOCTL:
1834 			dld_ioc(dsp, mp);
1835 			break;
1836 		default:
1837 			ASSERT(0);
1838 		}
1839 
1840 		mutex_enter(&dsp->ds_lock);
1841 	}
1842 	ASSERT(dsp->ds_pending_tail == NULL);
1843 	dsp->ds_dlpi_pending = 0;
1844 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1845 	mutex_exit(&dsp->ds_lock);
1846 }
1847 
1848 /*
1849  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1850  * thread is started at boot time.
1851  */
1852 static void
1853 dld_taskq_dispatch(void)
1854 {
1855 	callb_cpr_t	cprinfo;
1856 	dld_str_t	*dsp;
1857 
1858 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1859 	    "dld_taskq_dispatch");
1860 	mutex_enter(&dld_taskq_lock);
1861 
1862 	while (!dld_taskq_quit) {
1863 		dsp = list_head(&dld_taskq_list);
1864 		while (dsp != NULL) {
1865 			list_remove(&dld_taskq_list, dsp);
1866 			mutex_exit(&dld_taskq_lock);
1867 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1868 			    dsp, TQ_SLEEP) != 0);
1869 			mutex_enter(&dld_taskq_lock);
1870 			dsp = list_head(&dld_taskq_list);
1871 		}
1872 
1873 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1874 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1875 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1876 	}
1877 
1878 	dld_taskq_done = B_TRUE;
1879 	cv_signal(&dld_taskq_cv);
1880 	CALLB_CPR_EXIT(&cprinfo);
1881 	thread_exit();
1882 }
1883 
1884 /*
1885  * All control operations are serialized on the 'dsp' and are also funneled
1886  * through a taskq mechanism to ensure that subsequent processing has kernel
1887  * context and can safely use cv_wait.
1888  *
1889  * Mechanisms to handle taskq dispatch failures
1890  *
1891  * The only way to be sure that taskq dispatch does not fail is to either
1892  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1893  * some number of entries and make sure that the number of outstanding requests
1894  * are less than that number. We can't use TQ_SLEEP since we don't know the
1895  * context. Nor can we bound the total number of 'dsp' end points. So we are
1896  * unable to use either of the above schemes, and are forced to deal with
1897  * taskq dispatch failures. Note that even dynamic taskq could fail in
1898  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1899  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1900  * framework.
1901  *
1902  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1903  * We also have a single global thread to retry the taskq dispatch. This
1904  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1905  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1906  */
1907 static void
1908 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1909 {
1910 	ASSERT(mp->b_next == NULL);
1911 	mutex_enter(&dsp->ds_lock);
1912 	if (dsp->ds_pending_head != NULL) {
1913 		ASSERT(dsp->ds_dlpi_pending);
1914 		dsp->ds_pending_tail->b_next = mp;
1915 		dsp->ds_pending_tail = mp;
1916 		mutex_exit(&dsp->ds_lock);
1917 		return;
1918 	}
1919 	ASSERT(dsp->ds_pending_tail == NULL);
1920 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
1921 	/*
1922 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
1923 	 * thread is still active and is processing the last message, though
1924 	 * the pending queue has been emptied.
1925 	 */
1926 	if (dsp->ds_dlpi_pending) {
1927 		mutex_exit(&dsp->ds_lock);
1928 		return;
1929 	}
1930 
1931 	dsp->ds_dlpi_pending = 1;
1932 	mutex_exit(&dsp->ds_lock);
1933 
1934 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
1935 	    TQ_NOSLEEP) != 0)
1936 		return;
1937 
1938 	mutex_enter(&dld_taskq_lock);
1939 	list_insert_tail(&dld_taskq_list, dsp);
1940 	cv_signal(&dld_taskq_cv);
1941 	mutex_exit(&dld_taskq_lock);
1942 }
1943 
1944 /*
1945  * Process an M_IOCTL message.
1946  */
1947 static void
1948 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1949 {
1950 	uint_t			cmd;
1951 
1952 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1953 	ASSERT(dsp->ds_type == DLD_DLPI);
1954 
1955 	switch (cmd) {
1956 	case DLIOCNATIVE:
1957 		ioc_native(dsp, mp);
1958 		break;
1959 	case DLIOCMARGININFO:
1960 		ioc_margin(dsp, mp);
1961 		break;
1962 	case DLIOCRAW:
1963 		ioc_raw(dsp, mp);
1964 		break;
1965 	case DLIOCHDRINFO:
1966 		ioc_fast(dsp, mp);
1967 		break;
1968 	case DLIOCLOWLINK:
1969 		ioc_lowlink(dsp, mp);
1970 		break;
1971 	default:
1972 		ioc(dsp, mp);
1973 	}
1974 }
1975 
1976 /*
1977  * DLIOCNATIVE
1978  */
1979 static void
1980 ioc_native(dld_str_t *dsp, mblk_t *mp)
1981 {
1982 	queue_t *q = dsp->ds_wq;
1983 	const mac_info_t *mip = dsp->ds_mip;
1984 
1985 	/*
1986 	 * Native mode can be enabled if it's disabled and if the
1987 	 * native media type is different.
1988 	 */
1989 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
1990 		dsp->ds_native = B_TRUE;
1991 
1992 	if (dsp->ds_native)
1993 		miocack(q, mp, 0, mip->mi_nativemedia);
1994 	else
1995 		miocnak(q, mp, 0, ENOTSUP);
1996 }
1997 
1998 /*
1999  * DLIOCMARGININFO
2000  */
2001 static void
2002 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2003 {
2004 	queue_t *q = dsp->ds_wq;
2005 	uint32_t margin;
2006 	int err;
2007 
2008 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2009 		err = EINVAL;
2010 		goto failed;
2011 	}
2012 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2013 		goto failed;
2014 
2015 	mac_margin_get(dsp->ds_mh, &margin);
2016 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2017 	miocack(q, mp, sizeof (uint32_t), 0);
2018 	return;
2019 
2020 failed:
2021 	miocnak(q, mp, 0, err);
2022 }
2023 
2024 /*
2025  * DLIOCRAW
2026  */
2027 static void
2028 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2029 {
2030 	queue_t *q = dsp->ds_wq;
2031 	mac_perim_handle_t	mph;
2032 
2033 	if (dsp->ds_mh == NULL) {
2034 		dsp->ds_mode = DLD_RAW;
2035 		miocack(q, mp, 0, 0);
2036 		return;
2037 	}
2038 
2039 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2040 	if (dsp->ds_polling || dsp->ds_direct) {
2041 		mac_perim_exit(mph);
2042 		miocnak(q, mp, 0, EPROTO);
2043 		return;
2044 	}
2045 
2046 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2047 		/*
2048 		 * Set the receive callback.
2049 		 */
2050 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2051 	}
2052 
2053 	/*
2054 	 * Note that raw mode is enabled.
2055 	 */
2056 	dsp->ds_mode = DLD_RAW;
2057 	mac_perim_exit(mph);
2058 
2059 	miocack(q, mp, 0, 0);
2060 }
2061 
2062 /*
2063  * DLIOCHDRINFO
2064  */
2065 static void
2066 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2067 {
2068 	dl_unitdata_req_t *dlp;
2069 	off_t		off;
2070 	size_t		len;
2071 	const uint8_t	*addr;
2072 	uint16_t	sap;
2073 	mblk_t		*nmp;
2074 	mblk_t		*hmp;
2075 	uint_t		addr_length;
2076 	queue_t		*q = dsp->ds_wq;
2077 	int		err;
2078 	mac_perim_handle_t	mph;
2079 
2080 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2081 		err = ENOTSUP;
2082 		goto failed;
2083 	}
2084 
2085 	/*
2086 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2087 	 * user-land should not be allowed.
2088 	 */
2089 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2090 		err = EINVAL;
2091 		goto failed;
2092 	}
2093 
2094 	nmp = mp->b_cont;
2095 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2096 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2097 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2098 		err = EINVAL;
2099 		goto failed;
2100 	}
2101 
2102 	off = dlp->dl_dest_addr_offset;
2103 	len = dlp->dl_dest_addr_length;
2104 
2105 	if (!MBLKIN(nmp, off, len)) {
2106 		err = EINVAL;
2107 		goto failed;
2108 	}
2109 
2110 	if (dsp->ds_dlstate != DL_IDLE) {
2111 		err = ENOTSUP;
2112 		goto failed;
2113 	}
2114 
2115 	addr_length = dsp->ds_mip->mi_addr_length;
2116 	if (len != addr_length + sizeof (uint16_t)) {
2117 		err = EINVAL;
2118 		goto failed;
2119 	}
2120 
2121 	addr = nmp->b_rptr + off;
2122 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2123 
2124 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2125 		err = ENOMEM;
2126 		goto failed;
2127 	}
2128 
2129 	/*
2130 	 * This ioctl might happen concurrently with a direct call to dld_capab
2131 	 * that tries to enable direct and/or poll capabilities. Since the
2132 	 * stack does not serialize them, we do so here to avoid mixing
2133 	 * the callbacks.
2134 	 */
2135 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2136 	if (dsp->ds_mode != DLD_FASTPATH) {
2137 		/*
2138 		 * Set the receive callback (unless polling is enabled).
2139 		 */
2140 		if (!dsp->ds_polling && !dsp->ds_direct)
2141 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2142 
2143 		/*
2144 		 * Note that fast-path mode is enabled.
2145 		 */
2146 		dsp->ds_mode = DLD_FASTPATH;
2147 	}
2148 	mac_perim_exit(mph);
2149 
2150 	freemsg(nmp->b_cont);
2151 	nmp->b_cont = hmp;
2152 
2153 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2154 	return;
2155 failed:
2156 	miocnak(q, mp, 0, err);
2157 }
2158 
2159 /*
2160  * DLIOCLOWLINK: request actual link state changes. When the
2161  * link is part of a bridge instance the client receives actual
2162  * link state changes and not the aggregate link status. Used by
2163  * the bridging daemon (bridged) for proper RSTP operation.
2164  */
2165 static void
2166 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2167 {
2168 	queue_t *q = dsp->ds_wq;
2169 	int err;
2170 
2171 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
2172 		miocnak(q, mp, 0, err);
2173 	} else {
2174 		/* LINTED: alignment */
2175 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2176 		miocack(q, mp, 0, 0);
2177 	}
2178 }
2179 
2180 /*
2181  * Catch-all handler.
2182  */
2183 static void
2184 ioc(dld_str_t *dsp, mblk_t *mp)
2185 {
2186 	queue_t	*q = dsp->ds_wq;
2187 
2188 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2189 		miocnak(q, mp, 0, EINVAL);
2190 		return;
2191 	}
2192 	mac_ioctl(dsp->ds_mh, q, mp);
2193 }
2194