xref: /titanic_50/usr/src/uts/common/io/dld/dld_str.c (revision 5cf8276bb0140df747bc83e173b7c851dec6eda2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Data-Link Driver
27  */
28 
29 #include	<inet/common.h>
30 #include	<sys/strsubr.h>
31 #include	<sys/stropts.h>
32 #include	<sys/strsun.h>
33 #include	<sys/vlan.h>
34 #include	<sys/dld_impl.h>
35 #include	<sys/cpuvar.h>
36 #include	<sys/callb.h>
37 #include	<sys/list.h>
38 #include	<sys/mac_client.h>
39 #include	<sys/mac_client_priv.h>
40 #include	<sys/mac_flow.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
61 
62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
64     link_tagmode_t);
65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
66 
67 static uint32_t		str_count;
68 static kmem_cache_t	*str_cachep;
69 static mod_hash_t	*str_hashp;
70 
71 #define	STR_HASHSZ		64
72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
73 
74 #define	dld_taskq	system_taskq
75 
76 static kmutex_t		dld_taskq_lock;
77 static kcondvar_t	dld_taskq_cv;
78 static list_t		dld_taskq_list;		/* List of dld_str_t */
79 boolean_t		dld_taskq_quit;
80 boolean_t		dld_taskq_done;
81 
82 static void		dld_taskq_dispatch(void);
83 
84 /*
85  * Some notes on entry points, flow-control, queueing.
86  *
87  * This driver exports the traditional STREAMS put entry point as well as
88  * the non-STREAMS fast-path transmit routine which is provided to IP via
89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
90  * and data operations, while the fast-path routine deals only with M_DATA
91  * fast-path packets.  Regardless of the entry point, all outbound packets
92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
93  *
94  * The transmit logic operates in the following way: All packets coming
95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
96  * happens when the MAC layer indicates the packets couldn't be
97  * transmitted due to 1) lack of resources (e.g. running out of
98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
99  * particular flow. The indication comes in the form of a Tx cookie that
100  * identifies the blocked ring. In such case, DLD will place a
101  * dummy message on its write-side STREAMS queue so that the queue is
102  * marked as "full". Any subsequent packets arriving at the driver will
103  * still be sent to the MAC layer where it either gets queued in the Tx
104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
106  * When the write service procedure runs, it will remove the dummy
107  * message from the write-side STREAMS queue; in effect this will trigger
108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
109  * respectively, due to the above reasons.
110  *
111  * All non-data operations, both DLPI and ioctls are single threaded on a per
112  * dld_str_t endpoint. This is done using a taskq so that the control operation
113  * has kernel context and can cv_wait for resources. In addition all set type
114  * operations that involve mac level state modification are serialized on a
115  * per mac end point using the perimeter mechanism provided by the mac layer.
116  * This serializes all mac clients trying to modify a single mac end point over
117  * the entire sequence of mac calls made by that client as an atomic unit. The
118  * mac framework locking is described in mac.c. A critical element is that
119  * DLD/DLS does not hold any locks across the mac perimeter.
120  *
121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
124  * is returned. If the mac handle is non-null, it can be safely accessed
125  * below. The mac handle won't be freed until the mac_unregister which
126  * won't happen until the driver detaches. The DDI framework ensures that
127  * the detach won't happen while a getinfo is in progress.
128  */
129 typedef struct i_dld_str_state_s {
130 	major_t		ds_major;
131 	minor_t		ds_minor;
132 	int		ds_instance;
133 	dev_info_t	*ds_dip;
134 } i_dld_str_state_t;
135 
136 /* ARGSUSED */
137 static uint_t
138 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
139 {
140 	i_dld_str_state_t	*statep = arg;
141 	dld_str_t		*dsp = (dld_str_t *)val;
142 	mac_handle_t		mh;
143 
144 	if (statep->ds_major != dsp->ds_major)
145 		return (MH_WALK_CONTINUE);
146 
147 	ASSERT(statep->ds_minor != 0);
148 	mh = dsp->ds_mh;
149 
150 	if (statep->ds_minor == dsp->ds_minor) {
151 		/*
152 		 * Clone: a clone minor is unique. we can terminate the
153 		 * walk if we find a matching stream -- even if we fail
154 		 * to obtain the devinfo.
155 		 */
156 		if (mh != NULL) {
157 			statep->ds_dip = mac_devinfo_get(mh);
158 			statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
159 		}
160 		return (MH_WALK_TERMINATE);
161 	}
162 	return (MH_WALK_CONTINUE);
163 }
164 
165 static dev_info_t *
166 dld_finddevinfo(dev_t dev)
167 {
168 	dev_info_t		*dip;
169 	i_dld_str_state_t	state;
170 
171 	if (getminor(dev) == 0)
172 		return (NULL);
173 
174 	/*
175 	 * See if it's a minor node of a link
176 	 */
177 	if ((dip = dls_link_devinfo(dev)) != NULL)
178 		return (dip);
179 
180 	state.ds_minor = getminor(dev);
181 	state.ds_major = getmajor(dev);
182 	state.ds_dip = NULL;
183 	state.ds_instance = -1;
184 
185 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
186 	return (state.ds_dip);
187 }
188 
189 int
190 dld_devt_to_instance(dev_t dev)
191 {
192 	minor_t			minor;
193 	i_dld_str_state_t	state;
194 
195 	/*
196 	 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
197 	 * Minor number 0 is reserved for the DLPI style 2 unattached
198 	 * node.
199 	 */
200 
201 	if ((minor = getminor(dev)) == 0)
202 		return (-1);
203 
204 	/*
205 	 * Check for unopened style 1 node.
206 	 * Note that this doesn't *necessarily* work for legacy
207 	 * devices, but this code is only called within the
208 	 * getinfo(9e) implementation for true GLDv3 devices, so it
209 	 * doesn't matter.
210 	 */
211 	if (minor > 0 && minor <= DLS_MAX_MINOR) {
212 		return (DLS_MINOR2INST(minor));
213 	}
214 
215 	state.ds_minor = getminor(dev);
216 	state.ds_major = getmajor(dev);
217 	state.ds_dip = NULL;
218 	state.ds_instance = -1;
219 
220 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
221 	return (state.ds_instance);
222 }
223 
224 /*
225  * devo_getinfo: getinfo(9e)
226  *
227  * NB: This may be called for a provider before the provider's
228  * instances are attached.  Hence, if a particular provider needs a
229  * special mapping (the mac instance != ddi_get_instance()), then it
230  * may need to provide its own implmentation using the
231  * mac_devt_to_instance() function, and translating the returned mac
232  * instance to a devinfo instance.  For dev_t's where the minor number
233  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
234  * function indirectly via the mac_getinfo() function.
235  */
236 /*ARGSUSED*/
237 int
238 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
239 {
240 	dev_info_t	*devinfo;
241 	minor_t		minor = getminor((dev_t)arg);
242 	int		rc = DDI_FAILURE;
243 
244 	switch (cmd) {
245 	case DDI_INFO_DEVT2DEVINFO:
246 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
247 			*(dev_info_t **)resp = devinfo;
248 			rc = DDI_SUCCESS;
249 		}
250 		break;
251 	case DDI_INFO_DEVT2INSTANCE:
252 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
253 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
254 			rc = DDI_SUCCESS;
255 		} else if (minor > DLS_MAX_MINOR &&
256 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
257 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
258 			rc = DDI_SUCCESS;
259 		}
260 		break;
261 	}
262 	return (rc);
263 }
264 
265 void *
266 dld_str_private(queue_t *q)
267 {
268 	return (((dld_str_t *)(q->q_ptr))->ds_private);
269 }
270 
271 int
272 dld_str_open(queue_t *rq, dev_t *devp, void *private)
273 {
274 	dld_str_t	*dsp;
275 	major_t		major;
276 	minor_t		minor;
277 	int		err;
278 
279 	major = getmajor(*devp);
280 	minor = getminor(*devp);
281 
282 	/*
283 	 * Create a new dld_str_t for the stream. This will grab a new minor
284 	 * number that will be handed back in the cloned dev_t.  Creation may
285 	 * fail if we can't allocate the dummy mblk used for flow-control.
286 	 */
287 	dsp = dld_str_create(rq, DLD_DLPI, major,
288 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
289 	if (dsp == NULL)
290 		return (ENOSR);
291 
292 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
293 	dsp->ds_private = private;
294 	if (minor != 0) {
295 		/*
296 		 * Style 1 open
297 		 */
298 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
299 			goto failed;
300 
301 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
302 	} else {
303 		(void) qassociate(rq, -1);
304 	}
305 
306 	/*
307 	 * Enable the queue srv(9e) routine.
308 	 */
309 	qprocson(rq);
310 
311 	/*
312 	 * Construct a cloned dev_t to hand back.
313 	 */
314 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
315 	return (0);
316 
317 failed:
318 	dld_str_destroy(dsp);
319 	return (err);
320 }
321 
322 int
323 dld_str_close(queue_t *rq)
324 {
325 	dld_str_t	*dsp = rq->q_ptr;
326 
327 	/*
328 	 * All modules on top have been popped off. So there can't be any
329 	 * threads from the top.
330 	 */
331 	ASSERT(dsp->ds_datathr_cnt == 0);
332 
333 	/*
334 	 * Wait until pending DLPI requests are processed.
335 	 */
336 	mutex_enter(&dsp->ds_lock);
337 	while (dsp->ds_dlpi_pending)
338 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
339 	mutex_exit(&dsp->ds_lock);
340 
341 
342 	/*
343 	 * This stream was open to a provider node. Check to see
344 	 * if it has been cleanly shut down.
345 	 */
346 	if (dsp->ds_dlstate != DL_UNATTACHED) {
347 		/*
348 		 * The stream is either open to a style 1 provider or
349 		 * this is not clean shutdown. Detach from the PPA.
350 		 * (This is still ok even in the style 1 case).
351 		 */
352 		dld_str_detach(dsp);
353 	}
354 
355 	dld_str_destroy(dsp);
356 	return (0);
357 }
358 
359 /*
360  * qi_qopen: open(9e)
361  */
362 /*ARGSUSED*/
363 int
364 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
365 {
366 	if (sflag == MODOPEN)
367 		return (ENOTSUP);
368 
369 	/*
370 	 * This is a cloning driver and therefore each queue should only
371 	 * ever get opened once.
372 	 */
373 	if (rq->q_ptr != NULL)
374 		return (EBUSY);
375 
376 	return (dld_str_open(rq, devp, NULL));
377 }
378 
379 /*
380  * qi_qclose: close(9e)
381  */
382 int
383 dld_close(queue_t *rq)
384 {
385 	/*
386 	 * Disable the queue srv(9e) routine.
387 	 */
388 	qprocsoff(rq);
389 
390 	return (dld_str_close(rq));
391 }
392 
393 /*
394  * qi_qputp: put(9e)
395  */
396 void
397 dld_wput(queue_t *wq, mblk_t *mp)
398 {
399 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
400 	dld_str_mode_t	mode;
401 
402 	switch (DB_TYPE(mp)) {
403 	case M_DATA:
404 		mutex_enter(&dsp->ds_lock);
405 		mode = dsp->ds_mode;
406 		if ((dsp->ds_dlstate != DL_IDLE) ||
407 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
408 			mutex_exit(&dsp->ds_lock);
409 			freemsg(mp);
410 			break;
411 		}
412 
413 		DLD_DATATHR_INC(dsp);
414 		mutex_exit(&dsp->ds_lock);
415 		if (mode == DLD_FASTPATH) {
416 			if (dsp->ds_mip->mi_media == DL_ETHER &&
417 			    (MBLKL(mp) < sizeof (struct ether_header))) {
418 				freemsg(mp);
419 			} else {
420 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
421 			}
422 		} else {
423 			str_mdata_raw_put(dsp, mp);
424 		}
425 		DLD_DATATHR_DCR(dsp);
426 		break;
427 	case M_PROTO:
428 	case M_PCPROTO: {
429 		t_uscalar_t	prim;
430 
431 		if (MBLKL(mp) < sizeof (t_uscalar_t))
432 			break;
433 
434 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
435 
436 		if (prim == DL_UNITDATA_REQ) {
437 			proto_unitdata_req(dsp, mp);
438 		} else {
439 			dld_wput_nondata(dsp, mp);
440 		}
441 		break;
442 	}
443 
444 	case M_IOCTL:
445 		dld_wput_nondata(dsp, mp);
446 		break;
447 
448 	case M_FLUSH:
449 		if (*mp->b_rptr & FLUSHW) {
450 			DLD_CLRQFULL(dsp);
451 			*mp->b_rptr &= ~FLUSHW;
452 		}
453 
454 		if (*mp->b_rptr & FLUSHR) {
455 			qreply(wq, mp);
456 		} else {
457 			freemsg(mp);
458 		}
459 		break;
460 
461 	default:
462 		freemsg(mp);
463 		break;
464 	}
465 }
466 
467 /*
468  * qi_srvp: srv(9e)
469  */
470 void
471 dld_wsrv(queue_t *wq)
472 {
473 	dld_str_t	*dsp = wq->q_ptr;
474 
475 	DLD_CLRQFULL(dsp);
476 }
477 
478 void
479 dld_init_ops(struct dev_ops *ops, const char *name)
480 {
481 	struct streamtab *stream;
482 	struct qinit *rq, *wq;
483 	struct module_info *modinfo;
484 
485 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
486 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
487 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
488 	modinfo->mi_minpsz = 0;
489 	modinfo->mi_maxpsz = 64*1024;
490 	modinfo->mi_hiwat  = 1;
491 	modinfo->mi_lowat = 0;
492 
493 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
494 	rq->qi_qopen = dld_open;
495 	rq->qi_qclose = dld_close;
496 	rq->qi_minfo = modinfo;
497 
498 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
499 	wq->qi_putp = (pfi_t)dld_wput;
500 	wq->qi_srvp = (pfi_t)dld_wsrv;
501 	wq->qi_minfo = modinfo;
502 
503 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
504 	stream->st_rdinit = rq;
505 	stream->st_wrinit = wq;
506 	ops->devo_cb_ops->cb_str = stream;
507 
508 	if (ops->devo_getinfo == NULL)
509 		ops->devo_getinfo = &dld_getinfo;
510 }
511 
512 void
513 dld_fini_ops(struct dev_ops *ops)
514 {
515 	struct streamtab *stream;
516 	struct qinit *rq, *wq;
517 	struct module_info *modinfo;
518 
519 	stream = ops->devo_cb_ops->cb_str;
520 	rq = stream->st_rdinit;
521 	wq = stream->st_wrinit;
522 	modinfo = rq->qi_minfo;
523 	ASSERT(wq->qi_minfo == modinfo);
524 
525 	kmem_free(stream, sizeof (struct streamtab));
526 	kmem_free(wq, sizeof (struct qinit));
527 	kmem_free(rq, sizeof (struct qinit));
528 	kmem_free(modinfo->mi_idname, FMNAMESZ);
529 	kmem_free(modinfo, sizeof (struct module_info));
530 }
531 
532 /*
533  * Initialize this module's data structures.
534  */
535 void
536 dld_str_init(void)
537 {
538 	/*
539 	 * Create dld_str_t object cache.
540 	 */
541 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
542 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
543 	ASSERT(str_cachep != NULL);
544 
545 	/*
546 	 * Create a hash table for maintaining dld_str_t's.
547 	 * The ds_minor field (the clone minor number) of a dld_str_t
548 	 * is used as a key for this hash table because this number is
549 	 * globally unique (allocated from "dls_minor_arena").
550 	 */
551 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
552 	    mod_hash_null_valdtor);
553 
554 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
555 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
556 
557 	dld_taskq_quit = B_FALSE;
558 	dld_taskq_done = B_FALSE;
559 	list_create(&dld_taskq_list, sizeof (dld_str_t),
560 	    offsetof(dld_str_t, ds_tqlist));
561 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
562 	    &p0, TS_RUN, minclsyspri);
563 }
564 
565 /*
566  * Tear down this module's data structures.
567  */
568 int
569 dld_str_fini(void)
570 {
571 	/*
572 	 * Make sure that there are no objects in use.
573 	 */
574 	if (str_count != 0)
575 		return (EBUSY);
576 
577 	/*
578 	 * Ask the dld_taskq thread to quit and wait for it to be done
579 	 */
580 	mutex_enter(&dld_taskq_lock);
581 	dld_taskq_quit = B_TRUE;
582 	cv_signal(&dld_taskq_cv);
583 	while (!dld_taskq_done)
584 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
585 	mutex_exit(&dld_taskq_lock);
586 	list_destroy(&dld_taskq_list);
587 	/*
588 	 * Destroy object cache.
589 	 */
590 	kmem_cache_destroy(str_cachep);
591 	mod_hash_destroy_idhash(str_hashp);
592 	return (0);
593 }
594 
595 /*
596  * Create a new dld_str_t object.
597  */
598 dld_str_t *
599 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
600 {
601 	dld_str_t	*dsp;
602 	int		err;
603 
604 	/*
605 	 * Allocate an object from the cache.
606 	 */
607 	atomic_add_32(&str_count, 1);
608 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
609 
610 	/*
611 	 * Allocate the dummy mblk for flow-control.
612 	 */
613 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
614 	if (dsp->ds_tx_flow_mp == NULL) {
615 		kmem_cache_free(str_cachep, dsp);
616 		atomic_add_32(&str_count, -1);
617 		return (NULL);
618 	}
619 	dsp->ds_type = type;
620 	dsp->ds_major = major;
621 	dsp->ds_style = style;
622 
623 	/*
624 	 * Initialize the queue pointers.
625 	 */
626 	ASSERT(RD(rq) == rq);
627 	dsp->ds_rq = rq;
628 	dsp->ds_wq = WR(rq);
629 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
630 
631 	/*
632 	 * We want explicit control over our write-side STREAMS queue
633 	 * where the dummy mblk gets added/removed for flow-control.
634 	 */
635 	noenable(WR(rq));
636 
637 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
638 	    (mod_hash_val_t)dsp);
639 	ASSERT(err == 0);
640 	return (dsp);
641 }
642 
643 /*
644  * Destroy a dld_str_t object.
645  */
646 void
647 dld_str_destroy(dld_str_t *dsp)
648 {
649 	queue_t		*rq;
650 	queue_t		*wq;
651 	mod_hash_val_t	val;
652 
653 	/*
654 	 * Clear the queue pointers.
655 	 */
656 	rq = dsp->ds_rq;
657 	wq = dsp->ds_wq;
658 	ASSERT(wq == WR(rq));
659 	rq->q_ptr = wq->q_ptr = NULL;
660 	dsp->ds_rq = dsp->ds_wq = NULL;
661 
662 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
663 	ASSERT(dsp->ds_sap == 0);
664 	ASSERT(dsp->ds_mh == NULL);
665 	ASSERT(dsp->ds_mch == NULL);
666 	ASSERT(dsp->ds_promisc == 0);
667 	ASSERT(dsp->ds_mph == NULL);
668 	ASSERT(dsp->ds_mip == NULL);
669 	ASSERT(dsp->ds_mnh == NULL);
670 
671 	ASSERT(dsp->ds_polling == B_FALSE);
672 	ASSERT(dsp->ds_direct == B_FALSE);
673 	ASSERT(dsp->ds_lso == B_FALSE);
674 	ASSERT(dsp->ds_lso_max == 0);
675 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
676 
677 	/*
678 	 * Reinitialize all the flags.
679 	 */
680 	dsp->ds_notifications = 0;
681 	dsp->ds_passivestate = DLD_UNINITIALIZED;
682 	dsp->ds_mode = DLD_UNITDATA;
683 	dsp->ds_native = B_FALSE;
684 	dsp->ds_nonip = B_FALSE;
685 
686 	ASSERT(dsp->ds_datathr_cnt == 0);
687 	ASSERT(dsp->ds_pending_head == NULL);
688 	ASSERT(dsp->ds_pending_tail == NULL);
689 	ASSERT(!dsp->ds_dlpi_pending);
690 
691 	ASSERT(dsp->ds_dlp == NULL);
692 	ASSERT(dsp->ds_dmap == NULL);
693 	ASSERT(dsp->ds_rx == NULL);
694 	ASSERT(dsp->ds_rx_arg == NULL);
695 	ASSERT(dsp->ds_next == NULL);
696 	ASSERT(dsp->ds_head == NULL);
697 
698 	/*
699 	 * Free the dummy mblk if exists.
700 	 */
701 	if (dsp->ds_tx_flow_mp != NULL) {
702 		freeb(dsp->ds_tx_flow_mp);
703 		dsp->ds_tx_flow_mp = NULL;
704 	}
705 
706 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
707 	ASSERT(dsp == (dld_str_t *)val);
708 
709 	/*
710 	 * Free the object back to the cache.
711 	 */
712 	kmem_cache_free(str_cachep, dsp);
713 	atomic_add_32(&str_count, -1);
714 }
715 
716 /*
717  * kmem_cache contructor function: see kmem_cache_create(9f).
718  */
719 /*ARGSUSED*/
720 static int
721 str_constructor(void *buf, void *cdrarg, int kmflags)
722 {
723 	dld_str_t	*dsp = buf;
724 
725 	bzero(buf, sizeof (dld_str_t));
726 
727 	/*
728 	 * Allocate a new minor number.
729 	 */
730 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
731 		return (-1);
732 
733 	/*
734 	 * Initialize the DLPI state machine.
735 	 */
736 	dsp->ds_dlstate = DL_UNATTACHED;
737 
738 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
739 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
740 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
741 
742 	return (0);
743 }
744 
745 /*
746  * kmem_cache destructor function.
747  */
748 /*ARGSUSED*/
749 static void
750 str_destructor(void *buf, void *cdrarg)
751 {
752 	dld_str_t	*dsp = buf;
753 
754 	/*
755 	 * Release the minor number.
756 	 */
757 	mac_minor_rele(dsp->ds_minor);
758 
759 	ASSERT(dsp->ds_tx_flow_mp == NULL);
760 
761 	mutex_destroy(&dsp->ds_lock);
762 	cv_destroy(&dsp->ds_datathr_cv);
763 	cv_destroy(&dsp->ds_dlpi_pending_cv);
764 }
765 
766 /*
767  * Update the priority bits and VID (may need to insert tag if mp points
768  * to an untagged packet.
769  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
770  */
771 static mblk_t *
772 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
773     link_tagmode_t tagmode)
774 {
775 	mblk_t *hmp;
776 	struct ether_vlan_header *evhp;
777 	struct ether_header *ehp;
778 	uint16_t old_tci = 0;
779 	size_t len;
780 
781 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
782 
783 	evhp = (struct ether_vlan_header *)mp->b_rptr;
784 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
785 		/*
786 		 * Tagged packet, update the priority bits.
787 		 */
788 		len = sizeof (struct ether_vlan_header);
789 
790 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
791 			/*
792 			 * In case some drivers only check the db_ref
793 			 * count of the first mblk, we pullup the
794 			 * message into a single mblk.
795 			 */
796 			hmp = msgpullup(mp, -1);
797 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
798 				freemsg(hmp);
799 				return (NULL);
800 			} else {
801 				freemsg(mp);
802 				mp = hmp;
803 			}
804 		}
805 
806 		evhp = (struct ether_vlan_header *)mp->b_rptr;
807 		old_tci = ntohs(evhp->ether_tci);
808 	} else {
809 		/*
810 		 * Untagged packet.  Two factors will cause us to insert a
811 		 * VLAN header:
812 		 * - This is a VLAN link (vid is specified)
813 		 * - The link supports user priority tagging and the priority
814 		 *   is non-zero.
815 		 */
816 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
817 			return (mp);
818 
819 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
820 		if (hmp == NULL)
821 			return (NULL);
822 
823 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
824 		ehp = (struct ether_header *)mp->b_rptr;
825 
826 		/*
827 		 * Copy the MAC addresses and typelen
828 		 */
829 		bcopy(ehp, evhp, (ETHERADDRL * 2));
830 		evhp->ether_type = ehp->ether_type;
831 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
832 
833 		hmp->b_wptr += sizeof (struct ether_vlan_header);
834 		mp->b_rptr += sizeof (struct ether_header);
835 
836 		/*
837 		 * Free the original message if it's now empty. Link the
838 		 * rest of the messages to the header message.
839 		 */
840 		if (MBLKL(mp) == 0) {
841 			hmp->b_cont = mp->b_cont;
842 			freeb(mp);
843 		} else {
844 			hmp->b_cont = mp;
845 		}
846 		mp = hmp;
847 	}
848 
849 	if (pri == 0)
850 		pri = VLAN_PRI(old_tci);
851 	if (vid == VLAN_ID_NONE)
852 		vid = VLAN_ID(old_tci);
853 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
854 	return (mp);
855 }
856 
857 /*
858  * M_DATA put (IP fast-path mode)
859  */
860 mac_tx_cookie_t
861 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
862     uint16_t flag)
863 {
864 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
865 	mblk_t *newmp;
866 	uint_t pri;
867 	mac_tx_cookie_t cookie;
868 
869 	if (is_ethernet) {
870 		/*
871 		 * Update the priority bits to the assigned priority.
872 		 */
873 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
874 
875 		if (pri != 0) {
876 			newmp = i_dld_ether_header_update_tag(mp, pri,
877 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
878 			if (newmp == NULL)
879 				goto discard;
880 			mp = newmp;
881 		}
882 	}
883 
884 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
885 		DLD_SETQFULL(dsp);
886 	}
887 	return (cookie);
888 
889 discard:
890 	/* TODO: bump kstat? */
891 	freemsg(mp);
892 	return (NULL);
893 }
894 
895 /*
896  * M_DATA put (DLIOCRAW mode)
897  */
898 static void
899 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
900 {
901 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
902 	mblk_t *bp, *newmp;
903 	size_t size;
904 	mac_header_info_t mhi;
905 	uint_t pri, vid, dvid;
906 	uint_t max_sdu;
907 
908 	/*
909 	 * Certain MAC type plugins provide an illusion for raw DLPI
910 	 * consumers.  They pretend that the MAC layer is something that
911 	 * it's not for the benefit of observability tools.  For example,
912 	 * mac_wifi pretends that it's Ethernet for such consumers.
913 	 * Here, unless native mode is enabled, we call into the MAC layer so
914 	 * that this illusion can be maintained.  The plugin will optionally
915 	 * transform the MAC header here into something that can be passed
916 	 * down.  The header goes from raw mode to "cooked" mode.
917 	 */
918 	if (!dsp->ds_native) {
919 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
920 			goto discard;
921 		mp = newmp;
922 	}
923 
924 	size = MBLKL(mp);
925 
926 	/*
927 	 * Check the packet is not too big and that any remaining
928 	 * fragment list is composed entirely of M_DATA messages. (We
929 	 * know the first fragment was M_DATA otherwise we could not
930 	 * have got here).
931 	 */
932 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
933 		if (DB_TYPE(bp) != M_DATA)
934 			goto discard;
935 		size += MBLKL(bp);
936 	}
937 
938 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
939 		goto discard;
940 
941 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
942 	/*
943 	 * If LSO is enabled, check the size against lso_max. Otherwise,
944 	 * compare the packet size with max_sdu.
945 	 */
946 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
947 	if (size > max_sdu + mhi.mhi_hdrsize)
948 		goto discard;
949 
950 	if (is_ethernet) {
951 		dvid = mac_client_vid(dsp->ds_mch);
952 
953 		/*
954 		 * Discard the packet if this is a VLAN stream but the VID in
955 		 * the packet is not correct.
956 		 */
957 		vid = VLAN_ID(mhi.mhi_tci);
958 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
959 			goto discard;
960 
961 		/*
962 		 * Discard the packet if this packet is a tagged packet
963 		 * but both pri and VID are 0.
964 		 */
965 		pri = VLAN_PRI(mhi.mhi_tci);
966 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
967 		    vid == VLAN_ID_NONE)
968 			goto discard;
969 
970 		/*
971 		 * Update the priority bits to the per-stream priority if
972 		 * priority is not set in the packet. Update the VID for
973 		 * packets on a VLAN stream.
974 		 */
975 		pri = (pri == 0) ? dsp->ds_pri : 0;
976 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
977 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
978 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
979 				goto discard;
980 			}
981 			mp = newmp;
982 		}
983 	}
984 
985 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
986 		/* Turn on flow-control for dld */
987 		DLD_SETQFULL(dsp);
988 	}
989 	return;
990 
991 discard:
992 	/* TODO: bump kstat? */
993 	freemsg(mp);
994 }
995 
996 /*
997  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
998  */
999 int
1000 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1001 {
1002 	dev_t			dev;
1003 	int			err;
1004 	const char		*drvname;
1005 	mac_perim_handle_t	mph = NULL;
1006 	boolean_t		qassociated = B_FALSE;
1007 	dls_link_t		*dlp = NULL;
1008 	dls_dl_handle_t		ddp = NULL;
1009 
1010 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1011 		return (EINVAL);
1012 
1013 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1014 		return (ENOTSUP);
1015 
1016 	/*
1017 	 * /dev node access. This will still be supported for backward
1018 	 * compatibility reason.
1019 	 */
1020 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1021 	    (strcmp(drvname, "vnic") != 0)) {
1022 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1023 			return (EINVAL);
1024 		qassociated = B_TRUE;
1025 	}
1026 
1027 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1028 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1029 		goto failed;
1030 
1031 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1032 		goto failed;
1033 
1034 	/*
1035 	 * Open a channel.
1036 	 */
1037 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1038 		goto failed;
1039 
1040 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
1041 		goto failed;
1042 
1043 	/*
1044 	 * Set the default packet priority.
1045 	 */
1046 	dsp->ds_pri = 0;
1047 
1048 	/*
1049 	 * Add a notify function so that the we get updates from the MAC.
1050 	 */
1051 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1052 	dsp->ds_dlstate = DL_UNBOUND;
1053 	mac_perim_exit(mph);
1054 	return (0);
1055 
1056 failed:
1057 	if (dlp != NULL)
1058 		dls_link_rele(dlp);
1059 	if (mph != NULL)
1060 		mac_perim_exit(mph);
1061 	if (ddp != NULL)
1062 		dls_devnet_rele(ddp);
1063 	if (qassociated)
1064 		(void) qassociate(dsp->ds_wq, -1);
1065 
1066 	return (err);
1067 }
1068 
1069 /*
1070  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1071  * from close(2) for style 2.
1072  */
1073 void
1074 dld_str_detach(dld_str_t *dsp)
1075 {
1076 	mac_perim_handle_t	mph;
1077 	int			err;
1078 
1079 	ASSERT(dsp->ds_datathr_cnt == 0);
1080 
1081 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1082 	/*
1083 	 * Remove the notify function.
1084 	 *
1085 	 * Note that we cannot wait for the notification callback to be removed
1086 	 * since it could cause the deadlock with str_notify() since they both
1087 	 * need the mac perimeter. Continue if we cannot remove the
1088 	 * notification callback right now and wait after we leave the
1089 	 * perimeter.
1090 	 */
1091 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1092 	dsp->ds_mnh = NULL;
1093 
1094 	/*
1095 	 * Disable the capabilities
1096 	 */
1097 	dld_capabilities_disable(dsp);
1098 
1099 	/*
1100 	 * Clear LSO flags.
1101 	 */
1102 	dsp->ds_lso = B_FALSE;
1103 	dsp->ds_lso_max = 0;
1104 
1105 	dls_close(dsp);
1106 	mac_perim_exit(mph);
1107 
1108 	/*
1109 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1110 	 * because the notification callback was in progress, wait for
1111 	 * it to finish before we proceed.
1112 	 */
1113 	if (err != 0)
1114 		mac_notify_remove_wait(dsp->ds_mh);
1115 
1116 	/*
1117 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1118 	 * automatically in the call to dls_devnet_rele.
1119 	 */
1120 	dls_devnet_rele(dsp->ds_ddh);
1121 
1122 	dsp->ds_sap = 0;
1123 	dsp->ds_mh = NULL;
1124 	dsp->ds_mch = NULL;
1125 	dsp->ds_mip = NULL;
1126 
1127 	if (dsp->ds_style == DL_STYLE2)
1128 		(void) qassociate(dsp->ds_wq, -1);
1129 
1130 	/*
1131 	 * Re-initialize the DLPI state machine.
1132 	 */
1133 	dsp->ds_dlstate = DL_UNATTACHED;
1134 }
1135 
1136 /*
1137  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1138  * tags before sending packets up to the DLS clients, with the exception of
1139  * special priority tagged packets, in that case, we set the VID to 0.
1140  * mp must be a VLAN tagged packet.
1141  */
1142 static mblk_t *
1143 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1144 {
1145 	mblk_t *newmp;
1146 	struct ether_vlan_header *evhp;
1147 	uint16_t tci, new_tci;
1148 
1149 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1150 	if (DB_REF(mp) > 1) {
1151 		newmp = copymsg(mp);
1152 		if (newmp == NULL)
1153 			return (NULL);
1154 		freemsg(mp);
1155 		mp = newmp;
1156 	}
1157 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1158 
1159 	tci = ntohs(evhp->ether_tci);
1160 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1161 		/*
1162 		 * Priority is 0, strip the tag.
1163 		 */
1164 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1165 		mp->b_rptr += VLAN_TAGSZ;
1166 	} else {
1167 		/*
1168 		 * Priority is not 0, update the VID to 0.
1169 		 */
1170 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1171 		evhp->ether_tci = htons(new_tci);
1172 	}
1173 	return (mp);
1174 }
1175 
1176 /*
1177  * Raw mode receive function.
1178  */
1179 /*ARGSUSED*/
1180 void
1181 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1182     mac_header_info_t *mhip)
1183 {
1184 	dld_str_t *dsp = (dld_str_t *)arg;
1185 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1186 	mblk_t *next, *newmp;
1187 
1188 	ASSERT(mp != NULL);
1189 	do {
1190 		/*
1191 		 * Get the pointer to the next packet in the chain and then
1192 		 * clear b_next before the packet gets passed on.
1193 		 */
1194 		next = mp->b_next;
1195 		mp->b_next = NULL;
1196 
1197 		/*
1198 		 * Wind back b_rptr to point at the MAC header.
1199 		 */
1200 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1201 		mp->b_rptr -= mhip->mhi_hdrsize;
1202 
1203 		/*
1204 		 * Certain MAC type plugins provide an illusion for raw
1205 		 * DLPI consumers.  They pretend that the MAC layer is
1206 		 * something that it's not for the benefit of observability
1207 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1208 		 * for such consumers.	Here, unless native mode is enabled,
1209 		 * we call into the MAC layer so that this illusion can be
1210 		 * maintained.	The plugin will optionally transform the MAC
1211 		 * header here into something that can be passed up to raw
1212 		 * consumers.  The header goes from "cooked" mode to raw mode.
1213 		 */
1214 		if (!dsp->ds_native) {
1215 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1216 			if (newmp == NULL) {
1217 				freemsg(mp);
1218 				goto next;
1219 			}
1220 			mp = newmp;
1221 		}
1222 
1223 		/*
1224 		 * Strip the VLAN tag for VLAN streams.
1225 		 */
1226 		if (is_ethernet &&
1227 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1228 			/*
1229 			 * The priority should be kept only for VLAN
1230 			 * data-links.
1231 			 */
1232 			newmp = i_dld_ether_header_strip_tag(mp,
1233 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1234 			if (newmp == NULL) {
1235 				freemsg(mp);
1236 				goto next;
1237 			}
1238 			mp = newmp;
1239 		}
1240 
1241 		/*
1242 		 * Pass the packet on.
1243 		 */
1244 		if (canputnext(dsp->ds_rq))
1245 			putnext(dsp->ds_rq, mp);
1246 		else
1247 			freemsg(mp);
1248 
1249 next:
1250 		/*
1251 		 * Move on to the next packet in the chain.
1252 		 */
1253 		mp = next;
1254 	} while (mp != NULL);
1255 }
1256 
1257 /*
1258  * Fast-path receive function.
1259  */
1260 /*ARGSUSED*/
1261 void
1262 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1263     mac_header_info_t *mhip)
1264 {
1265 	dld_str_t *dsp = (dld_str_t *)arg;
1266 	mblk_t *next;
1267 	size_t offset = 0;
1268 
1269 	/*
1270 	 * MAC header stripping rules:
1271 	 *    - Tagged packets:
1272 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1273 	 *	b. Physical streams
1274 	 *	- VLAN packets (non-zero VID). The stream must be either a
1275 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1276 	 *	  Strip the Ethernet header but keep the VLAN header.
1277 	 *	- Special tagged packets (zero VID)
1278 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1279 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1280 	 *	    keep the VLAN header.
1281 	 *	  * Otherwise, strip the whole VLAN header.
1282 	 *    - Untagged packets. Strip the whole MAC header.
1283 	 */
1284 	if (mhip->mhi_istagged &&
1285 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1286 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1287 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1288 		offset = VLAN_TAGSZ;
1289 	}
1290 
1291 	ASSERT(mp != NULL);
1292 	do {
1293 		/*
1294 		 * Get the pointer to the next packet in the chain and then
1295 		 * clear b_next before the packet gets passed on.
1296 		 */
1297 		next = mp->b_next;
1298 		mp->b_next = NULL;
1299 
1300 		/*
1301 		 * Wind back b_rptr to point at the VLAN header.
1302 		 */
1303 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1304 		mp->b_rptr -= offset;
1305 
1306 		/*
1307 		 * Pass the packet on.
1308 		 */
1309 		if (canputnext(dsp->ds_rq))
1310 			putnext(dsp->ds_rq, mp);
1311 		else
1312 			freemsg(mp);
1313 		/*
1314 		 * Move on to the next packet in the chain.
1315 		 */
1316 		mp = next;
1317 	} while (mp != NULL);
1318 }
1319 
1320 /*
1321  * Default receive function (send DL_UNITDATA_IND messages).
1322  */
1323 /*ARGSUSED*/
1324 void
1325 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1326     mac_header_info_t *mhip)
1327 {
1328 	dld_str_t		*dsp = (dld_str_t *)arg;
1329 	mblk_t			*ud_mp;
1330 	mblk_t			*next;
1331 	size_t			offset = 0;
1332 	boolean_t		strip_vlan = B_TRUE;
1333 
1334 	/*
1335 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1336 	 */
1337 	if (mhip->mhi_istagged &&
1338 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1339 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1340 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1341 		offset = VLAN_TAGSZ;
1342 		strip_vlan = B_FALSE;
1343 	}
1344 
1345 	ASSERT(mp != NULL);
1346 	do {
1347 		/*
1348 		 * Get the pointer to the next packet in the chain and then
1349 		 * clear b_next before the packet gets passed on.
1350 		 */
1351 		next = mp->b_next;
1352 		mp->b_next = NULL;
1353 
1354 		/*
1355 		 * Wind back b_rptr to point at the MAC header.
1356 		 */
1357 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1358 		mp->b_rptr -= mhip->mhi_hdrsize;
1359 
1360 		/*
1361 		 * Create the DL_UNITDATA_IND M_PROTO.
1362 		 */
1363 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1364 			freemsgchain(mp);
1365 			return;
1366 		}
1367 
1368 		/*
1369 		 * Advance b_rptr to point at the payload (or the VLAN header).
1370 		 */
1371 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1372 
1373 		/*
1374 		 * Prepend the DL_UNITDATA_IND.
1375 		 */
1376 		ud_mp->b_cont = mp;
1377 
1378 		/*
1379 		 * Send the message.
1380 		 */
1381 		if (canputnext(dsp->ds_rq))
1382 			putnext(dsp->ds_rq, ud_mp);
1383 		else
1384 			freemsg(ud_mp);
1385 
1386 		/*
1387 		 * Move on to the next packet in the chain.
1388 		 */
1389 		mp = next;
1390 	} while (mp != NULL);
1391 }
1392 
1393 /*
1394  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1395  */
1396 static void
1397 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1398 {
1399 	mblk_t		*mp;
1400 	dl_notify_ind_t *dlip;
1401 
1402 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1403 		return;
1404 
1405 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1406 	    M_PROTO, 0)) == NULL)
1407 		return;
1408 
1409 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1410 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1411 	dlip->dl_primitive = DL_NOTIFY_IND;
1412 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1413 	dlip->dl_data = max_sdu;
1414 
1415 	qreply(dsp->ds_wq, mp);
1416 }
1417 
1418 /*
1419  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1420  * current state of the interface.
1421  */
1422 void
1423 dld_str_notify_ind(dld_str_t *dsp)
1424 {
1425 	mac_notify_type_t	type;
1426 
1427 	for (type = 0; type < MAC_NNOTE; type++)
1428 		str_notify(dsp, type);
1429 }
1430 
1431 typedef struct dl_unitdata_ind_wrapper {
1432 	dl_unitdata_ind_t	dl_unitdata;
1433 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1434 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1435 } dl_unitdata_ind_wrapper_t;
1436 
1437 /*
1438  * Create a DL_UNITDATA_IND M_PROTO message.
1439  */
1440 static mblk_t *
1441 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1442 {
1443 	mblk_t				*nmp;
1444 	dl_unitdata_ind_wrapper_t	*dlwp;
1445 	dl_unitdata_ind_t		*dlp;
1446 	mac_header_info_t		mhi;
1447 	uint_t				addr_length;
1448 	uint8_t				*daddr;
1449 	uint8_t				*saddr;
1450 
1451 	/*
1452 	 * Get the packet header information.
1453 	 */
1454 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
1455 		return (NULL);
1456 
1457 	/*
1458 	 * Allocate a message large enough to contain the wrapper structure
1459 	 * defined above.
1460 	 */
1461 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1462 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1463 	    DL_UNITDATA_IND)) == NULL)
1464 		return (NULL);
1465 
1466 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1467 
1468 	dlp = &(dlwp->dl_unitdata);
1469 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1470 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1471 
1472 	/*
1473 	 * Copy in the destination address.
1474 	 */
1475 	addr_length = dsp->ds_mip->mi_addr_length;
1476 	daddr = dlwp->dl_dest_addr;
1477 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1478 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1479 
1480 	/*
1481 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1482 	 */
1483 	if (mhi.mhi_istagged && !strip_vlan)
1484 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1485 	else
1486 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1487 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1488 
1489 	/*
1490 	 * If the destination address was multicast or broadcast then the
1491 	 * dl_group_address field should be non-zero.
1492 	 */
1493 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1494 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1495 
1496 	/*
1497 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1498 	 * for example) may not have access to source information.
1499 	 */
1500 	if (mhi.mhi_saddr == NULL) {
1501 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1502 	} else {
1503 		saddr = dlwp->dl_src_addr;
1504 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1505 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1506 
1507 		/*
1508 		 * Set the source DLSAP to the packet ethertype.
1509 		 */
1510 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1511 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1512 	}
1513 
1514 	return (nmp);
1515 }
1516 
1517 /*
1518  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1519  */
1520 static void
1521 str_notify_promisc_on_phys(dld_str_t *dsp)
1522 {
1523 	mblk_t		*mp;
1524 	dl_notify_ind_t	*dlip;
1525 
1526 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1527 		return;
1528 
1529 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1530 	    M_PROTO, 0)) == NULL)
1531 		return;
1532 
1533 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1534 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1535 	dlip->dl_primitive = DL_NOTIFY_IND;
1536 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1537 
1538 	qreply(dsp->ds_wq, mp);
1539 }
1540 
1541 /*
1542  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1543  */
1544 static void
1545 str_notify_promisc_off_phys(dld_str_t *dsp)
1546 {
1547 	mblk_t		*mp;
1548 	dl_notify_ind_t	*dlip;
1549 
1550 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1551 		return;
1552 
1553 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1554 	    M_PROTO, 0)) == NULL)
1555 		return;
1556 
1557 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1558 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1559 	dlip->dl_primitive = DL_NOTIFY_IND;
1560 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1561 
1562 	qreply(dsp->ds_wq, mp);
1563 }
1564 
1565 /*
1566  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1567  */
1568 static void
1569 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1570 {
1571 	mblk_t		*mp;
1572 	dl_notify_ind_t	*dlip;
1573 	uint_t		addr_length;
1574 	uint16_t	ethertype;
1575 
1576 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1577 		return;
1578 
1579 	addr_length = dsp->ds_mip->mi_addr_length;
1580 	if ((mp = mexchange(dsp->ds_wq, NULL,
1581 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1582 	    M_PROTO, 0)) == NULL)
1583 		return;
1584 
1585 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1586 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1587 	dlip->dl_primitive = DL_NOTIFY_IND;
1588 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1589 	dlip->dl_data = addr_type;
1590 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1591 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1592 
1593 	bcopy(addr, &dlip[1], addr_length);
1594 
1595 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1596 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1597 
1598 	qreply(dsp->ds_wq, mp);
1599 }
1600 
1601 /*
1602  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1603  */
1604 static void
1605 str_notify_link_up(dld_str_t *dsp)
1606 {
1607 	mblk_t		*mp;
1608 	dl_notify_ind_t	*dlip;
1609 
1610 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1611 		return;
1612 
1613 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1614 	    M_PROTO, 0)) == NULL)
1615 		return;
1616 
1617 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1618 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1619 	dlip->dl_primitive = DL_NOTIFY_IND;
1620 	dlip->dl_notification = DL_NOTE_LINK_UP;
1621 
1622 	qreply(dsp->ds_wq, mp);
1623 }
1624 
1625 /*
1626  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1627  */
1628 static void
1629 str_notify_link_down(dld_str_t *dsp)
1630 {
1631 	mblk_t		*mp;
1632 	dl_notify_ind_t	*dlip;
1633 
1634 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1635 		return;
1636 
1637 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1638 	    M_PROTO, 0)) == NULL)
1639 		return;
1640 
1641 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1642 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1643 	dlip->dl_primitive = DL_NOTIFY_IND;
1644 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1645 
1646 	qreply(dsp->ds_wq, mp);
1647 }
1648 
1649 /*
1650  * DL_NOTIFY_IND: DL_NOTE_SPEED
1651  */
1652 static void
1653 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1654 {
1655 	mblk_t		*mp;
1656 	dl_notify_ind_t	*dlip;
1657 
1658 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1659 		return;
1660 
1661 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1662 	    M_PROTO, 0)) == NULL)
1663 		return;
1664 
1665 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1666 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1667 	dlip->dl_primitive = DL_NOTIFY_IND;
1668 	dlip->dl_notification = DL_NOTE_SPEED;
1669 	dlip->dl_data = speed;
1670 
1671 	qreply(dsp->ds_wq, mp);
1672 }
1673 
1674 /*
1675  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1676  */
1677 static void
1678 str_notify_capab_reneg(dld_str_t *dsp)
1679 {
1680 	mblk_t		*mp;
1681 	dl_notify_ind_t	*dlip;
1682 
1683 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1684 		return;
1685 
1686 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1687 	    M_PROTO, 0)) == NULL)
1688 		return;
1689 
1690 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1691 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1692 	dlip->dl_primitive = DL_NOTIFY_IND;
1693 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1694 
1695 	qreply(dsp->ds_wq, mp);
1696 }
1697 
1698 /*
1699  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1700  */
1701 static void
1702 str_notify_fastpath_flush(dld_str_t *dsp)
1703 {
1704 	mblk_t		*mp;
1705 	dl_notify_ind_t	*dlip;
1706 
1707 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1708 		return;
1709 
1710 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1711 	    M_PROTO, 0)) == NULL)
1712 		return;
1713 
1714 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1715 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1716 	dlip->dl_primitive = DL_NOTIFY_IND;
1717 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1718 
1719 	qreply(dsp->ds_wq, mp);
1720 }
1721 
1722 static void
1723 str_notify_allowed_ips(dld_str_t *dsp)
1724 {
1725 	mblk_t		*mp;
1726 	dl_notify_ind_t	*dlip;
1727 	size_t		mp_size;
1728 	mac_protect_t	*mrp;
1729 
1730 	if (!(dsp->ds_notifications & DL_NOTE_ALLOWED_IPS))
1731 		return;
1732 
1733 	mp_size = sizeof (mac_protect_t) + sizeof (dl_notify_ind_t);
1734 	if ((mp = mexchange(dsp->ds_wq, NULL, mp_size, M_PROTO, 0)) == NULL)
1735 		return;
1736 
1737 	mrp = mac_protect_get(dsp->ds_mh);
1738 	bzero(mp->b_rptr, mp_size);
1739 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1740 	dlip->dl_primitive = DL_NOTIFY_IND;
1741 	dlip->dl_notification = DL_NOTE_ALLOWED_IPS;
1742 	dlip->dl_data = 0;
1743 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1744 	dlip->dl_addr_length = sizeof (mac_protect_t);
1745 	bcopy(mrp, mp->b_rptr + sizeof (dl_notify_ind_t),
1746 	    sizeof (mac_protect_t));
1747 
1748 	qreply(dsp->ds_wq, mp);
1749 }
1750 
1751 /*
1752  * MAC notification callback.
1753  */
1754 void
1755 str_notify(void *arg, mac_notify_type_t type)
1756 {
1757 	dld_str_t		*dsp = (dld_str_t *)arg;
1758 	queue_t			*q = dsp->ds_wq;
1759 	mac_handle_t		mh = dsp->ds_mh;
1760 	mac_client_handle_t	mch = dsp->ds_mch;
1761 	uint8_t			addr[MAXMACADDRLEN];
1762 
1763 	switch (type) {
1764 	case MAC_NOTE_TX:
1765 		qenable(q);
1766 		break;
1767 
1768 	case MAC_NOTE_DEVPROMISC:
1769 		/*
1770 		 * Send the appropriate DL_NOTIFY_IND.
1771 		 */
1772 		if (mac_promisc_get(mh))
1773 			str_notify_promisc_on_phys(dsp);
1774 		else
1775 			str_notify_promisc_off_phys(dsp);
1776 		break;
1777 
1778 	case MAC_NOTE_UNICST:
1779 		/*
1780 		 * This notification is sent whenever the MAC unicast
1781 		 * address changes.
1782 		 */
1783 		mac_unicast_primary_get(mh, addr);
1784 
1785 		/*
1786 		 * Send the appropriate DL_NOTIFY_IND.
1787 		 */
1788 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1789 		break;
1790 
1791 	case MAC_NOTE_DEST:
1792 		/*
1793 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
1794 		 * destination address.
1795 		 */
1796 		if (mac_dst_get(dsp->ds_mh, addr))
1797 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1798 		break;
1799 
1800 	case MAC_NOTE_LOWLINK:
1801 	case MAC_NOTE_LINK:
1802 		/*
1803 		 * LOWLINK refers to the actual link status. For links that
1804 		 * are not part of a bridge instance LOWLINK and LINK state
1805 		 * are the same. But for a link part of a bridge instance
1806 		 * LINK state refers to the aggregate link status: "up" when
1807 		 * at least one link part of the bridge is up and is "down"
1808 		 * when all links part of the bridge are down.
1809 		 *
1810 		 * Clients can request to be notified of the LOWLINK state
1811 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1812 		 * daemon request lowlink state changes and upper layer clients
1813 		 * receive notifications of the aggregate link state changes
1814 		 * which is the default when requesting LINK UP/DOWN state
1815 		 * notifications.
1816 		 */
1817 
1818 		/*
1819 		 * Check that the notification type matches the one that we
1820 		 * want.  If we want lower-level link notifications, and this
1821 		 * is upper, or if we want upper and this is lower, then
1822 		 * ignore.
1823 		 */
1824 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1825 			break;
1826 		/*
1827 		 * This notification is sent every time the MAC driver
1828 		 * updates the link state.
1829 		 */
1830 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1831 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1832 		case LINK_STATE_UP: {
1833 			uint64_t speed;
1834 			/*
1835 			 * The link is up so send the appropriate
1836 			 * DL_NOTIFY_IND.
1837 			 */
1838 			str_notify_link_up(dsp);
1839 
1840 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1841 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1842 			break;
1843 		}
1844 		case LINK_STATE_DOWN:
1845 			/*
1846 			 * The link is down so send the appropriate
1847 			 * DL_NOTIFY_IND.
1848 			 */
1849 			str_notify_link_down(dsp);
1850 			break;
1851 
1852 		default:
1853 			break;
1854 		}
1855 		break;
1856 
1857 	case MAC_NOTE_CAPAB_CHG:
1858 		/*
1859 		 * This notification is sent whenever the MAC resources
1860 		 * change or capabilities change. We need to renegotiate
1861 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1862 		 */
1863 		str_notify_capab_reneg(dsp);
1864 		break;
1865 
1866 	case MAC_NOTE_SDU_SIZE: {
1867 		uint_t  max_sdu;
1868 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1869 		str_notify_sdu_size(dsp, max_sdu);
1870 		break;
1871 	}
1872 
1873 	case MAC_NOTE_FASTPATH_FLUSH:
1874 		str_notify_fastpath_flush(dsp);
1875 		break;
1876 
1877 	/* Unused notifications */
1878 	case MAC_NOTE_MARGIN:
1879 		break;
1880 
1881 	case MAC_NOTE_ALLOWED_IPS:
1882 		str_notify_allowed_ips(dsp);
1883 		break;
1884 
1885 	default:
1886 		ASSERT(B_FALSE);
1887 		break;
1888 	}
1889 }
1890 
1891 /*
1892  * This function is called via a taskq mechansim to process all control
1893  * messages on a per 'dsp' end point.
1894  */
1895 static void
1896 dld_wput_nondata_task(void *arg)
1897 {
1898 	dld_str_t	*dsp = arg;
1899 	mblk_t		*mp;
1900 
1901 	mutex_enter(&dsp->ds_lock);
1902 	while (dsp->ds_pending_head != NULL) {
1903 		mp = dsp->ds_pending_head;
1904 		dsp->ds_pending_head = mp->b_next;
1905 		mp->b_next = NULL;
1906 		if (dsp->ds_pending_head == NULL)
1907 			dsp->ds_pending_tail = NULL;
1908 		mutex_exit(&dsp->ds_lock);
1909 
1910 		switch (DB_TYPE(mp)) {
1911 		case M_PROTO:
1912 		case M_PCPROTO:
1913 			dld_proto(dsp, mp);
1914 			break;
1915 		case M_IOCTL:
1916 			dld_ioc(dsp, mp);
1917 			break;
1918 		default:
1919 			ASSERT(0);
1920 		}
1921 
1922 		mutex_enter(&dsp->ds_lock);
1923 	}
1924 	ASSERT(dsp->ds_pending_tail == NULL);
1925 	dsp->ds_dlpi_pending = 0;
1926 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1927 	mutex_exit(&dsp->ds_lock);
1928 }
1929 
1930 /*
1931  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1932  * thread is started at boot time.
1933  */
1934 static void
1935 dld_taskq_dispatch(void)
1936 {
1937 	callb_cpr_t	cprinfo;
1938 	dld_str_t	*dsp;
1939 
1940 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1941 	    "dld_taskq_dispatch");
1942 	mutex_enter(&dld_taskq_lock);
1943 
1944 	while (!dld_taskq_quit) {
1945 		dsp = list_head(&dld_taskq_list);
1946 		while (dsp != NULL) {
1947 			list_remove(&dld_taskq_list, dsp);
1948 			mutex_exit(&dld_taskq_lock);
1949 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1950 			    dsp, TQ_SLEEP) != 0);
1951 			mutex_enter(&dld_taskq_lock);
1952 			dsp = list_head(&dld_taskq_list);
1953 		}
1954 
1955 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1956 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1957 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1958 	}
1959 
1960 	dld_taskq_done = B_TRUE;
1961 	cv_signal(&dld_taskq_cv);
1962 	CALLB_CPR_EXIT(&cprinfo);
1963 	thread_exit();
1964 }
1965 
1966 /*
1967  * All control operations are serialized on the 'dsp' and are also funneled
1968  * through a taskq mechanism to ensure that subsequent processing has kernel
1969  * context and can safely use cv_wait.
1970  *
1971  * Mechanisms to handle taskq dispatch failures
1972  *
1973  * The only way to be sure that taskq dispatch does not fail is to either
1974  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1975  * some number of entries and make sure that the number of outstanding requests
1976  * are less than that number. We can't use TQ_SLEEP since we don't know the
1977  * context. Nor can we bound the total number of 'dsp' end points. So we are
1978  * unable to use either of the above schemes, and are forced to deal with
1979  * taskq dispatch failures. Note that even dynamic taskq could fail in
1980  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1981  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1982  * framework.
1983  *
1984  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1985  * We also have a single global thread to retry the taskq dispatch. This
1986  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1987  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1988  */
1989 static void
1990 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1991 {
1992 	ASSERT(mp->b_next == NULL);
1993 	mutex_enter(&dsp->ds_lock);
1994 	if (dsp->ds_pending_head != NULL) {
1995 		ASSERT(dsp->ds_dlpi_pending);
1996 		dsp->ds_pending_tail->b_next = mp;
1997 		dsp->ds_pending_tail = mp;
1998 		mutex_exit(&dsp->ds_lock);
1999 		return;
2000 	}
2001 	ASSERT(dsp->ds_pending_tail == NULL);
2002 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2003 	/*
2004 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
2005 	 * thread is still active and is processing the last message, though
2006 	 * the pending queue has been emptied.
2007 	 */
2008 	if (dsp->ds_dlpi_pending) {
2009 		mutex_exit(&dsp->ds_lock);
2010 		return;
2011 	}
2012 
2013 	dsp->ds_dlpi_pending = 1;
2014 	mutex_exit(&dsp->ds_lock);
2015 
2016 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
2017 	    TQ_NOSLEEP) != 0)
2018 		return;
2019 
2020 	mutex_enter(&dld_taskq_lock);
2021 	list_insert_tail(&dld_taskq_list, dsp);
2022 	cv_signal(&dld_taskq_cv);
2023 	mutex_exit(&dld_taskq_lock);
2024 }
2025 
2026 /*
2027  * Process an M_IOCTL message.
2028  */
2029 static void
2030 dld_ioc(dld_str_t *dsp, mblk_t *mp)
2031 {
2032 	uint_t			cmd;
2033 
2034 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2035 	ASSERT(dsp->ds_type == DLD_DLPI);
2036 
2037 	switch (cmd) {
2038 	case DLIOCNATIVE:
2039 		ioc_native(dsp, mp);
2040 		break;
2041 	case DLIOCMARGININFO:
2042 		ioc_margin(dsp, mp);
2043 		break;
2044 	case DLIOCRAW:
2045 		ioc_raw(dsp, mp);
2046 		break;
2047 	case DLIOCHDRINFO:
2048 		ioc_fast(dsp, mp);
2049 		break;
2050 	case DLIOCLOWLINK:
2051 		ioc_lowlink(dsp, mp);
2052 		break;
2053 	default:
2054 		ioc(dsp, mp);
2055 	}
2056 }
2057 
2058 /*
2059  * DLIOCNATIVE
2060  */
2061 static void
2062 ioc_native(dld_str_t *dsp, mblk_t *mp)
2063 {
2064 	queue_t *q = dsp->ds_wq;
2065 	const mac_info_t *mip = dsp->ds_mip;
2066 
2067 	/*
2068 	 * Native mode can be enabled if it's disabled and if the
2069 	 * native media type is different.
2070 	 */
2071 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2072 		dsp->ds_native = B_TRUE;
2073 
2074 	if (dsp->ds_native)
2075 		miocack(q, mp, 0, mip->mi_nativemedia);
2076 	else
2077 		miocnak(q, mp, 0, ENOTSUP);
2078 }
2079 
2080 /*
2081  * DLIOCMARGININFO
2082  */
2083 static void
2084 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2085 {
2086 	queue_t *q = dsp->ds_wq;
2087 	uint32_t margin;
2088 	int err;
2089 
2090 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2091 		err = EINVAL;
2092 		goto failed;
2093 	}
2094 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2095 		goto failed;
2096 
2097 	mac_margin_get(dsp->ds_mh, &margin);
2098 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2099 	miocack(q, mp, sizeof (uint32_t), 0);
2100 	return;
2101 
2102 failed:
2103 	miocnak(q, mp, 0, err);
2104 }
2105 
2106 /*
2107  * DLIOCRAW
2108  */
2109 static void
2110 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2111 {
2112 	queue_t *q = dsp->ds_wq;
2113 	mac_perim_handle_t	mph;
2114 
2115 	if (dsp->ds_mh == NULL) {
2116 		dsp->ds_mode = DLD_RAW;
2117 		miocack(q, mp, 0, 0);
2118 		return;
2119 	}
2120 
2121 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2122 	if (dsp->ds_polling || dsp->ds_direct) {
2123 		mac_perim_exit(mph);
2124 		miocnak(q, mp, 0, EPROTO);
2125 		return;
2126 	}
2127 
2128 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2129 		/*
2130 		 * Set the receive callback.
2131 		 */
2132 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2133 	}
2134 
2135 	/*
2136 	 * Note that raw mode is enabled.
2137 	 */
2138 	dsp->ds_mode = DLD_RAW;
2139 	mac_perim_exit(mph);
2140 
2141 	miocack(q, mp, 0, 0);
2142 }
2143 
2144 /*
2145  * DLIOCHDRINFO
2146  */
2147 static void
2148 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2149 {
2150 	dl_unitdata_req_t *dlp;
2151 	off_t		off;
2152 	size_t		len;
2153 	const uint8_t	*addr;
2154 	uint16_t	sap;
2155 	mblk_t		*nmp;
2156 	mblk_t		*hmp;
2157 	uint_t		addr_length;
2158 	queue_t		*q = dsp->ds_wq;
2159 	int		err;
2160 	mac_perim_handle_t	mph;
2161 
2162 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2163 		err = ENOTSUP;
2164 		goto failed;
2165 	}
2166 
2167 	/*
2168 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2169 	 * user-land should not be allowed.
2170 	 */
2171 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2172 		err = EINVAL;
2173 		goto failed;
2174 	}
2175 
2176 	nmp = mp->b_cont;
2177 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2178 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2179 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2180 		err = EINVAL;
2181 		goto failed;
2182 	}
2183 
2184 	off = dlp->dl_dest_addr_offset;
2185 	len = dlp->dl_dest_addr_length;
2186 
2187 	if (!MBLKIN(nmp, off, len)) {
2188 		err = EINVAL;
2189 		goto failed;
2190 	}
2191 
2192 	if (dsp->ds_dlstate != DL_IDLE) {
2193 		err = ENOTSUP;
2194 		goto failed;
2195 	}
2196 
2197 	addr_length = dsp->ds_mip->mi_addr_length;
2198 	if (len != addr_length + sizeof (uint16_t)) {
2199 		err = EINVAL;
2200 		goto failed;
2201 	}
2202 
2203 	addr = nmp->b_rptr + off;
2204 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2205 
2206 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2207 		err = ENOMEM;
2208 		goto failed;
2209 	}
2210 
2211 	/*
2212 	 * This ioctl might happen concurrently with a direct call to dld_capab
2213 	 * that tries to enable direct and/or poll capabilities. Since the
2214 	 * stack does not serialize them, we do so here to avoid mixing
2215 	 * the callbacks.
2216 	 */
2217 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2218 	if (dsp->ds_mode != DLD_FASTPATH) {
2219 		/*
2220 		 * Set the receive callback (unless polling is enabled).
2221 		 */
2222 		if (!dsp->ds_polling && !dsp->ds_direct)
2223 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2224 
2225 		/*
2226 		 * Note that fast-path mode is enabled.
2227 		 */
2228 		dsp->ds_mode = DLD_FASTPATH;
2229 	}
2230 	mac_perim_exit(mph);
2231 
2232 	freemsg(nmp->b_cont);
2233 	nmp->b_cont = hmp;
2234 
2235 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2236 	return;
2237 failed:
2238 	miocnak(q, mp, 0, err);
2239 }
2240 
2241 /*
2242  * DLIOCLOWLINK: request actual link state changes. When the
2243  * link is part of a bridge instance the client receives actual
2244  * link state changes and not the aggregate link status. Used by
2245  * the bridging daemon (bridged) for proper RSTP operation.
2246  */
2247 static void
2248 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2249 {
2250 	queue_t *q = dsp->ds_wq;
2251 	int err;
2252 
2253 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
2254 		miocnak(q, mp, 0, err);
2255 	} else {
2256 		/* LINTED: alignment */
2257 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2258 		miocack(q, mp, 0, 0);
2259 	}
2260 }
2261 
2262 /*
2263  * Catch-all handler.
2264  */
2265 static void
2266 ioc(dld_str_t *dsp, mblk_t *mp)
2267 {
2268 	queue_t	*q = dsp->ds_wq;
2269 
2270 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2271 		miocnak(q, mp, 0, EINVAL);
2272 		return;
2273 	}
2274 	mac_ioctl(dsp->ds_mh, q, mp);
2275 }
2276