xref: /titanic_41/usr/src/uts/common/io/dld/dld_str.c (revision 56f9a274cc7ca7f2d6f19959b2db143d94a4e7e0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Driver
28  */
29 
30 #include	<inet/common.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/vlan.h>
35 #include	<sys/dld_impl.h>
36 #include	<sys/cpuvar.h>
37 #include	<sys/callb.h>
38 #include	<sys/list.h>
39 #include	<sys/mac_client.h>
40 #include	<sys/mac_client_priv.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc(dld_str_t *, mblk_t *);
58 static void	dld_ioc(dld_str_t *, mblk_t *);
59 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
60 
61 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
62 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
63     link_tagmode_t);
64 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
65 
66 static uint32_t		str_count;
67 static kmem_cache_t	*str_cachep;
68 static mod_hash_t	*str_hashp;
69 
70 #define	STR_HASHSZ		64
71 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
72 
73 #define	dld_taskq	system_taskq
74 
75 static kmutex_t		dld_taskq_lock;
76 static kcondvar_t	dld_taskq_cv;
77 static list_t		dld_taskq_list;		/* List of dld_str_t */
78 boolean_t		dld_taskq_quit;
79 boolean_t		dld_taskq_done;
80 
81 static void		dld_taskq_dispatch(void);
82 
83 /*
84  * Some notes on entry points, flow-control, queueing.
85  *
86  * This driver exports the traditional STREAMS put entry point as well as
87  * the non-STREAMS fast-path transmit routine which is provided to IP via
88  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
89  * and data operations, while the fast-path routine deals only with M_DATA
90  * fast-path packets.  Regardless of the entry point, all outbound packets
91  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
92  *
93  * The transmit logic operates in the following way: All packets coming
94  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
95  * happens when the MAC layer indicates the packets couldn't be
96  * transmitted due to 1) lack of resources (e.g. running out of
97  * descriptors),  or 2) reaching the allowed bandwidth limit for this
98  * particular flow. The indication comes in the form of a Tx cookie that
99  * identifies the blocked ring. In such case, DLD will place a
100  * dummy message on its write-side STREAMS queue so that the queue is
101  * marked as "full". Any subsequent packets arriving at the driver will
102  * still be sent to the MAC layer where it either gets queued in the Tx
103  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
104  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
105  * When the write service procedure runs, it will remove the dummy
106  * message from the write-side STREAMS queue; in effect this will trigger
107  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
108  * respectively, due to the above reasons.
109  *
110  * All non-data operations, both DLPI and ioctls are single threaded on a per
111  * dld_str_t endpoint. This is done using a taskq so that the control operation
112  * has kernel context and can cv_wait for resources. In addition all set type
113  * operations that involve mac level state modification are serialized on a
114  * per mac end point using the perimeter mechanism provided by the mac layer.
115  * This serializes all mac clients trying to modify a single mac end point over
116  * the entire sequence of mac calls made by that client as an atomic unit. The
117  * mac framework locking is described in mac.c. A critical element is that
118  * DLD/DLS does not hold any locks across the mac perimeter.
119  *
120  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
121  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
122  * match dev_t. If a stream is found and it is attached, its dev_info_t *
123  * is returned. If the mac handle is non-null, it can be safely accessed
124  * below. The mac handle won't be freed until the mac_unregister which
125  * won't happen until the driver detaches. The DDI framework ensures that
126  * the detach won't happen while a getinfo is in progress.
127  */
128 typedef struct i_dld_str_state_s {
129 	major_t		ds_major;
130 	minor_t		ds_minor;
131 	dev_info_t	*ds_dip;
132 } i_dld_str_state_t;
133 
134 /* ARGSUSED */
135 static uint_t
136 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
137 {
138 	i_dld_str_state_t	*statep = arg;
139 	dld_str_t		*dsp = (dld_str_t *)val;
140 	mac_handle_t		mh;
141 
142 	if (statep->ds_major != dsp->ds_major)
143 		return (MH_WALK_CONTINUE);
144 
145 	ASSERT(statep->ds_minor != 0);
146 	mh = dsp->ds_mh;
147 
148 	if (statep->ds_minor == dsp->ds_minor) {
149 		/*
150 		 * Clone: a clone minor is unique. we can terminate the
151 		 * walk if we find a matching stream -- even if we fail
152 		 * to obtain the devinfo.
153 		 */
154 		if (mh != NULL)
155 			statep->ds_dip = mac_devinfo_get(mh);
156 		return (MH_WALK_TERMINATE);
157 	}
158 	return (MH_WALK_CONTINUE);
159 }
160 
161 static dev_info_t *
162 dld_finddevinfo(dev_t dev)
163 {
164 	dev_info_t		*dip;
165 	i_dld_str_state_t	state;
166 
167 	if (getminor(dev) == 0)
168 		return (NULL);
169 
170 	/*
171 	 * See if it's a minor node of a link
172 	 */
173 	if ((dip = dls_link_devinfo(dev)) != NULL)
174 		return (dip);
175 
176 	state.ds_minor = getminor(dev);
177 	state.ds_major = getmajor(dev);
178 	state.ds_dip = NULL;
179 
180 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
181 	return (state.ds_dip);
182 }
183 
184 /*
185  * devo_getinfo: getinfo(9e)
186  */
187 /*ARGSUSED*/
188 int
189 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
190 {
191 	dev_info_t	*devinfo;
192 	minor_t		minor = getminor((dev_t)arg);
193 	int		rc = DDI_FAILURE;
194 
195 	switch (cmd) {
196 	case DDI_INFO_DEVT2DEVINFO:
197 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
198 			*(dev_info_t **)resp = devinfo;
199 			rc = DDI_SUCCESS;
200 		}
201 		break;
202 	case DDI_INFO_DEVT2INSTANCE:
203 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
204 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
205 			rc = DDI_SUCCESS;
206 		} else if (minor > DLS_MAX_MINOR &&
207 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
208 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
209 			rc = DDI_SUCCESS;
210 		}
211 		break;
212 	}
213 	return (rc);
214 }
215 
216 void *
217 dld_str_private(queue_t *q)
218 {
219 	return (((dld_str_t *)(q->q_ptr))->ds_private);
220 }
221 
222 int
223 dld_str_open(queue_t *rq, dev_t *devp, void *private)
224 {
225 	dld_str_t	*dsp;
226 	major_t		major;
227 	minor_t		minor;
228 	int		err;
229 
230 	major = getmajor(*devp);
231 	minor = getminor(*devp);
232 
233 	/*
234 	 * Create a new dld_str_t for the stream. This will grab a new minor
235 	 * number that will be handed back in the cloned dev_t.  Creation may
236 	 * fail if we can't allocate the dummy mblk used for flow-control.
237 	 */
238 	dsp = dld_str_create(rq, DLD_DLPI, major,
239 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
240 	if (dsp == NULL)
241 		return (ENOSR);
242 
243 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
244 	dsp->ds_private = private;
245 	if (minor != 0) {
246 		/*
247 		 * Style 1 open
248 		 */
249 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
250 			goto failed;
251 
252 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
253 	} else {
254 		(void) qassociate(rq, -1);
255 	}
256 
257 	/*
258 	 * Enable the queue srv(9e) routine.
259 	 */
260 	qprocson(rq);
261 
262 	/*
263 	 * Construct a cloned dev_t to hand back.
264 	 */
265 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
266 	return (0);
267 
268 failed:
269 	dld_str_destroy(dsp);
270 	return (err);
271 }
272 
273 int
274 dld_str_close(queue_t *rq)
275 {
276 	dld_str_t	*dsp = rq->q_ptr;
277 
278 	/*
279 	 * All modules on top have been popped off. So there can't be any
280 	 * threads from the top.
281 	 */
282 	ASSERT(dsp->ds_datathr_cnt == 0);
283 
284 	/*
285 	 * Wait until pending DLPI requests are processed.
286 	 */
287 	mutex_enter(&dsp->ds_lock);
288 	while (dsp->ds_dlpi_pending)
289 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
290 	mutex_exit(&dsp->ds_lock);
291 
292 
293 	/*
294 	 * This stream was open to a provider node. Check to see
295 	 * if it has been cleanly shut down.
296 	 */
297 	if (dsp->ds_dlstate != DL_UNATTACHED) {
298 		/*
299 		 * The stream is either open to a style 1 provider or
300 		 * this is not clean shutdown. Detach from the PPA.
301 		 * (This is still ok even in the style 1 case).
302 		 */
303 		dld_str_detach(dsp);
304 	}
305 
306 	dld_str_destroy(dsp);
307 	return (0);
308 }
309 
310 /*
311  * qi_qopen: open(9e)
312  */
313 /*ARGSUSED*/
314 int
315 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
316 {
317 	if (sflag == MODOPEN)
318 		return (ENOTSUP);
319 
320 	/*
321 	 * This is a cloning driver and therefore each queue should only
322 	 * ever get opened once.
323 	 */
324 	if (rq->q_ptr != NULL)
325 		return (EBUSY);
326 
327 	return (dld_str_open(rq, devp, NULL));
328 }
329 
330 /*
331  * qi_qclose: close(9e)
332  */
333 int
334 dld_close(queue_t *rq)
335 {
336 	/*
337 	 * Disable the queue srv(9e) routine.
338 	 */
339 	qprocsoff(rq);
340 
341 	return (dld_str_close(rq));
342 }
343 
344 /*
345  * qi_qputp: put(9e)
346  */
347 void
348 dld_wput(queue_t *wq, mblk_t *mp)
349 {
350 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
351 	dld_str_mode_t	mode;
352 
353 	switch (DB_TYPE(mp)) {
354 	case M_DATA:
355 		mutex_enter(&dsp->ds_lock);
356 		mode = dsp->ds_mode;
357 		if ((dsp->ds_dlstate != DL_IDLE) ||
358 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
359 			mutex_exit(&dsp->ds_lock);
360 			freemsg(mp);
361 			break;
362 		}
363 
364 		DLD_DATATHR_INC(dsp);
365 		mutex_exit(&dsp->ds_lock);
366 		if (mode == DLD_FASTPATH) {
367 			if (dsp->ds_mip->mi_media == DL_ETHER &&
368 			    (MBLKL(mp) < sizeof (struct ether_header))) {
369 				freemsg(mp);
370 			} else {
371 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
372 			}
373 		} else {
374 			str_mdata_raw_put(dsp, mp);
375 		}
376 		DLD_DATATHR_DCR(dsp);
377 		break;
378 	case M_PROTO:
379 	case M_PCPROTO: {
380 		t_uscalar_t	prim;
381 
382 		if (MBLKL(mp) < sizeof (t_uscalar_t))
383 			break;
384 
385 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
386 
387 		if (prim == DL_UNITDATA_REQ) {
388 			proto_unitdata_req(dsp, mp);
389 		} else {
390 			dld_wput_nondata(dsp, mp);
391 		}
392 		break;
393 	}
394 
395 	case M_IOCTL:
396 		dld_wput_nondata(dsp, mp);
397 		break;
398 
399 	case M_FLUSH:
400 		if (*mp->b_rptr & FLUSHW) {
401 			DLD_CLRQFULL(dsp);
402 			*mp->b_rptr &= ~FLUSHW;
403 		}
404 
405 		if (*mp->b_rptr & FLUSHR) {
406 			qreply(wq, mp);
407 		} else {
408 			freemsg(mp);
409 		}
410 		break;
411 
412 	default:
413 		freemsg(mp);
414 		break;
415 	}
416 }
417 
418 /*
419  * qi_srvp: srv(9e)
420  */
421 void
422 dld_wsrv(queue_t *wq)
423 {
424 	dld_str_t	*dsp = wq->q_ptr;
425 
426 	DLD_CLRQFULL(dsp);
427 }
428 
429 void
430 dld_init_ops(struct dev_ops *ops, const char *name)
431 {
432 	struct streamtab *stream;
433 	struct qinit *rq, *wq;
434 	struct module_info *modinfo;
435 
436 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
437 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
438 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
439 	modinfo->mi_minpsz = 0;
440 	modinfo->mi_maxpsz = 64*1024;
441 	modinfo->mi_hiwat  = 1;
442 	modinfo->mi_lowat = 0;
443 
444 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
445 	rq->qi_qopen = dld_open;
446 	rq->qi_qclose = dld_close;
447 	rq->qi_minfo = modinfo;
448 
449 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
450 	wq->qi_putp = (pfi_t)dld_wput;
451 	wq->qi_srvp = (pfi_t)dld_wsrv;
452 	wq->qi_minfo = modinfo;
453 
454 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
455 	stream->st_rdinit = rq;
456 	stream->st_wrinit = wq;
457 	ops->devo_cb_ops->cb_str = stream;
458 
459 	if (ops->devo_getinfo == NULL)
460 		ops->devo_getinfo = &dld_getinfo;
461 }
462 
463 void
464 dld_fini_ops(struct dev_ops *ops)
465 {
466 	struct streamtab *stream;
467 	struct qinit *rq, *wq;
468 	struct module_info *modinfo;
469 
470 	stream = ops->devo_cb_ops->cb_str;
471 	rq = stream->st_rdinit;
472 	wq = stream->st_wrinit;
473 	modinfo = rq->qi_minfo;
474 	ASSERT(wq->qi_minfo == modinfo);
475 
476 	kmem_free(stream, sizeof (struct streamtab));
477 	kmem_free(wq, sizeof (struct qinit));
478 	kmem_free(rq, sizeof (struct qinit));
479 	kmem_free(modinfo->mi_idname, FMNAMESZ);
480 	kmem_free(modinfo, sizeof (struct module_info));
481 }
482 
483 /*
484  * Initialize this module's data structures.
485  */
486 void
487 dld_str_init(void)
488 {
489 	/*
490 	 * Create dld_str_t object cache.
491 	 */
492 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
493 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
494 	ASSERT(str_cachep != NULL);
495 
496 	/*
497 	 * Create a hash table for maintaining dld_str_t's.
498 	 * The ds_minor field (the clone minor number) of a dld_str_t
499 	 * is used as a key for this hash table because this number is
500 	 * globally unique (allocated from "dls_minor_arena").
501 	 */
502 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
503 	    mod_hash_null_valdtor);
504 
505 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
506 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
507 
508 	dld_taskq_quit = B_FALSE;
509 	dld_taskq_done = B_FALSE;
510 	list_create(&dld_taskq_list, sizeof (dld_str_t),
511 	    offsetof(dld_str_t, ds_tqlist));
512 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
513 	    &p0, TS_RUN, minclsyspri);
514 }
515 
516 /*
517  * Tear down this module's data structures.
518  */
519 int
520 dld_str_fini(void)
521 {
522 	/*
523 	 * Make sure that there are no objects in use.
524 	 */
525 	if (str_count != 0)
526 		return (EBUSY);
527 
528 	/*
529 	 * Ask the dld_taskq thread to quit and wait for it to be done
530 	 */
531 	mutex_enter(&dld_taskq_lock);
532 	dld_taskq_quit = B_TRUE;
533 	cv_signal(&dld_taskq_cv);
534 	while (!dld_taskq_done)
535 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
536 	mutex_exit(&dld_taskq_lock);
537 	list_destroy(&dld_taskq_list);
538 	/*
539 	 * Destroy object cache.
540 	 */
541 	kmem_cache_destroy(str_cachep);
542 	mod_hash_destroy_idhash(str_hashp);
543 	return (0);
544 }
545 
546 /*
547  * Create a new dld_str_t object.
548  */
549 dld_str_t *
550 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
551 {
552 	dld_str_t	*dsp;
553 	int		err;
554 
555 	/*
556 	 * Allocate an object from the cache.
557 	 */
558 	atomic_add_32(&str_count, 1);
559 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
560 
561 	/*
562 	 * Allocate the dummy mblk for flow-control.
563 	 */
564 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
565 	if (dsp->ds_tx_flow_mp == NULL) {
566 		kmem_cache_free(str_cachep, dsp);
567 		atomic_add_32(&str_count, -1);
568 		return (NULL);
569 	}
570 	dsp->ds_type = type;
571 	dsp->ds_major = major;
572 	dsp->ds_style = style;
573 
574 	/*
575 	 * Initialize the queue pointers.
576 	 */
577 	ASSERT(RD(rq) == rq);
578 	dsp->ds_rq = rq;
579 	dsp->ds_wq = WR(rq);
580 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
581 
582 	/*
583 	 * We want explicit control over our write-side STREAMS queue
584 	 * where the dummy mblk gets added/removed for flow-control.
585 	 */
586 	noenable(WR(rq));
587 
588 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
589 	    (mod_hash_val_t)dsp);
590 	ASSERT(err == 0);
591 	return (dsp);
592 }
593 
594 /*
595  * Destroy a dld_str_t object.
596  */
597 void
598 dld_str_destroy(dld_str_t *dsp)
599 {
600 	queue_t		*rq;
601 	queue_t		*wq;
602 	mod_hash_val_t	val;
603 
604 	/*
605 	 * Clear the queue pointers.
606 	 */
607 	rq = dsp->ds_rq;
608 	wq = dsp->ds_wq;
609 	ASSERT(wq == WR(rq));
610 	rq->q_ptr = wq->q_ptr = NULL;
611 	dsp->ds_rq = dsp->ds_wq = NULL;
612 
613 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
614 	ASSERT(dsp->ds_sap == 0);
615 	ASSERT(dsp->ds_mh == NULL);
616 	ASSERT(dsp->ds_mch == NULL);
617 	ASSERT(dsp->ds_promisc == 0);
618 	ASSERT(dsp->ds_mph == NULL);
619 	ASSERT(dsp->ds_mip == NULL);
620 	ASSERT(dsp->ds_mnh == NULL);
621 
622 	ASSERT(dsp->ds_polling == B_FALSE);
623 	ASSERT(dsp->ds_direct == B_FALSE);
624 	ASSERT(dsp->ds_lso == B_FALSE);
625 	ASSERT(dsp->ds_lso_max == 0);
626 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
627 
628 	/*
629 	 * Reinitialize all the flags.
630 	 */
631 	dsp->ds_notifications = 0;
632 	dsp->ds_passivestate = DLD_UNINITIALIZED;
633 	dsp->ds_mode = DLD_UNITDATA;
634 	dsp->ds_native = B_FALSE;
635 
636 	ASSERT(dsp->ds_datathr_cnt == 0);
637 	ASSERT(dsp->ds_pending_head == NULL);
638 	ASSERT(dsp->ds_pending_tail == NULL);
639 	ASSERT(!dsp->ds_dlpi_pending);
640 
641 	ASSERT(dsp->ds_dlp == NULL);
642 	ASSERT(dsp->ds_dmap == NULL);
643 	ASSERT(dsp->ds_rx == NULL);
644 	ASSERT(dsp->ds_rx_arg == NULL);
645 	ASSERT(dsp->ds_next == NULL);
646 	ASSERT(dsp->ds_head == NULL);
647 
648 	/*
649 	 * Free the dummy mblk if exists.
650 	 */
651 	if (dsp->ds_tx_flow_mp != NULL) {
652 		freeb(dsp->ds_tx_flow_mp);
653 		dsp->ds_tx_flow_mp = NULL;
654 	}
655 
656 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
657 	ASSERT(dsp == (dld_str_t *)val);
658 
659 	/*
660 	 * Free the object back to the cache.
661 	 */
662 	kmem_cache_free(str_cachep, dsp);
663 	atomic_add_32(&str_count, -1);
664 }
665 
666 /*
667  * kmem_cache contructor function: see kmem_cache_create(9f).
668  */
669 /*ARGSUSED*/
670 static int
671 str_constructor(void *buf, void *cdrarg, int kmflags)
672 {
673 	dld_str_t	*dsp = buf;
674 
675 	bzero(buf, sizeof (dld_str_t));
676 
677 	/*
678 	 * Allocate a new minor number.
679 	 */
680 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
681 		return (-1);
682 
683 	/*
684 	 * Initialize the DLPI state machine.
685 	 */
686 	dsp->ds_dlstate = DL_UNATTACHED;
687 
688 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
689 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
690 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
691 
692 	return (0);
693 }
694 
695 /*
696  * kmem_cache destructor function.
697  */
698 /*ARGSUSED*/
699 static void
700 str_destructor(void *buf, void *cdrarg)
701 {
702 	dld_str_t	*dsp = buf;
703 
704 	/*
705 	 * Release the minor number.
706 	 */
707 	mac_minor_rele(dsp->ds_minor);
708 
709 	ASSERT(dsp->ds_tx_flow_mp == NULL);
710 
711 	mutex_destroy(&dsp->ds_lock);
712 	cv_destroy(&dsp->ds_datathr_cv);
713 	cv_destroy(&dsp->ds_dlpi_pending_cv);
714 }
715 
716 /*
717  * Update the priority bits and VID (may need to insert tag if mp points
718  * to an untagged packet.
719  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
720  */
721 static mblk_t *
722 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
723     link_tagmode_t tagmode)
724 {
725 	mblk_t *hmp;
726 	struct ether_vlan_header *evhp;
727 	struct ether_header *ehp;
728 	uint16_t old_tci = 0;
729 	size_t len;
730 
731 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
732 
733 	evhp = (struct ether_vlan_header *)mp->b_rptr;
734 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
735 		/*
736 		 * Tagged packet, update the priority bits.
737 		 */
738 		len = sizeof (struct ether_vlan_header);
739 
740 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
741 			/*
742 			 * In case some drivers only check the db_ref
743 			 * count of the first mblk, we pullup the
744 			 * message into a single mblk.
745 			 */
746 			hmp = msgpullup(mp, -1);
747 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
748 				freemsg(hmp);
749 				return (NULL);
750 			} else {
751 				freemsg(mp);
752 				mp = hmp;
753 			}
754 		}
755 
756 		evhp = (struct ether_vlan_header *)mp->b_rptr;
757 		old_tci = ntohs(evhp->ether_tci);
758 	} else {
759 		/*
760 		 * Untagged packet.  Two factors will cause us to insert a
761 		 * VLAN header:
762 		 * - This is a VLAN link (vid is specified)
763 		 * - The link supports user priority tagging and the priority
764 		 *   is non-zero.
765 		 */
766 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
767 			return (mp);
768 
769 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
770 		if (hmp == NULL)
771 			return (NULL);
772 
773 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
774 		ehp = (struct ether_header *)mp->b_rptr;
775 
776 		/*
777 		 * Copy the MAC addresses and typelen
778 		 */
779 		bcopy(ehp, evhp, (ETHERADDRL * 2));
780 		evhp->ether_type = ehp->ether_type;
781 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
782 
783 		hmp->b_wptr += sizeof (struct ether_vlan_header);
784 		mp->b_rptr += sizeof (struct ether_header);
785 
786 		/*
787 		 * Free the original message if it's now empty. Link the
788 		 * rest of the messages to the header message.
789 		 */
790 		if (MBLKL(mp) == 0) {
791 			hmp->b_cont = mp->b_cont;
792 			freeb(mp);
793 		} else {
794 			hmp->b_cont = mp;
795 		}
796 		mp = hmp;
797 	}
798 
799 	if (pri == 0)
800 		pri = VLAN_PRI(old_tci);
801 	if (vid == VLAN_ID_NONE)
802 		vid = VLAN_ID(old_tci);
803 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
804 	return (mp);
805 }
806 
807 /*
808  * M_DATA put (IP fast-path mode)
809  */
810 mac_tx_cookie_t
811 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
812     uint16_t flag)
813 {
814 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
815 	mblk_t *newmp;
816 	uint_t pri;
817 	mac_tx_cookie_t cookie;
818 
819 	if (is_ethernet) {
820 		/*
821 		 * Update the priority bits to the assigned priority.
822 		 */
823 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
824 
825 		if (pri != 0) {
826 			newmp = i_dld_ether_header_update_tag(mp, pri,
827 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
828 			if (newmp == NULL)
829 				goto discard;
830 			mp = newmp;
831 		}
832 	}
833 
834 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
835 		DLD_SETQFULL(dsp);
836 	}
837 	return (cookie);
838 
839 discard:
840 	/* TODO: bump kstat? */
841 	freemsg(mp);
842 	return (NULL);
843 }
844 
845 /*
846  * M_DATA put (DLIOCRAW mode)
847  */
848 static void
849 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
850 {
851 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
852 	mblk_t *bp, *newmp;
853 	size_t size;
854 	mac_header_info_t mhi;
855 	uint_t pri, vid, dvid;
856 	uint_t max_sdu;
857 
858 	/*
859 	 * Certain MAC type plugins provide an illusion for raw DLPI
860 	 * consumers.  They pretend that the MAC layer is something that
861 	 * it's not for the benefit of observability tools.  For example,
862 	 * mac_wifi pretends that it's Ethernet for such consumers.
863 	 * Here, unless native mode is enabled, we call into the MAC layer so
864 	 * that this illusion can be maintained.  The plugin will optionally
865 	 * transform the MAC header here into something that can be passed
866 	 * down.  The header goes from raw mode to "cooked" mode.
867 	 */
868 	if (!dsp->ds_native) {
869 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
870 			goto discard;
871 		mp = newmp;
872 	}
873 
874 	size = MBLKL(mp);
875 
876 	/*
877 	 * Check the packet is not too big and that any remaining
878 	 * fragment list is composed entirely of M_DATA messages. (We
879 	 * know the first fragment was M_DATA otherwise we could not
880 	 * have got here).
881 	 */
882 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
883 		if (DB_TYPE(bp) != M_DATA)
884 			goto discard;
885 		size += MBLKL(bp);
886 	}
887 
888 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
889 		goto discard;
890 
891 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
892 	/*
893 	 * If LSO is enabled, check the size against lso_max. Otherwise,
894 	 * compare the packet size with max_sdu.
895 	 */
896 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
897 	if (size > max_sdu + mhi.mhi_hdrsize)
898 		goto discard;
899 
900 	if (is_ethernet) {
901 		dvid = mac_client_vid(dsp->ds_mch);
902 
903 		/*
904 		 * Discard the packet if this is a VLAN stream but the VID in
905 		 * the packet is not correct.
906 		 */
907 		vid = VLAN_ID(mhi.mhi_tci);
908 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
909 			goto discard;
910 
911 		/*
912 		 * Discard the packet if this packet is a tagged packet
913 		 * but both pri and VID are 0.
914 		 */
915 		pri = VLAN_PRI(mhi.mhi_tci);
916 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
917 			goto discard;
918 
919 		/*
920 		 * Update the priority bits to the per-stream priority if
921 		 * priority is not set in the packet. Update the VID for
922 		 * packets on a VLAN stream.
923 		 */
924 		pri = (pri == 0) ? dsp->ds_pri : 0;
925 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
926 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
927 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
928 				goto discard;
929 			}
930 			mp = newmp;
931 		}
932 	}
933 
934 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
935 		/* Turn on flow-control for dld */
936 		DLD_SETQFULL(dsp);
937 	}
938 	return;
939 
940 discard:
941 	/* TODO: bump kstat? */
942 	freemsg(mp);
943 }
944 
945 /*
946  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
947  */
948 int
949 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
950 {
951 	dev_t			dev;
952 	int			err;
953 	const char		*drvname;
954 	mac_perim_handle_t	mph = NULL;
955 	boolean_t		qassociated = B_FALSE;
956 	dls_link_t		*dlp = NULL;
957 	dls_dl_handle_t		ddp = NULL;
958 
959 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
960 		return (EINVAL);
961 
962 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
963 		return (ENOTSUP);
964 
965 	/*
966 	 * /dev node access. This will still be supported for backward
967 	 * compatibility reason.
968 	 */
969 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
970 	    (strcmp(drvname, "vnic") != 0)) {
971 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
972 			return (EINVAL);
973 		qassociated = B_TRUE;
974 	}
975 
976 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
977 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
978 		goto failed;
979 
980 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
981 		goto failed;
982 
983 	/*
984 	 * Open a channel.
985 	 */
986 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
987 		goto failed;
988 
989 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
990 		goto failed;
991 
992 	/*
993 	 * Set the default packet priority.
994 	 */
995 	dsp->ds_pri = 0;
996 
997 	/*
998 	 * Add a notify function so that the we get updates from the MAC.
999 	 */
1000 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1001 	dsp->ds_dlstate = DL_UNBOUND;
1002 	mac_perim_exit(mph);
1003 	return (0);
1004 
1005 failed:
1006 	if (dlp != NULL)
1007 		dls_link_rele(dlp);
1008 	if (mph != NULL)
1009 		mac_perim_exit(mph);
1010 	if (ddp != NULL)
1011 		dls_devnet_rele(ddp);
1012 	if (qassociated)
1013 		(void) qassociate(dsp->ds_wq, -1);
1014 
1015 	return (err);
1016 }
1017 
1018 /*
1019  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1020  * from close(2) for style 2.
1021  */
1022 void
1023 dld_str_detach(dld_str_t *dsp)
1024 {
1025 	mac_perim_handle_t	mph;
1026 	int			err;
1027 
1028 	ASSERT(dsp->ds_datathr_cnt == 0);
1029 
1030 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1031 	/*
1032 	 * Remove the notify function.
1033 	 *
1034 	 * Note that we cannot wait for the notification callback to be removed
1035 	 * since it could cause the deadlock with str_notify() since they both
1036 	 * need the mac perimeter. Continue if we cannot remove the
1037 	 * notification callback right now and wait after we leave the
1038 	 * perimeter.
1039 	 */
1040 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1041 	dsp->ds_mnh = NULL;
1042 
1043 	/*
1044 	 * Disable the capabilities
1045 	 */
1046 	dld_capabilities_disable(dsp);
1047 
1048 	/*
1049 	 * Clear LSO flags.
1050 	 */
1051 	dsp->ds_lso = B_FALSE;
1052 	dsp->ds_lso_max = 0;
1053 
1054 	dls_close(dsp);
1055 	mac_perim_exit(mph);
1056 
1057 	/*
1058 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1059 	 * because the notification callback was in progress, wait for
1060 	 * it to finish before we proceed.
1061 	 */
1062 	if (err != 0)
1063 		mac_notify_remove_wait(dsp->ds_mh);
1064 
1065 	/*
1066 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1067 	 * automatically in the call to dls_devnet_rele.
1068 	 */
1069 	dls_devnet_rele(dsp->ds_ddh);
1070 
1071 	dsp->ds_sap = 0;
1072 	dsp->ds_mh = NULL;
1073 	dsp->ds_mch = NULL;
1074 	dsp->ds_mip = NULL;
1075 
1076 	if (dsp->ds_style == DL_STYLE2)
1077 		(void) qassociate(dsp->ds_wq, -1);
1078 
1079 	/*
1080 	 * Re-initialize the DLPI state machine.
1081 	 */
1082 	dsp->ds_dlstate = DL_UNATTACHED;
1083 }
1084 
1085 /*
1086  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1087  * tags before sending packets up to the DLS clients, with the exception of
1088  * special priority tagged packets, in that case, we set the VID to 0.
1089  * mp must be a VLAN tagged packet.
1090  */
1091 static mblk_t *
1092 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1093 {
1094 	mblk_t *newmp;
1095 	struct ether_vlan_header *evhp;
1096 	uint16_t tci, new_tci;
1097 
1098 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1099 	if (DB_REF(mp) > 1) {
1100 		newmp = copymsg(mp);
1101 		if (newmp == NULL)
1102 			return (NULL);
1103 		freemsg(mp);
1104 		mp = newmp;
1105 	}
1106 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1107 
1108 	tci = ntohs(evhp->ether_tci);
1109 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1110 		/*
1111 		 * Priority is 0, strip the tag.
1112 		 */
1113 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1114 		mp->b_rptr += VLAN_TAGSZ;
1115 	} else {
1116 		/*
1117 		 * Priority is not 0, update the VID to 0.
1118 		 */
1119 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1120 		evhp->ether_tci = htons(new_tci);
1121 	}
1122 	return (mp);
1123 }
1124 
1125 /*
1126  * Raw mode receive function.
1127  */
1128 /*ARGSUSED*/
1129 void
1130 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1131     mac_header_info_t *mhip)
1132 {
1133 	dld_str_t *dsp = (dld_str_t *)arg;
1134 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1135 	mblk_t *next, *newmp;
1136 
1137 	ASSERT(mp != NULL);
1138 	do {
1139 		/*
1140 		 * Get the pointer to the next packet in the chain and then
1141 		 * clear b_next before the packet gets passed on.
1142 		 */
1143 		next = mp->b_next;
1144 		mp->b_next = NULL;
1145 
1146 		/*
1147 		 * Wind back b_rptr to point at the MAC header.
1148 		 */
1149 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1150 		mp->b_rptr -= mhip->mhi_hdrsize;
1151 
1152 		/*
1153 		 * Certain MAC type plugins provide an illusion for raw
1154 		 * DLPI consumers.  They pretend that the MAC layer is
1155 		 * something that it's not for the benefit of observability
1156 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1157 		 * for such consumers.	Here, unless native mode is enabled,
1158 		 * we call into the MAC layer so that this illusion can be
1159 		 * maintained.	The plugin will optionally transform the MAC
1160 		 * header here into something that can be passed up to raw
1161 		 * consumers.  The header goes from "cooked" mode to raw mode.
1162 		 */
1163 		if (!dsp->ds_native) {
1164 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1165 			if (newmp == NULL) {
1166 				freemsg(mp);
1167 				goto next;
1168 			}
1169 			mp = newmp;
1170 		}
1171 
1172 		/*
1173 		 * Strip the VLAN tag for VLAN streams.
1174 		 */
1175 		if (is_ethernet &&
1176 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1177 			/*
1178 			 * The priority should be kept only for VLAN
1179 			 * data-links.
1180 			 */
1181 			newmp = i_dld_ether_header_strip_tag(mp,
1182 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1183 			if (newmp == NULL) {
1184 				freemsg(mp);
1185 				goto next;
1186 			}
1187 			mp = newmp;
1188 		}
1189 
1190 		/*
1191 		 * Pass the packet on.
1192 		 */
1193 		if (canputnext(dsp->ds_rq))
1194 			putnext(dsp->ds_rq, mp);
1195 		else
1196 			freemsg(mp);
1197 
1198 next:
1199 		/*
1200 		 * Move on to the next packet in the chain.
1201 		 */
1202 		mp = next;
1203 	} while (mp != NULL);
1204 }
1205 
1206 /*
1207  * Fast-path receive function.
1208  */
1209 /*ARGSUSED*/
1210 void
1211 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1212     mac_header_info_t *mhip)
1213 {
1214 	dld_str_t *dsp = (dld_str_t *)arg;
1215 	mblk_t *next;
1216 	size_t offset = 0;
1217 
1218 	/*
1219 	 * MAC header stripping rules:
1220 	 *    - Tagged packets:
1221 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1222 	 *	b. Physical streams
1223 	 *	- VLAN packets (non-zero VID). The stream must be either a
1224 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1225 	 *	  Strip the Ethernet header but keep the VLAN header.
1226 	 *	- Special tagged packets (zero VID)
1227 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1228 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1229 	 *	    keep the VLAN header.
1230 	 *	  * Otherwise, strip the whole VLAN header.
1231 	 *    - Untagged packets. Strip the whole MAC header.
1232 	 */
1233 	if (mhip->mhi_istagged &&
1234 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1235 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1236 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1237 		offset = VLAN_TAGSZ;
1238 	}
1239 
1240 	ASSERT(mp != NULL);
1241 	do {
1242 		/*
1243 		 * Get the pointer to the next packet in the chain and then
1244 		 * clear b_next before the packet gets passed on.
1245 		 */
1246 		next = mp->b_next;
1247 		mp->b_next = NULL;
1248 
1249 		/*
1250 		 * Wind back b_rptr to point at the VLAN header.
1251 		 */
1252 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1253 		mp->b_rptr -= offset;
1254 
1255 		/*
1256 		 * Pass the packet on.
1257 		 */
1258 		if (canputnext(dsp->ds_rq))
1259 			putnext(dsp->ds_rq, mp);
1260 		else
1261 			freemsg(mp);
1262 		/*
1263 		 * Move on to the next packet in the chain.
1264 		 */
1265 		mp = next;
1266 	} while (mp != NULL);
1267 }
1268 
1269 /*
1270  * Default receive function (send DL_UNITDATA_IND messages).
1271  */
1272 /*ARGSUSED*/
1273 void
1274 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1275     mac_header_info_t *mhip)
1276 {
1277 	dld_str_t		*dsp = (dld_str_t *)arg;
1278 	mblk_t			*ud_mp;
1279 	mblk_t			*next;
1280 	size_t			offset = 0;
1281 	boolean_t		strip_vlan = B_TRUE;
1282 
1283 	/*
1284 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1285 	 */
1286 	if (mhip->mhi_istagged &&
1287 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1288 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1289 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1290 		offset = VLAN_TAGSZ;
1291 		strip_vlan = B_FALSE;
1292 	}
1293 
1294 	ASSERT(mp != NULL);
1295 	do {
1296 		/*
1297 		 * Get the pointer to the next packet in the chain and then
1298 		 * clear b_next before the packet gets passed on.
1299 		 */
1300 		next = mp->b_next;
1301 		mp->b_next = NULL;
1302 
1303 		/*
1304 		 * Wind back b_rptr to point at the MAC header.
1305 		 */
1306 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1307 		mp->b_rptr -= mhip->mhi_hdrsize;
1308 
1309 		/*
1310 		 * Create the DL_UNITDATA_IND M_PROTO.
1311 		 */
1312 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1313 			freemsgchain(mp);
1314 			return;
1315 		}
1316 
1317 		/*
1318 		 * Advance b_rptr to point at the payload (or the VLAN header).
1319 		 */
1320 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1321 
1322 		/*
1323 		 * Prepend the DL_UNITDATA_IND.
1324 		 */
1325 		ud_mp->b_cont = mp;
1326 
1327 		/*
1328 		 * Send the message.
1329 		 */
1330 		if (canputnext(dsp->ds_rq))
1331 			putnext(dsp->ds_rq, ud_mp);
1332 		else
1333 			freemsg(ud_mp);
1334 
1335 		/*
1336 		 * Move on to the next packet in the chain.
1337 		 */
1338 		mp = next;
1339 	} while (mp != NULL);
1340 }
1341 
1342 /*
1343  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1344  */
1345 static void
1346 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1347 {
1348 	mblk_t		*mp;
1349 	dl_notify_ind_t *dlip;
1350 
1351 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1352 		return;
1353 
1354 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1355 	    M_PROTO, 0)) == NULL)
1356 		return;
1357 
1358 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1359 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1360 	dlip->dl_primitive = DL_NOTIFY_IND;
1361 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1362 	dlip->dl_data = max_sdu;
1363 
1364 	qreply(dsp->ds_wq, mp);
1365 }
1366 
1367 /*
1368  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1369  * current state of the interface.
1370  */
1371 void
1372 dld_str_notify_ind(dld_str_t *dsp)
1373 {
1374 	mac_notify_type_t	type;
1375 
1376 	for (type = 0; type < MAC_NNOTE; type++)
1377 		str_notify(dsp, type);
1378 }
1379 
1380 typedef struct dl_unitdata_ind_wrapper {
1381 	dl_unitdata_ind_t	dl_unitdata;
1382 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1383 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1384 } dl_unitdata_ind_wrapper_t;
1385 
1386 /*
1387  * Create a DL_UNITDATA_IND M_PROTO message.
1388  */
1389 static mblk_t *
1390 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1391 {
1392 	mblk_t				*nmp;
1393 	dl_unitdata_ind_wrapper_t	*dlwp;
1394 	dl_unitdata_ind_t		*dlp;
1395 	mac_header_info_t		mhi;
1396 	uint_t				addr_length;
1397 	uint8_t				*daddr;
1398 	uint8_t				*saddr;
1399 
1400 	/*
1401 	 * Get the packet header information.
1402 	 */
1403 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
1404 		return (NULL);
1405 
1406 	/*
1407 	 * Allocate a message large enough to contain the wrapper structure
1408 	 * defined above.
1409 	 */
1410 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1411 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1412 	    DL_UNITDATA_IND)) == NULL)
1413 		return (NULL);
1414 
1415 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1416 
1417 	dlp = &(dlwp->dl_unitdata);
1418 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1419 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1420 
1421 	/*
1422 	 * Copy in the destination address.
1423 	 */
1424 	addr_length = dsp->ds_mip->mi_addr_length;
1425 	daddr = dlwp->dl_dest_addr;
1426 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1427 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1428 
1429 	/*
1430 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1431 	 */
1432 	if (mhi.mhi_istagged && !strip_vlan)
1433 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1434 	else
1435 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1436 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1437 
1438 	/*
1439 	 * If the destination address was multicast or broadcast then the
1440 	 * dl_group_address field should be non-zero.
1441 	 */
1442 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1443 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1444 
1445 	/*
1446 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1447 	 * for example) may not have access to source information.
1448 	 */
1449 	if (mhi.mhi_saddr == NULL) {
1450 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1451 	} else {
1452 		saddr = dlwp->dl_src_addr;
1453 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1454 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1455 
1456 		/*
1457 		 * Set the source DLSAP to the packet ethertype.
1458 		 */
1459 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1460 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1461 	}
1462 
1463 	return (nmp);
1464 }
1465 
1466 /*
1467  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1468  */
1469 static void
1470 str_notify_promisc_on_phys(dld_str_t *dsp)
1471 {
1472 	mblk_t		*mp;
1473 	dl_notify_ind_t	*dlip;
1474 
1475 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1476 		return;
1477 
1478 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1479 	    M_PROTO, 0)) == NULL)
1480 		return;
1481 
1482 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1483 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1484 	dlip->dl_primitive = DL_NOTIFY_IND;
1485 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1486 
1487 	qreply(dsp->ds_wq, mp);
1488 }
1489 
1490 /*
1491  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1492  */
1493 static void
1494 str_notify_promisc_off_phys(dld_str_t *dsp)
1495 {
1496 	mblk_t		*mp;
1497 	dl_notify_ind_t	*dlip;
1498 
1499 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1500 		return;
1501 
1502 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1503 	    M_PROTO, 0)) == NULL)
1504 		return;
1505 
1506 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1507 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1508 	dlip->dl_primitive = DL_NOTIFY_IND;
1509 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1510 
1511 	qreply(dsp->ds_wq, mp);
1512 }
1513 
1514 /*
1515  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1516  */
1517 static void
1518 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1519 {
1520 	mblk_t		*mp;
1521 	dl_notify_ind_t	*dlip;
1522 	uint_t		addr_length;
1523 	uint16_t	ethertype;
1524 
1525 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1526 		return;
1527 
1528 	addr_length = dsp->ds_mip->mi_addr_length;
1529 	if ((mp = mexchange(dsp->ds_wq, NULL,
1530 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1531 	    M_PROTO, 0)) == NULL)
1532 		return;
1533 
1534 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1535 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1536 	dlip->dl_primitive = DL_NOTIFY_IND;
1537 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1538 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1539 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1540 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1541 
1542 	bcopy(addr, &dlip[1], addr_length);
1543 
1544 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1545 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1546 
1547 	qreply(dsp->ds_wq, mp);
1548 }
1549 
1550 /*
1551  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1552  */
1553 static void
1554 str_notify_link_up(dld_str_t *dsp)
1555 {
1556 	mblk_t		*mp;
1557 	dl_notify_ind_t	*dlip;
1558 
1559 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1560 		return;
1561 
1562 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1563 	    M_PROTO, 0)) == NULL)
1564 		return;
1565 
1566 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1567 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1568 	dlip->dl_primitive = DL_NOTIFY_IND;
1569 	dlip->dl_notification = DL_NOTE_LINK_UP;
1570 
1571 	qreply(dsp->ds_wq, mp);
1572 }
1573 
1574 /*
1575  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1576  */
1577 static void
1578 str_notify_link_down(dld_str_t *dsp)
1579 {
1580 	mblk_t		*mp;
1581 	dl_notify_ind_t	*dlip;
1582 
1583 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1584 		return;
1585 
1586 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1587 	    M_PROTO, 0)) == NULL)
1588 		return;
1589 
1590 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1591 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1592 	dlip->dl_primitive = DL_NOTIFY_IND;
1593 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1594 
1595 	qreply(dsp->ds_wq, mp);
1596 }
1597 
1598 /*
1599  * DL_NOTIFY_IND: DL_NOTE_SPEED
1600  */
1601 static void
1602 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1603 {
1604 	mblk_t		*mp;
1605 	dl_notify_ind_t	*dlip;
1606 
1607 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1608 		return;
1609 
1610 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1611 	    M_PROTO, 0)) == NULL)
1612 		return;
1613 
1614 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1615 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1616 	dlip->dl_primitive = DL_NOTIFY_IND;
1617 	dlip->dl_notification = DL_NOTE_SPEED;
1618 	dlip->dl_data = speed;
1619 
1620 	qreply(dsp->ds_wq, mp);
1621 }
1622 
1623 /*
1624  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1625  */
1626 static void
1627 str_notify_capab_reneg(dld_str_t *dsp)
1628 {
1629 	mblk_t		*mp;
1630 	dl_notify_ind_t	*dlip;
1631 
1632 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1633 		return;
1634 
1635 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1636 	    M_PROTO, 0)) == NULL)
1637 		return;
1638 
1639 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1640 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1641 	dlip->dl_primitive = DL_NOTIFY_IND;
1642 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1643 
1644 	qreply(dsp->ds_wq, mp);
1645 }
1646 
1647 /*
1648  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1649  */
1650 static void
1651 str_notify_fastpath_flush(dld_str_t *dsp)
1652 {
1653 	mblk_t		*mp;
1654 	dl_notify_ind_t	*dlip;
1655 
1656 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1657 		return;
1658 
1659 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1660 	    M_PROTO, 0)) == NULL)
1661 		return;
1662 
1663 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1664 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1665 	dlip->dl_primitive = DL_NOTIFY_IND;
1666 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1667 
1668 	qreply(dsp->ds_wq, mp);
1669 }
1670 
1671 /*
1672  * MAC notification callback.
1673  */
1674 void
1675 str_notify(void *arg, mac_notify_type_t type)
1676 {
1677 	dld_str_t		*dsp = (dld_str_t *)arg;
1678 	queue_t			*q = dsp->ds_wq;
1679 	mac_handle_t		mh = dsp->ds_mh;
1680 	mac_client_handle_t	mch = dsp->ds_mch;
1681 	uint8_t			addr[MAXMACADDRLEN];
1682 
1683 	switch (type) {
1684 	case MAC_NOTE_TX:
1685 		qenable(q);
1686 		break;
1687 
1688 	case MAC_NOTE_DEVPROMISC:
1689 		/*
1690 		 * Send the appropriate DL_NOTIFY_IND.
1691 		 */
1692 		if (mac_promisc_get(mh))
1693 			str_notify_promisc_on_phys(dsp);
1694 		else
1695 			str_notify_promisc_off_phys(dsp);
1696 		break;
1697 
1698 	case MAC_NOTE_UNICST:
1699 		/*
1700 		 * This notification is sent whenever the MAC unicast
1701 		 * address changes.
1702 		 */
1703 		mac_unicast_primary_get(mh, addr);
1704 
1705 		/*
1706 		 * Send the appropriate DL_NOTIFY_IND.
1707 		 */
1708 		str_notify_phys_addr(dsp, addr);
1709 		break;
1710 
1711 	case MAC_NOTE_LINK:
1712 		/*
1713 		 * This notification is sent every time the MAC driver
1714 		 * updates the link state.
1715 		 */
1716 		switch (mac_client_stat_get(mch, MAC_STAT_LINK_STATE)) {
1717 		case LINK_STATE_UP: {
1718 			uint64_t speed;
1719 			/*
1720 			 * The link is up so send the appropriate
1721 			 * DL_NOTIFY_IND.
1722 			 */
1723 			str_notify_link_up(dsp);
1724 
1725 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1726 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1727 			break;
1728 		}
1729 		case LINK_STATE_DOWN:
1730 			/*
1731 			 * The link is down so send the appropriate
1732 			 * DL_NOTIFY_IND.
1733 			 */
1734 			str_notify_link_down(dsp);
1735 			break;
1736 
1737 		default:
1738 			break;
1739 		}
1740 		break;
1741 
1742 	case MAC_NOTE_CAPAB_CHG:
1743 		/*
1744 		 * This notification is sent whenever the MAC resources
1745 		 * change or capabilities change. We need to renegotiate
1746 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1747 		 */
1748 		str_notify_capab_reneg(dsp);
1749 		break;
1750 
1751 	case MAC_NOTE_SDU_SIZE: {
1752 		uint_t  max_sdu;
1753 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1754 		str_notify_sdu_size(dsp, max_sdu);
1755 		break;
1756 	}
1757 
1758 	case MAC_NOTE_FASTPATH_FLUSH:
1759 		str_notify_fastpath_flush(dsp);
1760 		break;
1761 
1762 	case MAC_NOTE_MARGIN:
1763 		break;
1764 
1765 	default:
1766 		ASSERT(B_FALSE);
1767 		break;
1768 	}
1769 }
1770 
1771 /*
1772  * This function is called via a taskq mechansim to process all control
1773  * messages on a per 'dsp' end point.
1774  */
1775 static void
1776 dld_wput_nondata_task(void *arg)
1777 {
1778 	dld_str_t	*dsp = arg;
1779 	mblk_t		*mp;
1780 
1781 	mutex_enter(&dsp->ds_lock);
1782 	while (dsp->ds_pending_head != NULL) {
1783 		mp = dsp->ds_pending_head;
1784 		dsp->ds_pending_head = mp->b_next;
1785 		mp->b_next = NULL;
1786 		if (dsp->ds_pending_head == NULL)
1787 			dsp->ds_pending_tail = NULL;
1788 		mutex_exit(&dsp->ds_lock);
1789 
1790 		switch (DB_TYPE(mp)) {
1791 		case M_PROTO:
1792 		case M_PCPROTO:
1793 			dld_proto(dsp, mp);
1794 			break;
1795 		case M_IOCTL:
1796 			dld_ioc(dsp, mp);
1797 			break;
1798 		default:
1799 			ASSERT(0);
1800 		}
1801 
1802 		mutex_enter(&dsp->ds_lock);
1803 	}
1804 	ASSERT(dsp->ds_pending_tail == NULL);
1805 	dsp->ds_dlpi_pending = 0;
1806 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1807 	mutex_exit(&dsp->ds_lock);
1808 }
1809 
1810 /*
1811  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1812  * thread is started at boot time.
1813  */
1814 static void
1815 dld_taskq_dispatch(void)
1816 {
1817 	callb_cpr_t	cprinfo;
1818 	dld_str_t	*dsp;
1819 
1820 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1821 	    "dld_taskq_dispatch");
1822 	mutex_enter(&dld_taskq_lock);
1823 
1824 	while (!dld_taskq_quit) {
1825 		dsp = list_head(&dld_taskq_list);
1826 		while (dsp != NULL) {
1827 			list_remove(&dld_taskq_list, dsp);
1828 			mutex_exit(&dld_taskq_lock);
1829 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1830 			    dsp, TQ_SLEEP) != 0);
1831 			mutex_enter(&dld_taskq_lock);
1832 			dsp = list_head(&dld_taskq_list);
1833 		}
1834 
1835 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1836 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1837 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1838 	}
1839 
1840 	dld_taskq_done = B_TRUE;
1841 	cv_signal(&dld_taskq_cv);
1842 	CALLB_CPR_EXIT(&cprinfo);
1843 	thread_exit();
1844 }
1845 
1846 /*
1847  * All control operations are serialized on the 'dsp' and are also funneled
1848  * through a taskq mechanism to ensure that subsequent processing has kernel
1849  * context and can safely use cv_wait.
1850  *
1851  * Mechanisms to handle taskq dispatch failures
1852  *
1853  * The only way to be sure that taskq dispatch does not fail is to either
1854  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1855  * some number of entries and make sure that the number of outstanding requests
1856  * are less than that number. We can't use TQ_SLEEP since we don't know the
1857  * context. Nor can we bound the total number of 'dsp' end points. So we are
1858  * unable to use either of the above schemes, and are forced to deal with
1859  * taskq dispatch failures. Note that even dynamic taskq could fail in
1860  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1861  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1862  * framework.
1863  *
1864  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1865  * We also have a single global thread to retry the taskq dispatch. This
1866  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1867  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1868  */
1869 static void
1870 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1871 {
1872 	ASSERT(mp->b_next == NULL);
1873 	mutex_enter(&dsp->ds_lock);
1874 	if (dsp->ds_pending_head != NULL) {
1875 		ASSERT(dsp->ds_dlpi_pending);
1876 		dsp->ds_pending_tail->b_next = mp;
1877 		dsp->ds_pending_tail = mp;
1878 		mutex_exit(&dsp->ds_lock);
1879 		return;
1880 	}
1881 	ASSERT(dsp->ds_pending_tail == NULL);
1882 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
1883 	/*
1884 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
1885 	 * thread is still active and is processing the last message, though
1886 	 * the pending queue has been emptied.
1887 	 */
1888 	if (dsp->ds_dlpi_pending) {
1889 		mutex_exit(&dsp->ds_lock);
1890 		return;
1891 	}
1892 
1893 	dsp->ds_dlpi_pending = 1;
1894 	mutex_exit(&dsp->ds_lock);
1895 
1896 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
1897 	    TQ_NOSLEEP) != 0)
1898 		return;
1899 
1900 	mutex_enter(&dld_taskq_lock);
1901 	list_insert_tail(&dld_taskq_list, dsp);
1902 	cv_signal(&dld_taskq_cv);
1903 	mutex_exit(&dld_taskq_lock);
1904 }
1905 
1906 /*
1907  * Process an M_IOCTL message.
1908  */
1909 static void
1910 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1911 {
1912 	uint_t			cmd;
1913 
1914 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1915 	ASSERT(dsp->ds_type == DLD_DLPI);
1916 
1917 	switch (cmd) {
1918 	case DLIOCNATIVE:
1919 		ioc_native(dsp, mp);
1920 		break;
1921 	case DLIOCMARGININFO:
1922 		ioc_margin(dsp, mp);
1923 		break;
1924 	case DLIOCRAW:
1925 		ioc_raw(dsp, mp);
1926 		break;
1927 	case DLIOCHDRINFO:
1928 		ioc_fast(dsp, mp);
1929 		break;
1930 	default:
1931 		ioc(dsp, mp);
1932 	}
1933 }
1934 
1935 /*
1936  * DLIOCNATIVE
1937  */
1938 static void
1939 ioc_native(dld_str_t *dsp, mblk_t *mp)
1940 {
1941 	queue_t *q = dsp->ds_wq;
1942 	const mac_info_t *mip = dsp->ds_mip;
1943 
1944 	/*
1945 	 * Native mode can be enabled if it's disabled and if the
1946 	 * native media type is different.
1947 	 */
1948 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
1949 		dsp->ds_native = B_TRUE;
1950 
1951 	if (dsp->ds_native)
1952 		miocack(q, mp, 0, mip->mi_nativemedia);
1953 	else
1954 		miocnak(q, mp, 0, ENOTSUP);
1955 }
1956 
1957 /*
1958  * DLIOCMARGININFO
1959  */
1960 static void
1961 ioc_margin(dld_str_t *dsp, mblk_t *mp)
1962 {
1963 	queue_t *q = dsp->ds_wq;
1964 	uint32_t margin;
1965 	int err;
1966 
1967 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1968 		err = EINVAL;
1969 		goto failed;
1970 	}
1971 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
1972 		goto failed;
1973 
1974 	mac_margin_get(dsp->ds_mh, &margin);
1975 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
1976 	miocack(q, mp, sizeof (uint32_t), 0);
1977 	return;
1978 
1979 failed:
1980 	miocnak(q, mp, 0, err);
1981 }
1982 
1983 /*
1984  * DLIOCRAW
1985  */
1986 static void
1987 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1988 {
1989 	queue_t *q = dsp->ds_wq;
1990 	mac_perim_handle_t	mph;
1991 
1992 	if (dsp->ds_mh == NULL) {
1993 		dsp->ds_mode = DLD_RAW;
1994 		miocack(q, mp, 0, 0);
1995 		return;
1996 	}
1997 
1998 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1999 	if (dsp->ds_polling || dsp->ds_direct) {
2000 		mac_perim_exit(mph);
2001 		miocnak(q, mp, 0, EPROTO);
2002 		return;
2003 	}
2004 
2005 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2006 		/*
2007 		 * Set the receive callback.
2008 		 */
2009 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2010 	}
2011 
2012 	/*
2013 	 * Note that raw mode is enabled.
2014 	 */
2015 	dsp->ds_mode = DLD_RAW;
2016 	mac_perim_exit(mph);
2017 
2018 	miocack(q, mp, 0, 0);
2019 }
2020 
2021 /*
2022  * DLIOCHDRINFO
2023  */
2024 static void
2025 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2026 {
2027 	dl_unitdata_req_t *dlp;
2028 	off_t		off;
2029 	size_t		len;
2030 	const uint8_t	*addr;
2031 	uint16_t	sap;
2032 	mblk_t		*nmp;
2033 	mblk_t		*hmp;
2034 	uint_t		addr_length;
2035 	queue_t		*q = dsp->ds_wq;
2036 	int		err;
2037 	mac_perim_handle_t	mph;
2038 
2039 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2040 		err = ENOTSUP;
2041 		goto failed;
2042 	}
2043 
2044 	/*
2045 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2046 	 * user-land should not be allowed.
2047 	 */
2048 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2049 		err = EINVAL;
2050 		goto failed;
2051 	}
2052 
2053 	nmp = mp->b_cont;
2054 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2055 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2056 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2057 		err = EINVAL;
2058 		goto failed;
2059 	}
2060 
2061 	off = dlp->dl_dest_addr_offset;
2062 	len = dlp->dl_dest_addr_length;
2063 
2064 	if (!MBLKIN(nmp, off, len)) {
2065 		err = EINVAL;
2066 		goto failed;
2067 	}
2068 
2069 	if (dsp->ds_dlstate != DL_IDLE) {
2070 		err = ENOTSUP;
2071 		goto failed;
2072 	}
2073 
2074 	addr_length = dsp->ds_mip->mi_addr_length;
2075 	if (len != addr_length + sizeof (uint16_t)) {
2076 		err = EINVAL;
2077 		goto failed;
2078 	}
2079 
2080 	addr = nmp->b_rptr + off;
2081 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2082 
2083 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2084 		err = ENOMEM;
2085 		goto failed;
2086 	}
2087 
2088 	/*
2089 	 * This ioctl might happen concurrently with a direct call to dld_capab
2090 	 * that tries to enable direct and/or poll capabilities. Since the
2091 	 * stack does not serialize them, we do so here to avoid mixing
2092 	 * the callbacks.
2093 	 */
2094 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2095 	if (dsp->ds_mode != DLD_FASTPATH) {
2096 		/*
2097 		 * Set the receive callback (unless polling is enabled).
2098 		 */
2099 		if (!dsp->ds_polling && !dsp->ds_direct)
2100 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2101 
2102 		/*
2103 		 * Note that fast-path mode is enabled.
2104 		 */
2105 		dsp->ds_mode = DLD_FASTPATH;
2106 	}
2107 	mac_perim_exit(mph);
2108 
2109 	freemsg(nmp->b_cont);
2110 	nmp->b_cont = hmp;
2111 
2112 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2113 	return;
2114 failed:
2115 	miocnak(q, mp, 0, err);
2116 }
2117 
2118 /*
2119  * Catch-all handler.
2120  */
2121 static void
2122 ioc(dld_str_t *dsp, mblk_t *mp)
2123 {
2124 	queue_t	*q = dsp->ds_wq;
2125 
2126 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2127 		miocnak(q, mp, 0, EINVAL);
2128 		return;
2129 	}
2130 	mac_ioctl(dsp->ds_mh, q, mp);
2131 }
2132