xref: /titanic_50/usr/src/uts/common/io/dld/dld_str.c (revision 6e375c8351497b82ffa4f33cbf61d712999b4605)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Driver
28  */
29 
30 #include	<inet/common.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/vlan.h>
35 #include	<sys/dld_impl.h>
36 #include	<sys/cpuvar.h>
37 #include	<sys/callb.h>
38 #include	<sys/list.h>
39 #include	<sys/mac_client.h>
40 #include	<sys/mac_client_priv.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc(dld_str_t *, mblk_t *);
58 static void	dld_ioc(dld_str_t *, mblk_t *);
59 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
60 
61 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
62 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
63 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
64 
65 static uint32_t		str_count;
66 static kmem_cache_t	*str_cachep;
67 static mod_hash_t	*str_hashp;
68 
69 #define	STR_HASHSZ		64
70 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
71 
72 #define	dld_taskq	system_taskq
73 
74 static kmutex_t		dld_taskq_lock;
75 static kcondvar_t	dld_taskq_cv;
76 static list_t		dld_taskq_list;		/* List of dld_str_t */
77 boolean_t		dld_taskq_quit;
78 boolean_t		dld_taskq_done;
79 
80 static void		dld_taskq_dispatch(void);
81 
82 /*
83  * Some notes on entry points, flow-control, queueing.
84  *
85  * This driver exports the traditional STREAMS put entry point as well as
86  * the non-STREAMS fast-path transmit routine which is provided to IP via
87  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
88  * and data operations, while the fast-path routine deals only with M_DATA
89  * fast-path packets.  Regardless of the entry point, all outbound packets
90  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
91  *
92  * The transmit logic operates in the following way: All packets coming
93  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
94  * happens when the MAC layer indicates the packets couldn't be
95  * transmitted due to 1) lack of resources (e.g. running out of
96  * descriptors),  or 2) reaching the allowed bandwidth limit for this
97  * particular flow. The indication comes in the form of a Tx cookie that
98  * identifies the blocked ring. In such case, DLD will place a
99  * dummy message on its write-side STREAMS queue so that the queue is
100  * marked as "full". Any subsequent packets arriving at the driver will
101  * still be sent to the MAC layer where it either gets queued in the Tx
102  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
103  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
104  * When the write service procedure runs, it will remove the dummy
105  * message from the write-side STREAMS queue; in effect this will trigger
106  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
107  * respectively, due to the above reasons.
108  *
109  * All non-data operations, both DLPI and ioctls are single threaded on a per
110  * dld_str_t endpoint. This is done using a taskq so that the control operation
111  * has kernel context and can cv_wait for resources. In addition all set type
112  * operations that involve mac level state modification are serialized on a
113  * per mac end point using the perimeter mechanism provided by the mac layer.
114  * This serializes all mac clients trying to modify a single mac end point over
115  * the entire sequence of mac calls made by that client as an atomic unit. The
116  * mac framework locking is described in mac.c. A critical element is that
117  * DLD/DLS does not hold any locks across the mac perimeter.
118  *
119  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
120  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
121  * match dev_t. If a stream is found and it is attached, its dev_info_t *
122  * is returned. If the mac handle is non-null, it can be safely accessed
123  * below. The mac handle won't be freed until the mac_unregister which
124  * won't happen until the driver detaches. The DDI framework ensures that
125  * the detach won't happen while a getinfo is in progress.
126  */
127 typedef struct i_dld_str_state_s {
128 	major_t		ds_major;
129 	minor_t		ds_minor;
130 	dev_info_t	*ds_dip;
131 } i_dld_str_state_t;
132 
133 /* ARGSUSED */
134 static uint_t
135 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
136 {
137 	i_dld_str_state_t	*statep = arg;
138 	dld_str_t		*dsp = (dld_str_t *)val;
139 	mac_handle_t		mh;
140 
141 	if (statep->ds_major != dsp->ds_major)
142 		return (MH_WALK_CONTINUE);
143 
144 	ASSERT(statep->ds_minor != 0);
145 	mh = dsp->ds_mh;
146 
147 	if (statep->ds_minor == dsp->ds_minor) {
148 		/*
149 		 * Clone: a clone minor is unique. we can terminate the
150 		 * walk if we find a matching stream -- even if we fail
151 		 * to obtain the devinfo.
152 		 */
153 		if (mh != NULL)
154 			statep->ds_dip = mac_devinfo_get(mh);
155 		return (MH_WALK_TERMINATE);
156 	}
157 	return (MH_WALK_CONTINUE);
158 }
159 
160 static dev_info_t *
161 dld_finddevinfo(dev_t dev)
162 {
163 	dev_info_t		*dip;
164 	i_dld_str_state_t	state;
165 
166 	if (getminor(dev) == 0)
167 		return (NULL);
168 
169 	/*
170 	 * See if it's a minor node of a link
171 	 */
172 	if ((dip = dls_link_devinfo(dev)) != NULL)
173 		return (dip);
174 
175 	state.ds_minor = getminor(dev);
176 	state.ds_major = getmajor(dev);
177 	state.ds_dip = NULL;
178 
179 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
180 	return (state.ds_dip);
181 }
182 
183 /*
184  * devo_getinfo: getinfo(9e)
185  */
186 /*ARGSUSED*/
187 int
188 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
189 {
190 	dev_info_t	*devinfo;
191 	minor_t		minor = getminor((dev_t)arg);
192 	int		rc = DDI_FAILURE;
193 
194 	switch (cmd) {
195 	case DDI_INFO_DEVT2DEVINFO:
196 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
197 			*(dev_info_t **)resp = devinfo;
198 			rc = DDI_SUCCESS;
199 		}
200 		break;
201 	case DDI_INFO_DEVT2INSTANCE:
202 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
203 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
204 			rc = DDI_SUCCESS;
205 		} else if (minor > DLS_MAX_MINOR &&
206 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
207 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
208 			rc = DDI_SUCCESS;
209 		}
210 		break;
211 	}
212 	return (rc);
213 }
214 
215 /*
216  * qi_qopen: open(9e)
217  */
218 /*ARGSUSED*/
219 int
220 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
221 {
222 	dld_str_t	*dsp;
223 	major_t		major;
224 	minor_t		minor;
225 	int		err;
226 
227 	if (sflag == MODOPEN)
228 		return (ENOTSUP);
229 
230 	/*
231 	 * This is a cloning driver and therefore each queue should only
232 	 * ever get opened once.
233 	 */
234 	if (rq->q_ptr != NULL)
235 		return (EBUSY);
236 
237 	major = getmajor(*devp);
238 	minor = getminor(*devp);
239 
240 	/*
241 	 * Create a new dld_str_t for the stream. This will grab a new minor
242 	 * number that will be handed back in the cloned dev_t.  Creation may
243 	 * fail if we can't allocate the dummy mblk used for flow-control.
244 	 */
245 	dsp = dld_str_create(rq, DLD_DLPI, major,
246 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
247 	if (dsp == NULL)
248 		return (ENOSR);
249 
250 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
251 	if (minor != 0) {
252 		/*
253 		 * Style 1 open
254 		 */
255 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
256 			goto failed;
257 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
258 	} else {
259 		(void) qassociate(rq, -1);
260 	}
261 
262 	/*
263 	 * Enable the queue srv(9e) routine.
264 	 */
265 	qprocson(rq);
266 
267 	/*
268 	 * Construct a cloned dev_t to hand back.
269 	 */
270 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
271 	return (0);
272 
273 failed:
274 	dld_str_destroy(dsp);
275 	return (err);
276 }
277 
278 /*
279  * qi_qclose: close(9e)
280  */
281 int
282 dld_close(queue_t *rq)
283 {
284 	dld_str_t	*dsp = rq->q_ptr;
285 
286 	/*
287 	 * All modules on top have been popped off. So there can't be any
288 	 * threads from the top.
289 	 */
290 	ASSERT(dsp->ds_datathr_cnt == 0);
291 
292 	/*
293 	 * Wait until pending DLPI requests are processed.
294 	 */
295 	mutex_enter(&dsp->ds_lock);
296 	while (dsp->ds_dlpi_pending)
297 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
298 	mutex_exit(&dsp->ds_lock);
299 
300 	/*
301 	 * Disable the queue srv(9e) routine.
302 	 */
303 	qprocsoff(rq);
304 
305 
306 	/*
307 	 * This stream was open to a provider node. Check to see
308 	 * if it has been cleanly shut down.
309 	 */
310 	if (dsp->ds_dlstate != DL_UNATTACHED) {
311 		/*
312 		 * The stream is either open to a style 1 provider or
313 		 * this is not clean shutdown. Detach from the PPA.
314 		 * (This is still ok even in the style 1 case).
315 		 */
316 		dld_str_detach(dsp);
317 	}
318 
319 	dld_str_destroy(dsp);
320 	return (0);
321 }
322 
323 /*
324  * qi_qputp: put(9e)
325  */
326 void
327 dld_wput(queue_t *wq, mblk_t *mp)
328 {
329 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
330 	dld_str_mode_t	mode;
331 
332 	switch (DB_TYPE(mp)) {
333 	case M_DATA:
334 		mutex_enter(&dsp->ds_lock);
335 		if (dsp->ds_dlstate == DL_IDLE) {
336 			mode = dsp->ds_mode;
337 			if (mode == DLD_FASTPATH || mode == DLD_RAW) {
338 				DLD_DATATHR_INC(dsp);
339 				mutex_exit(&dsp->ds_lock);
340 				if (mode == DLD_FASTPATH) {
341 					(void) str_mdata_fastpath_put(dsp, mp,
342 					    0, 0);
343 				} else {
344 					str_mdata_raw_put(dsp, mp);
345 				}
346 				DLD_DATATHR_DCR(dsp);
347 				break;
348 			}
349 		}
350 		mutex_exit(&dsp->ds_lock);
351 		freemsg(mp);
352 		break;
353 
354 	case M_PROTO:
355 	case M_PCPROTO: {
356 		t_uscalar_t	prim;
357 
358 		if (MBLKL(mp) < sizeof (t_uscalar_t))
359 			break;
360 
361 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
362 
363 		if (prim == DL_UNITDATA_REQ) {
364 			proto_unitdata_req(dsp, mp);
365 		} else {
366 			dld_wput_nondata(dsp, mp);
367 		}
368 		break;
369 	}
370 
371 	case M_IOCTL:
372 		dld_wput_nondata(dsp, mp);
373 		break;
374 
375 	case M_FLUSH:
376 		if (*mp->b_rptr & FLUSHW) {
377 			DLD_CLRQFULL(dsp);
378 			*mp->b_rptr &= ~FLUSHW;
379 		}
380 
381 		if (*mp->b_rptr & FLUSHR) {
382 			qreply(wq, mp);
383 		} else {
384 			freemsg(mp);
385 		}
386 		break;
387 
388 	default:
389 		freemsg(mp);
390 		break;
391 	}
392 }
393 
394 /*
395  * qi_srvp: srv(9e)
396  */
397 void
398 dld_wsrv(queue_t *wq)
399 {
400 	dld_str_t	*dsp = wq->q_ptr;
401 
402 	DLD_CLRQFULL(dsp);
403 }
404 
405 void
406 dld_init_ops(struct dev_ops *ops, const char *name)
407 {
408 	struct streamtab *stream;
409 	struct qinit *rq, *wq;
410 	struct module_info *modinfo;
411 
412 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
413 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
414 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
415 	modinfo->mi_minpsz = 0;
416 	modinfo->mi_maxpsz = 64*1024;
417 	modinfo->mi_hiwat  = 1;
418 	modinfo->mi_lowat = 0;
419 
420 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
421 	rq->qi_qopen = dld_open;
422 	rq->qi_qclose = dld_close;
423 	rq->qi_minfo = modinfo;
424 
425 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
426 	wq->qi_putp = (pfi_t)dld_wput;
427 	wq->qi_srvp = (pfi_t)dld_wsrv;
428 	wq->qi_minfo = modinfo;
429 
430 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
431 	stream->st_rdinit = rq;
432 	stream->st_wrinit = wq;
433 	ops->devo_cb_ops->cb_str = stream;
434 
435 	if (ops->devo_getinfo == NULL)
436 		ops->devo_getinfo = &dld_getinfo;
437 }
438 
439 void
440 dld_fini_ops(struct dev_ops *ops)
441 {
442 	struct streamtab *stream;
443 	struct qinit *rq, *wq;
444 	struct module_info *modinfo;
445 
446 	stream = ops->devo_cb_ops->cb_str;
447 	rq = stream->st_rdinit;
448 	wq = stream->st_wrinit;
449 	modinfo = rq->qi_minfo;
450 	ASSERT(wq->qi_minfo == modinfo);
451 
452 	kmem_free(stream, sizeof (struct streamtab));
453 	kmem_free(wq, sizeof (struct qinit));
454 	kmem_free(rq, sizeof (struct qinit));
455 	kmem_free(modinfo->mi_idname, FMNAMESZ);
456 	kmem_free(modinfo, sizeof (struct module_info));
457 }
458 
459 /*
460  * Initialize this module's data structures.
461  */
462 void
463 dld_str_init(void)
464 {
465 	/*
466 	 * Create dld_str_t object cache.
467 	 */
468 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
469 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
470 	ASSERT(str_cachep != NULL);
471 
472 	/*
473 	 * Create a hash table for maintaining dld_str_t's.
474 	 * The ds_minor field (the clone minor number) of a dld_str_t
475 	 * is used as a key for this hash table because this number is
476 	 * globally unique (allocated from "dls_minor_arena").
477 	 */
478 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
479 	    mod_hash_null_valdtor);
480 
481 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
482 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
483 
484 	dld_taskq_quit = B_FALSE;
485 	dld_taskq_done = B_FALSE;
486 	list_create(&dld_taskq_list, sizeof (dld_str_t),
487 	    offsetof(dld_str_t, ds_tqlist));
488 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
489 	    &p0, TS_RUN, minclsyspri);
490 }
491 
492 /*
493  * Tear down this module's data structures.
494  */
495 int
496 dld_str_fini(void)
497 {
498 	/*
499 	 * Make sure that there are no objects in use.
500 	 */
501 	if (str_count != 0)
502 		return (EBUSY);
503 
504 	/*
505 	 * Ask the dld_taskq thread to quit and wait for it to be done
506 	 */
507 	mutex_enter(&dld_taskq_lock);
508 	dld_taskq_quit = B_TRUE;
509 	cv_signal(&dld_taskq_cv);
510 	while (!dld_taskq_done)
511 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
512 	mutex_exit(&dld_taskq_lock);
513 	list_destroy(&dld_taskq_list);
514 	/*
515 	 * Destroy object cache.
516 	 */
517 	kmem_cache_destroy(str_cachep);
518 	mod_hash_destroy_idhash(str_hashp);
519 	return (0);
520 }
521 
522 /*
523  * Create a new dld_str_t object.
524  */
525 dld_str_t *
526 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
527 {
528 	dld_str_t	*dsp;
529 	int		err;
530 
531 	/*
532 	 * Allocate an object from the cache.
533 	 */
534 	atomic_add_32(&str_count, 1);
535 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
536 
537 	/*
538 	 * Allocate the dummy mblk for flow-control.
539 	 */
540 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
541 	if (dsp->ds_tx_flow_mp == NULL) {
542 		kmem_cache_free(str_cachep, dsp);
543 		atomic_add_32(&str_count, -1);
544 		return (NULL);
545 	}
546 	dsp->ds_type = type;
547 	dsp->ds_major = major;
548 	dsp->ds_style = style;
549 
550 	/*
551 	 * Initialize the queue pointers.
552 	 */
553 	ASSERT(RD(rq) == rq);
554 	dsp->ds_rq = rq;
555 	dsp->ds_wq = WR(rq);
556 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
557 
558 	/*
559 	 * We want explicit control over our write-side STREAMS queue
560 	 * where the dummy mblk gets added/removed for flow-control.
561 	 */
562 	noenable(WR(rq));
563 
564 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
565 	    (mod_hash_val_t)dsp);
566 	ASSERT(err == 0);
567 	return (dsp);
568 }
569 
570 /*
571  * Destroy a dld_str_t object.
572  */
573 void
574 dld_str_destroy(dld_str_t *dsp)
575 {
576 	queue_t		*rq;
577 	queue_t		*wq;
578 	mod_hash_val_t	val;
579 
580 	/*
581 	 * Clear the queue pointers.
582 	 */
583 	rq = dsp->ds_rq;
584 	wq = dsp->ds_wq;
585 	ASSERT(wq == WR(rq));
586 	rq->q_ptr = wq->q_ptr = NULL;
587 	dsp->ds_rq = dsp->ds_wq = NULL;
588 
589 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
590 	ASSERT(dsp->ds_sap == 0);
591 	ASSERT(dsp->ds_mh == NULL);
592 	ASSERT(dsp->ds_mch == NULL);
593 	ASSERT(dsp->ds_promisc == 0);
594 	ASSERT(dsp->ds_mph == NULL);
595 	ASSERT(dsp->ds_mip == NULL);
596 	ASSERT(dsp->ds_mnh == NULL);
597 
598 	ASSERT(dsp->ds_polling == B_FALSE);
599 	ASSERT(dsp->ds_direct == B_FALSE);
600 	ASSERT(dsp->ds_lso == B_FALSE);
601 	ASSERT(dsp->ds_lso_max == 0);
602 
603 	/*
604 	 * Reinitialize all the flags.
605 	 */
606 	dsp->ds_notifications = 0;
607 	dsp->ds_passivestate = DLD_UNINITIALIZED;
608 	dsp->ds_mode = DLD_UNITDATA;
609 	dsp->ds_native = B_FALSE;
610 
611 	ASSERT(dsp->ds_datathr_cnt == 0);
612 	ASSERT(dsp->ds_pending_head == NULL);
613 	ASSERT(dsp->ds_pending_tail == NULL);
614 	ASSERT(!dsp->ds_dlpi_pending);
615 
616 	ASSERT(dsp->ds_dlp == NULL);
617 	ASSERT(dsp->ds_dmap == NULL);
618 	ASSERT(dsp->ds_rx == NULL);
619 	ASSERT(dsp->ds_rx_arg == NULL);
620 	ASSERT(dsp->ds_next == NULL);
621 	ASSERT(dsp->ds_head == NULL);
622 
623 	/*
624 	 * Free the dummy mblk if exists.
625 	 */
626 	if (dsp->ds_tx_flow_mp != NULL) {
627 		freeb(dsp->ds_tx_flow_mp);
628 		dsp->ds_tx_flow_mp = NULL;
629 	}
630 
631 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
632 	ASSERT(dsp == (dld_str_t *)val);
633 
634 	/*
635 	 * Free the object back to the cache.
636 	 */
637 	kmem_cache_free(str_cachep, dsp);
638 	atomic_add_32(&str_count, -1);
639 }
640 
641 /*
642  * kmem_cache contructor function: see kmem_cache_create(9f).
643  */
644 /*ARGSUSED*/
645 static int
646 str_constructor(void *buf, void *cdrarg, int kmflags)
647 {
648 	dld_str_t	*dsp = buf;
649 
650 	bzero(buf, sizeof (dld_str_t));
651 
652 	/*
653 	 * Allocate a new minor number.
654 	 */
655 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
656 		return (-1);
657 
658 	/*
659 	 * Initialize the DLPI state machine.
660 	 */
661 	dsp->ds_dlstate = DL_UNATTACHED;
662 
663 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
664 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
665 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
666 
667 	return (0);
668 }
669 
670 /*
671  * kmem_cache destructor function.
672  */
673 /*ARGSUSED*/
674 static void
675 str_destructor(void *buf, void *cdrarg)
676 {
677 	dld_str_t	*dsp = buf;
678 
679 	/*
680 	 * Release the minor number.
681 	 */
682 	mac_minor_rele(dsp->ds_minor);
683 
684 	ASSERT(dsp->ds_tx_flow_mp == NULL);
685 
686 	mutex_destroy(&dsp->ds_lock);
687 	cv_destroy(&dsp->ds_datathr_cv);
688 	cv_destroy(&dsp->ds_dlpi_pending_cv);
689 }
690 
691 /*
692  * Update the priority bits and VID (may need to insert tag if mp points
693  * to an untagged packet.
694  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
695  */
696 static mblk_t *
697 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
698 {
699 	mblk_t *hmp;
700 	struct ether_vlan_header *evhp;
701 	struct ether_header *ehp;
702 	uint16_t old_tci = 0;
703 	size_t len;
704 
705 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
706 
707 	evhp = (struct ether_vlan_header *)mp->b_rptr;
708 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
709 		/*
710 		 * Tagged packet, update the priority bits.
711 		 */
712 		old_tci = ntohs(evhp->ether_tci);
713 		len = sizeof (struct ether_vlan_header);
714 
715 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
716 			/*
717 			 * In case some drivers only check the db_ref
718 			 * count of the first mblk, we pullup the
719 			 * message into a single mblk.
720 			 */
721 			hmp = msgpullup(mp, -1);
722 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
723 				freemsg(hmp);
724 				return (NULL);
725 			} else {
726 				freemsg(mp);
727 				mp = hmp;
728 			}
729 		}
730 
731 		evhp = (struct ether_vlan_header *)mp->b_rptr;
732 	} else {
733 		/*
734 		 * Untagged packet. Insert the special priority tag.
735 		 * First allocate a header mblk.
736 		 */
737 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
738 		if (hmp == NULL)
739 			return (NULL);
740 
741 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
742 		ehp = (struct ether_header *)mp->b_rptr;
743 
744 		/*
745 		 * Copy the MAC addresses and typelen
746 		 */
747 		bcopy(ehp, evhp, (ETHERADDRL * 2));
748 		evhp->ether_type = ehp->ether_type;
749 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
750 
751 		hmp->b_wptr += sizeof (struct ether_vlan_header);
752 		mp->b_rptr += sizeof (struct ether_header);
753 
754 		/*
755 		 * Free the original message if it's now empty. Link the
756 		 * rest of the messages to the header message.
757 		 */
758 		if (MBLKL(mp) == 0) {
759 			hmp->b_cont = mp->b_cont;
760 			freeb(mp);
761 		} else {
762 			hmp->b_cont = mp;
763 		}
764 		mp = hmp;
765 	}
766 
767 	if (pri == 0)
768 		pri = VLAN_PRI(old_tci);
769 	if (vid == VLAN_ID_NONE)
770 		vid = VLAN_ID(old_tci);
771 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
772 	return (mp);
773 }
774 
775 /*
776  * M_DATA put (IP fast-path mode)
777  */
778 mac_tx_cookie_t
779 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
780     uint16_t flag)
781 {
782 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
783 	mblk_t *newmp;
784 	uint_t pri;
785 	mac_tx_cookie_t cookie;
786 
787 	if (is_ethernet) {
788 		/*
789 		 * Update the priority bits to the assigned priority.
790 		 */
791 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
792 
793 		if (pri != 0) {
794 			newmp = i_dld_ether_header_update_tag(mp, pri,
795 			    VLAN_ID_NONE);
796 			if (newmp == NULL)
797 				goto discard;
798 			mp = newmp;
799 		}
800 	}
801 
802 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
803 		DLD_SETQFULL(dsp);
804 	}
805 	return (cookie);
806 
807 discard:
808 	/* TODO: bump kstat? */
809 	freemsg(mp);
810 	return (NULL);
811 }
812 
813 /*
814  * M_DATA put (DLIOCRAW mode)
815  */
816 static void
817 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
818 {
819 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
820 	mblk_t *bp, *newmp;
821 	size_t size;
822 	mac_header_info_t mhi;
823 	uint_t pri, vid, dvid;
824 	uint_t max_sdu;
825 
826 	/*
827 	 * Certain MAC type plugins provide an illusion for raw DLPI
828 	 * consumers.  They pretend that the MAC layer is something that
829 	 * it's not for the benefit of observability tools.  For example,
830 	 * mac_wifi pretends that it's Ethernet for such consumers.
831 	 * Here, unless native mode is enabled, we call into the MAC layer so
832 	 * that this illusion can be maintained.  The plugin will optionally
833 	 * transform the MAC header here into something that can be passed
834 	 * down.  The header goes from raw mode to "cooked" mode.
835 	 */
836 	if (!dsp->ds_native) {
837 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
838 			goto discard;
839 		mp = newmp;
840 	}
841 
842 	size = MBLKL(mp);
843 
844 	/*
845 	 * Check the packet is not too big and that any remaining
846 	 * fragment list is composed entirely of M_DATA messages. (We
847 	 * know the first fragment was M_DATA otherwise we could not
848 	 * have got here).
849 	 */
850 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
851 		if (DB_TYPE(bp) != M_DATA)
852 			goto discard;
853 		size += MBLKL(bp);
854 	}
855 
856 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
857 		goto discard;
858 
859 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
860 	/*
861 	 * If LSO is enabled, check the size against lso_max. Otherwise,
862 	 * compare the packet size with max_sdu.
863 	 */
864 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
865 	if (size > max_sdu + mhi.mhi_hdrsize)
866 		goto discard;
867 
868 	if (is_ethernet) {
869 		dvid = mac_client_vid(dsp->ds_mch);
870 
871 		/*
872 		 * Discard the packet if this is a VLAN stream but the VID in
873 		 * the packet is not correct.
874 		 */
875 		vid = VLAN_ID(mhi.mhi_tci);
876 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
877 			goto discard;
878 
879 		/*
880 		 * Discard the packet if this packet is a tagged packet
881 		 * but both pri and VID are 0.
882 		 */
883 		pri = VLAN_PRI(mhi.mhi_tci);
884 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
885 			goto discard;
886 
887 		/*
888 		 * Update the priority bits to the per-stream priority if
889 		 * priority is not set in the packet. Update the VID for
890 		 * packets on a VLAN stream.
891 		 */
892 		pri = (pri == 0) ? dsp->ds_pri : 0;
893 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
894 			if ((newmp = i_dld_ether_header_update_tag(mp,
895 			    pri, dvid)) == NULL) {
896 				goto discard;
897 			}
898 			mp = newmp;
899 		}
900 	}
901 
902 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
903 		/* Turn on flow-control for dld */
904 		DLD_SETQFULL(dsp);
905 	}
906 	return;
907 
908 discard:
909 	/* TODO: bump kstat? */
910 	freemsg(mp);
911 }
912 
913 /*
914  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
915  */
916 int
917 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
918 {
919 	dev_t			dev;
920 	int			err;
921 	const char		*drvname;
922 	mac_perim_handle_t	mph;
923 	boolean_t		qassociated = B_FALSE;
924 	dls_link_t		*dlp = NULL;
925 	dls_dl_handle_t		ddp = NULL;
926 	boolean_t		entered_perim = B_FALSE;
927 
928 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
929 		return (EINVAL);
930 
931 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
932 		return (ENOTSUP);
933 
934 	/*
935 	 * /dev node access. This will still be supported for backward
936 	 * compatibility reason.
937 	 */
938 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
939 	    (strcmp(drvname, "vnic") != 0)) {
940 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
941 			return (EINVAL);
942 		qassociated = B_TRUE;
943 	}
944 
945 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
946 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
947 		goto failed;
948 
949 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
950 		goto failed;
951 	entered_perim = B_TRUE;
952 
953 	/*
954 	 * Open a channel.
955 	 */
956 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
957 		goto failed;
958 
959 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
960 		goto failed;
961 
962 	/*
963 	 * Set the default packet priority.
964 	 */
965 	dsp->ds_pri = 0;
966 
967 	/*
968 	 * Add a notify function so that the we get updates from the MAC.
969 	 */
970 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
971 	dsp->ds_dlstate = DL_UNBOUND;
972 	mac_perim_exit(mph);
973 	return (0);
974 
975 failed:
976 	if (dlp != NULL)
977 		dls_link_rele(dlp);
978 	if (entered_perim)
979 		mac_perim_exit(mph);
980 	if (ddp != NULL)
981 		dls_devnet_rele(ddp);
982 	if (qassociated)
983 		(void) qassociate(dsp->ds_wq, -1);
984 
985 	return (err);
986 }
987 
988 /*
989  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
990  * from close(2) for style 2.
991  */
992 void
993 dld_str_detach(dld_str_t *dsp)
994 {
995 	mac_perim_handle_t	mph;
996 	int			err;
997 
998 	ASSERT(dsp->ds_datathr_cnt == 0);
999 
1000 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1001 	/*
1002 	 * Remove the notify function.
1003 	 *
1004 	 * Note that we cannot wait for the notification callback to be removed
1005 	 * since it could cause the deadlock with str_notify() since they both
1006 	 * need the mac perimeter. Continue if we cannot remove the
1007 	 * notification callback right now and wait after we leave the
1008 	 * perimeter.
1009 	 */
1010 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1011 	dsp->ds_mnh = NULL;
1012 
1013 	/*
1014 	 * Disable the capabilities
1015 	 */
1016 	dld_capabilities_disable(dsp);
1017 
1018 	/*
1019 	 * Clear LSO flags.
1020 	 */
1021 	dsp->ds_lso = B_FALSE;
1022 	dsp->ds_lso_max = 0;
1023 
1024 	dls_close(dsp);
1025 	mac_perim_exit(mph);
1026 
1027 	/*
1028 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1029 	 * because the notification callback was in progress, wait for
1030 	 * it to finish before we proceed.
1031 	 */
1032 	if (err != 0)
1033 		mac_notify_remove_wait(dsp->ds_mh);
1034 
1035 	/*
1036 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1037 	 * automatically in the call to dls_devnet_rele.
1038 	 */
1039 	dls_devnet_rele(dsp->ds_ddh);
1040 
1041 	dsp->ds_sap = 0;
1042 	dsp->ds_mh = NULL;
1043 	dsp->ds_mch = NULL;
1044 	dsp->ds_mip = NULL;
1045 
1046 	if (dsp->ds_style == DL_STYLE2)
1047 		(void) qassociate(dsp->ds_wq, -1);
1048 
1049 	/*
1050 	 * Re-initialize the DLPI state machine.
1051 	 */
1052 	dsp->ds_dlstate = DL_UNATTACHED;
1053 }
1054 
1055 /*
1056  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1057  * tags before sending packets up to the DLS clients, with the exception of
1058  * special priority tagged packets, in that case, we set the VID to 0.
1059  * mp must be a VLAN tagged packet.
1060  */
1061 static mblk_t *
1062 i_dld_ether_header_strip_tag(mblk_t *mp)
1063 {
1064 	mblk_t *newmp;
1065 	struct ether_vlan_header *evhp;
1066 	uint16_t tci, new_tci;
1067 
1068 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1069 	if (DB_REF(mp) > 1) {
1070 		newmp = copymsg(mp);
1071 		if (newmp == NULL)
1072 			return (NULL);
1073 		freemsg(mp);
1074 		mp = newmp;
1075 	}
1076 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1077 
1078 	tci = ntohs(evhp->ether_tci);
1079 	if (VLAN_PRI(tci) == 0) {
1080 		/*
1081 		 * Priority is 0, strip the tag.
1082 		 */
1083 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1084 		mp->b_rptr += VLAN_TAGSZ;
1085 	} else {
1086 		/*
1087 		 * Priority is not 0, update the VID to 0.
1088 		 */
1089 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1090 		evhp->ether_tci = htons(new_tci);
1091 	}
1092 	return (mp);
1093 }
1094 
1095 /*
1096  * Raw mode receive function.
1097  */
1098 /*ARGSUSED*/
1099 void
1100 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1101     mac_header_info_t *mhip)
1102 {
1103 	dld_str_t *dsp = (dld_str_t *)arg;
1104 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1105 	mblk_t *next, *newmp;
1106 
1107 	ASSERT(mp != NULL);
1108 	do {
1109 		/*
1110 		 * Get the pointer to the next packet in the chain and then
1111 		 * clear b_next before the packet gets passed on.
1112 		 */
1113 		next = mp->b_next;
1114 		mp->b_next = NULL;
1115 
1116 		/*
1117 		 * Wind back b_rptr to point at the MAC header.
1118 		 */
1119 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1120 		mp->b_rptr -= mhip->mhi_hdrsize;
1121 
1122 		/*
1123 		 * Certain MAC type plugins provide an illusion for raw
1124 		 * DLPI consumers.  They pretend that the MAC layer is
1125 		 * something that it's not for the benefit of observability
1126 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1127 		 * for such consumers.	Here, unless native mode is enabled,
1128 		 * we call into the MAC layer so that this illusion can be
1129 		 * maintained.	The plugin will optionally transform the MAC
1130 		 * header here into something that can be passed up to raw
1131 		 * consumers.  The header goes from "cooked" mode to raw mode.
1132 		 */
1133 		if (!dsp->ds_native) {
1134 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1135 			if (newmp == NULL) {
1136 				freemsg(mp);
1137 				goto next;
1138 			}
1139 			mp = newmp;
1140 		}
1141 
1142 		/*
1143 		 * Strip the VLAN tag for VLAN streams.
1144 		 */
1145 		if (is_ethernet &&
1146 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1147 			newmp = i_dld_ether_header_strip_tag(mp);
1148 			if (newmp == NULL) {
1149 				freemsg(mp);
1150 				goto next;
1151 			}
1152 			mp = newmp;
1153 		}
1154 
1155 		/*
1156 		 * Pass the packet on.
1157 		 */
1158 		if (canputnext(dsp->ds_rq))
1159 			putnext(dsp->ds_rq, mp);
1160 		else
1161 			freemsg(mp);
1162 
1163 next:
1164 		/*
1165 		 * Move on to the next packet in the chain.
1166 		 */
1167 		mp = next;
1168 	} while (mp != NULL);
1169 }
1170 
1171 /*
1172  * Fast-path receive function.
1173  */
1174 /*ARGSUSED*/
1175 void
1176 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1177     mac_header_info_t *mhip)
1178 {
1179 	dld_str_t *dsp = (dld_str_t *)arg;
1180 	mblk_t *next;
1181 	size_t offset = 0;
1182 
1183 	/*
1184 	 * MAC header stripping rules:
1185 	 *    - Tagged packets:
1186 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1187 	 *	b. Physical streams
1188 	 *	- VLAN packets (non-zero VID). The stream must be either a
1189 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1190 	 *	  Strip the Ethernet header but keep the VLAN header.
1191 	 *	- Special tagged packets (zero VID)
1192 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1193 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1194 	 *	    keep the VLAN header.
1195 	 *	  * Otherwise, strip the whole VLAN header.
1196 	 *    - Untagged packets. Strip the whole MAC header.
1197 	 */
1198 	if (mhip->mhi_istagged &&
1199 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1200 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1201 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1202 		offset = VLAN_TAGSZ;
1203 	}
1204 
1205 	ASSERT(mp != NULL);
1206 	do {
1207 		/*
1208 		 * Get the pointer to the next packet in the chain and then
1209 		 * clear b_next before the packet gets passed on.
1210 		 */
1211 		next = mp->b_next;
1212 		mp->b_next = NULL;
1213 
1214 		/*
1215 		 * Wind back b_rptr to point at the VLAN header.
1216 		 */
1217 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1218 		mp->b_rptr -= offset;
1219 
1220 		/*
1221 		 * Pass the packet on.
1222 		 */
1223 		if (canputnext(dsp->ds_rq))
1224 			putnext(dsp->ds_rq, mp);
1225 		else
1226 			freemsg(mp);
1227 		/*
1228 		 * Move on to the next packet in the chain.
1229 		 */
1230 		mp = next;
1231 	} while (mp != NULL);
1232 }
1233 
1234 /*
1235  * Default receive function (send DL_UNITDATA_IND messages).
1236  */
1237 /*ARGSUSED*/
1238 void
1239 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1240     mac_header_info_t *mhip)
1241 {
1242 	dld_str_t		*dsp = (dld_str_t *)arg;
1243 	mblk_t			*ud_mp;
1244 	mblk_t			*next;
1245 	size_t			offset = 0;
1246 	boolean_t		strip_vlan = B_TRUE;
1247 
1248 	/*
1249 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1250 	 */
1251 	if (mhip->mhi_istagged &&
1252 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1253 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1254 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1255 		offset = VLAN_TAGSZ;
1256 		strip_vlan = B_FALSE;
1257 	}
1258 
1259 	ASSERT(mp != NULL);
1260 	do {
1261 		/*
1262 		 * Get the pointer to the next packet in the chain and then
1263 		 * clear b_next before the packet gets passed on.
1264 		 */
1265 		next = mp->b_next;
1266 		mp->b_next = NULL;
1267 
1268 		/*
1269 		 * Wind back b_rptr to point at the MAC header.
1270 		 */
1271 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1272 		mp->b_rptr -= mhip->mhi_hdrsize;
1273 
1274 		/*
1275 		 * Create the DL_UNITDATA_IND M_PROTO.
1276 		 */
1277 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1278 			freemsgchain(mp);
1279 			return;
1280 		}
1281 
1282 		/*
1283 		 * Advance b_rptr to point at the payload (or the VLAN header).
1284 		 */
1285 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1286 
1287 		/*
1288 		 * Prepend the DL_UNITDATA_IND.
1289 		 */
1290 		ud_mp->b_cont = mp;
1291 
1292 		/*
1293 		 * Send the message.
1294 		 */
1295 		if (canputnext(dsp->ds_rq))
1296 			putnext(dsp->ds_rq, ud_mp);
1297 		else
1298 			freemsg(ud_mp);
1299 
1300 		/*
1301 		 * Move on to the next packet in the chain.
1302 		 */
1303 		mp = next;
1304 	} while (mp != NULL);
1305 }
1306 
1307 /*
1308  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1309  */
1310 static void
1311 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1312 {
1313 	mblk_t		*mp;
1314 	dl_notify_ind_t *dlip;
1315 
1316 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1317 		return;
1318 
1319 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1320 	    M_PROTO, 0)) == NULL)
1321 		return;
1322 
1323 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1324 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1325 	dlip->dl_primitive = DL_NOTIFY_IND;
1326 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1327 	dlip->dl_data = max_sdu;
1328 
1329 	qreply(dsp->ds_wq, mp);
1330 }
1331 
1332 /*
1333  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1334  * current state of the interface.
1335  */
1336 void
1337 dld_str_notify_ind(dld_str_t *dsp)
1338 {
1339 	mac_notify_type_t	type;
1340 
1341 	for (type = 0; type < MAC_NNOTE; type++)
1342 		str_notify(dsp, type);
1343 }
1344 
1345 typedef struct dl_unitdata_ind_wrapper {
1346 	dl_unitdata_ind_t	dl_unitdata;
1347 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1348 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1349 } dl_unitdata_ind_wrapper_t;
1350 
1351 /*
1352  * Create a DL_UNITDATA_IND M_PROTO message.
1353  */
1354 static mblk_t *
1355 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1356 {
1357 	mblk_t				*nmp;
1358 	dl_unitdata_ind_wrapper_t	*dlwp;
1359 	dl_unitdata_ind_t		*dlp;
1360 	mac_header_info_t		mhi;
1361 	uint_t				addr_length;
1362 	uint8_t				*daddr;
1363 	uint8_t				*saddr;
1364 
1365 	/*
1366 	 * Get the packet header information.
1367 	 */
1368 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
1369 		return (NULL);
1370 
1371 	/*
1372 	 * Allocate a message large enough to contain the wrapper structure
1373 	 * defined above.
1374 	 */
1375 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1376 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1377 	    DL_UNITDATA_IND)) == NULL)
1378 		return (NULL);
1379 
1380 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1381 
1382 	dlp = &(dlwp->dl_unitdata);
1383 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1384 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1385 
1386 	/*
1387 	 * Copy in the destination address.
1388 	 */
1389 	addr_length = dsp->ds_mip->mi_addr_length;
1390 	daddr = dlwp->dl_dest_addr;
1391 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1392 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1393 
1394 	/*
1395 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1396 	 */
1397 	if (mhi.mhi_istagged && !strip_vlan)
1398 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1399 	else
1400 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1401 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1402 
1403 	/*
1404 	 * If the destination address was multicast or broadcast then the
1405 	 * dl_group_address field should be non-zero.
1406 	 */
1407 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1408 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1409 
1410 	/*
1411 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1412 	 * for example) may not have access to source information.
1413 	 */
1414 	if (mhi.mhi_saddr == NULL) {
1415 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1416 	} else {
1417 		saddr = dlwp->dl_src_addr;
1418 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1419 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1420 
1421 		/*
1422 		 * Set the source DLSAP to the packet ethertype.
1423 		 */
1424 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1425 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1426 	}
1427 
1428 	return (nmp);
1429 }
1430 
1431 /*
1432  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1433  */
1434 static void
1435 str_notify_promisc_on_phys(dld_str_t *dsp)
1436 {
1437 	mblk_t		*mp;
1438 	dl_notify_ind_t	*dlip;
1439 
1440 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1441 		return;
1442 
1443 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1444 	    M_PROTO, 0)) == NULL)
1445 		return;
1446 
1447 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1448 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1449 	dlip->dl_primitive = DL_NOTIFY_IND;
1450 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1451 
1452 	qreply(dsp->ds_wq, mp);
1453 }
1454 
1455 /*
1456  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1457  */
1458 static void
1459 str_notify_promisc_off_phys(dld_str_t *dsp)
1460 {
1461 	mblk_t		*mp;
1462 	dl_notify_ind_t	*dlip;
1463 
1464 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1465 		return;
1466 
1467 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1468 	    M_PROTO, 0)) == NULL)
1469 		return;
1470 
1471 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1472 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1473 	dlip->dl_primitive = DL_NOTIFY_IND;
1474 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1475 
1476 	qreply(dsp->ds_wq, mp);
1477 }
1478 
1479 /*
1480  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1481  */
1482 static void
1483 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1484 {
1485 	mblk_t		*mp;
1486 	dl_notify_ind_t	*dlip;
1487 	uint_t		addr_length;
1488 	uint16_t	ethertype;
1489 
1490 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1491 		return;
1492 
1493 	addr_length = dsp->ds_mip->mi_addr_length;
1494 	if ((mp = mexchange(dsp->ds_wq, NULL,
1495 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1496 	    M_PROTO, 0)) == NULL)
1497 		return;
1498 
1499 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1500 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1501 	dlip->dl_primitive = DL_NOTIFY_IND;
1502 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1503 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1504 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1505 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1506 
1507 	bcopy(addr, &dlip[1], addr_length);
1508 
1509 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1510 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1511 
1512 	qreply(dsp->ds_wq, mp);
1513 }
1514 
1515 /*
1516  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1517  */
1518 static void
1519 str_notify_link_up(dld_str_t *dsp)
1520 {
1521 	mblk_t		*mp;
1522 	dl_notify_ind_t	*dlip;
1523 
1524 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1525 		return;
1526 
1527 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1528 	    M_PROTO, 0)) == NULL)
1529 		return;
1530 
1531 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1532 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1533 	dlip->dl_primitive = DL_NOTIFY_IND;
1534 	dlip->dl_notification = DL_NOTE_LINK_UP;
1535 
1536 	qreply(dsp->ds_wq, mp);
1537 }
1538 
1539 /*
1540  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1541  */
1542 static void
1543 str_notify_link_down(dld_str_t *dsp)
1544 {
1545 	mblk_t		*mp;
1546 	dl_notify_ind_t	*dlip;
1547 
1548 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1549 		return;
1550 
1551 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1552 	    M_PROTO, 0)) == NULL)
1553 		return;
1554 
1555 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1556 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1557 	dlip->dl_primitive = DL_NOTIFY_IND;
1558 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1559 
1560 	qreply(dsp->ds_wq, mp);
1561 }
1562 
1563 /*
1564  * DL_NOTIFY_IND: DL_NOTE_SPEED
1565  */
1566 static void
1567 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1568 {
1569 	mblk_t		*mp;
1570 	dl_notify_ind_t	*dlip;
1571 
1572 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1573 		return;
1574 
1575 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1576 	    M_PROTO, 0)) == NULL)
1577 		return;
1578 
1579 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1580 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1581 	dlip->dl_primitive = DL_NOTIFY_IND;
1582 	dlip->dl_notification = DL_NOTE_SPEED;
1583 	dlip->dl_data = speed;
1584 
1585 	qreply(dsp->ds_wq, mp);
1586 }
1587 
1588 /*
1589  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1590  */
1591 static void
1592 str_notify_capab_reneg(dld_str_t *dsp)
1593 {
1594 	mblk_t		*mp;
1595 	dl_notify_ind_t	*dlip;
1596 
1597 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1598 		return;
1599 
1600 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1601 	    M_PROTO, 0)) == NULL)
1602 		return;
1603 
1604 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1605 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1606 	dlip->dl_primitive = DL_NOTIFY_IND;
1607 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1608 
1609 	qreply(dsp->ds_wq, mp);
1610 }
1611 
1612 /*
1613  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1614  */
1615 static void
1616 str_notify_fastpath_flush(dld_str_t *dsp)
1617 {
1618 	mblk_t		*mp;
1619 	dl_notify_ind_t	*dlip;
1620 
1621 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1622 		return;
1623 
1624 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1625 	    M_PROTO, 0)) == NULL)
1626 		return;
1627 
1628 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1629 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1630 	dlip->dl_primitive = DL_NOTIFY_IND;
1631 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1632 
1633 	qreply(dsp->ds_wq, mp);
1634 }
1635 
1636 /*
1637  * MAC notification callback.
1638  */
1639 void
1640 str_notify(void *arg, mac_notify_type_t type)
1641 {
1642 	dld_str_t		*dsp = (dld_str_t *)arg;
1643 	queue_t			*q = dsp->ds_wq;
1644 	mac_handle_t		mh = dsp->ds_mh;
1645 	mac_client_handle_t	mch = dsp->ds_mch;
1646 	uint8_t			addr[MAXMACADDRLEN];
1647 
1648 	switch (type) {
1649 	case MAC_NOTE_TX:
1650 		qenable(q);
1651 		break;
1652 
1653 	case MAC_NOTE_DEVPROMISC:
1654 		/*
1655 		 * Send the appropriate DL_NOTIFY_IND.
1656 		 */
1657 		if (mac_promisc_get(mh, MAC_DEVPROMISC))
1658 			str_notify_promisc_on_phys(dsp);
1659 		else
1660 			str_notify_promisc_off_phys(dsp);
1661 		break;
1662 
1663 	case MAC_NOTE_UNICST:
1664 		/*
1665 		 * This notification is sent whenever the MAC unicast
1666 		 * address changes.
1667 		 */
1668 		mac_unicast_primary_get(mh, addr);
1669 
1670 		/*
1671 		 * Send the appropriate DL_NOTIFY_IND.
1672 		 */
1673 		str_notify_phys_addr(dsp, addr);
1674 		break;
1675 
1676 	case MAC_NOTE_LINK:
1677 		/*
1678 		 * This notification is sent every time the MAC driver
1679 		 * updates the link state.
1680 		 */
1681 		switch (mac_client_stat_get(mch, MAC_STAT_LINK_STATE)) {
1682 		case LINK_STATE_UP: {
1683 			uint64_t speed;
1684 			/*
1685 			 * The link is up so send the appropriate
1686 			 * DL_NOTIFY_IND.
1687 			 */
1688 			str_notify_link_up(dsp);
1689 
1690 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1691 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1692 			break;
1693 		}
1694 		case LINK_STATE_DOWN:
1695 			/*
1696 			 * The link is down so send the appropriate
1697 			 * DL_NOTIFY_IND.
1698 			 */
1699 			str_notify_link_down(dsp);
1700 			break;
1701 
1702 		default:
1703 			break;
1704 		}
1705 		break;
1706 
1707 	case MAC_NOTE_RESOURCE:
1708 	case MAC_NOTE_CAPAB_CHG:
1709 		/*
1710 		 * This notification is sent whenever the MAC resources
1711 		 * change or capabilities change. We need to renegotiate
1712 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1713 		 */
1714 		str_notify_capab_reneg(dsp);
1715 		break;
1716 
1717 	case MAC_NOTE_SDU_SIZE: {
1718 		uint_t  max_sdu;
1719 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1720 		str_notify_sdu_size(dsp, max_sdu);
1721 		break;
1722 	}
1723 
1724 	case MAC_NOTE_FASTPATH_FLUSH:
1725 		str_notify_fastpath_flush(dsp);
1726 		break;
1727 
1728 	case MAC_NOTE_MARGIN:
1729 		break;
1730 
1731 	case MAC_NOTE_PROMISC:
1732 		break;
1733 
1734 	default:
1735 		ASSERT(B_FALSE);
1736 		break;
1737 	}
1738 }
1739 
1740 /*
1741  * This function is called via a taskq mechansim to process all control
1742  * messages on a per 'dsp' end point.
1743  */
1744 static void
1745 dld_wput_nondata_task(void *arg)
1746 {
1747 	dld_str_t	*dsp = arg;
1748 	mblk_t		*mp;
1749 
1750 	mutex_enter(&dsp->ds_lock);
1751 	while (dsp->ds_pending_head != NULL) {
1752 		mp = dsp->ds_pending_head;
1753 		dsp->ds_pending_head = mp->b_next;
1754 		mp->b_next = NULL;
1755 		if (dsp->ds_pending_head == NULL)
1756 			dsp->ds_pending_tail = NULL;
1757 		mutex_exit(&dsp->ds_lock);
1758 
1759 		switch (DB_TYPE(mp)) {
1760 		case M_PROTO:
1761 		case M_PCPROTO:
1762 			dld_proto(dsp, mp);
1763 			break;
1764 		case M_IOCTL:
1765 			dld_ioc(dsp, mp);
1766 			break;
1767 		default:
1768 			ASSERT(0);
1769 		}
1770 
1771 		mutex_enter(&dsp->ds_lock);
1772 	}
1773 	ASSERT(dsp->ds_pending_tail == NULL);
1774 	dsp->ds_dlpi_pending = 0;
1775 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1776 	mutex_exit(&dsp->ds_lock);
1777 }
1778 
1779 /*
1780  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1781  * thread is started at boot time.
1782  */
1783 static void
1784 dld_taskq_dispatch(void)
1785 {
1786 	callb_cpr_t	cprinfo;
1787 	dld_str_t	*dsp;
1788 
1789 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1790 	    "dld_taskq_dispatch");
1791 	mutex_enter(&dld_taskq_lock);
1792 
1793 	while (!dld_taskq_quit) {
1794 		dsp = list_head(&dld_taskq_list);
1795 		while (dsp != NULL) {
1796 			list_remove(&dld_taskq_list, dsp);
1797 			mutex_exit(&dld_taskq_lock);
1798 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1799 			    dsp, TQ_SLEEP) != 0);
1800 			mutex_enter(&dld_taskq_lock);
1801 			dsp = list_head(&dld_taskq_list);
1802 		}
1803 
1804 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1805 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1806 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1807 	}
1808 
1809 	dld_taskq_done = B_TRUE;
1810 	cv_signal(&dld_taskq_cv);
1811 	CALLB_CPR_EXIT(&cprinfo);
1812 	thread_exit();
1813 }
1814 
1815 /*
1816  * All control operations are serialized on the 'dsp' and are also funneled
1817  * through a taskq mechanism to ensure that subsequent processing has kernel
1818  * context and can safely use cv_wait.
1819  *
1820  * Mechanisms to handle taskq dispatch failures
1821  *
1822  * The only way to be sure that taskq dispatch does not fail is to either
1823  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1824  * some number of entries and make sure that the number of outstanding requests
1825  * are less than that number. We can't use TQ_SLEEP since we don't know the
1826  * context. Nor can we bound the total number of 'dsp' end points. So we are
1827  * unable to use either of the above schemes, and are forced to deal with
1828  * taskq dispatch failures. Note that even dynamic taskq could fail in
1829  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1830  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1831  * framework.
1832  *
1833  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1834  * We also have a single global thread to retry the taskq dispatch. This
1835  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1836  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1837  */
1838 static void
1839 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1840 {
1841 	ASSERT(mp->b_next == NULL);
1842 	mutex_enter(&dsp->ds_lock);
1843 	if (dsp->ds_pending_head != NULL) {
1844 		ASSERT(dsp->ds_dlpi_pending);
1845 		dsp->ds_pending_tail->b_next = mp;
1846 		dsp->ds_pending_tail = mp;
1847 		mutex_exit(&dsp->ds_lock);
1848 		return;
1849 	}
1850 	ASSERT(dsp->ds_pending_tail == NULL);
1851 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
1852 	/*
1853 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
1854 	 * thread is still active and is processing the last message, though
1855 	 * the pending queue has been emptied.
1856 	 */
1857 	if (dsp->ds_dlpi_pending) {
1858 		mutex_exit(&dsp->ds_lock);
1859 		return;
1860 	}
1861 
1862 	dsp->ds_dlpi_pending = 1;
1863 	mutex_exit(&dsp->ds_lock);
1864 
1865 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
1866 	    TQ_NOSLEEP) != 0)
1867 		return;
1868 
1869 	mutex_enter(&dld_taskq_lock);
1870 	list_insert_tail(&dld_taskq_list, dsp);
1871 	cv_signal(&dld_taskq_cv);
1872 	mutex_exit(&dld_taskq_lock);
1873 }
1874 
1875 /*
1876  * Process an M_IOCTL message.
1877  */
1878 static void
1879 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1880 {
1881 	uint_t			cmd;
1882 
1883 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1884 	ASSERT(dsp->ds_type == DLD_DLPI);
1885 
1886 	switch (cmd) {
1887 	case DLIOCNATIVE:
1888 		ioc_native(dsp, mp);
1889 		break;
1890 	case DLIOCMARGININFO:
1891 		ioc_margin(dsp, mp);
1892 		break;
1893 	case DLIOCRAW:
1894 		ioc_raw(dsp, mp);
1895 		break;
1896 	case DLIOCHDRINFO:
1897 		ioc_fast(dsp, mp);
1898 		break;
1899 	default:
1900 		ioc(dsp, mp);
1901 	}
1902 }
1903 
1904 /*
1905  * DLIOCNATIVE
1906  */
1907 static void
1908 ioc_native(dld_str_t *dsp, mblk_t *mp)
1909 {
1910 	queue_t *q = dsp->ds_wq;
1911 	const mac_info_t *mip = dsp->ds_mip;
1912 
1913 	/*
1914 	 * Native mode can be enabled if it's disabled and if the
1915 	 * native media type is different.
1916 	 */
1917 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
1918 		dsp->ds_native = B_TRUE;
1919 
1920 	if (dsp->ds_native)
1921 		miocack(q, mp, 0, mip->mi_nativemedia);
1922 	else
1923 		miocnak(q, mp, 0, ENOTSUP);
1924 }
1925 
1926 /*
1927  * DLIOCMARGININFO
1928  */
1929 static void
1930 ioc_margin(dld_str_t *dsp, mblk_t *mp)
1931 {
1932 	queue_t *q = dsp->ds_wq;
1933 	uint32_t margin;
1934 	int err;
1935 
1936 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1937 		err = EINVAL;
1938 		goto failed;
1939 	}
1940 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
1941 		goto failed;
1942 
1943 	mac_margin_get(dsp->ds_mh, &margin);
1944 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
1945 	miocack(q, mp, sizeof (uint32_t), 0);
1946 	return;
1947 
1948 failed:
1949 	miocnak(q, mp, 0, err);
1950 }
1951 
1952 /*
1953  * DLIOCRAW
1954  */
1955 static void
1956 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1957 {
1958 	queue_t *q = dsp->ds_wq;
1959 	mac_perim_handle_t	mph;
1960 
1961 	if (dsp->ds_mh == NULL) {
1962 		dsp->ds_mode = DLD_RAW;
1963 		miocack(q, mp, 0, 0);
1964 		return;
1965 	}
1966 
1967 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1968 	if (dsp->ds_polling || dsp->ds_direct) {
1969 		mac_perim_exit(mph);
1970 		miocnak(q, mp, 0, EPROTO);
1971 		return;
1972 	}
1973 
1974 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1975 		/*
1976 		 * Set the receive callback.
1977 		 */
1978 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
1979 	}
1980 
1981 	/*
1982 	 * Note that raw mode is enabled.
1983 	 */
1984 	dsp->ds_mode = DLD_RAW;
1985 	mac_perim_exit(mph);
1986 
1987 	miocack(q, mp, 0, 0);
1988 }
1989 
1990 /*
1991  * DLIOCHDRINFO
1992  */
1993 static void
1994 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1995 {
1996 	dl_unitdata_req_t *dlp;
1997 	off_t		off;
1998 	size_t		len;
1999 	const uint8_t	*addr;
2000 	uint16_t	sap;
2001 	mblk_t		*nmp;
2002 	mblk_t		*hmp;
2003 	uint_t		addr_length;
2004 	queue_t		*q = dsp->ds_wq;
2005 	int		err;
2006 	mac_perim_handle_t	mph;
2007 
2008 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2009 		err = ENOTSUP;
2010 		goto failed;
2011 	}
2012 
2013 	/*
2014 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2015 	 * user-land should not be allowed.
2016 	 */
2017 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2018 		err = EINVAL;
2019 		goto failed;
2020 	}
2021 
2022 	nmp = mp->b_cont;
2023 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2024 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2025 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2026 		err = EINVAL;
2027 		goto failed;
2028 	}
2029 
2030 	off = dlp->dl_dest_addr_offset;
2031 	len = dlp->dl_dest_addr_length;
2032 
2033 	if (!MBLKIN(nmp, off, len)) {
2034 		err = EINVAL;
2035 		goto failed;
2036 	}
2037 
2038 	if (dsp->ds_dlstate != DL_IDLE) {
2039 		err = ENOTSUP;
2040 		goto failed;
2041 	}
2042 
2043 	addr_length = dsp->ds_mip->mi_addr_length;
2044 	if (len != addr_length + sizeof (uint16_t)) {
2045 		err = EINVAL;
2046 		goto failed;
2047 	}
2048 
2049 	addr = nmp->b_rptr + off;
2050 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2051 
2052 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2053 		err = ENOMEM;
2054 		goto failed;
2055 	}
2056 
2057 	/*
2058 	 * This ioctl might happen concurrently with a direct call to dld_capab
2059 	 * that tries to enable direct and/or poll capabilities. Since the
2060 	 * stack does not serialize them, we do so here to avoid mixing
2061 	 * the callbacks.
2062 	 */
2063 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2064 	if (dsp->ds_mode != DLD_FASTPATH) {
2065 		/*
2066 		 * Set the receive callback (unless polling is enabled).
2067 		 */
2068 		if (!dsp->ds_polling && !dsp->ds_direct)
2069 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2070 
2071 		/*
2072 		 * Note that fast-path mode is enabled.
2073 		 */
2074 		dsp->ds_mode = DLD_FASTPATH;
2075 	}
2076 	mac_perim_exit(mph);
2077 
2078 	freemsg(nmp->b_cont);
2079 	nmp->b_cont = hmp;
2080 
2081 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2082 	return;
2083 failed:
2084 	miocnak(q, mp, 0, err);
2085 }
2086 
2087 /*
2088  * Catch-all handler.
2089  */
2090 static void
2091 ioc(dld_str_t *dsp, mblk_t *mp)
2092 {
2093 	queue_t	*q = dsp->ds_wq;
2094 
2095 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2096 		miocnak(q, mp, 0, EINVAL);
2097 		return;
2098 	}
2099 	mac_ioctl(dsp->ds_mh, q, mp);
2100 }
2101