xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision 3589c4f01c20349ca65899d209cdc0c17a641433)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Data-Link Driver
28  */
29 
30 #include	<inet/common.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/vlan.h>
35 #include	<sys/dld_impl.h>
36 #include	<sys/cpuvar.h>
37 #include	<sys/callb.h>
38 #include	<sys/list.h>
39 #include	<sys/mac_client.h>
40 #include	<sys/mac_client_priv.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc(dld_str_t *, mblk_t *);
58 static void	dld_ioc(dld_str_t *, mblk_t *);
59 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
60 
61 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
62 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
63     link_tagmode_t);
64 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
65 
66 static uint32_t		str_count;
67 static kmem_cache_t	*str_cachep;
68 static mod_hash_t	*str_hashp;
69 
70 #define	STR_HASHSZ		64
71 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
72 
73 #define	dld_taskq	system_taskq
74 
75 static kmutex_t		dld_taskq_lock;
76 static kcondvar_t	dld_taskq_cv;
77 static list_t		dld_taskq_list;		/* List of dld_str_t */
78 boolean_t		dld_taskq_quit;
79 boolean_t		dld_taskq_done;
80 
81 static void		dld_taskq_dispatch(void);
82 
83 /*
84  * Some notes on entry points, flow-control, queueing.
85  *
86  * This driver exports the traditional STREAMS put entry point as well as
87  * the non-STREAMS fast-path transmit routine which is provided to IP via
88  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
89  * and data operations, while the fast-path routine deals only with M_DATA
90  * fast-path packets.  Regardless of the entry point, all outbound packets
91  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
92  *
93  * The transmit logic operates in the following way: All packets coming
94  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
95  * happens when the MAC layer indicates the packets couldn't be
96  * transmitted due to 1) lack of resources (e.g. running out of
97  * descriptors),  or 2) reaching the allowed bandwidth limit for this
98  * particular flow. The indication comes in the form of a Tx cookie that
99  * identifies the blocked ring. In such case, DLD will place a
100  * dummy message on its write-side STREAMS queue so that the queue is
101  * marked as "full". Any subsequent packets arriving at the driver will
102  * still be sent to the MAC layer where it either gets queued in the Tx
103  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
104  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
105  * When the write service procedure runs, it will remove the dummy
106  * message from the write-side STREAMS queue; in effect this will trigger
107  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
108  * respectively, due to the above reasons.
109  *
110  * All non-data operations, both DLPI and ioctls are single threaded on a per
111  * dld_str_t endpoint. This is done using a taskq so that the control operation
112  * has kernel context and can cv_wait for resources. In addition all set type
113  * operations that involve mac level state modification are serialized on a
114  * per mac end point using the perimeter mechanism provided by the mac layer.
115  * This serializes all mac clients trying to modify a single mac end point over
116  * the entire sequence of mac calls made by that client as an atomic unit. The
117  * mac framework locking is described in mac.c. A critical element is that
118  * DLD/DLS does not hold any locks across the mac perimeter.
119  *
120  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
121  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
122  * match dev_t. If a stream is found and it is attached, its dev_info_t *
123  * is returned. If the mac handle is non-null, it can be safely accessed
124  * below. The mac handle won't be freed until the mac_unregister which
125  * won't happen until the driver detaches. The DDI framework ensures that
126  * the detach won't happen while a getinfo is in progress.
127  */
128 typedef struct i_dld_str_state_s {
129 	major_t		ds_major;
130 	minor_t		ds_minor;
131 	dev_info_t	*ds_dip;
132 } i_dld_str_state_t;
133 
134 /* ARGSUSED */
135 static uint_t
136 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
137 {
138 	i_dld_str_state_t	*statep = arg;
139 	dld_str_t		*dsp = (dld_str_t *)val;
140 	mac_handle_t		mh;
141 
142 	if (statep->ds_major != dsp->ds_major)
143 		return (MH_WALK_CONTINUE);
144 
145 	ASSERT(statep->ds_minor != 0);
146 	mh = dsp->ds_mh;
147 
148 	if (statep->ds_minor == dsp->ds_minor) {
149 		/*
150 		 * Clone: a clone minor is unique. we can terminate the
151 		 * walk if we find a matching stream -- even if we fail
152 		 * to obtain the devinfo.
153 		 */
154 		if (mh != NULL)
155 			statep->ds_dip = mac_devinfo_get(mh);
156 		return (MH_WALK_TERMINATE);
157 	}
158 	return (MH_WALK_CONTINUE);
159 }
160 
161 static dev_info_t *
162 dld_finddevinfo(dev_t dev)
163 {
164 	dev_info_t		*dip;
165 	i_dld_str_state_t	state;
166 
167 	if (getminor(dev) == 0)
168 		return (NULL);
169 
170 	/*
171 	 * See if it's a minor node of a link
172 	 */
173 	if ((dip = dls_link_devinfo(dev)) != NULL)
174 		return (dip);
175 
176 	state.ds_minor = getminor(dev);
177 	state.ds_major = getmajor(dev);
178 	state.ds_dip = NULL;
179 
180 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
181 	return (state.ds_dip);
182 }
183 
184 /*
185  * devo_getinfo: getinfo(9e)
186  */
187 /*ARGSUSED*/
188 int
189 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
190 {
191 	dev_info_t	*devinfo;
192 	minor_t		minor = getminor((dev_t)arg);
193 	int		rc = DDI_FAILURE;
194 
195 	switch (cmd) {
196 	case DDI_INFO_DEVT2DEVINFO:
197 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
198 			*(dev_info_t **)resp = devinfo;
199 			rc = DDI_SUCCESS;
200 		}
201 		break;
202 	case DDI_INFO_DEVT2INSTANCE:
203 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
204 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
205 			rc = DDI_SUCCESS;
206 		} else if (minor > DLS_MAX_MINOR &&
207 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
208 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
209 			rc = DDI_SUCCESS;
210 		}
211 		break;
212 	}
213 	return (rc);
214 }
215 
216 /*
217  * qi_qopen: open(9e)
218  */
219 /*ARGSUSED*/
220 int
221 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
222 {
223 	dld_str_t	*dsp;
224 	major_t		major;
225 	minor_t		minor;
226 	int		err;
227 
228 	if (sflag == MODOPEN)
229 		return (ENOTSUP);
230 
231 	/*
232 	 * This is a cloning driver and therefore each queue should only
233 	 * ever get opened once.
234 	 */
235 	if (rq->q_ptr != NULL)
236 		return (EBUSY);
237 
238 	major = getmajor(*devp);
239 	minor = getminor(*devp);
240 
241 	/*
242 	 * Create a new dld_str_t for the stream. This will grab a new minor
243 	 * number that will be handed back in the cloned dev_t.  Creation may
244 	 * fail if we can't allocate the dummy mblk used for flow-control.
245 	 */
246 	dsp = dld_str_create(rq, DLD_DLPI, major,
247 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
248 	if (dsp == NULL)
249 		return (ENOSR);
250 
251 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
252 	if (minor != 0) {
253 		/*
254 		 * Style 1 open
255 		 */
256 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
257 			goto failed;
258 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
259 	} else {
260 		(void) qassociate(rq, -1);
261 	}
262 
263 	/*
264 	 * Enable the queue srv(9e) routine.
265 	 */
266 	qprocson(rq);
267 
268 	/*
269 	 * Construct a cloned dev_t to hand back.
270 	 */
271 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
272 	return (0);
273 
274 failed:
275 	dld_str_destroy(dsp);
276 	return (err);
277 }
278 
279 /*
280  * qi_qclose: close(9e)
281  */
282 int
283 dld_close(queue_t *rq)
284 {
285 	dld_str_t	*dsp = rq->q_ptr;
286 
287 	/*
288 	 * All modules on top have been popped off. So there can't be any
289 	 * threads from the top.
290 	 */
291 	ASSERT(dsp->ds_datathr_cnt == 0);
292 
293 	/*
294 	 * Wait until pending DLPI requests are processed.
295 	 */
296 	mutex_enter(&dsp->ds_lock);
297 	while (dsp->ds_dlpi_pending)
298 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
299 	mutex_exit(&dsp->ds_lock);
300 
301 	/*
302 	 * Disable the queue srv(9e) routine.
303 	 */
304 	qprocsoff(rq);
305 
306 
307 	/*
308 	 * This stream was open to a provider node. Check to see
309 	 * if it has been cleanly shut down.
310 	 */
311 	if (dsp->ds_dlstate != DL_UNATTACHED) {
312 		/*
313 		 * The stream is either open to a style 1 provider or
314 		 * this is not clean shutdown. Detach from the PPA.
315 		 * (This is still ok even in the style 1 case).
316 		 */
317 		dld_str_detach(dsp);
318 	}
319 
320 	dld_str_destroy(dsp);
321 	return (0);
322 }
323 
324 /*
325  * qi_qputp: put(9e)
326  */
327 void
328 dld_wput(queue_t *wq, mblk_t *mp)
329 {
330 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
331 	dld_str_mode_t	mode;
332 
333 	switch (DB_TYPE(mp)) {
334 	case M_DATA:
335 		mutex_enter(&dsp->ds_lock);
336 		mode = dsp->ds_mode;
337 		if ((dsp->ds_dlstate != DL_IDLE) ||
338 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
339 			mutex_exit(&dsp->ds_lock);
340 			freemsg(mp);
341 			break;
342 		}
343 
344 		DLD_DATATHR_INC(dsp);
345 		mutex_exit(&dsp->ds_lock);
346 		if (mode == DLD_FASTPATH) {
347 			if (dsp->ds_mip->mi_media == DL_ETHER &&
348 			    (MBLKL(mp) < sizeof (struct ether_header))) {
349 				freemsg(mp);
350 			} else {
351 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
352 			}
353 		} else {
354 			str_mdata_raw_put(dsp, mp);
355 		}
356 		DLD_DATATHR_DCR(dsp);
357 		break;
358 	case M_PROTO:
359 	case M_PCPROTO: {
360 		t_uscalar_t	prim;
361 
362 		if (MBLKL(mp) < sizeof (t_uscalar_t))
363 			break;
364 
365 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
366 
367 		if (prim == DL_UNITDATA_REQ) {
368 			proto_unitdata_req(dsp, mp);
369 		} else {
370 			dld_wput_nondata(dsp, mp);
371 		}
372 		break;
373 	}
374 
375 	case M_IOCTL:
376 		dld_wput_nondata(dsp, mp);
377 		break;
378 
379 	case M_FLUSH:
380 		if (*mp->b_rptr & FLUSHW) {
381 			DLD_CLRQFULL(dsp);
382 			*mp->b_rptr &= ~FLUSHW;
383 		}
384 
385 		if (*mp->b_rptr & FLUSHR) {
386 			qreply(wq, mp);
387 		} else {
388 			freemsg(mp);
389 		}
390 		break;
391 
392 	default:
393 		freemsg(mp);
394 		break;
395 	}
396 }
397 
398 /*
399  * qi_srvp: srv(9e)
400  */
401 void
402 dld_wsrv(queue_t *wq)
403 {
404 	dld_str_t	*dsp = wq->q_ptr;
405 
406 	DLD_CLRQFULL(dsp);
407 }
408 
409 void
410 dld_init_ops(struct dev_ops *ops, const char *name)
411 {
412 	struct streamtab *stream;
413 	struct qinit *rq, *wq;
414 	struct module_info *modinfo;
415 
416 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
417 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
418 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
419 	modinfo->mi_minpsz = 0;
420 	modinfo->mi_maxpsz = 64*1024;
421 	modinfo->mi_hiwat  = 1;
422 	modinfo->mi_lowat = 0;
423 
424 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
425 	rq->qi_qopen = dld_open;
426 	rq->qi_qclose = dld_close;
427 	rq->qi_minfo = modinfo;
428 
429 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
430 	wq->qi_putp = (pfi_t)dld_wput;
431 	wq->qi_srvp = (pfi_t)dld_wsrv;
432 	wq->qi_minfo = modinfo;
433 
434 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
435 	stream->st_rdinit = rq;
436 	stream->st_wrinit = wq;
437 	ops->devo_cb_ops->cb_str = stream;
438 
439 	if (ops->devo_getinfo == NULL)
440 		ops->devo_getinfo = &dld_getinfo;
441 }
442 
443 void
444 dld_fini_ops(struct dev_ops *ops)
445 {
446 	struct streamtab *stream;
447 	struct qinit *rq, *wq;
448 	struct module_info *modinfo;
449 
450 	stream = ops->devo_cb_ops->cb_str;
451 	rq = stream->st_rdinit;
452 	wq = stream->st_wrinit;
453 	modinfo = rq->qi_minfo;
454 	ASSERT(wq->qi_minfo == modinfo);
455 
456 	kmem_free(stream, sizeof (struct streamtab));
457 	kmem_free(wq, sizeof (struct qinit));
458 	kmem_free(rq, sizeof (struct qinit));
459 	kmem_free(modinfo->mi_idname, FMNAMESZ);
460 	kmem_free(modinfo, sizeof (struct module_info));
461 }
462 
463 /*
464  * Initialize this module's data structures.
465  */
466 void
467 dld_str_init(void)
468 {
469 	/*
470 	 * Create dld_str_t object cache.
471 	 */
472 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
473 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
474 	ASSERT(str_cachep != NULL);
475 
476 	/*
477 	 * Create a hash table for maintaining dld_str_t's.
478 	 * The ds_minor field (the clone minor number) of a dld_str_t
479 	 * is used as a key for this hash table because this number is
480 	 * globally unique (allocated from "dls_minor_arena").
481 	 */
482 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
483 	    mod_hash_null_valdtor);
484 
485 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
486 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
487 
488 	dld_taskq_quit = B_FALSE;
489 	dld_taskq_done = B_FALSE;
490 	list_create(&dld_taskq_list, sizeof (dld_str_t),
491 	    offsetof(dld_str_t, ds_tqlist));
492 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
493 	    &p0, TS_RUN, minclsyspri);
494 }
495 
496 /*
497  * Tear down this module's data structures.
498  */
499 int
500 dld_str_fini(void)
501 {
502 	/*
503 	 * Make sure that there are no objects in use.
504 	 */
505 	if (str_count != 0)
506 		return (EBUSY);
507 
508 	/*
509 	 * Ask the dld_taskq thread to quit and wait for it to be done
510 	 */
511 	mutex_enter(&dld_taskq_lock);
512 	dld_taskq_quit = B_TRUE;
513 	cv_signal(&dld_taskq_cv);
514 	while (!dld_taskq_done)
515 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
516 	mutex_exit(&dld_taskq_lock);
517 	list_destroy(&dld_taskq_list);
518 	/*
519 	 * Destroy object cache.
520 	 */
521 	kmem_cache_destroy(str_cachep);
522 	mod_hash_destroy_idhash(str_hashp);
523 	return (0);
524 }
525 
526 /*
527  * Create a new dld_str_t object.
528  */
529 dld_str_t *
530 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
531 {
532 	dld_str_t	*dsp;
533 	int		err;
534 
535 	/*
536 	 * Allocate an object from the cache.
537 	 */
538 	atomic_add_32(&str_count, 1);
539 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
540 
541 	/*
542 	 * Allocate the dummy mblk for flow-control.
543 	 */
544 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
545 	if (dsp->ds_tx_flow_mp == NULL) {
546 		kmem_cache_free(str_cachep, dsp);
547 		atomic_add_32(&str_count, -1);
548 		return (NULL);
549 	}
550 	dsp->ds_type = type;
551 	dsp->ds_major = major;
552 	dsp->ds_style = style;
553 
554 	/*
555 	 * Initialize the queue pointers.
556 	 */
557 	ASSERT(RD(rq) == rq);
558 	dsp->ds_rq = rq;
559 	dsp->ds_wq = WR(rq);
560 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
561 
562 	/*
563 	 * We want explicit control over our write-side STREAMS queue
564 	 * where the dummy mblk gets added/removed for flow-control.
565 	 */
566 	noenable(WR(rq));
567 
568 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
569 	    (mod_hash_val_t)dsp);
570 	ASSERT(err == 0);
571 	return (dsp);
572 }
573 
574 /*
575  * Destroy a dld_str_t object.
576  */
577 void
578 dld_str_destroy(dld_str_t *dsp)
579 {
580 	queue_t		*rq;
581 	queue_t		*wq;
582 	mod_hash_val_t	val;
583 
584 	/*
585 	 * Clear the queue pointers.
586 	 */
587 	rq = dsp->ds_rq;
588 	wq = dsp->ds_wq;
589 	ASSERT(wq == WR(rq));
590 	rq->q_ptr = wq->q_ptr = NULL;
591 	dsp->ds_rq = dsp->ds_wq = NULL;
592 
593 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
594 	ASSERT(dsp->ds_sap == 0);
595 	ASSERT(dsp->ds_mh == NULL);
596 	ASSERT(dsp->ds_mch == NULL);
597 	ASSERT(dsp->ds_promisc == 0);
598 	ASSERT(dsp->ds_mph == NULL);
599 	ASSERT(dsp->ds_mip == NULL);
600 	ASSERT(dsp->ds_mnh == NULL);
601 
602 	ASSERT(dsp->ds_polling == B_FALSE);
603 	ASSERT(dsp->ds_direct == B_FALSE);
604 	ASSERT(dsp->ds_lso == B_FALSE);
605 	ASSERT(dsp->ds_lso_max == 0);
606 
607 	/*
608 	 * Reinitialize all the flags.
609 	 */
610 	dsp->ds_notifications = 0;
611 	dsp->ds_passivestate = DLD_UNINITIALIZED;
612 	dsp->ds_mode = DLD_UNITDATA;
613 	dsp->ds_native = B_FALSE;
614 
615 	ASSERT(dsp->ds_datathr_cnt == 0);
616 	ASSERT(dsp->ds_pending_head == NULL);
617 	ASSERT(dsp->ds_pending_tail == NULL);
618 	ASSERT(!dsp->ds_dlpi_pending);
619 
620 	ASSERT(dsp->ds_dlp == NULL);
621 	ASSERT(dsp->ds_dmap == NULL);
622 	ASSERT(dsp->ds_rx == NULL);
623 	ASSERT(dsp->ds_rx_arg == NULL);
624 	ASSERT(dsp->ds_next == NULL);
625 	ASSERT(dsp->ds_head == NULL);
626 
627 	/*
628 	 * Free the dummy mblk if exists.
629 	 */
630 	if (dsp->ds_tx_flow_mp != NULL) {
631 		freeb(dsp->ds_tx_flow_mp);
632 		dsp->ds_tx_flow_mp = NULL;
633 	}
634 
635 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
636 	ASSERT(dsp == (dld_str_t *)val);
637 
638 	/*
639 	 * Free the object back to the cache.
640 	 */
641 	kmem_cache_free(str_cachep, dsp);
642 	atomic_add_32(&str_count, -1);
643 }
644 
645 /*
646  * kmem_cache contructor function: see kmem_cache_create(9f).
647  */
648 /*ARGSUSED*/
649 static int
650 str_constructor(void *buf, void *cdrarg, int kmflags)
651 {
652 	dld_str_t	*dsp = buf;
653 
654 	bzero(buf, sizeof (dld_str_t));
655 
656 	/*
657 	 * Allocate a new minor number.
658 	 */
659 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
660 		return (-1);
661 
662 	/*
663 	 * Initialize the DLPI state machine.
664 	 */
665 	dsp->ds_dlstate = DL_UNATTACHED;
666 
667 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
668 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
669 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
670 
671 	return (0);
672 }
673 
674 /*
675  * kmem_cache destructor function.
676  */
677 /*ARGSUSED*/
678 static void
679 str_destructor(void *buf, void *cdrarg)
680 {
681 	dld_str_t	*dsp = buf;
682 
683 	/*
684 	 * Release the minor number.
685 	 */
686 	mac_minor_rele(dsp->ds_minor);
687 
688 	ASSERT(dsp->ds_tx_flow_mp == NULL);
689 
690 	mutex_destroy(&dsp->ds_lock);
691 	cv_destroy(&dsp->ds_datathr_cv);
692 	cv_destroy(&dsp->ds_dlpi_pending_cv);
693 }
694 
695 /*
696  * Update the priority bits and VID (may need to insert tag if mp points
697  * to an untagged packet.
698  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
699  */
700 static mblk_t *
701 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
702     link_tagmode_t tagmode)
703 {
704 	mblk_t *hmp;
705 	struct ether_vlan_header *evhp;
706 	struct ether_header *ehp;
707 	uint16_t old_tci = 0;
708 	size_t len;
709 
710 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
711 
712 	evhp = (struct ether_vlan_header *)mp->b_rptr;
713 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
714 		/*
715 		 * Tagged packet, update the priority bits.
716 		 */
717 		len = sizeof (struct ether_vlan_header);
718 
719 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
720 			/*
721 			 * In case some drivers only check the db_ref
722 			 * count of the first mblk, we pullup the
723 			 * message into a single mblk.
724 			 */
725 			hmp = msgpullup(mp, -1);
726 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
727 				freemsg(hmp);
728 				return (NULL);
729 			} else {
730 				freemsg(mp);
731 				mp = hmp;
732 			}
733 		}
734 
735 		evhp = (struct ether_vlan_header *)mp->b_rptr;
736 		old_tci = ntohs(evhp->ether_tci);
737 	} else {
738 		/*
739 		 * Untagged packet.  Two factors will cause us to insert a
740 		 * VLAN header:
741 		 * - This is a VLAN link (vid is specified)
742 		 * - The link supports user priority tagging and the priority
743 		 *   is non-zero.
744 		 */
745 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
746 			return (mp);
747 
748 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
749 		if (hmp == NULL)
750 			return (NULL);
751 
752 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
753 		ehp = (struct ether_header *)mp->b_rptr;
754 
755 		/*
756 		 * Copy the MAC addresses and typelen
757 		 */
758 		bcopy(ehp, evhp, (ETHERADDRL * 2));
759 		evhp->ether_type = ehp->ether_type;
760 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
761 
762 		hmp->b_wptr += sizeof (struct ether_vlan_header);
763 		mp->b_rptr += sizeof (struct ether_header);
764 
765 		/*
766 		 * Free the original message if it's now empty. Link the
767 		 * rest of the messages to the header message.
768 		 */
769 		if (MBLKL(mp) == 0) {
770 			hmp->b_cont = mp->b_cont;
771 			freeb(mp);
772 		} else {
773 			hmp->b_cont = mp;
774 		}
775 		mp = hmp;
776 	}
777 
778 	if (pri == 0)
779 		pri = VLAN_PRI(old_tci);
780 	if (vid == VLAN_ID_NONE)
781 		vid = VLAN_ID(old_tci);
782 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
783 	return (mp);
784 }
785 
786 /*
787  * M_DATA put (IP fast-path mode)
788  */
789 mac_tx_cookie_t
790 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
791     uint16_t flag)
792 {
793 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
794 	mblk_t *newmp;
795 	uint_t pri;
796 	mac_tx_cookie_t cookie;
797 
798 	if (is_ethernet) {
799 		/*
800 		 * Update the priority bits to the assigned priority.
801 		 */
802 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
803 
804 		if (pri != 0) {
805 			newmp = i_dld_ether_header_update_tag(mp, pri,
806 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
807 			if (newmp == NULL)
808 				goto discard;
809 			mp = newmp;
810 		}
811 	}
812 
813 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
814 		DLD_SETQFULL(dsp);
815 	}
816 	return (cookie);
817 
818 discard:
819 	/* TODO: bump kstat? */
820 	freemsg(mp);
821 	return (NULL);
822 }
823 
824 /*
825  * M_DATA put (DLIOCRAW mode)
826  */
827 static void
828 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
829 {
830 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
831 	mblk_t *bp, *newmp;
832 	size_t size;
833 	mac_header_info_t mhi;
834 	uint_t pri, vid, dvid;
835 	uint_t max_sdu;
836 
837 	/*
838 	 * Certain MAC type plugins provide an illusion for raw DLPI
839 	 * consumers.  They pretend that the MAC layer is something that
840 	 * it's not for the benefit of observability tools.  For example,
841 	 * mac_wifi pretends that it's Ethernet for such consumers.
842 	 * Here, unless native mode is enabled, we call into the MAC layer so
843 	 * that this illusion can be maintained.  The plugin will optionally
844 	 * transform the MAC header here into something that can be passed
845 	 * down.  The header goes from raw mode to "cooked" mode.
846 	 */
847 	if (!dsp->ds_native) {
848 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
849 			goto discard;
850 		mp = newmp;
851 	}
852 
853 	size = MBLKL(mp);
854 
855 	/*
856 	 * Check the packet is not too big and that any remaining
857 	 * fragment list is composed entirely of M_DATA messages. (We
858 	 * know the first fragment was M_DATA otherwise we could not
859 	 * have got here).
860 	 */
861 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
862 		if (DB_TYPE(bp) != M_DATA)
863 			goto discard;
864 		size += MBLKL(bp);
865 	}
866 
867 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
868 		goto discard;
869 
870 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
871 	/*
872 	 * If LSO is enabled, check the size against lso_max. Otherwise,
873 	 * compare the packet size with max_sdu.
874 	 */
875 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
876 	if (size > max_sdu + mhi.mhi_hdrsize)
877 		goto discard;
878 
879 	if (is_ethernet) {
880 		dvid = mac_client_vid(dsp->ds_mch);
881 
882 		/*
883 		 * Discard the packet if this is a VLAN stream but the VID in
884 		 * the packet is not correct.
885 		 */
886 		vid = VLAN_ID(mhi.mhi_tci);
887 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
888 			goto discard;
889 
890 		/*
891 		 * Discard the packet if this packet is a tagged packet
892 		 * but both pri and VID are 0.
893 		 */
894 		pri = VLAN_PRI(mhi.mhi_tci);
895 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
896 			goto discard;
897 
898 		/*
899 		 * Update the priority bits to the per-stream priority if
900 		 * priority is not set in the packet. Update the VID for
901 		 * packets on a VLAN stream.
902 		 */
903 		pri = (pri == 0) ? dsp->ds_pri : 0;
904 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
905 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
906 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
907 				goto discard;
908 			}
909 			mp = newmp;
910 		}
911 	}
912 
913 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
914 		/* Turn on flow-control for dld */
915 		DLD_SETQFULL(dsp);
916 	}
917 	return;
918 
919 discard:
920 	/* TODO: bump kstat? */
921 	freemsg(mp);
922 }
923 
924 /*
925  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
926  */
927 int
928 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
929 {
930 	dev_t			dev;
931 	int			err;
932 	const char		*drvname;
933 	mac_perim_handle_t	mph;
934 	boolean_t		qassociated = B_FALSE;
935 	dls_link_t		*dlp = NULL;
936 	dls_dl_handle_t		ddp = NULL;
937 	boolean_t		entered_perim = B_FALSE;
938 
939 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
940 		return (EINVAL);
941 
942 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
943 		return (ENOTSUP);
944 
945 	/*
946 	 * /dev node access. This will still be supported for backward
947 	 * compatibility reason.
948 	 */
949 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
950 	    (strcmp(drvname, "vnic") != 0)) {
951 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
952 			return (EINVAL);
953 		qassociated = B_TRUE;
954 	}
955 
956 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
957 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
958 		goto failed;
959 
960 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
961 		goto failed;
962 	entered_perim = B_TRUE;
963 
964 	/*
965 	 * Open a channel.
966 	 */
967 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
968 		goto failed;
969 
970 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
971 		goto failed;
972 
973 	/*
974 	 * Set the default packet priority.
975 	 */
976 	dsp->ds_pri = 0;
977 
978 	/*
979 	 * Add a notify function so that the we get updates from the MAC.
980 	 */
981 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
982 	dsp->ds_dlstate = DL_UNBOUND;
983 	mac_perim_exit(mph);
984 	return (0);
985 
986 failed:
987 	if (dlp != NULL)
988 		dls_link_rele(dlp);
989 	if (entered_perim)
990 		mac_perim_exit(mph);
991 	if (ddp != NULL)
992 		dls_devnet_rele(ddp);
993 	if (qassociated)
994 		(void) qassociate(dsp->ds_wq, -1);
995 
996 	return (err);
997 }
998 
999 /*
1000  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1001  * from close(2) for style 2.
1002  */
1003 void
1004 dld_str_detach(dld_str_t *dsp)
1005 {
1006 	mac_perim_handle_t	mph;
1007 	int			err;
1008 
1009 	ASSERT(dsp->ds_datathr_cnt == 0);
1010 
1011 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1012 	/*
1013 	 * Remove the notify function.
1014 	 *
1015 	 * Note that we cannot wait for the notification callback to be removed
1016 	 * since it could cause the deadlock with str_notify() since they both
1017 	 * need the mac perimeter. Continue if we cannot remove the
1018 	 * notification callback right now and wait after we leave the
1019 	 * perimeter.
1020 	 */
1021 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1022 	dsp->ds_mnh = NULL;
1023 
1024 	/*
1025 	 * Disable the capabilities
1026 	 */
1027 	dld_capabilities_disable(dsp);
1028 
1029 	/*
1030 	 * Clear LSO flags.
1031 	 */
1032 	dsp->ds_lso = B_FALSE;
1033 	dsp->ds_lso_max = 0;
1034 
1035 	dls_close(dsp);
1036 	mac_perim_exit(mph);
1037 
1038 	/*
1039 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1040 	 * because the notification callback was in progress, wait for
1041 	 * it to finish before we proceed.
1042 	 */
1043 	if (err != 0)
1044 		mac_notify_remove_wait(dsp->ds_mh);
1045 
1046 	/*
1047 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1048 	 * automatically in the call to dls_devnet_rele.
1049 	 */
1050 	dls_devnet_rele(dsp->ds_ddh);
1051 
1052 	dsp->ds_sap = 0;
1053 	dsp->ds_mh = NULL;
1054 	dsp->ds_mch = NULL;
1055 	dsp->ds_mip = NULL;
1056 
1057 	if (dsp->ds_style == DL_STYLE2)
1058 		(void) qassociate(dsp->ds_wq, -1);
1059 
1060 	/*
1061 	 * Re-initialize the DLPI state machine.
1062 	 */
1063 	dsp->ds_dlstate = DL_UNATTACHED;
1064 }
1065 
1066 /*
1067  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1068  * tags before sending packets up to the DLS clients, with the exception of
1069  * special priority tagged packets, in that case, we set the VID to 0.
1070  * mp must be a VLAN tagged packet.
1071  */
1072 static mblk_t *
1073 i_dld_ether_header_strip_tag(mblk_t *mp)
1074 {
1075 	mblk_t *newmp;
1076 	struct ether_vlan_header *evhp;
1077 	uint16_t tci, new_tci;
1078 
1079 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1080 	if (DB_REF(mp) > 1) {
1081 		newmp = copymsg(mp);
1082 		if (newmp == NULL)
1083 			return (NULL);
1084 		freemsg(mp);
1085 		mp = newmp;
1086 	}
1087 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1088 
1089 	tci = ntohs(evhp->ether_tci);
1090 	if (VLAN_PRI(tci) == 0) {
1091 		/*
1092 		 * Priority is 0, strip the tag.
1093 		 */
1094 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1095 		mp->b_rptr += VLAN_TAGSZ;
1096 	} else {
1097 		/*
1098 		 * Priority is not 0, update the VID to 0.
1099 		 */
1100 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1101 		evhp->ether_tci = htons(new_tci);
1102 	}
1103 	return (mp);
1104 }
1105 
1106 /*
1107  * Raw mode receive function.
1108  */
1109 /*ARGSUSED*/
1110 void
1111 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1112     mac_header_info_t *mhip)
1113 {
1114 	dld_str_t *dsp = (dld_str_t *)arg;
1115 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1116 	mblk_t *next, *newmp;
1117 
1118 	ASSERT(mp != NULL);
1119 	do {
1120 		/*
1121 		 * Get the pointer to the next packet in the chain and then
1122 		 * clear b_next before the packet gets passed on.
1123 		 */
1124 		next = mp->b_next;
1125 		mp->b_next = NULL;
1126 
1127 		/*
1128 		 * Wind back b_rptr to point at the MAC header.
1129 		 */
1130 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1131 		mp->b_rptr -= mhip->mhi_hdrsize;
1132 
1133 		/*
1134 		 * Certain MAC type plugins provide an illusion for raw
1135 		 * DLPI consumers.  They pretend that the MAC layer is
1136 		 * something that it's not for the benefit of observability
1137 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1138 		 * for such consumers.	Here, unless native mode is enabled,
1139 		 * we call into the MAC layer so that this illusion can be
1140 		 * maintained.	The plugin will optionally transform the MAC
1141 		 * header here into something that can be passed up to raw
1142 		 * consumers.  The header goes from "cooked" mode to raw mode.
1143 		 */
1144 		if (!dsp->ds_native) {
1145 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1146 			if (newmp == NULL) {
1147 				freemsg(mp);
1148 				goto next;
1149 			}
1150 			mp = newmp;
1151 		}
1152 
1153 		/*
1154 		 * Strip the VLAN tag for VLAN streams.
1155 		 */
1156 		if (is_ethernet &&
1157 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1158 			newmp = i_dld_ether_header_strip_tag(mp);
1159 			if (newmp == NULL) {
1160 				freemsg(mp);
1161 				goto next;
1162 			}
1163 			mp = newmp;
1164 		}
1165 
1166 		/*
1167 		 * Pass the packet on.
1168 		 */
1169 		if (canputnext(dsp->ds_rq))
1170 			putnext(dsp->ds_rq, mp);
1171 		else
1172 			freemsg(mp);
1173 
1174 next:
1175 		/*
1176 		 * Move on to the next packet in the chain.
1177 		 */
1178 		mp = next;
1179 	} while (mp != NULL);
1180 }
1181 
1182 /*
1183  * Fast-path receive function.
1184  */
1185 /*ARGSUSED*/
1186 void
1187 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1188     mac_header_info_t *mhip)
1189 {
1190 	dld_str_t *dsp = (dld_str_t *)arg;
1191 	mblk_t *next;
1192 	size_t offset = 0;
1193 
1194 	/*
1195 	 * MAC header stripping rules:
1196 	 *    - Tagged packets:
1197 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1198 	 *	b. Physical streams
1199 	 *	- VLAN packets (non-zero VID). The stream must be either a
1200 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1201 	 *	  Strip the Ethernet header but keep the VLAN header.
1202 	 *	- Special tagged packets (zero VID)
1203 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1204 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1205 	 *	    keep the VLAN header.
1206 	 *	  * Otherwise, strip the whole VLAN header.
1207 	 *    - Untagged packets. Strip the whole MAC header.
1208 	 */
1209 	if (mhip->mhi_istagged &&
1210 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1211 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1212 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1213 		offset = VLAN_TAGSZ;
1214 	}
1215 
1216 	ASSERT(mp != NULL);
1217 	do {
1218 		/*
1219 		 * Get the pointer to the next packet in the chain and then
1220 		 * clear b_next before the packet gets passed on.
1221 		 */
1222 		next = mp->b_next;
1223 		mp->b_next = NULL;
1224 
1225 		/*
1226 		 * Wind back b_rptr to point at the VLAN header.
1227 		 */
1228 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1229 		mp->b_rptr -= offset;
1230 
1231 		/*
1232 		 * Pass the packet on.
1233 		 */
1234 		if (canputnext(dsp->ds_rq))
1235 			putnext(dsp->ds_rq, mp);
1236 		else
1237 			freemsg(mp);
1238 		/*
1239 		 * Move on to the next packet in the chain.
1240 		 */
1241 		mp = next;
1242 	} while (mp != NULL);
1243 }
1244 
1245 /*
1246  * Default receive function (send DL_UNITDATA_IND messages).
1247  */
1248 /*ARGSUSED*/
1249 void
1250 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1251     mac_header_info_t *mhip)
1252 {
1253 	dld_str_t		*dsp = (dld_str_t *)arg;
1254 	mblk_t			*ud_mp;
1255 	mblk_t			*next;
1256 	size_t			offset = 0;
1257 	boolean_t		strip_vlan = B_TRUE;
1258 
1259 	/*
1260 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1261 	 */
1262 	if (mhip->mhi_istagged &&
1263 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1264 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1265 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1266 		offset = VLAN_TAGSZ;
1267 		strip_vlan = B_FALSE;
1268 	}
1269 
1270 	ASSERT(mp != NULL);
1271 	do {
1272 		/*
1273 		 * Get the pointer to the next packet in the chain and then
1274 		 * clear b_next before the packet gets passed on.
1275 		 */
1276 		next = mp->b_next;
1277 		mp->b_next = NULL;
1278 
1279 		/*
1280 		 * Wind back b_rptr to point at the MAC header.
1281 		 */
1282 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1283 		mp->b_rptr -= mhip->mhi_hdrsize;
1284 
1285 		/*
1286 		 * Create the DL_UNITDATA_IND M_PROTO.
1287 		 */
1288 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1289 			freemsgchain(mp);
1290 			return;
1291 		}
1292 
1293 		/*
1294 		 * Advance b_rptr to point at the payload (or the VLAN header).
1295 		 */
1296 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1297 
1298 		/*
1299 		 * Prepend the DL_UNITDATA_IND.
1300 		 */
1301 		ud_mp->b_cont = mp;
1302 
1303 		/*
1304 		 * Send the message.
1305 		 */
1306 		if (canputnext(dsp->ds_rq))
1307 			putnext(dsp->ds_rq, ud_mp);
1308 		else
1309 			freemsg(ud_mp);
1310 
1311 		/*
1312 		 * Move on to the next packet in the chain.
1313 		 */
1314 		mp = next;
1315 	} while (mp != NULL);
1316 }
1317 
1318 /*
1319  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1320  */
1321 static void
1322 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu)
1323 {
1324 	mblk_t		*mp;
1325 	dl_notify_ind_t *dlip;
1326 
1327 	if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE))
1328 		return;
1329 
1330 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1331 	    M_PROTO, 0)) == NULL)
1332 		return;
1333 
1334 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1335 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1336 	dlip->dl_primitive = DL_NOTIFY_IND;
1337 	dlip->dl_notification = DL_NOTE_SDU_SIZE;
1338 	dlip->dl_data = max_sdu;
1339 
1340 	qreply(dsp->ds_wq, mp);
1341 }
1342 
1343 /*
1344  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1345  * current state of the interface.
1346  */
1347 void
1348 dld_str_notify_ind(dld_str_t *dsp)
1349 {
1350 	mac_notify_type_t	type;
1351 
1352 	for (type = 0; type < MAC_NNOTE; type++)
1353 		str_notify(dsp, type);
1354 }
1355 
1356 typedef struct dl_unitdata_ind_wrapper {
1357 	dl_unitdata_ind_t	dl_unitdata;
1358 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1359 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1360 } dl_unitdata_ind_wrapper_t;
1361 
1362 /*
1363  * Create a DL_UNITDATA_IND M_PROTO message.
1364  */
1365 static mblk_t *
1366 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1367 {
1368 	mblk_t				*nmp;
1369 	dl_unitdata_ind_wrapper_t	*dlwp;
1370 	dl_unitdata_ind_t		*dlp;
1371 	mac_header_info_t		mhi;
1372 	uint_t				addr_length;
1373 	uint8_t				*daddr;
1374 	uint8_t				*saddr;
1375 
1376 	/*
1377 	 * Get the packet header information.
1378 	 */
1379 	if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0)
1380 		return (NULL);
1381 
1382 	/*
1383 	 * Allocate a message large enough to contain the wrapper structure
1384 	 * defined above.
1385 	 */
1386 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1387 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1388 	    DL_UNITDATA_IND)) == NULL)
1389 		return (NULL);
1390 
1391 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1392 
1393 	dlp = &(dlwp->dl_unitdata);
1394 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1395 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1396 
1397 	/*
1398 	 * Copy in the destination address.
1399 	 */
1400 	addr_length = dsp->ds_mip->mi_addr_length;
1401 	daddr = dlwp->dl_dest_addr;
1402 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1403 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1404 
1405 	/*
1406 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1407 	 */
1408 	if (mhi.mhi_istagged && !strip_vlan)
1409 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1410 	else
1411 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1412 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1413 
1414 	/*
1415 	 * If the destination address was multicast or broadcast then the
1416 	 * dl_group_address field should be non-zero.
1417 	 */
1418 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1419 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1420 
1421 	/*
1422 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1423 	 * for example) may not have access to source information.
1424 	 */
1425 	if (mhi.mhi_saddr == NULL) {
1426 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1427 	} else {
1428 		saddr = dlwp->dl_src_addr;
1429 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1430 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1431 
1432 		/*
1433 		 * Set the source DLSAP to the packet ethertype.
1434 		 */
1435 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1436 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1437 	}
1438 
1439 	return (nmp);
1440 }
1441 
1442 /*
1443  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1444  */
1445 static void
1446 str_notify_promisc_on_phys(dld_str_t *dsp)
1447 {
1448 	mblk_t		*mp;
1449 	dl_notify_ind_t	*dlip;
1450 
1451 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1452 		return;
1453 
1454 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1455 	    M_PROTO, 0)) == NULL)
1456 		return;
1457 
1458 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1459 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1460 	dlip->dl_primitive = DL_NOTIFY_IND;
1461 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1462 
1463 	qreply(dsp->ds_wq, mp);
1464 }
1465 
1466 /*
1467  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1468  */
1469 static void
1470 str_notify_promisc_off_phys(dld_str_t *dsp)
1471 {
1472 	mblk_t		*mp;
1473 	dl_notify_ind_t	*dlip;
1474 
1475 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1476 		return;
1477 
1478 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1479 	    M_PROTO, 0)) == NULL)
1480 		return;
1481 
1482 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1483 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1484 	dlip->dl_primitive = DL_NOTIFY_IND;
1485 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1486 
1487 	qreply(dsp->ds_wq, mp);
1488 }
1489 
1490 /*
1491  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1492  */
1493 static void
1494 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1495 {
1496 	mblk_t		*mp;
1497 	dl_notify_ind_t	*dlip;
1498 	uint_t		addr_length;
1499 	uint16_t	ethertype;
1500 
1501 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1502 		return;
1503 
1504 	addr_length = dsp->ds_mip->mi_addr_length;
1505 	if ((mp = mexchange(dsp->ds_wq, NULL,
1506 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1507 	    M_PROTO, 0)) == NULL)
1508 		return;
1509 
1510 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1511 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1512 	dlip->dl_primitive = DL_NOTIFY_IND;
1513 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1514 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1515 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1516 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1517 
1518 	bcopy(addr, &dlip[1], addr_length);
1519 
1520 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1521 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1522 
1523 	qreply(dsp->ds_wq, mp);
1524 }
1525 
1526 /*
1527  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1528  */
1529 static void
1530 str_notify_link_up(dld_str_t *dsp)
1531 {
1532 	mblk_t		*mp;
1533 	dl_notify_ind_t	*dlip;
1534 
1535 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1536 		return;
1537 
1538 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1539 	    M_PROTO, 0)) == NULL)
1540 		return;
1541 
1542 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1543 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1544 	dlip->dl_primitive = DL_NOTIFY_IND;
1545 	dlip->dl_notification = DL_NOTE_LINK_UP;
1546 
1547 	qreply(dsp->ds_wq, mp);
1548 }
1549 
1550 /*
1551  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1552  */
1553 static void
1554 str_notify_link_down(dld_str_t *dsp)
1555 {
1556 	mblk_t		*mp;
1557 	dl_notify_ind_t	*dlip;
1558 
1559 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1560 		return;
1561 
1562 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1563 	    M_PROTO, 0)) == NULL)
1564 		return;
1565 
1566 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1567 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1568 	dlip->dl_primitive = DL_NOTIFY_IND;
1569 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1570 
1571 	qreply(dsp->ds_wq, mp);
1572 }
1573 
1574 /*
1575  * DL_NOTIFY_IND: DL_NOTE_SPEED
1576  */
1577 static void
1578 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1579 {
1580 	mblk_t		*mp;
1581 	dl_notify_ind_t	*dlip;
1582 
1583 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1584 		return;
1585 
1586 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1587 	    M_PROTO, 0)) == NULL)
1588 		return;
1589 
1590 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1591 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1592 	dlip->dl_primitive = DL_NOTIFY_IND;
1593 	dlip->dl_notification = DL_NOTE_SPEED;
1594 	dlip->dl_data = speed;
1595 
1596 	qreply(dsp->ds_wq, mp);
1597 }
1598 
1599 /*
1600  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1601  */
1602 static void
1603 str_notify_capab_reneg(dld_str_t *dsp)
1604 {
1605 	mblk_t		*mp;
1606 	dl_notify_ind_t	*dlip;
1607 
1608 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1609 		return;
1610 
1611 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1612 	    M_PROTO, 0)) == NULL)
1613 		return;
1614 
1615 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1616 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1617 	dlip->dl_primitive = DL_NOTIFY_IND;
1618 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1619 
1620 	qreply(dsp->ds_wq, mp);
1621 }
1622 
1623 /*
1624  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1625  */
1626 static void
1627 str_notify_fastpath_flush(dld_str_t *dsp)
1628 {
1629 	mblk_t		*mp;
1630 	dl_notify_ind_t	*dlip;
1631 
1632 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1633 		return;
1634 
1635 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1636 	    M_PROTO, 0)) == NULL)
1637 		return;
1638 
1639 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1640 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1641 	dlip->dl_primitive = DL_NOTIFY_IND;
1642 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1643 
1644 	qreply(dsp->ds_wq, mp);
1645 }
1646 
1647 /*
1648  * MAC notification callback.
1649  */
1650 void
1651 str_notify(void *arg, mac_notify_type_t type)
1652 {
1653 	dld_str_t		*dsp = (dld_str_t *)arg;
1654 	queue_t			*q = dsp->ds_wq;
1655 	mac_handle_t		mh = dsp->ds_mh;
1656 	mac_client_handle_t	mch = dsp->ds_mch;
1657 	uint8_t			addr[MAXMACADDRLEN];
1658 
1659 	switch (type) {
1660 	case MAC_NOTE_TX:
1661 		qenable(q);
1662 		break;
1663 
1664 	case MAC_NOTE_DEVPROMISC:
1665 		/*
1666 		 * Send the appropriate DL_NOTIFY_IND.
1667 		 */
1668 		if (mac_promisc_get(mh, MAC_DEVPROMISC))
1669 			str_notify_promisc_on_phys(dsp);
1670 		else
1671 			str_notify_promisc_off_phys(dsp);
1672 		break;
1673 
1674 	case MAC_NOTE_UNICST:
1675 		/*
1676 		 * This notification is sent whenever the MAC unicast
1677 		 * address changes.
1678 		 */
1679 		mac_unicast_primary_get(mh, addr);
1680 
1681 		/*
1682 		 * Send the appropriate DL_NOTIFY_IND.
1683 		 */
1684 		str_notify_phys_addr(dsp, addr);
1685 		break;
1686 
1687 	case MAC_NOTE_LINK:
1688 		/*
1689 		 * This notification is sent every time the MAC driver
1690 		 * updates the link state.
1691 		 */
1692 		switch (mac_client_stat_get(mch, MAC_STAT_LINK_STATE)) {
1693 		case LINK_STATE_UP: {
1694 			uint64_t speed;
1695 			/*
1696 			 * The link is up so send the appropriate
1697 			 * DL_NOTIFY_IND.
1698 			 */
1699 			str_notify_link_up(dsp);
1700 
1701 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1702 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1703 			break;
1704 		}
1705 		case LINK_STATE_DOWN:
1706 			/*
1707 			 * The link is down so send the appropriate
1708 			 * DL_NOTIFY_IND.
1709 			 */
1710 			str_notify_link_down(dsp);
1711 			break;
1712 
1713 		default:
1714 			break;
1715 		}
1716 		break;
1717 
1718 	case MAC_NOTE_RESOURCE:
1719 	case MAC_NOTE_CAPAB_CHG:
1720 		/*
1721 		 * This notification is sent whenever the MAC resources
1722 		 * change or capabilities change. We need to renegotiate
1723 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1724 		 */
1725 		str_notify_capab_reneg(dsp);
1726 		break;
1727 
1728 	case MAC_NOTE_SDU_SIZE: {
1729 		uint_t  max_sdu;
1730 		mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
1731 		str_notify_sdu_size(dsp, max_sdu);
1732 		break;
1733 	}
1734 
1735 	case MAC_NOTE_FASTPATH_FLUSH:
1736 		str_notify_fastpath_flush(dsp);
1737 		break;
1738 
1739 	case MAC_NOTE_MARGIN:
1740 		break;
1741 
1742 	case MAC_NOTE_PROMISC:
1743 		break;
1744 
1745 	default:
1746 		ASSERT(B_FALSE);
1747 		break;
1748 	}
1749 }
1750 
1751 /*
1752  * This function is called via a taskq mechansim to process all control
1753  * messages on a per 'dsp' end point.
1754  */
1755 static void
1756 dld_wput_nondata_task(void *arg)
1757 {
1758 	dld_str_t	*dsp = arg;
1759 	mblk_t		*mp;
1760 
1761 	mutex_enter(&dsp->ds_lock);
1762 	while (dsp->ds_pending_head != NULL) {
1763 		mp = dsp->ds_pending_head;
1764 		dsp->ds_pending_head = mp->b_next;
1765 		mp->b_next = NULL;
1766 		if (dsp->ds_pending_head == NULL)
1767 			dsp->ds_pending_tail = NULL;
1768 		mutex_exit(&dsp->ds_lock);
1769 
1770 		switch (DB_TYPE(mp)) {
1771 		case M_PROTO:
1772 		case M_PCPROTO:
1773 			dld_proto(dsp, mp);
1774 			break;
1775 		case M_IOCTL:
1776 			dld_ioc(dsp, mp);
1777 			break;
1778 		default:
1779 			ASSERT(0);
1780 		}
1781 
1782 		mutex_enter(&dsp->ds_lock);
1783 	}
1784 	ASSERT(dsp->ds_pending_tail == NULL);
1785 	dsp->ds_dlpi_pending = 0;
1786 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1787 	mutex_exit(&dsp->ds_lock);
1788 }
1789 
1790 /*
1791  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1792  * thread is started at boot time.
1793  */
1794 static void
1795 dld_taskq_dispatch(void)
1796 {
1797 	callb_cpr_t	cprinfo;
1798 	dld_str_t	*dsp;
1799 
1800 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1801 	    "dld_taskq_dispatch");
1802 	mutex_enter(&dld_taskq_lock);
1803 
1804 	while (!dld_taskq_quit) {
1805 		dsp = list_head(&dld_taskq_list);
1806 		while (dsp != NULL) {
1807 			list_remove(&dld_taskq_list, dsp);
1808 			mutex_exit(&dld_taskq_lock);
1809 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1810 			    dsp, TQ_SLEEP) != 0);
1811 			mutex_enter(&dld_taskq_lock);
1812 			dsp = list_head(&dld_taskq_list);
1813 		}
1814 
1815 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1816 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1817 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1818 	}
1819 
1820 	dld_taskq_done = B_TRUE;
1821 	cv_signal(&dld_taskq_cv);
1822 	CALLB_CPR_EXIT(&cprinfo);
1823 	thread_exit();
1824 }
1825 
1826 /*
1827  * All control operations are serialized on the 'dsp' and are also funneled
1828  * through a taskq mechanism to ensure that subsequent processing has kernel
1829  * context and can safely use cv_wait.
1830  *
1831  * Mechanisms to handle taskq dispatch failures
1832  *
1833  * The only way to be sure that taskq dispatch does not fail is to either
1834  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1835  * some number of entries and make sure that the number of outstanding requests
1836  * are less than that number. We can't use TQ_SLEEP since we don't know the
1837  * context. Nor can we bound the total number of 'dsp' end points. So we are
1838  * unable to use either of the above schemes, and are forced to deal with
1839  * taskq dispatch failures. Note that even dynamic taskq could fail in
1840  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1841  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1842  * framework.
1843  *
1844  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1845  * We also have a single global thread to retry the taskq dispatch. This
1846  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1847  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1848  */
1849 static void
1850 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1851 {
1852 	ASSERT(mp->b_next == NULL);
1853 	mutex_enter(&dsp->ds_lock);
1854 	if (dsp->ds_pending_head != NULL) {
1855 		ASSERT(dsp->ds_dlpi_pending);
1856 		dsp->ds_pending_tail->b_next = mp;
1857 		dsp->ds_pending_tail = mp;
1858 		mutex_exit(&dsp->ds_lock);
1859 		return;
1860 	}
1861 	ASSERT(dsp->ds_pending_tail == NULL);
1862 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
1863 	/*
1864 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
1865 	 * thread is still active and is processing the last message, though
1866 	 * the pending queue has been emptied.
1867 	 */
1868 	if (dsp->ds_dlpi_pending) {
1869 		mutex_exit(&dsp->ds_lock);
1870 		return;
1871 	}
1872 
1873 	dsp->ds_dlpi_pending = 1;
1874 	mutex_exit(&dsp->ds_lock);
1875 
1876 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
1877 	    TQ_NOSLEEP) != 0)
1878 		return;
1879 
1880 	mutex_enter(&dld_taskq_lock);
1881 	list_insert_tail(&dld_taskq_list, dsp);
1882 	cv_signal(&dld_taskq_cv);
1883 	mutex_exit(&dld_taskq_lock);
1884 }
1885 
1886 /*
1887  * Process an M_IOCTL message.
1888  */
1889 static void
1890 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1891 {
1892 	uint_t			cmd;
1893 
1894 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1895 	ASSERT(dsp->ds_type == DLD_DLPI);
1896 
1897 	switch (cmd) {
1898 	case DLIOCNATIVE:
1899 		ioc_native(dsp, mp);
1900 		break;
1901 	case DLIOCMARGININFO:
1902 		ioc_margin(dsp, mp);
1903 		break;
1904 	case DLIOCRAW:
1905 		ioc_raw(dsp, mp);
1906 		break;
1907 	case DLIOCHDRINFO:
1908 		ioc_fast(dsp, mp);
1909 		break;
1910 	default:
1911 		ioc(dsp, mp);
1912 	}
1913 }
1914 
1915 /*
1916  * DLIOCNATIVE
1917  */
1918 static void
1919 ioc_native(dld_str_t *dsp, mblk_t *mp)
1920 {
1921 	queue_t *q = dsp->ds_wq;
1922 	const mac_info_t *mip = dsp->ds_mip;
1923 
1924 	/*
1925 	 * Native mode can be enabled if it's disabled and if the
1926 	 * native media type is different.
1927 	 */
1928 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
1929 		dsp->ds_native = B_TRUE;
1930 
1931 	if (dsp->ds_native)
1932 		miocack(q, mp, 0, mip->mi_nativemedia);
1933 	else
1934 		miocnak(q, mp, 0, ENOTSUP);
1935 }
1936 
1937 /*
1938  * DLIOCMARGININFO
1939  */
1940 static void
1941 ioc_margin(dld_str_t *dsp, mblk_t *mp)
1942 {
1943 	queue_t *q = dsp->ds_wq;
1944 	uint32_t margin;
1945 	int err;
1946 
1947 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1948 		err = EINVAL;
1949 		goto failed;
1950 	}
1951 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
1952 		goto failed;
1953 
1954 	mac_margin_get(dsp->ds_mh, &margin);
1955 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
1956 	miocack(q, mp, sizeof (uint32_t), 0);
1957 	return;
1958 
1959 failed:
1960 	miocnak(q, mp, 0, err);
1961 }
1962 
1963 /*
1964  * DLIOCRAW
1965  */
1966 static void
1967 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1968 {
1969 	queue_t *q = dsp->ds_wq;
1970 	mac_perim_handle_t	mph;
1971 
1972 	if (dsp->ds_mh == NULL) {
1973 		dsp->ds_mode = DLD_RAW;
1974 		miocack(q, mp, 0, 0);
1975 		return;
1976 	}
1977 
1978 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1979 	if (dsp->ds_polling || dsp->ds_direct) {
1980 		mac_perim_exit(mph);
1981 		miocnak(q, mp, 0, EPROTO);
1982 		return;
1983 	}
1984 
1985 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1986 		/*
1987 		 * Set the receive callback.
1988 		 */
1989 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
1990 	}
1991 
1992 	/*
1993 	 * Note that raw mode is enabled.
1994 	 */
1995 	dsp->ds_mode = DLD_RAW;
1996 	mac_perim_exit(mph);
1997 
1998 	miocack(q, mp, 0, 0);
1999 }
2000 
2001 /*
2002  * DLIOCHDRINFO
2003  */
2004 static void
2005 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2006 {
2007 	dl_unitdata_req_t *dlp;
2008 	off_t		off;
2009 	size_t		len;
2010 	const uint8_t	*addr;
2011 	uint16_t	sap;
2012 	mblk_t		*nmp;
2013 	mblk_t		*hmp;
2014 	uint_t		addr_length;
2015 	queue_t		*q = dsp->ds_wq;
2016 	int		err;
2017 	mac_perim_handle_t	mph;
2018 
2019 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2020 		err = ENOTSUP;
2021 		goto failed;
2022 	}
2023 
2024 	/*
2025 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2026 	 * user-land should not be allowed.
2027 	 */
2028 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2029 		err = EINVAL;
2030 		goto failed;
2031 	}
2032 
2033 	nmp = mp->b_cont;
2034 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2035 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2036 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2037 		err = EINVAL;
2038 		goto failed;
2039 	}
2040 
2041 	off = dlp->dl_dest_addr_offset;
2042 	len = dlp->dl_dest_addr_length;
2043 
2044 	if (!MBLKIN(nmp, off, len)) {
2045 		err = EINVAL;
2046 		goto failed;
2047 	}
2048 
2049 	if (dsp->ds_dlstate != DL_IDLE) {
2050 		err = ENOTSUP;
2051 		goto failed;
2052 	}
2053 
2054 	addr_length = dsp->ds_mip->mi_addr_length;
2055 	if (len != addr_length + sizeof (uint16_t)) {
2056 		err = EINVAL;
2057 		goto failed;
2058 	}
2059 
2060 	addr = nmp->b_rptr + off;
2061 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2062 
2063 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2064 		err = ENOMEM;
2065 		goto failed;
2066 	}
2067 
2068 	/*
2069 	 * This ioctl might happen concurrently with a direct call to dld_capab
2070 	 * that tries to enable direct and/or poll capabilities. Since the
2071 	 * stack does not serialize them, we do so here to avoid mixing
2072 	 * the callbacks.
2073 	 */
2074 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2075 	if (dsp->ds_mode != DLD_FASTPATH) {
2076 		/*
2077 		 * Set the receive callback (unless polling is enabled).
2078 		 */
2079 		if (!dsp->ds_polling && !dsp->ds_direct)
2080 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2081 
2082 		/*
2083 		 * Note that fast-path mode is enabled.
2084 		 */
2085 		dsp->ds_mode = DLD_FASTPATH;
2086 	}
2087 	mac_perim_exit(mph);
2088 
2089 	freemsg(nmp->b_cont);
2090 	nmp->b_cont = hmp;
2091 
2092 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2093 	return;
2094 failed:
2095 	miocnak(q, mp, 0, err);
2096 }
2097 
2098 /*
2099  * Catch-all handler.
2100  */
2101 static void
2102 ioc(dld_str_t *dsp, mblk_t *mp)
2103 {
2104 	queue_t	*q = dsp->ds_wq;
2105 
2106 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2107 		miocnak(q, mp, 0, EINVAL);
2108 		return;
2109 	}
2110 	mac_ioctl(dsp->ds_mh, q, mp);
2111 }
2112