xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision ab017dba278352f85f904f92ba32ab12cee76cb2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2023 Oxide Computer Company
24  */
25 
26 /*
27  * Data-Link Driver
28  */
29 
30 #include	<inet/common.h>
31 #include	<sys/strsubr.h>
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/vlan.h>
35 #include	<sys/dld_impl.h>
36 #include	<sys/cpuvar.h>
37 #include	<sys/callb.h>
38 #include	<sys/list.h>
39 #include	<sys/mac_client.h>
40 #include	<sys/mac_client_priv.h>
41 #include	<sys/mac_flow.h>
42 
43 static int	str_constructor(void *, void *, int);
44 static void	str_destructor(void *, void *);
45 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
46 static void	str_notify_promisc_on_phys(dld_str_t *);
47 static void	str_notify_promisc_off_phys(dld_str_t *);
48 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
49 static void	str_notify_link_up(dld_str_t *);
50 static void	str_notify_link_down(dld_str_t *);
51 static void	str_notify_capab_reneg(dld_str_t *);
52 static void	str_notify_speed(dld_str_t *, uint32_t);
53 
54 static void	ioc_native(dld_str_t *,  mblk_t *);
55 static void	ioc_margin(dld_str_t *, mblk_t *);
56 static void	ioc_raw(dld_str_t *, mblk_t *);
57 static void	ioc_fast(dld_str_t *,  mblk_t *);
58 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
59 static void	ioc(dld_str_t *, mblk_t *);
60 static void	dld_ioc(dld_str_t *, mblk_t *);
61 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
62 
63 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
64 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
65     link_tagmode_t);
66 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
67 
68 static uint32_t		str_count;
69 static kmem_cache_t	*str_cachep;
70 static mod_hash_t	*str_hashp;
71 
72 #define	STR_HASHSZ		64
73 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
74 
75 #define	dld_taskq	system_taskq
76 
77 static kmutex_t		dld_taskq_lock;
78 static kcondvar_t	dld_taskq_cv;
79 static list_t		dld_taskq_list;		/* List of dld_str_t */
80 boolean_t		dld_taskq_quit;
81 boolean_t		dld_taskq_done;
82 
83 static void		dld_taskq_dispatch(void);
84 
85 /*
86  * Some notes on entry points, flow-control, queueing.
87  *
88  * This driver exports the traditional STREAMS put entry point as well as
89  * the non-STREAMS fast-path transmit routine which is provided to IP via
90  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
91  * and data operations, while the fast-path routine deals only with M_DATA
92  * fast-path packets.  Regardless of the entry point, all outbound packets
93  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
94  *
95  * The transmit logic operates in the following way: All packets coming
96  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
97  * happens when the MAC layer indicates the packets couldn't be
98  * transmitted due to 1) lack of resources (e.g. running out of
99  * descriptors),  or 2) reaching the allowed bandwidth limit for this
100  * particular flow. The indication comes in the form of a Tx cookie that
101  * identifies the blocked ring. In such case, DLD will place a
102  * dummy message on its write-side STREAMS queue so that the queue is
103  * marked as "full". Any subsequent packets arriving at the driver will
104  * still be sent to the MAC layer where it either gets queued in the Tx
105  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
106  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
107  * When the write service procedure runs, it will remove the dummy
108  * message from the write-side STREAMS queue; in effect this will trigger
109  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
110  * respectively, due to the above reasons.
111  *
112  * All non-data operations, both DLPI and ioctls are single threaded on a per
113  * dld_str_t endpoint. This is done using a taskq so that the control operation
114  * has kernel context and can cv_wait for resources. In addition all set type
115  * operations that involve mac level state modification are serialized on a
116  * per mac end point using the perimeter mechanism provided by the mac layer.
117  * This serializes all mac clients trying to modify a single mac end point over
118  * the entire sequence of mac calls made by that client as an atomic unit. The
119  * mac framework locking is described in mac.c. A critical element is that
120  * DLD/DLS does not hold any locks across the mac perimeter.
121  *
122  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
123  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
124  * match dev_t. If a stream is found and it is attached, its dev_info_t *
125  * is returned. If the mac handle is non-null, it can be safely accessed
126  * below. The mac handle won't be freed until the mac_unregister which
127  * won't happen until the driver detaches. The DDI framework ensures that
128  * the detach won't happen while a getinfo is in progress.
129  */
130 typedef struct i_dld_str_state_s {
131 	major_t		ds_major;
132 	minor_t		ds_minor;
133 	int		ds_instance;
134 	dev_info_t	*ds_dip;
135 } i_dld_str_state_t;
136 
137 /* ARGSUSED */
138 static uint_t
139 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
140 {
141 	i_dld_str_state_t	*statep = arg;
142 	dld_str_t		*dsp = (dld_str_t *)val;
143 	mac_handle_t		mh;
144 
145 	if (statep->ds_major != dsp->ds_major)
146 		return (MH_WALK_CONTINUE);
147 
148 	ASSERT(statep->ds_minor != 0);
149 	mh = dsp->ds_mh;
150 
151 	if (statep->ds_minor == dsp->ds_minor) {
152 		/*
153 		 * Clone: a clone minor is unique. we can terminate the
154 		 * walk if we find a matching stream -- even if we fail
155 		 * to obtain the devinfo.
156 		 */
157 		if (mh != NULL) {
158 			statep->ds_dip = mac_devinfo_get(mh);
159 			statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
160 		}
161 		return (MH_WALK_TERMINATE);
162 	}
163 	return (MH_WALK_CONTINUE);
164 }
165 
166 static dev_info_t *
167 dld_finddevinfo(dev_t dev)
168 {
169 	dev_info_t		*dip;
170 	i_dld_str_state_t	state;
171 
172 	if (getminor(dev) == 0)
173 		return (NULL);
174 
175 	/*
176 	 * See if it's a minor node of a link
177 	 */
178 	if ((dip = dls_link_devinfo(dev)) != NULL)
179 		return (dip);
180 
181 	state.ds_minor = getminor(dev);
182 	state.ds_major = getmajor(dev);
183 	state.ds_dip = NULL;
184 	state.ds_instance = -1;
185 
186 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
187 	return (state.ds_dip);
188 }
189 
190 int
191 dld_devt_to_instance(dev_t dev)
192 {
193 	minor_t			minor;
194 	i_dld_str_state_t	state;
195 
196 	/*
197 	 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
198 	 * Minor number 0 is reserved for the DLPI style 2 unattached
199 	 * node.
200 	 */
201 
202 	if ((minor = getminor(dev)) == 0)
203 		return (-1);
204 
205 	/*
206 	 * Check for unopened style 1 node.
207 	 * Note that this doesn't *necessarily* work for legacy
208 	 * devices, but this code is only called within the
209 	 * getinfo(9e) implementation for true GLDv3 devices, so it
210 	 * doesn't matter.
211 	 */
212 	if (minor > 0 && minor <= DLS_MAX_MINOR) {
213 		return (DLS_MINOR2INST(minor));
214 	}
215 
216 	state.ds_minor = getminor(dev);
217 	state.ds_major = getmajor(dev);
218 	state.ds_dip = NULL;
219 	state.ds_instance = -1;
220 
221 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
222 	return (state.ds_instance);
223 }
224 
225 /*
226  * devo_getinfo: getinfo(9e)
227  *
228  * NB: This may be called for a provider before the provider's
229  * instances are attached.  Hence, if a particular provider needs a
230  * special mapping (the mac instance != ddi_get_instance()), then it
231  * may need to provide its own implementation using the
232  * mac_devt_to_instance() function, and translating the returned mac
233  * instance to a devinfo instance.  For dev_t's where the minor number
234  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
235  * function indirectly via the mac_getinfo() function.
236  */
237 /*ARGSUSED*/
238 int
239 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
240 {
241 	dev_info_t	*devinfo;
242 	minor_t		minor = getminor((dev_t)arg);
243 	int		rc = DDI_FAILURE;
244 
245 	switch (cmd) {
246 	case DDI_INFO_DEVT2DEVINFO:
247 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
248 			*(dev_info_t **)resp = devinfo;
249 			rc = DDI_SUCCESS;
250 		}
251 		break;
252 	case DDI_INFO_DEVT2INSTANCE:
253 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
254 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
255 			rc = DDI_SUCCESS;
256 		} else if (minor > DLS_MAX_MINOR &&
257 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
258 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
259 			rc = DDI_SUCCESS;
260 		}
261 		break;
262 	}
263 	return (rc);
264 }
265 
266 void *
267 dld_str_private(queue_t *q)
268 {
269 	return (((dld_str_t *)(q->q_ptr))->ds_private);
270 }
271 
272 int
273 dld_str_open(queue_t *rq, dev_t *devp, void *private)
274 {
275 	dld_str_t	*dsp;
276 	major_t		major;
277 	minor_t		minor;
278 	int		err;
279 
280 	major = getmajor(*devp);
281 	minor = getminor(*devp);
282 
283 	/*
284 	 * Half the 32-bit minor space is reserved for private use by the driver
285 	 * so we bail out here with `ENOSTR` to indicate specfs should retry the
286 	 * open with the driver's character based `open(9E)`. For a typical
287 	 * STREAMS driver, that would just be `nodev` which would simply return
288 	 * `ENODEV`. But a dual-personality device can choose to implement the
289 	 * character based `open(9E)` for some minor nodes. A driver wanting a
290 	 * separate STREAMS interface altogether would presumably have already
291 	 * provided its own `streamtab`.
292 	 */
293 	if (minor >= mac_private_minor())
294 		return (ENOSTR);
295 
296 	/*
297 	 * Create a new dld_str_t for the stream. This will grab a new minor
298 	 * number that will be handed back in the cloned dev_t.  Creation may
299 	 * fail if we can't allocate the dummy mblk used for flow-control.
300 	 */
301 	dsp = dld_str_create(rq, DLD_DLPI, major,
302 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
303 	if (dsp == NULL)
304 		return (ENOSR);
305 
306 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
307 	dsp->ds_private = private;
308 	if (minor != 0) {
309 		/*
310 		 * Style 1 open
311 		 */
312 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
313 			goto failed;
314 
315 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
316 	} else {
317 		(void) qassociate(rq, -1);
318 	}
319 
320 	/*
321 	 * Enable the queue srv(9e) routine.
322 	 */
323 	qprocson(rq);
324 
325 	/*
326 	 * Construct a cloned dev_t to hand back.
327 	 */
328 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
329 	return (0);
330 
331 failed:
332 	dld_str_destroy(dsp);
333 	return (err);
334 }
335 
336 int
337 dld_str_close(queue_t *rq)
338 {
339 	dld_str_t	*dsp = rq->q_ptr;
340 
341 	/*
342 	 * All modules on top have been popped off. So there can't be any
343 	 * threads from the top.
344 	 */
345 	ASSERT(dsp->ds_datathr_cnt == 0);
346 
347 	/*
348 	 * Wait until pending DLPI requests are processed.
349 	 */
350 	mutex_enter(&dsp->ds_lock);
351 	while (dsp->ds_dlpi_pending)
352 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
353 	mutex_exit(&dsp->ds_lock);
354 
355 
356 	/*
357 	 * This stream was open to a provider node. Check to see
358 	 * if it has been cleanly shut down.
359 	 */
360 	if (dsp->ds_dlstate != DL_UNATTACHED) {
361 		/*
362 		 * The stream is either open to a style 1 provider or
363 		 * this is not clean shutdown. Detach from the PPA.
364 		 * (This is still ok even in the style 1 case).
365 		 */
366 		dld_str_detach(dsp);
367 	}
368 
369 	dld_str_destroy(dsp);
370 	return (0);
371 }
372 
373 /*
374  * qi_qopen: open(9e)
375  */
376 /*ARGSUSED*/
377 int
378 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
379 {
380 	if (sflag == MODOPEN)
381 		return (ENOTSUP);
382 
383 	/*
384 	 * This is a cloning driver and therefore each queue should only
385 	 * ever get opened once.
386 	 */
387 	if (rq->q_ptr != NULL)
388 		return (EBUSY);
389 
390 	return (dld_str_open(rq, devp, NULL));
391 }
392 
393 /*
394  * qi_qclose: close(9e)
395  */
396 /* ARGSUSED */
397 int
398 dld_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
399 {
400 	/*
401 	 * Disable the queue srv(9e) routine.
402 	 */
403 	qprocsoff(rq);
404 
405 	return (dld_str_close(rq));
406 }
407 
408 /*
409  * qi_qputp: put(9e)
410  */
411 int
412 dld_wput(queue_t *wq, mblk_t *mp)
413 {
414 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
415 	dld_str_mode_t	mode;
416 
417 	switch (DB_TYPE(mp)) {
418 	case M_DATA:
419 		mutex_enter(&dsp->ds_lock);
420 		mode = dsp->ds_mode;
421 		if ((dsp->ds_dlstate != DL_IDLE) ||
422 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
423 			mutex_exit(&dsp->ds_lock);
424 			freemsg(mp);
425 			break;
426 		}
427 
428 		DLD_DATATHR_INC(dsp);
429 		mutex_exit(&dsp->ds_lock);
430 		if (mode == DLD_FASTPATH) {
431 			if (dsp->ds_mip->mi_media == DL_ETHER &&
432 			    (MBLKL(mp) < sizeof (struct ether_header))) {
433 				freemsg(mp);
434 			} else {
435 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
436 			}
437 		} else {
438 			str_mdata_raw_put(dsp, mp);
439 		}
440 		DLD_DATATHR_DCR(dsp);
441 		break;
442 	case M_PROTO:
443 	case M_PCPROTO: {
444 		t_uscalar_t	prim;
445 
446 		if (MBLKL(mp) < sizeof (t_uscalar_t))
447 			break;
448 
449 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
450 
451 		if (prim == DL_UNITDATA_REQ) {
452 			proto_unitdata_req(dsp, mp);
453 		} else {
454 			dld_wput_nondata(dsp, mp);
455 		}
456 		break;
457 	}
458 
459 	case M_IOCTL:
460 		dld_wput_nondata(dsp, mp);
461 		break;
462 
463 	case M_FLUSH:
464 		if (*mp->b_rptr & FLUSHW) {
465 			DLD_CLRQFULL(dsp);
466 			*mp->b_rptr &= ~FLUSHW;
467 		}
468 
469 		if (*mp->b_rptr & FLUSHR) {
470 			qreply(wq, mp);
471 		} else {
472 			freemsg(mp);
473 		}
474 		break;
475 
476 	default:
477 		freemsg(mp);
478 		break;
479 	}
480 	return (0);
481 }
482 
483 /*
484  * qi_srvp: srv(9e)
485  */
486 int
487 dld_wsrv(queue_t *wq)
488 {
489 	dld_str_t	*dsp = wq->q_ptr;
490 
491 	DLD_CLRQFULL(dsp);
492 	return (0);
493 }
494 
495 void
496 dld_init_ops(struct dev_ops *ops, const char *name)
497 {
498 	struct streamtab *stream;
499 	struct qinit *rq, *wq;
500 	struct module_info *modinfo;
501 
502 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
503 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
504 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
505 	modinfo->mi_minpsz = 0;
506 	modinfo->mi_maxpsz = 64*1024;
507 	modinfo->mi_hiwat  = 1;
508 	modinfo->mi_lowat = 0;
509 
510 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
511 	rq->qi_qopen = dld_open;
512 	rq->qi_qclose = dld_close;
513 	rq->qi_minfo = modinfo;
514 
515 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
516 	wq->qi_putp = (pfi_t)dld_wput;
517 	wq->qi_srvp = (pfi_t)dld_wsrv;
518 	wq->qi_minfo = modinfo;
519 
520 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
521 	stream->st_rdinit = rq;
522 	stream->st_wrinit = wq;
523 	ops->devo_cb_ops->cb_str = stream;
524 
525 	if (ops->devo_getinfo == NULL)
526 		ops->devo_getinfo = &dld_getinfo;
527 }
528 
529 void
530 dld_fini_ops(struct dev_ops *ops)
531 {
532 	struct streamtab *stream;
533 	struct qinit *rq, *wq;
534 	struct module_info *modinfo;
535 
536 	stream = ops->devo_cb_ops->cb_str;
537 	rq = stream->st_rdinit;
538 	wq = stream->st_wrinit;
539 	modinfo = rq->qi_minfo;
540 	ASSERT(wq->qi_minfo == modinfo);
541 
542 	kmem_free(stream, sizeof (struct streamtab));
543 	kmem_free(wq, sizeof (struct qinit));
544 	kmem_free(rq, sizeof (struct qinit));
545 	kmem_free(modinfo->mi_idname, FMNAMESZ);
546 	kmem_free(modinfo, sizeof (struct module_info));
547 }
548 
549 /*
550  * Initialize this module's data structures.
551  */
552 void
553 dld_str_init(void)
554 {
555 	/*
556 	 * Create dld_str_t object cache.
557 	 */
558 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
559 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
560 	ASSERT(str_cachep != NULL);
561 
562 	/*
563 	 * Create a hash table for maintaining dld_str_t's.
564 	 * The ds_minor field (the clone minor number) of a dld_str_t
565 	 * is used as a key for this hash table because this number is
566 	 * globally unique (allocated from "dls_minor_arena").
567 	 */
568 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
569 	    mod_hash_null_valdtor);
570 
571 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
572 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
573 
574 	dld_taskq_quit = B_FALSE;
575 	dld_taskq_done = B_FALSE;
576 	list_create(&dld_taskq_list, sizeof (dld_str_t),
577 	    offsetof(dld_str_t, ds_tqlist));
578 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
579 	    &p0, TS_RUN, minclsyspri);
580 }
581 
582 /*
583  * Tear down this module's data structures.
584  */
585 int
586 dld_str_fini(void)
587 {
588 	/*
589 	 * Make sure that there are no objects in use.
590 	 */
591 	if (str_count != 0)
592 		return (EBUSY);
593 
594 	/*
595 	 * Ask the dld_taskq thread to quit and wait for it to be done
596 	 */
597 	mutex_enter(&dld_taskq_lock);
598 	dld_taskq_quit = B_TRUE;
599 	cv_signal(&dld_taskq_cv);
600 	while (!dld_taskq_done)
601 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
602 	mutex_exit(&dld_taskq_lock);
603 	list_destroy(&dld_taskq_list);
604 	/*
605 	 * Destroy object cache.
606 	 */
607 	kmem_cache_destroy(str_cachep);
608 	mod_hash_destroy_idhash(str_hashp);
609 	return (0);
610 }
611 
612 /*
613  * Create a new dld_str_t object.
614  */
615 dld_str_t *
616 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
617 {
618 	dld_str_t	*dsp;
619 	int		err;
620 
621 	/*
622 	 * Allocate an object from the cache.
623 	 */
624 	atomic_inc_32(&str_count);
625 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
626 
627 	/*
628 	 * Allocate the dummy mblk for flow-control.
629 	 */
630 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
631 	if (dsp->ds_tx_flow_mp == NULL) {
632 		kmem_cache_free(str_cachep, dsp);
633 		atomic_dec_32(&str_count);
634 		return (NULL);
635 	}
636 	dsp->ds_type = type;
637 	dsp->ds_major = major;
638 	dsp->ds_style = style;
639 
640 	/*
641 	 * Initialize the queue pointers.
642 	 */
643 	ASSERT(RD(rq) == rq);
644 	dsp->ds_rq = rq;
645 	dsp->ds_wq = WR(rq);
646 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
647 
648 	/*
649 	 * We want explicit control over our write-side STREAMS queue
650 	 * where the dummy mblk gets added/removed for flow-control.
651 	 */
652 	noenable(WR(rq));
653 
654 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
655 	    (mod_hash_val_t)dsp);
656 	ASSERT(err == 0);
657 	return (dsp);
658 }
659 
660 /*
661  * Destroy a dld_str_t object.
662  */
663 void
664 dld_str_destroy(dld_str_t *dsp)
665 {
666 	queue_t		*rq;
667 	queue_t		*wq;
668 	mod_hash_val_t	val;
669 
670 	/*
671 	 * Clear the queue pointers.
672 	 */
673 	rq = dsp->ds_rq;
674 	wq = dsp->ds_wq;
675 	ASSERT(wq == WR(rq));
676 	rq->q_ptr = wq->q_ptr = NULL;
677 	dsp->ds_rq = dsp->ds_wq = NULL;
678 
679 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
680 	ASSERT(dsp->ds_sap == 0);
681 	ASSERT(dsp->ds_mh == NULL);
682 	ASSERT(dsp->ds_mch == NULL);
683 	ASSERT(dsp->ds_promisc == 0);
684 	ASSERT(dsp->ds_mph == NULL);
685 	ASSERT(dsp->ds_mip == NULL);
686 	ASSERT(dsp->ds_mnh == NULL);
687 
688 	ASSERT(dsp->ds_polling == B_FALSE);
689 	ASSERT(dsp->ds_direct == B_FALSE);
690 	ASSERT(dsp->ds_lso == B_FALSE);
691 	ASSERT(dsp->ds_lso_max == 0);
692 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
693 
694 	/*
695 	 * Reinitialize all the flags.
696 	 */
697 	dsp->ds_notifications = 0;
698 	dsp->ds_passivestate = DLD_UNINITIALIZED;
699 	dsp->ds_mode = DLD_UNITDATA;
700 	dsp->ds_native = B_FALSE;
701 	dsp->ds_nonip = B_FALSE;
702 
703 	ASSERT(dsp->ds_datathr_cnt == 0);
704 	ASSERT(dsp->ds_pending_head == NULL);
705 	ASSERT(dsp->ds_pending_tail == NULL);
706 	ASSERT(!dsp->ds_dlpi_pending);
707 
708 	ASSERT(dsp->ds_dlp == NULL);
709 	ASSERT(dsp->ds_dmap == NULL);
710 	ASSERT(dsp->ds_rx == NULL);
711 	ASSERT(dsp->ds_rx_arg == NULL);
712 	ASSERT(dsp->ds_next == NULL);
713 	ASSERT(dsp->ds_head == NULL);
714 
715 	/*
716 	 * Free the dummy mblk if exists.
717 	 */
718 	if (dsp->ds_tx_flow_mp != NULL) {
719 		freeb(dsp->ds_tx_flow_mp);
720 		dsp->ds_tx_flow_mp = NULL;
721 	}
722 
723 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
724 	ASSERT(dsp == (dld_str_t *)val);
725 
726 	/*
727 	 * Free the object back to the cache.
728 	 */
729 	kmem_cache_free(str_cachep, dsp);
730 	atomic_dec_32(&str_count);
731 }
732 
733 /*
734  * kmem_cache contructor function: see kmem_cache_create(9f).
735  */
736 /*ARGSUSED*/
737 static int
738 str_constructor(void *buf, void *cdrarg, int kmflags)
739 {
740 	dld_str_t	*dsp = buf;
741 
742 	bzero(buf, sizeof (dld_str_t));
743 
744 	/*
745 	 * Allocate a new minor number.
746 	 */
747 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
748 		return (-1);
749 
750 	/*
751 	 * Initialize the DLPI state machine.
752 	 */
753 	dsp->ds_dlstate = DL_UNATTACHED;
754 
755 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
756 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
757 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
758 
759 	return (0);
760 }
761 
762 /*
763  * kmem_cache destructor function.
764  */
765 /*ARGSUSED*/
766 static void
767 str_destructor(void *buf, void *cdrarg)
768 {
769 	dld_str_t	*dsp = buf;
770 
771 	/*
772 	 * Release the minor number.
773 	 */
774 	mac_minor_rele(dsp->ds_minor);
775 
776 	ASSERT(dsp->ds_tx_flow_mp == NULL);
777 
778 	mutex_destroy(&dsp->ds_lock);
779 	cv_destroy(&dsp->ds_datathr_cv);
780 	cv_destroy(&dsp->ds_dlpi_pending_cv);
781 }
782 
783 /*
784  * Update the priority bits and VID (may need to insert tag if mp points
785  * to an untagged packet.
786  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
787  */
788 static mblk_t *
789 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
790     link_tagmode_t tagmode)
791 {
792 	mblk_t *hmp;
793 	struct ether_vlan_header *evhp;
794 	struct ether_header *ehp;
795 	uint16_t old_tci = 0;
796 	size_t len;
797 
798 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
799 
800 	evhp = (struct ether_vlan_header *)mp->b_rptr;
801 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
802 		/*
803 		 * Tagged packet, update the priority bits.
804 		 */
805 		len = sizeof (struct ether_vlan_header);
806 
807 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
808 			/*
809 			 * In case some drivers only check the db_ref
810 			 * count of the first mblk, we pullup the
811 			 * message into a single mblk.
812 			 */
813 			hmp = msgpullup(mp, -1);
814 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
815 				freemsg(hmp);
816 				return (NULL);
817 			} else {
818 				freemsg(mp);
819 				mp = hmp;
820 			}
821 		}
822 
823 		evhp = (struct ether_vlan_header *)mp->b_rptr;
824 		old_tci = ntohs(evhp->ether_tci);
825 	} else {
826 		/*
827 		 * Untagged packet.  Two factors will cause us to insert a
828 		 * VLAN header:
829 		 * - This is a VLAN link (vid is specified)
830 		 * - The link supports user priority tagging and the priority
831 		 *   is non-zero.
832 		 */
833 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
834 			return (mp);
835 
836 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
837 		if (hmp == NULL)
838 			return (NULL);
839 
840 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
841 		ehp = (struct ether_header *)mp->b_rptr;
842 
843 		/*
844 		 * Copy the MAC addresses and typelen
845 		 */
846 		bcopy(ehp, evhp, (ETHERADDRL * 2));
847 		evhp->ether_type = ehp->ether_type;
848 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
849 
850 		hmp->b_wptr += sizeof (struct ether_vlan_header);
851 		mp->b_rptr += sizeof (struct ether_header);
852 
853 		/*
854 		 * Free the original message if it's now empty. Link the
855 		 * rest of the messages to the header message.
856 		 */
857 		if (MBLKL(mp) == 0) {
858 			hmp->b_cont = mp->b_cont;
859 			freeb(mp);
860 		} else {
861 			hmp->b_cont = mp;
862 		}
863 		mp = hmp;
864 	}
865 
866 	if (pri == 0)
867 		pri = VLAN_PRI(old_tci);
868 	if (vid == VLAN_ID_NONE)
869 		vid = VLAN_ID(old_tci);
870 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
871 	return (mp);
872 }
873 
874 /*
875  * M_DATA put (IP fast-path mode)
876  */
877 mac_tx_cookie_t
878 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
879     uint16_t flag)
880 {
881 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
882 	mblk_t *newmp;
883 	uint_t pri;
884 	mac_tx_cookie_t cookie;
885 
886 	if (is_ethernet) {
887 		/*
888 		 * Update the priority bits to the assigned priority.
889 		 */
890 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
891 
892 		if (pri != 0) {
893 			newmp = i_dld_ether_header_update_tag(mp, pri,
894 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
895 			if (newmp == NULL)
896 				goto discard;
897 			mp = newmp;
898 		}
899 	}
900 
901 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != 0) {
902 		DLD_SETQFULL(dsp);
903 	}
904 	return (cookie);
905 
906 discard:
907 	/* TODO: bump kstat? */
908 	freemsg(mp);
909 	return (0);
910 }
911 
912 /*
913  * M_DATA put (DLIOCRAW mode)
914  */
915 static void
916 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
917 {
918 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
919 	mblk_t *bp, *newmp;
920 	size_t size;
921 	mac_header_info_t mhi;
922 	uint_t pri, vid, dvid;
923 	uint_t max_sdu;
924 
925 	/*
926 	 * Certain MAC type plugins provide an illusion for raw DLPI
927 	 * consumers.  They pretend that the MAC layer is something that
928 	 * it's not for the benefit of observability tools.  For example,
929 	 * mac_wifi pretends that it's Ethernet for such consumers.
930 	 * Here, unless native mode is enabled, we call into the MAC layer so
931 	 * that this illusion can be maintained.  The plugin will optionally
932 	 * transform the MAC header here into something that can be passed
933 	 * down.  The header goes from raw mode to "cooked" mode.
934 	 */
935 	if (!dsp->ds_native) {
936 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
937 			goto discard;
938 		mp = newmp;
939 	}
940 
941 	size = MBLKL(mp);
942 
943 	/*
944 	 * Check the packet is not too big and that any remaining
945 	 * fragment list is composed entirely of M_DATA messages. (We
946 	 * know the first fragment was M_DATA otherwise we could not
947 	 * have got here).
948 	 */
949 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
950 		if (DB_TYPE(bp) != M_DATA)
951 			goto discard;
952 		size += MBLKL(bp);
953 	}
954 
955 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
956 		goto discard;
957 
958 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
959 	/*
960 	 * If LSO is enabled, check the size against lso_max. Otherwise,
961 	 * compare the packet size with max_sdu.
962 	 */
963 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
964 	if (size > max_sdu + mhi.mhi_hdrsize)
965 		goto discard;
966 
967 	if (is_ethernet) {
968 		dvid = mac_client_vid(dsp->ds_mch);
969 
970 		/*
971 		 * Discard the packet if this is a VLAN stream but the VID in
972 		 * the packet is not correct.
973 		 */
974 		vid = VLAN_ID(mhi.mhi_tci);
975 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
976 			goto discard;
977 
978 		/*
979 		 * Discard the packet if this packet is a tagged packet
980 		 * but both pri and VID are 0.
981 		 */
982 		pri = VLAN_PRI(mhi.mhi_tci);
983 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
984 		    vid == VLAN_ID_NONE)
985 			goto discard;
986 
987 		/*
988 		 * Update the priority bits to the per-stream priority if
989 		 * priority is not set in the packet. Update the VID for
990 		 * packets on a VLAN stream.
991 		 */
992 		pri = (pri == 0) ? dsp->ds_pri : 0;
993 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
994 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
995 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
996 				goto discard;
997 			}
998 			mp = newmp;
999 		}
1000 	}
1001 
1002 	if (DLD_TX(dsp, mp, 0, 0) != 0) {
1003 		/* Turn on flow-control for dld */
1004 		DLD_SETQFULL(dsp);
1005 	}
1006 	return;
1007 
1008 discard:
1009 	/* TODO: bump kstat? */
1010 	freemsg(mp);
1011 }
1012 
1013 /*
1014  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1015  */
1016 int
1017 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1018 {
1019 	dev_t			dev;
1020 	int			err;
1021 	const char		*drvname;
1022 	mac_perim_handle_t	mph = NULL;
1023 	boolean_t		qassociated = B_FALSE;
1024 	dls_link_t		*dlp = NULL;
1025 	dls_dl_handle_t		ddp = NULL;
1026 
1027 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1028 		return (EINVAL);
1029 
1030 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1031 		return (ENOTSUP);
1032 
1033 	/*
1034 	 * /dev node access. This will still be supported for backward
1035 	 * compatibility reason.
1036 	 */
1037 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1038 	    (strcmp(drvname, "vnic") != 0)) {
1039 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1040 			return (EINVAL);
1041 		qassociated = B_TRUE;
1042 	}
1043 
1044 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1045 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1046 		goto failed;
1047 
1048 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1049 		goto failed;
1050 
1051 	/*
1052 	 * Open a channel.
1053 	 */
1054 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1055 		goto failed;
1056 
1057 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
1058 		goto failed;
1059 
1060 	/*
1061 	 * Set the default packet priority.
1062 	 */
1063 	dsp->ds_pri = 0;
1064 
1065 	/*
1066 	 * Add a notify function so that the we get updates from the MAC.
1067 	 */
1068 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1069 	dsp->ds_dlstate = DL_UNBOUND;
1070 	mac_perim_exit(mph);
1071 	return (0);
1072 
1073 failed:
1074 	if (dlp != NULL)
1075 		dls_link_rele(dlp);
1076 	if (mph != NULL)
1077 		mac_perim_exit(mph);
1078 	if (ddp != NULL)
1079 		dls_devnet_rele(ddp);
1080 	if (qassociated)
1081 		(void) qassociate(dsp->ds_wq, -1);
1082 
1083 	return (err);
1084 }
1085 
1086 /*
1087  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1088  * from close(2) for style 2.
1089  */
1090 void
1091 dld_str_detach(dld_str_t *dsp)
1092 {
1093 	mac_perim_handle_t	mph;
1094 	int			err;
1095 
1096 	ASSERT(dsp->ds_datathr_cnt == 0);
1097 
1098 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1099 	/*
1100 	 * Remove the notify function.
1101 	 *
1102 	 * Note that we cannot wait for the notification callback to be removed
1103 	 * since it could cause the deadlock with str_notify() since they both
1104 	 * need the mac perimeter. Continue if we cannot remove the
1105 	 * notification callback right now and wait after we leave the
1106 	 * perimeter.
1107 	 */
1108 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1109 	dsp->ds_mnh = NULL;
1110 
1111 	/*
1112 	 * Disable the capabilities
1113 	 */
1114 	dld_capabilities_disable(dsp);
1115 
1116 	/*
1117 	 * Clear LSO flags.
1118 	 */
1119 	dsp->ds_lso = B_FALSE;
1120 	dsp->ds_lso_max = 0;
1121 
1122 	dls_close(dsp);
1123 	mac_perim_exit(mph);
1124 
1125 	/*
1126 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1127 	 * because the notification callback was in progress, wait for
1128 	 * it to finish before we proceed.
1129 	 */
1130 	if (err != 0)
1131 		mac_notify_remove_wait(dsp->ds_mh);
1132 
1133 	/*
1134 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1135 	 * automatically in the call to dls_devnet_rele.
1136 	 */
1137 	dls_devnet_rele(dsp->ds_ddh);
1138 
1139 	dsp->ds_sap = 0;
1140 	dsp->ds_mh = NULL;
1141 	dsp->ds_mch = NULL;
1142 	dsp->ds_mip = NULL;
1143 
1144 	if (dsp->ds_style == DL_STYLE2)
1145 		(void) qassociate(dsp->ds_wq, -1);
1146 
1147 	/*
1148 	 * Re-initialize the DLPI state machine.
1149 	 */
1150 	dsp->ds_dlstate = DL_UNATTACHED;
1151 }
1152 
1153 /*
1154  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1155  * tags before sending packets up to the DLS clients, with the exception of
1156  * special priority tagged packets, in that case, we set the VID to 0.
1157  * mp must be a VLAN tagged packet.
1158  */
1159 static mblk_t *
1160 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1161 {
1162 	mblk_t *newmp;
1163 	struct ether_vlan_header *evhp;
1164 	uint16_t tci, new_tci;
1165 
1166 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1167 	if (DB_REF(mp) > 1) {
1168 		newmp = copymsg(mp);
1169 		if (newmp == NULL)
1170 			return (NULL);
1171 		freemsg(mp);
1172 		mp = newmp;
1173 	}
1174 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1175 
1176 	tci = ntohs(evhp->ether_tci);
1177 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1178 		/*
1179 		 * Priority is 0, strip the tag.
1180 		 */
1181 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1182 		mp->b_rptr += VLAN_TAGSZ;
1183 	} else {
1184 		/*
1185 		 * Priority is not 0, update the VID to 0.
1186 		 */
1187 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1188 		evhp->ether_tci = htons(new_tci);
1189 	}
1190 	return (mp);
1191 }
1192 
1193 /*
1194  * Raw mode receive function.
1195  */
1196 /*ARGSUSED*/
1197 void
1198 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1199     mac_header_info_t *mhip)
1200 {
1201 	dld_str_t *dsp = (dld_str_t *)arg;
1202 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1203 	mblk_t *next, *newmp;
1204 
1205 	ASSERT(mp != NULL);
1206 	do {
1207 		/*
1208 		 * Get the pointer to the next packet in the chain and then
1209 		 * clear b_next before the packet gets passed on.
1210 		 */
1211 		next = mp->b_next;
1212 		mp->b_next = NULL;
1213 
1214 		/*
1215 		 * Wind back b_rptr to point at the MAC header.
1216 		 */
1217 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1218 		mp->b_rptr -= mhip->mhi_hdrsize;
1219 
1220 		/*
1221 		 * Certain MAC type plugins provide an illusion for raw
1222 		 * DLPI consumers.  They pretend that the MAC layer is
1223 		 * something that it's not for the benefit of observability
1224 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1225 		 * for such consumers.	Here, unless native mode is enabled,
1226 		 * we call into the MAC layer so that this illusion can be
1227 		 * maintained.	The plugin will optionally transform the MAC
1228 		 * header here into something that can be passed up to raw
1229 		 * consumers.  The header goes from "cooked" mode to raw mode.
1230 		 */
1231 		if (!dsp->ds_native) {
1232 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1233 			if (newmp == NULL) {
1234 				freemsg(mp);
1235 				goto next;
1236 			}
1237 			mp = newmp;
1238 		}
1239 
1240 		/*
1241 		 * Strip the VLAN tag for VLAN streams.
1242 		 */
1243 		if (is_ethernet &&
1244 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1245 			/*
1246 			 * The priority should be kept only for VLAN
1247 			 * data-links.
1248 			 */
1249 			newmp = i_dld_ether_header_strip_tag(mp,
1250 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1251 			if (newmp == NULL) {
1252 				freemsg(mp);
1253 				goto next;
1254 			}
1255 			mp = newmp;
1256 		}
1257 
1258 		/*
1259 		 * Pass the packet on.
1260 		 */
1261 		if (canputnext(dsp->ds_rq))
1262 			putnext(dsp->ds_rq, mp);
1263 		else
1264 			freemsg(mp);
1265 
1266 next:
1267 		/*
1268 		 * Move on to the next packet in the chain.
1269 		 */
1270 		mp = next;
1271 	} while (mp != NULL);
1272 }
1273 
1274 /*
1275  * Fast-path receive function.
1276  */
1277 /*ARGSUSED*/
1278 void
1279 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1280     mac_header_info_t *mhip)
1281 {
1282 	dld_str_t *dsp = (dld_str_t *)arg;
1283 	mblk_t *next;
1284 	size_t offset = 0;
1285 
1286 	/*
1287 	 * MAC header stripping rules:
1288 	 *    - Tagged packets:
1289 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1290 	 *	b. Physical streams
1291 	 *	- VLAN packets (non-zero VID). The stream must be either a
1292 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1293 	 *	  Strip the Ethernet header but keep the VLAN header.
1294 	 *	- Special tagged packets (zero VID)
1295 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1296 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1297 	 *	    keep the VLAN header.
1298 	 *	  * Otherwise, strip the whole VLAN header.
1299 	 *    - Untagged packets. Strip the whole MAC header.
1300 	 */
1301 	if (mhip->mhi_istagged &&
1302 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1303 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1304 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1305 		offset = VLAN_TAGSZ;
1306 	}
1307 
1308 	ASSERT(mp != NULL);
1309 	do {
1310 		/*
1311 		 * Get the pointer to the next packet in the chain and then
1312 		 * clear b_next before the packet gets passed on.
1313 		 */
1314 		next = mp->b_next;
1315 		mp->b_next = NULL;
1316 
1317 		/*
1318 		 * Wind back b_rptr to point at the VLAN header.
1319 		 */
1320 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1321 		mp->b_rptr -= offset;
1322 
1323 		/*
1324 		 * Pass the packet on.
1325 		 */
1326 		if (canputnext(dsp->ds_rq))
1327 			putnext(dsp->ds_rq, mp);
1328 		else
1329 			freemsg(mp);
1330 		/*
1331 		 * Move on to the next packet in the chain.
1332 		 */
1333 		mp = next;
1334 	} while (mp != NULL);
1335 }
1336 
1337 /*
1338  * Default receive function (send DL_UNITDATA_IND messages).
1339  */
1340 /*ARGSUSED*/
1341 void
1342 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1343     mac_header_info_t *mhip)
1344 {
1345 	dld_str_t		*dsp = (dld_str_t *)arg;
1346 	mblk_t			*ud_mp;
1347 	mblk_t			*next;
1348 	size_t			offset = 0;
1349 	boolean_t		strip_vlan = B_TRUE;
1350 
1351 	/*
1352 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1353 	 */
1354 	if (mhip->mhi_istagged &&
1355 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1356 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1357 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1358 		offset = VLAN_TAGSZ;
1359 		strip_vlan = B_FALSE;
1360 	}
1361 
1362 	ASSERT(mp != NULL);
1363 	do {
1364 		/*
1365 		 * Get the pointer to the next packet in the chain and then
1366 		 * clear b_next before the packet gets passed on.
1367 		 */
1368 		next = mp->b_next;
1369 		mp->b_next = NULL;
1370 
1371 		/*
1372 		 * Wind back b_rptr to point at the MAC header.
1373 		 */
1374 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1375 		mp->b_rptr -= mhip->mhi_hdrsize;
1376 
1377 		/*
1378 		 * Create the DL_UNITDATA_IND M_PROTO.
1379 		 */
1380 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1381 			freemsgchain(mp);
1382 			return;
1383 		}
1384 
1385 		/*
1386 		 * Advance b_rptr to point at the payload (or the VLAN header).
1387 		 */
1388 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1389 
1390 		/*
1391 		 * Prepend the DL_UNITDATA_IND.
1392 		 */
1393 		ud_mp->b_cont = mp;
1394 
1395 		/*
1396 		 * Send the message.
1397 		 */
1398 		if (canputnext(dsp->ds_rq))
1399 			putnext(dsp->ds_rq, ud_mp);
1400 		else
1401 			freemsg(ud_mp);
1402 
1403 		/*
1404 		 * Move on to the next packet in the chain.
1405 		 */
1406 		mp = next;
1407 	} while (mp != NULL);
1408 }
1409 
1410 /*
1411  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1412  */
1413 static void
1414 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu, uint_t multicast_sdu)
1415 {
1416 	mblk_t		*mp;
1417 	dl_notify_ind_t *dlip;
1418 
1419 	if (!(dsp->ds_notifications & (DL_NOTE_SDU_SIZE|DL_NOTE_SDU_SIZE2)))
1420 		return;
1421 
1422 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1423 	    M_PROTO, 0)) == NULL)
1424 		return;
1425 
1426 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1427 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1428 	dlip->dl_primitive = DL_NOTIFY_IND;
1429 	if (dsp->ds_notifications & DL_NOTE_SDU_SIZE2) {
1430 		dlip->dl_notification = DL_NOTE_SDU_SIZE2;
1431 		dlip->dl_data1 = max_sdu;
1432 		dlip->dl_data2 = multicast_sdu;
1433 	} else {
1434 		dlip->dl_notification = DL_NOTE_SDU_SIZE;
1435 		dlip->dl_data = max_sdu;
1436 	}
1437 
1438 	qreply(dsp->ds_wq, mp);
1439 }
1440 
1441 /*
1442  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1443  * current state of the interface.
1444  */
1445 void
1446 dld_str_notify_ind(dld_str_t *dsp)
1447 {
1448 	mac_notify_type_t	type;
1449 
1450 	for (type = 0; type < MAC_NNOTE; type++)
1451 		str_notify(dsp, type);
1452 }
1453 
1454 typedef struct dl_unitdata_ind_wrapper {
1455 	dl_unitdata_ind_t	dl_unitdata;
1456 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1457 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1458 } dl_unitdata_ind_wrapper_t;
1459 
1460 /*
1461  * Create a DL_UNITDATA_IND M_PROTO message.
1462  */
1463 static mblk_t *
1464 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1465 {
1466 	mblk_t				*nmp;
1467 	dl_unitdata_ind_wrapper_t	*dlwp;
1468 	dl_unitdata_ind_t		*dlp;
1469 	mac_header_info_t		mhi;
1470 	uint_t				addr_length;
1471 	uint8_t				*daddr;
1472 	uint8_t				*saddr;
1473 
1474 	/*
1475 	 * Get the packet header information.
1476 	 */
1477 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
1478 		return (NULL);
1479 
1480 	/*
1481 	 * Allocate a message large enough to contain the wrapper structure
1482 	 * defined above.
1483 	 */
1484 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1485 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1486 	    DL_UNITDATA_IND)) == NULL)
1487 		return (NULL);
1488 
1489 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1490 
1491 	dlp = &(dlwp->dl_unitdata);
1492 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1493 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1494 
1495 	/*
1496 	 * Copy in the destination address.
1497 	 */
1498 	addr_length = dsp->ds_mip->mi_addr_length;
1499 	daddr = dlwp->dl_dest_addr;
1500 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1501 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1502 
1503 	/*
1504 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1505 	 */
1506 	if (mhi.mhi_istagged && !strip_vlan)
1507 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1508 	else
1509 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1510 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1511 
1512 	/*
1513 	 * If the destination address was multicast or broadcast then the
1514 	 * dl_group_address field should be non-zero.
1515 	 */
1516 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1517 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1518 
1519 	/*
1520 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1521 	 * for example) may not have access to source information.
1522 	 */
1523 	if (mhi.mhi_saddr == NULL) {
1524 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1525 	} else {
1526 		saddr = dlwp->dl_src_addr;
1527 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1528 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1529 
1530 		/*
1531 		 * Set the source DLSAP to the packet ethertype.
1532 		 */
1533 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1534 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1535 	}
1536 
1537 	return (nmp);
1538 }
1539 
1540 /*
1541  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1542  */
1543 static void
1544 str_notify_promisc_on_phys(dld_str_t *dsp)
1545 {
1546 	mblk_t		*mp;
1547 	dl_notify_ind_t	*dlip;
1548 
1549 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1550 		return;
1551 
1552 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1553 	    M_PROTO, 0)) == NULL)
1554 		return;
1555 
1556 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1557 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1558 	dlip->dl_primitive = DL_NOTIFY_IND;
1559 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1560 
1561 	qreply(dsp->ds_wq, mp);
1562 }
1563 
1564 /*
1565  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1566  */
1567 static void
1568 str_notify_promisc_off_phys(dld_str_t *dsp)
1569 {
1570 	mblk_t		*mp;
1571 	dl_notify_ind_t	*dlip;
1572 
1573 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1574 		return;
1575 
1576 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1577 	    M_PROTO, 0)) == NULL)
1578 		return;
1579 
1580 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1581 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1582 	dlip->dl_primitive = DL_NOTIFY_IND;
1583 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1584 
1585 	qreply(dsp->ds_wq, mp);
1586 }
1587 
1588 /*
1589  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1590  */
1591 static void
1592 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1593 {
1594 	mblk_t		*mp;
1595 	dl_notify_ind_t	*dlip;
1596 	uint_t		addr_length;
1597 	uint16_t	ethertype;
1598 
1599 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1600 		return;
1601 
1602 	addr_length = dsp->ds_mip->mi_addr_length;
1603 	if ((mp = mexchange(dsp->ds_wq, NULL,
1604 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1605 	    M_PROTO, 0)) == NULL)
1606 		return;
1607 
1608 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1609 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1610 	dlip->dl_primitive = DL_NOTIFY_IND;
1611 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1612 	dlip->dl_data = addr_type;
1613 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1614 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1615 
1616 	bcopy(addr, &dlip[1], addr_length);
1617 
1618 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1619 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1620 
1621 	qreply(dsp->ds_wq, mp);
1622 }
1623 
1624 /*
1625  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1626  */
1627 static void
1628 str_notify_link_up(dld_str_t *dsp)
1629 {
1630 	mblk_t		*mp;
1631 	dl_notify_ind_t	*dlip;
1632 
1633 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1634 		return;
1635 
1636 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1637 	    M_PROTO, 0)) == NULL)
1638 		return;
1639 
1640 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1641 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1642 	dlip->dl_primitive = DL_NOTIFY_IND;
1643 	dlip->dl_notification = DL_NOTE_LINK_UP;
1644 
1645 	qreply(dsp->ds_wq, mp);
1646 }
1647 
1648 /*
1649  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1650  */
1651 static void
1652 str_notify_link_down(dld_str_t *dsp)
1653 {
1654 	mblk_t		*mp;
1655 	dl_notify_ind_t	*dlip;
1656 
1657 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1658 		return;
1659 
1660 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1661 	    M_PROTO, 0)) == NULL)
1662 		return;
1663 
1664 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1665 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1666 	dlip->dl_primitive = DL_NOTIFY_IND;
1667 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1668 
1669 	qreply(dsp->ds_wq, mp);
1670 }
1671 
1672 /*
1673  * DL_NOTIFY_IND: DL_NOTE_SPEED
1674  */
1675 static void
1676 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1677 {
1678 	mblk_t		*mp;
1679 	dl_notify_ind_t	*dlip;
1680 
1681 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1682 		return;
1683 
1684 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1685 	    M_PROTO, 0)) == NULL)
1686 		return;
1687 
1688 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1689 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1690 	dlip->dl_primitive = DL_NOTIFY_IND;
1691 	dlip->dl_notification = DL_NOTE_SPEED;
1692 	dlip->dl_data = speed;
1693 
1694 	qreply(dsp->ds_wq, mp);
1695 }
1696 
1697 /*
1698  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1699  */
1700 static void
1701 str_notify_capab_reneg(dld_str_t *dsp)
1702 {
1703 	mblk_t		*mp;
1704 	dl_notify_ind_t	*dlip;
1705 
1706 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1707 		return;
1708 
1709 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1710 	    M_PROTO, 0)) == NULL)
1711 		return;
1712 
1713 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1714 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1715 	dlip->dl_primitive = DL_NOTIFY_IND;
1716 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1717 
1718 	qreply(dsp->ds_wq, mp);
1719 }
1720 
1721 /*
1722  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1723  */
1724 static void
1725 str_notify_fastpath_flush(dld_str_t *dsp)
1726 {
1727 	mblk_t		*mp;
1728 	dl_notify_ind_t	*dlip;
1729 
1730 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1731 		return;
1732 
1733 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1734 	    M_PROTO, 0)) == NULL)
1735 		return;
1736 
1737 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1738 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1739 	dlip->dl_primitive = DL_NOTIFY_IND;
1740 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1741 
1742 	qreply(dsp->ds_wq, mp);
1743 }
1744 
1745 static void
1746 str_notify_allowed_ips(dld_str_t *dsp)
1747 {
1748 	mblk_t		*mp;
1749 	dl_notify_ind_t	*dlip;
1750 	size_t		mp_size;
1751 	mac_protect_t	*mrp;
1752 
1753 	if (!(dsp->ds_notifications & DL_NOTE_ALLOWED_IPS))
1754 		return;
1755 
1756 	mp_size = sizeof (mac_protect_t) + sizeof (dl_notify_ind_t);
1757 	if ((mp = mexchange(dsp->ds_wq, NULL, mp_size, M_PROTO, 0)) == NULL)
1758 		return;
1759 
1760 	mrp = mac_protect_get(dsp->ds_mh);
1761 	bzero(mp->b_rptr, mp_size);
1762 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1763 	dlip->dl_primitive = DL_NOTIFY_IND;
1764 	dlip->dl_notification = DL_NOTE_ALLOWED_IPS;
1765 	dlip->dl_data = 0;
1766 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1767 	dlip->dl_addr_length = sizeof (mac_protect_t);
1768 	bcopy(mrp, mp->b_rptr + sizeof (dl_notify_ind_t),
1769 	    sizeof (mac_protect_t));
1770 
1771 	qreply(dsp->ds_wq, mp);
1772 }
1773 
1774 /*
1775  * MAC notification callback.
1776  */
1777 void
1778 str_notify(void *arg, mac_notify_type_t type)
1779 {
1780 	dld_str_t		*dsp = (dld_str_t *)arg;
1781 	queue_t			*q = dsp->ds_wq;
1782 	mac_handle_t		mh = dsp->ds_mh;
1783 	mac_client_handle_t	mch = dsp->ds_mch;
1784 	uint8_t			addr[MAXMACADDRLEN];
1785 
1786 	switch (type) {
1787 	case MAC_NOTE_TX:
1788 		qenable(q);
1789 		break;
1790 
1791 	case MAC_NOTE_DEVPROMISC:
1792 		/*
1793 		 * Send the appropriate DL_NOTIFY_IND.
1794 		 */
1795 		if (mac_promisc_get(mh))
1796 			str_notify_promisc_on_phys(dsp);
1797 		else
1798 			str_notify_promisc_off_phys(dsp);
1799 		break;
1800 
1801 	case MAC_NOTE_UNICST:
1802 		/*
1803 		 * This notification is sent whenever the MAC unicast
1804 		 * address changes.
1805 		 */
1806 		mac_unicast_primary_get(mh, addr);
1807 
1808 		/*
1809 		 * Send the appropriate DL_NOTIFY_IND.
1810 		 */
1811 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1812 		break;
1813 
1814 	case MAC_NOTE_DEST:
1815 		/*
1816 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
1817 		 * destination address.
1818 		 */
1819 		if (mac_dst_get(dsp->ds_mh, addr))
1820 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1821 		break;
1822 
1823 	case MAC_NOTE_LOWLINK:
1824 	case MAC_NOTE_LINK:
1825 		/*
1826 		 * LOWLINK refers to the actual link status. For links that
1827 		 * are not part of a bridge instance LOWLINK and LINK state
1828 		 * are the same. But for a link part of a bridge instance
1829 		 * LINK state refers to the aggregate link status: "up" when
1830 		 * at least one link part of the bridge is up and is "down"
1831 		 * when all links part of the bridge are down.
1832 		 *
1833 		 * Clients can request to be notified of the LOWLINK state
1834 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1835 		 * daemon request lowlink state changes and upper layer clients
1836 		 * receive notifications of the aggregate link state changes
1837 		 * which is the default when requesting LINK UP/DOWN state
1838 		 * notifications.
1839 		 */
1840 
1841 		/*
1842 		 * Check that the notification type matches the one that we
1843 		 * want.  If we want lower-level link notifications, and this
1844 		 * is upper, or if we want upper and this is lower, then
1845 		 * ignore.
1846 		 */
1847 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1848 			break;
1849 		/*
1850 		 * This notification is sent every time the MAC driver
1851 		 * updates the link state.
1852 		 */
1853 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1854 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1855 		case LINK_STATE_UP: {
1856 			uint64_t speed;
1857 			/*
1858 			 * The link is up so send the appropriate
1859 			 * DL_NOTIFY_IND.
1860 			 */
1861 			str_notify_link_up(dsp);
1862 
1863 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1864 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1865 			break;
1866 		}
1867 		case LINK_STATE_DOWN:
1868 			/*
1869 			 * The link is down so send the appropriate
1870 			 * DL_NOTIFY_IND.
1871 			 */
1872 			str_notify_link_down(dsp);
1873 			break;
1874 
1875 		default:
1876 			break;
1877 		}
1878 		break;
1879 
1880 	case MAC_NOTE_CAPAB_CHG:
1881 		/*
1882 		 * This notification is sent whenever the MAC resources
1883 		 * change or capabilities change. We need to renegotiate
1884 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1885 		 */
1886 		str_notify_capab_reneg(dsp);
1887 		break;
1888 
1889 	case MAC_NOTE_SDU_SIZE: {
1890 		uint_t  max_sdu;
1891 		uint_t	multicast_sdu;
1892 		mac_sdu_get2(dsp->ds_mh, NULL, &max_sdu, &multicast_sdu);
1893 		str_notify_sdu_size(dsp, max_sdu, multicast_sdu);
1894 		break;
1895 	}
1896 
1897 	case MAC_NOTE_FASTPATH_FLUSH:
1898 		str_notify_fastpath_flush(dsp);
1899 		break;
1900 
1901 	/* Unused notifications */
1902 	case MAC_NOTE_MARGIN:
1903 		break;
1904 
1905 	case MAC_NOTE_ALLOWED_IPS:
1906 		str_notify_allowed_ips(dsp);
1907 		break;
1908 
1909 	default:
1910 		ASSERT(B_FALSE);
1911 		break;
1912 	}
1913 }
1914 
1915 /*
1916  * This function is called via a taskq mechansim to process all control
1917  * messages on a per 'dsp' end point.
1918  */
1919 static void
1920 dld_wput_nondata_task(void *arg)
1921 {
1922 	dld_str_t	*dsp = arg;
1923 	mblk_t		*mp;
1924 
1925 	mutex_enter(&dsp->ds_lock);
1926 	while (dsp->ds_pending_head != NULL) {
1927 		mp = dsp->ds_pending_head;
1928 		dsp->ds_pending_head = mp->b_next;
1929 		mp->b_next = NULL;
1930 		if (dsp->ds_pending_head == NULL)
1931 			dsp->ds_pending_tail = NULL;
1932 		mutex_exit(&dsp->ds_lock);
1933 
1934 		switch (DB_TYPE(mp)) {
1935 		case M_PROTO:
1936 		case M_PCPROTO:
1937 			dld_proto(dsp, mp);
1938 			break;
1939 		case M_IOCTL:
1940 			dld_ioc(dsp, mp);
1941 			break;
1942 		default:
1943 			ASSERT(0);
1944 		}
1945 
1946 		mutex_enter(&dsp->ds_lock);
1947 	}
1948 	ASSERT(dsp->ds_pending_tail == NULL);
1949 	dsp->ds_dlpi_pending = 0;
1950 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1951 	mutex_exit(&dsp->ds_lock);
1952 }
1953 
1954 /*
1955  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1956  * thread is started at boot time.
1957  */
1958 static void
1959 dld_taskq_dispatch(void)
1960 {
1961 	callb_cpr_t	cprinfo;
1962 	dld_str_t	*dsp;
1963 
1964 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1965 	    "dld_taskq_dispatch");
1966 	mutex_enter(&dld_taskq_lock);
1967 
1968 	while (!dld_taskq_quit) {
1969 		dsp = list_head(&dld_taskq_list);
1970 		while (dsp != NULL) {
1971 			list_remove(&dld_taskq_list, dsp);
1972 			mutex_exit(&dld_taskq_lock);
1973 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1974 			    dsp, TQ_SLEEP) != TASKQID_INVALID);
1975 			mutex_enter(&dld_taskq_lock);
1976 			dsp = list_head(&dld_taskq_list);
1977 		}
1978 
1979 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1980 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1981 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1982 	}
1983 
1984 	dld_taskq_done = B_TRUE;
1985 	cv_signal(&dld_taskq_cv);
1986 	CALLB_CPR_EXIT(&cprinfo);
1987 	thread_exit();
1988 }
1989 
1990 /*
1991  * All control operations are serialized on the 'dsp' and are also funneled
1992  * through a taskq mechanism to ensure that subsequent processing has kernel
1993  * context and can safely use cv_wait.
1994  *
1995  * Mechanisms to handle taskq dispatch failures
1996  *
1997  * The only way to be sure that taskq dispatch does not fail is to either
1998  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1999  * some number of entries and make sure that the number of outstanding requests
2000  * are less than that number. We can't use TQ_SLEEP since we don't know the
2001  * context. Nor can we bound the total number of 'dsp' end points. So we are
2002  * unable to use either of the above schemes, and are forced to deal with
2003  * taskq dispatch failures. Note that even dynamic taskq could fail in
2004  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
2005  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
2006  * framework.
2007  *
2008  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
2009  * We also have a single global thread to retry the taskq dispatch. This
2010  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
2011  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
2012  */
2013 static void
2014 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
2015 {
2016 	ASSERT(mp->b_next == NULL);
2017 	mutex_enter(&dsp->ds_lock);
2018 	if (dsp->ds_pending_head != NULL) {
2019 		ASSERT(dsp->ds_dlpi_pending);
2020 		dsp->ds_pending_tail->b_next = mp;
2021 		dsp->ds_pending_tail = mp;
2022 		mutex_exit(&dsp->ds_lock);
2023 		return;
2024 	}
2025 	ASSERT(dsp->ds_pending_tail == NULL);
2026 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2027 	/*
2028 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
2029 	 * thread is still active and is processing the last message, though
2030 	 * the pending queue has been emptied.
2031 	 */
2032 	if (dsp->ds_dlpi_pending) {
2033 		mutex_exit(&dsp->ds_lock);
2034 		return;
2035 	}
2036 
2037 	dsp->ds_dlpi_pending = 1;
2038 	mutex_exit(&dsp->ds_lock);
2039 
2040 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
2041 	    TQ_NOSLEEP) != TASKQID_INVALID)
2042 		return;
2043 
2044 	mutex_enter(&dld_taskq_lock);
2045 	list_insert_tail(&dld_taskq_list, dsp);
2046 	cv_signal(&dld_taskq_cv);
2047 	mutex_exit(&dld_taskq_lock);
2048 }
2049 
2050 /*
2051  * Process an M_IOCTL message.
2052  */
2053 static void
2054 dld_ioc(dld_str_t *dsp, mblk_t *mp)
2055 {
2056 	uint_t			cmd;
2057 
2058 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2059 	ASSERT(dsp->ds_type == DLD_DLPI);
2060 
2061 	switch (cmd) {
2062 	case DLIOCNATIVE:
2063 		ioc_native(dsp, mp);
2064 		break;
2065 	case DLIOCMARGININFO:
2066 		ioc_margin(dsp, mp);
2067 		break;
2068 	case DLIOCRAW:
2069 		ioc_raw(dsp, mp);
2070 		break;
2071 	case DLIOCHDRINFO:
2072 		ioc_fast(dsp, mp);
2073 		break;
2074 	case DLIOCLOWLINK:
2075 		ioc_lowlink(dsp, mp);
2076 		break;
2077 	default:
2078 		ioc(dsp, mp);
2079 	}
2080 }
2081 
2082 /*
2083  * DLIOCNATIVE
2084  */
2085 static void
2086 ioc_native(dld_str_t *dsp, mblk_t *mp)
2087 {
2088 	queue_t *q = dsp->ds_wq;
2089 	const mac_info_t *mip = dsp->ds_mip;
2090 
2091 	/*
2092 	 * Native mode can be enabled if it's disabled and if the
2093 	 * native media type is different.
2094 	 */
2095 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2096 		dsp->ds_native = B_TRUE;
2097 
2098 	if (dsp->ds_native)
2099 		miocack(q, mp, 0, mip->mi_nativemedia);
2100 	else
2101 		miocnak(q, mp, 0, ENOTSUP);
2102 }
2103 
2104 /*
2105  * DLIOCMARGININFO
2106  */
2107 static void
2108 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2109 {
2110 	queue_t *q = dsp->ds_wq;
2111 	uint32_t margin;
2112 	int err;
2113 
2114 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2115 		err = EINVAL;
2116 		goto failed;
2117 	}
2118 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2119 		goto failed;
2120 
2121 	mac_margin_get(dsp->ds_mh, &margin);
2122 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2123 	miocack(q, mp, sizeof (uint32_t), 0);
2124 	return;
2125 
2126 failed:
2127 	miocnak(q, mp, 0, err);
2128 }
2129 
2130 /*
2131  * DLIOCRAW
2132  */
2133 static void
2134 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2135 {
2136 	queue_t *q = dsp->ds_wq;
2137 	mac_perim_handle_t	mph;
2138 
2139 	if (dsp->ds_mh == NULL) {
2140 		dsp->ds_mode = DLD_RAW;
2141 		miocack(q, mp, 0, 0);
2142 		return;
2143 	}
2144 
2145 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2146 	if (dsp->ds_polling || dsp->ds_direct) {
2147 		mac_perim_exit(mph);
2148 		miocnak(q, mp, 0, EPROTO);
2149 		return;
2150 	}
2151 
2152 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2153 		/*
2154 		 * Set the receive callback.
2155 		 */
2156 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2157 	}
2158 
2159 	/*
2160 	 * Note that raw mode is enabled.
2161 	 */
2162 	dsp->ds_mode = DLD_RAW;
2163 	mac_perim_exit(mph);
2164 
2165 	miocack(q, mp, 0, 0);
2166 }
2167 
2168 /*
2169  * DLIOCHDRINFO
2170  */
2171 static void
2172 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2173 {
2174 	dl_unitdata_req_t *dlp;
2175 	off_t		off;
2176 	size_t		len;
2177 	const uint8_t	*addr;
2178 	uint16_t	sap;
2179 	mblk_t		*nmp;
2180 	mblk_t		*hmp;
2181 	uint_t		addr_length;
2182 	queue_t		*q = dsp->ds_wq;
2183 	int		err;
2184 	mac_perim_handle_t	mph;
2185 
2186 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2187 		err = ENOTSUP;
2188 		goto failed;
2189 	}
2190 
2191 	/*
2192 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2193 	 * user-land should not be allowed.
2194 	 */
2195 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2196 		err = EINVAL;
2197 		goto failed;
2198 	}
2199 
2200 	nmp = mp->b_cont;
2201 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2202 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2203 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2204 		err = EINVAL;
2205 		goto failed;
2206 	}
2207 
2208 	off = dlp->dl_dest_addr_offset;
2209 	len = dlp->dl_dest_addr_length;
2210 
2211 	if (!MBLKIN(nmp, off, len)) {
2212 		err = EINVAL;
2213 		goto failed;
2214 	}
2215 
2216 	if (dsp->ds_dlstate != DL_IDLE) {
2217 		err = ENOTSUP;
2218 		goto failed;
2219 	}
2220 
2221 	addr_length = dsp->ds_mip->mi_addr_length;
2222 	if (len != addr_length + sizeof (uint16_t)) {
2223 		err = EINVAL;
2224 		goto failed;
2225 	}
2226 
2227 	addr = nmp->b_rptr + off;
2228 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2229 
2230 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2231 		err = ENOMEM;
2232 		goto failed;
2233 	}
2234 
2235 	/*
2236 	 * This ioctl might happen concurrently with a direct call to dld_capab
2237 	 * that tries to enable direct and/or poll capabilities. Since the
2238 	 * stack does not serialize them, we do so here to avoid mixing
2239 	 * the callbacks.
2240 	 */
2241 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2242 	if (dsp->ds_mode != DLD_FASTPATH) {
2243 		/*
2244 		 * Set the receive callback (unless polling is enabled).
2245 		 */
2246 		if (!dsp->ds_polling && !dsp->ds_direct)
2247 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2248 
2249 		/*
2250 		 * Note that fast-path mode is enabled.
2251 		 */
2252 		dsp->ds_mode = DLD_FASTPATH;
2253 	}
2254 	mac_perim_exit(mph);
2255 
2256 	freemsg(nmp->b_cont);
2257 	nmp->b_cont = hmp;
2258 
2259 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2260 	return;
2261 failed:
2262 	miocnak(q, mp, 0, err);
2263 }
2264 
2265 /*
2266  * DLIOCLOWLINK: request actual link state changes. When the
2267  * link is part of a bridge instance the client receives actual
2268  * link state changes and not the aggregate link status. Used by
2269  * the bridging daemon (bridged) for proper RSTP operation.
2270  */
2271 static void
2272 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2273 {
2274 	queue_t *q = dsp->ds_wq;
2275 	int err;
2276 
2277 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
2278 		miocnak(q, mp, 0, err);
2279 	} else {
2280 		/* LINTED: alignment */
2281 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2282 		miocack(q, mp, 0, 0);
2283 	}
2284 }
2285 
2286 /*
2287  * Catch-all handler.
2288  */
2289 static void
2290 ioc(dld_str_t *dsp, mblk_t *mp)
2291 {
2292 	queue_t	*q = dsp->ds_wq;
2293 
2294 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2295 		miocnak(q, mp, 0, EINVAL);
2296 		return;
2297 	}
2298 	mac_ioctl(dsp->ds_mh, q, mp);
2299 }
2300