xref: /titanic_50/usr/src/uts/common/io/dld/dld_str.c (revision 506ea5b80cb420d5edeb6aff19fe035c0a283bb3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Data-Link Driver
27  */
28 
29 #include	<inet/common.h>
30 #include	<sys/strsubr.h>
31 #include	<sys/stropts.h>
32 #include	<sys/strsun.h>
33 #include	<sys/vlan.h>
34 #include	<sys/dld_impl.h>
35 #include	<sys/cpuvar.h>
36 #include	<sys/callb.h>
37 #include	<sys/list.h>
38 #include	<sys/mac_client.h>
39 #include	<sys/mac_client_priv.h>
40 #include	<sys/mac_flow.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
61 
62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
64     link_tagmode_t);
65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
66 
67 static uint32_t		str_count;
68 static kmem_cache_t	*str_cachep;
69 static mod_hash_t	*str_hashp;
70 
71 #define	STR_HASHSZ		64
72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
73 
74 #define	dld_taskq	system_taskq
75 
76 static kmutex_t		dld_taskq_lock;
77 static kcondvar_t	dld_taskq_cv;
78 static list_t		dld_taskq_list;		/* List of dld_str_t */
79 boolean_t		dld_taskq_quit;
80 boolean_t		dld_taskq_done;
81 
82 static void		dld_taskq_dispatch(void);
83 
84 /*
85  * Some notes on entry points, flow-control, queueing.
86  *
87  * This driver exports the traditional STREAMS put entry point as well as
88  * the non-STREAMS fast-path transmit routine which is provided to IP via
89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
90  * and data operations, while the fast-path routine deals only with M_DATA
91  * fast-path packets.  Regardless of the entry point, all outbound packets
92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
93  *
94  * The transmit logic operates in the following way: All packets coming
95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
96  * happens when the MAC layer indicates the packets couldn't be
97  * transmitted due to 1) lack of resources (e.g. running out of
98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
99  * particular flow. The indication comes in the form of a Tx cookie that
100  * identifies the blocked ring. In such case, DLD will place a
101  * dummy message on its write-side STREAMS queue so that the queue is
102  * marked as "full". Any subsequent packets arriving at the driver will
103  * still be sent to the MAC layer where it either gets queued in the Tx
104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
106  * When the write service procedure runs, it will remove the dummy
107  * message from the write-side STREAMS queue; in effect this will trigger
108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
109  * respectively, due to the above reasons.
110  *
111  * All non-data operations, both DLPI and ioctls are single threaded on a per
112  * dld_str_t endpoint. This is done using a taskq so that the control operation
113  * has kernel context and can cv_wait for resources. In addition all set type
114  * operations that involve mac level state modification are serialized on a
115  * per mac end point using the perimeter mechanism provided by the mac layer.
116  * This serializes all mac clients trying to modify a single mac end point over
117  * the entire sequence of mac calls made by that client as an atomic unit. The
118  * mac framework locking is described in mac.c. A critical element is that
119  * DLD/DLS does not hold any locks across the mac perimeter.
120  *
121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
124  * is returned. If the mac handle is non-null, it can be safely accessed
125  * below. The mac handle won't be freed until the mac_unregister which
126  * won't happen until the driver detaches. The DDI framework ensures that
127  * the detach won't happen while a getinfo is in progress.
128  */
129 typedef struct i_dld_str_state_s {
130 	major_t		ds_major;
131 	minor_t		ds_minor;
132 	int		ds_instance;
133 	dev_info_t	*ds_dip;
134 } i_dld_str_state_t;
135 
136 /* ARGSUSED */
137 static uint_t
138 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
139 {
140 	i_dld_str_state_t	*statep = arg;
141 	dld_str_t		*dsp = (dld_str_t *)val;
142 	mac_handle_t		mh;
143 
144 	if (statep->ds_major != dsp->ds_major)
145 		return (MH_WALK_CONTINUE);
146 
147 	ASSERT(statep->ds_minor != 0);
148 	mh = dsp->ds_mh;
149 
150 	if (statep->ds_minor == dsp->ds_minor) {
151 		/*
152 		 * Clone: a clone minor is unique. we can terminate the
153 		 * walk if we find a matching stream -- even if we fail
154 		 * to obtain the devinfo.
155 		 */
156 		if (mh != NULL) {
157 			statep->ds_dip = mac_devinfo_get(mh);
158 			statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
159 		}
160 		return (MH_WALK_TERMINATE);
161 	}
162 	return (MH_WALK_CONTINUE);
163 }
164 
165 static dev_info_t *
166 dld_finddevinfo(dev_t dev)
167 {
168 	dev_info_t		*dip;
169 	i_dld_str_state_t	state;
170 
171 	if (getminor(dev) == 0)
172 		return (NULL);
173 
174 	/*
175 	 * See if it's a minor node of a link
176 	 */
177 	if ((dip = dls_link_devinfo(dev)) != NULL)
178 		return (dip);
179 
180 	state.ds_minor = getminor(dev);
181 	state.ds_major = getmajor(dev);
182 	state.ds_dip = NULL;
183 	state.ds_instance = -1;
184 
185 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
186 	return (state.ds_dip);
187 }
188 
189 int
190 dld_devt_to_instance(dev_t dev)
191 {
192 	minor_t			minor;
193 	i_dld_str_state_t	state;
194 
195 	/*
196 	 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
197 	 * Minor number 0 is reserved for the DLPI style 2 unattached
198 	 * node.
199 	 */
200 
201 	if ((minor = getminor(dev)) == 0)
202 		return (-1);
203 
204 	/*
205 	 * Check for unopened style 1 node.
206 	 * Note that this doesn't *necessarily* work for legacy
207 	 * devices, but this code is only called within the
208 	 * getinfo(9e) implementation for true GLDv3 devices, so it
209 	 * doesn't matter.
210 	 */
211 	if (minor > 0 && minor <= DLS_MAX_MINOR) {
212 		return (DLS_MINOR2INST(minor));
213 	}
214 
215 	state.ds_minor = getminor(dev);
216 	state.ds_major = getmajor(dev);
217 	state.ds_dip = NULL;
218 	state.ds_instance = -1;
219 
220 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
221 	return (state.ds_instance);
222 }
223 
224 /*
225  * devo_getinfo: getinfo(9e)
226  *
227  * NB: This may be called for a provider before the provider's
228  * instances are attached.  Hence, if a particular provider needs a
229  * special mapping (the mac instance != ddi_get_instance()), then it
230  * may need to provide its own implmentation using the
231  * mac_devt_to_instance() function, and translating the returned mac
232  * instance to a devinfo instance.  For dev_t's where the minor number
233  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
234  * function indirectly via the mac_getinfo() function.
235  */
236 /*ARGSUSED*/
237 int
238 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
239 {
240 	dev_info_t	*devinfo;
241 	minor_t		minor = getminor((dev_t)arg);
242 	int		rc = DDI_FAILURE;
243 
244 	switch (cmd) {
245 	case DDI_INFO_DEVT2DEVINFO:
246 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
247 			*(dev_info_t **)resp = devinfo;
248 			rc = DDI_SUCCESS;
249 		}
250 		break;
251 	case DDI_INFO_DEVT2INSTANCE:
252 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
253 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
254 			rc = DDI_SUCCESS;
255 		} else if (minor > DLS_MAX_MINOR &&
256 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
257 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
258 			rc = DDI_SUCCESS;
259 		}
260 		break;
261 	}
262 	return (rc);
263 }
264 
265 void *
266 dld_str_private(queue_t *q)
267 {
268 	return (((dld_str_t *)(q->q_ptr))->ds_private);
269 }
270 
271 int
272 dld_str_open(queue_t *rq, dev_t *devp, void *private)
273 {
274 	dld_str_t	*dsp;
275 	major_t		major;
276 	minor_t		minor;
277 	int		err;
278 
279 	major = getmajor(*devp);
280 	minor = getminor(*devp);
281 
282 	/*
283 	 * Create a new dld_str_t for the stream. This will grab a new minor
284 	 * number that will be handed back in the cloned dev_t.  Creation may
285 	 * fail if we can't allocate the dummy mblk used for flow-control.
286 	 */
287 	dsp = dld_str_create(rq, DLD_DLPI, major,
288 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
289 	if (dsp == NULL)
290 		return (ENOSR);
291 
292 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
293 	dsp->ds_private = private;
294 	if (minor != 0) {
295 		/*
296 		 * Style 1 open
297 		 */
298 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
299 			goto failed;
300 
301 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
302 	} else {
303 		(void) qassociate(rq, -1);
304 	}
305 
306 	/*
307 	 * Enable the queue srv(9e) routine.
308 	 */
309 	qprocson(rq);
310 
311 	/*
312 	 * Construct a cloned dev_t to hand back.
313 	 */
314 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
315 	return (0);
316 
317 failed:
318 	dld_str_destroy(dsp);
319 	return (err);
320 }
321 
322 int
323 dld_str_close(queue_t *rq)
324 {
325 	dld_str_t	*dsp = rq->q_ptr;
326 
327 	/*
328 	 * All modules on top have been popped off. So there can't be any
329 	 * threads from the top.
330 	 */
331 	ASSERT(dsp->ds_datathr_cnt == 0);
332 
333 	/*
334 	 * Wait until pending DLPI requests are processed.
335 	 */
336 	mutex_enter(&dsp->ds_lock);
337 	while (dsp->ds_dlpi_pending)
338 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
339 	mutex_exit(&dsp->ds_lock);
340 
341 
342 	/*
343 	 * This stream was open to a provider node. Check to see
344 	 * if it has been cleanly shut down.
345 	 */
346 	if (dsp->ds_dlstate != DL_UNATTACHED) {
347 		/*
348 		 * The stream is either open to a style 1 provider or
349 		 * this is not clean shutdown. Detach from the PPA.
350 		 * (This is still ok even in the style 1 case).
351 		 */
352 		dld_str_detach(dsp);
353 	}
354 
355 	dld_str_destroy(dsp);
356 	return (0);
357 }
358 
359 /*
360  * qi_qopen: open(9e)
361  */
362 /*ARGSUSED*/
363 int
364 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
365 {
366 	if (sflag == MODOPEN)
367 		return (ENOTSUP);
368 
369 	/*
370 	 * This is a cloning driver and therefore each queue should only
371 	 * ever get opened once.
372 	 */
373 	if (rq->q_ptr != NULL)
374 		return (EBUSY);
375 
376 	return (dld_str_open(rq, devp, NULL));
377 }
378 
379 /*
380  * qi_qclose: close(9e)
381  */
382 int
383 dld_close(queue_t *rq)
384 {
385 	/*
386 	 * Disable the queue srv(9e) routine.
387 	 */
388 	qprocsoff(rq);
389 
390 	return (dld_str_close(rq));
391 }
392 
393 /*
394  * qi_qputp: put(9e)
395  */
396 void
397 dld_wput(queue_t *wq, mblk_t *mp)
398 {
399 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
400 	dld_str_mode_t	mode;
401 
402 	switch (DB_TYPE(mp)) {
403 	case M_DATA:
404 		mutex_enter(&dsp->ds_lock);
405 		mode = dsp->ds_mode;
406 		if ((dsp->ds_dlstate != DL_IDLE) ||
407 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
408 			mutex_exit(&dsp->ds_lock);
409 			freemsg(mp);
410 			break;
411 		}
412 
413 		DLD_DATATHR_INC(dsp);
414 		mutex_exit(&dsp->ds_lock);
415 		if (mode == DLD_FASTPATH) {
416 			if (dsp->ds_mip->mi_media == DL_ETHER &&
417 			    (MBLKL(mp) < sizeof (struct ether_header))) {
418 				freemsg(mp);
419 			} else {
420 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
421 			}
422 		} else {
423 			str_mdata_raw_put(dsp, mp);
424 		}
425 		DLD_DATATHR_DCR(dsp);
426 		break;
427 	case M_PROTO:
428 	case M_PCPROTO: {
429 		t_uscalar_t	prim;
430 
431 		if (MBLKL(mp) < sizeof (t_uscalar_t))
432 			break;
433 
434 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
435 
436 		if (prim == DL_UNITDATA_REQ) {
437 			proto_unitdata_req(dsp, mp);
438 		} else {
439 			dld_wput_nondata(dsp, mp);
440 		}
441 		break;
442 	}
443 
444 	case M_IOCTL:
445 		dld_wput_nondata(dsp, mp);
446 		break;
447 
448 	case M_FLUSH:
449 		if (*mp->b_rptr & FLUSHW) {
450 			DLD_CLRQFULL(dsp);
451 			*mp->b_rptr &= ~FLUSHW;
452 		}
453 
454 		if (*mp->b_rptr & FLUSHR) {
455 			qreply(wq, mp);
456 		} else {
457 			freemsg(mp);
458 		}
459 		break;
460 
461 	default:
462 		freemsg(mp);
463 		break;
464 	}
465 }
466 
467 /*
468  * qi_srvp: srv(9e)
469  */
470 void
471 dld_wsrv(queue_t *wq)
472 {
473 	dld_str_t	*dsp = wq->q_ptr;
474 
475 	DLD_CLRQFULL(dsp);
476 }
477 
478 void
479 dld_init_ops(struct dev_ops *ops, const char *name)
480 {
481 	struct streamtab *stream;
482 	struct qinit *rq, *wq;
483 	struct module_info *modinfo;
484 
485 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
486 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
487 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
488 	modinfo->mi_minpsz = 0;
489 	modinfo->mi_maxpsz = 64*1024;
490 	modinfo->mi_hiwat  = 1;
491 	modinfo->mi_lowat = 0;
492 
493 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
494 	rq->qi_qopen = dld_open;
495 	rq->qi_qclose = dld_close;
496 	rq->qi_minfo = modinfo;
497 
498 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
499 	wq->qi_putp = (pfi_t)dld_wput;
500 	wq->qi_srvp = (pfi_t)dld_wsrv;
501 	wq->qi_minfo = modinfo;
502 
503 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
504 	stream->st_rdinit = rq;
505 	stream->st_wrinit = wq;
506 	ops->devo_cb_ops->cb_str = stream;
507 
508 	if (ops->devo_getinfo == NULL)
509 		ops->devo_getinfo = &dld_getinfo;
510 }
511 
512 void
513 dld_fini_ops(struct dev_ops *ops)
514 {
515 	struct streamtab *stream;
516 	struct qinit *rq, *wq;
517 	struct module_info *modinfo;
518 
519 	stream = ops->devo_cb_ops->cb_str;
520 	rq = stream->st_rdinit;
521 	wq = stream->st_wrinit;
522 	modinfo = rq->qi_minfo;
523 	ASSERT(wq->qi_minfo == modinfo);
524 
525 	kmem_free(stream, sizeof (struct streamtab));
526 	kmem_free(wq, sizeof (struct qinit));
527 	kmem_free(rq, sizeof (struct qinit));
528 	kmem_free(modinfo->mi_idname, FMNAMESZ);
529 	kmem_free(modinfo, sizeof (struct module_info));
530 }
531 
532 /*
533  * Initialize this module's data structures.
534  */
535 void
536 dld_str_init(void)
537 {
538 	/*
539 	 * Create dld_str_t object cache.
540 	 */
541 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
542 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
543 	ASSERT(str_cachep != NULL);
544 
545 	/*
546 	 * Create a hash table for maintaining dld_str_t's.
547 	 * The ds_minor field (the clone minor number) of a dld_str_t
548 	 * is used as a key for this hash table because this number is
549 	 * globally unique (allocated from "dls_minor_arena").
550 	 */
551 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
552 	    mod_hash_null_valdtor);
553 
554 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
555 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
556 
557 	dld_taskq_quit = B_FALSE;
558 	dld_taskq_done = B_FALSE;
559 	list_create(&dld_taskq_list, sizeof (dld_str_t),
560 	    offsetof(dld_str_t, ds_tqlist));
561 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
562 	    &p0, TS_RUN, minclsyspri);
563 }
564 
565 /*
566  * Tear down this module's data structures.
567  */
568 int
569 dld_str_fini(void)
570 {
571 	/*
572 	 * Make sure that there are no objects in use.
573 	 */
574 	if (str_count != 0)
575 		return (EBUSY);
576 
577 	/*
578 	 * Ask the dld_taskq thread to quit and wait for it to be done
579 	 */
580 	mutex_enter(&dld_taskq_lock);
581 	dld_taskq_quit = B_TRUE;
582 	cv_signal(&dld_taskq_cv);
583 	while (!dld_taskq_done)
584 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
585 	mutex_exit(&dld_taskq_lock);
586 	list_destroy(&dld_taskq_list);
587 	/*
588 	 * Destroy object cache.
589 	 */
590 	kmem_cache_destroy(str_cachep);
591 	mod_hash_destroy_idhash(str_hashp);
592 	return (0);
593 }
594 
595 /*
596  * Create a new dld_str_t object.
597  */
598 dld_str_t *
599 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
600 {
601 	dld_str_t	*dsp;
602 	int		err;
603 
604 	/*
605 	 * Allocate an object from the cache.
606 	 */
607 	atomic_add_32(&str_count, 1);
608 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
609 
610 	/*
611 	 * Allocate the dummy mblk for flow-control.
612 	 */
613 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
614 	if (dsp->ds_tx_flow_mp == NULL) {
615 		kmem_cache_free(str_cachep, dsp);
616 		atomic_add_32(&str_count, -1);
617 		return (NULL);
618 	}
619 	dsp->ds_type = type;
620 	dsp->ds_major = major;
621 	dsp->ds_style = style;
622 
623 	/*
624 	 * Initialize the queue pointers.
625 	 */
626 	ASSERT(RD(rq) == rq);
627 	dsp->ds_rq = rq;
628 	dsp->ds_wq = WR(rq);
629 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
630 
631 	/*
632 	 * We want explicit control over our write-side STREAMS queue
633 	 * where the dummy mblk gets added/removed for flow-control.
634 	 */
635 	noenable(WR(rq));
636 
637 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
638 	    (mod_hash_val_t)dsp);
639 	ASSERT(err == 0);
640 	return (dsp);
641 }
642 
643 /*
644  * Destroy a dld_str_t object.
645  */
646 void
647 dld_str_destroy(dld_str_t *dsp)
648 {
649 	queue_t		*rq;
650 	queue_t		*wq;
651 	mod_hash_val_t	val;
652 
653 	/*
654 	 * Clear the queue pointers.
655 	 */
656 	rq = dsp->ds_rq;
657 	wq = dsp->ds_wq;
658 	ASSERT(wq == WR(rq));
659 	rq->q_ptr = wq->q_ptr = NULL;
660 	dsp->ds_rq = dsp->ds_wq = NULL;
661 
662 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
663 	ASSERT(dsp->ds_sap == 0);
664 	ASSERT(dsp->ds_mh == NULL);
665 	ASSERT(dsp->ds_mch == NULL);
666 	ASSERT(dsp->ds_promisc == 0);
667 	ASSERT(dsp->ds_mph == NULL);
668 	ASSERT(dsp->ds_mip == NULL);
669 	ASSERT(dsp->ds_mnh == NULL);
670 
671 	ASSERT(dsp->ds_polling == B_FALSE);
672 	ASSERT(dsp->ds_direct == B_FALSE);
673 	ASSERT(dsp->ds_lso == B_FALSE);
674 	ASSERT(dsp->ds_lso_max == 0);
675 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
676 
677 	/*
678 	 * Reinitialize all the flags.
679 	 */
680 	dsp->ds_notifications = 0;
681 	dsp->ds_passivestate = DLD_UNINITIALIZED;
682 	dsp->ds_mode = DLD_UNITDATA;
683 	dsp->ds_native = B_FALSE;
684 	dsp->ds_nonip = B_FALSE;
685 
686 	ASSERT(dsp->ds_datathr_cnt == 0);
687 	ASSERT(dsp->ds_pending_head == NULL);
688 	ASSERT(dsp->ds_pending_tail == NULL);
689 	ASSERT(!dsp->ds_dlpi_pending);
690 
691 	ASSERT(dsp->ds_dlp == NULL);
692 	ASSERT(dsp->ds_dmap == NULL);
693 	ASSERT(dsp->ds_rx == NULL);
694 	ASSERT(dsp->ds_rx_arg == NULL);
695 	ASSERT(dsp->ds_next == NULL);
696 	ASSERT(dsp->ds_head == NULL);
697 
698 	/*
699 	 * Free the dummy mblk if exists.
700 	 */
701 	if (dsp->ds_tx_flow_mp != NULL) {
702 		freeb(dsp->ds_tx_flow_mp);
703 		dsp->ds_tx_flow_mp = NULL;
704 	}
705 
706 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
707 	ASSERT(dsp == (dld_str_t *)val);
708 
709 	/*
710 	 * Free the object back to the cache.
711 	 */
712 	kmem_cache_free(str_cachep, dsp);
713 	atomic_add_32(&str_count, -1);
714 }
715 
716 /*
717  * kmem_cache contructor function: see kmem_cache_create(9f).
718  */
719 /*ARGSUSED*/
720 static int
721 str_constructor(void *buf, void *cdrarg, int kmflags)
722 {
723 	dld_str_t	*dsp = buf;
724 
725 	bzero(buf, sizeof (dld_str_t));
726 
727 	/*
728 	 * Allocate a new minor number.
729 	 */
730 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
731 		return (-1);
732 
733 	/*
734 	 * Initialize the DLPI state machine.
735 	 */
736 	dsp->ds_dlstate = DL_UNATTACHED;
737 
738 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
739 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
740 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
741 
742 	return (0);
743 }
744 
745 /*
746  * kmem_cache destructor function.
747  */
748 /*ARGSUSED*/
749 static void
750 str_destructor(void *buf, void *cdrarg)
751 {
752 	dld_str_t	*dsp = buf;
753 
754 	/*
755 	 * Release the minor number.
756 	 */
757 	mac_minor_rele(dsp->ds_minor);
758 
759 	ASSERT(dsp->ds_tx_flow_mp == NULL);
760 
761 	mutex_destroy(&dsp->ds_lock);
762 	cv_destroy(&dsp->ds_datathr_cv);
763 	cv_destroy(&dsp->ds_dlpi_pending_cv);
764 }
765 
766 /*
767  * Update the priority bits and VID (may need to insert tag if mp points
768  * to an untagged packet.
769  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
770  */
771 static mblk_t *
772 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
773     link_tagmode_t tagmode)
774 {
775 	mblk_t *hmp;
776 	struct ether_vlan_header *evhp;
777 	struct ether_header *ehp;
778 	uint16_t old_tci = 0;
779 	size_t len;
780 
781 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
782 
783 	evhp = (struct ether_vlan_header *)mp->b_rptr;
784 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
785 		/*
786 		 * Tagged packet, update the priority bits.
787 		 */
788 		len = sizeof (struct ether_vlan_header);
789 
790 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
791 			/*
792 			 * In case some drivers only check the db_ref
793 			 * count of the first mblk, we pullup the
794 			 * message into a single mblk.
795 			 */
796 			hmp = msgpullup(mp, -1);
797 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
798 				freemsg(hmp);
799 				return (NULL);
800 			} else {
801 				freemsg(mp);
802 				mp = hmp;
803 			}
804 		}
805 
806 		evhp = (struct ether_vlan_header *)mp->b_rptr;
807 		old_tci = ntohs(evhp->ether_tci);
808 	} else {
809 		/*
810 		 * Untagged packet.  Two factors will cause us to insert a
811 		 * VLAN header:
812 		 * - This is a VLAN link (vid is specified)
813 		 * - The link supports user priority tagging and the priority
814 		 *   is non-zero.
815 		 */
816 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
817 			return (mp);
818 
819 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
820 		if (hmp == NULL)
821 			return (NULL);
822 
823 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
824 		ehp = (struct ether_header *)mp->b_rptr;
825 
826 		/*
827 		 * Copy the MAC addresses and typelen
828 		 */
829 		bcopy(ehp, evhp, (ETHERADDRL * 2));
830 		evhp->ether_type = ehp->ether_type;
831 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
832 
833 		hmp->b_wptr += sizeof (struct ether_vlan_header);
834 		mp->b_rptr += sizeof (struct ether_header);
835 
836 		/*
837 		 * Free the original message if it's now empty. Link the
838 		 * rest of the messages to the header message.
839 		 */
840 		if (MBLKL(mp) == 0) {
841 			hmp->b_cont = mp->b_cont;
842 			freeb(mp);
843 		} else {
844 			hmp->b_cont = mp;
845 		}
846 		mp = hmp;
847 	}
848 
849 	if (pri == 0)
850 		pri = VLAN_PRI(old_tci);
851 	if (vid == VLAN_ID_NONE)
852 		vid = VLAN_ID(old_tci);
853 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
854 	return (mp);
855 }
856 
857 /*
858  * M_DATA put (IP fast-path mode)
859  */
860 mac_tx_cookie_t
861 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
862     uint16_t flag)
863 {
864 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
865 	mblk_t *newmp;
866 	uint_t pri;
867 	mac_tx_cookie_t cookie;
868 
869 	if (is_ethernet) {
870 		/*
871 		 * Update the priority bits to the assigned priority.
872 		 */
873 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
874 
875 		if (pri != 0) {
876 			newmp = i_dld_ether_header_update_tag(mp, pri,
877 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
878 			if (newmp == NULL)
879 				goto discard;
880 			mp = newmp;
881 		}
882 	}
883 
884 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
885 		DLD_SETQFULL(dsp);
886 	}
887 	return (cookie);
888 
889 discard:
890 	/* TODO: bump kstat? */
891 	freemsg(mp);
892 	return (NULL);
893 }
894 
895 /*
896  * M_DATA put (DLIOCRAW mode)
897  */
898 static void
899 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
900 {
901 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
902 	mblk_t *bp, *newmp;
903 	size_t size;
904 	mac_header_info_t mhi;
905 	uint_t pri, vid, dvid;
906 	uint_t max_sdu;
907 
908 	/*
909 	 * Certain MAC type plugins provide an illusion for raw DLPI
910 	 * consumers.  They pretend that the MAC layer is something that
911 	 * it's not for the benefit of observability tools.  For example,
912 	 * mac_wifi pretends that it's Ethernet for such consumers.
913 	 * Here, unless native mode is enabled, we call into the MAC layer so
914 	 * that this illusion can be maintained.  The plugin will optionally
915 	 * transform the MAC header here into something that can be passed
916 	 * down.  The header goes from raw mode to "cooked" mode.
917 	 */
918 	if (!dsp->ds_native) {
919 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
920 			goto discard;
921 		mp = newmp;
922 	}
923 
924 	size = MBLKL(mp);
925 
926 	/*
927 	 * Check the packet is not too big and that any remaining
928 	 * fragment list is composed entirely of M_DATA messages. (We
929 	 * know the first fragment was M_DATA otherwise we could not
930 	 * have got here).
931 	 */
932 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
933 		if (DB_TYPE(bp) != M_DATA)
934 			goto discard;
935 		size += MBLKL(bp);
936 	}
937 
938 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
939 		goto discard;
940 
941 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
942 	/*
943 	 * If LSO is enabled, check the size against lso_max. Otherwise,
944 	 * compare the packet size with max_sdu.
945 	 */
946 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
947 	if (size > max_sdu + mhi.mhi_hdrsize)
948 		goto discard;
949 
950 	if (is_ethernet) {
951 		dvid = mac_client_vid(dsp->ds_mch);
952 
953 		/*
954 		 * Discard the packet if this is a VLAN stream but the VID in
955 		 * the packet is not correct.
956 		 */
957 		vid = VLAN_ID(mhi.mhi_tci);
958 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
959 			goto discard;
960 
961 		/*
962 		 * Discard the packet if this packet is a tagged packet
963 		 * but both pri and VID are 0.
964 		 */
965 		pri = VLAN_PRI(mhi.mhi_tci);
966 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
967 		    vid == VLAN_ID_NONE)
968 			goto discard;
969 
970 		/*
971 		 * Update the priority bits to the per-stream priority if
972 		 * priority is not set in the packet. Update the VID for
973 		 * packets on a VLAN stream.
974 		 */
975 		pri = (pri == 0) ? dsp->ds_pri : 0;
976 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
977 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
978 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
979 				goto discard;
980 			}
981 			mp = newmp;
982 		}
983 	}
984 
985 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
986 		/* Turn on flow-control for dld */
987 		DLD_SETQFULL(dsp);
988 	}
989 	return;
990 
991 discard:
992 	/* TODO: bump kstat? */
993 	freemsg(mp);
994 }
995 
996 /*
997  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
998  */
999 int
1000 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1001 {
1002 	dev_t			dev;
1003 	int			err;
1004 	const char		*drvname;
1005 	mac_perim_handle_t	mph = NULL;
1006 	boolean_t		qassociated = B_FALSE;
1007 	dls_link_t		*dlp = NULL;
1008 	dls_dl_handle_t		ddp = NULL;
1009 
1010 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1011 		return (EINVAL);
1012 
1013 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1014 		return (ENOTSUP);
1015 
1016 	/*
1017 	 * /dev node access. This will still be supported for backward
1018 	 * compatibility reason.
1019 	 */
1020 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1021 	    (strcmp(drvname, "vnic") != 0)) {
1022 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1023 			return (EINVAL);
1024 		qassociated = B_TRUE;
1025 	}
1026 
1027 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1028 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1029 		goto failed;
1030 
1031 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1032 		goto failed;
1033 
1034 	/*
1035 	 * Open a channel.
1036 	 */
1037 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1038 		goto failed;
1039 
1040 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
1041 		goto failed;
1042 
1043 	/*
1044 	 * Set the default packet priority.
1045 	 */
1046 	dsp->ds_pri = 0;
1047 
1048 	/*
1049 	 * Add a notify function so that the we get updates from the MAC.
1050 	 */
1051 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1052 	dsp->ds_dlstate = DL_UNBOUND;
1053 	mac_perim_exit(mph);
1054 	return (0);
1055 
1056 failed:
1057 	if (dlp != NULL)
1058 		dls_link_rele(dlp);
1059 	if (mph != NULL)
1060 		mac_perim_exit(mph);
1061 	if (ddp != NULL)
1062 		dls_devnet_rele(ddp);
1063 	if (qassociated)
1064 		(void) qassociate(dsp->ds_wq, -1);
1065 
1066 	return (err);
1067 }
1068 
1069 /*
1070  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1071  * from close(2) for style 2.
1072  */
1073 void
1074 dld_str_detach(dld_str_t *dsp)
1075 {
1076 	mac_perim_handle_t	mph;
1077 	int			err;
1078 
1079 	ASSERT(dsp->ds_datathr_cnt == 0);
1080 
1081 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1082 	/*
1083 	 * Remove the notify function.
1084 	 *
1085 	 * Note that we cannot wait for the notification callback to be removed
1086 	 * since it could cause the deadlock with str_notify() since they both
1087 	 * need the mac perimeter. Continue if we cannot remove the
1088 	 * notification callback right now and wait after we leave the
1089 	 * perimeter.
1090 	 */
1091 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1092 	dsp->ds_mnh = NULL;
1093 
1094 	/*
1095 	 * Disable the capabilities
1096 	 */
1097 	dld_capabilities_disable(dsp);
1098 
1099 	/*
1100 	 * Clear LSO flags.
1101 	 */
1102 	dsp->ds_lso = B_FALSE;
1103 	dsp->ds_lso_max = 0;
1104 
1105 	dls_close(dsp);
1106 	mac_perim_exit(mph);
1107 
1108 	/*
1109 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1110 	 * because the notification callback was in progress, wait for
1111 	 * it to finish before we proceed.
1112 	 */
1113 	if (err != 0)
1114 		mac_notify_remove_wait(dsp->ds_mh);
1115 
1116 	/*
1117 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1118 	 * automatically in the call to dls_devnet_rele.
1119 	 */
1120 	dls_devnet_rele(dsp->ds_ddh);
1121 
1122 	dsp->ds_sap = 0;
1123 	dsp->ds_mh = NULL;
1124 	dsp->ds_mch = NULL;
1125 	dsp->ds_mip = NULL;
1126 
1127 	if (dsp->ds_style == DL_STYLE2)
1128 		(void) qassociate(dsp->ds_wq, -1);
1129 
1130 	/*
1131 	 * Re-initialize the DLPI state machine.
1132 	 */
1133 	dsp->ds_dlstate = DL_UNATTACHED;
1134 }
1135 
1136 /*
1137  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1138  * tags before sending packets up to the DLS clients, with the exception of
1139  * special priority tagged packets, in that case, we set the VID to 0.
1140  * mp must be a VLAN tagged packet.
1141  */
1142 static mblk_t *
1143 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1144 {
1145 	mblk_t *newmp;
1146 	struct ether_vlan_header *evhp;
1147 	uint16_t tci, new_tci;
1148 
1149 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1150 	if (DB_REF(mp) > 1) {
1151 		newmp = copymsg(mp);
1152 		if (newmp == NULL)
1153 			return (NULL);
1154 		freemsg(mp);
1155 		mp = newmp;
1156 	}
1157 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1158 
1159 	tci = ntohs(evhp->ether_tci);
1160 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1161 		/*
1162 		 * Priority is 0, strip the tag.
1163 		 */
1164 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1165 		mp->b_rptr += VLAN_TAGSZ;
1166 	} else {
1167 		/*
1168 		 * Priority is not 0, update the VID to 0.
1169 		 */
1170 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1171 		evhp->ether_tci = htons(new_tci);
1172 	}
1173 	return (mp);
1174 }
1175 
1176 /*
1177  * Raw mode receive function.
1178  */
1179 /*ARGSUSED*/
1180 void
1181 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1182     mac_header_info_t *mhip)
1183 {
1184 	dld_str_t *dsp = (dld_str_t *)arg;
1185 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1186 	mblk_t *next, *newmp;
1187 
1188 	ASSERT(mp != NULL);
1189 	do {
1190 		/*
1191 		 * Get the pointer to the next packet in the chain and then
1192 		 * clear b_next before the packet gets passed on.
1193 		 */
1194 		next = mp->b_next;
1195 		mp->b_next = NULL;
1196 
1197 		/*
1198 		 * Wind back b_rptr to point at the MAC header.
1199 		 */
1200 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1201 		mp->b_rptr -= mhip->mhi_hdrsize;
1202 
1203 		/*
1204 		 * Certain MAC type plugins provide an illusion for raw
1205 		 * DLPI consumers.  They pretend that the MAC layer is
1206 		 * something that it's not for the benefit of observability
1207 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1208 		 * for such consumers.	Here, unless native mode is enabled,
1209 		 * we call into the MAC layer so that this illusion can be
1210 		 * maintained.	The plugin will optionally transform the MAC
1211 		 * header here into something that can be passed up to raw
1212 		 * consumers.  The header goes from "cooked" mode to raw mode.
1213 		 */
1214 		if (!dsp->ds_native) {
1215 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1216 			if (newmp == NULL) {
1217 				freemsg(mp);
1218 				goto next;
1219 			}
1220 			mp = newmp;
1221 		}
1222 
1223 		/*
1224 		 * Strip the VLAN tag for VLAN streams.
1225 		 */
1226 		if (is_ethernet &&
1227 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1228 			/*
1229 			 * The priority should be kept only for VLAN
1230 			 * data-links.
1231 			 */
1232 			newmp = i_dld_ether_header_strip_tag(mp,
1233 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1234 			if (newmp == NULL) {
1235 				freemsg(mp);
1236 				goto next;
1237 			}
1238 			mp = newmp;
1239 		}
1240 
1241 		/*
1242 		 * Pass the packet on.
1243 		 */
1244 		if (canputnext(dsp->ds_rq))
1245 			putnext(dsp->ds_rq, mp);
1246 		else
1247 			freemsg(mp);
1248 
1249 next:
1250 		/*
1251 		 * Move on to the next packet in the chain.
1252 		 */
1253 		mp = next;
1254 	} while (mp != NULL);
1255 }
1256 
1257 /*
1258  * Fast-path receive function.
1259  */
1260 /*ARGSUSED*/
1261 void
1262 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1263     mac_header_info_t *mhip)
1264 {
1265 	dld_str_t *dsp = (dld_str_t *)arg;
1266 	mblk_t *next;
1267 	size_t offset = 0;
1268 
1269 	/*
1270 	 * MAC header stripping rules:
1271 	 *    - Tagged packets:
1272 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1273 	 *	b. Physical streams
1274 	 *	- VLAN packets (non-zero VID). The stream must be either a
1275 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1276 	 *	  Strip the Ethernet header but keep the VLAN header.
1277 	 *	- Special tagged packets (zero VID)
1278 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1279 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1280 	 *	    keep the VLAN header.
1281 	 *	  * Otherwise, strip the whole VLAN header.
1282 	 *    - Untagged packets. Strip the whole MAC header.
1283 	 */
1284 	if (mhip->mhi_istagged &&
1285 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1286 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1287 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1288 		offset = VLAN_TAGSZ;
1289 	}
1290 
1291 	ASSERT(mp != NULL);
1292 	do {
1293 		/*
1294 		 * Get the pointer to the next packet in the chain and then
1295 		 * clear b_next before the packet gets passed on.
1296 		 */
1297 		next = mp->b_next;
1298 		mp->b_next = NULL;
1299 
1300 		/*
1301 		 * Wind back b_rptr to point at the VLAN header.
1302 		 */
1303 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1304 		mp->b_rptr -= offset;
1305 
1306 		/*
1307 		 * Pass the packet on.
1308 		 */
1309 		if (canputnext(dsp->ds_rq))
1310 			putnext(dsp->ds_rq, mp);
1311 		else
1312 			freemsg(mp);
1313 		/*
1314 		 * Move on to the next packet in the chain.
1315 		 */
1316 		mp = next;
1317 	} while (mp != NULL);
1318 }
1319 
1320 /*
1321  * Default receive function (send DL_UNITDATA_IND messages).
1322  */
1323 /*ARGSUSED*/
1324 void
1325 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1326     mac_header_info_t *mhip)
1327 {
1328 	dld_str_t		*dsp = (dld_str_t *)arg;
1329 	mblk_t			*ud_mp;
1330 	mblk_t			*next;
1331 	size_t			offset = 0;
1332 	boolean_t		strip_vlan = B_TRUE;
1333 
1334 	/*
1335 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1336 	 */
1337 	if (mhip->mhi_istagged &&
1338 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1339 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1340 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1341 		offset = VLAN_TAGSZ;
1342 		strip_vlan = B_FALSE;
1343 	}
1344 
1345 	ASSERT(mp != NULL);
1346 	do {
1347 		/*
1348 		 * Get the pointer to the next packet in the chain and then
1349 		 * clear b_next before the packet gets passed on.
1350 		 */
1351 		next = mp->b_next;
1352 		mp->b_next = NULL;
1353 
1354 		/*
1355 		 * Wind back b_rptr to point at the MAC header.
1356 		 */
1357 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1358 		mp->b_rptr -= mhip->mhi_hdrsize;
1359 
1360 		/*
1361 		 * Create the DL_UNITDATA_IND M_PROTO.
1362 		 */
1363 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1364 			freemsgchain(mp);
1365 			return;
1366 		}
1367 
1368 		/*
1369 		 * Advance b_rptr to point at the payload (or the VLAN header).
1370 		 */
1371 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1372 
1373 		/*
1374 		 * Prepend the DL_UNITDATA_IND.
1375 		 */
1376 		ud_mp->b_cont = mp;
1377 
1378 		/*
1379 		 * Send the message.
1380 		 */
1381 		if (canputnext(dsp->ds_rq))
1382 			putnext(dsp->ds_rq, ud_mp);
1383 		else
1384 			freemsg(ud_mp);
1385 
1386 		/*
1387 		 * Move on to the next packet in the chain.
1388 		 */
1389 		mp = next;
1390 	} while (mp != NULL);
1391 }
1392 
1393 /*
1394  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1395  */
1396 static void
1397 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu, uint_t multicast_sdu)
1398 {
1399 	mblk_t		*mp;
1400 	dl_notify_ind_t *dlip;
1401 
1402 	if (!(dsp->ds_notifications & (DL_NOTE_SDU_SIZE|DL_NOTE_SDU_SIZE2)))
1403 		return;
1404 
1405 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1406 	    M_PROTO, 0)) == NULL)
1407 		return;
1408 
1409 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1410 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1411 	dlip->dl_primitive = DL_NOTIFY_IND;
1412 	if (dsp->ds_notifications & DL_NOTE_SDU_SIZE2) {
1413 		dlip->dl_notification = DL_NOTE_SDU_SIZE2;
1414 		dlip->dl_data1 = max_sdu;
1415 		dlip->dl_data2 = multicast_sdu;
1416 	} else {
1417 		dlip->dl_notification = DL_NOTE_SDU_SIZE;
1418 		dlip->dl_data = max_sdu;
1419 	}
1420 
1421 	qreply(dsp->ds_wq, mp);
1422 }
1423 
1424 /*
1425  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1426  * current state of the interface.
1427  */
1428 void
1429 dld_str_notify_ind(dld_str_t *dsp)
1430 {
1431 	mac_notify_type_t	type;
1432 
1433 	for (type = 0; type < MAC_NNOTE; type++)
1434 		str_notify(dsp, type);
1435 }
1436 
1437 typedef struct dl_unitdata_ind_wrapper {
1438 	dl_unitdata_ind_t	dl_unitdata;
1439 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1440 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1441 } dl_unitdata_ind_wrapper_t;
1442 
1443 /*
1444  * Create a DL_UNITDATA_IND M_PROTO message.
1445  */
1446 static mblk_t *
1447 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1448 {
1449 	mblk_t				*nmp;
1450 	dl_unitdata_ind_wrapper_t	*dlwp;
1451 	dl_unitdata_ind_t		*dlp;
1452 	mac_header_info_t		mhi;
1453 	uint_t				addr_length;
1454 	uint8_t				*daddr;
1455 	uint8_t				*saddr;
1456 
1457 	/*
1458 	 * Get the packet header information.
1459 	 */
1460 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
1461 		return (NULL);
1462 
1463 	/*
1464 	 * Allocate a message large enough to contain the wrapper structure
1465 	 * defined above.
1466 	 */
1467 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1468 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1469 	    DL_UNITDATA_IND)) == NULL)
1470 		return (NULL);
1471 
1472 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1473 
1474 	dlp = &(dlwp->dl_unitdata);
1475 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1476 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1477 
1478 	/*
1479 	 * Copy in the destination address.
1480 	 */
1481 	addr_length = dsp->ds_mip->mi_addr_length;
1482 	daddr = dlwp->dl_dest_addr;
1483 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1484 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1485 
1486 	/*
1487 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1488 	 */
1489 	if (mhi.mhi_istagged && !strip_vlan)
1490 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1491 	else
1492 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1493 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1494 
1495 	/*
1496 	 * If the destination address was multicast or broadcast then the
1497 	 * dl_group_address field should be non-zero.
1498 	 */
1499 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1500 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1501 
1502 	/*
1503 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1504 	 * for example) may not have access to source information.
1505 	 */
1506 	if (mhi.mhi_saddr == NULL) {
1507 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1508 	} else {
1509 		saddr = dlwp->dl_src_addr;
1510 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1511 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1512 
1513 		/*
1514 		 * Set the source DLSAP to the packet ethertype.
1515 		 */
1516 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1517 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1518 	}
1519 
1520 	return (nmp);
1521 }
1522 
1523 /*
1524  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1525  */
1526 static void
1527 str_notify_promisc_on_phys(dld_str_t *dsp)
1528 {
1529 	mblk_t		*mp;
1530 	dl_notify_ind_t	*dlip;
1531 
1532 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1533 		return;
1534 
1535 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1536 	    M_PROTO, 0)) == NULL)
1537 		return;
1538 
1539 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1540 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1541 	dlip->dl_primitive = DL_NOTIFY_IND;
1542 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1543 
1544 	qreply(dsp->ds_wq, mp);
1545 }
1546 
1547 /*
1548  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1549  */
1550 static void
1551 str_notify_promisc_off_phys(dld_str_t *dsp)
1552 {
1553 	mblk_t		*mp;
1554 	dl_notify_ind_t	*dlip;
1555 
1556 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1557 		return;
1558 
1559 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1560 	    M_PROTO, 0)) == NULL)
1561 		return;
1562 
1563 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1564 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1565 	dlip->dl_primitive = DL_NOTIFY_IND;
1566 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1567 
1568 	qreply(dsp->ds_wq, mp);
1569 }
1570 
1571 /*
1572  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1573  */
1574 static void
1575 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1576 {
1577 	mblk_t		*mp;
1578 	dl_notify_ind_t	*dlip;
1579 	uint_t		addr_length;
1580 	uint16_t	ethertype;
1581 
1582 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1583 		return;
1584 
1585 	addr_length = dsp->ds_mip->mi_addr_length;
1586 	if ((mp = mexchange(dsp->ds_wq, NULL,
1587 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1588 	    M_PROTO, 0)) == NULL)
1589 		return;
1590 
1591 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1592 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1593 	dlip->dl_primitive = DL_NOTIFY_IND;
1594 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1595 	dlip->dl_data = addr_type;
1596 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1597 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1598 
1599 	bcopy(addr, &dlip[1], addr_length);
1600 
1601 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1602 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1603 
1604 	qreply(dsp->ds_wq, mp);
1605 }
1606 
1607 /*
1608  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1609  */
1610 static void
1611 str_notify_link_up(dld_str_t *dsp)
1612 {
1613 	mblk_t		*mp;
1614 	dl_notify_ind_t	*dlip;
1615 
1616 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1617 		return;
1618 
1619 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1620 	    M_PROTO, 0)) == NULL)
1621 		return;
1622 
1623 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1624 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1625 	dlip->dl_primitive = DL_NOTIFY_IND;
1626 	dlip->dl_notification = DL_NOTE_LINK_UP;
1627 
1628 	qreply(dsp->ds_wq, mp);
1629 }
1630 
1631 /*
1632  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1633  */
1634 static void
1635 str_notify_link_down(dld_str_t *dsp)
1636 {
1637 	mblk_t		*mp;
1638 	dl_notify_ind_t	*dlip;
1639 
1640 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1641 		return;
1642 
1643 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1644 	    M_PROTO, 0)) == NULL)
1645 		return;
1646 
1647 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1648 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1649 	dlip->dl_primitive = DL_NOTIFY_IND;
1650 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1651 
1652 	qreply(dsp->ds_wq, mp);
1653 }
1654 
1655 /*
1656  * DL_NOTIFY_IND: DL_NOTE_SPEED
1657  */
1658 static void
1659 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1660 {
1661 	mblk_t		*mp;
1662 	dl_notify_ind_t	*dlip;
1663 
1664 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1665 		return;
1666 
1667 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1668 	    M_PROTO, 0)) == NULL)
1669 		return;
1670 
1671 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1672 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1673 	dlip->dl_primitive = DL_NOTIFY_IND;
1674 	dlip->dl_notification = DL_NOTE_SPEED;
1675 	dlip->dl_data = speed;
1676 
1677 	qreply(dsp->ds_wq, mp);
1678 }
1679 
1680 /*
1681  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1682  */
1683 static void
1684 str_notify_capab_reneg(dld_str_t *dsp)
1685 {
1686 	mblk_t		*mp;
1687 	dl_notify_ind_t	*dlip;
1688 
1689 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1690 		return;
1691 
1692 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1693 	    M_PROTO, 0)) == NULL)
1694 		return;
1695 
1696 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1697 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1698 	dlip->dl_primitive = DL_NOTIFY_IND;
1699 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1700 
1701 	qreply(dsp->ds_wq, mp);
1702 }
1703 
1704 /*
1705  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1706  */
1707 static void
1708 str_notify_fastpath_flush(dld_str_t *dsp)
1709 {
1710 	mblk_t		*mp;
1711 	dl_notify_ind_t	*dlip;
1712 
1713 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1714 		return;
1715 
1716 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1717 	    M_PROTO, 0)) == NULL)
1718 		return;
1719 
1720 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1721 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1722 	dlip->dl_primitive = DL_NOTIFY_IND;
1723 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1724 
1725 	qreply(dsp->ds_wq, mp);
1726 }
1727 
1728 static void
1729 str_notify_allowed_ips(dld_str_t *dsp)
1730 {
1731 	mblk_t		*mp;
1732 	dl_notify_ind_t	*dlip;
1733 	size_t		mp_size;
1734 	mac_protect_t	*mrp;
1735 
1736 	if (!(dsp->ds_notifications & DL_NOTE_ALLOWED_IPS))
1737 		return;
1738 
1739 	mp_size = sizeof (mac_protect_t) + sizeof (dl_notify_ind_t);
1740 	if ((mp = mexchange(dsp->ds_wq, NULL, mp_size, M_PROTO, 0)) == NULL)
1741 		return;
1742 
1743 	mrp = mac_protect_get(dsp->ds_mh);
1744 	bzero(mp->b_rptr, mp_size);
1745 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1746 	dlip->dl_primitive = DL_NOTIFY_IND;
1747 	dlip->dl_notification = DL_NOTE_ALLOWED_IPS;
1748 	dlip->dl_data = 0;
1749 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1750 	dlip->dl_addr_length = sizeof (mac_protect_t);
1751 	bcopy(mrp, mp->b_rptr + sizeof (dl_notify_ind_t),
1752 	    sizeof (mac_protect_t));
1753 
1754 	qreply(dsp->ds_wq, mp);
1755 }
1756 
1757 /*
1758  * MAC notification callback.
1759  */
1760 void
1761 str_notify(void *arg, mac_notify_type_t type)
1762 {
1763 	dld_str_t		*dsp = (dld_str_t *)arg;
1764 	queue_t			*q = dsp->ds_wq;
1765 	mac_handle_t		mh = dsp->ds_mh;
1766 	mac_client_handle_t	mch = dsp->ds_mch;
1767 	uint8_t			addr[MAXMACADDRLEN];
1768 
1769 	switch (type) {
1770 	case MAC_NOTE_TX:
1771 		qenable(q);
1772 		break;
1773 
1774 	case MAC_NOTE_DEVPROMISC:
1775 		/*
1776 		 * Send the appropriate DL_NOTIFY_IND.
1777 		 */
1778 		if (mac_promisc_get(mh))
1779 			str_notify_promisc_on_phys(dsp);
1780 		else
1781 			str_notify_promisc_off_phys(dsp);
1782 		break;
1783 
1784 	case MAC_NOTE_UNICST:
1785 		/*
1786 		 * This notification is sent whenever the MAC unicast
1787 		 * address changes.
1788 		 */
1789 		mac_unicast_primary_get(mh, addr);
1790 
1791 		/*
1792 		 * Send the appropriate DL_NOTIFY_IND.
1793 		 */
1794 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1795 		break;
1796 
1797 	case MAC_NOTE_DEST:
1798 		/*
1799 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
1800 		 * destination address.
1801 		 */
1802 		if (mac_dst_get(dsp->ds_mh, addr))
1803 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1804 		break;
1805 
1806 	case MAC_NOTE_LOWLINK:
1807 	case MAC_NOTE_LINK:
1808 		/*
1809 		 * LOWLINK refers to the actual link status. For links that
1810 		 * are not part of a bridge instance LOWLINK and LINK state
1811 		 * are the same. But for a link part of a bridge instance
1812 		 * LINK state refers to the aggregate link status: "up" when
1813 		 * at least one link part of the bridge is up and is "down"
1814 		 * when all links part of the bridge are down.
1815 		 *
1816 		 * Clients can request to be notified of the LOWLINK state
1817 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1818 		 * daemon request lowlink state changes and upper layer clients
1819 		 * receive notifications of the aggregate link state changes
1820 		 * which is the default when requesting LINK UP/DOWN state
1821 		 * notifications.
1822 		 */
1823 
1824 		/*
1825 		 * Check that the notification type matches the one that we
1826 		 * want.  If we want lower-level link notifications, and this
1827 		 * is upper, or if we want upper and this is lower, then
1828 		 * ignore.
1829 		 */
1830 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1831 			break;
1832 		/*
1833 		 * This notification is sent every time the MAC driver
1834 		 * updates the link state.
1835 		 */
1836 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1837 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1838 		case LINK_STATE_UP: {
1839 			uint64_t speed;
1840 			/*
1841 			 * The link is up so send the appropriate
1842 			 * DL_NOTIFY_IND.
1843 			 */
1844 			str_notify_link_up(dsp);
1845 
1846 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1847 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1848 			break;
1849 		}
1850 		case LINK_STATE_DOWN:
1851 			/*
1852 			 * The link is down so send the appropriate
1853 			 * DL_NOTIFY_IND.
1854 			 */
1855 			str_notify_link_down(dsp);
1856 			break;
1857 
1858 		default:
1859 			break;
1860 		}
1861 		break;
1862 
1863 	case MAC_NOTE_CAPAB_CHG:
1864 		/*
1865 		 * This notification is sent whenever the MAC resources
1866 		 * change or capabilities change. We need to renegotiate
1867 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1868 		 */
1869 		str_notify_capab_reneg(dsp);
1870 		break;
1871 
1872 	case MAC_NOTE_SDU_SIZE: {
1873 		uint_t  max_sdu;
1874 		uint_t	multicast_sdu;
1875 		mac_sdu_get2(dsp->ds_mh, NULL, &max_sdu, &multicast_sdu);
1876 		str_notify_sdu_size(dsp, max_sdu, multicast_sdu);
1877 		break;
1878 	}
1879 
1880 	case MAC_NOTE_FASTPATH_FLUSH:
1881 		str_notify_fastpath_flush(dsp);
1882 		break;
1883 
1884 	/* Unused notifications */
1885 	case MAC_NOTE_MARGIN:
1886 		break;
1887 
1888 	case MAC_NOTE_ALLOWED_IPS:
1889 		str_notify_allowed_ips(dsp);
1890 		break;
1891 
1892 	default:
1893 		ASSERT(B_FALSE);
1894 		break;
1895 	}
1896 }
1897 
1898 /*
1899  * This function is called via a taskq mechansim to process all control
1900  * messages on a per 'dsp' end point.
1901  */
1902 static void
1903 dld_wput_nondata_task(void *arg)
1904 {
1905 	dld_str_t	*dsp = arg;
1906 	mblk_t		*mp;
1907 
1908 	mutex_enter(&dsp->ds_lock);
1909 	while (dsp->ds_pending_head != NULL) {
1910 		mp = dsp->ds_pending_head;
1911 		dsp->ds_pending_head = mp->b_next;
1912 		mp->b_next = NULL;
1913 		if (dsp->ds_pending_head == NULL)
1914 			dsp->ds_pending_tail = NULL;
1915 		mutex_exit(&dsp->ds_lock);
1916 
1917 		switch (DB_TYPE(mp)) {
1918 		case M_PROTO:
1919 		case M_PCPROTO:
1920 			dld_proto(dsp, mp);
1921 			break;
1922 		case M_IOCTL:
1923 			dld_ioc(dsp, mp);
1924 			break;
1925 		default:
1926 			ASSERT(0);
1927 		}
1928 
1929 		mutex_enter(&dsp->ds_lock);
1930 	}
1931 	ASSERT(dsp->ds_pending_tail == NULL);
1932 	dsp->ds_dlpi_pending = 0;
1933 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1934 	mutex_exit(&dsp->ds_lock);
1935 }
1936 
1937 /*
1938  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1939  * thread is started at boot time.
1940  */
1941 static void
1942 dld_taskq_dispatch(void)
1943 {
1944 	callb_cpr_t	cprinfo;
1945 	dld_str_t	*dsp;
1946 
1947 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1948 	    "dld_taskq_dispatch");
1949 	mutex_enter(&dld_taskq_lock);
1950 
1951 	while (!dld_taskq_quit) {
1952 		dsp = list_head(&dld_taskq_list);
1953 		while (dsp != NULL) {
1954 			list_remove(&dld_taskq_list, dsp);
1955 			mutex_exit(&dld_taskq_lock);
1956 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1957 			    dsp, TQ_SLEEP) != 0);
1958 			mutex_enter(&dld_taskq_lock);
1959 			dsp = list_head(&dld_taskq_list);
1960 		}
1961 
1962 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1963 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1964 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1965 	}
1966 
1967 	dld_taskq_done = B_TRUE;
1968 	cv_signal(&dld_taskq_cv);
1969 	CALLB_CPR_EXIT(&cprinfo);
1970 	thread_exit();
1971 }
1972 
1973 /*
1974  * All control operations are serialized on the 'dsp' and are also funneled
1975  * through a taskq mechanism to ensure that subsequent processing has kernel
1976  * context and can safely use cv_wait.
1977  *
1978  * Mechanisms to handle taskq dispatch failures
1979  *
1980  * The only way to be sure that taskq dispatch does not fail is to either
1981  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1982  * some number of entries and make sure that the number of outstanding requests
1983  * are less than that number. We can't use TQ_SLEEP since we don't know the
1984  * context. Nor can we bound the total number of 'dsp' end points. So we are
1985  * unable to use either of the above schemes, and are forced to deal with
1986  * taskq dispatch failures. Note that even dynamic taskq could fail in
1987  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1988  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1989  * framework.
1990  *
1991  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1992  * We also have a single global thread to retry the taskq dispatch. This
1993  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1994  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1995  */
1996 static void
1997 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1998 {
1999 	ASSERT(mp->b_next == NULL);
2000 	mutex_enter(&dsp->ds_lock);
2001 	if (dsp->ds_pending_head != NULL) {
2002 		ASSERT(dsp->ds_dlpi_pending);
2003 		dsp->ds_pending_tail->b_next = mp;
2004 		dsp->ds_pending_tail = mp;
2005 		mutex_exit(&dsp->ds_lock);
2006 		return;
2007 	}
2008 	ASSERT(dsp->ds_pending_tail == NULL);
2009 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2010 	/*
2011 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
2012 	 * thread is still active and is processing the last message, though
2013 	 * the pending queue has been emptied.
2014 	 */
2015 	if (dsp->ds_dlpi_pending) {
2016 		mutex_exit(&dsp->ds_lock);
2017 		return;
2018 	}
2019 
2020 	dsp->ds_dlpi_pending = 1;
2021 	mutex_exit(&dsp->ds_lock);
2022 
2023 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
2024 	    TQ_NOSLEEP) != 0)
2025 		return;
2026 
2027 	mutex_enter(&dld_taskq_lock);
2028 	list_insert_tail(&dld_taskq_list, dsp);
2029 	cv_signal(&dld_taskq_cv);
2030 	mutex_exit(&dld_taskq_lock);
2031 }
2032 
2033 /*
2034  * Process an M_IOCTL message.
2035  */
2036 static void
2037 dld_ioc(dld_str_t *dsp, mblk_t *mp)
2038 {
2039 	uint_t			cmd;
2040 
2041 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2042 	ASSERT(dsp->ds_type == DLD_DLPI);
2043 
2044 	switch (cmd) {
2045 	case DLIOCNATIVE:
2046 		ioc_native(dsp, mp);
2047 		break;
2048 	case DLIOCMARGININFO:
2049 		ioc_margin(dsp, mp);
2050 		break;
2051 	case DLIOCRAW:
2052 		ioc_raw(dsp, mp);
2053 		break;
2054 	case DLIOCHDRINFO:
2055 		ioc_fast(dsp, mp);
2056 		break;
2057 	case DLIOCLOWLINK:
2058 		ioc_lowlink(dsp, mp);
2059 		break;
2060 	default:
2061 		ioc(dsp, mp);
2062 	}
2063 }
2064 
2065 /*
2066  * DLIOCNATIVE
2067  */
2068 static void
2069 ioc_native(dld_str_t *dsp, mblk_t *mp)
2070 {
2071 	queue_t *q = dsp->ds_wq;
2072 	const mac_info_t *mip = dsp->ds_mip;
2073 
2074 	/*
2075 	 * Native mode can be enabled if it's disabled and if the
2076 	 * native media type is different.
2077 	 */
2078 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2079 		dsp->ds_native = B_TRUE;
2080 
2081 	if (dsp->ds_native)
2082 		miocack(q, mp, 0, mip->mi_nativemedia);
2083 	else
2084 		miocnak(q, mp, 0, ENOTSUP);
2085 }
2086 
2087 /*
2088  * DLIOCMARGININFO
2089  */
2090 static void
2091 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2092 {
2093 	queue_t *q = dsp->ds_wq;
2094 	uint32_t margin;
2095 	int err;
2096 
2097 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2098 		err = EINVAL;
2099 		goto failed;
2100 	}
2101 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2102 		goto failed;
2103 
2104 	mac_margin_get(dsp->ds_mh, &margin);
2105 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2106 	miocack(q, mp, sizeof (uint32_t), 0);
2107 	return;
2108 
2109 failed:
2110 	miocnak(q, mp, 0, err);
2111 }
2112 
2113 /*
2114  * DLIOCRAW
2115  */
2116 static void
2117 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2118 {
2119 	queue_t *q = dsp->ds_wq;
2120 	mac_perim_handle_t	mph;
2121 
2122 	if (dsp->ds_mh == NULL) {
2123 		dsp->ds_mode = DLD_RAW;
2124 		miocack(q, mp, 0, 0);
2125 		return;
2126 	}
2127 
2128 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2129 	if (dsp->ds_polling || dsp->ds_direct) {
2130 		mac_perim_exit(mph);
2131 		miocnak(q, mp, 0, EPROTO);
2132 		return;
2133 	}
2134 
2135 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2136 		/*
2137 		 * Set the receive callback.
2138 		 */
2139 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2140 	}
2141 
2142 	/*
2143 	 * Note that raw mode is enabled.
2144 	 */
2145 	dsp->ds_mode = DLD_RAW;
2146 	mac_perim_exit(mph);
2147 
2148 	miocack(q, mp, 0, 0);
2149 }
2150 
2151 /*
2152  * DLIOCHDRINFO
2153  */
2154 static void
2155 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2156 {
2157 	dl_unitdata_req_t *dlp;
2158 	off_t		off;
2159 	size_t		len;
2160 	const uint8_t	*addr;
2161 	uint16_t	sap;
2162 	mblk_t		*nmp;
2163 	mblk_t		*hmp;
2164 	uint_t		addr_length;
2165 	queue_t		*q = dsp->ds_wq;
2166 	int		err;
2167 	mac_perim_handle_t	mph;
2168 
2169 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2170 		err = ENOTSUP;
2171 		goto failed;
2172 	}
2173 
2174 	/*
2175 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2176 	 * user-land should not be allowed.
2177 	 */
2178 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2179 		err = EINVAL;
2180 		goto failed;
2181 	}
2182 
2183 	nmp = mp->b_cont;
2184 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2185 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2186 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2187 		err = EINVAL;
2188 		goto failed;
2189 	}
2190 
2191 	off = dlp->dl_dest_addr_offset;
2192 	len = dlp->dl_dest_addr_length;
2193 
2194 	if (!MBLKIN(nmp, off, len)) {
2195 		err = EINVAL;
2196 		goto failed;
2197 	}
2198 
2199 	if (dsp->ds_dlstate != DL_IDLE) {
2200 		err = ENOTSUP;
2201 		goto failed;
2202 	}
2203 
2204 	addr_length = dsp->ds_mip->mi_addr_length;
2205 	if (len != addr_length + sizeof (uint16_t)) {
2206 		err = EINVAL;
2207 		goto failed;
2208 	}
2209 
2210 	addr = nmp->b_rptr + off;
2211 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2212 
2213 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2214 		err = ENOMEM;
2215 		goto failed;
2216 	}
2217 
2218 	/*
2219 	 * This ioctl might happen concurrently with a direct call to dld_capab
2220 	 * that tries to enable direct and/or poll capabilities. Since the
2221 	 * stack does not serialize them, we do so here to avoid mixing
2222 	 * the callbacks.
2223 	 */
2224 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2225 	if (dsp->ds_mode != DLD_FASTPATH) {
2226 		/*
2227 		 * Set the receive callback (unless polling is enabled).
2228 		 */
2229 		if (!dsp->ds_polling && !dsp->ds_direct)
2230 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2231 
2232 		/*
2233 		 * Note that fast-path mode is enabled.
2234 		 */
2235 		dsp->ds_mode = DLD_FASTPATH;
2236 	}
2237 	mac_perim_exit(mph);
2238 
2239 	freemsg(nmp->b_cont);
2240 	nmp->b_cont = hmp;
2241 
2242 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2243 	return;
2244 failed:
2245 	miocnak(q, mp, 0, err);
2246 }
2247 
2248 /*
2249  * DLIOCLOWLINK: request actual link state changes. When the
2250  * link is part of a bridge instance the client receives actual
2251  * link state changes and not the aggregate link status. Used by
2252  * the bridging daemon (bridged) for proper RSTP operation.
2253  */
2254 static void
2255 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2256 {
2257 	queue_t *q = dsp->ds_wq;
2258 	int err;
2259 
2260 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
2261 		miocnak(q, mp, 0, err);
2262 	} else {
2263 		/* LINTED: alignment */
2264 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2265 		miocack(q, mp, 0, 0);
2266 	}
2267 }
2268 
2269 /*
2270  * Catch-all handler.
2271  */
2272 static void
2273 ioc(dld_str_t *dsp, mblk_t *mp)
2274 {
2275 	queue_t	*q = dsp->ds_wq;
2276 
2277 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2278 		miocnak(q, mp, 0, EINVAL);
2279 		return;
2280 	}
2281 	mac_ioctl(dsp->ds_mh, q, mp);
2282 }
2283