xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision bc1f688b4872ace323eaddbb1a6365d054e7bf56)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Data-Link Driver
27  */
28 
29 #include	<inet/common.h>
30 #include	<sys/strsubr.h>
31 #include	<sys/stropts.h>
32 #include	<sys/strsun.h>
33 #include	<sys/vlan.h>
34 #include	<sys/dld_impl.h>
35 #include	<sys/cpuvar.h>
36 #include	<sys/callb.h>
37 #include	<sys/list.h>
38 #include	<sys/mac_client.h>
39 #include	<sys/mac_client_priv.h>
40 #include	<sys/mac_flow.h>
41 
42 static int	str_constructor(void *, void *, int);
43 static void	str_destructor(void *, void *);
44 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
45 static void	str_notify_promisc_on_phys(dld_str_t *);
46 static void	str_notify_promisc_off_phys(dld_str_t *);
47 static void	str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
48 static void	str_notify_link_up(dld_str_t *);
49 static void	str_notify_link_down(dld_str_t *);
50 static void	str_notify_capab_reneg(dld_str_t *);
51 static void	str_notify_speed(dld_str_t *, uint32_t);
52 
53 static void	ioc_native(dld_str_t *,  mblk_t *);
54 static void	ioc_margin(dld_str_t *, mblk_t *);
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc_lowlink(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static void	dld_wput_nondata(dld_str_t *, mblk_t *);
61 
62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
64     link_tagmode_t);
65 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
66 
67 static uint32_t		str_count;
68 static kmem_cache_t	*str_cachep;
69 static mod_hash_t	*str_hashp;
70 
71 #define	STR_HASHSZ		64
72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
73 
74 #define	dld_taskq	system_taskq
75 
76 static kmutex_t		dld_taskq_lock;
77 static kcondvar_t	dld_taskq_cv;
78 static list_t		dld_taskq_list;		/* List of dld_str_t */
79 boolean_t		dld_taskq_quit;
80 boolean_t		dld_taskq_done;
81 
82 static void		dld_taskq_dispatch(void);
83 
84 /*
85  * Some notes on entry points, flow-control, queueing.
86  *
87  * This driver exports the traditional STREAMS put entry point as well as
88  * the non-STREAMS fast-path transmit routine which is provided to IP via
89  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
90  * and data operations, while the fast-path routine deals only with M_DATA
91  * fast-path packets.  Regardless of the entry point, all outbound packets
92  * will end up in DLD_TX(), where they will be delivered to the MAC layer.
93  *
94  * The transmit logic operates in the following way: All packets coming
95  * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
96  * happens when the MAC layer indicates the packets couldn't be
97  * transmitted due to 1) lack of resources (e.g. running out of
98  * descriptors),  or 2) reaching the allowed bandwidth limit for this
99  * particular flow. The indication comes in the form of a Tx cookie that
100  * identifies the blocked ring. In such case, DLD will place a
101  * dummy message on its write-side STREAMS queue so that the queue is
102  * marked as "full". Any subsequent packets arriving at the driver will
103  * still be sent to the MAC layer where it either gets queued in the Tx
104  * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
105  * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
106  * When the write service procedure runs, it will remove the dummy
107  * message from the write-side STREAMS queue; in effect this will trigger
108  * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
109  * respectively, due to the above reasons.
110  *
111  * All non-data operations, both DLPI and ioctls are single threaded on a per
112  * dld_str_t endpoint. This is done using a taskq so that the control operation
113  * has kernel context and can cv_wait for resources. In addition all set type
114  * operations that involve mac level state modification are serialized on a
115  * per mac end point using the perimeter mechanism provided by the mac layer.
116  * This serializes all mac clients trying to modify a single mac end point over
117  * the entire sequence of mac calls made by that client as an atomic unit. The
118  * mac framework locking is described in mac.c. A critical element is that
119  * DLD/DLS does not hold any locks across the mac perimeter.
120  *
121  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
122  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
123  * match dev_t. If a stream is found and it is attached, its dev_info_t *
124  * is returned. If the mac handle is non-null, it can be safely accessed
125  * below. The mac handle won't be freed until the mac_unregister which
126  * won't happen until the driver detaches. The DDI framework ensures that
127  * the detach won't happen while a getinfo is in progress.
128  */
129 typedef struct i_dld_str_state_s {
130 	major_t		ds_major;
131 	minor_t		ds_minor;
132 	int		ds_instance;
133 	dev_info_t	*ds_dip;
134 } i_dld_str_state_t;
135 
136 /* ARGSUSED */
137 static uint_t
138 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
139 {
140 	i_dld_str_state_t	*statep = arg;
141 	dld_str_t		*dsp = (dld_str_t *)val;
142 	mac_handle_t		mh;
143 
144 	if (statep->ds_major != dsp->ds_major)
145 		return (MH_WALK_CONTINUE);
146 
147 	ASSERT(statep->ds_minor != 0);
148 	mh = dsp->ds_mh;
149 
150 	if (statep->ds_minor == dsp->ds_minor) {
151 		/*
152 		 * Clone: a clone minor is unique. we can terminate the
153 		 * walk if we find a matching stream -- even if we fail
154 		 * to obtain the devinfo.
155 		 */
156 		if (mh != NULL) {
157 			statep->ds_dip = mac_devinfo_get(mh);
158 			statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
159 		}
160 		return (MH_WALK_TERMINATE);
161 	}
162 	return (MH_WALK_CONTINUE);
163 }
164 
165 static dev_info_t *
166 dld_finddevinfo(dev_t dev)
167 {
168 	dev_info_t		*dip;
169 	i_dld_str_state_t	state;
170 
171 	if (getminor(dev) == 0)
172 		return (NULL);
173 
174 	/*
175 	 * See if it's a minor node of a link
176 	 */
177 	if ((dip = dls_link_devinfo(dev)) != NULL)
178 		return (dip);
179 
180 	state.ds_minor = getminor(dev);
181 	state.ds_major = getmajor(dev);
182 	state.ds_dip = NULL;
183 	state.ds_instance = -1;
184 
185 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
186 	return (state.ds_dip);
187 }
188 
189 int
190 dld_devt_to_instance(dev_t dev)
191 {
192 	minor_t			minor;
193 	i_dld_str_state_t	state;
194 
195 	/*
196 	 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
197 	 * Minor number 0 is reserved for the DLPI style 2 unattached
198 	 * node.
199 	 */
200 
201 	if ((minor = getminor(dev)) == 0)
202 		return (-1);
203 
204 	/*
205 	 * Check for unopened style 1 node.
206 	 * Note that this doesn't *necessarily* work for legacy
207 	 * devices, but this code is only called within the
208 	 * getinfo(9e) implementation for true GLDv3 devices, so it
209 	 * doesn't matter.
210 	 */
211 	if (minor > 0 && minor <= DLS_MAX_MINOR) {
212 		return (DLS_MINOR2INST(minor));
213 	}
214 
215 	state.ds_minor = getminor(dev);
216 	state.ds_major = getmajor(dev);
217 	state.ds_dip = NULL;
218 	state.ds_instance = -1;
219 
220 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
221 	return (state.ds_instance);
222 }
223 
224 /*
225  * devo_getinfo: getinfo(9e)
226  *
227  * NB: This may be called for a provider before the provider's
228  * instances are attached.  Hence, if a particular provider needs a
229  * special mapping (the mac instance != ddi_get_instance()), then it
230  * may need to provide its own implmentation using the
231  * mac_devt_to_instance() function, and translating the returned mac
232  * instance to a devinfo instance.  For dev_t's where the minor number
233  * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
234  * function indirectly via the mac_getinfo() function.
235  */
236 /*ARGSUSED*/
237 int
238 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
239 {
240 	dev_info_t	*devinfo;
241 	minor_t		minor = getminor((dev_t)arg);
242 	int		rc = DDI_FAILURE;
243 
244 	switch (cmd) {
245 	case DDI_INFO_DEVT2DEVINFO:
246 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
247 			*(dev_info_t **)resp = devinfo;
248 			rc = DDI_SUCCESS;
249 		}
250 		break;
251 	case DDI_INFO_DEVT2INSTANCE:
252 		if (minor > 0 && minor <= DLS_MAX_MINOR) {
253 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
254 			rc = DDI_SUCCESS;
255 		} else if (minor > DLS_MAX_MINOR &&
256 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
257 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
258 			rc = DDI_SUCCESS;
259 		}
260 		break;
261 	}
262 	return (rc);
263 }
264 
265 void *
266 dld_str_private(queue_t *q)
267 {
268 	return (((dld_str_t *)(q->q_ptr))->ds_private);
269 }
270 
271 int
272 dld_str_open(queue_t *rq, dev_t *devp, void *private)
273 {
274 	dld_str_t	*dsp;
275 	major_t		major;
276 	minor_t		minor;
277 	int		err;
278 
279 	major = getmajor(*devp);
280 	minor = getminor(*devp);
281 
282 	/*
283 	 * Create a new dld_str_t for the stream. This will grab a new minor
284 	 * number that will be handed back in the cloned dev_t.  Creation may
285 	 * fail if we can't allocate the dummy mblk used for flow-control.
286 	 */
287 	dsp = dld_str_create(rq, DLD_DLPI, major,
288 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
289 	if (dsp == NULL)
290 		return (ENOSR);
291 
292 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
293 	dsp->ds_private = private;
294 	if (minor != 0) {
295 		/*
296 		 * Style 1 open
297 		 */
298 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
299 			goto failed;
300 
301 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
302 	} else {
303 		(void) qassociate(rq, -1);
304 	}
305 
306 	/*
307 	 * Enable the queue srv(9e) routine.
308 	 */
309 	qprocson(rq);
310 
311 	/*
312 	 * Construct a cloned dev_t to hand back.
313 	 */
314 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
315 	return (0);
316 
317 failed:
318 	dld_str_destroy(dsp);
319 	return (err);
320 }
321 
322 int
323 dld_str_close(queue_t *rq)
324 {
325 	dld_str_t	*dsp = rq->q_ptr;
326 
327 	/*
328 	 * All modules on top have been popped off. So there can't be any
329 	 * threads from the top.
330 	 */
331 	ASSERT(dsp->ds_datathr_cnt == 0);
332 
333 	/*
334 	 * Wait until pending DLPI requests are processed.
335 	 */
336 	mutex_enter(&dsp->ds_lock);
337 	while (dsp->ds_dlpi_pending)
338 		cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
339 	mutex_exit(&dsp->ds_lock);
340 
341 
342 	/*
343 	 * This stream was open to a provider node. Check to see
344 	 * if it has been cleanly shut down.
345 	 */
346 	if (dsp->ds_dlstate != DL_UNATTACHED) {
347 		/*
348 		 * The stream is either open to a style 1 provider or
349 		 * this is not clean shutdown. Detach from the PPA.
350 		 * (This is still ok even in the style 1 case).
351 		 */
352 		dld_str_detach(dsp);
353 	}
354 
355 	dld_str_destroy(dsp);
356 	return (0);
357 }
358 
359 /*
360  * qi_qopen: open(9e)
361  */
362 /*ARGSUSED*/
363 int
364 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
365 {
366 	if (sflag == MODOPEN)
367 		return (ENOTSUP);
368 
369 	/*
370 	 * This is a cloning driver and therefore each queue should only
371 	 * ever get opened once.
372 	 */
373 	if (rq->q_ptr != NULL)
374 		return (EBUSY);
375 
376 	return (dld_str_open(rq, devp, NULL));
377 }
378 
379 /*
380  * qi_qclose: close(9e)
381  */
382 /* ARGSUSED */
383 int
384 dld_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
385 {
386 	/*
387 	 * Disable the queue srv(9e) routine.
388 	 */
389 	qprocsoff(rq);
390 
391 	return (dld_str_close(rq));
392 }
393 
394 /*
395  * qi_qputp: put(9e)
396  */
397 void
398 dld_wput(queue_t *wq, mblk_t *mp)
399 {
400 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
401 	dld_str_mode_t	mode;
402 
403 	switch (DB_TYPE(mp)) {
404 	case M_DATA:
405 		mutex_enter(&dsp->ds_lock);
406 		mode = dsp->ds_mode;
407 		if ((dsp->ds_dlstate != DL_IDLE) ||
408 		    (mode != DLD_FASTPATH && mode != DLD_RAW)) {
409 			mutex_exit(&dsp->ds_lock);
410 			freemsg(mp);
411 			break;
412 		}
413 
414 		DLD_DATATHR_INC(dsp);
415 		mutex_exit(&dsp->ds_lock);
416 		if (mode == DLD_FASTPATH) {
417 			if (dsp->ds_mip->mi_media == DL_ETHER &&
418 			    (MBLKL(mp) < sizeof (struct ether_header))) {
419 				freemsg(mp);
420 			} else {
421 				(void) str_mdata_fastpath_put(dsp, mp, 0, 0);
422 			}
423 		} else {
424 			str_mdata_raw_put(dsp, mp);
425 		}
426 		DLD_DATATHR_DCR(dsp);
427 		break;
428 	case M_PROTO:
429 	case M_PCPROTO: {
430 		t_uscalar_t	prim;
431 
432 		if (MBLKL(mp) < sizeof (t_uscalar_t))
433 			break;
434 
435 		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
436 
437 		if (prim == DL_UNITDATA_REQ) {
438 			proto_unitdata_req(dsp, mp);
439 		} else {
440 			dld_wput_nondata(dsp, mp);
441 		}
442 		break;
443 	}
444 
445 	case M_IOCTL:
446 		dld_wput_nondata(dsp, mp);
447 		break;
448 
449 	case M_FLUSH:
450 		if (*mp->b_rptr & FLUSHW) {
451 			DLD_CLRQFULL(dsp);
452 			*mp->b_rptr &= ~FLUSHW;
453 		}
454 
455 		if (*mp->b_rptr & FLUSHR) {
456 			qreply(wq, mp);
457 		} else {
458 			freemsg(mp);
459 		}
460 		break;
461 
462 	default:
463 		freemsg(mp);
464 		break;
465 	}
466 }
467 
468 /*
469  * qi_srvp: srv(9e)
470  */
471 void
472 dld_wsrv(queue_t *wq)
473 {
474 	dld_str_t	*dsp = wq->q_ptr;
475 
476 	DLD_CLRQFULL(dsp);
477 }
478 
479 void
480 dld_init_ops(struct dev_ops *ops, const char *name)
481 {
482 	struct streamtab *stream;
483 	struct qinit *rq, *wq;
484 	struct module_info *modinfo;
485 
486 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
487 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
488 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
489 	modinfo->mi_minpsz = 0;
490 	modinfo->mi_maxpsz = 64*1024;
491 	modinfo->mi_hiwat  = 1;
492 	modinfo->mi_lowat = 0;
493 
494 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
495 	rq->qi_qopen = dld_open;
496 	rq->qi_qclose = dld_close;
497 	rq->qi_minfo = modinfo;
498 
499 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
500 	wq->qi_putp = (pfi_t)dld_wput;
501 	wq->qi_srvp = (pfi_t)dld_wsrv;
502 	wq->qi_minfo = modinfo;
503 
504 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
505 	stream->st_rdinit = rq;
506 	stream->st_wrinit = wq;
507 	ops->devo_cb_ops->cb_str = stream;
508 
509 	if (ops->devo_getinfo == NULL)
510 		ops->devo_getinfo = &dld_getinfo;
511 }
512 
513 void
514 dld_fini_ops(struct dev_ops *ops)
515 {
516 	struct streamtab *stream;
517 	struct qinit *rq, *wq;
518 	struct module_info *modinfo;
519 
520 	stream = ops->devo_cb_ops->cb_str;
521 	rq = stream->st_rdinit;
522 	wq = stream->st_wrinit;
523 	modinfo = rq->qi_minfo;
524 	ASSERT(wq->qi_minfo == modinfo);
525 
526 	kmem_free(stream, sizeof (struct streamtab));
527 	kmem_free(wq, sizeof (struct qinit));
528 	kmem_free(rq, sizeof (struct qinit));
529 	kmem_free(modinfo->mi_idname, FMNAMESZ);
530 	kmem_free(modinfo, sizeof (struct module_info));
531 }
532 
533 /*
534  * Initialize this module's data structures.
535  */
536 void
537 dld_str_init(void)
538 {
539 	/*
540 	 * Create dld_str_t object cache.
541 	 */
542 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
543 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
544 	ASSERT(str_cachep != NULL);
545 
546 	/*
547 	 * Create a hash table for maintaining dld_str_t's.
548 	 * The ds_minor field (the clone minor number) of a dld_str_t
549 	 * is used as a key for this hash table because this number is
550 	 * globally unique (allocated from "dls_minor_arena").
551 	 */
552 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
553 	    mod_hash_null_valdtor);
554 
555 	mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
556 	cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
557 
558 	dld_taskq_quit = B_FALSE;
559 	dld_taskq_done = B_FALSE;
560 	list_create(&dld_taskq_list, sizeof (dld_str_t),
561 	    offsetof(dld_str_t, ds_tqlist));
562 	(void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
563 	    &p0, TS_RUN, minclsyspri);
564 }
565 
566 /*
567  * Tear down this module's data structures.
568  */
569 int
570 dld_str_fini(void)
571 {
572 	/*
573 	 * Make sure that there are no objects in use.
574 	 */
575 	if (str_count != 0)
576 		return (EBUSY);
577 
578 	/*
579 	 * Ask the dld_taskq thread to quit and wait for it to be done
580 	 */
581 	mutex_enter(&dld_taskq_lock);
582 	dld_taskq_quit = B_TRUE;
583 	cv_signal(&dld_taskq_cv);
584 	while (!dld_taskq_done)
585 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
586 	mutex_exit(&dld_taskq_lock);
587 	list_destroy(&dld_taskq_list);
588 	/*
589 	 * Destroy object cache.
590 	 */
591 	kmem_cache_destroy(str_cachep);
592 	mod_hash_destroy_idhash(str_hashp);
593 	return (0);
594 }
595 
596 /*
597  * Create a new dld_str_t object.
598  */
599 dld_str_t *
600 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
601 {
602 	dld_str_t	*dsp;
603 	int		err;
604 
605 	/*
606 	 * Allocate an object from the cache.
607 	 */
608 	atomic_inc_32(&str_count);
609 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
610 
611 	/*
612 	 * Allocate the dummy mblk for flow-control.
613 	 */
614 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
615 	if (dsp->ds_tx_flow_mp == NULL) {
616 		kmem_cache_free(str_cachep, dsp);
617 		atomic_dec_32(&str_count);
618 		return (NULL);
619 	}
620 	dsp->ds_type = type;
621 	dsp->ds_major = major;
622 	dsp->ds_style = style;
623 
624 	/*
625 	 * Initialize the queue pointers.
626 	 */
627 	ASSERT(RD(rq) == rq);
628 	dsp->ds_rq = rq;
629 	dsp->ds_wq = WR(rq);
630 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
631 
632 	/*
633 	 * We want explicit control over our write-side STREAMS queue
634 	 * where the dummy mblk gets added/removed for flow-control.
635 	 */
636 	noenable(WR(rq));
637 
638 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
639 	    (mod_hash_val_t)dsp);
640 	ASSERT(err == 0);
641 	return (dsp);
642 }
643 
644 /*
645  * Destroy a dld_str_t object.
646  */
647 void
648 dld_str_destroy(dld_str_t *dsp)
649 {
650 	queue_t		*rq;
651 	queue_t		*wq;
652 	mod_hash_val_t	val;
653 
654 	/*
655 	 * Clear the queue pointers.
656 	 */
657 	rq = dsp->ds_rq;
658 	wq = dsp->ds_wq;
659 	ASSERT(wq == WR(rq));
660 	rq->q_ptr = wq->q_ptr = NULL;
661 	dsp->ds_rq = dsp->ds_wq = NULL;
662 
663 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
664 	ASSERT(dsp->ds_sap == 0);
665 	ASSERT(dsp->ds_mh == NULL);
666 	ASSERT(dsp->ds_mch == NULL);
667 	ASSERT(dsp->ds_promisc == 0);
668 	ASSERT(dsp->ds_mph == NULL);
669 	ASSERT(dsp->ds_mip == NULL);
670 	ASSERT(dsp->ds_mnh == NULL);
671 
672 	ASSERT(dsp->ds_polling == B_FALSE);
673 	ASSERT(dsp->ds_direct == B_FALSE);
674 	ASSERT(dsp->ds_lso == B_FALSE);
675 	ASSERT(dsp->ds_lso_max == 0);
676 	ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
677 
678 	/*
679 	 * Reinitialize all the flags.
680 	 */
681 	dsp->ds_notifications = 0;
682 	dsp->ds_passivestate = DLD_UNINITIALIZED;
683 	dsp->ds_mode = DLD_UNITDATA;
684 	dsp->ds_native = B_FALSE;
685 	dsp->ds_nonip = B_FALSE;
686 
687 	ASSERT(dsp->ds_datathr_cnt == 0);
688 	ASSERT(dsp->ds_pending_head == NULL);
689 	ASSERT(dsp->ds_pending_tail == NULL);
690 	ASSERT(!dsp->ds_dlpi_pending);
691 
692 	ASSERT(dsp->ds_dlp == NULL);
693 	ASSERT(dsp->ds_dmap == NULL);
694 	ASSERT(dsp->ds_rx == NULL);
695 	ASSERT(dsp->ds_rx_arg == NULL);
696 	ASSERT(dsp->ds_next == NULL);
697 	ASSERT(dsp->ds_head == NULL);
698 
699 	/*
700 	 * Free the dummy mblk if exists.
701 	 */
702 	if (dsp->ds_tx_flow_mp != NULL) {
703 		freeb(dsp->ds_tx_flow_mp);
704 		dsp->ds_tx_flow_mp = NULL;
705 	}
706 
707 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
708 	ASSERT(dsp == (dld_str_t *)val);
709 
710 	/*
711 	 * Free the object back to the cache.
712 	 */
713 	kmem_cache_free(str_cachep, dsp);
714 	atomic_dec_32(&str_count);
715 }
716 
717 /*
718  * kmem_cache contructor function: see kmem_cache_create(9f).
719  */
720 /*ARGSUSED*/
721 static int
722 str_constructor(void *buf, void *cdrarg, int kmflags)
723 {
724 	dld_str_t	*dsp = buf;
725 
726 	bzero(buf, sizeof (dld_str_t));
727 
728 	/*
729 	 * Allocate a new minor number.
730 	 */
731 	if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
732 		return (-1);
733 
734 	/*
735 	 * Initialize the DLPI state machine.
736 	 */
737 	dsp->ds_dlstate = DL_UNATTACHED;
738 
739 	mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
740 	cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
741 	cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
742 
743 	return (0);
744 }
745 
746 /*
747  * kmem_cache destructor function.
748  */
749 /*ARGSUSED*/
750 static void
751 str_destructor(void *buf, void *cdrarg)
752 {
753 	dld_str_t	*dsp = buf;
754 
755 	/*
756 	 * Release the minor number.
757 	 */
758 	mac_minor_rele(dsp->ds_minor);
759 
760 	ASSERT(dsp->ds_tx_flow_mp == NULL);
761 
762 	mutex_destroy(&dsp->ds_lock);
763 	cv_destroy(&dsp->ds_datathr_cv);
764 	cv_destroy(&dsp->ds_dlpi_pending_cv);
765 }
766 
767 /*
768  * Update the priority bits and VID (may need to insert tag if mp points
769  * to an untagged packet.
770  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
771  */
772 static mblk_t *
773 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
774     link_tagmode_t tagmode)
775 {
776 	mblk_t *hmp;
777 	struct ether_vlan_header *evhp;
778 	struct ether_header *ehp;
779 	uint16_t old_tci = 0;
780 	size_t len;
781 
782 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
783 
784 	evhp = (struct ether_vlan_header *)mp->b_rptr;
785 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
786 		/*
787 		 * Tagged packet, update the priority bits.
788 		 */
789 		len = sizeof (struct ether_vlan_header);
790 
791 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
792 			/*
793 			 * In case some drivers only check the db_ref
794 			 * count of the first mblk, we pullup the
795 			 * message into a single mblk.
796 			 */
797 			hmp = msgpullup(mp, -1);
798 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
799 				freemsg(hmp);
800 				return (NULL);
801 			} else {
802 				freemsg(mp);
803 				mp = hmp;
804 			}
805 		}
806 
807 		evhp = (struct ether_vlan_header *)mp->b_rptr;
808 		old_tci = ntohs(evhp->ether_tci);
809 	} else {
810 		/*
811 		 * Untagged packet.  Two factors will cause us to insert a
812 		 * VLAN header:
813 		 * - This is a VLAN link (vid is specified)
814 		 * - The link supports user priority tagging and the priority
815 		 *   is non-zero.
816 		 */
817 		if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
818 			return (mp);
819 
820 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
821 		if (hmp == NULL)
822 			return (NULL);
823 
824 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
825 		ehp = (struct ether_header *)mp->b_rptr;
826 
827 		/*
828 		 * Copy the MAC addresses and typelen
829 		 */
830 		bcopy(ehp, evhp, (ETHERADDRL * 2));
831 		evhp->ether_type = ehp->ether_type;
832 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
833 
834 		hmp->b_wptr += sizeof (struct ether_vlan_header);
835 		mp->b_rptr += sizeof (struct ether_header);
836 
837 		/*
838 		 * Free the original message if it's now empty. Link the
839 		 * rest of the messages to the header message.
840 		 */
841 		if (MBLKL(mp) == 0) {
842 			hmp->b_cont = mp->b_cont;
843 			freeb(mp);
844 		} else {
845 			hmp->b_cont = mp;
846 		}
847 		mp = hmp;
848 	}
849 
850 	if (pri == 0)
851 		pri = VLAN_PRI(old_tci);
852 	if (vid == VLAN_ID_NONE)
853 		vid = VLAN_ID(old_tci);
854 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
855 	return (mp);
856 }
857 
858 /*
859  * M_DATA put (IP fast-path mode)
860  */
861 mac_tx_cookie_t
862 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
863     uint16_t flag)
864 {
865 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
866 	mblk_t *newmp;
867 	uint_t pri;
868 	mac_tx_cookie_t cookie;
869 
870 	if (is_ethernet) {
871 		/*
872 		 * Update the priority bits to the assigned priority.
873 		 */
874 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
875 
876 		if (pri != 0) {
877 			newmp = i_dld_ether_header_update_tag(mp, pri,
878 			    VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
879 			if (newmp == NULL)
880 				goto discard;
881 			mp = newmp;
882 		}
883 	}
884 
885 	if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) {
886 		DLD_SETQFULL(dsp);
887 	}
888 	return (cookie);
889 
890 discard:
891 	/* TODO: bump kstat? */
892 	freemsg(mp);
893 	return (NULL);
894 }
895 
896 /*
897  * M_DATA put (DLIOCRAW mode)
898  */
899 static void
900 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
901 {
902 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
903 	mblk_t *bp, *newmp;
904 	size_t size;
905 	mac_header_info_t mhi;
906 	uint_t pri, vid, dvid;
907 	uint_t max_sdu;
908 
909 	/*
910 	 * Certain MAC type plugins provide an illusion for raw DLPI
911 	 * consumers.  They pretend that the MAC layer is something that
912 	 * it's not for the benefit of observability tools.  For example,
913 	 * mac_wifi pretends that it's Ethernet for such consumers.
914 	 * Here, unless native mode is enabled, we call into the MAC layer so
915 	 * that this illusion can be maintained.  The plugin will optionally
916 	 * transform the MAC header here into something that can be passed
917 	 * down.  The header goes from raw mode to "cooked" mode.
918 	 */
919 	if (!dsp->ds_native) {
920 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
921 			goto discard;
922 		mp = newmp;
923 	}
924 
925 	size = MBLKL(mp);
926 
927 	/*
928 	 * Check the packet is not too big and that any remaining
929 	 * fragment list is composed entirely of M_DATA messages. (We
930 	 * know the first fragment was M_DATA otherwise we could not
931 	 * have got here).
932 	 */
933 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
934 		if (DB_TYPE(bp) != M_DATA)
935 			goto discard;
936 		size += MBLKL(bp);
937 	}
938 
939 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
940 		goto discard;
941 
942 	mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
943 	/*
944 	 * If LSO is enabled, check the size against lso_max. Otherwise,
945 	 * compare the packet size with max_sdu.
946 	 */
947 	max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
948 	if (size > max_sdu + mhi.mhi_hdrsize)
949 		goto discard;
950 
951 	if (is_ethernet) {
952 		dvid = mac_client_vid(dsp->ds_mch);
953 
954 		/*
955 		 * Discard the packet if this is a VLAN stream but the VID in
956 		 * the packet is not correct.
957 		 */
958 		vid = VLAN_ID(mhi.mhi_tci);
959 		if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
960 			goto discard;
961 
962 		/*
963 		 * Discard the packet if this packet is a tagged packet
964 		 * but both pri and VID are 0.
965 		 */
966 		pri = VLAN_PRI(mhi.mhi_tci);
967 		if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
968 		    vid == VLAN_ID_NONE)
969 			goto discard;
970 
971 		/*
972 		 * Update the priority bits to the per-stream priority if
973 		 * priority is not set in the packet. Update the VID for
974 		 * packets on a VLAN stream.
975 		 */
976 		pri = (pri == 0) ? dsp->ds_pri : 0;
977 		if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
978 			if ((newmp = i_dld_ether_header_update_tag(mp, pri,
979 			    dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
980 				goto discard;
981 			}
982 			mp = newmp;
983 		}
984 	}
985 
986 	if (DLD_TX(dsp, mp, 0, 0) != NULL) {
987 		/* Turn on flow-control for dld */
988 		DLD_SETQFULL(dsp);
989 	}
990 	return;
991 
992 discard:
993 	/* TODO: bump kstat? */
994 	freemsg(mp);
995 }
996 
997 /*
998  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
999  */
1000 int
1001 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1002 {
1003 	dev_t			dev;
1004 	int			err;
1005 	const char		*drvname;
1006 	mac_perim_handle_t	mph = NULL;
1007 	boolean_t		qassociated = B_FALSE;
1008 	dls_link_t		*dlp = NULL;
1009 	dls_dl_handle_t		ddp = NULL;
1010 
1011 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1012 		return (EINVAL);
1013 
1014 	if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1015 		return (ENOTSUP);
1016 
1017 	/*
1018 	 * /dev node access. This will still be supported for backward
1019 	 * compatibility reason.
1020 	 */
1021 	if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1022 	    (strcmp(drvname, "vnic") != 0)) {
1023 		if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1024 			return (EINVAL);
1025 		qassociated = B_TRUE;
1026 	}
1027 
1028 	dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1029 	if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1030 		goto failed;
1031 
1032 	if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1033 		goto failed;
1034 
1035 	/*
1036 	 * Open a channel.
1037 	 */
1038 	if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1039 		goto failed;
1040 
1041 	if ((err = dls_open(dlp, ddp, dsp)) != 0)
1042 		goto failed;
1043 
1044 	/*
1045 	 * Set the default packet priority.
1046 	 */
1047 	dsp->ds_pri = 0;
1048 
1049 	/*
1050 	 * Add a notify function so that the we get updates from the MAC.
1051 	 */
1052 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1053 	dsp->ds_dlstate = DL_UNBOUND;
1054 	mac_perim_exit(mph);
1055 	return (0);
1056 
1057 failed:
1058 	if (dlp != NULL)
1059 		dls_link_rele(dlp);
1060 	if (mph != NULL)
1061 		mac_perim_exit(mph);
1062 	if (ddp != NULL)
1063 		dls_devnet_rele(ddp);
1064 	if (qassociated)
1065 		(void) qassociate(dsp->ds_wq, -1);
1066 
1067 	return (err);
1068 }
1069 
1070 /*
1071  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1072  * from close(2) for style 2.
1073  */
1074 void
1075 dld_str_detach(dld_str_t *dsp)
1076 {
1077 	mac_perim_handle_t	mph;
1078 	int			err;
1079 
1080 	ASSERT(dsp->ds_datathr_cnt == 0);
1081 
1082 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1083 	/*
1084 	 * Remove the notify function.
1085 	 *
1086 	 * Note that we cannot wait for the notification callback to be removed
1087 	 * since it could cause the deadlock with str_notify() since they both
1088 	 * need the mac perimeter. Continue if we cannot remove the
1089 	 * notification callback right now and wait after we leave the
1090 	 * perimeter.
1091 	 */
1092 	err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1093 	dsp->ds_mnh = NULL;
1094 
1095 	/*
1096 	 * Disable the capabilities
1097 	 */
1098 	dld_capabilities_disable(dsp);
1099 
1100 	/*
1101 	 * Clear LSO flags.
1102 	 */
1103 	dsp->ds_lso = B_FALSE;
1104 	dsp->ds_lso_max = 0;
1105 
1106 	dls_close(dsp);
1107 	mac_perim_exit(mph);
1108 
1109 	/*
1110 	 * Now we leave the mac perimeter. If mac_notify_remove() failed
1111 	 * because the notification callback was in progress, wait for
1112 	 * it to finish before we proceed.
1113 	 */
1114 	if (err != 0)
1115 		mac_notify_remove_wait(dsp->ds_mh);
1116 
1117 	/*
1118 	 * An unreferenced tagged (non-persistent) vlan gets destroyed
1119 	 * automatically in the call to dls_devnet_rele.
1120 	 */
1121 	dls_devnet_rele(dsp->ds_ddh);
1122 
1123 	dsp->ds_sap = 0;
1124 	dsp->ds_mh = NULL;
1125 	dsp->ds_mch = NULL;
1126 	dsp->ds_mip = NULL;
1127 
1128 	if (dsp->ds_style == DL_STYLE2)
1129 		(void) qassociate(dsp->ds_wq, -1);
1130 
1131 	/*
1132 	 * Re-initialize the DLPI state machine.
1133 	 */
1134 	dsp->ds_dlstate = DL_UNATTACHED;
1135 }
1136 
1137 /*
1138  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1139  * tags before sending packets up to the DLS clients, with the exception of
1140  * special priority tagged packets, in that case, we set the VID to 0.
1141  * mp must be a VLAN tagged packet.
1142  */
1143 static mblk_t *
1144 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1145 {
1146 	mblk_t *newmp;
1147 	struct ether_vlan_header *evhp;
1148 	uint16_t tci, new_tci;
1149 
1150 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1151 	if (DB_REF(mp) > 1) {
1152 		newmp = copymsg(mp);
1153 		if (newmp == NULL)
1154 			return (NULL);
1155 		freemsg(mp);
1156 		mp = newmp;
1157 	}
1158 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1159 
1160 	tci = ntohs(evhp->ether_tci);
1161 	if (VLAN_PRI(tci) == 0 || !keep_pri) {
1162 		/*
1163 		 * Priority is 0, strip the tag.
1164 		 */
1165 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1166 		mp->b_rptr += VLAN_TAGSZ;
1167 	} else {
1168 		/*
1169 		 * Priority is not 0, update the VID to 0.
1170 		 */
1171 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1172 		evhp->ether_tci = htons(new_tci);
1173 	}
1174 	return (mp);
1175 }
1176 
1177 /*
1178  * Raw mode receive function.
1179  */
1180 /*ARGSUSED*/
1181 void
1182 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1183     mac_header_info_t *mhip)
1184 {
1185 	dld_str_t *dsp = (dld_str_t *)arg;
1186 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1187 	mblk_t *next, *newmp;
1188 
1189 	ASSERT(mp != NULL);
1190 	do {
1191 		/*
1192 		 * Get the pointer to the next packet in the chain and then
1193 		 * clear b_next before the packet gets passed on.
1194 		 */
1195 		next = mp->b_next;
1196 		mp->b_next = NULL;
1197 
1198 		/*
1199 		 * Wind back b_rptr to point at the MAC header.
1200 		 */
1201 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1202 		mp->b_rptr -= mhip->mhi_hdrsize;
1203 
1204 		/*
1205 		 * Certain MAC type plugins provide an illusion for raw
1206 		 * DLPI consumers.  They pretend that the MAC layer is
1207 		 * something that it's not for the benefit of observability
1208 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1209 		 * for such consumers.	Here, unless native mode is enabled,
1210 		 * we call into the MAC layer so that this illusion can be
1211 		 * maintained.	The plugin will optionally transform the MAC
1212 		 * header here into something that can be passed up to raw
1213 		 * consumers.  The header goes from "cooked" mode to raw mode.
1214 		 */
1215 		if (!dsp->ds_native) {
1216 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1217 			if (newmp == NULL) {
1218 				freemsg(mp);
1219 				goto next;
1220 			}
1221 			mp = newmp;
1222 		}
1223 
1224 		/*
1225 		 * Strip the VLAN tag for VLAN streams.
1226 		 */
1227 		if (is_ethernet &&
1228 		    mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1229 			/*
1230 			 * The priority should be kept only for VLAN
1231 			 * data-links.
1232 			 */
1233 			newmp = i_dld_ether_header_strip_tag(mp,
1234 			    mac_client_is_vlan_vnic(dsp->ds_mch));
1235 			if (newmp == NULL) {
1236 				freemsg(mp);
1237 				goto next;
1238 			}
1239 			mp = newmp;
1240 		}
1241 
1242 		/*
1243 		 * Pass the packet on.
1244 		 */
1245 		if (canputnext(dsp->ds_rq))
1246 			putnext(dsp->ds_rq, mp);
1247 		else
1248 			freemsg(mp);
1249 
1250 next:
1251 		/*
1252 		 * Move on to the next packet in the chain.
1253 		 */
1254 		mp = next;
1255 	} while (mp != NULL);
1256 }
1257 
1258 /*
1259  * Fast-path receive function.
1260  */
1261 /*ARGSUSED*/
1262 void
1263 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1264     mac_header_info_t *mhip)
1265 {
1266 	dld_str_t *dsp = (dld_str_t *)arg;
1267 	mblk_t *next;
1268 	size_t offset = 0;
1269 
1270 	/*
1271 	 * MAC header stripping rules:
1272 	 *    - Tagged packets:
1273 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1274 	 *	b. Physical streams
1275 	 *	- VLAN packets (non-zero VID). The stream must be either a
1276 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1277 	 *	  Strip the Ethernet header but keep the VLAN header.
1278 	 *	- Special tagged packets (zero VID)
1279 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1280 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1281 	 *	    keep the VLAN header.
1282 	 *	  * Otherwise, strip the whole VLAN header.
1283 	 *    - Untagged packets. Strip the whole MAC header.
1284 	 */
1285 	if (mhip->mhi_istagged &&
1286 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1287 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1288 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1289 		offset = VLAN_TAGSZ;
1290 	}
1291 
1292 	ASSERT(mp != NULL);
1293 	do {
1294 		/*
1295 		 * Get the pointer to the next packet in the chain and then
1296 		 * clear b_next before the packet gets passed on.
1297 		 */
1298 		next = mp->b_next;
1299 		mp->b_next = NULL;
1300 
1301 		/*
1302 		 * Wind back b_rptr to point at the VLAN header.
1303 		 */
1304 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1305 		mp->b_rptr -= offset;
1306 
1307 		/*
1308 		 * Pass the packet on.
1309 		 */
1310 		if (canputnext(dsp->ds_rq))
1311 			putnext(dsp->ds_rq, mp);
1312 		else
1313 			freemsg(mp);
1314 		/*
1315 		 * Move on to the next packet in the chain.
1316 		 */
1317 		mp = next;
1318 	} while (mp != NULL);
1319 }
1320 
1321 /*
1322  * Default receive function (send DL_UNITDATA_IND messages).
1323  */
1324 /*ARGSUSED*/
1325 void
1326 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1327     mac_header_info_t *mhip)
1328 {
1329 	dld_str_t		*dsp = (dld_str_t *)arg;
1330 	mblk_t			*ud_mp;
1331 	mblk_t			*next;
1332 	size_t			offset = 0;
1333 	boolean_t		strip_vlan = B_TRUE;
1334 
1335 	/*
1336 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1337 	 */
1338 	if (mhip->mhi_istagged &&
1339 	    (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1340 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1341 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1342 		offset = VLAN_TAGSZ;
1343 		strip_vlan = B_FALSE;
1344 	}
1345 
1346 	ASSERT(mp != NULL);
1347 	do {
1348 		/*
1349 		 * Get the pointer to the next packet in the chain and then
1350 		 * clear b_next before the packet gets passed on.
1351 		 */
1352 		next = mp->b_next;
1353 		mp->b_next = NULL;
1354 
1355 		/*
1356 		 * Wind back b_rptr to point at the MAC header.
1357 		 */
1358 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1359 		mp->b_rptr -= mhip->mhi_hdrsize;
1360 
1361 		/*
1362 		 * Create the DL_UNITDATA_IND M_PROTO.
1363 		 */
1364 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1365 			freemsgchain(mp);
1366 			return;
1367 		}
1368 
1369 		/*
1370 		 * Advance b_rptr to point at the payload (or the VLAN header).
1371 		 */
1372 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1373 
1374 		/*
1375 		 * Prepend the DL_UNITDATA_IND.
1376 		 */
1377 		ud_mp->b_cont = mp;
1378 
1379 		/*
1380 		 * Send the message.
1381 		 */
1382 		if (canputnext(dsp->ds_rq))
1383 			putnext(dsp->ds_rq, ud_mp);
1384 		else
1385 			freemsg(ud_mp);
1386 
1387 		/*
1388 		 * Move on to the next packet in the chain.
1389 		 */
1390 		mp = next;
1391 	} while (mp != NULL);
1392 }
1393 
1394 /*
1395  * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1396  */
1397 static void
1398 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu, uint_t multicast_sdu)
1399 {
1400 	mblk_t		*mp;
1401 	dl_notify_ind_t *dlip;
1402 
1403 	if (!(dsp->ds_notifications & (DL_NOTE_SDU_SIZE|DL_NOTE_SDU_SIZE2)))
1404 		return;
1405 
1406 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1407 	    M_PROTO, 0)) == NULL)
1408 		return;
1409 
1410 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1411 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1412 	dlip->dl_primitive = DL_NOTIFY_IND;
1413 	if (dsp->ds_notifications & DL_NOTE_SDU_SIZE2) {
1414 		dlip->dl_notification = DL_NOTE_SDU_SIZE2;
1415 		dlip->dl_data1 = max_sdu;
1416 		dlip->dl_data2 = multicast_sdu;
1417 	} else {
1418 		dlip->dl_notification = DL_NOTE_SDU_SIZE;
1419 		dlip->dl_data = max_sdu;
1420 	}
1421 
1422 	qreply(dsp->ds_wq, mp);
1423 }
1424 
1425 /*
1426  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1427  * current state of the interface.
1428  */
1429 void
1430 dld_str_notify_ind(dld_str_t *dsp)
1431 {
1432 	mac_notify_type_t	type;
1433 
1434 	for (type = 0; type < MAC_NNOTE; type++)
1435 		str_notify(dsp, type);
1436 }
1437 
1438 typedef struct dl_unitdata_ind_wrapper {
1439 	dl_unitdata_ind_t	dl_unitdata;
1440 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1441 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1442 } dl_unitdata_ind_wrapper_t;
1443 
1444 /*
1445  * Create a DL_UNITDATA_IND M_PROTO message.
1446  */
1447 static mblk_t *
1448 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1449 {
1450 	mblk_t				*nmp;
1451 	dl_unitdata_ind_wrapper_t	*dlwp;
1452 	dl_unitdata_ind_t		*dlp;
1453 	mac_header_info_t		mhi;
1454 	uint_t				addr_length;
1455 	uint8_t				*daddr;
1456 	uint8_t				*saddr;
1457 
1458 	/*
1459 	 * Get the packet header information.
1460 	 */
1461 	if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
1462 		return (NULL);
1463 
1464 	/*
1465 	 * Allocate a message large enough to contain the wrapper structure
1466 	 * defined above.
1467 	 */
1468 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1469 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1470 	    DL_UNITDATA_IND)) == NULL)
1471 		return (NULL);
1472 
1473 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1474 
1475 	dlp = &(dlwp->dl_unitdata);
1476 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1477 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1478 
1479 	/*
1480 	 * Copy in the destination address.
1481 	 */
1482 	addr_length = dsp->ds_mip->mi_addr_length;
1483 	daddr = dlwp->dl_dest_addr;
1484 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1485 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1486 
1487 	/*
1488 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1489 	 */
1490 	if (mhi.mhi_istagged && !strip_vlan)
1491 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1492 	else
1493 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1494 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1495 
1496 	/*
1497 	 * If the destination address was multicast or broadcast then the
1498 	 * dl_group_address field should be non-zero.
1499 	 */
1500 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1501 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1502 
1503 	/*
1504 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1505 	 * for example) may not have access to source information.
1506 	 */
1507 	if (mhi.mhi_saddr == NULL) {
1508 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1509 	} else {
1510 		saddr = dlwp->dl_src_addr;
1511 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1512 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1513 
1514 		/*
1515 		 * Set the source DLSAP to the packet ethertype.
1516 		 */
1517 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1518 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1519 	}
1520 
1521 	return (nmp);
1522 }
1523 
1524 /*
1525  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1526  */
1527 static void
1528 str_notify_promisc_on_phys(dld_str_t *dsp)
1529 {
1530 	mblk_t		*mp;
1531 	dl_notify_ind_t	*dlip;
1532 
1533 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1534 		return;
1535 
1536 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1537 	    M_PROTO, 0)) == NULL)
1538 		return;
1539 
1540 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1541 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1542 	dlip->dl_primitive = DL_NOTIFY_IND;
1543 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1544 
1545 	qreply(dsp->ds_wq, mp);
1546 }
1547 
1548 /*
1549  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1550  */
1551 static void
1552 str_notify_promisc_off_phys(dld_str_t *dsp)
1553 {
1554 	mblk_t		*mp;
1555 	dl_notify_ind_t	*dlip;
1556 
1557 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1558 		return;
1559 
1560 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1561 	    M_PROTO, 0)) == NULL)
1562 		return;
1563 
1564 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1565 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1566 	dlip->dl_primitive = DL_NOTIFY_IND;
1567 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1568 
1569 	qreply(dsp->ds_wq, mp);
1570 }
1571 
1572 /*
1573  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1574  */
1575 static void
1576 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1577 {
1578 	mblk_t		*mp;
1579 	dl_notify_ind_t	*dlip;
1580 	uint_t		addr_length;
1581 	uint16_t	ethertype;
1582 
1583 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1584 		return;
1585 
1586 	addr_length = dsp->ds_mip->mi_addr_length;
1587 	if ((mp = mexchange(dsp->ds_wq, NULL,
1588 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1589 	    M_PROTO, 0)) == NULL)
1590 		return;
1591 
1592 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1593 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1594 	dlip->dl_primitive = DL_NOTIFY_IND;
1595 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1596 	dlip->dl_data = addr_type;
1597 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1598 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1599 
1600 	bcopy(addr, &dlip[1], addr_length);
1601 
1602 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1603 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1604 
1605 	qreply(dsp->ds_wq, mp);
1606 }
1607 
1608 /*
1609  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1610  */
1611 static void
1612 str_notify_link_up(dld_str_t *dsp)
1613 {
1614 	mblk_t		*mp;
1615 	dl_notify_ind_t	*dlip;
1616 
1617 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1618 		return;
1619 
1620 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1621 	    M_PROTO, 0)) == NULL)
1622 		return;
1623 
1624 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1625 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1626 	dlip->dl_primitive = DL_NOTIFY_IND;
1627 	dlip->dl_notification = DL_NOTE_LINK_UP;
1628 
1629 	qreply(dsp->ds_wq, mp);
1630 }
1631 
1632 /*
1633  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1634  */
1635 static void
1636 str_notify_link_down(dld_str_t *dsp)
1637 {
1638 	mblk_t		*mp;
1639 	dl_notify_ind_t	*dlip;
1640 
1641 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1642 		return;
1643 
1644 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1645 	    M_PROTO, 0)) == NULL)
1646 		return;
1647 
1648 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1649 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1650 	dlip->dl_primitive = DL_NOTIFY_IND;
1651 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1652 
1653 	qreply(dsp->ds_wq, mp);
1654 }
1655 
1656 /*
1657  * DL_NOTIFY_IND: DL_NOTE_SPEED
1658  */
1659 static void
1660 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1661 {
1662 	mblk_t		*mp;
1663 	dl_notify_ind_t	*dlip;
1664 
1665 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1666 		return;
1667 
1668 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1669 	    M_PROTO, 0)) == NULL)
1670 		return;
1671 
1672 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1673 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1674 	dlip->dl_primitive = DL_NOTIFY_IND;
1675 	dlip->dl_notification = DL_NOTE_SPEED;
1676 	dlip->dl_data = speed;
1677 
1678 	qreply(dsp->ds_wq, mp);
1679 }
1680 
1681 /*
1682  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1683  */
1684 static void
1685 str_notify_capab_reneg(dld_str_t *dsp)
1686 {
1687 	mblk_t		*mp;
1688 	dl_notify_ind_t	*dlip;
1689 
1690 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1691 		return;
1692 
1693 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1694 	    M_PROTO, 0)) == NULL)
1695 		return;
1696 
1697 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1698 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1699 	dlip->dl_primitive = DL_NOTIFY_IND;
1700 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1701 
1702 	qreply(dsp->ds_wq, mp);
1703 }
1704 
1705 /*
1706  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1707  */
1708 static void
1709 str_notify_fastpath_flush(dld_str_t *dsp)
1710 {
1711 	mblk_t		*mp;
1712 	dl_notify_ind_t	*dlip;
1713 
1714 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1715 		return;
1716 
1717 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1718 	    M_PROTO, 0)) == NULL)
1719 		return;
1720 
1721 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1722 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1723 	dlip->dl_primitive = DL_NOTIFY_IND;
1724 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1725 
1726 	qreply(dsp->ds_wq, mp);
1727 }
1728 
1729 static void
1730 str_notify_allowed_ips(dld_str_t *dsp)
1731 {
1732 	mblk_t		*mp;
1733 	dl_notify_ind_t	*dlip;
1734 	size_t		mp_size;
1735 	mac_protect_t	*mrp;
1736 
1737 	if (!(dsp->ds_notifications & DL_NOTE_ALLOWED_IPS))
1738 		return;
1739 
1740 	mp_size = sizeof (mac_protect_t) + sizeof (dl_notify_ind_t);
1741 	if ((mp = mexchange(dsp->ds_wq, NULL, mp_size, M_PROTO, 0)) == NULL)
1742 		return;
1743 
1744 	mrp = mac_protect_get(dsp->ds_mh);
1745 	bzero(mp->b_rptr, mp_size);
1746 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1747 	dlip->dl_primitive = DL_NOTIFY_IND;
1748 	dlip->dl_notification = DL_NOTE_ALLOWED_IPS;
1749 	dlip->dl_data = 0;
1750 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1751 	dlip->dl_addr_length = sizeof (mac_protect_t);
1752 	bcopy(mrp, mp->b_rptr + sizeof (dl_notify_ind_t),
1753 	    sizeof (mac_protect_t));
1754 
1755 	qreply(dsp->ds_wq, mp);
1756 }
1757 
1758 /*
1759  * MAC notification callback.
1760  */
1761 void
1762 str_notify(void *arg, mac_notify_type_t type)
1763 {
1764 	dld_str_t		*dsp = (dld_str_t *)arg;
1765 	queue_t			*q = dsp->ds_wq;
1766 	mac_handle_t		mh = dsp->ds_mh;
1767 	mac_client_handle_t	mch = dsp->ds_mch;
1768 	uint8_t			addr[MAXMACADDRLEN];
1769 
1770 	switch (type) {
1771 	case MAC_NOTE_TX:
1772 		qenable(q);
1773 		break;
1774 
1775 	case MAC_NOTE_DEVPROMISC:
1776 		/*
1777 		 * Send the appropriate DL_NOTIFY_IND.
1778 		 */
1779 		if (mac_promisc_get(mh))
1780 			str_notify_promisc_on_phys(dsp);
1781 		else
1782 			str_notify_promisc_off_phys(dsp);
1783 		break;
1784 
1785 	case MAC_NOTE_UNICST:
1786 		/*
1787 		 * This notification is sent whenever the MAC unicast
1788 		 * address changes.
1789 		 */
1790 		mac_unicast_primary_get(mh, addr);
1791 
1792 		/*
1793 		 * Send the appropriate DL_NOTIFY_IND.
1794 		 */
1795 		str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1796 		break;
1797 
1798 	case MAC_NOTE_DEST:
1799 		/*
1800 		 * Only send up DL_NOTE_DEST_ADDR if the link has a
1801 		 * destination address.
1802 		 */
1803 		if (mac_dst_get(dsp->ds_mh, addr))
1804 			str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1805 		break;
1806 
1807 	case MAC_NOTE_LOWLINK:
1808 	case MAC_NOTE_LINK:
1809 		/*
1810 		 * LOWLINK refers to the actual link status. For links that
1811 		 * are not part of a bridge instance LOWLINK and LINK state
1812 		 * are the same. But for a link part of a bridge instance
1813 		 * LINK state refers to the aggregate link status: "up" when
1814 		 * at least one link part of the bridge is up and is "down"
1815 		 * when all links part of the bridge are down.
1816 		 *
1817 		 * Clients can request to be notified of the LOWLINK state
1818 		 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1819 		 * daemon request lowlink state changes and upper layer clients
1820 		 * receive notifications of the aggregate link state changes
1821 		 * which is the default when requesting LINK UP/DOWN state
1822 		 * notifications.
1823 		 */
1824 
1825 		/*
1826 		 * Check that the notification type matches the one that we
1827 		 * want.  If we want lower-level link notifications, and this
1828 		 * is upper, or if we want upper and this is lower, then
1829 		 * ignore.
1830 		 */
1831 		if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1832 			break;
1833 		/*
1834 		 * This notification is sent every time the MAC driver
1835 		 * updates the link state.
1836 		 */
1837 		switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1838 		    MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1839 		case LINK_STATE_UP: {
1840 			uint64_t speed;
1841 			/*
1842 			 * The link is up so send the appropriate
1843 			 * DL_NOTIFY_IND.
1844 			 */
1845 			str_notify_link_up(dsp);
1846 
1847 			speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1848 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1849 			break;
1850 		}
1851 		case LINK_STATE_DOWN:
1852 			/*
1853 			 * The link is down so send the appropriate
1854 			 * DL_NOTIFY_IND.
1855 			 */
1856 			str_notify_link_down(dsp);
1857 			break;
1858 
1859 		default:
1860 			break;
1861 		}
1862 		break;
1863 
1864 	case MAC_NOTE_CAPAB_CHG:
1865 		/*
1866 		 * This notification is sent whenever the MAC resources
1867 		 * change or capabilities change. We need to renegotiate
1868 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1869 		 */
1870 		str_notify_capab_reneg(dsp);
1871 		break;
1872 
1873 	case MAC_NOTE_SDU_SIZE: {
1874 		uint_t  max_sdu;
1875 		uint_t	multicast_sdu;
1876 		mac_sdu_get2(dsp->ds_mh, NULL, &max_sdu, &multicast_sdu);
1877 		str_notify_sdu_size(dsp, max_sdu, multicast_sdu);
1878 		break;
1879 	}
1880 
1881 	case MAC_NOTE_FASTPATH_FLUSH:
1882 		str_notify_fastpath_flush(dsp);
1883 		break;
1884 
1885 	/* Unused notifications */
1886 	case MAC_NOTE_MARGIN:
1887 		break;
1888 
1889 	case MAC_NOTE_ALLOWED_IPS:
1890 		str_notify_allowed_ips(dsp);
1891 		break;
1892 
1893 	default:
1894 		ASSERT(B_FALSE);
1895 		break;
1896 	}
1897 }
1898 
1899 /*
1900  * This function is called via a taskq mechansim to process all control
1901  * messages on a per 'dsp' end point.
1902  */
1903 static void
1904 dld_wput_nondata_task(void *arg)
1905 {
1906 	dld_str_t	*dsp = arg;
1907 	mblk_t		*mp;
1908 
1909 	mutex_enter(&dsp->ds_lock);
1910 	while (dsp->ds_pending_head != NULL) {
1911 		mp = dsp->ds_pending_head;
1912 		dsp->ds_pending_head = mp->b_next;
1913 		mp->b_next = NULL;
1914 		if (dsp->ds_pending_head == NULL)
1915 			dsp->ds_pending_tail = NULL;
1916 		mutex_exit(&dsp->ds_lock);
1917 
1918 		switch (DB_TYPE(mp)) {
1919 		case M_PROTO:
1920 		case M_PCPROTO:
1921 			dld_proto(dsp, mp);
1922 			break;
1923 		case M_IOCTL:
1924 			dld_ioc(dsp, mp);
1925 			break;
1926 		default:
1927 			ASSERT(0);
1928 		}
1929 
1930 		mutex_enter(&dsp->ds_lock);
1931 	}
1932 	ASSERT(dsp->ds_pending_tail == NULL);
1933 	dsp->ds_dlpi_pending = 0;
1934 	cv_broadcast(&dsp->ds_dlpi_pending_cv);
1935 	mutex_exit(&dsp->ds_lock);
1936 }
1937 
1938 /*
1939  * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1940  * thread is started at boot time.
1941  */
1942 static void
1943 dld_taskq_dispatch(void)
1944 {
1945 	callb_cpr_t	cprinfo;
1946 	dld_str_t	*dsp;
1947 
1948 	CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1949 	    "dld_taskq_dispatch");
1950 	mutex_enter(&dld_taskq_lock);
1951 
1952 	while (!dld_taskq_quit) {
1953 		dsp = list_head(&dld_taskq_list);
1954 		while (dsp != NULL) {
1955 			list_remove(&dld_taskq_list, dsp);
1956 			mutex_exit(&dld_taskq_lock);
1957 			VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1958 			    dsp, TQ_SLEEP) != 0);
1959 			mutex_enter(&dld_taskq_lock);
1960 			dsp = list_head(&dld_taskq_list);
1961 		}
1962 
1963 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1964 		cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1965 		CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1966 	}
1967 
1968 	dld_taskq_done = B_TRUE;
1969 	cv_signal(&dld_taskq_cv);
1970 	CALLB_CPR_EXIT(&cprinfo);
1971 	thread_exit();
1972 }
1973 
1974 /*
1975  * All control operations are serialized on the 'dsp' and are also funneled
1976  * through a taskq mechanism to ensure that subsequent processing has kernel
1977  * context and can safely use cv_wait.
1978  *
1979  * Mechanisms to handle taskq dispatch failures
1980  *
1981  * The only way to be sure that taskq dispatch does not fail is to either
1982  * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1983  * some number of entries and make sure that the number of outstanding requests
1984  * are less than that number. We can't use TQ_SLEEP since we don't know the
1985  * context. Nor can we bound the total number of 'dsp' end points. So we are
1986  * unable to use either of the above schemes, and are forced to deal with
1987  * taskq dispatch failures. Note that even dynamic taskq could fail in
1988  * dispatch if TQ_NOSLEEP is specified, since this flag is translated
1989  * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
1990  * framework.
1991  *
1992  * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
1993  * We also have a single global thread to retry the taskq dispatch. This
1994  * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
1995  * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
1996  */
1997 static void
1998 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
1999 {
2000 	ASSERT(mp->b_next == NULL);
2001 	mutex_enter(&dsp->ds_lock);
2002 	if (dsp->ds_pending_head != NULL) {
2003 		ASSERT(dsp->ds_dlpi_pending);
2004 		dsp->ds_pending_tail->b_next = mp;
2005 		dsp->ds_pending_tail = mp;
2006 		mutex_exit(&dsp->ds_lock);
2007 		return;
2008 	}
2009 	ASSERT(dsp->ds_pending_tail == NULL);
2010 	dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2011 	/*
2012 	 * At this point if ds_dlpi_pending is set, it implies that the taskq
2013 	 * thread is still active and is processing the last message, though
2014 	 * the pending queue has been emptied.
2015 	 */
2016 	if (dsp->ds_dlpi_pending) {
2017 		mutex_exit(&dsp->ds_lock);
2018 		return;
2019 	}
2020 
2021 	dsp->ds_dlpi_pending = 1;
2022 	mutex_exit(&dsp->ds_lock);
2023 
2024 	if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
2025 	    TQ_NOSLEEP) != 0)
2026 		return;
2027 
2028 	mutex_enter(&dld_taskq_lock);
2029 	list_insert_tail(&dld_taskq_list, dsp);
2030 	cv_signal(&dld_taskq_cv);
2031 	mutex_exit(&dld_taskq_lock);
2032 }
2033 
2034 /*
2035  * Process an M_IOCTL message.
2036  */
2037 static void
2038 dld_ioc(dld_str_t *dsp, mblk_t *mp)
2039 {
2040 	uint_t			cmd;
2041 
2042 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2043 	ASSERT(dsp->ds_type == DLD_DLPI);
2044 
2045 	switch (cmd) {
2046 	case DLIOCNATIVE:
2047 		ioc_native(dsp, mp);
2048 		break;
2049 	case DLIOCMARGININFO:
2050 		ioc_margin(dsp, mp);
2051 		break;
2052 	case DLIOCRAW:
2053 		ioc_raw(dsp, mp);
2054 		break;
2055 	case DLIOCHDRINFO:
2056 		ioc_fast(dsp, mp);
2057 		break;
2058 	case DLIOCLOWLINK:
2059 		ioc_lowlink(dsp, mp);
2060 		break;
2061 	default:
2062 		ioc(dsp, mp);
2063 	}
2064 }
2065 
2066 /*
2067  * DLIOCNATIVE
2068  */
2069 static void
2070 ioc_native(dld_str_t *dsp, mblk_t *mp)
2071 {
2072 	queue_t *q = dsp->ds_wq;
2073 	const mac_info_t *mip = dsp->ds_mip;
2074 
2075 	/*
2076 	 * Native mode can be enabled if it's disabled and if the
2077 	 * native media type is different.
2078 	 */
2079 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2080 		dsp->ds_native = B_TRUE;
2081 
2082 	if (dsp->ds_native)
2083 		miocack(q, mp, 0, mip->mi_nativemedia);
2084 	else
2085 		miocnak(q, mp, 0, ENOTSUP);
2086 }
2087 
2088 /*
2089  * DLIOCMARGININFO
2090  */
2091 static void
2092 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2093 {
2094 	queue_t *q = dsp->ds_wq;
2095 	uint32_t margin;
2096 	int err;
2097 
2098 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2099 		err = EINVAL;
2100 		goto failed;
2101 	}
2102 	if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2103 		goto failed;
2104 
2105 	mac_margin_get(dsp->ds_mh, &margin);
2106 	*((uint32_t *)mp->b_cont->b_rptr) = margin;
2107 	miocack(q, mp, sizeof (uint32_t), 0);
2108 	return;
2109 
2110 failed:
2111 	miocnak(q, mp, 0, err);
2112 }
2113 
2114 /*
2115  * DLIOCRAW
2116  */
2117 static void
2118 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2119 {
2120 	queue_t *q = dsp->ds_wq;
2121 	mac_perim_handle_t	mph;
2122 
2123 	if (dsp->ds_mh == NULL) {
2124 		dsp->ds_mode = DLD_RAW;
2125 		miocack(q, mp, 0, 0);
2126 		return;
2127 	}
2128 
2129 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2130 	if (dsp->ds_polling || dsp->ds_direct) {
2131 		mac_perim_exit(mph);
2132 		miocnak(q, mp, 0, EPROTO);
2133 		return;
2134 	}
2135 
2136 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2137 		/*
2138 		 * Set the receive callback.
2139 		 */
2140 		dls_rx_set(dsp, dld_str_rx_raw, dsp);
2141 	}
2142 
2143 	/*
2144 	 * Note that raw mode is enabled.
2145 	 */
2146 	dsp->ds_mode = DLD_RAW;
2147 	mac_perim_exit(mph);
2148 
2149 	miocack(q, mp, 0, 0);
2150 }
2151 
2152 /*
2153  * DLIOCHDRINFO
2154  */
2155 static void
2156 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2157 {
2158 	dl_unitdata_req_t *dlp;
2159 	off_t		off;
2160 	size_t		len;
2161 	const uint8_t	*addr;
2162 	uint16_t	sap;
2163 	mblk_t		*nmp;
2164 	mblk_t		*hmp;
2165 	uint_t		addr_length;
2166 	queue_t		*q = dsp->ds_wq;
2167 	int		err;
2168 	mac_perim_handle_t	mph;
2169 
2170 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
2171 		err = ENOTSUP;
2172 		goto failed;
2173 	}
2174 
2175 	/*
2176 	 * DLIOCHDRINFO should only come from IP. The one initiated from
2177 	 * user-land should not be allowed.
2178 	 */
2179 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2180 		err = EINVAL;
2181 		goto failed;
2182 	}
2183 
2184 	nmp = mp->b_cont;
2185 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2186 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2187 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
2188 		err = EINVAL;
2189 		goto failed;
2190 	}
2191 
2192 	off = dlp->dl_dest_addr_offset;
2193 	len = dlp->dl_dest_addr_length;
2194 
2195 	if (!MBLKIN(nmp, off, len)) {
2196 		err = EINVAL;
2197 		goto failed;
2198 	}
2199 
2200 	if (dsp->ds_dlstate != DL_IDLE) {
2201 		err = ENOTSUP;
2202 		goto failed;
2203 	}
2204 
2205 	addr_length = dsp->ds_mip->mi_addr_length;
2206 	if (len != addr_length + sizeof (uint16_t)) {
2207 		err = EINVAL;
2208 		goto failed;
2209 	}
2210 
2211 	addr = nmp->b_rptr + off;
2212 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2213 
2214 	if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2215 		err = ENOMEM;
2216 		goto failed;
2217 	}
2218 
2219 	/*
2220 	 * This ioctl might happen concurrently with a direct call to dld_capab
2221 	 * that tries to enable direct and/or poll capabilities. Since the
2222 	 * stack does not serialize them, we do so here to avoid mixing
2223 	 * the callbacks.
2224 	 */
2225 	mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2226 	if (dsp->ds_mode != DLD_FASTPATH) {
2227 		/*
2228 		 * Set the receive callback (unless polling is enabled).
2229 		 */
2230 		if (!dsp->ds_polling && !dsp->ds_direct)
2231 			dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2232 
2233 		/*
2234 		 * Note that fast-path mode is enabled.
2235 		 */
2236 		dsp->ds_mode = DLD_FASTPATH;
2237 	}
2238 	mac_perim_exit(mph);
2239 
2240 	freemsg(nmp->b_cont);
2241 	nmp->b_cont = hmp;
2242 
2243 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2244 	return;
2245 failed:
2246 	miocnak(q, mp, 0, err);
2247 }
2248 
2249 /*
2250  * DLIOCLOWLINK: request actual link state changes. When the
2251  * link is part of a bridge instance the client receives actual
2252  * link state changes and not the aggregate link status. Used by
2253  * the bridging daemon (bridged) for proper RSTP operation.
2254  */
2255 static void
2256 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2257 {
2258 	queue_t *q = dsp->ds_wq;
2259 	int err;
2260 
2261 	if ((err = miocpullup(mp, sizeof (int))) != 0) {
2262 		miocnak(q, mp, 0, err);
2263 	} else {
2264 		/* LINTED: alignment */
2265 		dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2266 		miocack(q, mp, 0, 0);
2267 	}
2268 }
2269 
2270 /*
2271  * Catch-all handler.
2272  */
2273 static void
2274 ioc(dld_str_t *dsp, mblk_t *mp)
2275 {
2276 	queue_t	*q = dsp->ds_wq;
2277 
2278 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2279 		miocnak(q, mp, 0, EINVAL);
2280 		return;
2281 	}
2282 	mac_ioctl(dsp->ds_mh, q, mp);
2283 }
2284