xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision 4a7ceb24cfcc0a97f96d86cfe5852ae445b50e57)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Data-Link Driver
30  */
31 
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/strsubr.h>
35 #include	<sys/atomic.h>
36 #include	<sys/mkdev.h>
37 #include	<sys/vlan.h>
38 #include	<sys/dld.h>
39 #include	<sys/dld_impl.h>
40 #include	<sys/dls_impl.h>
41 #include	<inet/common.h>
42 
43 static int	str_constructor(void *, void *, int);
44 static void	str_destructor(void *, void *);
45 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
46 static void	str_notify_promisc_on_phys(dld_str_t *);
47 static void	str_notify_promisc_off_phys(dld_str_t *);
48 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
49 static void	str_notify_link_up(dld_str_t *);
50 static void	str_notify_link_down(dld_str_t *);
51 static void	str_notify_capab_reneg(dld_str_t *);
52 static void	str_notify_speed(dld_str_t *, uint32_t);
53 static void	str_notify(void *, mac_notify_type_t);
54 
55 static void	ioc_native(dld_str_t *,  mblk_t *);
56 static void	ioc_raw(dld_str_t *, mblk_t *);
57 static void	ioc_fast(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
61 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
62 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
63 
64 static uint32_t		str_count;
65 static kmem_cache_t	*str_cachep;
66 static uint32_t		minor_count;
67 static mod_hash_t	*str_hashp;
68 
69 #define	STR_HASHSZ		64
70 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
71 
72 /*
73  * Some notes on entry points, flow-control, queueing and locking:
74  *
75  * This driver exports the traditional STREAMS put entry point as well as
76  * the non-STREAMS fast-path transmit routine which is provided to IP via
77  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
78  * and data operations, while the fast-path routine deals only with M_DATA
79  * fast-path packets.  Regardless of the entry point, all outbound packets
80  * will end up in dld_tx_single(), where they will be delivered to the MAC
81  * driver.
82  *
83  * The transmit logic operates in two modes: a "not busy" mode where the
84  * packets will be delivered to the MAC for a send attempt, or "busy" mode
85  * where they will be enqueued in the internal queue because of flow-control.
86  * Flow-control happens when the MAC driver indicates the packets couldn't
87  * be transmitted due to lack of resources (e.g. running out of descriptors).
88  * In such case, the driver will place a dummy message on its write-side
89  * STREAMS queue so that the queue is marked as "full".  Any subsequent
90  * packets arriving at the driver will be enqueued in the internal queue,
91  * which is drained in the context of the service thread that gets scheduled
92  * whenever the driver is in the "busy" mode.  When all packets have been
93  * successfully delivered by MAC and the internal queue is empty, it will
94  * transition to the "not busy" mode by removing the dummy message from the
95  * write-side STREAMS queue; in effect this will trigger backenabling.
96  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
97  * to the above reasons.
98  *
99  * The driver implements an internal transmit queue independent of STREAMS.
100  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
101  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
102  * getq() operations done by the driver are those related to placing and
103  * removing the dummy message to/from the write-side STREAMS queue for flow-
104  * control purposes.
105  *
106  * Locking is done independent of STREAMS due to the driver being fully MT.
107  * Threads entering the driver (either from put or service entry points)
108  * will most likely be readers, with the exception of a few writer cases
109  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
110  * DLD-related ioctl requests.  The DLPI detach case is special, because
111  * it involves freeing resources and therefore must be single-threaded.
112  * Unfortunately the readers/writers lock can't be used to protect against
113  * it, because the lock is dropped prior to the driver calling places where
114  * putnext() may be invoked, and such places may depend on those resources
115  * to exist.  Because of this, the driver always completes the DLPI detach
116  * process when there are no other threads running in the driver.  This is
117  * done by keeping track of the number of threads, such that the the last
118  * thread leaving the driver will finish the pending DLPI detach operation.
119  */
120 
121 /*
122  * dld_max_q_count is the queue depth threshold used to limit the number of
123  * outstanding packets or bytes allowed in the queue; once this limit is
124  * reached the driver will free any incoming ones until the queue depth
125  * drops below the threshold.
126  *
127  * This buffering is provided to accomodate clients which do not employ
128  * their own buffering scheme, and to handle occasional packet bursts.
129  * Clients which handle their own buffering will receive positive feedback
130  * from this driver as soon as it transitions into the "busy" state, i.e.
131  * when the queue is initially filled up; they will get backenabled once
132  * the queue is empty.
133  *
134  * The value chosen here is rather arbitrary; in future some intelligent
135  * heuristics may be involved which could take into account the hardware's
136  * transmit ring size, etc.
137  */
138 uint_t dld_max_q_count = (16 * 1024 *1024);
139 
140 /*
141  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
142  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
143  * match dev_t. If a stream is found and it is attached, its dev_info_t *
144  * is returned.
145  */
146 typedef struct i_dld_str_state_s {
147 	major_t		ds_major;
148 	minor_t		ds_minor;
149 	dev_info_t	*ds_dip;
150 } i_dld_str_state_t;
151 
152 /* ARGSUSED */
153 static uint_t
154 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
155 {
156 	i_dld_str_state_t	*statep = arg;
157 	dld_str_t		*dsp = (dld_str_t *)val;
158 
159 	if (statep->ds_major != dsp->ds_major)
160 		return (MH_WALK_CONTINUE);
161 
162 	ASSERT(statep->ds_minor != 0);
163 
164 	/*
165 	 * Access to ds_ppa and ds_mh need to be protected by ds_lock.
166 	 */
167 	rw_enter(&dsp->ds_lock, RW_READER);
168 	if (statep->ds_minor <= DLD_MAX_MINOR) {
169 		/*
170 		 * Style 1: minor can be derived from the ppa. we
171 		 * continue to walk until we find a matching stream
172 		 * in attached state.
173 		 */
174 		if (statep->ds_minor == DLS_PPA2MINOR(dsp->ds_ppa) &&
175 		    dsp->ds_mh != NULL) {
176 			statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
177 			rw_exit(&dsp->ds_lock);
178 			return (MH_WALK_TERMINATE);
179 		}
180 	} else {
181 		/*
182 		 * Clone: a clone minor is unique. we can terminate the
183 		 * walk if we find a matching stream -- even if we fail
184 		 * to obtain the devinfo.
185 		 */
186 		if (statep->ds_minor == dsp->ds_minor) {
187 			if (dsp->ds_mh != NULL)
188 				statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
189 			rw_exit(&dsp->ds_lock);
190 			return (MH_WALK_TERMINATE);
191 		}
192 	}
193 	rw_exit(&dsp->ds_lock);
194 	return (MH_WALK_CONTINUE);
195 }
196 
197 static dev_info_t *
198 dld_finddevinfo(dev_t dev)
199 {
200 	i_dld_str_state_t	state;
201 
202 	state.ds_minor = getminor(dev);
203 	state.ds_major = getmajor(dev);
204 	state.ds_dip = NULL;
205 
206 	if (state.ds_minor == 0)
207 		return (NULL);
208 
209 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
210 	if (state.ds_dip != NULL || state.ds_minor <= DLD_MAX_MINOR)
211 		return (state.ds_dip);
212 
213 	/* See if it's a minor node of a VLAN */
214 	return (dls_finddevinfo(dev));
215 }
216 
217 /*
218  * devo_getinfo: getinfo(9e)
219  */
220 /*ARGSUSED*/
221 int
222 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
223 {
224 	dev_info_t	*devinfo;
225 	minor_t		minor = getminor((dev_t)arg);
226 	int		rc = DDI_FAILURE;
227 
228 	switch (cmd) {
229 	case DDI_INFO_DEVT2DEVINFO:
230 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
231 			*(dev_info_t **)resp = devinfo;
232 			rc = DDI_SUCCESS;
233 		}
234 		break;
235 	case DDI_INFO_DEVT2INSTANCE:
236 		if (minor > 0 && minor <= DLD_MAX_MINOR) {
237 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
238 			rc = DDI_SUCCESS;
239 		} else if (minor > DLD_MAX_MINOR &&
240 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
241 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
242 			rc = DDI_SUCCESS;
243 		}
244 		break;
245 	}
246 	return (rc);
247 }
248 
249 /*
250  * qi_qopen: open(9e)
251  */
252 /*ARGSUSED*/
253 int
254 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
255 {
256 	dld_str_t	*dsp;
257 	major_t		major;
258 	minor_t		minor;
259 	int		err;
260 
261 	if (sflag == MODOPEN)
262 		return (ENOTSUP);
263 
264 	/*
265 	 * This is a cloning driver and therefore each queue should only
266 	 * ever get opened once.
267 	 */
268 	if (rq->q_ptr != NULL)
269 		return (EBUSY);
270 
271 	major = getmajor(*devp);
272 	minor = getminor(*devp);
273 
274 	/*
275 	 * Create a new dld_str_t for the stream. This will grab a new minor
276 	 * number that will be handed back in the cloned dev_t.  Creation may
277 	 * fail if we can't allocate the dummy mblk used for flow-control.
278 	 */
279 	dsp = dld_str_create(rq, DLD_DLPI, major,
280 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
281 	if (dsp == NULL)
282 		return (ENOSR);
283 
284 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
285 	if (minor != 0) {
286 		/*
287 		 * Style 1 open
288 		 */
289 		t_uscalar_t ppa;
290 
291 		if ((err = dls_ppa_from_minor(minor, &ppa)) != 0)
292 			goto failed;
293 
294 		if ((err = dld_str_attach(dsp, ppa)) != 0)
295 			goto failed;
296 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
297 	} else {
298 		(void) qassociate(rq, -1);
299 	}
300 
301 	/*
302 	 * Enable the queue srv(9e) routine.
303 	 */
304 	qprocson(rq);
305 
306 	/*
307 	 * Construct a cloned dev_t to hand back.
308 	 */
309 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
310 	return (0);
311 
312 failed:
313 	dld_str_destroy(dsp);
314 	return (err);
315 }
316 
317 /*
318  * qi_qclose: close(9e)
319  */
320 int
321 dld_close(queue_t *rq)
322 {
323 	dld_str_t	*dsp = rq->q_ptr;
324 
325 	/*
326 	 * Wait until pending requests are processed.
327 	 */
328 	mutex_enter(&dsp->ds_thr_lock);
329 	while (dsp->ds_pending_cnt > 0)
330 		cv_wait(&dsp->ds_pending_cv, &dsp->ds_thr_lock);
331 	mutex_exit(&dsp->ds_thr_lock);
332 
333 	/*
334 	 * Disable the queue srv(9e) routine.
335 	 */
336 	qprocsoff(rq);
337 
338 	/*
339 	 * At this point we can not be entered by any threads via STREAMS
340 	 * or the direct call interface, which is available only to IP.
341 	 * After the interface is unplumbed, IP wouldn't have any reference
342 	 * to this instance, and therefore we are now effectively single
343 	 * threaded and don't require any lock protection.  Flush all
344 	 * pending packets which are sitting in the transmit queue.
345 	 */
346 	ASSERT(dsp->ds_thr == 0);
347 	dld_tx_flush(dsp);
348 
349 	/*
350 	 * This stream was open to a provider node. Check to see
351 	 * if it has been cleanly shut down.
352 	 */
353 	if (dsp->ds_dlstate != DL_UNATTACHED) {
354 		/*
355 		 * The stream is either open to a style 1 provider or
356 		 * this is not clean shutdown. Detach from the PPA.
357 		 * (This is still ok even in the style 1 case).
358 		 */
359 		dld_str_detach(dsp);
360 	}
361 
362 	dld_str_destroy(dsp);
363 	return (0);
364 }
365 
366 /*
367  * qi_qputp: put(9e)
368  */
369 void
370 dld_wput(queue_t *wq, mblk_t *mp)
371 {
372 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
373 
374 	DLD_ENTER(dsp);
375 
376 	switch (DB_TYPE(mp)) {
377 	case M_DATA:
378 		/*
379 		 * State is held constant by the DLD_ENTER done above
380 		 * until all sending threads are done. Mode can change
381 		 * due to ioctl, however locks must not be held across
382 		 * calls to putnext(), which can be called from here
383 		 * via dld_tx_single().
384 		 */
385 		rw_enter(&dsp->ds_lock, RW_READER);
386 		if (dsp->ds_dlstate != DL_IDLE ||
387 		    dsp->ds_mode == DLD_UNITDATA) {
388 			rw_exit(&dsp->ds_lock);
389 			freemsg(mp);
390 		} else if (dsp->ds_mode == DLD_FASTPATH) {
391 			rw_exit(&dsp->ds_lock);
392 			str_mdata_fastpath_put(dsp, mp);
393 		} else if (dsp->ds_mode == DLD_RAW) {
394 			rw_exit(&dsp->ds_lock);
395 			str_mdata_raw_put(dsp, mp);
396 		}
397 		break;
398 	case M_PROTO:
399 	case M_PCPROTO:
400 		dld_proto(dsp, mp);
401 		break;
402 	case M_IOCTL:
403 		dld_ioc(dsp, mp);
404 		break;
405 	case M_FLUSH:
406 		if (*mp->b_rptr & FLUSHW) {
407 			dld_tx_flush(dsp);
408 			*mp->b_rptr &= ~FLUSHW;
409 		}
410 
411 		if (*mp->b_rptr & FLUSHR) {
412 			qreply(wq, mp);
413 		} else {
414 			freemsg(mp);
415 		}
416 		break;
417 	default:
418 		freemsg(mp);
419 		break;
420 	}
421 
422 	DLD_EXIT(dsp);
423 }
424 
425 /*
426  * qi_srvp: srv(9e)
427  */
428 void
429 dld_wsrv(queue_t *wq)
430 {
431 	mblk_t		*mp;
432 	dld_str_t	*dsp = wq->q_ptr;
433 
434 	DLD_ENTER(dsp);
435 	rw_enter(&dsp->ds_lock, RW_READER);
436 	/*
437 	 * Grab all packets (chained via b_next) off our transmit queue
438 	 * and try to send them all to the MAC layer.  Since the queue
439 	 * is independent of streams, we are able to dequeue all messages
440 	 * at once without looping through getq() and manually chaining
441 	 * them.  Note that the queue size parameters (byte and message
442 	 * counts) are cleared as well, but we postpone the backenabling
443 	 * until after the MAC transmit since some packets may end up
444 	 * back at our transmit queue.
445 	 */
446 	mutex_enter(&dsp->ds_tx_list_lock);
447 	if ((mp = dsp->ds_tx_list_head) == NULL) {
448 		ASSERT(!dsp->ds_tx_qbusy);
449 		ASSERT(dsp->ds_tx_flow_mp != NULL);
450 		ASSERT(dsp->ds_tx_list_head == NULL);
451 		ASSERT(dsp->ds_tx_list_tail == NULL);
452 		ASSERT(dsp->ds_tx_cnt == 0);
453 		ASSERT(dsp->ds_tx_msgcnt == 0);
454 		mutex_exit(&dsp->ds_tx_list_lock);
455 		rw_exit(&dsp->ds_lock);
456 		DLD_EXIT(dsp);
457 		return;
458 	}
459 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
460 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
461 	mutex_exit(&dsp->ds_tx_list_lock);
462 
463 	/*
464 	 * Discard packets unless we are attached and bound; note that
465 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
466 	 * because regardless of the mode all transmit will end up in
467 	 * dld_tx_single() where the packets may be queued.
468 	 */
469 	ASSERT(DB_TYPE(mp) == M_DATA);
470 	if (dsp->ds_dlstate != DL_IDLE) {
471 		freemsgchain(mp);
472 		goto done;
473 	}
474 
475 	/*
476 	 * Attempt to transmit one or more packets.  If the MAC can't
477 	 * send them all, re-queue the packet(s) at the beginning of
478 	 * the transmit queue to avoid any re-ordering.
479 	 */
480 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
481 		dld_tx_enqueue(dsp, mp, B_TRUE);
482 
483 done:
484 	/*
485 	 * Grab the list lock again and check if the transmit queue is
486 	 * really empty; if so, lift up flow-control and backenable any
487 	 * writer queues.  If the queue is not empty, schedule service
488 	 * thread to drain it.
489 	 */
490 	mutex_enter(&dsp->ds_tx_list_lock);
491 	if (dsp->ds_tx_list_head == NULL) {
492 		dsp->ds_tx_flow_mp = getq(wq);
493 		ASSERT(dsp->ds_tx_flow_mp != NULL);
494 		dsp->ds_tx_qbusy = B_FALSE;
495 	}
496 	mutex_exit(&dsp->ds_tx_list_lock);
497 
498 	rw_exit(&dsp->ds_lock);
499 	DLD_EXIT(dsp);
500 }
501 
502 void
503 dld_init_ops(struct dev_ops *ops, const char *name)
504 {
505 	struct streamtab *stream;
506 	struct qinit *rq, *wq;
507 	struct module_info *modinfo;
508 
509 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
510 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
511 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
512 	modinfo->mi_minpsz = 0;
513 	modinfo->mi_maxpsz = 64*1024;
514 	modinfo->mi_hiwat  = 1;
515 	modinfo->mi_lowat = 0;
516 
517 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
518 	rq->qi_qopen = dld_open;
519 	rq->qi_qclose = dld_close;
520 	rq->qi_minfo = modinfo;
521 
522 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
523 	wq->qi_putp = (pfi_t)dld_wput;
524 	wq->qi_srvp = (pfi_t)dld_wsrv;
525 	wq->qi_minfo = modinfo;
526 
527 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
528 	stream->st_rdinit = rq;
529 	stream->st_wrinit = wq;
530 	ops->devo_cb_ops->cb_str = stream;
531 
532 	ops->devo_getinfo = &dld_getinfo;
533 }
534 
535 void
536 dld_fini_ops(struct dev_ops *ops)
537 {
538 	struct streamtab *stream;
539 	struct qinit *rq, *wq;
540 	struct module_info *modinfo;
541 
542 	stream = ops->devo_cb_ops->cb_str;
543 	rq = stream->st_rdinit;
544 	wq = stream->st_wrinit;
545 	modinfo = rq->qi_minfo;
546 	ASSERT(wq->qi_minfo == modinfo);
547 
548 	kmem_free(stream, sizeof (struct streamtab));
549 	kmem_free(wq, sizeof (struct qinit));
550 	kmem_free(rq, sizeof (struct qinit));
551 	kmem_free(modinfo->mi_idname, FMNAMESZ);
552 	kmem_free(modinfo, sizeof (struct module_info));
553 }
554 
555 /*
556  * Initialize this module's data structures.
557  */
558 void
559 dld_str_init(void)
560 {
561 	/*
562 	 * Create dld_str_t object cache.
563 	 */
564 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
565 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
566 	ASSERT(str_cachep != NULL);
567 
568 	/*
569 	 * Create a hash table for maintaining dld_str_t's.
570 	 * The ds_minor field (the clone minor number) of a dld_str_t
571 	 * is used as a key for this hash table because this number is
572 	 * globally unique (allocated from "dls_minor_arena").
573 	 */
574 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
575 	    mod_hash_null_valdtor);
576 }
577 
578 /*
579  * Tear down this module's data structures.
580  */
581 int
582 dld_str_fini(void)
583 {
584 	/*
585 	 * Make sure that there are no objects in use.
586 	 */
587 	if (str_count != 0)
588 		return (EBUSY);
589 
590 	/*
591 	 * Check to see if there are any minor numbers still in use.
592 	 */
593 	if (minor_count != 0)
594 		return (EBUSY);
595 
596 	/*
597 	 * Destroy object cache.
598 	 */
599 	kmem_cache_destroy(str_cachep);
600 	mod_hash_destroy_idhash(str_hashp);
601 	return (0);
602 }
603 
604 /*
605  * Create a new dld_str_t object.
606  */
607 dld_str_t *
608 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
609 {
610 	dld_str_t	*dsp;
611 	int		err;
612 
613 	/*
614 	 * Allocate an object from the cache.
615 	 */
616 	atomic_add_32(&str_count, 1);
617 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
618 
619 	/*
620 	 * Allocate the dummy mblk for flow-control.
621 	 */
622 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
623 	if (dsp->ds_tx_flow_mp == NULL) {
624 		kmem_cache_free(str_cachep, dsp);
625 		atomic_add_32(&str_count, -1);
626 		return (NULL);
627 	}
628 	dsp->ds_type = type;
629 	dsp->ds_major = major;
630 	dsp->ds_style = style;
631 
632 	/*
633 	 * Initialize the queue pointers.
634 	 */
635 	ASSERT(RD(rq) == rq);
636 	dsp->ds_rq = rq;
637 	dsp->ds_wq = WR(rq);
638 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
639 
640 	/*
641 	 * We want explicit control over our write-side STREAMS queue
642 	 * where the dummy mblk gets added/removed for flow-control.
643 	 */
644 	noenable(WR(rq));
645 
646 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
647 	    (mod_hash_val_t)dsp);
648 	ASSERT(err == 0);
649 	return (dsp);
650 }
651 
652 /*
653  * Destroy a dld_str_t object.
654  */
655 void
656 dld_str_destroy(dld_str_t *dsp)
657 {
658 	queue_t		*rq;
659 	queue_t		*wq;
660 	mod_hash_val_t	val;
661 	/*
662 	 * Clear the queue pointers.
663 	 */
664 	rq = dsp->ds_rq;
665 	wq = dsp->ds_wq;
666 	ASSERT(wq == WR(rq));
667 
668 	rq->q_ptr = wq->q_ptr = NULL;
669 	dsp->ds_rq = dsp->ds_wq = NULL;
670 
671 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
672 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
673 	ASSERT(dsp->ds_tx_list_head == NULL);
674 	ASSERT(dsp->ds_tx_list_tail == NULL);
675 	ASSERT(dsp->ds_tx_cnt == 0);
676 	ASSERT(dsp->ds_tx_msgcnt == 0);
677 	ASSERT(!dsp->ds_tx_qbusy);
678 
679 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
680 	ASSERT(dsp->ds_thr == 0);
681 	ASSERT(dsp->ds_pending_req == NULL);
682 
683 	/*
684 	 * Reinitialize all the flags.
685 	 */
686 	dsp->ds_notifications = 0;
687 	dsp->ds_passivestate = DLD_UNINITIALIZED;
688 	dsp->ds_mode = DLD_UNITDATA;
689 	dsp->ds_native = B_FALSE;
690 
691 	/*
692 	 * Free the dummy mblk if exists.
693 	 */
694 	if (dsp->ds_tx_flow_mp != NULL) {
695 		freeb(dsp->ds_tx_flow_mp);
696 		dsp->ds_tx_flow_mp = NULL;
697 	}
698 
699 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
700 	ASSERT(dsp == (dld_str_t *)val);
701 
702 	/*
703 	 * Free the object back to the cache.
704 	 */
705 	kmem_cache_free(str_cachep, dsp);
706 	atomic_add_32(&str_count, -1);
707 }
708 
709 /*
710  * kmem_cache contructor function: see kmem_cache_create(9f).
711  */
712 /*ARGSUSED*/
713 static int
714 str_constructor(void *buf, void *cdrarg, int kmflags)
715 {
716 	dld_str_t	*dsp = buf;
717 
718 	bzero(buf, sizeof (dld_str_t));
719 
720 	/*
721 	 * Allocate a new minor number.
722 	 */
723 	atomic_add_32(&minor_count, 1);
724 	if ((dsp->ds_minor = dls_minor_hold(kmflags == KM_SLEEP)) == 0) {
725 		atomic_add_32(&minor_count, -1);
726 		return (-1);
727 	}
728 
729 	/*
730 	 * Initialize the DLPI state machine.
731 	 */
732 	dsp->ds_dlstate = DL_UNATTACHED;
733 	dsp->ds_ppa = (t_uscalar_t)-1;
734 
735 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
736 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
737 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
738 	cv_init(&dsp->ds_pending_cv, NULL, CV_DRIVER, NULL);
739 
740 	return (0);
741 }
742 
743 /*
744  * kmem_cache destructor function.
745  */
746 /*ARGSUSED*/
747 static void
748 str_destructor(void *buf, void *cdrarg)
749 {
750 	dld_str_t	*dsp = buf;
751 
752 	/*
753 	 * Make sure the DLPI state machine was reset.
754 	 */
755 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
756 
757 	/*
758 	 * Make sure the data-link interface was closed.
759 	 */
760 	ASSERT(dsp->ds_mh == NULL);
761 	ASSERT(dsp->ds_dc == NULL);
762 
763 	/*
764 	 * Make sure enabled notifications are cleared.
765 	 */
766 	ASSERT(dsp->ds_notifications == 0);
767 
768 	/*
769 	 * Make sure polling is disabled.
770 	 */
771 	ASSERT(!dsp->ds_polling);
772 
773 	/*
774 	 * Release the minor number.
775 	 */
776 	dls_minor_rele(dsp->ds_minor);
777 	atomic_add_32(&minor_count, -1);
778 
779 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
780 	rw_destroy(&dsp->ds_lock);
781 
782 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
783 	mutex_destroy(&dsp->ds_tx_list_lock);
784 	ASSERT(dsp->ds_tx_flow_mp == NULL);
785 
786 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
787 	mutex_destroy(&dsp->ds_thr_lock);
788 	ASSERT(dsp->ds_pending_req == NULL);
789 	ASSERT(dsp->ds_pending_op == NULL);
790 	ASSERT(dsp->ds_pending_cnt == 0);
791 	cv_destroy(&dsp->ds_pending_cv);
792 }
793 
794 /*
795  * M_DATA put. Note that mp is a single message, not a chained message.
796  */
797 void
798 dld_tx_single(dld_str_t *dsp, mblk_t *mp)
799 {
800 	/*
801 	 * This function can be called from within dld or from an upper
802 	 * layer protocol (currently only tcp). If we are in the busy
803 	 * mode enqueue the packet(s) and return.  Otherwise hand them
804 	 * over to the MAC driver for transmission; any remaining one(s)
805 	 * which didn't get sent will be queued.
806 	 *
807 	 * Note here that we don't grab the list lock prior to checking
808 	 * the busy flag.  This is okay, because a missed transition
809 	 * will not cause any packet reordering for any particular TCP
810 	 * connection (which is single-threaded).  The enqueue routine
811 	 * will atomically set the busy flag and schedule the service
812 	 * thread to run; the flag is only cleared by the service thread
813 	 * when there is no more packet to be transmitted.
814 	 */
815 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
816 		dld_tx_enqueue(dsp, mp, B_FALSE);
817 }
818 
819 /*
820  * Update the priority bits and VID (may need to insert tag if mp points
821  * to an untagged packet.
822  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
823  */
824 static mblk_t *
825 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
826 {
827 	mblk_t *hmp;
828 	struct ether_vlan_header *evhp;
829 	struct ether_header *ehp;
830 	uint16_t old_tci = 0;
831 	size_t len;
832 
833 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
834 
835 	evhp = (struct ether_vlan_header *)mp->b_rptr;
836 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
837 		/*
838 		 * Tagged packet, update the priority bits.
839 		 */
840 		old_tci = ntohs(evhp->ether_tci);
841 		len = sizeof (struct ether_vlan_header);
842 
843 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
844 			/*
845 			 * In case some drivers only check the db_ref
846 			 * count of the first mblk, we pullup the
847 			 * message into a single mblk.
848 			 */
849 			hmp = msgpullup(mp, -1);
850 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
851 				freemsg(hmp);
852 				return (NULL);
853 			} else {
854 				freemsg(mp);
855 				mp = hmp;
856 			}
857 		}
858 
859 		evhp = (struct ether_vlan_header *)mp->b_rptr;
860 	} else {
861 		/*
862 		 * Untagged packet. Insert the special priority tag.
863 		 * First allocate a header mblk.
864 		 */
865 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
866 		if (hmp == NULL)
867 			return (NULL);
868 
869 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
870 		ehp = (struct ether_header *)mp->b_rptr;
871 
872 		/*
873 		 * Copy the MAC addresses and typelen
874 		 */
875 		bcopy(ehp, evhp, (ETHERADDRL * 2));
876 		evhp->ether_type = ehp->ether_type;
877 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
878 
879 		hmp->b_wptr += sizeof (struct ether_vlan_header);
880 		mp->b_rptr += sizeof (struct ether_header);
881 
882 		/*
883 		 * Free the original message if it's now empty. Link the
884 		 * rest of messages to the header message.
885 		 */
886 		if (MBLKL(mp) == 0) {
887 			hmp->b_cont = mp->b_cont;
888 			freeb(mp);
889 		} else {
890 			hmp->b_cont = mp;
891 		}
892 		mp = hmp;
893 	}
894 
895 	if (pri == 0)
896 		pri = VLAN_PRI(old_tci);
897 	if (vid == VLAN_ID_NONE)
898 		vid = VLAN_ID(old_tci);
899 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
900 	return (mp);
901 }
902 
903 /*
904  * M_DATA put (IP fast-path mode)
905  */
906 void
907 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
908 {
909 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
910 	mblk_t *newmp;
911 	uint_t pri;
912 
913 	if (is_ethernet) {
914 		/*
915 		 * Update the priority bits to the assigned priority.
916 		 */
917 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
918 
919 		if (pri != 0) {
920 			newmp = i_dld_ether_header_update_tag(mp, pri,
921 			    VLAN_ID_NONE);
922 			if (newmp == NULL)
923 				goto discard;
924 			mp = newmp;
925 		}
926 	}
927 
928 	dld_tx_single(dsp, mp);
929 	return;
930 
931 discard:
932 	/* TODO: bump kstat? */
933 	freemsg(mp);
934 }
935 
936 /*
937  * M_DATA put (DLIOCRAW mode)
938  */
939 static void
940 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
941 {
942 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
943 	mblk_t *bp, *newmp;
944 	size_t size;
945 	mac_header_info_t mhi;
946 	uint_t pri, vid;
947 
948 	/*
949 	 * Certain MAC type plugins provide an illusion for raw DLPI
950 	 * consumers.  They pretend that the MAC layer is something that
951 	 * it's not for the benefit of observability tools.  For example,
952 	 * mac_wifi pretends that it's Ethernet for such consumers.
953 	 * Here, unless native mode is enabled, we call into the MAC layer so
954 	 * that this illusion can be maintained.  The plugin will optionally
955 	 * transform the MAC header here into something that can be passed
956 	 * down.  The header goes from raw mode to "cooked" mode.
957 	 */
958 	if (!dsp->ds_native) {
959 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
960 			goto discard;
961 		mp = newmp;
962 	}
963 
964 	size = MBLKL(mp);
965 
966 	/*
967 	 * Check the packet is not too big and that any remaining
968 	 * fragment list is composed entirely of M_DATA messages. (We
969 	 * know the first fragment was M_DATA otherwise we could not
970 	 * have got here).
971 	 */
972 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
973 		if (DB_TYPE(bp) != M_DATA)
974 			goto discard;
975 		size += MBLKL(bp);
976 	}
977 
978 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
979 		goto discard;
980 
981 	/*
982 	 * If LSO is enabled, check the size against lso_max. Otherwise,
983 	 * compare the packet size with sdu_max.
984 	 */
985 	if (size > (dsp->ds_lso ? dsp->ds_lso_max : dsp->ds_mip->mi_sdu_max)
986 	    + mhi.mhi_hdrsize)
987 		goto discard;
988 
989 	if (is_ethernet) {
990 		/*
991 		 * Discard the packet if this is a VLAN stream but the VID in
992 		 * the packet is not correct.
993 		 */
994 		vid = VLAN_ID(mhi.mhi_tci);
995 		if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
996 			goto discard;
997 
998 		/*
999 		 * Discard the packet if this packet is a tagged packet
1000 		 * but both pri and VID are 0.
1001 		 */
1002 		pri = VLAN_PRI(mhi.mhi_tci);
1003 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
1004 			goto discard;
1005 
1006 		/*
1007 		 * Update the priority bits to the per-stream priority if
1008 		 * priority is not set in the packet. Update the VID for
1009 		 * packets on a VLAN stream.
1010 		 */
1011 		pri = (pri == 0) ? dsp->ds_pri : 0;
1012 		if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) {
1013 			if ((newmp = i_dld_ether_header_update_tag(mp,
1014 			    pri, dsp->ds_vid)) == NULL) {
1015 				goto discard;
1016 			}
1017 			mp = newmp;
1018 		}
1019 	}
1020 
1021 	dld_tx_single(dsp, mp);
1022 	return;
1023 
1024 discard:
1025 	/* TODO: bump kstat? */
1026 	freemsg(mp);
1027 }
1028 
1029 /*
1030  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1031  */
1032 int
1033 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1034 {
1035 	int			err;
1036 	const char		*drvname;
1037 	char			name[MAXNAMELEN];
1038 	dls_channel_t		dc;
1039 	uint_t			addr_length;
1040 
1041 	ASSERT(dsp->ds_dc == NULL);
1042 
1043 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1044 		return (EINVAL);
1045 
1046 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
1047 
1048 	if (strcmp(drvname, "aggr") != 0 && strcmp(drvname, "vnic") != 0 &&
1049 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1050 		return (EINVAL);
1051 
1052 	/*
1053 	 * Open a channel.
1054 	 */
1055 	if ((err = dls_open(name, &dc)) != 0) {
1056 		(void) qassociate(dsp->ds_wq, -1);
1057 		return (err);
1058 	}
1059 
1060 	/*
1061 	 * Cache the MAC interface handle, a pointer to the immutable MAC
1062 	 * information and the current and 'factory' MAC address.
1063 	 */
1064 	dsp->ds_mh = dls_mac(dc);
1065 	dsp->ds_mip = mac_info(dsp->ds_mh);
1066 
1067 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1068 
1069 	addr_length = dsp->ds_mip->mi_addr_length;
1070 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
1071 
1072 	/*
1073 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
1074 	 * a non-VLAN interface).
1075 	 */
1076 	dsp->ds_vid = dls_vid(dc);
1077 
1078 	/*
1079 	 * Set the default packet priority.
1080 	 */
1081 	dsp->ds_pri = 0;
1082 
1083 	/*
1084 	 * Add a notify function so that the we get updates from the MAC.
1085 	 */
1086 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
1087 
1088 	dsp->ds_ppa = ppa;
1089 	dsp->ds_dc = dc;
1090 	dsp->ds_dlstate = DL_UNBOUND;
1091 
1092 	return (0);
1093 }
1094 
1095 /*
1096  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1097  * from close(2) for style 2.
1098  */
1099 void
1100 dld_str_detach(dld_str_t *dsp)
1101 {
1102 	ASSERT(dsp->ds_thr == 0);
1103 
1104 	/*
1105 	 * Remove the notify function.
1106 	 */
1107 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
1108 
1109 	/*
1110 	 * Disable the capabilities and clear the promisc flag.
1111 	 */
1112 	ASSERT(!dsp->ds_polling);
1113 	ASSERT(!dsp->ds_soft_ring);
1114 	dld_capabilities_disable(dsp);
1115 	dsp->ds_promisc = 0;
1116 
1117 	/*
1118 	 * Clear LSO flags.
1119 	 */
1120 	dsp->ds_lso = B_FALSE;
1121 	dsp->ds_lso_max = 0;
1122 
1123 	/*
1124 	 * Close the channel.
1125 	 */
1126 	dls_close(dsp->ds_dc);
1127 	dsp->ds_ppa = (t_uscalar_t)-1;
1128 	dsp->ds_dc = NULL;
1129 	dsp->ds_mh = NULL;
1130 
1131 	(void) qassociate(dsp->ds_wq, -1);
1132 
1133 	/*
1134 	 * Re-initialize the DLPI state machine.
1135 	 */
1136 	dsp->ds_dlstate = DL_UNATTACHED;
1137 
1138 }
1139 
1140 /*
1141  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1142  * tags before sending packets up to the DLS clients, with the exception of
1143  * special priority tagged packets, in that case, we set the VID to 0.
1144  * mp must be a VLAN tagged packet.
1145  */
1146 static mblk_t *
1147 i_dld_ether_header_strip_tag(mblk_t *mp)
1148 {
1149 	mblk_t *newmp;
1150 	struct ether_vlan_header *evhp;
1151 	uint16_t tci, new_tci;
1152 
1153 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1154 	if (DB_REF(mp) > 1) {
1155 		newmp = copymsg(mp);
1156 		if (newmp == NULL)
1157 			return (NULL);
1158 		freemsg(mp);
1159 		mp = newmp;
1160 	}
1161 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1162 
1163 	tci = ntohs(evhp->ether_tci);
1164 	if (VLAN_PRI(tci) == 0) {
1165 		/*
1166 		 * Priority is 0, strip the tag.
1167 		 */
1168 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1169 		mp->b_rptr += VLAN_TAGSZ;
1170 	} else {
1171 		/*
1172 		 * Priority is not 0, update the VID to 0.
1173 		 */
1174 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1175 		evhp->ether_tci = htons(new_tci);
1176 	}
1177 	return (mp);
1178 }
1179 
1180 /*
1181  * Raw mode receive function.
1182  */
1183 /*ARGSUSED*/
1184 void
1185 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1186     mac_header_info_t *mhip)
1187 {
1188 	dld_str_t *dsp = (dld_str_t *)arg;
1189 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1190 	mblk_t *next, *newmp;
1191 
1192 	ASSERT(mp != NULL);
1193 	do {
1194 		/*
1195 		 * Get the pointer to the next packet in the chain and then
1196 		 * clear b_next before the packet gets passed on.
1197 		 */
1198 		next = mp->b_next;
1199 		mp->b_next = NULL;
1200 
1201 		/*
1202 		 * Wind back b_rptr to point at the MAC header.
1203 		 */
1204 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1205 		mp->b_rptr -= mhip->mhi_hdrsize;
1206 
1207 		/*
1208 		 * Certain MAC type plugins provide an illusion for raw
1209 		 * DLPI consumers.  They pretend that the MAC layer is
1210 		 * something that it's not for the benefit of observability
1211 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1212 		 * for such consumers.	Here, unless native mode is enabled,
1213 		 * we call into the MAC layer so that this illusion can be
1214 		 * maintained.	The plugin will optionally transform the MAC
1215 		 * header here into something that can be passed up to raw
1216 		 * consumers.  The header goes from "cooked" mode to raw mode.
1217 		 */
1218 		if (!dsp->ds_native) {
1219 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1220 			if (newmp == NULL) {
1221 				freemsg(mp);
1222 				goto next;
1223 			}
1224 			mp = newmp;
1225 		}
1226 
1227 		/*
1228 		 * Strip the VLAN tag for VLAN streams.
1229 		 */
1230 		if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) {
1231 			newmp = i_dld_ether_header_strip_tag(mp);
1232 			if (newmp == NULL) {
1233 				freemsg(mp);
1234 				goto next;
1235 			}
1236 			mp = newmp;
1237 		}
1238 
1239 		/*
1240 		 * Pass the packet on.
1241 		 */
1242 		if (canputnext(dsp->ds_rq))
1243 			putnext(dsp->ds_rq, mp);
1244 		else
1245 			freemsg(mp);
1246 
1247 next:
1248 		/*
1249 		 * Move on to the next packet in the chain.
1250 		 */
1251 		mp = next;
1252 	} while (mp != NULL);
1253 }
1254 
1255 /*
1256  * Fast-path receive function.
1257  */
1258 /*ARGSUSED*/
1259 void
1260 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1261     mac_header_info_t *mhip)
1262 {
1263 	dld_str_t *dsp = (dld_str_t *)arg;
1264 	mblk_t *next;
1265 	size_t offset = 0;
1266 
1267 	/*
1268 	 * MAC header stripping rules:
1269 	 *    - Tagged packets:
1270 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1271 	 *	b. Physical streams
1272 	 *	- VLAN packets (non-zero VID). The stream must be either a
1273 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1274 	 *	  Strip the Ethernet header but keep the VLAN header.
1275 	 *	- Special tagged packets (zero VID)
1276 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1277 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1278 	 *	    keep the VLAN header.
1279 	 *	  * Otherwise, strip the whole VLAN header.
1280 	 *    - Untagged packets. Strip the whole MAC header.
1281 	 */
1282 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1283 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1284 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1285 		offset = VLAN_TAGSZ;
1286 	}
1287 
1288 	ASSERT(mp != NULL);
1289 	do {
1290 		/*
1291 		 * Get the pointer to the next packet in the chain and then
1292 		 * clear b_next before the packet gets passed on.
1293 		 */
1294 		next = mp->b_next;
1295 		mp->b_next = NULL;
1296 
1297 		/*
1298 		 * Wind back b_rptr to point at the VLAN header.
1299 		 */
1300 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1301 		mp->b_rptr -= offset;
1302 
1303 		/*
1304 		 * Pass the packet on.
1305 		 */
1306 		if (canputnext(dsp->ds_rq))
1307 			putnext(dsp->ds_rq, mp);
1308 		else
1309 			freemsg(mp);
1310 		/*
1311 		 * Move on to the next packet in the chain.
1312 		 */
1313 		mp = next;
1314 	} while (mp != NULL);
1315 }
1316 
1317 /*
1318  * Default receive function (send DL_UNITDATA_IND messages).
1319  */
1320 /*ARGSUSED*/
1321 void
1322 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1323     mac_header_info_t *mhip)
1324 {
1325 	dld_str_t		*dsp = (dld_str_t *)arg;
1326 	mblk_t			*ud_mp;
1327 	mblk_t			*next;
1328 	size_t			offset = 0;
1329 	boolean_t		strip_vlan = B_TRUE;
1330 
1331 	/*
1332 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1333 	 */
1334 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1335 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1336 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1337 		offset = VLAN_TAGSZ;
1338 		strip_vlan = B_FALSE;
1339 	}
1340 
1341 	ASSERT(mp != NULL);
1342 	do {
1343 		/*
1344 		 * Get the pointer to the next packet in the chain and then
1345 		 * clear b_next before the packet gets passed on.
1346 		 */
1347 		next = mp->b_next;
1348 		mp->b_next = NULL;
1349 
1350 		/*
1351 		 * Wind back b_rptr to point at the MAC header.
1352 		 */
1353 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1354 		mp->b_rptr -= mhip->mhi_hdrsize;
1355 
1356 		/*
1357 		 * Create the DL_UNITDATA_IND M_PROTO.
1358 		 */
1359 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1360 			freemsgchain(mp);
1361 			return;
1362 		}
1363 
1364 		/*
1365 		 * Advance b_rptr to point at the payload (or the VLAN header).
1366 		 */
1367 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1368 
1369 		/*
1370 		 * Prepend the DL_UNITDATA_IND.
1371 		 */
1372 		ud_mp->b_cont = mp;
1373 
1374 		/*
1375 		 * Send the message.
1376 		 */
1377 		if (canputnext(dsp->ds_rq))
1378 			putnext(dsp->ds_rq, ud_mp);
1379 		else
1380 			freemsg(ud_mp);
1381 
1382 		/*
1383 		 * Move on to the next packet in the chain.
1384 		 */
1385 		mp = next;
1386 	} while (mp != NULL);
1387 }
1388 
1389 /*
1390  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1391  * current state of the interface.
1392  */
1393 void
1394 dld_str_notify_ind(dld_str_t *dsp)
1395 {
1396 	mac_notify_type_t	type;
1397 
1398 	for (type = 0; type < MAC_NNOTE; type++)
1399 		str_notify(dsp, type);
1400 }
1401 
1402 typedef struct dl_unitdata_ind_wrapper {
1403 	dl_unitdata_ind_t	dl_unitdata;
1404 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1405 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1406 } dl_unitdata_ind_wrapper_t;
1407 
1408 /*
1409  * Create a DL_UNITDATA_IND M_PROTO message.
1410  */
1411 static mblk_t *
1412 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1413 {
1414 	mblk_t				*nmp;
1415 	dl_unitdata_ind_wrapper_t	*dlwp;
1416 	dl_unitdata_ind_t		*dlp;
1417 	mac_header_info_t		mhi;
1418 	uint_t				addr_length;
1419 	uint8_t				*daddr;
1420 	uint8_t				*saddr;
1421 
1422 	/*
1423 	 * Get the packet header information.
1424 	 */
1425 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1426 		return (NULL);
1427 
1428 	/*
1429 	 * Allocate a message large enough to contain the wrapper structure
1430 	 * defined above.
1431 	 */
1432 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1433 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1434 	    DL_UNITDATA_IND)) == NULL)
1435 		return (NULL);
1436 
1437 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1438 
1439 	dlp = &(dlwp->dl_unitdata);
1440 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1441 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1442 
1443 	/*
1444 	 * Copy in the destination address.
1445 	 */
1446 	addr_length = dsp->ds_mip->mi_addr_length;
1447 	daddr = dlwp->dl_dest_addr;
1448 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1449 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1450 
1451 	/*
1452 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1453 	 */
1454 	if (mhi.mhi_istagged && !strip_vlan)
1455 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1456 	else
1457 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1458 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1459 
1460 	/*
1461 	 * If the destination address was multicast or broadcast then the
1462 	 * dl_group_address field should be non-zero.
1463 	 */
1464 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1465 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1466 
1467 	/*
1468 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1469 	 * for example) may not have access to source information.
1470 	 */
1471 	if (mhi.mhi_saddr == NULL) {
1472 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1473 	} else {
1474 		saddr = dlwp->dl_src_addr;
1475 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1476 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1477 
1478 		/*
1479 		 * Set the source DLSAP to the packet ethertype.
1480 		 */
1481 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1482 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1483 	}
1484 
1485 	return (nmp);
1486 }
1487 
1488 /*
1489  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1490  */
1491 static void
1492 str_notify_promisc_on_phys(dld_str_t *dsp)
1493 {
1494 	mblk_t		*mp;
1495 	dl_notify_ind_t	*dlip;
1496 
1497 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1498 		return;
1499 
1500 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1501 	    M_PROTO, 0)) == NULL)
1502 		return;
1503 
1504 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1505 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1506 	dlip->dl_primitive = DL_NOTIFY_IND;
1507 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1508 
1509 	qreply(dsp->ds_wq, mp);
1510 }
1511 
1512 /*
1513  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1514  */
1515 static void
1516 str_notify_promisc_off_phys(dld_str_t *dsp)
1517 {
1518 	mblk_t		*mp;
1519 	dl_notify_ind_t	*dlip;
1520 
1521 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1522 		return;
1523 
1524 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1525 	    M_PROTO, 0)) == NULL)
1526 		return;
1527 
1528 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1529 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1530 	dlip->dl_primitive = DL_NOTIFY_IND;
1531 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1532 
1533 	qreply(dsp->ds_wq, mp);
1534 }
1535 
1536 /*
1537  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1538  */
1539 static void
1540 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1541 {
1542 	mblk_t		*mp;
1543 	dl_notify_ind_t	*dlip;
1544 	uint_t		addr_length;
1545 	uint16_t	ethertype;
1546 
1547 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1548 		return;
1549 
1550 	addr_length = dsp->ds_mip->mi_addr_length;
1551 	if ((mp = mexchange(dsp->ds_wq, NULL,
1552 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1553 	    M_PROTO, 0)) == NULL)
1554 		return;
1555 
1556 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1557 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1558 	dlip->dl_primitive = DL_NOTIFY_IND;
1559 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1560 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1561 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1562 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1563 
1564 	bcopy(addr, &dlip[1], addr_length);
1565 
1566 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1567 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1568 
1569 	qreply(dsp->ds_wq, mp);
1570 }
1571 
1572 /*
1573  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1574  */
1575 static void
1576 str_notify_link_up(dld_str_t *dsp)
1577 {
1578 	mblk_t		*mp;
1579 	dl_notify_ind_t	*dlip;
1580 
1581 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1582 		return;
1583 
1584 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1585 	    M_PROTO, 0)) == NULL)
1586 		return;
1587 
1588 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1589 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1590 	dlip->dl_primitive = DL_NOTIFY_IND;
1591 	dlip->dl_notification = DL_NOTE_LINK_UP;
1592 
1593 	qreply(dsp->ds_wq, mp);
1594 }
1595 
1596 /*
1597  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1598  */
1599 static void
1600 str_notify_link_down(dld_str_t *dsp)
1601 {
1602 	mblk_t		*mp;
1603 	dl_notify_ind_t	*dlip;
1604 
1605 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1606 		return;
1607 
1608 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1609 	    M_PROTO, 0)) == NULL)
1610 		return;
1611 
1612 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1613 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1614 	dlip->dl_primitive = DL_NOTIFY_IND;
1615 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1616 
1617 	qreply(dsp->ds_wq, mp);
1618 }
1619 
1620 /*
1621  * DL_NOTIFY_IND: DL_NOTE_SPEED
1622  */
1623 static void
1624 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1625 {
1626 	mblk_t		*mp;
1627 	dl_notify_ind_t	*dlip;
1628 
1629 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1630 		return;
1631 
1632 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1633 	    M_PROTO, 0)) == NULL)
1634 		return;
1635 
1636 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1637 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1638 	dlip->dl_primitive = DL_NOTIFY_IND;
1639 	dlip->dl_notification = DL_NOTE_SPEED;
1640 	dlip->dl_data = speed;
1641 
1642 	qreply(dsp->ds_wq, mp);
1643 }
1644 
1645 /*
1646  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1647  */
1648 static void
1649 str_notify_capab_reneg(dld_str_t *dsp)
1650 {
1651 	mblk_t		*mp;
1652 	dl_notify_ind_t	*dlip;
1653 
1654 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1655 		return;
1656 
1657 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1658 	    M_PROTO, 0)) == NULL)
1659 		return;
1660 
1661 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1662 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1663 	dlip->dl_primitive = DL_NOTIFY_IND;
1664 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1665 
1666 	qreply(dsp->ds_wq, mp);
1667 }
1668 
1669 /*
1670  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1671  */
1672 static void
1673 str_notify_fastpath_flush(dld_str_t *dsp)
1674 {
1675 	mblk_t		*mp;
1676 	dl_notify_ind_t	*dlip;
1677 
1678 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1679 		return;
1680 
1681 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1682 	    M_PROTO, 0)) == NULL)
1683 		return;
1684 
1685 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1686 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1687 	dlip->dl_primitive = DL_NOTIFY_IND;
1688 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1689 
1690 	qreply(dsp->ds_wq, mp);
1691 }
1692 
1693 /*
1694  * MAC notification callback.
1695  */
1696 static void
1697 str_notify(void *arg, mac_notify_type_t type)
1698 {
1699 	dld_str_t		*dsp = (dld_str_t *)arg;
1700 	queue_t			*q = dsp->ds_wq;
1701 
1702 	switch (type) {
1703 	case MAC_NOTE_TX:
1704 		qenable(q);
1705 		break;
1706 
1707 	case MAC_NOTE_DEVPROMISC:
1708 		/*
1709 		 * Send the appropriate DL_NOTIFY_IND.
1710 		 */
1711 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1712 			str_notify_promisc_on_phys(dsp);
1713 		else
1714 			str_notify_promisc_off_phys(dsp);
1715 		break;
1716 
1717 	case MAC_NOTE_PROMISC:
1718 		break;
1719 
1720 	case MAC_NOTE_UNICST:
1721 		/*
1722 		 * This notification is sent whenever the MAC unicast address
1723 		 * changes. We need to re-cache the address.
1724 		 */
1725 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1726 
1727 		/*
1728 		 * Send the appropriate DL_NOTIFY_IND.
1729 		 */
1730 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1731 		break;
1732 
1733 	case MAC_NOTE_LINK:
1734 		/*
1735 		 * This notification is sent every time the MAC driver
1736 		 * updates the link state.
1737 		 */
1738 		switch (mac_link_get(dsp->ds_mh)) {
1739 		case LINK_STATE_UP: {
1740 			uint64_t speed;
1741 			/*
1742 			 * The link is up so send the appropriate
1743 			 * DL_NOTIFY_IND.
1744 			 */
1745 			str_notify_link_up(dsp);
1746 
1747 			speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
1748 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1749 			break;
1750 		}
1751 		case LINK_STATE_DOWN:
1752 			/*
1753 			 * The link is down so send the appropriate
1754 			 * DL_NOTIFY_IND.
1755 			 */
1756 			str_notify_link_down(dsp);
1757 			break;
1758 
1759 		default:
1760 			break;
1761 		}
1762 		break;
1763 
1764 	case MAC_NOTE_RESOURCE:
1765 	case MAC_NOTE_VNIC:
1766 		/*
1767 		 * This notification is sent whenever the MAC resources
1768 		 * change or capabilities change. We need to renegotiate
1769 		 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1770 		 */
1771 		str_notify_capab_reneg(dsp);
1772 		break;
1773 
1774 	case MAC_NOTE_FASTPATH_FLUSH:
1775 		str_notify_fastpath_flush(dsp);
1776 		break;
1777 
1778 	default:
1779 		ASSERT(B_FALSE);
1780 		break;
1781 	}
1782 }
1783 
1784 /*
1785  * Enqueue one or more messages to the transmit queue.
1786  * Caller specifies the insertion position (head/tail).
1787  */
1788 void
1789 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1790 {
1791 	mblk_t	*tail;
1792 	queue_t *q = dsp->ds_wq;
1793 	uint_t	cnt, msgcnt;
1794 	uint_t	tot_cnt, tot_msgcnt;
1795 
1796 	ASSERT(DB_TYPE(mp) == M_DATA);
1797 	/* Calculate total size and count of the packet(s) */
1798 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1799 	    tail->b_next != NULL; tail = tail->b_next) {
1800 		ASSERT(DB_TYPE(tail->b_next) == M_DATA);
1801 		cnt += msgdsize(tail->b_next);
1802 		msgcnt++;
1803 	}
1804 
1805 	mutex_enter(&dsp->ds_tx_list_lock);
1806 	/*
1807 	 * If the queue depth would exceed the allowed threshold, drop
1808 	 * new packet(s) and drain those already in the queue.
1809 	 */
1810 	tot_cnt = dsp->ds_tx_cnt + cnt;
1811 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1812 
1813 	if (!head_insert &&
1814 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1815 		ASSERT(dsp->ds_tx_qbusy);
1816 		mutex_exit(&dsp->ds_tx_list_lock);
1817 		freemsgchain(mp);
1818 		goto done;
1819 	}
1820 
1821 	/* Update the queue size parameters */
1822 	dsp->ds_tx_cnt = tot_cnt;
1823 	dsp->ds_tx_msgcnt = tot_msgcnt;
1824 
1825 	/*
1826 	 * If the transmit queue is currently empty and we are
1827 	 * about to deposit the packet(s) there, switch mode to
1828 	 * "busy" and raise flow-control condition.
1829 	 */
1830 	if (!dsp->ds_tx_qbusy) {
1831 		dsp->ds_tx_qbusy = B_TRUE;
1832 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1833 		(void) putq(q, dsp->ds_tx_flow_mp);
1834 		dsp->ds_tx_flow_mp = NULL;
1835 	}
1836 
1837 	if (!head_insert) {
1838 		/* Tail insertion */
1839 		if (dsp->ds_tx_list_head == NULL)
1840 			dsp->ds_tx_list_head = mp;
1841 		else
1842 			dsp->ds_tx_list_tail->b_next = mp;
1843 		dsp->ds_tx_list_tail = tail;
1844 	} else {
1845 		/* Head insertion */
1846 		tail->b_next = dsp->ds_tx_list_head;
1847 		if (dsp->ds_tx_list_head == NULL)
1848 			dsp->ds_tx_list_tail = tail;
1849 		dsp->ds_tx_list_head = mp;
1850 	}
1851 	mutex_exit(&dsp->ds_tx_list_lock);
1852 done:
1853 	/* Schedule service thread to drain the transmit queue */
1854 	if (!head_insert)
1855 		qenable(q);
1856 }
1857 
1858 void
1859 dld_tx_flush(dld_str_t *dsp)
1860 {
1861 	mutex_enter(&dsp->ds_tx_list_lock);
1862 	if (dsp->ds_tx_list_head != NULL) {
1863 		freemsgchain(dsp->ds_tx_list_head);
1864 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1865 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1866 		if (dsp->ds_tx_qbusy) {
1867 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1868 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1869 			dsp->ds_tx_qbusy = B_FALSE;
1870 		}
1871 	}
1872 	mutex_exit(&dsp->ds_tx_list_lock);
1873 }
1874 
1875 /*
1876  * Process an M_IOCTL message.
1877  */
1878 static void
1879 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1880 {
1881 	uint_t			cmd;
1882 
1883 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1884 	ASSERT(dsp->ds_type == DLD_DLPI);
1885 
1886 	switch (cmd) {
1887 	case DLIOCNATIVE:
1888 		ioc_native(dsp, mp);
1889 		break;
1890 	case DLIOCRAW:
1891 		ioc_raw(dsp, mp);
1892 		break;
1893 	case DLIOCHDRINFO:
1894 		ioc_fast(dsp, mp);
1895 		break;
1896 	default:
1897 		ioc(dsp, mp);
1898 	}
1899 }
1900 
1901 /*
1902  * DLIOCNATIVE
1903  */
1904 static void
1905 ioc_native(dld_str_t *dsp, mblk_t *mp)
1906 {
1907 	queue_t *q = dsp->ds_wq;
1908 	const mac_info_t *mip = dsp->ds_mip;
1909 
1910 	rw_enter(&dsp->ds_lock, RW_WRITER);
1911 
1912 	/*
1913 	 * Native mode can be enabled if it's disabled and if the
1914 	 * native media type is different.
1915 	 */
1916 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
1917 		dsp->ds_native = B_TRUE;
1918 
1919 	rw_exit(&dsp->ds_lock);
1920 
1921 	if (dsp->ds_native)
1922 		miocack(q, mp, 0, mip->mi_nativemedia);
1923 	else
1924 		miocnak(q, mp, 0, ENOTSUP);
1925 }
1926 
1927 /*
1928  * DLIOCRAW
1929  */
1930 static void
1931 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1932 {
1933 	queue_t *q = dsp->ds_wq;
1934 
1935 	rw_enter(&dsp->ds_lock, RW_WRITER);
1936 	if (dsp->ds_polling || dsp->ds_soft_ring) {
1937 		rw_exit(&dsp->ds_lock);
1938 		miocnak(q, mp, 0, EPROTO);
1939 		return;
1940 	}
1941 
1942 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1943 		/*
1944 		 * Set the receive callback.
1945 		 */
1946 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1947 	}
1948 
1949 	/*
1950 	 * Note that raw mode is enabled.
1951 	 */
1952 	dsp->ds_mode = DLD_RAW;
1953 
1954 	rw_exit(&dsp->ds_lock);
1955 	miocack(q, mp, 0, 0);
1956 }
1957 
1958 /*
1959  * DLIOCHDRINFO
1960  */
1961 static void
1962 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1963 {
1964 	dl_unitdata_req_t *dlp;
1965 	off_t		off;
1966 	size_t		len;
1967 	const uint8_t	*addr;
1968 	uint16_t	sap;
1969 	mblk_t		*nmp;
1970 	mblk_t		*hmp;
1971 	uint_t		addr_length;
1972 	queue_t		*q = dsp->ds_wq;
1973 	int		err;
1974 	dls_channel_t	dc;
1975 
1976 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1977 		err = ENOTSUP;
1978 		goto failed;
1979 	}
1980 
1981 	/*
1982 	 * DLIOCHDRINFO should only come from IP. The one initiated from
1983 	 * user-land should not be allowed.
1984 	 */
1985 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
1986 		err = EINVAL;
1987 		goto failed;
1988 	}
1989 
1990 	nmp = mp->b_cont;
1991 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1992 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1993 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1994 		err = EINVAL;
1995 		goto failed;
1996 	}
1997 
1998 	off = dlp->dl_dest_addr_offset;
1999 	len = dlp->dl_dest_addr_length;
2000 
2001 	if (!MBLKIN(nmp, off, len)) {
2002 		err = EINVAL;
2003 		goto failed;
2004 	}
2005 
2006 	rw_enter(&dsp->ds_lock, RW_READER);
2007 	if (dsp->ds_dlstate != DL_IDLE) {
2008 		rw_exit(&dsp->ds_lock);
2009 		err = ENOTSUP;
2010 		goto failed;
2011 	}
2012 
2013 	addr_length = dsp->ds_mip->mi_addr_length;
2014 	if (len != addr_length + sizeof (uint16_t)) {
2015 		rw_exit(&dsp->ds_lock);
2016 		err = EINVAL;
2017 		goto failed;
2018 	}
2019 
2020 	addr = nmp->b_rptr + off;
2021 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2022 	dc = dsp->ds_dc;
2023 
2024 	if ((hmp = dls_header(dc, addr, sap, 0, NULL)) == NULL) {
2025 		rw_exit(&dsp->ds_lock);
2026 		err = ENOMEM;
2027 		goto failed;
2028 	}
2029 
2030 	/*
2031 	 * This is a performance optimization.  We originally entered
2032 	 * as reader and only become writer upon transitioning into
2033 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
2034 	 * stay as reader and return the fast-path header to IP.
2035 	 */
2036 	if (dsp->ds_mode != DLD_FASTPATH) {
2037 		if (!rw_tryupgrade(&dsp->ds_lock)) {
2038 			rw_exit(&dsp->ds_lock);
2039 			rw_enter(&dsp->ds_lock, RW_WRITER);
2040 
2041 			/*
2042 			 * State may have changed before we re-acquired
2043 			 * the writer lock in case the upgrade failed.
2044 			 */
2045 			if (dsp->ds_dlstate != DL_IDLE) {
2046 				rw_exit(&dsp->ds_lock);
2047 				err = ENOTSUP;
2048 				goto failed;
2049 			}
2050 		}
2051 
2052 		/*
2053 		 * Set the receive callback (unless polling is enabled).
2054 		 */
2055 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
2056 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
2057 
2058 		/*
2059 		 * Note that fast-path mode is enabled.
2060 		 */
2061 		dsp->ds_mode = DLD_FASTPATH;
2062 	}
2063 	rw_exit(&dsp->ds_lock);
2064 
2065 	freemsg(nmp->b_cont);
2066 	nmp->b_cont = hmp;
2067 
2068 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2069 	return;
2070 failed:
2071 	miocnak(q, mp, 0, err);
2072 }
2073 
2074 /*
2075  * Catch-all handler.
2076  */
2077 static void
2078 ioc(dld_str_t *dsp, mblk_t *mp)
2079 {
2080 	queue_t	*q = dsp->ds_wq;
2081 	mac_handle_t mh;
2082 
2083 	rw_enter(&dsp->ds_lock, RW_READER);
2084 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2085 		rw_exit(&dsp->ds_lock);
2086 		miocnak(q, mp, 0, EINVAL);
2087 		return;
2088 	}
2089 	mh = dsp->ds_mh;
2090 	ASSERT(mh != NULL);
2091 	rw_exit(&dsp->ds_lock);
2092 	mac_ioctl(mh, q, mp);
2093 }
2094