xref: /titanic_51/usr/src/uts/common/io/dld/dld_str.c (revision 99ebb4ca412cb0a19d77a3899a87c055b9c30fa8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Data-Link Driver
30  */
31 
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/strsubr.h>
35 #include	<sys/atomic.h>
36 #include	<sys/mkdev.h>
37 #include	<sys/vlan.h>
38 #include	<sys/dld.h>
39 #include	<sys/dld_impl.h>
40 #include	<sys/dls_impl.h>
41 #include	<inet/common.h>
42 
43 static int	str_constructor(void *, void *, int);
44 static void	str_destructor(void *, void *);
45 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
46 static void	str_notify_promisc_on_phys(dld_str_t *);
47 static void	str_notify_promisc_off_phys(dld_str_t *);
48 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
49 static void	str_notify_link_up(dld_str_t *);
50 static void	str_notify_link_down(dld_str_t *);
51 static void	str_notify_capab_reneg(dld_str_t *);
52 static void	str_notify_speed(dld_str_t *, uint32_t);
53 static void	str_notify(void *, mac_notify_type_t);
54 
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc(dld_str_t *, mblk_t *);
58 static void	dld_ioc(dld_str_t *, mblk_t *);
59 static minor_t	dld_minor_hold(boolean_t);
60 static void	dld_minor_rele(minor_t);
61 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
62 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
63 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
64 
65 static uint32_t		str_count;
66 static kmem_cache_t	*str_cachep;
67 static vmem_t		*minor_arenap;
68 static uint32_t		minor_count;
69 static mod_hash_t	*str_hashp;
70 
71 #define	MINOR_TO_PTR(minor)	((void *)(uintptr_t)(minor))
72 #define	PTR_TO_MINOR(ptr)	((minor_t)(uintptr_t)(ptr))
73 
74 #define	STR_HASHSZ		64
75 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
76 
77 /*
78  * Some notes on entry points, flow-control, queueing and locking:
79  *
80  * This driver exports the traditional STREAMS put entry point as well as
81  * the non-STREAMS fast-path transmit routine which is provided to IP via
82  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
83  * and data operations, while the fast-path routine deals only with M_DATA
84  * fast-path packets.  Regardless of the entry point, all outbound packets
85  * will end up in dld_tx_single(), where they will be delivered to the MAC
86  * driver.
87  *
88  * The transmit logic operates in two modes: a "not busy" mode where the
89  * packets will be delivered to the MAC for a send attempt, or "busy" mode
90  * where they will be enqueued in the internal queue because of flow-control.
91  * Flow-control happens when the MAC driver indicates the packets couldn't
92  * be transmitted due to lack of resources (e.g. running out of descriptors).
93  * In such case, the driver will place a dummy message on its write-side
94  * STREAMS queue so that the queue is marked as "full".  Any subsequent
95  * packets arriving at the driver will be enqueued in the internal queue,
96  * which is drained in the context of the service thread that gets scheduled
97  * whenever the driver is in the "busy" mode.  When all packets have been
98  * successfully delivered by MAC and the internal queue is empty, it will
99  * transition to the "not busy" mode by removing the dummy message from the
100  * write-side STREAMS queue; in effect this will trigger backenabling.
101  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
102  * to the above reasons.
103  *
104  * The driver implements an internal transmit queue independent of STREAMS.
105  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
106  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
107  * getq() operations done by the driver are those related to placing and
108  * removing the dummy message to/from the write-side STREAMS queue for flow-
109  * control purposes.
110  *
111  * Locking is done independent of STREAMS due to the driver being fully MT.
112  * Threads entering the driver (either from put or service entry points)
113  * will most likely be readers, with the exception of a few writer cases
114  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
115  * DLD-related ioctl requests.  The DLPI detach case is special, because
116  * it involves freeing resources and therefore must be single-threaded.
117  * Unfortunately the readers/writers lock can't be used to protect against
118  * it, because the lock is dropped prior to the driver calling places where
119  * putnext() may be invoked, and such places may depend on those resources
120  * to exist.  Because of this, the driver always completes the DLPI detach
121  * process when there are no other threads running in the driver.  This is
122  * done by keeping track of the number of threads, such that the the last
123  * thread leaving the driver will finish the pending DLPI detach operation.
124  */
125 
126 /*
127  * dld_max_q_count is the queue depth threshold used to limit the number of
128  * outstanding packets or bytes allowed in the queue; once this limit is
129  * reached the driver will free any incoming ones until the queue depth
130  * drops below the threshold.
131  *
132  * This buffering is provided to accomodate clients which do not employ
133  * their own buffering scheme, and to handle occasional packet bursts.
134  * Clients which handle their own buffering will receive positive feedback
135  * from this driver as soon as it transitions into the "busy" state, i.e.
136  * when the queue is initially filled up; they will get backenabled once
137  * the queue is empty.
138  *
139  * The value chosen here is rather arbitrary; in future some intelligent
140  * heuristics may be involved which could take into account the hardware's
141  * transmit ring size, etc.
142  */
143 uint_t dld_max_q_count = (16 * 1024 *1024);
144 
145 /*
146  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
147  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
148  * match dev_t. If a stream is found and it is attached, its dev_info_t *
149  * is returned.
150  */
151 typedef struct i_dld_str_state_s {
152 	major_t		ds_major;
153 	minor_t		ds_minor;
154 	dev_info_t	*ds_dip;
155 } i_dld_str_state_t;
156 
157 /* ARGSUSED */
158 static uint_t
159 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
160 {
161 	i_dld_str_state_t	*statep = arg;
162 	dld_str_t		*dsp = (dld_str_t *)val;
163 
164 	if (statep->ds_major != dsp->ds_major)
165 		return (MH_WALK_CONTINUE);
166 
167 	ASSERT(statep->ds_minor != 0);
168 
169 	/*
170 	 * Access to ds_ppa and ds_mh need to be protected by ds_lock.
171 	 */
172 	rw_enter(&dsp->ds_lock, RW_READER);
173 	if (statep->ds_minor <= DLD_MAX_MINOR) {
174 		/*
175 		 * Style 1: minor can be derived from the ppa. we
176 		 * continue to walk until we find a matching stream
177 		 * in attached state.
178 		 */
179 		if (statep->ds_minor == DLS_PPA2MINOR(dsp->ds_ppa) &&
180 		    dsp->ds_mh != NULL) {
181 			statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
182 			rw_exit(&dsp->ds_lock);
183 			return (MH_WALK_TERMINATE);
184 		}
185 	} else {
186 		/*
187 		 * Clone: a clone minor is unique. we can terminate the
188 		 * walk if we find a matching stream -- even if we fail
189 		 * to obtain the devinfo.
190 		 */
191 		if (statep->ds_minor == dsp->ds_minor) {
192 			if (dsp->ds_mh != NULL)
193 				statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
194 			rw_exit(&dsp->ds_lock);
195 			return (MH_WALK_TERMINATE);
196 		}
197 	}
198 	rw_exit(&dsp->ds_lock);
199 	return (MH_WALK_CONTINUE);
200 }
201 
202 static dev_info_t *
203 dld_finddevinfo(dev_t dev)
204 {
205 	i_dld_str_state_t	state;
206 
207 	state.ds_minor = getminor(dev);
208 	state.ds_major = getmajor(dev);
209 	state.ds_dip = NULL;
210 
211 	if (state.ds_minor == 0)
212 		return (NULL);
213 
214 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
215 	return (state.ds_dip);
216 }
217 
218 
219 /*
220  * devo_getinfo: getinfo(9e)
221  */
222 /*ARGSUSED*/
223 int
224 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
225 {
226 	dev_info_t	*devinfo;
227 	minor_t		minor = getminor((dev_t)arg);
228 	int		rc = DDI_FAILURE;
229 
230 	switch (cmd) {
231 	case DDI_INFO_DEVT2DEVINFO:
232 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
233 			*(dev_info_t **)resp = devinfo;
234 			rc = DDI_SUCCESS;
235 		}
236 		break;
237 	case DDI_INFO_DEVT2INSTANCE:
238 		if (minor > 0 && minor <= DLD_MAX_MINOR) {
239 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
240 			rc = DDI_SUCCESS;
241 		} else if (minor > DLD_MAX_MINOR &&
242 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
243 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
244 			rc = DDI_SUCCESS;
245 		}
246 		break;
247 	}
248 	return (rc);
249 }
250 
251 /*
252  * qi_qopen: open(9e)
253  */
254 /*ARGSUSED*/
255 int
256 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
257 {
258 	dld_str_t	*dsp;
259 	major_t		major;
260 	minor_t		minor;
261 	int		err;
262 
263 	if (sflag == MODOPEN)
264 		return (ENOTSUP);
265 
266 	/*
267 	 * This is a cloning driver and therefore each queue should only
268 	 * ever get opened once.
269 	 */
270 	if (rq->q_ptr != NULL)
271 		return (EBUSY);
272 
273 	major = getmajor(*devp);
274 	minor = getminor(*devp);
275 	if (minor > DLD_MAX_MINOR)
276 		return (ENODEV);
277 
278 	/*
279 	 * Create a new dld_str_t for the stream. This will grab a new minor
280 	 * number that will be handed back in the cloned dev_t.  Creation may
281 	 * fail if we can't allocate the dummy mblk used for flow-control.
282 	 */
283 	dsp = dld_str_create(rq, DLD_DLPI, major,
284 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
285 	if (dsp == NULL)
286 		return (ENOSR);
287 
288 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
289 	if (minor != 0) {
290 		/*
291 		 * Style 1 open
292 		 */
293 
294 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
295 			goto failed;
296 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
297 	} else {
298 		(void) qassociate(rq, -1);
299 	}
300 
301 	/*
302 	 * Enable the queue srv(9e) routine.
303 	 */
304 	qprocson(rq);
305 
306 	/*
307 	 * Construct a cloned dev_t to hand back.
308 	 */
309 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
310 	return (0);
311 
312 failed:
313 	dld_str_destroy(dsp);
314 	return (err);
315 }
316 
317 /*
318  * qi_qclose: close(9e)
319  */
320 int
321 dld_close(queue_t *rq)
322 {
323 	dld_str_t	*dsp = rq->q_ptr;
324 
325 	/*
326 	 * Wait until pending requests are processed.
327 	 */
328 	mutex_enter(&dsp->ds_thr_lock);
329 	while (dsp->ds_pending_cnt > 0)
330 		cv_wait(&dsp->ds_pending_cv, &dsp->ds_thr_lock);
331 	mutex_exit(&dsp->ds_thr_lock);
332 
333 	/*
334 	 * Disable the queue srv(9e) routine.
335 	 */
336 	qprocsoff(rq);
337 
338 	/*
339 	 * At this point we can not be entered by any threads via STREAMS
340 	 * or the direct call interface, which is available only to IP.
341 	 * After the interface is unplumbed, IP wouldn't have any reference
342 	 * to this instance, and therefore we are now effectively single
343 	 * threaded and don't require any lock protection.  Flush all
344 	 * pending packets which are sitting in the transmit queue.
345 	 */
346 	ASSERT(dsp->ds_thr == 0);
347 	dld_tx_flush(dsp);
348 
349 	/*
350 	 * This stream was open to a provider node. Check to see
351 	 * if it has been cleanly shut down.
352 	 */
353 	if (dsp->ds_dlstate != DL_UNATTACHED) {
354 		/*
355 		 * The stream is either open to a style 1 provider or
356 		 * this is not clean shutdown. Detach from the PPA.
357 		 * (This is still ok even in the style 1 case).
358 		 */
359 		dld_str_detach(dsp);
360 	}
361 
362 	dld_str_destroy(dsp);
363 	return (0);
364 }
365 
366 /*
367  * qi_qputp: put(9e)
368  */
369 void
370 dld_wput(queue_t *wq, mblk_t *mp)
371 {
372 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
373 
374 	DLD_ENTER(dsp);
375 
376 	switch (DB_TYPE(mp)) {
377 	case M_DATA:
378 		rw_enter(&dsp->ds_lock, RW_READER);
379 		if (dsp->ds_dlstate != DL_IDLE ||
380 		    dsp->ds_mode == DLD_UNITDATA) {
381 			freemsg(mp);
382 		} else if (dsp->ds_mode == DLD_FASTPATH) {
383 			str_mdata_fastpath_put(dsp, mp);
384 		} else if (dsp->ds_mode == DLD_RAW) {
385 			str_mdata_raw_put(dsp, mp);
386 		}
387 		rw_exit(&dsp->ds_lock);
388 		break;
389 	case M_PROTO:
390 	case M_PCPROTO:
391 		dld_proto(dsp, mp);
392 		break;
393 	case M_IOCTL:
394 		dld_ioc(dsp, mp);
395 		break;
396 	case M_FLUSH:
397 		if (*mp->b_rptr & FLUSHW) {
398 			dld_tx_flush(dsp);
399 			*mp->b_rptr &= ~FLUSHW;
400 		}
401 
402 		if (*mp->b_rptr & FLUSHR) {
403 			qreply(wq, mp);
404 		} else {
405 			freemsg(mp);
406 		}
407 		break;
408 	default:
409 		freemsg(mp);
410 		break;
411 	}
412 
413 	DLD_EXIT(dsp);
414 }
415 
416 /*
417  * qi_srvp: srv(9e)
418  */
419 void
420 dld_wsrv(queue_t *wq)
421 {
422 	mblk_t		*mp;
423 	dld_str_t	*dsp = wq->q_ptr;
424 
425 	DLD_ENTER(dsp);
426 	rw_enter(&dsp->ds_lock, RW_READER);
427 	/*
428 	 * Grab all packets (chained via b_next) off our transmit queue
429 	 * and try to send them all to the MAC layer.  Since the queue
430 	 * is independent of streams, we are able to dequeue all messages
431 	 * at once without looping through getq() and manually chaining
432 	 * them.  Note that the queue size parameters (byte and message
433 	 * counts) are cleared as well, but we postpone the backenabling
434 	 * until after the MAC transmit since some packets may end up
435 	 * back at our transmit queue.
436 	 */
437 	mutex_enter(&dsp->ds_tx_list_lock);
438 	if ((mp = dsp->ds_tx_list_head) == NULL) {
439 		ASSERT(!dsp->ds_tx_qbusy);
440 		ASSERT(dsp->ds_tx_flow_mp != NULL);
441 		ASSERT(dsp->ds_tx_list_head == NULL);
442 		ASSERT(dsp->ds_tx_list_tail == NULL);
443 		ASSERT(dsp->ds_tx_cnt == 0);
444 		ASSERT(dsp->ds_tx_msgcnt == 0);
445 		mutex_exit(&dsp->ds_tx_list_lock);
446 		rw_exit(&dsp->ds_lock);
447 		DLD_EXIT(dsp);
448 		return;
449 	}
450 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
451 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
452 	mutex_exit(&dsp->ds_tx_list_lock);
453 
454 	/*
455 	 * Discard packets unless we are attached and bound; note that
456 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
457 	 * because regardless of the mode all transmit will end up in
458 	 * dld_tx_single() where the packets may be queued.
459 	 */
460 	ASSERT(DB_TYPE(mp) == M_DATA);
461 	if (dsp->ds_dlstate != DL_IDLE) {
462 		freemsgchain(mp);
463 		goto done;
464 	}
465 
466 	/*
467 	 * Attempt to transmit one or more packets.  If the MAC can't
468 	 * send them all, re-queue the packet(s) at the beginning of
469 	 * the transmit queue to avoid any re-ordering.
470 	 */
471 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
472 		dld_tx_enqueue(dsp, mp, B_TRUE);
473 
474 done:
475 	/*
476 	 * Grab the list lock again and check if the transmit queue is
477 	 * really empty; if so, lift up flow-control and backenable any
478 	 * writer queues.  If the queue is not empty, schedule service
479 	 * thread to drain it.
480 	 */
481 	mutex_enter(&dsp->ds_tx_list_lock);
482 	if (dsp->ds_tx_list_head == NULL) {
483 		dsp->ds_tx_flow_mp = getq(wq);
484 		ASSERT(dsp->ds_tx_flow_mp != NULL);
485 		dsp->ds_tx_qbusy = B_FALSE;
486 	}
487 	mutex_exit(&dsp->ds_tx_list_lock);
488 
489 	rw_exit(&dsp->ds_lock);
490 	DLD_EXIT(dsp);
491 }
492 
493 void
494 dld_init_ops(struct dev_ops *ops, const char *name)
495 {
496 	struct streamtab *stream;
497 	struct qinit *rq, *wq;
498 	struct module_info *modinfo;
499 
500 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
501 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
502 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
503 	modinfo->mi_minpsz = 0;
504 	modinfo->mi_maxpsz = 64*1024;
505 	modinfo->mi_hiwat  = 1;
506 	modinfo->mi_lowat = 0;
507 
508 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
509 	rq->qi_qopen = dld_open;
510 	rq->qi_qclose = dld_close;
511 	rq->qi_minfo = modinfo;
512 
513 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
514 	wq->qi_putp = (pfi_t)dld_wput;
515 	wq->qi_srvp = (pfi_t)dld_wsrv;
516 	wq->qi_minfo = modinfo;
517 
518 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
519 	stream->st_rdinit = rq;
520 	stream->st_wrinit = wq;
521 	ops->devo_cb_ops->cb_str = stream;
522 
523 	ops->devo_getinfo = &dld_getinfo;
524 }
525 
526 void
527 dld_fini_ops(struct dev_ops *ops)
528 {
529 	struct streamtab *stream;
530 	struct qinit *rq, *wq;
531 	struct module_info *modinfo;
532 
533 	stream = ops->devo_cb_ops->cb_str;
534 	rq = stream->st_rdinit;
535 	wq = stream->st_wrinit;
536 	modinfo = rq->qi_minfo;
537 	ASSERT(wq->qi_minfo == modinfo);
538 
539 	kmem_free(stream, sizeof (struct streamtab));
540 	kmem_free(wq, sizeof (struct qinit));
541 	kmem_free(rq, sizeof (struct qinit));
542 	kmem_free(modinfo->mi_idname, FMNAMESZ);
543 	kmem_free(modinfo, sizeof (struct module_info));
544 }
545 
546 /*
547  * Initialize this module's data structures.
548  */
549 void
550 dld_str_init(void)
551 {
552 	/*
553 	 * Create dld_str_t object cache.
554 	 */
555 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
556 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
557 	ASSERT(str_cachep != NULL);
558 
559 	/*
560 	 * Allocate a vmem arena to manage minor numbers. The range of the
561 	 * arena will be from DLD_MAX_MINOR + 1 to MAXMIN (maximum legal
562 	 * minor number).
563 	 */
564 	minor_arenap = vmem_create("dld_minor_arena",
565 	    MINOR_TO_PTR(DLD_MAX_MINOR + 1), MAXMIN, 1, NULL, NULL, NULL, 0,
566 	    VM_SLEEP | VMC_IDENTIFIER);
567 	ASSERT(minor_arenap != NULL);
568 
569 	/*
570 	 * Create a hash table for maintaining dld_str_t's.
571 	 * The ds_minor field (the clone minor number) of a dld_str_t
572 	 * is used as a key for this hash table because this number is
573 	 * globally unique (allocated from "dld_minor_arena").
574 	 */
575 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
576 	    mod_hash_null_valdtor);
577 }
578 
579 /*
580  * Tear down this module's data structures.
581  */
582 int
583 dld_str_fini(void)
584 {
585 	/*
586 	 * Make sure that there are no objects in use.
587 	 */
588 	if (str_count != 0)
589 		return (EBUSY);
590 
591 	/*
592 	 * Check to see if there are any minor numbers still in use.
593 	 */
594 	if (minor_count != 0)
595 		return (EBUSY);
596 
597 	/*
598 	 * Destroy object cache.
599 	 */
600 	kmem_cache_destroy(str_cachep);
601 	vmem_destroy(minor_arenap);
602 	mod_hash_destroy_idhash(str_hashp);
603 	return (0);
604 }
605 
606 /*
607  * Create a new dld_str_t object.
608  */
609 dld_str_t *
610 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
611 {
612 	dld_str_t	*dsp;
613 	int		err;
614 
615 	/*
616 	 * Allocate an object from the cache.
617 	 */
618 	atomic_add_32(&str_count, 1);
619 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
620 
621 	/*
622 	 * Allocate the dummy mblk for flow-control.
623 	 */
624 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
625 	if (dsp->ds_tx_flow_mp == NULL) {
626 		kmem_cache_free(str_cachep, dsp);
627 		atomic_add_32(&str_count, -1);
628 		return (NULL);
629 	}
630 	dsp->ds_type = type;
631 	dsp->ds_major = major;
632 	dsp->ds_style = style;
633 
634 	/*
635 	 * Initialize the queue pointers.
636 	 */
637 	ASSERT(RD(rq) == rq);
638 	dsp->ds_rq = rq;
639 	dsp->ds_wq = WR(rq);
640 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
641 
642 	/*
643 	 * We want explicit control over our write-side STREAMS queue
644 	 * where the dummy mblk gets added/removed for flow-control.
645 	 */
646 	noenable(WR(rq));
647 
648 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
649 	    (mod_hash_val_t)dsp);
650 	ASSERT(err == 0);
651 	return (dsp);
652 }
653 
654 /*
655  * Destroy a dld_str_t object.
656  */
657 void
658 dld_str_destroy(dld_str_t *dsp)
659 {
660 	queue_t		*rq;
661 	queue_t		*wq;
662 	mod_hash_val_t	val;
663 	/*
664 	 * Clear the queue pointers.
665 	 */
666 	rq = dsp->ds_rq;
667 	wq = dsp->ds_wq;
668 	ASSERT(wq == WR(rq));
669 
670 	rq->q_ptr = wq->q_ptr = NULL;
671 	dsp->ds_rq = dsp->ds_wq = NULL;
672 
673 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
674 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
675 	ASSERT(dsp->ds_tx_list_head == NULL);
676 	ASSERT(dsp->ds_tx_list_tail == NULL);
677 	ASSERT(dsp->ds_tx_cnt == 0);
678 	ASSERT(dsp->ds_tx_msgcnt == 0);
679 	ASSERT(!dsp->ds_tx_qbusy);
680 
681 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
682 	ASSERT(dsp->ds_thr == 0);
683 	ASSERT(dsp->ds_pending_req == NULL);
684 
685 	/*
686 	 * Reinitialize all the flags.
687 	 */
688 	dsp->ds_notifications = 0;
689 	dsp->ds_passivestate = DLD_UNINITIALIZED;
690 	dsp->ds_mode = DLD_UNITDATA;
691 
692 	/*
693 	 * Free the dummy mblk if exists.
694 	 */
695 	if (dsp->ds_tx_flow_mp != NULL) {
696 		freeb(dsp->ds_tx_flow_mp);
697 		dsp->ds_tx_flow_mp = NULL;
698 	}
699 
700 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
701 	ASSERT(dsp == (dld_str_t *)val);
702 
703 	/*
704 	 * Free the object back to the cache.
705 	 */
706 	kmem_cache_free(str_cachep, dsp);
707 	atomic_add_32(&str_count, -1);
708 }
709 
710 /*
711  * kmem_cache contructor function: see kmem_cache_create(9f).
712  */
713 /*ARGSUSED*/
714 static int
715 str_constructor(void *buf, void *cdrarg, int kmflags)
716 {
717 	dld_str_t	*dsp = buf;
718 
719 	bzero(buf, sizeof (dld_str_t));
720 
721 	/*
722 	 * Allocate a new minor number.
723 	 */
724 	if ((dsp->ds_minor = dld_minor_hold(kmflags == KM_SLEEP)) == 0)
725 		return (-1);
726 
727 	/*
728 	 * Initialize the DLPI state machine.
729 	 */
730 	dsp->ds_dlstate = DL_UNATTACHED;
731 	dsp->ds_ppa = (t_uscalar_t)-1;
732 
733 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
734 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
735 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
736 	cv_init(&dsp->ds_pending_cv, NULL, CV_DRIVER, NULL);
737 
738 	return (0);
739 }
740 
741 /*
742  * kmem_cache destructor function.
743  */
744 /*ARGSUSED*/
745 static void
746 str_destructor(void *buf, void *cdrarg)
747 {
748 	dld_str_t	*dsp = buf;
749 
750 	/*
751 	 * Make sure the DLPI state machine was reset.
752 	 */
753 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
754 
755 	/*
756 	 * Make sure the data-link interface was closed.
757 	 */
758 	ASSERT(dsp->ds_mh == NULL);
759 	ASSERT(dsp->ds_dc == NULL);
760 
761 	/*
762 	 * Make sure enabled notifications are cleared.
763 	 */
764 	ASSERT(dsp->ds_notifications == 0);
765 
766 	/*
767 	 * Make sure polling is disabled.
768 	 */
769 	ASSERT(!dsp->ds_polling);
770 
771 	/*
772 	 * Release the minor number.
773 	 */
774 	dld_minor_rele(dsp->ds_minor);
775 
776 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
777 	rw_destroy(&dsp->ds_lock);
778 
779 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
780 	mutex_destroy(&dsp->ds_tx_list_lock);
781 	ASSERT(dsp->ds_tx_flow_mp == NULL);
782 
783 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
784 	mutex_destroy(&dsp->ds_thr_lock);
785 	ASSERT(dsp->ds_pending_req == NULL);
786 	ASSERT(dsp->ds_pending_op == NULL);
787 	ASSERT(dsp->ds_pending_cnt == 0);
788 	cv_destroy(&dsp->ds_pending_cv);
789 }
790 
791 /*
792  * M_DATA put. Note that mp is a single message, not a chained message.
793  */
794 void
795 dld_tx_single(dld_str_t *dsp, mblk_t *mp)
796 {
797 	/*
798 	 * This function can be called from within dld or from an upper
799 	 * layer protocol (currently only tcp). If we are in the busy
800 	 * mode enqueue the packet(s) and return.  Otherwise hand them
801 	 * over to the MAC driver for transmission; any remaining one(s)
802 	 * which didn't get sent will be queued.
803 	 *
804 	 * Note here that we don't grab the list lock prior to checking
805 	 * the busy flag.  This is okay, because a missed transition
806 	 * will not cause any packet reordering for any particular TCP
807 	 * connection (which is single-threaded).  The enqueue routine
808 	 * will atomically set the busy flag and schedule the service
809 	 * thread to run; the flag is only cleared by the service thread
810 	 * when there is no more packet to be transmitted.
811 	 */
812 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
813 		dld_tx_enqueue(dsp, mp, B_FALSE);
814 }
815 
816 /*
817  * Update the priority bits and VID (may need to insert tag if mp points
818  * to an untagged packet.
819  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
820  */
821 static mblk_t *
822 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
823 {
824 	mblk_t *hmp;
825 	struct ether_vlan_header *evhp;
826 	struct ether_header *ehp;
827 	uint16_t old_tci = 0;
828 	size_t len;
829 
830 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
831 
832 	evhp = (struct ether_vlan_header *)mp->b_rptr;
833 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
834 		/*
835 		 * Tagged packet, update the priority bits.
836 		 */
837 		old_tci = ntohs(evhp->ether_tci);
838 		len = sizeof (struct ether_vlan_header);
839 
840 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
841 			/*
842 			 * In case some drivers only check the db_ref
843 			 * count of the first mblk, we pullup the
844 			 * message into a single mblk.
845 			 */
846 			hmp = msgpullup(mp, -1);
847 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
848 				freemsg(hmp);
849 				return (NULL);
850 			} else {
851 				freemsg(mp);
852 				mp = hmp;
853 			}
854 		}
855 
856 		evhp = (struct ether_vlan_header *)mp->b_rptr;
857 	} else {
858 		/*
859 		 * Untagged packet. Insert the special priority tag.
860 		 * First allocate a header mblk.
861 		 */
862 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
863 		if (hmp == NULL)
864 			return (NULL);
865 
866 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
867 		ehp = (struct ether_header *)mp->b_rptr;
868 
869 		/*
870 		 * Copy the MAC addresses and typelen
871 		 */
872 		bcopy(ehp, evhp, (ETHERADDRL * 2));
873 		evhp->ether_type = ehp->ether_type;
874 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
875 
876 		hmp->b_wptr += sizeof (struct ether_vlan_header);
877 		mp->b_rptr += sizeof (struct ether_header);
878 
879 		/*
880 		 * Free the original message if it's now empty. Link the
881 		 * rest of messages to the header message.
882 		 */
883 		if (MBLKL(mp) == 0) {
884 			hmp->b_cont = mp->b_cont;
885 			freeb(mp);
886 		} else {
887 			hmp->b_cont = mp;
888 		}
889 		mp = hmp;
890 	}
891 
892 	if (pri == 0)
893 		pri = VLAN_PRI(old_tci);
894 	if (vid == VLAN_ID_NONE)
895 		vid = VLAN_ID(old_tci);
896 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
897 	return (mp);
898 }
899 
900 /*
901  * M_DATA put (IP fast-path mode)
902  */
903 void
904 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
905 {
906 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
907 	mblk_t *newmp;
908 	uint_t pri;
909 
910 	if (is_ethernet) {
911 		/*
912 		 * Update the priority bits to the assigned priority.
913 		 */
914 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
915 
916 		if (pri != 0) {
917 			newmp = i_dld_ether_header_update_tag(mp, pri,
918 			    VLAN_ID_NONE);
919 			if (newmp == NULL)
920 				goto discard;
921 			mp = newmp;
922 		}
923 	}
924 
925 	dld_tx_single(dsp, mp);
926 	return;
927 
928 discard:
929 	/* TODO: bump kstat? */
930 	freemsg(mp);
931 }
932 
933 /*
934  * M_DATA put (DLIOCRAW mode)
935  */
936 static void
937 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
938 {
939 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
940 	mblk_t *bp, *newmp;
941 	size_t size;
942 	mac_header_info_t mhi;
943 	uint_t pri, vid;
944 
945 	/*
946 	 * Certain MAC type plugins provide an illusion for raw DLPI
947 	 * consumers.  They pretend that the MAC layer is something that
948 	 * it's not for the benefit of observability tools.  For example, a
949 	 * wifi plugin might pretend that it's Ethernet for such consumers.
950 	 * Here, we call into the MAC layer so that this illusion can be
951 	 * maintained.  The plugin will optionally transform the MAC header
952 	 * here into something that can be passed down.  The header goes
953 	 * from raw mode to "cooked" mode.
954 	 */
955 	if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
956 		goto discard;
957 	mp = newmp;
958 
959 	size = MBLKL(mp);
960 
961 	/*
962 	 * Check the packet is not too big and that any remaining
963 	 * fragment list is composed entirely of M_DATA messages. (We
964 	 * know the first fragment was M_DATA otherwise we could not
965 	 * have got here).
966 	 */
967 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
968 		if (DB_TYPE(bp) != M_DATA)
969 			goto discard;
970 		size += MBLKL(bp);
971 	}
972 
973 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
974 		goto discard;
975 
976 	if (size > dsp->ds_mip->mi_sdu_max + mhi.mhi_hdrsize)
977 		goto discard;
978 
979 	if (is_ethernet) {
980 		/*
981 		 * Discard the packet if this is a VLAN stream but the VID in
982 		 * the packet is not correct.
983 		 */
984 		vid = VLAN_ID(mhi.mhi_tci);
985 		if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
986 			goto discard;
987 
988 		/*
989 		 * Discard the packet if this packet is a tagged packet
990 		 * but both pri and VID are 0.
991 		 */
992 		pri = VLAN_PRI(mhi.mhi_tci);
993 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
994 			goto discard;
995 
996 		/*
997 		 * Update the priority bits to the per-stream priority if
998 		 * priority is not set in the packet. Update the VID for
999 		 * packets on a VLAN stream.
1000 		 */
1001 		pri = (pri == 0) ? dsp->ds_pri : 0;
1002 		if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) {
1003 			if ((newmp = i_dld_ether_header_update_tag(mp,
1004 			    pri, dsp->ds_vid)) == NULL) {
1005 				goto discard;
1006 			}
1007 			mp = newmp;
1008 		}
1009 	}
1010 
1011 	dld_tx_single(dsp, mp);
1012 	return;
1013 
1014 discard:
1015 	/* TODO: bump kstat? */
1016 	freemsg(mp);
1017 }
1018 
1019 /*
1020  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1021  */
1022 int
1023 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1024 {
1025 	int			err;
1026 	const char		*drvname;
1027 	char			name[MAXNAMELEN];
1028 	dls_channel_t		dc;
1029 	uint_t			addr_length;
1030 
1031 	ASSERT(dsp->ds_dc == NULL);
1032 
1033 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1034 		return (EINVAL);
1035 
1036 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
1037 
1038 	if (strcmp(drvname, "aggr") != 0 &&
1039 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1040 		return (EINVAL);
1041 
1042 	/*
1043 	 * Open a channel.
1044 	 */
1045 	if ((err = dls_open(name, &dc)) != 0) {
1046 		(void) qassociate(dsp->ds_wq, -1);
1047 		return (err);
1048 	}
1049 
1050 	/*
1051 	 * Cache the MAC interface handle, a pointer to the immutable MAC
1052 	 * information and the current and 'factory' MAC address.
1053 	 */
1054 	dsp->ds_mh = dls_mac(dc);
1055 	dsp->ds_mip = mac_info(dsp->ds_mh);
1056 
1057 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1058 
1059 	addr_length = dsp->ds_mip->mi_addr_length;
1060 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
1061 
1062 	/*
1063 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
1064 	 * a non-VLAN interface).
1065 	 */
1066 	dsp->ds_vid = dls_vid(dc);
1067 
1068 	/*
1069 	 * Set the default packet priority.
1070 	 */
1071 	dsp->ds_pri = 0;
1072 
1073 	/*
1074 	 * Add a notify function so that the we get updates from the MAC.
1075 	 */
1076 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
1077 
1078 	dsp->ds_ppa = ppa;
1079 	dsp->ds_dc = dc;
1080 	dsp->ds_dlstate = DL_UNBOUND;
1081 
1082 	return (0);
1083 }
1084 
1085 /*
1086  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1087  * from close(2) for style 2.
1088  */
1089 void
1090 dld_str_detach(dld_str_t *dsp)
1091 {
1092 	ASSERT(dsp->ds_thr == 0);
1093 
1094 	/*
1095 	 * Remove the notify function.
1096 	 */
1097 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
1098 
1099 	/*
1100 	 * Clear the polling and promisc flags.
1101 	 */
1102 	dsp->ds_polling = B_FALSE;
1103 	dsp->ds_soft_ring = B_FALSE;
1104 	dsp->ds_promisc = 0;
1105 
1106 	/*
1107 	 * Close the channel.
1108 	 */
1109 	dls_close(dsp->ds_dc);
1110 	dsp->ds_ppa = (t_uscalar_t)-1;
1111 	dsp->ds_dc = NULL;
1112 	dsp->ds_mh = NULL;
1113 
1114 	(void) qassociate(dsp->ds_wq, -1);
1115 
1116 	/*
1117 	 * Re-initialize the DLPI state machine.
1118 	 */
1119 	dsp->ds_dlstate = DL_UNATTACHED;
1120 
1121 }
1122 
1123 /*
1124  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1125  * tags before sending packets up to the DLS clients, with the exception of
1126  * special priority tagged packets, in that case, we set the VID to 0.
1127  * mp must be a VLAN tagged packet.
1128  */
1129 static mblk_t *
1130 i_dld_ether_header_strip_tag(mblk_t *mp)
1131 {
1132 	mblk_t *newmp;
1133 	struct ether_vlan_header *evhp;
1134 	uint16_t tci, new_tci;
1135 
1136 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1137 	if (DB_REF(mp) > 1) {
1138 		newmp = copymsg(mp);
1139 		if (newmp == NULL)
1140 			return (NULL);
1141 		freemsg(mp);
1142 		mp = newmp;
1143 	}
1144 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1145 
1146 	tci = ntohs(evhp->ether_tci);
1147 	if (VLAN_PRI(tci) == 0) {
1148 		/*
1149 		 * Priority is 0, strip the tag.
1150 		 */
1151 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1152 		mp->b_rptr += VLAN_TAGSZ;
1153 	} else {
1154 		/*
1155 		 * Priority is not 0, update the VID to 0.
1156 		 */
1157 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1158 		evhp->ether_tci = htons(new_tci);
1159 	}
1160 	return (mp);
1161 }
1162 
1163 /*
1164  * Raw mode receive function.
1165  */
1166 /*ARGSUSED*/
1167 void
1168 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1169     mac_header_info_t *mhip)
1170 {
1171 	dld_str_t *dsp = (dld_str_t *)arg;
1172 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1173 	mblk_t *next, *newmp;
1174 
1175 	ASSERT(mp != NULL);
1176 	do {
1177 		/*
1178 		 * Get the pointer to the next packet in the chain and then
1179 		 * clear b_next before the packet gets passed on.
1180 		 */
1181 		next = mp->b_next;
1182 		mp->b_next = NULL;
1183 
1184 		/*
1185 		 * Wind back b_rptr to point at the MAC header.
1186 		 */
1187 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1188 		mp->b_rptr -= mhip->mhi_hdrsize;
1189 
1190 		/*
1191 		 * Certain MAC type plugins provide an illusion for raw
1192 		 * DLPI consumers.  They pretend that the MAC layer is
1193 		 * something that it's not for the benefit of observability
1194 		 * tools.  For example, a wifi plugin might pretend that
1195 		 * it's Ethernet for such consumers.  Here, we call into
1196 		 * the MAC layer so that this illusion can be maintained.
1197 		 * The plugin will optionally transform the MAC header here
1198 		 * into something that can be passed up to raw consumers.
1199 		 * The header goes from "cooked" mode to raw mode.
1200 		 */
1201 		if ((newmp = mac_header_uncook(dsp->ds_mh, mp)) == NULL) {
1202 			freemsg(mp);
1203 			goto next;
1204 		}
1205 		mp = newmp;
1206 
1207 		/*
1208 		 * Strip the VLAN tag for VLAN streams.
1209 		 */
1210 		if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) {
1211 			newmp = i_dld_ether_header_strip_tag(mp);
1212 			if (newmp == NULL) {
1213 				freemsg(mp);
1214 				goto next;
1215 			}
1216 			mp = newmp;
1217 		}
1218 
1219 		/*
1220 		 * Pass the packet on.
1221 		 */
1222 		if (canputnext(dsp->ds_rq))
1223 			putnext(dsp->ds_rq, mp);
1224 		else
1225 			freemsg(mp);
1226 
1227 next:
1228 		/*
1229 		 * Move on to the next packet in the chain.
1230 		 */
1231 		mp = next;
1232 	} while (mp != NULL);
1233 }
1234 
1235 /*
1236  * Fast-path receive function.
1237  */
1238 /*ARGSUSED*/
1239 void
1240 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1241     mac_header_info_t *mhip)
1242 {
1243 	dld_str_t *dsp = (dld_str_t *)arg;
1244 	mblk_t *next;
1245 	size_t offset = 0;
1246 
1247 	/*
1248 	 * MAC header stripping rules:
1249 	 *    - Tagged packets:
1250 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1251 	 *	b. Physical streams
1252 	 *	- VLAN packets (non-zero VID). The stream must be either a
1253 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1254 	 *	  Strip the Ethernet header but keep the VLAN header.
1255 	 *	- Special tagged packets (zero VID)
1256 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1257 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1258 	 *	    keep the VLAN header.
1259 	 *	  * Otherwise, strip the whole VLAN header.
1260 	 *    - Untagged packets. Strip the whole MAC header.
1261 	 */
1262 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1263 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1264 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1265 		offset = VLAN_TAGSZ;
1266 	}
1267 
1268 	ASSERT(mp != NULL);
1269 	do {
1270 		/*
1271 		 * Get the pointer to the next packet in the chain and then
1272 		 * clear b_next before the packet gets passed on.
1273 		 */
1274 		next = mp->b_next;
1275 		mp->b_next = NULL;
1276 
1277 		/*
1278 		 * Wind back b_rptr to point at the VLAN header.
1279 		 */
1280 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1281 		mp->b_rptr -= offset;
1282 
1283 		/*
1284 		 * Pass the packet on.
1285 		 */
1286 		if (canputnext(dsp->ds_rq))
1287 			putnext(dsp->ds_rq, mp);
1288 		else
1289 			freemsg(mp);
1290 		/*
1291 		 * Move on to the next packet in the chain.
1292 		 */
1293 		mp = next;
1294 	} while (mp != NULL);
1295 }
1296 
1297 /*
1298  * Default receive function (send DL_UNITDATA_IND messages).
1299  */
1300 /*ARGSUSED*/
1301 void
1302 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1303     mac_header_info_t *mhip)
1304 {
1305 	dld_str_t		*dsp = (dld_str_t *)arg;
1306 	mblk_t			*ud_mp;
1307 	mblk_t			*next;
1308 	size_t			offset = 0;
1309 	boolean_t		strip_vlan = B_TRUE;
1310 
1311 	/*
1312 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1313 	 */
1314 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1315 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1316 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1317 		offset = VLAN_TAGSZ;
1318 		strip_vlan = B_FALSE;
1319 	}
1320 
1321 	ASSERT(mp != NULL);
1322 	do {
1323 		/*
1324 		 * Get the pointer to the next packet in the chain and then
1325 		 * clear b_next before the packet gets passed on.
1326 		 */
1327 		next = mp->b_next;
1328 		mp->b_next = NULL;
1329 
1330 		/*
1331 		 * Wind back b_rptr to point at the MAC header.
1332 		 */
1333 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1334 		mp->b_rptr -= mhip->mhi_hdrsize;
1335 
1336 		/*
1337 		 * Create the DL_UNITDATA_IND M_PROTO.
1338 		 */
1339 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1340 			freemsgchain(mp);
1341 			return;
1342 		}
1343 
1344 		/*
1345 		 * Advance b_rptr to point at the payload (or the VLAN header).
1346 		 */
1347 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1348 
1349 		/*
1350 		 * Prepend the DL_UNITDATA_IND.
1351 		 */
1352 		ud_mp->b_cont = mp;
1353 
1354 		/*
1355 		 * Send the message.
1356 		 */
1357 		if (canputnext(dsp->ds_rq))
1358 			putnext(dsp->ds_rq, ud_mp);
1359 		else
1360 			freemsg(ud_mp);
1361 
1362 		/*
1363 		 * Move on to the next packet in the chain.
1364 		 */
1365 		mp = next;
1366 	} while (mp != NULL);
1367 }
1368 
1369 /*
1370  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1371  * current state of the interface.
1372  */
1373 void
1374 dld_str_notify_ind(dld_str_t *dsp)
1375 {
1376 	mac_notify_type_t	type;
1377 
1378 	for (type = 0; type < MAC_NNOTE; type++)
1379 		str_notify(dsp, type);
1380 }
1381 
1382 typedef struct dl_unitdata_ind_wrapper {
1383 	dl_unitdata_ind_t	dl_unitdata;
1384 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1385 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1386 } dl_unitdata_ind_wrapper_t;
1387 
1388 /*
1389  * Create a DL_UNITDATA_IND M_PROTO message.
1390  */
1391 static mblk_t *
1392 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1393 {
1394 	mblk_t				*nmp;
1395 	dl_unitdata_ind_wrapper_t	*dlwp;
1396 	dl_unitdata_ind_t		*dlp;
1397 	mac_header_info_t		mhi;
1398 	uint_t				addr_length;
1399 	uint8_t				*daddr;
1400 	uint8_t				*saddr;
1401 
1402 	/*
1403 	 * Get the packet header information.
1404 	 */
1405 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1406 		return (NULL);
1407 
1408 	/*
1409 	 * Allocate a message large enough to contain the wrapper structure
1410 	 * defined above.
1411 	 */
1412 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1413 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1414 	    DL_UNITDATA_IND)) == NULL)
1415 		return (NULL);
1416 
1417 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1418 
1419 	dlp = &(dlwp->dl_unitdata);
1420 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1421 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1422 
1423 	/*
1424 	 * Copy in the destination address.
1425 	 */
1426 	addr_length = dsp->ds_mip->mi_addr_length;
1427 	daddr = dlwp->dl_dest_addr;
1428 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1429 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1430 
1431 	/*
1432 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1433 	 */
1434 	if (mhi.mhi_istagged && !strip_vlan)
1435 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1436 	else
1437 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1438 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1439 
1440 	/*
1441 	 * If the destination address was multicast or broadcast then the
1442 	 * dl_group_address field should be non-zero.
1443 	 */
1444 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1445 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1446 
1447 	/*
1448 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1449 	 * for example) may not have access to source information.
1450 	 */
1451 	if (mhi.mhi_saddr == NULL) {
1452 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1453 	} else {
1454 		saddr = dlwp->dl_src_addr;
1455 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1456 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1457 
1458 		/*
1459 		 * Set the source DLSAP to the packet ethertype.
1460 		 */
1461 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1462 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1463 	}
1464 
1465 	return (nmp);
1466 }
1467 
1468 /*
1469  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1470  */
1471 static void
1472 str_notify_promisc_on_phys(dld_str_t *dsp)
1473 {
1474 	mblk_t		*mp;
1475 	dl_notify_ind_t	*dlip;
1476 
1477 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1478 		return;
1479 
1480 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1481 	    M_PROTO, 0)) == NULL)
1482 		return;
1483 
1484 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1485 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1486 	dlip->dl_primitive = DL_NOTIFY_IND;
1487 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1488 
1489 	qreply(dsp->ds_wq, mp);
1490 }
1491 
1492 /*
1493  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1494  */
1495 static void
1496 str_notify_promisc_off_phys(dld_str_t *dsp)
1497 {
1498 	mblk_t		*mp;
1499 	dl_notify_ind_t	*dlip;
1500 
1501 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1502 		return;
1503 
1504 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1505 	    M_PROTO, 0)) == NULL)
1506 		return;
1507 
1508 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1509 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1510 	dlip->dl_primitive = DL_NOTIFY_IND;
1511 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1512 
1513 	qreply(dsp->ds_wq, mp);
1514 }
1515 
1516 /*
1517  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1518  */
1519 static void
1520 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1521 {
1522 	mblk_t		*mp;
1523 	dl_notify_ind_t	*dlip;
1524 	uint_t		addr_length;
1525 	uint16_t	ethertype;
1526 
1527 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1528 		return;
1529 
1530 	addr_length = dsp->ds_mip->mi_addr_length;
1531 	if ((mp = mexchange(dsp->ds_wq, NULL,
1532 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1533 	    M_PROTO, 0)) == NULL)
1534 		return;
1535 
1536 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1537 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1538 	dlip->dl_primitive = DL_NOTIFY_IND;
1539 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1540 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1541 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1542 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1543 
1544 	bcopy(addr, &dlip[1], addr_length);
1545 
1546 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1547 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) =
1548 		ethertype;
1549 
1550 	qreply(dsp->ds_wq, mp);
1551 }
1552 
1553 /*
1554  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1555  */
1556 static void
1557 str_notify_link_up(dld_str_t *dsp)
1558 {
1559 	mblk_t		*mp;
1560 	dl_notify_ind_t	*dlip;
1561 
1562 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1563 		return;
1564 
1565 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1566 	    M_PROTO, 0)) == NULL)
1567 		return;
1568 
1569 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1570 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1571 	dlip->dl_primitive = DL_NOTIFY_IND;
1572 	dlip->dl_notification = DL_NOTE_LINK_UP;
1573 
1574 	qreply(dsp->ds_wq, mp);
1575 }
1576 
1577 /*
1578  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1579  */
1580 static void
1581 str_notify_link_down(dld_str_t *dsp)
1582 {
1583 	mblk_t		*mp;
1584 	dl_notify_ind_t	*dlip;
1585 
1586 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1587 		return;
1588 
1589 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1590 	    M_PROTO, 0)) == NULL)
1591 		return;
1592 
1593 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1594 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1595 	dlip->dl_primitive = DL_NOTIFY_IND;
1596 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1597 
1598 	qreply(dsp->ds_wq, mp);
1599 }
1600 
1601 /*
1602  * DL_NOTIFY_IND: DL_NOTE_SPEED
1603  */
1604 static void
1605 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1606 {
1607 	mblk_t		*mp;
1608 	dl_notify_ind_t	*dlip;
1609 
1610 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1611 		return;
1612 
1613 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1614 	    M_PROTO, 0)) == NULL)
1615 		return;
1616 
1617 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1618 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1619 	dlip->dl_primitive = DL_NOTIFY_IND;
1620 	dlip->dl_notification = DL_NOTE_SPEED;
1621 	dlip->dl_data = speed;
1622 
1623 	qreply(dsp->ds_wq, mp);
1624 }
1625 
1626 /*
1627  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1628  */
1629 static void
1630 str_notify_capab_reneg(dld_str_t *dsp)
1631 {
1632 	mblk_t		*mp;
1633 	dl_notify_ind_t	*dlip;
1634 
1635 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1636 		return;
1637 
1638 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1639 	    M_PROTO, 0)) == NULL)
1640 		return;
1641 
1642 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1643 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1644 	dlip->dl_primitive = DL_NOTIFY_IND;
1645 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1646 
1647 	qreply(dsp->ds_wq, mp);
1648 }
1649 
1650 /*
1651  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1652  */
1653 static void
1654 str_notify_fastpath_flush(dld_str_t *dsp)
1655 {
1656 	mblk_t		*mp;
1657 	dl_notify_ind_t	*dlip;
1658 
1659 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1660 		return;
1661 
1662 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1663 	    M_PROTO, 0)) == NULL)
1664 		return;
1665 
1666 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1667 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1668 	dlip->dl_primitive = DL_NOTIFY_IND;
1669 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1670 
1671 	qreply(dsp->ds_wq, mp);
1672 }
1673 
1674 /*
1675  * MAC notification callback.
1676  */
1677 static void
1678 str_notify(void *arg, mac_notify_type_t type)
1679 {
1680 	dld_str_t		*dsp = (dld_str_t *)arg;
1681 	queue_t			*q = dsp->ds_wq;
1682 
1683 	switch (type) {
1684 	case MAC_NOTE_TX:
1685 		qenable(q);
1686 		break;
1687 
1688 	case MAC_NOTE_DEVPROMISC:
1689 		/*
1690 		 * Send the appropriate DL_NOTIFY_IND.
1691 		 */
1692 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1693 			str_notify_promisc_on_phys(dsp);
1694 		else
1695 			str_notify_promisc_off_phys(dsp);
1696 		break;
1697 
1698 	case MAC_NOTE_PROMISC:
1699 		break;
1700 
1701 	case MAC_NOTE_UNICST:
1702 		/*
1703 		 * This notification is sent whenever the MAC unicast address
1704 		 * changes. We need to re-cache the address.
1705 		 */
1706 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1707 
1708 		/*
1709 		 * Send the appropriate DL_NOTIFY_IND.
1710 		 */
1711 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1712 		break;
1713 
1714 	case MAC_NOTE_LINK:
1715 		/*
1716 		 * This notification is sent every time the MAC driver
1717 		 * updates the link state.
1718 		 */
1719 		switch (mac_link_get(dsp->ds_mh)) {
1720 		case LINK_STATE_UP: {
1721 			uint64_t speed;
1722 			/*
1723 			 * The link is up so send the appropriate
1724 			 * DL_NOTIFY_IND.
1725 			 */
1726 			str_notify_link_up(dsp);
1727 
1728 			speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
1729 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1730 			break;
1731 		}
1732 		case LINK_STATE_DOWN:
1733 			/*
1734 			 * The link is down so send the appropriate
1735 			 * DL_NOTIFY_IND.
1736 			 */
1737 			str_notify_link_down(dsp);
1738 			break;
1739 
1740 		default:
1741 			break;
1742 		}
1743 		break;
1744 
1745 	case MAC_NOTE_RESOURCE:
1746 		/*
1747 		 * This notification is sent whenever the MAC resources
1748 		 * change. We need to renegotiate the capabilities.
1749 		 * Send the appropriate DL_NOTIFY_IND.
1750 		 */
1751 		str_notify_capab_reneg(dsp);
1752 		break;
1753 
1754 	case MAC_NOTE_FASTPATH_FLUSH:
1755 		str_notify_fastpath_flush(dsp);
1756 		break;
1757 
1758 	default:
1759 		ASSERT(B_FALSE);
1760 		break;
1761 	}
1762 }
1763 
1764 /*
1765  * Enqueue one or more messages to the transmit queue.
1766  * Caller specifies the insertion position (head/tail).
1767  */
1768 void
1769 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1770 {
1771 	mblk_t	*tail;
1772 	queue_t *q = dsp->ds_wq;
1773 	uint_t	cnt, msgcnt;
1774 	uint_t	tot_cnt, tot_msgcnt;
1775 
1776 	ASSERT(DB_TYPE(mp) == M_DATA);
1777 	/* Calculate total size and count of the packet(s) */
1778 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1779 	    tail->b_next != NULL; tail = tail->b_next) {
1780 		ASSERT(DB_TYPE(tail->b_next) == M_DATA);
1781 		cnt += msgdsize(tail->b_next);
1782 		msgcnt++;
1783 	}
1784 
1785 	mutex_enter(&dsp->ds_tx_list_lock);
1786 	/*
1787 	 * If the queue depth would exceed the allowed threshold, drop
1788 	 * new packet(s) and drain those already in the queue.
1789 	 */
1790 	tot_cnt = dsp->ds_tx_cnt + cnt;
1791 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1792 
1793 	if (!head_insert &&
1794 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1795 		ASSERT(dsp->ds_tx_qbusy);
1796 		mutex_exit(&dsp->ds_tx_list_lock);
1797 		freemsgchain(mp);
1798 		goto done;
1799 	}
1800 
1801 	/* Update the queue size parameters */
1802 	dsp->ds_tx_cnt = tot_cnt;
1803 	dsp->ds_tx_msgcnt = tot_msgcnt;
1804 
1805 	/*
1806 	 * If the transmit queue is currently empty and we are
1807 	 * about to deposit the packet(s) there, switch mode to
1808 	 * "busy" and raise flow-control condition.
1809 	 */
1810 	if (!dsp->ds_tx_qbusy) {
1811 		dsp->ds_tx_qbusy = B_TRUE;
1812 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1813 		(void) putq(q, dsp->ds_tx_flow_mp);
1814 		dsp->ds_tx_flow_mp = NULL;
1815 	}
1816 
1817 	if (!head_insert) {
1818 		/* Tail insertion */
1819 		if (dsp->ds_tx_list_head == NULL)
1820 			dsp->ds_tx_list_head = mp;
1821 		else
1822 			dsp->ds_tx_list_tail->b_next = mp;
1823 		dsp->ds_tx_list_tail = tail;
1824 	} else {
1825 		/* Head insertion */
1826 		tail->b_next = dsp->ds_tx_list_head;
1827 		if (dsp->ds_tx_list_head == NULL)
1828 			dsp->ds_tx_list_tail = tail;
1829 		dsp->ds_tx_list_head = mp;
1830 	}
1831 	mutex_exit(&dsp->ds_tx_list_lock);
1832 done:
1833 	/* Schedule service thread to drain the transmit queue */
1834 	if (!head_insert)
1835 		qenable(q);
1836 }
1837 
1838 void
1839 dld_tx_flush(dld_str_t *dsp)
1840 {
1841 	mutex_enter(&dsp->ds_tx_list_lock);
1842 	if (dsp->ds_tx_list_head != NULL) {
1843 		freemsgchain(dsp->ds_tx_list_head);
1844 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1845 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1846 		if (dsp->ds_tx_qbusy) {
1847 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1848 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1849 			dsp->ds_tx_qbusy = B_FALSE;
1850 		}
1851 	}
1852 	mutex_exit(&dsp->ds_tx_list_lock);
1853 }
1854 
1855 /*
1856  * Process an M_IOCTL message.
1857  */
1858 static void
1859 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1860 {
1861 	uint_t			cmd;
1862 
1863 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1864 	ASSERT(dsp->ds_type == DLD_DLPI);
1865 
1866 	switch (cmd) {
1867 	case DLIOCRAW:
1868 		ioc_raw(dsp, mp);
1869 		break;
1870 	case DLIOCHDRINFO:
1871 		ioc_fast(dsp, mp);
1872 		break;
1873 	default:
1874 		ioc(dsp, mp);
1875 	}
1876 }
1877 
1878 /*
1879  * DLIOCRAW
1880  */
1881 static void
1882 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1883 {
1884 	queue_t *q = dsp->ds_wq;
1885 
1886 	rw_enter(&dsp->ds_lock, RW_WRITER);
1887 	if (dsp->ds_polling || dsp->ds_soft_ring) {
1888 		rw_exit(&dsp->ds_lock);
1889 		miocnak(q, mp, 0, EPROTO);
1890 		return;
1891 	}
1892 
1893 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1894 		/*
1895 		 * Set the receive callback.
1896 		 */
1897 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1898 	}
1899 
1900 	/*
1901 	 * Note that raw mode is enabled.
1902 	 */
1903 	dsp->ds_mode = DLD_RAW;
1904 
1905 	rw_exit(&dsp->ds_lock);
1906 	miocack(q, mp, 0, 0);
1907 }
1908 
1909 /*
1910  * DLIOCHDRINFO
1911  */
1912 static void
1913 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1914 {
1915 	dl_unitdata_req_t *dlp;
1916 	off_t		off;
1917 	size_t		len;
1918 	const uint8_t	*addr;
1919 	uint16_t	sap;
1920 	mblk_t		*nmp;
1921 	mblk_t		*hmp;
1922 	uint_t		addr_length;
1923 	queue_t		*q = dsp->ds_wq;
1924 	int		err;
1925 	dls_channel_t	dc;
1926 
1927 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1928 		err = ENOTSUP;
1929 		goto failed;
1930 	}
1931 
1932 	/*
1933 	 * DLIOCHDRINFO should only come from IP. The one initiated from
1934 	 * user-land should not be allowed.
1935 	 */
1936 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
1937 		err = EINVAL;
1938 		goto failed;
1939 	}
1940 
1941 	nmp = mp->b_cont;
1942 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1943 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1944 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1945 		err = EINVAL;
1946 		goto failed;
1947 	}
1948 
1949 	off = dlp->dl_dest_addr_offset;
1950 	len = dlp->dl_dest_addr_length;
1951 
1952 	if (!MBLKIN(nmp, off, len)) {
1953 		err = EINVAL;
1954 		goto failed;
1955 	}
1956 
1957 	rw_enter(&dsp->ds_lock, RW_READER);
1958 	if (dsp->ds_dlstate != DL_IDLE) {
1959 		rw_exit(&dsp->ds_lock);
1960 		err = ENOTSUP;
1961 		goto failed;
1962 	}
1963 
1964 	addr_length = dsp->ds_mip->mi_addr_length;
1965 	if (len != addr_length + sizeof (uint16_t)) {
1966 		rw_exit(&dsp->ds_lock);
1967 		err = EINVAL;
1968 		goto failed;
1969 	}
1970 
1971 	addr = nmp->b_rptr + off;
1972 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
1973 	dc = dsp->ds_dc;
1974 
1975 	if ((hmp = dls_header(dc, addr, sap, 0, NULL)) == NULL) {
1976 		rw_exit(&dsp->ds_lock);
1977 		err = ENOMEM;
1978 		goto failed;
1979 	}
1980 
1981 	/*
1982 	 * This is a performance optimization.  We originally entered
1983 	 * as reader and only become writer upon transitioning into
1984 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
1985 	 * stay as reader and return the fast-path header to IP.
1986 	 */
1987 	if (dsp->ds_mode != DLD_FASTPATH) {
1988 		if (!rw_tryupgrade(&dsp->ds_lock)) {
1989 			rw_exit(&dsp->ds_lock);
1990 			rw_enter(&dsp->ds_lock, RW_WRITER);
1991 
1992 			/*
1993 			 * State may have changed before we re-acquired
1994 			 * the writer lock in case the upgrade failed.
1995 			 */
1996 			if (dsp->ds_dlstate != DL_IDLE) {
1997 				rw_exit(&dsp->ds_lock);
1998 				err = ENOTSUP;
1999 				goto failed;
2000 			}
2001 		}
2002 
2003 		/*
2004 		 * Set the receive callback (unless polling is enabled).
2005 		 */
2006 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
2007 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
2008 
2009 		/*
2010 		 * Note that fast-path mode is enabled.
2011 		 */
2012 		dsp->ds_mode = DLD_FASTPATH;
2013 	}
2014 	rw_exit(&dsp->ds_lock);
2015 
2016 	freemsg(nmp->b_cont);
2017 	nmp->b_cont = hmp;
2018 
2019 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2020 	return;
2021 failed:
2022 	miocnak(q, mp, 0, err);
2023 }
2024 
2025 /*
2026  * Catch-all handler.
2027  */
2028 static void
2029 ioc(dld_str_t *dsp, mblk_t *mp)
2030 {
2031 	queue_t	*q = dsp->ds_wq;
2032 	mac_handle_t mh;
2033 
2034 	rw_enter(&dsp->ds_lock, RW_READER);
2035 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2036 		rw_exit(&dsp->ds_lock);
2037 		miocnak(q, mp, 0, EINVAL);
2038 		return;
2039 	}
2040 	mh = dsp->ds_mh;
2041 	ASSERT(mh != NULL);
2042 	rw_exit(&dsp->ds_lock);
2043 	mac_ioctl(mh, q, mp);
2044 }
2045 
2046 /*
2047  * Allocate a new minor number.
2048  */
2049 static minor_t
2050 dld_minor_hold(boolean_t sleep)
2051 {
2052 	minor_t		minor;
2053 
2054 	/*
2055 	 * Grab a value from the arena.
2056 	 */
2057 	atomic_add_32(&minor_count, 1);
2058 	if ((minor = PTR_TO_MINOR(vmem_alloc(minor_arenap, 1,
2059 	    (sleep) ? VM_SLEEP : VM_NOSLEEP))) == 0) {
2060 		atomic_add_32(&minor_count, -1);
2061 		return (0);
2062 	}
2063 
2064 	return (minor);
2065 }
2066 
2067 /*
2068  * Release a previously allocated minor number.
2069  */
2070 static void
2071 dld_minor_rele(minor_t minor)
2072 {
2073 	/*
2074 	 * Return the value to the arena.
2075 	 */
2076 	vmem_free(minor_arenap, MINOR_TO_PTR(minor), 1);
2077 
2078 	atomic_add_32(&minor_count, -1);
2079 }
2080