xref: /titanic_44/usr/src/uts/common/io/dld/dld_str.c (revision 39c23413b8df94a95f67b34cfd4a4dfc3fd0b48d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Data-Link Driver
30  */
31 
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/strsubr.h>
35 #include	<sys/atomic.h>
36 #include	<sys/mkdev.h>
37 #include	<sys/vlan.h>
38 #include	<sys/dld.h>
39 #include	<sys/dld_impl.h>
40 #include	<sys/dls_impl.h>
41 #include	<inet/common.h>
42 
43 static int	str_constructor(void *, void *, int);
44 static void	str_destructor(void *, void *);
45 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
46 static void	str_notify_promisc_on_phys(dld_str_t *);
47 static void	str_notify_promisc_off_phys(dld_str_t *);
48 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
49 static void	str_notify_link_up(dld_str_t *);
50 static void	str_notify_link_down(dld_str_t *);
51 static void	str_notify_capab_reneg(dld_str_t *);
52 static void	str_notify_speed(dld_str_t *, uint32_t);
53 static void	str_notify(void *, mac_notify_type_t);
54 
55 static void	ioc_native(dld_str_t *,  mblk_t *);
56 static void	ioc_raw(dld_str_t *, mblk_t *);
57 static void	ioc_fast(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static minor_t	dld_minor_hold(boolean_t);
61 static void	dld_minor_rele(minor_t);
62 static void	str_mdata_raw_put(dld_str_t *, mblk_t *);
63 static mblk_t	*i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t);
64 static mblk_t	*i_dld_ether_header_strip_tag(mblk_t *);
65 
66 static uint32_t		str_count;
67 static kmem_cache_t	*str_cachep;
68 static vmem_t		*minor_arenap;
69 static uint32_t		minor_count;
70 static mod_hash_t	*str_hashp;
71 
72 #define	MINOR_TO_PTR(minor)	((void *)(uintptr_t)(minor))
73 #define	PTR_TO_MINOR(ptr)	((minor_t)(uintptr_t)(ptr))
74 
75 #define	STR_HASHSZ		64
76 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
77 
78 /*
79  * Some notes on entry points, flow-control, queueing and locking:
80  *
81  * This driver exports the traditional STREAMS put entry point as well as
82  * the non-STREAMS fast-path transmit routine which is provided to IP via
83  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
84  * and data operations, while the fast-path routine deals only with M_DATA
85  * fast-path packets.  Regardless of the entry point, all outbound packets
86  * will end up in dld_tx_single(), where they will be delivered to the MAC
87  * driver.
88  *
89  * The transmit logic operates in two modes: a "not busy" mode where the
90  * packets will be delivered to the MAC for a send attempt, or "busy" mode
91  * where they will be enqueued in the internal queue because of flow-control.
92  * Flow-control happens when the MAC driver indicates the packets couldn't
93  * be transmitted due to lack of resources (e.g. running out of descriptors).
94  * In such case, the driver will place a dummy message on its write-side
95  * STREAMS queue so that the queue is marked as "full".  Any subsequent
96  * packets arriving at the driver will be enqueued in the internal queue,
97  * which is drained in the context of the service thread that gets scheduled
98  * whenever the driver is in the "busy" mode.  When all packets have been
99  * successfully delivered by MAC and the internal queue is empty, it will
100  * transition to the "not busy" mode by removing the dummy message from the
101  * write-side STREAMS queue; in effect this will trigger backenabling.
102  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
103  * to the above reasons.
104  *
105  * The driver implements an internal transmit queue independent of STREAMS.
106  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
107  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
108  * getq() operations done by the driver are those related to placing and
109  * removing the dummy message to/from the write-side STREAMS queue for flow-
110  * control purposes.
111  *
112  * Locking is done independent of STREAMS due to the driver being fully MT.
113  * Threads entering the driver (either from put or service entry points)
114  * will most likely be readers, with the exception of a few writer cases
115  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
116  * DLD-related ioctl requests.  The DLPI detach case is special, because
117  * it involves freeing resources and therefore must be single-threaded.
118  * Unfortunately the readers/writers lock can't be used to protect against
119  * it, because the lock is dropped prior to the driver calling places where
120  * putnext() may be invoked, and such places may depend on those resources
121  * to exist.  Because of this, the driver always completes the DLPI detach
122  * process when there are no other threads running in the driver.  This is
123  * done by keeping track of the number of threads, such that the the last
124  * thread leaving the driver will finish the pending DLPI detach operation.
125  */
126 
127 /*
128  * dld_max_q_count is the queue depth threshold used to limit the number of
129  * outstanding packets or bytes allowed in the queue; once this limit is
130  * reached the driver will free any incoming ones until the queue depth
131  * drops below the threshold.
132  *
133  * This buffering is provided to accomodate clients which do not employ
134  * their own buffering scheme, and to handle occasional packet bursts.
135  * Clients which handle their own buffering will receive positive feedback
136  * from this driver as soon as it transitions into the "busy" state, i.e.
137  * when the queue is initially filled up; they will get backenabled once
138  * the queue is empty.
139  *
140  * The value chosen here is rather arbitrary; in future some intelligent
141  * heuristics may be involved which could take into account the hardware's
142  * transmit ring size, etc.
143  */
144 uint_t dld_max_q_count = (16 * 1024 *1024);
145 
146 /*
147  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
148  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
149  * match dev_t. If a stream is found and it is attached, its dev_info_t *
150  * is returned.
151  */
152 typedef struct i_dld_str_state_s {
153 	major_t		ds_major;
154 	minor_t		ds_minor;
155 	dev_info_t	*ds_dip;
156 } i_dld_str_state_t;
157 
158 /* ARGSUSED */
159 static uint_t
160 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
161 {
162 	i_dld_str_state_t	*statep = arg;
163 	dld_str_t		*dsp = (dld_str_t *)val;
164 
165 	if (statep->ds_major != dsp->ds_major)
166 		return (MH_WALK_CONTINUE);
167 
168 	ASSERT(statep->ds_minor != 0);
169 
170 	/*
171 	 * Access to ds_ppa and ds_mh need to be protected by ds_lock.
172 	 */
173 	rw_enter(&dsp->ds_lock, RW_READER);
174 	if (statep->ds_minor <= DLD_MAX_MINOR) {
175 		/*
176 		 * Style 1: minor can be derived from the ppa. we
177 		 * continue to walk until we find a matching stream
178 		 * in attached state.
179 		 */
180 		if (statep->ds_minor == DLS_PPA2MINOR(dsp->ds_ppa) &&
181 		    dsp->ds_mh != NULL) {
182 			statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
183 			rw_exit(&dsp->ds_lock);
184 			return (MH_WALK_TERMINATE);
185 		}
186 	} else {
187 		/*
188 		 * Clone: a clone minor is unique. we can terminate the
189 		 * walk if we find a matching stream -- even if we fail
190 		 * to obtain the devinfo.
191 		 */
192 		if (statep->ds_minor == dsp->ds_minor) {
193 			if (dsp->ds_mh != NULL)
194 				statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
195 			rw_exit(&dsp->ds_lock);
196 			return (MH_WALK_TERMINATE);
197 		}
198 	}
199 	rw_exit(&dsp->ds_lock);
200 	return (MH_WALK_CONTINUE);
201 }
202 
203 static dev_info_t *
204 dld_finddevinfo(dev_t dev)
205 {
206 	i_dld_str_state_t	state;
207 
208 	state.ds_minor = getminor(dev);
209 	state.ds_major = getmajor(dev);
210 	state.ds_dip = NULL;
211 
212 	if (state.ds_minor == 0)
213 		return (NULL);
214 
215 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
216 	return (state.ds_dip);
217 }
218 
219 
220 /*
221  * devo_getinfo: getinfo(9e)
222  */
223 /*ARGSUSED*/
224 int
225 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
226 {
227 	dev_info_t	*devinfo;
228 	minor_t		minor = getminor((dev_t)arg);
229 	int		rc = DDI_FAILURE;
230 
231 	switch (cmd) {
232 	case DDI_INFO_DEVT2DEVINFO:
233 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
234 			*(dev_info_t **)resp = devinfo;
235 			rc = DDI_SUCCESS;
236 		}
237 		break;
238 	case DDI_INFO_DEVT2INSTANCE:
239 		if (minor > 0 && minor <= DLD_MAX_MINOR) {
240 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
241 			rc = DDI_SUCCESS;
242 		} else if (minor > DLD_MAX_MINOR &&
243 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
244 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
245 			rc = DDI_SUCCESS;
246 		}
247 		break;
248 	}
249 	return (rc);
250 }
251 
252 /*
253  * qi_qopen: open(9e)
254  */
255 /*ARGSUSED*/
256 int
257 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
258 {
259 	dld_str_t	*dsp;
260 	major_t		major;
261 	minor_t		minor;
262 	int		err;
263 
264 	if (sflag == MODOPEN)
265 		return (ENOTSUP);
266 
267 	/*
268 	 * This is a cloning driver and therefore each queue should only
269 	 * ever get opened once.
270 	 */
271 	if (rq->q_ptr != NULL)
272 		return (EBUSY);
273 
274 	major = getmajor(*devp);
275 	minor = getminor(*devp);
276 	if (minor > DLD_MAX_MINOR)
277 		return (ENODEV);
278 
279 	/*
280 	 * Create a new dld_str_t for the stream. This will grab a new minor
281 	 * number that will be handed back in the cloned dev_t.  Creation may
282 	 * fail if we can't allocate the dummy mblk used for flow-control.
283 	 */
284 	dsp = dld_str_create(rq, DLD_DLPI, major,
285 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
286 	if (dsp == NULL)
287 		return (ENOSR);
288 
289 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
290 	if (minor != 0) {
291 		/*
292 		 * Style 1 open
293 		 */
294 
295 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
296 			goto failed;
297 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
298 	} else {
299 		(void) qassociate(rq, -1);
300 	}
301 
302 	/*
303 	 * Enable the queue srv(9e) routine.
304 	 */
305 	qprocson(rq);
306 
307 	/*
308 	 * Construct a cloned dev_t to hand back.
309 	 */
310 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
311 	return (0);
312 
313 failed:
314 	dld_str_destroy(dsp);
315 	return (err);
316 }
317 
318 /*
319  * qi_qclose: close(9e)
320  */
321 int
322 dld_close(queue_t *rq)
323 {
324 	dld_str_t	*dsp = rq->q_ptr;
325 
326 	/*
327 	 * Wait until pending requests are processed.
328 	 */
329 	mutex_enter(&dsp->ds_thr_lock);
330 	while (dsp->ds_pending_cnt > 0)
331 		cv_wait(&dsp->ds_pending_cv, &dsp->ds_thr_lock);
332 	mutex_exit(&dsp->ds_thr_lock);
333 
334 	/*
335 	 * Disable the queue srv(9e) routine.
336 	 */
337 	qprocsoff(rq);
338 
339 	/*
340 	 * At this point we can not be entered by any threads via STREAMS
341 	 * or the direct call interface, which is available only to IP.
342 	 * After the interface is unplumbed, IP wouldn't have any reference
343 	 * to this instance, and therefore we are now effectively single
344 	 * threaded and don't require any lock protection.  Flush all
345 	 * pending packets which are sitting in the transmit queue.
346 	 */
347 	ASSERT(dsp->ds_thr == 0);
348 	dld_tx_flush(dsp);
349 
350 	/*
351 	 * This stream was open to a provider node. Check to see
352 	 * if it has been cleanly shut down.
353 	 */
354 	if (dsp->ds_dlstate != DL_UNATTACHED) {
355 		/*
356 		 * The stream is either open to a style 1 provider or
357 		 * this is not clean shutdown. Detach from the PPA.
358 		 * (This is still ok even in the style 1 case).
359 		 */
360 		dld_str_detach(dsp);
361 	}
362 
363 	dld_str_destroy(dsp);
364 	return (0);
365 }
366 
367 /*
368  * qi_qputp: put(9e)
369  */
370 void
371 dld_wput(queue_t *wq, mblk_t *mp)
372 {
373 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
374 
375 	DLD_ENTER(dsp);
376 
377 	switch (DB_TYPE(mp)) {
378 	case M_DATA:
379 		rw_enter(&dsp->ds_lock, RW_READER);
380 		if (dsp->ds_dlstate != DL_IDLE ||
381 		    dsp->ds_mode == DLD_UNITDATA) {
382 			freemsg(mp);
383 		} else if (dsp->ds_mode == DLD_FASTPATH) {
384 			str_mdata_fastpath_put(dsp, mp);
385 		} else if (dsp->ds_mode == DLD_RAW) {
386 			str_mdata_raw_put(dsp, mp);
387 		}
388 		rw_exit(&dsp->ds_lock);
389 		break;
390 	case M_PROTO:
391 	case M_PCPROTO:
392 		dld_proto(dsp, mp);
393 		break;
394 	case M_IOCTL:
395 		dld_ioc(dsp, mp);
396 		break;
397 	case M_FLUSH:
398 		if (*mp->b_rptr & FLUSHW) {
399 			dld_tx_flush(dsp);
400 			*mp->b_rptr &= ~FLUSHW;
401 		}
402 
403 		if (*mp->b_rptr & FLUSHR) {
404 			qreply(wq, mp);
405 		} else {
406 			freemsg(mp);
407 		}
408 		break;
409 	default:
410 		freemsg(mp);
411 		break;
412 	}
413 
414 	DLD_EXIT(dsp);
415 }
416 
417 /*
418  * qi_srvp: srv(9e)
419  */
420 void
421 dld_wsrv(queue_t *wq)
422 {
423 	mblk_t		*mp;
424 	dld_str_t	*dsp = wq->q_ptr;
425 
426 	DLD_ENTER(dsp);
427 	rw_enter(&dsp->ds_lock, RW_READER);
428 	/*
429 	 * Grab all packets (chained via b_next) off our transmit queue
430 	 * and try to send them all to the MAC layer.  Since the queue
431 	 * is independent of streams, we are able to dequeue all messages
432 	 * at once without looping through getq() and manually chaining
433 	 * them.  Note that the queue size parameters (byte and message
434 	 * counts) are cleared as well, but we postpone the backenabling
435 	 * until after the MAC transmit since some packets may end up
436 	 * back at our transmit queue.
437 	 */
438 	mutex_enter(&dsp->ds_tx_list_lock);
439 	if ((mp = dsp->ds_tx_list_head) == NULL) {
440 		ASSERT(!dsp->ds_tx_qbusy);
441 		ASSERT(dsp->ds_tx_flow_mp != NULL);
442 		ASSERT(dsp->ds_tx_list_head == NULL);
443 		ASSERT(dsp->ds_tx_list_tail == NULL);
444 		ASSERT(dsp->ds_tx_cnt == 0);
445 		ASSERT(dsp->ds_tx_msgcnt == 0);
446 		mutex_exit(&dsp->ds_tx_list_lock);
447 		rw_exit(&dsp->ds_lock);
448 		DLD_EXIT(dsp);
449 		return;
450 	}
451 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
452 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
453 	mutex_exit(&dsp->ds_tx_list_lock);
454 
455 	/*
456 	 * Discard packets unless we are attached and bound; note that
457 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
458 	 * because regardless of the mode all transmit will end up in
459 	 * dld_tx_single() where the packets may be queued.
460 	 */
461 	ASSERT(DB_TYPE(mp) == M_DATA);
462 	if (dsp->ds_dlstate != DL_IDLE) {
463 		freemsgchain(mp);
464 		goto done;
465 	}
466 
467 	/*
468 	 * Attempt to transmit one or more packets.  If the MAC can't
469 	 * send them all, re-queue the packet(s) at the beginning of
470 	 * the transmit queue to avoid any re-ordering.
471 	 */
472 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
473 		dld_tx_enqueue(dsp, mp, B_TRUE);
474 
475 done:
476 	/*
477 	 * Grab the list lock again and check if the transmit queue is
478 	 * really empty; if so, lift up flow-control and backenable any
479 	 * writer queues.  If the queue is not empty, schedule service
480 	 * thread to drain it.
481 	 */
482 	mutex_enter(&dsp->ds_tx_list_lock);
483 	if (dsp->ds_tx_list_head == NULL) {
484 		dsp->ds_tx_flow_mp = getq(wq);
485 		ASSERT(dsp->ds_tx_flow_mp != NULL);
486 		dsp->ds_tx_qbusy = B_FALSE;
487 	}
488 	mutex_exit(&dsp->ds_tx_list_lock);
489 
490 	rw_exit(&dsp->ds_lock);
491 	DLD_EXIT(dsp);
492 }
493 
494 void
495 dld_init_ops(struct dev_ops *ops, const char *name)
496 {
497 	struct streamtab *stream;
498 	struct qinit *rq, *wq;
499 	struct module_info *modinfo;
500 
501 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
502 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
503 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
504 	modinfo->mi_minpsz = 0;
505 	modinfo->mi_maxpsz = 64*1024;
506 	modinfo->mi_hiwat  = 1;
507 	modinfo->mi_lowat = 0;
508 
509 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
510 	rq->qi_qopen = dld_open;
511 	rq->qi_qclose = dld_close;
512 	rq->qi_minfo = modinfo;
513 
514 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
515 	wq->qi_putp = (pfi_t)dld_wput;
516 	wq->qi_srvp = (pfi_t)dld_wsrv;
517 	wq->qi_minfo = modinfo;
518 
519 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
520 	stream->st_rdinit = rq;
521 	stream->st_wrinit = wq;
522 	ops->devo_cb_ops->cb_str = stream;
523 
524 	ops->devo_getinfo = &dld_getinfo;
525 }
526 
527 void
528 dld_fini_ops(struct dev_ops *ops)
529 {
530 	struct streamtab *stream;
531 	struct qinit *rq, *wq;
532 	struct module_info *modinfo;
533 
534 	stream = ops->devo_cb_ops->cb_str;
535 	rq = stream->st_rdinit;
536 	wq = stream->st_wrinit;
537 	modinfo = rq->qi_minfo;
538 	ASSERT(wq->qi_minfo == modinfo);
539 
540 	kmem_free(stream, sizeof (struct streamtab));
541 	kmem_free(wq, sizeof (struct qinit));
542 	kmem_free(rq, sizeof (struct qinit));
543 	kmem_free(modinfo->mi_idname, FMNAMESZ);
544 	kmem_free(modinfo, sizeof (struct module_info));
545 }
546 
547 /*
548  * Initialize this module's data structures.
549  */
550 void
551 dld_str_init(void)
552 {
553 	/*
554 	 * Create dld_str_t object cache.
555 	 */
556 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
557 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
558 	ASSERT(str_cachep != NULL);
559 
560 	/*
561 	 * Allocate a vmem arena to manage minor numbers. The range of the
562 	 * arena will be from DLD_MAX_MINOR + 1 to MAXMIN (maximum legal
563 	 * minor number).
564 	 */
565 	minor_arenap = vmem_create("dld_minor_arena",
566 	    MINOR_TO_PTR(DLD_MAX_MINOR + 1), MAXMIN, 1, NULL, NULL, NULL, 0,
567 	    VM_SLEEP | VMC_IDENTIFIER);
568 	ASSERT(minor_arenap != NULL);
569 
570 	/*
571 	 * Create a hash table for maintaining dld_str_t's.
572 	 * The ds_minor field (the clone minor number) of a dld_str_t
573 	 * is used as a key for this hash table because this number is
574 	 * globally unique (allocated from "dld_minor_arena").
575 	 */
576 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
577 	    mod_hash_null_valdtor);
578 }
579 
580 /*
581  * Tear down this module's data structures.
582  */
583 int
584 dld_str_fini(void)
585 {
586 	/*
587 	 * Make sure that there are no objects in use.
588 	 */
589 	if (str_count != 0)
590 		return (EBUSY);
591 
592 	/*
593 	 * Check to see if there are any minor numbers still in use.
594 	 */
595 	if (minor_count != 0)
596 		return (EBUSY);
597 
598 	/*
599 	 * Destroy object cache.
600 	 */
601 	kmem_cache_destroy(str_cachep);
602 	vmem_destroy(minor_arenap);
603 	mod_hash_destroy_idhash(str_hashp);
604 	return (0);
605 }
606 
607 /*
608  * Create a new dld_str_t object.
609  */
610 dld_str_t *
611 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
612 {
613 	dld_str_t	*dsp;
614 	int		err;
615 
616 	/*
617 	 * Allocate an object from the cache.
618 	 */
619 	atomic_add_32(&str_count, 1);
620 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
621 
622 	/*
623 	 * Allocate the dummy mblk for flow-control.
624 	 */
625 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
626 	if (dsp->ds_tx_flow_mp == NULL) {
627 		kmem_cache_free(str_cachep, dsp);
628 		atomic_add_32(&str_count, -1);
629 		return (NULL);
630 	}
631 	dsp->ds_type = type;
632 	dsp->ds_major = major;
633 	dsp->ds_style = style;
634 
635 	/*
636 	 * Initialize the queue pointers.
637 	 */
638 	ASSERT(RD(rq) == rq);
639 	dsp->ds_rq = rq;
640 	dsp->ds_wq = WR(rq);
641 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
642 
643 	/*
644 	 * We want explicit control over our write-side STREAMS queue
645 	 * where the dummy mblk gets added/removed for flow-control.
646 	 */
647 	noenable(WR(rq));
648 
649 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
650 	    (mod_hash_val_t)dsp);
651 	ASSERT(err == 0);
652 	return (dsp);
653 }
654 
655 /*
656  * Destroy a dld_str_t object.
657  */
658 void
659 dld_str_destroy(dld_str_t *dsp)
660 {
661 	queue_t		*rq;
662 	queue_t		*wq;
663 	mod_hash_val_t	val;
664 	/*
665 	 * Clear the queue pointers.
666 	 */
667 	rq = dsp->ds_rq;
668 	wq = dsp->ds_wq;
669 	ASSERT(wq == WR(rq));
670 
671 	rq->q_ptr = wq->q_ptr = NULL;
672 	dsp->ds_rq = dsp->ds_wq = NULL;
673 
674 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
675 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
676 	ASSERT(dsp->ds_tx_list_head == NULL);
677 	ASSERT(dsp->ds_tx_list_tail == NULL);
678 	ASSERT(dsp->ds_tx_cnt == 0);
679 	ASSERT(dsp->ds_tx_msgcnt == 0);
680 	ASSERT(!dsp->ds_tx_qbusy);
681 
682 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
683 	ASSERT(dsp->ds_thr == 0);
684 	ASSERT(dsp->ds_pending_req == NULL);
685 
686 	/*
687 	 * Reinitialize all the flags.
688 	 */
689 	dsp->ds_notifications = 0;
690 	dsp->ds_passivestate = DLD_UNINITIALIZED;
691 	dsp->ds_mode = DLD_UNITDATA;
692 	dsp->ds_native = B_FALSE;
693 
694 	/*
695 	 * Free the dummy mblk if exists.
696 	 */
697 	if (dsp->ds_tx_flow_mp != NULL) {
698 		freeb(dsp->ds_tx_flow_mp);
699 		dsp->ds_tx_flow_mp = NULL;
700 	}
701 
702 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
703 	ASSERT(dsp == (dld_str_t *)val);
704 
705 	/*
706 	 * Free the object back to the cache.
707 	 */
708 	kmem_cache_free(str_cachep, dsp);
709 	atomic_add_32(&str_count, -1);
710 }
711 
712 /*
713  * kmem_cache contructor function: see kmem_cache_create(9f).
714  */
715 /*ARGSUSED*/
716 static int
717 str_constructor(void *buf, void *cdrarg, int kmflags)
718 {
719 	dld_str_t	*dsp = buf;
720 
721 	bzero(buf, sizeof (dld_str_t));
722 
723 	/*
724 	 * Allocate a new minor number.
725 	 */
726 	if ((dsp->ds_minor = dld_minor_hold(kmflags == KM_SLEEP)) == 0)
727 		return (-1);
728 
729 	/*
730 	 * Initialize the DLPI state machine.
731 	 */
732 	dsp->ds_dlstate = DL_UNATTACHED;
733 	dsp->ds_ppa = (t_uscalar_t)-1;
734 
735 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
736 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
737 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
738 	cv_init(&dsp->ds_pending_cv, NULL, CV_DRIVER, NULL);
739 
740 	return (0);
741 }
742 
743 /*
744  * kmem_cache destructor function.
745  */
746 /*ARGSUSED*/
747 static void
748 str_destructor(void *buf, void *cdrarg)
749 {
750 	dld_str_t	*dsp = buf;
751 
752 	/*
753 	 * Make sure the DLPI state machine was reset.
754 	 */
755 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
756 
757 	/*
758 	 * Make sure the data-link interface was closed.
759 	 */
760 	ASSERT(dsp->ds_mh == NULL);
761 	ASSERT(dsp->ds_dc == NULL);
762 
763 	/*
764 	 * Make sure enabled notifications are cleared.
765 	 */
766 	ASSERT(dsp->ds_notifications == 0);
767 
768 	/*
769 	 * Make sure polling is disabled.
770 	 */
771 	ASSERT(!dsp->ds_polling);
772 
773 	/*
774 	 * Release the minor number.
775 	 */
776 	dld_minor_rele(dsp->ds_minor);
777 
778 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
779 	rw_destroy(&dsp->ds_lock);
780 
781 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
782 	mutex_destroy(&dsp->ds_tx_list_lock);
783 	ASSERT(dsp->ds_tx_flow_mp == NULL);
784 
785 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
786 	mutex_destroy(&dsp->ds_thr_lock);
787 	ASSERT(dsp->ds_pending_req == NULL);
788 	ASSERT(dsp->ds_pending_op == NULL);
789 	ASSERT(dsp->ds_pending_cnt == 0);
790 	cv_destroy(&dsp->ds_pending_cv);
791 }
792 
793 /*
794  * M_DATA put. Note that mp is a single message, not a chained message.
795  */
796 void
797 dld_tx_single(dld_str_t *dsp, mblk_t *mp)
798 {
799 	/*
800 	 * This function can be called from within dld or from an upper
801 	 * layer protocol (currently only tcp). If we are in the busy
802 	 * mode enqueue the packet(s) and return.  Otherwise hand them
803 	 * over to the MAC driver for transmission; any remaining one(s)
804 	 * which didn't get sent will be queued.
805 	 *
806 	 * Note here that we don't grab the list lock prior to checking
807 	 * the busy flag.  This is okay, because a missed transition
808 	 * will not cause any packet reordering for any particular TCP
809 	 * connection (which is single-threaded).  The enqueue routine
810 	 * will atomically set the busy flag and schedule the service
811 	 * thread to run; the flag is only cleared by the service thread
812 	 * when there is no more packet to be transmitted.
813 	 */
814 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
815 		dld_tx_enqueue(dsp, mp, B_FALSE);
816 }
817 
818 /*
819  * Update the priority bits and VID (may need to insert tag if mp points
820  * to an untagged packet.
821  * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
822  */
823 static mblk_t *
824 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid)
825 {
826 	mblk_t *hmp;
827 	struct ether_vlan_header *evhp;
828 	struct ether_header *ehp;
829 	uint16_t old_tci = 0;
830 	size_t len;
831 
832 	ASSERT(pri != 0 || vid != VLAN_ID_NONE);
833 
834 	evhp = (struct ether_vlan_header *)mp->b_rptr;
835 	if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
836 		/*
837 		 * Tagged packet, update the priority bits.
838 		 */
839 		old_tci = ntohs(evhp->ether_tci);
840 		len = sizeof (struct ether_vlan_header);
841 
842 		if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
843 			/*
844 			 * In case some drivers only check the db_ref
845 			 * count of the first mblk, we pullup the
846 			 * message into a single mblk.
847 			 */
848 			hmp = msgpullup(mp, -1);
849 			if ((hmp == NULL) || (MBLKL(hmp) < len)) {
850 				freemsg(hmp);
851 				return (NULL);
852 			} else {
853 				freemsg(mp);
854 				mp = hmp;
855 			}
856 		}
857 
858 		evhp = (struct ether_vlan_header *)mp->b_rptr;
859 	} else {
860 		/*
861 		 * Untagged packet. Insert the special priority tag.
862 		 * First allocate a header mblk.
863 		 */
864 		hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
865 		if (hmp == NULL)
866 			return (NULL);
867 
868 		evhp = (struct ether_vlan_header *)hmp->b_rptr;
869 		ehp = (struct ether_header *)mp->b_rptr;
870 
871 		/*
872 		 * Copy the MAC addresses and typelen
873 		 */
874 		bcopy(ehp, evhp, (ETHERADDRL * 2));
875 		evhp->ether_type = ehp->ether_type;
876 		evhp->ether_tpid = htons(ETHERTYPE_VLAN);
877 
878 		hmp->b_wptr += sizeof (struct ether_vlan_header);
879 		mp->b_rptr += sizeof (struct ether_header);
880 
881 		/*
882 		 * Free the original message if it's now empty. Link the
883 		 * rest of messages to the header message.
884 		 */
885 		if (MBLKL(mp) == 0) {
886 			hmp->b_cont = mp->b_cont;
887 			freeb(mp);
888 		} else {
889 			hmp->b_cont = mp;
890 		}
891 		mp = hmp;
892 	}
893 
894 	if (pri == 0)
895 		pri = VLAN_PRI(old_tci);
896 	if (vid == VLAN_ID_NONE)
897 		vid = VLAN_ID(old_tci);
898 	evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
899 	return (mp);
900 }
901 
902 /*
903  * M_DATA put (IP fast-path mode)
904  */
905 void
906 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
907 {
908 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
909 	mblk_t *newmp;
910 	uint_t pri;
911 
912 	if (is_ethernet) {
913 		/*
914 		 * Update the priority bits to the assigned priority.
915 		 */
916 		pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
917 
918 		if (pri != 0) {
919 			newmp = i_dld_ether_header_update_tag(mp, pri,
920 			    VLAN_ID_NONE);
921 			if (newmp == NULL)
922 				goto discard;
923 			mp = newmp;
924 		}
925 	}
926 
927 	dld_tx_single(dsp, mp);
928 	return;
929 
930 discard:
931 	/* TODO: bump kstat? */
932 	freemsg(mp);
933 }
934 
935 /*
936  * M_DATA put (DLIOCRAW mode)
937  */
938 static void
939 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
940 {
941 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
942 	mblk_t *bp, *newmp;
943 	size_t size;
944 	mac_header_info_t mhi;
945 	uint_t pri, vid;
946 
947 	/*
948 	 * Certain MAC type plugins provide an illusion for raw DLPI
949 	 * consumers.  They pretend that the MAC layer is something that
950 	 * it's not for the benefit of observability tools.  For example,
951 	 * mac_wifi pretends that it's Ethernet for such consumers.
952 	 * Here, unless native mode is enabled, we call into the MAC layer so
953 	 * that this illusion can be maintained.  The plugin will optionally
954 	 * transform the MAC header here into something that can be passed
955 	 * down.  The header goes from raw mode to "cooked" mode.
956 	 */
957 	if (!dsp->ds_native) {
958 		if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
959 			goto discard;
960 		mp = newmp;
961 	}
962 
963 	size = MBLKL(mp);
964 
965 	/*
966 	 * Check the packet is not too big and that any remaining
967 	 * fragment list is composed entirely of M_DATA messages. (We
968 	 * know the first fragment was M_DATA otherwise we could not
969 	 * have got here).
970 	 */
971 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
972 		if (DB_TYPE(bp) != M_DATA)
973 			goto discard;
974 		size += MBLKL(bp);
975 	}
976 
977 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
978 		goto discard;
979 
980 	/*
981 	 * If LSO is enabled, check the size against lso_max. Otherwise,
982 	 * compare the packet size with sdu_max.
983 	 */
984 	if (size > (dsp->ds_lso ? dsp->ds_lso_max : dsp->ds_mip->mi_sdu_max)
985 	    + mhi.mhi_hdrsize)
986 		goto discard;
987 
988 	if (is_ethernet) {
989 		/*
990 		 * Discard the packet if this is a VLAN stream but the VID in
991 		 * the packet is not correct.
992 		 */
993 		vid = VLAN_ID(mhi.mhi_tci);
994 		if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
995 			goto discard;
996 
997 		/*
998 		 * Discard the packet if this packet is a tagged packet
999 		 * but both pri and VID are 0.
1000 		 */
1001 		pri = VLAN_PRI(mhi.mhi_tci);
1002 		if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE))
1003 			goto discard;
1004 
1005 		/*
1006 		 * Update the priority bits to the per-stream priority if
1007 		 * priority is not set in the packet. Update the VID for
1008 		 * packets on a VLAN stream.
1009 		 */
1010 		pri = (pri == 0) ? dsp->ds_pri : 0;
1011 		if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) {
1012 			if ((newmp = i_dld_ether_header_update_tag(mp,
1013 			    pri, dsp->ds_vid)) == NULL) {
1014 				goto discard;
1015 			}
1016 			mp = newmp;
1017 		}
1018 	}
1019 
1020 	dld_tx_single(dsp, mp);
1021 	return;
1022 
1023 discard:
1024 	/* TODO: bump kstat? */
1025 	freemsg(mp);
1026 }
1027 
1028 /*
1029  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1030  */
1031 int
1032 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1033 {
1034 	int			err;
1035 	const char		*drvname;
1036 	char			name[MAXNAMELEN];
1037 	dls_channel_t		dc;
1038 	uint_t			addr_length;
1039 
1040 	ASSERT(dsp->ds_dc == NULL);
1041 
1042 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1043 		return (EINVAL);
1044 
1045 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
1046 
1047 	if (strcmp(drvname, "aggr") != 0 &&
1048 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1049 		return (EINVAL);
1050 
1051 	/*
1052 	 * Open a channel.
1053 	 */
1054 	if ((err = dls_open(name, &dc)) != 0) {
1055 		(void) qassociate(dsp->ds_wq, -1);
1056 		return (err);
1057 	}
1058 
1059 	/*
1060 	 * Cache the MAC interface handle, a pointer to the immutable MAC
1061 	 * information and the current and 'factory' MAC address.
1062 	 */
1063 	dsp->ds_mh = dls_mac(dc);
1064 	dsp->ds_mip = mac_info(dsp->ds_mh);
1065 
1066 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1067 
1068 	addr_length = dsp->ds_mip->mi_addr_length;
1069 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
1070 
1071 	/*
1072 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
1073 	 * a non-VLAN interface).
1074 	 */
1075 	dsp->ds_vid = dls_vid(dc);
1076 
1077 	/*
1078 	 * Set the default packet priority.
1079 	 */
1080 	dsp->ds_pri = 0;
1081 
1082 	/*
1083 	 * Add a notify function so that the we get updates from the MAC.
1084 	 */
1085 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
1086 
1087 	dsp->ds_ppa = ppa;
1088 	dsp->ds_dc = dc;
1089 	dsp->ds_dlstate = DL_UNBOUND;
1090 
1091 	return (0);
1092 }
1093 
1094 /*
1095  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1096  * from close(2) for style 2.
1097  */
1098 void
1099 dld_str_detach(dld_str_t *dsp)
1100 {
1101 	ASSERT(dsp->ds_thr == 0);
1102 
1103 	/*
1104 	 * Remove the notify function.
1105 	 */
1106 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
1107 
1108 	/*
1109 	 * Clear the polling and promisc flags.
1110 	 */
1111 	dsp->ds_polling = B_FALSE;
1112 	dsp->ds_soft_ring = B_FALSE;
1113 	dsp->ds_promisc = 0;
1114 
1115 	/*
1116 	 * Clear LSO flags.
1117 	 */
1118 	dsp->ds_lso = B_FALSE;
1119 	dsp->ds_lso_max = 0;
1120 
1121 	/*
1122 	 * Close the channel.
1123 	 */
1124 	dls_close(dsp->ds_dc);
1125 	dsp->ds_ppa = (t_uscalar_t)-1;
1126 	dsp->ds_dc = NULL;
1127 	dsp->ds_mh = NULL;
1128 
1129 	(void) qassociate(dsp->ds_wq, -1);
1130 
1131 	/*
1132 	 * Re-initialize the DLPI state machine.
1133 	 */
1134 	dsp->ds_dlstate = DL_UNATTACHED;
1135 
1136 }
1137 
1138 /*
1139  * This function is only called for VLAN streams. In raw mode, we strip VLAN
1140  * tags before sending packets up to the DLS clients, with the exception of
1141  * special priority tagged packets, in that case, we set the VID to 0.
1142  * mp must be a VLAN tagged packet.
1143  */
1144 static mblk_t *
1145 i_dld_ether_header_strip_tag(mblk_t *mp)
1146 {
1147 	mblk_t *newmp;
1148 	struct ether_vlan_header *evhp;
1149 	uint16_t tci, new_tci;
1150 
1151 	ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1152 	if (DB_REF(mp) > 1) {
1153 		newmp = copymsg(mp);
1154 		if (newmp == NULL)
1155 			return (NULL);
1156 		freemsg(mp);
1157 		mp = newmp;
1158 	}
1159 	evhp = (struct ether_vlan_header *)mp->b_rptr;
1160 
1161 	tci = ntohs(evhp->ether_tci);
1162 	if (VLAN_PRI(tci) == 0) {
1163 		/*
1164 		 * Priority is 0, strip the tag.
1165 		 */
1166 		ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1167 		mp->b_rptr += VLAN_TAGSZ;
1168 	} else {
1169 		/*
1170 		 * Priority is not 0, update the VID to 0.
1171 		 */
1172 		new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1173 		evhp->ether_tci = htons(new_tci);
1174 	}
1175 	return (mp);
1176 }
1177 
1178 /*
1179  * Raw mode receive function.
1180  */
1181 /*ARGSUSED*/
1182 void
1183 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1184     mac_header_info_t *mhip)
1185 {
1186 	dld_str_t *dsp = (dld_str_t *)arg;
1187 	boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1188 	mblk_t *next, *newmp;
1189 
1190 	ASSERT(mp != NULL);
1191 	do {
1192 		/*
1193 		 * Get the pointer to the next packet in the chain and then
1194 		 * clear b_next before the packet gets passed on.
1195 		 */
1196 		next = mp->b_next;
1197 		mp->b_next = NULL;
1198 
1199 		/*
1200 		 * Wind back b_rptr to point at the MAC header.
1201 		 */
1202 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1203 		mp->b_rptr -= mhip->mhi_hdrsize;
1204 
1205 		/*
1206 		 * Certain MAC type plugins provide an illusion for raw
1207 		 * DLPI consumers.  They pretend that the MAC layer is
1208 		 * something that it's not for the benefit of observability
1209 		 * tools.  For example, mac_wifi pretends that it's Ethernet
1210 		 * for such consumers.	Here, unless native mode is enabled,
1211 		 * we call into the MAC layer so that this illusion can be
1212 		 * maintained.	The plugin will optionally transform the MAC
1213 		 * header here into something that can be passed up to raw
1214 		 * consumers.  The header goes from "cooked" mode to raw mode.
1215 		 */
1216 		if (!dsp->ds_native) {
1217 			newmp = mac_header_uncook(dsp->ds_mh, mp);
1218 			if (newmp == NULL) {
1219 				freemsg(mp);
1220 				goto next;
1221 			}
1222 			mp = newmp;
1223 		}
1224 
1225 		/*
1226 		 * Strip the VLAN tag for VLAN streams.
1227 		 */
1228 		if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) {
1229 			newmp = i_dld_ether_header_strip_tag(mp);
1230 			if (newmp == NULL) {
1231 				freemsg(mp);
1232 				goto next;
1233 			}
1234 			mp = newmp;
1235 		}
1236 
1237 		/*
1238 		 * Pass the packet on.
1239 		 */
1240 		if (canputnext(dsp->ds_rq))
1241 			putnext(dsp->ds_rq, mp);
1242 		else
1243 			freemsg(mp);
1244 
1245 next:
1246 		/*
1247 		 * Move on to the next packet in the chain.
1248 		 */
1249 		mp = next;
1250 	} while (mp != NULL);
1251 }
1252 
1253 /*
1254  * Fast-path receive function.
1255  */
1256 /*ARGSUSED*/
1257 void
1258 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1259     mac_header_info_t *mhip)
1260 {
1261 	dld_str_t *dsp = (dld_str_t *)arg;
1262 	mblk_t *next;
1263 	size_t offset = 0;
1264 
1265 	/*
1266 	 * MAC header stripping rules:
1267 	 *    - Tagged packets:
1268 	 *	a. VLAN streams. Strip the whole VLAN header including the tag.
1269 	 *	b. Physical streams
1270 	 *	- VLAN packets (non-zero VID). The stream must be either a
1271 	 *	  DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1272 	 *	  Strip the Ethernet header but keep the VLAN header.
1273 	 *	- Special tagged packets (zero VID)
1274 	 *	  * The stream is either a DL_PROMISC_SAP listener or a
1275 	 *	    ETHERTYPE_VLAN listener, strip the Ethernet header but
1276 	 *	    keep the VLAN header.
1277 	 *	  * Otherwise, strip the whole VLAN header.
1278 	 *    - Untagged packets. Strip the whole MAC header.
1279 	 */
1280 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1281 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1282 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1283 		offset = VLAN_TAGSZ;
1284 	}
1285 
1286 	ASSERT(mp != NULL);
1287 	do {
1288 		/*
1289 		 * Get the pointer to the next packet in the chain and then
1290 		 * clear b_next before the packet gets passed on.
1291 		 */
1292 		next = mp->b_next;
1293 		mp->b_next = NULL;
1294 
1295 		/*
1296 		 * Wind back b_rptr to point at the VLAN header.
1297 		 */
1298 		ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1299 		mp->b_rptr -= offset;
1300 
1301 		/*
1302 		 * Pass the packet on.
1303 		 */
1304 		if (canputnext(dsp->ds_rq))
1305 			putnext(dsp->ds_rq, mp);
1306 		else
1307 			freemsg(mp);
1308 		/*
1309 		 * Move on to the next packet in the chain.
1310 		 */
1311 		mp = next;
1312 	} while (mp != NULL);
1313 }
1314 
1315 /*
1316  * Default receive function (send DL_UNITDATA_IND messages).
1317  */
1318 /*ARGSUSED*/
1319 void
1320 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1321     mac_header_info_t *mhip)
1322 {
1323 	dld_str_t		*dsp = (dld_str_t *)arg;
1324 	mblk_t			*ud_mp;
1325 	mblk_t			*next;
1326 	size_t			offset = 0;
1327 	boolean_t		strip_vlan = B_TRUE;
1328 
1329 	/*
1330 	 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1331 	 */
1332 	if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) &&
1333 	    ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1334 	    (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1335 		offset = VLAN_TAGSZ;
1336 		strip_vlan = B_FALSE;
1337 	}
1338 
1339 	ASSERT(mp != NULL);
1340 	do {
1341 		/*
1342 		 * Get the pointer to the next packet in the chain and then
1343 		 * clear b_next before the packet gets passed on.
1344 		 */
1345 		next = mp->b_next;
1346 		mp->b_next = NULL;
1347 
1348 		/*
1349 		 * Wind back b_rptr to point at the MAC header.
1350 		 */
1351 		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1352 		mp->b_rptr -= mhip->mhi_hdrsize;
1353 
1354 		/*
1355 		 * Create the DL_UNITDATA_IND M_PROTO.
1356 		 */
1357 		if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1358 			freemsgchain(mp);
1359 			return;
1360 		}
1361 
1362 		/*
1363 		 * Advance b_rptr to point at the payload (or the VLAN header).
1364 		 */
1365 		mp->b_rptr += (mhip->mhi_hdrsize - offset);
1366 
1367 		/*
1368 		 * Prepend the DL_UNITDATA_IND.
1369 		 */
1370 		ud_mp->b_cont = mp;
1371 
1372 		/*
1373 		 * Send the message.
1374 		 */
1375 		if (canputnext(dsp->ds_rq))
1376 			putnext(dsp->ds_rq, ud_mp);
1377 		else
1378 			freemsg(ud_mp);
1379 
1380 		/*
1381 		 * Move on to the next packet in the chain.
1382 		 */
1383 		mp = next;
1384 	} while (mp != NULL);
1385 }
1386 
1387 /*
1388  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1389  * current state of the interface.
1390  */
1391 void
1392 dld_str_notify_ind(dld_str_t *dsp)
1393 {
1394 	mac_notify_type_t	type;
1395 
1396 	for (type = 0; type < MAC_NNOTE; type++)
1397 		str_notify(dsp, type);
1398 }
1399 
1400 typedef struct dl_unitdata_ind_wrapper {
1401 	dl_unitdata_ind_t	dl_unitdata;
1402 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1403 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1404 } dl_unitdata_ind_wrapper_t;
1405 
1406 /*
1407  * Create a DL_UNITDATA_IND M_PROTO message.
1408  */
1409 static mblk_t *
1410 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1411 {
1412 	mblk_t				*nmp;
1413 	dl_unitdata_ind_wrapper_t	*dlwp;
1414 	dl_unitdata_ind_t		*dlp;
1415 	mac_header_info_t		mhi;
1416 	uint_t				addr_length;
1417 	uint8_t				*daddr;
1418 	uint8_t				*saddr;
1419 
1420 	/*
1421 	 * Get the packet header information.
1422 	 */
1423 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1424 		return (NULL);
1425 
1426 	/*
1427 	 * Allocate a message large enough to contain the wrapper structure
1428 	 * defined above.
1429 	 */
1430 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1431 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1432 	    DL_UNITDATA_IND)) == NULL)
1433 		return (NULL);
1434 
1435 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1436 
1437 	dlp = &(dlwp->dl_unitdata);
1438 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1439 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1440 
1441 	/*
1442 	 * Copy in the destination address.
1443 	 */
1444 	addr_length = dsp->ds_mip->mi_addr_length;
1445 	daddr = dlwp->dl_dest_addr;
1446 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1447 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1448 
1449 	/*
1450 	 * Set the destination DLSAP to the SAP value encoded in the packet.
1451 	 */
1452 	if (mhi.mhi_istagged && !strip_vlan)
1453 		*(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1454 	else
1455 		*(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1456 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1457 
1458 	/*
1459 	 * If the destination address was multicast or broadcast then the
1460 	 * dl_group_address field should be non-zero.
1461 	 */
1462 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1463 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1464 
1465 	/*
1466 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1467 	 * for example) may not have access to source information.
1468 	 */
1469 	if (mhi.mhi_saddr == NULL) {
1470 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1471 	} else {
1472 		saddr = dlwp->dl_src_addr;
1473 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1474 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1475 
1476 		/*
1477 		 * Set the source DLSAP to the packet ethertype.
1478 		 */
1479 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1480 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1481 	}
1482 
1483 	return (nmp);
1484 }
1485 
1486 /*
1487  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1488  */
1489 static void
1490 str_notify_promisc_on_phys(dld_str_t *dsp)
1491 {
1492 	mblk_t		*mp;
1493 	dl_notify_ind_t	*dlip;
1494 
1495 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1496 		return;
1497 
1498 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1499 	    M_PROTO, 0)) == NULL)
1500 		return;
1501 
1502 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1503 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1504 	dlip->dl_primitive = DL_NOTIFY_IND;
1505 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1506 
1507 	qreply(dsp->ds_wq, mp);
1508 }
1509 
1510 /*
1511  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1512  */
1513 static void
1514 str_notify_promisc_off_phys(dld_str_t *dsp)
1515 {
1516 	mblk_t		*mp;
1517 	dl_notify_ind_t	*dlip;
1518 
1519 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1520 		return;
1521 
1522 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1523 	    M_PROTO, 0)) == NULL)
1524 		return;
1525 
1526 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1527 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1528 	dlip->dl_primitive = DL_NOTIFY_IND;
1529 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1530 
1531 	qreply(dsp->ds_wq, mp);
1532 }
1533 
1534 /*
1535  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1536  */
1537 static void
1538 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1539 {
1540 	mblk_t		*mp;
1541 	dl_notify_ind_t	*dlip;
1542 	uint_t		addr_length;
1543 	uint16_t	ethertype;
1544 
1545 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1546 		return;
1547 
1548 	addr_length = dsp->ds_mip->mi_addr_length;
1549 	if ((mp = mexchange(dsp->ds_wq, NULL,
1550 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1551 	    M_PROTO, 0)) == NULL)
1552 		return;
1553 
1554 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1555 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1556 	dlip->dl_primitive = DL_NOTIFY_IND;
1557 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1558 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1559 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1560 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1561 
1562 	bcopy(addr, &dlip[1], addr_length);
1563 
1564 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1565 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) =
1566 		ethertype;
1567 
1568 	qreply(dsp->ds_wq, mp);
1569 }
1570 
1571 /*
1572  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1573  */
1574 static void
1575 str_notify_link_up(dld_str_t *dsp)
1576 {
1577 	mblk_t		*mp;
1578 	dl_notify_ind_t	*dlip;
1579 
1580 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1581 		return;
1582 
1583 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1584 	    M_PROTO, 0)) == NULL)
1585 		return;
1586 
1587 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1588 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1589 	dlip->dl_primitive = DL_NOTIFY_IND;
1590 	dlip->dl_notification = DL_NOTE_LINK_UP;
1591 
1592 	qreply(dsp->ds_wq, mp);
1593 }
1594 
1595 /*
1596  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1597  */
1598 static void
1599 str_notify_link_down(dld_str_t *dsp)
1600 {
1601 	mblk_t		*mp;
1602 	dl_notify_ind_t	*dlip;
1603 
1604 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1605 		return;
1606 
1607 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1608 	    M_PROTO, 0)) == NULL)
1609 		return;
1610 
1611 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1612 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1613 	dlip->dl_primitive = DL_NOTIFY_IND;
1614 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1615 
1616 	qreply(dsp->ds_wq, mp);
1617 }
1618 
1619 /*
1620  * DL_NOTIFY_IND: DL_NOTE_SPEED
1621  */
1622 static void
1623 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1624 {
1625 	mblk_t		*mp;
1626 	dl_notify_ind_t	*dlip;
1627 
1628 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1629 		return;
1630 
1631 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1632 	    M_PROTO, 0)) == NULL)
1633 		return;
1634 
1635 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1636 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1637 	dlip->dl_primitive = DL_NOTIFY_IND;
1638 	dlip->dl_notification = DL_NOTE_SPEED;
1639 	dlip->dl_data = speed;
1640 
1641 	qreply(dsp->ds_wq, mp);
1642 }
1643 
1644 /*
1645  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1646  */
1647 static void
1648 str_notify_capab_reneg(dld_str_t *dsp)
1649 {
1650 	mblk_t		*mp;
1651 	dl_notify_ind_t	*dlip;
1652 
1653 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1654 		return;
1655 
1656 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1657 	    M_PROTO, 0)) == NULL)
1658 		return;
1659 
1660 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1661 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1662 	dlip->dl_primitive = DL_NOTIFY_IND;
1663 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1664 
1665 	qreply(dsp->ds_wq, mp);
1666 }
1667 
1668 /*
1669  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1670  */
1671 static void
1672 str_notify_fastpath_flush(dld_str_t *dsp)
1673 {
1674 	mblk_t		*mp;
1675 	dl_notify_ind_t	*dlip;
1676 
1677 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1678 		return;
1679 
1680 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1681 	    M_PROTO, 0)) == NULL)
1682 		return;
1683 
1684 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1685 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1686 	dlip->dl_primitive = DL_NOTIFY_IND;
1687 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1688 
1689 	qreply(dsp->ds_wq, mp);
1690 }
1691 
1692 /*
1693  * MAC notification callback.
1694  */
1695 static void
1696 str_notify(void *arg, mac_notify_type_t type)
1697 {
1698 	dld_str_t		*dsp = (dld_str_t *)arg;
1699 	queue_t			*q = dsp->ds_wq;
1700 
1701 	switch (type) {
1702 	case MAC_NOTE_TX:
1703 		qenable(q);
1704 		break;
1705 
1706 	case MAC_NOTE_DEVPROMISC:
1707 		/*
1708 		 * Send the appropriate DL_NOTIFY_IND.
1709 		 */
1710 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1711 			str_notify_promisc_on_phys(dsp);
1712 		else
1713 			str_notify_promisc_off_phys(dsp);
1714 		break;
1715 
1716 	case MAC_NOTE_PROMISC:
1717 		break;
1718 
1719 	case MAC_NOTE_UNICST:
1720 		/*
1721 		 * This notification is sent whenever the MAC unicast address
1722 		 * changes. We need to re-cache the address.
1723 		 */
1724 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1725 
1726 		/*
1727 		 * Send the appropriate DL_NOTIFY_IND.
1728 		 */
1729 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1730 		break;
1731 
1732 	case MAC_NOTE_LINK:
1733 		/*
1734 		 * This notification is sent every time the MAC driver
1735 		 * updates the link state.
1736 		 */
1737 		switch (mac_link_get(dsp->ds_mh)) {
1738 		case LINK_STATE_UP: {
1739 			uint64_t speed;
1740 			/*
1741 			 * The link is up so send the appropriate
1742 			 * DL_NOTIFY_IND.
1743 			 */
1744 			str_notify_link_up(dsp);
1745 
1746 			speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
1747 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1748 			break;
1749 		}
1750 		case LINK_STATE_DOWN:
1751 			/*
1752 			 * The link is down so send the appropriate
1753 			 * DL_NOTIFY_IND.
1754 			 */
1755 			str_notify_link_down(dsp);
1756 			break;
1757 
1758 		default:
1759 			break;
1760 		}
1761 		break;
1762 
1763 	case MAC_NOTE_RESOURCE:
1764 		/*
1765 		 * This notification is sent whenever the MAC resources
1766 		 * change. We need to renegotiate the capabilities.
1767 		 * Send the appropriate DL_NOTIFY_IND.
1768 		 */
1769 		str_notify_capab_reneg(dsp);
1770 		break;
1771 
1772 	case MAC_NOTE_FASTPATH_FLUSH:
1773 		str_notify_fastpath_flush(dsp);
1774 		break;
1775 
1776 	default:
1777 		ASSERT(B_FALSE);
1778 		break;
1779 	}
1780 }
1781 
1782 /*
1783  * Enqueue one or more messages to the transmit queue.
1784  * Caller specifies the insertion position (head/tail).
1785  */
1786 void
1787 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1788 {
1789 	mblk_t	*tail;
1790 	queue_t *q = dsp->ds_wq;
1791 	uint_t	cnt, msgcnt;
1792 	uint_t	tot_cnt, tot_msgcnt;
1793 
1794 	ASSERT(DB_TYPE(mp) == M_DATA);
1795 	/* Calculate total size and count of the packet(s) */
1796 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1797 	    tail->b_next != NULL; tail = tail->b_next) {
1798 		ASSERT(DB_TYPE(tail->b_next) == M_DATA);
1799 		cnt += msgdsize(tail->b_next);
1800 		msgcnt++;
1801 	}
1802 
1803 	mutex_enter(&dsp->ds_tx_list_lock);
1804 	/*
1805 	 * If the queue depth would exceed the allowed threshold, drop
1806 	 * new packet(s) and drain those already in the queue.
1807 	 */
1808 	tot_cnt = dsp->ds_tx_cnt + cnt;
1809 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1810 
1811 	if (!head_insert &&
1812 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1813 		ASSERT(dsp->ds_tx_qbusy);
1814 		mutex_exit(&dsp->ds_tx_list_lock);
1815 		freemsgchain(mp);
1816 		goto done;
1817 	}
1818 
1819 	/* Update the queue size parameters */
1820 	dsp->ds_tx_cnt = tot_cnt;
1821 	dsp->ds_tx_msgcnt = tot_msgcnt;
1822 
1823 	/*
1824 	 * If the transmit queue is currently empty and we are
1825 	 * about to deposit the packet(s) there, switch mode to
1826 	 * "busy" and raise flow-control condition.
1827 	 */
1828 	if (!dsp->ds_tx_qbusy) {
1829 		dsp->ds_tx_qbusy = B_TRUE;
1830 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1831 		(void) putq(q, dsp->ds_tx_flow_mp);
1832 		dsp->ds_tx_flow_mp = NULL;
1833 	}
1834 
1835 	if (!head_insert) {
1836 		/* Tail insertion */
1837 		if (dsp->ds_tx_list_head == NULL)
1838 			dsp->ds_tx_list_head = mp;
1839 		else
1840 			dsp->ds_tx_list_tail->b_next = mp;
1841 		dsp->ds_tx_list_tail = tail;
1842 	} else {
1843 		/* Head insertion */
1844 		tail->b_next = dsp->ds_tx_list_head;
1845 		if (dsp->ds_tx_list_head == NULL)
1846 			dsp->ds_tx_list_tail = tail;
1847 		dsp->ds_tx_list_head = mp;
1848 	}
1849 	mutex_exit(&dsp->ds_tx_list_lock);
1850 done:
1851 	/* Schedule service thread to drain the transmit queue */
1852 	if (!head_insert)
1853 		qenable(q);
1854 }
1855 
1856 void
1857 dld_tx_flush(dld_str_t *dsp)
1858 {
1859 	mutex_enter(&dsp->ds_tx_list_lock);
1860 	if (dsp->ds_tx_list_head != NULL) {
1861 		freemsgchain(dsp->ds_tx_list_head);
1862 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1863 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1864 		if (dsp->ds_tx_qbusy) {
1865 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1866 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1867 			dsp->ds_tx_qbusy = B_FALSE;
1868 		}
1869 	}
1870 	mutex_exit(&dsp->ds_tx_list_lock);
1871 }
1872 
1873 /*
1874  * Process an M_IOCTL message.
1875  */
1876 static void
1877 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1878 {
1879 	uint_t			cmd;
1880 
1881 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1882 	ASSERT(dsp->ds_type == DLD_DLPI);
1883 
1884 	switch (cmd) {
1885 	case DLIOCNATIVE:
1886 		ioc_native(dsp, mp);
1887 		break;
1888 	case DLIOCRAW:
1889 		ioc_raw(dsp, mp);
1890 		break;
1891 	case DLIOCHDRINFO:
1892 		ioc_fast(dsp, mp);
1893 		break;
1894 	default:
1895 		ioc(dsp, mp);
1896 	}
1897 }
1898 
1899 /*
1900  * DLIOCNATIVE
1901  */
1902 static void
1903 ioc_native(dld_str_t *dsp, mblk_t *mp)
1904 {
1905 	queue_t *q = dsp->ds_wq;
1906 	const mac_info_t *mip = dsp->ds_mip;
1907 
1908 	rw_enter(&dsp->ds_lock, RW_WRITER);
1909 
1910 	/*
1911 	 * Native mode can be enabled if it's disabled and if the
1912 	 * native media type is different.
1913 	 */
1914 	if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
1915 		dsp->ds_native = B_TRUE;
1916 
1917 	rw_exit(&dsp->ds_lock);
1918 
1919 	if (dsp->ds_native)
1920 		miocack(q, mp, 0, mip->mi_nativemedia);
1921 	else
1922 		miocnak(q, mp, 0, ENOTSUP);
1923 }
1924 
1925 /*
1926  * DLIOCRAW
1927  */
1928 static void
1929 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1930 {
1931 	queue_t *q = dsp->ds_wq;
1932 
1933 	rw_enter(&dsp->ds_lock, RW_WRITER);
1934 	if (dsp->ds_polling || dsp->ds_soft_ring) {
1935 		rw_exit(&dsp->ds_lock);
1936 		miocnak(q, mp, 0, EPROTO);
1937 		return;
1938 	}
1939 
1940 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1941 		/*
1942 		 * Set the receive callback.
1943 		 */
1944 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1945 	}
1946 
1947 	/*
1948 	 * Note that raw mode is enabled.
1949 	 */
1950 	dsp->ds_mode = DLD_RAW;
1951 
1952 	rw_exit(&dsp->ds_lock);
1953 	miocack(q, mp, 0, 0);
1954 }
1955 
1956 /*
1957  * DLIOCHDRINFO
1958  */
1959 static void
1960 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1961 {
1962 	dl_unitdata_req_t *dlp;
1963 	off_t		off;
1964 	size_t		len;
1965 	const uint8_t	*addr;
1966 	uint16_t	sap;
1967 	mblk_t		*nmp;
1968 	mblk_t		*hmp;
1969 	uint_t		addr_length;
1970 	queue_t		*q = dsp->ds_wq;
1971 	int		err;
1972 	dls_channel_t	dc;
1973 
1974 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1975 		err = ENOTSUP;
1976 		goto failed;
1977 	}
1978 
1979 	/*
1980 	 * DLIOCHDRINFO should only come from IP. The one initiated from
1981 	 * user-land should not be allowed.
1982 	 */
1983 	if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
1984 		err = EINVAL;
1985 		goto failed;
1986 	}
1987 
1988 	nmp = mp->b_cont;
1989 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1990 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1991 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1992 		err = EINVAL;
1993 		goto failed;
1994 	}
1995 
1996 	off = dlp->dl_dest_addr_offset;
1997 	len = dlp->dl_dest_addr_length;
1998 
1999 	if (!MBLKIN(nmp, off, len)) {
2000 		err = EINVAL;
2001 		goto failed;
2002 	}
2003 
2004 	rw_enter(&dsp->ds_lock, RW_READER);
2005 	if (dsp->ds_dlstate != DL_IDLE) {
2006 		rw_exit(&dsp->ds_lock);
2007 		err = ENOTSUP;
2008 		goto failed;
2009 	}
2010 
2011 	addr_length = dsp->ds_mip->mi_addr_length;
2012 	if (len != addr_length + sizeof (uint16_t)) {
2013 		rw_exit(&dsp->ds_lock);
2014 		err = EINVAL;
2015 		goto failed;
2016 	}
2017 
2018 	addr = nmp->b_rptr + off;
2019 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2020 	dc = dsp->ds_dc;
2021 
2022 	if ((hmp = dls_header(dc, addr, sap, 0, NULL)) == NULL) {
2023 		rw_exit(&dsp->ds_lock);
2024 		err = ENOMEM;
2025 		goto failed;
2026 	}
2027 
2028 	/*
2029 	 * This is a performance optimization.  We originally entered
2030 	 * as reader and only become writer upon transitioning into
2031 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
2032 	 * stay as reader and return the fast-path header to IP.
2033 	 */
2034 	if (dsp->ds_mode != DLD_FASTPATH) {
2035 		if (!rw_tryupgrade(&dsp->ds_lock)) {
2036 			rw_exit(&dsp->ds_lock);
2037 			rw_enter(&dsp->ds_lock, RW_WRITER);
2038 
2039 			/*
2040 			 * State may have changed before we re-acquired
2041 			 * the writer lock in case the upgrade failed.
2042 			 */
2043 			if (dsp->ds_dlstate != DL_IDLE) {
2044 				rw_exit(&dsp->ds_lock);
2045 				err = ENOTSUP;
2046 				goto failed;
2047 			}
2048 		}
2049 
2050 		/*
2051 		 * Set the receive callback (unless polling is enabled).
2052 		 */
2053 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
2054 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
2055 
2056 		/*
2057 		 * Note that fast-path mode is enabled.
2058 		 */
2059 		dsp->ds_mode = DLD_FASTPATH;
2060 	}
2061 	rw_exit(&dsp->ds_lock);
2062 
2063 	freemsg(nmp->b_cont);
2064 	nmp->b_cont = hmp;
2065 
2066 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2067 	return;
2068 failed:
2069 	miocnak(q, mp, 0, err);
2070 }
2071 
2072 /*
2073  * Catch-all handler.
2074  */
2075 static void
2076 ioc(dld_str_t *dsp, mblk_t *mp)
2077 {
2078 	queue_t	*q = dsp->ds_wq;
2079 	mac_handle_t mh;
2080 
2081 	rw_enter(&dsp->ds_lock, RW_READER);
2082 	if (dsp->ds_dlstate == DL_UNATTACHED) {
2083 		rw_exit(&dsp->ds_lock);
2084 		miocnak(q, mp, 0, EINVAL);
2085 		return;
2086 	}
2087 	mh = dsp->ds_mh;
2088 	ASSERT(mh != NULL);
2089 	rw_exit(&dsp->ds_lock);
2090 	mac_ioctl(mh, q, mp);
2091 }
2092 
2093 /*
2094  * Allocate a new minor number.
2095  */
2096 static minor_t
2097 dld_minor_hold(boolean_t sleep)
2098 {
2099 	minor_t		minor;
2100 
2101 	/*
2102 	 * Grab a value from the arena.
2103 	 */
2104 	atomic_add_32(&minor_count, 1);
2105 	if ((minor = PTR_TO_MINOR(vmem_alloc(minor_arenap, 1,
2106 	    (sleep) ? VM_SLEEP : VM_NOSLEEP))) == 0) {
2107 		atomic_add_32(&minor_count, -1);
2108 		return (0);
2109 	}
2110 
2111 	return (minor);
2112 }
2113 
2114 /*
2115  * Release a previously allocated minor number.
2116  */
2117 static void
2118 dld_minor_rele(minor_t minor)
2119 {
2120 	/*
2121 	 * Return the value to the arena.
2122 	 */
2123 	vmem_free(minor_arenap, MINOR_TO_PTR(minor), 1);
2124 
2125 	atomic_add_32(&minor_count, -1);
2126 }
2127