xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision 65a89a64c60f3061bbe2381edaacc81660af9a95)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Data-Link Driver
31  */
32 
33 #include	<sys/stropts.h>
34 #include	<sys/strsun.h>
35 #include	<sys/strsubr.h>
36 #include	<sys/atomic.h>
37 #include	<sys/mkdev.h>
38 #include	<sys/vlan.h>
39 #include	<sys/dld.h>
40 #include	<sys/dld_impl.h>
41 #include	<sys/dls_impl.h>
42 #include	<inet/common.h>
43 
44 static int	str_constructor(void *, void *, int);
45 static void	str_destructor(void *, void *);
46 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *);
47 static void	str_notify_promisc_on_phys(dld_str_t *);
48 static void	str_notify_promisc_off_phys(dld_str_t *);
49 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
50 static void	str_notify_link_up(dld_str_t *);
51 static void	str_notify_link_down(dld_str_t *);
52 static void	str_notify_capab_reneg(dld_str_t *);
53 static void	str_notify_speed(dld_str_t *, uint32_t);
54 static void	str_notify(void *, mac_notify_type_t);
55 
56 static void	ioc_raw(dld_str_t *, mblk_t *);
57 static void	ioc_fast(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static minor_t	dld_minor_hold(boolean_t);
61 static void	dld_minor_rele(minor_t);
62 
63 static uint32_t		str_count;
64 static kmem_cache_t	*str_cachep;
65 static vmem_t		*minor_arenap;
66 static uint32_t		minor_count;
67 
68 #define	MINOR_TO_PTR(minor)	((void *)(uintptr_t)(minor))
69 #define	PTR_TO_MINOR(ptr)	((minor_t)(uintptr_t)(ptr))
70 
71 /*
72  * Some notes on entry points, flow-control, queueing and locking:
73  *
74  * This driver exports the traditional STREAMS put entry point as well as
75  * the non-STREAMS fast-path transmit routine which is provided to IP via
76  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
77  * and data operations, while the fast-path routine deals only with M_DATA
78  * fast-path packets.  Regardless of the entry point, all outbound packets
79  * will end up in str_mdata_fastpath_put(), where they will be delivered to
80  * the MAC driver.
81  *
82  * The transmit logic operates in two modes: a "not busy" mode where the
83  * packets will be delivered to the MAC for a send attempt, or "busy" mode
84  * where they will be enqueued in the internal queue because of flow-control.
85  * Flow-control happens when the MAC driver indicates the packets couldn't
86  * be transmitted due to lack of resources (e.g. running out of descriptors).
87  * In such case, the driver will place a dummy message on its write-side
88  * STREAMS queue so that the queue is marked as "full".  Any subsequent
89  * packets arriving at the driver will be enqueued in the internal queue,
90  * which is drained in the context of the service thread that gets scheduled
91  * whenever the driver is in the "busy" mode.  When all packets have been
92  * successfully delivered by MAC and the internal queue is empty, it will
93  * transition to the "not busy" mode by removing the dummy message from the
94  * write-side STREAMS queue; in effect this will trigger backenabling.
95  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
96  * to the above reasons.
97  *
98  * The driver implements an internal transmit queue independent of STREAMS.
99  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
100  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
101  * getq() operations done by the driver are those related to placing and
102  * removing the dummy message to/from the write-side STREAMS queue for flow-
103  * control purposes.
104  *
105  * Locking is done independent of STREAMS due to the driver being fully MT.
106  * Threads entering the driver (either from put or service entry points)
107  * will most likely be readers, with the exception of a few writer cases
108  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
109  * DLD-related ioctl requests.  The DLPI detach case is special, because
110  * it involves freeing resources and therefore must be single-threaded.
111  * Unfortunately the readers/writers lock can't be used to protect against
112  * it, because the lock is dropped prior to the driver calling places where
113  * putnext() may be invoked, and such places may depend on those resources
114  * to exist.  Because of this, the driver always completes the DLPI detach
115  * process when there are no other threads running in the driver.  This is
116  * done by keeping track of the number of threads, such that the the last
117  * thread leaving the driver will finish the pending DLPI detach operation.
118  */
119 
120 /*
121  * dld_max_q_count is the queue depth threshold used to limit the number of
122  * outstanding packets or bytes allowed in the queue; once this limit is
123  * reached the driver will free any incoming ones until the queue depth
124  * drops below the threshold.
125  *
126  * This buffering is provided to accomodate clients which do not employ
127  * their own buffering scheme, and to handle occasional packet bursts.
128  * Clients which handle their own buffering will receive positive feedback
129  * from this driver as soon as it transitions into the "busy" state, i.e.
130  * when the queue is initially filled up; they will get backenabled once
131  * the queue is empty.
132  *
133  * The value chosen here is rather arbitrary; in future some intelligent
134  * heuristics may be involved which could take into account the hardware's
135  * transmit ring size, etc.
136  */
137 uint_t dld_max_q_count = (16 * 1024 *1024);
138 
139 static dev_info_t *
140 dld_finddevinfo(dev_t dev)
141 {
142 	minor_t		minor = getminor(dev);
143 	char		*drvname = ddi_major_to_name(getmajor(dev));
144 	char		name[MAXNAMELEN];
145 	dls_vlan_t	*dvp = NULL;
146 	dev_info_t	*dip = NULL;
147 
148 	if (drvname == NULL || minor == 0 || minor > DLD_MAX_PPA + 1)
149 		return (NULL);
150 
151 	(void) snprintf(name, MAXNAMELEN, "%s%d", drvname, (int)minor - 1);
152 	if (dls_vlan_hold(name, &dvp, B_FALSE) != 0)
153 		return (NULL);
154 
155 	dip = mac_devinfo_get(dvp->dv_dlp->dl_mh);
156 	dls_vlan_rele(dvp);
157 	return (dip);
158 }
159 
160 /*
161  * devo_getinfo: getinfo(9e)
162  */
163 /*ARGSUSED*/
164 int
165 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
166 {
167 	dev_info_t	*devinfo;
168 	minor_t		minor = getminor((dev_t)arg);
169 	int		rc = DDI_FAILURE;
170 
171 	switch (cmd) {
172 	case DDI_INFO_DEVT2DEVINFO:
173 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
174 			*(dev_info_t **)resp = devinfo;
175 			rc = DDI_SUCCESS;
176 		}
177 		break;
178 	case DDI_INFO_DEVT2INSTANCE:
179 		if (minor > 0 && minor <= DLD_MAX_PPA + 1) {
180 			*(int *)resp = (int)minor - 1;
181 			rc = DDI_SUCCESS;
182 		}
183 		break;
184 	}
185 	return (rc);
186 }
187 
188 /*
189  * qi_qopen: open(9e)
190  */
191 /*ARGSUSED*/
192 int
193 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
194 {
195 	dld_str_t	*dsp;
196 	major_t		major;
197 	minor_t		minor;
198 	int		err;
199 
200 	if (sflag == MODOPEN)
201 		return (ENOTSUP);
202 
203 	/*
204 	 * This is a cloning driver and therefore each queue should only
205 	 * ever get opened once.
206 	 */
207 	if (rq->q_ptr != NULL)
208 		return (EBUSY);
209 
210 	major = getmajor(*devp);
211 	minor = getminor(*devp);
212 	if (minor > DLD_MAX_MINOR)
213 		return (ENODEV);
214 
215 	/*
216 	 * Create a new dld_str_t for the stream. This will grab a new minor
217 	 * number that will be handed back in the cloned dev_t.  Creation may
218 	 * fail if we can't allocate the dummy mblk used for flow-control.
219 	 */
220 	dsp = dld_str_create(rq, DLD_DLPI, major,
221 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
222 	if (dsp == NULL)
223 		return (ENOSR);
224 
225 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
226 	if (minor != 0) {
227 		/*
228 		 * Style 1 open
229 		 */
230 
231 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
232 			goto failed;
233 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
234 	} else {
235 		(void) qassociate(rq, -1);
236 	}
237 
238 	/*
239 	 * Enable the queue srv(9e) routine.
240 	 */
241 	qprocson(rq);
242 
243 	/*
244 	 * Construct a cloned dev_t to hand back.
245 	 */
246 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
247 	return (0);
248 
249 failed:
250 	dld_str_destroy(dsp);
251 	return (err);
252 }
253 
254 /*
255  * qi_qclose: close(9e)
256  */
257 int
258 dld_close(queue_t *rq)
259 {
260 	dld_str_t	*dsp = rq->q_ptr;
261 
262 	/*
263 	 * Wait until pending requests are processed.
264 	 */
265 	mutex_enter(&dsp->ds_thr_lock);
266 	while (dsp->ds_pending_cnt > 0)
267 		cv_wait(&dsp->ds_pending_cv, &dsp->ds_thr_lock);
268 	mutex_exit(&dsp->ds_thr_lock);
269 
270 	/*
271 	 * Disable the queue srv(9e) routine.
272 	 */
273 	qprocsoff(rq);
274 
275 	/*
276 	 * At this point we can not be entered by any threads via STREAMS
277 	 * or the direct call interface, which is available only to IP.
278 	 * After the interface is unplumbed, IP wouldn't have any reference
279 	 * to this instance, and therefore we are now effectively single
280 	 * threaded and don't require any lock protection.  Flush all
281 	 * pending packets which are sitting in the transmit queue.
282 	 */
283 	ASSERT(dsp->ds_thr == 0);
284 	dld_tx_flush(dsp);
285 
286 	/*
287 	 * This stream was open to a provider node. Check to see
288 	 * if it has been cleanly shut down.
289 	 */
290 	if (dsp->ds_dlstate != DL_UNATTACHED) {
291 		/*
292 		 * The stream is either open to a style 1 provider or
293 		 * this is not clean shutdown. Detach from the PPA.
294 		 * (This is still ok even in the style 1 case).
295 		 */
296 		dld_str_detach(dsp);
297 	}
298 
299 	dld_str_destroy(dsp);
300 	return (0);
301 }
302 
303 /*
304  * qi_qputp: put(9e)
305  */
306 void
307 dld_wput(queue_t *wq, mblk_t *mp)
308 {
309 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
310 
311 	DLD_ENTER(dsp);
312 
313 	switch (DB_TYPE(mp)) {
314 	case M_DATA:
315 		rw_enter(&dsp->ds_lock, RW_READER);
316 		if (dsp->ds_dlstate != DL_IDLE ||
317 		    dsp->ds_mode == DLD_UNITDATA) {
318 			freemsg(mp);
319 		} else if (dsp->ds_mode == DLD_FASTPATH) {
320 			str_mdata_fastpath_put(dsp, mp);
321 		} else if (dsp->ds_mode == DLD_RAW) {
322 			str_mdata_raw_put(dsp, mp);
323 		}
324 		rw_exit(&dsp->ds_lock);
325 		break;
326 	case M_PROTO:
327 	case M_PCPROTO:
328 		dld_proto(dsp, mp);
329 		break;
330 	case M_IOCTL:
331 		dld_ioc(dsp, mp);
332 		break;
333 	case M_FLUSH:
334 		if (*mp->b_rptr & FLUSHW) {
335 			dld_tx_flush(dsp);
336 			*mp->b_rptr &= ~FLUSHW;
337 		}
338 
339 		if (*mp->b_rptr & FLUSHR) {
340 			qreply(wq, mp);
341 		} else {
342 			freemsg(mp);
343 		}
344 		break;
345 	default:
346 		freemsg(mp);
347 		break;
348 	}
349 
350 	DLD_EXIT(dsp);
351 }
352 
353 /*
354  * qi_srvp: srv(9e)
355  */
356 void
357 dld_wsrv(queue_t *wq)
358 {
359 	mblk_t		*mp;
360 	dld_str_t	*dsp = wq->q_ptr;
361 
362 	DLD_ENTER(dsp);
363 	rw_enter(&dsp->ds_lock, RW_READER);
364 	/*
365 	 * Grab all packets (chained via b_next) off our transmit queue
366 	 * and try to send them all to the MAC layer.  Since the queue
367 	 * is independent of streams, we are able to dequeue all messages
368 	 * at once without looping through getq() and manually chaining
369 	 * them.  Note that the queue size parameters (byte and message
370 	 * counts) are cleared as well, but we postpone the backenabling
371 	 * until after the MAC transmit since some packets may end up
372 	 * back at our transmit queue.
373 	 */
374 	mutex_enter(&dsp->ds_tx_list_lock);
375 	if ((mp = dsp->ds_tx_list_head) == NULL) {
376 		ASSERT(!dsp->ds_tx_qbusy);
377 		ASSERT(dsp->ds_tx_flow_mp != NULL);
378 		ASSERT(dsp->ds_tx_list_head == NULL);
379 		ASSERT(dsp->ds_tx_list_tail == NULL);
380 		ASSERT(dsp->ds_tx_cnt == 0);
381 		ASSERT(dsp->ds_tx_msgcnt == 0);
382 		mutex_exit(&dsp->ds_tx_list_lock);
383 		goto done;
384 	}
385 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
386 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
387 	mutex_exit(&dsp->ds_tx_list_lock);
388 
389 	/*
390 	 * Discard packets unless we are attached and bound; note that
391 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
392 	 * because regardless of the mode all transmit will end up in
393 	 * str_mdata_fastpath_put() where the packets may be queued.
394 	 */
395 	ASSERT(DB_TYPE(mp) == M_DATA);
396 	if (dsp->ds_dlstate != DL_IDLE) {
397 		freemsgchain(mp);
398 		goto done;
399 	}
400 
401 	/*
402 	 * Attempt to transmit one or more packets.  If the MAC can't
403 	 * send them all, re-queue the packet(s) at the beginning of
404 	 * the transmit queue to avoid any re-ordering.
405 	 */
406 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
407 		dld_tx_enqueue(dsp, mp, B_TRUE);
408 
409 	/*
410 	 * Grab the list lock again and check if the transmit queue is
411 	 * really empty; if so, lift up flow-control and backenable any
412 	 * writer queues.  If the queue is not empty, schedule service
413 	 * thread to drain it.
414 	 */
415 	mutex_enter(&dsp->ds_tx_list_lock);
416 	if (dsp->ds_tx_list_head == NULL) {
417 		dsp->ds_tx_flow_mp = getq(wq);
418 		ASSERT(dsp->ds_tx_flow_mp != NULL);
419 		dsp->ds_tx_qbusy = B_FALSE;
420 	}
421 	mutex_exit(&dsp->ds_tx_list_lock);
422 done:
423 	rw_exit(&dsp->ds_lock);
424 	DLD_EXIT(dsp);
425 }
426 
427 void
428 dld_init_ops(struct dev_ops *ops, const char *name)
429 {
430 	struct streamtab *stream;
431 	struct qinit *rq, *wq;
432 	struct module_info *modinfo;
433 
434 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
435 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
436 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
437 	modinfo->mi_minpsz = 0;
438 	modinfo->mi_maxpsz = 64*1024;
439 	modinfo->mi_hiwat  = 1;
440 	modinfo->mi_lowat = 0;
441 
442 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
443 	rq->qi_qopen = dld_open;
444 	rq->qi_qclose = dld_close;
445 	rq->qi_minfo = modinfo;
446 
447 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
448 	wq->qi_putp = (pfi_t)dld_wput;
449 	wq->qi_srvp = (pfi_t)dld_wsrv;
450 	wq->qi_minfo = modinfo;
451 
452 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
453 	stream->st_rdinit = rq;
454 	stream->st_wrinit = wq;
455 	ops->devo_cb_ops->cb_str = stream;
456 
457 	ops->devo_getinfo = &dld_getinfo;
458 }
459 
460 void
461 dld_fini_ops(struct dev_ops *ops)
462 {
463 	struct streamtab *stream;
464 	struct qinit *rq, *wq;
465 	struct module_info *modinfo;
466 
467 	stream = ops->devo_cb_ops->cb_str;
468 	rq = stream->st_rdinit;
469 	wq = stream->st_wrinit;
470 	modinfo = rq->qi_minfo;
471 	ASSERT(wq->qi_minfo == modinfo);
472 
473 	kmem_free(stream, sizeof (struct streamtab));
474 	kmem_free(wq, sizeof (struct qinit));
475 	kmem_free(rq, sizeof (struct qinit));
476 	kmem_free(modinfo->mi_idname, FMNAMESZ);
477 	kmem_free(modinfo, sizeof (struct module_info));
478 }
479 
480 /*
481  * Initialize this module's data structures.
482  */
483 void
484 dld_str_init(void)
485 {
486 	/*
487 	 * Create dld_str_t object cache.
488 	 */
489 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
490 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
491 	ASSERT(str_cachep != NULL);
492 
493 	/*
494 	 * Allocate a vmem arena to manage minor numbers. The range of the
495 	 * arena will be from DLD_MAX_MINOR + 1 to MAXMIN (maximum legal
496 	 * minor number).
497 	 */
498 	minor_arenap = vmem_create("dld_minor_arena",
499 	    MINOR_TO_PTR(DLD_MAX_MINOR + 1), MAXMIN, 1, NULL, NULL, NULL, 0,
500 	    VM_SLEEP | VMC_IDENTIFIER);
501 	ASSERT(minor_arenap != NULL);
502 }
503 
504 /*
505  * Tear down this module's data structures.
506  */
507 int
508 dld_str_fini(void)
509 {
510 	/*
511 	 * Make sure that there are no objects in use.
512 	 */
513 	if (str_count != 0)
514 		return (EBUSY);
515 
516 	/*
517 	 * Check to see if there are any minor numbers still in use.
518 	 */
519 	if (minor_count != 0)
520 		return (EBUSY);
521 
522 	/*
523 	 * Destroy object cache.
524 	 */
525 	kmem_cache_destroy(str_cachep);
526 	vmem_destroy(minor_arenap);
527 	return (0);
528 }
529 
530 /*
531  * Create a new dld_str_t object.
532  */
533 dld_str_t *
534 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
535 {
536 	dld_str_t	*dsp;
537 
538 	/*
539 	 * Allocate an object from the cache.
540 	 */
541 	atomic_add_32(&str_count, 1);
542 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
543 
544 	/*
545 	 * Allocate the dummy mblk for flow-control.
546 	 */
547 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
548 	if (dsp->ds_tx_flow_mp == NULL) {
549 		kmem_cache_free(str_cachep, dsp);
550 		atomic_add_32(&str_count, -1);
551 		return (NULL);
552 	}
553 	dsp->ds_type = type;
554 	dsp->ds_major = major;
555 	dsp->ds_style = style;
556 
557 	/*
558 	 * Initialize the queue pointers.
559 	 */
560 	ASSERT(RD(rq) == rq);
561 	dsp->ds_rq = rq;
562 	dsp->ds_wq = WR(rq);
563 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
564 
565 	/*
566 	 * We want explicit control over our write-side STREAMS queue
567 	 * where the dummy mblk gets added/removed for flow-control.
568 	 */
569 	noenable(WR(rq));
570 
571 	return (dsp);
572 }
573 
574 /*
575  * Destroy a dld_str_t object.
576  */
577 void
578 dld_str_destroy(dld_str_t *dsp)
579 {
580 	queue_t		*rq;
581 	queue_t		*wq;
582 
583 	/*
584 	 * Clear the queue pointers.
585 	 */
586 	rq = dsp->ds_rq;
587 	wq = dsp->ds_wq;
588 	ASSERT(wq == WR(rq));
589 
590 	rq->q_ptr = wq->q_ptr = NULL;
591 	dsp->ds_rq = dsp->ds_wq = NULL;
592 
593 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
594 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
595 	ASSERT(dsp->ds_tx_list_head == NULL);
596 	ASSERT(dsp->ds_tx_list_tail == NULL);
597 	ASSERT(dsp->ds_tx_cnt == 0);
598 	ASSERT(dsp->ds_tx_msgcnt == 0);
599 	ASSERT(!dsp->ds_tx_qbusy);
600 
601 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
602 	ASSERT(dsp->ds_thr == 0);
603 	ASSERT(dsp->ds_pending_req == NULL);
604 
605 	/*
606 	 * Reinitialize all the flags.
607 	 */
608 	dsp->ds_notifications = 0;
609 	dsp->ds_passivestate = DLD_UNINITIALIZED;
610 	dsp->ds_mode = DLD_UNITDATA;
611 
612 	/*
613 	 * Free the dummy mblk if exists.
614 	 */
615 	if (dsp->ds_tx_flow_mp != NULL) {
616 		freeb(dsp->ds_tx_flow_mp);
617 		dsp->ds_tx_flow_mp = NULL;
618 	}
619 	/*
620 	 * Free the object back to the cache.
621 	 */
622 	kmem_cache_free(str_cachep, dsp);
623 	atomic_add_32(&str_count, -1);
624 }
625 
626 /*
627  * kmem_cache contructor function: see kmem_cache_create(9f).
628  */
629 /*ARGSUSED*/
630 static int
631 str_constructor(void *buf, void *cdrarg, int kmflags)
632 {
633 	dld_str_t	*dsp = buf;
634 
635 	bzero(buf, sizeof (dld_str_t));
636 
637 	/*
638 	 * Allocate a new minor number.
639 	 */
640 	if ((dsp->ds_minor = dld_minor_hold(kmflags == KM_SLEEP)) == 0)
641 		return (-1);
642 
643 	/*
644 	 * Initialize the DLPI state machine.
645 	 */
646 	dsp->ds_dlstate = DL_UNATTACHED;
647 
648 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
649 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
650 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
651 	cv_init(&dsp->ds_pending_cv, NULL, CV_DRIVER, NULL);
652 
653 	return (0);
654 }
655 
656 /*
657  * kmem_cache destructor function.
658  */
659 /*ARGSUSED*/
660 static void
661 str_destructor(void *buf, void *cdrarg)
662 {
663 	dld_str_t	*dsp = buf;
664 
665 	/*
666 	 * Make sure the DLPI state machine was reset.
667 	 */
668 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
669 
670 	/*
671 	 * Make sure the data-link interface was closed.
672 	 */
673 	ASSERT(dsp->ds_mh == NULL);
674 	ASSERT(dsp->ds_dc == NULL);
675 
676 	/*
677 	 * Make sure enabled notifications are cleared.
678 	 */
679 	ASSERT(dsp->ds_notifications == 0);
680 
681 	/*
682 	 * Make sure polling is disabled.
683 	 */
684 	ASSERT(!dsp->ds_polling);
685 
686 	/*
687 	 * Release the minor number.
688 	 */
689 	dld_minor_rele(dsp->ds_minor);
690 
691 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
692 	rw_destroy(&dsp->ds_lock);
693 
694 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
695 	mutex_destroy(&dsp->ds_tx_list_lock);
696 	ASSERT(dsp->ds_tx_flow_mp == NULL);
697 
698 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
699 	mutex_destroy(&dsp->ds_thr_lock);
700 	ASSERT(dsp->ds_pending_req == NULL);
701 	ASSERT(dsp->ds_pending_op == NULL);
702 	ASSERT(dsp->ds_pending_cnt == 0);
703 	cv_destroy(&dsp->ds_pending_cv);
704 }
705 
706 /*
707  * M_DATA put (IP fast-path mode)
708  */
709 void
710 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
711 {
712 	/*
713 	 * This function can be called from within dld or from an upper
714 	 * layer protocol (currently only tcp). If we are in the busy
715 	 * mode enqueue the packet(s) and return.  Otherwise hand them
716 	 * over to the MAC driver for transmission; any remaining one(s)
717 	 * which didn't get sent will be queued.
718 	 *
719 	 * Note here that we don't grab the list lock prior to checking
720 	 * the busy flag.  This is okay, because a missed transition
721 	 * will not cause any packet reordering for any particular TCP
722 	 * connection (which is single-threaded).  The enqueue routine
723 	 * will atomically set the busy flag and schedule the service
724 	 * thread to run; the flag is only cleared by the service thread
725 	 * when there is no more packet to be transmitted.
726 	 */
727 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
728 		dld_tx_enqueue(dsp, mp, B_FALSE);
729 }
730 
731 /*
732  * M_DATA put (raw mode)
733  */
734 void
735 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
736 {
737 	struct ether_header	*ehp;
738 	mblk_t			*bp;
739 	size_t			size;
740 	size_t			hdrlen;
741 
742 	size = MBLKL(mp);
743 	if (size < sizeof (struct ether_header))
744 		goto discard;
745 
746 	hdrlen = sizeof (struct ether_header);
747 
748 	ehp = (struct ether_header *)mp->b_rptr;
749 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
750 		struct ether_vlan_header	*evhp;
751 
752 		if (size < sizeof (struct ether_vlan_header))
753 			goto discard;
754 
755 		/*
756 		 * Replace vtag with our own
757 		 */
758 		evhp = (struct ether_vlan_header *)ehp;
759 		evhp->ether_tci = htons(VLAN_TCI(dsp->ds_pri,
760 		    ETHER_CFI, dsp->ds_vid));
761 		hdrlen = sizeof (struct ether_vlan_header);
762 	}
763 
764 	/*
765 	 * Check the packet is not too big and that any remaining
766 	 * fragment list is composed entirely of M_DATA messages. (We
767 	 * know the first fragment was M_DATA otherwise we could not
768 	 * have got here).
769 	 */
770 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
771 		if (DB_TYPE(bp) != M_DATA)
772 			goto discard;
773 		size += MBLKL(bp);
774 	}
775 
776 	if (size > dsp->ds_mip->mi_sdu_max + hdrlen)
777 		goto discard;
778 
779 	str_mdata_fastpath_put(dsp, mp);
780 	return;
781 
782 discard:
783 	freemsg(mp);
784 }
785 
786 /*
787  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
788  */
789 int
790 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
791 {
792 	int			err;
793 	const char		*drvname;
794 	char			name[MAXNAMELEN];
795 	dls_channel_t		dc;
796 	uint_t			addr_length;
797 
798 	ASSERT(dsp->ds_dc == NULL);
799 
800 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
801 		return (EINVAL);
802 
803 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
804 
805 	if (strcmp(drvname, "aggr") != 0 &&
806 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
807 		return (EINVAL);
808 
809 	/*
810 	 * Open a channel.
811 	 */
812 	if ((err = dls_open(name, &dc)) != 0) {
813 		(void) qassociate(dsp->ds_wq, -1);
814 		return (err);
815 	}
816 
817 	/*
818 	 * Cache the MAC interface handle, a pointer to the immutable MAC
819 	 * information and the current and 'factory' MAC address.
820 	 */
821 	dsp->ds_mh = dls_mac(dc);
822 	dsp->ds_mip = mac_info(dsp->ds_mh);
823 
824 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
825 
826 	addr_length = dsp->ds_mip->mi_addr_length;
827 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
828 
829 	/*
830 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
831 	 * a non-VLAN interface).
832 	 */
833 	dsp->ds_vid = dls_vid(dc);
834 
835 	/*
836 	 * Set the default packet priority.
837 	 */
838 	dsp->ds_pri = 0;
839 
840 	/*
841 	 * Add a notify function so that the we get updates from the MAC.
842 	 */
843 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
844 
845 	dsp->ds_dc = dc;
846 	dsp->ds_dlstate = DL_UNBOUND;
847 
848 	return (0);
849 }
850 
851 /*
852  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
853  * from close(2) for style 2.
854  */
855 void
856 dld_str_detach(dld_str_t *dsp)
857 {
858 	ASSERT(dsp->ds_thr == 0);
859 
860 	/*
861 	 * Remove the notify function.
862 	 */
863 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
864 
865 	/*
866 	 * Clear the polling and promisc flags.
867 	 */
868 	dsp->ds_polling = B_FALSE;
869 	dsp->ds_soft_ring = B_FALSE;
870 	dsp->ds_promisc = 0;
871 
872 	/*
873 	 * Close the channel.
874 	 */
875 	dls_close(dsp->ds_dc);
876 	dsp->ds_dc = NULL;
877 	dsp->ds_mh = NULL;
878 
879 	(void) qassociate(dsp->ds_wq, -1);
880 
881 	/*
882 	 * Re-initialize the DLPI state machine.
883 	 */
884 	dsp->ds_dlstate = DL_UNATTACHED;
885 
886 }
887 
888 /*
889  * Raw mode receive function.
890  */
891 /*ARGSUSED*/
892 void
893 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
894     size_t header_length)
895 {
896 	dld_str_t		*dsp = (dld_str_t *)arg;
897 	mblk_t			*next;
898 
899 	ASSERT(mp != NULL);
900 	do {
901 		/*
902 		 * Get the pointer to the next packet in the chain and then
903 		 * clear b_next before the packet gets passed on.
904 		 */
905 		next = mp->b_next;
906 		mp->b_next = NULL;
907 
908 		/*
909 		 * Wind back b_rptr to point at the MAC header.
910 		 */
911 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
912 		mp->b_rptr -= header_length;
913 		if (header_length == sizeof (struct ether_vlan_header)) {
914 			/*
915 			 * Strip off the vtag
916 			 */
917 			ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ,
918 			    2 * ETHERADDRL);
919 			mp->b_rptr += VLAN_TAGSZ;
920 		}
921 
922 		/*
923 		 * Pass the packet on.
924 		 */
925 		putnext(dsp->ds_rq, mp);
926 
927 		/*
928 		 * Move on to the next packet in the chain.
929 		 */
930 		mp = next;
931 	} while (mp != NULL);
932 }
933 
934 /*
935  * Fast-path receive function.
936  */
937 /*ARGSUSED*/
938 void
939 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
940     size_t header_length)
941 {
942 	dld_str_t		*dsp = (dld_str_t *)arg;
943 	mblk_t			*next;
944 
945 	ASSERT(mp != NULL);
946 	do {
947 		/*
948 		 * Get the pointer to the next packet in the chain and then
949 		 * clear b_next before the packet gets passed on.
950 		 */
951 		next = mp->b_next;
952 		mp->b_next = NULL;
953 
954 		/*
955 		 * Pass the packet on.
956 		 */
957 		putnext(dsp->ds_rq, mp);
958 
959 		/*
960 		 * Move on to the next packet in the chain.
961 		 */
962 		mp = next;
963 	} while (mp != NULL);
964 }
965 
966 /*
967  * Default receive function (send DL_UNITDATA_IND messages).
968  */
969 /*ARGSUSED*/
970 void
971 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
972     size_t header_length)
973 {
974 	dld_str_t		*dsp = (dld_str_t *)arg;
975 	mblk_t			*ud_mp;
976 	mblk_t			*next;
977 
978 	ASSERT(mp != NULL);
979 	do {
980 		/*
981 		 * Get the pointer to the next packet in the chain and then
982 		 * clear b_next before the packet gets passed on.
983 		 */
984 		next = mp->b_next;
985 		mp->b_next = NULL;
986 
987 		/*
988 		 * Wind back b_rptr to point at the MAC header.
989 		 */
990 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
991 		mp->b_rptr -= header_length;
992 
993 		/*
994 		 * Create the DL_UNITDATA_IND M_PROTO.
995 		 */
996 		if ((ud_mp = str_unitdata_ind(dsp, mp)) == NULL) {
997 			freemsgchain(mp);
998 			return;
999 		}
1000 
1001 		/*
1002 		 * Advance b_rptr to point at the payload again.
1003 		 */
1004 		mp->b_rptr += header_length;
1005 
1006 		/*
1007 		 * Prepend the DL_UNITDATA_IND.
1008 		 */
1009 		ud_mp->b_cont = mp;
1010 
1011 		/*
1012 		 * Send the message.
1013 		 */
1014 		putnext(dsp->ds_rq, ud_mp);
1015 
1016 		/*
1017 		 * Move on to the next packet in the chain.
1018 		 */
1019 		mp = next;
1020 	} while (mp != NULL);
1021 }
1022 
1023 /*
1024  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1025  * current state of the interface.
1026  */
1027 void
1028 dld_str_notify_ind(dld_str_t *dsp)
1029 {
1030 	mac_notify_type_t	type;
1031 
1032 	for (type = 0; type < MAC_NNOTE; type++)
1033 		str_notify(dsp, type);
1034 }
1035 
1036 typedef struct dl_unitdata_ind_wrapper {
1037 	dl_unitdata_ind_t	dl_unitdata;
1038 	uint8_t			dl_dest_addr[MAXADDRLEN + sizeof (uint16_t)];
1039 	uint8_t			dl_src_addr[MAXADDRLEN + sizeof (uint16_t)];
1040 } dl_unitdata_ind_wrapper_t;
1041 
1042 /*
1043  * Create a DL_UNITDATA_IND M_PROTO message.
1044  */
1045 static mblk_t *
1046 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp)
1047 {
1048 	mblk_t				*nmp;
1049 	dl_unitdata_ind_wrapper_t	*dlwp;
1050 	dl_unitdata_ind_t		*dlp;
1051 	dls_header_info_t		dhi;
1052 	uint_t				addr_length;
1053 	uint8_t				*daddr;
1054 	uint8_t				*saddr;
1055 
1056 	/*
1057 	 * Get the packet header information.
1058 	 */
1059 	dls_header_info(dsp->ds_dc, mp, &dhi);
1060 
1061 	/*
1062 	 * Allocate a message large enough to contain the wrapper structure
1063 	 * defined above.
1064 	 */
1065 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1066 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1067 	    DL_UNITDATA_IND)) == NULL)
1068 		return (NULL);
1069 
1070 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1071 
1072 	dlp = &(dlwp->dl_unitdata);
1073 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1074 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1075 
1076 	/*
1077 	 * Copy in the destination address.
1078 	 */
1079 	addr_length = dsp->ds_mip->mi_addr_length;
1080 	daddr = dlwp->dl_dest_addr;
1081 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1082 	bcopy(dhi.dhi_daddr, daddr, addr_length);
1083 
1084 	/*
1085 	 * Set the destination DLSAP to our bound DLSAP value.
1086 	 */
1087 	*(uint16_t *)(daddr + addr_length) = dsp->ds_sap;
1088 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1089 
1090 	/*
1091 	 * If the destination address was a group address then
1092 	 * dl_group_address field should be non-zero.
1093 	 */
1094 	dlp->dl_group_address = dhi.dhi_isgroup;
1095 
1096 	/*
1097 	 * Copy in the source address.
1098 	 */
1099 	saddr = dlwp->dl_src_addr;
1100 	dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1101 	bcopy(dhi.dhi_saddr, saddr, addr_length);
1102 
1103 	/*
1104 	 * Set the source DLSAP to the packet ethertype.
1105 	 */
1106 	*(uint16_t *)(saddr + addr_length) = dhi.dhi_ethertype;
1107 	dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1108 
1109 	return (nmp);
1110 }
1111 
1112 /*
1113  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1114  */
1115 static void
1116 str_notify_promisc_on_phys(dld_str_t *dsp)
1117 {
1118 	mblk_t		*mp;
1119 	dl_notify_ind_t	*dlip;
1120 
1121 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1122 		return;
1123 
1124 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1125 	    M_PROTO, 0)) == NULL)
1126 		return;
1127 
1128 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1129 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1130 	dlip->dl_primitive = DL_NOTIFY_IND;
1131 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1132 
1133 	qreply(dsp->ds_wq, mp);
1134 }
1135 
1136 /*
1137  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1138  */
1139 static void
1140 str_notify_promisc_off_phys(dld_str_t *dsp)
1141 {
1142 	mblk_t		*mp;
1143 	dl_notify_ind_t	*dlip;
1144 
1145 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1146 		return;
1147 
1148 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1149 	    M_PROTO, 0)) == NULL)
1150 		return;
1151 
1152 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1153 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1154 	dlip->dl_primitive = DL_NOTIFY_IND;
1155 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1156 
1157 	qreply(dsp->ds_wq, mp);
1158 }
1159 
1160 /*
1161  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1162  */
1163 static void
1164 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1165 {
1166 	mblk_t		*mp;
1167 	dl_notify_ind_t	*dlip;
1168 	uint_t		addr_length;
1169 	uint16_t	ethertype;
1170 
1171 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1172 		return;
1173 
1174 	addr_length = dsp->ds_mip->mi_addr_length;
1175 	if ((mp = mexchange(dsp->ds_wq, NULL,
1176 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1177 	    M_PROTO, 0)) == NULL)
1178 		return;
1179 
1180 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1181 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1182 	dlip->dl_primitive = DL_NOTIFY_IND;
1183 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1184 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1185 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1186 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1187 
1188 	bcopy(addr, &dlip[1], addr_length);
1189 
1190 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1191 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) =
1192 		ethertype;
1193 
1194 	qreply(dsp->ds_wq, mp);
1195 }
1196 
1197 /*
1198  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1199  */
1200 static void
1201 str_notify_link_up(dld_str_t *dsp)
1202 {
1203 	mblk_t		*mp;
1204 	dl_notify_ind_t	*dlip;
1205 
1206 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1207 		return;
1208 
1209 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1210 	    M_PROTO, 0)) == NULL)
1211 		return;
1212 
1213 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1214 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1215 	dlip->dl_primitive = DL_NOTIFY_IND;
1216 	dlip->dl_notification = DL_NOTE_LINK_UP;
1217 
1218 	qreply(dsp->ds_wq, mp);
1219 }
1220 
1221 /*
1222  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1223  */
1224 static void
1225 str_notify_link_down(dld_str_t *dsp)
1226 {
1227 	mblk_t		*mp;
1228 	dl_notify_ind_t	*dlip;
1229 
1230 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1231 		return;
1232 
1233 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1234 	    M_PROTO, 0)) == NULL)
1235 		return;
1236 
1237 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1238 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1239 	dlip->dl_primitive = DL_NOTIFY_IND;
1240 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1241 
1242 	qreply(dsp->ds_wq, mp);
1243 }
1244 
1245 /*
1246  * DL_NOTIFY_IND: DL_NOTE_SPEED
1247  */
1248 static void
1249 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1250 {
1251 	mblk_t		*mp;
1252 	dl_notify_ind_t	*dlip;
1253 
1254 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1255 		return;
1256 
1257 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1258 	    M_PROTO, 0)) == NULL)
1259 		return;
1260 
1261 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1262 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1263 	dlip->dl_primitive = DL_NOTIFY_IND;
1264 	dlip->dl_notification = DL_NOTE_SPEED;
1265 	dlip->dl_data = speed;
1266 
1267 	qreply(dsp->ds_wq, mp);
1268 }
1269 
1270 /*
1271  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1272  */
1273 static void
1274 str_notify_capab_reneg(dld_str_t *dsp)
1275 {
1276 	mblk_t		*mp;
1277 	dl_notify_ind_t	*dlip;
1278 
1279 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1280 		return;
1281 
1282 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1283 	    M_PROTO, 0)) == NULL)
1284 		return;
1285 
1286 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1287 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1288 	dlip->dl_primitive = DL_NOTIFY_IND;
1289 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1290 
1291 	qreply(dsp->ds_wq, mp);
1292 }
1293 
1294 /*
1295  * MAC notification callback.
1296  */
1297 static void
1298 str_notify(void *arg, mac_notify_type_t type)
1299 {
1300 	dld_str_t		*dsp = (dld_str_t *)arg;
1301 	queue_t			*q = dsp->ds_wq;
1302 
1303 	switch (type) {
1304 	case MAC_NOTE_TX:
1305 		qenable(q);
1306 		break;
1307 
1308 	case MAC_NOTE_DEVPROMISC:
1309 		/*
1310 		 * Send the appropriate DL_NOTIFY_IND.
1311 		 */
1312 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1313 			str_notify_promisc_on_phys(dsp);
1314 		else
1315 			str_notify_promisc_off_phys(dsp);
1316 		break;
1317 
1318 	case MAC_NOTE_PROMISC:
1319 		break;
1320 
1321 	case MAC_NOTE_UNICST:
1322 		/*
1323 		 * This notification is sent whenever the MAC unicast address
1324 		 * changes. We need to re-cache the address.
1325 		 */
1326 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1327 
1328 		/*
1329 		 * Send the appropriate DL_NOTIFY_IND.
1330 		 */
1331 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1332 		break;
1333 
1334 	case MAC_NOTE_LINK:
1335 		/*
1336 		 * This notification is sent every time the MAC driver
1337 		 * updates the link state.
1338 		 */
1339 		switch (mac_link_get(dsp->ds_mh)) {
1340 		case LINK_STATE_UP:
1341 			/*
1342 			 * The link is up so send the appropriate
1343 			 * DL_NOTIFY_IND.
1344 			 */
1345 			str_notify_link_up(dsp);
1346 
1347 			/*
1348 			 * If we can find the link speed then send a
1349 			 * DL_NOTIFY_IND for that too.
1350 			 */
1351 			if (dsp->ds_mip->mi_stat[MAC_STAT_IFSPEED]) {
1352 				uint64_t	val;
1353 
1354 				val = mac_stat_get(dsp->ds_mh,
1355 				    MAC_STAT_IFSPEED);
1356 				str_notify_speed(dsp,
1357 				    (uint32_t)(val / 1000ull));
1358 			}
1359 			break;
1360 
1361 		case LINK_STATE_DOWN:
1362 			/*
1363 			 * The link is down so send the appropriate
1364 			 * DL_NOTIFY_IND.
1365 			 */
1366 			str_notify_link_down(dsp);
1367 			break;
1368 
1369 		default:
1370 			break;
1371 		}
1372 		break;
1373 
1374 	case MAC_NOTE_RESOURCE:
1375 		/*
1376 		 * This notification is sent whenever the MAC resources
1377 		 * change. We need to renegotiate the capabilities.
1378 		 * Send the appropriate DL_NOTIFY_IND.
1379 		 */
1380 		str_notify_capab_reneg(dsp);
1381 		break;
1382 
1383 	default:
1384 		ASSERT(B_FALSE);
1385 		break;
1386 	}
1387 }
1388 
1389 /*
1390  * Enqueue one or more messages to the transmit queue.
1391  * Caller specifies the insertion position (head/tail).
1392  */
1393 void
1394 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1395 {
1396 	mblk_t	*tail;
1397 	queue_t *q = dsp->ds_wq;
1398 	uint_t	cnt, msgcnt;
1399 	uint_t	tot_cnt, tot_msgcnt;
1400 
1401 	ASSERT(DB_TYPE(mp) == M_DATA);
1402 	/* Calculate total size and count of the packet(s) */
1403 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1404 	    tail->b_next != NULL; tail = tail->b_next) {
1405 		ASSERT(DB_TYPE(tail) == M_DATA);
1406 		cnt += msgdsize(tail);
1407 		msgcnt++;
1408 	}
1409 
1410 	mutex_enter(&dsp->ds_tx_list_lock);
1411 	/*
1412 	 * If the queue depth would exceed the allowed threshold, drop
1413 	 * new packet(s) and drain those already in the queue.
1414 	 */
1415 	tot_cnt = dsp->ds_tx_cnt + cnt;
1416 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1417 
1418 	if (!head_insert &&
1419 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1420 		ASSERT(dsp->ds_tx_qbusy);
1421 		mutex_exit(&dsp->ds_tx_list_lock);
1422 		freemsgchain(mp);
1423 		goto done;
1424 	}
1425 
1426 	/* Update the queue size parameters */
1427 	dsp->ds_tx_cnt = tot_cnt;
1428 	dsp->ds_tx_msgcnt = tot_msgcnt;
1429 
1430 	/*
1431 	 * If the transmit queue is currently empty and we are
1432 	 * about to deposit the packet(s) there, switch mode to
1433 	 * "busy" and raise flow-control condition.
1434 	 */
1435 	if (!dsp->ds_tx_qbusy) {
1436 		dsp->ds_tx_qbusy = B_TRUE;
1437 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1438 		(void) putq(q, dsp->ds_tx_flow_mp);
1439 		dsp->ds_tx_flow_mp = NULL;
1440 	}
1441 
1442 	if (!head_insert) {
1443 		/* Tail insertion */
1444 		if (dsp->ds_tx_list_head == NULL)
1445 			dsp->ds_tx_list_head = mp;
1446 		else
1447 			dsp->ds_tx_list_tail->b_next = mp;
1448 		dsp->ds_tx_list_tail = tail;
1449 	} else {
1450 		/* Head insertion */
1451 		tail->b_next = dsp->ds_tx_list_head;
1452 		if (dsp->ds_tx_list_head == NULL)
1453 			dsp->ds_tx_list_tail = tail;
1454 		dsp->ds_tx_list_head = mp;
1455 	}
1456 	mutex_exit(&dsp->ds_tx_list_lock);
1457 done:
1458 	/* Schedule service thread to drain the transmit queue */
1459 	qenable(q);
1460 }
1461 
1462 void
1463 dld_tx_flush(dld_str_t *dsp)
1464 {
1465 	mutex_enter(&dsp->ds_tx_list_lock);
1466 	if (dsp->ds_tx_list_head != NULL) {
1467 		freemsgchain(dsp->ds_tx_list_head);
1468 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1469 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1470 		if (dsp->ds_tx_qbusy) {
1471 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1472 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1473 			dsp->ds_tx_qbusy = B_FALSE;
1474 		}
1475 	}
1476 	mutex_exit(&dsp->ds_tx_list_lock);
1477 }
1478 
1479 /*
1480  * Process an M_IOCTL message.
1481  */
1482 static void
1483 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1484 {
1485 	uint_t			cmd;
1486 
1487 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1488 	ASSERT(dsp->ds_type == DLD_DLPI);
1489 
1490 	switch (cmd) {
1491 	case DLIOCRAW:
1492 		ioc_raw(dsp, mp);
1493 		break;
1494 	case DLIOCHDRINFO:
1495 		ioc_fast(dsp, mp);
1496 		break;
1497 	default:
1498 		ioc(dsp, mp);
1499 	}
1500 }
1501 
1502 /*
1503  * DLIOCRAW
1504  */
1505 static void
1506 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1507 {
1508 	queue_t *q = dsp->ds_wq;
1509 
1510 	rw_enter(&dsp->ds_lock, RW_WRITER);
1511 	if (dsp->ds_polling || dsp->ds_soft_ring) {
1512 		rw_exit(&dsp->ds_lock);
1513 		miocnak(q, mp, 0, EPROTO);
1514 		return;
1515 	}
1516 
1517 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1518 		/*
1519 		 * Set the receive callback.
1520 		 */
1521 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1522 
1523 		/*
1524 		 * Note that raw mode is enabled.
1525 		 */
1526 		dsp->ds_mode = DLD_RAW;
1527 	}
1528 
1529 	rw_exit(&dsp->ds_lock);
1530 	miocack(q, mp, 0, 0);
1531 }
1532 
1533 /*
1534  * DLIOCHDRINFO
1535  */
1536 static void
1537 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1538 {
1539 	dl_unitdata_req_t *dlp;
1540 	off_t		off;
1541 	size_t		len;
1542 	const uint8_t	*addr;
1543 	uint16_t	sap;
1544 	mblk_t		*nmp;
1545 	mblk_t		*hmp;
1546 	uint_t		addr_length;
1547 	queue_t		*q = dsp->ds_wq;
1548 	int		err;
1549 	dls_channel_t	dc;
1550 
1551 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1552 		err = ENOTSUP;
1553 		goto failed;
1554 	}
1555 
1556 	nmp = mp->b_cont;
1557 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1558 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1559 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1560 		err = EINVAL;
1561 		goto failed;
1562 	}
1563 
1564 	off = dlp->dl_dest_addr_offset;
1565 	len = dlp->dl_dest_addr_length;
1566 
1567 	if (!MBLKIN(nmp, off, len)) {
1568 		err = EINVAL;
1569 		goto failed;
1570 	}
1571 
1572 	rw_enter(&dsp->ds_lock, RW_READER);
1573 	if (dsp->ds_dlstate != DL_IDLE) {
1574 		rw_exit(&dsp->ds_lock);
1575 		err = ENOTSUP;
1576 		goto failed;
1577 	}
1578 
1579 	addr_length = dsp->ds_mip->mi_addr_length;
1580 	if (len != addr_length + sizeof (uint16_t)) {
1581 		rw_exit(&dsp->ds_lock);
1582 		err = EINVAL;
1583 		goto failed;
1584 	}
1585 
1586 	addr = nmp->b_rptr + off;
1587 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
1588 	dc = dsp->ds_dc;
1589 
1590 	if ((hmp = dls_header(dc, addr, sap, dsp->ds_pri)) == NULL) {
1591 		rw_exit(&dsp->ds_lock);
1592 		err = ENOMEM;
1593 		goto failed;
1594 	}
1595 
1596 	/*
1597 	 * This is a performance optimization.  We originally entered
1598 	 * as reader and only become writer upon transitioning into
1599 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
1600 	 * stay as reader and return the fast-path header to IP.
1601 	 */
1602 	if (dsp->ds_mode != DLD_FASTPATH) {
1603 		if (!rw_tryupgrade(&dsp->ds_lock)) {
1604 			rw_exit(&dsp->ds_lock);
1605 			rw_enter(&dsp->ds_lock, RW_WRITER);
1606 
1607 			/*
1608 			 * State may have changed before we re-acquired
1609 			 * the writer lock in case the upgrade failed.
1610 			 */
1611 			if (dsp->ds_dlstate != DL_IDLE) {
1612 				rw_exit(&dsp->ds_lock);
1613 				err = ENOTSUP;
1614 				goto failed;
1615 			}
1616 		}
1617 
1618 		/*
1619 		 * Set the receive callback (unless polling is enabled).
1620 		 */
1621 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
1622 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
1623 
1624 		/*
1625 		 * Note that fast-path mode is enabled.
1626 		 */
1627 		dsp->ds_mode = DLD_FASTPATH;
1628 	}
1629 	rw_exit(&dsp->ds_lock);
1630 
1631 	freemsg(nmp->b_cont);
1632 	nmp->b_cont = hmp;
1633 
1634 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
1635 	return;
1636 failed:
1637 	miocnak(q, mp, 0, err);
1638 }
1639 
1640 /*
1641  * Catch-all handler.
1642  */
1643 static void
1644 ioc(dld_str_t *dsp, mblk_t *mp)
1645 {
1646 	queue_t	*q = dsp->ds_wq;
1647 	mac_handle_t mh;
1648 
1649 	rw_enter(&dsp->ds_lock, RW_READER);
1650 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1651 		rw_exit(&dsp->ds_lock);
1652 		miocnak(q, mp, 0, EINVAL);
1653 		return;
1654 	}
1655 	mh = dsp->ds_mh;
1656 	ASSERT(mh != NULL);
1657 	rw_exit(&dsp->ds_lock);
1658 	mac_ioctl(mh, q, mp);
1659 }
1660 
1661 /*
1662  * Allocate a new minor number.
1663  */
1664 static minor_t
1665 dld_minor_hold(boolean_t sleep)
1666 {
1667 	minor_t		minor;
1668 
1669 	/*
1670 	 * Grab a value from the arena.
1671 	 */
1672 	atomic_add_32(&minor_count, 1);
1673 	if ((minor = PTR_TO_MINOR(vmem_alloc(minor_arenap, 1,
1674 	    (sleep) ? VM_SLEEP : VM_NOSLEEP))) == 0) {
1675 		atomic_add_32(&minor_count, -1);
1676 		return (0);
1677 	}
1678 
1679 	return (minor);
1680 }
1681 
1682 /*
1683  * Release a previously allocated minor number.
1684  */
1685 static void
1686 dld_minor_rele(minor_t minor)
1687 {
1688 	/*
1689 	 * Return the value to the arena.
1690 	 */
1691 	vmem_free(minor_arenap, MINOR_TO_PTR(minor), 1);
1692 
1693 	atomic_add_32(&minor_count, -1);
1694 }
1695