xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision d2ec6b54cfaa9a3bf7fc816dc44fabab46f9ecc4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Data-Link Driver
30  */
31 
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/strsubr.h>
35 #include	<sys/atomic.h>
36 #include	<sys/mkdev.h>
37 #include	<sys/vlan.h>
38 #include	<sys/dld.h>
39 #include	<sys/dld_impl.h>
40 #include	<sys/dls_impl.h>
41 #include	<inet/common.h>
42 
43 static int	str_constructor(void *, void *, int);
44 static void	str_destructor(void *, void *);
45 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *);
46 static void	str_notify_promisc_on_phys(dld_str_t *);
47 static void	str_notify_promisc_off_phys(dld_str_t *);
48 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
49 static void	str_notify_link_up(dld_str_t *);
50 static void	str_notify_link_down(dld_str_t *);
51 static void	str_notify_capab_reneg(dld_str_t *);
52 static void	str_notify_speed(dld_str_t *, uint32_t);
53 static void	str_notify(void *, mac_notify_type_t);
54 
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc(dld_str_t *, mblk_t *);
58 static void	dld_ioc(dld_str_t *, mblk_t *);
59 static minor_t	dld_minor_hold(boolean_t);
60 static void	dld_minor_rele(minor_t);
61 
62 static uint32_t		str_count;
63 static kmem_cache_t	*str_cachep;
64 static vmem_t		*minor_arenap;
65 static uint32_t		minor_count;
66 
67 #define	MINOR_TO_PTR(minor)	((void *)(uintptr_t)(minor))
68 #define	PTR_TO_MINOR(ptr)	((minor_t)(uintptr_t)(ptr))
69 
70 /*
71  * Some notes on entry points, flow-control, queueing and locking:
72  *
73  * This driver exports the traditional STREAMS put entry point as well as
74  * the non-STREAMS fast-path transmit routine which is provided to IP via
75  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
76  * and data operations, while the fast-path routine deals only with M_DATA
77  * fast-path packets.  Regardless of the entry point, all outbound packets
78  * will end up in str_mdata_fastpath_put(), where they will be delivered to
79  * the MAC driver.
80  *
81  * The transmit logic operates in two modes: a "not busy" mode where the
82  * packets will be delivered to the MAC for a send attempt, or "busy" mode
83  * where they will be enqueued in the internal queue because of flow-control.
84  * Flow-control happens when the MAC driver indicates the packets couldn't
85  * be transmitted due to lack of resources (e.g. running out of descriptors).
86  * In such case, the driver will place a dummy message on its write-side
87  * STREAMS queue so that the queue is marked as "full".  Any subsequent
88  * packets arriving at the driver will be enqueued in the internal queue,
89  * which is drained in the context of the service thread that gets scheduled
90  * whenever the driver is in the "busy" mode.  When all packets have been
91  * successfully delivered by MAC and the internal queue is empty, it will
92  * transition to the "not busy" mode by removing the dummy message from the
93  * write-side STREAMS queue; in effect this will trigger backenabling.
94  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
95  * to the above reasons.
96  *
97  * The driver implements an internal transmit queue independent of STREAMS.
98  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
99  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
100  * getq() operations done by the driver are those related to placing and
101  * removing the dummy message to/from the write-side STREAMS queue for flow-
102  * control purposes.
103  *
104  * Locking is done independent of STREAMS due to the driver being fully MT.
105  * Threads entering the driver (either from put or service entry points)
106  * will most likely be readers, with the exception of a few writer cases
107  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
108  * DLD-related ioctl requests.  The DLPI detach case is special, because
109  * it involves freeing resources and therefore must be single-threaded.
110  * Unfortunately the readers/writers lock can't be used to protect against
111  * it, because the lock is dropped prior to the driver calling places where
112  * putnext() may be invoked, and such places may depend on those resources
113  * to exist.  Because of this, the driver always completes the DLPI detach
114  * process when there are no other threads running in the driver.  This is
115  * done by keeping track of the number of threads, such that the the last
116  * thread leaving the driver will finish the pending DLPI detach operation.
117  */
118 
119 /*
120  * dld_max_q_count is the queue depth threshold used to limit the number of
121  * outstanding packets or bytes allowed in the queue; once this limit is
122  * reached the driver will free any incoming ones until the queue depth
123  * drops below the threshold.
124  *
125  * This buffering is provided to accomodate clients which do not employ
126  * their own buffering scheme, and to handle occasional packet bursts.
127  * Clients which handle their own buffering will receive positive feedback
128  * from this driver as soon as it transitions into the "busy" state, i.e.
129  * when the queue is initially filled up; they will get backenabled once
130  * the queue is empty.
131  *
132  * The value chosen here is rather arbitrary; in future some intelligent
133  * heuristics may be involved which could take into account the hardware's
134  * transmit ring size, etc.
135  */
136 uint_t dld_max_q_count = (16 * 1024 *1024);
137 
138 static dev_info_t *
139 dld_finddevinfo(dev_t dev)
140 {
141 	minor_t		minor = getminor(dev);
142 	char		*drvname = ddi_major_to_name(getmajor(dev));
143 	char		name[MAXNAMELEN];
144 	dls_vlan_t	*dvp = NULL;
145 	dev_info_t	*dip = NULL;
146 
147 	if (drvname == NULL || minor == 0 || minor > DLD_MAX_PPA + 1)
148 		return (NULL);
149 
150 	(void) snprintf(name, MAXNAMELEN, "%s%d", drvname, (int)minor - 1);
151 	if (dls_vlan_hold(name, &dvp, B_FALSE) != 0)
152 		return (NULL);
153 
154 	dip = mac_devinfo_get(dvp->dv_dlp->dl_mh);
155 	dls_vlan_rele(dvp);
156 	return (dip);
157 }
158 
159 /*
160  * devo_getinfo: getinfo(9e)
161  */
162 /*ARGSUSED*/
163 int
164 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
165 {
166 	dev_info_t	*devinfo;
167 	minor_t		minor = getminor((dev_t)arg);
168 	int		rc = DDI_FAILURE;
169 
170 	switch (cmd) {
171 	case DDI_INFO_DEVT2DEVINFO:
172 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
173 			*(dev_info_t **)resp = devinfo;
174 			rc = DDI_SUCCESS;
175 		}
176 		break;
177 	case DDI_INFO_DEVT2INSTANCE:
178 		if (minor > 0 && minor <= DLD_MAX_PPA + 1) {
179 			*resp = (void *)(uintptr_t)(minor - 1);
180 			rc = DDI_SUCCESS;
181 		}
182 		break;
183 	}
184 	return (rc);
185 }
186 
187 /*
188  * qi_qopen: open(9e)
189  */
190 /*ARGSUSED*/
191 int
192 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
193 {
194 	dld_str_t	*dsp;
195 	major_t		major;
196 	minor_t		minor;
197 	int		err;
198 
199 	if (sflag == MODOPEN)
200 		return (ENOTSUP);
201 
202 	/*
203 	 * This is a cloning driver and therefore each queue should only
204 	 * ever get opened once.
205 	 */
206 	if (rq->q_ptr != NULL)
207 		return (EBUSY);
208 
209 	major = getmajor(*devp);
210 	minor = getminor(*devp);
211 	if (minor > DLD_MAX_MINOR)
212 		return (ENODEV);
213 
214 	/*
215 	 * Create a new dld_str_t for the stream. This will grab a new minor
216 	 * number that will be handed back in the cloned dev_t.  Creation may
217 	 * fail if we can't allocate the dummy mblk used for flow-control.
218 	 */
219 	dsp = dld_str_create(rq, DLD_DLPI, major,
220 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
221 	if (dsp == NULL)
222 		return (ENOSR);
223 
224 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
225 	if (minor != 0) {
226 		/*
227 		 * Style 1 open
228 		 */
229 
230 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
231 			goto failed;
232 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
233 	} else {
234 		(void) qassociate(rq, -1);
235 	}
236 
237 	/*
238 	 * Enable the queue srv(9e) routine.
239 	 */
240 	qprocson(rq);
241 
242 	/*
243 	 * Construct a cloned dev_t to hand back.
244 	 */
245 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
246 	return (0);
247 
248 failed:
249 	dld_str_destroy(dsp);
250 	return (err);
251 }
252 
253 /*
254  * qi_qclose: close(9e)
255  */
256 int
257 dld_close(queue_t *rq)
258 {
259 	dld_str_t	*dsp = rq->q_ptr;
260 
261 	/*
262 	 * Wait until pending requests are processed.
263 	 */
264 	mutex_enter(&dsp->ds_thr_lock);
265 	while (dsp->ds_pending_cnt > 0)
266 		cv_wait(&dsp->ds_pending_cv, &dsp->ds_thr_lock);
267 	mutex_exit(&dsp->ds_thr_lock);
268 
269 	/*
270 	 * Disable the queue srv(9e) routine.
271 	 */
272 	qprocsoff(rq);
273 
274 	/*
275 	 * At this point we can not be entered by any threads via STREAMS
276 	 * or the direct call interface, which is available only to IP.
277 	 * After the interface is unplumbed, IP wouldn't have any reference
278 	 * to this instance, and therefore we are now effectively single
279 	 * threaded and don't require any lock protection.  Flush all
280 	 * pending packets which are sitting in the transmit queue.
281 	 */
282 	ASSERT(dsp->ds_thr == 0);
283 	dld_tx_flush(dsp);
284 
285 	/*
286 	 * This stream was open to a provider node. Check to see
287 	 * if it has been cleanly shut down.
288 	 */
289 	if (dsp->ds_dlstate != DL_UNATTACHED) {
290 		/*
291 		 * The stream is either open to a style 1 provider or
292 		 * this is not clean shutdown. Detach from the PPA.
293 		 * (This is still ok even in the style 1 case).
294 		 */
295 		dld_str_detach(dsp);
296 	}
297 
298 	dld_str_destroy(dsp);
299 	return (0);
300 }
301 
302 /*
303  * qi_qputp: put(9e)
304  */
305 void
306 dld_wput(queue_t *wq, mblk_t *mp)
307 {
308 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
309 
310 	DLD_ENTER(dsp);
311 
312 	switch (DB_TYPE(mp)) {
313 	case M_DATA:
314 		rw_enter(&dsp->ds_lock, RW_READER);
315 		if (dsp->ds_dlstate != DL_IDLE ||
316 		    dsp->ds_mode == DLD_UNITDATA) {
317 			freemsg(mp);
318 		} else if (dsp->ds_mode == DLD_FASTPATH) {
319 			str_mdata_fastpath_put(dsp, mp);
320 		} else if (dsp->ds_mode == DLD_RAW) {
321 			str_mdata_raw_put(dsp, mp);
322 		}
323 		rw_exit(&dsp->ds_lock);
324 		break;
325 	case M_PROTO:
326 	case M_PCPROTO:
327 		dld_proto(dsp, mp);
328 		break;
329 	case M_IOCTL:
330 		dld_ioc(dsp, mp);
331 		break;
332 	case M_FLUSH:
333 		if (*mp->b_rptr & FLUSHW) {
334 			dld_tx_flush(dsp);
335 			*mp->b_rptr &= ~FLUSHW;
336 		}
337 
338 		if (*mp->b_rptr & FLUSHR) {
339 			qreply(wq, mp);
340 		} else {
341 			freemsg(mp);
342 		}
343 		break;
344 	default:
345 		freemsg(mp);
346 		break;
347 	}
348 
349 	DLD_EXIT(dsp);
350 }
351 
352 /*
353  * qi_srvp: srv(9e)
354  */
355 void
356 dld_wsrv(queue_t *wq)
357 {
358 	mblk_t		*mp;
359 	dld_str_t	*dsp = wq->q_ptr;
360 
361 	DLD_ENTER(dsp);
362 	rw_enter(&dsp->ds_lock, RW_READER);
363 	/*
364 	 * Grab all packets (chained via b_next) off our transmit queue
365 	 * and try to send them all to the MAC layer.  Since the queue
366 	 * is independent of streams, we are able to dequeue all messages
367 	 * at once without looping through getq() and manually chaining
368 	 * them.  Note that the queue size parameters (byte and message
369 	 * counts) are cleared as well, but we postpone the backenabling
370 	 * until after the MAC transmit since some packets may end up
371 	 * back at our transmit queue.
372 	 */
373 	mutex_enter(&dsp->ds_tx_list_lock);
374 	if ((mp = dsp->ds_tx_list_head) == NULL) {
375 		ASSERT(!dsp->ds_tx_qbusy);
376 		ASSERT(dsp->ds_tx_flow_mp != NULL);
377 		ASSERT(dsp->ds_tx_list_head == NULL);
378 		ASSERT(dsp->ds_tx_list_tail == NULL);
379 		ASSERT(dsp->ds_tx_cnt == 0);
380 		ASSERT(dsp->ds_tx_msgcnt == 0);
381 		mutex_exit(&dsp->ds_tx_list_lock);
382 		goto done;
383 	}
384 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
385 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
386 	mutex_exit(&dsp->ds_tx_list_lock);
387 
388 	/*
389 	 * Discard packets unless we are attached and bound; note that
390 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
391 	 * because regardless of the mode all transmit will end up in
392 	 * str_mdata_fastpath_put() where the packets may be queued.
393 	 */
394 	ASSERT(DB_TYPE(mp) == M_DATA);
395 	if (dsp->ds_dlstate != DL_IDLE) {
396 		freemsgchain(mp);
397 		goto done;
398 	}
399 
400 	/*
401 	 * Attempt to transmit one or more packets.  If the MAC can't
402 	 * send them all, re-queue the packet(s) at the beginning of
403 	 * the transmit queue to avoid any re-ordering.
404 	 */
405 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
406 		dld_tx_enqueue(dsp, mp, B_TRUE);
407 
408 	/*
409 	 * Grab the list lock again and check if the transmit queue is
410 	 * really empty; if so, lift up flow-control and backenable any
411 	 * writer queues.  If the queue is not empty, schedule service
412 	 * thread to drain it.
413 	 */
414 	mutex_enter(&dsp->ds_tx_list_lock);
415 	if (dsp->ds_tx_list_head == NULL) {
416 		dsp->ds_tx_flow_mp = getq(wq);
417 		ASSERT(dsp->ds_tx_flow_mp != NULL);
418 		dsp->ds_tx_qbusy = B_FALSE;
419 	}
420 	mutex_exit(&dsp->ds_tx_list_lock);
421 done:
422 	rw_exit(&dsp->ds_lock);
423 	DLD_EXIT(dsp);
424 }
425 
426 void
427 dld_init_ops(struct dev_ops *ops, const char *name)
428 {
429 	struct streamtab *stream;
430 	struct qinit *rq, *wq;
431 	struct module_info *modinfo;
432 
433 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
434 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
435 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
436 	modinfo->mi_minpsz = 0;
437 	modinfo->mi_maxpsz = 64*1024;
438 	modinfo->mi_hiwat  = 1;
439 	modinfo->mi_lowat = 0;
440 
441 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
442 	rq->qi_qopen = dld_open;
443 	rq->qi_qclose = dld_close;
444 	rq->qi_minfo = modinfo;
445 
446 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
447 	wq->qi_putp = (pfi_t)dld_wput;
448 	wq->qi_srvp = (pfi_t)dld_wsrv;
449 	wq->qi_minfo = modinfo;
450 
451 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
452 	stream->st_rdinit = rq;
453 	stream->st_wrinit = wq;
454 	ops->devo_cb_ops->cb_str = stream;
455 
456 	ops->devo_getinfo = &dld_getinfo;
457 }
458 
459 void
460 dld_fini_ops(struct dev_ops *ops)
461 {
462 	struct streamtab *stream;
463 	struct qinit *rq, *wq;
464 	struct module_info *modinfo;
465 
466 	stream = ops->devo_cb_ops->cb_str;
467 	rq = stream->st_rdinit;
468 	wq = stream->st_wrinit;
469 	modinfo = rq->qi_minfo;
470 	ASSERT(wq->qi_minfo == modinfo);
471 
472 	kmem_free(stream, sizeof (struct streamtab));
473 	kmem_free(wq, sizeof (struct qinit));
474 	kmem_free(rq, sizeof (struct qinit));
475 	kmem_free(modinfo->mi_idname, FMNAMESZ);
476 	kmem_free(modinfo, sizeof (struct module_info));
477 }
478 
479 /*
480  * Initialize this module's data structures.
481  */
482 void
483 dld_str_init(void)
484 {
485 	/*
486 	 * Create dld_str_t object cache.
487 	 */
488 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
489 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
490 	ASSERT(str_cachep != NULL);
491 
492 	/*
493 	 * Allocate a vmem arena to manage minor numbers. The range of the
494 	 * arena will be from DLD_MAX_MINOR + 1 to MAXMIN (maximum legal
495 	 * minor number).
496 	 */
497 	minor_arenap = vmem_create("dld_minor_arena",
498 	    MINOR_TO_PTR(DLD_MAX_MINOR + 1), MAXMIN, 1, NULL, NULL, NULL, 0,
499 	    VM_SLEEP | VMC_IDENTIFIER);
500 	ASSERT(minor_arenap != NULL);
501 }
502 
503 /*
504  * Tear down this module's data structures.
505  */
506 int
507 dld_str_fini(void)
508 {
509 	/*
510 	 * Make sure that there are no objects in use.
511 	 */
512 	if (str_count != 0)
513 		return (EBUSY);
514 
515 	/*
516 	 * Check to see if there are any minor numbers still in use.
517 	 */
518 	if (minor_count != 0)
519 		return (EBUSY);
520 
521 	/*
522 	 * Destroy object cache.
523 	 */
524 	kmem_cache_destroy(str_cachep);
525 	vmem_destroy(minor_arenap);
526 	return (0);
527 }
528 
529 /*
530  * Create a new dld_str_t object.
531  */
532 dld_str_t *
533 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
534 {
535 	dld_str_t	*dsp;
536 
537 	/*
538 	 * Allocate an object from the cache.
539 	 */
540 	atomic_add_32(&str_count, 1);
541 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
542 
543 	/*
544 	 * Allocate the dummy mblk for flow-control.
545 	 */
546 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
547 	if (dsp->ds_tx_flow_mp == NULL) {
548 		kmem_cache_free(str_cachep, dsp);
549 		atomic_add_32(&str_count, -1);
550 		return (NULL);
551 	}
552 	dsp->ds_type = type;
553 	dsp->ds_major = major;
554 	dsp->ds_style = style;
555 
556 	/*
557 	 * Initialize the queue pointers.
558 	 */
559 	ASSERT(RD(rq) == rq);
560 	dsp->ds_rq = rq;
561 	dsp->ds_wq = WR(rq);
562 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
563 
564 	/*
565 	 * We want explicit control over our write-side STREAMS queue
566 	 * where the dummy mblk gets added/removed for flow-control.
567 	 */
568 	noenable(WR(rq));
569 
570 	return (dsp);
571 }
572 
573 /*
574  * Destroy a dld_str_t object.
575  */
576 void
577 dld_str_destroy(dld_str_t *dsp)
578 {
579 	queue_t		*rq;
580 	queue_t		*wq;
581 
582 	/*
583 	 * Clear the queue pointers.
584 	 */
585 	rq = dsp->ds_rq;
586 	wq = dsp->ds_wq;
587 	ASSERT(wq == WR(rq));
588 
589 	rq->q_ptr = wq->q_ptr = NULL;
590 	dsp->ds_rq = dsp->ds_wq = NULL;
591 
592 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
593 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
594 	ASSERT(dsp->ds_tx_list_head == NULL);
595 	ASSERT(dsp->ds_tx_list_tail == NULL);
596 	ASSERT(dsp->ds_tx_cnt == 0);
597 	ASSERT(dsp->ds_tx_msgcnt == 0);
598 	ASSERT(!dsp->ds_tx_qbusy);
599 
600 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
601 	ASSERT(dsp->ds_thr == 0);
602 	ASSERT(dsp->ds_pending_req == NULL);
603 
604 	/*
605 	 * Reinitialize all the flags.
606 	 */
607 	dsp->ds_notifications = 0;
608 	dsp->ds_passivestate = DLD_UNINITIALIZED;
609 	dsp->ds_mode = DLD_UNITDATA;
610 
611 	/*
612 	 * Free the dummy mblk if exists.
613 	 */
614 	if (dsp->ds_tx_flow_mp != NULL) {
615 		freeb(dsp->ds_tx_flow_mp);
616 		dsp->ds_tx_flow_mp = NULL;
617 	}
618 	/*
619 	 * Free the object back to the cache.
620 	 */
621 	kmem_cache_free(str_cachep, dsp);
622 	atomic_add_32(&str_count, -1);
623 }
624 
625 /*
626  * kmem_cache contructor function: see kmem_cache_create(9f).
627  */
628 /*ARGSUSED*/
629 static int
630 str_constructor(void *buf, void *cdrarg, int kmflags)
631 {
632 	dld_str_t	*dsp = buf;
633 
634 	bzero(buf, sizeof (dld_str_t));
635 
636 	/*
637 	 * Allocate a new minor number.
638 	 */
639 	if ((dsp->ds_minor = dld_minor_hold(kmflags == KM_SLEEP)) == 0)
640 		return (-1);
641 
642 	/*
643 	 * Initialize the DLPI state machine.
644 	 */
645 	dsp->ds_dlstate = DL_UNATTACHED;
646 
647 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
648 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
649 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
650 	cv_init(&dsp->ds_pending_cv, NULL, CV_DRIVER, NULL);
651 
652 	return (0);
653 }
654 
655 /*
656  * kmem_cache destructor function.
657  */
658 /*ARGSUSED*/
659 static void
660 str_destructor(void *buf, void *cdrarg)
661 {
662 	dld_str_t	*dsp = buf;
663 
664 	/*
665 	 * Make sure the DLPI state machine was reset.
666 	 */
667 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
668 
669 	/*
670 	 * Make sure the data-link interface was closed.
671 	 */
672 	ASSERT(dsp->ds_mh == NULL);
673 	ASSERT(dsp->ds_dc == NULL);
674 
675 	/*
676 	 * Make sure enabled notifications are cleared.
677 	 */
678 	ASSERT(dsp->ds_notifications == 0);
679 
680 	/*
681 	 * Make sure polling is disabled.
682 	 */
683 	ASSERT(!dsp->ds_polling);
684 
685 	/*
686 	 * Release the minor number.
687 	 */
688 	dld_minor_rele(dsp->ds_minor);
689 
690 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
691 	rw_destroy(&dsp->ds_lock);
692 
693 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
694 	mutex_destroy(&dsp->ds_tx_list_lock);
695 	ASSERT(dsp->ds_tx_flow_mp == NULL);
696 
697 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
698 	mutex_destroy(&dsp->ds_thr_lock);
699 	ASSERT(dsp->ds_pending_req == NULL);
700 	ASSERT(dsp->ds_pending_op == NULL);
701 	ASSERT(dsp->ds_pending_cnt == 0);
702 	cv_destroy(&dsp->ds_pending_cv);
703 }
704 
705 /*
706  * M_DATA put (IP fast-path mode)
707  */
708 void
709 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
710 {
711 	/*
712 	 * This function can be called from within dld or from an upper
713 	 * layer protocol (currently only tcp). If we are in the busy
714 	 * mode enqueue the packet(s) and return.  Otherwise hand them
715 	 * over to the MAC driver for transmission; any remaining one(s)
716 	 * which didn't get sent will be queued.
717 	 *
718 	 * Note here that we don't grab the list lock prior to checking
719 	 * the busy flag.  This is okay, because a missed transition
720 	 * will not cause any packet reordering for any particular TCP
721 	 * connection (which is single-threaded).  The enqueue routine
722 	 * will atomically set the busy flag and schedule the service
723 	 * thread to run; the flag is only cleared by the service thread
724 	 * when there is no more packet to be transmitted.
725 	 */
726 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
727 		dld_tx_enqueue(dsp, mp, B_FALSE);
728 }
729 
730 /*
731  * M_DATA put (raw mode)
732  */
733 void
734 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
735 {
736 	struct ether_header	*ehp;
737 	mblk_t			*bp;
738 	size_t			size;
739 	size_t			hdrlen;
740 
741 	size = MBLKL(mp);
742 	if (size < sizeof (struct ether_header))
743 		goto discard;
744 
745 	hdrlen = sizeof (struct ether_header);
746 
747 	ehp = (struct ether_header *)mp->b_rptr;
748 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
749 		struct ether_vlan_header	*evhp;
750 
751 		if (size < sizeof (struct ether_vlan_header))
752 			goto discard;
753 
754 		/*
755 		 * Replace vtag with our own
756 		 */
757 		evhp = (struct ether_vlan_header *)ehp;
758 		evhp->ether_tci = htons(VLAN_TCI(dsp->ds_pri,
759 		    ETHER_CFI, dsp->ds_vid));
760 		hdrlen = sizeof (struct ether_vlan_header);
761 	}
762 
763 	/*
764 	 * Check the packet is not too big and that any remaining
765 	 * fragment list is composed entirely of M_DATA messages. (We
766 	 * know the first fragment was M_DATA otherwise we could not
767 	 * have got here).
768 	 */
769 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
770 		if (DB_TYPE(bp) != M_DATA)
771 			goto discard;
772 		size += MBLKL(bp);
773 	}
774 
775 	if (size > dsp->ds_mip->mi_sdu_max + hdrlen)
776 		goto discard;
777 
778 	str_mdata_fastpath_put(dsp, mp);
779 	return;
780 
781 discard:
782 	freemsg(mp);
783 }
784 
785 /*
786  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
787  */
788 int
789 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
790 {
791 	int			err;
792 	const char		*drvname;
793 	char			name[MAXNAMELEN];
794 	dls_channel_t		dc;
795 	uint_t			addr_length;
796 
797 	ASSERT(dsp->ds_dc == NULL);
798 
799 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
800 		return (EINVAL);
801 
802 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
803 
804 	if (strcmp(drvname, "aggr") != 0 &&
805 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
806 		return (EINVAL);
807 
808 	/*
809 	 * Open a channel.
810 	 */
811 	if ((err = dls_open(name, &dc)) != 0) {
812 		(void) qassociate(dsp->ds_wq, -1);
813 		return (err);
814 	}
815 
816 	/*
817 	 * Cache the MAC interface handle, a pointer to the immutable MAC
818 	 * information and the current and 'factory' MAC address.
819 	 */
820 	dsp->ds_mh = dls_mac(dc);
821 	dsp->ds_mip = mac_info(dsp->ds_mh);
822 
823 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
824 
825 	addr_length = dsp->ds_mip->mi_addr_length;
826 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
827 
828 	/*
829 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
830 	 * a non-VLAN interface).
831 	 */
832 	dsp->ds_vid = dls_vid(dc);
833 
834 	/*
835 	 * Set the default packet priority.
836 	 */
837 	dsp->ds_pri = 0;
838 
839 	/*
840 	 * Add a notify function so that the we get updates from the MAC.
841 	 */
842 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
843 
844 	dsp->ds_dc = dc;
845 	dsp->ds_dlstate = DL_UNBOUND;
846 
847 	return (0);
848 }
849 
850 /*
851  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
852  * from close(2) for style 2.
853  */
854 void
855 dld_str_detach(dld_str_t *dsp)
856 {
857 	ASSERT(dsp->ds_thr == 0);
858 
859 	/*
860 	 * Remove the notify function.
861 	 */
862 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
863 
864 	/*
865 	 * Clear the polling and promisc flags.
866 	 */
867 	dsp->ds_polling = B_FALSE;
868 	dsp->ds_soft_ring = B_FALSE;
869 	dsp->ds_promisc = 0;
870 
871 	/*
872 	 * Close the channel.
873 	 */
874 	dls_close(dsp->ds_dc);
875 	dsp->ds_dc = NULL;
876 	dsp->ds_mh = NULL;
877 
878 	(void) qassociate(dsp->ds_wq, -1);
879 
880 	/*
881 	 * Re-initialize the DLPI state machine.
882 	 */
883 	dsp->ds_dlstate = DL_UNATTACHED;
884 
885 }
886 
887 /*
888  * Raw mode receive function.
889  */
890 /*ARGSUSED*/
891 void
892 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
893     size_t header_length)
894 {
895 	dld_str_t		*dsp = (dld_str_t *)arg;
896 	mblk_t			*next;
897 
898 	ASSERT(mp != NULL);
899 	do {
900 		/*
901 		 * Get the pointer to the next packet in the chain and then
902 		 * clear b_next before the packet gets passed on.
903 		 */
904 		next = mp->b_next;
905 		mp->b_next = NULL;
906 
907 		/*
908 		 * Wind back b_rptr to point at the MAC header.
909 		 */
910 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
911 		mp->b_rptr -= header_length;
912 		if (header_length == sizeof (struct ether_vlan_header)) {
913 			/*
914 			 * Strip off the vtag
915 			 */
916 			ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ,
917 			    2 * ETHERADDRL);
918 			mp->b_rptr += VLAN_TAGSZ;
919 		}
920 
921 		/*
922 		 * Pass the packet on.
923 		 */
924 		putnext(dsp->ds_rq, mp);
925 
926 		/*
927 		 * Move on to the next packet in the chain.
928 		 */
929 		mp = next;
930 	} while (mp != NULL);
931 }
932 
933 /*
934  * Fast-path receive function.
935  */
936 /*ARGSUSED*/
937 void
938 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
939     size_t header_length)
940 {
941 	dld_str_t		*dsp = (dld_str_t *)arg;
942 	mblk_t			*next;
943 
944 	ASSERT(mp != NULL);
945 	do {
946 		/*
947 		 * Get the pointer to the next packet in the chain and then
948 		 * clear b_next before the packet gets passed on.
949 		 */
950 		next = mp->b_next;
951 		mp->b_next = NULL;
952 
953 		/*
954 		 * Pass the packet on.
955 		 */
956 		putnext(dsp->ds_rq, mp);
957 
958 		/*
959 		 * Move on to the next packet in the chain.
960 		 */
961 		mp = next;
962 	} while (mp != NULL);
963 }
964 
965 /*
966  * Default receive function (send DL_UNITDATA_IND messages).
967  */
968 /*ARGSUSED*/
969 void
970 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
971     size_t header_length)
972 {
973 	dld_str_t		*dsp = (dld_str_t *)arg;
974 	mblk_t			*ud_mp;
975 	mblk_t			*next;
976 
977 	ASSERT(mp != NULL);
978 	do {
979 		/*
980 		 * Get the pointer to the next packet in the chain and then
981 		 * clear b_next before the packet gets passed on.
982 		 */
983 		next = mp->b_next;
984 		mp->b_next = NULL;
985 
986 		/*
987 		 * Wind back b_rptr to point at the MAC header.
988 		 */
989 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
990 		mp->b_rptr -= header_length;
991 
992 		/*
993 		 * Create the DL_UNITDATA_IND M_PROTO.
994 		 */
995 		if ((ud_mp = str_unitdata_ind(dsp, mp)) == NULL) {
996 			freemsgchain(mp);
997 			return;
998 		}
999 
1000 		/*
1001 		 * Advance b_rptr to point at the payload again.
1002 		 */
1003 		mp->b_rptr += header_length;
1004 
1005 		/*
1006 		 * Prepend the DL_UNITDATA_IND.
1007 		 */
1008 		ud_mp->b_cont = mp;
1009 
1010 		/*
1011 		 * Send the message.
1012 		 */
1013 		putnext(dsp->ds_rq, ud_mp);
1014 
1015 		/*
1016 		 * Move on to the next packet in the chain.
1017 		 */
1018 		mp = next;
1019 	} while (mp != NULL);
1020 }
1021 
1022 /*
1023  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1024  * current state of the interface.
1025  */
1026 void
1027 dld_str_notify_ind(dld_str_t *dsp)
1028 {
1029 	mac_notify_type_t	type;
1030 
1031 	for (type = 0; type < MAC_NNOTE; type++)
1032 		str_notify(dsp, type);
1033 }
1034 
1035 typedef struct dl_unitdata_ind_wrapper {
1036 	dl_unitdata_ind_t	dl_unitdata;
1037 	uint8_t			dl_dest_addr[MAXADDRLEN + sizeof (uint16_t)];
1038 	uint8_t			dl_src_addr[MAXADDRLEN + sizeof (uint16_t)];
1039 } dl_unitdata_ind_wrapper_t;
1040 
1041 /*
1042  * Create a DL_UNITDATA_IND M_PROTO message.
1043  */
1044 static mblk_t *
1045 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp)
1046 {
1047 	mblk_t				*nmp;
1048 	dl_unitdata_ind_wrapper_t	*dlwp;
1049 	dl_unitdata_ind_t		*dlp;
1050 	dls_header_info_t		dhi;
1051 	uint_t				addr_length;
1052 	uint8_t				*daddr;
1053 	uint8_t				*saddr;
1054 
1055 	/*
1056 	 * Get the packet header information.
1057 	 */
1058 	dls_header_info(dsp->ds_dc, mp, &dhi);
1059 
1060 	/*
1061 	 * Allocate a message large enough to contain the wrapper structure
1062 	 * defined above.
1063 	 */
1064 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1065 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1066 	    DL_UNITDATA_IND)) == NULL)
1067 		return (NULL);
1068 
1069 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1070 
1071 	dlp = &(dlwp->dl_unitdata);
1072 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1073 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1074 
1075 	/*
1076 	 * Copy in the destination address.
1077 	 */
1078 	addr_length = dsp->ds_mip->mi_addr_length;
1079 	daddr = dlwp->dl_dest_addr;
1080 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1081 	bcopy(dhi.dhi_daddr, daddr, addr_length);
1082 
1083 	/*
1084 	 * Set the destination DLSAP to our bound DLSAP value.
1085 	 */
1086 	*(uint16_t *)(daddr + addr_length) = dsp->ds_sap;
1087 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1088 
1089 	/*
1090 	 * If the destination address was a group address then
1091 	 * dl_group_address field should be non-zero.
1092 	 */
1093 	dlp->dl_group_address = dhi.dhi_isgroup;
1094 
1095 	/*
1096 	 * Copy in the source address.
1097 	 */
1098 	saddr = dlwp->dl_src_addr;
1099 	dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1100 	bcopy(dhi.dhi_saddr, saddr, addr_length);
1101 
1102 	/*
1103 	 * Set the source DLSAP to the packet ethertype.
1104 	 */
1105 	*(uint16_t *)(saddr + addr_length) = dhi.dhi_ethertype;
1106 	dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1107 
1108 	return (nmp);
1109 }
1110 
1111 /*
1112  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1113  */
1114 static void
1115 str_notify_promisc_on_phys(dld_str_t *dsp)
1116 {
1117 	mblk_t		*mp;
1118 	dl_notify_ind_t	*dlip;
1119 
1120 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1121 		return;
1122 
1123 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1124 	    M_PROTO, 0)) == NULL)
1125 		return;
1126 
1127 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1128 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1129 	dlip->dl_primitive = DL_NOTIFY_IND;
1130 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1131 
1132 	qreply(dsp->ds_wq, mp);
1133 }
1134 
1135 /*
1136  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1137  */
1138 static void
1139 str_notify_promisc_off_phys(dld_str_t *dsp)
1140 {
1141 	mblk_t		*mp;
1142 	dl_notify_ind_t	*dlip;
1143 
1144 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1145 		return;
1146 
1147 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1148 	    M_PROTO, 0)) == NULL)
1149 		return;
1150 
1151 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1152 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1153 	dlip->dl_primitive = DL_NOTIFY_IND;
1154 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1155 
1156 	qreply(dsp->ds_wq, mp);
1157 }
1158 
1159 /*
1160  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1161  */
1162 static void
1163 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1164 {
1165 	mblk_t		*mp;
1166 	dl_notify_ind_t	*dlip;
1167 	uint_t		addr_length;
1168 	uint16_t	ethertype;
1169 
1170 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1171 		return;
1172 
1173 	addr_length = dsp->ds_mip->mi_addr_length;
1174 	if ((mp = mexchange(dsp->ds_wq, NULL,
1175 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1176 	    M_PROTO, 0)) == NULL)
1177 		return;
1178 
1179 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1180 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1181 	dlip->dl_primitive = DL_NOTIFY_IND;
1182 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1183 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1184 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1185 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1186 
1187 	bcopy(addr, &dlip[1], addr_length);
1188 
1189 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1190 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) =
1191 		ethertype;
1192 
1193 	qreply(dsp->ds_wq, mp);
1194 }
1195 
1196 /*
1197  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1198  */
1199 static void
1200 str_notify_link_up(dld_str_t *dsp)
1201 {
1202 	mblk_t		*mp;
1203 	dl_notify_ind_t	*dlip;
1204 
1205 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1206 		return;
1207 
1208 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1209 	    M_PROTO, 0)) == NULL)
1210 		return;
1211 
1212 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1213 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1214 	dlip->dl_primitive = DL_NOTIFY_IND;
1215 	dlip->dl_notification = DL_NOTE_LINK_UP;
1216 
1217 	qreply(dsp->ds_wq, mp);
1218 }
1219 
1220 /*
1221  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1222  */
1223 static void
1224 str_notify_link_down(dld_str_t *dsp)
1225 {
1226 	mblk_t		*mp;
1227 	dl_notify_ind_t	*dlip;
1228 
1229 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1230 		return;
1231 
1232 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1233 	    M_PROTO, 0)) == NULL)
1234 		return;
1235 
1236 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1237 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1238 	dlip->dl_primitive = DL_NOTIFY_IND;
1239 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1240 
1241 	qreply(dsp->ds_wq, mp);
1242 }
1243 
1244 /*
1245  * DL_NOTIFY_IND: DL_NOTE_SPEED
1246  */
1247 static void
1248 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1249 {
1250 	mblk_t		*mp;
1251 	dl_notify_ind_t	*dlip;
1252 
1253 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1254 		return;
1255 
1256 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1257 	    M_PROTO, 0)) == NULL)
1258 		return;
1259 
1260 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1261 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1262 	dlip->dl_primitive = DL_NOTIFY_IND;
1263 	dlip->dl_notification = DL_NOTE_SPEED;
1264 	dlip->dl_data = speed;
1265 
1266 	qreply(dsp->ds_wq, mp);
1267 }
1268 
1269 /*
1270  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1271  */
1272 static void
1273 str_notify_capab_reneg(dld_str_t *dsp)
1274 {
1275 	mblk_t		*mp;
1276 	dl_notify_ind_t	*dlip;
1277 
1278 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1279 		return;
1280 
1281 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1282 	    M_PROTO, 0)) == NULL)
1283 		return;
1284 
1285 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1286 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1287 	dlip->dl_primitive = DL_NOTIFY_IND;
1288 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1289 
1290 	qreply(dsp->ds_wq, mp);
1291 }
1292 
1293 /*
1294  * MAC notification callback.
1295  */
1296 static void
1297 str_notify(void *arg, mac_notify_type_t type)
1298 {
1299 	dld_str_t		*dsp = (dld_str_t *)arg;
1300 	queue_t			*q = dsp->ds_wq;
1301 
1302 	switch (type) {
1303 	case MAC_NOTE_TX:
1304 		qenable(q);
1305 		break;
1306 
1307 	case MAC_NOTE_DEVPROMISC:
1308 		/*
1309 		 * Send the appropriate DL_NOTIFY_IND.
1310 		 */
1311 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1312 			str_notify_promisc_on_phys(dsp);
1313 		else
1314 			str_notify_promisc_off_phys(dsp);
1315 		break;
1316 
1317 	case MAC_NOTE_PROMISC:
1318 		break;
1319 
1320 	case MAC_NOTE_UNICST:
1321 		/*
1322 		 * This notification is sent whenever the MAC unicast address
1323 		 * changes. We need to re-cache the address.
1324 		 */
1325 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1326 
1327 		/*
1328 		 * Send the appropriate DL_NOTIFY_IND.
1329 		 */
1330 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1331 		break;
1332 
1333 	case MAC_NOTE_LINK:
1334 		/*
1335 		 * This notification is sent every time the MAC driver
1336 		 * updates the link state.
1337 		 */
1338 		switch (mac_link_get(dsp->ds_mh)) {
1339 		case LINK_STATE_UP:
1340 			/*
1341 			 * The link is up so send the appropriate
1342 			 * DL_NOTIFY_IND.
1343 			 */
1344 			str_notify_link_up(dsp);
1345 
1346 			/*
1347 			 * If we can find the link speed then send a
1348 			 * DL_NOTIFY_IND for that too.
1349 			 */
1350 			if (dsp->ds_mip->mi_stat[MAC_STAT_IFSPEED]) {
1351 				uint64_t	val;
1352 
1353 				val = mac_stat_get(dsp->ds_mh,
1354 				    MAC_STAT_IFSPEED);
1355 				str_notify_speed(dsp,
1356 				    (uint32_t)(val / 1000ull));
1357 			}
1358 			break;
1359 
1360 		case LINK_STATE_DOWN:
1361 			/*
1362 			 * The link is down so send the appropriate
1363 			 * DL_NOTIFY_IND.
1364 			 */
1365 			str_notify_link_down(dsp);
1366 			break;
1367 
1368 		default:
1369 			break;
1370 		}
1371 		break;
1372 
1373 	case MAC_NOTE_RESOURCE:
1374 		/*
1375 		 * This notification is sent whenever the MAC resources
1376 		 * change. We need to renegotiate the capabilities.
1377 		 * Send the appropriate DL_NOTIFY_IND.
1378 		 */
1379 		str_notify_capab_reneg(dsp);
1380 		break;
1381 
1382 	default:
1383 		ASSERT(B_FALSE);
1384 		break;
1385 	}
1386 }
1387 
1388 /*
1389  * Enqueue one or more messages to the transmit queue.
1390  * Caller specifies the insertion position (head/tail).
1391  */
1392 void
1393 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1394 {
1395 	mblk_t	*tail;
1396 	queue_t *q = dsp->ds_wq;
1397 	uint_t	cnt, msgcnt;
1398 	uint_t	tot_cnt, tot_msgcnt;
1399 
1400 	ASSERT(DB_TYPE(mp) == M_DATA);
1401 	/* Calculate total size and count of the packet(s) */
1402 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1403 	    tail->b_next != NULL; tail = tail->b_next) {
1404 		ASSERT(DB_TYPE(tail) == M_DATA);
1405 		cnt += msgdsize(tail);
1406 		msgcnt++;
1407 	}
1408 
1409 	mutex_enter(&dsp->ds_tx_list_lock);
1410 	/*
1411 	 * If the queue depth would exceed the allowed threshold, drop
1412 	 * new packet(s) and drain those already in the queue.
1413 	 */
1414 	tot_cnt = dsp->ds_tx_cnt + cnt;
1415 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1416 
1417 	if (!head_insert &&
1418 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1419 		ASSERT(dsp->ds_tx_qbusy);
1420 		mutex_exit(&dsp->ds_tx_list_lock);
1421 		freemsgchain(mp);
1422 		goto done;
1423 	}
1424 
1425 	/* Update the queue size parameters */
1426 	dsp->ds_tx_cnt = tot_cnt;
1427 	dsp->ds_tx_msgcnt = tot_msgcnt;
1428 
1429 	/*
1430 	 * If the transmit queue is currently empty and we are
1431 	 * about to deposit the packet(s) there, switch mode to
1432 	 * "busy" and raise flow-control condition.
1433 	 */
1434 	if (!dsp->ds_tx_qbusy) {
1435 		dsp->ds_tx_qbusy = B_TRUE;
1436 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1437 		(void) putq(q, dsp->ds_tx_flow_mp);
1438 		dsp->ds_tx_flow_mp = NULL;
1439 	}
1440 
1441 	if (!head_insert) {
1442 		/* Tail insertion */
1443 		if (dsp->ds_tx_list_head == NULL)
1444 			dsp->ds_tx_list_head = mp;
1445 		else
1446 			dsp->ds_tx_list_tail->b_next = mp;
1447 		dsp->ds_tx_list_tail = tail;
1448 	} else {
1449 		/* Head insertion */
1450 		tail->b_next = dsp->ds_tx_list_head;
1451 		if (dsp->ds_tx_list_head == NULL)
1452 			dsp->ds_tx_list_tail = tail;
1453 		dsp->ds_tx_list_head = mp;
1454 	}
1455 	mutex_exit(&dsp->ds_tx_list_lock);
1456 done:
1457 	/* Schedule service thread to drain the transmit queue */
1458 	qenable(q);
1459 }
1460 
1461 void
1462 dld_tx_flush(dld_str_t *dsp)
1463 {
1464 	mutex_enter(&dsp->ds_tx_list_lock);
1465 	if (dsp->ds_tx_list_head != NULL) {
1466 		freemsgchain(dsp->ds_tx_list_head);
1467 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1468 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1469 		if (dsp->ds_tx_qbusy) {
1470 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1471 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1472 			dsp->ds_tx_qbusy = B_FALSE;
1473 		}
1474 	}
1475 	mutex_exit(&dsp->ds_tx_list_lock);
1476 }
1477 
1478 /*
1479  * Process an M_IOCTL message.
1480  */
1481 static void
1482 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1483 {
1484 	uint_t			cmd;
1485 
1486 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1487 	ASSERT(dsp->ds_type == DLD_DLPI);
1488 
1489 	switch (cmd) {
1490 	case DLIOCRAW:
1491 		ioc_raw(dsp, mp);
1492 		break;
1493 	case DLIOCHDRINFO:
1494 		ioc_fast(dsp, mp);
1495 		break;
1496 	default:
1497 		ioc(dsp, mp);
1498 	}
1499 }
1500 
1501 /*
1502  * DLIOCRAW
1503  */
1504 static void
1505 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1506 {
1507 	queue_t *q = dsp->ds_wq;
1508 
1509 	rw_enter(&dsp->ds_lock, RW_WRITER);
1510 	if (dsp->ds_polling || dsp->ds_soft_ring) {
1511 		rw_exit(&dsp->ds_lock);
1512 		miocnak(q, mp, 0, EPROTO);
1513 		return;
1514 	}
1515 
1516 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1517 		/*
1518 		 * Set the receive callback.
1519 		 */
1520 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1521 
1522 		/*
1523 		 * Note that raw mode is enabled.
1524 		 */
1525 		dsp->ds_mode = DLD_RAW;
1526 	}
1527 
1528 	rw_exit(&dsp->ds_lock);
1529 	miocack(q, mp, 0, 0);
1530 }
1531 
1532 /*
1533  * DLIOCHDRINFO
1534  */
1535 static void
1536 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1537 {
1538 	dl_unitdata_req_t *dlp;
1539 	off_t		off;
1540 	size_t		len;
1541 	const uint8_t	*addr;
1542 	uint16_t	sap;
1543 	mblk_t		*nmp;
1544 	mblk_t		*hmp;
1545 	uint_t		addr_length;
1546 	queue_t		*q = dsp->ds_wq;
1547 	int		err;
1548 	dls_channel_t	dc;
1549 
1550 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1551 		err = ENOTSUP;
1552 		goto failed;
1553 	}
1554 
1555 	nmp = mp->b_cont;
1556 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1557 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1558 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1559 		err = EINVAL;
1560 		goto failed;
1561 	}
1562 
1563 	off = dlp->dl_dest_addr_offset;
1564 	len = dlp->dl_dest_addr_length;
1565 
1566 	if (!MBLKIN(nmp, off, len)) {
1567 		err = EINVAL;
1568 		goto failed;
1569 	}
1570 
1571 	rw_enter(&dsp->ds_lock, RW_READER);
1572 	if (dsp->ds_dlstate != DL_IDLE) {
1573 		rw_exit(&dsp->ds_lock);
1574 		err = ENOTSUP;
1575 		goto failed;
1576 	}
1577 
1578 	addr_length = dsp->ds_mip->mi_addr_length;
1579 	if (len != addr_length + sizeof (uint16_t)) {
1580 		rw_exit(&dsp->ds_lock);
1581 		err = EINVAL;
1582 		goto failed;
1583 	}
1584 
1585 	addr = nmp->b_rptr + off;
1586 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
1587 	dc = dsp->ds_dc;
1588 
1589 	if ((hmp = dls_header(dc, addr, sap, dsp->ds_pri)) == NULL) {
1590 		rw_exit(&dsp->ds_lock);
1591 		err = ENOMEM;
1592 		goto failed;
1593 	}
1594 
1595 	/*
1596 	 * This is a performance optimization.  We originally entered
1597 	 * as reader and only become writer upon transitioning into
1598 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
1599 	 * stay as reader and return the fast-path header to IP.
1600 	 */
1601 	if (dsp->ds_mode != DLD_FASTPATH) {
1602 		if (!rw_tryupgrade(&dsp->ds_lock)) {
1603 			rw_exit(&dsp->ds_lock);
1604 			rw_enter(&dsp->ds_lock, RW_WRITER);
1605 
1606 			/*
1607 			 * State may have changed before we re-acquired
1608 			 * the writer lock in case the upgrade failed.
1609 			 */
1610 			if (dsp->ds_dlstate != DL_IDLE) {
1611 				rw_exit(&dsp->ds_lock);
1612 				err = ENOTSUP;
1613 				goto failed;
1614 			}
1615 		}
1616 
1617 		/*
1618 		 * Set the receive callback (unless polling is enabled).
1619 		 */
1620 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
1621 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
1622 
1623 		/*
1624 		 * Note that fast-path mode is enabled.
1625 		 */
1626 		dsp->ds_mode = DLD_FASTPATH;
1627 	}
1628 	rw_exit(&dsp->ds_lock);
1629 
1630 	freemsg(nmp->b_cont);
1631 	nmp->b_cont = hmp;
1632 
1633 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
1634 	return;
1635 failed:
1636 	miocnak(q, mp, 0, err);
1637 }
1638 
1639 /*
1640  * Catch-all handler.
1641  */
1642 static void
1643 ioc(dld_str_t *dsp, mblk_t *mp)
1644 {
1645 	queue_t	*q = dsp->ds_wq;
1646 	mac_handle_t mh;
1647 
1648 	rw_enter(&dsp->ds_lock, RW_READER);
1649 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1650 		rw_exit(&dsp->ds_lock);
1651 		miocnak(q, mp, 0, EINVAL);
1652 		return;
1653 	}
1654 	mh = dsp->ds_mh;
1655 	ASSERT(mh != NULL);
1656 	rw_exit(&dsp->ds_lock);
1657 	mac_ioctl(mh, q, mp);
1658 }
1659 
1660 /*
1661  * Allocate a new minor number.
1662  */
1663 static minor_t
1664 dld_minor_hold(boolean_t sleep)
1665 {
1666 	minor_t		minor;
1667 
1668 	/*
1669 	 * Grab a value from the arena.
1670 	 */
1671 	atomic_add_32(&minor_count, 1);
1672 	if ((minor = PTR_TO_MINOR(vmem_alloc(minor_arenap, 1,
1673 	    (sleep) ? VM_SLEEP : VM_NOSLEEP))) == 0) {
1674 		atomic_add_32(&minor_count, -1);
1675 		return (0);
1676 	}
1677 
1678 	return (minor);
1679 }
1680 
1681 /*
1682  * Release a previously allocated minor number.
1683  */
1684 static void
1685 dld_minor_rele(minor_t minor)
1686 {
1687 	/*
1688 	 * Return the value to the arena.
1689 	 */
1690 	vmem_free(minor_arenap, MINOR_TO_PTR(minor), 1);
1691 
1692 	atomic_add_32(&minor_count, -1);
1693 }
1694