xref: /illumos-gate/usr/src/uts/common/io/dld/dld_str.c (revision baaf27537df696d26823ed1dd9b5a0f45c988c17)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Data-Link Driver
31  */
32 
33 #include	<sys/stropts.h>
34 #include	<sys/strsun.h>
35 #include	<sys/strsubr.h>
36 #include	<sys/atomic.h>
37 #include	<sys/mkdev.h>
38 #include	<sys/vlan.h>
39 #include	<sys/dld.h>
40 #include	<sys/dld_impl.h>
41 #include	<sys/dls_impl.h>
42 #include	<inet/common.h>
43 
44 static int	str_constructor(void *, void *, int);
45 static void	str_destructor(void *, void *);
46 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *);
47 static void	str_notify_promisc_on_phys(dld_str_t *);
48 static void	str_notify_promisc_off_phys(dld_str_t *);
49 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
50 static void	str_notify_link_up(dld_str_t *);
51 static void	str_notify_link_down(dld_str_t *);
52 static void	str_notify_capab_reneg(dld_str_t *);
53 static void	str_notify_speed(dld_str_t *, uint32_t);
54 static void	str_notify(void *, mac_notify_type_t);
55 
56 static void	ioc_raw(dld_str_t *, mblk_t *);
57 static void	ioc_fast(dld_str_t *,  mblk_t *);
58 static void	ioc(dld_str_t *, mblk_t *);
59 static void	dld_ioc(dld_str_t *, mblk_t *);
60 static minor_t	dld_minor_hold(boolean_t);
61 static void	dld_minor_rele(minor_t);
62 
63 static uint32_t		str_count;
64 static kmem_cache_t	*str_cachep;
65 static vmem_t		*minor_arenap;
66 static uint32_t		minor_count;
67 
68 #define	MINOR_TO_PTR(minor)	((void *)(uintptr_t)(minor))
69 #define	PTR_TO_MINOR(ptr)	((minor_t)(uintptr_t)(ptr))
70 
71 /*
72  * Some notes on entry points, flow-control, queueing and locking:
73  *
74  * This driver exports the traditional STREAMS put entry point as well as
75  * the non-STREAMS fast-path transmit routine which is provided to IP via
76  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
77  * and data operations, while the fast-path routine deals only with M_DATA
78  * fast-path packets.  Regardless of the entry point, all outbound packets
79  * will end up in str_mdata_fastpath_put(), where they will be delivered to
80  * the MAC driver.
81  *
82  * The transmit logic operates in two modes: a "not busy" mode where the
83  * packets will be delivered to the MAC for a send attempt, or "busy" mode
84  * where they will be enqueued in the internal queue because of flow-control.
85  * Flow-control happens when the MAC driver indicates the packets couldn't
86  * be transmitted due to lack of resources (e.g. running out of descriptors).
87  * In such case, the driver will place a dummy message on its write-side
88  * STREAMS queue so that the queue is marked as "full".  Any subsequent
89  * packets arriving at the driver will be enqueued in the internal queue,
90  * which is drained in the context of the service thread that gets scheduled
91  * whenever the driver is in the "busy" mode.  When all packets have been
92  * successfully delivered by MAC and the internal queue is empty, it will
93  * transition to the "not busy" mode by removing the dummy message from the
94  * write-side STREAMS queue; in effect this will trigger backenabling.
95  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
96  * to the above reasons.
97  *
98  * The driver implements an internal transmit queue independent of STREAMS.
99  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
100  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
101  * getq() operations done by the driver are those related to placing and
102  * removing the dummy message to/from the write-side STREAMS queue for flow-
103  * control purposes.
104  *
105  * Locking is done independent of STREAMS due to the driver being fully MT.
106  * Threads entering the driver (either from put or service entry points)
107  * will most likely be readers, with the exception of a few writer cases
108  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
109  * DLD-related ioctl requests.  The DLPI detach case is special, because
110  * it involves freeing resources and therefore must be single-threaded.
111  * Unfortunately the readers/writers lock can't be used to protect against
112  * it, because the lock is dropped prior to the driver calling places where
113  * putnext() may be invoked, and such places may depend on those resources
114  * to exist.  Because of this, the driver always completes the DLPI detach
115  * process when there are no other threads running in the driver.  This is
116  * done by keeping track of the number of threads, such that the the last
117  * thread leaving the driver will finish the pending DLPI detach operation.
118  */
119 
120 /*
121  * dld_max_q_count is the queue depth threshold used to limit the number of
122  * outstanding packets or bytes allowed in the queue; once this limit is
123  * reached the driver will free any incoming ones until the queue depth
124  * drops below the threshold.
125  *
126  * This buffering is provided to accomodate clients which do not employ
127  * their own buffering scheme, and to handle occasional packet bursts.
128  * Clients which handle their own buffering will receive positive feedback
129  * from this driver as soon as it transitions into the "busy" state, i.e.
130  * when the queue is initially filled up; they will get backenabled once
131  * the queue is empty.
132  *
133  * The value chosen here is rather arbitrary; in future some intelligent
134  * heuristics may be involved which could take into account the hardware's
135  * transmit ring size, etc.
136  */
137 uint_t dld_max_q_count = (16 * 1024 *1024);
138 
139 static dev_info_t *
140 dld_finddevinfo(dev_t dev)
141 {
142 	minor_t		minor = getminor(dev);
143 	char		*drvname = ddi_major_to_name(getmajor(dev));
144 	char		name[MAXNAMELEN];
145 	dls_vlan_t	*dvp = NULL;
146 	dev_info_t	*dip = NULL;
147 
148 	if (drvname == NULL || minor == 0 || minor > DLD_MAX_PPA + 1)
149 		return (NULL);
150 
151 	(void) snprintf(name, MAXNAMELEN, "%s%d", drvname, (int)minor - 1);
152 	if (dls_vlan_hold(name, &dvp, B_FALSE) != 0)
153 		return (NULL);
154 
155 	dip = mac_devinfo_get(dvp->dv_dlp->dl_mh);
156 	dls_vlan_rele(dvp);
157 	return (dip);
158 }
159 
160 /*
161  * devo_getinfo: getinfo(9e)
162  */
163 /*ARGSUSED*/
164 int
165 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
166 {
167 	dev_info_t	*devinfo;
168 	minor_t		minor = getminor((dev_t)arg);
169 	int		rc = DDI_FAILURE;
170 
171 	switch (cmd) {
172 	case DDI_INFO_DEVT2DEVINFO:
173 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
174 			*(dev_info_t **)resp = devinfo;
175 			rc = DDI_SUCCESS;
176 		}
177 		break;
178 	case DDI_INFO_DEVT2INSTANCE:
179 		if (minor > 0 && minor <= DLD_MAX_PPA + 1) {
180 			*(int *)resp = (int)minor - 1;
181 			rc = DDI_SUCCESS;
182 		}
183 		break;
184 	}
185 	return (rc);
186 }
187 
188 /*
189  * qi_qopen: open(9e)
190  */
191 /*ARGSUSED*/
192 int
193 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
194 {
195 	dld_str_t	*dsp;
196 	major_t		major;
197 	minor_t		minor;
198 	int		err;
199 
200 	if (sflag == MODOPEN)
201 		return (ENOTSUP);
202 
203 	/*
204 	 * This is a cloning driver and therefore each queue should only
205 	 * ever get opened once.
206 	 */
207 	if (rq->q_ptr != NULL)
208 		return (EBUSY);
209 
210 	major = getmajor(*devp);
211 	minor = getminor(*devp);
212 	if (minor > DLD_MAX_MINOR)
213 		return (ENODEV);
214 
215 	/*
216 	 * Create a new dld_str_t for the stream. This will grab a new minor
217 	 * number that will be handed back in the cloned dev_t.  Creation may
218 	 * fail if we can't allocate the dummy mblk used for flow-control.
219 	 */
220 	dsp = dld_str_create(rq, DLD_DLPI, major,
221 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
222 	if (dsp == NULL)
223 		return (ENOSR);
224 
225 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
226 	if (minor != 0) {
227 		/*
228 		 * Style 1 open
229 		 */
230 
231 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
232 			goto failed;
233 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
234 	} else {
235 		(void) qassociate(rq, -1);
236 	}
237 
238 	/*
239 	 * Enable the queue srv(9e) routine.
240 	 */
241 	qprocson(rq);
242 
243 	/*
244 	 * Construct a cloned dev_t to hand back.
245 	 */
246 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
247 	return (0);
248 
249 failed:
250 	dld_str_destroy(dsp);
251 	return (err);
252 }
253 
254 /*
255  * qi_qclose: close(9e)
256  */
257 int
258 dld_close(queue_t *rq)
259 {
260 	dld_str_t	*dsp = rq->q_ptr;
261 
262 	ASSERT(dsp->ds_task_id == NULL);
263 
264 	/*
265 	 * Disable the queue srv(9e) routine.
266 	 */
267 	qprocsoff(rq);
268 
269 	/*
270 	 * At this point we can not be entered by any threads via STREAMS
271 	 * or the direct call interface, which is available only to IP.
272 	 * After the interface is unplumbed, IP wouldn't have any reference
273 	 * to this instance, and therefore we are now effectively single
274 	 * threaded and don't require any lock protection.  Flush all
275 	 * pending packets which are sitting in the transmit queue.
276 	 */
277 	ASSERT(dsp->ds_thr == 0);
278 	dld_tx_flush(dsp);
279 
280 	/*
281 	 * This stream was open to a provider node. Check to see
282 	 * if it has been cleanly shut down.
283 	 */
284 	if (dsp->ds_dlstate != DL_UNATTACHED) {
285 		/*
286 		 * The stream is either open to a style 1 provider or
287 		 * this is not clean shutdown. Detach from the PPA.
288 		 * (This is still ok even in the style 1 case).
289 		 */
290 		dld_str_detach(dsp);
291 	}
292 
293 	dld_str_destroy(dsp);
294 	return (0);
295 }
296 
297 /*
298  * qi_qputp: put(9e)
299  */
300 void
301 dld_wput(queue_t *wq, mblk_t *mp)
302 {
303 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
304 
305 	DLD_ENTER(dsp);
306 
307 	switch (DB_TYPE(mp)) {
308 	case M_DATA:
309 		rw_enter(&dsp->ds_lock, RW_READER);
310 		if (dsp->ds_dlstate != DL_IDLE ||
311 		    dsp->ds_mode == DLD_UNITDATA) {
312 			freemsg(mp);
313 		} else if (dsp->ds_mode == DLD_FASTPATH) {
314 			str_mdata_fastpath_put(dsp, mp);
315 		} else if (dsp->ds_mode == DLD_RAW) {
316 			str_mdata_raw_put(dsp, mp);
317 		}
318 		rw_exit(&dsp->ds_lock);
319 		break;
320 	case M_PROTO:
321 	case M_PCPROTO:
322 		dld_proto(dsp, mp);
323 		break;
324 	case M_IOCTL:
325 		dld_ioc(dsp, mp);
326 		break;
327 	case M_FLUSH:
328 		if (*mp->b_rptr & FLUSHW) {
329 			dld_tx_flush(dsp);
330 			*mp->b_rptr &= ~FLUSHW;
331 		}
332 
333 		if (*mp->b_rptr & FLUSHR) {
334 			qreply(wq, mp);
335 		} else {
336 			freemsg(mp);
337 		}
338 		break;
339 	default:
340 		freemsg(mp);
341 		break;
342 	}
343 
344 	DLD_EXIT(dsp);
345 }
346 
347 /*
348  * qi_srvp: srv(9e)
349  */
350 void
351 dld_wsrv(queue_t *wq)
352 {
353 	mblk_t		*mp;
354 	dld_str_t	*dsp = wq->q_ptr;
355 
356 	DLD_ENTER(dsp);
357 	rw_enter(&dsp->ds_lock, RW_READER);
358 	/*
359 	 * Grab all packets (chained via b_next) off our transmit queue
360 	 * and try to send them all to the MAC layer.  Since the queue
361 	 * is independent of streams, we are able to dequeue all messages
362 	 * at once without looping through getq() and manually chaining
363 	 * them.  Note that the queue size parameters (byte and message
364 	 * counts) are cleared as well, but we postpone the backenabling
365 	 * until after the MAC transmit since some packets may end up
366 	 * back at our transmit queue.
367 	 */
368 	mutex_enter(&dsp->ds_tx_list_lock);
369 	if ((mp = dsp->ds_tx_list_head) == NULL) {
370 		ASSERT(!dsp->ds_tx_qbusy);
371 		ASSERT(dsp->ds_tx_flow_mp != NULL);
372 		ASSERT(dsp->ds_tx_list_head == NULL);
373 		ASSERT(dsp->ds_tx_list_tail == NULL);
374 		ASSERT(dsp->ds_tx_cnt == 0);
375 		ASSERT(dsp->ds_tx_msgcnt == 0);
376 		mutex_exit(&dsp->ds_tx_list_lock);
377 		goto done;
378 	}
379 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
380 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
381 	mutex_exit(&dsp->ds_tx_list_lock);
382 
383 	/*
384 	 * Discard packets unless we are attached and bound; note that
385 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
386 	 * because regardless of the mode all transmit will end up in
387 	 * str_mdata_fastpath_put() where the packets may be queued.
388 	 */
389 	ASSERT(DB_TYPE(mp) == M_DATA);
390 	if (dsp->ds_dlstate != DL_IDLE) {
391 		freemsgchain(mp);
392 		goto done;
393 	}
394 
395 	/*
396 	 * Attempt to transmit one or more packets.  If the MAC can't
397 	 * send them all, re-queue the packet(s) at the beginning of
398 	 * the transmit queue to avoid any re-ordering.
399 	 */
400 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
401 		dld_tx_enqueue(dsp, mp, B_TRUE);
402 
403 	/*
404 	 * Grab the list lock again and check if the transmit queue is
405 	 * really empty; if so, lift up flow-control and backenable any
406 	 * writer queues.  If the queue is not empty, schedule service
407 	 * thread to drain it.
408 	 */
409 	mutex_enter(&dsp->ds_tx_list_lock);
410 	if (dsp->ds_tx_list_head == NULL) {
411 		dsp->ds_tx_flow_mp = getq(wq);
412 		ASSERT(dsp->ds_tx_flow_mp != NULL);
413 		dsp->ds_tx_qbusy = B_FALSE;
414 	}
415 	mutex_exit(&dsp->ds_tx_list_lock);
416 done:
417 	rw_exit(&dsp->ds_lock);
418 	DLD_EXIT(dsp);
419 }
420 
421 void
422 dld_init_ops(struct dev_ops *ops, const char *name)
423 {
424 	struct streamtab *stream;
425 	struct qinit *rq, *wq;
426 	struct module_info *modinfo;
427 
428 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
429 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
430 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
431 	modinfo->mi_minpsz = 0;
432 	modinfo->mi_maxpsz = 64*1024;
433 	modinfo->mi_hiwat  = 1;
434 	modinfo->mi_lowat = 0;
435 
436 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
437 	rq->qi_qopen = dld_open;
438 	rq->qi_qclose = dld_close;
439 	rq->qi_minfo = modinfo;
440 
441 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
442 	wq->qi_putp = (pfi_t)dld_wput;
443 	wq->qi_srvp = (pfi_t)dld_wsrv;
444 	wq->qi_minfo = modinfo;
445 
446 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
447 	stream->st_rdinit = rq;
448 	stream->st_wrinit = wq;
449 	ops->devo_cb_ops->cb_str = stream;
450 
451 	ops->devo_getinfo = &dld_getinfo;
452 }
453 
454 void
455 dld_fini_ops(struct dev_ops *ops)
456 {
457 	struct streamtab *stream;
458 	struct qinit *rq, *wq;
459 	struct module_info *modinfo;
460 
461 	stream = ops->devo_cb_ops->cb_str;
462 	rq = stream->st_rdinit;
463 	wq = stream->st_wrinit;
464 	modinfo = rq->qi_minfo;
465 	ASSERT(wq->qi_minfo == modinfo);
466 
467 	kmem_free(stream, sizeof (struct streamtab));
468 	kmem_free(wq, sizeof (struct qinit));
469 	kmem_free(rq, sizeof (struct qinit));
470 	kmem_free(modinfo->mi_idname, FMNAMESZ);
471 	kmem_free(modinfo, sizeof (struct module_info));
472 }
473 
474 /*
475  * Initialize this module's data structures.
476  */
477 void
478 dld_str_init(void)
479 {
480 	/*
481 	 * Create dld_str_t object cache.
482 	 */
483 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
484 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
485 	ASSERT(str_cachep != NULL);
486 
487 	/*
488 	 * Allocate a vmem arena to manage minor numbers. The range of the
489 	 * arena will be from DLD_MAX_MINOR + 1 to MAXMIN (maximum legal
490 	 * minor number).
491 	 */
492 	minor_arenap = vmem_create("dld_minor_arena",
493 	    MINOR_TO_PTR(DLD_MAX_MINOR + 1), MAXMIN, 1, NULL, NULL, NULL, 0,
494 	    VM_SLEEP | VMC_IDENTIFIER);
495 	ASSERT(minor_arenap != NULL);
496 }
497 
498 /*
499  * Tear down this module's data structures.
500  */
501 int
502 dld_str_fini(void)
503 {
504 	/*
505 	 * Make sure that there are no objects in use.
506 	 */
507 	if (str_count != 0)
508 		return (EBUSY);
509 
510 	/*
511 	 * Check to see if there are any minor numbers still in use.
512 	 */
513 	if (minor_count != 0)
514 		return (EBUSY);
515 
516 	/*
517 	 * Destroy object cache.
518 	 */
519 	kmem_cache_destroy(str_cachep);
520 	vmem_destroy(minor_arenap);
521 	return (0);
522 }
523 
524 /*
525  * Create a new dld_str_t object.
526  */
527 dld_str_t *
528 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
529 {
530 	dld_str_t	*dsp;
531 
532 	/*
533 	 * Allocate an object from the cache.
534 	 */
535 	atomic_add_32(&str_count, 1);
536 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
537 
538 	/*
539 	 * Allocate the dummy mblk for flow-control.
540 	 */
541 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
542 	if (dsp->ds_tx_flow_mp == NULL) {
543 		kmem_cache_free(str_cachep, dsp);
544 		atomic_add_32(&str_count, -1);
545 		return (NULL);
546 	}
547 	dsp->ds_type = type;
548 	dsp->ds_major = major;
549 	dsp->ds_style = style;
550 
551 	/*
552 	 * Initialize the queue pointers.
553 	 */
554 	ASSERT(RD(rq) == rq);
555 	dsp->ds_rq = rq;
556 	dsp->ds_wq = WR(rq);
557 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
558 
559 	/*
560 	 * We want explicit control over our write-side STREAMS queue
561 	 * where the dummy mblk gets added/removed for flow-control.
562 	 */
563 	noenable(WR(rq));
564 
565 	return (dsp);
566 }
567 
568 /*
569  * Destroy a dld_str_t object.
570  */
571 void
572 dld_str_destroy(dld_str_t *dsp)
573 {
574 	queue_t		*rq;
575 	queue_t		*wq;
576 
577 	/*
578 	 * Clear the queue pointers.
579 	 */
580 	rq = dsp->ds_rq;
581 	wq = dsp->ds_wq;
582 	ASSERT(wq == WR(rq));
583 
584 	rq->q_ptr = wq->q_ptr = NULL;
585 	dsp->ds_rq = dsp->ds_wq = NULL;
586 
587 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
588 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
589 	ASSERT(dsp->ds_tx_list_head == NULL);
590 	ASSERT(dsp->ds_tx_list_tail == NULL);
591 	ASSERT(dsp->ds_tx_cnt == 0);
592 	ASSERT(dsp->ds_tx_msgcnt == 0);
593 	ASSERT(!dsp->ds_tx_qbusy);
594 
595 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
596 	ASSERT(dsp->ds_thr == 0);
597 	ASSERT(dsp->ds_detach_req == NULL);
598 
599 	/*
600 	 * Reinitialize all the flags.
601 	 */
602 	dsp->ds_notifications = 0;
603 	dsp->ds_passivestate = DLD_UNINITIALIZED;
604 	dsp->ds_mode = DLD_UNITDATA;
605 
606 	/*
607 	 * Free the dummy mblk if exists.
608 	 */
609 	if (dsp->ds_tx_flow_mp != NULL) {
610 		freeb(dsp->ds_tx_flow_mp);
611 		dsp->ds_tx_flow_mp = NULL;
612 	}
613 	/*
614 	 * Free the object back to the cache.
615 	 */
616 	kmem_cache_free(str_cachep, dsp);
617 	atomic_add_32(&str_count, -1);
618 }
619 
620 /*
621  * kmem_cache contructor function: see kmem_cache_create(9f).
622  */
623 /*ARGSUSED*/
624 static int
625 str_constructor(void *buf, void *cdrarg, int kmflags)
626 {
627 	dld_str_t	*dsp = buf;
628 
629 	bzero(buf, sizeof (dld_str_t));
630 
631 	/*
632 	 * Allocate a new minor number.
633 	 */
634 	if ((dsp->ds_minor = dld_minor_hold(kmflags == KM_SLEEP)) == 0)
635 		return (-1);
636 
637 	/*
638 	 * Initialize the DLPI state machine.
639 	 */
640 	dsp->ds_dlstate = DL_UNATTACHED;
641 
642 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
643 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
644 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
645 
646 	return (0);
647 }
648 
649 /*
650  * kmem_cache destructor function.
651  */
652 /*ARGSUSED*/
653 static void
654 str_destructor(void *buf, void *cdrarg)
655 {
656 	dld_str_t	*dsp = buf;
657 
658 	/*
659 	 * Make sure the DLPI state machine was reset.
660 	 */
661 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
662 
663 	/*
664 	 * Make sure the data-link interface was closed.
665 	 */
666 	ASSERT(dsp->ds_mh == NULL);
667 	ASSERT(dsp->ds_dc == NULL);
668 
669 	/*
670 	 * Make sure enabled notifications are cleared.
671 	 */
672 	ASSERT(dsp->ds_notifications == 0);
673 
674 	/*
675 	 * Make sure polling is disabled.
676 	 */
677 	ASSERT(!dsp->ds_polling);
678 
679 	/*
680 	 * Release the minor number.
681 	 */
682 	dld_minor_rele(dsp->ds_minor);
683 
684 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
685 	rw_destroy(&dsp->ds_lock);
686 
687 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
688 	mutex_destroy(&dsp->ds_tx_list_lock);
689 	ASSERT(dsp->ds_tx_flow_mp == NULL);
690 
691 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
692 	mutex_destroy(&dsp->ds_thr_lock);
693 	ASSERT(dsp->ds_detach_req == NULL);
694 }
695 
696 /*
697  * M_DATA put (IP fast-path mode)
698  */
699 void
700 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
701 {
702 	/*
703 	 * This function can be called from within dld or from an upper
704 	 * layer protocol (currently only tcp). If we are in the busy
705 	 * mode enqueue the packet(s) and return.  Otherwise hand them
706 	 * over to the MAC driver for transmission; any remaining one(s)
707 	 * which didn't get sent will be queued.
708 	 *
709 	 * Note here that we don't grab the list lock prior to checking
710 	 * the busy flag.  This is okay, because a missed transition
711 	 * will not cause any packet reordering for any particular TCP
712 	 * connection (which is single-threaded).  The enqueue routine
713 	 * will atomically set the busy flag and schedule the service
714 	 * thread to run; the flag is only cleared by the service thread
715 	 * when there is no more packet to be transmitted.
716 	 */
717 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
718 		dld_tx_enqueue(dsp, mp, B_FALSE);
719 }
720 
721 /*
722  * M_DATA put (raw mode)
723  */
724 void
725 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
726 {
727 	struct ether_header	*ehp;
728 	mblk_t			*bp;
729 	size_t			size;
730 	size_t			hdrlen;
731 
732 	size = MBLKL(mp);
733 	if (size < sizeof (struct ether_header))
734 		goto discard;
735 
736 	hdrlen = sizeof (struct ether_header);
737 
738 	ehp = (struct ether_header *)mp->b_rptr;
739 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
740 		struct ether_vlan_header	*evhp;
741 
742 		if (size < sizeof (struct ether_vlan_header))
743 			goto discard;
744 
745 		/*
746 		 * Replace vtag with our own
747 		 */
748 		evhp = (struct ether_vlan_header *)ehp;
749 		evhp->ether_tci = htons(VLAN_TCI(dsp->ds_pri,
750 		    ETHER_CFI, dsp->ds_vid));
751 		hdrlen = sizeof (struct ether_vlan_header);
752 	}
753 
754 	/*
755 	 * Check the packet is not too big and that any remaining
756 	 * fragment list is composed entirely of M_DATA messages. (We
757 	 * know the first fragment was M_DATA otherwise we could not
758 	 * have got here).
759 	 */
760 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
761 		if (DB_TYPE(bp) != M_DATA)
762 			goto discard;
763 		size += MBLKL(bp);
764 	}
765 
766 	if (size > dsp->ds_mip->mi_sdu_max + hdrlen)
767 		goto discard;
768 
769 	str_mdata_fastpath_put(dsp, mp);
770 	return;
771 
772 discard:
773 	freemsg(mp);
774 }
775 
776 /*
777  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
778  */
779 int
780 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
781 {
782 	int			err;
783 	const char		*drvname;
784 	char			name[MAXNAMELEN];
785 	dls_channel_t		dc;
786 	uint_t			addr_length;
787 
788 	ASSERT(dsp->ds_dc == NULL);
789 
790 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
791 		return (EINVAL);
792 
793 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
794 
795 	if (strcmp(drvname, "aggr") != 0 &&
796 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
797 		return (EINVAL);
798 
799 	/*
800 	 * Open a channel.
801 	 */
802 	if ((err = dls_open(name, &dc)) != 0) {
803 		(void) qassociate(dsp->ds_wq, -1);
804 		return (err);
805 	}
806 
807 	/*
808 	 * Cache the MAC interface handle, a pointer to the immutable MAC
809 	 * information and the current and 'factory' MAC address.
810 	 */
811 	dsp->ds_mh = dls_mac(dc);
812 	dsp->ds_mip = mac_info(dsp->ds_mh);
813 
814 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
815 
816 	addr_length = dsp->ds_mip->mi_addr_length;
817 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
818 
819 	/*
820 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
821 	 * a non-VLAN interface).
822 	 */
823 	dsp->ds_vid = dls_vid(dc);
824 
825 	/*
826 	 * Set the default packet priority.
827 	 */
828 	dsp->ds_pri = 0;
829 
830 	/*
831 	 * Add a notify function so that the we get updates from the MAC.
832 	 */
833 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
834 
835 	dsp->ds_dc = dc;
836 	dsp->ds_dlstate = DL_UNBOUND;
837 
838 	return (0);
839 }
840 
841 /*
842  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
843  * from close(2) for style 2.
844  */
845 void
846 dld_str_detach(dld_str_t *dsp)
847 {
848 	ASSERT(dsp->ds_thr == 0);
849 
850 	/*
851 	 * Remove the notify function.
852 	 */
853 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
854 
855 	/*
856 	 * Re-initialize the DLPI state machine.
857 	 */
858 	dsp->ds_dlstate = DL_UNATTACHED;
859 
860 	/*
861 	 * Clear the polling and promisc flags.
862 	 */
863 	dsp->ds_polling = B_FALSE;
864 	dsp->ds_soft_ring = B_FALSE;
865 	dsp->ds_promisc = 0;
866 
867 	/*
868 	 * Close the channel.
869 	 */
870 	dls_close(dsp->ds_dc);
871 	dsp->ds_dc = NULL;
872 	dsp->ds_mh = NULL;
873 
874 	(void) qassociate(dsp->ds_wq, -1);
875 }
876 
877 /*
878  * Raw mode receive function.
879  */
880 /*ARGSUSED*/
881 void
882 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
883     size_t header_length)
884 {
885 	dld_str_t		*dsp = (dld_str_t *)arg;
886 	mblk_t			*next;
887 
888 	ASSERT(mp != NULL);
889 	do {
890 		/*
891 		 * Get the pointer to the next packet in the chain and then
892 		 * clear b_next before the packet gets passed on.
893 		 */
894 		next = mp->b_next;
895 		mp->b_next = NULL;
896 
897 		/*
898 		 * Wind back b_rptr to point at the MAC header.
899 		 */
900 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
901 		mp->b_rptr -= header_length;
902 		if (header_length == sizeof (struct ether_vlan_header)) {
903 			/*
904 			 * Strip off the vtag
905 			 */
906 			ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ,
907 			    2 * ETHERADDRL);
908 			mp->b_rptr += VLAN_TAGSZ;
909 		}
910 
911 		/*
912 		 * Pass the packet on.
913 		 */
914 		putnext(dsp->ds_rq, mp);
915 
916 		/*
917 		 * Move on to the next packet in the chain.
918 		 */
919 		mp = next;
920 	} while (mp != NULL);
921 }
922 
923 /*
924  * Fast-path receive function.
925  */
926 /*ARGSUSED*/
927 void
928 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
929     size_t header_length)
930 {
931 	dld_str_t		*dsp = (dld_str_t *)arg;
932 	mblk_t			*next;
933 
934 	ASSERT(mp != NULL);
935 	do {
936 		/*
937 		 * Get the pointer to the next packet in the chain and then
938 		 * clear b_next before the packet gets passed on.
939 		 */
940 		next = mp->b_next;
941 		mp->b_next = NULL;
942 
943 		/*
944 		 * Pass the packet on.
945 		 */
946 		putnext(dsp->ds_rq, mp);
947 
948 		/*
949 		 * Move on to the next packet in the chain.
950 		 */
951 		mp = next;
952 	} while (mp != NULL);
953 }
954 
955 /*
956  * Default receive function (send DL_UNITDATA_IND messages).
957  */
958 /*ARGSUSED*/
959 void
960 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
961     size_t header_length)
962 {
963 	dld_str_t		*dsp = (dld_str_t *)arg;
964 	mblk_t			*ud_mp;
965 	mblk_t			*next;
966 
967 	ASSERT(mp != NULL);
968 	do {
969 		/*
970 		 * Get the pointer to the next packet in the chain and then
971 		 * clear b_next before the packet gets passed on.
972 		 */
973 		next = mp->b_next;
974 		mp->b_next = NULL;
975 
976 		/*
977 		 * Wind back b_rptr to point at the MAC header.
978 		 */
979 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
980 		mp->b_rptr -= header_length;
981 
982 		/*
983 		 * Create the DL_UNITDATA_IND M_PROTO.
984 		 */
985 		if ((ud_mp = str_unitdata_ind(dsp, mp)) == NULL) {
986 			freemsgchain(mp);
987 			return;
988 		}
989 
990 		/*
991 		 * Advance b_rptr to point at the payload again.
992 		 */
993 		mp->b_rptr += header_length;
994 
995 		/*
996 		 * Prepend the DL_UNITDATA_IND.
997 		 */
998 		ud_mp->b_cont = mp;
999 
1000 		/*
1001 		 * Send the message.
1002 		 */
1003 		putnext(dsp->ds_rq, ud_mp);
1004 
1005 		/*
1006 		 * Move on to the next packet in the chain.
1007 		 */
1008 		mp = next;
1009 	} while (mp != NULL);
1010 }
1011 
1012 /*
1013  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1014  * current state of the interface.
1015  */
1016 void
1017 dld_str_notify_ind(dld_str_t *dsp)
1018 {
1019 	mac_notify_type_t	type;
1020 
1021 	for (type = 0; type < MAC_NNOTE; type++)
1022 		str_notify(dsp, type);
1023 }
1024 
1025 typedef struct dl_unitdata_ind_wrapper {
1026 	dl_unitdata_ind_t	dl_unitdata;
1027 	uint8_t			dl_dest_addr[MAXADDRLEN + sizeof (uint16_t)];
1028 	uint8_t			dl_src_addr[MAXADDRLEN + sizeof (uint16_t)];
1029 } dl_unitdata_ind_wrapper_t;
1030 
1031 /*
1032  * Create a DL_UNITDATA_IND M_PROTO message.
1033  */
1034 static mblk_t *
1035 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp)
1036 {
1037 	mblk_t				*nmp;
1038 	dl_unitdata_ind_wrapper_t	*dlwp;
1039 	dl_unitdata_ind_t		*dlp;
1040 	dls_header_info_t		dhi;
1041 	uint_t				addr_length;
1042 	uint8_t				*daddr;
1043 	uint8_t				*saddr;
1044 
1045 	/*
1046 	 * Get the packet header information.
1047 	 */
1048 	dls_header_info(dsp->ds_dc, mp, &dhi);
1049 
1050 	/*
1051 	 * Allocate a message large enough to contain the wrapper structure
1052 	 * defined above.
1053 	 */
1054 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1055 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1056 	    DL_UNITDATA_IND)) == NULL)
1057 		return (NULL);
1058 
1059 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1060 
1061 	dlp = &(dlwp->dl_unitdata);
1062 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1063 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1064 
1065 	/*
1066 	 * Copy in the destination address.
1067 	 */
1068 	addr_length = dsp->ds_mip->mi_addr_length;
1069 	daddr = dlwp->dl_dest_addr;
1070 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1071 	bcopy(dhi.dhi_daddr, daddr, addr_length);
1072 
1073 	/*
1074 	 * Set the destination DLSAP to our bound DLSAP value.
1075 	 */
1076 	*(uint16_t *)(daddr + addr_length) = dsp->ds_sap;
1077 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1078 
1079 	/*
1080 	 * If the destination address was a group address then
1081 	 * dl_group_address field should be non-zero.
1082 	 */
1083 	dlp->dl_group_address = dhi.dhi_isgroup;
1084 
1085 	/*
1086 	 * Copy in the source address.
1087 	 */
1088 	saddr = dlwp->dl_src_addr;
1089 	dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1090 	bcopy(dhi.dhi_saddr, saddr, addr_length);
1091 
1092 	/*
1093 	 * Set the source DLSAP to the packet ethertype.
1094 	 */
1095 	*(uint16_t *)(saddr + addr_length) = dhi.dhi_ethertype;
1096 	dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1097 
1098 	return (nmp);
1099 }
1100 
1101 /*
1102  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1103  */
1104 static void
1105 str_notify_promisc_on_phys(dld_str_t *dsp)
1106 {
1107 	mblk_t		*mp;
1108 	dl_notify_ind_t	*dlip;
1109 
1110 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1111 		return;
1112 
1113 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1114 	    M_PROTO, 0)) == NULL)
1115 		return;
1116 
1117 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1118 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1119 	dlip->dl_primitive = DL_NOTIFY_IND;
1120 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1121 
1122 	qreply(dsp->ds_wq, mp);
1123 }
1124 
1125 /*
1126  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1127  */
1128 static void
1129 str_notify_promisc_off_phys(dld_str_t *dsp)
1130 {
1131 	mblk_t		*mp;
1132 	dl_notify_ind_t	*dlip;
1133 
1134 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1135 		return;
1136 
1137 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1138 	    M_PROTO, 0)) == NULL)
1139 		return;
1140 
1141 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1142 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1143 	dlip->dl_primitive = DL_NOTIFY_IND;
1144 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1145 
1146 	qreply(dsp->ds_wq, mp);
1147 }
1148 
1149 /*
1150  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1151  */
1152 static void
1153 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1154 {
1155 	mblk_t		*mp;
1156 	dl_notify_ind_t	*dlip;
1157 	uint_t		addr_length;
1158 	uint16_t	ethertype;
1159 
1160 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1161 		return;
1162 
1163 	addr_length = dsp->ds_mip->mi_addr_length;
1164 	if ((mp = mexchange(dsp->ds_wq, NULL,
1165 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1166 	    M_PROTO, 0)) == NULL)
1167 		return;
1168 
1169 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1170 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1171 	dlip->dl_primitive = DL_NOTIFY_IND;
1172 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1173 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1174 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1175 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1176 
1177 	bcopy(addr, &dlip[1], addr_length);
1178 
1179 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1180 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) =
1181 		ethertype;
1182 
1183 	qreply(dsp->ds_wq, mp);
1184 }
1185 
1186 /*
1187  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1188  */
1189 static void
1190 str_notify_link_up(dld_str_t *dsp)
1191 {
1192 	mblk_t		*mp;
1193 	dl_notify_ind_t	*dlip;
1194 
1195 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1196 		return;
1197 
1198 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1199 	    M_PROTO, 0)) == NULL)
1200 		return;
1201 
1202 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1203 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1204 	dlip->dl_primitive = DL_NOTIFY_IND;
1205 	dlip->dl_notification = DL_NOTE_LINK_UP;
1206 
1207 	qreply(dsp->ds_wq, mp);
1208 }
1209 
1210 /*
1211  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1212  */
1213 static void
1214 str_notify_link_down(dld_str_t *dsp)
1215 {
1216 	mblk_t		*mp;
1217 	dl_notify_ind_t	*dlip;
1218 
1219 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1220 		return;
1221 
1222 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1223 	    M_PROTO, 0)) == NULL)
1224 		return;
1225 
1226 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1227 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1228 	dlip->dl_primitive = DL_NOTIFY_IND;
1229 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1230 
1231 	qreply(dsp->ds_wq, mp);
1232 }
1233 
1234 /*
1235  * DL_NOTIFY_IND: DL_NOTE_SPEED
1236  */
1237 static void
1238 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1239 {
1240 	mblk_t		*mp;
1241 	dl_notify_ind_t	*dlip;
1242 
1243 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1244 		return;
1245 
1246 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1247 	    M_PROTO, 0)) == NULL)
1248 		return;
1249 
1250 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1251 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1252 	dlip->dl_primitive = DL_NOTIFY_IND;
1253 	dlip->dl_notification = DL_NOTE_SPEED;
1254 	dlip->dl_data = speed;
1255 
1256 	qreply(dsp->ds_wq, mp);
1257 }
1258 
1259 /*
1260  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1261  */
1262 static void
1263 str_notify_capab_reneg(dld_str_t *dsp)
1264 {
1265 	mblk_t		*mp;
1266 	dl_notify_ind_t	*dlip;
1267 
1268 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1269 		return;
1270 
1271 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1272 	    M_PROTO, 0)) == NULL)
1273 		return;
1274 
1275 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1276 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1277 	dlip->dl_primitive = DL_NOTIFY_IND;
1278 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1279 
1280 	qreply(dsp->ds_wq, mp);
1281 }
1282 
1283 /*
1284  * MAC notification callback.
1285  */
1286 static void
1287 str_notify(void *arg, mac_notify_type_t type)
1288 {
1289 	dld_str_t		*dsp = (dld_str_t *)arg;
1290 	queue_t			*q = dsp->ds_wq;
1291 
1292 	switch (type) {
1293 	case MAC_NOTE_TX:
1294 		qenable(q);
1295 		break;
1296 
1297 	case MAC_NOTE_DEVPROMISC:
1298 		/*
1299 		 * Send the appropriate DL_NOTIFY_IND.
1300 		 */
1301 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1302 			str_notify_promisc_on_phys(dsp);
1303 		else
1304 			str_notify_promisc_off_phys(dsp);
1305 		break;
1306 
1307 	case MAC_NOTE_PROMISC:
1308 		break;
1309 
1310 	case MAC_NOTE_UNICST:
1311 		/*
1312 		 * This notification is sent whenever the MAC unicast address
1313 		 * changes. We need to re-cache the address.
1314 		 */
1315 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1316 
1317 		/*
1318 		 * Send the appropriate DL_NOTIFY_IND.
1319 		 */
1320 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1321 		break;
1322 
1323 	case MAC_NOTE_LINK:
1324 		/*
1325 		 * This notification is sent every time the MAC driver
1326 		 * updates the link state.
1327 		 */
1328 		switch (mac_link_get(dsp->ds_mh)) {
1329 		case LINK_STATE_UP:
1330 			/*
1331 			 * The link is up so send the appropriate
1332 			 * DL_NOTIFY_IND.
1333 			 */
1334 			str_notify_link_up(dsp);
1335 
1336 			/*
1337 			 * If we can find the link speed then send a
1338 			 * DL_NOTIFY_IND for that too.
1339 			 */
1340 			if (dsp->ds_mip->mi_stat[MAC_STAT_IFSPEED]) {
1341 				uint64_t	val;
1342 
1343 				val = mac_stat_get(dsp->ds_mh,
1344 				    MAC_STAT_IFSPEED);
1345 				str_notify_speed(dsp,
1346 				    (uint32_t)(val / 1000ull));
1347 			}
1348 			break;
1349 
1350 		case LINK_STATE_DOWN:
1351 			/*
1352 			 * The link is down so send the appropriate
1353 			 * DL_NOTIFY_IND.
1354 			 */
1355 			str_notify_link_down(dsp);
1356 			break;
1357 
1358 		default:
1359 			break;
1360 		}
1361 		break;
1362 
1363 	case MAC_NOTE_RESOURCE:
1364 		/*
1365 		 * This notification is sent whenever the MAC resources
1366 		 * change. We need to renegotiate the capabilities.
1367 		 * Send the appropriate DL_NOTIFY_IND.
1368 		 */
1369 		str_notify_capab_reneg(dsp);
1370 		break;
1371 
1372 	default:
1373 		ASSERT(B_FALSE);
1374 		break;
1375 	}
1376 }
1377 
1378 /*
1379  * Enqueue one or more messages to the transmit queue.
1380  * Caller specifies the insertion position (head/tail).
1381  */
1382 void
1383 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1384 {
1385 	mblk_t	*tail;
1386 	queue_t *q = dsp->ds_wq;
1387 	uint_t	cnt, msgcnt;
1388 	uint_t	tot_cnt, tot_msgcnt;
1389 
1390 	ASSERT(DB_TYPE(mp) == M_DATA);
1391 	/* Calculate total size and count of the packet(s) */
1392 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1393 	    tail->b_next != NULL; tail = tail->b_next) {
1394 		ASSERT(DB_TYPE(tail) == M_DATA);
1395 		cnt += msgdsize(tail);
1396 		msgcnt++;
1397 	}
1398 
1399 	mutex_enter(&dsp->ds_tx_list_lock);
1400 	/*
1401 	 * If the queue depth would exceed the allowed threshold, drop
1402 	 * new packet(s) and drain those already in the queue.
1403 	 */
1404 	tot_cnt = dsp->ds_tx_cnt + cnt;
1405 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1406 
1407 	if (!head_insert &&
1408 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1409 		ASSERT(dsp->ds_tx_qbusy);
1410 		mutex_exit(&dsp->ds_tx_list_lock);
1411 		freemsgchain(mp);
1412 		goto done;
1413 	}
1414 
1415 	/* Update the queue size parameters */
1416 	dsp->ds_tx_cnt = tot_cnt;
1417 	dsp->ds_tx_msgcnt = tot_msgcnt;
1418 
1419 	/*
1420 	 * If the transmit queue is currently empty and we are
1421 	 * about to deposit the packet(s) there, switch mode to
1422 	 * "busy" and raise flow-control condition.
1423 	 */
1424 	if (!dsp->ds_tx_qbusy) {
1425 		dsp->ds_tx_qbusy = B_TRUE;
1426 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1427 		(void) putq(q, dsp->ds_tx_flow_mp);
1428 		dsp->ds_tx_flow_mp = NULL;
1429 	}
1430 
1431 	if (!head_insert) {
1432 		/* Tail insertion */
1433 		if (dsp->ds_tx_list_head == NULL)
1434 			dsp->ds_tx_list_head = mp;
1435 		else
1436 			dsp->ds_tx_list_tail->b_next = mp;
1437 		dsp->ds_tx_list_tail = tail;
1438 	} else {
1439 		/* Head insertion */
1440 		tail->b_next = dsp->ds_tx_list_head;
1441 		if (dsp->ds_tx_list_head == NULL)
1442 			dsp->ds_tx_list_tail = tail;
1443 		dsp->ds_tx_list_head = mp;
1444 	}
1445 	mutex_exit(&dsp->ds_tx_list_lock);
1446 done:
1447 	/* Schedule service thread to drain the transmit queue */
1448 	qenable(q);
1449 }
1450 
1451 void
1452 dld_tx_flush(dld_str_t *dsp)
1453 {
1454 	mutex_enter(&dsp->ds_tx_list_lock);
1455 	if (dsp->ds_tx_list_head != NULL) {
1456 		freemsgchain(dsp->ds_tx_list_head);
1457 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1458 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1459 		if (dsp->ds_tx_qbusy) {
1460 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1461 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1462 			dsp->ds_tx_qbusy = B_FALSE;
1463 		}
1464 	}
1465 	mutex_exit(&dsp->ds_tx_list_lock);
1466 }
1467 
1468 /*
1469  * Process an M_IOCTL message.
1470  */
1471 static void
1472 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1473 {
1474 	uint_t			cmd;
1475 
1476 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1477 	ASSERT(dsp->ds_type == DLD_DLPI);
1478 
1479 	switch (cmd) {
1480 	case DLIOCRAW:
1481 		ioc_raw(dsp, mp);
1482 		break;
1483 	case DLIOCHDRINFO:
1484 		ioc_fast(dsp, mp);
1485 		break;
1486 	default:
1487 		ioc(dsp, mp);
1488 	}
1489 }
1490 
1491 /*
1492  * DLIOCRAW
1493  */
1494 static void
1495 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1496 {
1497 	queue_t *q = dsp->ds_wq;
1498 
1499 	rw_enter(&dsp->ds_lock, RW_WRITER);
1500 	if (dsp->ds_polling || dsp->ds_soft_ring) {
1501 		rw_exit(&dsp->ds_lock);
1502 		miocnak(q, mp, 0, EPROTO);
1503 		return;
1504 	}
1505 
1506 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1507 		/*
1508 		 * Set the receive callback.
1509 		 */
1510 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1511 
1512 		/*
1513 		 * Note that raw mode is enabled.
1514 		 */
1515 		dsp->ds_mode = DLD_RAW;
1516 	}
1517 
1518 	rw_exit(&dsp->ds_lock);
1519 	miocack(q, mp, 0, 0);
1520 }
1521 
1522 /*
1523  * DLIOCHDRINFO
1524  */
1525 static void
1526 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1527 {
1528 	dl_unitdata_req_t *dlp;
1529 	off_t		off;
1530 	size_t		len;
1531 	const uint8_t	*addr;
1532 	uint16_t	sap;
1533 	mblk_t		*nmp;
1534 	mblk_t		*hmp;
1535 	uint_t		addr_length;
1536 	queue_t		*q = dsp->ds_wq;
1537 	int		err;
1538 	dls_channel_t	dc;
1539 
1540 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1541 		err = ENOTSUP;
1542 		goto failed;
1543 	}
1544 
1545 	nmp = mp->b_cont;
1546 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1547 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1548 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1549 		err = EINVAL;
1550 		goto failed;
1551 	}
1552 
1553 	off = dlp->dl_dest_addr_offset;
1554 	len = dlp->dl_dest_addr_length;
1555 
1556 	if (!MBLKIN(nmp, off, len)) {
1557 		err = EINVAL;
1558 		goto failed;
1559 	}
1560 
1561 	rw_enter(&dsp->ds_lock, RW_READER);
1562 	if (dsp->ds_dlstate != DL_IDLE) {
1563 		rw_exit(&dsp->ds_lock);
1564 		err = ENOTSUP;
1565 		goto failed;
1566 	}
1567 
1568 	addr_length = dsp->ds_mip->mi_addr_length;
1569 	if (len != addr_length + sizeof (uint16_t)) {
1570 		rw_exit(&dsp->ds_lock);
1571 		err = EINVAL;
1572 		goto failed;
1573 	}
1574 
1575 	addr = nmp->b_rptr + off;
1576 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
1577 	dc = dsp->ds_dc;
1578 
1579 	if ((hmp = dls_header(dc, addr, sap, dsp->ds_pri)) == NULL) {
1580 		rw_exit(&dsp->ds_lock);
1581 		err = ENOMEM;
1582 		goto failed;
1583 	}
1584 
1585 	/*
1586 	 * This is a performance optimization.  We originally entered
1587 	 * as reader and only become writer upon transitioning into
1588 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
1589 	 * stay as reader and return the fast-path header to IP.
1590 	 */
1591 	if (dsp->ds_mode != DLD_FASTPATH) {
1592 		if (!rw_tryupgrade(&dsp->ds_lock)) {
1593 			rw_exit(&dsp->ds_lock);
1594 			rw_enter(&dsp->ds_lock, RW_WRITER);
1595 
1596 			/*
1597 			 * State may have changed before we re-acquired
1598 			 * the writer lock in case the upgrade failed.
1599 			 */
1600 			if (dsp->ds_dlstate != DL_IDLE) {
1601 				rw_exit(&dsp->ds_lock);
1602 				err = ENOTSUP;
1603 				goto failed;
1604 			}
1605 		}
1606 
1607 		/*
1608 		 * Set the receive callback (unless polling is enabled).
1609 		 */
1610 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
1611 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
1612 
1613 		/*
1614 		 * Note that fast-path mode is enabled.
1615 		 */
1616 		dsp->ds_mode = DLD_FASTPATH;
1617 	}
1618 	rw_exit(&dsp->ds_lock);
1619 
1620 	freemsg(nmp->b_cont);
1621 	nmp->b_cont = hmp;
1622 
1623 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
1624 	return;
1625 failed:
1626 	miocnak(q, mp, 0, err);
1627 }
1628 
1629 /*
1630  * Catch-all handler.
1631  */
1632 static void
1633 ioc(dld_str_t *dsp, mblk_t *mp)
1634 {
1635 	queue_t	*q = dsp->ds_wq;
1636 	mac_handle_t mh;
1637 
1638 	rw_enter(&dsp->ds_lock, RW_READER);
1639 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1640 		rw_exit(&dsp->ds_lock);
1641 		miocnak(q, mp, 0, EINVAL);
1642 		return;
1643 	}
1644 	mh = dsp->ds_mh;
1645 	ASSERT(mh != NULL);
1646 	rw_exit(&dsp->ds_lock);
1647 	mac_ioctl(mh, q, mp);
1648 }
1649 
1650 /*
1651  * Allocate a new minor number.
1652  */
1653 static minor_t
1654 dld_minor_hold(boolean_t sleep)
1655 {
1656 	minor_t		minor;
1657 
1658 	/*
1659 	 * Grab a value from the arena.
1660 	 */
1661 	atomic_add_32(&minor_count, 1);
1662 	if ((minor = PTR_TO_MINOR(vmem_alloc(minor_arenap, 1,
1663 	    (sleep) ? VM_SLEEP : VM_NOSLEEP))) == 0) {
1664 		atomic_add_32(&minor_count, -1);
1665 		return (0);
1666 	}
1667 
1668 	return (minor);
1669 }
1670 
1671 /*
1672  * Release a previously allocated minor number.
1673  */
1674 static void
1675 dld_minor_rele(minor_t minor)
1676 {
1677 	/*
1678 	 * Return the value to the arena.
1679 	 */
1680 	vmem_free(minor_arenap, MINOR_TO_PTR(minor), 1);
1681 
1682 	atomic_add_32(&minor_count, -1);
1683 }
1684