xref: /titanic_51/usr/src/uts/common/io/dld/dld_str.c (revision 0173c38a73f34277e0c97a19fedfd25d81ba8380)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Data-Link Driver
30  */
31 
32 #include	<sys/stropts.h>
33 #include	<sys/strsun.h>
34 #include	<sys/strsubr.h>
35 #include	<sys/atomic.h>
36 #include	<sys/mkdev.h>
37 #include	<sys/vlan.h>
38 #include	<sys/dld.h>
39 #include	<sys/dld_impl.h>
40 #include	<sys/dls_impl.h>
41 #include	<inet/common.h>
42 
43 static int	str_constructor(void *, void *, int);
44 static void	str_destructor(void *, void *);
45 static mblk_t	*str_unitdata_ind(dld_str_t *, mblk_t *);
46 static void	str_notify_promisc_on_phys(dld_str_t *);
47 static void	str_notify_promisc_off_phys(dld_str_t *);
48 static void	str_notify_phys_addr(dld_str_t *, const uint8_t *);
49 static void	str_notify_link_up(dld_str_t *);
50 static void	str_notify_link_down(dld_str_t *);
51 static void	str_notify_capab_reneg(dld_str_t *);
52 static void	str_notify_speed(dld_str_t *, uint32_t);
53 static void	str_notify(void *, mac_notify_type_t);
54 
55 static void	ioc_raw(dld_str_t *, mblk_t *);
56 static void	ioc_fast(dld_str_t *,  mblk_t *);
57 static void	ioc(dld_str_t *, mblk_t *);
58 static void	dld_ioc(dld_str_t *, mblk_t *);
59 static minor_t	dld_minor_hold(boolean_t);
60 static void	dld_minor_rele(minor_t);
61 
62 static uint32_t		str_count;
63 static kmem_cache_t	*str_cachep;
64 static vmem_t		*minor_arenap;
65 static uint32_t		minor_count;
66 static mod_hash_t	*str_hashp;
67 
68 #define	MINOR_TO_PTR(minor)	((void *)(uintptr_t)(minor))
69 #define	PTR_TO_MINOR(ptr)	((minor_t)(uintptr_t)(ptr))
70 
71 #define	STR_HASHSZ		64
72 #define	STR_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)(key))
73 
74 /*
75  * Some notes on entry points, flow-control, queueing and locking:
76  *
77  * This driver exports the traditional STREAMS put entry point as well as
78  * the non-STREAMS fast-path transmit routine which is provided to IP via
79  * the DL_CAPAB_POLL negotiation.  The put procedure handles all control
80  * and data operations, while the fast-path routine deals only with M_DATA
81  * fast-path packets.  Regardless of the entry point, all outbound packets
82  * will end up in str_mdata_fastpath_put(), where they will be delivered to
83  * the MAC driver.
84  *
85  * The transmit logic operates in two modes: a "not busy" mode where the
86  * packets will be delivered to the MAC for a send attempt, or "busy" mode
87  * where they will be enqueued in the internal queue because of flow-control.
88  * Flow-control happens when the MAC driver indicates the packets couldn't
89  * be transmitted due to lack of resources (e.g. running out of descriptors).
90  * In such case, the driver will place a dummy message on its write-side
91  * STREAMS queue so that the queue is marked as "full".  Any subsequent
92  * packets arriving at the driver will be enqueued in the internal queue,
93  * which is drained in the context of the service thread that gets scheduled
94  * whenever the driver is in the "busy" mode.  When all packets have been
95  * successfully delivered by MAC and the internal queue is empty, it will
96  * transition to the "not busy" mode by removing the dummy message from the
97  * write-side STREAMS queue; in effect this will trigger backenabling.
98  * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due
99  * to the above reasons.
100  *
101  * The driver implements an internal transmit queue independent of STREAMS.
102  * This allows for flexibility and provides a fast enqueue/dequeue mechanism
103  * compared to the putq() and get() STREAMS interfaces.  The only putq() and
104  * getq() operations done by the driver are those related to placing and
105  * removing the dummy message to/from the write-side STREAMS queue for flow-
106  * control purposes.
107  *
108  * Locking is done independent of STREAMS due to the driver being fully MT.
109  * Threads entering the driver (either from put or service entry points)
110  * will most likely be readers, with the exception of a few writer cases
111  * such those handling DLPI attach/detach/bind/unbind/etc. or any of the
112  * DLD-related ioctl requests.  The DLPI detach case is special, because
113  * it involves freeing resources and therefore must be single-threaded.
114  * Unfortunately the readers/writers lock can't be used to protect against
115  * it, because the lock is dropped prior to the driver calling places where
116  * putnext() may be invoked, and such places may depend on those resources
117  * to exist.  Because of this, the driver always completes the DLPI detach
118  * process when there are no other threads running in the driver.  This is
119  * done by keeping track of the number of threads, such that the the last
120  * thread leaving the driver will finish the pending DLPI detach operation.
121  */
122 
123 /*
124  * dld_max_q_count is the queue depth threshold used to limit the number of
125  * outstanding packets or bytes allowed in the queue; once this limit is
126  * reached the driver will free any incoming ones until the queue depth
127  * drops below the threshold.
128  *
129  * This buffering is provided to accomodate clients which do not employ
130  * their own buffering scheme, and to handle occasional packet bursts.
131  * Clients which handle their own buffering will receive positive feedback
132  * from this driver as soon as it transitions into the "busy" state, i.e.
133  * when the queue is initially filled up; they will get backenabled once
134  * the queue is empty.
135  *
136  * The value chosen here is rather arbitrary; in future some intelligent
137  * heuristics may be involved which could take into account the hardware's
138  * transmit ring size, etc.
139  */
140 uint_t dld_max_q_count = (16 * 1024 *1024);
141 
142 /*
143  * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
144  * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
145  * match dev_t. If a stream is found and it is attached, its dev_info_t *
146  * is returned.
147  */
148 typedef struct i_dld_str_state_s {
149 	major_t		ds_major;
150 	minor_t		ds_minor;
151 	dev_info_t	*ds_dip;
152 } i_dld_str_state_t;
153 
154 /* ARGSUSED */
155 static uint_t
156 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
157 {
158 	i_dld_str_state_t	*statep = arg;
159 	dld_str_t		*dsp = (dld_str_t *)val;
160 
161 	if (statep->ds_major != dsp->ds_major)
162 		return (MH_WALK_CONTINUE);
163 
164 	ASSERT(statep->ds_minor != 0);
165 
166 	/*
167 	 * Access to ds_ppa and ds_mh need to be protected by ds_lock.
168 	 */
169 	rw_enter(&dsp->ds_lock, RW_READER);
170 	if (statep->ds_minor <= DLD_MAX_MINOR) {
171 		/*
172 		 * Style 1: minor can be derived from the ppa. we
173 		 * continue to walk until we find a matching stream
174 		 * in attached state.
175 		 */
176 		if (statep->ds_minor == DLS_PPA2MINOR(dsp->ds_ppa) &&
177 		    dsp->ds_mh != NULL) {
178 			statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
179 			rw_exit(&dsp->ds_lock);
180 			return (MH_WALK_TERMINATE);
181 		}
182 	} else {
183 		/*
184 		 * Clone: a clone minor is unique. we can terminate the
185 		 * walk if we find a matching stream -- even if we fail
186 		 * to obtain the devinfo.
187 		 */
188 		if (statep->ds_minor == dsp->ds_minor) {
189 			if (dsp->ds_mh != NULL)
190 				statep->ds_dip = mac_devinfo_get(dsp->ds_mh);
191 			rw_exit(&dsp->ds_lock);
192 			return (MH_WALK_TERMINATE);
193 		}
194 	}
195 	rw_exit(&dsp->ds_lock);
196 	return (MH_WALK_CONTINUE);
197 }
198 
199 static dev_info_t *
200 dld_finddevinfo(dev_t dev)
201 {
202 	i_dld_str_state_t	state;
203 
204 	state.ds_minor = getminor(dev);
205 	state.ds_major = getmajor(dev);
206 	state.ds_dip = NULL;
207 
208 	if (state.ds_minor == 0)
209 		return (NULL);
210 
211 	mod_hash_walk(str_hashp, i_dld_str_walker, &state);
212 	return (state.ds_dip);
213 }
214 
215 
216 /*
217  * devo_getinfo: getinfo(9e)
218  */
219 /*ARGSUSED*/
220 int
221 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
222 {
223 	dev_info_t	*devinfo;
224 	minor_t		minor = getminor((dev_t)arg);
225 	int		rc = DDI_FAILURE;
226 
227 	switch (cmd) {
228 	case DDI_INFO_DEVT2DEVINFO:
229 		if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
230 			*(dev_info_t **)resp = devinfo;
231 			rc = DDI_SUCCESS;
232 		}
233 		break;
234 	case DDI_INFO_DEVT2INSTANCE:
235 		if (minor > 0 && minor <= DLD_MAX_MINOR) {
236 			*resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
237 			rc = DDI_SUCCESS;
238 		} else if (minor > DLD_MAX_MINOR &&
239 		    (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
240 			*resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
241 			rc = DDI_SUCCESS;
242 		}
243 		break;
244 	}
245 	return (rc);
246 }
247 
248 /*
249  * qi_qopen: open(9e)
250  */
251 /*ARGSUSED*/
252 int
253 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
254 {
255 	dld_str_t	*dsp;
256 	major_t		major;
257 	minor_t		minor;
258 	int		err;
259 
260 	if (sflag == MODOPEN)
261 		return (ENOTSUP);
262 
263 	/*
264 	 * This is a cloning driver and therefore each queue should only
265 	 * ever get opened once.
266 	 */
267 	if (rq->q_ptr != NULL)
268 		return (EBUSY);
269 
270 	major = getmajor(*devp);
271 	minor = getminor(*devp);
272 	if (minor > DLD_MAX_MINOR)
273 		return (ENODEV);
274 
275 	/*
276 	 * Create a new dld_str_t for the stream. This will grab a new minor
277 	 * number that will be handed back in the cloned dev_t.  Creation may
278 	 * fail if we can't allocate the dummy mblk used for flow-control.
279 	 */
280 	dsp = dld_str_create(rq, DLD_DLPI, major,
281 	    ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
282 	if (dsp == NULL)
283 		return (ENOSR);
284 
285 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
286 	if (minor != 0) {
287 		/*
288 		 * Style 1 open
289 		 */
290 
291 		if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
292 			goto failed;
293 		ASSERT(dsp->ds_dlstate == DL_UNBOUND);
294 	} else {
295 		(void) qassociate(rq, -1);
296 	}
297 
298 	/*
299 	 * Enable the queue srv(9e) routine.
300 	 */
301 	qprocson(rq);
302 
303 	/*
304 	 * Construct a cloned dev_t to hand back.
305 	 */
306 	*devp = makedevice(getmajor(*devp), dsp->ds_minor);
307 	return (0);
308 
309 failed:
310 	dld_str_destroy(dsp);
311 	return (err);
312 }
313 
314 /*
315  * qi_qclose: close(9e)
316  */
317 int
318 dld_close(queue_t *rq)
319 {
320 	dld_str_t	*dsp = rq->q_ptr;
321 
322 	/*
323 	 * Wait until pending requests are processed.
324 	 */
325 	mutex_enter(&dsp->ds_thr_lock);
326 	while (dsp->ds_pending_cnt > 0)
327 		cv_wait(&dsp->ds_pending_cv, &dsp->ds_thr_lock);
328 	mutex_exit(&dsp->ds_thr_lock);
329 
330 	/*
331 	 * Disable the queue srv(9e) routine.
332 	 */
333 	qprocsoff(rq);
334 
335 	/*
336 	 * At this point we can not be entered by any threads via STREAMS
337 	 * or the direct call interface, which is available only to IP.
338 	 * After the interface is unplumbed, IP wouldn't have any reference
339 	 * to this instance, and therefore we are now effectively single
340 	 * threaded and don't require any lock protection.  Flush all
341 	 * pending packets which are sitting in the transmit queue.
342 	 */
343 	ASSERT(dsp->ds_thr == 0);
344 	dld_tx_flush(dsp);
345 
346 	/*
347 	 * This stream was open to a provider node. Check to see
348 	 * if it has been cleanly shut down.
349 	 */
350 	if (dsp->ds_dlstate != DL_UNATTACHED) {
351 		/*
352 		 * The stream is either open to a style 1 provider or
353 		 * this is not clean shutdown. Detach from the PPA.
354 		 * (This is still ok even in the style 1 case).
355 		 */
356 		dld_str_detach(dsp);
357 	}
358 
359 	dld_str_destroy(dsp);
360 	return (0);
361 }
362 
363 /*
364  * qi_qputp: put(9e)
365  */
366 void
367 dld_wput(queue_t *wq, mblk_t *mp)
368 {
369 	dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
370 
371 	DLD_ENTER(dsp);
372 
373 	switch (DB_TYPE(mp)) {
374 	case M_DATA:
375 		rw_enter(&dsp->ds_lock, RW_READER);
376 		if (dsp->ds_dlstate != DL_IDLE ||
377 		    dsp->ds_mode == DLD_UNITDATA) {
378 			freemsg(mp);
379 		} else if (dsp->ds_mode == DLD_FASTPATH) {
380 			str_mdata_fastpath_put(dsp, mp);
381 		} else if (dsp->ds_mode == DLD_RAW) {
382 			str_mdata_raw_put(dsp, mp);
383 		}
384 		rw_exit(&dsp->ds_lock);
385 		break;
386 	case M_PROTO:
387 	case M_PCPROTO:
388 		dld_proto(dsp, mp);
389 		break;
390 	case M_IOCTL:
391 		dld_ioc(dsp, mp);
392 		break;
393 	case M_FLUSH:
394 		if (*mp->b_rptr & FLUSHW) {
395 			dld_tx_flush(dsp);
396 			*mp->b_rptr &= ~FLUSHW;
397 		}
398 
399 		if (*mp->b_rptr & FLUSHR) {
400 			qreply(wq, mp);
401 		} else {
402 			freemsg(mp);
403 		}
404 		break;
405 	default:
406 		freemsg(mp);
407 		break;
408 	}
409 
410 	DLD_EXIT(dsp);
411 }
412 
413 /*
414  * qi_srvp: srv(9e)
415  */
416 void
417 dld_wsrv(queue_t *wq)
418 {
419 	mblk_t		*mp;
420 	dld_str_t	*dsp = wq->q_ptr;
421 
422 	DLD_ENTER(dsp);
423 	rw_enter(&dsp->ds_lock, RW_READER);
424 	/*
425 	 * Grab all packets (chained via b_next) off our transmit queue
426 	 * and try to send them all to the MAC layer.  Since the queue
427 	 * is independent of streams, we are able to dequeue all messages
428 	 * at once without looping through getq() and manually chaining
429 	 * them.  Note that the queue size parameters (byte and message
430 	 * counts) are cleared as well, but we postpone the backenabling
431 	 * until after the MAC transmit since some packets may end up
432 	 * back at our transmit queue.
433 	 */
434 	mutex_enter(&dsp->ds_tx_list_lock);
435 	if ((mp = dsp->ds_tx_list_head) == NULL) {
436 		ASSERT(!dsp->ds_tx_qbusy);
437 		ASSERT(dsp->ds_tx_flow_mp != NULL);
438 		ASSERT(dsp->ds_tx_list_head == NULL);
439 		ASSERT(dsp->ds_tx_list_tail == NULL);
440 		ASSERT(dsp->ds_tx_cnt == 0);
441 		ASSERT(dsp->ds_tx_msgcnt == 0);
442 		mutex_exit(&dsp->ds_tx_list_lock);
443 		rw_exit(&dsp->ds_lock);
444 		DLD_EXIT(dsp);
445 		return;
446 	}
447 	dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
448 	dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
449 	mutex_exit(&dsp->ds_tx_list_lock);
450 
451 	/*
452 	 * Discard packets unless we are attached and bound; note that
453 	 * the driver mode (fastpath/raw/unitdata) is irrelevant here,
454 	 * because regardless of the mode all transmit will end up in
455 	 * str_mdata_fastpath_put() where the packets may be queued.
456 	 */
457 	ASSERT(DB_TYPE(mp) == M_DATA);
458 	if (dsp->ds_dlstate != DL_IDLE) {
459 		freemsgchain(mp);
460 		goto done;
461 	}
462 
463 	/*
464 	 * Attempt to transmit one or more packets.  If the MAC can't
465 	 * send them all, re-queue the packet(s) at the beginning of
466 	 * the transmit queue to avoid any re-ordering.
467 	 */
468 	if ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)
469 		dld_tx_enqueue(dsp, mp, B_TRUE);
470 
471 done:
472 	/*
473 	 * Grab the list lock again and check if the transmit queue is
474 	 * really empty; if so, lift up flow-control and backenable any
475 	 * writer queues.  If the queue is not empty, schedule service
476 	 * thread to drain it.
477 	 */
478 	mutex_enter(&dsp->ds_tx_list_lock);
479 	if (dsp->ds_tx_list_head == NULL) {
480 		dsp->ds_tx_flow_mp = getq(wq);
481 		ASSERT(dsp->ds_tx_flow_mp != NULL);
482 		dsp->ds_tx_qbusy = B_FALSE;
483 	}
484 	mutex_exit(&dsp->ds_tx_list_lock);
485 
486 	rw_exit(&dsp->ds_lock);
487 	DLD_EXIT(dsp);
488 }
489 
490 void
491 dld_init_ops(struct dev_ops *ops, const char *name)
492 {
493 	struct streamtab *stream;
494 	struct qinit *rq, *wq;
495 	struct module_info *modinfo;
496 
497 	modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
498 	modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
499 	(void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
500 	modinfo->mi_minpsz = 0;
501 	modinfo->mi_maxpsz = 64*1024;
502 	modinfo->mi_hiwat  = 1;
503 	modinfo->mi_lowat = 0;
504 
505 	rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
506 	rq->qi_qopen = dld_open;
507 	rq->qi_qclose = dld_close;
508 	rq->qi_minfo = modinfo;
509 
510 	wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
511 	wq->qi_putp = (pfi_t)dld_wput;
512 	wq->qi_srvp = (pfi_t)dld_wsrv;
513 	wq->qi_minfo = modinfo;
514 
515 	stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
516 	stream->st_rdinit = rq;
517 	stream->st_wrinit = wq;
518 	ops->devo_cb_ops->cb_str = stream;
519 
520 	ops->devo_getinfo = &dld_getinfo;
521 }
522 
523 void
524 dld_fini_ops(struct dev_ops *ops)
525 {
526 	struct streamtab *stream;
527 	struct qinit *rq, *wq;
528 	struct module_info *modinfo;
529 
530 	stream = ops->devo_cb_ops->cb_str;
531 	rq = stream->st_rdinit;
532 	wq = stream->st_wrinit;
533 	modinfo = rq->qi_minfo;
534 	ASSERT(wq->qi_minfo == modinfo);
535 
536 	kmem_free(stream, sizeof (struct streamtab));
537 	kmem_free(wq, sizeof (struct qinit));
538 	kmem_free(rq, sizeof (struct qinit));
539 	kmem_free(modinfo->mi_idname, FMNAMESZ);
540 	kmem_free(modinfo, sizeof (struct module_info));
541 }
542 
543 /*
544  * Initialize this module's data structures.
545  */
546 void
547 dld_str_init(void)
548 {
549 	/*
550 	 * Create dld_str_t object cache.
551 	 */
552 	str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
553 	    0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
554 	ASSERT(str_cachep != NULL);
555 
556 	/*
557 	 * Allocate a vmem arena to manage minor numbers. The range of the
558 	 * arena will be from DLD_MAX_MINOR + 1 to MAXMIN (maximum legal
559 	 * minor number).
560 	 */
561 	minor_arenap = vmem_create("dld_minor_arena",
562 	    MINOR_TO_PTR(DLD_MAX_MINOR + 1), MAXMIN, 1, NULL, NULL, NULL, 0,
563 	    VM_SLEEP | VMC_IDENTIFIER);
564 	ASSERT(minor_arenap != NULL);
565 
566 	/*
567 	 * Create a hash table for maintaining dld_str_t's.
568 	 * The ds_minor field (the clone minor number) of a dld_str_t
569 	 * is used as a key for this hash table because this number is
570 	 * globally unique (allocated from "dld_minor_arena").
571 	 */
572 	str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
573 	    mod_hash_null_valdtor);
574 }
575 
576 /*
577  * Tear down this module's data structures.
578  */
579 int
580 dld_str_fini(void)
581 {
582 	/*
583 	 * Make sure that there are no objects in use.
584 	 */
585 	if (str_count != 0)
586 		return (EBUSY);
587 
588 	/*
589 	 * Check to see if there are any minor numbers still in use.
590 	 */
591 	if (minor_count != 0)
592 		return (EBUSY);
593 
594 	/*
595 	 * Destroy object cache.
596 	 */
597 	kmem_cache_destroy(str_cachep);
598 	vmem_destroy(minor_arenap);
599 	mod_hash_destroy_idhash(str_hashp);
600 	return (0);
601 }
602 
603 /*
604  * Create a new dld_str_t object.
605  */
606 dld_str_t *
607 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
608 {
609 	dld_str_t	*dsp;
610 	int		err;
611 
612 	/*
613 	 * Allocate an object from the cache.
614 	 */
615 	atomic_add_32(&str_count, 1);
616 	dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
617 
618 	/*
619 	 * Allocate the dummy mblk for flow-control.
620 	 */
621 	dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
622 	if (dsp->ds_tx_flow_mp == NULL) {
623 		kmem_cache_free(str_cachep, dsp);
624 		atomic_add_32(&str_count, -1);
625 		return (NULL);
626 	}
627 	dsp->ds_type = type;
628 	dsp->ds_major = major;
629 	dsp->ds_style = style;
630 
631 	/*
632 	 * Initialize the queue pointers.
633 	 */
634 	ASSERT(RD(rq) == rq);
635 	dsp->ds_rq = rq;
636 	dsp->ds_wq = WR(rq);
637 	rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
638 
639 	/*
640 	 * We want explicit control over our write-side STREAMS queue
641 	 * where the dummy mblk gets added/removed for flow-control.
642 	 */
643 	noenable(WR(rq));
644 
645 	err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
646 	    (mod_hash_val_t)dsp);
647 	ASSERT(err == 0);
648 	return (dsp);
649 }
650 
651 /*
652  * Destroy a dld_str_t object.
653  */
654 void
655 dld_str_destroy(dld_str_t *dsp)
656 {
657 	queue_t		*rq;
658 	queue_t		*wq;
659 	mod_hash_val_t	val;
660 	/*
661 	 * Clear the queue pointers.
662 	 */
663 	rq = dsp->ds_rq;
664 	wq = dsp->ds_wq;
665 	ASSERT(wq == WR(rq));
666 
667 	rq->q_ptr = wq->q_ptr = NULL;
668 	dsp->ds_rq = dsp->ds_wq = NULL;
669 
670 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
671 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
672 	ASSERT(dsp->ds_tx_list_head == NULL);
673 	ASSERT(dsp->ds_tx_list_tail == NULL);
674 	ASSERT(dsp->ds_tx_cnt == 0);
675 	ASSERT(dsp->ds_tx_msgcnt == 0);
676 	ASSERT(!dsp->ds_tx_qbusy);
677 
678 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
679 	ASSERT(dsp->ds_thr == 0);
680 	ASSERT(dsp->ds_pending_req == NULL);
681 
682 	/*
683 	 * Reinitialize all the flags.
684 	 */
685 	dsp->ds_notifications = 0;
686 	dsp->ds_passivestate = DLD_UNINITIALIZED;
687 	dsp->ds_mode = DLD_UNITDATA;
688 
689 	/*
690 	 * Free the dummy mblk if exists.
691 	 */
692 	if (dsp->ds_tx_flow_mp != NULL) {
693 		freeb(dsp->ds_tx_flow_mp);
694 		dsp->ds_tx_flow_mp = NULL;
695 	}
696 
697 	(void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
698 	ASSERT(dsp == (dld_str_t *)val);
699 
700 	/*
701 	 * Free the object back to the cache.
702 	 */
703 	kmem_cache_free(str_cachep, dsp);
704 	atomic_add_32(&str_count, -1);
705 }
706 
707 /*
708  * kmem_cache contructor function: see kmem_cache_create(9f).
709  */
710 /*ARGSUSED*/
711 static int
712 str_constructor(void *buf, void *cdrarg, int kmflags)
713 {
714 	dld_str_t	*dsp = buf;
715 
716 	bzero(buf, sizeof (dld_str_t));
717 
718 	/*
719 	 * Allocate a new minor number.
720 	 */
721 	if ((dsp->ds_minor = dld_minor_hold(kmflags == KM_SLEEP)) == 0)
722 		return (-1);
723 
724 	/*
725 	 * Initialize the DLPI state machine.
726 	 */
727 	dsp->ds_dlstate = DL_UNATTACHED;
728 	dsp->ds_ppa = (t_uscalar_t)-1;
729 
730 	mutex_init(&dsp->ds_thr_lock, NULL, MUTEX_DRIVER, NULL);
731 	rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL);
732 	mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL);
733 	cv_init(&dsp->ds_pending_cv, NULL, CV_DRIVER, NULL);
734 
735 	return (0);
736 }
737 
738 /*
739  * kmem_cache destructor function.
740  */
741 /*ARGSUSED*/
742 static void
743 str_destructor(void *buf, void *cdrarg)
744 {
745 	dld_str_t	*dsp = buf;
746 
747 	/*
748 	 * Make sure the DLPI state machine was reset.
749 	 */
750 	ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
751 
752 	/*
753 	 * Make sure the data-link interface was closed.
754 	 */
755 	ASSERT(dsp->ds_mh == NULL);
756 	ASSERT(dsp->ds_dc == NULL);
757 
758 	/*
759 	 * Make sure enabled notifications are cleared.
760 	 */
761 	ASSERT(dsp->ds_notifications == 0);
762 
763 	/*
764 	 * Make sure polling is disabled.
765 	 */
766 	ASSERT(!dsp->ds_polling);
767 
768 	/*
769 	 * Release the minor number.
770 	 */
771 	dld_minor_rele(dsp->ds_minor);
772 
773 	ASSERT(!RW_LOCK_HELD(&dsp->ds_lock));
774 	rw_destroy(&dsp->ds_lock);
775 
776 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock));
777 	mutex_destroy(&dsp->ds_tx_list_lock);
778 	ASSERT(dsp->ds_tx_flow_mp == NULL);
779 
780 	ASSERT(MUTEX_NOT_HELD(&dsp->ds_thr_lock));
781 	mutex_destroy(&dsp->ds_thr_lock);
782 	ASSERT(dsp->ds_pending_req == NULL);
783 	ASSERT(dsp->ds_pending_op == NULL);
784 	ASSERT(dsp->ds_pending_cnt == 0);
785 	cv_destroy(&dsp->ds_pending_cv);
786 }
787 
788 /*
789  * M_DATA put (IP fast-path mode)
790  */
791 void
792 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp)
793 {
794 	/*
795 	 * This function can be called from within dld or from an upper
796 	 * layer protocol (currently only tcp). If we are in the busy
797 	 * mode enqueue the packet(s) and return.  Otherwise hand them
798 	 * over to the MAC driver for transmission; any remaining one(s)
799 	 * which didn't get sent will be queued.
800 	 *
801 	 * Note here that we don't grab the list lock prior to checking
802 	 * the busy flag.  This is okay, because a missed transition
803 	 * will not cause any packet reordering for any particular TCP
804 	 * connection (which is single-threaded).  The enqueue routine
805 	 * will atomically set the busy flag and schedule the service
806 	 * thread to run; the flag is only cleared by the service thread
807 	 * when there is no more packet to be transmitted.
808 	 */
809 	if (dsp->ds_tx_qbusy || (mp = dls_tx(dsp->ds_dc, mp)) != NULL)
810 		dld_tx_enqueue(dsp, mp, B_FALSE);
811 }
812 
813 /*
814  * M_DATA put (raw mode)
815  */
816 void
817 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
818 {
819 	mblk_t			*bp, *newmp;
820 	size_t			size;
821 	mac_header_info_t	mhi;
822 
823 	/*
824 	 * Certain MAC type plugins provide an illusion for raw DLPI
825 	 * consumers.  They pretend that the MAC layer is something that
826 	 * it's not for the benefit of observability tools.  For example, a
827 	 * wifi plugin might pretend that it's Ethernet for such consumers.
828 	 * Here, we call into the MAC layer so that this illusion can be
829 	 * maintained.  The plugin will optionally transform the MAC header
830 	 * here into something that can be passed down.  The header goes
831 	 * from raw mode to "cooked" mode.
832 	 */
833 	if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
834 		goto discard;
835 	mp = newmp;
836 
837 	size = MBLKL(mp);
838 
839 	/*
840 	 * Check the packet is not too big and that any remaining
841 	 * fragment list is composed entirely of M_DATA messages. (We
842 	 * know the first fragment was M_DATA otherwise we could not
843 	 * have got here).
844 	 */
845 	for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
846 		if (DB_TYPE(bp) != M_DATA)
847 			goto discard;
848 		size += MBLKL(bp);
849 	}
850 
851 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
852 		goto discard;
853 
854 	if (size > dsp->ds_mip->mi_sdu_max + mhi.mhi_hdrsize)
855 		goto discard;
856 
857 	if (dsp->ds_mip->mi_media == DL_ETHER && mhi.mhi_origsap == VLAN_TPID) {
858 		struct ether_vlan_header	*evhp;
859 
860 		if (size < sizeof (struct ether_vlan_header))
861 			goto discard;
862 		/*
863 		 * Replace vtag with our own
864 		 */
865 		evhp = (struct ether_vlan_header *)mp->b_rptr;
866 		evhp->ether_tci = htons(VLAN_TCI(dsp->ds_pri,
867 		    ETHER_CFI, dsp->ds_vid));
868 	}
869 
870 	str_mdata_fastpath_put(dsp, mp);
871 	return;
872 
873 discard:
874 	freemsg(mp);
875 }
876 
877 /*
878  * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
879  */
880 int
881 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
882 {
883 	int			err;
884 	const char		*drvname;
885 	char			name[MAXNAMELEN];
886 	dls_channel_t		dc;
887 	uint_t			addr_length;
888 
889 	ASSERT(dsp->ds_dc == NULL);
890 
891 	if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
892 		return (EINVAL);
893 
894 	(void) snprintf(name, MAXNAMELEN, "%s%u", drvname, ppa);
895 
896 	if (strcmp(drvname, "aggr") != 0 &&
897 	    qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
898 		return (EINVAL);
899 
900 	/*
901 	 * Open a channel.
902 	 */
903 	if ((err = dls_open(name, &dc)) != 0) {
904 		(void) qassociate(dsp->ds_wq, -1);
905 		return (err);
906 	}
907 
908 	/*
909 	 * Cache the MAC interface handle, a pointer to the immutable MAC
910 	 * information and the current and 'factory' MAC address.
911 	 */
912 	dsp->ds_mh = dls_mac(dc);
913 	dsp->ds_mip = mac_info(dsp->ds_mh);
914 
915 	mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
916 
917 	addr_length = dsp->ds_mip->mi_addr_length;
918 	bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length);
919 
920 	/*
921 	 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for
922 	 * a non-VLAN interface).
923 	 */
924 	dsp->ds_vid = dls_vid(dc);
925 
926 	/*
927 	 * Set the default packet priority.
928 	 */
929 	dsp->ds_pri = 0;
930 
931 	/*
932 	 * Add a notify function so that the we get updates from the MAC.
933 	 */
934 	dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp);
935 
936 	dsp->ds_ppa = ppa;
937 	dsp->ds_dc = dc;
938 	dsp->ds_dlstate = DL_UNBOUND;
939 
940 	return (0);
941 }
942 
943 /*
944  * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
945  * from close(2) for style 2.
946  */
947 void
948 dld_str_detach(dld_str_t *dsp)
949 {
950 	ASSERT(dsp->ds_thr == 0);
951 
952 	/*
953 	 * Remove the notify function.
954 	 */
955 	mac_notify_remove(dsp->ds_mh, dsp->ds_mnh);
956 
957 	/*
958 	 * Clear the polling and promisc flags.
959 	 */
960 	dsp->ds_polling = B_FALSE;
961 	dsp->ds_soft_ring = B_FALSE;
962 	dsp->ds_promisc = 0;
963 
964 	/*
965 	 * Close the channel.
966 	 */
967 	dls_close(dsp->ds_dc);
968 	dsp->ds_ppa = (t_uscalar_t)-1;
969 	dsp->ds_dc = NULL;
970 	dsp->ds_mh = NULL;
971 
972 	(void) qassociate(dsp->ds_wq, -1);
973 
974 	/*
975 	 * Re-initialize the DLPI state machine.
976 	 */
977 	dsp->ds_dlstate = DL_UNATTACHED;
978 
979 }
980 
981 /*
982  * Raw mode receive function.
983  */
984 /*ARGSUSED*/
985 void
986 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
987     size_t header_length)
988 {
989 	dld_str_t		*dsp = (dld_str_t *)arg;
990 	mblk_t			*next, *newmp;
991 
992 	ASSERT(mp != NULL);
993 	do {
994 		/*
995 		 * Get the pointer to the next packet in the chain and then
996 		 * clear b_next before the packet gets passed on.
997 		 */
998 		next = mp->b_next;
999 		mp->b_next = NULL;
1000 
1001 		/*
1002 		 * Wind back b_rptr to point at the MAC header.
1003 		 */
1004 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
1005 		mp->b_rptr -= header_length;
1006 
1007 		/*
1008 		 * Certain MAC type plugins provide an illusion for raw
1009 		 * DLPI consumers.  They pretend that the MAC layer is
1010 		 * something that it's not for the benefit of observability
1011 		 * tools.  For example, a wifi plugin might pretend that
1012 		 * it's Ethernet for such consumers.  Here, we call into
1013 		 * the MAC layer so that this illusion can be maintained.
1014 		 * The plugin will optionally transform the MAC header here
1015 		 * into something that can be passed up to raw consumers.
1016 		 * The header goes from "cooked" mode to raw mode.
1017 		 */
1018 		if ((newmp = mac_header_uncook(dsp->ds_mh, mp)) == NULL) {
1019 			freemsg(mp);
1020 			mp = next;
1021 			continue;
1022 		}
1023 		mp = newmp;
1024 
1025 		if (dsp->ds_mip->mi_media == DL_ETHER) {
1026 			struct ether_header *ehp =
1027 			    (struct ether_header *)mp->b_rptr;
1028 
1029 			if (ntohs(ehp->ether_type) == VLAN_TPID) {
1030 				/*
1031 				 * Strip off the vtag
1032 				 */
1033 				ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ,
1034 				    2 * ETHERADDRL);
1035 				mp->b_rptr += VLAN_TAGSZ;
1036 			}
1037 		}
1038 		/*
1039 		 * Pass the packet on.
1040 		 */
1041 		if (canputnext(dsp->ds_rq))
1042 			putnext(dsp->ds_rq, mp);
1043 		else
1044 			freemsg(mp);
1045 
1046 		/*
1047 		 * Move on to the next packet in the chain.
1048 		 */
1049 		mp = next;
1050 	} while (mp != NULL);
1051 }
1052 
1053 /*
1054  * Fast-path receive function.
1055  */
1056 /*ARGSUSED*/
1057 void
1058 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1059     size_t header_length)
1060 {
1061 	dld_str_t		*dsp = (dld_str_t *)arg;
1062 	mblk_t			*next;
1063 
1064 	ASSERT(mp != NULL);
1065 	do {
1066 		/*
1067 		 * Get the pointer to the next packet in the chain and then
1068 		 * clear b_next before the packet gets passed on.
1069 		 */
1070 		next = mp->b_next;
1071 		mp->b_next = NULL;
1072 
1073 		/*
1074 		 * Pass the packet on.
1075 		 */
1076 		if (canputnext(dsp->ds_rq))
1077 			putnext(dsp->ds_rq, mp);
1078 		else
1079 			freemsg(mp);
1080 		/*
1081 		 * Move on to the next packet in the chain.
1082 		 */
1083 		mp = next;
1084 	} while (mp != NULL);
1085 }
1086 
1087 /*
1088  * Default receive function (send DL_UNITDATA_IND messages).
1089  */
1090 /*ARGSUSED*/
1091 void
1092 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1093     size_t header_length)
1094 {
1095 	dld_str_t		*dsp = (dld_str_t *)arg;
1096 	mblk_t			*ud_mp;
1097 	mblk_t			*next;
1098 
1099 	ASSERT(mp != NULL);
1100 	do {
1101 		/*
1102 		 * Get the pointer to the next packet in the chain and then
1103 		 * clear b_next before the packet gets passed on.
1104 		 */
1105 		next = mp->b_next;
1106 		mp->b_next = NULL;
1107 
1108 		/*
1109 		 * Wind back b_rptr to point at the MAC header.
1110 		 */
1111 		ASSERT(mp->b_rptr >= DB_BASE(mp) + header_length);
1112 		mp->b_rptr -= header_length;
1113 
1114 		/*
1115 		 * Create the DL_UNITDATA_IND M_PROTO.
1116 		 */
1117 		if ((ud_mp = str_unitdata_ind(dsp, mp)) == NULL) {
1118 			freemsgchain(mp);
1119 			return;
1120 		}
1121 
1122 		/*
1123 		 * Advance b_rptr to point at the payload again.
1124 		 */
1125 		mp->b_rptr += header_length;
1126 
1127 		/*
1128 		 * Prepend the DL_UNITDATA_IND.
1129 		 */
1130 		ud_mp->b_cont = mp;
1131 
1132 		/*
1133 		 * Send the message.
1134 		 */
1135 		if (canputnext(dsp->ds_rq))
1136 			putnext(dsp->ds_rq, ud_mp);
1137 		else
1138 			freemsg(ud_mp);
1139 
1140 		/*
1141 		 * Move on to the next packet in the chain.
1142 		 */
1143 		mp = next;
1144 	} while (mp != NULL);
1145 }
1146 
1147 /*
1148  * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1149  * current state of the interface.
1150  */
1151 void
1152 dld_str_notify_ind(dld_str_t *dsp)
1153 {
1154 	mac_notify_type_t	type;
1155 
1156 	for (type = 0; type < MAC_NNOTE; type++)
1157 		str_notify(dsp, type);
1158 }
1159 
1160 typedef struct dl_unitdata_ind_wrapper {
1161 	dl_unitdata_ind_t	dl_unitdata;
1162 	uint8_t			dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1163 	uint8_t			dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1164 } dl_unitdata_ind_wrapper_t;
1165 
1166 /*
1167  * Create a DL_UNITDATA_IND M_PROTO message.
1168  */
1169 static mblk_t *
1170 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp)
1171 {
1172 	mblk_t				*nmp;
1173 	dl_unitdata_ind_wrapper_t	*dlwp;
1174 	dl_unitdata_ind_t		*dlp;
1175 	mac_header_info_t		mhi;
1176 	uint_t				addr_length;
1177 	uint8_t				*daddr;
1178 	uint8_t				*saddr;
1179 
1180 	/*
1181 	 * Get the packet header information.
1182 	 */
1183 	if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0)
1184 		return (NULL);
1185 
1186 	/*
1187 	 * Allocate a message large enough to contain the wrapper structure
1188 	 * defined above.
1189 	 */
1190 	if ((nmp = mexchange(dsp->ds_wq, NULL,
1191 	    sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1192 	    DL_UNITDATA_IND)) == NULL)
1193 		return (NULL);
1194 
1195 	dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1196 
1197 	dlp = &(dlwp->dl_unitdata);
1198 	ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1199 	ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1200 
1201 	/*
1202 	 * Copy in the destination address.
1203 	 */
1204 	addr_length = dsp->ds_mip->mi_addr_length;
1205 	daddr = dlwp->dl_dest_addr;
1206 	dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1207 	bcopy(mhi.mhi_daddr, daddr, addr_length);
1208 
1209 	/*
1210 	 * Set the destination DLSAP to our bound DLSAP value.
1211 	 */
1212 	*(uint16_t *)(daddr + addr_length) = dsp->ds_sap;
1213 	dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1214 
1215 	/*
1216 	 * If the destination address was multicast or broadcast then the
1217 	 * dl_group_address field should be non-zero.
1218 	 */
1219 	dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1220 	    (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1221 
1222 	/*
1223 	 * Copy in the source address if one exists.  Some MAC types (DL_IB
1224 	 * for example) may not have access to source information.
1225 	 */
1226 	if (mhi.mhi_saddr == NULL) {
1227 		dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1228 	} else {
1229 		saddr = dlwp->dl_src_addr;
1230 		dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1231 		bcopy(mhi.mhi_saddr, saddr, addr_length);
1232 
1233 		/*
1234 		 * Set the source DLSAP to the packet ethertype.
1235 		 */
1236 		*(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1237 		dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1238 	}
1239 
1240 	return (nmp);
1241 }
1242 
1243 /*
1244  * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1245  */
1246 static void
1247 str_notify_promisc_on_phys(dld_str_t *dsp)
1248 {
1249 	mblk_t		*mp;
1250 	dl_notify_ind_t	*dlip;
1251 
1252 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1253 		return;
1254 
1255 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1256 	    M_PROTO, 0)) == NULL)
1257 		return;
1258 
1259 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1260 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1261 	dlip->dl_primitive = DL_NOTIFY_IND;
1262 	dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1263 
1264 	qreply(dsp->ds_wq, mp);
1265 }
1266 
1267 /*
1268  * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1269  */
1270 static void
1271 str_notify_promisc_off_phys(dld_str_t *dsp)
1272 {
1273 	mblk_t		*mp;
1274 	dl_notify_ind_t	*dlip;
1275 
1276 	if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1277 		return;
1278 
1279 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1280 	    M_PROTO, 0)) == NULL)
1281 		return;
1282 
1283 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1284 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1285 	dlip->dl_primitive = DL_NOTIFY_IND;
1286 	dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1287 
1288 	qreply(dsp->ds_wq, mp);
1289 }
1290 
1291 /*
1292  * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1293  */
1294 static void
1295 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr)
1296 {
1297 	mblk_t		*mp;
1298 	dl_notify_ind_t	*dlip;
1299 	uint_t		addr_length;
1300 	uint16_t	ethertype;
1301 
1302 	if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1303 		return;
1304 
1305 	addr_length = dsp->ds_mip->mi_addr_length;
1306 	if ((mp = mexchange(dsp->ds_wq, NULL,
1307 	    sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1308 	    M_PROTO, 0)) == NULL)
1309 		return;
1310 
1311 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1312 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1313 	dlip->dl_primitive = DL_NOTIFY_IND;
1314 	dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1315 	dlip->dl_data = DL_CURR_PHYS_ADDR;
1316 	dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1317 	dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1318 
1319 	bcopy(addr, &dlip[1], addr_length);
1320 
1321 	ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1322 	*(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) =
1323 		ethertype;
1324 
1325 	qreply(dsp->ds_wq, mp);
1326 }
1327 
1328 /*
1329  * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1330  */
1331 static void
1332 str_notify_link_up(dld_str_t *dsp)
1333 {
1334 	mblk_t		*mp;
1335 	dl_notify_ind_t	*dlip;
1336 
1337 	if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1338 		return;
1339 
1340 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1341 	    M_PROTO, 0)) == NULL)
1342 		return;
1343 
1344 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1345 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1346 	dlip->dl_primitive = DL_NOTIFY_IND;
1347 	dlip->dl_notification = DL_NOTE_LINK_UP;
1348 
1349 	qreply(dsp->ds_wq, mp);
1350 }
1351 
1352 /*
1353  * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1354  */
1355 static void
1356 str_notify_link_down(dld_str_t *dsp)
1357 {
1358 	mblk_t		*mp;
1359 	dl_notify_ind_t	*dlip;
1360 
1361 	if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1362 		return;
1363 
1364 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1365 	    M_PROTO, 0)) == NULL)
1366 		return;
1367 
1368 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1369 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1370 	dlip->dl_primitive = DL_NOTIFY_IND;
1371 	dlip->dl_notification = DL_NOTE_LINK_DOWN;
1372 
1373 	qreply(dsp->ds_wq, mp);
1374 }
1375 
1376 /*
1377  * DL_NOTIFY_IND: DL_NOTE_SPEED
1378  */
1379 static void
1380 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1381 {
1382 	mblk_t		*mp;
1383 	dl_notify_ind_t	*dlip;
1384 
1385 	if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1386 		return;
1387 
1388 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1389 	    M_PROTO, 0)) == NULL)
1390 		return;
1391 
1392 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1393 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1394 	dlip->dl_primitive = DL_NOTIFY_IND;
1395 	dlip->dl_notification = DL_NOTE_SPEED;
1396 	dlip->dl_data = speed;
1397 
1398 	qreply(dsp->ds_wq, mp);
1399 }
1400 
1401 /*
1402  * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1403  */
1404 static void
1405 str_notify_capab_reneg(dld_str_t *dsp)
1406 {
1407 	mblk_t		*mp;
1408 	dl_notify_ind_t	*dlip;
1409 
1410 	if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1411 		return;
1412 
1413 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1414 	    M_PROTO, 0)) == NULL)
1415 		return;
1416 
1417 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1418 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1419 	dlip->dl_primitive = DL_NOTIFY_IND;
1420 	dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1421 
1422 	qreply(dsp->ds_wq, mp);
1423 }
1424 
1425 /*
1426  * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1427  */
1428 static void
1429 str_notify_fastpath_flush(dld_str_t *dsp)
1430 {
1431 	mblk_t		*mp;
1432 	dl_notify_ind_t	*dlip;
1433 
1434 	if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1435 		return;
1436 
1437 	if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1438 	    M_PROTO, 0)) == NULL)
1439 		return;
1440 
1441 	bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1442 	dlip = (dl_notify_ind_t *)mp->b_rptr;
1443 	dlip->dl_primitive = DL_NOTIFY_IND;
1444 	dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1445 
1446 	qreply(dsp->ds_wq, mp);
1447 }
1448 
1449 /*
1450  * MAC notification callback.
1451  */
1452 static void
1453 str_notify(void *arg, mac_notify_type_t type)
1454 {
1455 	dld_str_t		*dsp = (dld_str_t *)arg;
1456 	queue_t			*q = dsp->ds_wq;
1457 
1458 	switch (type) {
1459 	case MAC_NOTE_TX:
1460 		qenable(q);
1461 		break;
1462 
1463 	case MAC_NOTE_DEVPROMISC:
1464 		/*
1465 		 * Send the appropriate DL_NOTIFY_IND.
1466 		 */
1467 		if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC))
1468 			str_notify_promisc_on_phys(dsp);
1469 		else
1470 			str_notify_promisc_off_phys(dsp);
1471 		break;
1472 
1473 	case MAC_NOTE_PROMISC:
1474 		break;
1475 
1476 	case MAC_NOTE_UNICST:
1477 		/*
1478 		 * This notification is sent whenever the MAC unicast address
1479 		 * changes. We need to re-cache the address.
1480 		 */
1481 		mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr);
1482 
1483 		/*
1484 		 * Send the appropriate DL_NOTIFY_IND.
1485 		 */
1486 		str_notify_phys_addr(dsp, dsp->ds_curr_addr);
1487 		break;
1488 
1489 	case MAC_NOTE_LINK:
1490 		/*
1491 		 * This notification is sent every time the MAC driver
1492 		 * updates the link state.
1493 		 */
1494 		switch (mac_link_get(dsp->ds_mh)) {
1495 		case LINK_STATE_UP: {
1496 			uint64_t speed;
1497 			/*
1498 			 * The link is up so send the appropriate
1499 			 * DL_NOTIFY_IND.
1500 			 */
1501 			str_notify_link_up(dsp);
1502 
1503 			speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED);
1504 			str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1505 			break;
1506 		}
1507 		case LINK_STATE_DOWN:
1508 			/*
1509 			 * The link is down so send the appropriate
1510 			 * DL_NOTIFY_IND.
1511 			 */
1512 			str_notify_link_down(dsp);
1513 			break;
1514 
1515 		default:
1516 			break;
1517 		}
1518 		break;
1519 
1520 	case MAC_NOTE_RESOURCE:
1521 		/*
1522 		 * This notification is sent whenever the MAC resources
1523 		 * change. We need to renegotiate the capabilities.
1524 		 * Send the appropriate DL_NOTIFY_IND.
1525 		 */
1526 		str_notify_capab_reneg(dsp);
1527 		break;
1528 
1529 	case MAC_NOTE_FASTPATH_FLUSH:
1530 		str_notify_fastpath_flush(dsp);
1531 		break;
1532 
1533 	default:
1534 		ASSERT(B_FALSE);
1535 		break;
1536 	}
1537 }
1538 
1539 /*
1540  * Enqueue one or more messages to the transmit queue.
1541  * Caller specifies the insertion position (head/tail).
1542  */
1543 void
1544 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, boolean_t head_insert)
1545 {
1546 	mblk_t	*tail;
1547 	queue_t *q = dsp->ds_wq;
1548 	uint_t	cnt, msgcnt;
1549 	uint_t	tot_cnt, tot_msgcnt;
1550 
1551 	ASSERT(DB_TYPE(mp) == M_DATA);
1552 	/* Calculate total size and count of the packet(s) */
1553 	for (tail = mp, cnt = msgdsize(mp), msgcnt = 1;
1554 	    tail->b_next != NULL; tail = tail->b_next) {
1555 		ASSERT(DB_TYPE(tail->b_next) == M_DATA);
1556 		cnt += msgdsize(tail->b_next);
1557 		msgcnt++;
1558 	}
1559 
1560 	mutex_enter(&dsp->ds_tx_list_lock);
1561 	/*
1562 	 * If the queue depth would exceed the allowed threshold, drop
1563 	 * new packet(s) and drain those already in the queue.
1564 	 */
1565 	tot_cnt = dsp->ds_tx_cnt + cnt;
1566 	tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt;
1567 
1568 	if (!head_insert &&
1569 	    (tot_cnt >= dld_max_q_count || tot_msgcnt >= dld_max_q_count)) {
1570 		ASSERT(dsp->ds_tx_qbusy);
1571 		mutex_exit(&dsp->ds_tx_list_lock);
1572 		freemsgchain(mp);
1573 		goto done;
1574 	}
1575 
1576 	/* Update the queue size parameters */
1577 	dsp->ds_tx_cnt = tot_cnt;
1578 	dsp->ds_tx_msgcnt = tot_msgcnt;
1579 
1580 	/*
1581 	 * If the transmit queue is currently empty and we are
1582 	 * about to deposit the packet(s) there, switch mode to
1583 	 * "busy" and raise flow-control condition.
1584 	 */
1585 	if (!dsp->ds_tx_qbusy) {
1586 		dsp->ds_tx_qbusy = B_TRUE;
1587 		ASSERT(dsp->ds_tx_flow_mp != NULL);
1588 		(void) putq(q, dsp->ds_tx_flow_mp);
1589 		dsp->ds_tx_flow_mp = NULL;
1590 	}
1591 
1592 	if (!head_insert) {
1593 		/* Tail insertion */
1594 		if (dsp->ds_tx_list_head == NULL)
1595 			dsp->ds_tx_list_head = mp;
1596 		else
1597 			dsp->ds_tx_list_tail->b_next = mp;
1598 		dsp->ds_tx_list_tail = tail;
1599 	} else {
1600 		/* Head insertion */
1601 		tail->b_next = dsp->ds_tx_list_head;
1602 		if (dsp->ds_tx_list_head == NULL)
1603 			dsp->ds_tx_list_tail = tail;
1604 		dsp->ds_tx_list_head = mp;
1605 	}
1606 	mutex_exit(&dsp->ds_tx_list_lock);
1607 done:
1608 	/* Schedule service thread to drain the transmit queue */
1609 	qenable(q);
1610 }
1611 
1612 void
1613 dld_tx_flush(dld_str_t *dsp)
1614 {
1615 	mutex_enter(&dsp->ds_tx_list_lock);
1616 	if (dsp->ds_tx_list_head != NULL) {
1617 		freemsgchain(dsp->ds_tx_list_head);
1618 		dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL;
1619 		dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0;
1620 		if (dsp->ds_tx_qbusy) {
1621 			dsp->ds_tx_flow_mp = getq(dsp->ds_wq);
1622 			ASSERT(dsp->ds_tx_flow_mp != NULL);
1623 			dsp->ds_tx_qbusy = B_FALSE;
1624 		}
1625 	}
1626 	mutex_exit(&dsp->ds_tx_list_lock);
1627 }
1628 
1629 /*
1630  * Process an M_IOCTL message.
1631  */
1632 static void
1633 dld_ioc(dld_str_t *dsp, mblk_t *mp)
1634 {
1635 	uint_t			cmd;
1636 
1637 	cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
1638 	ASSERT(dsp->ds_type == DLD_DLPI);
1639 
1640 	switch (cmd) {
1641 	case DLIOCRAW:
1642 		ioc_raw(dsp, mp);
1643 		break;
1644 	case DLIOCHDRINFO:
1645 		ioc_fast(dsp, mp);
1646 		break;
1647 	default:
1648 		ioc(dsp, mp);
1649 	}
1650 }
1651 
1652 /*
1653  * DLIOCRAW
1654  */
1655 static void
1656 ioc_raw(dld_str_t *dsp, mblk_t *mp)
1657 {
1658 	queue_t *q = dsp->ds_wq;
1659 
1660 	rw_enter(&dsp->ds_lock, RW_WRITER);
1661 	if (dsp->ds_polling || dsp->ds_soft_ring) {
1662 		rw_exit(&dsp->ds_lock);
1663 		miocnak(q, mp, 0, EPROTO);
1664 		return;
1665 	}
1666 
1667 	if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
1668 		/*
1669 		 * Set the receive callback.
1670 		 */
1671 		dls_rx_set(dsp->ds_dc, dld_str_rx_raw, (void *)dsp);
1672 	}
1673 
1674 	/*
1675 	 * Note that raw mode is enabled.
1676 	 */
1677 	dsp->ds_mode = DLD_RAW;
1678 
1679 	rw_exit(&dsp->ds_lock);
1680 	miocack(q, mp, 0, 0);
1681 }
1682 
1683 /*
1684  * DLIOCHDRINFO
1685  */
1686 static void
1687 ioc_fast(dld_str_t *dsp, mblk_t *mp)
1688 {
1689 	dl_unitdata_req_t *dlp;
1690 	off_t		off;
1691 	size_t		len;
1692 	const uint8_t	*addr;
1693 	uint16_t	sap;
1694 	mblk_t		*nmp;
1695 	mblk_t		*hmp;
1696 	uint_t		addr_length;
1697 	queue_t		*q = dsp->ds_wq;
1698 	int		err;
1699 	dls_channel_t	dc;
1700 
1701 	if (dld_opt & DLD_OPT_NO_FASTPATH) {
1702 		err = ENOTSUP;
1703 		goto failed;
1704 	}
1705 
1706 	nmp = mp->b_cont;
1707 	if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
1708 	    (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
1709 	    dlp->dl_primitive != DL_UNITDATA_REQ)) {
1710 		err = EINVAL;
1711 		goto failed;
1712 	}
1713 
1714 	off = dlp->dl_dest_addr_offset;
1715 	len = dlp->dl_dest_addr_length;
1716 
1717 	if (!MBLKIN(nmp, off, len)) {
1718 		err = EINVAL;
1719 		goto failed;
1720 	}
1721 
1722 	rw_enter(&dsp->ds_lock, RW_READER);
1723 	if (dsp->ds_dlstate != DL_IDLE) {
1724 		rw_exit(&dsp->ds_lock);
1725 		err = ENOTSUP;
1726 		goto failed;
1727 	}
1728 
1729 	addr_length = dsp->ds_mip->mi_addr_length;
1730 	if (len != addr_length + sizeof (uint16_t)) {
1731 		rw_exit(&dsp->ds_lock);
1732 		err = EINVAL;
1733 		goto failed;
1734 	}
1735 
1736 	addr = nmp->b_rptr + off;
1737 	sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
1738 	dc = dsp->ds_dc;
1739 
1740 	if ((hmp = dls_header(dc, addr, sap, dsp->ds_pri, NULL)) == NULL) {
1741 		rw_exit(&dsp->ds_lock);
1742 		err = ENOMEM;
1743 		goto failed;
1744 	}
1745 
1746 	/*
1747 	 * This is a performance optimization.  We originally entered
1748 	 * as reader and only become writer upon transitioning into
1749 	 * the DLD_FASTPATH mode for the first time.  Otherwise we
1750 	 * stay as reader and return the fast-path header to IP.
1751 	 */
1752 	if (dsp->ds_mode != DLD_FASTPATH) {
1753 		if (!rw_tryupgrade(&dsp->ds_lock)) {
1754 			rw_exit(&dsp->ds_lock);
1755 			rw_enter(&dsp->ds_lock, RW_WRITER);
1756 
1757 			/*
1758 			 * State may have changed before we re-acquired
1759 			 * the writer lock in case the upgrade failed.
1760 			 */
1761 			if (dsp->ds_dlstate != DL_IDLE) {
1762 				rw_exit(&dsp->ds_lock);
1763 				err = ENOTSUP;
1764 				goto failed;
1765 			}
1766 		}
1767 
1768 		/*
1769 		 * Set the receive callback (unless polling is enabled).
1770 		 */
1771 		if (!dsp->ds_polling && !dsp->ds_soft_ring)
1772 			dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
1773 
1774 		/*
1775 		 * Note that fast-path mode is enabled.
1776 		 */
1777 		dsp->ds_mode = DLD_FASTPATH;
1778 	}
1779 	rw_exit(&dsp->ds_lock);
1780 
1781 	freemsg(nmp->b_cont);
1782 	nmp->b_cont = hmp;
1783 
1784 	miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
1785 	return;
1786 failed:
1787 	miocnak(q, mp, 0, err);
1788 }
1789 
1790 /*
1791  * Catch-all handler.
1792  */
1793 static void
1794 ioc(dld_str_t *dsp, mblk_t *mp)
1795 {
1796 	queue_t	*q = dsp->ds_wq;
1797 	mac_handle_t mh;
1798 
1799 	rw_enter(&dsp->ds_lock, RW_READER);
1800 	if (dsp->ds_dlstate == DL_UNATTACHED) {
1801 		rw_exit(&dsp->ds_lock);
1802 		miocnak(q, mp, 0, EINVAL);
1803 		return;
1804 	}
1805 	mh = dsp->ds_mh;
1806 	ASSERT(mh != NULL);
1807 	rw_exit(&dsp->ds_lock);
1808 	mac_ioctl(mh, q, mp);
1809 }
1810 
1811 /*
1812  * Allocate a new minor number.
1813  */
1814 static minor_t
1815 dld_minor_hold(boolean_t sleep)
1816 {
1817 	minor_t		minor;
1818 
1819 	/*
1820 	 * Grab a value from the arena.
1821 	 */
1822 	atomic_add_32(&minor_count, 1);
1823 	if ((minor = PTR_TO_MINOR(vmem_alloc(minor_arenap, 1,
1824 	    (sleep) ? VM_SLEEP : VM_NOSLEEP))) == 0) {
1825 		atomic_add_32(&minor_count, -1);
1826 		return (0);
1827 	}
1828 
1829 	return (minor);
1830 }
1831 
1832 /*
1833  * Release a previously allocated minor number.
1834  */
1835 static void
1836 dld_minor_rele(minor_t minor)
1837 {
1838 	/*
1839 	 * Return the value to the arena.
1840 	 */
1841 	vmem_free(minor_arenap, MINOR_TO_PTR(minor), 1);
1842 
1843 	atomic_add_32(&minor_count, -1);
1844 }
1845