1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2023 Oxide Computer Company
24 */
25
26 /*
27 * Data-Link Driver
28 */
29
30 #include <inet/common.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strsun.h>
34 #include <sys/vlan.h>
35 #include <sys/dld_impl.h>
36 #include <sys/cpuvar.h>
37 #include <sys/callb.h>
38 #include <sys/list.h>
39 #include <sys/mac_client.h>
40 #include <sys/mac_client_priv.h>
41 #include <sys/mac_flow.h>
42
43 static int str_constructor(void *, void *, int);
44 static void str_destructor(void *, void *);
45 static mblk_t *str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t);
46 static void str_notify_promisc_on_phys(dld_str_t *);
47 static void str_notify_promisc_off_phys(dld_str_t *);
48 static void str_notify_phys_addr(dld_str_t *, uint_t, const uint8_t *);
49 static void str_notify_link_up(dld_str_t *);
50 static void str_notify_link_down(dld_str_t *);
51 static void str_notify_capab_reneg(dld_str_t *);
52 static void str_notify_speed(dld_str_t *, uint32_t);
53
54 static void ioc_native(dld_str_t *, mblk_t *);
55 static void ioc_margin(dld_str_t *, mblk_t *);
56 static void ioc_raw(dld_str_t *, mblk_t *);
57 static void ioc_fast(dld_str_t *, mblk_t *);
58 static void ioc_lowlink(dld_str_t *, mblk_t *);
59 static void ioc(dld_str_t *, mblk_t *);
60 static void dld_ioc(dld_str_t *, mblk_t *);
61 static void dld_wput_nondata(dld_str_t *, mblk_t *);
62
63 static void str_mdata_raw_put(dld_str_t *, mblk_t *);
64 static mblk_t *i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t,
65 link_tagmode_t);
66 static mblk_t *i_dld_ether_header_strip_tag(mblk_t *, boolean_t);
67
68 static uint32_t str_count;
69 static kmem_cache_t *str_cachep;
70 static mod_hash_t *str_hashp;
71
72 #define STR_HASHSZ 64
73 #define STR_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key))
74
75 #define dld_taskq system_taskq
76
77 static kmutex_t dld_taskq_lock;
78 static kcondvar_t dld_taskq_cv;
79 static list_t dld_taskq_list; /* List of dld_str_t */
80 boolean_t dld_taskq_quit;
81 boolean_t dld_taskq_done;
82
83 static void dld_taskq_dispatch(void);
84
85 /*
86 * Some notes on entry points, flow-control, queueing.
87 *
88 * This driver exports the traditional STREAMS put entry point as well as
89 * the non-STREAMS fast-path transmit routine which is provided to IP via
90 * the DL_CAPAB_POLL negotiation. The put procedure handles all control
91 * and data operations, while the fast-path routine deals only with M_DATA
92 * fast-path packets. Regardless of the entry point, all outbound packets
93 * will end up in DLD_TX(), where they will be delivered to the MAC layer.
94 *
95 * The transmit logic operates in the following way: All packets coming
96 * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control
97 * happens when the MAC layer indicates the packets couldn't be
98 * transmitted due to 1) lack of resources (e.g. running out of
99 * descriptors), or 2) reaching the allowed bandwidth limit for this
100 * particular flow. The indication comes in the form of a Tx cookie that
101 * identifies the blocked ring. In such case, DLD will place a
102 * dummy message on its write-side STREAMS queue so that the queue is
103 * marked as "full". Any subsequent packets arriving at the driver will
104 * still be sent to the MAC layer where it either gets queued in the Tx
105 * SRS or discarded it if queue limit is exceeded. The write-side STREAMS
106 * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX.
107 * When the write service procedure runs, it will remove the dummy
108 * message from the write-side STREAMS queue; in effect this will trigger
109 * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0,
110 * respectively, due to the above reasons.
111 *
112 * All non-data operations, both DLPI and ioctls are single threaded on a per
113 * dld_str_t endpoint. This is done using a taskq so that the control operation
114 * has kernel context and can cv_wait for resources. In addition all set type
115 * operations that involve mac level state modification are serialized on a
116 * per mac end point using the perimeter mechanism provided by the mac layer.
117 * This serializes all mac clients trying to modify a single mac end point over
118 * the entire sequence of mac calls made by that client as an atomic unit. The
119 * mac framework locking is described in mac.c. A critical element is that
120 * DLD/DLS does not hold any locks across the mac perimeter.
121 *
122 * dld_finddevinfo() returns the dev_info_t * corresponding to a particular
123 * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that
124 * match dev_t. If a stream is found and it is attached, its dev_info_t *
125 * is returned. If the mac handle is non-null, it can be safely accessed
126 * below. The mac handle won't be freed until the mac_unregister which
127 * won't happen until the driver detaches. The DDI framework ensures that
128 * the detach won't happen while a getinfo is in progress.
129 */
130 typedef struct i_dld_str_state_s {
131 major_t ds_major;
132 minor_t ds_minor;
133 int ds_instance;
134 dev_info_t *ds_dip;
135 } i_dld_str_state_t;
136
137 /* ARGSUSED */
138 static uint_t
i_dld_str_walker(mod_hash_key_t key,mod_hash_val_t * val,void * arg)139 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
140 {
141 i_dld_str_state_t *statep = arg;
142 dld_str_t *dsp = (dld_str_t *)val;
143 mac_handle_t mh;
144
145 if (statep->ds_major != dsp->ds_major)
146 return (MH_WALK_CONTINUE);
147
148 ASSERT(statep->ds_minor != 0);
149 mh = dsp->ds_mh;
150
151 if (statep->ds_minor == dsp->ds_minor) {
152 /*
153 * Clone: a clone minor is unique. we can terminate the
154 * walk if we find a matching stream -- even if we fail
155 * to obtain the devinfo.
156 */
157 if (mh != NULL) {
158 statep->ds_dip = mac_devinfo_get(mh);
159 statep->ds_instance = DLS_MINOR2INST(mac_minor(mh));
160 }
161 return (MH_WALK_TERMINATE);
162 }
163 return (MH_WALK_CONTINUE);
164 }
165
166 static dev_info_t *
dld_finddevinfo(dev_t dev)167 dld_finddevinfo(dev_t dev)
168 {
169 dev_info_t *dip;
170 i_dld_str_state_t state;
171
172 if (getminor(dev) == 0)
173 return (NULL);
174
175 /*
176 * See if it's a minor node of a link
177 */
178 if ((dip = dls_link_devinfo(dev)) != NULL)
179 return (dip);
180
181 state.ds_minor = getminor(dev);
182 state.ds_major = getmajor(dev);
183 state.ds_dip = NULL;
184 state.ds_instance = -1;
185
186 mod_hash_walk(str_hashp, i_dld_str_walker, &state);
187 return (state.ds_dip);
188 }
189
190 int
dld_devt_to_instance(dev_t dev)191 dld_devt_to_instance(dev_t dev)
192 {
193 minor_t minor;
194 i_dld_str_state_t state;
195
196 /*
197 * GLDv3 numbers DLPI style 1 node as the instance number + 1.
198 * Minor number 0 is reserved for the DLPI style 2 unattached
199 * node.
200 */
201
202 if ((minor = getminor(dev)) == 0)
203 return (-1);
204
205 /*
206 * Check for unopened style 1 node.
207 * Note that this doesn't *necessarily* work for legacy
208 * devices, but this code is only called within the
209 * getinfo(9e) implementation for true GLDv3 devices, so it
210 * doesn't matter.
211 */
212 if (minor > 0 && minor <= DLS_MAX_MINOR) {
213 return (DLS_MINOR2INST(minor));
214 }
215
216 state.ds_minor = getminor(dev);
217 state.ds_major = getmajor(dev);
218 state.ds_dip = NULL;
219 state.ds_instance = -1;
220
221 mod_hash_walk(str_hashp, i_dld_str_walker, &state);
222 return (state.ds_instance);
223 }
224
225 /*
226 * devo_getinfo: getinfo(9e)
227 *
228 * NB: This may be called for a provider before the provider's
229 * instances are attached. Hence, if a particular provider needs a
230 * special mapping (the mac instance != ddi_get_instance()), then it
231 * may need to provide its own implementation using the
232 * mac_devt_to_instance() function, and translating the returned mac
233 * instance to a devinfo instance. For dev_t's where the minor number
234 * is too large (i.e. > MAC_MAX_MINOR), the provider can call this
235 * function indirectly via the mac_getinfo() function.
236 */
237 /*ARGSUSED*/
238 int
dld_getinfo(dev_info_t * dip,ddi_info_cmd_t cmd,void * arg,void ** resp)239 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
240 {
241 dev_info_t *devinfo;
242 minor_t minor = getminor((dev_t)arg);
243 int rc = DDI_FAILURE;
244
245 switch (cmd) {
246 case DDI_INFO_DEVT2DEVINFO:
247 if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
248 *(dev_info_t **)resp = devinfo;
249 rc = DDI_SUCCESS;
250 }
251 break;
252 case DDI_INFO_DEVT2INSTANCE:
253 if (minor > 0 && minor <= DLS_MAX_MINOR) {
254 *resp = (void *)(uintptr_t)DLS_MINOR2INST(minor);
255 rc = DDI_SUCCESS;
256 } else if (minor > DLS_MAX_MINOR &&
257 (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) {
258 *resp = (void *)(uintptr_t)ddi_get_instance(devinfo);
259 rc = DDI_SUCCESS;
260 }
261 break;
262 }
263 return (rc);
264 }
265
266 void *
dld_str_private(queue_t * q)267 dld_str_private(queue_t *q)
268 {
269 return (((dld_str_t *)(q->q_ptr))->ds_private);
270 }
271
272 int
dld_str_open(queue_t * rq,dev_t * devp,void * private)273 dld_str_open(queue_t *rq, dev_t *devp, void *private)
274 {
275 dld_str_t *dsp;
276 major_t major;
277 minor_t minor;
278 int err;
279
280 major = getmajor(*devp);
281 minor = getminor(*devp);
282
283 /*
284 * Half the 32-bit minor space is reserved for private use by the driver
285 * so we bail out here with `ENOSTR` to indicate specfs should retry the
286 * open with the driver's character based `open(9E)`. For a typical
287 * STREAMS driver, that would just be `nodev` which would simply return
288 * `ENODEV`. But a dual-personality device can choose to implement the
289 * character based `open(9E)` for some minor nodes. A driver wanting a
290 * separate STREAMS interface altogether would presumably have already
291 * provided its own `streamtab`.
292 */
293 if (minor >= mac_private_minor())
294 return (ENOSTR);
295
296 /*
297 * Create a new dld_str_t for the stream. This will grab a new minor
298 * number that will be handed back in the cloned dev_t. Creation may
299 * fail if we can't allocate the dummy mblk used for flow-control.
300 */
301 dsp = dld_str_create(rq, DLD_DLPI, major,
302 ((minor == 0) ? DL_STYLE2 : DL_STYLE1));
303 if (dsp == NULL)
304 return (ENOSR);
305
306 ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
307 dsp->ds_private = private;
308 if (minor != 0) {
309 /*
310 * Style 1 open
311 */
312 if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0)
313 goto failed;
314
315 ASSERT(dsp->ds_dlstate == DL_UNBOUND);
316 } else {
317 (void) qassociate(rq, -1);
318 }
319
320 /*
321 * Enable the queue srv(9e) routine.
322 */
323 qprocson(rq);
324
325 /*
326 * Construct a cloned dev_t to hand back.
327 */
328 *devp = makedevice(getmajor(*devp), dsp->ds_minor);
329 return (0);
330
331 failed:
332 dld_str_destroy(dsp);
333 return (err);
334 }
335
336 int
dld_str_close(queue_t * rq)337 dld_str_close(queue_t *rq)
338 {
339 dld_str_t *dsp = rq->q_ptr;
340
341 /*
342 * All modules on top have been popped off. So there can't be any
343 * threads from the top.
344 */
345 ASSERT(dsp->ds_datathr_cnt == 0);
346
347 /*
348 * Wait until pending DLPI requests are processed.
349 */
350 mutex_enter(&dsp->ds_lock);
351 while (dsp->ds_dlpi_pending)
352 cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock);
353 mutex_exit(&dsp->ds_lock);
354
355
356 /*
357 * This stream was open to a provider node. Check to see
358 * if it has been cleanly shut down.
359 */
360 if (dsp->ds_dlstate != DL_UNATTACHED) {
361 /*
362 * The stream is either open to a style 1 provider or
363 * this is not clean shutdown. Detach from the PPA.
364 * (This is still ok even in the style 1 case).
365 */
366 dld_str_detach(dsp);
367 }
368
369 dld_str_destroy(dsp);
370 return (0);
371 }
372
373 /*
374 * qi_qopen: open(9e)
375 */
376 /*ARGSUSED*/
377 int
dld_open(queue_t * rq,dev_t * devp,int flag,int sflag,cred_t * credp)378 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp)
379 {
380 if (sflag == MODOPEN)
381 return (ENOTSUP);
382
383 /*
384 * This is a cloning driver and therefore each queue should only
385 * ever get opened once.
386 */
387 if (rq->q_ptr != NULL)
388 return (EBUSY);
389
390 return (dld_str_open(rq, devp, NULL));
391 }
392
393 /*
394 * qi_qclose: close(9e)
395 */
396 /* ARGSUSED */
397 int
dld_close(queue_t * rq,int flags __unused,cred_t * credp __unused)398 dld_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
399 {
400 /*
401 * Disable the queue srv(9e) routine.
402 */
403 qprocsoff(rq);
404
405 return (dld_str_close(rq));
406 }
407
408 /*
409 * qi_qputp: put(9e)
410 */
411 int
dld_wput(queue_t * wq,mblk_t * mp)412 dld_wput(queue_t *wq, mblk_t *mp)
413 {
414 dld_str_t *dsp = (dld_str_t *)wq->q_ptr;
415 dld_str_mode_t mode;
416
417 switch (DB_TYPE(mp)) {
418 case M_DATA:
419 mutex_enter(&dsp->ds_lock);
420 mode = dsp->ds_mode;
421 if ((dsp->ds_dlstate != DL_IDLE) ||
422 (mode != DLD_FASTPATH && mode != DLD_RAW)) {
423 mutex_exit(&dsp->ds_lock);
424 freemsg(mp);
425 break;
426 }
427
428 DLD_DATATHR_INC(dsp);
429 mutex_exit(&dsp->ds_lock);
430 if (mode == DLD_FASTPATH) {
431 if (dsp->ds_mip->mi_media == DL_ETHER &&
432 (MBLKL(mp) < sizeof (struct ether_header))) {
433 freemsg(mp);
434 } else {
435 (void) str_mdata_fastpath_put(dsp, mp, 0, 0);
436 }
437 } else {
438 str_mdata_raw_put(dsp, mp);
439 }
440 DLD_DATATHR_DCR(dsp);
441 break;
442 case M_PROTO:
443 case M_PCPROTO: {
444 t_uscalar_t prim;
445
446 if (MBLKL(mp) < sizeof (t_uscalar_t))
447 break;
448
449 prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
450
451 if (prim == DL_UNITDATA_REQ) {
452 proto_unitdata_req(dsp, mp);
453 } else {
454 dld_wput_nondata(dsp, mp);
455 }
456 break;
457 }
458
459 case M_IOCTL:
460 dld_wput_nondata(dsp, mp);
461 break;
462
463 case M_FLUSH:
464 if (*mp->b_rptr & FLUSHW) {
465 DLD_CLRQFULL(dsp);
466 *mp->b_rptr &= ~FLUSHW;
467 }
468
469 if (*mp->b_rptr & FLUSHR) {
470 qreply(wq, mp);
471 } else {
472 freemsg(mp);
473 }
474 break;
475
476 default:
477 freemsg(mp);
478 break;
479 }
480 return (0);
481 }
482
483 /*
484 * qi_srvp: srv(9e)
485 */
486 int
dld_wsrv(queue_t * wq)487 dld_wsrv(queue_t *wq)
488 {
489 dld_str_t *dsp = wq->q_ptr;
490
491 DLD_CLRQFULL(dsp);
492 return (0);
493 }
494
495 void
dld_init_ops(struct dev_ops * ops,const char * name)496 dld_init_ops(struct dev_ops *ops, const char *name)
497 {
498 struct streamtab *stream;
499 struct qinit *rq, *wq;
500 struct module_info *modinfo;
501
502 modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP);
503 modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP);
504 (void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name);
505 modinfo->mi_minpsz = 0;
506 modinfo->mi_maxpsz = 64*1024;
507 modinfo->mi_hiwat = 1;
508 modinfo->mi_lowat = 0;
509
510 rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
511 rq->qi_qopen = dld_open;
512 rq->qi_qclose = dld_close;
513 rq->qi_minfo = modinfo;
514
515 wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP);
516 wq->qi_putp = (pfi_t)dld_wput;
517 wq->qi_srvp = (pfi_t)dld_wsrv;
518 wq->qi_minfo = modinfo;
519
520 stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP);
521 stream->st_rdinit = rq;
522 stream->st_wrinit = wq;
523 ops->devo_cb_ops->cb_str = stream;
524
525 if (ops->devo_getinfo == NULL)
526 ops->devo_getinfo = &dld_getinfo;
527 }
528
529 void
dld_fini_ops(struct dev_ops * ops)530 dld_fini_ops(struct dev_ops *ops)
531 {
532 struct streamtab *stream;
533 struct qinit *rq, *wq;
534 struct module_info *modinfo;
535
536 stream = ops->devo_cb_ops->cb_str;
537 rq = stream->st_rdinit;
538 wq = stream->st_wrinit;
539 modinfo = rq->qi_minfo;
540 ASSERT(wq->qi_minfo == modinfo);
541
542 kmem_free(stream, sizeof (struct streamtab));
543 kmem_free(wq, sizeof (struct qinit));
544 kmem_free(rq, sizeof (struct qinit));
545 kmem_free(modinfo->mi_idname, FMNAMESZ);
546 kmem_free(modinfo, sizeof (struct module_info));
547 }
548
549 /*
550 * Initialize this module's data structures.
551 */
552 void
dld_str_init(void)553 dld_str_init(void)
554 {
555 /*
556 * Create dld_str_t object cache.
557 */
558 str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t),
559 0, str_constructor, str_destructor, NULL, NULL, NULL, 0);
560 ASSERT(str_cachep != NULL);
561
562 /*
563 * Create a hash table for maintaining dld_str_t's.
564 * The ds_minor field (the clone minor number) of a dld_str_t
565 * is used as a key for this hash table because this number is
566 * globally unique (allocated from "dls_minor_arena").
567 */
568 str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ,
569 mod_hash_null_valdtor);
570
571 mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL);
572 cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL);
573
574 dld_taskq_quit = B_FALSE;
575 dld_taskq_done = B_FALSE;
576 list_create(&dld_taskq_list, sizeof (dld_str_t),
577 offsetof(dld_str_t, ds_tqlist));
578 (void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0,
579 &p0, TS_RUN, minclsyspri);
580 }
581
582 /*
583 * Tear down this module's data structures.
584 */
585 int
dld_str_fini(void)586 dld_str_fini(void)
587 {
588 /*
589 * Make sure that there are no objects in use.
590 */
591 if (str_count != 0)
592 return (EBUSY);
593
594 /*
595 * Ask the dld_taskq thread to quit and wait for it to be done
596 */
597 mutex_enter(&dld_taskq_lock);
598 dld_taskq_quit = B_TRUE;
599 cv_signal(&dld_taskq_cv);
600 while (!dld_taskq_done)
601 cv_wait(&dld_taskq_cv, &dld_taskq_lock);
602 mutex_exit(&dld_taskq_lock);
603 list_destroy(&dld_taskq_list);
604 /*
605 * Destroy object cache.
606 */
607 kmem_cache_destroy(str_cachep);
608 mod_hash_destroy_idhash(str_hashp);
609 return (0);
610 }
611
612 /*
613 * Create a new dld_str_t object.
614 */
615 dld_str_t *
dld_str_create(queue_t * rq,uint_t type,major_t major,t_uscalar_t style)616 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style)
617 {
618 dld_str_t *dsp;
619 int err;
620
621 /*
622 * Allocate an object from the cache.
623 */
624 atomic_inc_32(&str_count);
625 dsp = kmem_cache_alloc(str_cachep, KM_SLEEP);
626
627 /*
628 * Allocate the dummy mblk for flow-control.
629 */
630 dsp->ds_tx_flow_mp = allocb(1, BPRI_HI);
631 if (dsp->ds_tx_flow_mp == NULL) {
632 kmem_cache_free(str_cachep, dsp);
633 atomic_dec_32(&str_count);
634 return (NULL);
635 }
636 dsp->ds_type = type;
637 dsp->ds_major = major;
638 dsp->ds_style = style;
639
640 /*
641 * Initialize the queue pointers.
642 */
643 ASSERT(RD(rq) == rq);
644 dsp->ds_rq = rq;
645 dsp->ds_wq = WR(rq);
646 rq->q_ptr = WR(rq)->q_ptr = (void *)dsp;
647
648 /*
649 * We want explicit control over our write-side STREAMS queue
650 * where the dummy mblk gets added/removed for flow-control.
651 */
652 noenable(WR(rq));
653
654 err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor),
655 (mod_hash_val_t)dsp);
656 ASSERT(err == 0);
657 return (dsp);
658 }
659
660 /*
661 * Destroy a dld_str_t object.
662 */
663 void
dld_str_destroy(dld_str_t * dsp)664 dld_str_destroy(dld_str_t *dsp)
665 {
666 queue_t *rq;
667 queue_t *wq;
668 mod_hash_val_t val;
669
670 /*
671 * Clear the queue pointers.
672 */
673 rq = dsp->ds_rq;
674 wq = dsp->ds_wq;
675 ASSERT(wq == WR(rq));
676 rq->q_ptr = wq->q_ptr = NULL;
677 dsp->ds_rq = dsp->ds_wq = NULL;
678
679 ASSERT(dsp->ds_dlstate == DL_UNATTACHED);
680 ASSERT(dsp->ds_sap == 0);
681 ASSERT(dsp->ds_mh == NULL);
682 ASSERT(dsp->ds_mch == NULL);
683 ASSERT(dsp->ds_promisc == 0);
684 ASSERT(dsp->ds_mph == NULL);
685 ASSERT(dsp->ds_mip == NULL);
686 ASSERT(dsp->ds_mnh == NULL);
687
688 ASSERT(dsp->ds_polling == B_FALSE);
689 ASSERT(dsp->ds_direct == B_FALSE);
690 ASSERT(dsp->ds_lso == B_FALSE);
691 ASSERT(dsp->ds_lso_max == 0);
692 ASSERT(dsp->ds_passivestate != DLD_ACTIVE);
693
694 /*
695 * Reinitialize all the flags.
696 */
697 dsp->ds_notifications = 0;
698 dsp->ds_passivestate = DLD_UNINITIALIZED;
699 dsp->ds_mode = DLD_UNITDATA;
700 dsp->ds_native = B_FALSE;
701 dsp->ds_nonip = B_FALSE;
702
703 ASSERT(dsp->ds_datathr_cnt == 0);
704 ASSERT(dsp->ds_pending_head == NULL);
705 ASSERT(dsp->ds_pending_tail == NULL);
706 ASSERT(!dsp->ds_dlpi_pending);
707
708 ASSERT(dsp->ds_dlp == NULL);
709 ASSERT(dsp->ds_dmap == NULL);
710 ASSERT(dsp->ds_rx == NULL);
711 ASSERT(dsp->ds_rx_arg == NULL);
712 ASSERT(dsp->ds_next == NULL);
713 ASSERT(dsp->ds_head == NULL);
714
715 /*
716 * Free the dummy mblk if exists.
717 */
718 if (dsp->ds_tx_flow_mp != NULL) {
719 freeb(dsp->ds_tx_flow_mp);
720 dsp->ds_tx_flow_mp = NULL;
721 }
722
723 (void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val);
724 ASSERT(dsp == (dld_str_t *)val);
725
726 /*
727 * Free the object back to the cache.
728 */
729 kmem_cache_free(str_cachep, dsp);
730 atomic_dec_32(&str_count);
731 }
732
733 /*
734 * kmem_cache contructor function: see kmem_cache_create(9f).
735 */
736 /*ARGSUSED*/
737 static int
str_constructor(void * buf,void * cdrarg,int kmflags)738 str_constructor(void *buf, void *cdrarg, int kmflags)
739 {
740 dld_str_t *dsp = buf;
741
742 bzero(buf, sizeof (dld_str_t));
743
744 /*
745 * Allocate a new minor number.
746 */
747 if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0)
748 return (-1);
749
750 /*
751 * Initialize the DLPI state machine.
752 */
753 dsp->ds_dlstate = DL_UNATTACHED;
754
755 mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL);
756 cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL);
757 cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL);
758
759 return (0);
760 }
761
762 /*
763 * kmem_cache destructor function.
764 */
765 /*ARGSUSED*/
766 static void
str_destructor(void * buf,void * cdrarg)767 str_destructor(void *buf, void *cdrarg)
768 {
769 dld_str_t *dsp = buf;
770
771 /*
772 * Release the minor number.
773 */
774 mac_minor_rele(dsp->ds_minor);
775
776 ASSERT(dsp->ds_tx_flow_mp == NULL);
777
778 mutex_destroy(&dsp->ds_lock);
779 cv_destroy(&dsp->ds_datathr_cv);
780 cv_destroy(&dsp->ds_dlpi_pending_cv);
781 }
782
783 /*
784 * Update the priority bits and VID (may need to insert tag if mp points
785 * to an untagged packet.
786 * If vid is VLAN_ID_NONE, use the VID encoded in the packet.
787 */
788 static mblk_t *
i_dld_ether_header_update_tag(mblk_t * mp,uint_t pri,uint16_t vid,link_tagmode_t tagmode)789 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid,
790 link_tagmode_t tagmode)
791 {
792 mblk_t *hmp;
793 struct ether_vlan_header *evhp;
794 struct ether_header *ehp;
795 uint16_t old_tci = 0;
796 size_t len;
797
798 ASSERT(pri != 0 || vid != VLAN_ID_NONE);
799
800 evhp = (struct ether_vlan_header *)mp->b_rptr;
801 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) {
802 /*
803 * Tagged packet, update the priority bits.
804 */
805 len = sizeof (struct ether_vlan_header);
806
807 if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) {
808 /*
809 * In case some drivers only check the db_ref
810 * count of the first mblk, we pullup the
811 * message into a single mblk.
812 */
813 hmp = msgpullup(mp, -1);
814 if ((hmp == NULL) || (MBLKL(hmp) < len)) {
815 freemsg(hmp);
816 return (NULL);
817 } else {
818 freemsg(mp);
819 mp = hmp;
820 }
821 }
822
823 evhp = (struct ether_vlan_header *)mp->b_rptr;
824 old_tci = ntohs(evhp->ether_tci);
825 } else {
826 /*
827 * Untagged packet. Two factors will cause us to insert a
828 * VLAN header:
829 * - This is a VLAN link (vid is specified)
830 * - The link supports user priority tagging and the priority
831 * is non-zero.
832 */
833 if (vid == VLAN_ID_NONE && tagmode == LINK_TAGMODE_VLANONLY)
834 return (mp);
835
836 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED);
837 if (hmp == NULL)
838 return (NULL);
839
840 evhp = (struct ether_vlan_header *)hmp->b_rptr;
841 ehp = (struct ether_header *)mp->b_rptr;
842
843 /*
844 * Copy the MAC addresses and typelen
845 */
846 bcopy(ehp, evhp, (ETHERADDRL * 2));
847 evhp->ether_type = ehp->ether_type;
848 evhp->ether_tpid = htons(ETHERTYPE_VLAN);
849
850 hmp->b_wptr += sizeof (struct ether_vlan_header);
851 mp->b_rptr += sizeof (struct ether_header);
852
853 /*
854 * Free the original message if it's now empty. Link the
855 * rest of the messages to the header message.
856 */
857 if (MBLKL(mp) == 0) {
858 hmp->b_cont = mp->b_cont;
859 freeb(mp);
860 } else {
861 hmp->b_cont = mp;
862 }
863 mp = hmp;
864 }
865
866 if (pri == 0)
867 pri = VLAN_PRI(old_tci);
868 if (vid == VLAN_ID_NONE)
869 vid = VLAN_ID(old_tci);
870 evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid));
871 return (mp);
872 }
873
874 /*
875 * M_DATA put (IP fast-path mode)
876 */
877 mac_tx_cookie_t
str_mdata_fastpath_put(dld_str_t * dsp,mblk_t * mp,uintptr_t f_hint,uint16_t flag)878 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint,
879 uint16_t flag)
880 {
881 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
882 mblk_t *newmp;
883 uint_t pri;
884 mac_tx_cookie_t cookie;
885
886 if (is_ethernet) {
887 /*
888 * Update the priority bits to the assigned priority.
889 */
890 pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp);
891
892 if (pri != 0) {
893 newmp = i_dld_ether_header_update_tag(mp, pri,
894 VLAN_ID_NONE, dsp->ds_dlp->dl_tagmode);
895 if (newmp == NULL)
896 goto discard;
897 mp = newmp;
898 }
899 }
900
901 if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != 0) {
902 DLD_SETQFULL(dsp);
903 }
904 return (cookie);
905
906 discard:
907 /* TODO: bump kstat? */
908 freemsg(mp);
909 return (0);
910 }
911
912 /*
913 * M_DATA put (DLIOCRAW mode)
914 */
915 static void
str_mdata_raw_put(dld_str_t * dsp,mblk_t * mp)916 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp)
917 {
918 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
919 mblk_t *bp, *newmp;
920 size_t size;
921 mac_header_info_t mhi;
922 uint_t pri, vid, dvid;
923 uint_t max_sdu;
924
925 /*
926 * Certain MAC type plugins provide an illusion for raw DLPI
927 * consumers. They pretend that the MAC layer is something that
928 * it's not for the benefit of observability tools. For example,
929 * mac_wifi pretends that it's Ethernet for such consumers.
930 * Here, unless native mode is enabled, we call into the MAC layer so
931 * that this illusion can be maintained. The plugin will optionally
932 * transform the MAC header here into something that can be passed
933 * down. The header goes from raw mode to "cooked" mode.
934 */
935 if (!dsp->ds_native) {
936 if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL)
937 goto discard;
938 mp = newmp;
939 }
940
941 size = MBLKL(mp);
942
943 /*
944 * Check the packet is not too big and that any remaining
945 * fragment list is composed entirely of M_DATA messages. (We
946 * know the first fragment was M_DATA otherwise we could not
947 * have got here).
948 */
949 for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) {
950 if (DB_TYPE(bp) != M_DATA)
951 goto discard;
952 size += MBLKL(bp);
953 }
954
955 if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
956 goto discard;
957
958 mac_sdu_get(dsp->ds_mh, NULL, &max_sdu);
959 /*
960 * If LSO is enabled, check the size against lso_max. Otherwise,
961 * compare the packet size with max_sdu.
962 */
963 max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu;
964 if (size > max_sdu + mhi.mhi_hdrsize)
965 goto discard;
966
967 if (is_ethernet) {
968 dvid = mac_client_vid(dsp->ds_mch);
969
970 /*
971 * Discard the packet if this is a VLAN stream but the VID in
972 * the packet is not correct.
973 */
974 vid = VLAN_ID(mhi.mhi_tci);
975 if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE))
976 goto discard;
977
978 /*
979 * Discard the packet if this packet is a tagged packet
980 * but both pri and VID are 0.
981 */
982 pri = VLAN_PRI(mhi.mhi_tci);
983 if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 &&
984 vid == VLAN_ID_NONE)
985 goto discard;
986
987 /*
988 * Update the priority bits to the per-stream priority if
989 * priority is not set in the packet. Update the VID for
990 * packets on a VLAN stream.
991 */
992 pri = (pri == 0) ? dsp->ds_pri : 0;
993 if ((pri != 0) || (dvid != VLAN_ID_NONE)) {
994 if ((newmp = i_dld_ether_header_update_tag(mp, pri,
995 dvid, dsp->ds_dlp->dl_tagmode)) == NULL) {
996 goto discard;
997 }
998 mp = newmp;
999 }
1000 }
1001
1002 if (DLD_TX(dsp, mp, 0, 0) != 0) {
1003 /* Turn on flow-control for dld */
1004 DLD_SETQFULL(dsp);
1005 }
1006 return;
1007
1008 discard:
1009 /* TODO: bump kstat? */
1010 freemsg(mp);
1011 }
1012
1013 /*
1014 * Process DL_ATTACH_REQ (style 2) or open(2) (style 1).
1015 */
1016 int
dld_str_attach(dld_str_t * dsp,t_uscalar_t ppa)1017 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa)
1018 {
1019 dev_t dev;
1020 int err;
1021 const char *drvname;
1022 mac_perim_handle_t mph = NULL;
1023 boolean_t qassociated = B_FALSE;
1024 dls_link_t *dlp = NULL;
1025 dls_dl_handle_t ddp = NULL;
1026
1027 if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL)
1028 return (EINVAL);
1029
1030 if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA)
1031 return (ENOTSUP);
1032
1033 /*
1034 * /dev node access. This will still be supported for backward
1035 * compatibility reason.
1036 */
1037 if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) &&
1038 (strcmp(drvname, "vnic") != 0)) {
1039 if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0)
1040 return (EINVAL);
1041 qassociated = B_TRUE;
1042 }
1043
1044 dev = makedevice(dsp->ds_major, (minor_t)ppa + 1);
1045 if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0)
1046 goto failed;
1047
1048 if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0)
1049 goto failed;
1050
1051 /*
1052 * Open a channel.
1053 */
1054 if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0)
1055 goto failed;
1056
1057 if ((err = dls_open(dlp, ddp, dsp)) != 0)
1058 goto failed;
1059
1060 /*
1061 * Set the default packet priority.
1062 */
1063 dsp->ds_pri = 0;
1064
1065 /*
1066 * Add a notify function so that the we get updates from the MAC.
1067 */
1068 dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp);
1069 dsp->ds_dlstate = DL_UNBOUND;
1070 mac_perim_exit(mph);
1071 return (0);
1072
1073 failed:
1074 if (dlp != NULL)
1075 dls_link_rele(dlp);
1076 if (mph != NULL)
1077 mac_perim_exit(mph);
1078 if (ddp != NULL)
1079 dls_devnet_rele(ddp);
1080 if (qassociated)
1081 (void) qassociate(dsp->ds_wq, -1);
1082
1083 return (err);
1084 }
1085
1086 /*
1087 * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called
1088 * from close(2) for style 2.
1089 */
1090 void
dld_str_detach(dld_str_t * dsp)1091 dld_str_detach(dld_str_t *dsp)
1092 {
1093 mac_perim_handle_t mph;
1094 int err;
1095
1096 ASSERT(dsp->ds_datathr_cnt == 0);
1097
1098 mac_perim_enter_by_mh(dsp->ds_mh, &mph);
1099 /*
1100 * Remove the notify function.
1101 *
1102 * Note that we cannot wait for the notification callback to be removed
1103 * since it could cause the deadlock with str_notify() since they both
1104 * need the mac perimeter. Continue if we cannot remove the
1105 * notification callback right now and wait after we leave the
1106 * perimeter.
1107 */
1108 err = mac_notify_remove(dsp->ds_mnh, B_FALSE);
1109 dsp->ds_mnh = NULL;
1110
1111 /*
1112 * Disable the capabilities
1113 */
1114 dld_capabilities_disable(dsp);
1115
1116 /*
1117 * Clear LSO flags.
1118 */
1119 dsp->ds_lso = B_FALSE;
1120 dsp->ds_lso_max = 0;
1121
1122 dls_close(dsp);
1123 mac_perim_exit(mph);
1124
1125 /*
1126 * Now we leave the mac perimeter. If mac_notify_remove() failed
1127 * because the notification callback was in progress, wait for
1128 * it to finish before we proceed.
1129 */
1130 if (err != 0)
1131 mac_notify_remove_wait(dsp->ds_mh);
1132
1133 /*
1134 * An unreferenced tagged (non-persistent) vlan gets destroyed
1135 * automatically in the call to dls_devnet_rele.
1136 */
1137 dls_devnet_rele(dsp->ds_ddh);
1138
1139 dsp->ds_sap = 0;
1140 dsp->ds_mh = NULL;
1141 dsp->ds_mch = NULL;
1142 dsp->ds_mip = NULL;
1143
1144 if (dsp->ds_style == DL_STYLE2)
1145 (void) qassociate(dsp->ds_wq, -1);
1146
1147 /*
1148 * Re-initialize the DLPI state machine.
1149 */
1150 dsp->ds_dlstate = DL_UNATTACHED;
1151 }
1152
1153 /*
1154 * This function is only called for VLAN streams. In raw mode, we strip VLAN
1155 * tags before sending packets up to the DLS clients, with the exception of
1156 * special priority tagged packets, in that case, we set the VID to 0.
1157 * mp must be a VLAN tagged packet.
1158 */
1159 static mblk_t *
i_dld_ether_header_strip_tag(mblk_t * mp,boolean_t keep_pri)1160 i_dld_ether_header_strip_tag(mblk_t *mp, boolean_t keep_pri)
1161 {
1162 mblk_t *newmp;
1163 struct ether_vlan_header *evhp;
1164 uint16_t tci, new_tci;
1165
1166 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
1167 if (DB_REF(mp) > 1) {
1168 newmp = copymsg(mp);
1169 if (newmp == NULL)
1170 return (NULL);
1171 freemsg(mp);
1172 mp = newmp;
1173 }
1174 evhp = (struct ether_vlan_header *)mp->b_rptr;
1175
1176 tci = ntohs(evhp->ether_tci);
1177 if (VLAN_PRI(tci) == 0 || !keep_pri) {
1178 /*
1179 * Priority is 0, strip the tag.
1180 */
1181 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL);
1182 mp->b_rptr += VLAN_TAGSZ;
1183 } else {
1184 /*
1185 * Priority is not 0, update the VID to 0.
1186 */
1187 new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE);
1188 evhp->ether_tci = htons(new_tci);
1189 }
1190 return (mp);
1191 }
1192
1193 /*
1194 * Raw mode receive function.
1195 */
1196 /*ARGSUSED*/
1197 void
dld_str_rx_raw(void * arg,mac_resource_handle_t mrh,mblk_t * mp,mac_header_info_t * mhip)1198 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1199 mac_header_info_t *mhip)
1200 {
1201 dld_str_t *dsp = (dld_str_t *)arg;
1202 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER);
1203 mblk_t *next, *newmp;
1204
1205 ASSERT(mp != NULL);
1206 do {
1207 /*
1208 * Get the pointer to the next packet in the chain and then
1209 * clear b_next before the packet gets passed on.
1210 */
1211 next = mp->b_next;
1212 mp->b_next = NULL;
1213
1214 /*
1215 * Wind back b_rptr to point at the MAC header.
1216 */
1217 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1218 mp->b_rptr -= mhip->mhi_hdrsize;
1219
1220 /*
1221 * Certain MAC type plugins provide an illusion for raw
1222 * DLPI consumers. They pretend that the MAC layer is
1223 * something that it's not for the benefit of observability
1224 * tools. For example, mac_wifi pretends that it's Ethernet
1225 * for such consumers. Here, unless native mode is enabled,
1226 * we call into the MAC layer so that this illusion can be
1227 * maintained. The plugin will optionally transform the MAC
1228 * header here into something that can be passed up to raw
1229 * consumers. The header goes from "cooked" mode to raw mode.
1230 */
1231 if (!dsp->ds_native) {
1232 newmp = mac_header_uncook(dsp->ds_mh, mp);
1233 if (newmp == NULL) {
1234 freemsg(mp);
1235 goto next;
1236 }
1237 mp = newmp;
1238 }
1239
1240 /*
1241 * Strip the VLAN tag for VLAN streams.
1242 */
1243 if (is_ethernet &&
1244 mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) {
1245 /*
1246 * The priority should be kept only for VLAN
1247 * data-links.
1248 */
1249 newmp = i_dld_ether_header_strip_tag(mp,
1250 mac_client_is_vlan_vnic(dsp->ds_mch));
1251 if (newmp == NULL) {
1252 freemsg(mp);
1253 goto next;
1254 }
1255 mp = newmp;
1256 }
1257
1258 /*
1259 * Pass the packet on.
1260 */
1261 if (canputnext(dsp->ds_rq))
1262 putnext(dsp->ds_rq, mp);
1263 else
1264 freemsg(mp);
1265
1266 next:
1267 /*
1268 * Move on to the next packet in the chain.
1269 */
1270 mp = next;
1271 } while (mp != NULL);
1272 }
1273
1274 /*
1275 * Fast-path receive function.
1276 */
1277 /*ARGSUSED*/
1278 void
dld_str_rx_fastpath(void * arg,mac_resource_handle_t mrh,mblk_t * mp,mac_header_info_t * mhip)1279 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1280 mac_header_info_t *mhip)
1281 {
1282 dld_str_t *dsp = (dld_str_t *)arg;
1283 mblk_t *next;
1284 size_t offset = 0;
1285
1286 /*
1287 * MAC header stripping rules:
1288 * - Tagged packets:
1289 * a. VLAN streams. Strip the whole VLAN header including the tag.
1290 * b. Physical streams
1291 * - VLAN packets (non-zero VID). The stream must be either a
1292 * DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener.
1293 * Strip the Ethernet header but keep the VLAN header.
1294 * - Special tagged packets (zero VID)
1295 * * The stream is either a DL_PROMISC_SAP listener or a
1296 * ETHERTYPE_VLAN listener, strip the Ethernet header but
1297 * keep the VLAN header.
1298 * * Otherwise, strip the whole VLAN header.
1299 * - Untagged packets. Strip the whole MAC header.
1300 */
1301 if (mhip->mhi_istagged &&
1302 (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1303 ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1304 (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1305 offset = VLAN_TAGSZ;
1306 }
1307
1308 ASSERT(mp != NULL);
1309 do {
1310 /*
1311 * Get the pointer to the next packet in the chain and then
1312 * clear b_next before the packet gets passed on.
1313 */
1314 next = mp->b_next;
1315 mp->b_next = NULL;
1316
1317 /*
1318 * Wind back b_rptr to point at the VLAN header.
1319 */
1320 ASSERT(mp->b_rptr >= DB_BASE(mp) + offset);
1321 mp->b_rptr -= offset;
1322
1323 /*
1324 * Pass the packet on.
1325 */
1326 if (canputnext(dsp->ds_rq))
1327 putnext(dsp->ds_rq, mp);
1328 else
1329 freemsg(mp);
1330 /*
1331 * Move on to the next packet in the chain.
1332 */
1333 mp = next;
1334 } while (mp != NULL);
1335 }
1336
1337 /*
1338 * Default receive function (send DL_UNITDATA_IND messages).
1339 */
1340 /*ARGSUSED*/
1341 void
dld_str_rx_unitdata(void * arg,mac_resource_handle_t mrh,mblk_t * mp,mac_header_info_t * mhip)1342 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
1343 mac_header_info_t *mhip)
1344 {
1345 dld_str_t *dsp = (dld_str_t *)arg;
1346 mblk_t *ud_mp;
1347 mblk_t *next;
1348 size_t offset = 0;
1349 boolean_t strip_vlan = B_TRUE;
1350
1351 /*
1352 * See MAC header stripping rules in the dld_str_rx_fastpath() function.
1353 */
1354 if (mhip->mhi_istagged &&
1355 (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) &&
1356 ((dsp->ds_sap == ETHERTYPE_VLAN) ||
1357 (dsp->ds_promisc & DLS_PROMISC_SAP))) {
1358 offset = VLAN_TAGSZ;
1359 strip_vlan = B_FALSE;
1360 }
1361
1362 ASSERT(mp != NULL);
1363 do {
1364 /*
1365 * Get the pointer to the next packet in the chain and then
1366 * clear b_next before the packet gets passed on.
1367 */
1368 next = mp->b_next;
1369 mp->b_next = NULL;
1370
1371 /*
1372 * Wind back b_rptr to point at the MAC header.
1373 */
1374 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
1375 mp->b_rptr -= mhip->mhi_hdrsize;
1376
1377 /*
1378 * Create the DL_UNITDATA_IND M_PROTO.
1379 */
1380 if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) {
1381 freemsgchain(mp);
1382 return;
1383 }
1384
1385 /*
1386 * Advance b_rptr to point at the payload (or the VLAN header).
1387 */
1388 mp->b_rptr += (mhip->mhi_hdrsize - offset);
1389
1390 /*
1391 * Prepend the DL_UNITDATA_IND.
1392 */
1393 ud_mp->b_cont = mp;
1394
1395 /*
1396 * Send the message.
1397 */
1398 if (canputnext(dsp->ds_rq))
1399 putnext(dsp->ds_rq, ud_mp);
1400 else
1401 freemsg(ud_mp);
1402
1403 /*
1404 * Move on to the next packet in the chain.
1405 */
1406 mp = next;
1407 } while (mp != NULL);
1408 }
1409
1410 /*
1411 * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE
1412 */
1413 static void
str_notify_sdu_size(dld_str_t * dsp,uint_t max_sdu,uint_t multicast_sdu)1414 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu, uint_t multicast_sdu)
1415 {
1416 mblk_t *mp;
1417 dl_notify_ind_t *dlip;
1418
1419 if (!(dsp->ds_notifications & (DL_NOTE_SDU_SIZE|DL_NOTE_SDU_SIZE2)))
1420 return;
1421
1422 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1423 M_PROTO, 0)) == NULL)
1424 return;
1425
1426 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1427 dlip = (dl_notify_ind_t *)mp->b_rptr;
1428 dlip->dl_primitive = DL_NOTIFY_IND;
1429 if (dsp->ds_notifications & DL_NOTE_SDU_SIZE2) {
1430 dlip->dl_notification = DL_NOTE_SDU_SIZE2;
1431 dlip->dl_data1 = max_sdu;
1432 dlip->dl_data2 = multicast_sdu;
1433 } else {
1434 dlip->dl_notification = DL_NOTE_SDU_SIZE;
1435 dlip->dl_data = max_sdu;
1436 }
1437
1438 qreply(dsp->ds_wq, mp);
1439 }
1440
1441 /*
1442 * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the
1443 * current state of the interface.
1444 */
1445 void
dld_str_notify_ind(dld_str_t * dsp)1446 dld_str_notify_ind(dld_str_t *dsp)
1447 {
1448 mac_notify_type_t type;
1449
1450 for (type = 0; type < MAC_NNOTE; type++)
1451 str_notify(dsp, type);
1452 }
1453
1454 typedef struct dl_unitdata_ind_wrapper {
1455 dl_unitdata_ind_t dl_unitdata;
1456 uint8_t dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1457 uint8_t dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)];
1458 } dl_unitdata_ind_wrapper_t;
1459
1460 /*
1461 * Create a DL_UNITDATA_IND M_PROTO message.
1462 */
1463 static mblk_t *
str_unitdata_ind(dld_str_t * dsp,mblk_t * mp,boolean_t strip_vlan)1464 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan)
1465 {
1466 mblk_t *nmp;
1467 dl_unitdata_ind_wrapper_t *dlwp;
1468 dl_unitdata_ind_t *dlp;
1469 mac_header_info_t mhi;
1470 uint_t addr_length;
1471 uint8_t *daddr;
1472 uint8_t *saddr;
1473
1474 /*
1475 * Get the packet header information.
1476 */
1477 if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0)
1478 return (NULL);
1479
1480 /*
1481 * Allocate a message large enough to contain the wrapper structure
1482 * defined above.
1483 */
1484 if ((nmp = mexchange(dsp->ds_wq, NULL,
1485 sizeof (dl_unitdata_ind_wrapper_t), M_PROTO,
1486 DL_UNITDATA_IND)) == NULL)
1487 return (NULL);
1488
1489 dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr;
1490
1491 dlp = &(dlwp->dl_unitdata);
1492 ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr);
1493 ASSERT(dlp->dl_primitive == DL_UNITDATA_IND);
1494
1495 /*
1496 * Copy in the destination address.
1497 */
1498 addr_length = dsp->ds_mip->mi_addr_length;
1499 daddr = dlwp->dl_dest_addr;
1500 dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp;
1501 bcopy(mhi.mhi_daddr, daddr, addr_length);
1502
1503 /*
1504 * Set the destination DLSAP to the SAP value encoded in the packet.
1505 */
1506 if (mhi.mhi_istagged && !strip_vlan)
1507 *(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN;
1508 else
1509 *(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap;
1510 dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t);
1511
1512 /*
1513 * If the destination address was multicast or broadcast then the
1514 * dl_group_address field should be non-zero.
1515 */
1516 dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) ||
1517 (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST);
1518
1519 /*
1520 * Copy in the source address if one exists. Some MAC types (DL_IB
1521 * for example) may not have access to source information.
1522 */
1523 if (mhi.mhi_saddr == NULL) {
1524 dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0;
1525 } else {
1526 saddr = dlwp->dl_src_addr;
1527 dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp;
1528 bcopy(mhi.mhi_saddr, saddr, addr_length);
1529
1530 /*
1531 * Set the source DLSAP to the packet ethertype.
1532 */
1533 *(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap;
1534 dlp->dl_src_addr_length = addr_length + sizeof (uint16_t);
1535 }
1536
1537 return (nmp);
1538 }
1539
1540 /*
1541 * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS
1542 */
1543 static void
str_notify_promisc_on_phys(dld_str_t * dsp)1544 str_notify_promisc_on_phys(dld_str_t *dsp)
1545 {
1546 mblk_t *mp;
1547 dl_notify_ind_t *dlip;
1548
1549 if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS))
1550 return;
1551
1552 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1553 M_PROTO, 0)) == NULL)
1554 return;
1555
1556 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1557 dlip = (dl_notify_ind_t *)mp->b_rptr;
1558 dlip->dl_primitive = DL_NOTIFY_IND;
1559 dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS;
1560
1561 qreply(dsp->ds_wq, mp);
1562 }
1563
1564 /*
1565 * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS
1566 */
1567 static void
str_notify_promisc_off_phys(dld_str_t * dsp)1568 str_notify_promisc_off_phys(dld_str_t *dsp)
1569 {
1570 mblk_t *mp;
1571 dl_notify_ind_t *dlip;
1572
1573 if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS))
1574 return;
1575
1576 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1577 M_PROTO, 0)) == NULL)
1578 return;
1579
1580 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1581 dlip = (dl_notify_ind_t *)mp->b_rptr;
1582 dlip->dl_primitive = DL_NOTIFY_IND;
1583 dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS;
1584
1585 qreply(dsp->ds_wq, mp);
1586 }
1587
1588 /*
1589 * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR
1590 */
1591 static void
str_notify_phys_addr(dld_str_t * dsp,uint_t addr_type,const uint8_t * addr)1592 str_notify_phys_addr(dld_str_t *dsp, uint_t addr_type, const uint8_t *addr)
1593 {
1594 mblk_t *mp;
1595 dl_notify_ind_t *dlip;
1596 uint_t addr_length;
1597 uint16_t ethertype;
1598
1599 if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR))
1600 return;
1601
1602 addr_length = dsp->ds_mip->mi_addr_length;
1603 if ((mp = mexchange(dsp->ds_wq, NULL,
1604 sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t),
1605 M_PROTO, 0)) == NULL)
1606 return;
1607
1608 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1609 dlip = (dl_notify_ind_t *)mp->b_rptr;
1610 dlip->dl_primitive = DL_NOTIFY_IND;
1611 dlip->dl_notification = DL_NOTE_PHYS_ADDR;
1612 dlip->dl_data = addr_type;
1613 dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1614 dlip->dl_addr_length = addr_length + sizeof (uint16_t);
1615
1616 bcopy(addr, &dlip[1], addr_length);
1617
1618 ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap;
1619 *(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype;
1620
1621 qreply(dsp->ds_wq, mp);
1622 }
1623
1624 /*
1625 * DL_NOTIFY_IND: DL_NOTE_LINK_UP
1626 */
1627 static void
str_notify_link_up(dld_str_t * dsp)1628 str_notify_link_up(dld_str_t *dsp)
1629 {
1630 mblk_t *mp;
1631 dl_notify_ind_t *dlip;
1632
1633 if (!(dsp->ds_notifications & DL_NOTE_LINK_UP))
1634 return;
1635
1636 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1637 M_PROTO, 0)) == NULL)
1638 return;
1639
1640 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1641 dlip = (dl_notify_ind_t *)mp->b_rptr;
1642 dlip->dl_primitive = DL_NOTIFY_IND;
1643 dlip->dl_notification = DL_NOTE_LINK_UP;
1644
1645 qreply(dsp->ds_wq, mp);
1646 }
1647
1648 /*
1649 * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN
1650 */
1651 static void
str_notify_link_down(dld_str_t * dsp)1652 str_notify_link_down(dld_str_t *dsp)
1653 {
1654 mblk_t *mp;
1655 dl_notify_ind_t *dlip;
1656
1657 if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN))
1658 return;
1659
1660 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1661 M_PROTO, 0)) == NULL)
1662 return;
1663
1664 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1665 dlip = (dl_notify_ind_t *)mp->b_rptr;
1666 dlip->dl_primitive = DL_NOTIFY_IND;
1667 dlip->dl_notification = DL_NOTE_LINK_DOWN;
1668
1669 qreply(dsp->ds_wq, mp);
1670 }
1671
1672 /*
1673 * DL_NOTIFY_IND: DL_NOTE_SPEED
1674 */
1675 static void
str_notify_speed(dld_str_t * dsp,uint32_t speed)1676 str_notify_speed(dld_str_t *dsp, uint32_t speed)
1677 {
1678 mblk_t *mp;
1679 dl_notify_ind_t *dlip;
1680
1681 if (!(dsp->ds_notifications & DL_NOTE_SPEED))
1682 return;
1683
1684 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1685 M_PROTO, 0)) == NULL)
1686 return;
1687
1688 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1689 dlip = (dl_notify_ind_t *)mp->b_rptr;
1690 dlip->dl_primitive = DL_NOTIFY_IND;
1691 dlip->dl_notification = DL_NOTE_SPEED;
1692 dlip->dl_data = speed;
1693
1694 qreply(dsp->ds_wq, mp);
1695 }
1696
1697 /*
1698 * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG
1699 */
1700 static void
str_notify_capab_reneg(dld_str_t * dsp)1701 str_notify_capab_reneg(dld_str_t *dsp)
1702 {
1703 mblk_t *mp;
1704 dl_notify_ind_t *dlip;
1705
1706 if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG))
1707 return;
1708
1709 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1710 M_PROTO, 0)) == NULL)
1711 return;
1712
1713 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1714 dlip = (dl_notify_ind_t *)mp->b_rptr;
1715 dlip->dl_primitive = DL_NOTIFY_IND;
1716 dlip->dl_notification = DL_NOTE_CAPAB_RENEG;
1717
1718 qreply(dsp->ds_wq, mp);
1719 }
1720
1721 /*
1722 * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH
1723 */
1724 static void
str_notify_fastpath_flush(dld_str_t * dsp)1725 str_notify_fastpath_flush(dld_str_t *dsp)
1726 {
1727 mblk_t *mp;
1728 dl_notify_ind_t *dlip;
1729
1730 if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH))
1731 return;
1732
1733 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t),
1734 M_PROTO, 0)) == NULL)
1735 return;
1736
1737 bzero(mp->b_rptr, sizeof (dl_notify_ind_t));
1738 dlip = (dl_notify_ind_t *)mp->b_rptr;
1739 dlip->dl_primitive = DL_NOTIFY_IND;
1740 dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH;
1741
1742 qreply(dsp->ds_wq, mp);
1743 }
1744
1745 static void
str_notify_allowed_ips(dld_str_t * dsp)1746 str_notify_allowed_ips(dld_str_t *dsp)
1747 {
1748 mblk_t *mp;
1749 dl_notify_ind_t *dlip;
1750 size_t mp_size;
1751 mac_protect_t *mrp;
1752
1753 if (!(dsp->ds_notifications & DL_NOTE_ALLOWED_IPS))
1754 return;
1755
1756 mp_size = sizeof (mac_protect_t) + sizeof (dl_notify_ind_t);
1757 if ((mp = mexchange(dsp->ds_wq, NULL, mp_size, M_PROTO, 0)) == NULL)
1758 return;
1759
1760 mrp = mac_protect_get(dsp->ds_mh);
1761 bzero(mp->b_rptr, mp_size);
1762 dlip = (dl_notify_ind_t *)mp->b_rptr;
1763 dlip->dl_primitive = DL_NOTIFY_IND;
1764 dlip->dl_notification = DL_NOTE_ALLOWED_IPS;
1765 dlip->dl_data = 0;
1766 dlip->dl_addr_offset = sizeof (dl_notify_ind_t);
1767 dlip->dl_addr_length = sizeof (mac_protect_t);
1768 bcopy(mrp, mp->b_rptr + sizeof (dl_notify_ind_t),
1769 sizeof (mac_protect_t));
1770
1771 qreply(dsp->ds_wq, mp);
1772 }
1773
1774 /*
1775 * MAC notification callback.
1776 */
1777 void
str_notify(void * arg,mac_notify_type_t type)1778 str_notify(void *arg, mac_notify_type_t type)
1779 {
1780 dld_str_t *dsp = (dld_str_t *)arg;
1781 queue_t *q = dsp->ds_wq;
1782 mac_handle_t mh = dsp->ds_mh;
1783 mac_client_handle_t mch = dsp->ds_mch;
1784 uint8_t addr[MAXMACADDRLEN];
1785
1786 switch (type) {
1787 case MAC_NOTE_TX:
1788 qenable(q);
1789 break;
1790
1791 case MAC_NOTE_DEVPROMISC:
1792 /*
1793 * Send the appropriate DL_NOTIFY_IND.
1794 */
1795 if (mac_promisc_get(mh))
1796 str_notify_promisc_on_phys(dsp);
1797 else
1798 str_notify_promisc_off_phys(dsp);
1799 break;
1800
1801 case MAC_NOTE_UNICST:
1802 /*
1803 * This notification is sent whenever the MAC unicast
1804 * address changes.
1805 */
1806 mac_unicast_primary_get(mh, addr);
1807
1808 /*
1809 * Send the appropriate DL_NOTIFY_IND.
1810 */
1811 str_notify_phys_addr(dsp, DL_CURR_PHYS_ADDR, addr);
1812 break;
1813
1814 case MAC_NOTE_DEST:
1815 /*
1816 * Only send up DL_NOTE_DEST_ADDR if the link has a
1817 * destination address.
1818 */
1819 if (mac_dst_get(dsp->ds_mh, addr))
1820 str_notify_phys_addr(dsp, DL_CURR_DEST_ADDR, addr);
1821 break;
1822
1823 case MAC_NOTE_LOWLINK:
1824 case MAC_NOTE_LINK:
1825 /*
1826 * LOWLINK refers to the actual link status. For links that
1827 * are not part of a bridge instance LOWLINK and LINK state
1828 * are the same. But for a link part of a bridge instance
1829 * LINK state refers to the aggregate link status: "up" when
1830 * at least one link part of the bridge is up and is "down"
1831 * when all links part of the bridge are down.
1832 *
1833 * Clients can request to be notified of the LOWLINK state
1834 * using the DLIOCLOWLINK ioctl. Clients such as the bridge
1835 * daemon request lowlink state changes and upper layer clients
1836 * receive notifications of the aggregate link state changes
1837 * which is the default when requesting LINK UP/DOWN state
1838 * notifications.
1839 */
1840
1841 /*
1842 * Check that the notification type matches the one that we
1843 * want. If we want lower-level link notifications, and this
1844 * is upper, or if we want upper and this is lower, then
1845 * ignore.
1846 */
1847 if ((type == MAC_NOTE_LOWLINK) != dsp->ds_lowlink)
1848 break;
1849 /*
1850 * This notification is sent every time the MAC driver
1851 * updates the link state.
1852 */
1853 switch (mac_client_stat_get(mch, dsp->ds_lowlink ?
1854 MAC_STAT_LOWLINK_STATE : MAC_STAT_LINK_STATE)) {
1855 case LINK_STATE_UP: {
1856 uint64_t speed;
1857 /*
1858 * The link is up so send the appropriate
1859 * DL_NOTIFY_IND.
1860 */
1861 str_notify_link_up(dsp);
1862
1863 speed = mac_stat_get(mh, MAC_STAT_IFSPEED);
1864 str_notify_speed(dsp, (uint32_t)(speed / 1000ull));
1865 break;
1866 }
1867 case LINK_STATE_DOWN:
1868 /*
1869 * The link is down so send the appropriate
1870 * DL_NOTIFY_IND.
1871 */
1872 str_notify_link_down(dsp);
1873 break;
1874
1875 default:
1876 break;
1877 }
1878 break;
1879
1880 case MAC_NOTE_CAPAB_CHG:
1881 /*
1882 * This notification is sent whenever the MAC resources
1883 * change or capabilities change. We need to renegotiate
1884 * the capabilities. Send the appropriate DL_NOTIFY_IND.
1885 */
1886 str_notify_capab_reneg(dsp);
1887 break;
1888
1889 case MAC_NOTE_SDU_SIZE: {
1890 uint_t max_sdu;
1891 uint_t multicast_sdu;
1892 mac_sdu_get2(dsp->ds_mh, NULL, &max_sdu, &multicast_sdu);
1893 str_notify_sdu_size(dsp, max_sdu, multicast_sdu);
1894 break;
1895 }
1896
1897 case MAC_NOTE_FASTPATH_FLUSH:
1898 str_notify_fastpath_flush(dsp);
1899 break;
1900
1901 /* Unused notifications */
1902 case MAC_NOTE_MARGIN:
1903 break;
1904
1905 case MAC_NOTE_ALLOWED_IPS:
1906 str_notify_allowed_ips(dsp);
1907 break;
1908
1909 default:
1910 ASSERT(B_FALSE);
1911 break;
1912 }
1913 }
1914
1915 /*
1916 * This function is called via a taskq mechansim to process all control
1917 * messages on a per 'dsp' end point.
1918 */
1919 static void
dld_wput_nondata_task(void * arg)1920 dld_wput_nondata_task(void *arg)
1921 {
1922 dld_str_t *dsp = arg;
1923 mblk_t *mp;
1924
1925 mutex_enter(&dsp->ds_lock);
1926 while (dsp->ds_pending_head != NULL) {
1927 mp = dsp->ds_pending_head;
1928 dsp->ds_pending_head = mp->b_next;
1929 mp->b_next = NULL;
1930 if (dsp->ds_pending_head == NULL)
1931 dsp->ds_pending_tail = NULL;
1932 mutex_exit(&dsp->ds_lock);
1933
1934 switch (DB_TYPE(mp)) {
1935 case M_PROTO:
1936 case M_PCPROTO:
1937 dld_proto(dsp, mp);
1938 break;
1939 case M_IOCTL:
1940 dld_ioc(dsp, mp);
1941 break;
1942 default:
1943 ASSERT(0);
1944 }
1945
1946 mutex_enter(&dsp->ds_lock);
1947 }
1948 ASSERT(dsp->ds_pending_tail == NULL);
1949 dsp->ds_dlpi_pending = 0;
1950 cv_broadcast(&dsp->ds_dlpi_pending_cv);
1951 mutex_exit(&dsp->ds_lock);
1952 }
1953
1954 /*
1955 * Kernel thread to handle taskq dispatch failures in dld_wput_data. This
1956 * thread is started at boot time.
1957 */
1958 static void
dld_taskq_dispatch(void)1959 dld_taskq_dispatch(void)
1960 {
1961 callb_cpr_t cprinfo;
1962 dld_str_t *dsp;
1963
1964 CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr,
1965 "dld_taskq_dispatch");
1966 mutex_enter(&dld_taskq_lock);
1967
1968 while (!dld_taskq_quit) {
1969 dsp = list_head(&dld_taskq_list);
1970 while (dsp != NULL) {
1971 list_remove(&dld_taskq_list, dsp);
1972 mutex_exit(&dld_taskq_lock);
1973 VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task,
1974 dsp, TQ_SLEEP) != TASKQID_INVALID);
1975 mutex_enter(&dld_taskq_lock);
1976 dsp = list_head(&dld_taskq_list);
1977 }
1978
1979 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1980 cv_wait(&dld_taskq_cv, &dld_taskq_lock);
1981 CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock);
1982 }
1983
1984 dld_taskq_done = B_TRUE;
1985 cv_signal(&dld_taskq_cv);
1986 CALLB_CPR_EXIT(&cprinfo);
1987 thread_exit();
1988 }
1989
1990 /*
1991 * All control operations are serialized on the 'dsp' and are also funneled
1992 * through a taskq mechanism to ensure that subsequent processing has kernel
1993 * context and can safely use cv_wait.
1994 *
1995 * Mechanisms to handle taskq dispatch failures
1996 *
1997 * The only way to be sure that taskq dispatch does not fail is to either
1998 * specify TQ_SLEEP or to use a static taskq and prepopulate it with
1999 * some number of entries and make sure that the number of outstanding requests
2000 * are less than that number. We can't use TQ_SLEEP since we don't know the
2001 * context. Nor can we bound the total number of 'dsp' end points. So we are
2002 * unable to use either of the above schemes, and are forced to deal with
2003 * taskq dispatch failures. Note that even dynamic taskq could fail in
2004 * dispatch if TQ_NOSLEEP is specified, since this flag is translated
2005 * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq
2006 * framework.
2007 *
2008 * We maintain a queue of 'dsp's that encountered taskq dispatch failure.
2009 * We also have a single global thread to retry the taskq dispatch. This
2010 * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but
2011 * uses TQ_SLEEP to ensure eventual success of the dispatch operation.
2012 */
2013 static void
dld_wput_nondata(dld_str_t * dsp,mblk_t * mp)2014 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp)
2015 {
2016 ASSERT(mp->b_next == NULL);
2017 mutex_enter(&dsp->ds_lock);
2018 if (dsp->ds_pending_head != NULL) {
2019 ASSERT(dsp->ds_dlpi_pending);
2020 dsp->ds_pending_tail->b_next = mp;
2021 dsp->ds_pending_tail = mp;
2022 mutex_exit(&dsp->ds_lock);
2023 return;
2024 }
2025 ASSERT(dsp->ds_pending_tail == NULL);
2026 dsp->ds_pending_head = dsp->ds_pending_tail = mp;
2027 /*
2028 * At this point if ds_dlpi_pending is set, it implies that the taskq
2029 * thread is still active and is processing the last message, though
2030 * the pending queue has been emptied.
2031 */
2032 if (dsp->ds_dlpi_pending) {
2033 mutex_exit(&dsp->ds_lock);
2034 return;
2035 }
2036
2037 dsp->ds_dlpi_pending = 1;
2038 mutex_exit(&dsp->ds_lock);
2039
2040 if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp,
2041 TQ_NOSLEEP) != TASKQID_INVALID)
2042 return;
2043
2044 mutex_enter(&dld_taskq_lock);
2045 list_insert_tail(&dld_taskq_list, dsp);
2046 cv_signal(&dld_taskq_cv);
2047 mutex_exit(&dld_taskq_lock);
2048 }
2049
2050 /*
2051 * Process an M_IOCTL message.
2052 */
2053 static void
dld_ioc(dld_str_t * dsp,mblk_t * mp)2054 dld_ioc(dld_str_t *dsp, mblk_t *mp)
2055 {
2056 uint_t cmd;
2057
2058 cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
2059 ASSERT(dsp->ds_type == DLD_DLPI);
2060
2061 switch (cmd) {
2062 case DLIOCNATIVE:
2063 ioc_native(dsp, mp);
2064 break;
2065 case DLIOCMARGININFO:
2066 ioc_margin(dsp, mp);
2067 break;
2068 case DLIOCRAW:
2069 ioc_raw(dsp, mp);
2070 break;
2071 case DLIOCHDRINFO:
2072 ioc_fast(dsp, mp);
2073 break;
2074 case DLIOCLOWLINK:
2075 ioc_lowlink(dsp, mp);
2076 break;
2077 default:
2078 ioc(dsp, mp);
2079 }
2080 }
2081
2082 /*
2083 * DLIOCNATIVE
2084 */
2085 static void
ioc_native(dld_str_t * dsp,mblk_t * mp)2086 ioc_native(dld_str_t *dsp, mblk_t *mp)
2087 {
2088 queue_t *q = dsp->ds_wq;
2089 const mac_info_t *mip = dsp->ds_mip;
2090
2091 /*
2092 * Native mode can be enabled if it's disabled and if the
2093 * native media type is different.
2094 */
2095 if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia)
2096 dsp->ds_native = B_TRUE;
2097
2098 if (dsp->ds_native)
2099 miocack(q, mp, 0, mip->mi_nativemedia);
2100 else
2101 miocnak(q, mp, 0, ENOTSUP);
2102 }
2103
2104 /*
2105 * DLIOCMARGININFO
2106 */
2107 static void
ioc_margin(dld_str_t * dsp,mblk_t * mp)2108 ioc_margin(dld_str_t *dsp, mblk_t *mp)
2109 {
2110 queue_t *q = dsp->ds_wq;
2111 uint32_t margin;
2112 int err;
2113
2114 if (dsp->ds_dlstate == DL_UNATTACHED) {
2115 err = EINVAL;
2116 goto failed;
2117 }
2118 if ((err = miocpullup(mp, sizeof (uint32_t))) != 0)
2119 goto failed;
2120
2121 mac_margin_get(dsp->ds_mh, &margin);
2122 *((uint32_t *)mp->b_cont->b_rptr) = margin;
2123 miocack(q, mp, sizeof (uint32_t), 0);
2124 return;
2125
2126 failed:
2127 miocnak(q, mp, 0, err);
2128 }
2129
2130 /*
2131 * DLIOCRAW
2132 */
2133 static void
ioc_raw(dld_str_t * dsp,mblk_t * mp)2134 ioc_raw(dld_str_t *dsp, mblk_t *mp)
2135 {
2136 queue_t *q = dsp->ds_wq;
2137 mac_perim_handle_t mph;
2138
2139 if (dsp->ds_mh == NULL) {
2140 dsp->ds_mode = DLD_RAW;
2141 miocack(q, mp, 0, 0);
2142 return;
2143 }
2144
2145 mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2146 if (dsp->ds_polling || dsp->ds_direct) {
2147 mac_perim_exit(mph);
2148 miocnak(q, mp, 0, EPROTO);
2149 return;
2150 }
2151
2152 if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) {
2153 /*
2154 * Set the receive callback.
2155 */
2156 dls_rx_set(dsp, dld_str_rx_raw, dsp);
2157 }
2158
2159 /*
2160 * Note that raw mode is enabled.
2161 */
2162 dsp->ds_mode = DLD_RAW;
2163 mac_perim_exit(mph);
2164
2165 miocack(q, mp, 0, 0);
2166 }
2167
2168 /*
2169 * DLIOCHDRINFO
2170 */
2171 static void
ioc_fast(dld_str_t * dsp,mblk_t * mp)2172 ioc_fast(dld_str_t *dsp, mblk_t *mp)
2173 {
2174 dl_unitdata_req_t *dlp;
2175 off_t off;
2176 size_t len;
2177 const uint8_t *addr;
2178 uint16_t sap;
2179 mblk_t *nmp;
2180 mblk_t *hmp;
2181 uint_t addr_length;
2182 queue_t *q = dsp->ds_wq;
2183 int err;
2184 mac_perim_handle_t mph;
2185
2186 if (dld_opt & DLD_OPT_NO_FASTPATH) {
2187 err = ENOTSUP;
2188 goto failed;
2189 }
2190
2191 /*
2192 * DLIOCHDRINFO should only come from IP. The one initiated from
2193 * user-land should not be allowed.
2194 */
2195 if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) {
2196 err = EINVAL;
2197 goto failed;
2198 }
2199
2200 nmp = mp->b_cont;
2201 if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) ||
2202 (dlp = (dl_unitdata_req_t *)nmp->b_rptr,
2203 dlp->dl_primitive != DL_UNITDATA_REQ)) {
2204 err = EINVAL;
2205 goto failed;
2206 }
2207
2208 off = dlp->dl_dest_addr_offset;
2209 len = dlp->dl_dest_addr_length;
2210
2211 if (!MBLKIN(nmp, off, len)) {
2212 err = EINVAL;
2213 goto failed;
2214 }
2215
2216 if (dsp->ds_dlstate != DL_IDLE) {
2217 err = ENOTSUP;
2218 goto failed;
2219 }
2220
2221 addr_length = dsp->ds_mip->mi_addr_length;
2222 if (len != addr_length + sizeof (uint16_t)) {
2223 err = EINVAL;
2224 goto failed;
2225 }
2226
2227 addr = nmp->b_rptr + off;
2228 sap = *(uint16_t *)(nmp->b_rptr + off + addr_length);
2229
2230 if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) {
2231 err = ENOMEM;
2232 goto failed;
2233 }
2234
2235 /*
2236 * This ioctl might happen concurrently with a direct call to dld_capab
2237 * that tries to enable direct and/or poll capabilities. Since the
2238 * stack does not serialize them, we do so here to avoid mixing
2239 * the callbacks.
2240 */
2241 mac_perim_enter_by_mh(dsp->ds_mh, &mph);
2242 if (dsp->ds_mode != DLD_FASTPATH) {
2243 /*
2244 * Set the receive callback (unless polling is enabled).
2245 */
2246 if (!dsp->ds_polling && !dsp->ds_direct)
2247 dls_rx_set(dsp, dld_str_rx_fastpath, dsp);
2248
2249 /*
2250 * Note that fast-path mode is enabled.
2251 */
2252 dsp->ds_mode = DLD_FASTPATH;
2253 }
2254 mac_perim_exit(mph);
2255
2256 freemsg(nmp->b_cont);
2257 nmp->b_cont = hmp;
2258
2259 miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0);
2260 return;
2261 failed:
2262 miocnak(q, mp, 0, err);
2263 }
2264
2265 /*
2266 * DLIOCLOWLINK: request actual link state changes. When the
2267 * link is part of a bridge instance the client receives actual
2268 * link state changes and not the aggregate link status. Used by
2269 * the bridging daemon (bridged) for proper RSTP operation.
2270 */
2271 static void
ioc_lowlink(dld_str_t * dsp,mblk_t * mp)2272 ioc_lowlink(dld_str_t *dsp, mblk_t *mp)
2273 {
2274 queue_t *q = dsp->ds_wq;
2275 int err;
2276
2277 if ((err = miocpullup(mp, sizeof (int))) != 0) {
2278 miocnak(q, mp, 0, err);
2279 } else {
2280 /* LINTED: alignment */
2281 dsp->ds_lowlink = *(boolean_t *)mp->b_cont->b_rptr;
2282 miocack(q, mp, 0, 0);
2283 }
2284 }
2285
2286 /*
2287 * Catch-all handler.
2288 */
2289 static void
ioc(dld_str_t * dsp,mblk_t * mp)2290 ioc(dld_str_t *dsp, mblk_t *mp)
2291 {
2292 queue_t *q = dsp->ds_wq;
2293
2294 if (dsp->ds_dlstate == DL_UNATTACHED) {
2295 miocnak(q, mp, 0, EINVAL);
2296 return;
2297 }
2298 mac_ioctl(dsp->ds_mh, q, mp);
2299 }
2300