1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Data-Link Driver 30 */ 31 32 #include <sys/stropts.h> 33 #include <sys/strsun.h> 34 #include <sys/strsubr.h> 35 #include <sys/atomic.h> 36 #include <sys/disp.h> 37 #include <sys/callb.h> 38 #include <sys/vlan.h> 39 #include <sys/dld.h> 40 #include <sys/dld_impl.h> 41 #include <sys/dls_impl.h> 42 #include <inet/common.h> 43 44 static int str_constructor(void *, void *, int); 45 static void str_destructor(void *, void *); 46 static mblk_t *str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t); 47 static void str_notify_promisc_on_phys(dld_str_t *); 48 static void str_notify_promisc_off_phys(dld_str_t *); 49 static void str_notify_phys_addr(dld_str_t *, const uint8_t *); 50 static void str_notify_link_up(dld_str_t *); 51 static void str_notify_link_down(dld_str_t *); 52 static void str_notify_capab_reneg(dld_str_t *); 53 static void str_notify_speed(dld_str_t *, uint32_t); 54 static void str_notify(void *, mac_notify_type_t); 55 56 static void ioc_native(dld_str_t *, mblk_t *); 57 static void ioc_margin(dld_str_t *, mblk_t *); 58 static void ioc_raw(dld_str_t *, mblk_t *); 59 static void ioc_fast(dld_str_t *, mblk_t *); 60 static void ioc(dld_str_t *, mblk_t *); 61 static void dld_tx_enqueue(dld_str_t *, mblk_t *, mblk_t *, boolean_t, 62 uint_t, uint_t); 63 static void dld_wput_nondata(dld_str_t *, mblk_t *); 64 static void dld_wput_nondata_task(void *); 65 static void dld_flush_nondata(dld_str_t *); 66 static mblk_t *i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t); 67 static mblk_t *i_dld_ether_header_strip_tag(mblk_t *); 68 69 static uint32_t str_count; 70 static kmem_cache_t *str_cachep; 71 static taskq_t *dld_disp_taskq = NULL; 72 static mod_hash_t *str_hashp; 73 74 #define STR_HASHSZ 64 75 #define STR_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key)) 76 77 static inline uint_t mp_getsize(mblk_t *); 78 79 /* 80 * Interval to count the TX queued depth. Default is 1s (1000000us). 81 * Count the queue depth immediately (not by timeout) if this is set to 0. 82 * See more details above dld_tx_enqueue(). 83 */ 84 uint_t tx_qdepth_interval = 1000000; 85 86 /* 87 * Some notes on entry points, flow-control, queueing and locking: 88 * 89 * This driver exports the traditional STREAMS put entry point as well as 90 * the non-STREAMS fast-path transmit routine which is provided to IP via 91 * the DL_CAPAB_POLL negotiation. The put procedure handles all control 92 * and data operations, while the fast-path routine deals only with M_DATA 93 * fast-path packets. Regardless of the entry point, all outbound packets 94 * will end up in dld_tx_single(), where they will be delivered to the MAC 95 * driver. 96 * 97 * The transmit logic operates in two modes: a "not busy" mode where the 98 * packets will be delivered to the MAC for a send attempt, or "busy" mode 99 * where they will be enqueued in the internal queue because of flow-control. 100 * Flow-control happens when the MAC driver indicates the packets couldn't 101 * be transmitted due to lack of resources (e.g. running out of descriptors). 102 * In such case, the driver will place a dummy message on its write-side 103 * STREAMS queue so that the queue is marked as "full". Any subsequent 104 * packets arriving at the driver will be enqueued in the internal queue, 105 * which is drained in the context of the service thread that gets scheduled 106 * whenever the driver is in the "busy" mode. When all packets have been 107 * successfully delivered by MAC and the internal queue is empty, it will 108 * transition to the "not busy" mode by removing the dummy message from the 109 * write-side STREAMS queue; in effect this will trigger backenabling. 110 * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due 111 * to the above reasons. 112 * 113 * The driver implements an internal transmit queue independent of STREAMS. 114 * This allows for flexibility and provides a fast enqueue/dequeue mechanism 115 * compared to the putq() and get() STREAMS interfaces. The only putq() and 116 * getq() operations done by the driver are those related to placing and 117 * removing the dummy message to/from the write-side STREAMS queue for flow- 118 * control purposes. 119 * 120 * Locking is done independent of STREAMS due to the driver being fully MT. 121 * Threads entering the driver (either from put or service entry points) 122 * will most likely be readers, with the exception of a few writer cases 123 * such those handling DLPI attach/detach/bind/unbind/etc. or any of the 124 * DLD-related ioctl requests. The DLPI detach case is special, because 125 * it involves freeing resources and therefore must be single-threaded. 126 * Unfortunately the readers/writers lock can't be used to protect against 127 * it, because the lock is dropped prior to the driver calling places where 128 * putnext() may be invoked, and such places may depend on those resources 129 * to exist. Because of this, the driver always completes the DLPI detach 130 * process when there are no other threads running in the driver. This is 131 * done by keeping track of the number of threads, such that the the last 132 * thread leaving the driver will finish the pending DLPI detach operation. 133 */ 134 135 /* 136 * dld_max_q_count is the queue depth threshold used to limit the number of 137 * outstanding packets or bytes allowed in the queue; once this limit is 138 * reached the driver will free any incoming ones until the queue depth 139 * drops below the threshold. 140 * 141 * This buffering is provided to accomodate clients which do not employ 142 * their own buffering scheme, and to handle occasional packet bursts. 143 * Clients which handle their own buffering will receive positive feedback 144 * from this driver as soon as it transitions into the "busy" state, i.e. 145 * when the queue is initially filled up; they will get backenabled once 146 * the queue is empty. 147 * 148 * The value chosen here is rather arbitrary; in future some intelligent 149 * heuristics may be involved which could take into account the hardware's 150 * transmit ring size, etc. 151 */ 152 uint_t dld_max_q_count = (16 * 1024 *1024); 153 154 /* 155 * dld_finddevinfo() returns the dev_info_t * corresponding to a particular 156 * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that 157 * match dev_t. If a stream is found and it is attached, its dev_info_t * 158 * is returned. 159 */ 160 typedef struct i_dld_str_state_s { 161 major_t ds_major; 162 minor_t ds_minor; 163 dev_info_t *ds_dip; 164 } i_dld_str_state_t; 165 166 /* ARGSUSED */ 167 static uint_t 168 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 169 { 170 i_dld_str_state_t *statep = arg; 171 dld_str_t *dsp = (dld_str_t *)val; 172 173 if (statep->ds_major != dsp->ds_major) 174 return (MH_WALK_CONTINUE); 175 176 ASSERT(statep->ds_minor != 0); 177 178 /* 179 * Access to ds_mh needs to be protected by ds_lock. 180 */ 181 rw_enter(&dsp->ds_lock, RW_READER); 182 if (statep->ds_minor == dsp->ds_minor) { 183 /* 184 * Clone: a clone minor is unique. we can terminate the 185 * walk if we find a matching stream -- even if we fail 186 * to obtain the devinfo. 187 */ 188 if (dsp->ds_mh != NULL) 189 statep->ds_dip = mac_devinfo_get(dsp->ds_mh); 190 rw_exit(&dsp->ds_lock); 191 return (MH_WALK_TERMINATE); 192 } 193 rw_exit(&dsp->ds_lock); 194 return (MH_WALK_CONTINUE); 195 } 196 197 static dev_info_t * 198 dld_finddevinfo(dev_t dev) 199 { 200 dev_info_t *dip; 201 i_dld_str_state_t state; 202 203 if (getminor(dev) == 0) 204 return (NULL); 205 206 /* 207 * See if it's a minor node of a link 208 */ 209 if ((dip = dls_finddevinfo(dev)) != NULL) 210 return (dip); 211 212 state.ds_minor = getminor(dev); 213 state.ds_major = getmajor(dev); 214 state.ds_dip = NULL; 215 216 mod_hash_walk(str_hashp, i_dld_str_walker, &state); 217 return (state.ds_dip); 218 } 219 220 /* 221 * devo_getinfo: getinfo(9e) 222 */ 223 /*ARGSUSED*/ 224 int 225 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) 226 { 227 dev_info_t *devinfo; 228 minor_t minor = getminor((dev_t)arg); 229 int rc = DDI_FAILURE; 230 231 switch (cmd) { 232 case DDI_INFO_DEVT2DEVINFO: 233 if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) { 234 *(dev_info_t **)resp = devinfo; 235 rc = DDI_SUCCESS; 236 } 237 break; 238 case DDI_INFO_DEVT2INSTANCE: 239 if (minor > 0 && minor <= DLS_MAX_MINOR) { 240 *resp = (void *)(uintptr_t)DLS_MINOR2INST(minor); 241 rc = DDI_SUCCESS; 242 } else if (minor > DLS_MAX_MINOR && 243 (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) { 244 *resp = (void *)(uintptr_t)ddi_get_instance(devinfo); 245 rc = DDI_SUCCESS; 246 } 247 break; 248 } 249 return (rc); 250 } 251 252 /* 253 * qi_qopen: open(9e) 254 */ 255 /*ARGSUSED*/ 256 int 257 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp) 258 { 259 dld_str_t *dsp; 260 major_t major; 261 minor_t minor; 262 int err; 263 264 if (sflag == MODOPEN) 265 return (ENOTSUP); 266 267 /* 268 * This is a cloning driver and therefore each queue should only 269 * ever get opened once. 270 */ 271 if (rq->q_ptr != NULL) 272 return (EBUSY); 273 274 major = getmajor(*devp); 275 minor = getminor(*devp); 276 277 /* 278 * Create a new dld_str_t for the stream. This will grab a new minor 279 * number that will be handed back in the cloned dev_t. Creation may 280 * fail if we can't allocate the dummy mblk used for flow-control. 281 */ 282 dsp = dld_str_create(rq, DLD_DLPI, major, 283 ((minor == 0) ? DL_STYLE2 : DL_STYLE1)); 284 if (dsp == NULL) 285 return (ENOSR); 286 287 ASSERT(dsp->ds_dlstate == DL_UNATTACHED); 288 if (minor != 0) { 289 /* 290 * Style 1 open 291 */ 292 if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0) 293 goto failed; 294 ASSERT(dsp->ds_dlstate == DL_UNBOUND); 295 } else { 296 (void) qassociate(rq, -1); 297 } 298 299 /* 300 * Enable the queue srv(9e) routine. 301 */ 302 qprocson(rq); 303 304 /* 305 * Construct a cloned dev_t to hand back. 306 */ 307 *devp = makedevice(getmajor(*devp), dsp->ds_minor); 308 return (0); 309 310 failed: 311 dld_str_destroy(dsp); 312 return (err); 313 } 314 315 /* 316 * qi_qclose: close(9e) 317 */ 318 int 319 dld_close(queue_t *rq) 320 { 321 dld_str_t *dsp = rq->q_ptr; 322 323 /* 324 * Disable the queue srv(9e) routine. 325 */ 326 qprocsoff(rq); 327 328 dld_finish_pending_task(dsp); 329 330 /* 331 * This stream was open to a provider node. Check to see 332 * if it has been cleanly shut down. 333 */ 334 if (dsp->ds_dlstate != DL_UNATTACHED) { 335 /* 336 * The stream is either open to a style 1 provider or 337 * this is not clean shutdown. Detach from the PPA. 338 * (This is still ok even in the style 1 case). 339 */ 340 dld_str_detach(dsp); 341 } 342 343 dld_str_destroy(dsp); 344 return (0); 345 } 346 347 /* 348 * qi_qputp: put(9e) 349 */ 350 void 351 dld_wput(queue_t *wq, mblk_t *mp) 352 { 353 dld_str_t *dsp = wq->q_ptr; 354 355 switch (DB_TYPE(mp)) { 356 case M_DATA: { 357 dld_tx_t tx; 358 359 DLD_TX_ENTER(dsp); 360 if ((tx = dsp->ds_tx) != NULL) 361 tx(dsp, mp); 362 else 363 freemsg(mp); 364 DLD_TX_EXIT(dsp); 365 break; 366 } 367 case M_PROTO: 368 case M_PCPROTO: { 369 t_uscalar_t prim; 370 dld_tx_t tx; 371 372 if (MBLKL(mp) < sizeof (t_uscalar_t)) { 373 freemsg(mp); 374 return; 375 } 376 377 prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; 378 if (prim != DL_UNITDATA_REQ) { 379 /* Control path */ 380 dld_wput_nondata(dsp, mp); 381 break; 382 } 383 384 /* Data path */ 385 DLD_TX_ENTER(dsp); 386 if ((tx = dsp->ds_unitdata_tx) != NULL) 387 tx(dsp, mp); 388 else 389 dlerrorack(wq, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0); 390 DLD_TX_EXIT(dsp); 391 break; 392 } 393 case M_IOCTL: 394 case M_IOCDATA: 395 /* Control path */ 396 dld_wput_nondata(dsp, mp); 397 break; 398 case M_FLUSH: 399 /* 400 * Flush both the data messages and the control messages. 401 */ 402 if (*mp->b_rptr & FLUSHW) { 403 dld_flush_nondata(dsp); 404 dld_tx_flush(dsp); 405 *mp->b_rptr &= ~FLUSHW; 406 } 407 408 if (*mp->b_rptr & FLUSHR) { 409 qreply(wq, mp); 410 } else { 411 freemsg(mp); 412 } 413 break; 414 default: 415 freemsg(mp); 416 break; 417 } 418 } 419 420 /* 421 * Called by GLDv3 control node to process the ioctls. It will start 422 * a taskq to allow the ioctl processing to block. This is a temporary 423 * solution, and will be replaced by a more graceful approach afterwards. 424 */ 425 void 426 dld_ioctl(queue_t *wq, mblk_t *mp) 427 { 428 dld_wput_nondata(wq->q_ptr, mp); 429 } 430 431 /* 432 * qi_srvp: srv(9e) 433 */ 434 void 435 dld_wsrv(queue_t *wq) 436 { 437 mblk_t *mp, *head, *tail; 438 dld_str_t *dsp = wq->q_ptr; 439 uint_t cnt, msgcnt; 440 timeout_id_t tid = 0; 441 442 rw_enter(&dsp->ds_lock, RW_READER); 443 /* 444 * Grab all packets (chained via b_next) off our transmit queue 445 * and try to send them all to the MAC layer. Since the queue 446 * is independent of streams, we are able to dequeue all messages 447 * at once without looping through getq() and manually chaining 448 * them. Note that the queue size parameters (byte and message 449 * counts) are cleared as well, but we postpone the backenabling 450 * until after the MAC transmit since some packets may end up 451 * back at our transmit queue. 452 */ 453 mutex_enter(&dsp->ds_tx_list_lock); 454 if ((mp = dsp->ds_tx_list_head) == NULL) { 455 ASSERT(!dsp->ds_tx_qbusy); 456 ASSERT(dsp->ds_tx_flow_mp != NULL); 457 ASSERT(dsp->ds_tx_list_head == NULL); 458 ASSERT(dsp->ds_tx_list_tail == NULL); 459 ASSERT(dsp->ds_tx_cnt == 0); 460 ASSERT(dsp->ds_tx_msgcnt == 0); 461 mutex_exit(&dsp->ds_tx_list_lock); 462 rw_exit(&dsp->ds_lock); 463 return; 464 } 465 head = mp; 466 tail = dsp->ds_tx_list_tail; 467 dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL; 468 cnt = dsp->ds_tx_cnt; 469 msgcnt = dsp->ds_tx_msgcnt; 470 dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0; 471 mutex_exit(&dsp->ds_tx_list_lock); 472 473 /* 474 * Discard packets unless we are attached and bound; note that 475 * the driver mode (fastpath/raw/unitdata) is irrelevant here, 476 * because regardless of the mode all transmit will end up in 477 * dld_tx_single() where the packets may be queued. 478 */ 479 ASSERT((DB_TYPE(mp) == M_DATA) || (DB_TYPE(mp) == M_MULTIDATA)); 480 if (dsp->ds_dlstate != DL_IDLE) { 481 freemsgchain(mp); 482 goto done; 483 } 484 485 /* 486 * Attempt to transmit one or more packets. If the MAC can't 487 * send them all, re-queue the packet(s) at the beginning of 488 * the transmit queue to avoid any re-ordering. 489 */ 490 mp = dls_tx(dsp->ds_dc, mp); 491 if (mp == head) { 492 /* 493 * No message was sent out. Take the saved the queue depth 494 * as the input, so that dld_tx_enqueue() need not to 495 * calculate it again. 496 */ 497 dld_tx_enqueue(dsp, mp, tail, B_TRUE, msgcnt, cnt); 498 } else if (mp != NULL) { 499 /* 500 * Some but not all messages were sent out. dld_tx_enqueue() 501 * needs to start the timer to calculate the queue depth if 502 * timer has not been started. 503 * 504 * Note that a timer is used to calculate the queue depth 505 * to improve network performance, especially for TCP, in 506 * which case packets are sent without canput() being checked, 507 * and mostly end up in dld_tx_enqueue() under heavy load. 508 */ 509 dld_tx_enqueue(dsp, mp, tail, B_TRUE, 0, 0); 510 } 511 512 done: 513 /* 514 * Grab the list lock again and check if the transmit queue is 515 * really empty; if so, lift up flow-control and backenable any 516 * writer queues. If the queue is not empty, schedule service 517 * thread to drain it. 518 */ 519 mutex_enter(&dsp->ds_tx_list_lock); 520 if (dsp->ds_tx_list_head == NULL) { 521 dsp->ds_tx_flow_mp = getq(wq); 522 ASSERT(dsp->ds_tx_flow_mp != NULL); 523 dsp->ds_tx_qbusy = B_FALSE; 524 if ((tid = dsp->ds_tx_qdepth_tid) != 0) 525 dsp->ds_tx_qdepth_tid = 0; 526 } 527 mutex_exit(&dsp->ds_tx_list_lock); 528 529 /* 530 * Note that ds_tx_list_lock (which is acquired by the timeout 531 * callback routine) cannot be held across the call to untimeout(). 532 */ 533 if (tid != 0) 534 (void) untimeout(tid); 535 536 rw_exit(&dsp->ds_lock); 537 } 538 539 void 540 dld_init_ops(struct dev_ops *ops, const char *name) 541 { 542 struct streamtab *stream; 543 struct qinit *rq, *wq; 544 struct module_info *modinfo; 545 546 modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP); 547 modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP); 548 (void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name); 549 modinfo->mi_minpsz = 0; 550 modinfo->mi_maxpsz = 64*1024; 551 modinfo->mi_hiwat = 1; 552 modinfo->mi_lowat = 0; 553 554 rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP); 555 rq->qi_qopen = dld_open; 556 rq->qi_qclose = dld_close; 557 rq->qi_minfo = modinfo; 558 559 wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP); 560 wq->qi_putp = (pfi_t)dld_wput; 561 wq->qi_srvp = (pfi_t)dld_wsrv; 562 wq->qi_minfo = modinfo; 563 564 stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP); 565 stream->st_rdinit = rq; 566 stream->st_wrinit = wq; 567 ops->devo_cb_ops->cb_str = stream; 568 569 ops->devo_getinfo = &dld_getinfo; 570 } 571 572 void 573 dld_fini_ops(struct dev_ops *ops) 574 { 575 struct streamtab *stream; 576 struct qinit *rq, *wq; 577 struct module_info *modinfo; 578 579 stream = ops->devo_cb_ops->cb_str; 580 rq = stream->st_rdinit; 581 wq = stream->st_wrinit; 582 modinfo = rq->qi_minfo; 583 ASSERT(wq->qi_minfo == modinfo); 584 585 kmem_free(stream, sizeof (struct streamtab)); 586 kmem_free(wq, sizeof (struct qinit)); 587 kmem_free(rq, sizeof (struct qinit)); 588 kmem_free(modinfo->mi_idname, FMNAMESZ); 589 kmem_free(modinfo, sizeof (struct module_info)); 590 } 591 592 /* 593 * Initialize this module's data structures. 594 */ 595 void 596 dld_str_init(void) 597 { 598 /* 599 * Create dld_str_t object cache. 600 */ 601 str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t), 602 0, str_constructor, str_destructor, NULL, NULL, NULL, 0); 603 ASSERT(str_cachep != NULL); 604 605 /* 606 * Create taskq to process DLPI requests. 607 */ 608 dld_disp_taskq = taskq_create("dld_disp_taskq", 1024, MINCLSYSPRI, 2, 609 INT_MAX, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); 610 611 /* 612 * Create a hash table for maintaining dld_str_t's. 613 * The ds_minor field (the clone minor number) of a dld_str_t 614 * is used as a key for this hash table because this number is 615 * globally unique (allocated from "dls_minor_arena"). 616 */ 617 str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ, 618 mod_hash_null_valdtor); 619 } 620 621 /* 622 * Tear down this module's data structures. 623 */ 624 int 625 dld_str_fini(void) 626 { 627 /* 628 * Make sure that there are no objects in use. 629 */ 630 if (str_count != 0) 631 return (EBUSY); 632 633 ASSERT(dld_disp_taskq != NULL); 634 taskq_destroy(dld_disp_taskq); 635 dld_disp_taskq = NULL; 636 637 /* 638 * Destroy object cache. 639 */ 640 kmem_cache_destroy(str_cachep); 641 mod_hash_destroy_idhash(str_hashp); 642 return (0); 643 } 644 645 /* 646 * Create a new dld_str_t object. 647 */ 648 dld_str_t * 649 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style) 650 { 651 dld_str_t *dsp; 652 int err; 653 654 /* 655 * Allocate an object from the cache. 656 */ 657 atomic_add_32(&str_count, 1); 658 dsp = kmem_cache_alloc(str_cachep, KM_SLEEP); 659 660 /* 661 * Allocate the dummy mblk for flow-control. 662 */ 663 dsp->ds_tx_flow_mp = allocb(1, BPRI_HI); 664 if (dsp->ds_tx_flow_mp == NULL) { 665 kmem_cache_free(str_cachep, dsp); 666 atomic_add_32(&str_count, -1); 667 return (NULL); 668 } 669 dsp->ds_type = type; 670 dsp->ds_major = major; 671 dsp->ds_style = style; 672 dsp->ds_tx = dsp->ds_unitdata_tx = NULL; 673 674 /* 675 * Initialize the queue pointers. 676 */ 677 ASSERT(RD(rq) == rq); 678 dsp->ds_rq = rq; 679 dsp->ds_wq = WR(rq); 680 rq->q_ptr = WR(rq)->q_ptr = (void *)dsp; 681 682 /* 683 * We want explicit control over our write-side STREAMS queue 684 * where the dummy mblk gets added/removed for flow-control. 685 */ 686 noenable(WR(rq)); 687 688 err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor), 689 (mod_hash_val_t)dsp); 690 ASSERT(err == 0); 691 return (dsp); 692 } 693 694 void 695 dld_finish_pending_task(dld_str_t *dsp) 696 { 697 /* 698 * Wait until the pending requests are processed by the worker thread. 699 */ 700 mutex_enter(&dsp->ds_disp_lock); 701 dsp->ds_closing = B_TRUE; 702 while (dsp->ds_tid != NULL) 703 cv_wait(&dsp->ds_disp_cv, &dsp->ds_disp_lock); 704 dsp->ds_closing = B_FALSE; 705 mutex_exit(&dsp->ds_disp_lock); 706 } 707 708 /* 709 * Destroy a dld_str_t object. 710 */ 711 void 712 dld_str_destroy(dld_str_t *dsp) 713 { 714 queue_t *rq; 715 queue_t *wq; 716 mod_hash_val_t val; 717 /* 718 * Clear the queue pointers. 719 */ 720 rq = dsp->ds_rq; 721 wq = dsp->ds_wq; 722 ASSERT(wq == WR(rq)); 723 724 rq->q_ptr = wq->q_ptr = NULL; 725 dsp->ds_rq = dsp->ds_wq = NULL; 726 727 ASSERT(!RW_LOCK_HELD(&dsp->ds_lock)); 728 ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock)); 729 ASSERT(dsp->ds_tx_list_head == NULL); 730 ASSERT(dsp->ds_tx_list_tail == NULL); 731 ASSERT(dsp->ds_tx_cnt == 0); 732 ASSERT(dsp->ds_tx_msgcnt == 0); 733 ASSERT(dsp->ds_tx_qdepth_tid == 0); 734 ASSERT(!dsp->ds_tx_qbusy); 735 736 ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock)); 737 ASSERT(dsp->ds_pending_head == NULL); 738 ASSERT(dsp->ds_pending_tail == NULL); 739 ASSERT(dsp->ds_tx == NULL); 740 ASSERT(dsp->ds_unitdata_tx == NULL); 741 742 /* 743 * Reinitialize all the flags. 744 */ 745 dsp->ds_notifications = 0; 746 dsp->ds_passivestate = DLD_UNINITIALIZED; 747 dsp->ds_mode = DLD_UNITDATA; 748 dsp->ds_native = B_FALSE; 749 750 /* 751 * Free the dummy mblk if exists. 752 */ 753 if (dsp->ds_tx_flow_mp != NULL) { 754 freeb(dsp->ds_tx_flow_mp); 755 dsp->ds_tx_flow_mp = NULL; 756 } 757 758 (void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val); 759 ASSERT(dsp == (dld_str_t *)val); 760 761 /* 762 * Free the object back to the cache. 763 */ 764 kmem_cache_free(str_cachep, dsp); 765 atomic_add_32(&str_count, -1); 766 } 767 768 /* 769 * kmem_cache contructor function: see kmem_cache_create(9f). 770 */ 771 /*ARGSUSED*/ 772 static int 773 str_constructor(void *buf, void *cdrarg, int kmflags) 774 { 775 dld_str_t *dsp = buf; 776 777 bzero(buf, sizeof (dld_str_t)); 778 779 /* 780 * Allocate a new minor number. 781 */ 782 if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0) 783 return (-1); 784 785 /* 786 * Initialize the DLPI state machine. 787 */ 788 dsp->ds_dlstate = DL_UNATTACHED; 789 790 rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL); 791 mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL); 792 mutex_init(&dsp->ds_disp_lock, NULL, MUTEX_DRIVER, NULL); 793 cv_init(&dsp->ds_disp_cv, NULL, CV_DRIVER, NULL); 794 mutex_init(&dsp->ds_tx_lock, NULL, MUTEX_DRIVER, NULL); 795 cv_init(&dsp->ds_tx_cv, NULL, CV_DRIVER, NULL); 796 797 return (0); 798 } 799 800 /* 801 * kmem_cache destructor function. 802 */ 803 /*ARGSUSED*/ 804 static void 805 str_destructor(void *buf, void *cdrarg) 806 { 807 dld_str_t *dsp = buf; 808 809 /* 810 * Make sure the DLPI state machine was reset. 811 */ 812 ASSERT(dsp->ds_dlstate == DL_UNATTACHED); 813 814 /* 815 * Make sure the data-link interface was closed. 816 */ 817 ASSERT(dsp->ds_mh == NULL); 818 ASSERT(dsp->ds_dc == NULL); 819 ASSERT(dsp->ds_tx == NULL); 820 ASSERT(dsp->ds_unitdata_tx == NULL); 821 ASSERT(dsp->ds_intx_cnt == 0); 822 ASSERT(dsp->ds_detaching == B_FALSE); 823 824 /* 825 * Make sure enabled notifications are cleared. 826 */ 827 ASSERT(dsp->ds_notifications == 0); 828 829 /* 830 * Make sure polling is disabled. 831 */ 832 ASSERT(!dsp->ds_polling); 833 834 /* 835 * Release the minor number. 836 */ 837 mac_minor_rele(dsp->ds_minor); 838 839 ASSERT(!RW_LOCK_HELD(&dsp->ds_lock)); 840 rw_destroy(&dsp->ds_lock); 841 842 ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock)); 843 mutex_destroy(&dsp->ds_tx_list_lock); 844 ASSERT(dsp->ds_tx_flow_mp == NULL); 845 ASSERT(dsp->ds_pending_head == NULL); 846 ASSERT(dsp->ds_pending_tail == NULL); 847 ASSERT(!dsp->ds_closing); 848 849 ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock)); 850 mutex_destroy(&dsp->ds_disp_lock); 851 cv_destroy(&dsp->ds_disp_cv); 852 853 ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_lock)); 854 mutex_destroy(&dsp->ds_tx_lock); 855 cv_destroy(&dsp->ds_tx_cv); 856 } 857 858 void 859 dld_tx_single(dld_str_t *dsp, mblk_t *mp) 860 { 861 /* 862 * If we are busy enqueue the packet and return. 863 * Otherwise hand them over to the MAC driver for transmission. 864 * If the message didn't get sent it will be queued. 865 * 866 * Note here that we don't grab the list lock prior to checking 867 * the busy flag. This is okay, because a missed transition 868 * will not cause any packet reordering for any particular TCP 869 * connection (which is single-threaded). The enqueue routine 870 * will atomically set the busy flag and schedule the service 871 * thread to run; the flag is only cleared by the service thread 872 * when there is no more packet to be transmitted. 873 */ 874 875 if (dsp->ds_tx_qbusy || ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)) 876 dld_tx_enqueue(dsp, mp, mp, B_FALSE, 1, mp_getsize(mp)); 877 } 878 879 /* 880 * Update the priority bits and VID (may need to insert tag if mp points 881 * to an untagged packet). 882 * If vid is VLAN_ID_NONE, use the VID encoded in the packet. 883 */ 884 static mblk_t * 885 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid) 886 { 887 mblk_t *hmp; 888 struct ether_vlan_header *evhp; 889 struct ether_header *ehp; 890 uint16_t old_tci = 0; 891 size_t len; 892 893 ASSERT(pri != 0 || vid != VLAN_ID_NONE); 894 895 evhp = (struct ether_vlan_header *)mp->b_rptr; 896 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 897 /* 898 * Tagged packet, update the priority bits. 899 */ 900 old_tci = ntohs(evhp->ether_tci); 901 len = sizeof (struct ether_vlan_header); 902 903 if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) { 904 /* 905 * In case some drivers only check the db_ref 906 * count of the first mblk, we pullup the 907 * message into a single mblk. 908 */ 909 hmp = msgpullup(mp, -1); 910 if ((hmp == NULL) || (MBLKL(hmp) < len)) { 911 freemsg(hmp); 912 return (NULL); 913 } else { 914 freemsg(mp); 915 mp = hmp; 916 } 917 } 918 919 evhp = (struct ether_vlan_header *)mp->b_rptr; 920 } else { 921 /* 922 * Untagged packet. Insert the special priority tag. 923 * First allocate a header mblk. 924 */ 925 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 926 if (hmp == NULL) 927 return (NULL); 928 929 evhp = (struct ether_vlan_header *)hmp->b_rptr; 930 ehp = (struct ether_header *)mp->b_rptr; 931 932 /* 933 * Copy the MAC addresses and typelen 934 */ 935 bcopy(ehp, evhp, (ETHERADDRL * 2)); 936 evhp->ether_type = ehp->ether_type; 937 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 938 939 hmp->b_wptr += sizeof (struct ether_vlan_header); 940 mp->b_rptr += sizeof (struct ether_header); 941 942 /* 943 * Free the original message if it's now empty. Link the 944 * rest of the messages to the header message. 945 */ 946 if (MBLKL(mp) == 0) { 947 hmp->b_cont = mp->b_cont; 948 freeb(mp); 949 } else { 950 hmp->b_cont = mp; 951 } 952 mp = hmp; 953 } 954 955 if (pri == 0) 956 pri = VLAN_PRI(old_tci); 957 if (vid == VLAN_ID_NONE) 958 vid = VLAN_ID(old_tci); 959 evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid)); 960 return (mp); 961 } 962 963 /* 964 * M_DATA put 965 * 966 * The poll callback function for DLS clients which are not in the per-stream 967 * mode. This function is called from an upper layer protocol (currently only 968 * tcp and udp). 969 */ 970 void 971 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp) 972 { 973 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); 974 mblk_t *newmp; 975 uint_t pri; 976 977 if (is_ethernet) { 978 /* 979 * Update the priority bits to the assigned priority. 980 */ 981 pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp); 982 983 if (pri != 0) { 984 newmp = i_dld_ether_header_update_tag(mp, pri, 985 VLAN_ID_NONE); 986 if (newmp == NULL) 987 goto discard; 988 mp = newmp; 989 } 990 } 991 992 dld_tx_single(dsp, mp); 993 return; 994 995 discard: 996 /* TODO: bump kstat? */ 997 freemsg(mp); 998 } 999 1000 /* 1001 * M_DATA put (DLIOCRAW mode). 1002 */ 1003 void 1004 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) 1005 { 1006 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); 1007 mblk_t *bp, *newmp; 1008 size_t size; 1009 mac_header_info_t mhi; 1010 uint_t pri, vid; 1011 uint_t max_sdu; 1012 1013 /* 1014 * Certain MAC type plugins provide an illusion for raw DLPI 1015 * consumers. They pretend that the MAC layer is something that 1016 * it's not for the benefit of observability tools. For example, 1017 * mac_wifi pretends that it's Ethernet for such consumers. 1018 * Here, unless native mode is enabled, we call into the MAC layer so 1019 * that this illusion can be maintained. The plugin will optionally 1020 * transform the MAC header here into something that can be passed 1021 * down. The header goes from raw mode to "cooked" mode. 1022 */ 1023 if (!dsp->ds_native) { 1024 if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL) 1025 goto discard; 1026 mp = newmp; 1027 } 1028 1029 size = MBLKL(mp); 1030 1031 /* 1032 * Check the packet is not too big and that any remaining 1033 * fragment list is composed entirely of M_DATA messages. (We 1034 * know the first fragment was M_DATA otherwise we could not 1035 * have got here). 1036 */ 1037 for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) { 1038 if (DB_TYPE(bp) != M_DATA) 1039 goto discard; 1040 size += MBLKL(bp); 1041 } 1042 1043 if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0) 1044 goto discard; 1045 1046 mac_sdu_get(dsp->ds_mh, NULL, &max_sdu); 1047 /* 1048 * If LSO is enabled, check the size against lso_max. Otherwise, 1049 * compare the packet size with max_sdu. 1050 */ 1051 max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu; 1052 if (size > max_sdu + mhi.mhi_hdrsize) 1053 goto discard; 1054 1055 if (is_ethernet) { 1056 /* 1057 * Discard the packet if this is a VLAN stream but the VID in 1058 * the packet is not correct. 1059 */ 1060 vid = VLAN_ID(mhi.mhi_tci); 1061 if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) 1062 goto discard; 1063 1064 /* 1065 * Discard the packet if this packet is a tagged packet 1066 * but both pri and VID are 0. 1067 */ 1068 pri = VLAN_PRI(mhi.mhi_tci); 1069 if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE)) 1070 goto discard; 1071 1072 /* 1073 * Update the priority bits to the per-stream priority if 1074 * priority is not set in the packet. Update the VID for 1075 * packets on a VLAN stream. 1076 */ 1077 pri = (pri == 0) ? dsp->ds_pri : 0; 1078 if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) { 1079 if ((newmp = i_dld_ether_header_update_tag(mp, 1080 pri, dsp->ds_vid)) == NULL) { 1081 goto discard; 1082 } 1083 mp = newmp; 1084 } 1085 } 1086 1087 dld_tx_single(dsp, mp); 1088 return; 1089 1090 discard: 1091 /* TODO: bump kstat? */ 1092 freemsg(mp); 1093 } 1094 1095 /* 1096 * Process DL_ATTACH_REQ (style 2) or open(2) (style 1). 1097 */ 1098 int 1099 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa) 1100 { 1101 dev_t dev; 1102 int err; 1103 const char *drvname; 1104 dls_channel_t dc; 1105 uint_t addr_length; 1106 boolean_t qassociated = B_FALSE; 1107 1108 ASSERT(dsp->ds_dc == NULL); 1109 1110 if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL) 1111 return (EINVAL); 1112 1113 /* 1114 * /dev node access. This will still be supported for backward 1115 * compatibility reason. 1116 */ 1117 if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) && 1118 (strcmp(drvname, "vnic") != 0)) { 1119 if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0) 1120 return (EINVAL); 1121 qassociated = B_TRUE; 1122 } 1123 1124 /* 1125 * Open a channel. 1126 */ 1127 if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) { 1128 /* 1129 * style-2 VLAN open, this is a /dev VLAN ppa open 1130 * which might result in a newly created dls_vlan_t. 1131 */ 1132 err = dls_open_style2_vlan(dsp->ds_major, ppa, &dc); 1133 if (err != 0) { 1134 if (qassociated) 1135 (void) qassociate(dsp->ds_wq, -1); 1136 return (err); 1137 } 1138 } else { 1139 dev = makedevice(dsp->ds_major, (minor_t)ppa + 1); 1140 if ((err = dls_open_by_dev(dev, &dc)) != 0) { 1141 if (qassociated) 1142 (void) qassociate(dsp->ds_wq, -1); 1143 return (err); 1144 } 1145 } 1146 1147 /* 1148 * Cache the MAC interface handle, a pointer to the immutable MAC 1149 * information and the current and 'factory' MAC address. 1150 */ 1151 dsp->ds_mh = dls_mac(dc); 1152 dsp->ds_mip = mac_info(dsp->ds_mh); 1153 1154 mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr); 1155 1156 addr_length = dsp->ds_mip->mi_addr_length; 1157 bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length); 1158 1159 /* 1160 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for 1161 * a non-VLAN interface). 1162 */ 1163 dsp->ds_vid = dls_vid(dc); 1164 1165 /* 1166 * Set the default packet priority. 1167 */ 1168 dsp->ds_pri = 0; 1169 1170 /* 1171 * Add a notify function so that the we get updates from the MAC. 1172 */ 1173 dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp); 1174 1175 dsp->ds_dc = dc; 1176 dsp->ds_dlstate = DL_UNBOUND; 1177 1178 return (0); 1179 } 1180 1181 /* 1182 * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called 1183 * from close(2) for style 2. 1184 */ 1185 void 1186 dld_str_detach(dld_str_t *dsp) 1187 { 1188 /* 1189 * Remove the notify function. 1190 */ 1191 mac_notify_remove(dsp->ds_mh, dsp->ds_mnh); 1192 1193 /* 1194 * Disable the capabilities and clear the promisc flag. 1195 */ 1196 ASSERT(!dsp->ds_polling); 1197 ASSERT(!dsp->ds_soft_ring); 1198 dld_capabilities_disable(dsp); 1199 dsp->ds_promisc = 0; 1200 1201 DLD_TX_QUIESCE(dsp); 1202 1203 /* 1204 * Flush all pending packets which are sitting in the transmit queue. 1205 */ 1206 dld_tx_flush(dsp); 1207 1208 /* 1209 * Clear LSO flags. 1210 */ 1211 dsp->ds_lso = B_FALSE; 1212 dsp->ds_lso_max = 0; 1213 1214 dls_close(dsp->ds_dc); 1215 dsp->ds_dc = NULL; 1216 dsp->ds_mh = NULL; 1217 1218 if (dsp->ds_style == DL_STYLE2) 1219 (void) qassociate(dsp->ds_wq, -1); 1220 1221 /* 1222 * Re-initialize the DLPI state machine. 1223 */ 1224 dsp->ds_dlstate = DL_UNATTACHED; 1225 1226 } 1227 1228 /* 1229 * This function is only called for VLAN streams. In raw mode, we strip VLAN 1230 * tags before sending packets up to the DLS clients, with the exception of 1231 * special priority tagged packets, in that case, we set the VID to 0. 1232 * mp must be a VLAN tagged packet. 1233 */ 1234 static mblk_t * 1235 i_dld_ether_header_strip_tag(mblk_t *mp) 1236 { 1237 mblk_t *newmp; 1238 struct ether_vlan_header *evhp; 1239 uint16_t tci, new_tci; 1240 1241 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 1242 if (DB_REF(mp) > 1) { 1243 newmp = copymsg(mp); 1244 if (newmp == NULL) 1245 return (NULL); 1246 freemsg(mp); 1247 mp = newmp; 1248 } 1249 evhp = (struct ether_vlan_header *)mp->b_rptr; 1250 1251 tci = ntohs(evhp->ether_tci); 1252 if (VLAN_PRI(tci) == 0) { 1253 /* 1254 * Priority is 0, strip the tag. 1255 */ 1256 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 1257 mp->b_rptr += VLAN_TAGSZ; 1258 } else { 1259 /* 1260 * Priority is not 0, update the VID to 0. 1261 */ 1262 new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE); 1263 evhp->ether_tci = htons(new_tci); 1264 } 1265 return (mp); 1266 } 1267 1268 /* 1269 * Raw mode receive function. 1270 */ 1271 /*ARGSUSED*/ 1272 void 1273 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp, 1274 mac_header_info_t *mhip) 1275 { 1276 dld_str_t *dsp = (dld_str_t *)arg; 1277 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); 1278 mblk_t *next, *newmp; 1279 1280 ASSERT(mp != NULL); 1281 do { 1282 /* 1283 * Get the pointer to the next packet in the chain and then 1284 * clear b_next before the packet gets passed on. 1285 */ 1286 next = mp->b_next; 1287 mp->b_next = NULL; 1288 1289 /* 1290 * Wind back b_rptr to point at the MAC header. 1291 */ 1292 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); 1293 mp->b_rptr -= mhip->mhi_hdrsize; 1294 1295 /* 1296 * Certain MAC type plugins provide an illusion for raw 1297 * DLPI consumers. They pretend that the MAC layer is 1298 * something that it's not for the benefit of observability 1299 * tools. For example, mac_wifi pretends that it's Ethernet 1300 * for such consumers. Here, unless native mode is enabled, 1301 * we call into the MAC layer so that this illusion can be 1302 * maintained. The plugin will optionally transform the MAC 1303 * header here into something that can be passed up to raw 1304 * consumers. The header goes from "cooked" mode to raw mode. 1305 */ 1306 if (!dsp->ds_native) { 1307 newmp = mac_header_uncook(dsp->ds_mh, mp); 1308 if (newmp == NULL) { 1309 freemsg(mp); 1310 goto next; 1311 } 1312 mp = newmp; 1313 } 1314 1315 /* 1316 * Strip the VLAN tag for VLAN streams. 1317 */ 1318 if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) { 1319 newmp = i_dld_ether_header_strip_tag(mp); 1320 if (newmp == NULL) { 1321 freemsg(mp); 1322 goto next; 1323 } 1324 mp = newmp; 1325 } 1326 1327 /* 1328 * Pass the packet on. 1329 */ 1330 if (canputnext(dsp->ds_rq)) 1331 putnext(dsp->ds_rq, mp); 1332 else 1333 freemsg(mp); 1334 1335 next: 1336 /* 1337 * Move on to the next packet in the chain. 1338 */ 1339 mp = next; 1340 } while (mp != NULL); 1341 } 1342 1343 /* 1344 * Fast-path receive function. 1345 */ 1346 /*ARGSUSED*/ 1347 void 1348 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp, 1349 mac_header_info_t *mhip) 1350 { 1351 dld_str_t *dsp = (dld_str_t *)arg; 1352 mblk_t *next; 1353 size_t offset = 0; 1354 1355 /* 1356 * MAC header stripping rules: 1357 * - Tagged packets: 1358 * a. VLAN streams. Strip the whole VLAN header including the tag. 1359 * b. Physical streams 1360 * - VLAN packets (non-zero VID). The stream must be either a 1361 * DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener. 1362 * Strip the Ethernet header but keep the VLAN header. 1363 * - Special tagged packets (zero VID) 1364 * * The stream is either a DL_PROMISC_SAP listener or a 1365 * ETHERTYPE_VLAN listener, strip the Ethernet header but 1366 * keep the VLAN header. 1367 * * Otherwise, strip the whole VLAN header. 1368 * - Untagged packets. Strip the whole MAC header. 1369 */ 1370 if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) && 1371 ((dsp->ds_sap == ETHERTYPE_VLAN) || 1372 (dsp->ds_promisc & DLS_PROMISC_SAP))) { 1373 offset = VLAN_TAGSZ; 1374 } 1375 1376 ASSERT(mp != NULL); 1377 do { 1378 /* 1379 * Get the pointer to the next packet in the chain and then 1380 * clear b_next before the packet gets passed on. 1381 */ 1382 next = mp->b_next; 1383 mp->b_next = NULL; 1384 1385 /* 1386 * Wind back b_rptr to point at the VLAN header. 1387 */ 1388 ASSERT(mp->b_rptr >= DB_BASE(mp) + offset); 1389 mp->b_rptr -= offset; 1390 1391 /* 1392 * Pass the packet on. 1393 */ 1394 if (canputnext(dsp->ds_rq)) 1395 putnext(dsp->ds_rq, mp); 1396 else 1397 freemsg(mp); 1398 /* 1399 * Move on to the next packet in the chain. 1400 */ 1401 mp = next; 1402 } while (mp != NULL); 1403 } 1404 1405 /* 1406 * Default receive function (send DL_UNITDATA_IND messages). 1407 */ 1408 /*ARGSUSED*/ 1409 void 1410 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp, 1411 mac_header_info_t *mhip) 1412 { 1413 dld_str_t *dsp = (dld_str_t *)arg; 1414 mblk_t *ud_mp; 1415 mblk_t *next; 1416 size_t offset = 0; 1417 boolean_t strip_vlan = B_TRUE; 1418 1419 /* 1420 * See MAC header stripping rules in the dld_str_rx_fastpath() function. 1421 */ 1422 if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) && 1423 ((dsp->ds_sap == ETHERTYPE_VLAN) || 1424 (dsp->ds_promisc & DLS_PROMISC_SAP))) { 1425 offset = VLAN_TAGSZ; 1426 strip_vlan = B_FALSE; 1427 } 1428 1429 ASSERT(mp != NULL); 1430 do { 1431 /* 1432 * Get the pointer to the next packet in the chain and then 1433 * clear b_next before the packet gets passed on. 1434 */ 1435 next = mp->b_next; 1436 mp->b_next = NULL; 1437 1438 /* 1439 * Wind back b_rptr to point at the MAC header. 1440 */ 1441 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); 1442 mp->b_rptr -= mhip->mhi_hdrsize; 1443 1444 /* 1445 * Create the DL_UNITDATA_IND M_PROTO. 1446 */ 1447 if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) { 1448 freemsgchain(mp); 1449 return; 1450 } 1451 1452 /* 1453 * Advance b_rptr to point at the payload (or the VLAN header). 1454 */ 1455 mp->b_rptr += (mhip->mhi_hdrsize - offset); 1456 1457 /* 1458 * Prepend the DL_UNITDATA_IND. 1459 */ 1460 ud_mp->b_cont = mp; 1461 1462 /* 1463 * Send the message. 1464 */ 1465 if (canputnext(dsp->ds_rq)) 1466 putnext(dsp->ds_rq, ud_mp); 1467 else 1468 freemsg(ud_mp); 1469 1470 /* 1471 * Move on to the next packet in the chain. 1472 */ 1473 mp = next; 1474 } while (mp != NULL); 1475 } 1476 1477 /* 1478 * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE 1479 */ 1480 static void 1481 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu) 1482 { 1483 mblk_t *mp; 1484 dl_notify_ind_t *dlip; 1485 1486 if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE)) 1487 return; 1488 1489 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1490 M_PROTO, 0)) == NULL) 1491 return; 1492 1493 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1494 dlip = (dl_notify_ind_t *)mp->b_rptr; 1495 dlip->dl_primitive = DL_NOTIFY_IND; 1496 dlip->dl_notification = DL_NOTE_SDU_SIZE; 1497 dlip->dl_data = max_sdu; 1498 1499 qreply(dsp->ds_wq, mp); 1500 } 1501 1502 /* 1503 * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the 1504 * current state of the interface. 1505 */ 1506 void 1507 dld_str_notify_ind(dld_str_t *dsp) 1508 { 1509 mac_notify_type_t type; 1510 1511 for (type = 0; type < MAC_NNOTE; type++) 1512 str_notify(dsp, type); 1513 } 1514 1515 typedef struct dl_unitdata_ind_wrapper { 1516 dl_unitdata_ind_t dl_unitdata; 1517 uint8_t dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)]; 1518 uint8_t dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)]; 1519 } dl_unitdata_ind_wrapper_t; 1520 1521 /* 1522 * Create a DL_UNITDATA_IND M_PROTO message. 1523 */ 1524 static mblk_t * 1525 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan) 1526 { 1527 mblk_t *nmp; 1528 dl_unitdata_ind_wrapper_t *dlwp; 1529 dl_unitdata_ind_t *dlp; 1530 mac_header_info_t mhi; 1531 uint_t addr_length; 1532 uint8_t *daddr; 1533 uint8_t *saddr; 1534 1535 /* 1536 * Get the packet header information. 1537 */ 1538 if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0) 1539 return (NULL); 1540 1541 /* 1542 * Allocate a message large enough to contain the wrapper structure 1543 * defined above. 1544 */ 1545 if ((nmp = mexchange(dsp->ds_wq, NULL, 1546 sizeof (dl_unitdata_ind_wrapper_t), M_PROTO, 1547 DL_UNITDATA_IND)) == NULL) 1548 return (NULL); 1549 1550 dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr; 1551 1552 dlp = &(dlwp->dl_unitdata); 1553 ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr); 1554 ASSERT(dlp->dl_primitive == DL_UNITDATA_IND); 1555 1556 /* 1557 * Copy in the destination address. 1558 */ 1559 addr_length = dsp->ds_mip->mi_addr_length; 1560 daddr = dlwp->dl_dest_addr; 1561 dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp; 1562 bcopy(mhi.mhi_daddr, daddr, addr_length); 1563 1564 /* 1565 * Set the destination DLSAP to the SAP value encoded in the packet. 1566 */ 1567 if (mhi.mhi_istagged && !strip_vlan) 1568 *(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN; 1569 else 1570 *(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap; 1571 dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t); 1572 1573 /* 1574 * If the destination address was multicast or broadcast then the 1575 * dl_group_address field should be non-zero. 1576 */ 1577 dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) || 1578 (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST); 1579 1580 /* 1581 * Copy in the source address if one exists. Some MAC types (DL_IB 1582 * for example) may not have access to source information. 1583 */ 1584 if (mhi.mhi_saddr == NULL) { 1585 dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0; 1586 } else { 1587 saddr = dlwp->dl_src_addr; 1588 dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp; 1589 bcopy(mhi.mhi_saddr, saddr, addr_length); 1590 1591 /* 1592 * Set the source DLSAP to the packet ethertype. 1593 */ 1594 *(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap; 1595 dlp->dl_src_addr_length = addr_length + sizeof (uint16_t); 1596 } 1597 1598 return (nmp); 1599 } 1600 1601 /* 1602 * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS 1603 */ 1604 static void 1605 str_notify_promisc_on_phys(dld_str_t *dsp) 1606 { 1607 mblk_t *mp; 1608 dl_notify_ind_t *dlip; 1609 1610 if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS)) 1611 return; 1612 1613 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1614 M_PROTO, 0)) == NULL) 1615 return; 1616 1617 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1618 dlip = (dl_notify_ind_t *)mp->b_rptr; 1619 dlip->dl_primitive = DL_NOTIFY_IND; 1620 dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS; 1621 1622 qreply(dsp->ds_wq, mp); 1623 } 1624 1625 /* 1626 * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS 1627 */ 1628 static void 1629 str_notify_promisc_off_phys(dld_str_t *dsp) 1630 { 1631 mblk_t *mp; 1632 dl_notify_ind_t *dlip; 1633 1634 if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS)) 1635 return; 1636 1637 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1638 M_PROTO, 0)) == NULL) 1639 return; 1640 1641 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1642 dlip = (dl_notify_ind_t *)mp->b_rptr; 1643 dlip->dl_primitive = DL_NOTIFY_IND; 1644 dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS; 1645 1646 qreply(dsp->ds_wq, mp); 1647 } 1648 1649 /* 1650 * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR 1651 */ 1652 static void 1653 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr) 1654 { 1655 mblk_t *mp; 1656 dl_notify_ind_t *dlip; 1657 uint_t addr_length; 1658 uint16_t ethertype; 1659 1660 if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR)) 1661 return; 1662 1663 addr_length = dsp->ds_mip->mi_addr_length; 1664 if ((mp = mexchange(dsp->ds_wq, NULL, 1665 sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t), 1666 M_PROTO, 0)) == NULL) 1667 return; 1668 1669 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1670 dlip = (dl_notify_ind_t *)mp->b_rptr; 1671 dlip->dl_primitive = DL_NOTIFY_IND; 1672 dlip->dl_notification = DL_NOTE_PHYS_ADDR; 1673 dlip->dl_data = DL_CURR_PHYS_ADDR; 1674 dlip->dl_addr_offset = sizeof (dl_notify_ind_t); 1675 dlip->dl_addr_length = addr_length + sizeof (uint16_t); 1676 1677 bcopy(addr, &dlip[1], addr_length); 1678 1679 ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap; 1680 *(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype; 1681 1682 qreply(dsp->ds_wq, mp); 1683 } 1684 1685 /* 1686 * DL_NOTIFY_IND: DL_NOTE_LINK_UP 1687 */ 1688 static void 1689 str_notify_link_up(dld_str_t *dsp) 1690 { 1691 mblk_t *mp; 1692 dl_notify_ind_t *dlip; 1693 1694 if (!(dsp->ds_notifications & DL_NOTE_LINK_UP)) 1695 return; 1696 1697 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1698 M_PROTO, 0)) == NULL) 1699 return; 1700 1701 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1702 dlip = (dl_notify_ind_t *)mp->b_rptr; 1703 dlip->dl_primitive = DL_NOTIFY_IND; 1704 dlip->dl_notification = DL_NOTE_LINK_UP; 1705 1706 qreply(dsp->ds_wq, mp); 1707 } 1708 1709 /* 1710 * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN 1711 */ 1712 static void 1713 str_notify_link_down(dld_str_t *dsp) 1714 { 1715 mblk_t *mp; 1716 dl_notify_ind_t *dlip; 1717 1718 if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN)) 1719 return; 1720 1721 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1722 M_PROTO, 0)) == NULL) 1723 return; 1724 1725 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1726 dlip = (dl_notify_ind_t *)mp->b_rptr; 1727 dlip->dl_primitive = DL_NOTIFY_IND; 1728 dlip->dl_notification = DL_NOTE_LINK_DOWN; 1729 1730 qreply(dsp->ds_wq, mp); 1731 } 1732 1733 /* 1734 * DL_NOTIFY_IND: DL_NOTE_SPEED 1735 */ 1736 static void 1737 str_notify_speed(dld_str_t *dsp, uint32_t speed) 1738 { 1739 mblk_t *mp; 1740 dl_notify_ind_t *dlip; 1741 1742 if (!(dsp->ds_notifications & DL_NOTE_SPEED)) 1743 return; 1744 1745 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1746 M_PROTO, 0)) == NULL) 1747 return; 1748 1749 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1750 dlip = (dl_notify_ind_t *)mp->b_rptr; 1751 dlip->dl_primitive = DL_NOTIFY_IND; 1752 dlip->dl_notification = DL_NOTE_SPEED; 1753 dlip->dl_data = speed; 1754 1755 qreply(dsp->ds_wq, mp); 1756 } 1757 1758 /* 1759 * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG 1760 */ 1761 static void 1762 str_notify_capab_reneg(dld_str_t *dsp) 1763 { 1764 mblk_t *mp; 1765 dl_notify_ind_t *dlip; 1766 1767 if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG)) 1768 return; 1769 1770 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1771 M_PROTO, 0)) == NULL) 1772 return; 1773 1774 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1775 dlip = (dl_notify_ind_t *)mp->b_rptr; 1776 dlip->dl_primitive = DL_NOTIFY_IND; 1777 dlip->dl_notification = DL_NOTE_CAPAB_RENEG; 1778 1779 qreply(dsp->ds_wq, mp); 1780 } 1781 1782 /* 1783 * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH 1784 */ 1785 static void 1786 str_notify_fastpath_flush(dld_str_t *dsp) 1787 { 1788 mblk_t *mp; 1789 dl_notify_ind_t *dlip; 1790 1791 if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH)) 1792 return; 1793 1794 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1795 M_PROTO, 0)) == NULL) 1796 return; 1797 1798 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1799 dlip = (dl_notify_ind_t *)mp->b_rptr; 1800 dlip->dl_primitive = DL_NOTIFY_IND; 1801 dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH; 1802 1803 qreply(dsp->ds_wq, mp); 1804 } 1805 1806 /* 1807 * MAC notification callback. 1808 */ 1809 static void 1810 str_notify(void *arg, mac_notify_type_t type) 1811 { 1812 dld_str_t *dsp = (dld_str_t *)arg; 1813 queue_t *q = dsp->ds_wq; 1814 1815 switch (type) { 1816 case MAC_NOTE_TX: 1817 qenable(q); 1818 break; 1819 1820 case MAC_NOTE_DEVPROMISC: 1821 /* 1822 * Send the appropriate DL_NOTIFY_IND. 1823 */ 1824 if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC)) 1825 str_notify_promisc_on_phys(dsp); 1826 else 1827 str_notify_promisc_off_phys(dsp); 1828 break; 1829 1830 case MAC_NOTE_PROMISC: 1831 break; 1832 1833 case MAC_NOTE_UNICST: 1834 /* 1835 * This notification is sent whenever the MAC unicast address 1836 * changes. We need to re-cache the address. 1837 */ 1838 mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr); 1839 1840 /* 1841 * Send the appropriate DL_NOTIFY_IND. 1842 */ 1843 str_notify_phys_addr(dsp, dsp->ds_curr_addr); 1844 break; 1845 1846 case MAC_NOTE_LINK: 1847 /* 1848 * This notification is sent every time the MAC driver 1849 * updates the link state. 1850 */ 1851 switch (mac_link_get(dsp->ds_mh)) { 1852 case LINK_STATE_UP: { 1853 uint64_t speed; 1854 /* 1855 * The link is up so send the appropriate 1856 * DL_NOTIFY_IND. 1857 */ 1858 str_notify_link_up(dsp); 1859 1860 speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED); 1861 str_notify_speed(dsp, (uint32_t)(speed / 1000ull)); 1862 break; 1863 } 1864 case LINK_STATE_DOWN: 1865 /* 1866 * The link is down so send the appropriate 1867 * DL_NOTIFY_IND. 1868 */ 1869 str_notify_link_down(dsp); 1870 break; 1871 1872 default: 1873 break; 1874 } 1875 break; 1876 1877 case MAC_NOTE_RESOURCE: 1878 case MAC_NOTE_VNIC: 1879 /* 1880 * This notification is sent whenever the MAC resources 1881 * change or capabilities change. We need to renegotiate 1882 * the capabilities. Send the appropriate DL_NOTIFY_IND. 1883 */ 1884 str_notify_capab_reneg(dsp); 1885 break; 1886 1887 case MAC_NOTE_SDU_SIZE: { 1888 uint_t max_sdu; 1889 mac_sdu_get(dsp->ds_mh, NULL, &max_sdu); 1890 str_notify_sdu_size(dsp, max_sdu); 1891 break; 1892 } 1893 1894 case MAC_NOTE_FASTPATH_FLUSH: 1895 str_notify_fastpath_flush(dsp); 1896 break; 1897 1898 case MAC_NOTE_MARGIN: 1899 break; 1900 1901 default: 1902 ASSERT(B_FALSE); 1903 break; 1904 } 1905 } 1906 1907 static inline uint_t 1908 mp_getsize(mblk_t *mp) 1909 { 1910 ASSERT(DB_TYPE(mp) == M_DATA); 1911 return ((mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp)); 1912 } 1913 1914 /* 1915 * Calculate the dld queue depth, free the messages that exceed the threshold. 1916 */ 1917 static void 1918 dld_tx_qdepth_timer(void *arg) 1919 { 1920 dld_str_t *dsp = (dld_str_t *)arg; 1921 mblk_t *prev, *mp; 1922 uint_t cnt, msgcnt, size; 1923 1924 mutex_enter(&dsp->ds_tx_list_lock); 1925 1926 /* Calculate total size and count of the packet(s) */ 1927 cnt = msgcnt = 0; 1928 for (prev = NULL, mp = dsp->ds_tx_list_head; mp != NULL; 1929 prev = mp, mp = mp->b_next) { 1930 size = mp_getsize(mp); 1931 cnt += size; 1932 msgcnt++; 1933 if (cnt >= dld_max_q_count || msgcnt >= dld_max_q_count) { 1934 ASSERT(dsp->ds_tx_qbusy); 1935 dsp->ds_tx_list_tail = prev; 1936 if (prev == NULL) 1937 dsp->ds_tx_list_head = NULL; 1938 else 1939 prev->b_next = NULL; 1940 freemsgchain(mp); 1941 cnt -= size; 1942 msgcnt--; 1943 break; 1944 } 1945 } 1946 dsp->ds_tx_cnt = cnt; 1947 dsp->ds_tx_msgcnt = msgcnt; 1948 dsp->ds_tx_qdepth_tid = 0; 1949 mutex_exit(&dsp->ds_tx_list_lock); 1950 } 1951 1952 /* 1953 * Enqueue one or more messages on the transmit queue. Caller specifies: 1954 * - the insertion position (head/tail). 1955 * - the message count and the total message size of messages to be queued 1956 * if they are known to the caller; or 0 if they are not known. 1957 * 1958 * If the caller does not know the message size information, this usually 1959 * means that dld_wsrv() managed to send some but not all of the queued 1960 * messages. For performance reasons, we do not calculate the queue depth 1961 * every time. Instead, a timer is started to calculate the queue depth 1962 * every 1 second (can be changed by tx_qdepth_interval). 1963 */ 1964 static void 1965 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, mblk_t *tail, boolean_t head_insert, 1966 uint_t msgcnt, uint_t cnt) 1967 { 1968 queue_t *q = dsp->ds_wq; 1969 uint_t tot_cnt, tot_msgcnt; 1970 mblk_t *next; 1971 1972 mutex_enter(&dsp->ds_tx_list_lock); 1973 1974 /* 1975 * Simply enqueue the message and calculate the queue depth via 1976 * timer if: 1977 * 1978 * - the current queue depth is incorrect, and the timer is already 1979 * started; or 1980 * 1981 * - the given message size is unknown and it is allowed to start the 1982 * timer; 1983 */ 1984 if ((dsp->ds_tx_qdepth_tid != 0) || 1985 (msgcnt == 0 && tx_qdepth_interval != 0)) { 1986 goto enqueue; 1987 } 1988 1989 /* 1990 * The timer is not allowed, so calculate the message size now. 1991 */ 1992 if (msgcnt == 0) { 1993 for (next = mp; next != NULL; next = next->b_next) { 1994 cnt += mp_getsize(next); 1995 msgcnt++; 1996 } 1997 } 1998 1999 /* 2000 * Grow the queue depth using the input messesge size. 2001 * 2002 * If the queue depth would exceed the allowed threshold, drop 2003 * new packet(s) and drain those already in the queue. 2004 */ 2005 tot_cnt = dsp->ds_tx_cnt + cnt; 2006 tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt; 2007 2008 if (!head_insert && (tot_cnt >= dld_max_q_count || 2009 tot_msgcnt >= dld_max_q_count)) { 2010 ASSERT(dsp->ds_tx_qbusy); 2011 mutex_exit(&dsp->ds_tx_list_lock); 2012 freemsgchain(mp); 2013 goto done; 2014 } 2015 /* Update the queue size parameters */ 2016 dsp->ds_tx_cnt = tot_cnt; 2017 dsp->ds_tx_msgcnt = tot_msgcnt; 2018 2019 enqueue: 2020 /* 2021 * If the transmit queue is currently empty and we are 2022 * about to deposit the packet(s) there, switch mode to 2023 * "busy" and raise flow-control condition. 2024 */ 2025 if (!dsp->ds_tx_qbusy) { 2026 dsp->ds_tx_qbusy = B_TRUE; 2027 ASSERT(dsp->ds_tx_flow_mp != NULL); 2028 (void) putq(q, dsp->ds_tx_flow_mp); 2029 dsp->ds_tx_flow_mp = NULL; 2030 } 2031 2032 if (!head_insert) { 2033 /* Tail insertion */ 2034 if (dsp->ds_tx_list_head == NULL) 2035 dsp->ds_tx_list_head = mp; 2036 else 2037 dsp->ds_tx_list_tail->b_next = mp; 2038 dsp->ds_tx_list_tail = tail; 2039 } else { 2040 /* Head insertion */ 2041 tail->b_next = dsp->ds_tx_list_head; 2042 if (dsp->ds_tx_list_head == NULL) 2043 dsp->ds_tx_list_tail = tail; 2044 dsp->ds_tx_list_head = mp; 2045 } 2046 2047 if (msgcnt == 0 && dsp->ds_tx_qdepth_tid == 0 && 2048 tx_qdepth_interval != 0) { 2049 /* 2050 * The message size is not given so that we need to start 2051 * the timer to calculate the queue depth. 2052 */ 2053 dsp->ds_tx_qdepth_tid = timeout(dld_tx_qdepth_timer, dsp, 2054 drv_usectohz(tx_qdepth_interval)); 2055 ASSERT(dsp->ds_tx_qdepth_tid != NULL); 2056 } 2057 mutex_exit(&dsp->ds_tx_list_lock); 2058 done: 2059 /* Schedule service thread to drain the transmit queue */ 2060 if (!head_insert) 2061 qenable(q); 2062 } 2063 2064 void 2065 dld_tx_flush(dld_str_t *dsp) 2066 { 2067 timeout_id_t tid = 0; 2068 2069 mutex_enter(&dsp->ds_tx_list_lock); 2070 if (dsp->ds_tx_list_head != NULL) { 2071 freemsgchain(dsp->ds_tx_list_head); 2072 dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL; 2073 dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0; 2074 if (dsp->ds_tx_qbusy) { 2075 dsp->ds_tx_flow_mp = getq(dsp->ds_wq); 2076 ASSERT(dsp->ds_tx_flow_mp != NULL); 2077 dsp->ds_tx_qbusy = B_FALSE; 2078 } 2079 if ((tid = dsp->ds_tx_qdepth_tid) != 0) 2080 dsp->ds_tx_qdepth_tid = 0; 2081 } 2082 mutex_exit(&dsp->ds_tx_list_lock); 2083 2084 /* 2085 * Note that ds_tx_list_lock (which is acquired by the timeout 2086 * callback routine) cannot be held across the call to untimeout(). 2087 */ 2088 if (tid != 0) 2089 (void) untimeout(tid); 2090 } 2091 2092 /* 2093 * Process a non-data message. 2094 */ 2095 static void 2096 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp) 2097 { 2098 ASSERT((dsp->ds_type == DLD_DLPI && dsp->ds_ioctl == NULL) || 2099 (dsp->ds_type == DLD_CONTROL && dsp->ds_ioctl != NULL)); 2100 2101 mutex_enter(&dsp->ds_disp_lock); 2102 2103 /* 2104 * The processing of the message might block. Enqueue the 2105 * message for later processing. 2106 */ 2107 if (dsp->ds_pending_head == NULL) { 2108 dsp->ds_pending_head = dsp->ds_pending_tail = mp; 2109 } else { 2110 dsp->ds_pending_tail->b_next = mp; 2111 dsp->ds_pending_tail = mp; 2112 } 2113 2114 /* 2115 * If there is no task pending, kick off the task. 2116 */ 2117 if (dsp->ds_tid == NULL) { 2118 dsp->ds_tid = taskq_dispatch(dld_disp_taskq, 2119 dld_wput_nondata_task, dsp, TQ_SLEEP); 2120 ASSERT(dsp->ds_tid != NULL); 2121 } 2122 mutex_exit(&dsp->ds_disp_lock); 2123 } 2124 2125 /* 2126 * The worker thread which processes non-data messages. Note we only process 2127 * one message at one time in order to be able to "flush" the queued message 2128 * and serialize the processing. 2129 */ 2130 static void 2131 dld_wput_nondata_task(void *arg) 2132 { 2133 dld_str_t *dsp = (dld_str_t *)arg; 2134 mblk_t *mp; 2135 2136 mutex_enter(&dsp->ds_disp_lock); 2137 ASSERT(dsp->ds_pending_head != NULL); 2138 ASSERT(dsp->ds_tid != NULL); 2139 2140 if (dsp->ds_closing) 2141 goto closing; 2142 2143 mp = dsp->ds_pending_head; 2144 if ((dsp->ds_pending_head = mp->b_next) == NULL) 2145 dsp->ds_pending_tail = NULL; 2146 mp->b_next = NULL; 2147 2148 mutex_exit(&dsp->ds_disp_lock); 2149 2150 switch (DB_TYPE(mp)) { 2151 case M_PROTO: 2152 case M_PCPROTO: 2153 ASSERT(dsp->ds_type == DLD_DLPI); 2154 dld_wput_proto_nondata(dsp, mp); 2155 break; 2156 case M_IOCTL: { 2157 uint_t cmd; 2158 2159 if (dsp->ds_type == DLD_CONTROL) { 2160 ASSERT(dsp->ds_ioctl != NULL); 2161 dsp->ds_ioctl(dsp->ds_wq, mp); 2162 break; 2163 } 2164 2165 cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 2166 2167 switch (cmd) { 2168 case DLIOCNATIVE: 2169 ioc_native(dsp, mp); 2170 break; 2171 case DLIOCMARGININFO: 2172 ioc_margin(dsp, mp); 2173 break; 2174 case DLIOCRAW: 2175 ioc_raw(dsp, mp); 2176 break; 2177 case DLIOCHDRINFO: 2178 ioc_fast(dsp, mp); 2179 break; 2180 default: 2181 ioc(dsp, mp); 2182 break; 2183 } 2184 break; 2185 } 2186 case M_IOCDATA: 2187 ASSERT(dsp->ds_type == DLD_DLPI); 2188 ioc(dsp, mp); 2189 break; 2190 } 2191 2192 mutex_enter(&dsp->ds_disp_lock); 2193 2194 if (dsp->ds_closing) 2195 goto closing; 2196 2197 if (dsp->ds_pending_head != NULL) { 2198 dsp->ds_tid = taskq_dispatch(dld_disp_taskq, 2199 dld_wput_nondata_task, dsp, TQ_SLEEP); 2200 ASSERT(dsp->ds_tid != NULL); 2201 } else { 2202 dsp->ds_tid = NULL; 2203 } 2204 mutex_exit(&dsp->ds_disp_lock); 2205 return; 2206 2207 /* 2208 * If the stream is closing, flush all queued messages and inform 2209 * the stream once it is done. 2210 */ 2211 closing: 2212 freemsgchain(dsp->ds_pending_head); 2213 dsp->ds_pending_head = dsp->ds_pending_tail = NULL; 2214 dsp->ds_tid = NULL; 2215 cv_signal(&dsp->ds_disp_cv); 2216 mutex_exit(&dsp->ds_disp_lock); 2217 } 2218 2219 /* 2220 * Flush queued non-data messages. 2221 */ 2222 static void 2223 dld_flush_nondata(dld_str_t *dsp) 2224 { 2225 mutex_enter(&dsp->ds_disp_lock); 2226 freemsgchain(dsp->ds_pending_head); 2227 dsp->ds_pending_head = dsp->ds_pending_tail = NULL; 2228 mutex_exit(&dsp->ds_disp_lock); 2229 } 2230 2231 /* 2232 * DLIOCNATIVE 2233 */ 2234 static void 2235 ioc_native(dld_str_t *dsp, mblk_t *mp) 2236 { 2237 queue_t *q = dsp->ds_wq; 2238 const mac_info_t *mip = dsp->ds_mip; 2239 2240 rw_enter(&dsp->ds_lock, RW_WRITER); 2241 2242 /* 2243 * Native mode can be enabled if it's disabled and if the 2244 * native media type is different. 2245 */ 2246 if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia) 2247 dsp->ds_native = B_TRUE; 2248 2249 rw_exit(&dsp->ds_lock); 2250 2251 if (dsp->ds_native) 2252 miocack(q, mp, 0, mip->mi_nativemedia); 2253 else 2254 miocnak(q, mp, 0, ENOTSUP); 2255 } 2256 2257 /* 2258 * DLIOCMARGININFO 2259 */ 2260 static void 2261 ioc_margin(dld_str_t *dsp, mblk_t *mp) 2262 { 2263 queue_t *q = dsp->ds_wq; 2264 uint32_t margin; 2265 int err; 2266 2267 if (dsp->ds_dlstate == DL_UNATTACHED) { 2268 err = EINVAL; 2269 goto failed; 2270 } 2271 if ((err = miocpullup(mp, sizeof (uint32_t))) != 0) 2272 goto failed; 2273 2274 mac_margin_get(dsp->ds_mh, &margin); 2275 *((uint32_t *)mp->b_cont->b_rptr) = margin; 2276 miocack(q, mp, sizeof (uint32_t), 0); 2277 return; 2278 2279 failed: 2280 miocnak(q, mp, 0, err); 2281 } 2282 2283 /* 2284 * DLIOCRAW 2285 */ 2286 static void 2287 ioc_raw(dld_str_t *dsp, mblk_t *mp) 2288 { 2289 queue_t *q = dsp->ds_wq; 2290 2291 if (dsp->ds_polling || dsp->ds_soft_ring) { 2292 miocnak(q, mp, 0, EPROTO); 2293 return; 2294 } 2295 2296 rw_enter(&dsp->ds_lock, RW_WRITER); 2297 if ((dsp->ds_mode != DLD_RAW) && (dsp->ds_dlstate == DL_IDLE)) { 2298 /* 2299 * Set the receive callback. 2300 */ 2301 dls_rx_set(dsp->ds_dc, dld_str_rx_raw, dsp); 2302 dsp->ds_tx = str_mdata_raw_put; 2303 } 2304 dsp->ds_mode = DLD_RAW; 2305 rw_exit(&dsp->ds_lock); 2306 miocack(q, mp, 0, 0); 2307 } 2308 2309 /* 2310 * DLIOCHDRINFO 2311 */ 2312 static void 2313 ioc_fast(dld_str_t *dsp, mblk_t *mp) 2314 { 2315 dl_unitdata_req_t *dlp; 2316 off_t off; 2317 size_t len; 2318 const uint8_t *addr; 2319 uint16_t sap; 2320 mblk_t *nmp; 2321 mblk_t *hmp; 2322 uint_t addr_length; 2323 queue_t *q = dsp->ds_wq; 2324 int err; 2325 2326 if (dld_opt & DLD_OPT_NO_FASTPATH) { 2327 err = ENOTSUP; 2328 goto failed; 2329 } 2330 2331 /* 2332 * DLIOCHDRINFO should only come from IP. The one initiated from 2333 * user-land should not be allowed. 2334 */ 2335 if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) { 2336 err = EINVAL; 2337 goto failed; 2338 } 2339 2340 nmp = mp->b_cont; 2341 if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) || 2342 (dlp = (dl_unitdata_req_t *)nmp->b_rptr, 2343 dlp->dl_primitive != DL_UNITDATA_REQ)) { 2344 err = EINVAL; 2345 goto failed; 2346 } 2347 2348 off = dlp->dl_dest_addr_offset; 2349 len = dlp->dl_dest_addr_length; 2350 2351 if (!MBLKIN(nmp, off, len)) { 2352 err = EINVAL; 2353 goto failed; 2354 } 2355 2356 /* 2357 * We don't need to hold any locks to access ds_dlstate, because 2358 * control message prossessing (which updates this field) is 2359 * serialized. 2360 */ 2361 if (dsp->ds_dlstate != DL_IDLE) { 2362 err = ENOTSUP; 2363 goto failed; 2364 } 2365 2366 addr_length = dsp->ds_mip->mi_addr_length; 2367 if (len != addr_length + sizeof (uint16_t)) { 2368 err = EINVAL; 2369 goto failed; 2370 } 2371 2372 addr = nmp->b_rptr + off; 2373 sap = *(uint16_t *)(nmp->b_rptr + off + addr_length); 2374 2375 if ((hmp = dls_header(dsp->ds_dc, addr, sap, 0, NULL)) == NULL) { 2376 err = ENOMEM; 2377 goto failed; 2378 } 2379 2380 rw_enter(&dsp->ds_lock, RW_WRITER); 2381 ASSERT(dsp->ds_dlstate == DL_IDLE); 2382 if (dsp->ds_mode != DLD_FASTPATH) { 2383 /* 2384 * Set the receive callback (unless polling or 2385 * soft-ring is enabled). 2386 */ 2387 dsp->ds_mode = DLD_FASTPATH; 2388 if (!dsp->ds_polling && !dsp->ds_soft_ring) 2389 dls_rx_set(dsp->ds_dc, dld_str_rx_fastpath, dsp); 2390 dsp->ds_tx = str_mdata_fastpath_put; 2391 } 2392 rw_exit(&dsp->ds_lock); 2393 2394 freemsg(nmp->b_cont); 2395 nmp->b_cont = hmp; 2396 2397 miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0); 2398 return; 2399 failed: 2400 miocnak(q, mp, 0, err); 2401 } 2402 2403 static void 2404 ioc(dld_str_t *dsp, mblk_t *mp) 2405 { 2406 queue_t *q = dsp->ds_wq; 2407 mac_handle_t mh; 2408 2409 if (dsp->ds_dlstate == DL_UNATTACHED) { 2410 miocnak(q, mp, 0, EINVAL); 2411 return; 2412 } 2413 mh = dsp->ds_mh; 2414 ASSERT(mh != NULL); 2415 mac_ioctl(mh, q, mp); 2416 } 2417