1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Data-Link Driver 28 */ 29 30 #include <sys/stropts.h> 31 #include <sys/strsun.h> 32 #include <sys/strsubr.h> 33 #include <sys/atomic.h> 34 #include <sys/disp.h> 35 #include <sys/callb.h> 36 #include <sys/vlan.h> 37 #include <sys/dld.h> 38 #include <sys/dld_impl.h> 39 #include <sys/dls_impl.h> 40 #include <inet/common.h> 41 42 static int str_constructor(void *, void *, int); 43 static void str_destructor(void *, void *); 44 static mblk_t *str_unitdata_ind(dld_str_t *, mblk_t *, boolean_t); 45 static void str_notify_promisc_on_phys(dld_str_t *); 46 static void str_notify_promisc_off_phys(dld_str_t *); 47 static void str_notify_phys_addr(dld_str_t *, const uint8_t *); 48 static void str_notify_link_up(dld_str_t *); 49 static void str_notify_link_down(dld_str_t *); 50 static void str_notify_capab_reneg(dld_str_t *); 51 static void str_notify_speed(dld_str_t *, uint32_t); 52 static void str_notify(void *, mac_notify_type_t); 53 54 static void ioc_native(dld_str_t *, mblk_t *); 55 static void ioc_margin(dld_str_t *, mblk_t *); 56 static void ioc_raw(dld_str_t *, mblk_t *); 57 static void ioc_fast(dld_str_t *, mblk_t *); 58 static void ioc(dld_str_t *, mblk_t *); 59 static void dld_tx_enqueue(dld_str_t *, mblk_t *, mblk_t *, boolean_t, 60 uint_t, uint_t); 61 static void dld_wput_nondata(dld_str_t *, mblk_t *); 62 static void dld_wput_nondata_task(void *); 63 static void dld_flush_nondata(dld_str_t *); 64 static mblk_t *i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t); 65 static mblk_t *i_dld_ether_header_strip_tag(mblk_t *); 66 67 static uint32_t str_count; 68 static kmem_cache_t *str_cachep; 69 static taskq_t *dld_disp_taskq = NULL; 70 static mod_hash_t *str_hashp; 71 72 #define STR_HASHSZ 64 73 #define STR_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key)) 74 75 static inline uint_t mp_getsize(mblk_t *); 76 77 /* 78 * Interval to count the TX queued depth. Default is 1s (1000000us). 79 * Count the queue depth immediately (not by timeout) if this is set to 0. 80 * See more details above dld_tx_enqueue(). 81 */ 82 uint_t tx_qdepth_interval = 1000000; 83 84 /* 85 * Some notes on entry points, flow-control, queueing and locking: 86 * 87 * This driver exports the traditional STREAMS put entry point as well as 88 * the non-STREAMS fast-path transmit routine which is provided to IP via 89 * the DL_CAPAB_POLL negotiation. The put procedure handles all control 90 * and data operations, while the fast-path routine deals only with M_DATA 91 * fast-path packets. Regardless of the entry point, all outbound packets 92 * will end up in dld_tx_single(), where they will be delivered to the MAC 93 * driver. 94 * 95 * The transmit logic operates in two modes: a "not busy" mode where the 96 * packets will be delivered to the MAC for a send attempt, or "busy" mode 97 * where they will be enqueued in the internal queue because of flow-control. 98 * Flow-control happens when the MAC driver indicates the packets couldn't 99 * be transmitted due to lack of resources (e.g. running out of descriptors). 100 * In such case, the driver will place a dummy message on its write-side 101 * STREAMS queue so that the queue is marked as "full". Any subsequent 102 * packets arriving at the driver will be enqueued in the internal queue, 103 * which is drained in the context of the service thread that gets scheduled 104 * whenever the driver is in the "busy" mode. When all packets have been 105 * successfully delivered by MAC and the internal queue is empty, it will 106 * transition to the "not busy" mode by removing the dummy message from the 107 * write-side STREAMS queue; in effect this will trigger backenabling. 108 * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due 109 * to the above reasons. 110 * 111 * The driver implements an internal transmit queue independent of STREAMS. 112 * This allows for flexibility and provides a fast enqueue/dequeue mechanism 113 * compared to the putq() and get() STREAMS interfaces. The only putq() and 114 * getq() operations done by the driver are those related to placing and 115 * removing the dummy message to/from the write-side STREAMS queue for flow- 116 * control purposes. 117 * 118 * Locking is done independent of STREAMS due to the driver being fully MT. 119 * Threads entering the driver (either from put or service entry points) 120 * will most likely be readers, with the exception of a few writer cases 121 * such those handling DLPI attach/detach/bind/unbind/etc. or any of the 122 * DLD-related ioctl requests. The DLPI detach case is special, because 123 * it involves freeing resources and therefore must be single-threaded. 124 * Unfortunately the readers/writers lock can't be used to protect against 125 * it, because the lock is dropped prior to the driver calling places where 126 * putnext() may be invoked, and such places may depend on those resources 127 * to exist. Because of this, the driver always completes the DLPI detach 128 * process when there are no other threads running in the driver. This is 129 * done by keeping track of the number of threads, such that the the last 130 * thread leaving the driver will finish the pending DLPI detach operation. 131 */ 132 133 /* 134 * dld_max_q_count is the queue depth threshold used to limit the number of 135 * outstanding packets or bytes allowed in the queue; once this limit is 136 * reached the driver will free any incoming ones until the queue depth 137 * drops below the threshold. 138 * 139 * This buffering is provided to accomodate clients which do not employ 140 * their own buffering scheme, and to handle occasional packet bursts. 141 * Clients which handle their own buffering will receive positive feedback 142 * from this driver as soon as it transitions into the "busy" state, i.e. 143 * when the queue is initially filled up; they will get backenabled once 144 * the queue is empty. 145 * 146 * The value chosen here is rather arbitrary; in future some intelligent 147 * heuristics may be involved which could take into account the hardware's 148 * transmit ring size, etc. 149 */ 150 uint_t dld_max_q_count = (16 * 1024 *1024); 151 152 /* 153 * dld_finddevinfo() returns the dev_info_t * corresponding to a particular 154 * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that 155 * match dev_t. If a stream is found and it is attached, its dev_info_t * 156 * is returned. 157 */ 158 typedef struct i_dld_str_state_s { 159 major_t ds_major; 160 minor_t ds_minor; 161 dev_info_t *ds_dip; 162 } i_dld_str_state_t; 163 164 /* ARGSUSED */ 165 static uint_t 166 i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) 167 { 168 i_dld_str_state_t *statep = arg; 169 dld_str_t *dsp = (dld_str_t *)val; 170 171 if (statep->ds_major != dsp->ds_major) 172 return (MH_WALK_CONTINUE); 173 174 ASSERT(statep->ds_minor != 0); 175 176 /* 177 * Access to ds_mh needs to be protected by ds_lock. 178 */ 179 rw_enter(&dsp->ds_lock, RW_READER); 180 if (statep->ds_minor == dsp->ds_minor) { 181 /* 182 * Clone: a clone minor is unique. we can terminate the 183 * walk if we find a matching stream -- even if we fail 184 * to obtain the devinfo. 185 */ 186 if (dsp->ds_mh != NULL) 187 statep->ds_dip = mac_devinfo_get(dsp->ds_mh); 188 rw_exit(&dsp->ds_lock); 189 return (MH_WALK_TERMINATE); 190 } 191 rw_exit(&dsp->ds_lock); 192 return (MH_WALK_CONTINUE); 193 } 194 195 static dev_info_t * 196 dld_finddevinfo(dev_t dev) 197 { 198 dev_info_t *dip; 199 i_dld_str_state_t state; 200 201 if (getminor(dev) == 0) 202 return (NULL); 203 204 /* 205 * See if it's a minor node of a link 206 */ 207 if ((dip = dls_finddevinfo(dev)) != NULL) 208 return (dip); 209 210 state.ds_minor = getminor(dev); 211 state.ds_major = getmajor(dev); 212 state.ds_dip = NULL; 213 214 mod_hash_walk(str_hashp, i_dld_str_walker, &state); 215 return (state.ds_dip); 216 } 217 218 /* 219 * devo_getinfo: getinfo(9e) 220 */ 221 /*ARGSUSED*/ 222 int 223 dld_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) 224 { 225 dev_info_t *devinfo; 226 minor_t minor = getminor((dev_t)arg); 227 int rc = DDI_FAILURE; 228 229 switch (cmd) { 230 case DDI_INFO_DEVT2DEVINFO: 231 if ((devinfo = dld_finddevinfo((dev_t)arg)) != NULL) { 232 *(dev_info_t **)resp = devinfo; 233 rc = DDI_SUCCESS; 234 } 235 break; 236 case DDI_INFO_DEVT2INSTANCE: 237 if (minor > 0 && minor <= DLS_MAX_MINOR) { 238 *resp = (void *)(uintptr_t)DLS_MINOR2INST(minor); 239 rc = DDI_SUCCESS; 240 } else if (minor > DLS_MAX_MINOR && 241 (devinfo = dld_finddevinfo((dev_t)arg)) != NULL) { 242 *resp = (void *)(uintptr_t)ddi_get_instance(devinfo); 243 rc = DDI_SUCCESS; 244 } 245 break; 246 } 247 return (rc); 248 } 249 250 /* 251 * qi_qopen: open(9e) 252 */ 253 /*ARGSUSED*/ 254 int 255 dld_open(queue_t *rq, dev_t *devp, int flag, int sflag, cred_t *credp) 256 { 257 dld_str_t *dsp; 258 major_t major; 259 minor_t minor; 260 int err; 261 262 if (sflag == MODOPEN) 263 return (ENOTSUP); 264 265 /* 266 * This is a cloning driver and therefore each queue should only 267 * ever get opened once. 268 */ 269 if (rq->q_ptr != NULL) 270 return (EBUSY); 271 272 major = getmajor(*devp); 273 minor = getminor(*devp); 274 275 /* 276 * Create a new dld_str_t for the stream. This will grab a new minor 277 * number that will be handed back in the cloned dev_t. Creation may 278 * fail if we can't allocate the dummy mblk used for flow-control. 279 */ 280 dsp = dld_str_create(rq, DLD_DLPI, major, 281 ((minor == 0) ? DL_STYLE2 : DL_STYLE1)); 282 if (dsp == NULL) 283 return (ENOSR); 284 285 ASSERT(dsp->ds_dlstate == DL_UNATTACHED); 286 if (minor != 0) { 287 /* 288 * Style 1 open 289 */ 290 if ((err = dld_str_attach(dsp, (t_uscalar_t)minor - 1)) != 0) 291 goto failed; 292 ASSERT(dsp->ds_dlstate == DL_UNBOUND); 293 } else { 294 (void) qassociate(rq, -1); 295 } 296 297 /* 298 * Enable the queue srv(9e) routine. 299 */ 300 qprocson(rq); 301 302 /* 303 * Construct a cloned dev_t to hand back. 304 */ 305 *devp = makedevice(getmajor(*devp), dsp->ds_minor); 306 return (0); 307 308 failed: 309 dld_str_destroy(dsp); 310 return (err); 311 } 312 313 /* 314 * qi_qclose: close(9e) 315 */ 316 int 317 dld_close(queue_t *rq) 318 { 319 dld_str_t *dsp = rq->q_ptr; 320 321 /* 322 * Disable the queue srv(9e) routine. 323 */ 324 qprocsoff(rq); 325 326 dld_finish_pending_task(dsp); 327 328 /* 329 * This stream was open to a provider node. Check to see 330 * if it has been cleanly shut down. 331 */ 332 if (dsp->ds_dlstate != DL_UNATTACHED) { 333 /* 334 * The stream is either open to a style 1 provider or 335 * this is not clean shutdown. Detach from the PPA. 336 * (This is still ok even in the style 1 case). 337 */ 338 dld_str_detach(dsp); 339 } 340 341 dld_str_destroy(dsp); 342 return (0); 343 } 344 345 /* 346 * qi_qputp: put(9e) 347 */ 348 void 349 dld_wput(queue_t *wq, mblk_t *mp) 350 { 351 dld_str_t *dsp = wq->q_ptr; 352 353 switch (DB_TYPE(mp)) { 354 case M_DATA: { 355 dld_tx_t tx; 356 357 DLD_TX_ENTER(dsp); 358 if ((tx = dsp->ds_tx) != NULL) 359 tx(dsp, mp); 360 else 361 freemsg(mp); 362 DLD_TX_EXIT(dsp); 363 break; 364 } 365 case M_PROTO: 366 case M_PCPROTO: { 367 t_uscalar_t prim; 368 dld_tx_t tx; 369 370 if (MBLKL(mp) < sizeof (t_uscalar_t)) { 371 freemsg(mp); 372 return; 373 } 374 375 prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; 376 if (prim != DL_UNITDATA_REQ) { 377 /* Control path */ 378 dld_wput_nondata(dsp, mp); 379 break; 380 } 381 382 /* Data path */ 383 DLD_TX_ENTER(dsp); 384 if ((tx = dsp->ds_unitdata_tx) != NULL) 385 tx(dsp, mp); 386 else 387 dlerrorack(wq, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0); 388 DLD_TX_EXIT(dsp); 389 break; 390 } 391 case M_IOCTL: 392 case M_IOCDATA: 393 /* Control path */ 394 dld_wput_nondata(dsp, mp); 395 break; 396 case M_FLUSH: 397 /* 398 * Flush both the data messages and the control messages. 399 */ 400 if (*mp->b_rptr & FLUSHW) { 401 dld_flush_nondata(dsp); 402 dld_tx_flush(dsp); 403 *mp->b_rptr &= ~FLUSHW; 404 } 405 406 if (*mp->b_rptr & FLUSHR) { 407 qreply(wq, mp); 408 } else { 409 freemsg(mp); 410 } 411 break; 412 default: 413 freemsg(mp); 414 break; 415 } 416 } 417 418 /* 419 * Called by GLDv3 control node to process the ioctls. It will start 420 * a taskq to allow the ioctl processing to block. This is a temporary 421 * solution, and will be replaced by a more graceful approach afterwards. 422 */ 423 void 424 dld_ioctl(queue_t *wq, mblk_t *mp) 425 { 426 dld_wput_nondata(wq->q_ptr, mp); 427 } 428 429 /* 430 * qi_srvp: srv(9e) 431 */ 432 void 433 dld_wsrv(queue_t *wq) 434 { 435 mblk_t *mp, *head, *tail; 436 dld_str_t *dsp = wq->q_ptr; 437 uint_t cnt, msgcnt; 438 timeout_id_t tid = 0; 439 440 rw_enter(&dsp->ds_lock, RW_READER); 441 /* 442 * Grab all packets (chained via b_next) off our transmit queue 443 * and try to send them all to the MAC layer. Since the queue 444 * is independent of streams, we are able to dequeue all messages 445 * at once without looping through getq() and manually chaining 446 * them. Note that the queue size parameters (byte and message 447 * counts) are cleared as well, but we postpone the backenabling 448 * until after the MAC transmit since some packets may end up 449 * back at our transmit queue. 450 */ 451 mutex_enter(&dsp->ds_tx_list_lock); 452 if ((mp = dsp->ds_tx_list_head) == NULL) { 453 ASSERT(!dsp->ds_tx_qbusy); 454 ASSERT(dsp->ds_tx_flow_mp != NULL); 455 ASSERT(dsp->ds_tx_list_head == NULL); 456 ASSERT(dsp->ds_tx_list_tail == NULL); 457 ASSERT(dsp->ds_tx_cnt == 0); 458 ASSERT(dsp->ds_tx_msgcnt == 0); 459 mutex_exit(&dsp->ds_tx_list_lock); 460 rw_exit(&dsp->ds_lock); 461 return; 462 } 463 head = mp; 464 tail = dsp->ds_tx_list_tail; 465 dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL; 466 cnt = dsp->ds_tx_cnt; 467 msgcnt = dsp->ds_tx_msgcnt; 468 dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0; 469 mutex_exit(&dsp->ds_tx_list_lock); 470 471 /* 472 * Discard packets unless we are attached and bound; note that 473 * the driver mode (fastpath/raw/unitdata) is irrelevant here, 474 * because regardless of the mode all transmit will end up in 475 * dld_tx_single() where the packets may be queued. 476 */ 477 ASSERT((DB_TYPE(mp) == M_DATA) || (DB_TYPE(mp) == M_MULTIDATA)); 478 if (dsp->ds_dlstate != DL_IDLE) { 479 freemsgchain(mp); 480 goto done; 481 } 482 483 /* 484 * Attempt to transmit one or more packets. If the MAC can't 485 * send them all, re-queue the packet(s) at the beginning of 486 * the transmit queue to avoid any re-ordering. 487 */ 488 mp = dls_tx(dsp->ds_dc, mp); 489 if (mp == head) { 490 /* 491 * No message was sent out. Take the saved the queue depth 492 * as the input, so that dld_tx_enqueue() need not to 493 * calculate it again. 494 */ 495 dld_tx_enqueue(dsp, mp, tail, B_TRUE, msgcnt, cnt); 496 } else if (mp != NULL) { 497 /* 498 * Some but not all messages were sent out. dld_tx_enqueue() 499 * needs to start the timer to calculate the queue depth if 500 * timer has not been started. 501 * 502 * Note that a timer is used to calculate the queue depth 503 * to improve network performance, especially for TCP, in 504 * which case packets are sent without canput() being checked, 505 * and mostly end up in dld_tx_enqueue() under heavy load. 506 */ 507 dld_tx_enqueue(dsp, mp, tail, B_TRUE, 0, 0); 508 } 509 510 done: 511 /* 512 * Grab the list lock again and check if the transmit queue is 513 * really empty; if so, lift up flow-control and backenable any 514 * writer queues. If the queue is not empty, schedule service 515 * thread to drain it. 516 */ 517 mutex_enter(&dsp->ds_tx_list_lock); 518 if (dsp->ds_tx_list_head == NULL) { 519 dsp->ds_tx_flow_mp = getq(wq); 520 ASSERT(dsp->ds_tx_flow_mp != NULL); 521 dsp->ds_tx_qbusy = B_FALSE; 522 if ((tid = dsp->ds_tx_qdepth_tid) != 0) 523 dsp->ds_tx_qdepth_tid = 0; 524 } 525 mutex_exit(&dsp->ds_tx_list_lock); 526 527 /* 528 * Note that ds_tx_list_lock (which is acquired by the timeout 529 * callback routine) cannot be held across the call to untimeout(). 530 */ 531 if (tid != 0) 532 (void) untimeout(tid); 533 534 rw_exit(&dsp->ds_lock); 535 } 536 537 void 538 dld_init_ops(struct dev_ops *ops, const char *name) 539 { 540 struct streamtab *stream; 541 struct qinit *rq, *wq; 542 struct module_info *modinfo; 543 544 modinfo = kmem_zalloc(sizeof (struct module_info), KM_SLEEP); 545 modinfo->mi_idname = kmem_zalloc(FMNAMESZ, KM_SLEEP); 546 (void) snprintf(modinfo->mi_idname, FMNAMESZ, "%s", name); 547 modinfo->mi_minpsz = 0; 548 modinfo->mi_maxpsz = 64*1024; 549 modinfo->mi_hiwat = 1; 550 modinfo->mi_lowat = 0; 551 552 rq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP); 553 rq->qi_qopen = dld_open; 554 rq->qi_qclose = dld_close; 555 rq->qi_minfo = modinfo; 556 557 wq = kmem_zalloc(sizeof (struct qinit), KM_SLEEP); 558 wq->qi_putp = (pfi_t)dld_wput; 559 wq->qi_srvp = (pfi_t)dld_wsrv; 560 wq->qi_minfo = modinfo; 561 562 stream = kmem_zalloc(sizeof (struct streamtab), KM_SLEEP); 563 stream->st_rdinit = rq; 564 stream->st_wrinit = wq; 565 ops->devo_cb_ops->cb_str = stream; 566 567 if (ops->devo_getinfo == NULL) 568 ops->devo_getinfo = &dld_getinfo; 569 } 570 571 void 572 dld_fini_ops(struct dev_ops *ops) 573 { 574 struct streamtab *stream; 575 struct qinit *rq, *wq; 576 struct module_info *modinfo; 577 578 stream = ops->devo_cb_ops->cb_str; 579 rq = stream->st_rdinit; 580 wq = stream->st_wrinit; 581 modinfo = rq->qi_minfo; 582 ASSERT(wq->qi_minfo == modinfo); 583 584 kmem_free(stream, sizeof (struct streamtab)); 585 kmem_free(wq, sizeof (struct qinit)); 586 kmem_free(rq, sizeof (struct qinit)); 587 kmem_free(modinfo->mi_idname, FMNAMESZ); 588 kmem_free(modinfo, sizeof (struct module_info)); 589 } 590 591 /* 592 * Initialize this module's data structures. 593 */ 594 void 595 dld_str_init(void) 596 { 597 /* 598 * Create dld_str_t object cache. 599 */ 600 str_cachep = kmem_cache_create("dld_str_cache", sizeof (dld_str_t), 601 0, str_constructor, str_destructor, NULL, NULL, NULL, 0); 602 ASSERT(str_cachep != NULL); 603 604 /* 605 * Create taskq to process DLPI requests. 606 */ 607 dld_disp_taskq = taskq_create("dld_disp_taskq", 1024, MINCLSYSPRI, 2, 608 INT_MAX, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); 609 610 /* 611 * Create a hash table for maintaining dld_str_t's. 612 * The ds_minor field (the clone minor number) of a dld_str_t 613 * is used as a key for this hash table because this number is 614 * globally unique (allocated from "dls_minor_arena"). 615 */ 616 str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ, 617 mod_hash_null_valdtor); 618 } 619 620 /* 621 * Tear down this module's data structures. 622 */ 623 int 624 dld_str_fini(void) 625 { 626 /* 627 * Make sure that there are no objects in use. 628 */ 629 if (str_count != 0) 630 return (EBUSY); 631 632 ASSERT(dld_disp_taskq != NULL); 633 taskq_destroy(dld_disp_taskq); 634 dld_disp_taskq = NULL; 635 636 /* 637 * Destroy object cache. 638 */ 639 kmem_cache_destroy(str_cachep); 640 mod_hash_destroy_idhash(str_hashp); 641 return (0); 642 } 643 644 /* 645 * Create a new dld_str_t object. 646 */ 647 dld_str_t * 648 dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style) 649 { 650 dld_str_t *dsp; 651 int err; 652 653 /* 654 * Allocate an object from the cache. 655 */ 656 atomic_add_32(&str_count, 1); 657 dsp = kmem_cache_alloc(str_cachep, KM_SLEEP); 658 659 /* 660 * Allocate the dummy mblk for flow-control. 661 */ 662 dsp->ds_tx_flow_mp = allocb(1, BPRI_HI); 663 if (dsp->ds_tx_flow_mp == NULL) { 664 kmem_cache_free(str_cachep, dsp); 665 atomic_add_32(&str_count, -1); 666 return (NULL); 667 } 668 dsp->ds_type = type; 669 dsp->ds_major = major; 670 dsp->ds_style = style; 671 dsp->ds_tx = dsp->ds_unitdata_tx = NULL; 672 673 /* 674 * Initialize the queue pointers. 675 */ 676 ASSERT(RD(rq) == rq); 677 dsp->ds_rq = rq; 678 dsp->ds_wq = WR(rq); 679 rq->q_ptr = WR(rq)->q_ptr = (void *)dsp; 680 681 /* 682 * We want explicit control over our write-side STREAMS queue 683 * where the dummy mblk gets added/removed for flow-control. 684 */ 685 noenable(WR(rq)); 686 687 err = mod_hash_insert(str_hashp, STR_HASH_KEY(dsp->ds_minor), 688 (mod_hash_val_t)dsp); 689 ASSERT(err == 0); 690 return (dsp); 691 } 692 693 void 694 dld_finish_pending_task(dld_str_t *dsp) 695 { 696 /* 697 * Wait until the pending requests are processed by the worker thread. 698 */ 699 mutex_enter(&dsp->ds_disp_lock); 700 dsp->ds_closing = B_TRUE; 701 while (dsp->ds_tid != NULL) 702 cv_wait(&dsp->ds_disp_cv, &dsp->ds_disp_lock); 703 dsp->ds_closing = B_FALSE; 704 mutex_exit(&dsp->ds_disp_lock); 705 } 706 707 /* 708 * Destroy a dld_str_t object. 709 */ 710 void 711 dld_str_destroy(dld_str_t *dsp) 712 { 713 queue_t *rq; 714 queue_t *wq; 715 mod_hash_val_t val; 716 /* 717 * Clear the queue pointers. 718 */ 719 rq = dsp->ds_rq; 720 wq = dsp->ds_wq; 721 ASSERT(wq == WR(rq)); 722 723 rq->q_ptr = wq->q_ptr = NULL; 724 dsp->ds_rq = dsp->ds_wq = NULL; 725 726 ASSERT(!RW_LOCK_HELD(&dsp->ds_lock)); 727 ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock)); 728 ASSERT(dsp->ds_tx_list_head == NULL); 729 ASSERT(dsp->ds_tx_list_tail == NULL); 730 ASSERT(dsp->ds_tx_cnt == 0); 731 ASSERT(dsp->ds_tx_msgcnt == 0); 732 ASSERT(dsp->ds_tx_qdepth_tid == 0); 733 ASSERT(!dsp->ds_tx_qbusy); 734 735 ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock)); 736 ASSERT(dsp->ds_pending_head == NULL); 737 ASSERT(dsp->ds_pending_tail == NULL); 738 ASSERT(dsp->ds_tx == NULL); 739 ASSERT(dsp->ds_unitdata_tx == NULL); 740 741 /* 742 * Reinitialize all the flags. 743 */ 744 dsp->ds_notifications = 0; 745 dsp->ds_passivestate = DLD_UNINITIALIZED; 746 dsp->ds_mode = DLD_UNITDATA; 747 dsp->ds_native = B_FALSE; 748 749 /* 750 * Free the dummy mblk if exists. 751 */ 752 if (dsp->ds_tx_flow_mp != NULL) { 753 freeb(dsp->ds_tx_flow_mp); 754 dsp->ds_tx_flow_mp = NULL; 755 } 756 757 (void) mod_hash_remove(str_hashp, STR_HASH_KEY(dsp->ds_minor), &val); 758 ASSERT(dsp == (dld_str_t *)val); 759 760 /* 761 * Free the object back to the cache. 762 */ 763 kmem_cache_free(str_cachep, dsp); 764 atomic_add_32(&str_count, -1); 765 } 766 767 /* 768 * kmem_cache contructor function: see kmem_cache_create(9f). 769 */ 770 /*ARGSUSED*/ 771 static int 772 str_constructor(void *buf, void *cdrarg, int kmflags) 773 { 774 dld_str_t *dsp = buf; 775 776 bzero(buf, sizeof (dld_str_t)); 777 778 /* 779 * Allocate a new minor number. 780 */ 781 if ((dsp->ds_minor = mac_minor_hold(kmflags == KM_SLEEP)) == 0) 782 return (-1); 783 784 /* 785 * Initialize the DLPI state machine. 786 */ 787 dsp->ds_dlstate = DL_UNATTACHED; 788 789 rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL); 790 mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL); 791 mutex_init(&dsp->ds_disp_lock, NULL, MUTEX_DRIVER, NULL); 792 cv_init(&dsp->ds_disp_cv, NULL, CV_DRIVER, NULL); 793 mutex_init(&dsp->ds_tx_lock, NULL, MUTEX_DRIVER, NULL); 794 cv_init(&dsp->ds_tx_cv, NULL, CV_DRIVER, NULL); 795 796 return (0); 797 } 798 799 /* 800 * kmem_cache destructor function. 801 */ 802 /*ARGSUSED*/ 803 static void 804 str_destructor(void *buf, void *cdrarg) 805 { 806 dld_str_t *dsp = buf; 807 808 /* 809 * Make sure the DLPI state machine was reset. 810 */ 811 ASSERT(dsp->ds_dlstate == DL_UNATTACHED); 812 813 /* 814 * Make sure the data-link interface was closed. 815 */ 816 ASSERT(dsp->ds_mh == NULL); 817 ASSERT(dsp->ds_dc == NULL); 818 ASSERT(dsp->ds_tx == NULL); 819 ASSERT(dsp->ds_unitdata_tx == NULL); 820 ASSERT(dsp->ds_intx_cnt == 0); 821 ASSERT(dsp->ds_detaching == B_FALSE); 822 823 /* 824 * Make sure enabled notifications are cleared. 825 */ 826 ASSERT(dsp->ds_notifications == 0); 827 828 /* 829 * Make sure polling is disabled. 830 */ 831 ASSERT(!dsp->ds_polling); 832 833 /* 834 * Release the minor number. 835 */ 836 mac_minor_rele(dsp->ds_minor); 837 838 ASSERT(!RW_LOCK_HELD(&dsp->ds_lock)); 839 rw_destroy(&dsp->ds_lock); 840 841 ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock)); 842 mutex_destroy(&dsp->ds_tx_list_lock); 843 ASSERT(dsp->ds_tx_flow_mp == NULL); 844 ASSERT(dsp->ds_pending_head == NULL); 845 ASSERT(dsp->ds_pending_tail == NULL); 846 ASSERT(!dsp->ds_closing); 847 848 ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock)); 849 mutex_destroy(&dsp->ds_disp_lock); 850 cv_destroy(&dsp->ds_disp_cv); 851 852 ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_lock)); 853 mutex_destroy(&dsp->ds_tx_lock); 854 cv_destroy(&dsp->ds_tx_cv); 855 } 856 857 void 858 dld_tx_single(dld_str_t *dsp, mblk_t *mp) 859 { 860 /* 861 * If we are busy enqueue the packet and return. 862 * Otherwise hand them over to the MAC driver for transmission. 863 * If the message didn't get sent it will be queued. 864 * 865 * Note here that we don't grab the list lock prior to checking 866 * the busy flag. This is okay, because a missed transition 867 * will not cause any packet reordering for any particular TCP 868 * connection (which is single-threaded). The enqueue routine 869 * will atomically set the busy flag and schedule the service 870 * thread to run; the flag is only cleared by the service thread 871 * when there is no more packet to be transmitted. 872 */ 873 874 if (dsp->ds_tx_qbusy || ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)) 875 dld_tx_enqueue(dsp, mp, mp, B_FALSE, 1, mp_getsize(mp)); 876 } 877 878 /* 879 * Update the priority bits and VID (may need to insert tag if mp points 880 * to an untagged packet). 881 * If vid is VLAN_ID_NONE, use the VID encoded in the packet. 882 */ 883 static mblk_t * 884 i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid) 885 { 886 mblk_t *hmp; 887 struct ether_vlan_header *evhp; 888 struct ether_header *ehp; 889 uint16_t old_tci = 0; 890 size_t len; 891 892 ASSERT(pri != 0 || vid != VLAN_ID_NONE); 893 894 evhp = (struct ether_vlan_header *)mp->b_rptr; 895 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { 896 /* 897 * Tagged packet, update the priority bits. 898 */ 899 old_tci = ntohs(evhp->ether_tci); 900 len = sizeof (struct ether_vlan_header); 901 902 if ((DB_REF(mp) > 1) || (MBLKL(mp) < len)) { 903 /* 904 * In case some drivers only check the db_ref 905 * count of the first mblk, we pullup the 906 * message into a single mblk. 907 */ 908 hmp = msgpullup(mp, -1); 909 if ((hmp == NULL) || (MBLKL(hmp) < len)) { 910 freemsg(hmp); 911 return (NULL); 912 } else { 913 freemsg(mp); 914 mp = hmp; 915 } 916 } 917 918 evhp = (struct ether_vlan_header *)mp->b_rptr; 919 } else { 920 /* 921 * Untagged packet. Insert the special priority tag. 922 * First allocate a header mblk. 923 */ 924 hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); 925 if (hmp == NULL) 926 return (NULL); 927 928 evhp = (struct ether_vlan_header *)hmp->b_rptr; 929 ehp = (struct ether_header *)mp->b_rptr; 930 931 /* 932 * Copy the MAC addresses and typelen 933 */ 934 bcopy(ehp, evhp, (ETHERADDRL * 2)); 935 evhp->ether_type = ehp->ether_type; 936 evhp->ether_tpid = htons(ETHERTYPE_VLAN); 937 938 hmp->b_wptr += sizeof (struct ether_vlan_header); 939 mp->b_rptr += sizeof (struct ether_header); 940 941 /* 942 * Free the original message if it's now empty. Link the 943 * rest of the messages to the header message. 944 */ 945 if (MBLKL(mp) == 0) { 946 hmp->b_cont = mp->b_cont; 947 freeb(mp); 948 } else { 949 hmp->b_cont = mp; 950 } 951 mp = hmp; 952 } 953 954 if (pri == 0) 955 pri = VLAN_PRI(old_tci); 956 if (vid == VLAN_ID_NONE) 957 vid = VLAN_ID(old_tci); 958 evhp->ether_tci = htons(VLAN_TCI(pri, VLAN_CFI(old_tci), vid)); 959 return (mp); 960 } 961 962 /* 963 * M_DATA put 964 * 965 * The poll callback function for DLS clients which are not in the per-stream 966 * mode. This function is called from an upper layer protocol (currently only 967 * tcp and udp). 968 */ 969 void 970 str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp) 971 { 972 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); 973 mblk_t *newmp; 974 uint_t pri; 975 976 if (is_ethernet) { 977 /* 978 * Update the priority bits to the assigned priority. 979 */ 980 pri = (VLAN_MBLKPRI(mp) == 0) ? dsp->ds_pri : VLAN_MBLKPRI(mp); 981 982 if (pri != 0) { 983 newmp = i_dld_ether_header_update_tag(mp, pri, 984 VLAN_ID_NONE); 985 if (newmp == NULL) 986 goto discard; 987 mp = newmp; 988 } 989 } 990 991 dld_tx_single(dsp, mp); 992 return; 993 994 discard: 995 /* TODO: bump kstat? */ 996 freemsg(mp); 997 } 998 999 /* 1000 * M_DATA put (DLIOCRAW mode). 1001 */ 1002 void 1003 str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) 1004 { 1005 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); 1006 mblk_t *bp, *newmp; 1007 size_t size; 1008 mac_header_info_t mhi; 1009 uint_t pri, vid; 1010 uint_t max_sdu; 1011 1012 /* 1013 * Certain MAC type plugins provide an illusion for raw DLPI 1014 * consumers. They pretend that the MAC layer is something that 1015 * it's not for the benefit of observability tools. For example, 1016 * mac_wifi pretends that it's Ethernet for such consumers. 1017 * Here, unless native mode is enabled, we call into the MAC layer so 1018 * that this illusion can be maintained. The plugin will optionally 1019 * transform the MAC header here into something that can be passed 1020 * down. The header goes from raw mode to "cooked" mode. 1021 */ 1022 if (!dsp->ds_native) { 1023 if ((newmp = mac_header_cook(dsp->ds_mh, mp)) == NULL) 1024 goto discard; 1025 mp = newmp; 1026 } 1027 1028 size = MBLKL(mp); 1029 1030 /* 1031 * Check the packet is not too big and that any remaining 1032 * fragment list is composed entirely of M_DATA messages. (We 1033 * know the first fragment was M_DATA otherwise we could not 1034 * have got here). 1035 */ 1036 for (bp = mp->b_cont; bp != NULL; bp = bp->b_cont) { 1037 if (DB_TYPE(bp) != M_DATA) 1038 goto discard; 1039 size += MBLKL(bp); 1040 } 1041 1042 if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0) 1043 goto discard; 1044 1045 mac_sdu_get(dsp->ds_mh, NULL, &max_sdu); 1046 /* 1047 * If LSO is enabled, check the size against lso_max. Otherwise, 1048 * compare the packet size with max_sdu. 1049 */ 1050 max_sdu = dsp->ds_lso ? dsp->ds_lso_max : max_sdu; 1051 if (size > max_sdu + mhi.mhi_hdrsize) 1052 goto discard; 1053 1054 if (is_ethernet) { 1055 /* 1056 * Discard the packet if this is a VLAN stream but the VID in 1057 * the packet is not correct. 1058 */ 1059 vid = VLAN_ID(mhi.mhi_tci); 1060 if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) 1061 goto discard; 1062 1063 /* 1064 * Discard the packet if this packet is a tagged packet 1065 * but both pri and VID are 0. 1066 */ 1067 pri = VLAN_PRI(mhi.mhi_tci); 1068 if (mhi.mhi_istagged && (pri == 0) && (vid == VLAN_ID_NONE)) 1069 goto discard; 1070 1071 /* 1072 * Update the priority bits to the per-stream priority if 1073 * priority is not set in the packet. Update the VID for 1074 * packets on a VLAN stream. 1075 */ 1076 pri = (pri == 0) ? dsp->ds_pri : 0; 1077 if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) { 1078 if ((newmp = i_dld_ether_header_update_tag(mp, 1079 pri, dsp->ds_vid)) == NULL) { 1080 goto discard; 1081 } 1082 mp = newmp; 1083 } 1084 } 1085 1086 dld_tx_single(dsp, mp); 1087 return; 1088 1089 discard: 1090 /* TODO: bump kstat? */ 1091 freemsg(mp); 1092 } 1093 1094 /* 1095 * Process DL_ATTACH_REQ (style 2) or open(2) (style 1). 1096 */ 1097 int 1098 dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa) 1099 { 1100 dev_t dev; 1101 int err; 1102 const char *drvname; 1103 dls_channel_t dc; 1104 uint_t addr_length; 1105 boolean_t qassociated = B_FALSE; 1106 1107 ASSERT(dsp->ds_dc == NULL); 1108 1109 if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL) 1110 return (EINVAL); 1111 1112 /* 1113 * /dev node access. This will still be supported for backward 1114 * compatibility reason. 1115 */ 1116 if ((dsp->ds_style == DL_STYLE2) && (strcmp(drvname, "aggr") != 0) && 1117 (strcmp(drvname, "vnic") != 0)) { 1118 if (qassociate(dsp->ds_wq, DLS_PPA2INST(ppa)) != 0) 1119 return (EINVAL); 1120 qassociated = B_TRUE; 1121 } 1122 1123 /* 1124 * Open a channel. 1125 */ 1126 if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) { 1127 /* 1128 * style-2 VLAN open, this is a /dev VLAN ppa open 1129 * which might result in a newly created dls_vlan_t. 1130 */ 1131 err = dls_open_style2_vlan(dsp->ds_major, ppa, &dc); 1132 if (err != 0) { 1133 if (qassociated) 1134 (void) qassociate(dsp->ds_wq, -1); 1135 return (err); 1136 } 1137 } else { 1138 dev = makedevice(dsp->ds_major, (minor_t)ppa + 1); 1139 if ((err = dls_open_by_dev(dev, &dc)) != 0) { 1140 if (qassociated) 1141 (void) qassociate(dsp->ds_wq, -1); 1142 return (err); 1143 } 1144 } 1145 1146 /* 1147 * Cache the MAC interface handle, a pointer to the immutable MAC 1148 * information and the current and 'factory' MAC address. 1149 */ 1150 dsp->ds_mh = dls_mac(dc); 1151 dsp->ds_mip = mac_info(dsp->ds_mh); 1152 1153 mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr); 1154 1155 addr_length = dsp->ds_mip->mi_addr_length; 1156 bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length); 1157 1158 /* 1159 * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for 1160 * a non-VLAN interface). 1161 */ 1162 dsp->ds_vid = dls_vid(dc); 1163 1164 /* 1165 * Set the default packet priority. 1166 */ 1167 dsp->ds_pri = 0; 1168 1169 /* 1170 * Add a notify function so that the we get updates from the MAC. 1171 */ 1172 dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp); 1173 1174 dsp->ds_dc = dc; 1175 dsp->ds_dlstate = DL_UNBOUND; 1176 1177 return (0); 1178 } 1179 1180 /* 1181 * Process DL_DETACH_REQ (style 2) or close(2) (style 1). Can also be called 1182 * from close(2) for style 2. 1183 */ 1184 void 1185 dld_str_detach(dld_str_t *dsp) 1186 { 1187 /* 1188 * Remove the notify function. 1189 */ 1190 mac_notify_remove(dsp->ds_mh, dsp->ds_mnh); 1191 1192 /* 1193 * Disable the capabilities and clear the promisc flag. 1194 */ 1195 ASSERT(!dsp->ds_polling); 1196 ASSERT(!dsp->ds_soft_ring); 1197 dld_capabilities_disable(dsp); 1198 dsp->ds_promisc = 0; 1199 1200 DLD_TX_QUIESCE(dsp); 1201 1202 /* 1203 * Flush all pending packets which are sitting in the transmit queue. 1204 */ 1205 dld_tx_flush(dsp); 1206 1207 /* 1208 * Clear LSO flags. 1209 */ 1210 dsp->ds_lso = B_FALSE; 1211 dsp->ds_lso_max = 0; 1212 1213 dls_close(dsp->ds_dc); 1214 dsp->ds_dc = NULL; 1215 dsp->ds_mh = NULL; 1216 1217 if (dsp->ds_style == DL_STYLE2) 1218 (void) qassociate(dsp->ds_wq, -1); 1219 1220 /* 1221 * Re-initialize the DLPI state machine. 1222 */ 1223 dsp->ds_dlstate = DL_UNATTACHED; 1224 1225 } 1226 1227 /* 1228 * This function is only called for VLAN streams. In raw mode, we strip VLAN 1229 * tags before sending packets up to the DLS clients, with the exception of 1230 * special priority tagged packets, in that case, we set the VID to 0. 1231 * mp must be a VLAN tagged packet. 1232 */ 1233 static mblk_t * 1234 i_dld_ether_header_strip_tag(mblk_t *mp) 1235 { 1236 mblk_t *newmp; 1237 struct ether_vlan_header *evhp; 1238 uint16_t tci, new_tci; 1239 1240 ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); 1241 if (DB_REF(mp) > 1) { 1242 newmp = copymsg(mp); 1243 if (newmp == NULL) 1244 return (NULL); 1245 freemsg(mp); 1246 mp = newmp; 1247 } 1248 evhp = (struct ether_vlan_header *)mp->b_rptr; 1249 1250 tci = ntohs(evhp->ether_tci); 1251 if (VLAN_PRI(tci) == 0) { 1252 /* 1253 * Priority is 0, strip the tag. 1254 */ 1255 ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); 1256 mp->b_rptr += VLAN_TAGSZ; 1257 } else { 1258 /* 1259 * Priority is not 0, update the VID to 0. 1260 */ 1261 new_tci = VLAN_TCI(VLAN_PRI(tci), VLAN_CFI(tci), VLAN_ID_NONE); 1262 evhp->ether_tci = htons(new_tci); 1263 } 1264 return (mp); 1265 } 1266 1267 /* 1268 * Raw mode receive function. 1269 */ 1270 /*ARGSUSED*/ 1271 void 1272 dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp, 1273 mac_header_info_t *mhip) 1274 { 1275 dld_str_t *dsp = (dld_str_t *)arg; 1276 boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); 1277 mblk_t *next, *newmp; 1278 1279 ASSERT(mp != NULL); 1280 do { 1281 /* 1282 * Get the pointer to the next packet in the chain and then 1283 * clear b_next before the packet gets passed on. 1284 */ 1285 next = mp->b_next; 1286 mp->b_next = NULL; 1287 1288 /* 1289 * Wind back b_rptr to point at the MAC header. 1290 */ 1291 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); 1292 mp->b_rptr -= mhip->mhi_hdrsize; 1293 1294 /* 1295 * Certain MAC type plugins provide an illusion for raw 1296 * DLPI consumers. They pretend that the MAC layer is 1297 * something that it's not for the benefit of observability 1298 * tools. For example, mac_wifi pretends that it's Ethernet 1299 * for such consumers. Here, unless native mode is enabled, 1300 * we call into the MAC layer so that this illusion can be 1301 * maintained. The plugin will optionally transform the MAC 1302 * header here into something that can be passed up to raw 1303 * consumers. The header goes from "cooked" mode to raw mode. 1304 */ 1305 if (!dsp->ds_native) { 1306 newmp = mac_header_uncook(dsp->ds_mh, mp); 1307 if (newmp == NULL) { 1308 freemsg(mp); 1309 goto next; 1310 } 1311 mp = newmp; 1312 } 1313 1314 /* 1315 * Strip the VLAN tag for VLAN streams. 1316 */ 1317 if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) { 1318 newmp = i_dld_ether_header_strip_tag(mp); 1319 if (newmp == NULL) { 1320 freemsg(mp); 1321 goto next; 1322 } 1323 mp = newmp; 1324 } 1325 1326 /* 1327 * Pass the packet on. 1328 */ 1329 if (canputnext(dsp->ds_rq)) 1330 putnext(dsp->ds_rq, mp); 1331 else 1332 freemsg(mp); 1333 1334 next: 1335 /* 1336 * Move on to the next packet in the chain. 1337 */ 1338 mp = next; 1339 } while (mp != NULL); 1340 } 1341 1342 /* 1343 * Fast-path receive function. 1344 */ 1345 /*ARGSUSED*/ 1346 void 1347 dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp, 1348 mac_header_info_t *mhip) 1349 { 1350 dld_str_t *dsp = (dld_str_t *)arg; 1351 mblk_t *next; 1352 size_t offset = 0; 1353 1354 /* 1355 * MAC header stripping rules: 1356 * - Tagged packets: 1357 * a. VLAN streams. Strip the whole VLAN header including the tag. 1358 * b. Physical streams 1359 * - VLAN packets (non-zero VID). The stream must be either a 1360 * DL_PROMISC_SAP listener or a ETHERTYPE_VLAN listener. 1361 * Strip the Ethernet header but keep the VLAN header. 1362 * - Special tagged packets (zero VID) 1363 * * The stream is either a DL_PROMISC_SAP listener or a 1364 * ETHERTYPE_VLAN listener, strip the Ethernet header but 1365 * keep the VLAN header. 1366 * * Otherwise, strip the whole VLAN header. 1367 * - Untagged packets. Strip the whole MAC header. 1368 */ 1369 if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) && 1370 ((dsp->ds_sap == ETHERTYPE_VLAN) || 1371 (dsp->ds_promisc & DLS_PROMISC_SAP))) { 1372 offset = VLAN_TAGSZ; 1373 } 1374 1375 ASSERT(mp != NULL); 1376 do { 1377 /* 1378 * Get the pointer to the next packet in the chain and then 1379 * clear b_next before the packet gets passed on. 1380 */ 1381 next = mp->b_next; 1382 mp->b_next = NULL; 1383 1384 /* 1385 * Wind back b_rptr to point at the VLAN header. 1386 */ 1387 ASSERT(mp->b_rptr >= DB_BASE(mp) + offset); 1388 mp->b_rptr -= offset; 1389 1390 /* 1391 * Pass the packet on. 1392 */ 1393 if (canputnext(dsp->ds_rq)) 1394 putnext(dsp->ds_rq, mp); 1395 else 1396 freemsg(mp); 1397 /* 1398 * Move on to the next packet in the chain. 1399 */ 1400 mp = next; 1401 } while (mp != NULL); 1402 } 1403 1404 /* 1405 * Default receive function (send DL_UNITDATA_IND messages). 1406 */ 1407 /*ARGSUSED*/ 1408 void 1409 dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp, 1410 mac_header_info_t *mhip) 1411 { 1412 dld_str_t *dsp = (dld_str_t *)arg; 1413 mblk_t *ud_mp; 1414 mblk_t *next; 1415 size_t offset = 0; 1416 boolean_t strip_vlan = B_TRUE; 1417 1418 /* 1419 * See MAC header stripping rules in the dld_str_rx_fastpath() function. 1420 */ 1421 if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) && 1422 ((dsp->ds_sap == ETHERTYPE_VLAN) || 1423 (dsp->ds_promisc & DLS_PROMISC_SAP))) { 1424 offset = VLAN_TAGSZ; 1425 strip_vlan = B_FALSE; 1426 } 1427 1428 ASSERT(mp != NULL); 1429 do { 1430 /* 1431 * Get the pointer to the next packet in the chain and then 1432 * clear b_next before the packet gets passed on. 1433 */ 1434 next = mp->b_next; 1435 mp->b_next = NULL; 1436 1437 /* 1438 * Wind back b_rptr to point at the MAC header. 1439 */ 1440 ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); 1441 mp->b_rptr -= mhip->mhi_hdrsize; 1442 1443 /* 1444 * Create the DL_UNITDATA_IND M_PROTO. 1445 */ 1446 if ((ud_mp = str_unitdata_ind(dsp, mp, strip_vlan)) == NULL) { 1447 freemsgchain(mp); 1448 return; 1449 } 1450 1451 /* 1452 * Advance b_rptr to point at the payload (or the VLAN header). 1453 */ 1454 mp->b_rptr += (mhip->mhi_hdrsize - offset); 1455 1456 /* 1457 * Prepend the DL_UNITDATA_IND. 1458 */ 1459 ud_mp->b_cont = mp; 1460 1461 /* 1462 * Send the message. 1463 */ 1464 if (canputnext(dsp->ds_rq)) 1465 putnext(dsp->ds_rq, ud_mp); 1466 else 1467 freemsg(ud_mp); 1468 1469 /* 1470 * Move on to the next packet in the chain. 1471 */ 1472 mp = next; 1473 } while (mp != NULL); 1474 } 1475 1476 /* 1477 * DL_NOTIFY_IND: DL_NOTE_SDU_SIZE 1478 */ 1479 static void 1480 str_notify_sdu_size(dld_str_t *dsp, uint_t max_sdu) 1481 { 1482 mblk_t *mp; 1483 dl_notify_ind_t *dlip; 1484 1485 if (!(dsp->ds_notifications & DL_NOTE_SDU_SIZE)) 1486 return; 1487 1488 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1489 M_PROTO, 0)) == NULL) 1490 return; 1491 1492 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1493 dlip = (dl_notify_ind_t *)mp->b_rptr; 1494 dlip->dl_primitive = DL_NOTIFY_IND; 1495 dlip->dl_notification = DL_NOTE_SDU_SIZE; 1496 dlip->dl_data = max_sdu; 1497 1498 qreply(dsp->ds_wq, mp); 1499 } 1500 1501 /* 1502 * Generate DL_NOTIFY_IND messages to notify the DLPI consumer of the 1503 * current state of the interface. 1504 */ 1505 void 1506 dld_str_notify_ind(dld_str_t *dsp) 1507 { 1508 mac_notify_type_t type; 1509 1510 for (type = 0; type < MAC_NNOTE; type++) 1511 str_notify(dsp, type); 1512 } 1513 1514 typedef struct dl_unitdata_ind_wrapper { 1515 dl_unitdata_ind_t dl_unitdata; 1516 uint8_t dl_dest_addr[MAXMACADDRLEN + sizeof (uint16_t)]; 1517 uint8_t dl_src_addr[MAXMACADDRLEN + sizeof (uint16_t)]; 1518 } dl_unitdata_ind_wrapper_t; 1519 1520 /* 1521 * Create a DL_UNITDATA_IND M_PROTO message. 1522 */ 1523 static mblk_t * 1524 str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan) 1525 { 1526 mblk_t *nmp; 1527 dl_unitdata_ind_wrapper_t *dlwp; 1528 dl_unitdata_ind_t *dlp; 1529 mac_header_info_t mhi; 1530 uint_t addr_length; 1531 uint8_t *daddr; 1532 uint8_t *saddr; 1533 1534 /* 1535 * Get the packet header information. 1536 */ 1537 if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0) 1538 return (NULL); 1539 1540 /* 1541 * Allocate a message large enough to contain the wrapper structure 1542 * defined above. 1543 */ 1544 if ((nmp = mexchange(dsp->ds_wq, NULL, 1545 sizeof (dl_unitdata_ind_wrapper_t), M_PROTO, 1546 DL_UNITDATA_IND)) == NULL) 1547 return (NULL); 1548 1549 dlwp = (dl_unitdata_ind_wrapper_t *)nmp->b_rptr; 1550 1551 dlp = &(dlwp->dl_unitdata); 1552 ASSERT(dlp == (dl_unitdata_ind_t *)nmp->b_rptr); 1553 ASSERT(dlp->dl_primitive == DL_UNITDATA_IND); 1554 1555 /* 1556 * Copy in the destination address. 1557 */ 1558 addr_length = dsp->ds_mip->mi_addr_length; 1559 daddr = dlwp->dl_dest_addr; 1560 dlp->dl_dest_addr_offset = (uintptr_t)daddr - (uintptr_t)dlp; 1561 bcopy(mhi.mhi_daddr, daddr, addr_length); 1562 1563 /* 1564 * Set the destination DLSAP to the SAP value encoded in the packet. 1565 */ 1566 if (mhi.mhi_istagged && !strip_vlan) 1567 *(uint16_t *)(daddr + addr_length) = ETHERTYPE_VLAN; 1568 else 1569 *(uint16_t *)(daddr + addr_length) = mhi.mhi_bindsap; 1570 dlp->dl_dest_addr_length = addr_length + sizeof (uint16_t); 1571 1572 /* 1573 * If the destination address was multicast or broadcast then the 1574 * dl_group_address field should be non-zero. 1575 */ 1576 dlp->dl_group_address = (mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) || 1577 (mhi.mhi_dsttype == MAC_ADDRTYPE_BROADCAST); 1578 1579 /* 1580 * Copy in the source address if one exists. Some MAC types (DL_IB 1581 * for example) may not have access to source information. 1582 */ 1583 if (mhi.mhi_saddr == NULL) { 1584 dlp->dl_src_addr_offset = dlp->dl_src_addr_length = 0; 1585 } else { 1586 saddr = dlwp->dl_src_addr; 1587 dlp->dl_src_addr_offset = (uintptr_t)saddr - (uintptr_t)dlp; 1588 bcopy(mhi.mhi_saddr, saddr, addr_length); 1589 1590 /* 1591 * Set the source DLSAP to the packet ethertype. 1592 */ 1593 *(uint16_t *)(saddr + addr_length) = mhi.mhi_origsap; 1594 dlp->dl_src_addr_length = addr_length + sizeof (uint16_t); 1595 } 1596 1597 return (nmp); 1598 } 1599 1600 /* 1601 * DL_NOTIFY_IND: DL_NOTE_PROMISC_ON_PHYS 1602 */ 1603 static void 1604 str_notify_promisc_on_phys(dld_str_t *dsp) 1605 { 1606 mblk_t *mp; 1607 dl_notify_ind_t *dlip; 1608 1609 if (!(dsp->ds_notifications & DL_NOTE_PROMISC_ON_PHYS)) 1610 return; 1611 1612 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1613 M_PROTO, 0)) == NULL) 1614 return; 1615 1616 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1617 dlip = (dl_notify_ind_t *)mp->b_rptr; 1618 dlip->dl_primitive = DL_NOTIFY_IND; 1619 dlip->dl_notification = DL_NOTE_PROMISC_ON_PHYS; 1620 1621 qreply(dsp->ds_wq, mp); 1622 } 1623 1624 /* 1625 * DL_NOTIFY_IND: DL_NOTE_PROMISC_OFF_PHYS 1626 */ 1627 static void 1628 str_notify_promisc_off_phys(dld_str_t *dsp) 1629 { 1630 mblk_t *mp; 1631 dl_notify_ind_t *dlip; 1632 1633 if (!(dsp->ds_notifications & DL_NOTE_PROMISC_OFF_PHYS)) 1634 return; 1635 1636 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1637 M_PROTO, 0)) == NULL) 1638 return; 1639 1640 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1641 dlip = (dl_notify_ind_t *)mp->b_rptr; 1642 dlip->dl_primitive = DL_NOTIFY_IND; 1643 dlip->dl_notification = DL_NOTE_PROMISC_OFF_PHYS; 1644 1645 qreply(dsp->ds_wq, mp); 1646 } 1647 1648 /* 1649 * DL_NOTIFY_IND: DL_NOTE_PHYS_ADDR 1650 */ 1651 static void 1652 str_notify_phys_addr(dld_str_t *dsp, const uint8_t *addr) 1653 { 1654 mblk_t *mp; 1655 dl_notify_ind_t *dlip; 1656 uint_t addr_length; 1657 uint16_t ethertype; 1658 1659 if (!(dsp->ds_notifications & DL_NOTE_PHYS_ADDR)) 1660 return; 1661 1662 addr_length = dsp->ds_mip->mi_addr_length; 1663 if ((mp = mexchange(dsp->ds_wq, NULL, 1664 sizeof (dl_notify_ind_t) + addr_length + sizeof (uint16_t), 1665 M_PROTO, 0)) == NULL) 1666 return; 1667 1668 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1669 dlip = (dl_notify_ind_t *)mp->b_rptr; 1670 dlip->dl_primitive = DL_NOTIFY_IND; 1671 dlip->dl_notification = DL_NOTE_PHYS_ADDR; 1672 dlip->dl_data = DL_CURR_PHYS_ADDR; 1673 dlip->dl_addr_offset = sizeof (dl_notify_ind_t); 1674 dlip->dl_addr_length = addr_length + sizeof (uint16_t); 1675 1676 bcopy(addr, &dlip[1], addr_length); 1677 1678 ethertype = (dsp->ds_sap < ETHERTYPE_802_MIN) ? 0 : dsp->ds_sap; 1679 *(uint16_t *)((uchar_t *)(dlip + 1) + addr_length) = ethertype; 1680 1681 qreply(dsp->ds_wq, mp); 1682 } 1683 1684 /* 1685 * DL_NOTIFY_IND: DL_NOTE_LINK_UP 1686 */ 1687 static void 1688 str_notify_link_up(dld_str_t *dsp) 1689 { 1690 mblk_t *mp; 1691 dl_notify_ind_t *dlip; 1692 1693 if (!(dsp->ds_notifications & DL_NOTE_LINK_UP)) 1694 return; 1695 1696 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1697 M_PROTO, 0)) == NULL) 1698 return; 1699 1700 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1701 dlip = (dl_notify_ind_t *)mp->b_rptr; 1702 dlip->dl_primitive = DL_NOTIFY_IND; 1703 dlip->dl_notification = DL_NOTE_LINK_UP; 1704 1705 qreply(dsp->ds_wq, mp); 1706 } 1707 1708 /* 1709 * DL_NOTIFY_IND: DL_NOTE_LINK_DOWN 1710 */ 1711 static void 1712 str_notify_link_down(dld_str_t *dsp) 1713 { 1714 mblk_t *mp; 1715 dl_notify_ind_t *dlip; 1716 1717 if (!(dsp->ds_notifications & DL_NOTE_LINK_DOWN)) 1718 return; 1719 1720 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1721 M_PROTO, 0)) == NULL) 1722 return; 1723 1724 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1725 dlip = (dl_notify_ind_t *)mp->b_rptr; 1726 dlip->dl_primitive = DL_NOTIFY_IND; 1727 dlip->dl_notification = DL_NOTE_LINK_DOWN; 1728 1729 qreply(dsp->ds_wq, mp); 1730 } 1731 1732 /* 1733 * DL_NOTIFY_IND: DL_NOTE_SPEED 1734 */ 1735 static void 1736 str_notify_speed(dld_str_t *dsp, uint32_t speed) 1737 { 1738 mblk_t *mp; 1739 dl_notify_ind_t *dlip; 1740 1741 if (!(dsp->ds_notifications & DL_NOTE_SPEED)) 1742 return; 1743 1744 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1745 M_PROTO, 0)) == NULL) 1746 return; 1747 1748 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1749 dlip = (dl_notify_ind_t *)mp->b_rptr; 1750 dlip->dl_primitive = DL_NOTIFY_IND; 1751 dlip->dl_notification = DL_NOTE_SPEED; 1752 dlip->dl_data = speed; 1753 1754 qreply(dsp->ds_wq, mp); 1755 } 1756 1757 /* 1758 * DL_NOTIFY_IND: DL_NOTE_CAPAB_RENEG 1759 */ 1760 static void 1761 str_notify_capab_reneg(dld_str_t *dsp) 1762 { 1763 mblk_t *mp; 1764 dl_notify_ind_t *dlip; 1765 1766 if (!(dsp->ds_notifications & DL_NOTE_CAPAB_RENEG)) 1767 return; 1768 1769 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1770 M_PROTO, 0)) == NULL) 1771 return; 1772 1773 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1774 dlip = (dl_notify_ind_t *)mp->b_rptr; 1775 dlip->dl_primitive = DL_NOTIFY_IND; 1776 dlip->dl_notification = DL_NOTE_CAPAB_RENEG; 1777 1778 qreply(dsp->ds_wq, mp); 1779 } 1780 1781 /* 1782 * DL_NOTIFY_IND: DL_NOTE_FASTPATH_FLUSH 1783 */ 1784 static void 1785 str_notify_fastpath_flush(dld_str_t *dsp) 1786 { 1787 mblk_t *mp; 1788 dl_notify_ind_t *dlip; 1789 1790 if (!(dsp->ds_notifications & DL_NOTE_FASTPATH_FLUSH)) 1791 return; 1792 1793 if ((mp = mexchange(dsp->ds_wq, NULL, sizeof (dl_notify_ind_t), 1794 M_PROTO, 0)) == NULL) 1795 return; 1796 1797 bzero(mp->b_rptr, sizeof (dl_notify_ind_t)); 1798 dlip = (dl_notify_ind_t *)mp->b_rptr; 1799 dlip->dl_primitive = DL_NOTIFY_IND; 1800 dlip->dl_notification = DL_NOTE_FASTPATH_FLUSH; 1801 1802 qreply(dsp->ds_wq, mp); 1803 } 1804 1805 /* 1806 * MAC notification callback. 1807 */ 1808 static void 1809 str_notify(void *arg, mac_notify_type_t type) 1810 { 1811 dld_str_t *dsp = (dld_str_t *)arg; 1812 queue_t *q = dsp->ds_wq; 1813 1814 switch (type) { 1815 case MAC_NOTE_TX: 1816 qenable(q); 1817 break; 1818 1819 case MAC_NOTE_DEVPROMISC: 1820 /* 1821 * Send the appropriate DL_NOTIFY_IND. 1822 */ 1823 if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC)) 1824 str_notify_promisc_on_phys(dsp); 1825 else 1826 str_notify_promisc_off_phys(dsp); 1827 break; 1828 1829 case MAC_NOTE_PROMISC: 1830 break; 1831 1832 case MAC_NOTE_UNICST: 1833 /* 1834 * This notification is sent whenever the MAC unicast address 1835 * changes. We need to re-cache the address. 1836 */ 1837 mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr); 1838 1839 /* 1840 * Send the appropriate DL_NOTIFY_IND. 1841 */ 1842 str_notify_phys_addr(dsp, dsp->ds_curr_addr); 1843 break; 1844 1845 case MAC_NOTE_LINK: 1846 /* 1847 * This notification is sent every time the MAC driver 1848 * updates the link state. 1849 */ 1850 switch (mac_link_get(dsp->ds_mh)) { 1851 case LINK_STATE_UP: { 1852 uint64_t speed; 1853 /* 1854 * The link is up so send the appropriate 1855 * DL_NOTIFY_IND. 1856 */ 1857 str_notify_link_up(dsp); 1858 1859 speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED); 1860 str_notify_speed(dsp, (uint32_t)(speed / 1000ull)); 1861 break; 1862 } 1863 case LINK_STATE_DOWN: 1864 /* 1865 * The link is down so send the appropriate 1866 * DL_NOTIFY_IND. 1867 */ 1868 str_notify_link_down(dsp); 1869 break; 1870 1871 default: 1872 break; 1873 } 1874 break; 1875 1876 case MAC_NOTE_RESOURCE: 1877 case MAC_NOTE_VNIC: 1878 /* 1879 * This notification is sent whenever the MAC resources 1880 * change or capabilities change. We need to renegotiate 1881 * the capabilities. Send the appropriate DL_NOTIFY_IND. 1882 */ 1883 str_notify_capab_reneg(dsp); 1884 break; 1885 1886 case MAC_NOTE_SDU_SIZE: { 1887 uint_t max_sdu; 1888 mac_sdu_get(dsp->ds_mh, NULL, &max_sdu); 1889 str_notify_sdu_size(dsp, max_sdu); 1890 break; 1891 } 1892 1893 case MAC_NOTE_FASTPATH_FLUSH: 1894 str_notify_fastpath_flush(dsp); 1895 break; 1896 1897 case MAC_NOTE_MARGIN: 1898 break; 1899 1900 default: 1901 ASSERT(B_FALSE); 1902 break; 1903 } 1904 } 1905 1906 static inline uint_t 1907 mp_getsize(mblk_t *mp) 1908 { 1909 ASSERT(DB_TYPE(mp) == M_DATA); 1910 return ((mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp)); 1911 } 1912 1913 /* 1914 * Calculate the dld queue depth, free the messages that exceed the threshold. 1915 */ 1916 static void 1917 dld_tx_qdepth_timer(void *arg) 1918 { 1919 dld_str_t *dsp = (dld_str_t *)arg; 1920 mblk_t *prev, *mp; 1921 uint_t cnt, msgcnt, size; 1922 1923 mutex_enter(&dsp->ds_tx_list_lock); 1924 1925 /* Calculate total size and count of the packet(s) */ 1926 cnt = msgcnt = 0; 1927 for (prev = NULL, mp = dsp->ds_tx_list_head; mp != NULL; 1928 prev = mp, mp = mp->b_next) { 1929 size = mp_getsize(mp); 1930 cnt += size; 1931 msgcnt++; 1932 if (cnt >= dld_max_q_count || msgcnt >= dld_max_q_count) { 1933 ASSERT(dsp->ds_tx_qbusy); 1934 dsp->ds_tx_list_tail = prev; 1935 if (prev == NULL) 1936 dsp->ds_tx_list_head = NULL; 1937 else 1938 prev->b_next = NULL; 1939 freemsgchain(mp); 1940 cnt -= size; 1941 msgcnt--; 1942 break; 1943 } 1944 } 1945 dsp->ds_tx_cnt = cnt; 1946 dsp->ds_tx_msgcnt = msgcnt; 1947 dsp->ds_tx_qdepth_tid = 0; 1948 mutex_exit(&dsp->ds_tx_list_lock); 1949 } 1950 1951 /* 1952 * Enqueue one or more messages on the transmit queue. Caller specifies: 1953 * - the insertion position (head/tail). 1954 * - the message count and the total message size of messages to be queued 1955 * if they are known to the caller; or 0 if they are not known. 1956 * 1957 * If the caller does not know the message size information, this usually 1958 * means that dld_wsrv() managed to send some but not all of the queued 1959 * messages. For performance reasons, we do not calculate the queue depth 1960 * every time. Instead, a timer is started to calculate the queue depth 1961 * every 1 second (can be changed by tx_qdepth_interval). 1962 */ 1963 static void 1964 dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, mblk_t *tail, boolean_t head_insert, 1965 uint_t msgcnt, uint_t cnt) 1966 { 1967 queue_t *q = dsp->ds_wq; 1968 uint_t tot_cnt, tot_msgcnt; 1969 mblk_t *next; 1970 1971 mutex_enter(&dsp->ds_tx_list_lock); 1972 1973 /* 1974 * Simply enqueue the message and calculate the queue depth via 1975 * timer if: 1976 * 1977 * - the current queue depth is incorrect, and the timer is already 1978 * started; or 1979 * 1980 * - the given message size is unknown and it is allowed to start the 1981 * timer; 1982 */ 1983 if ((dsp->ds_tx_qdepth_tid != 0) || 1984 (msgcnt == 0 && tx_qdepth_interval != 0)) { 1985 goto enqueue; 1986 } 1987 1988 /* 1989 * The timer is not allowed, so calculate the message size now. 1990 */ 1991 if (msgcnt == 0) { 1992 for (next = mp; next != NULL; next = next->b_next) { 1993 cnt += mp_getsize(next); 1994 msgcnt++; 1995 } 1996 } 1997 1998 /* 1999 * Grow the queue depth using the input messesge size. 2000 * 2001 * If the queue depth would exceed the allowed threshold, drop 2002 * new packet(s) and drain those already in the queue. 2003 */ 2004 tot_cnt = dsp->ds_tx_cnt + cnt; 2005 tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt; 2006 2007 if (!head_insert && (tot_cnt >= dld_max_q_count || 2008 tot_msgcnt >= dld_max_q_count)) { 2009 ASSERT(dsp->ds_tx_qbusy); 2010 mutex_exit(&dsp->ds_tx_list_lock); 2011 freemsgchain(mp); 2012 goto done; 2013 } 2014 /* Update the queue size parameters */ 2015 dsp->ds_tx_cnt = tot_cnt; 2016 dsp->ds_tx_msgcnt = tot_msgcnt; 2017 2018 enqueue: 2019 /* 2020 * If the transmit queue is currently empty and we are 2021 * about to deposit the packet(s) there, switch mode to 2022 * "busy" and raise flow-control condition. 2023 */ 2024 if (!dsp->ds_tx_qbusy) { 2025 dsp->ds_tx_qbusy = B_TRUE; 2026 ASSERT(dsp->ds_tx_flow_mp != NULL); 2027 (void) putq(q, dsp->ds_tx_flow_mp); 2028 dsp->ds_tx_flow_mp = NULL; 2029 } 2030 2031 if (!head_insert) { 2032 /* Tail insertion */ 2033 if (dsp->ds_tx_list_head == NULL) 2034 dsp->ds_tx_list_head = mp; 2035 else 2036 dsp->ds_tx_list_tail->b_next = mp; 2037 dsp->ds_tx_list_tail = tail; 2038 } else { 2039 /* Head insertion */ 2040 tail->b_next = dsp->ds_tx_list_head; 2041 if (dsp->ds_tx_list_head == NULL) 2042 dsp->ds_tx_list_tail = tail; 2043 dsp->ds_tx_list_head = mp; 2044 } 2045 2046 if (msgcnt == 0 && dsp->ds_tx_qdepth_tid == 0 && 2047 tx_qdepth_interval != 0) { 2048 /* 2049 * The message size is not given so that we need to start 2050 * the timer to calculate the queue depth. 2051 */ 2052 dsp->ds_tx_qdepth_tid = timeout(dld_tx_qdepth_timer, dsp, 2053 drv_usectohz(tx_qdepth_interval)); 2054 ASSERT(dsp->ds_tx_qdepth_tid != NULL); 2055 } 2056 mutex_exit(&dsp->ds_tx_list_lock); 2057 done: 2058 /* Schedule service thread to drain the transmit queue */ 2059 if (!head_insert) 2060 qenable(q); 2061 } 2062 2063 void 2064 dld_tx_flush(dld_str_t *dsp) 2065 { 2066 timeout_id_t tid = 0; 2067 2068 mutex_enter(&dsp->ds_tx_list_lock); 2069 if (dsp->ds_tx_list_head != NULL) { 2070 freemsgchain(dsp->ds_tx_list_head); 2071 dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL; 2072 dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0; 2073 if (dsp->ds_tx_qbusy) { 2074 dsp->ds_tx_flow_mp = getq(dsp->ds_wq); 2075 ASSERT(dsp->ds_tx_flow_mp != NULL); 2076 dsp->ds_tx_qbusy = B_FALSE; 2077 } 2078 if ((tid = dsp->ds_tx_qdepth_tid) != 0) 2079 dsp->ds_tx_qdepth_tid = 0; 2080 } 2081 mutex_exit(&dsp->ds_tx_list_lock); 2082 2083 /* 2084 * Note that ds_tx_list_lock (which is acquired by the timeout 2085 * callback routine) cannot be held across the call to untimeout(). 2086 */ 2087 if (tid != 0) 2088 (void) untimeout(tid); 2089 } 2090 2091 /* 2092 * Process a non-data message. 2093 */ 2094 static void 2095 dld_wput_nondata(dld_str_t *dsp, mblk_t *mp) 2096 { 2097 ASSERT((dsp->ds_type == DLD_DLPI && dsp->ds_ioctl == NULL) || 2098 (dsp->ds_type == DLD_CONTROL && dsp->ds_ioctl != NULL)); 2099 2100 mutex_enter(&dsp->ds_disp_lock); 2101 2102 /* 2103 * The processing of the message might block. Enqueue the 2104 * message for later processing. 2105 */ 2106 if (dsp->ds_pending_head == NULL) { 2107 dsp->ds_pending_head = dsp->ds_pending_tail = mp; 2108 } else { 2109 dsp->ds_pending_tail->b_next = mp; 2110 dsp->ds_pending_tail = mp; 2111 } 2112 2113 /* 2114 * If there is no task pending, kick off the task. 2115 */ 2116 if (dsp->ds_tid == NULL) { 2117 dsp->ds_tid = taskq_dispatch(dld_disp_taskq, 2118 dld_wput_nondata_task, dsp, TQ_SLEEP); 2119 ASSERT(dsp->ds_tid != NULL); 2120 } 2121 mutex_exit(&dsp->ds_disp_lock); 2122 } 2123 2124 /* 2125 * The worker thread which processes non-data messages. Note we only process 2126 * one message at one time in order to be able to "flush" the queued message 2127 * and serialize the processing. 2128 */ 2129 static void 2130 dld_wput_nondata_task(void *arg) 2131 { 2132 dld_str_t *dsp = (dld_str_t *)arg; 2133 mblk_t *mp; 2134 2135 mutex_enter(&dsp->ds_disp_lock); 2136 ASSERT(dsp->ds_pending_head != NULL); 2137 ASSERT(dsp->ds_tid != NULL); 2138 2139 if (dsp->ds_closing) 2140 goto closing; 2141 2142 mp = dsp->ds_pending_head; 2143 if ((dsp->ds_pending_head = mp->b_next) == NULL) 2144 dsp->ds_pending_tail = NULL; 2145 mp->b_next = NULL; 2146 2147 mutex_exit(&dsp->ds_disp_lock); 2148 2149 switch (DB_TYPE(mp)) { 2150 case M_PROTO: 2151 case M_PCPROTO: 2152 ASSERT(dsp->ds_type == DLD_DLPI); 2153 dld_wput_proto_nondata(dsp, mp); 2154 break; 2155 case M_IOCTL: { 2156 uint_t cmd; 2157 2158 if (dsp->ds_type == DLD_CONTROL) { 2159 ASSERT(dsp->ds_ioctl != NULL); 2160 dsp->ds_ioctl(dsp->ds_wq, mp); 2161 break; 2162 } 2163 2164 cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 2165 2166 switch (cmd) { 2167 case DLIOCNATIVE: 2168 ioc_native(dsp, mp); 2169 break; 2170 case DLIOCMARGININFO: 2171 ioc_margin(dsp, mp); 2172 break; 2173 case DLIOCRAW: 2174 ioc_raw(dsp, mp); 2175 break; 2176 case DLIOCHDRINFO: 2177 ioc_fast(dsp, mp); 2178 break; 2179 default: 2180 ioc(dsp, mp); 2181 break; 2182 } 2183 break; 2184 } 2185 case M_IOCDATA: 2186 ASSERT(dsp->ds_type == DLD_DLPI); 2187 ioc(dsp, mp); 2188 break; 2189 } 2190 2191 mutex_enter(&dsp->ds_disp_lock); 2192 2193 if (dsp->ds_closing) 2194 goto closing; 2195 2196 if (dsp->ds_pending_head != NULL) { 2197 dsp->ds_tid = taskq_dispatch(dld_disp_taskq, 2198 dld_wput_nondata_task, dsp, TQ_SLEEP); 2199 ASSERT(dsp->ds_tid != NULL); 2200 } else { 2201 dsp->ds_tid = NULL; 2202 } 2203 mutex_exit(&dsp->ds_disp_lock); 2204 return; 2205 2206 /* 2207 * If the stream is closing, flush all queued messages and inform 2208 * the stream once it is done. 2209 */ 2210 closing: 2211 freemsgchain(dsp->ds_pending_head); 2212 dsp->ds_pending_head = dsp->ds_pending_tail = NULL; 2213 dsp->ds_tid = NULL; 2214 cv_signal(&dsp->ds_disp_cv); 2215 mutex_exit(&dsp->ds_disp_lock); 2216 } 2217 2218 /* 2219 * Flush queued non-data messages. 2220 */ 2221 static void 2222 dld_flush_nondata(dld_str_t *dsp) 2223 { 2224 mutex_enter(&dsp->ds_disp_lock); 2225 freemsgchain(dsp->ds_pending_head); 2226 dsp->ds_pending_head = dsp->ds_pending_tail = NULL; 2227 mutex_exit(&dsp->ds_disp_lock); 2228 } 2229 2230 /* 2231 * DLIOCNATIVE 2232 */ 2233 static void 2234 ioc_native(dld_str_t *dsp, mblk_t *mp) 2235 { 2236 queue_t *q = dsp->ds_wq; 2237 const mac_info_t *mip = dsp->ds_mip; 2238 2239 rw_enter(&dsp->ds_lock, RW_WRITER); 2240 2241 /* 2242 * Native mode can be enabled if it's disabled and if the 2243 * native media type is different. 2244 */ 2245 if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia) 2246 dsp->ds_native = B_TRUE; 2247 2248 rw_exit(&dsp->ds_lock); 2249 2250 if (dsp->ds_native) 2251 miocack(q, mp, 0, mip->mi_nativemedia); 2252 else 2253 miocnak(q, mp, 0, ENOTSUP); 2254 } 2255 2256 /* 2257 * DLIOCMARGININFO 2258 */ 2259 static void 2260 ioc_margin(dld_str_t *dsp, mblk_t *mp) 2261 { 2262 queue_t *q = dsp->ds_wq; 2263 uint32_t margin; 2264 int err; 2265 2266 if (dsp->ds_dlstate == DL_UNATTACHED) { 2267 err = EINVAL; 2268 goto failed; 2269 } 2270 if ((err = miocpullup(mp, sizeof (uint32_t))) != 0) 2271 goto failed; 2272 2273 mac_margin_get(dsp->ds_mh, &margin); 2274 *((uint32_t *)mp->b_cont->b_rptr) = margin; 2275 miocack(q, mp, sizeof (uint32_t), 0); 2276 return; 2277 2278 failed: 2279 miocnak(q, mp, 0, err); 2280 } 2281 2282 /* 2283 * DLIOCRAW 2284 */ 2285 static void 2286 ioc_raw(dld_str_t *dsp, mblk_t *mp) 2287 { 2288 queue_t *q = dsp->ds_wq; 2289 2290 if (dsp->ds_polling || dsp->ds_soft_ring) { 2291 miocnak(q, mp, 0, EPROTO); 2292 return; 2293 } 2294 2295 rw_enter(&dsp->ds_lock, RW_WRITER); 2296 if ((dsp->ds_mode != DLD_RAW) && (dsp->ds_dlstate == DL_IDLE)) { 2297 /* 2298 * Set the receive callback. 2299 */ 2300 dls_rx_set(dsp->ds_dc, dld_str_rx_raw, dsp); 2301 dsp->ds_tx = str_mdata_raw_put; 2302 } 2303 dsp->ds_mode = DLD_RAW; 2304 rw_exit(&dsp->ds_lock); 2305 miocack(q, mp, 0, 0); 2306 } 2307 2308 /* 2309 * DLIOCHDRINFO 2310 */ 2311 static void 2312 ioc_fast(dld_str_t *dsp, mblk_t *mp) 2313 { 2314 dl_unitdata_req_t *dlp; 2315 off_t off; 2316 size_t len; 2317 const uint8_t *addr; 2318 uint16_t sap; 2319 mblk_t *nmp; 2320 mblk_t *hmp; 2321 uint_t addr_length; 2322 queue_t *q = dsp->ds_wq; 2323 int err; 2324 2325 if (dld_opt & DLD_OPT_NO_FASTPATH) { 2326 err = ENOTSUP; 2327 goto failed; 2328 } 2329 2330 /* 2331 * DLIOCHDRINFO should only come from IP. The one initiated from 2332 * user-land should not be allowed. 2333 */ 2334 if (((struct iocblk *)mp->b_rptr)->ioc_cr != kcred) { 2335 err = EINVAL; 2336 goto failed; 2337 } 2338 2339 nmp = mp->b_cont; 2340 if (nmp == NULL || MBLKL(nmp) < sizeof (dl_unitdata_req_t) || 2341 (dlp = (dl_unitdata_req_t *)nmp->b_rptr, 2342 dlp->dl_primitive != DL_UNITDATA_REQ)) { 2343 err = EINVAL; 2344 goto failed; 2345 } 2346 2347 off = dlp->dl_dest_addr_offset; 2348 len = dlp->dl_dest_addr_length; 2349 2350 if (!MBLKIN(nmp, off, len)) { 2351 err = EINVAL; 2352 goto failed; 2353 } 2354 2355 /* 2356 * We don't need to hold any locks to access ds_dlstate, because 2357 * control message prossessing (which updates this field) is 2358 * serialized. 2359 */ 2360 if (dsp->ds_dlstate != DL_IDLE) { 2361 err = ENOTSUP; 2362 goto failed; 2363 } 2364 2365 addr_length = dsp->ds_mip->mi_addr_length; 2366 if (len != addr_length + sizeof (uint16_t)) { 2367 err = EINVAL; 2368 goto failed; 2369 } 2370 2371 addr = nmp->b_rptr + off; 2372 sap = *(uint16_t *)(nmp->b_rptr + off + addr_length); 2373 2374 if ((hmp = dls_header(dsp->ds_dc, addr, sap, 0, NULL)) == NULL) { 2375 err = ENOMEM; 2376 goto failed; 2377 } 2378 2379 rw_enter(&dsp->ds_lock, RW_WRITER); 2380 ASSERT(dsp->ds_dlstate == DL_IDLE); 2381 if (dsp->ds_mode != DLD_FASTPATH) { 2382 /* 2383 * Set the receive callback (unless polling or 2384 * soft-ring is enabled). 2385 */ 2386 dsp->ds_mode = DLD_FASTPATH; 2387 if (!dsp->ds_polling && !dsp->ds_soft_ring) 2388 dls_rx_set(dsp->ds_dc, dld_str_rx_fastpath, dsp); 2389 dsp->ds_tx = str_mdata_fastpath_put; 2390 } 2391 rw_exit(&dsp->ds_lock); 2392 2393 freemsg(nmp->b_cont); 2394 nmp->b_cont = hmp; 2395 2396 miocack(q, mp, MBLKL(nmp) + MBLKL(hmp), 0); 2397 return; 2398 failed: 2399 miocnak(q, mp, 0, err); 2400 } 2401 2402 static void 2403 ioc(dld_str_t *dsp, mblk_t *mp) 2404 { 2405 queue_t *q = dsp->ds_wq; 2406 mac_handle_t mh; 2407 2408 if (dsp->ds_dlstate == DL_UNATTACHED) { 2409 miocnak(q, mp, 0, EINVAL); 2410 return; 2411 } 2412 mh = dsp->ds_mh; 2413 ASSERT(mh != NULL); 2414 mac_ioctl(mh, q, mp); 2415 } 2416