1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2011 Joyent, Inc. All rights reserved. 25 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/callb.h> 30 #include <sys/sdt.h> 31 #include <sys/strsubr.h> 32 #include <sys/strsun.h> 33 #include <sys/vlan.h> 34 #include <sys/stack.h> 35 #include <sys/archsystm.h> 36 #include <inet/ipsec_impl.h> 37 #include <inet/ip_impl.h> 38 #include <inet/sadb.h> 39 #include <inet/ipsecesp.h> 40 #include <inet/ipsecah.h> 41 #include <inet/ip6.h> 42 43 #include <sys/mac_impl.h> 44 #include <sys/mac_client_impl.h> 45 #include <sys/mac_client_priv.h> 46 #include <sys/mac_soft_ring.h> 47 #include <sys/mac_flow_impl.h> 48 49 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 50 uintptr_t, uint16_t, mblk_t **); 51 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 52 uintptr_t, uint16_t, mblk_t **); 53 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 54 uintptr_t, uint16_t, mblk_t **); 55 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 56 uintptr_t, uint16_t, mblk_t **); 57 static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *, 58 uintptr_t, uint16_t, mblk_t **); 59 60 typedef struct mac_tx_mode_s { 61 mac_tx_srs_mode_t mac_tx_mode; 62 mac_tx_func_t mac_tx_func; 63 } mac_tx_mode_t; 64 65 /* 66 * There are seven modes of operation on the Tx side. These modes get set 67 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 68 * none of the other modes are user configurable. They get selected by 69 * the system depending upon whether the link (or flow) has multiple Tx 70 * rings or a bandwidth configured, or if the link is an aggr, etc. 71 * 72 * When the Tx SRS is operating in aggr mode (st_mode) or if there are 73 * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or 74 * otherwise) will have a soft ring associated with it. These soft rings 75 * are stored in srs_tx_soft_rings[] array. 76 * 77 * Additionally in the case of aggr, there is the st_soft_rings[] array 78 * in the mac_srs_tx_t structure. This array is used to store the same 79 * set of soft rings that are present in srs_tx_soft_rings[] array but 80 * in a different manner. The soft ring associated with the pseudo Tx 81 * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[] 82 * array. This helps in quickly getting the soft ring associated with the 83 * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to 84 * be used for transmit. 85 */ 86 mac_tx_mode_t mac_tx_mode_list[] = { 87 {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 88 {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 89 {SRS_TX_FANOUT, mac_tx_fanout_mode}, 90 {SRS_TX_BW, mac_tx_bw_mode}, 91 {SRS_TX_BW_FANOUT, mac_tx_bw_mode}, 92 {SRS_TX_AGGR, mac_tx_aggr_mode}, 93 {SRS_TX_BW_AGGR, mac_tx_bw_mode} 94 }; 95 96 /* 97 * Soft Ring Set (SRS) - The Run time code that deals with 98 * dynamic polling from the hardware, bandwidth enforcement, 99 * fanout etc. 100 * 101 * We try to use H/W classification on NIC and assign traffic for 102 * a MAC address to a particular Rx ring or ring group. There is a 103 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 104 * switches the underlying Rx ring between interrupt and 105 * polling mode and enforces any specified B/W control. 106 * 107 * There is always a SRS created and tied to each H/W and S/W rule. 108 * Whenever we create a H/W rule, we always add the the same rule to 109 * S/W classifier and tie a SRS to it. 110 * 111 * In case a B/W control is specified, it is broken into bytes 112 * per ticks and as soon as the quota for a tick is exhausted, 113 * the underlying Rx ring is forced into poll mode for remainder of 114 * the tick. The SRS poll thread only polls for bytes that are 115 * allowed to come in the SRS. We typically let 4x the configured 116 * B/W worth of packets to come in the SRS (to prevent unnecessary 117 * drops due to bursts) but only process the specified amount. 118 * 119 * A MAC client (e.g. a VNIC or aggr) can have 1 or more 120 * Rx rings (and corresponding SRSs) assigned to it. The SRS 121 * in turn can have softrings to do protocol level fanout or 122 * softrings to do S/W based fanout or both. In case the NIC 123 * has no Rx rings, we do S/W classification to respective SRS. 124 * The S/W classification rule is always setup and ready. This 125 * allows the MAC layer to reassign Rx rings whenever needed 126 * but packets still continue to flow via the default path and 127 * getting S/W classified to correct SRS. 128 * 129 * The SRS's are used on both Tx and Rx side. They use the same 130 * data structure but the processing routines have slightly different 131 * semantics due to the fact that Rx side needs to do dynamic 132 * polling etc. 133 * 134 * Dynamic Polling Notes 135 * ===================== 136 * 137 * Each Soft ring set is capable of switching its Rx ring between 138 * interrupt and poll mode and actively 'polls' for packets in 139 * poll mode. If the SRS is implementing a B/W limit, it makes 140 * sure that only Max allowed packets are pulled in poll mode 141 * and goes to poll mode as soon as B/W limit is exceeded. As 142 * such, there are no overheads to implement B/W limits. 143 * 144 * In poll mode, its better to keep the pipeline going where the 145 * SRS worker thread keeps processing packets and poll thread 146 * keeps bringing more packets (specially if they get to run 147 * on different CPUs). This also prevents the overheads associated 148 * by excessive signalling (on NUMA machines, this can be 149 * pretty devastating). The exception is latency optimized case 150 * where worker thread does no work and interrupt and poll thread 151 * are allowed to do their own drain. 152 * 153 * We use the following policy to control Dynamic Polling: 154 * 1) We switch to poll mode anytime the processing 155 * thread causes a backlog to build up in SRS and 156 * its associated Soft Rings (sr_poll_pkt_cnt > 0). 157 * 2) As long as the backlog stays under the low water 158 * mark (sr_lowat), we poll the H/W for more packets. 159 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 160 * water mark, we stay in poll mode but don't poll 161 * the H/W for more packets. 162 * 4) Anytime in polling mode, if we poll the H/W for 163 * packets and find nothing plus we have an existing 164 * backlog (sr_poll_pkt_cnt > 0), we stay in polling 165 * mode but don't poll the H/W for packets anymore 166 * (let the polling thread go to sleep). 167 * 5) Once the backlog is relived (packets are processed) 168 * we reenable polling (by signalling the poll thread) 169 * only when the backlog dips below sr_poll_thres. 170 * 6) sr_hiwat is used exclusively when we are not 171 * polling capable and is used to decide when to 172 * drop packets so the SRS queue length doesn't grow 173 * infinitely. 174 * 175 * NOTE: Also see the block level comment on top of mac_soft_ring.c 176 */ 177 178 /* 179 * mac_latency_optimize 180 * 181 * Controls whether the poll thread can process the packets inline 182 * or let the SRS worker thread do the processing. This applies if 183 * the SRS was not being processed. For latency sensitive traffic, 184 * this needs to be true to allow inline processing. For throughput 185 * under load, this should be false. 186 * 187 * This (and other similar) tunable should be rolled into a link 188 * or flow specific workload hint that can be set using dladm 189 * linkprop (instead of multiple such tunables). 190 */ 191 boolean_t mac_latency_optimize = B_TRUE; 192 193 /* 194 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 195 * 196 * queue a mp or chain in soft ring set and increment the 197 * local count (srs_count) for the SRS and the shared counter 198 * (srs_poll_pkt_cnt - shared between SRS and its soft rings 199 * to track the total unprocessed packets for polling to work 200 * correctly). 201 * 202 * The size (total bytes queued) counters are incremented only 203 * if we are doing B/W control. 204 */ 205 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 206 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 207 if ((mac_srs)->srs_last != NULL) \ 208 (mac_srs)->srs_last->b_next = (head); \ 209 else \ 210 (mac_srs)->srs_first = (head); \ 211 (mac_srs)->srs_last = (tail); \ 212 (mac_srs)->srs_count += count; \ 213 } 214 215 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 216 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 217 \ 218 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 219 srs_rx->sr_poll_pkt_cnt += count; \ 220 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 221 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 222 (mac_srs)->srs_size += (sz); \ 223 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 224 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 225 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 226 } \ 227 } 228 229 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 230 mac_srs->srs_state |= SRS_ENQUEUED; \ 231 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 232 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 233 (mac_srs)->srs_size += (sz); \ 234 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 235 } \ 236 } 237 238 /* 239 * Turn polling on routines 240 */ 241 #define MAC_SRS_POLLING_ON(mac_srs) { \ 242 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 243 if (((mac_srs)->srs_state & \ 244 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 245 (mac_srs)->srs_state |= SRS_POLLING; \ 246 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 247 (mac_srs)->srs_ring); \ 248 (mac_srs)->srs_rx.sr_poll_on++; \ 249 } \ 250 } 251 252 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 253 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 254 if (((mac_srs)->srs_state & \ 255 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 256 (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 257 (mac_srs)->srs_state |= SRS_POLLING; \ 258 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 259 (mac_srs)->srs_ring); \ 260 (mac_srs)->srs_rx.sr_worker_poll_on++; \ 261 } \ 262 } 263 264 /* 265 * MAC_SRS_POLL_RING 266 * 267 * Signal the SRS poll thread to poll the underlying H/W ring 268 * provided it wasn't already polling (SRS_GET_PKTS was set). 269 * 270 * Poll thread gets to run only from mac_rx_srs_drain() and only 271 * if the drain was being done by the worker thread. 272 */ 273 #define MAC_SRS_POLL_RING(mac_srs) { \ 274 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 275 \ 276 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 277 srs_rx->sr_poll_thr_sig++; \ 278 if (((mac_srs)->srs_state & \ 279 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 280 (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 281 (mac_srs)->srs_state |= SRS_GET_PKTS; \ 282 cv_signal(&(mac_srs)->srs_cv); \ 283 } else { \ 284 srs_rx->sr_poll_thr_busy++; \ 285 } \ 286 } 287 288 /* 289 * MAC_SRS_CHECK_BW_CONTROL 290 * 291 * Check to see if next tick has started so we can reset the 292 * SRS_BW_ENFORCED flag and allow more packets to come in the 293 * system. 294 */ 295 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 296 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 297 ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 298 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 299 clock_t now = ddi_get_lbolt(); \ 300 if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \ 301 (mac_srs)->srs_bw->mac_bw_curr_time = now; \ 302 (mac_srs)->srs_bw->mac_bw_used = 0; \ 303 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 304 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 305 } \ 306 } 307 308 /* 309 * MAC_SRS_WORKER_WAKEUP 310 * 311 * Wake up the SRS worker thread to process the queue as long as 312 * no one else is processing the queue. If we are optimizing for 313 * latency, we wake up the worker thread immediately or else we 314 * wait mac_srs_worker_wakeup_ticks before worker thread gets 315 * woken up. 316 */ 317 int mac_srs_worker_wakeup_ticks = 0; 318 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 319 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 320 if (!((mac_srs)->srs_state & SRS_PROC) && \ 321 (mac_srs)->srs_tid == NULL) { \ 322 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 323 (mac_srs_worker_wakeup_ticks == 0)) \ 324 cv_signal(&(mac_srs)->srs_async); \ 325 else \ 326 (mac_srs)->srs_tid = \ 327 timeout(mac_srs_fire, (mac_srs), \ 328 mac_srs_worker_wakeup_ticks); \ 329 } \ 330 } 331 332 #define TX_BANDWIDTH_MODE(mac_srs) \ 333 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 334 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT || \ 335 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR) 336 337 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 338 if (tx_mode == SRS_TX_BW_FANOUT) \ 339 (void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\ 340 else \ 341 (void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL); \ 342 } 343 344 /* 345 * MAC_TX_SRS_BLOCK 346 * 347 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 348 * will be set only if srs_tx_woken_up is FALSE. If 349 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 350 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 351 * attempt to transmit again and not setting SRS_TX_BLOCKED does 352 * that. 353 */ 354 #define MAC_TX_SRS_BLOCK(srs, mp) { \ 355 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 356 if ((srs)->srs_tx.st_woken_up) { \ 357 (srs)->srs_tx.st_woken_up = B_FALSE; \ 358 } else { \ 359 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 360 (srs)->srs_state |= SRS_TX_BLOCKED; \ 361 (srs)->srs_tx.st_stat.mts_blockcnt++; \ 362 } \ 363 } 364 365 /* 366 * MAC_TX_SRS_TEST_HIWAT 367 * 368 * Called before queueing a packet onto Tx SRS to test and set 369 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 370 */ 371 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 372 boolean_t enqueue = 1; \ 373 \ 374 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 375 /* \ 376 * flow-controlled. Store srs in cookie so that it \ 377 * can be returned as mac_tx_cookie_t to client \ 378 */ \ 379 (srs)->srs_state |= SRS_TX_HIWAT; \ 380 cookie = (mac_tx_cookie_t)srs; \ 381 (srs)->srs_tx.st_hiwat_cnt++; \ 382 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 383 /* increment freed stats */ \ 384 (srs)->srs_tx.st_stat.mts_sdrops += cnt; \ 385 /* \ 386 * b_prev may be set to the fanout hint \ 387 * hence can't use freemsg directly \ 388 */ \ 389 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 390 DTRACE_PROBE1(tx_queued_hiwat, \ 391 mac_soft_ring_set_t *, srs); \ 392 enqueue = 0; \ 393 } \ 394 } \ 395 if (enqueue) \ 396 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 397 } 398 399 /* Some utility macros */ 400 #define MAC_SRS_BW_LOCK(srs) \ 401 if (!(srs->srs_type & SRST_TX)) \ 402 mutex_enter(&srs->srs_bw->mac_bw_lock); 403 404 #define MAC_SRS_BW_UNLOCK(srs) \ 405 if (!(srs->srs_type & SRST_TX)) \ 406 mutex_exit(&srs->srs_bw->mac_bw_lock); 407 408 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 409 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 410 /* increment freed stats */ \ 411 mac_srs->srs_tx.st_stat.mts_sdrops++; \ 412 cookie = (mac_tx_cookie_t)srs; \ 413 } 414 415 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 416 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 417 cookie = (mac_tx_cookie_t)srs; \ 418 *ret_mp = mp_chain; \ 419 } 420 421 /* 422 * MAC_RX_SRS_TOODEEP 423 * 424 * Macro called as part of receive-side processing to determine if handling 425 * can occur in situ (in the interrupt thread) or if it should be left to a 426 * worker thread. Note that the constant used to make this determination is 427 * not entirely made-up, and is a result of some emprical validation. That 428 * said, the constant is left as a static variable to allow it to be 429 * dynamically tuned in the field if and as needed. 430 */ 431 static uintptr_t mac_rx_srs_stack_needed = 10240; 432 static uint_t mac_rx_srs_stack_toodeep; 433 434 #ifndef STACK_GROWTH_DOWN 435 #error Downward stack growth assumed. 436 #endif 437 438 #define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ 439 (uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \ 440 ++mac_rx_srs_stack_toodeep) 441 442 443 /* 444 * Drop the rx packet and advance to the next one in the chain. 445 */ 446 static void 447 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 448 { 449 mac_srs_rx_t *srs_rx = &srs->srs_rx; 450 451 ASSERT(mp->b_next == NULL); 452 mutex_enter(&srs->srs_lock); 453 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 454 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 455 mutex_exit(&srs->srs_lock); 456 457 srs_rx->sr_stat.mrs_sdrops++; 458 freemsg(mp); 459 } 460 461 /* DATAPATH RUNTIME ROUTINES */ 462 463 /* 464 * mac_srs_fire 465 * 466 * Timer callback routine for waking up the SRS worker thread. 467 */ 468 static void 469 mac_srs_fire(void *arg) 470 { 471 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 472 473 mutex_enter(&mac_srs->srs_lock); 474 if (mac_srs->srs_tid == 0) { 475 mutex_exit(&mac_srs->srs_lock); 476 return; 477 } 478 479 mac_srs->srs_tid = 0; 480 if (!(mac_srs->srs_state & SRS_PROC)) 481 cv_signal(&mac_srs->srs_async); 482 483 mutex_exit(&mac_srs->srs_lock); 484 } 485 486 /* 487 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 488 * and it is used on the TX path. 489 */ 490 #define HASH_HINT(hint) \ 491 ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8)) 492 493 494 /* 495 * hash based on the src address and the port information. 496 */ 497 #define HASH_ADDR(src, ports) \ 498 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 499 ((ports) >> 8) ^ (ports)) 500 501 #define COMPUTE_INDEX(key, sz) (key % sz) 502 503 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 504 if ((tail) != NULL) { \ 505 ASSERT((tail)->b_next == NULL); \ 506 (tail)->b_next = (mp); \ 507 } else { \ 508 ASSERT((head) == NULL); \ 509 (head) = (mp); \ 510 } \ 511 (tail) = (mp); \ 512 (cnt)++; \ 513 if ((bw_ctl)) \ 514 (sz) += (sz0); \ 515 } 516 517 #define MAC_FANOUT_DEFAULT 0 518 #define MAC_FANOUT_RND_ROBIN 1 519 int mac_fanout_type = MAC_FANOUT_DEFAULT; 520 521 #define MAX_SR_TYPES 3 522 /* fanout types for port based hashing */ 523 enum pkt_type { 524 V4_TCP = 0, 525 V4_UDP, 526 OTH, 527 UNDEF 528 }; 529 530 /* 531 * In general we do port based hashing to spread traffic over different 532 * softrings. The below tunables allow to override that behavior. Setting one 533 * (depending on IPv6 or IPv4) to B_TRUE allows a fanout based on src 534 * IPv6 or IPv4 address. This behavior is also applicable to IPv6 packets 535 * carrying multiple optional headers and other uncommon packet types. 536 */ 537 boolean_t mac_src_ipv6_fanout = B_FALSE; 538 boolean_t mac_src_ipv4_fanout = B_FALSE; 539 540 /* 541 * Pair of local and remote ports in the transport header 542 */ 543 #define PORTS_SIZE 4 544 545 /* 546 * mac_rx_srs_proto_fanout 547 * 548 * This routine delivers packets destined to an SRS into one of the 549 * protocol soft rings. 550 * 551 * Given a chain of packets we need to split it up into multiple sub chains 552 * destined into TCP, UDP or OTH soft ring. Instead of entering 553 * the soft ring one packet at a time, we want to enter it in the form of a 554 * chain otherwise we get this start/stop behaviour where the worker thread 555 * goes to sleep and then next packets comes in forcing it to wake up etc. 556 */ 557 static void 558 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 559 { 560 struct ether_header *ehp; 561 struct ether_vlan_header *evhp; 562 uint32_t sap; 563 ipha_t *ipha; 564 uint8_t *dstaddr; 565 size_t hdrsize; 566 mblk_t *mp; 567 mblk_t *headmp[MAX_SR_TYPES]; 568 mblk_t *tailmp[MAX_SR_TYPES]; 569 int cnt[MAX_SR_TYPES]; 570 size_t sz[MAX_SR_TYPES]; 571 size_t sz1; 572 boolean_t bw_ctl; 573 boolean_t hw_classified; 574 boolean_t dls_bypass; 575 boolean_t is_ether; 576 boolean_t is_unicast; 577 enum pkt_type type; 578 mac_client_impl_t *mcip = mac_srs->srs_mcip; 579 580 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 581 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 582 583 /* 584 * If we don't have a Rx ring, S/W classification would have done 585 * its job and its a packet meant for us. If we were polling on 586 * the default ring (i.e. there was a ring assigned to this SRS), 587 * then we need to make sure that the mac address really belongs 588 * to us. 589 */ 590 hw_classified = mac_srs->srs_ring != NULL && 591 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 592 593 /* 594 * Special clients (eg. VLAN, non ether, etc) need DLS 595 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 596 * such SRSs. Another way of disabling bypass is to set the 597 * MCIS_RX_BYPASS_DISABLE flag. 598 */ 599 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 600 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 601 602 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 603 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 604 bzero(cnt, MAX_SR_TYPES * sizeof (int)); 605 bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 606 607 /* 608 * We got a chain from SRS that we need to send to the soft rings. 609 * Since squeues for TCP & IPv4 sap poll their soft rings (for 610 * performance reasons), we need to separate out v4_tcp, v4_udp 611 * and the rest goes in other. 612 */ 613 while (head != NULL) { 614 mp = head; 615 head = head->b_next; 616 mp->b_next = NULL; 617 618 type = OTH; 619 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 620 621 if (is_ether) { 622 /* 623 * At this point we can be sure the packet at least 624 * has an ether header. 625 */ 626 if (sz1 < sizeof (struct ether_header)) { 627 mac_rx_drop_pkt(mac_srs, mp); 628 continue; 629 } 630 ehp = (struct ether_header *)mp->b_rptr; 631 632 /* 633 * Determine if this is a VLAN or non-VLAN packet. 634 */ 635 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 636 evhp = (struct ether_vlan_header *)mp->b_rptr; 637 sap = ntohs(evhp->ether_type); 638 hdrsize = sizeof (struct ether_vlan_header); 639 /* 640 * Check if the VID of the packet, if any, 641 * belongs to this client. 642 */ 643 if (!mac_client_check_flow_vid(mcip, 644 VLAN_ID(ntohs(evhp->ether_tci)))) { 645 mac_rx_drop_pkt(mac_srs, mp); 646 continue; 647 } 648 } else { 649 hdrsize = sizeof (struct ether_header); 650 } 651 is_unicast = 652 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 653 dstaddr = (uint8_t *)&ehp->ether_dhost; 654 } else { 655 mac_header_info_t mhi; 656 657 if (mac_header_info((mac_handle_t)mcip->mci_mip, 658 mp, &mhi) != 0) { 659 mac_rx_drop_pkt(mac_srs, mp); 660 continue; 661 } 662 hdrsize = mhi.mhi_hdrsize; 663 sap = mhi.mhi_bindsap; 664 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 665 dstaddr = (uint8_t *)mhi.mhi_daddr; 666 } 667 668 if (!dls_bypass) { 669 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 670 cnt[type], bw_ctl, sz[type], sz1, mp); 671 continue; 672 } 673 674 if (sap == ETHERTYPE_IP) { 675 /* 676 * If we are H/W classified, but we have promisc 677 * on, then we need to check for the unicast address. 678 */ 679 if (hw_classified && mcip->mci_promisc_list != NULL) { 680 mac_address_t *map; 681 682 rw_enter(&mcip->mci_rw_lock, RW_READER); 683 map = mcip->mci_unicast; 684 if (bcmp(dstaddr, map->ma_addr, 685 map->ma_len) == 0) 686 type = UNDEF; 687 rw_exit(&mcip->mci_rw_lock); 688 } else if (is_unicast) { 689 type = UNDEF; 690 } 691 } 692 693 /* 694 * This needs to become a contract with the driver for 695 * the fast path. 696 * 697 * In the normal case the packet will have at least the L2 698 * header and the IP + Transport header in the same mblk. 699 * This is usually the case when the NIC driver sends up 700 * the packet. This is also true when the stack generates 701 * a packet that is looped back and when the stack uses the 702 * fastpath mechanism. The normal case is optimized for 703 * performance and may bypass DLS. All other cases go through 704 * the 'OTH' type path without DLS bypass. 705 */ 706 707 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 708 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 709 type = OTH; 710 711 if (type == OTH) { 712 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 713 cnt[type], bw_ctl, sz[type], sz1, mp); 714 continue; 715 } 716 717 ASSERT(type == UNDEF); 718 /* 719 * We look for at least 4 bytes past the IP header to get 720 * the port information. If we get an IP fragment, we don't 721 * have the port information, and we use just the protocol 722 * information. 723 */ 724 switch (ipha->ipha_protocol) { 725 case IPPROTO_TCP: 726 type = V4_TCP; 727 mp->b_rptr += hdrsize; 728 break; 729 case IPPROTO_UDP: 730 type = V4_UDP; 731 mp->b_rptr += hdrsize; 732 break; 733 default: 734 type = OTH; 735 break; 736 } 737 738 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 739 bw_ctl, sz[type], sz1, mp); 740 } 741 742 for (type = V4_TCP; type < UNDEF; type++) { 743 if (headmp[type] != NULL) { 744 mac_soft_ring_t *softring; 745 746 ASSERT(tailmp[type]->b_next == NULL); 747 switch (type) { 748 case V4_TCP: 749 softring = mac_srs->srs_tcp_soft_rings[0]; 750 break; 751 case V4_UDP: 752 softring = mac_srs->srs_udp_soft_rings[0]; 753 break; 754 case OTH: 755 softring = mac_srs->srs_oth_soft_rings[0]; 756 } 757 mac_rx_soft_ring_process(mcip, softring, 758 headmp[type], tailmp[type], cnt[type], sz[type]); 759 } 760 } 761 } 762 763 int fanout_unaligned = 0; 764 765 /* 766 * mac_rx_srs_long_fanout 767 * 768 * The fanout routine for VLANs, and for anything else that isn't performing 769 * explicit dls bypass. Returns -1 on an error (drop the packet due to a 770 * malformed packet), 0 on success, with values written in *indx and *type. 771 */ 772 static int 773 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 774 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 775 { 776 ip6_t *ip6h; 777 ipha_t *ipha; 778 uint8_t *whereptr; 779 uint_t hash; 780 uint16_t remlen; 781 uint8_t nexthdr; 782 uint16_t hdr_len; 783 uint32_t src_val; 784 boolean_t modifiable = B_TRUE; 785 boolean_t v6; 786 787 ASSERT(MBLKL(mp) >= hdrsize); 788 789 if (sap == ETHERTYPE_IPV6) { 790 v6 = B_TRUE; 791 hdr_len = IPV6_HDR_LEN; 792 } else if (sap == ETHERTYPE_IP) { 793 v6 = B_FALSE; 794 hdr_len = IP_SIMPLE_HDR_LENGTH; 795 } else { 796 *indx = 0; 797 *type = OTH; 798 return (0); 799 } 800 801 ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 802 ipha = (ipha_t *)ip6h; 803 804 if ((uint8_t *)ip6h == mp->b_wptr) { 805 /* 806 * The first mblk_t only includes the mac header. 807 * Note that it is safe to change the mp pointer here, 808 * as the subsequent operation does not assume mp 809 * points to the start of the mac header. 810 */ 811 mp = mp->b_cont; 812 813 /* 814 * Make sure the IP header points to an entire one. 815 */ 816 if (mp == NULL) 817 return (-1); 818 819 if (MBLKL(mp) < hdr_len) { 820 modifiable = (DB_REF(mp) == 1); 821 822 if (modifiable && !pullupmsg(mp, hdr_len)) 823 return (-1); 824 } 825 826 ip6h = (ip6_t *)mp->b_rptr; 827 ipha = (ipha_t *)ip6h; 828 } 829 830 if (!modifiable || !(OK_32PTR((char *)ip6h)) || 831 ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) { 832 /* 833 * If either the IP header is not aligned, or it does not hold 834 * the complete simple structure (a pullupmsg() is not an 835 * option since it would result in an unaligned IP header), 836 * fanout to the default ring. 837 * 838 * Note that this may cause packet reordering. 839 */ 840 *indx = 0; 841 *type = OTH; 842 fanout_unaligned++; 843 return (0); 844 } 845 846 /* 847 * Extract next-header, full header length, and source-hash value 848 * using v4/v6 specific fields. 849 */ 850 if (v6) { 851 remlen = ntohs(ip6h->ip6_plen); 852 nexthdr = ip6h->ip6_nxt; 853 src_val = V4_PART_OF_V6(ip6h->ip6_src); 854 /* 855 * Do src based fanout if below tunable is set to B_TRUE or 856 * when mac_ip_hdr_length_v6() fails because of malformed 857 * packets or because mblks need to be concatenated using 858 * pullupmsg(). 859 */ 860 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h, 861 mp->b_wptr, &hdr_len, &nexthdr, NULL)) { 862 goto src_based_fanout; 863 } 864 } else { 865 hdr_len = IPH_HDR_LENGTH(ipha); 866 remlen = ntohs(ipha->ipha_length) - hdr_len; 867 nexthdr = ipha->ipha_protocol; 868 src_val = (uint32_t)ipha->ipha_src; 869 /* 870 * Catch IPv4 fragment case here. IPv6 has nexthdr == FRAG 871 * for its equivalent case. 872 */ 873 if (mac_src_ipv4_fanout || 874 (ntohs(ipha->ipha_fragment_offset_and_flags) & 875 (IPH_MF | IPH_OFFSET)) != 0) { 876 goto src_based_fanout; 877 } 878 } 879 if (remlen < MIN_EHDR_LEN) 880 return (-1); 881 whereptr = (uint8_t *)ip6h + hdr_len; 882 883 /* If the transport is one of below, we do port/SPI based fanout */ 884 switch (nexthdr) { 885 case IPPROTO_TCP: 886 case IPPROTO_UDP: 887 case IPPROTO_SCTP: 888 case IPPROTO_ESP: 889 /* 890 * If the ports or SPI in the transport header is not part of 891 * the mblk, do src_based_fanout, instead of calling 892 * pullupmsg(). 893 */ 894 if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr) 895 break; /* out of switch... */ 896 /* FALLTHRU */ 897 default: 898 goto src_based_fanout; 899 } 900 901 switch (nexthdr) { 902 case IPPROTO_TCP: 903 hash = HASH_ADDR(src_val, *(uint32_t *)whereptr); 904 *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 905 *type = OTH; 906 break; 907 case IPPROTO_UDP: 908 case IPPROTO_SCTP: 909 case IPPROTO_ESP: 910 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 911 hash = HASH_ADDR(src_val, *(uint32_t *)whereptr); 912 *indx = COMPUTE_INDEX(hash, 913 mac_srs->srs_udp_ring_count); 914 } else { 915 *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count; 916 mac_srs->srs_ind++; 917 } 918 *type = OTH; 919 break; 920 } 921 return (0); 922 923 src_based_fanout: 924 hash = HASH_ADDR(src_val, (uint32_t)0); 925 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 926 *type = OTH; 927 return (0); 928 } 929 930 /* 931 * mac_rx_srs_fanout 932 * 933 * This routine delivers packets destined to an SRS into a soft ring member 934 * of the set. 935 * 936 * Given a chain of packets we need to split it up into multiple sub chains 937 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 938 * the soft ring one packet at a time, we want to enter it in the form of a 939 * chain otherwise we get this start/stop behaviour where the worker thread 940 * goes to sleep and then next packets comes in forcing it to wake up etc. 941 * 942 * Note: 943 * Since we know what is the maximum fanout possible, we create a 2D array 944 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 945 * variables so that we can enter the softrings with chain. We need the 946 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 947 * for each packet would be expensive). If we ever want to have the 948 * ability to have unlimited fanout, we should probably declare a head, 949 * tail, cnt, sz with each soft ring (a data struct which contains a softring 950 * along with these members) and create an array of this uber struct so we 951 * don't have to do kmem_alloc. 952 */ 953 int fanout_oth1 = 0; 954 int fanout_oth2 = 0; 955 int fanout_oth3 = 0; 956 int fanout_oth4 = 0; 957 int fanout_oth5 = 0; 958 959 static void 960 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 961 { 962 struct ether_header *ehp; 963 struct ether_vlan_header *evhp; 964 uint32_t sap; 965 ipha_t *ipha; 966 uint8_t *dstaddr; 967 uint_t indx; 968 size_t ports_offset; 969 size_t ipha_len; 970 size_t hdrsize; 971 uint_t hash; 972 mblk_t *mp; 973 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 974 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 975 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 976 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 977 size_t sz1; 978 boolean_t bw_ctl; 979 boolean_t hw_classified; 980 boolean_t dls_bypass; 981 boolean_t is_ether; 982 boolean_t is_unicast; 983 int fanout_cnt; 984 enum pkt_type type; 985 mac_client_impl_t *mcip = mac_srs->srs_mcip; 986 987 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 988 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 989 990 /* 991 * If we don't have a Rx ring, S/W classification would have done 992 * its job and its a packet meant for us. If we were polling on 993 * the default ring (i.e. there was a ring assigned to this SRS), 994 * then we need to make sure that the mac address really belongs 995 * to us. 996 */ 997 hw_classified = mac_srs->srs_ring != NULL && 998 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 999 1000 /* 1001 * Special clients (eg. VLAN, non ether, etc) need DLS 1002 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 1003 * such SRSs. Another way of disabling bypass is to set the 1004 * MCIS_RX_BYPASS_DISABLE flag. 1005 */ 1006 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 1007 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 1008 1009 /* 1010 * Since the softrings are never destroyed and we always 1011 * create equal number of softrings for TCP, UDP and rest, 1012 * its OK to check one of them for count and use it without 1013 * any lock. In future, if soft rings get destroyed because 1014 * of reduction in fanout, we will need to ensure that happens 1015 * behind the SRS_PROC. 1016 */ 1017 fanout_cnt = mac_srs->srs_tcp_ring_count; 1018 1019 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 1020 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 1021 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 1022 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 1023 1024 /* 1025 * We got a chain from SRS that we need to send to the soft rings. 1026 * Since squeues for TCP & IPv4 sap poll their soft rings (for 1027 * performance reasons), we need to separate out v4_tcp, v4_udp 1028 * and the rest goes in other. 1029 */ 1030 while (head != NULL) { 1031 mp = head; 1032 head = head->b_next; 1033 mp->b_next = NULL; 1034 1035 type = OTH; 1036 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 1037 1038 if (is_ether) { 1039 /* 1040 * At this point we can be sure the packet at least 1041 * has an ether header. 1042 */ 1043 if (sz1 < sizeof (struct ether_header)) { 1044 mac_rx_drop_pkt(mac_srs, mp); 1045 continue; 1046 } 1047 ehp = (struct ether_header *)mp->b_rptr; 1048 1049 /* 1050 * Determine if this is a VLAN or non-VLAN packet. 1051 */ 1052 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 1053 evhp = (struct ether_vlan_header *)mp->b_rptr; 1054 sap = ntohs(evhp->ether_type); 1055 hdrsize = sizeof (struct ether_vlan_header); 1056 /* 1057 * Check if the VID of the packet, if any, 1058 * belongs to this client. 1059 */ 1060 if (!mac_client_check_flow_vid(mcip, 1061 VLAN_ID(ntohs(evhp->ether_tci)))) { 1062 mac_rx_drop_pkt(mac_srs, mp); 1063 continue; 1064 } 1065 } else { 1066 hdrsize = sizeof (struct ether_header); 1067 } 1068 is_unicast = 1069 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 1070 dstaddr = (uint8_t *)&ehp->ether_dhost; 1071 } else { 1072 mac_header_info_t mhi; 1073 1074 if (mac_header_info((mac_handle_t)mcip->mci_mip, 1075 mp, &mhi) != 0) { 1076 mac_rx_drop_pkt(mac_srs, mp); 1077 continue; 1078 } 1079 hdrsize = mhi.mhi_hdrsize; 1080 sap = mhi.mhi_bindsap; 1081 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 1082 dstaddr = (uint8_t *)mhi.mhi_daddr; 1083 } 1084 1085 if (!dls_bypass) { 1086 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1087 hdrsize, &type, &indx) == -1) { 1088 mac_rx_drop_pkt(mac_srs, mp); 1089 continue; 1090 } 1091 1092 FANOUT_ENQUEUE_MP(headmp[type][indx], 1093 tailmp[type][indx], cnt[type][indx], bw_ctl, 1094 sz[type][indx], sz1, mp); 1095 continue; 1096 } 1097 1098 1099 /* 1100 * If we are using the default Rx ring where H/W or S/W 1101 * classification has not happened, we need to verify if 1102 * this unicast packet really belongs to us. 1103 */ 1104 if (sap == ETHERTYPE_IP) { 1105 /* 1106 * If we are H/W classified, but we have promisc 1107 * on, then we need to check for the unicast address. 1108 */ 1109 if (hw_classified && mcip->mci_promisc_list != NULL) { 1110 mac_address_t *map; 1111 1112 rw_enter(&mcip->mci_rw_lock, RW_READER); 1113 map = mcip->mci_unicast; 1114 if (bcmp(dstaddr, map->ma_addr, 1115 map->ma_len) == 0) 1116 type = UNDEF; 1117 rw_exit(&mcip->mci_rw_lock); 1118 } else if (is_unicast) { 1119 type = UNDEF; 1120 } 1121 } 1122 1123 /* 1124 * This needs to become a contract with the driver for 1125 * the fast path. 1126 */ 1127 1128 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 1129 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1130 type = OTH; 1131 fanout_oth1++; 1132 } 1133 1134 if (type != OTH) { 1135 uint16_t frag_offset_flags; 1136 1137 switch (ipha->ipha_protocol) { 1138 case IPPROTO_TCP: 1139 case IPPROTO_UDP: 1140 case IPPROTO_SCTP: 1141 case IPPROTO_ESP: 1142 ipha_len = IPH_HDR_LENGTH(ipha); 1143 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1144 mp->b_wptr) { 1145 type = OTH; 1146 break; 1147 } 1148 frag_offset_flags = 1149 ntohs(ipha->ipha_fragment_offset_and_flags); 1150 if ((frag_offset_flags & 1151 (IPH_MF | IPH_OFFSET)) != 0) { 1152 type = OTH; 1153 fanout_oth3++; 1154 break; 1155 } 1156 ports_offset = hdrsize + ipha_len; 1157 break; 1158 default: 1159 type = OTH; 1160 fanout_oth4++; 1161 break; 1162 } 1163 } 1164 1165 if (type == OTH) { 1166 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1167 hdrsize, &type, &indx) == -1) { 1168 mac_rx_drop_pkt(mac_srs, mp); 1169 continue; 1170 } 1171 1172 FANOUT_ENQUEUE_MP(headmp[type][indx], 1173 tailmp[type][indx], cnt[type][indx], bw_ctl, 1174 sz[type][indx], sz1, mp); 1175 continue; 1176 } 1177 1178 ASSERT(type == UNDEF); 1179 1180 /* 1181 * XXX-Sunay: We should hold srs_lock since ring_count 1182 * below can change. But if we are always called from 1183 * mac_rx_srs_drain and SRS_PROC is set, then we can 1184 * enforce that ring_count can't be changed i.e. 1185 * to change fanout type or ring count, the calling 1186 * thread needs to be behind SRS_PROC. 1187 */ 1188 switch (ipha->ipha_protocol) { 1189 case IPPROTO_TCP: 1190 /* 1191 * Note that for ESP, we fanout on SPI and it is at the 1192 * same offset as the 2x16-bit ports. So it is clumped 1193 * along with TCP, UDP and SCTP. 1194 */ 1195 hash = HASH_ADDR(ipha->ipha_src, 1196 *(uint32_t *)(mp->b_rptr + ports_offset)); 1197 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1198 type = V4_TCP; 1199 mp->b_rptr += hdrsize; 1200 break; 1201 case IPPROTO_UDP: 1202 case IPPROTO_SCTP: 1203 case IPPROTO_ESP: 1204 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1205 hash = HASH_ADDR(ipha->ipha_src, 1206 *(uint32_t *)(mp->b_rptr + ports_offset)); 1207 indx = COMPUTE_INDEX(hash, 1208 mac_srs->srs_udp_ring_count); 1209 } else { 1210 indx = mac_srs->srs_ind % 1211 mac_srs->srs_udp_ring_count; 1212 mac_srs->srs_ind++; 1213 } 1214 type = V4_UDP; 1215 mp->b_rptr += hdrsize; 1216 break; 1217 default: 1218 indx = 0; 1219 type = OTH; 1220 } 1221 1222 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1223 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1224 } 1225 1226 for (type = V4_TCP; type < UNDEF; type++) { 1227 int i; 1228 1229 for (i = 0; i < fanout_cnt; i++) { 1230 if (headmp[type][i] != NULL) { 1231 mac_soft_ring_t *softring; 1232 1233 ASSERT(tailmp[type][i]->b_next == NULL); 1234 switch (type) { 1235 case V4_TCP: 1236 softring = 1237 mac_srs->srs_tcp_soft_rings[i]; 1238 break; 1239 case V4_UDP: 1240 softring = 1241 mac_srs->srs_udp_soft_rings[i]; 1242 break; 1243 case OTH: 1244 softring = 1245 mac_srs->srs_oth_soft_rings[i]; 1246 break; 1247 } 1248 mac_rx_soft_ring_process(mcip, 1249 softring, headmp[type][i], tailmp[type][i], 1250 cnt[type][i], sz[type][i]); 1251 } 1252 } 1253 } 1254 } 1255 1256 #define SRS_BYTES_TO_PICKUP 150000 1257 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1258 1259 /* 1260 * mac_rx_srs_poll_ring 1261 * 1262 * This SRS Poll thread uses this routine to poll the underlying hardware 1263 * Rx ring to get a chain of packets. It can inline process that chain 1264 * if mac_latency_optimize is set (default) or signal the SRS worker thread 1265 * to do the remaining processing. 1266 * 1267 * Since packets come in the system via interrupt or poll path, we also 1268 * update the stats and deal with promiscous clients here. 1269 */ 1270 void 1271 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1272 { 1273 kmutex_t *lock = &mac_srs->srs_lock; 1274 kcondvar_t *async = &mac_srs->srs_cv; 1275 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1276 mblk_t *head, *tail, *mp; 1277 callb_cpr_t cprinfo; 1278 ssize_t bytes_to_pickup; 1279 size_t sz; 1280 int count; 1281 mac_client_impl_t *smcip; 1282 1283 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1284 mutex_enter(lock); 1285 1286 start: 1287 for (;;) { 1288 if (mac_srs->srs_state & SRS_PAUSE) 1289 goto done; 1290 1291 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1292 cv_wait(async, lock); 1293 CALLB_CPR_SAFE_END(&cprinfo, lock); 1294 1295 if (mac_srs->srs_state & SRS_PAUSE) 1296 goto done; 1297 1298 check_again: 1299 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1300 /* 1301 * We pick as many bytes as we are allowed to queue. 1302 * Its possible that we will exceed the total 1303 * packets queued in case this SRS is part of the 1304 * Rx ring group since > 1 poll thread can be pulling 1305 * upto the max allowed packets at the same time 1306 * but that should be OK. 1307 */ 1308 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1309 bytes_to_pickup = 1310 mac_srs->srs_bw->mac_bw_drop_threshold - 1311 mac_srs->srs_bw->mac_bw_sz; 1312 /* 1313 * We shouldn't have been signalled if we 1314 * have 0 or less bytes to pick but since 1315 * some of the bytes accounting is driver 1316 * dependant, we do the safety check. 1317 */ 1318 if (bytes_to_pickup < 0) 1319 bytes_to_pickup = 0; 1320 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1321 } else { 1322 /* 1323 * ToDO: Need to change the polling API 1324 * to add a packet count and a flag which 1325 * tells the driver whether we want packets 1326 * based on a count, or bytes, or all the 1327 * packets queued in the driver/HW. This 1328 * way, we never have to check the limits 1329 * on poll path. We truly let only as many 1330 * packets enter the system as we are willing 1331 * to process or queue. 1332 * 1333 * Something along the lines of 1334 * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1335 * mac_srs->srs_poll_pkt_cnt 1336 */ 1337 1338 /* 1339 * Since we are not doing B/W control, pick 1340 * as many packets as allowed. 1341 */ 1342 bytes_to_pickup = max_bytes_to_pickup; 1343 } 1344 1345 /* Poll the underlying Hardware */ 1346 mutex_exit(lock); 1347 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1348 mutex_enter(lock); 1349 1350 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1351 SRS_POLL_THR_OWNER); 1352 1353 mp = tail = head; 1354 count = 0; 1355 sz = 0; 1356 while (mp != NULL) { 1357 tail = mp; 1358 sz += msgdsize(mp); 1359 mp = mp->b_next; 1360 count++; 1361 } 1362 1363 if (head != NULL) { 1364 tail->b_next = NULL; 1365 smcip = mac_srs->srs_mcip; 1366 1367 SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz); 1368 SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count); 1369 1370 /* 1371 * If there are any promiscuous mode callbacks 1372 * defined for this MAC client, pass them a copy 1373 * if appropriate and also update the counters. 1374 */ 1375 if (smcip != NULL) { 1376 if (smcip->mci_mip->mi_promisc_list != NULL) { 1377 mutex_exit(lock); 1378 mac_promisc_dispatch(smcip->mci_mip, 1379 head, NULL); 1380 mutex_enter(lock); 1381 } 1382 } 1383 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1384 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1385 mac_srs->srs_bw->mac_bw_polled += sz; 1386 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1387 } 1388 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1389 count, sz); 1390 if (count <= 10) 1391 srs_rx->sr_stat.mrs_chaincntundr10++; 1392 else if (count > 10 && count <= 50) 1393 srs_rx->sr_stat.mrs_chaincnt10to50++; 1394 else 1395 srs_rx->sr_stat.mrs_chaincntover50++; 1396 } 1397 1398 /* 1399 * We are guaranteed that SRS_PROC will be set if we 1400 * are here. Also, poll thread gets to run only if 1401 * the drain was being done by a worker thread although 1402 * its possible that worker thread is still running 1403 * and poll thread was sent down to keep the pipeline 1404 * going instead of doing a complete drain and then 1405 * trying to poll the NIC. 1406 * 1407 * So we need to check SRS_WORKER flag to make sure 1408 * that the worker thread is not processing the queue 1409 * in parallel to us. The flags and conditions are 1410 * protected by the srs_lock to prevent any race. We 1411 * ensure that we don't drop the srs_lock from now 1412 * till the end and similarly we don't drop the srs_lock 1413 * in mac_rx_srs_drain() till similar condition check 1414 * are complete. The mac_rx_srs_drain() needs to ensure 1415 * that SRS_WORKER flag remains set as long as its 1416 * processing the queue. 1417 */ 1418 if (!(mac_srs->srs_state & SRS_WORKER) && 1419 (mac_srs->srs_first != NULL)) { 1420 /* 1421 * We have packets to process and worker thread 1422 * is not running. Check to see if poll thread is 1423 * allowed to process. 1424 */ 1425 if (mac_srs->srs_state & SRS_LATENCY_OPT) { 1426 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1427 if (!(mac_srs->srs_state & SRS_PAUSE) && 1428 srs_rx->sr_poll_pkt_cnt <= 1429 srs_rx->sr_lowat) { 1430 srs_rx->sr_poll_again++; 1431 goto check_again; 1432 } 1433 /* 1434 * We are already above low water mark 1435 * so stay in the polling mode but no 1436 * need to poll. Once we dip below 1437 * the polling threshold, the processing 1438 * thread (soft ring) will signal us 1439 * to poll again (MAC_UPDATE_SRS_COUNT) 1440 */ 1441 srs_rx->sr_poll_drain_no_poll++; 1442 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1443 /* 1444 * In B/W control case, its possible 1445 * that the backlog built up due to 1446 * B/W limit being reached and packets 1447 * are queued only in SRS. In this case, 1448 * we should schedule worker thread 1449 * since no one else will wake us up. 1450 */ 1451 if ((mac_srs->srs_type & SRST_BW_CONTROL) && 1452 (mac_srs->srs_tid == NULL)) { 1453 mac_srs->srs_tid = 1454 timeout(mac_srs_fire, mac_srs, 1); 1455 srs_rx->sr_poll_worker_wakeup++; 1456 } 1457 } else { 1458 /* 1459 * Wakeup the worker thread for more processing. 1460 * We optimize for throughput in this case. 1461 */ 1462 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1463 MAC_SRS_WORKER_WAKEUP(mac_srs); 1464 srs_rx->sr_poll_sig_worker++; 1465 } 1466 } else if ((mac_srs->srs_first == NULL) && 1467 !(mac_srs->srs_state & SRS_WORKER)) { 1468 /* 1469 * There is nothing queued in SRS and 1470 * no worker thread running. Plus we 1471 * didn't get anything from the H/W 1472 * as well (head == NULL); 1473 */ 1474 ASSERT(head == NULL); 1475 mac_srs->srs_state &= 1476 ~(SRS_PROC|SRS_GET_PKTS); 1477 1478 /* 1479 * If we have a packets in soft ring, don't allow 1480 * more packets to come into this SRS by keeping the 1481 * interrupts off but not polling the H/W. The 1482 * poll thread will get signaled as soon as 1483 * srs_poll_pkt_cnt dips below poll threshold. 1484 */ 1485 if (srs_rx->sr_poll_pkt_cnt == 0) { 1486 srs_rx->sr_poll_intr_enable++; 1487 MAC_SRS_POLLING_OFF(mac_srs); 1488 } else { 1489 /* 1490 * We know nothing is queued in SRS 1491 * since we are here after checking 1492 * srs_first is NULL. The backlog 1493 * is entirely due to packets queued 1494 * in Soft ring which will wake us up 1495 * and get the interface out of polling 1496 * mode once the backlog dips below 1497 * sr_poll_thres. 1498 */ 1499 srs_rx->sr_poll_no_poll++; 1500 } 1501 } else { 1502 /* 1503 * Worker thread is already running. 1504 * Nothing much to do. If the polling 1505 * was enabled, worker thread will deal 1506 * with that. 1507 */ 1508 mac_srs->srs_state &= ~SRS_GET_PKTS; 1509 srs_rx->sr_poll_goto_sleep++; 1510 } 1511 } 1512 done: 1513 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1514 cv_signal(&mac_srs->srs_async); 1515 /* 1516 * If this is a temporary quiesce then wait for the restart signal 1517 * from the srs worker. Then clear the flags and signal the srs worker 1518 * to ensure a positive handshake and go back to start. 1519 */ 1520 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1521 cv_wait(async, lock); 1522 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1523 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1524 mac_srs->srs_state &= 1525 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1526 cv_signal(&mac_srs->srs_async); 1527 goto start; 1528 } else { 1529 mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1530 cv_signal(&mac_srs->srs_async); 1531 CALLB_CPR_EXIT(&cprinfo); 1532 thread_exit(); 1533 } 1534 } 1535 1536 /* 1537 * mac_srs_pick_chain 1538 * 1539 * In Bandwidth control case, checks how many packets can be processed 1540 * and return them in a sub chain. 1541 */ 1542 static mblk_t * 1543 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1544 size_t *chain_sz, int *chain_cnt) 1545 { 1546 mblk_t *head = NULL; 1547 mblk_t *tail = NULL; 1548 size_t sz; 1549 size_t tsz = 0; 1550 int cnt = 0; 1551 mblk_t *mp; 1552 1553 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1554 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1555 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1556 mac_srs->srs_bw->mac_bw_limit) || 1557 (mac_srs->srs_bw->mac_bw_limit == 0)) { 1558 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1559 head = mac_srs->srs_first; 1560 mac_srs->srs_first = NULL; 1561 *chain_tail = mac_srs->srs_last; 1562 mac_srs->srs_last = NULL; 1563 *chain_sz = mac_srs->srs_size; 1564 *chain_cnt = mac_srs->srs_count; 1565 mac_srs->srs_count = 0; 1566 mac_srs->srs_size = 0; 1567 return (head); 1568 } 1569 1570 /* 1571 * Can't clear the entire backlog. 1572 * Need to find how many packets to pick 1573 */ 1574 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1575 while ((mp = mac_srs->srs_first) != NULL) { 1576 sz = msgdsize(mp); 1577 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1578 mac_srs->srs_bw->mac_bw_limit) { 1579 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1580 mac_srs->srs_bw->mac_bw_state |= 1581 SRS_BW_ENFORCED; 1582 break; 1583 } 1584 1585 /* 1586 * The _size & cnt is decremented from the softrings 1587 * when they send up the packet for polling to work 1588 * properly. 1589 */ 1590 tsz += sz; 1591 cnt++; 1592 mac_srs->srs_count--; 1593 mac_srs->srs_size -= sz; 1594 if (tail != NULL) 1595 tail->b_next = mp; 1596 else 1597 head = mp; 1598 tail = mp; 1599 mac_srs->srs_first = mac_srs->srs_first->b_next; 1600 } 1601 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1602 if (mac_srs->srs_first == NULL) 1603 mac_srs->srs_last = NULL; 1604 1605 if (tail != NULL) 1606 tail->b_next = NULL; 1607 *chain_tail = tail; 1608 *chain_cnt = cnt; 1609 *chain_sz = tsz; 1610 1611 return (head); 1612 } 1613 1614 /* 1615 * mac_rx_srs_drain 1616 * 1617 * The SRS drain routine. Gets to run to clear the queue. Any thread 1618 * (worker, interrupt, poll) can call this based on processing model. 1619 * The first thing we do is disable interrupts if possible and then 1620 * drain the queue. we also try to poll the underlying hardware if 1621 * there is a dedicated hardware Rx ring assigned to this SRS. 1622 * 1623 * There is a equivalent drain routine in bandwidth control mode 1624 * mac_rx_srs_drain_bw. There is some code duplication between the two 1625 * routines but they are highly performance sensitive and are easier 1626 * to read/debug if they stay separate. Any code changes here might 1627 * also apply to mac_rx_srs_drain_bw as well. 1628 */ 1629 void 1630 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1631 { 1632 mblk_t *head; 1633 mblk_t *tail; 1634 timeout_id_t tid; 1635 int cnt = 0; 1636 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1637 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1638 1639 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1640 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1641 1642 /* If we are blanked i.e. can't do upcalls, then we are done */ 1643 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1644 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1645 (mac_srs->srs_state & SRS_PAUSE)); 1646 goto out; 1647 } 1648 1649 if (mac_srs->srs_first == NULL) 1650 goto out; 1651 1652 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 1653 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 1654 /* 1655 * In the normal case, the SRS worker thread does no 1656 * work and we wait for a backlog to build up before 1657 * we switch into polling mode. In case we are 1658 * optimizing for throughput, we use the worker thread 1659 * as well. The goal is to let worker thread process 1660 * the queue and poll thread to feed packets into 1661 * the queue. As such, we should signal the poll 1662 * thread to try and get more packets. 1663 * 1664 * We could have pulled this check in the POLL_RING 1665 * macro itself but keeping it explicit here makes 1666 * the architecture more human understandable. 1667 */ 1668 MAC_SRS_POLL_RING(mac_srs); 1669 } 1670 1671 again: 1672 head = mac_srs->srs_first; 1673 mac_srs->srs_first = NULL; 1674 tail = mac_srs->srs_last; 1675 mac_srs->srs_last = NULL; 1676 cnt = mac_srs->srs_count; 1677 mac_srs->srs_count = 0; 1678 1679 ASSERT(head != NULL); 1680 ASSERT(tail != NULL); 1681 1682 if ((tid = mac_srs->srs_tid) != 0) 1683 mac_srs->srs_tid = 0; 1684 1685 mac_srs->srs_state |= (SRS_PROC|proc_type); 1686 1687 1688 /* 1689 * mcip is NULL for broadcast and multicast flows. The promisc 1690 * callbacks for broadcast and multicast packets are delivered from 1691 * mac_rx() and we don't need to worry about that case in this path 1692 */ 1693 if (mcip != NULL) { 1694 if (mcip->mci_promisc_list != NULL) { 1695 mutex_exit(&mac_srs->srs_lock); 1696 mac_promisc_client_dispatch(mcip, head); 1697 mutex_enter(&mac_srs->srs_lock); 1698 } 1699 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) { 1700 mutex_exit(&mac_srs->srs_lock); 1701 mac_protect_intercept_dhcp(mcip, head); 1702 mutex_enter(&mac_srs->srs_lock); 1703 } 1704 } 1705 1706 /* 1707 * Check if SRS itself is doing the processing 1708 * This direct path does not apply when subflows are present. In this 1709 * case, packets need to be dispatched to a soft ring according to the 1710 * flow's bandwidth and other resources contraints. 1711 */ 1712 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1713 mac_direct_rx_t proc; 1714 void *arg1; 1715 mac_resource_handle_t arg2; 1716 1717 /* 1718 * This is the case when a Rx is directly 1719 * assigned and we have a fully classified 1720 * protocol chain. We can deal with it in 1721 * one shot. 1722 */ 1723 proc = srs_rx->sr_func; 1724 arg1 = srs_rx->sr_arg1; 1725 arg2 = srs_rx->sr_arg2; 1726 1727 mac_srs->srs_state |= SRS_CLIENT_PROC; 1728 mutex_exit(&mac_srs->srs_lock); 1729 if (tid != 0) { 1730 (void) untimeout(tid); 1731 tid = 0; 1732 } 1733 1734 proc(arg1, arg2, head, NULL); 1735 /* 1736 * Decrement the size and count here itelf 1737 * since the packet has been processed. 1738 */ 1739 mutex_enter(&mac_srs->srs_lock); 1740 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1741 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1742 cv_signal(&mac_srs->srs_client_cv); 1743 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1744 } else { 1745 /* Some kind of softrings based fanout is required */ 1746 mutex_exit(&mac_srs->srs_lock); 1747 if (tid != 0) { 1748 (void) untimeout(tid); 1749 tid = 0; 1750 } 1751 1752 /* 1753 * Since the fanout routines can deal with chains, 1754 * shoot the entire chain up. 1755 */ 1756 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1757 mac_rx_srs_fanout(mac_srs, head); 1758 else 1759 mac_rx_srs_proto_fanout(mac_srs, head); 1760 mutex_enter(&mac_srs->srs_lock); 1761 } 1762 1763 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 1764 (mac_srs->srs_first != NULL)) { 1765 /* 1766 * More packets arrived while we were clearing the 1767 * SRS. This can be possible because of one of 1768 * three conditions below: 1769 * 1) The driver is using multiple worker threads 1770 * to send the packets to us. 1771 * 2) The driver has a race in switching 1772 * between interrupt and polling mode or 1773 * 3) Packets are arriving in this SRS via the 1774 * S/W classification as well. 1775 * 1776 * We should switch to polling mode and see if we 1777 * need to send the poll thread down. Also, signal 1778 * the worker thread to process whats just arrived. 1779 */ 1780 MAC_SRS_POLLING_ON(mac_srs); 1781 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 1782 srs_rx->sr_drain_poll_sig++; 1783 MAC_SRS_POLL_RING(mac_srs); 1784 } 1785 1786 /* 1787 * If we didn't signal the poll thread, we need 1788 * to deal with the pending packets ourselves. 1789 */ 1790 if (proc_type == SRS_WORKER) { 1791 srs_rx->sr_drain_again++; 1792 goto again; 1793 } else { 1794 srs_rx->sr_drain_worker_sig++; 1795 cv_signal(&mac_srs->srs_async); 1796 } 1797 } 1798 1799 out: 1800 if (mac_srs->srs_state & SRS_GET_PKTS) { 1801 /* 1802 * Poll thread is already running. Leave the 1803 * SRS_RPOC set and hand over the control to 1804 * poll thread. 1805 */ 1806 mac_srs->srs_state &= ~proc_type; 1807 srs_rx->sr_drain_poll_running++; 1808 return; 1809 } 1810 1811 /* 1812 * Even if there are no packets queued in SRS, we 1813 * need to make sure that the shared counter is 1814 * clear and any associated softrings have cleared 1815 * all the backlog. Otherwise, leave the interface 1816 * in polling mode and the poll thread will get 1817 * signalled once the count goes down to zero. 1818 * 1819 * If someone is already draining the queue (SRS_PROC is 1820 * set) when the srs_poll_pkt_cnt goes down to zero, 1821 * then it means that drain is already running and we 1822 * will turn off polling at that time if there is 1823 * no backlog. 1824 * 1825 * As long as there are packets queued either 1826 * in soft ring set or its soft rings, we will leave 1827 * the interface in polling mode (even if the drain 1828 * was done being the interrupt thread). We signal 1829 * the poll thread as well if we have dipped below 1830 * low water mark. 1831 * 1832 * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1833 * since that turn polling on only for worker thread. 1834 * Its not worth turning polling on for interrupt 1835 * thread (since NIC will not issue another interrupt) 1836 * unless a backlog builds up. 1837 */ 1838 if ((srs_rx->sr_poll_pkt_cnt > 0) && 1839 (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1840 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1841 srs_rx->sr_drain_keep_polling++; 1842 MAC_SRS_POLLING_ON(mac_srs); 1843 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1844 MAC_SRS_POLL_RING(mac_srs); 1845 return; 1846 } 1847 1848 /* Nothing else to do. Get out of poll mode */ 1849 MAC_SRS_POLLING_OFF(mac_srs); 1850 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1851 srs_rx->sr_drain_finish_intr++; 1852 } 1853 1854 /* 1855 * mac_rx_srs_drain_bw 1856 * 1857 * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1858 * (worker, interrupt, poll) can call this based on processing model. 1859 * The first thing we do is disable interrupts if possible and then 1860 * drain the queue. we also try to poll the underlying hardware if 1861 * there is a dedicated hardware Rx ring assigned to this SRS. 1862 * 1863 * There is a equivalent drain routine in non bandwidth control mode 1864 * mac_rx_srs_drain. There is some code duplication between the two 1865 * routines but they are highly performance sensitive and are easier 1866 * to read/debug if they stay separate. Any code changes here might 1867 * also apply to mac_rx_srs_drain as well. 1868 */ 1869 void 1870 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1871 { 1872 mblk_t *head; 1873 mblk_t *tail; 1874 timeout_id_t tid; 1875 size_t sz = 0; 1876 int cnt = 0; 1877 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1878 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1879 clock_t now; 1880 1881 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1882 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1883 again: 1884 /* Check if we are doing B/W control */ 1885 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1886 now = ddi_get_lbolt(); 1887 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 1888 mac_srs->srs_bw->mac_bw_curr_time = now; 1889 mac_srs->srs_bw->mac_bw_used = 0; 1890 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1891 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1892 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1893 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1894 goto done; 1895 } else if (mac_srs->srs_bw->mac_bw_used > 1896 mac_srs->srs_bw->mac_bw_limit) { 1897 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1898 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1899 goto done; 1900 } 1901 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1902 1903 /* If we are blanked i.e. can't do upcalls, then we are done */ 1904 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1905 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1906 (mac_srs->srs_state & SRS_PAUSE)); 1907 goto done; 1908 } 1909 1910 sz = 0; 1911 cnt = 0; 1912 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1913 /* 1914 * We couldn't pick up a single packet. 1915 */ 1916 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1917 if ((mac_srs->srs_bw->mac_bw_used == 0) && 1918 (mac_srs->srs_size != 0) && 1919 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1920 /* 1921 * Seems like configured B/W doesn't 1922 * even allow processing of 1 packet 1923 * per tick. 1924 * 1925 * XXX: raise the limit to processing 1926 * at least 1 packet per tick. 1927 */ 1928 mac_srs->srs_bw->mac_bw_limit += 1929 mac_srs->srs_bw->mac_bw_limit; 1930 mac_srs->srs_bw->mac_bw_drop_threshold += 1931 mac_srs->srs_bw->mac_bw_drop_threshold; 1932 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1933 "raised B/W limit to %d since not even a " 1934 "single packet can be processed per " 1935 "tick %d\n", (void *)mac_srs, 1936 (int)mac_srs->srs_bw->mac_bw_limit, 1937 (int)msgdsize(mac_srs->srs_first)); 1938 } 1939 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1940 goto done; 1941 } 1942 1943 ASSERT(head != NULL); 1944 ASSERT(tail != NULL); 1945 1946 /* zero bandwidth: drop all and return to interrupt mode */ 1947 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1948 if (mac_srs->srs_bw->mac_bw_limit == 0) { 1949 srs_rx->sr_stat.mrs_sdrops += cnt; 1950 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1951 mac_srs->srs_bw->mac_bw_sz -= sz; 1952 mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1953 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1954 mac_pkt_drop(NULL, NULL, head, B_FALSE); 1955 goto leave_poll; 1956 } else { 1957 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1958 } 1959 1960 if ((tid = mac_srs->srs_tid) != 0) 1961 mac_srs->srs_tid = 0; 1962 1963 mac_srs->srs_state |= (SRS_PROC|proc_type); 1964 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1965 1966 /* 1967 * mcip is NULL for broadcast and multicast flows. The promisc 1968 * callbacks for broadcast and multicast packets are delivered from 1969 * mac_rx() and we don't need to worry about that case in this path 1970 */ 1971 if (mcip != NULL) { 1972 if (mcip->mci_promisc_list != NULL) { 1973 mutex_exit(&mac_srs->srs_lock); 1974 mac_promisc_client_dispatch(mcip, head); 1975 mutex_enter(&mac_srs->srs_lock); 1976 } 1977 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) { 1978 mutex_exit(&mac_srs->srs_lock); 1979 mac_protect_intercept_dhcp(mcip, head); 1980 mutex_enter(&mac_srs->srs_lock); 1981 } 1982 } 1983 1984 /* 1985 * Check if SRS itself is doing the processing 1986 * This direct path does not apply when subflows are present. In this 1987 * case, packets need to be dispatched to a soft ring according to the 1988 * flow's bandwidth and other resources contraints. 1989 */ 1990 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1991 mac_direct_rx_t proc; 1992 void *arg1; 1993 mac_resource_handle_t arg2; 1994 1995 /* 1996 * This is the case when a Rx is directly 1997 * assigned and we have a fully classified 1998 * protocol chain. We can deal with it in 1999 * one shot. 2000 */ 2001 proc = srs_rx->sr_func; 2002 arg1 = srs_rx->sr_arg1; 2003 arg2 = srs_rx->sr_arg2; 2004 2005 mac_srs->srs_state |= SRS_CLIENT_PROC; 2006 mutex_exit(&mac_srs->srs_lock); 2007 if (tid != 0) { 2008 (void) untimeout(tid); 2009 tid = 0; 2010 } 2011 2012 proc(arg1, arg2, head, NULL); 2013 /* 2014 * Decrement the size and count here itelf 2015 * since the packet has been processed. 2016 */ 2017 mutex_enter(&mac_srs->srs_lock); 2018 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 2019 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 2020 2021 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 2022 cv_signal(&mac_srs->srs_client_cv); 2023 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 2024 } else { 2025 /* Some kind of softrings based fanout is required */ 2026 mutex_exit(&mac_srs->srs_lock); 2027 if (tid != 0) { 2028 (void) untimeout(tid); 2029 tid = 0; 2030 } 2031 2032 /* 2033 * Since the fanout routines can deal with chains, 2034 * shoot the entire chain up. 2035 */ 2036 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 2037 mac_rx_srs_fanout(mac_srs, head); 2038 else 2039 mac_rx_srs_proto_fanout(mac_srs, head); 2040 mutex_enter(&mac_srs->srs_lock); 2041 } 2042 2043 /* 2044 * Send the poll thread to pick up any packets arrived 2045 * so far. This also serves as the last check in case 2046 * nothing else is queued in the SRS. The poll thread 2047 * is signalled only in the case the drain was done 2048 * by the worker thread and SRS_WORKER is set. The 2049 * worker thread can run in parallel as long as the 2050 * SRS_WORKER flag is set. We we have nothing else to 2051 * process, we can exit while leaving SRS_PROC set 2052 * which gives the poll thread control to process and 2053 * cleanup once it returns from the NIC. 2054 * 2055 * If we have nothing else to process, we need to 2056 * ensure that we keep holding the srs_lock till 2057 * all the checks below are done and control is 2058 * handed to the poll thread if it was running. 2059 */ 2060 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2061 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2062 if (mac_srs->srs_first != NULL) { 2063 if (proc_type == SRS_WORKER) { 2064 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2065 if (srs_rx->sr_poll_pkt_cnt <= 2066 srs_rx->sr_lowat) 2067 MAC_SRS_POLL_RING(mac_srs); 2068 goto again; 2069 } else { 2070 cv_signal(&mac_srs->srs_async); 2071 } 2072 } 2073 } 2074 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2075 2076 done: 2077 2078 if (mac_srs->srs_state & SRS_GET_PKTS) { 2079 /* 2080 * Poll thread is already running. Leave the 2081 * SRS_RPOC set and hand over the control to 2082 * poll thread. 2083 */ 2084 mac_srs->srs_state &= ~proc_type; 2085 return; 2086 } 2087 2088 /* 2089 * If we can't process packets because we have exceeded 2090 * B/W limit for this tick, just set the timeout 2091 * and leave. 2092 * 2093 * Even if there are no packets queued in SRS, we 2094 * need to make sure that the shared counter is 2095 * clear and any associated softrings have cleared 2096 * all the backlog. Otherwise, leave the interface 2097 * in polling mode and the poll thread will get 2098 * signalled once the count goes down to zero. 2099 * 2100 * If someone is already draining the queue (SRS_PROC is 2101 * set) when the srs_poll_pkt_cnt goes down to zero, 2102 * then it means that drain is already running and we 2103 * will turn off polling at that time if there is 2104 * no backlog. As long as there are packets queued either 2105 * is soft ring set or its soft rings, we will leave 2106 * the interface in polling mode. 2107 */ 2108 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2109 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2110 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2111 (srs_rx->sr_poll_pkt_cnt > 0))) { 2112 MAC_SRS_POLLING_ON(mac_srs); 2113 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2114 if ((mac_srs->srs_first != NULL) && 2115 (mac_srs->srs_tid == NULL)) 2116 mac_srs->srs_tid = timeout(mac_srs_fire, 2117 mac_srs, 1); 2118 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2119 return; 2120 } 2121 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2122 2123 leave_poll: 2124 2125 /* Nothing else to do. Get out of poll mode */ 2126 MAC_SRS_POLLING_OFF(mac_srs); 2127 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2128 } 2129 2130 /* 2131 * mac_srs_worker 2132 * 2133 * The SRS worker routine. Drains the queue when no one else is 2134 * processing it. 2135 */ 2136 void 2137 mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2138 { 2139 kmutex_t *lock = &mac_srs->srs_lock; 2140 kcondvar_t *async = &mac_srs->srs_async; 2141 callb_cpr_t cprinfo; 2142 boolean_t bw_ctl_flag; 2143 2144 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2145 mutex_enter(lock); 2146 2147 start: 2148 for (;;) { 2149 bw_ctl_flag = B_FALSE; 2150 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2151 MAC_SRS_BW_LOCK(mac_srs); 2152 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2153 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2154 bw_ctl_flag = B_TRUE; 2155 MAC_SRS_BW_UNLOCK(mac_srs); 2156 } 2157 /* 2158 * The SRS_BW_ENFORCED flag may change since we have dropped 2159 * the mac_bw_lock. However the drain function can handle both 2160 * a drainable SRS or a bandwidth controlled SRS, and the 2161 * effect of scheduling a timeout is to wakeup the worker 2162 * thread which in turn will call the drain function. Since 2163 * we release the srs_lock atomically only in the cv_wait there 2164 * isn't a fear of waiting for ever. 2165 */ 2166 while (((mac_srs->srs_state & SRS_PROC) || 2167 (mac_srs->srs_first == NULL) || bw_ctl_flag || 2168 (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2169 !(mac_srs->srs_state & SRS_PAUSE)) { 2170 /* 2171 * If we have packets queued and we are here 2172 * because B/W control is in place, we better 2173 * schedule the worker wakeup after 1 tick 2174 * to see if bandwidth control can be relaxed. 2175 */ 2176 if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2177 /* 2178 * We need to ensure that a timer is already 2179 * scheduled or we force schedule one for 2180 * later so that we can continue processing 2181 * after this quanta is over. 2182 */ 2183 mac_srs->srs_tid = timeout(mac_srs_fire, 2184 mac_srs, 1); 2185 } 2186 wait: 2187 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2188 cv_wait(async, lock); 2189 CALLB_CPR_SAFE_END(&cprinfo, lock); 2190 2191 if (mac_srs->srs_state & SRS_PAUSE) 2192 goto done; 2193 if (mac_srs->srs_state & SRS_PROC) 2194 goto wait; 2195 2196 if (mac_srs->srs_first != NULL && 2197 mac_srs->srs_type & SRST_BW_CONTROL) { 2198 MAC_SRS_BW_LOCK(mac_srs); 2199 if (mac_srs->srs_bw->mac_bw_state & 2200 SRS_BW_ENFORCED) { 2201 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2202 } 2203 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2204 SRS_BW_ENFORCED; 2205 MAC_SRS_BW_UNLOCK(mac_srs); 2206 } 2207 } 2208 2209 if (mac_srs->srs_state & SRS_PAUSE) 2210 goto done; 2211 mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2212 } 2213 done: 2214 /* 2215 * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2216 * from both hard and soft classifications and waits for such threads 2217 * to finish before signaling the worker. So at this point the only 2218 * thread left that could be competing with the worker is the poll 2219 * thread. In the case of Tx, there shouldn't be any thread holding 2220 * SRS_PROC at this point. 2221 */ 2222 if (!(mac_srs->srs_state & SRS_PROC)) { 2223 mac_srs->srs_state |= SRS_PROC; 2224 } else { 2225 ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2226 /* 2227 * Poll thread still owns the SRS and is still running 2228 */ 2229 ASSERT((mac_srs->srs_poll_thr == NULL) || 2230 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2231 SRS_POLL_THR_OWNER)); 2232 } 2233 mac_srs_worker_quiesce(mac_srs); 2234 /* 2235 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2236 * of the quiesce operation 2237 */ 2238 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2239 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2240 2241 if (mac_srs->srs_state & SRS_RESTART) { 2242 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2243 mac_srs_worker_restart(mac_srs); 2244 mac_srs->srs_state &= ~SRS_PROC; 2245 goto start; 2246 } 2247 2248 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2249 mac_srs_worker_quiesce(mac_srs); 2250 2251 mac_srs->srs_state &= ~SRS_PROC; 2252 /* The macro drops the srs_lock */ 2253 CALLB_CPR_EXIT(&cprinfo); 2254 thread_exit(); 2255 } 2256 2257 /* 2258 * mac_rx_srs_subflow_process 2259 * 2260 * Receive side routine called from interrupt path when there are 2261 * sub flows present on this SRS. 2262 */ 2263 /* ARGSUSED */ 2264 void 2265 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2266 mblk_t *mp_chain, boolean_t loopback) 2267 { 2268 flow_entry_t *flent = NULL; 2269 flow_entry_t *prev_flent = NULL; 2270 mblk_t *mp = NULL; 2271 mblk_t *tail = NULL; 2272 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2273 mac_client_impl_t *mcip; 2274 2275 mcip = mac_srs->srs_mcip; 2276 ASSERT(mcip != NULL); 2277 2278 /* 2279 * We need to determine the SRS for every packet 2280 * by walking the flow table, if we don't get any, 2281 * then we proceed using the SRS we came with. 2282 */ 2283 mp = tail = mp_chain; 2284 while (mp != NULL) { 2285 2286 /* 2287 * We will increment the stats for the mactching subflow. 2288 * when we get the bytes/pkt count for the classified packets 2289 * later in mac_rx_srs_process. 2290 */ 2291 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2292 FLOW_INBOUND, &flent); 2293 2294 if (mp == mp_chain || flent == prev_flent) { 2295 if (prev_flent != NULL) 2296 FLOW_REFRELE(prev_flent); 2297 prev_flent = flent; 2298 flent = NULL; 2299 tail = mp; 2300 mp = mp->b_next; 2301 continue; 2302 } 2303 tail->b_next = NULL; 2304 /* 2305 * A null indicates, this is for the mac_srs itself. 2306 * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2307 */ 2308 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2309 mac_rx_srs_process(arg, 2310 (mac_resource_handle_t)mac_srs, mp_chain, 2311 loopback); 2312 } else { 2313 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2314 prev_flent->fe_cb_arg2, mp_chain, loopback); 2315 FLOW_REFRELE(prev_flent); 2316 } 2317 prev_flent = flent; 2318 flent = NULL; 2319 mp_chain = mp; 2320 tail = mp; 2321 mp = mp->b_next; 2322 } 2323 /* Last chain */ 2324 ASSERT(mp_chain != NULL); 2325 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2326 mac_rx_srs_process(arg, 2327 (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2328 } else { 2329 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2330 prev_flent->fe_cb_arg2, mp_chain, loopback); 2331 FLOW_REFRELE(prev_flent); 2332 } 2333 } 2334 2335 /* 2336 * mac_rx_srs_process 2337 * 2338 * Receive side routine called from the interrupt path. 2339 * 2340 * loopback is set to force a context switch on the loopback 2341 * path between MAC clients. 2342 */ 2343 /* ARGSUSED */ 2344 void 2345 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2346 boolean_t loopback) 2347 { 2348 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2349 mblk_t *mp, *tail, *head; 2350 int count = 0; 2351 int count1; 2352 size_t sz = 0; 2353 size_t chain_sz, sz1; 2354 mac_bw_ctl_t *mac_bw; 2355 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2356 2357 /* 2358 * Set the tail, count and sz. We set the sz irrespective 2359 * of whether we are doing B/W control or not for the 2360 * purpose of updating the stats. 2361 */ 2362 mp = tail = mp_chain; 2363 while (mp != NULL) { 2364 tail = mp; 2365 count++; 2366 sz += msgdsize(mp); 2367 mp = mp->b_next; 2368 } 2369 2370 mutex_enter(&mac_srs->srs_lock); 2371 2372 if (loopback) { 2373 SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz); 2374 SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count); 2375 2376 } else { 2377 SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz); 2378 SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count); 2379 } 2380 2381 /* 2382 * If the SRS in already being processed; has been blanked; 2383 * can be processed by worker thread only; or the B/W limit 2384 * has been reached, then queue the chain and check if 2385 * worker thread needs to be awakend. 2386 */ 2387 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2388 mac_bw = mac_srs->srs_bw; 2389 ASSERT(mac_bw != NULL); 2390 mutex_enter(&mac_bw->mac_bw_lock); 2391 mac_bw->mac_bw_intr += sz; 2392 if (mac_bw->mac_bw_limit == 0) { 2393 /* zero bandwidth: drop all */ 2394 srs_rx->sr_stat.mrs_sdrops += count; 2395 mac_bw->mac_bw_drop_bytes += sz; 2396 mutex_exit(&mac_bw->mac_bw_lock); 2397 mutex_exit(&mac_srs->srs_lock); 2398 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2399 return; 2400 } else { 2401 if ((mac_bw->mac_bw_sz + sz) <= 2402 mac_bw->mac_bw_drop_threshold) { 2403 mutex_exit(&mac_bw->mac_bw_lock); 2404 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2405 tail, count, sz); 2406 } else { 2407 mp = mp_chain; 2408 chain_sz = 0; 2409 count1 = 0; 2410 tail = NULL; 2411 head = NULL; 2412 while (mp != NULL) { 2413 sz1 = msgdsize(mp); 2414 if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2415 mac_bw->mac_bw_drop_threshold) 2416 break; 2417 chain_sz += sz1; 2418 count1++; 2419 tail = mp; 2420 mp = mp->b_next; 2421 } 2422 mutex_exit(&mac_bw->mac_bw_lock); 2423 if (tail != NULL) { 2424 head = tail->b_next; 2425 tail->b_next = NULL; 2426 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2427 mp_chain, tail, count1, chain_sz); 2428 sz -= chain_sz; 2429 count -= count1; 2430 } else { 2431 /* Can't pick up any */ 2432 head = mp_chain; 2433 } 2434 if (head != NULL) { 2435 /* Drop any packet over the threshold */ 2436 srs_rx->sr_stat.mrs_sdrops += count; 2437 mutex_enter(&mac_bw->mac_bw_lock); 2438 mac_bw->mac_bw_drop_bytes += sz; 2439 mutex_exit(&mac_bw->mac_bw_lock); 2440 freemsgchain(head); 2441 } 2442 } 2443 MAC_SRS_WORKER_WAKEUP(mac_srs); 2444 mutex_exit(&mac_srs->srs_lock); 2445 return; 2446 } 2447 } 2448 2449 /* 2450 * If the total number of packets queued in the SRS and 2451 * its associated soft rings exceeds the max allowed, 2452 * then drop the chain. If we are polling capable, this 2453 * shouldn't be happening. 2454 */ 2455 if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2456 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2457 mac_bw = mac_srs->srs_bw; 2458 srs_rx->sr_stat.mrs_sdrops += count; 2459 mutex_enter(&mac_bw->mac_bw_lock); 2460 mac_bw->mac_bw_drop_bytes += sz; 2461 mutex_exit(&mac_bw->mac_bw_lock); 2462 freemsgchain(mp_chain); 2463 mutex_exit(&mac_srs->srs_lock); 2464 return; 2465 } 2466 2467 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2468 2469 if (!(mac_srs->srs_state & SRS_PROC)) { 2470 /* 2471 * If we are coming via loopback, if we are not optimizing for 2472 * latency, or if our stack is running deep, we should signal 2473 * the worker thread. 2474 */ 2475 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT) || 2476 MAC_RX_SRS_TOODEEP()) { 2477 /* 2478 * For loopback, We need to let the worker take 2479 * over as we don't want to continue in the same 2480 * thread even if we can. This could lead to stack 2481 * overflows and may also end up using 2482 * resources (cpu) incorrectly. 2483 */ 2484 cv_signal(&mac_srs->srs_async); 2485 } else { 2486 /* 2487 * Seems like no one is processing the SRS and 2488 * there is no backlog. We also inline process 2489 * our packet if its a single packet in non 2490 * latency optimized case (in latency optimized 2491 * case, we inline process chains of any size). 2492 */ 2493 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2494 } 2495 } 2496 mutex_exit(&mac_srs->srs_lock); 2497 } 2498 2499 /* TX SIDE ROUTINES (RUNTIME) */ 2500 2501 /* 2502 * mac_tx_srs_no_desc 2503 * 2504 * This routine is called by Tx single ring default mode 2505 * when Tx ring runs out of descs. 2506 */ 2507 mac_tx_cookie_t 2508 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2509 uint16_t flag, mblk_t **ret_mp) 2510 { 2511 mac_tx_cookie_t cookie = NULL; 2512 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2513 boolean_t wakeup_worker = B_TRUE; 2514 uint32_t tx_mode = srs_tx->st_mode; 2515 int cnt, sz; 2516 mblk_t *tail; 2517 2518 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2519 if (flag & MAC_DROP_ON_NO_DESC) { 2520 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2521 } else { 2522 if (mac_srs->srs_first != NULL) 2523 wakeup_worker = B_FALSE; 2524 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2525 if (flag & MAC_TX_NO_ENQUEUE) { 2526 /* 2527 * If TX_QUEUED is not set, queue the 2528 * packet and let mac_tx_srs_drain() 2529 * set the TX_BLOCKED bit for the 2530 * reasons explained above. Otherwise, 2531 * return the mblks. 2532 */ 2533 if (wakeup_worker) { 2534 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2535 mp_chain, tail, cnt, sz); 2536 } else { 2537 MAC_TX_SET_NO_ENQUEUE(mac_srs, 2538 mp_chain, ret_mp, cookie); 2539 } 2540 } else { 2541 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2542 tail, cnt, sz, cookie); 2543 } 2544 if (wakeup_worker) 2545 cv_signal(&mac_srs->srs_async); 2546 } 2547 return (cookie); 2548 } 2549 2550 /* 2551 * mac_tx_srs_enqueue 2552 * 2553 * This routine is called when Tx SRS is operating in either serializer 2554 * or bandwidth mode. In serializer mode, a packet will get enqueued 2555 * when a thread cannot enter SRS exclusively. In bandwidth mode, 2556 * packets gets queued if allowed byte-count limit for a tick is 2557 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2558 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2559 * the default mode or fanout mode. Here packets get dropped or 2560 * returned back to the caller only after hi-watermark worth of data 2561 * is queued. 2562 */ 2563 static mac_tx_cookie_t 2564 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2565 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2566 { 2567 mac_tx_cookie_t cookie = NULL; 2568 int cnt, sz; 2569 mblk_t *tail; 2570 boolean_t wakeup_worker = B_TRUE; 2571 2572 /* 2573 * Ignore fanout hint if we don't have multiple tx rings. 2574 */ 2575 if (!MAC_TX_SOFT_RINGS(mac_srs)) 2576 fanout_hint = 0; 2577 2578 if (mac_srs->srs_first != NULL) 2579 wakeup_worker = B_FALSE; 2580 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2581 if (flag & MAC_DROP_ON_NO_DESC) { 2582 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2583 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2584 } else { 2585 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2586 mp_chain, tail, cnt, sz); 2587 } 2588 } else if (flag & MAC_TX_NO_ENQUEUE) { 2589 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2590 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2591 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2592 ret_mp, cookie); 2593 } else { 2594 mp_chain->b_prev = (mblk_t *)fanout_hint; 2595 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2596 mp_chain, tail, cnt, sz); 2597 } 2598 } else { 2599 /* 2600 * If you are BW_ENFORCED, just enqueue the 2601 * packet. srs_worker will drain it at the 2602 * prescribed rate. Before enqueueing, save 2603 * the fanout hint. 2604 */ 2605 mp_chain->b_prev = (mblk_t *)fanout_hint; 2606 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2607 tail, cnt, sz, cookie); 2608 } 2609 if (wakeup_worker) 2610 cv_signal(&mac_srs->srs_async); 2611 return (cookie); 2612 } 2613 2614 /* 2615 * There are seven tx modes: 2616 * 2617 * 1) Default mode (SRS_TX_DEFAULT) 2618 * 2) Serialization mode (SRS_TX_SERIALIZE) 2619 * 3) Fanout mode (SRS_TX_FANOUT) 2620 * 4) Bandwdith mode (SRS_TX_BW) 2621 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2622 * 6) aggr Tx mode (SRS_TX_AGGR) 2623 * 7) aggr Tx bw mode (SRS_TX_BW_AGGR) 2624 * 2625 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2626 * based on the number of Tx rings requested for an SRS and whether 2627 * bandwidth control is requested or not. 2628 * 2629 * The default mode (i.e., no fanout/no bandwidth) is used when the 2630 * underlying NIC does not have Tx rings or just one Tx ring. In this mode, 2631 * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send(). 2632 * When the underlying Tx ring runs out of Tx descs, it starts queueing up 2633 * packets in SRS. When flow-control is relieved, the srs_worker drains 2634 * the queued packets and informs blocked clients to restart sending 2635 * packets. 2636 * 2637 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This 2638 * mode is used when the link has no Tx rings or only one Tx ring. 2639 * 2640 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2641 * Tx rings. Each Tx ring will have a soft ring associated with it. 2642 * These soft rings will be hung off the Tx SRS. Queueing if it happens 2643 * due to lack of Tx desc will be in individual soft ring (and not srs) 2644 * associated with Tx ring. 2645 * 2646 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2647 * only if bw is available. Otherwise the packets will be queued in 2648 * SRS. If fanout to multiple Tx rings is configured, the packets will 2649 * be fanned out among the soft rings associated with the Tx rings. 2650 * 2651 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 2652 * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring 2653 * belonging to a port on which the packet has to be sent. Aggr will 2654 * always have a pseudo Tx ring associated with it even when it is an 2655 * aggregation over a single NIC that has no Tx rings. Even in such a 2656 * case, the single pseudo Tx ring will have a soft ring associated with 2657 * it and the soft ring will hang off the SRS. 2658 * 2659 * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used. 2660 * In this mode, the bandwidth is first applied on the outgoing packets 2661 * and later mac_tx_addr_mode() function is called to send the packet out 2662 * of one of the pseudo Tx rings. 2663 * 2664 * Four flags are used in srs_state for indicating flow control 2665 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2666 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2667 * driver below. 2668 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2669 * and flow-control pressure is applied back to clients. The clients expect 2670 * wakeup when flow-control is relieved. 2671 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2672 * got returned back to client either due to lack of Tx descs or due to bw 2673 * control reasons. The clients expect a wakeup when condition is relieved. 2674 * 2675 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2676 * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2677 * MAC_TX_NO_ENQUEUE 2678 * Mac clients that do not want packets to be enqueued in the mac layer set 2679 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2680 * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2681 * behaviour of this flag is different when the Tx is running in serializer 2682 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2683 * get dropped when Tx high watermark is reached. 2684 * There are some mac clients like vsw, aggr that want the mblks to be 2685 * returned back to clients instead of being queued in Tx SRS (or Tx soft 2686 * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2687 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2688 * In the default and Tx fanout mode, the un-transmitted mblks will be 2689 * returned back to the clients when the driver runs out of Tx descs. 2690 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2691 * soft ring) so that the clients can be woken up when Tx desc become 2692 * available. When running in serializer or bandwidth mode mode, 2693 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2694 */ 2695 2696 mac_tx_func_t 2697 mac_tx_get_func(uint32_t mode) 2698 { 2699 return (mac_tx_mode_list[mode].mac_tx_func); 2700 } 2701 2702 /* ARGSUSED */ 2703 static mac_tx_cookie_t 2704 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2705 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2706 { 2707 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2708 mac_tx_stats_t stats; 2709 mac_tx_cookie_t cookie = NULL; 2710 2711 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2712 2713 /* Regular case with a single Tx ring */ 2714 /* 2715 * SRS_TX_BLOCKED is set when underlying NIC runs 2716 * out of Tx descs and messages start getting 2717 * queued. It won't get reset until 2718 * tx_srs_drain() completely drains out the 2719 * messages. 2720 */ 2721 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2722 /* Tx descs/resources not available */ 2723 mutex_enter(&mac_srs->srs_lock); 2724 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2725 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2726 flag, ret_mp); 2727 mutex_exit(&mac_srs->srs_lock); 2728 return (cookie); 2729 } 2730 /* 2731 * While we were computing mblk count, the 2732 * flow control condition got relieved. 2733 * Continue with the transmission. 2734 */ 2735 mutex_exit(&mac_srs->srs_lock); 2736 } 2737 2738 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2739 mp_chain, &stats); 2740 2741 /* 2742 * Multiple threads could be here sending packets. 2743 * Under such conditions, it is not possible to 2744 * automically set SRS_TX_BLOCKED bit to indicate 2745 * out of tx desc condition. To atomically set 2746 * this, we queue the returned packet and do 2747 * the setting of SRS_TX_BLOCKED in 2748 * mac_tx_srs_drain(). 2749 */ 2750 if (mp_chain != NULL) { 2751 mutex_enter(&mac_srs->srs_lock); 2752 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2753 mutex_exit(&mac_srs->srs_lock); 2754 return (cookie); 2755 } 2756 SRS_TX_STATS_UPDATE(mac_srs, &stats); 2757 2758 return (NULL); 2759 } 2760 2761 /* 2762 * mac_tx_serialize_mode 2763 * 2764 * This is an experimental mode implemented as per the request of PAE. 2765 * In this mode, all callers attempting to send a packet to the NIC 2766 * will get serialized. Only one thread at any time will access the 2767 * NIC to send the packet out. 2768 */ 2769 /* ARGSUSED */ 2770 static mac_tx_cookie_t 2771 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2772 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2773 { 2774 mac_tx_stats_t stats; 2775 mac_tx_cookie_t cookie = NULL; 2776 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2777 2778 /* Single ring, serialize below */ 2779 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2780 mutex_enter(&mac_srs->srs_lock); 2781 if ((mac_srs->srs_first != NULL) || 2782 (mac_srs->srs_state & SRS_PROC)) { 2783 /* 2784 * In serialization mode, queue all packets until 2785 * TX_HIWAT is set. 2786 * If drop bit is set, drop if TX_HIWAT is set. 2787 * If no_enqueue is set, still enqueue until hiwat 2788 * is set and return mblks after TX_HIWAT is set. 2789 */ 2790 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2791 flag, NULL, ret_mp); 2792 mutex_exit(&mac_srs->srs_lock); 2793 return (cookie); 2794 } 2795 /* 2796 * No packets queued, nothing on proc and no flow 2797 * control condition. Fast-path, ok. Do inline 2798 * processing. 2799 */ 2800 mac_srs->srs_state |= SRS_PROC; 2801 mutex_exit(&mac_srs->srs_lock); 2802 2803 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2804 mp_chain, &stats); 2805 2806 mutex_enter(&mac_srs->srs_lock); 2807 mac_srs->srs_state &= ~SRS_PROC; 2808 if (mp_chain != NULL) { 2809 cookie = mac_tx_srs_enqueue(mac_srs, 2810 mp_chain, flag, NULL, ret_mp); 2811 } 2812 if (mac_srs->srs_first != NULL) { 2813 /* 2814 * We processed inline our packet and a new 2815 * packet/s got queued while we were 2816 * processing. Wakeup srs worker 2817 */ 2818 cv_signal(&mac_srs->srs_async); 2819 } 2820 mutex_exit(&mac_srs->srs_lock); 2821 2822 if (cookie == NULL) 2823 SRS_TX_STATS_UPDATE(mac_srs, &stats); 2824 2825 return (cookie); 2826 } 2827 2828 /* 2829 * mac_tx_fanout_mode 2830 * 2831 * In this mode, the SRS will have access to multiple Tx rings to send 2832 * the packet out. The fanout hint that is passed as an argument is 2833 * used to find an appropriate ring to fanout the traffic. Each Tx 2834 * ring, in turn, will have a soft ring associated with it. If a Tx 2835 * ring runs out of Tx desc's the returned packet will be queued in 2836 * the soft ring associated with that Tx ring. The srs itself will not 2837 * queue any packets. 2838 */ 2839 2840 #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 2841 index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count), \ 2842 softring = mac_srs->srs_tx_soft_rings[index]; \ 2843 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 2844 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 2845 } 2846 2847 static mac_tx_cookie_t 2848 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2849 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2850 { 2851 mac_soft_ring_t *softring; 2852 uint64_t hash; 2853 uint_t index; 2854 mac_tx_cookie_t cookie = NULL; 2855 2856 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 2857 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 2858 if (fanout_hint != 0) { 2859 /* 2860 * The hint is specified by the caller, simply pass the 2861 * whole chain to the soft ring. 2862 */ 2863 hash = HASH_HINT(fanout_hint); 2864 MAC_TX_SOFT_RING_PROCESS(mp_chain); 2865 } else { 2866 mblk_t *last_mp, *cur_mp, *sub_chain; 2867 uint64_t last_hash = 0; 2868 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 2869 2870 /* 2871 * Compute the hash from the contents (headers) of the 2872 * packets of the mblk chain. Split the chains into 2873 * subchains of the same conversation. 2874 * 2875 * Since there may be more than one ring used for 2876 * sub-chains of the same call, and since the caller 2877 * does not maintain per conversation state since it 2878 * passed a zero hint, unsent subchains will be 2879 * dropped. 2880 */ 2881 2882 flag |= MAC_DROP_ON_NO_DESC; 2883 ret_mp = NULL; 2884 2885 ASSERT(ret_mp == NULL); 2886 2887 sub_chain = NULL; 2888 last_mp = NULL; 2889 2890 for (cur_mp = mp_chain; cur_mp != NULL; 2891 cur_mp = cur_mp->b_next) { 2892 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 2893 B_TRUE); 2894 if (last_hash != 0 && hash != last_hash) { 2895 /* 2896 * Starting a different subchain, send current 2897 * chain out. 2898 */ 2899 ASSERT(last_mp != NULL); 2900 last_mp->b_next = NULL; 2901 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2902 sub_chain = NULL; 2903 } 2904 2905 /* add packet to subchain */ 2906 if (sub_chain == NULL) 2907 sub_chain = cur_mp; 2908 last_mp = cur_mp; 2909 last_hash = hash; 2910 } 2911 2912 if (sub_chain != NULL) { 2913 /* send last subchain */ 2914 ASSERT(last_mp != NULL); 2915 last_mp->b_next = NULL; 2916 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2917 } 2918 2919 cookie = NULL; 2920 } 2921 2922 return (cookie); 2923 } 2924 2925 /* 2926 * mac_tx_bw_mode 2927 * 2928 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2929 * only if bw is available. Otherwise the packets will be queued in 2930 * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2931 * out to a Tx rings. 2932 */ 2933 static mac_tx_cookie_t 2934 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2935 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2936 { 2937 int cnt, sz; 2938 mblk_t *tail; 2939 mac_tx_cookie_t cookie = NULL; 2940 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2941 clock_t now; 2942 2943 ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2944 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2945 mutex_enter(&mac_srs->srs_lock); 2946 if (mac_srs->srs_bw->mac_bw_limit == 0) { 2947 /* 2948 * zero bandwidth, no traffic is sent: drop the packets, 2949 * or return the whole chain if the caller requests all 2950 * unsent packets back. 2951 */ 2952 if (flag & MAC_TX_NO_ENQUEUE) { 2953 cookie = (mac_tx_cookie_t)mac_srs; 2954 *ret_mp = mp_chain; 2955 } else { 2956 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2957 } 2958 mutex_exit(&mac_srs->srs_lock); 2959 return (cookie); 2960 } else if ((mac_srs->srs_first != NULL) || 2961 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2962 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2963 fanout_hint, ret_mp); 2964 mutex_exit(&mac_srs->srs_lock); 2965 return (cookie); 2966 } 2967 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2968 now = ddi_get_lbolt(); 2969 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 2970 mac_srs->srs_bw->mac_bw_curr_time = now; 2971 mac_srs->srs_bw->mac_bw_used = 0; 2972 } else if (mac_srs->srs_bw->mac_bw_used > 2973 mac_srs->srs_bw->mac_bw_limit) { 2974 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2975 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2976 mp_chain, tail, cnt, sz); 2977 /* 2978 * Wakeup worker thread. Note that worker 2979 * thread has to be woken up so that it 2980 * can fire up the timer to be woken up 2981 * on the next tick. Also once 2982 * BW_ENFORCED is set, it can only be 2983 * reset by srs_worker thread. Until then 2984 * all packets will get queued up in SRS 2985 * and hence this this code path won't be 2986 * entered until BW_ENFORCED is reset. 2987 */ 2988 cv_signal(&mac_srs->srs_async); 2989 mutex_exit(&mac_srs->srs_lock); 2990 return (cookie); 2991 } 2992 2993 mac_srs->srs_bw->mac_bw_used += sz; 2994 mutex_exit(&mac_srs->srs_lock); 2995 2996 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2997 mac_soft_ring_t *softring; 2998 uint_t indx, hash; 2999 3000 hash = HASH_HINT(fanout_hint); 3001 indx = COMPUTE_INDEX(hash, 3002 mac_srs->srs_tx_ring_count); 3003 softring = mac_srs->srs_tx_soft_rings[indx]; 3004 return (mac_tx_soft_ring_process(softring, mp_chain, flag, 3005 ret_mp)); 3006 } else if (srs_tx->st_mode == SRS_TX_BW_AGGR) { 3007 return (mac_tx_aggr_mode(mac_srs, mp_chain, 3008 fanout_hint, flag, ret_mp)); 3009 } else { 3010 mac_tx_stats_t stats; 3011 3012 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3013 mp_chain, &stats); 3014 3015 if (mp_chain != NULL) { 3016 mutex_enter(&mac_srs->srs_lock); 3017 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3018 if (mac_srs->srs_bw->mac_bw_used > sz) 3019 mac_srs->srs_bw->mac_bw_used -= sz; 3020 else 3021 mac_srs->srs_bw->mac_bw_used = 0; 3022 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 3023 fanout_hint, ret_mp); 3024 mutex_exit(&mac_srs->srs_lock); 3025 return (cookie); 3026 } 3027 SRS_TX_STATS_UPDATE(mac_srs, &stats); 3028 3029 return (NULL); 3030 } 3031 } 3032 3033 /* 3034 * mac_tx_aggr_mode 3035 * 3036 * This routine invokes an aggr function, aggr_find_tx_ring(), to find 3037 * a (pseudo) Tx ring belonging to a port on which the packet has to 3038 * be sent. aggr_find_tx_ring() first finds the outgoing port based on 3039 * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick 3040 * a Tx ring from the selected port. 3041 * 3042 * Note that a port can be deleted from the aggregation. In such a case, 3043 * the aggregation layer first separates the port from the rest of the 3044 * ports making sure that port (and thus any Tx rings associated with 3045 * it) won't get selected in the call to aggr_find_tx_ring() function. 3046 * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring 3047 * handles one by one which in turn will quiesce the Tx SRS and remove 3048 * the soft ring associated with the pseudo Tx ring. Unlike Rx side 3049 * where a cookie is used to protect against mac_rx_ring() calls on 3050 * rings that have been removed, no such cookie is needed on the Tx 3051 * side as the pseudo Tx ring won't be available anymore to 3052 * aggr_find_tx_ring() once the port has been removed. 3053 */ 3054 static mac_tx_cookie_t 3055 mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 3056 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 3057 { 3058 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3059 mac_tx_ring_fn_t find_tx_ring_fn; 3060 mac_ring_handle_t ring = NULL; 3061 void *arg; 3062 mac_soft_ring_t *sringp; 3063 3064 find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn; 3065 arg = srs_tx->st_capab_aggr.mca_arg; 3066 if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL) 3067 return (NULL); 3068 sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index]; 3069 return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp)); 3070 } 3071 3072 void 3073 mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie) 3074 { 3075 mac_cb_t *mcb; 3076 mac_tx_notify_cb_t *mtnfp; 3077 3078 /* Wakeup callback registered clients */ 3079 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3080 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3081 mcb = mcb->mcb_nextp) { 3082 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3083 mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie); 3084 } 3085 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3086 &mcip->mci_tx_notify_cb_list); 3087 } 3088 3089 /* ARGSUSED */ 3090 void 3091 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 3092 { 3093 mblk_t *head, *tail; 3094 size_t sz; 3095 uint32_t tx_mode; 3096 uint_t saved_pkt_count; 3097 mac_tx_stats_t stats; 3098 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3099 clock_t now; 3100 3101 saved_pkt_count = 0; 3102 ASSERT(mutex_owned(&mac_srs->srs_lock)); 3103 ASSERT(!(mac_srs->srs_state & SRS_PROC)); 3104 3105 mac_srs->srs_state |= SRS_PROC; 3106 3107 tx_mode = srs_tx->st_mode; 3108 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 3109 if (mac_srs->srs_first != NULL) { 3110 head = mac_srs->srs_first; 3111 tail = mac_srs->srs_last; 3112 saved_pkt_count = mac_srs->srs_count; 3113 mac_srs->srs_first = NULL; 3114 mac_srs->srs_last = NULL; 3115 mac_srs->srs_count = 0; 3116 mutex_exit(&mac_srs->srs_lock); 3117 3118 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3119 head, &stats); 3120 3121 mutex_enter(&mac_srs->srs_lock); 3122 if (head != NULL) { 3123 /* Device out of tx desc, set block */ 3124 if (head->b_next == NULL) 3125 VERIFY(head == tail); 3126 tail->b_next = mac_srs->srs_first; 3127 mac_srs->srs_first = head; 3128 mac_srs->srs_count += 3129 (saved_pkt_count - stats.mts_opackets); 3130 if (mac_srs->srs_last == NULL) 3131 mac_srs->srs_last = tail; 3132 MAC_TX_SRS_BLOCK(mac_srs, head); 3133 } else { 3134 srs_tx->st_woken_up = B_FALSE; 3135 SRS_TX_STATS_UPDATE(mac_srs, &stats); 3136 } 3137 } 3138 } else if (tx_mode == SRS_TX_BW) { 3139 /* 3140 * We are here because the timer fired and we have some data 3141 * to tranmit. Also mac_tx_srs_worker should have reset 3142 * SRS_BW_ENFORCED flag 3143 */ 3144 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 3145 head = tail = mac_srs->srs_first; 3146 while (mac_srs->srs_first != NULL) { 3147 tail = mac_srs->srs_first; 3148 tail->b_prev = NULL; 3149 mac_srs->srs_first = tail->b_next; 3150 if (mac_srs->srs_first == NULL) 3151 mac_srs->srs_last = NULL; 3152 mac_srs->srs_count--; 3153 sz = msgdsize(tail); 3154 mac_srs->srs_size -= sz; 3155 saved_pkt_count++; 3156 MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 3157 3158 if (mac_srs->srs_bw->mac_bw_used < 3159 mac_srs->srs_bw->mac_bw_limit) 3160 continue; 3161 3162 now = ddi_get_lbolt(); 3163 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3164 mac_srs->srs_bw->mac_bw_curr_time = now; 3165 mac_srs->srs_bw->mac_bw_used = sz; 3166 continue; 3167 } 3168 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3169 break; 3170 } 3171 3172 ASSERT((head == NULL && tail == NULL) || 3173 (head != NULL && tail != NULL)); 3174 if (tail != NULL) { 3175 tail->b_next = NULL; 3176 mutex_exit(&mac_srs->srs_lock); 3177 3178 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3179 head, &stats); 3180 3181 mutex_enter(&mac_srs->srs_lock); 3182 if (head != NULL) { 3183 uint_t size_sent; 3184 3185 /* Device out of tx desc, set block */ 3186 if (head->b_next == NULL) 3187 VERIFY(head == tail); 3188 tail->b_next = mac_srs->srs_first; 3189 mac_srs->srs_first = head; 3190 mac_srs->srs_count += 3191 (saved_pkt_count - stats.mts_opackets); 3192 if (mac_srs->srs_last == NULL) 3193 mac_srs->srs_last = tail; 3194 size_sent = sz - stats.mts_obytes; 3195 mac_srs->srs_size += size_sent; 3196 mac_srs->srs_bw->mac_bw_sz += size_sent; 3197 if (mac_srs->srs_bw->mac_bw_used > size_sent) { 3198 mac_srs->srs_bw->mac_bw_used -= 3199 size_sent; 3200 } else { 3201 mac_srs->srs_bw->mac_bw_used = 0; 3202 } 3203 MAC_TX_SRS_BLOCK(mac_srs, head); 3204 } else { 3205 srs_tx->st_woken_up = B_FALSE; 3206 SRS_TX_STATS_UPDATE(mac_srs, &stats); 3207 } 3208 } 3209 } else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) { 3210 mblk_t *prev; 3211 uint64_t hint; 3212 3213 /* 3214 * We are here because the timer fired and we 3215 * have some quota to tranmit. 3216 */ 3217 prev = NULL; 3218 head = tail = mac_srs->srs_first; 3219 while (mac_srs->srs_first != NULL) { 3220 tail = mac_srs->srs_first; 3221 mac_srs->srs_first = tail->b_next; 3222 if (mac_srs->srs_first == NULL) 3223 mac_srs->srs_last = NULL; 3224 mac_srs->srs_count--; 3225 sz = msgdsize(tail); 3226 mac_srs->srs_size -= sz; 3227 mac_srs->srs_bw->mac_bw_used += sz; 3228 if (prev == NULL) 3229 hint = (ulong_t)tail->b_prev; 3230 if (hint != (ulong_t)tail->b_prev) { 3231 prev->b_next = NULL; 3232 mutex_exit(&mac_srs->srs_lock); 3233 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3234 head = tail; 3235 hint = (ulong_t)tail->b_prev; 3236 mutex_enter(&mac_srs->srs_lock); 3237 } 3238 3239 prev = tail; 3240 tail->b_prev = NULL; 3241 if (mac_srs->srs_bw->mac_bw_used < 3242 mac_srs->srs_bw->mac_bw_limit) 3243 continue; 3244 3245 now = ddi_get_lbolt(); 3246 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3247 mac_srs->srs_bw->mac_bw_curr_time = now; 3248 mac_srs->srs_bw->mac_bw_used = 0; 3249 continue; 3250 } 3251 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3252 break; 3253 } 3254 ASSERT((head == NULL && tail == NULL) || 3255 (head != NULL && tail != NULL)); 3256 if (tail != NULL) { 3257 tail->b_next = NULL; 3258 mutex_exit(&mac_srs->srs_lock); 3259 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3260 mutex_enter(&mac_srs->srs_lock); 3261 } 3262 } 3263 /* 3264 * SRS_TX_FANOUT case not considered here because packets 3265 * won't be queued in the SRS for this case. Packets will 3266 * be sent directly to soft rings underneath and if there 3267 * is any queueing at all, it would be in Tx side soft 3268 * rings. 3269 */ 3270 3271 /* 3272 * When srs_count becomes 0, reset SRS_TX_HIWAT and 3273 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3274 */ 3275 if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3276 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3277 mac_client_impl_t *mcip = mac_srs->srs_mcip; 3278 boolean_t wakeup_required = B_FALSE; 3279 3280 if (mac_srs->srs_state & 3281 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3282 wakeup_required = B_TRUE; 3283 } 3284 mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3285 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3286 mutex_exit(&mac_srs->srs_lock); 3287 if (wakeup_required) { 3288 mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs); 3289 /* 3290 * If the client is not the primary MAC client, then we 3291 * need to send the notification to the clients upper 3292 * MAC, i.e. mci_upper_mip. 3293 */ 3294 mac_tx_notify(mcip->mci_upper_mip != NULL ? 3295 mcip->mci_upper_mip : mcip->mci_mip); 3296 } 3297 mutex_enter(&mac_srs->srs_lock); 3298 } 3299 mac_srs->srs_state &= ~SRS_PROC; 3300 } 3301 3302 /* 3303 * Given a packet, get the flow_entry that identifies the flow 3304 * to which that packet belongs. The flow_entry will contain 3305 * the transmit function to be used to send the packet. If the 3306 * function returns NULL, the packet should be sent using the 3307 * underlying NIC. 3308 */ 3309 static flow_entry_t * 3310 mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3311 { 3312 flow_entry_t *flent = NULL; 3313 mac_client_impl_t *mcip; 3314 int err; 3315 3316 /* 3317 * Do classification on the packet. 3318 */ 3319 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3320 if (err != 0) 3321 return (NULL); 3322 3323 /* 3324 * This flent might just be an additional one on the MAC client, 3325 * i.e. for classification purposes (different fdesc), however 3326 * the resources, SRS et. al., are in the mci_flent, so if 3327 * this isn't the mci_flent, we need to get it. 3328 */ 3329 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3330 FLOW_REFRELE(flent); 3331 flent = mcip->mci_flent; 3332 FLOW_TRY_REFHOLD(flent, err); 3333 if (err != 0) 3334 return (NULL); 3335 } 3336 3337 return (flent); 3338 } 3339 3340 /* 3341 * This macro is only meant to be used by mac_tx_send(). 3342 */ 3343 #define CHECK_VID_AND_ADD_TAG(mp) { \ 3344 if (vid_check) { \ 3345 int err = 0; \ 3346 \ 3347 MAC_VID_CHECK(src_mcip, (mp), err); \ 3348 if (err != 0) { \ 3349 freemsg((mp)); \ 3350 (mp) = next; \ 3351 oerrors++; \ 3352 continue; \ 3353 } \ 3354 } \ 3355 if (add_tag) { \ 3356 (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3357 if ((mp) == NULL) { \ 3358 (mp) = next; \ 3359 oerrors++; \ 3360 continue; \ 3361 } \ 3362 } \ 3363 } 3364 3365 mblk_t * 3366 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3367 mac_tx_stats_t *stats) 3368 { 3369 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3370 mac_impl_t *mip = src_mcip->mci_mip; 3371 uint_t obytes = 0, opackets = 0, oerrors = 0; 3372 mblk_t *mp = NULL, *next; 3373 boolean_t vid_check, add_tag; 3374 uint16_t vid = 0; 3375 3376 if (mip->mi_nclients > 1) { 3377 vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3378 add_tag = MAC_TAG_NEEDED(src_mcip); 3379 if (add_tag) 3380 vid = mac_client_vid(mch); 3381 } else { 3382 ASSERT(mip->mi_nclients == 1); 3383 vid_check = add_tag = B_FALSE; 3384 } 3385 3386 /* 3387 * Fastpath: if there's only one client, we simply send 3388 * the packet down to the underlying NIC. 3389 */ 3390 if (mip->mi_nactiveclients == 1) { 3391 DTRACE_PROBE2(fastpath, 3392 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3393 3394 mp = mp_chain; 3395 while (mp != NULL) { 3396 next = mp->b_next; 3397 mp->b_next = NULL; 3398 opackets++; 3399 obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3400 msgdsize(mp)); 3401 3402 CHECK_VID_AND_ADD_TAG(mp); 3403 MAC_TX(mip, ring, mp, src_mcip); 3404 3405 /* 3406 * If the driver is out of descriptors and does a 3407 * partial send it will return a chain of unsent 3408 * mblks. Adjust the accounting stats. 3409 */ 3410 if (mp != NULL) { 3411 opackets--; 3412 obytes -= msgdsize(mp); 3413 mp->b_next = next; 3414 break; 3415 } 3416 mp = next; 3417 } 3418 goto done; 3419 } 3420 3421 /* 3422 * No fastpath, we either have more than one MAC client 3423 * defined on top of the same MAC, or one or more MAC 3424 * client promiscuous callbacks. 3425 */ 3426 DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3427 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3428 3429 mp = mp_chain; 3430 while (mp != NULL) { 3431 flow_entry_t *dst_flow_ent; 3432 void *flow_cookie; 3433 size_t pkt_size; 3434 mblk_t *mp1; 3435 3436 next = mp->b_next; 3437 mp->b_next = NULL; 3438 opackets++; 3439 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3440 obytes += pkt_size; 3441 CHECK_VID_AND_ADD_TAG(mp); 3442 3443 /* 3444 * Find the destination. 3445 */ 3446 dst_flow_ent = mac_tx_classify(mip, mp); 3447 3448 if (dst_flow_ent != NULL) { 3449 size_t hdrsize; 3450 int err = 0; 3451 3452 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3453 struct ether_vlan_header *evhp = 3454 (struct ether_vlan_header *)mp->b_rptr; 3455 3456 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3457 hdrsize = sizeof (*evhp); 3458 else 3459 hdrsize = sizeof (struct ether_header); 3460 } else { 3461 mac_header_info_t mhi; 3462 3463 err = mac_header_info((mac_handle_t)mip, 3464 mp, &mhi); 3465 if (err == 0) 3466 hdrsize = mhi.mhi_hdrsize; 3467 } 3468 3469 /* 3470 * Got a matching flow. It's either another 3471 * MAC client, or a broadcast/multicast flow. 3472 * Make sure the packet size is within the 3473 * allowed size. If not drop the packet and 3474 * move to next packet. 3475 */ 3476 if (err != 0 || 3477 (pkt_size - hdrsize) > mip->mi_sdu_max) { 3478 oerrors++; 3479 DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3480 mblk_t *, mp); 3481 freemsg(mp); 3482 mp = next; 3483 FLOW_REFRELE(dst_flow_ent); 3484 continue; 3485 } 3486 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3487 if (flow_cookie != NULL) { 3488 /* 3489 * The vnic_bcast_send function expects 3490 * to receive the sender MAC client 3491 * as value for arg2. 3492 */ 3493 mac_bcast_send(flow_cookie, src_mcip, mp, 3494 B_TRUE); 3495 } else { 3496 /* 3497 * loopback the packet to a local MAC 3498 * client. We force a context switch 3499 * if both source and destination MAC 3500 * clients are used by IP, i.e. 3501 * bypass is set. 3502 */ 3503 boolean_t do_switch; 3504 mac_client_impl_t *dst_mcip = 3505 dst_flow_ent->fe_mcip; 3506 3507 /* 3508 * Check if there are promiscuous mode 3509 * callbacks defined. This check is 3510 * done here in the 'else' case and 3511 * not in other cases because this 3512 * path is for local loopback 3513 * communication which does not go 3514 * through MAC_TX(). For paths that go 3515 * through MAC_TX(), the promisc_list 3516 * check is done inside the MAC_TX() 3517 * macro. 3518 */ 3519 if (mip->mi_promisc_list != NULL) 3520 mac_promisc_dispatch(mip, mp, src_mcip); 3521 3522 do_switch = ((src_mcip->mci_state_flags & 3523 dst_mcip->mci_state_flags & 3524 MCIS_CLIENT_POLL_CAPABLE) != 0); 3525 3526 if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3527 (dst_flow_ent->fe_cb_fn)( 3528 dst_flow_ent->fe_cb_arg1, 3529 dst_flow_ent->fe_cb_arg2, 3530 mp1, do_switch); 3531 } 3532 } 3533 FLOW_REFRELE(dst_flow_ent); 3534 } else { 3535 /* 3536 * Unknown destination, send via the underlying 3537 * NIC. 3538 */ 3539 MAC_TX(mip, ring, mp, src_mcip); 3540 if (mp != NULL) { 3541 /* 3542 * Adjust for the last packet that 3543 * could not be transmitted 3544 */ 3545 opackets--; 3546 obytes -= pkt_size; 3547 mp->b_next = next; 3548 break; 3549 } 3550 } 3551 mp = next; 3552 } 3553 3554 done: 3555 stats->mts_obytes = obytes; 3556 stats->mts_opackets = opackets; 3557 stats->mts_oerrors = oerrors; 3558 return (mp); 3559 } 3560 3561 /* 3562 * mac_tx_srs_ring_present 3563 * 3564 * Returns whether the specified ring is part of the specified SRS. 3565 */ 3566 boolean_t 3567 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3568 { 3569 int i; 3570 mac_soft_ring_t *soft_ring; 3571 3572 if (srs->srs_tx.st_arg2 == tx_ring) 3573 return (B_TRUE); 3574 3575 for (i = 0; i < srs->srs_tx_ring_count; i++) { 3576 soft_ring = srs->srs_tx_soft_rings[i]; 3577 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3578 return (B_TRUE); 3579 } 3580 3581 return (B_FALSE); 3582 } 3583 3584 /* 3585 * mac_tx_srs_get_soft_ring 3586 * 3587 * Returns the TX soft ring associated with the given ring, if present. 3588 */ 3589 mac_soft_ring_t * 3590 mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3591 { 3592 int i; 3593 mac_soft_ring_t *soft_ring; 3594 3595 if (srs->srs_tx.st_arg2 == tx_ring) 3596 return (NULL); 3597 3598 for (i = 0; i < srs->srs_tx_ring_count; i++) { 3599 soft_ring = srs->srs_tx_soft_rings[i]; 3600 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3601 return (soft_ring); 3602 } 3603 3604 return (NULL); 3605 } 3606 3607 /* 3608 * mac_tx_srs_wakeup 3609 * 3610 * Called when Tx desc become available. Wakeup the appropriate worker 3611 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3612 * state field. 3613 */ 3614 void 3615 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3616 { 3617 int i; 3618 mac_soft_ring_t *sringp; 3619 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3620 3621 mutex_enter(&mac_srs->srs_lock); 3622 /* 3623 * srs_tx_ring_count == 0 is the single ring mode case. In 3624 * this mode, there will not be Tx soft rings associated 3625 * with the SRS. 3626 */ 3627 if (!MAC_TX_SOFT_RINGS(mac_srs)) { 3628 if (srs_tx->st_arg2 == ring && 3629 mac_srs->srs_state & SRS_TX_BLOCKED) { 3630 mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3631 srs_tx->st_stat.mts_unblockcnt++; 3632 cv_signal(&mac_srs->srs_async); 3633 } 3634 /* 3635 * A wakeup can come before tx_srs_drain() could 3636 * grab srs lock and set SRS_TX_BLOCKED. So 3637 * always set woken_up flag when we come here. 3638 */ 3639 srs_tx->st_woken_up = B_TRUE; 3640 mutex_exit(&mac_srs->srs_lock); 3641 return; 3642 } 3643 3644 /* 3645 * If you are here, it is for FANOUT, BW_FANOUT, 3646 * AGGR_MODE or AGGR_BW_MODE case 3647 */ 3648 for (i = 0; i < mac_srs->srs_tx_ring_count; i++) { 3649 sringp = mac_srs->srs_tx_soft_rings[i]; 3650 mutex_enter(&sringp->s_ring_lock); 3651 if (sringp->s_ring_tx_arg2 == ring) { 3652 if (sringp->s_ring_state & S_RING_BLOCK) { 3653 sringp->s_ring_state &= ~S_RING_BLOCK; 3654 sringp->s_st_stat.mts_unblockcnt++; 3655 cv_signal(&sringp->s_ring_async); 3656 } 3657 sringp->s_ring_tx_woken_up = B_TRUE; 3658 } 3659 mutex_exit(&sringp->s_ring_lock); 3660 } 3661 mutex_exit(&mac_srs->srs_lock); 3662 } 3663 3664 /* 3665 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3666 * the blocked clients again. 3667 */ 3668 void 3669 mac_tx_notify(mac_impl_t *mip) 3670 { 3671 i_mac_notify(mip, MAC_NOTE_TX); 3672 } 3673 3674 /* 3675 * RX SOFTRING RELATED FUNCTIONS 3676 * 3677 * These functions really belong in mac_soft_ring.c and here for 3678 * a short period. 3679 */ 3680 3681 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3682 /* \ 3683 * Enqueue our mblk chain. \ 3684 */ \ 3685 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3686 \ 3687 if ((ringp)->s_ring_last != NULL) \ 3688 (ringp)->s_ring_last->b_next = (mp); \ 3689 else \ 3690 (ringp)->s_ring_first = (mp); \ 3691 (ringp)->s_ring_last = (tail); \ 3692 (ringp)->s_ring_count += (cnt); \ 3693 ASSERT((ringp)->s_ring_count > 0); \ 3694 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3695 (ringp)->s_ring_size += sz; \ 3696 } \ 3697 } 3698 3699 /* 3700 * Default entry point to deliver a packet chain to a MAC client. 3701 * If the MAC client has flows, do the classification with these 3702 * flows as well. 3703 */ 3704 /* ARGSUSED */ 3705 void 3706 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3707 mac_header_info_t *arg3) 3708 { 3709 mac_client_impl_t *mcip = arg1; 3710 3711 if (mcip->mci_nvids == 1 && 3712 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 3713 /* 3714 * If the client has exactly one VID associated with it 3715 * and striping of VLAN header is not disabled, 3716 * remove the VLAN tag from the packet before 3717 * passing it on to the client's receive callback. 3718 * Note that this needs to be done after we dispatch 3719 * the packet to the promiscuous listeners of the 3720 * client, since they expect to see the whole 3721 * frame including the VLAN headers. 3722 */ 3723 mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3724 } 3725 3726 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3727 } 3728 3729 /* 3730 * mac_rx_soft_ring_process 3731 * 3732 * process a chain for a given soft ring. The number of packets queued 3733 * in the SRS and its associated soft rings (including this one) is 3734 * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3735 * thread (interrupt or poll thread) to do inline processing. This 3736 * helps keep the latency down under low load. 3737 * 3738 * The proc and arg for each mblk is already stored in the mblk in 3739 * appropriate places. 3740 */ 3741 /* ARGSUSED */ 3742 void 3743 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3744 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3745 { 3746 mac_direct_rx_t proc; 3747 void *arg1; 3748 mac_resource_handle_t arg2; 3749 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3750 3751 ASSERT(ringp != NULL); 3752 ASSERT(mp_chain != NULL); 3753 ASSERT(tail != NULL); 3754 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3755 3756 mutex_enter(&ringp->s_ring_lock); 3757 ringp->s_ring_total_inpkt += cnt; 3758 ringp->s_ring_total_rbytes += sz; 3759 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3760 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 3761 /* If on processor or blanking on, then enqueue and return */ 3762 if (ringp->s_ring_state & S_RING_BLANK || 3763 ringp->s_ring_state & S_RING_PROC) { 3764 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3765 mutex_exit(&ringp->s_ring_lock); 3766 return; 3767 } 3768 proc = ringp->s_ring_rx_func; 3769 arg1 = ringp->s_ring_rx_arg1; 3770 arg2 = ringp->s_ring_rx_arg2; 3771 /* 3772 * See if anything is already queued. If we are the 3773 * first packet, do inline processing else queue the 3774 * packet and do the drain. 3775 */ 3776 if (ringp->s_ring_first == NULL) { 3777 /* 3778 * Fast-path, ok to process and nothing queued. 3779 */ 3780 ringp->s_ring_run = curthread; 3781 ringp->s_ring_state |= (S_RING_PROC); 3782 3783 mutex_exit(&ringp->s_ring_lock); 3784 3785 /* 3786 * We are the chain of 1 packet so 3787 * go through this fast path. 3788 */ 3789 ASSERT(mp_chain->b_next == NULL); 3790 3791 (*proc)(arg1, arg2, mp_chain, NULL); 3792 3793 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3794 /* 3795 * If we have a soft ring set which is doing 3796 * bandwidth control, we need to decrement 3797 * srs_size and count so it the SRS can have a 3798 * accurate idea of what is the real data 3799 * queued between SRS and its soft rings. We 3800 * decrement the counters only when the packet 3801 * gets processed by both SRS and the soft ring. 3802 */ 3803 mutex_enter(&mac_srs->srs_lock); 3804 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3805 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3806 mutex_exit(&mac_srs->srs_lock); 3807 3808 mutex_enter(&ringp->s_ring_lock); 3809 ringp->s_ring_run = NULL; 3810 ringp->s_ring_state &= ~S_RING_PROC; 3811 if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3812 cv_signal(&ringp->s_ring_client_cv); 3813 3814 if ((ringp->s_ring_first == NULL) || 3815 (ringp->s_ring_state & S_RING_BLANK)) { 3816 /* 3817 * We processed inline our packet and 3818 * nothing new has arrived or our 3819 * receiver doesn't want to receive 3820 * any packets. We are done. 3821 */ 3822 mutex_exit(&ringp->s_ring_lock); 3823 return; 3824 } 3825 } else { 3826 SOFT_RING_ENQUEUE_CHAIN(ringp, 3827 mp_chain, tail, cnt, sz); 3828 } 3829 3830 /* 3831 * We are here because either we couldn't do inline 3832 * processing (because something was already 3833 * queued), or we had a chain of more than one 3834 * packet, or something else arrived after we were 3835 * done with inline processing. 3836 */ 3837 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3838 ASSERT(ringp->s_ring_first != NULL); 3839 3840 ringp->s_ring_drain_func(ringp); 3841 mutex_exit(&ringp->s_ring_lock); 3842 return; 3843 } else { 3844 /* ST_RING_WORKER_ONLY case */ 3845 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3846 mac_soft_ring_worker_wakeup(ringp); 3847 mutex_exit(&ringp->s_ring_lock); 3848 } 3849 } 3850 3851 /* 3852 * TX SOFTRING RELATED FUNCTIONS 3853 * 3854 * These functions really belong in mac_soft_ring.c and here for 3855 * a short period. 3856 */ 3857 3858 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3859 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3860 ringp->s_ring_state |= S_RING_ENQUEUED; \ 3861 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3862 } 3863 3864 /* 3865 * mac_tx_sring_queued 3866 * 3867 * When we are out of transmit descriptors and we already have a 3868 * queue that exceeds hiwat (or the client called us with 3869 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3870 * soft ring pointer as the opaque cookie for the client enable 3871 * flow control. 3872 */ 3873 static mac_tx_cookie_t 3874 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3875 mblk_t **ret_mp) 3876 { 3877 int cnt; 3878 size_t sz; 3879 mblk_t *tail; 3880 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3881 mac_tx_cookie_t cookie = NULL; 3882 boolean_t wakeup_worker = B_TRUE; 3883 3884 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3885 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3886 if (flag & MAC_DROP_ON_NO_DESC) { 3887 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3888 /* increment freed stats */ 3889 ringp->s_ring_drops += cnt; 3890 cookie = (mac_tx_cookie_t)ringp; 3891 } else { 3892 if (ringp->s_ring_first != NULL) 3893 wakeup_worker = B_FALSE; 3894 3895 if (flag & MAC_TX_NO_ENQUEUE) { 3896 /* 3897 * If QUEUED is not set, queue the packet 3898 * and let mac_tx_soft_ring_drain() set 3899 * the TX_BLOCKED bit for the reasons 3900 * explained above. Otherwise, return the 3901 * mblks. 3902 */ 3903 if (wakeup_worker) { 3904 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3905 mp_chain, tail, cnt, sz); 3906 } else { 3907 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3908 cookie = (mac_tx_cookie_t)ringp; 3909 *ret_mp = mp_chain; 3910 } 3911 } else { 3912 boolean_t enqueue = B_TRUE; 3913 3914 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3915 /* 3916 * flow-controlled. Store ringp in cookie 3917 * so that it can be returned as 3918 * mac_tx_cookie_t to client 3919 */ 3920 ringp->s_ring_state |= S_RING_TX_HIWAT; 3921 cookie = (mac_tx_cookie_t)ringp; 3922 ringp->s_ring_hiwat_cnt++; 3923 if (ringp->s_ring_count > 3924 ringp->s_ring_tx_max_q_cnt) { 3925 /* increment freed stats */ 3926 ringp->s_ring_drops += cnt; 3927 /* 3928 * b_prev may be set to the fanout hint 3929 * hence can't use freemsg directly 3930 */ 3931 mac_pkt_drop(NULL, NULL, 3932 mp_chain, B_FALSE); 3933 DTRACE_PROBE1(tx_queued_hiwat, 3934 mac_soft_ring_t *, ringp); 3935 enqueue = B_FALSE; 3936 } 3937 } 3938 if (enqueue) { 3939 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3940 tail, cnt, sz); 3941 } 3942 } 3943 if (wakeup_worker) 3944 cv_signal(&ringp->s_ring_async); 3945 } 3946 return (cookie); 3947 } 3948 3949 3950 /* 3951 * mac_tx_soft_ring_process 3952 * 3953 * This routine is called when fanning out outgoing traffic among 3954 * multipe Tx rings. 3955 * Note that a soft ring is associated with a h/w Tx ring. 3956 */ 3957 mac_tx_cookie_t 3958 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3959 uint16_t flag, mblk_t **ret_mp) 3960 { 3961 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3962 int cnt; 3963 size_t sz; 3964 mblk_t *tail; 3965 mac_tx_cookie_t cookie = NULL; 3966 3967 ASSERT(ringp != NULL); 3968 ASSERT(mp_chain != NULL); 3969 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3970 /* 3971 * The following modes can come here: SRS_TX_BW_FANOUT, 3972 * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR. 3973 */ 3974 ASSERT(MAC_TX_SOFT_RINGS(mac_srs)); 3975 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3976 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT || 3977 mac_srs->srs_tx.st_mode == SRS_TX_AGGR || 3978 mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR); 3979 3980 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3981 /* Serialization mode */ 3982 3983 mutex_enter(&ringp->s_ring_lock); 3984 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3985 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3986 flag, ret_mp); 3987 mutex_exit(&ringp->s_ring_lock); 3988 return (cookie); 3989 } 3990 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3991 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3992 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3993 /* 3994 * If ring is blocked due to lack of Tx 3995 * descs, just return. Worker thread 3996 * will get scheduled when Tx desc's 3997 * become available. 3998 */ 3999 mutex_exit(&ringp->s_ring_lock); 4000 return (cookie); 4001 } 4002 mac_soft_ring_worker_wakeup(ringp); 4003 mutex_exit(&ringp->s_ring_lock); 4004 return (cookie); 4005 } else { 4006 /* Default fanout mode */ 4007 /* 4008 * S_RING_BLOCKED is set when underlying NIC runs 4009 * out of Tx descs and messages start getting 4010 * queued. It won't get reset until 4011 * tx_srs_drain() completely drains out the 4012 * messages. 4013 */ 4014 mac_tx_stats_t stats; 4015 4016 if (ringp->s_ring_state & S_RING_ENQUEUED) { 4017 /* Tx descs/resources not available */ 4018 mutex_enter(&ringp->s_ring_lock); 4019 if (ringp->s_ring_state & S_RING_ENQUEUED) { 4020 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 4021 flag, ret_mp); 4022 mutex_exit(&ringp->s_ring_lock); 4023 return (cookie); 4024 } 4025 /* 4026 * While we were computing mblk count, the 4027 * flow control condition got relieved. 4028 * Continue with the transmission. 4029 */ 4030 mutex_exit(&ringp->s_ring_lock); 4031 } 4032 4033 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 4034 ringp->s_ring_tx_arg2, mp_chain, &stats); 4035 4036 /* 4037 * Multiple threads could be here sending packets. 4038 * Under such conditions, it is not possible to 4039 * automically set S_RING_BLOCKED bit to indicate 4040 * out of tx desc condition. To atomically set 4041 * this, we queue the returned packet and do 4042 * the setting of S_RING_BLOCKED in 4043 * mac_tx_soft_ring_drain(). 4044 */ 4045 if (mp_chain != NULL) { 4046 mutex_enter(&ringp->s_ring_lock); 4047 cookie = 4048 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 4049 mutex_exit(&ringp->s_ring_lock); 4050 return (cookie); 4051 } 4052 SRS_TX_STATS_UPDATE(mac_srs, &stats); 4053 SOFTRING_TX_STATS_UPDATE(ringp, &stats); 4054 4055 return (NULL); 4056 } 4057 } 4058