1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/callb.h> 28 #include <sys/sdt.h> 29 #include <sys/strsubr.h> 30 #include <sys/strsun.h> 31 #include <sys/vlan.h> 32 #include <inet/ipsec_impl.h> 33 #include <inet/ip_impl.h> 34 #include <inet/sadb.h> 35 #include <inet/ipsecesp.h> 36 #include <inet/ipsecah.h> 37 #include <inet/ip6.h> 38 39 #include <sys/mac_impl.h> 40 #include <sys/mac_client_impl.h> 41 #include <sys/mac_client_priv.h> 42 #include <sys/mac_soft_ring.h> 43 #include <sys/mac_flow_impl.h> 44 45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 46 uintptr_t, uint16_t, mblk_t **); 47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 48 uintptr_t, uint16_t, mblk_t **); 49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 50 uintptr_t, uint16_t, mblk_t **); 51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 52 uintptr_t, uint16_t, mblk_t **); 53 static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *, 54 uintptr_t, uint16_t, mblk_t **); 55 56 typedef struct mac_tx_mode_s { 57 mac_tx_srs_mode_t mac_tx_mode; 58 mac_tx_func_t mac_tx_func; 59 } mac_tx_mode_t; 60 61 /* 62 * There are seven modes of operation on the Tx side. These modes get set 63 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 64 * none of the other modes are user configurable. They get selected by 65 * the system depending upon whether the link (or flow) has multiple Tx 66 * rings or a bandwidth configured, or if the link is an aggr, etc. 67 * 68 * When the Tx SRS is operating in aggr mode (st_mode) or if there are 69 * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or 70 * otherwise) will have a soft ring associated with it. These soft rings 71 * are stored in srs_tx_soft_rings[] array. 72 * 73 * Additionally in the case of aggr, there is the st_soft_rings[] array 74 * in the mac_srs_tx_t structure. This array is used to store the same 75 * set of soft rings that are present in srs_tx_soft_rings[] array but 76 * in a different manner. The soft ring associated with the pseudo Tx 77 * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[] 78 * array. This helps in quickly getting the soft ring associated with the 79 * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to 80 * be used for transmit. 81 */ 82 mac_tx_mode_t mac_tx_mode_list[] = { 83 {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 84 {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 85 {SRS_TX_FANOUT, mac_tx_fanout_mode}, 86 {SRS_TX_BW, mac_tx_bw_mode}, 87 {SRS_TX_BW_FANOUT, mac_tx_bw_mode}, 88 {SRS_TX_AGGR, mac_tx_aggr_mode}, 89 {SRS_TX_BW_AGGR, mac_tx_bw_mode} 90 }; 91 92 /* 93 * Soft Ring Set (SRS) - The Run time code that deals with 94 * dynamic polling from the hardware, bandwidth enforcement, 95 * fanout etc. 96 * 97 * We try to use H/W classification on NIC and assign traffic for 98 * a MAC address to a particular Rx ring or ring group. There is a 99 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 100 * switches the underlying Rx ring between interrupt and 101 * polling mode and enforces any specified B/W control. 102 * 103 * There is always a SRS created and tied to each H/W and S/W rule. 104 * Whenever we create a H/W rule, we always add the the same rule to 105 * S/W classifier and tie a SRS to it. 106 * 107 * In case a B/W control is specified, it is broken into bytes 108 * per ticks and as soon as the quota for a tick is exhausted, 109 * the underlying Rx ring is forced into poll mode for remainder of 110 * the tick. The SRS poll thread only polls for bytes that are 111 * allowed to come in the SRS. We typically let 4x the configured 112 * B/W worth of packets to come in the SRS (to prevent unnecessary 113 * drops due to bursts) but only process the specified amount. 114 * 115 * A MAC client (e.g. a VNIC or aggr) can have 1 or more 116 * Rx rings (and corresponding SRSs) assigned to it. The SRS 117 * in turn can have softrings to do protocol level fanout or 118 * softrings to do S/W based fanout or both. In case the NIC 119 * has no Rx rings, we do S/W classification to respective SRS. 120 * The S/W classification rule is always setup and ready. This 121 * allows the MAC layer to reassign Rx rings whenever needed 122 * but packets still continue to flow via the default path and 123 * getting S/W classified to correct SRS. 124 * 125 * The SRS's are used on both Tx and Rx side. They use the same 126 * data structure but the processing routines have slightly different 127 * semantics due to the fact that Rx side needs to do dynamic 128 * polling etc. 129 * 130 * Dynamic Polling Notes 131 * ===================== 132 * 133 * Each Soft ring set is capable of switching its Rx ring between 134 * interrupt and poll mode and actively 'polls' for packets in 135 * poll mode. If the SRS is implementing a B/W limit, it makes 136 * sure that only Max allowed packets are pulled in poll mode 137 * and goes to poll mode as soon as B/W limit is exceeded. As 138 * such, there are no overheads to implement B/W limits. 139 * 140 * In poll mode, its better to keep the pipeline going where the 141 * SRS worker thread keeps processing packets and poll thread 142 * keeps bringing more packets (specially if they get to run 143 * on different CPUs). This also prevents the overheads associated 144 * by excessive signalling (on NUMA machines, this can be 145 * pretty devastating). The exception is latency optimized case 146 * where worker thread does no work and interrupt and poll thread 147 * are allowed to do their own drain. 148 * 149 * We use the following policy to control Dynamic Polling: 150 * 1) We switch to poll mode anytime the processing 151 * thread causes a backlog to build up in SRS and 152 * its associated Soft Rings (sr_poll_pkt_cnt > 0). 153 * 2) As long as the backlog stays under the low water 154 * mark (sr_lowat), we poll the H/W for more packets. 155 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 156 * water mark, we stay in poll mode but don't poll 157 * the H/W for more packets. 158 * 4) Anytime in polling mode, if we poll the H/W for 159 * packets and find nothing plus we have an existing 160 * backlog (sr_poll_pkt_cnt > 0), we stay in polling 161 * mode but don't poll the H/W for packets anymore 162 * (let the polling thread go to sleep). 163 * 5) Once the backlog is relived (packets are processed) 164 * we reenable polling (by signalling the poll thread) 165 * only when the backlog dips below sr_poll_thres. 166 * 6) sr_hiwat is used exclusively when we are not 167 * polling capable and is used to decide when to 168 * drop packets so the SRS queue length doesn't grow 169 * infinitely. 170 * 171 * NOTE: Also see the block level comment on top of mac_soft_ring.c 172 */ 173 174 /* 175 * mac_latency_optimize 176 * 177 * Controls whether the poll thread can process the packets inline 178 * or let the SRS worker thread do the processing. This applies if 179 * the SRS was not being processed. For latency sensitive traffic, 180 * this needs to be true to allow inline processing. For throughput 181 * under load, this should be false. 182 * 183 * This (and other similar) tunable should be rolled into a link 184 * or flow specific workload hint that can be set using dladm 185 * linkprop (instead of multiple such tunables). 186 */ 187 boolean_t mac_latency_optimize = B_TRUE; 188 189 /* 190 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 191 * 192 * queue a mp or chain in soft ring set and increment the 193 * local count (srs_count) for the SRS and the shared counter 194 * (srs_poll_pkt_cnt - shared between SRS and its soft rings 195 * to track the total unprocessed packets for polling to work 196 * correctly). 197 * 198 * The size (total bytes queued) counters are incremented only 199 * if we are doing B/W control. 200 */ 201 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 202 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 203 if ((mac_srs)->srs_last != NULL) \ 204 (mac_srs)->srs_last->b_next = (head); \ 205 else \ 206 (mac_srs)->srs_first = (head); \ 207 (mac_srs)->srs_last = (tail); \ 208 (mac_srs)->srs_count += count; \ 209 } 210 211 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 212 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 213 \ 214 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 215 srs_rx->sr_poll_pkt_cnt += count; \ 216 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 217 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 218 (mac_srs)->srs_size += (sz); \ 219 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 220 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 221 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 222 } \ 223 } 224 225 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 226 mac_srs->srs_state |= SRS_ENQUEUED; \ 227 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 228 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 229 (mac_srs)->srs_size += (sz); \ 230 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 231 } \ 232 } 233 234 /* 235 * Turn polling on routines 236 */ 237 #define MAC_SRS_POLLING_ON(mac_srs) { \ 238 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 239 if (((mac_srs)->srs_state & \ 240 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 241 (mac_srs)->srs_state |= SRS_POLLING; \ 242 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 243 (mac_srs)->srs_ring); \ 244 (mac_srs)->srs_rx.sr_poll_on++; \ 245 } \ 246 } 247 248 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 249 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 250 if (((mac_srs)->srs_state & \ 251 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 252 (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 253 (mac_srs)->srs_state |= SRS_POLLING; \ 254 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 255 (mac_srs)->srs_ring); \ 256 (mac_srs)->srs_rx.sr_worker_poll_on++; \ 257 } \ 258 } 259 260 /* 261 * MAC_SRS_POLL_RING 262 * 263 * Signal the SRS poll thread to poll the underlying H/W ring 264 * provided it wasn't already polling (SRS_GET_PKTS was set). 265 * 266 * Poll thread gets to run only from mac_rx_srs_drain() and only 267 * if the drain was being done by the worker thread. 268 */ 269 #define MAC_SRS_POLL_RING(mac_srs) { \ 270 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 271 \ 272 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 273 srs_rx->sr_poll_thr_sig++; \ 274 if (((mac_srs)->srs_state & \ 275 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 276 (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 277 (mac_srs)->srs_state |= SRS_GET_PKTS; \ 278 cv_signal(&(mac_srs)->srs_cv); \ 279 } else { \ 280 srs_rx->sr_poll_thr_busy++; \ 281 } \ 282 } 283 284 /* 285 * MAC_SRS_CHECK_BW_CONTROL 286 * 287 * Check to see if next tick has started so we can reset the 288 * SRS_BW_ENFORCED flag and allow more packets to come in the 289 * system. 290 */ 291 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 292 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 293 ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 294 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 295 clock_t now = ddi_get_lbolt(); \ 296 if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \ 297 (mac_srs)->srs_bw->mac_bw_curr_time = now; \ 298 (mac_srs)->srs_bw->mac_bw_used = 0; \ 299 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 300 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 301 } \ 302 } 303 304 /* 305 * MAC_SRS_WORKER_WAKEUP 306 * 307 * Wake up the SRS worker thread to process the queue as long as 308 * no one else is processing the queue. If we are optimizing for 309 * latency, we wake up the worker thread immediately or else we 310 * wait mac_srs_worker_wakeup_ticks before worker thread gets 311 * woken up. 312 */ 313 int mac_srs_worker_wakeup_ticks = 0; 314 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 315 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 316 if (!((mac_srs)->srs_state & SRS_PROC) && \ 317 (mac_srs)->srs_tid == NULL) { \ 318 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 319 (mac_srs_worker_wakeup_ticks == 0)) \ 320 cv_signal(&(mac_srs)->srs_async); \ 321 else \ 322 (mac_srs)->srs_tid = \ 323 timeout(mac_srs_fire, (mac_srs), \ 324 mac_srs_worker_wakeup_ticks); \ 325 } \ 326 } 327 328 #define TX_BANDWIDTH_MODE(mac_srs) \ 329 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 330 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT || \ 331 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR) 332 333 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 334 if (tx_mode == SRS_TX_BW_FANOUT) \ 335 (void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\ 336 else \ 337 (void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL); \ 338 } 339 340 /* 341 * MAC_TX_SRS_BLOCK 342 * 343 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 344 * will be set only if srs_tx_woken_up is FALSE. If 345 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 346 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 347 * attempt to transmit again and not setting SRS_TX_BLOCKED does 348 * that. 349 */ 350 #define MAC_TX_SRS_BLOCK(srs, mp) { \ 351 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 352 if ((srs)->srs_tx.st_woken_up) { \ 353 (srs)->srs_tx.st_woken_up = B_FALSE; \ 354 } else { \ 355 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 356 (srs)->srs_state |= SRS_TX_BLOCKED; \ 357 (srs)->srs_tx.st_stat.mts_blockcnt++; \ 358 } \ 359 } 360 361 /* 362 * MAC_TX_SRS_TEST_HIWAT 363 * 364 * Called before queueing a packet onto Tx SRS to test and set 365 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 366 */ 367 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 368 boolean_t enqueue = 1; \ 369 \ 370 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 371 /* \ 372 * flow-controlled. Store srs in cookie so that it \ 373 * can be returned as mac_tx_cookie_t to client \ 374 */ \ 375 (srs)->srs_state |= SRS_TX_HIWAT; \ 376 cookie = (mac_tx_cookie_t)srs; \ 377 (srs)->srs_tx.st_hiwat_cnt++; \ 378 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 379 /* increment freed stats */ \ 380 (srs)->srs_tx.st_stat.mts_sdrops += cnt; \ 381 /* \ 382 * b_prev may be set to the fanout hint \ 383 * hence can't use freemsg directly \ 384 */ \ 385 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 386 DTRACE_PROBE1(tx_queued_hiwat, \ 387 mac_soft_ring_set_t *, srs); \ 388 enqueue = 0; \ 389 } \ 390 } \ 391 if (enqueue) \ 392 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 393 } 394 395 /* Some utility macros */ 396 #define MAC_SRS_BW_LOCK(srs) \ 397 if (!(srs->srs_type & SRST_TX)) \ 398 mutex_enter(&srs->srs_bw->mac_bw_lock); 399 400 #define MAC_SRS_BW_UNLOCK(srs) \ 401 if (!(srs->srs_type & SRST_TX)) \ 402 mutex_exit(&srs->srs_bw->mac_bw_lock); 403 404 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 405 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 406 /* increment freed stats */ \ 407 mac_srs->srs_tx.st_stat.mts_sdrops++; \ 408 cookie = (mac_tx_cookie_t)srs; \ 409 } 410 411 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 412 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 413 cookie = (mac_tx_cookie_t)srs; \ 414 *ret_mp = mp_chain; \ 415 } 416 417 /* 418 * Drop the rx packet and advance to the next one in the chain. 419 */ 420 static void 421 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 422 { 423 mac_srs_rx_t *srs_rx = &srs->srs_rx; 424 425 ASSERT(mp->b_next == NULL); 426 mutex_enter(&srs->srs_lock); 427 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 428 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 429 mutex_exit(&srs->srs_lock); 430 431 srs_rx->sr_stat.mrs_sdrops++; 432 freemsg(mp); 433 } 434 435 /* DATAPATH RUNTIME ROUTINES */ 436 437 /* 438 * mac_srs_fire 439 * 440 * Timer callback routine for waking up the SRS worker thread. 441 */ 442 static void 443 mac_srs_fire(void *arg) 444 { 445 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 446 447 mutex_enter(&mac_srs->srs_lock); 448 if (mac_srs->srs_tid == 0) { 449 mutex_exit(&mac_srs->srs_lock); 450 return; 451 } 452 453 mac_srs->srs_tid = 0; 454 if (!(mac_srs->srs_state & SRS_PROC)) 455 cv_signal(&mac_srs->srs_async); 456 457 mutex_exit(&mac_srs->srs_lock); 458 } 459 460 /* 461 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 462 * and it is used on the TX path. 463 */ 464 #define HASH_HINT(hint) \ 465 ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8)) 466 467 468 /* 469 * hash based on the src address and the port information. 470 */ 471 #define HASH_ADDR(src, ports) \ 472 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 473 ((ports) >> 8) ^ (ports)) 474 475 #define COMPUTE_INDEX(key, sz) (key % sz) 476 477 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 478 if ((tail) != NULL) { \ 479 ASSERT((tail)->b_next == NULL); \ 480 (tail)->b_next = (mp); \ 481 } else { \ 482 ASSERT((head) == NULL); \ 483 (head) = (mp); \ 484 } \ 485 (tail) = (mp); \ 486 (cnt)++; \ 487 if ((bw_ctl)) \ 488 (sz) += (sz0); \ 489 } 490 491 #define MAC_FANOUT_DEFAULT 0 492 #define MAC_FANOUT_RND_ROBIN 1 493 int mac_fanout_type = MAC_FANOUT_DEFAULT; 494 495 #define MAX_SR_TYPES 3 496 /* fanout types for port based hashing */ 497 enum pkt_type { 498 V4_TCP = 0, 499 V4_UDP, 500 OTH, 501 UNDEF 502 }; 503 504 /* 505 * In general we do port based hashing to spread traffic over different 506 * softrings. The below tunable allows to override that behavior. Setting it 507 * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 508 * is also the applicable to ipv6 packets carrying multiple optional headers 509 * and other uncommon packet types. 510 */ 511 boolean_t mac_src_ipv6_fanout = B_FALSE; 512 513 /* 514 * Pair of local and remote ports in the transport header 515 */ 516 #define PORTS_SIZE 4 517 518 /* 519 * mac_rx_srs_proto_fanout 520 * 521 * This routine delivers packets destined to an SRS into one of the 522 * protocol soft rings. 523 * 524 * Given a chain of packets we need to split it up into multiple sub chains 525 * destined into TCP, UDP or OTH soft ring. Instead of entering 526 * the soft ring one packet at a time, we want to enter it in the form of a 527 * chain otherwise we get this start/stop behaviour where the worker thread 528 * goes to sleep and then next packets comes in forcing it to wake up etc. 529 */ 530 static void 531 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 532 { 533 struct ether_header *ehp; 534 struct ether_vlan_header *evhp; 535 uint32_t sap; 536 ipha_t *ipha; 537 uint8_t *dstaddr; 538 size_t hdrsize; 539 mblk_t *mp; 540 mblk_t *headmp[MAX_SR_TYPES]; 541 mblk_t *tailmp[MAX_SR_TYPES]; 542 int cnt[MAX_SR_TYPES]; 543 size_t sz[MAX_SR_TYPES]; 544 size_t sz1; 545 boolean_t bw_ctl; 546 boolean_t hw_classified; 547 boolean_t dls_bypass; 548 boolean_t is_ether; 549 boolean_t is_unicast; 550 enum pkt_type type; 551 mac_client_impl_t *mcip = mac_srs->srs_mcip; 552 553 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 554 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 555 556 /* 557 * If we don't have a Rx ring, S/W classification would have done 558 * its job and its a packet meant for us. If we were polling on 559 * the default ring (i.e. there was a ring assigned to this SRS), 560 * then we need to make sure that the mac address really belongs 561 * to us. 562 */ 563 hw_classified = mac_srs->srs_ring != NULL && 564 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 565 566 /* 567 * Special clients (eg. VLAN, non ether, etc) need DLS 568 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 569 * such SRSs. Another way of disabling bypass is to set the 570 * MCIS_RX_BYPASS_DISABLE flag. 571 */ 572 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 573 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 574 575 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 576 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 577 bzero(cnt, MAX_SR_TYPES * sizeof (int)); 578 bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 579 580 /* 581 * We got a chain from SRS that we need to send to the soft rings. 582 * Since squeues for TCP & IPv4 sap poll their soft rings (for 583 * performance reasons), we need to separate out v4_tcp, v4_udp 584 * and the rest goes in other. 585 */ 586 while (head != NULL) { 587 mp = head; 588 head = head->b_next; 589 mp->b_next = NULL; 590 591 type = OTH; 592 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 593 594 if (is_ether) { 595 /* 596 * At this point we can be sure the packet at least 597 * has an ether header. 598 */ 599 if (sz1 < sizeof (struct ether_header)) { 600 mac_rx_drop_pkt(mac_srs, mp); 601 continue; 602 } 603 ehp = (struct ether_header *)mp->b_rptr; 604 605 /* 606 * Determine if this is a VLAN or non-VLAN packet. 607 */ 608 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 609 evhp = (struct ether_vlan_header *)mp->b_rptr; 610 sap = ntohs(evhp->ether_type); 611 hdrsize = sizeof (struct ether_vlan_header); 612 /* 613 * Check if the VID of the packet, if any, 614 * belongs to this client. 615 */ 616 if (!mac_client_check_flow_vid(mcip, 617 VLAN_ID(ntohs(evhp->ether_tci)))) { 618 mac_rx_drop_pkt(mac_srs, mp); 619 continue; 620 } 621 } else { 622 hdrsize = sizeof (struct ether_header); 623 } 624 is_unicast = 625 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 626 dstaddr = (uint8_t *)&ehp->ether_dhost; 627 } else { 628 mac_header_info_t mhi; 629 630 if (mac_header_info((mac_handle_t)mcip->mci_mip, 631 mp, &mhi) != 0) { 632 mac_rx_drop_pkt(mac_srs, mp); 633 continue; 634 } 635 hdrsize = mhi.mhi_hdrsize; 636 sap = mhi.mhi_bindsap; 637 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 638 dstaddr = (uint8_t *)mhi.mhi_daddr; 639 } 640 641 if (!dls_bypass) { 642 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 643 cnt[type], bw_ctl, sz[type], sz1, mp); 644 continue; 645 } 646 647 if (sap == ETHERTYPE_IP) { 648 /* 649 * If we are H/W classified, but we have promisc 650 * on, then we need to check for the unicast address. 651 */ 652 if (hw_classified && mcip->mci_promisc_list != NULL) { 653 mac_address_t *map; 654 655 rw_enter(&mcip->mci_rw_lock, RW_READER); 656 map = mcip->mci_unicast; 657 if (bcmp(dstaddr, map->ma_addr, 658 map->ma_len) == 0) 659 type = UNDEF; 660 rw_exit(&mcip->mci_rw_lock); 661 } else if (is_unicast) { 662 type = UNDEF; 663 } 664 } 665 666 /* 667 * This needs to become a contract with the driver for 668 * the fast path. 669 * 670 * In the normal case the packet will have at least the L2 671 * header and the IP + Transport header in the same mblk. 672 * This is usually the case when the NIC driver sends up 673 * the packet. This is also true when the stack generates 674 * a packet that is looped back and when the stack uses the 675 * fastpath mechanism. The normal case is optimized for 676 * performance and may bypass DLS. All other cases go through 677 * the 'OTH' type path without DLS bypass. 678 */ 679 680 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 681 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 682 type = OTH; 683 684 if (type == OTH) { 685 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 686 cnt[type], bw_ctl, sz[type], sz1, mp); 687 continue; 688 } 689 690 ASSERT(type == UNDEF); 691 /* 692 * We look for at least 4 bytes past the IP header to get 693 * the port information. If we get an IP fragment, we don't 694 * have the port information, and we use just the protocol 695 * information. 696 */ 697 switch (ipha->ipha_protocol) { 698 case IPPROTO_TCP: 699 type = V4_TCP; 700 mp->b_rptr += hdrsize; 701 break; 702 case IPPROTO_UDP: 703 type = V4_UDP; 704 mp->b_rptr += hdrsize; 705 break; 706 default: 707 type = OTH; 708 break; 709 } 710 711 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 712 bw_ctl, sz[type], sz1, mp); 713 } 714 715 for (type = V4_TCP; type < UNDEF; type++) { 716 if (headmp[type] != NULL) { 717 mac_soft_ring_t *softring; 718 719 ASSERT(tailmp[type]->b_next == NULL); 720 switch (type) { 721 case V4_TCP: 722 softring = mac_srs->srs_tcp_soft_rings[0]; 723 break; 724 case V4_UDP: 725 softring = mac_srs->srs_udp_soft_rings[0]; 726 break; 727 case OTH: 728 softring = mac_srs->srs_oth_soft_rings[0]; 729 } 730 mac_rx_soft_ring_process(mcip, softring, 731 headmp[type], tailmp[type], cnt[type], sz[type]); 732 } 733 } 734 } 735 736 int fanout_unalligned = 0; 737 738 /* 739 * mac_rx_srs_long_fanout 740 * 741 * The fanout routine for IPv6 742 */ 743 static int 744 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 745 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 746 { 747 ip6_t *ip6h; 748 uint8_t *whereptr; 749 uint_t hash; 750 uint16_t remlen; 751 uint8_t nexthdr; 752 uint16_t hdr_len; 753 754 if (sap == ETHERTYPE_IPV6) { 755 boolean_t modifiable = B_TRUE; 756 757 ASSERT(MBLKL(mp) >= hdrsize); 758 759 ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 760 if ((unsigned char *)ip6h == mp->b_wptr) { 761 /* 762 * The first mblk_t only includes the mac header. 763 * Note that it is safe to change the mp pointer here, 764 * as the subsequent operation does not assume mp 765 * points to the start of the mac header. 766 */ 767 mp = mp->b_cont; 768 769 /* 770 * Make sure ip6h holds the full ip6_t structure. 771 */ 772 if (mp == NULL) 773 return (-1); 774 775 if (MBLKL(mp) < IPV6_HDR_LEN) { 776 modifiable = (DB_REF(mp) == 1); 777 778 if (modifiable && 779 !pullupmsg(mp, IPV6_HDR_LEN)) { 780 return (-1); 781 } 782 } 783 784 ip6h = (ip6_t *)mp->b_rptr; 785 } 786 787 if (!modifiable || !(OK_32PTR((char *)ip6h)) || 788 ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 789 /* 790 * If either ip6h is not alligned, or ip6h does not 791 * hold the complete ip6_t structure (a pullupmsg() 792 * is not an option since it would result in an 793 * unalligned ip6h), fanout to the default ring. Note 794 * that this may cause packets reordering. 795 */ 796 *indx = 0; 797 *type = OTH; 798 fanout_unalligned++; 799 return (0); 800 } 801 802 remlen = ntohs(ip6h->ip6_plen); 803 nexthdr = ip6h->ip6_nxt; 804 805 if (remlen < MIN_EHDR_LEN) 806 return (-1); 807 /* 808 * Do src based fanout if below tunable is set to B_TRUE or 809 * when mac_ip_hdr_length_v6() fails because of malformed 810 * packets or because mblk's need to be concatenated using 811 * pullupmsg(). 812 */ 813 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(ip6h, 814 mp->b_wptr, &hdr_len, &nexthdr, NULL)) { 815 goto src_based_fanout; 816 } 817 whereptr = (uint8_t *)ip6h + hdr_len; 818 819 /* If the transport is one of below, we do port based fanout */ 820 switch (nexthdr) { 821 case IPPROTO_TCP: 822 case IPPROTO_UDP: 823 case IPPROTO_SCTP: 824 case IPPROTO_ESP: 825 /* 826 * If the ports in the transport header is not part of 827 * the mblk, do src_based_fanout, instead of calling 828 * pullupmsg(). 829 */ 830 if (mp->b_cont != NULL && 831 whereptr + PORTS_SIZE > mp->b_wptr) { 832 goto src_based_fanout; 833 } 834 break; 835 default: 836 break; 837 } 838 839 switch (nexthdr) { 840 case IPPROTO_TCP: 841 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 842 *(uint32_t *)whereptr); 843 *indx = COMPUTE_INDEX(hash, 844 mac_srs->srs_tcp_ring_count); 845 *type = OTH; 846 break; 847 848 case IPPROTO_UDP: 849 case IPPROTO_SCTP: 850 case IPPROTO_ESP: 851 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 852 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 853 *(uint32_t *)whereptr); 854 *indx = COMPUTE_INDEX(hash, 855 mac_srs->srs_udp_ring_count); 856 } else { 857 *indx = mac_srs->srs_ind % 858 mac_srs->srs_udp_ring_count; 859 mac_srs->srs_ind++; 860 } 861 *type = OTH; 862 break; 863 864 /* For all other protocol, do source based fanout */ 865 default: 866 goto src_based_fanout; 867 } 868 } else { 869 *indx = 0; 870 *type = OTH; 871 } 872 return (0); 873 874 src_based_fanout: 875 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 876 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 877 *type = OTH; 878 return (0); 879 } 880 881 /* 882 * mac_rx_srs_fanout 883 * 884 * This routine delivers packets destined to an SRS into a soft ring member 885 * of the set. 886 * 887 * Given a chain of packets we need to split it up into multiple sub chains 888 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 889 * the soft ring one packet at a time, we want to enter it in the form of a 890 * chain otherwise we get this start/stop behaviour where the worker thread 891 * goes to sleep and then next packets comes in forcing it to wake up etc. 892 * 893 * Note: 894 * Since we know what is the maximum fanout possible, we create a 2D array 895 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 896 * variables so that we can enter the softrings with chain. We need the 897 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 898 * for each packet would be expensive). If we ever want to have the 899 * ability to have unlimited fanout, we should probably declare a head, 900 * tail, cnt, sz with each soft ring (a data struct which contains a softring 901 * along with these members) and create an array of this uber struct so we 902 * don't have to do kmem_alloc. 903 */ 904 int fanout_oth1 = 0; 905 int fanout_oth2 = 0; 906 int fanout_oth3 = 0; 907 int fanout_oth4 = 0; 908 int fanout_oth5 = 0; 909 910 static void 911 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 912 { 913 struct ether_header *ehp; 914 struct ether_vlan_header *evhp; 915 uint32_t sap; 916 ipha_t *ipha; 917 uint8_t *dstaddr; 918 uint_t indx; 919 size_t ports_offset; 920 size_t ipha_len; 921 size_t hdrsize; 922 uint_t hash; 923 mblk_t *mp; 924 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 925 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 926 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 927 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 928 size_t sz1; 929 boolean_t bw_ctl; 930 boolean_t hw_classified; 931 boolean_t dls_bypass; 932 boolean_t is_ether; 933 boolean_t is_unicast; 934 int fanout_cnt; 935 enum pkt_type type; 936 mac_client_impl_t *mcip = mac_srs->srs_mcip; 937 938 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 939 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 940 941 /* 942 * If we don't have a Rx ring, S/W classification would have done 943 * its job and its a packet meant for us. If we were polling on 944 * the default ring (i.e. there was a ring assigned to this SRS), 945 * then we need to make sure that the mac address really belongs 946 * to us. 947 */ 948 hw_classified = mac_srs->srs_ring != NULL && 949 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 950 951 /* 952 * Special clients (eg. VLAN, non ether, etc) need DLS 953 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 954 * such SRSs. Another way of disabling bypass is to set the 955 * MCIS_RX_BYPASS_DISABLE flag. 956 */ 957 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 958 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 959 960 /* 961 * Since the softrings are never destroyed and we always 962 * create equal number of softrings for TCP, UDP and rest, 963 * its OK to check one of them for count and use it without 964 * any lock. In future, if soft rings get destroyed because 965 * of reduction in fanout, we will need to ensure that happens 966 * behind the SRS_PROC. 967 */ 968 fanout_cnt = mac_srs->srs_tcp_ring_count; 969 970 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 971 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 972 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 973 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 974 975 /* 976 * We got a chain from SRS that we need to send to the soft rings. 977 * Since squeues for TCP & IPv4 sap poll their soft rings (for 978 * performance reasons), we need to separate out v4_tcp, v4_udp 979 * and the rest goes in other. 980 */ 981 while (head != NULL) { 982 mp = head; 983 head = head->b_next; 984 mp->b_next = NULL; 985 986 type = OTH; 987 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 988 989 if (is_ether) { 990 /* 991 * At this point we can be sure the packet at least 992 * has an ether header. 993 */ 994 if (sz1 < sizeof (struct ether_header)) { 995 mac_rx_drop_pkt(mac_srs, mp); 996 continue; 997 } 998 ehp = (struct ether_header *)mp->b_rptr; 999 1000 /* 1001 * Determine if this is a VLAN or non-VLAN packet. 1002 */ 1003 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 1004 evhp = (struct ether_vlan_header *)mp->b_rptr; 1005 sap = ntohs(evhp->ether_type); 1006 hdrsize = sizeof (struct ether_vlan_header); 1007 /* 1008 * Check if the VID of the packet, if any, 1009 * belongs to this client. 1010 */ 1011 if (!mac_client_check_flow_vid(mcip, 1012 VLAN_ID(ntohs(evhp->ether_tci)))) { 1013 mac_rx_drop_pkt(mac_srs, mp); 1014 continue; 1015 } 1016 } else { 1017 hdrsize = sizeof (struct ether_header); 1018 } 1019 is_unicast = 1020 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 1021 dstaddr = (uint8_t *)&ehp->ether_dhost; 1022 } else { 1023 mac_header_info_t mhi; 1024 1025 if (mac_header_info((mac_handle_t)mcip->mci_mip, 1026 mp, &mhi) != 0) { 1027 mac_rx_drop_pkt(mac_srs, mp); 1028 continue; 1029 } 1030 hdrsize = mhi.mhi_hdrsize; 1031 sap = mhi.mhi_bindsap; 1032 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 1033 dstaddr = (uint8_t *)mhi.mhi_daddr; 1034 } 1035 1036 if (!dls_bypass) { 1037 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1038 hdrsize, &type, &indx) == -1) { 1039 mac_rx_drop_pkt(mac_srs, mp); 1040 continue; 1041 } 1042 1043 FANOUT_ENQUEUE_MP(headmp[type][indx], 1044 tailmp[type][indx], cnt[type][indx], bw_ctl, 1045 sz[type][indx], sz1, mp); 1046 continue; 1047 } 1048 1049 1050 /* 1051 * If we are using the default Rx ring where H/W or S/W 1052 * classification has not happened, we need to verify if 1053 * this unicast packet really belongs to us. 1054 */ 1055 if (sap == ETHERTYPE_IP) { 1056 /* 1057 * If we are H/W classified, but we have promisc 1058 * on, then we need to check for the unicast address. 1059 */ 1060 if (hw_classified && mcip->mci_promisc_list != NULL) { 1061 mac_address_t *map; 1062 1063 rw_enter(&mcip->mci_rw_lock, RW_READER); 1064 map = mcip->mci_unicast; 1065 if (bcmp(dstaddr, map->ma_addr, 1066 map->ma_len) == 0) 1067 type = UNDEF; 1068 rw_exit(&mcip->mci_rw_lock); 1069 } else if (is_unicast) { 1070 type = UNDEF; 1071 } 1072 } 1073 1074 /* 1075 * This needs to become a contract with the driver for 1076 * the fast path. 1077 */ 1078 1079 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 1080 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1081 type = OTH; 1082 fanout_oth1++; 1083 } 1084 1085 if (type != OTH) { 1086 uint16_t frag_offset_flags; 1087 1088 switch (ipha->ipha_protocol) { 1089 case IPPROTO_TCP: 1090 case IPPROTO_UDP: 1091 case IPPROTO_SCTP: 1092 case IPPROTO_ESP: 1093 ipha_len = IPH_HDR_LENGTH(ipha); 1094 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1095 mp->b_wptr) { 1096 type = OTH; 1097 break; 1098 } 1099 frag_offset_flags = 1100 ntohs(ipha->ipha_fragment_offset_and_flags); 1101 if ((frag_offset_flags & 1102 (IPH_MF | IPH_OFFSET)) != 0) { 1103 type = OTH; 1104 fanout_oth3++; 1105 break; 1106 } 1107 ports_offset = hdrsize + ipha_len; 1108 break; 1109 default: 1110 type = OTH; 1111 fanout_oth4++; 1112 break; 1113 } 1114 } 1115 1116 if (type == OTH) { 1117 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1118 hdrsize, &type, &indx) == -1) { 1119 mac_rx_drop_pkt(mac_srs, mp); 1120 continue; 1121 } 1122 1123 FANOUT_ENQUEUE_MP(headmp[type][indx], 1124 tailmp[type][indx], cnt[type][indx], bw_ctl, 1125 sz[type][indx], sz1, mp); 1126 continue; 1127 } 1128 1129 ASSERT(type == UNDEF); 1130 1131 /* 1132 * XXX-Sunay: We should hold srs_lock since ring_count 1133 * below can change. But if we are always called from 1134 * mac_rx_srs_drain and SRS_PROC is set, then we can 1135 * enforce that ring_count can't be changed i.e. 1136 * to change fanout type or ring count, the calling 1137 * thread needs to be behind SRS_PROC. 1138 */ 1139 switch (ipha->ipha_protocol) { 1140 case IPPROTO_TCP: 1141 /* 1142 * Note that for ESP, we fanout on SPI and it is at the 1143 * same offset as the 2x16-bit ports. So it is clumped 1144 * along with TCP, UDP and SCTP. 1145 */ 1146 hash = HASH_ADDR(ipha->ipha_src, 1147 *(uint32_t *)(mp->b_rptr + ports_offset)); 1148 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1149 type = V4_TCP; 1150 mp->b_rptr += hdrsize; 1151 break; 1152 case IPPROTO_UDP: 1153 case IPPROTO_SCTP: 1154 case IPPROTO_ESP: 1155 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1156 hash = HASH_ADDR(ipha->ipha_src, 1157 *(uint32_t *)(mp->b_rptr + ports_offset)); 1158 indx = COMPUTE_INDEX(hash, 1159 mac_srs->srs_udp_ring_count); 1160 } else { 1161 indx = mac_srs->srs_ind % 1162 mac_srs->srs_udp_ring_count; 1163 mac_srs->srs_ind++; 1164 } 1165 type = V4_UDP; 1166 mp->b_rptr += hdrsize; 1167 break; 1168 default: 1169 indx = 0; 1170 type = OTH; 1171 } 1172 1173 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1174 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1175 } 1176 1177 for (type = V4_TCP; type < UNDEF; type++) { 1178 int i; 1179 1180 for (i = 0; i < fanout_cnt; i++) { 1181 if (headmp[type][i] != NULL) { 1182 mac_soft_ring_t *softring; 1183 1184 ASSERT(tailmp[type][i]->b_next == NULL); 1185 switch (type) { 1186 case V4_TCP: 1187 softring = 1188 mac_srs->srs_tcp_soft_rings[i]; 1189 break; 1190 case V4_UDP: 1191 softring = 1192 mac_srs->srs_udp_soft_rings[i]; 1193 break; 1194 case OTH: 1195 softring = 1196 mac_srs->srs_oth_soft_rings[i]; 1197 break; 1198 } 1199 mac_rx_soft_ring_process(mcip, 1200 softring, headmp[type][i], tailmp[type][i], 1201 cnt[type][i], sz[type][i]); 1202 } 1203 } 1204 } 1205 } 1206 1207 #define SRS_BYTES_TO_PICKUP 150000 1208 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1209 1210 /* 1211 * mac_rx_srs_poll_ring 1212 * 1213 * This SRS Poll thread uses this routine to poll the underlying hardware 1214 * Rx ring to get a chain of packets. It can inline process that chain 1215 * if mac_latency_optimize is set (default) or signal the SRS worker thread 1216 * to do the remaining processing. 1217 * 1218 * Since packets come in the system via interrupt or poll path, we also 1219 * update the stats and deal with promiscous clients here. 1220 */ 1221 void 1222 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1223 { 1224 kmutex_t *lock = &mac_srs->srs_lock; 1225 kcondvar_t *async = &mac_srs->srs_cv; 1226 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1227 mblk_t *head, *tail, *mp; 1228 callb_cpr_t cprinfo; 1229 ssize_t bytes_to_pickup; 1230 size_t sz; 1231 int count; 1232 mac_client_impl_t *smcip; 1233 1234 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1235 mutex_enter(lock); 1236 1237 start: 1238 for (;;) { 1239 if (mac_srs->srs_state & SRS_PAUSE) 1240 goto done; 1241 1242 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1243 cv_wait(async, lock); 1244 CALLB_CPR_SAFE_END(&cprinfo, lock); 1245 1246 if (mac_srs->srs_state & SRS_PAUSE) 1247 goto done; 1248 1249 check_again: 1250 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1251 /* 1252 * We pick as many bytes as we are allowed to queue. 1253 * Its possible that we will exceed the total 1254 * packets queued in case this SRS is part of the 1255 * Rx ring group since > 1 poll thread can be pulling 1256 * upto the max allowed packets at the same time 1257 * but that should be OK. 1258 */ 1259 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1260 bytes_to_pickup = 1261 mac_srs->srs_bw->mac_bw_drop_threshold - 1262 mac_srs->srs_bw->mac_bw_sz; 1263 /* 1264 * We shouldn't have been signalled if we 1265 * have 0 or less bytes to pick but since 1266 * some of the bytes accounting is driver 1267 * dependant, we do the safety check. 1268 */ 1269 if (bytes_to_pickup < 0) 1270 bytes_to_pickup = 0; 1271 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1272 } else { 1273 /* 1274 * ToDO: Need to change the polling API 1275 * to add a packet count and a flag which 1276 * tells the driver whether we want packets 1277 * based on a count, or bytes, or all the 1278 * packets queued in the driver/HW. This 1279 * way, we never have to check the limits 1280 * on poll path. We truly let only as many 1281 * packets enter the system as we are willing 1282 * to process or queue. 1283 * 1284 * Something along the lines of 1285 * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1286 * mac_srs->srs_poll_pkt_cnt 1287 */ 1288 1289 /* 1290 * Since we are not doing B/W control, pick 1291 * as many packets as allowed. 1292 */ 1293 bytes_to_pickup = max_bytes_to_pickup; 1294 } 1295 1296 /* Poll the underlying Hardware */ 1297 mutex_exit(lock); 1298 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1299 mutex_enter(lock); 1300 1301 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1302 SRS_POLL_THR_OWNER); 1303 1304 mp = tail = head; 1305 count = 0; 1306 sz = 0; 1307 while (mp != NULL) { 1308 tail = mp; 1309 sz += msgdsize(mp); 1310 mp = mp->b_next; 1311 count++; 1312 } 1313 1314 if (head != NULL) { 1315 tail->b_next = NULL; 1316 smcip = mac_srs->srs_mcip; 1317 1318 SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz); 1319 SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count); 1320 1321 /* 1322 * If there are any promiscuous mode callbacks 1323 * defined for this MAC client, pass them a copy 1324 * if appropriate and also update the counters. 1325 */ 1326 if (smcip != NULL) { 1327 if (smcip->mci_mip->mi_promisc_list != NULL) { 1328 mutex_exit(lock); 1329 mac_promisc_dispatch(smcip->mci_mip, 1330 head, NULL); 1331 mutex_enter(lock); 1332 } 1333 } 1334 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1335 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1336 mac_srs->srs_bw->mac_bw_polled += sz; 1337 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1338 } 1339 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1340 count, sz); 1341 if (count <= 10) 1342 srs_rx->sr_stat.mrs_chaincntundr10++; 1343 else if (count > 10 && count <= 50) 1344 srs_rx->sr_stat.mrs_chaincnt10to50++; 1345 else 1346 srs_rx->sr_stat.mrs_chaincntover50++; 1347 } 1348 1349 /* 1350 * We are guaranteed that SRS_PROC will be set if we 1351 * are here. Also, poll thread gets to run only if 1352 * the drain was being done by a worker thread although 1353 * its possible that worker thread is still running 1354 * and poll thread was sent down to keep the pipeline 1355 * going instead of doing a complete drain and then 1356 * trying to poll the NIC. 1357 * 1358 * So we need to check SRS_WORKER flag to make sure 1359 * that the worker thread is not processing the queue 1360 * in parallel to us. The flags and conditions are 1361 * protected by the srs_lock to prevent any race. We 1362 * ensure that we don't drop the srs_lock from now 1363 * till the end and similarly we don't drop the srs_lock 1364 * in mac_rx_srs_drain() till similar condition check 1365 * are complete. The mac_rx_srs_drain() needs to ensure 1366 * that SRS_WORKER flag remains set as long as its 1367 * processing the queue. 1368 */ 1369 if (!(mac_srs->srs_state & SRS_WORKER) && 1370 (mac_srs->srs_first != NULL)) { 1371 /* 1372 * We have packets to process and worker thread 1373 * is not running. Check to see if poll thread is 1374 * allowed to process. 1375 */ 1376 if (mac_srs->srs_state & SRS_LATENCY_OPT) { 1377 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1378 if (!(mac_srs->srs_state & SRS_PAUSE) && 1379 srs_rx->sr_poll_pkt_cnt <= 1380 srs_rx->sr_lowat) { 1381 srs_rx->sr_poll_again++; 1382 goto check_again; 1383 } 1384 /* 1385 * We are already above low water mark 1386 * so stay in the polling mode but no 1387 * need to poll. Once we dip below 1388 * the polling threshold, the processing 1389 * thread (soft ring) will signal us 1390 * to poll again (MAC_UPDATE_SRS_COUNT) 1391 */ 1392 srs_rx->sr_poll_drain_no_poll++; 1393 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1394 /* 1395 * In B/W control case, its possible 1396 * that the backlog built up due to 1397 * B/W limit being reached and packets 1398 * are queued only in SRS. In this case, 1399 * we should schedule worker thread 1400 * since no one else will wake us up. 1401 */ 1402 if ((mac_srs->srs_type & SRST_BW_CONTROL) && 1403 (mac_srs->srs_tid == NULL)) { 1404 mac_srs->srs_tid = 1405 timeout(mac_srs_fire, mac_srs, 1); 1406 srs_rx->sr_poll_worker_wakeup++; 1407 } 1408 } else { 1409 /* 1410 * Wakeup the worker thread for more processing. 1411 * We optimize for throughput in this case. 1412 */ 1413 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1414 MAC_SRS_WORKER_WAKEUP(mac_srs); 1415 srs_rx->sr_poll_sig_worker++; 1416 } 1417 } else if ((mac_srs->srs_first == NULL) && 1418 !(mac_srs->srs_state & SRS_WORKER)) { 1419 /* 1420 * There is nothing queued in SRS and 1421 * no worker thread running. Plus we 1422 * didn't get anything from the H/W 1423 * as well (head == NULL); 1424 */ 1425 ASSERT(head == NULL); 1426 mac_srs->srs_state &= 1427 ~(SRS_PROC|SRS_GET_PKTS); 1428 1429 /* 1430 * If we have a packets in soft ring, don't allow 1431 * more packets to come into this SRS by keeping the 1432 * interrupts off but not polling the H/W. The 1433 * poll thread will get signaled as soon as 1434 * srs_poll_pkt_cnt dips below poll threshold. 1435 */ 1436 if (srs_rx->sr_poll_pkt_cnt == 0) { 1437 srs_rx->sr_poll_intr_enable++; 1438 MAC_SRS_POLLING_OFF(mac_srs); 1439 } else { 1440 /* 1441 * We know nothing is queued in SRS 1442 * since we are here after checking 1443 * srs_first is NULL. The backlog 1444 * is entirely due to packets queued 1445 * in Soft ring which will wake us up 1446 * and get the interface out of polling 1447 * mode once the backlog dips below 1448 * sr_poll_thres. 1449 */ 1450 srs_rx->sr_poll_no_poll++; 1451 } 1452 } else { 1453 /* 1454 * Worker thread is already running. 1455 * Nothing much to do. If the polling 1456 * was enabled, worker thread will deal 1457 * with that. 1458 */ 1459 mac_srs->srs_state &= ~SRS_GET_PKTS; 1460 srs_rx->sr_poll_goto_sleep++; 1461 } 1462 } 1463 done: 1464 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1465 cv_signal(&mac_srs->srs_async); 1466 /* 1467 * If this is a temporary quiesce then wait for the restart signal 1468 * from the srs worker. Then clear the flags and signal the srs worker 1469 * to ensure a positive handshake and go back to start. 1470 */ 1471 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1472 cv_wait(async, lock); 1473 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1474 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1475 mac_srs->srs_state &= 1476 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1477 cv_signal(&mac_srs->srs_async); 1478 goto start; 1479 } else { 1480 mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1481 cv_signal(&mac_srs->srs_async); 1482 CALLB_CPR_EXIT(&cprinfo); 1483 thread_exit(); 1484 } 1485 } 1486 1487 /* 1488 * mac_srs_pick_chain 1489 * 1490 * In Bandwidth control case, checks how many packets can be processed 1491 * and return them in a sub chain. 1492 */ 1493 static mblk_t * 1494 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1495 size_t *chain_sz, int *chain_cnt) 1496 { 1497 mblk_t *head = NULL; 1498 mblk_t *tail = NULL; 1499 size_t sz; 1500 size_t tsz = 0; 1501 int cnt = 0; 1502 mblk_t *mp; 1503 1504 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1505 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1506 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1507 mac_srs->srs_bw->mac_bw_limit) || 1508 (mac_srs->srs_bw->mac_bw_limit == 0)) { 1509 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1510 head = mac_srs->srs_first; 1511 mac_srs->srs_first = NULL; 1512 *chain_tail = mac_srs->srs_last; 1513 mac_srs->srs_last = NULL; 1514 *chain_sz = mac_srs->srs_size; 1515 *chain_cnt = mac_srs->srs_count; 1516 mac_srs->srs_count = 0; 1517 mac_srs->srs_size = 0; 1518 return (head); 1519 } 1520 1521 /* 1522 * Can't clear the entire backlog. 1523 * Need to find how many packets to pick 1524 */ 1525 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1526 while ((mp = mac_srs->srs_first) != NULL) { 1527 sz = msgdsize(mp); 1528 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1529 mac_srs->srs_bw->mac_bw_limit) { 1530 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1531 mac_srs->srs_bw->mac_bw_state |= 1532 SRS_BW_ENFORCED; 1533 break; 1534 } 1535 1536 /* 1537 * The _size & cnt is decremented from the softrings 1538 * when they send up the packet for polling to work 1539 * properly. 1540 */ 1541 tsz += sz; 1542 cnt++; 1543 mac_srs->srs_count--; 1544 mac_srs->srs_size -= sz; 1545 if (tail != NULL) 1546 tail->b_next = mp; 1547 else 1548 head = mp; 1549 tail = mp; 1550 mac_srs->srs_first = mac_srs->srs_first->b_next; 1551 } 1552 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1553 if (mac_srs->srs_first == NULL) 1554 mac_srs->srs_last = NULL; 1555 1556 if (tail != NULL) 1557 tail->b_next = NULL; 1558 *chain_tail = tail; 1559 *chain_cnt = cnt; 1560 *chain_sz = tsz; 1561 1562 return (head); 1563 } 1564 1565 /* 1566 * mac_rx_srs_drain 1567 * 1568 * The SRS drain routine. Gets to run to clear the queue. Any thread 1569 * (worker, interrupt, poll) can call this based on processing model. 1570 * The first thing we do is disable interrupts if possible and then 1571 * drain the queue. we also try to poll the underlying hardware if 1572 * there is a dedicated hardware Rx ring assigned to this SRS. 1573 * 1574 * There is a equivalent drain routine in bandwidth control mode 1575 * mac_rx_srs_drain_bw. There is some code duplication between the two 1576 * routines but they are highly performance sensitive and are easier 1577 * to read/debug if they stay separate. Any code changes here might 1578 * also apply to mac_rx_srs_drain_bw as well. 1579 */ 1580 void 1581 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1582 { 1583 mblk_t *head; 1584 mblk_t *tail; 1585 timeout_id_t tid; 1586 int cnt = 0; 1587 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1588 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1589 1590 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1591 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1592 1593 /* If we are blanked i.e. can't do upcalls, then we are done */ 1594 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1595 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1596 (mac_srs->srs_state & SRS_PAUSE)); 1597 goto out; 1598 } 1599 1600 if (mac_srs->srs_first == NULL) 1601 goto out; 1602 1603 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 1604 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 1605 /* 1606 * In the normal case, the SRS worker thread does no 1607 * work and we wait for a backlog to build up before 1608 * we switch into polling mode. In case we are 1609 * optimizing for throughput, we use the worker thread 1610 * as well. The goal is to let worker thread process 1611 * the queue and poll thread to feed packets into 1612 * the queue. As such, we should signal the poll 1613 * thread to try and get more packets. 1614 * 1615 * We could have pulled this check in the POLL_RING 1616 * macro itself but keeping it explicit here makes 1617 * the architecture more human understandable. 1618 */ 1619 MAC_SRS_POLL_RING(mac_srs); 1620 } 1621 1622 again: 1623 head = mac_srs->srs_first; 1624 mac_srs->srs_first = NULL; 1625 tail = mac_srs->srs_last; 1626 mac_srs->srs_last = NULL; 1627 cnt = mac_srs->srs_count; 1628 mac_srs->srs_count = 0; 1629 1630 ASSERT(head != NULL); 1631 ASSERT(tail != NULL); 1632 1633 if ((tid = mac_srs->srs_tid) != 0) 1634 mac_srs->srs_tid = 0; 1635 1636 mac_srs->srs_state |= (SRS_PROC|proc_type); 1637 1638 1639 /* 1640 * mcip is NULL for broadcast and multicast flows. The promisc 1641 * callbacks for broadcast and multicast packets are delivered from 1642 * mac_rx() and we don't need to worry about that case in this path 1643 */ 1644 if (mcip != NULL) { 1645 if (mcip->mci_promisc_list != NULL) { 1646 mutex_exit(&mac_srs->srs_lock); 1647 mac_promisc_client_dispatch(mcip, head); 1648 mutex_enter(&mac_srs->srs_lock); 1649 } 1650 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) { 1651 mutex_exit(&mac_srs->srs_lock); 1652 mac_protect_intercept_dhcp(mcip, head); 1653 mutex_enter(&mac_srs->srs_lock); 1654 } 1655 } 1656 1657 /* 1658 * Check if SRS itself is doing the processing 1659 * This direct path does not apply when subflows are present. In this 1660 * case, packets need to be dispatched to a soft ring according to the 1661 * flow's bandwidth and other resources contraints. 1662 */ 1663 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1664 mac_direct_rx_t proc; 1665 void *arg1; 1666 mac_resource_handle_t arg2; 1667 1668 /* 1669 * This is the case when a Rx is directly 1670 * assigned and we have a fully classified 1671 * protocol chain. We can deal with it in 1672 * one shot. 1673 */ 1674 proc = srs_rx->sr_func; 1675 arg1 = srs_rx->sr_arg1; 1676 arg2 = srs_rx->sr_arg2; 1677 1678 mac_srs->srs_state |= SRS_CLIENT_PROC; 1679 mutex_exit(&mac_srs->srs_lock); 1680 if (tid != 0) { 1681 (void) untimeout(tid); 1682 tid = 0; 1683 } 1684 1685 proc(arg1, arg2, head, NULL); 1686 /* 1687 * Decrement the size and count here itelf 1688 * since the packet has been processed. 1689 */ 1690 mutex_enter(&mac_srs->srs_lock); 1691 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1692 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1693 cv_signal(&mac_srs->srs_client_cv); 1694 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1695 } else { 1696 /* Some kind of softrings based fanout is required */ 1697 mutex_exit(&mac_srs->srs_lock); 1698 if (tid != 0) { 1699 (void) untimeout(tid); 1700 tid = 0; 1701 } 1702 1703 /* 1704 * Since the fanout routines can deal with chains, 1705 * shoot the entire chain up. 1706 */ 1707 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1708 mac_rx_srs_fanout(mac_srs, head); 1709 else 1710 mac_rx_srs_proto_fanout(mac_srs, head); 1711 mutex_enter(&mac_srs->srs_lock); 1712 } 1713 1714 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 1715 (mac_srs->srs_first != NULL)) { 1716 /* 1717 * More packets arrived while we were clearing the 1718 * SRS. This can be possible because of one of 1719 * three conditions below: 1720 * 1) The driver is using multiple worker threads 1721 * to send the packets to us. 1722 * 2) The driver has a race in switching 1723 * between interrupt and polling mode or 1724 * 3) Packets are arriving in this SRS via the 1725 * S/W classification as well. 1726 * 1727 * We should switch to polling mode and see if we 1728 * need to send the poll thread down. Also, signal 1729 * the worker thread to process whats just arrived. 1730 */ 1731 MAC_SRS_POLLING_ON(mac_srs); 1732 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 1733 srs_rx->sr_drain_poll_sig++; 1734 MAC_SRS_POLL_RING(mac_srs); 1735 } 1736 1737 /* 1738 * If we didn't signal the poll thread, we need 1739 * to deal with the pending packets ourselves. 1740 */ 1741 if (proc_type == SRS_WORKER) { 1742 srs_rx->sr_drain_again++; 1743 goto again; 1744 } else { 1745 srs_rx->sr_drain_worker_sig++; 1746 cv_signal(&mac_srs->srs_async); 1747 } 1748 } 1749 1750 out: 1751 if (mac_srs->srs_state & SRS_GET_PKTS) { 1752 /* 1753 * Poll thread is already running. Leave the 1754 * SRS_RPOC set and hand over the control to 1755 * poll thread. 1756 */ 1757 mac_srs->srs_state &= ~proc_type; 1758 srs_rx->sr_drain_poll_running++; 1759 return; 1760 } 1761 1762 /* 1763 * Even if there are no packets queued in SRS, we 1764 * need to make sure that the shared counter is 1765 * clear and any associated softrings have cleared 1766 * all the backlog. Otherwise, leave the interface 1767 * in polling mode and the poll thread will get 1768 * signalled once the count goes down to zero. 1769 * 1770 * If someone is already draining the queue (SRS_PROC is 1771 * set) when the srs_poll_pkt_cnt goes down to zero, 1772 * then it means that drain is already running and we 1773 * will turn off polling at that time if there is 1774 * no backlog. 1775 * 1776 * As long as there are packets queued either 1777 * in soft ring set or its soft rings, we will leave 1778 * the interface in polling mode (even if the drain 1779 * was done being the interrupt thread). We signal 1780 * the poll thread as well if we have dipped below 1781 * low water mark. 1782 * 1783 * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1784 * since that turn polling on only for worker thread. 1785 * Its not worth turning polling on for interrupt 1786 * thread (since NIC will not issue another interrupt) 1787 * unless a backlog builds up. 1788 */ 1789 if ((srs_rx->sr_poll_pkt_cnt > 0) && 1790 (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1791 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1792 srs_rx->sr_drain_keep_polling++; 1793 MAC_SRS_POLLING_ON(mac_srs); 1794 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1795 MAC_SRS_POLL_RING(mac_srs); 1796 return; 1797 } 1798 1799 /* Nothing else to do. Get out of poll mode */ 1800 MAC_SRS_POLLING_OFF(mac_srs); 1801 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1802 srs_rx->sr_drain_finish_intr++; 1803 } 1804 1805 /* 1806 * mac_rx_srs_drain_bw 1807 * 1808 * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1809 * (worker, interrupt, poll) can call this based on processing model. 1810 * The first thing we do is disable interrupts if possible and then 1811 * drain the queue. we also try to poll the underlying hardware if 1812 * there is a dedicated hardware Rx ring assigned to this SRS. 1813 * 1814 * There is a equivalent drain routine in non bandwidth control mode 1815 * mac_rx_srs_drain. There is some code duplication between the two 1816 * routines but they are highly performance sensitive and are easier 1817 * to read/debug if they stay separate. Any code changes here might 1818 * also apply to mac_rx_srs_drain as well. 1819 */ 1820 void 1821 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1822 { 1823 mblk_t *head; 1824 mblk_t *tail; 1825 timeout_id_t tid; 1826 size_t sz = 0; 1827 int cnt = 0; 1828 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1829 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1830 clock_t now; 1831 1832 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1833 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1834 again: 1835 /* Check if we are doing B/W control */ 1836 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1837 now = ddi_get_lbolt(); 1838 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 1839 mac_srs->srs_bw->mac_bw_curr_time = now; 1840 mac_srs->srs_bw->mac_bw_used = 0; 1841 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1842 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1843 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1844 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1845 goto done; 1846 } else if (mac_srs->srs_bw->mac_bw_used > 1847 mac_srs->srs_bw->mac_bw_limit) { 1848 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1849 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1850 goto done; 1851 } 1852 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1853 1854 /* If we are blanked i.e. can't do upcalls, then we are done */ 1855 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1856 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1857 (mac_srs->srs_state & SRS_PAUSE)); 1858 goto done; 1859 } 1860 1861 sz = 0; 1862 cnt = 0; 1863 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1864 /* 1865 * We couldn't pick up a single packet. 1866 */ 1867 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1868 if ((mac_srs->srs_bw->mac_bw_used == 0) && 1869 (mac_srs->srs_size != 0) && 1870 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1871 /* 1872 * Seems like configured B/W doesn't 1873 * even allow processing of 1 packet 1874 * per tick. 1875 * 1876 * XXX: raise the limit to processing 1877 * at least 1 packet per tick. 1878 */ 1879 mac_srs->srs_bw->mac_bw_limit += 1880 mac_srs->srs_bw->mac_bw_limit; 1881 mac_srs->srs_bw->mac_bw_drop_threshold += 1882 mac_srs->srs_bw->mac_bw_drop_threshold; 1883 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1884 "raised B/W limit to %d since not even a " 1885 "single packet can be processed per " 1886 "tick %d\n", (void *)mac_srs, 1887 (int)mac_srs->srs_bw->mac_bw_limit, 1888 (int)msgdsize(mac_srs->srs_first)); 1889 } 1890 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1891 goto done; 1892 } 1893 1894 ASSERT(head != NULL); 1895 ASSERT(tail != NULL); 1896 1897 /* zero bandwidth: drop all and return to interrupt mode */ 1898 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1899 if (mac_srs->srs_bw->mac_bw_limit == 0) { 1900 srs_rx->sr_stat.mrs_sdrops += cnt; 1901 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1902 mac_srs->srs_bw->mac_bw_sz -= sz; 1903 mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1904 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1905 mac_pkt_drop(NULL, NULL, head, B_FALSE); 1906 goto leave_poll; 1907 } else { 1908 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1909 } 1910 1911 if ((tid = mac_srs->srs_tid) != 0) 1912 mac_srs->srs_tid = 0; 1913 1914 mac_srs->srs_state |= (SRS_PROC|proc_type); 1915 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1916 1917 /* 1918 * mcip is NULL for broadcast and multicast flows. The promisc 1919 * callbacks for broadcast and multicast packets are delivered from 1920 * mac_rx() and we don't need to worry about that case in this path 1921 */ 1922 if (mcip != NULL) { 1923 if (mcip->mci_promisc_list != NULL) { 1924 mutex_exit(&mac_srs->srs_lock); 1925 mac_promisc_client_dispatch(mcip, head); 1926 mutex_enter(&mac_srs->srs_lock); 1927 } 1928 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) { 1929 mutex_exit(&mac_srs->srs_lock); 1930 mac_protect_intercept_dhcp(mcip, head); 1931 mutex_enter(&mac_srs->srs_lock); 1932 } 1933 } 1934 1935 /* 1936 * Check if SRS itself is doing the processing 1937 * This direct path does not apply when subflows are present. In this 1938 * case, packets need to be dispatched to a soft ring according to the 1939 * flow's bandwidth and other resources contraints. 1940 */ 1941 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1942 mac_direct_rx_t proc; 1943 void *arg1; 1944 mac_resource_handle_t arg2; 1945 1946 /* 1947 * This is the case when a Rx is directly 1948 * assigned and we have a fully classified 1949 * protocol chain. We can deal with it in 1950 * one shot. 1951 */ 1952 proc = srs_rx->sr_func; 1953 arg1 = srs_rx->sr_arg1; 1954 arg2 = srs_rx->sr_arg2; 1955 1956 mac_srs->srs_state |= SRS_CLIENT_PROC; 1957 mutex_exit(&mac_srs->srs_lock); 1958 if (tid != 0) { 1959 (void) untimeout(tid); 1960 tid = 0; 1961 } 1962 1963 proc(arg1, arg2, head, NULL); 1964 /* 1965 * Decrement the size and count here itelf 1966 * since the packet has been processed. 1967 */ 1968 mutex_enter(&mac_srs->srs_lock); 1969 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1970 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 1971 1972 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1973 cv_signal(&mac_srs->srs_client_cv); 1974 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1975 } else { 1976 /* Some kind of softrings based fanout is required */ 1977 mutex_exit(&mac_srs->srs_lock); 1978 if (tid != 0) { 1979 (void) untimeout(tid); 1980 tid = 0; 1981 } 1982 1983 /* 1984 * Since the fanout routines can deal with chains, 1985 * shoot the entire chain up. 1986 */ 1987 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1988 mac_rx_srs_fanout(mac_srs, head); 1989 else 1990 mac_rx_srs_proto_fanout(mac_srs, head); 1991 mutex_enter(&mac_srs->srs_lock); 1992 } 1993 1994 /* 1995 * Send the poll thread to pick up any packets arrived 1996 * so far. This also serves as the last check in case 1997 * nothing else is queued in the SRS. The poll thread 1998 * is signalled only in the case the drain was done 1999 * by the worker thread and SRS_WORKER is set. The 2000 * worker thread can run in parallel as long as the 2001 * SRS_WORKER flag is set. We we have nothing else to 2002 * process, we can exit while leaving SRS_PROC set 2003 * which gives the poll thread control to process and 2004 * cleanup once it returns from the NIC. 2005 * 2006 * If we have nothing else to process, we need to 2007 * ensure that we keep holding the srs_lock till 2008 * all the checks below are done and control is 2009 * handed to the poll thread if it was running. 2010 */ 2011 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2012 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2013 if (mac_srs->srs_first != NULL) { 2014 if (proc_type == SRS_WORKER) { 2015 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2016 if (srs_rx->sr_poll_pkt_cnt <= 2017 srs_rx->sr_lowat) 2018 MAC_SRS_POLL_RING(mac_srs); 2019 goto again; 2020 } else { 2021 cv_signal(&mac_srs->srs_async); 2022 } 2023 } 2024 } 2025 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2026 2027 done: 2028 2029 if (mac_srs->srs_state & SRS_GET_PKTS) { 2030 /* 2031 * Poll thread is already running. Leave the 2032 * SRS_RPOC set and hand over the control to 2033 * poll thread. 2034 */ 2035 mac_srs->srs_state &= ~proc_type; 2036 return; 2037 } 2038 2039 /* 2040 * If we can't process packets because we have exceeded 2041 * B/W limit for this tick, just set the timeout 2042 * and leave. 2043 * 2044 * Even if there are no packets queued in SRS, we 2045 * need to make sure that the shared counter is 2046 * clear and any associated softrings have cleared 2047 * all the backlog. Otherwise, leave the interface 2048 * in polling mode and the poll thread will get 2049 * signalled once the count goes down to zero. 2050 * 2051 * If someone is already draining the queue (SRS_PROC is 2052 * set) when the srs_poll_pkt_cnt goes down to zero, 2053 * then it means that drain is already running and we 2054 * will turn off polling at that time if there is 2055 * no backlog. As long as there are packets queued either 2056 * is soft ring set or its soft rings, we will leave 2057 * the interface in polling mode. 2058 */ 2059 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2060 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2061 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2062 (srs_rx->sr_poll_pkt_cnt > 0))) { 2063 MAC_SRS_POLLING_ON(mac_srs); 2064 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2065 if ((mac_srs->srs_first != NULL) && 2066 (mac_srs->srs_tid == NULL)) 2067 mac_srs->srs_tid = timeout(mac_srs_fire, 2068 mac_srs, 1); 2069 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2070 return; 2071 } 2072 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2073 2074 leave_poll: 2075 2076 /* Nothing else to do. Get out of poll mode */ 2077 MAC_SRS_POLLING_OFF(mac_srs); 2078 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2079 } 2080 2081 /* 2082 * mac_srs_worker 2083 * 2084 * The SRS worker routine. Drains the queue when no one else is 2085 * processing it. 2086 */ 2087 void 2088 mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2089 { 2090 kmutex_t *lock = &mac_srs->srs_lock; 2091 kcondvar_t *async = &mac_srs->srs_async; 2092 callb_cpr_t cprinfo; 2093 boolean_t bw_ctl_flag; 2094 2095 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2096 mutex_enter(lock); 2097 2098 start: 2099 for (;;) { 2100 bw_ctl_flag = B_FALSE; 2101 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2102 MAC_SRS_BW_LOCK(mac_srs); 2103 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2104 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2105 bw_ctl_flag = B_TRUE; 2106 MAC_SRS_BW_UNLOCK(mac_srs); 2107 } 2108 /* 2109 * The SRS_BW_ENFORCED flag may change since we have dropped 2110 * the mac_bw_lock. However the drain function can handle both 2111 * a drainable SRS or a bandwidth controlled SRS, and the 2112 * effect of scheduling a timeout is to wakeup the worker 2113 * thread which in turn will call the drain function. Since 2114 * we release the srs_lock atomically only in the cv_wait there 2115 * isn't a fear of waiting for ever. 2116 */ 2117 while (((mac_srs->srs_state & SRS_PROC) || 2118 (mac_srs->srs_first == NULL) || bw_ctl_flag || 2119 (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2120 !(mac_srs->srs_state & SRS_PAUSE)) { 2121 /* 2122 * If we have packets queued and we are here 2123 * because B/W control is in place, we better 2124 * schedule the worker wakeup after 1 tick 2125 * to see if bandwidth control can be relaxed. 2126 */ 2127 if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2128 /* 2129 * We need to ensure that a timer is already 2130 * scheduled or we force schedule one for 2131 * later so that we can continue processing 2132 * after this quanta is over. 2133 */ 2134 mac_srs->srs_tid = timeout(mac_srs_fire, 2135 mac_srs, 1); 2136 } 2137 wait: 2138 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2139 cv_wait(async, lock); 2140 CALLB_CPR_SAFE_END(&cprinfo, lock); 2141 2142 if (mac_srs->srs_state & SRS_PAUSE) 2143 goto done; 2144 if (mac_srs->srs_state & SRS_PROC) 2145 goto wait; 2146 2147 if (mac_srs->srs_first != NULL && 2148 mac_srs->srs_type & SRST_BW_CONTROL) { 2149 MAC_SRS_BW_LOCK(mac_srs); 2150 if (mac_srs->srs_bw->mac_bw_state & 2151 SRS_BW_ENFORCED) { 2152 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2153 } 2154 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2155 SRS_BW_ENFORCED; 2156 MAC_SRS_BW_UNLOCK(mac_srs); 2157 } 2158 } 2159 2160 if (mac_srs->srs_state & SRS_PAUSE) 2161 goto done; 2162 mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2163 } 2164 done: 2165 /* 2166 * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2167 * from both hard and soft classifications and waits for such threads 2168 * to finish before signaling the worker. So at this point the only 2169 * thread left that could be competing with the worker is the poll 2170 * thread. In the case of Tx, there shouldn't be any thread holding 2171 * SRS_PROC at this point. 2172 */ 2173 if (!(mac_srs->srs_state & SRS_PROC)) { 2174 mac_srs->srs_state |= SRS_PROC; 2175 } else { 2176 ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2177 /* 2178 * Poll thread still owns the SRS and is still running 2179 */ 2180 ASSERT((mac_srs->srs_poll_thr == NULL) || 2181 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2182 SRS_POLL_THR_OWNER)); 2183 } 2184 mac_srs_worker_quiesce(mac_srs); 2185 /* 2186 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2187 * of the quiesce operation 2188 */ 2189 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2190 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2191 2192 if (mac_srs->srs_state & SRS_RESTART) { 2193 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2194 mac_srs_worker_restart(mac_srs); 2195 mac_srs->srs_state &= ~SRS_PROC; 2196 goto start; 2197 } 2198 2199 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2200 mac_srs_worker_quiesce(mac_srs); 2201 2202 mac_srs->srs_state &= ~SRS_PROC; 2203 /* The macro drops the srs_lock */ 2204 CALLB_CPR_EXIT(&cprinfo); 2205 thread_exit(); 2206 } 2207 2208 /* 2209 * mac_rx_srs_subflow_process 2210 * 2211 * Receive side routine called from interrupt path when there are 2212 * sub flows present on this SRS. 2213 */ 2214 /* ARGSUSED */ 2215 void 2216 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2217 mblk_t *mp_chain, boolean_t loopback) 2218 { 2219 flow_entry_t *flent = NULL; 2220 flow_entry_t *prev_flent = NULL; 2221 mblk_t *mp = NULL; 2222 mblk_t *tail = NULL; 2223 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2224 mac_client_impl_t *mcip; 2225 2226 mcip = mac_srs->srs_mcip; 2227 ASSERT(mcip != NULL); 2228 2229 /* 2230 * We need to determine the SRS for every packet 2231 * by walking the flow table, if we don't get any, 2232 * then we proceed using the SRS we came with. 2233 */ 2234 mp = tail = mp_chain; 2235 while (mp != NULL) { 2236 2237 /* 2238 * We will increment the stats for the mactching subflow. 2239 * when we get the bytes/pkt count for the classified packets 2240 * later in mac_rx_srs_process. 2241 */ 2242 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2243 FLOW_INBOUND, &flent); 2244 2245 if (mp == mp_chain || flent == prev_flent) { 2246 if (prev_flent != NULL) 2247 FLOW_REFRELE(prev_flent); 2248 prev_flent = flent; 2249 flent = NULL; 2250 tail = mp; 2251 mp = mp->b_next; 2252 continue; 2253 } 2254 tail->b_next = NULL; 2255 /* 2256 * A null indicates, this is for the mac_srs itself. 2257 * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2258 */ 2259 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2260 mac_rx_srs_process(arg, 2261 (mac_resource_handle_t)mac_srs, mp_chain, 2262 loopback); 2263 } else { 2264 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2265 prev_flent->fe_cb_arg2, mp_chain, loopback); 2266 FLOW_REFRELE(prev_flent); 2267 } 2268 prev_flent = flent; 2269 flent = NULL; 2270 mp_chain = mp; 2271 tail = mp; 2272 mp = mp->b_next; 2273 } 2274 /* Last chain */ 2275 ASSERT(mp_chain != NULL); 2276 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2277 mac_rx_srs_process(arg, 2278 (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2279 } else { 2280 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2281 prev_flent->fe_cb_arg2, mp_chain, loopback); 2282 FLOW_REFRELE(prev_flent); 2283 } 2284 } 2285 2286 /* 2287 * mac_rx_srs_process 2288 * 2289 * Receive side routine called from the interrupt path. 2290 * 2291 * loopback is set to force a context switch on the loopback 2292 * path between MAC clients. 2293 */ 2294 /* ARGSUSED */ 2295 void 2296 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2297 boolean_t loopback) 2298 { 2299 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2300 mblk_t *mp, *tail, *head; 2301 int count = 0; 2302 int count1; 2303 size_t sz = 0; 2304 size_t chain_sz, sz1; 2305 mac_bw_ctl_t *mac_bw; 2306 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2307 2308 /* 2309 * Set the tail, count and sz. We set the sz irrespective 2310 * of whether we are doing B/W control or not for the 2311 * purpose of updating the stats. 2312 */ 2313 mp = tail = mp_chain; 2314 while (mp != NULL) { 2315 tail = mp; 2316 count++; 2317 sz += msgdsize(mp); 2318 mp = mp->b_next; 2319 } 2320 2321 mutex_enter(&mac_srs->srs_lock); 2322 2323 if (loopback) { 2324 SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz); 2325 SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count); 2326 2327 } else { 2328 SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz); 2329 SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count); 2330 } 2331 2332 /* 2333 * If the SRS in already being processed; has been blanked; 2334 * can be processed by worker thread only; or the B/W limit 2335 * has been reached, then queue the chain and check if 2336 * worker thread needs to be awakend. 2337 */ 2338 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2339 mac_bw = mac_srs->srs_bw; 2340 ASSERT(mac_bw != NULL); 2341 mutex_enter(&mac_bw->mac_bw_lock); 2342 mac_bw->mac_bw_intr += sz; 2343 if (mac_bw->mac_bw_limit == 0) { 2344 /* zero bandwidth: drop all */ 2345 srs_rx->sr_stat.mrs_sdrops += count; 2346 mac_bw->mac_bw_drop_bytes += sz; 2347 mutex_exit(&mac_bw->mac_bw_lock); 2348 mutex_exit(&mac_srs->srs_lock); 2349 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2350 return; 2351 } else { 2352 if ((mac_bw->mac_bw_sz + sz) <= 2353 mac_bw->mac_bw_drop_threshold) { 2354 mutex_exit(&mac_bw->mac_bw_lock); 2355 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2356 tail, count, sz); 2357 } else { 2358 mp = mp_chain; 2359 chain_sz = 0; 2360 count1 = 0; 2361 tail = NULL; 2362 head = NULL; 2363 while (mp != NULL) { 2364 sz1 = msgdsize(mp); 2365 if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2366 mac_bw->mac_bw_drop_threshold) 2367 break; 2368 chain_sz += sz1; 2369 count1++; 2370 tail = mp; 2371 mp = mp->b_next; 2372 } 2373 mutex_exit(&mac_bw->mac_bw_lock); 2374 if (tail != NULL) { 2375 head = tail->b_next; 2376 tail->b_next = NULL; 2377 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2378 mp_chain, tail, count1, chain_sz); 2379 sz -= chain_sz; 2380 count -= count1; 2381 } else { 2382 /* Can't pick up any */ 2383 head = mp_chain; 2384 } 2385 if (head != NULL) { 2386 /* Drop any packet over the threshold */ 2387 srs_rx->sr_stat.mrs_sdrops += count; 2388 mutex_enter(&mac_bw->mac_bw_lock); 2389 mac_bw->mac_bw_drop_bytes += sz; 2390 mutex_exit(&mac_bw->mac_bw_lock); 2391 freemsgchain(head); 2392 } 2393 } 2394 MAC_SRS_WORKER_WAKEUP(mac_srs); 2395 mutex_exit(&mac_srs->srs_lock); 2396 return; 2397 } 2398 } 2399 2400 /* 2401 * If the total number of packets queued in the SRS and 2402 * its associated soft rings exceeds the max allowed, 2403 * then drop the chain. If we are polling capable, this 2404 * shouldn't be happening. 2405 */ 2406 if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2407 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2408 mac_bw = mac_srs->srs_bw; 2409 srs_rx->sr_stat.mrs_sdrops += count; 2410 mutex_enter(&mac_bw->mac_bw_lock); 2411 mac_bw->mac_bw_drop_bytes += sz; 2412 mutex_exit(&mac_bw->mac_bw_lock); 2413 freemsgchain(mp_chain); 2414 mutex_exit(&mac_srs->srs_lock); 2415 return; 2416 } 2417 2418 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2419 2420 if (!(mac_srs->srs_state & SRS_PROC)) { 2421 /* 2422 * If we are coming via loopback or if we are not 2423 * optimizing for latency, we should signal the 2424 * worker thread. 2425 */ 2426 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 2427 /* 2428 * For loopback, We need to let the worker take 2429 * over as we don't want to continue in the same 2430 * thread even if we can. This could lead to stack 2431 * overflows and may also end up using 2432 * resources (cpu) incorrectly. 2433 */ 2434 cv_signal(&mac_srs->srs_async); 2435 } else { 2436 /* 2437 * Seems like no one is processing the SRS and 2438 * there is no backlog. We also inline process 2439 * our packet if its a single packet in non 2440 * latency optimized case (in latency optimized 2441 * case, we inline process chains of any size). 2442 */ 2443 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2444 } 2445 } 2446 mutex_exit(&mac_srs->srs_lock); 2447 } 2448 2449 /* TX SIDE ROUTINES (RUNTIME) */ 2450 2451 /* 2452 * mac_tx_srs_no_desc 2453 * 2454 * This routine is called by Tx single ring default mode 2455 * when Tx ring runs out of descs. 2456 */ 2457 mac_tx_cookie_t 2458 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2459 uint16_t flag, mblk_t **ret_mp) 2460 { 2461 mac_tx_cookie_t cookie = NULL; 2462 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2463 boolean_t wakeup_worker = B_TRUE; 2464 uint32_t tx_mode = srs_tx->st_mode; 2465 int cnt, sz; 2466 mblk_t *tail; 2467 2468 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2469 if (flag & MAC_DROP_ON_NO_DESC) { 2470 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2471 } else { 2472 if (mac_srs->srs_first != NULL) 2473 wakeup_worker = B_FALSE; 2474 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2475 if (flag & MAC_TX_NO_ENQUEUE) { 2476 /* 2477 * If TX_QUEUED is not set, queue the 2478 * packet and let mac_tx_srs_drain() 2479 * set the TX_BLOCKED bit for the 2480 * reasons explained above. Otherwise, 2481 * return the mblks. 2482 */ 2483 if (wakeup_worker) { 2484 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2485 mp_chain, tail, cnt, sz); 2486 } else { 2487 MAC_TX_SET_NO_ENQUEUE(mac_srs, 2488 mp_chain, ret_mp, cookie); 2489 } 2490 } else { 2491 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2492 tail, cnt, sz, cookie); 2493 } 2494 if (wakeup_worker) 2495 cv_signal(&mac_srs->srs_async); 2496 } 2497 return (cookie); 2498 } 2499 2500 /* 2501 * mac_tx_srs_enqueue 2502 * 2503 * This routine is called when Tx SRS is operating in either serializer 2504 * or bandwidth mode. In serializer mode, a packet will get enqueued 2505 * when a thread cannot enter SRS exclusively. In bandwidth mode, 2506 * packets gets queued if allowed byte-count limit for a tick is 2507 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2508 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2509 * the default mode or fanout mode. Here packets get dropped or 2510 * returned back to the caller only after hi-watermark worth of data 2511 * is queued. 2512 */ 2513 static mac_tx_cookie_t 2514 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2515 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2516 { 2517 mac_tx_cookie_t cookie = NULL; 2518 int cnt, sz; 2519 mblk_t *tail; 2520 boolean_t wakeup_worker = B_TRUE; 2521 2522 /* 2523 * Ignore fanout hint if we don't have multiple tx rings. 2524 */ 2525 if (!MAC_TX_SOFT_RINGS(mac_srs)) 2526 fanout_hint = 0; 2527 2528 if (mac_srs->srs_first != NULL) 2529 wakeup_worker = B_FALSE; 2530 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2531 if (flag & MAC_DROP_ON_NO_DESC) { 2532 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2533 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2534 } else { 2535 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2536 mp_chain, tail, cnt, sz); 2537 } 2538 } else if (flag & MAC_TX_NO_ENQUEUE) { 2539 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2540 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2541 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2542 ret_mp, cookie); 2543 } else { 2544 mp_chain->b_prev = (mblk_t *)fanout_hint; 2545 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2546 mp_chain, tail, cnt, sz); 2547 } 2548 } else { 2549 /* 2550 * If you are BW_ENFORCED, just enqueue the 2551 * packet. srs_worker will drain it at the 2552 * prescribed rate. Before enqueueing, save 2553 * the fanout hint. 2554 */ 2555 mp_chain->b_prev = (mblk_t *)fanout_hint; 2556 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2557 tail, cnt, sz, cookie); 2558 } 2559 if (wakeup_worker) 2560 cv_signal(&mac_srs->srs_async); 2561 return (cookie); 2562 } 2563 2564 /* 2565 * There are seven tx modes: 2566 * 2567 * 1) Default mode (SRS_TX_DEFAULT) 2568 * 2) Serialization mode (SRS_TX_SERIALIZE) 2569 * 3) Fanout mode (SRS_TX_FANOUT) 2570 * 4) Bandwdith mode (SRS_TX_BW) 2571 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2572 * 6) aggr Tx mode (SRS_TX_AGGR) 2573 * 7) aggr Tx bw mode (SRS_TX_BW_AGGR) 2574 * 2575 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2576 * based on the number of Tx rings requested for an SRS and whether 2577 * bandwidth control is requested or not. 2578 * 2579 * The default mode (i.e., no fanout/no bandwidth) is used when the 2580 * underlying NIC does not have Tx rings or just one Tx ring. In this mode, 2581 * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send(). 2582 * When the underlying Tx ring runs out of Tx descs, it starts queueing up 2583 * packets in SRS. When flow-control is relieved, the srs_worker drains 2584 * the queued packets and informs blocked clients to restart sending 2585 * packets. 2586 * 2587 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This 2588 * mode is used when the link has no Tx rings or only one Tx ring. 2589 * 2590 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2591 * Tx rings. Each Tx ring will have a soft ring associated with it. 2592 * These soft rings will be hung off the Tx SRS. Queueing if it happens 2593 * due to lack of Tx desc will be in individual soft ring (and not srs) 2594 * associated with Tx ring. 2595 * 2596 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2597 * only if bw is available. Otherwise the packets will be queued in 2598 * SRS. If fanout to multiple Tx rings is configured, the packets will 2599 * be fanned out among the soft rings associated with the Tx rings. 2600 * 2601 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 2602 * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring 2603 * belonging to a port on which the packet has to be sent. Aggr will 2604 * always have a pseudo Tx ring associated with it even when it is an 2605 * aggregation over a single NIC that has no Tx rings. Even in such a 2606 * case, the single pseudo Tx ring will have a soft ring associated with 2607 * it and the soft ring will hang off the SRS. 2608 * 2609 * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used. 2610 * In this mode, the bandwidth is first applied on the outgoing packets 2611 * and later mac_tx_addr_mode() function is called to send the packet out 2612 * of one of the pseudo Tx rings. 2613 * 2614 * Four flags are used in srs_state for indicating flow control 2615 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2616 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2617 * driver below. 2618 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2619 * and flow-control pressure is applied back to clients. The clients expect 2620 * wakeup when flow-control is relieved. 2621 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2622 * got returned back to client either due to lack of Tx descs or due to bw 2623 * control reasons. The clients expect a wakeup when condition is relieved. 2624 * 2625 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2626 * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2627 * MAC_TX_NO_ENQUEUE 2628 * Mac clients that do not want packets to be enqueued in the mac layer set 2629 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2630 * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2631 * behaviour of this flag is different when the Tx is running in serializer 2632 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2633 * get dropped when Tx high watermark is reached. 2634 * There are some mac clients like vsw, aggr that want the mblks to be 2635 * returned back to clients instead of being queued in Tx SRS (or Tx soft 2636 * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2637 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2638 * In the default and Tx fanout mode, the un-transmitted mblks will be 2639 * returned back to the clients when the driver runs out of Tx descs. 2640 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2641 * soft ring) so that the clients can be woken up when Tx desc become 2642 * available. When running in serializer or bandwidth mode mode, 2643 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2644 */ 2645 2646 mac_tx_func_t 2647 mac_tx_get_func(uint32_t mode) 2648 { 2649 return (mac_tx_mode_list[mode].mac_tx_func); 2650 } 2651 2652 /* ARGSUSED */ 2653 static mac_tx_cookie_t 2654 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2655 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2656 { 2657 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2658 mac_tx_stats_t stats; 2659 mac_tx_cookie_t cookie = NULL; 2660 2661 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2662 2663 /* Regular case with a single Tx ring */ 2664 /* 2665 * SRS_TX_BLOCKED is set when underlying NIC runs 2666 * out of Tx descs and messages start getting 2667 * queued. It won't get reset until 2668 * tx_srs_drain() completely drains out the 2669 * messages. 2670 */ 2671 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2672 /* Tx descs/resources not available */ 2673 mutex_enter(&mac_srs->srs_lock); 2674 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2675 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2676 flag, ret_mp); 2677 mutex_exit(&mac_srs->srs_lock); 2678 return (cookie); 2679 } 2680 /* 2681 * While we were computing mblk count, the 2682 * flow control condition got relieved. 2683 * Continue with the transmission. 2684 */ 2685 mutex_exit(&mac_srs->srs_lock); 2686 } 2687 2688 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2689 mp_chain, &stats); 2690 2691 /* 2692 * Multiple threads could be here sending packets. 2693 * Under such conditions, it is not possible to 2694 * automically set SRS_TX_BLOCKED bit to indicate 2695 * out of tx desc condition. To atomically set 2696 * this, we queue the returned packet and do 2697 * the setting of SRS_TX_BLOCKED in 2698 * mac_tx_srs_drain(). 2699 */ 2700 if (mp_chain != NULL) { 2701 mutex_enter(&mac_srs->srs_lock); 2702 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2703 mutex_exit(&mac_srs->srs_lock); 2704 return (cookie); 2705 } 2706 SRS_TX_STATS_UPDATE(mac_srs, &stats); 2707 2708 return (NULL); 2709 } 2710 2711 /* 2712 * mac_tx_serialize_mode 2713 * 2714 * This is an experimental mode implemented as per the request of PAE. 2715 * In this mode, all callers attempting to send a packet to the NIC 2716 * will get serialized. Only one thread at any time will access the 2717 * NIC to send the packet out. 2718 */ 2719 /* ARGSUSED */ 2720 static mac_tx_cookie_t 2721 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2722 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2723 { 2724 mac_tx_stats_t stats; 2725 mac_tx_cookie_t cookie = NULL; 2726 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2727 2728 /* Single ring, serialize below */ 2729 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2730 mutex_enter(&mac_srs->srs_lock); 2731 if ((mac_srs->srs_first != NULL) || 2732 (mac_srs->srs_state & SRS_PROC)) { 2733 /* 2734 * In serialization mode, queue all packets until 2735 * TX_HIWAT is set. 2736 * If drop bit is set, drop if TX_HIWAT is set. 2737 * If no_enqueue is set, still enqueue until hiwat 2738 * is set and return mblks after TX_HIWAT is set. 2739 */ 2740 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2741 flag, NULL, ret_mp); 2742 mutex_exit(&mac_srs->srs_lock); 2743 return (cookie); 2744 } 2745 /* 2746 * No packets queued, nothing on proc and no flow 2747 * control condition. Fast-path, ok. Do inline 2748 * processing. 2749 */ 2750 mac_srs->srs_state |= SRS_PROC; 2751 mutex_exit(&mac_srs->srs_lock); 2752 2753 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2754 mp_chain, &stats); 2755 2756 mutex_enter(&mac_srs->srs_lock); 2757 mac_srs->srs_state &= ~SRS_PROC; 2758 if (mp_chain != NULL) { 2759 cookie = mac_tx_srs_enqueue(mac_srs, 2760 mp_chain, flag, NULL, ret_mp); 2761 } 2762 if (mac_srs->srs_first != NULL) { 2763 /* 2764 * We processed inline our packet and a new 2765 * packet/s got queued while we were 2766 * processing. Wakeup srs worker 2767 */ 2768 cv_signal(&mac_srs->srs_async); 2769 } 2770 mutex_exit(&mac_srs->srs_lock); 2771 2772 if (cookie == NULL) 2773 SRS_TX_STATS_UPDATE(mac_srs, &stats); 2774 2775 return (cookie); 2776 } 2777 2778 /* 2779 * mac_tx_fanout_mode 2780 * 2781 * In this mode, the SRS will have access to multiple Tx rings to send 2782 * the packet out. The fanout hint that is passed as an argument is 2783 * used to find an appropriate ring to fanout the traffic. Each Tx 2784 * ring, in turn, will have a soft ring associated with it. If a Tx 2785 * ring runs out of Tx desc's the returned packet will be queued in 2786 * the soft ring associated with that Tx ring. The srs itself will not 2787 * queue any packets. 2788 */ 2789 2790 #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 2791 index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count), \ 2792 softring = mac_srs->srs_tx_soft_rings[index]; \ 2793 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 2794 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 2795 } 2796 2797 static mac_tx_cookie_t 2798 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2799 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2800 { 2801 mac_soft_ring_t *softring; 2802 uint64_t hash; 2803 uint_t index; 2804 mac_tx_cookie_t cookie = NULL; 2805 2806 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 2807 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 2808 if (fanout_hint != 0) { 2809 /* 2810 * The hint is specified by the caller, simply pass the 2811 * whole chain to the soft ring. 2812 */ 2813 hash = HASH_HINT(fanout_hint); 2814 MAC_TX_SOFT_RING_PROCESS(mp_chain); 2815 } else { 2816 mblk_t *last_mp, *cur_mp, *sub_chain; 2817 uint64_t last_hash = 0; 2818 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 2819 2820 /* 2821 * Compute the hash from the contents (headers) of the 2822 * packets of the mblk chain. Split the chains into 2823 * subchains of the same conversation. 2824 * 2825 * Since there may be more than one ring used for 2826 * sub-chains of the same call, and since the caller 2827 * does not maintain per conversation state since it 2828 * passed a zero hint, unsent subchains will be 2829 * dropped. 2830 */ 2831 2832 flag |= MAC_DROP_ON_NO_DESC; 2833 ret_mp = NULL; 2834 2835 ASSERT(ret_mp == NULL); 2836 2837 sub_chain = NULL; 2838 last_mp = NULL; 2839 2840 for (cur_mp = mp_chain; cur_mp != NULL; 2841 cur_mp = cur_mp->b_next) { 2842 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 2843 B_TRUE); 2844 if (last_hash != 0 && hash != last_hash) { 2845 /* 2846 * Starting a different subchain, send current 2847 * chain out. 2848 */ 2849 ASSERT(last_mp != NULL); 2850 last_mp->b_next = NULL; 2851 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2852 sub_chain = NULL; 2853 } 2854 2855 /* add packet to subchain */ 2856 if (sub_chain == NULL) 2857 sub_chain = cur_mp; 2858 last_mp = cur_mp; 2859 last_hash = hash; 2860 } 2861 2862 if (sub_chain != NULL) { 2863 /* send last subchain */ 2864 ASSERT(last_mp != NULL); 2865 last_mp->b_next = NULL; 2866 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2867 } 2868 2869 cookie = NULL; 2870 } 2871 2872 return (cookie); 2873 } 2874 2875 /* 2876 * mac_tx_bw_mode 2877 * 2878 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2879 * only if bw is available. Otherwise the packets will be queued in 2880 * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2881 * out to a Tx rings. 2882 */ 2883 static mac_tx_cookie_t 2884 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2885 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2886 { 2887 int cnt, sz; 2888 mblk_t *tail; 2889 mac_tx_cookie_t cookie = NULL; 2890 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2891 clock_t now; 2892 2893 ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2894 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2895 mutex_enter(&mac_srs->srs_lock); 2896 if (mac_srs->srs_bw->mac_bw_limit == 0) { 2897 /* 2898 * zero bandwidth, no traffic is sent: drop the packets, 2899 * or return the whole chain if the caller requests all 2900 * unsent packets back. 2901 */ 2902 if (flag & MAC_TX_NO_ENQUEUE) { 2903 cookie = (mac_tx_cookie_t)mac_srs; 2904 *ret_mp = mp_chain; 2905 } else { 2906 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2907 } 2908 mutex_exit(&mac_srs->srs_lock); 2909 return (cookie); 2910 } else if ((mac_srs->srs_first != NULL) || 2911 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2912 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2913 fanout_hint, ret_mp); 2914 mutex_exit(&mac_srs->srs_lock); 2915 return (cookie); 2916 } 2917 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2918 now = ddi_get_lbolt(); 2919 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 2920 mac_srs->srs_bw->mac_bw_curr_time = now; 2921 mac_srs->srs_bw->mac_bw_used = 0; 2922 } else if (mac_srs->srs_bw->mac_bw_used > 2923 mac_srs->srs_bw->mac_bw_limit) { 2924 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2925 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2926 mp_chain, tail, cnt, sz); 2927 /* 2928 * Wakeup worker thread. Note that worker 2929 * thread has to be woken up so that it 2930 * can fire up the timer to be woken up 2931 * on the next tick. Also once 2932 * BW_ENFORCED is set, it can only be 2933 * reset by srs_worker thread. Until then 2934 * all packets will get queued up in SRS 2935 * and hence this this code path won't be 2936 * entered until BW_ENFORCED is reset. 2937 */ 2938 cv_signal(&mac_srs->srs_async); 2939 mutex_exit(&mac_srs->srs_lock); 2940 return (cookie); 2941 } 2942 2943 mac_srs->srs_bw->mac_bw_used += sz; 2944 mutex_exit(&mac_srs->srs_lock); 2945 2946 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2947 mac_soft_ring_t *softring; 2948 uint_t indx, hash; 2949 2950 hash = HASH_HINT(fanout_hint); 2951 indx = COMPUTE_INDEX(hash, 2952 mac_srs->srs_tx_ring_count); 2953 softring = mac_srs->srs_tx_soft_rings[indx]; 2954 return (mac_tx_soft_ring_process(softring, mp_chain, flag, 2955 ret_mp)); 2956 } else if (srs_tx->st_mode == SRS_TX_BW_AGGR) { 2957 return (mac_tx_aggr_mode(mac_srs, mp_chain, 2958 fanout_hint, flag, ret_mp)); 2959 } else { 2960 mac_tx_stats_t stats; 2961 2962 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2963 mp_chain, &stats); 2964 2965 if (mp_chain != NULL) { 2966 mutex_enter(&mac_srs->srs_lock); 2967 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2968 if (mac_srs->srs_bw->mac_bw_used > sz) 2969 mac_srs->srs_bw->mac_bw_used -= sz; 2970 else 2971 mac_srs->srs_bw->mac_bw_used = 0; 2972 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2973 fanout_hint, ret_mp); 2974 mutex_exit(&mac_srs->srs_lock); 2975 return (cookie); 2976 } 2977 SRS_TX_STATS_UPDATE(mac_srs, &stats); 2978 2979 return (NULL); 2980 } 2981 } 2982 2983 /* 2984 * mac_tx_aggr_mode 2985 * 2986 * This routine invokes an aggr function, aggr_find_tx_ring(), to find 2987 * a (pseudo) Tx ring belonging to a port on which the packet has to 2988 * be sent. aggr_find_tx_ring() first finds the outgoing port based on 2989 * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick 2990 * a Tx ring from the selected port. 2991 * 2992 * Note that a port can be deleted from the aggregation. In such a case, 2993 * the aggregation layer first separates the port from the rest of the 2994 * ports making sure that port (and thus any Tx rings associated with 2995 * it) won't get selected in the call to aggr_find_tx_ring() function. 2996 * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring 2997 * handles one by one which in turn will quiesce the Tx SRS and remove 2998 * the soft ring associated with the pseudo Tx ring. Unlike Rx side 2999 * where a cookie is used to protect against mac_rx_ring() calls on 3000 * rings that have been removed, no such cookie is needed on the Tx 3001 * side as the pseudo Tx ring won't be available anymore to 3002 * aggr_find_tx_ring() once the port has been removed. 3003 */ 3004 static mac_tx_cookie_t 3005 mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 3006 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 3007 { 3008 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3009 mac_tx_ring_fn_t find_tx_ring_fn; 3010 mac_ring_handle_t ring = NULL; 3011 void *arg; 3012 mac_soft_ring_t *sringp; 3013 3014 find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn; 3015 arg = srs_tx->st_capab_aggr.mca_arg; 3016 if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL) 3017 return (NULL); 3018 sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index]; 3019 return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp)); 3020 } 3021 3022 void 3023 mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie) 3024 { 3025 mac_cb_t *mcb; 3026 mac_tx_notify_cb_t *mtnfp; 3027 3028 /* Wakeup callback registered clients */ 3029 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3030 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3031 mcb = mcb->mcb_nextp) { 3032 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3033 mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie); 3034 } 3035 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3036 &mcip->mci_tx_notify_cb_list); 3037 } 3038 3039 /* ARGSUSED */ 3040 void 3041 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 3042 { 3043 mblk_t *head, *tail; 3044 size_t sz; 3045 uint32_t tx_mode; 3046 uint_t saved_pkt_count; 3047 mac_tx_stats_t stats; 3048 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3049 clock_t now; 3050 3051 saved_pkt_count = 0; 3052 ASSERT(mutex_owned(&mac_srs->srs_lock)); 3053 ASSERT(!(mac_srs->srs_state & SRS_PROC)); 3054 3055 mac_srs->srs_state |= SRS_PROC; 3056 3057 tx_mode = srs_tx->st_mode; 3058 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 3059 if (mac_srs->srs_first != NULL) { 3060 head = mac_srs->srs_first; 3061 tail = mac_srs->srs_last; 3062 saved_pkt_count = mac_srs->srs_count; 3063 mac_srs->srs_first = NULL; 3064 mac_srs->srs_last = NULL; 3065 mac_srs->srs_count = 0; 3066 mutex_exit(&mac_srs->srs_lock); 3067 3068 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3069 head, &stats); 3070 3071 mutex_enter(&mac_srs->srs_lock); 3072 if (head != NULL) { 3073 /* Device out of tx desc, set block */ 3074 if (head->b_next == NULL) 3075 VERIFY(head == tail); 3076 tail->b_next = mac_srs->srs_first; 3077 mac_srs->srs_first = head; 3078 mac_srs->srs_count += 3079 (saved_pkt_count - stats.mts_opackets); 3080 if (mac_srs->srs_last == NULL) 3081 mac_srs->srs_last = tail; 3082 MAC_TX_SRS_BLOCK(mac_srs, head); 3083 } else { 3084 srs_tx->st_woken_up = B_FALSE; 3085 SRS_TX_STATS_UPDATE(mac_srs, &stats); 3086 } 3087 } 3088 } else if (tx_mode == SRS_TX_BW) { 3089 /* 3090 * We are here because the timer fired and we have some data 3091 * to tranmit. Also mac_tx_srs_worker should have reset 3092 * SRS_BW_ENFORCED flag 3093 */ 3094 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 3095 head = tail = mac_srs->srs_first; 3096 while (mac_srs->srs_first != NULL) { 3097 tail = mac_srs->srs_first; 3098 tail->b_prev = NULL; 3099 mac_srs->srs_first = tail->b_next; 3100 if (mac_srs->srs_first == NULL) 3101 mac_srs->srs_last = NULL; 3102 mac_srs->srs_count--; 3103 sz = msgdsize(tail); 3104 mac_srs->srs_size -= sz; 3105 saved_pkt_count++; 3106 MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 3107 3108 if (mac_srs->srs_bw->mac_bw_used < 3109 mac_srs->srs_bw->mac_bw_limit) 3110 continue; 3111 3112 now = ddi_get_lbolt(); 3113 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3114 mac_srs->srs_bw->mac_bw_curr_time = now; 3115 mac_srs->srs_bw->mac_bw_used = sz; 3116 continue; 3117 } 3118 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3119 break; 3120 } 3121 3122 ASSERT((head == NULL && tail == NULL) || 3123 (head != NULL && tail != NULL)); 3124 if (tail != NULL) { 3125 tail->b_next = NULL; 3126 mutex_exit(&mac_srs->srs_lock); 3127 3128 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3129 head, &stats); 3130 3131 mutex_enter(&mac_srs->srs_lock); 3132 if (head != NULL) { 3133 uint_t size_sent; 3134 3135 /* Device out of tx desc, set block */ 3136 if (head->b_next == NULL) 3137 VERIFY(head == tail); 3138 tail->b_next = mac_srs->srs_first; 3139 mac_srs->srs_first = head; 3140 mac_srs->srs_count += 3141 (saved_pkt_count - stats.mts_opackets); 3142 if (mac_srs->srs_last == NULL) 3143 mac_srs->srs_last = tail; 3144 size_sent = sz - stats.mts_obytes; 3145 mac_srs->srs_size += size_sent; 3146 mac_srs->srs_bw->mac_bw_sz += size_sent; 3147 if (mac_srs->srs_bw->mac_bw_used > size_sent) { 3148 mac_srs->srs_bw->mac_bw_used -= 3149 size_sent; 3150 } else { 3151 mac_srs->srs_bw->mac_bw_used = 0; 3152 } 3153 MAC_TX_SRS_BLOCK(mac_srs, head); 3154 } else { 3155 srs_tx->st_woken_up = B_FALSE; 3156 SRS_TX_STATS_UPDATE(mac_srs, &stats); 3157 } 3158 } 3159 } else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) { 3160 mblk_t *prev; 3161 uint64_t hint; 3162 3163 /* 3164 * We are here because the timer fired and we 3165 * have some quota to tranmit. 3166 */ 3167 prev = NULL; 3168 head = tail = mac_srs->srs_first; 3169 while (mac_srs->srs_first != NULL) { 3170 tail = mac_srs->srs_first; 3171 mac_srs->srs_first = tail->b_next; 3172 if (mac_srs->srs_first == NULL) 3173 mac_srs->srs_last = NULL; 3174 mac_srs->srs_count--; 3175 sz = msgdsize(tail); 3176 mac_srs->srs_size -= sz; 3177 mac_srs->srs_bw->mac_bw_used += sz; 3178 if (prev == NULL) 3179 hint = (ulong_t)tail->b_prev; 3180 if (hint != (ulong_t)tail->b_prev) { 3181 prev->b_next = NULL; 3182 mutex_exit(&mac_srs->srs_lock); 3183 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3184 head = tail; 3185 hint = (ulong_t)tail->b_prev; 3186 mutex_enter(&mac_srs->srs_lock); 3187 } 3188 3189 prev = tail; 3190 tail->b_prev = NULL; 3191 if (mac_srs->srs_bw->mac_bw_used < 3192 mac_srs->srs_bw->mac_bw_limit) 3193 continue; 3194 3195 now = ddi_get_lbolt(); 3196 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3197 mac_srs->srs_bw->mac_bw_curr_time = now; 3198 mac_srs->srs_bw->mac_bw_used = 0; 3199 continue; 3200 } 3201 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3202 break; 3203 } 3204 ASSERT((head == NULL && tail == NULL) || 3205 (head != NULL && tail != NULL)); 3206 if (tail != NULL) { 3207 tail->b_next = NULL; 3208 mutex_exit(&mac_srs->srs_lock); 3209 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3210 mutex_enter(&mac_srs->srs_lock); 3211 } 3212 } 3213 /* 3214 * SRS_TX_FANOUT case not considered here because packets 3215 * won't be queued in the SRS for this case. Packets will 3216 * be sent directly to soft rings underneath and if there 3217 * is any queueing at all, it would be in Tx side soft 3218 * rings. 3219 */ 3220 3221 /* 3222 * When srs_count becomes 0, reset SRS_TX_HIWAT and 3223 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3224 */ 3225 if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3226 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3227 mac_client_impl_t *mcip = mac_srs->srs_mcip; 3228 boolean_t wakeup_required = B_FALSE; 3229 3230 if (mac_srs->srs_state & 3231 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3232 wakeup_required = B_TRUE; 3233 } 3234 mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3235 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3236 mutex_exit(&mac_srs->srs_lock); 3237 if (wakeup_required) { 3238 mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs); 3239 /* 3240 * If the client is not the primary MAC client, then we 3241 * need to send the notification to the clients upper 3242 * MAC, i.e. mci_upper_mip. 3243 */ 3244 mac_tx_notify(mcip->mci_upper_mip != NULL ? 3245 mcip->mci_upper_mip : mcip->mci_mip); 3246 } 3247 mutex_enter(&mac_srs->srs_lock); 3248 } 3249 mac_srs->srs_state &= ~SRS_PROC; 3250 } 3251 3252 /* 3253 * Given a packet, get the flow_entry that identifies the flow 3254 * to which that packet belongs. The flow_entry will contain 3255 * the transmit function to be used to send the packet. If the 3256 * function returns NULL, the packet should be sent using the 3257 * underlying NIC. 3258 */ 3259 static flow_entry_t * 3260 mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3261 { 3262 flow_entry_t *flent = NULL; 3263 mac_client_impl_t *mcip; 3264 int err; 3265 3266 /* 3267 * Do classification on the packet. 3268 */ 3269 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3270 if (err != 0) 3271 return (NULL); 3272 3273 /* 3274 * This flent might just be an additional one on the MAC client, 3275 * i.e. for classification purposes (different fdesc), however 3276 * the resources, SRS et. al., are in the mci_flent, so if 3277 * this isn't the mci_flent, we need to get it. 3278 */ 3279 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3280 FLOW_REFRELE(flent); 3281 flent = mcip->mci_flent; 3282 FLOW_TRY_REFHOLD(flent, err); 3283 if (err != 0) 3284 return (NULL); 3285 } 3286 3287 return (flent); 3288 } 3289 3290 /* 3291 * This macro is only meant to be used by mac_tx_send(). 3292 */ 3293 #define CHECK_VID_AND_ADD_TAG(mp) { \ 3294 if (vid_check) { \ 3295 int err = 0; \ 3296 \ 3297 MAC_VID_CHECK(src_mcip, (mp), err); \ 3298 if (err != 0) { \ 3299 freemsg((mp)); \ 3300 (mp) = next; \ 3301 oerrors++; \ 3302 continue; \ 3303 } \ 3304 } \ 3305 if (add_tag) { \ 3306 (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3307 if ((mp) == NULL) { \ 3308 (mp) = next; \ 3309 oerrors++; \ 3310 continue; \ 3311 } \ 3312 } \ 3313 } 3314 3315 mblk_t * 3316 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3317 mac_tx_stats_t *stats) 3318 { 3319 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3320 mac_impl_t *mip = src_mcip->mci_mip; 3321 uint_t obytes = 0, opackets = 0, oerrors = 0; 3322 mblk_t *mp = NULL, *next; 3323 boolean_t vid_check, add_tag; 3324 uint16_t vid = 0; 3325 3326 if (mip->mi_nclients > 1) { 3327 vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3328 add_tag = MAC_TAG_NEEDED(src_mcip); 3329 if (add_tag) 3330 vid = mac_client_vid(mch); 3331 } else { 3332 ASSERT(mip->mi_nclients == 1); 3333 vid_check = add_tag = B_FALSE; 3334 } 3335 3336 /* 3337 * Fastpath: if there's only one client, we simply send 3338 * the packet down to the underlying NIC. 3339 */ 3340 if (mip->mi_nactiveclients == 1) { 3341 DTRACE_PROBE2(fastpath, 3342 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3343 3344 mp = mp_chain; 3345 while (mp != NULL) { 3346 next = mp->b_next; 3347 mp->b_next = NULL; 3348 opackets++; 3349 obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3350 msgdsize(mp)); 3351 3352 CHECK_VID_AND_ADD_TAG(mp); 3353 MAC_TX(mip, ring, mp, src_mcip); 3354 3355 /* 3356 * If the driver is out of descriptors and does a 3357 * partial send it will return a chain of unsent 3358 * mblks. Adjust the accounting stats. 3359 */ 3360 if (mp != NULL) { 3361 opackets--; 3362 obytes -= msgdsize(mp); 3363 mp->b_next = next; 3364 break; 3365 } 3366 mp = next; 3367 } 3368 goto done; 3369 } 3370 3371 /* 3372 * No fastpath, we either have more than one MAC client 3373 * defined on top of the same MAC, or one or more MAC 3374 * client promiscuous callbacks. 3375 */ 3376 DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3377 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3378 3379 mp = mp_chain; 3380 while (mp != NULL) { 3381 flow_entry_t *dst_flow_ent; 3382 void *flow_cookie; 3383 size_t pkt_size; 3384 mblk_t *mp1; 3385 3386 next = mp->b_next; 3387 mp->b_next = NULL; 3388 opackets++; 3389 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3390 obytes += pkt_size; 3391 CHECK_VID_AND_ADD_TAG(mp); 3392 3393 /* 3394 * Find the destination. 3395 */ 3396 dst_flow_ent = mac_tx_classify(mip, mp); 3397 3398 if (dst_flow_ent != NULL) { 3399 size_t hdrsize; 3400 int err = 0; 3401 3402 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3403 struct ether_vlan_header *evhp = 3404 (struct ether_vlan_header *)mp->b_rptr; 3405 3406 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3407 hdrsize = sizeof (*evhp); 3408 else 3409 hdrsize = sizeof (struct ether_header); 3410 } else { 3411 mac_header_info_t mhi; 3412 3413 err = mac_header_info((mac_handle_t)mip, 3414 mp, &mhi); 3415 if (err == 0) 3416 hdrsize = mhi.mhi_hdrsize; 3417 } 3418 3419 /* 3420 * Got a matching flow. It's either another 3421 * MAC client, or a broadcast/multicast flow. 3422 * Make sure the packet size is within the 3423 * allowed size. If not drop the packet and 3424 * move to next packet. 3425 */ 3426 if (err != 0 || 3427 (pkt_size - hdrsize) > mip->mi_sdu_max) { 3428 oerrors++; 3429 DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3430 mblk_t *, mp); 3431 freemsg(mp); 3432 mp = next; 3433 FLOW_REFRELE(dst_flow_ent); 3434 continue; 3435 } 3436 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3437 if (flow_cookie != NULL) { 3438 /* 3439 * The vnic_bcast_send function expects 3440 * to receive the sender MAC client 3441 * as value for arg2. 3442 */ 3443 mac_bcast_send(flow_cookie, src_mcip, mp, 3444 B_TRUE); 3445 } else { 3446 /* 3447 * loopback the packet to a local MAC 3448 * client. We force a context switch 3449 * if both source and destination MAC 3450 * clients are used by IP, i.e. 3451 * bypass is set. 3452 */ 3453 boolean_t do_switch; 3454 mac_client_impl_t *dst_mcip = 3455 dst_flow_ent->fe_mcip; 3456 3457 /* 3458 * Check if there are promiscuous mode 3459 * callbacks defined. This check is 3460 * done here in the 'else' case and 3461 * not in other cases because this 3462 * path is for local loopback 3463 * communication which does not go 3464 * through MAC_TX(). For paths that go 3465 * through MAC_TX(), the promisc_list 3466 * check is done inside the MAC_TX() 3467 * macro. 3468 */ 3469 if (mip->mi_promisc_list != NULL) 3470 mac_promisc_dispatch(mip, mp, src_mcip); 3471 3472 do_switch = ((src_mcip->mci_state_flags & 3473 dst_mcip->mci_state_flags & 3474 MCIS_CLIENT_POLL_CAPABLE) != 0); 3475 3476 if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3477 (dst_flow_ent->fe_cb_fn)( 3478 dst_flow_ent->fe_cb_arg1, 3479 dst_flow_ent->fe_cb_arg2, 3480 mp1, do_switch); 3481 } 3482 } 3483 FLOW_REFRELE(dst_flow_ent); 3484 } else { 3485 /* 3486 * Unknown destination, send via the underlying 3487 * NIC. 3488 */ 3489 MAC_TX(mip, ring, mp, src_mcip); 3490 if (mp != NULL) { 3491 /* 3492 * Adjust for the last packet that 3493 * could not be transmitted 3494 */ 3495 opackets--; 3496 obytes -= pkt_size; 3497 mp->b_next = next; 3498 break; 3499 } 3500 } 3501 mp = next; 3502 } 3503 3504 done: 3505 stats->mts_obytes = obytes; 3506 stats->mts_opackets = opackets; 3507 stats->mts_oerrors = oerrors; 3508 return (mp); 3509 } 3510 3511 /* 3512 * mac_tx_srs_ring_present 3513 * 3514 * Returns whether the specified ring is part of the specified SRS. 3515 */ 3516 boolean_t 3517 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3518 { 3519 int i; 3520 mac_soft_ring_t *soft_ring; 3521 3522 if (srs->srs_tx.st_arg2 == tx_ring) 3523 return (B_TRUE); 3524 3525 for (i = 0; i < srs->srs_tx_ring_count; i++) { 3526 soft_ring = srs->srs_tx_soft_rings[i]; 3527 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3528 return (B_TRUE); 3529 } 3530 3531 return (B_FALSE); 3532 } 3533 3534 /* 3535 * mac_tx_srs_get_soft_ring 3536 * 3537 * Returns the TX soft ring associated with the given ring, if present. 3538 */ 3539 mac_soft_ring_t * 3540 mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3541 { 3542 int i; 3543 mac_soft_ring_t *soft_ring; 3544 3545 if (srs->srs_tx.st_arg2 == tx_ring) 3546 return (NULL); 3547 3548 for (i = 0; i < srs->srs_tx_ring_count; i++) { 3549 soft_ring = srs->srs_tx_soft_rings[i]; 3550 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3551 return (soft_ring); 3552 } 3553 3554 return (NULL); 3555 } 3556 3557 /* 3558 * mac_tx_srs_wakeup 3559 * 3560 * Called when Tx desc become available. Wakeup the appropriate worker 3561 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3562 * state field. 3563 */ 3564 void 3565 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3566 { 3567 int i; 3568 mac_soft_ring_t *sringp; 3569 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3570 3571 mutex_enter(&mac_srs->srs_lock); 3572 /* 3573 * srs_tx_ring_count == 0 is the single ring mode case. In 3574 * this mode, there will not be Tx soft rings associated 3575 * with the SRS. 3576 */ 3577 if (!MAC_TX_SOFT_RINGS(mac_srs)) { 3578 if (srs_tx->st_arg2 == ring && 3579 mac_srs->srs_state & SRS_TX_BLOCKED) { 3580 mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3581 srs_tx->st_stat.mts_unblockcnt++; 3582 cv_signal(&mac_srs->srs_async); 3583 } 3584 /* 3585 * A wakeup can come before tx_srs_drain() could 3586 * grab srs lock and set SRS_TX_BLOCKED. So 3587 * always set woken_up flag when we come here. 3588 */ 3589 srs_tx->st_woken_up = B_TRUE; 3590 mutex_exit(&mac_srs->srs_lock); 3591 return; 3592 } 3593 3594 /* 3595 * If you are here, it is for FANOUT, BW_FANOUT, 3596 * AGGR_MODE or AGGR_BW_MODE case 3597 */ 3598 for (i = 0; i < mac_srs->srs_tx_ring_count; i++) { 3599 sringp = mac_srs->srs_tx_soft_rings[i]; 3600 mutex_enter(&sringp->s_ring_lock); 3601 if (sringp->s_ring_tx_arg2 == ring) { 3602 if (sringp->s_ring_state & S_RING_BLOCK) { 3603 sringp->s_ring_state &= ~S_RING_BLOCK; 3604 sringp->s_st_stat.mts_unblockcnt++; 3605 cv_signal(&sringp->s_ring_async); 3606 } 3607 sringp->s_ring_tx_woken_up = B_TRUE; 3608 } 3609 mutex_exit(&sringp->s_ring_lock); 3610 } 3611 mutex_exit(&mac_srs->srs_lock); 3612 } 3613 3614 /* 3615 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3616 * the blocked clients again. 3617 */ 3618 void 3619 mac_tx_notify(mac_impl_t *mip) 3620 { 3621 i_mac_notify(mip, MAC_NOTE_TX); 3622 } 3623 3624 /* 3625 * RX SOFTRING RELATED FUNCTIONS 3626 * 3627 * These functions really belong in mac_soft_ring.c and here for 3628 * a short period. 3629 */ 3630 3631 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3632 /* \ 3633 * Enqueue our mblk chain. \ 3634 */ \ 3635 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3636 \ 3637 if ((ringp)->s_ring_last != NULL) \ 3638 (ringp)->s_ring_last->b_next = (mp); \ 3639 else \ 3640 (ringp)->s_ring_first = (mp); \ 3641 (ringp)->s_ring_last = (tail); \ 3642 (ringp)->s_ring_count += (cnt); \ 3643 ASSERT((ringp)->s_ring_count > 0); \ 3644 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3645 (ringp)->s_ring_size += sz; \ 3646 } \ 3647 } 3648 3649 /* 3650 * Default entry point to deliver a packet chain to a MAC client. 3651 * If the MAC client has flows, do the classification with these 3652 * flows as well. 3653 */ 3654 /* ARGSUSED */ 3655 void 3656 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3657 mac_header_info_t *arg3) 3658 { 3659 mac_client_impl_t *mcip = arg1; 3660 3661 if (mcip->mci_nvids == 1 && 3662 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 3663 /* 3664 * If the client has exactly one VID associated with it 3665 * and striping of VLAN header is not disabled, 3666 * remove the VLAN tag from the packet before 3667 * passing it on to the client's receive callback. 3668 * Note that this needs to be done after we dispatch 3669 * the packet to the promiscuous listeners of the 3670 * client, since they expect to see the whole 3671 * frame including the VLAN headers. 3672 */ 3673 mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3674 } 3675 3676 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3677 } 3678 3679 /* 3680 * mac_rx_soft_ring_process 3681 * 3682 * process a chain for a given soft ring. The number of packets queued 3683 * in the SRS and its associated soft rings (including this one) is 3684 * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3685 * thread (interrupt or poll thread) to do inline processing. This 3686 * helps keep the latency down under low load. 3687 * 3688 * The proc and arg for each mblk is already stored in the mblk in 3689 * appropriate places. 3690 */ 3691 /* ARGSUSED */ 3692 void 3693 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3694 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3695 { 3696 mac_direct_rx_t proc; 3697 void *arg1; 3698 mac_resource_handle_t arg2; 3699 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3700 3701 ASSERT(ringp != NULL); 3702 ASSERT(mp_chain != NULL); 3703 ASSERT(tail != NULL); 3704 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3705 3706 mutex_enter(&ringp->s_ring_lock); 3707 ringp->s_ring_total_inpkt += cnt; 3708 ringp->s_ring_total_rbytes += sz; 3709 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3710 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 3711 /* If on processor or blanking on, then enqueue and return */ 3712 if (ringp->s_ring_state & S_RING_BLANK || 3713 ringp->s_ring_state & S_RING_PROC) { 3714 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3715 mutex_exit(&ringp->s_ring_lock); 3716 return; 3717 } 3718 proc = ringp->s_ring_rx_func; 3719 arg1 = ringp->s_ring_rx_arg1; 3720 arg2 = ringp->s_ring_rx_arg2; 3721 /* 3722 * See if anything is already queued. If we are the 3723 * first packet, do inline processing else queue the 3724 * packet and do the drain. 3725 */ 3726 if (ringp->s_ring_first == NULL) { 3727 /* 3728 * Fast-path, ok to process and nothing queued. 3729 */ 3730 ringp->s_ring_run = curthread; 3731 ringp->s_ring_state |= (S_RING_PROC); 3732 3733 mutex_exit(&ringp->s_ring_lock); 3734 3735 /* 3736 * We are the chain of 1 packet so 3737 * go through this fast path. 3738 */ 3739 ASSERT(mp_chain->b_next == NULL); 3740 3741 (*proc)(arg1, arg2, mp_chain, NULL); 3742 3743 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3744 /* 3745 * If we have a soft ring set which is doing 3746 * bandwidth control, we need to decrement 3747 * srs_size and count so it the SRS can have a 3748 * accurate idea of what is the real data 3749 * queued between SRS and its soft rings. We 3750 * decrement the counters only when the packet 3751 * gets processed by both SRS and the soft ring. 3752 */ 3753 mutex_enter(&mac_srs->srs_lock); 3754 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3755 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3756 mutex_exit(&mac_srs->srs_lock); 3757 3758 mutex_enter(&ringp->s_ring_lock); 3759 ringp->s_ring_run = NULL; 3760 ringp->s_ring_state &= ~S_RING_PROC; 3761 if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3762 cv_signal(&ringp->s_ring_client_cv); 3763 3764 if ((ringp->s_ring_first == NULL) || 3765 (ringp->s_ring_state & S_RING_BLANK)) { 3766 /* 3767 * We processed inline our packet and 3768 * nothing new has arrived or our 3769 * receiver doesn't want to receive 3770 * any packets. We are done. 3771 */ 3772 mutex_exit(&ringp->s_ring_lock); 3773 return; 3774 } 3775 } else { 3776 SOFT_RING_ENQUEUE_CHAIN(ringp, 3777 mp_chain, tail, cnt, sz); 3778 } 3779 3780 /* 3781 * We are here because either we couldn't do inline 3782 * processing (because something was already 3783 * queued), or we had a chain of more than one 3784 * packet, or something else arrived after we were 3785 * done with inline processing. 3786 */ 3787 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3788 ASSERT(ringp->s_ring_first != NULL); 3789 3790 ringp->s_ring_drain_func(ringp); 3791 mutex_exit(&ringp->s_ring_lock); 3792 return; 3793 } else { 3794 /* ST_RING_WORKER_ONLY case */ 3795 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3796 mac_soft_ring_worker_wakeup(ringp); 3797 mutex_exit(&ringp->s_ring_lock); 3798 } 3799 } 3800 3801 /* 3802 * TX SOFTRING RELATED FUNCTIONS 3803 * 3804 * These functions really belong in mac_soft_ring.c and here for 3805 * a short period. 3806 */ 3807 3808 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3809 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3810 ringp->s_ring_state |= S_RING_ENQUEUED; \ 3811 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3812 } 3813 3814 /* 3815 * mac_tx_sring_queued 3816 * 3817 * When we are out of transmit descriptors and we already have a 3818 * queue that exceeds hiwat (or the client called us with 3819 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3820 * soft ring pointer as the opaque cookie for the client enable 3821 * flow control. 3822 */ 3823 static mac_tx_cookie_t 3824 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3825 mblk_t **ret_mp) 3826 { 3827 int cnt; 3828 size_t sz; 3829 mblk_t *tail; 3830 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3831 mac_tx_cookie_t cookie = NULL; 3832 boolean_t wakeup_worker = B_TRUE; 3833 3834 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3835 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3836 if (flag & MAC_DROP_ON_NO_DESC) { 3837 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3838 /* increment freed stats */ 3839 ringp->s_ring_drops += cnt; 3840 cookie = (mac_tx_cookie_t)ringp; 3841 } else { 3842 if (ringp->s_ring_first != NULL) 3843 wakeup_worker = B_FALSE; 3844 3845 if (flag & MAC_TX_NO_ENQUEUE) { 3846 /* 3847 * If QUEUED is not set, queue the packet 3848 * and let mac_tx_soft_ring_drain() set 3849 * the TX_BLOCKED bit for the reasons 3850 * explained above. Otherwise, return the 3851 * mblks. 3852 */ 3853 if (wakeup_worker) { 3854 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3855 mp_chain, tail, cnt, sz); 3856 } else { 3857 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3858 cookie = (mac_tx_cookie_t)ringp; 3859 *ret_mp = mp_chain; 3860 } 3861 } else { 3862 boolean_t enqueue = B_TRUE; 3863 3864 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3865 /* 3866 * flow-controlled. Store ringp in cookie 3867 * so that it can be returned as 3868 * mac_tx_cookie_t to client 3869 */ 3870 ringp->s_ring_state |= S_RING_TX_HIWAT; 3871 cookie = (mac_tx_cookie_t)ringp; 3872 ringp->s_ring_hiwat_cnt++; 3873 if (ringp->s_ring_count > 3874 ringp->s_ring_tx_max_q_cnt) { 3875 /* increment freed stats */ 3876 ringp->s_ring_drops += cnt; 3877 /* 3878 * b_prev may be set to the fanout hint 3879 * hence can't use freemsg directly 3880 */ 3881 mac_pkt_drop(NULL, NULL, 3882 mp_chain, B_FALSE); 3883 DTRACE_PROBE1(tx_queued_hiwat, 3884 mac_soft_ring_t *, ringp); 3885 enqueue = B_FALSE; 3886 } 3887 } 3888 if (enqueue) { 3889 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3890 tail, cnt, sz); 3891 } 3892 } 3893 if (wakeup_worker) 3894 cv_signal(&ringp->s_ring_async); 3895 } 3896 return (cookie); 3897 } 3898 3899 3900 /* 3901 * mac_tx_soft_ring_process 3902 * 3903 * This routine is called when fanning out outgoing traffic among 3904 * multipe Tx rings. 3905 * Note that a soft ring is associated with a h/w Tx ring. 3906 */ 3907 mac_tx_cookie_t 3908 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3909 uint16_t flag, mblk_t **ret_mp) 3910 { 3911 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3912 int cnt; 3913 size_t sz; 3914 mblk_t *tail; 3915 mac_tx_cookie_t cookie = NULL; 3916 3917 ASSERT(ringp != NULL); 3918 ASSERT(mp_chain != NULL); 3919 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3920 /* 3921 * The following modes can come here: SRS_TX_BW_FANOUT, 3922 * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR. 3923 */ 3924 ASSERT(MAC_TX_SOFT_RINGS(mac_srs)); 3925 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3926 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT || 3927 mac_srs->srs_tx.st_mode == SRS_TX_AGGR || 3928 mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR); 3929 3930 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3931 /* Serialization mode */ 3932 3933 mutex_enter(&ringp->s_ring_lock); 3934 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3935 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3936 flag, ret_mp); 3937 mutex_exit(&ringp->s_ring_lock); 3938 return (cookie); 3939 } 3940 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3941 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3942 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3943 /* 3944 * If ring is blocked due to lack of Tx 3945 * descs, just return. Worker thread 3946 * will get scheduled when Tx desc's 3947 * become available. 3948 */ 3949 mutex_exit(&ringp->s_ring_lock); 3950 return (cookie); 3951 } 3952 mac_soft_ring_worker_wakeup(ringp); 3953 mutex_exit(&ringp->s_ring_lock); 3954 return (cookie); 3955 } else { 3956 /* Default fanout mode */ 3957 /* 3958 * S_RING_BLOCKED is set when underlying NIC runs 3959 * out of Tx descs and messages start getting 3960 * queued. It won't get reset until 3961 * tx_srs_drain() completely drains out the 3962 * messages. 3963 */ 3964 mac_tx_stats_t stats; 3965 3966 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3967 /* Tx descs/resources not available */ 3968 mutex_enter(&ringp->s_ring_lock); 3969 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3970 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3971 flag, ret_mp); 3972 mutex_exit(&ringp->s_ring_lock); 3973 return (cookie); 3974 } 3975 /* 3976 * While we were computing mblk count, the 3977 * flow control condition got relieved. 3978 * Continue with the transmission. 3979 */ 3980 mutex_exit(&ringp->s_ring_lock); 3981 } 3982 3983 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 3984 ringp->s_ring_tx_arg2, mp_chain, &stats); 3985 3986 /* 3987 * Multiple threads could be here sending packets. 3988 * Under such conditions, it is not possible to 3989 * automically set S_RING_BLOCKED bit to indicate 3990 * out of tx desc condition. To atomically set 3991 * this, we queue the returned packet and do 3992 * the setting of S_RING_BLOCKED in 3993 * mac_tx_soft_ring_drain(). 3994 */ 3995 if (mp_chain != NULL) { 3996 mutex_enter(&ringp->s_ring_lock); 3997 cookie = 3998 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 3999 mutex_exit(&ringp->s_ring_lock); 4000 return (cookie); 4001 } 4002 SRS_TX_STATS_UPDATE(mac_srs, &stats); 4003 SOFTRING_TX_STATS_UPDATE(ringp, &stats); 4004 4005 return (NULL); 4006 } 4007 } 4008