1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/callb.h> 28 #include <sys/sdt.h> 29 #include <sys/strsubr.h> 30 #include <sys/strsun.h> 31 #include <sys/vlan.h> 32 #include <inet/ipsec_impl.h> 33 #include <inet/ip_impl.h> 34 #include <inet/sadb.h> 35 #include <inet/ipsecesp.h> 36 #include <inet/ipsecah.h> 37 #include <inet/ip6.h> 38 39 #include <sys/mac_impl.h> 40 #include <sys/mac_client_impl.h> 41 #include <sys/mac_client_priv.h> 42 #include <sys/mac_soft_ring.h> 43 #include <sys/mac_flow_impl.h> 44 45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 46 uintptr_t, uint16_t, mblk_t **); 47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 48 uintptr_t, uint16_t, mblk_t **); 49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 50 uintptr_t, uint16_t, mblk_t **); 51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 52 uintptr_t, uint16_t, mblk_t **); 53 54 typedef struct mac_tx_mode_s { 55 mac_tx_srs_mode_t mac_tx_mode; 56 mac_tx_func_t mac_tx_func; 57 } mac_tx_mode_t; 58 59 /* 60 * There are five modes of operation on the Tx side. These modes get set 61 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 62 * none of the other modes are user configurable. They get selected by 63 * the system depending upon whether the link (or flow) has multiple Tx 64 * rings or a bandwidth configured, etc. 65 */ 66 mac_tx_mode_t mac_tx_mode_list[] = { 67 {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 68 {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 69 {SRS_TX_FANOUT, mac_tx_fanout_mode}, 70 {SRS_TX_BW, mac_tx_bw_mode}, 71 {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 72 }; 73 74 /* 75 * Soft Ring Set (SRS) - The Run time code that deals with 76 * dynamic polling from the hardware, bandwidth enforcement, 77 * fanout etc. 78 * 79 * We try to use H/W classification on NIC and assign traffic for 80 * a MAC address to a particular Rx ring or ring group. There is a 81 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 82 * switches the underlying Rx ring between interrupt and 83 * polling mode and enforces any specified B/W control. 84 * 85 * There is always a SRS created and tied to each H/W and S/W rule. 86 * Whenever we create a H/W rule, we always add the the same rule to 87 * S/W classifier and tie a SRS to it. 88 * 89 * In case a B/W control is specified, it is broken into bytes 90 * per ticks and as soon as the quota for a tick is exhausted, 91 * the underlying Rx ring is forced into poll mode for remainder of 92 * the tick. The SRS poll thread only polls for bytes that are 93 * allowed to come in the SRS. We typically let 4x the configured 94 * B/W worth of packets to come in the SRS (to prevent unnecessary 95 * drops due to bursts) but only process the specified amount. 96 * 97 * A MAC client (e.g. a VNIC or aggr) can have 1 or more 98 * Rx rings (and corresponding SRSs) assigned to it. The SRS 99 * in turn can have softrings to do protocol level fanout or 100 * softrings to do S/W based fanout or both. In case the NIC 101 * has no Rx rings, we do S/W classification to respective SRS. 102 * The S/W classification rule is always setup and ready. This 103 * allows the MAC layer to reassign Rx rings whenever needed 104 * but packets still continue to flow via the default path and 105 * getting S/W classified to correct SRS. 106 * 107 * The SRS's are used on both Tx and Rx side. They use the same 108 * data structure but the processing routines have slightly different 109 * semantics due to the fact that Rx side needs to do dynamic 110 * polling etc. 111 * 112 * Dynamic Polling Notes 113 * ===================== 114 * 115 * Each Soft ring set is capable of switching its Rx ring between 116 * interrupt and poll mode and actively 'polls' for packets in 117 * poll mode. If the SRS is implementing a B/W limit, it makes 118 * sure that only Max allowed packets are pulled in poll mode 119 * and goes to poll mode as soon as B/W limit is exceeded. As 120 * such, there are no overheads to implement B/W limits. 121 * 122 * In poll mode, its better to keep the pipeline going where the 123 * SRS worker thread keeps processing packets and poll thread 124 * keeps bringing more packets (specially if they get to run 125 * on different CPUs). This also prevents the overheads associated 126 * by excessive signalling (on NUMA machines, this can be 127 * pretty devastating). The exception is latency optimized case 128 * where worker thread does no work and interrupt and poll thread 129 * are allowed to do their own drain. 130 * 131 * We use the following policy to control Dynamic Polling: 132 * 1) We switch to poll mode anytime the processing 133 * thread causes a backlog to build up in SRS and 134 * its associated Soft Rings (sr_poll_pkt_cnt > 0). 135 * 2) As long as the backlog stays under the low water 136 * mark (sr_lowat), we poll the H/W for more packets. 137 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 138 * water mark, we stay in poll mode but don't poll 139 * the H/W for more packets. 140 * 4) Anytime in polling mode, if we poll the H/W for 141 * packets and find nothing plus we have an existing 142 * backlog (sr_poll_pkt_cnt > 0), we stay in polling 143 * mode but don't poll the H/W for packets anymore 144 * (let the polling thread go to sleep). 145 * 5) Once the backlog is relived (packets are processed) 146 * we reenable polling (by signalling the poll thread) 147 * only when the backlog dips below sr_poll_thres. 148 * 6) sr_hiwat is used exclusively when we are not 149 * polling capable and is used to decide when to 150 * drop packets so the SRS queue length doesn't grow 151 * infinitely. 152 * 153 * NOTE: Also see the block level comment on top of mac_soft_ring.c 154 */ 155 156 /* 157 * mac_latency_optimize 158 * 159 * Controls whether the poll thread can process the packets inline 160 * or let the SRS worker thread do the processing. This applies if 161 * the SRS was not being processed. For latency sensitive traffic, 162 * this needs to be true to allow inline processing. For throughput 163 * under load, this should be false. 164 * 165 * This (and other similar) tunable should be rolled into a link 166 * or flow specific workload hint that can be set using dladm 167 * linkprop (instead of multiple such tunables). 168 */ 169 boolean_t mac_latency_optimize = B_TRUE; 170 171 /* 172 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 173 * 174 * queue a mp or chain in soft ring set and increment the 175 * local count (srs_count) for the SRS and the shared counter 176 * (srs_poll_pkt_cnt - shared between SRS and its soft rings 177 * to track the total unprocessed packets for polling to work 178 * correctly). 179 * 180 * The size (total bytes queued) counters are incremented only 181 * if we are doing B/W control. 182 */ 183 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 184 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 185 if ((mac_srs)->srs_last != NULL) \ 186 (mac_srs)->srs_last->b_next = (head); \ 187 else \ 188 (mac_srs)->srs_first = (head); \ 189 (mac_srs)->srs_last = (tail); \ 190 (mac_srs)->srs_count += count; \ 191 } 192 193 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 194 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 195 \ 196 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 197 srs_rx->sr_poll_pkt_cnt += count; \ 198 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 199 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 200 (mac_srs)->srs_size += (sz); \ 201 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 202 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 203 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 204 } \ 205 } 206 207 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 208 mac_srs->srs_state |= SRS_ENQUEUED; \ 209 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 210 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 211 (mac_srs)->srs_size += (sz); \ 212 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 213 } \ 214 } 215 216 /* 217 * Turn polling on routines 218 */ 219 #define MAC_SRS_POLLING_ON(mac_srs) { \ 220 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 221 if (((mac_srs)->srs_state & \ 222 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 223 (mac_srs)->srs_state |= SRS_POLLING; \ 224 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 225 (mac_srs)->srs_ring); \ 226 (mac_srs)->srs_rx.sr_poll_on++; \ 227 } \ 228 } 229 230 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 231 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 232 if (((mac_srs)->srs_state & \ 233 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 234 (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 235 (mac_srs)->srs_state |= SRS_POLLING; \ 236 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 237 (mac_srs)->srs_ring); \ 238 (mac_srs)->srs_rx.sr_worker_poll_on++; \ 239 } \ 240 } 241 242 /* 243 * MAC_SRS_POLL_RING 244 * 245 * Signal the SRS poll thread to poll the underlying H/W ring 246 * provided it wasn't already polling (SRS_GET_PKTS was set). 247 * 248 * Poll thread gets to run only from mac_rx_srs_drain() and only 249 * if the drain was being done by the worker thread. 250 */ 251 #define MAC_SRS_POLL_RING(mac_srs) { \ 252 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 253 \ 254 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 255 srs_rx->sr_poll_thr_sig++; \ 256 if (((mac_srs)->srs_state & \ 257 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 258 (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 259 (mac_srs)->srs_state |= SRS_GET_PKTS; \ 260 cv_signal(&(mac_srs)->srs_cv); \ 261 } else { \ 262 srs_rx->sr_poll_thr_busy++; \ 263 } \ 264 } 265 266 /* 267 * MAC_SRS_CHECK_BW_CONTROL 268 * 269 * Check to see if next tick has started so we can reset the 270 * SRS_BW_ENFORCED flag and allow more packets to come in the 271 * system. 272 */ 273 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 274 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 275 ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 276 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 277 if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \ 278 (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \ 279 (mac_srs)->srs_bw->mac_bw_used = 0; \ 280 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 281 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 282 } \ 283 } 284 285 /* 286 * MAC_SRS_WORKER_WAKEUP 287 * 288 * Wake up the SRS worker thread to process the queue as long as 289 * no one else is processing the queue. If we are optimizing for 290 * latency, we wake up the worker thread immediately or else we 291 * wait mac_srs_worker_wakeup_ticks before worker thread gets 292 * woken up. 293 */ 294 int mac_srs_worker_wakeup_ticks = 0; 295 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 296 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 297 if (!((mac_srs)->srs_state & SRS_PROC) && \ 298 (mac_srs)->srs_tid == NULL) { \ 299 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 300 (mac_srs_worker_wakeup_ticks == 0)) \ 301 cv_signal(&(mac_srs)->srs_async); \ 302 else \ 303 (mac_srs)->srs_tid = \ 304 timeout(mac_srs_fire, (mac_srs), \ 305 mac_srs_worker_wakeup_ticks); \ 306 } \ 307 } 308 309 #define TX_SINGLE_RING_MODE(mac_srs) \ 310 ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 311 (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 312 (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 313 314 #define TX_BANDWIDTH_MODE(mac_srs) \ 315 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 316 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 317 318 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 319 uint_t hash, indx; \ 320 hash = HASH_HINT(hint); \ 321 indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 322 softring = mac_srs->srs_oth_soft_rings[indx]; \ 323 (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 324 } 325 326 /* 327 * MAC_TX_SRS_BLOCK 328 * 329 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 330 * will be set only if srs_tx_woken_up is FALSE. If 331 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 332 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 333 * attempt to transmit again and not setting SRS_TX_BLOCKED does 334 * that. 335 */ 336 #define MAC_TX_SRS_BLOCK(srs, mp) { \ 337 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 338 if ((srs)->srs_tx.st_woken_up) { \ 339 (srs)->srs_tx.st_woken_up = B_FALSE; \ 340 } else { \ 341 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 342 (srs)->srs_state |= SRS_TX_BLOCKED; \ 343 (srs)->srs_tx.st_blocked_cnt++; \ 344 } \ 345 } 346 347 /* 348 * MAC_TX_SRS_TEST_HIWAT 349 * 350 * Called before queueing a packet onto Tx SRS to test and set 351 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 352 */ 353 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 354 boolean_t enqueue = 1; \ 355 \ 356 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 357 /* \ 358 * flow-controlled. Store srs in cookie so that it \ 359 * can be returned as mac_tx_cookie_t to client \ 360 */ \ 361 (srs)->srs_state |= SRS_TX_HIWAT; \ 362 cookie = (mac_tx_cookie_t)srs; \ 363 (srs)->srs_tx.st_hiwat_cnt++; \ 364 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 365 /* increment freed stats */ \ 366 (srs)->srs_tx.st_drop_count += cnt; \ 367 /* \ 368 * b_prev may be set to the fanout hint \ 369 * hence can't use freemsg directly \ 370 */ \ 371 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 372 DTRACE_PROBE1(tx_queued_hiwat, \ 373 mac_soft_ring_set_t *, srs); \ 374 enqueue = 0; \ 375 } \ 376 } \ 377 if (enqueue) \ 378 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 379 } 380 381 /* Some utility macros */ 382 #define MAC_SRS_BW_LOCK(srs) \ 383 if (!(srs->srs_type & SRST_TX)) \ 384 mutex_enter(&srs->srs_bw->mac_bw_lock); 385 386 #define MAC_SRS_BW_UNLOCK(srs) \ 387 if (!(srs->srs_type & SRST_TX)) \ 388 mutex_exit(&srs->srs_bw->mac_bw_lock); 389 390 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 391 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 392 /* increment freed stats */ \ 393 mac_srs->srs_tx.st_drop_count++; \ 394 cookie = (mac_tx_cookie_t)srs; \ 395 } 396 397 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 398 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 399 cookie = (mac_tx_cookie_t)srs; \ 400 *ret_mp = mp_chain; \ 401 } 402 403 /* 404 * Drop the rx packet and advance to the next one in the chain. 405 */ 406 static void 407 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 408 { 409 mac_srs_rx_t *srs_rx = &srs->srs_rx; 410 411 ASSERT(mp->b_next == NULL); 412 mutex_enter(&srs->srs_lock); 413 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 414 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 415 mutex_exit(&srs->srs_lock); 416 417 srs_rx->sr_drop_count++; 418 freemsg(mp); 419 } 420 421 /* DATAPATH RUNTIME ROUTINES */ 422 423 /* 424 * mac_srs_fire 425 * 426 * Timer callback routine for waking up the SRS worker thread. 427 */ 428 static void 429 mac_srs_fire(void *arg) 430 { 431 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 432 433 mutex_enter(&mac_srs->srs_lock); 434 if (mac_srs->srs_tid == 0) { 435 mutex_exit(&mac_srs->srs_lock); 436 return; 437 } 438 439 mac_srs->srs_tid = 0; 440 if (!(mac_srs->srs_state & SRS_PROC)) 441 cv_signal(&mac_srs->srs_async); 442 443 mutex_exit(&mac_srs->srs_lock); 444 } 445 446 /* 447 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 448 * and it is used on the TX path. 449 */ 450 #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 451 452 /* 453 * hash based on the src address and the port information. 454 */ 455 #define HASH_ADDR(src, ports) \ 456 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 457 ((ports) >> 8) ^ (ports)) 458 459 #define COMPUTE_INDEX(key, sz) (key % sz) 460 461 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 462 if ((tail) != NULL) { \ 463 ASSERT((tail)->b_next == NULL); \ 464 (tail)->b_next = (mp); \ 465 } else { \ 466 ASSERT((head) == NULL); \ 467 (head) = (mp); \ 468 } \ 469 (tail) = (mp); \ 470 (cnt)++; \ 471 if ((bw_ctl)) \ 472 (sz) += (sz0); \ 473 } 474 475 #define MAC_FANOUT_DEFAULT 0 476 #define MAC_FANOUT_RND_ROBIN 1 477 int mac_fanout_type = MAC_FANOUT_DEFAULT; 478 479 #define MAX_SR_TYPES 3 480 /* fanout types for port based hashing */ 481 enum pkt_type { 482 V4_TCP = 0, 483 V4_UDP, 484 OTH, 485 UNDEF 486 }; 487 488 /* 489 * In general we do port based hashing to spread traffic over different 490 * softrings. The below tunable allows to override that behavior. Setting it 491 * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 492 * is also the applicable to ipv6 packets carrying multiple optional headers 493 * and other uncommon packet types. 494 */ 495 boolean_t mac_src_ipv6_fanout = B_FALSE; 496 497 /* 498 * Pair of local and remote ports in the transport header 499 */ 500 #define PORTS_SIZE 4 501 502 /* 503 * mac_rx_srs_proto_fanout 504 * 505 * This routine delivers packets destined to an SRS into one of the 506 * protocol soft rings. 507 * 508 * Given a chain of packets we need to split it up into multiple sub chains 509 * destined into TCP, UDP or OTH soft ring. Instead of entering 510 * the soft ring one packet at a time, we want to enter it in the form of a 511 * chain otherwise we get this start/stop behaviour where the worker thread 512 * goes to sleep and then next packets comes in forcing it to wake up etc. 513 */ 514 static void 515 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 516 { 517 struct ether_header *ehp; 518 struct ether_vlan_header *evhp; 519 uint32_t sap; 520 ipha_t *ipha; 521 uint8_t *dstaddr; 522 size_t hdrsize; 523 mblk_t *mp; 524 mblk_t *headmp[MAX_SR_TYPES]; 525 mblk_t *tailmp[MAX_SR_TYPES]; 526 int cnt[MAX_SR_TYPES]; 527 size_t sz[MAX_SR_TYPES]; 528 size_t sz1; 529 boolean_t bw_ctl; 530 boolean_t hw_classified; 531 boolean_t dls_bypass; 532 boolean_t is_ether; 533 boolean_t is_unicast; 534 enum pkt_type type; 535 mac_client_impl_t *mcip = mac_srs->srs_mcip; 536 537 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 538 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 539 540 /* 541 * If we don't have a Rx ring, S/W classification would have done 542 * its job and its a packet meant for us. If we were polling on 543 * the default ring (i.e. there was a ring assigned to this SRS), 544 * then we need to make sure that the mac address really belongs 545 * to us. 546 */ 547 hw_classified = mac_srs->srs_ring != NULL && 548 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 549 550 /* 551 * Special clients (eg. VLAN, non ether, etc) need DLS 552 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 553 * such SRSs. Another way of disabling bypass is to set the 554 * MCIS_RX_BYPASS_DISABLE flag. 555 */ 556 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 557 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 558 559 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 560 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 561 bzero(cnt, MAX_SR_TYPES * sizeof (int)); 562 bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 563 564 /* 565 * We got a chain from SRS that we need to send to the soft rings. 566 * Since squeues for TCP & IPv4 sap poll their soft rings (for 567 * performance reasons), we need to separate out v4_tcp, v4_udp 568 * and the rest goes in other. 569 */ 570 while (head != NULL) { 571 mp = head; 572 head = head->b_next; 573 mp->b_next = NULL; 574 575 type = OTH; 576 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 577 578 if (is_ether) { 579 /* 580 * At this point we can be sure the packet at least 581 * has an ether header. 582 */ 583 if (sz1 < sizeof (struct ether_header)) { 584 mac_rx_drop_pkt(mac_srs, mp); 585 continue; 586 } 587 ehp = (struct ether_header *)mp->b_rptr; 588 589 /* 590 * Determine if this is a VLAN or non-VLAN packet. 591 */ 592 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 593 evhp = (struct ether_vlan_header *)mp->b_rptr; 594 sap = ntohs(evhp->ether_type); 595 hdrsize = sizeof (struct ether_vlan_header); 596 /* 597 * Check if the VID of the packet, if any, 598 * belongs to this client. 599 */ 600 if (!mac_client_check_flow_vid(mcip, 601 VLAN_ID(ntohs(evhp->ether_tci)))) { 602 mac_rx_drop_pkt(mac_srs, mp); 603 continue; 604 } 605 } else { 606 hdrsize = sizeof (struct ether_header); 607 } 608 is_unicast = 609 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 610 dstaddr = (uint8_t *)&ehp->ether_dhost; 611 } else { 612 mac_header_info_t mhi; 613 614 if (mac_header_info((mac_handle_t)mcip->mci_mip, 615 mp, &mhi) != 0) { 616 mac_rx_drop_pkt(mac_srs, mp); 617 continue; 618 } 619 hdrsize = mhi.mhi_hdrsize; 620 sap = mhi.mhi_bindsap; 621 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 622 dstaddr = (uint8_t *)mhi.mhi_daddr; 623 } 624 625 if (!dls_bypass) { 626 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 627 cnt[type], bw_ctl, sz[type], sz1, mp); 628 continue; 629 } 630 631 if (sap == ETHERTYPE_IP) { 632 /* 633 * If we are H/W classified, but we have promisc 634 * on, then we need to check for the unicast address. 635 */ 636 if (hw_classified && mcip->mci_promisc_list != NULL) { 637 mac_address_t *map; 638 639 rw_enter(&mcip->mci_rw_lock, RW_READER); 640 map = mcip->mci_unicast; 641 if (bcmp(dstaddr, map->ma_addr, 642 map->ma_len) == 0) 643 type = UNDEF; 644 rw_exit(&mcip->mci_rw_lock); 645 } else if (is_unicast) { 646 type = UNDEF; 647 } 648 } 649 650 /* 651 * This needs to become a contract with the driver for 652 * the fast path. 653 * 654 * In the normal case the packet will have at least the L2 655 * header and the IP + Transport header in the same mblk. 656 * This is usually the case when the NIC driver sends up 657 * the packet. This is also true when the stack generates 658 * a packet that is looped back and when the stack uses the 659 * fastpath mechanism. The normal case is optimized for 660 * performance and may bypass DLS. All other cases go through 661 * the 'OTH' type path without DLS bypass. 662 */ 663 664 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 665 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 666 type = OTH; 667 668 if (type == OTH) { 669 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 670 cnt[type], bw_ctl, sz[type], sz1, mp); 671 continue; 672 } 673 674 ASSERT(type == UNDEF); 675 /* 676 * We look for at least 4 bytes past the IP header to get 677 * the port information. If we get an IP fragment, we don't 678 * have the port information, and we use just the protocol 679 * information. 680 */ 681 switch (ipha->ipha_protocol) { 682 case IPPROTO_TCP: 683 type = V4_TCP; 684 mp->b_rptr += hdrsize; 685 break; 686 case IPPROTO_UDP: 687 type = V4_UDP; 688 mp->b_rptr += hdrsize; 689 break; 690 default: 691 type = OTH; 692 break; 693 } 694 695 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 696 bw_ctl, sz[type], sz1, mp); 697 } 698 699 for (type = V4_TCP; type < UNDEF; type++) { 700 if (headmp[type] != NULL) { 701 mac_soft_ring_t *softring; 702 703 ASSERT(tailmp[type]->b_next == NULL); 704 switch (type) { 705 case V4_TCP: 706 softring = mac_srs->srs_tcp_soft_rings[0]; 707 break; 708 case V4_UDP: 709 softring = mac_srs->srs_udp_soft_rings[0]; 710 break; 711 case OTH: 712 softring = mac_srs->srs_oth_soft_rings[0]; 713 } 714 mac_rx_soft_ring_process(mcip, softring, 715 headmp[type], tailmp[type], cnt[type], sz[type]); 716 } 717 } 718 } 719 720 int fanout_unalligned = 0; 721 722 /* 723 * mac_rx_srs_long_fanout 724 * 725 * The fanout routine for IPv6 726 */ 727 static int 728 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 729 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 730 { 731 ip6_t *ip6h; 732 uint8_t *whereptr; 733 uint_t hash; 734 uint16_t remlen; 735 uint8_t nexthdr; 736 uint16_t hdr_len; 737 738 if (sap == ETHERTYPE_IPV6) { 739 boolean_t modifiable = B_TRUE; 740 741 ASSERT(MBLKL(mp) >= hdrsize); 742 743 ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 744 if ((unsigned char *)ip6h == mp->b_wptr) { 745 /* 746 * The first mblk_t only includes the mac header. 747 * Note that it is safe to change the mp pointer here, 748 * as the subsequent operation does not assume mp 749 * points to the start of the mac header. 750 */ 751 mp = mp->b_cont; 752 753 /* 754 * Make sure ip6h holds the full ip6_t structure. 755 */ 756 if (mp == NULL) 757 return (-1); 758 759 if (MBLKL(mp) < IPV6_HDR_LEN) { 760 modifiable = (DB_REF(mp) == 1); 761 762 if (modifiable && 763 !pullupmsg(mp, IPV6_HDR_LEN)) { 764 return (-1); 765 } 766 } 767 768 ip6h = (ip6_t *)mp->b_rptr; 769 } 770 771 if (!modifiable || !(OK_32PTR((char *)ip6h)) || 772 ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 773 /* 774 * If either ip6h is not alligned, or ip6h does not 775 * hold the complete ip6_t structure (a pullupmsg() 776 * is not an option since it would result in an 777 * unalligned ip6h), fanout to the default ring. Note 778 * that this may cause packets reordering. 779 */ 780 *indx = 0; 781 *type = OTH; 782 fanout_unalligned++; 783 return (0); 784 } 785 786 remlen = ntohs(ip6h->ip6_plen); 787 nexthdr = ip6h->ip6_nxt; 788 789 if (remlen < MIN_EHDR_LEN) 790 return (-1); 791 /* 792 * Do src based fanout if below tunable is set to B_TRUE or 793 * when mac_ip_hdr_length_v6() fails because of malformed 794 * packets or because mblk's need to be concatenated using 795 * pullupmsg(). 796 */ 797 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 798 &hdr_len, &nexthdr)) { 799 goto src_based_fanout; 800 } 801 whereptr = (uint8_t *)ip6h + hdr_len; 802 803 /* If the transport is one of below, we do port based fanout */ 804 switch (nexthdr) { 805 case IPPROTO_TCP: 806 case IPPROTO_UDP: 807 case IPPROTO_SCTP: 808 case IPPROTO_ESP: 809 /* 810 * If the ports in the transport header is not part of 811 * the mblk, do src_based_fanout, instead of calling 812 * pullupmsg(). 813 */ 814 if (mp->b_cont != NULL && 815 whereptr + PORTS_SIZE > mp->b_wptr) { 816 goto src_based_fanout; 817 } 818 break; 819 default: 820 break; 821 } 822 823 switch (nexthdr) { 824 case IPPROTO_TCP: 825 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 826 *(uint32_t *)whereptr); 827 *indx = COMPUTE_INDEX(hash, 828 mac_srs->srs_tcp_ring_count); 829 *type = OTH; 830 break; 831 832 case IPPROTO_UDP: 833 case IPPROTO_SCTP: 834 case IPPROTO_ESP: 835 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 836 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 837 *(uint32_t *)whereptr); 838 *indx = COMPUTE_INDEX(hash, 839 mac_srs->srs_udp_ring_count); 840 } else { 841 *indx = mac_srs->srs_ind % 842 mac_srs->srs_udp_ring_count; 843 mac_srs->srs_ind++; 844 } 845 *type = OTH; 846 break; 847 848 /* For all other protocol, do source based fanout */ 849 default: 850 goto src_based_fanout; 851 } 852 } else { 853 *indx = 0; 854 *type = OTH; 855 } 856 return (0); 857 858 src_based_fanout: 859 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 860 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 861 *type = OTH; 862 return (0); 863 } 864 865 /* 866 * mac_rx_srs_fanout 867 * 868 * This routine delivers packets destined to an SRS into a soft ring member 869 * of the set. 870 * 871 * Given a chain of packets we need to split it up into multiple sub chains 872 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 873 * the soft ring one packet at a time, we want to enter it in the form of a 874 * chain otherwise we get this start/stop behaviour where the worker thread 875 * goes to sleep and then next packets comes in forcing it to wake up etc. 876 * 877 * Note: 878 * Since we know what is the maximum fanout possible, we create a 2D array 879 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 880 * variables so that we can enter the softrings with chain. We need the 881 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 882 * for each packet would be expensive). If we ever want to have the 883 * ability to have unlimited fanout, we should probably declare a head, 884 * tail, cnt, sz with each soft ring (a data struct which contains a softring 885 * along with these members) and create an array of this uber struct so we 886 * don't have to do kmem_alloc. 887 */ 888 int fanout_oth1 = 0; 889 int fanout_oth2 = 0; 890 int fanout_oth3 = 0; 891 int fanout_oth4 = 0; 892 int fanout_oth5 = 0; 893 894 static void 895 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 896 { 897 struct ether_header *ehp; 898 struct ether_vlan_header *evhp; 899 uint32_t sap; 900 ipha_t *ipha; 901 uint8_t *dstaddr; 902 uint_t indx; 903 size_t ports_offset; 904 size_t ipha_len; 905 size_t hdrsize; 906 uint_t hash; 907 mblk_t *mp; 908 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 909 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 910 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 911 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 912 size_t sz1; 913 boolean_t bw_ctl; 914 boolean_t hw_classified; 915 boolean_t dls_bypass; 916 boolean_t is_ether; 917 boolean_t is_unicast; 918 int fanout_cnt; 919 enum pkt_type type; 920 mac_client_impl_t *mcip = mac_srs->srs_mcip; 921 922 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 923 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 924 925 /* 926 * If we don't have a Rx ring, S/W classification would have done 927 * its job and its a packet meant for us. If we were polling on 928 * the default ring (i.e. there was a ring assigned to this SRS), 929 * then we need to make sure that the mac address really belongs 930 * to us. 931 */ 932 hw_classified = mac_srs->srs_ring != NULL && 933 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 934 935 /* 936 * Special clients (eg. VLAN, non ether, etc) need DLS 937 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 938 * such SRSs. Another way of disabling bypass is to set the 939 * MCIS_RX_BYPASS_DISABLE flag. 940 */ 941 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 942 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 943 944 /* 945 * Since the softrings are never destroyed and we always 946 * create equal number of softrings for TCP, UDP and rest, 947 * its OK to check one of them for count and use it without 948 * any lock. In future, if soft rings get destroyed because 949 * of reduction in fanout, we will need to ensure that happens 950 * behind the SRS_PROC. 951 */ 952 fanout_cnt = mac_srs->srs_tcp_ring_count; 953 954 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 955 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 956 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 957 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 958 959 /* 960 * We got a chain from SRS that we need to send to the soft rings. 961 * Since squeues for TCP & IPv4 sap poll their soft rings (for 962 * performance reasons), we need to separate out v4_tcp, v4_udp 963 * and the rest goes in other. 964 */ 965 while (head != NULL) { 966 mp = head; 967 head = head->b_next; 968 mp->b_next = NULL; 969 970 type = OTH; 971 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 972 973 if (is_ether) { 974 /* 975 * At this point we can be sure the packet at least 976 * has an ether header. 977 */ 978 if (sz1 < sizeof (struct ether_header)) { 979 mac_rx_drop_pkt(mac_srs, mp); 980 continue; 981 } 982 ehp = (struct ether_header *)mp->b_rptr; 983 984 /* 985 * Determine if this is a VLAN or non-VLAN packet. 986 */ 987 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 988 evhp = (struct ether_vlan_header *)mp->b_rptr; 989 sap = ntohs(evhp->ether_type); 990 hdrsize = sizeof (struct ether_vlan_header); 991 /* 992 * Check if the VID of the packet, if any, 993 * belongs to this client. 994 */ 995 if (!mac_client_check_flow_vid(mcip, 996 VLAN_ID(ntohs(evhp->ether_tci)))) { 997 mac_rx_drop_pkt(mac_srs, mp); 998 continue; 999 } 1000 } else { 1001 hdrsize = sizeof (struct ether_header); 1002 } 1003 is_unicast = 1004 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 1005 dstaddr = (uint8_t *)&ehp->ether_dhost; 1006 } else { 1007 mac_header_info_t mhi; 1008 1009 if (mac_header_info((mac_handle_t)mcip->mci_mip, 1010 mp, &mhi) != 0) { 1011 mac_rx_drop_pkt(mac_srs, mp); 1012 continue; 1013 } 1014 hdrsize = mhi.mhi_hdrsize; 1015 sap = mhi.mhi_bindsap; 1016 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 1017 dstaddr = (uint8_t *)mhi.mhi_daddr; 1018 } 1019 1020 if (!dls_bypass) { 1021 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1022 hdrsize, &type, &indx) == -1) { 1023 mac_rx_drop_pkt(mac_srs, mp); 1024 continue; 1025 } 1026 1027 FANOUT_ENQUEUE_MP(headmp[type][indx], 1028 tailmp[type][indx], cnt[type][indx], bw_ctl, 1029 sz[type][indx], sz1, mp); 1030 continue; 1031 } 1032 1033 1034 /* 1035 * If we are using the default Rx ring where H/W or S/W 1036 * classification has not happened, we need to verify if 1037 * this unicast packet really belongs to us. 1038 */ 1039 if (sap == ETHERTYPE_IP) { 1040 /* 1041 * If we are H/W classified, but we have promisc 1042 * on, then we need to check for the unicast address. 1043 */ 1044 if (hw_classified && mcip->mci_promisc_list != NULL) { 1045 mac_address_t *map; 1046 1047 rw_enter(&mcip->mci_rw_lock, RW_READER); 1048 map = mcip->mci_unicast; 1049 if (bcmp(dstaddr, map->ma_addr, 1050 map->ma_len) == 0) 1051 type = UNDEF; 1052 rw_exit(&mcip->mci_rw_lock); 1053 } else if (is_unicast) { 1054 type = UNDEF; 1055 } 1056 } 1057 1058 /* 1059 * This needs to become a contract with the driver for 1060 * the fast path. 1061 */ 1062 1063 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 1064 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1065 type = OTH; 1066 fanout_oth1++; 1067 } 1068 1069 if (type != OTH) { 1070 uint16_t frag_offset_flags; 1071 1072 switch (ipha->ipha_protocol) { 1073 case IPPROTO_TCP: 1074 case IPPROTO_UDP: 1075 case IPPROTO_SCTP: 1076 case IPPROTO_ESP: 1077 ipha_len = IPH_HDR_LENGTH(ipha); 1078 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1079 mp->b_wptr) { 1080 type = OTH; 1081 break; 1082 } 1083 frag_offset_flags = 1084 ntohs(ipha->ipha_fragment_offset_and_flags); 1085 if ((frag_offset_flags & 1086 (IPH_MF | IPH_OFFSET)) != 0) { 1087 type = OTH; 1088 fanout_oth3++; 1089 break; 1090 } 1091 ports_offset = hdrsize + ipha_len; 1092 break; 1093 default: 1094 type = OTH; 1095 fanout_oth4++; 1096 break; 1097 } 1098 } 1099 1100 if (type == OTH) { 1101 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1102 hdrsize, &type, &indx) == -1) { 1103 mac_rx_drop_pkt(mac_srs, mp); 1104 continue; 1105 } 1106 1107 FANOUT_ENQUEUE_MP(headmp[type][indx], 1108 tailmp[type][indx], cnt[type][indx], bw_ctl, 1109 sz[type][indx], sz1, mp); 1110 continue; 1111 } 1112 1113 ASSERT(type == UNDEF); 1114 1115 /* 1116 * XXX-Sunay: We should hold srs_lock since ring_count 1117 * below can change. But if we are always called from 1118 * mac_rx_srs_drain and SRS_PROC is set, then we can 1119 * enforce that ring_count can't be changed i.e. 1120 * to change fanout type or ring count, the calling 1121 * thread needs to be behind SRS_PROC. 1122 */ 1123 switch (ipha->ipha_protocol) { 1124 case IPPROTO_TCP: 1125 /* 1126 * Note that for ESP, we fanout on SPI and it is at the 1127 * same offset as the 2x16-bit ports. So it is clumped 1128 * along with TCP, UDP and SCTP. 1129 */ 1130 hash = HASH_ADDR(ipha->ipha_src, 1131 *(uint32_t *)(mp->b_rptr + ports_offset)); 1132 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1133 type = V4_TCP; 1134 mp->b_rptr += hdrsize; 1135 break; 1136 case IPPROTO_UDP: 1137 case IPPROTO_SCTP: 1138 case IPPROTO_ESP: 1139 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1140 hash = HASH_ADDR(ipha->ipha_src, 1141 *(uint32_t *)(mp->b_rptr + ports_offset)); 1142 indx = COMPUTE_INDEX(hash, 1143 mac_srs->srs_udp_ring_count); 1144 } else { 1145 indx = mac_srs->srs_ind % 1146 mac_srs->srs_udp_ring_count; 1147 mac_srs->srs_ind++; 1148 } 1149 type = V4_UDP; 1150 mp->b_rptr += hdrsize; 1151 break; 1152 default: 1153 indx = 0; 1154 type = OTH; 1155 } 1156 1157 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1158 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1159 } 1160 1161 for (type = V4_TCP; type < UNDEF; type++) { 1162 int i; 1163 1164 for (i = 0; i < fanout_cnt; i++) { 1165 if (headmp[type][i] != NULL) { 1166 mac_soft_ring_t *softring; 1167 1168 ASSERT(tailmp[type][i]->b_next == NULL); 1169 switch (type) { 1170 case V4_TCP: 1171 softring = 1172 mac_srs->srs_tcp_soft_rings[i]; 1173 break; 1174 case V4_UDP: 1175 softring = 1176 mac_srs->srs_udp_soft_rings[i]; 1177 break; 1178 case OTH: 1179 softring = 1180 mac_srs->srs_oth_soft_rings[i]; 1181 break; 1182 } 1183 mac_rx_soft_ring_process(mcip, 1184 softring, headmp[type][i], tailmp[type][i], 1185 cnt[type][i], sz[type][i]); 1186 } 1187 } 1188 } 1189 } 1190 1191 #define SRS_BYTES_TO_PICKUP 150000 1192 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1193 1194 /* 1195 * mac_rx_srs_poll_ring 1196 * 1197 * This SRS Poll thread uses this routine to poll the underlying hardware 1198 * Rx ring to get a chain of packets. It can inline process that chain 1199 * if mac_latency_optimize is set (default) or signal the SRS worker thread 1200 * to do the remaining processing. 1201 * 1202 * Since packets come in the system via interrupt or poll path, we also 1203 * update the stats and deal with promiscous clients here. 1204 */ 1205 void 1206 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1207 { 1208 kmutex_t *lock = &mac_srs->srs_lock; 1209 kcondvar_t *async = &mac_srs->srs_cv; 1210 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1211 mblk_t *head, *tail, *mp; 1212 callb_cpr_t cprinfo; 1213 ssize_t bytes_to_pickup; 1214 size_t sz; 1215 int count; 1216 mac_client_impl_t *smcip; 1217 1218 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1219 mutex_enter(lock); 1220 1221 start: 1222 for (;;) { 1223 if (mac_srs->srs_state & SRS_PAUSE) 1224 goto done; 1225 1226 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1227 cv_wait(async, lock); 1228 CALLB_CPR_SAFE_END(&cprinfo, lock); 1229 1230 if (mac_srs->srs_state & SRS_PAUSE) 1231 goto done; 1232 1233 check_again: 1234 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1235 /* 1236 * We pick as many bytes as we are allowed to queue. 1237 * Its possible that we will exceed the total 1238 * packets queued in case this SRS is part of the 1239 * Rx ring group since > 1 poll thread can be pulling 1240 * upto the max allowed packets at the same time 1241 * but that should be OK. 1242 */ 1243 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1244 bytes_to_pickup = 1245 mac_srs->srs_bw->mac_bw_drop_threshold - 1246 mac_srs->srs_bw->mac_bw_sz; 1247 /* 1248 * We shouldn't have been signalled if we 1249 * have 0 or less bytes to pick but since 1250 * some of the bytes accounting is driver 1251 * dependant, we do the safety check. 1252 */ 1253 if (bytes_to_pickup < 0) 1254 bytes_to_pickup = 0; 1255 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1256 } else { 1257 /* 1258 * ToDO: Need to change the polling API 1259 * to add a packet count and a flag which 1260 * tells the driver whether we want packets 1261 * based on a count, or bytes, or all the 1262 * packets queued in the driver/HW. This 1263 * way, we never have to check the limits 1264 * on poll path. We truly let only as many 1265 * packets enter the system as we are willing 1266 * to process or queue. 1267 * 1268 * Something along the lines of 1269 * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1270 * mac_srs->srs_poll_pkt_cnt 1271 */ 1272 1273 /* 1274 * Since we are not doing B/W control, pick 1275 * as many packets as allowed. 1276 */ 1277 bytes_to_pickup = max_bytes_to_pickup; 1278 } 1279 1280 /* Poll the underlying Hardware */ 1281 mutex_exit(lock); 1282 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1283 mutex_enter(lock); 1284 1285 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1286 SRS_POLL_THR_OWNER); 1287 1288 mp = tail = head; 1289 count = 0; 1290 sz = 0; 1291 while (mp != NULL) { 1292 tail = mp; 1293 sz += msgdsize(mp); 1294 mp = mp->b_next; 1295 count++; 1296 } 1297 1298 if (head != NULL) { 1299 tail->b_next = NULL; 1300 smcip = mac_srs->srs_mcip; 1301 1302 if ((mac_srs->srs_type & SRST_FLOW) || 1303 (smcip == NULL)) { 1304 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1305 rbytes, sz); 1306 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1307 ipackets, count); 1308 } 1309 1310 /* 1311 * If there are any promiscuous mode callbacks 1312 * defined for this MAC client, pass them a copy 1313 * if appropriate and also update the counters. 1314 */ 1315 if (smcip != NULL) { 1316 smcip->mci_stat_ibytes += sz; 1317 smcip->mci_stat_ipackets += count; 1318 1319 if (smcip->mci_mip->mi_promisc_list != NULL) { 1320 mutex_exit(lock); 1321 mac_promisc_dispatch(smcip->mci_mip, 1322 head, NULL); 1323 mutex_enter(lock); 1324 } 1325 } 1326 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1327 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1328 mac_srs->srs_bw->mac_bw_polled += sz; 1329 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1330 } 1331 srs_rx->sr_poll_count += count; 1332 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1333 count, sz); 1334 if (count <= 10) 1335 srs_rx->sr_chain_cnt_undr10++; 1336 else if (count > 10 && count <= 50) 1337 srs_rx->sr_chain_cnt_10to50++; 1338 else 1339 srs_rx->sr_chain_cnt_over50++; 1340 } 1341 1342 /* 1343 * We are guaranteed that SRS_PROC will be set if we 1344 * are here. Also, poll thread gets to run only if 1345 * the drain was being done by a worker thread although 1346 * its possible that worker thread is still running 1347 * and poll thread was sent down to keep the pipeline 1348 * going instead of doing a complete drain and then 1349 * trying to poll the NIC. 1350 * 1351 * So we need to check SRS_WORKER flag to make sure 1352 * that the worker thread is not processing the queue 1353 * in parallel to us. The flags and conditions are 1354 * protected by the srs_lock to prevent any race. We 1355 * ensure that we don't drop the srs_lock from now 1356 * till the end and similarly we don't drop the srs_lock 1357 * in mac_rx_srs_drain() till similar condition check 1358 * are complete. The mac_rx_srs_drain() needs to ensure 1359 * that SRS_WORKER flag remains set as long as its 1360 * processing the queue. 1361 */ 1362 if (!(mac_srs->srs_state & SRS_WORKER) && 1363 (mac_srs->srs_first != NULL)) { 1364 /* 1365 * We have packets to process and worker thread 1366 * is not running. Check to see if poll thread is 1367 * allowed to process. 1368 */ 1369 if (mac_srs->srs_state & SRS_LATENCY_OPT) { 1370 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1371 if (!(mac_srs->srs_state & SRS_PAUSE) && 1372 srs_rx->sr_poll_pkt_cnt <= 1373 srs_rx->sr_lowat) { 1374 srs_rx->sr_poll_again++; 1375 goto check_again; 1376 } 1377 /* 1378 * We are already above low water mark 1379 * so stay in the polling mode but no 1380 * need to poll. Once we dip below 1381 * the polling threshold, the processing 1382 * thread (soft ring) will signal us 1383 * to poll again (MAC_UPDATE_SRS_COUNT) 1384 */ 1385 srs_rx->sr_poll_drain_no_poll++; 1386 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1387 /* 1388 * In B/W control case, its possible 1389 * that the backlog built up due to 1390 * B/W limit being reached and packets 1391 * are queued only in SRS. In this case, 1392 * we should schedule worker thread 1393 * since no one else will wake us up. 1394 */ 1395 if ((mac_srs->srs_type & SRST_BW_CONTROL) && 1396 (mac_srs->srs_tid == NULL)) { 1397 mac_srs->srs_tid = 1398 timeout(mac_srs_fire, mac_srs, 1); 1399 srs_rx->sr_poll_worker_wakeup++; 1400 } 1401 } else { 1402 /* 1403 * Wakeup the worker thread for more processing. 1404 * We optimize for throughput in this case. 1405 */ 1406 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1407 MAC_SRS_WORKER_WAKEUP(mac_srs); 1408 srs_rx->sr_poll_sig_worker++; 1409 } 1410 } else if ((mac_srs->srs_first == NULL) && 1411 !(mac_srs->srs_state & SRS_WORKER)) { 1412 /* 1413 * There is nothing queued in SRS and 1414 * no worker thread running. Plus we 1415 * didn't get anything from the H/W 1416 * as well (head == NULL); 1417 */ 1418 ASSERT(head == NULL); 1419 mac_srs->srs_state &= 1420 ~(SRS_PROC|SRS_GET_PKTS); 1421 1422 /* 1423 * If we have a packets in soft ring, don't allow 1424 * more packets to come into this SRS by keeping the 1425 * interrupts off but not polling the H/W. The 1426 * poll thread will get signaled as soon as 1427 * srs_poll_pkt_cnt dips below poll threshold. 1428 */ 1429 if (srs_rx->sr_poll_pkt_cnt == 0) { 1430 srs_rx->sr_poll_intr_enable++; 1431 MAC_SRS_POLLING_OFF(mac_srs); 1432 } else { 1433 /* 1434 * We know nothing is queued in SRS 1435 * since we are here after checking 1436 * srs_first is NULL. The backlog 1437 * is entirely due to packets queued 1438 * in Soft ring which will wake us up 1439 * and get the interface out of polling 1440 * mode once the backlog dips below 1441 * sr_poll_thres. 1442 */ 1443 srs_rx->sr_poll_no_poll++; 1444 } 1445 } else { 1446 /* 1447 * Worker thread is already running. 1448 * Nothing much to do. If the polling 1449 * was enabled, worker thread will deal 1450 * with that. 1451 */ 1452 mac_srs->srs_state &= ~SRS_GET_PKTS; 1453 srs_rx->sr_poll_goto_sleep++; 1454 } 1455 } 1456 done: 1457 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1458 cv_signal(&mac_srs->srs_async); 1459 /* 1460 * If this is a temporary quiesce then wait for the restart signal 1461 * from the srs worker. Then clear the flags and signal the srs worker 1462 * to ensure a positive handshake and go back to start. 1463 */ 1464 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1465 cv_wait(async, lock); 1466 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1467 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1468 mac_srs->srs_state &= 1469 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1470 cv_signal(&mac_srs->srs_async); 1471 goto start; 1472 } else { 1473 mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1474 cv_signal(&mac_srs->srs_async); 1475 CALLB_CPR_EXIT(&cprinfo); 1476 thread_exit(); 1477 } 1478 } 1479 1480 /* 1481 * mac_srs_pick_chain 1482 * 1483 * In Bandwidth control case, checks how many packets can be processed 1484 * and return them in a sub chain. 1485 */ 1486 static mblk_t * 1487 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1488 size_t *chain_sz, int *chain_cnt) 1489 { 1490 mblk_t *head = NULL; 1491 mblk_t *tail = NULL; 1492 size_t sz; 1493 size_t tsz = 0; 1494 int cnt = 0; 1495 mblk_t *mp; 1496 1497 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1498 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1499 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1500 mac_srs->srs_bw->mac_bw_limit) || 1501 (mac_srs->srs_bw->mac_bw_limit == 0)) { 1502 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1503 head = mac_srs->srs_first; 1504 mac_srs->srs_first = NULL; 1505 *chain_tail = mac_srs->srs_last; 1506 mac_srs->srs_last = NULL; 1507 *chain_sz = mac_srs->srs_size; 1508 *chain_cnt = mac_srs->srs_count; 1509 mac_srs->srs_count = 0; 1510 mac_srs->srs_size = 0; 1511 return (head); 1512 } 1513 1514 /* 1515 * Can't clear the entire backlog. 1516 * Need to find how many packets to pick 1517 */ 1518 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1519 while ((mp = mac_srs->srs_first) != NULL) { 1520 sz = msgdsize(mp); 1521 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1522 mac_srs->srs_bw->mac_bw_limit) { 1523 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1524 mac_srs->srs_bw->mac_bw_state |= 1525 SRS_BW_ENFORCED; 1526 break; 1527 } 1528 1529 /* 1530 * The _size & cnt is decremented from the softrings 1531 * when they send up the packet for polling to work 1532 * properly. 1533 */ 1534 tsz += sz; 1535 cnt++; 1536 mac_srs->srs_count--; 1537 mac_srs->srs_size -= sz; 1538 if (tail != NULL) 1539 tail->b_next = mp; 1540 else 1541 head = mp; 1542 tail = mp; 1543 mac_srs->srs_first = mac_srs->srs_first->b_next; 1544 } 1545 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1546 if (mac_srs->srs_first == NULL) 1547 mac_srs->srs_last = NULL; 1548 1549 if (tail != NULL) 1550 tail->b_next = NULL; 1551 *chain_tail = tail; 1552 *chain_cnt = cnt; 1553 *chain_sz = tsz; 1554 1555 return (head); 1556 } 1557 1558 /* 1559 * mac_rx_srs_drain 1560 * 1561 * The SRS drain routine. Gets to run to clear the queue. Any thread 1562 * (worker, interrupt, poll) can call this based on processing model. 1563 * The first thing we do is disable interrupts if possible and then 1564 * drain the queue. we also try to poll the underlying hardware if 1565 * there is a dedicated hardware Rx ring assigned to this SRS. 1566 * 1567 * There is a equivalent drain routine in bandwidth control mode 1568 * mac_rx_srs_drain_bw. There is some code duplication between the two 1569 * routines but they are highly performance sensitive and are easier 1570 * to read/debug if they stay separate. Any code changes here might 1571 * also apply to mac_rx_srs_drain_bw as well. 1572 */ 1573 void 1574 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1575 { 1576 mblk_t *head; 1577 mblk_t *tail; 1578 timeout_id_t tid; 1579 int cnt = 0; 1580 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1581 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1582 1583 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1584 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1585 1586 /* If we are blanked i.e. can't do upcalls, then we are done */ 1587 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1588 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1589 (mac_srs->srs_state & SRS_PAUSE)); 1590 goto out; 1591 } 1592 1593 if (mac_srs->srs_first == NULL) 1594 goto out; 1595 1596 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 1597 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 1598 /* 1599 * In the normal case, the SRS worker thread does no 1600 * work and we wait for a backlog to build up before 1601 * we switch into polling mode. In case we are 1602 * optimizing for throughput, we use the worker thread 1603 * as well. The goal is to let worker thread process 1604 * the queue and poll thread to feed packets into 1605 * the queue. As such, we should signal the poll 1606 * thread to try and get more packets. 1607 * 1608 * We could have pulled this check in the POLL_RING 1609 * macro itself but keeping it explicit here makes 1610 * the architecture more human understandable. 1611 */ 1612 MAC_SRS_POLL_RING(mac_srs); 1613 } 1614 1615 again: 1616 head = mac_srs->srs_first; 1617 mac_srs->srs_first = NULL; 1618 tail = mac_srs->srs_last; 1619 mac_srs->srs_last = NULL; 1620 cnt = mac_srs->srs_count; 1621 mac_srs->srs_count = 0; 1622 1623 ASSERT(head != NULL); 1624 ASSERT(tail != NULL); 1625 1626 if ((tid = mac_srs->srs_tid) != 0) 1627 mac_srs->srs_tid = 0; 1628 1629 mac_srs->srs_state |= (SRS_PROC|proc_type); 1630 1631 1632 /* 1633 * mcip is NULL for broadcast and multicast flows. The promisc 1634 * callbacks for broadcast and multicast packets are delivered from 1635 * mac_rx() and we don't need to worry about that case in this path 1636 */ 1637 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1638 mutex_exit(&mac_srs->srs_lock); 1639 mac_promisc_client_dispatch(mcip, head); 1640 mutex_enter(&mac_srs->srs_lock); 1641 } 1642 1643 /* 1644 * Check if SRS itself is doing the processing 1645 * This direct path does not apply when subflows are present. In this 1646 * case, packets need to be dispatched to a soft ring according to the 1647 * flow's bandwidth and other resources contraints. 1648 */ 1649 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1650 mac_direct_rx_t proc; 1651 void *arg1; 1652 mac_resource_handle_t arg2; 1653 1654 /* 1655 * This is the case when a Rx is directly 1656 * assigned and we have a fully classified 1657 * protocol chain. We can deal with it in 1658 * one shot. 1659 */ 1660 proc = srs_rx->sr_func; 1661 arg1 = srs_rx->sr_arg1; 1662 arg2 = srs_rx->sr_arg2; 1663 1664 mac_srs->srs_state |= SRS_CLIENT_PROC; 1665 mutex_exit(&mac_srs->srs_lock); 1666 if (tid != 0) { 1667 (void) untimeout(tid); 1668 tid = 0; 1669 } 1670 1671 proc(arg1, arg2, head, NULL); 1672 /* 1673 * Decrement the size and count here itelf 1674 * since the packet has been processed. 1675 */ 1676 mutex_enter(&mac_srs->srs_lock); 1677 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1678 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1679 cv_signal(&mac_srs->srs_client_cv); 1680 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1681 } else { 1682 /* Some kind of softrings based fanout is required */ 1683 mutex_exit(&mac_srs->srs_lock); 1684 if (tid != 0) { 1685 (void) untimeout(tid); 1686 tid = 0; 1687 } 1688 1689 /* 1690 * Since the fanout routines can deal with chains, 1691 * shoot the entire chain up. 1692 */ 1693 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1694 mac_rx_srs_fanout(mac_srs, head); 1695 else 1696 mac_rx_srs_proto_fanout(mac_srs, head); 1697 mutex_enter(&mac_srs->srs_lock); 1698 } 1699 1700 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 1701 (mac_srs->srs_first != NULL)) { 1702 /* 1703 * More packets arrived while we were clearing the 1704 * SRS. This can be possible because of one of 1705 * three conditions below: 1706 * 1) The driver is using multiple worker threads 1707 * to send the packets to us. 1708 * 2) The driver has a race in switching 1709 * between interrupt and polling mode or 1710 * 3) Packets are arriving in this SRS via the 1711 * S/W classification as well. 1712 * 1713 * We should switch to polling mode and see if we 1714 * need to send the poll thread down. Also, signal 1715 * the worker thread to process whats just arrived. 1716 */ 1717 MAC_SRS_POLLING_ON(mac_srs); 1718 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 1719 srs_rx->sr_drain_poll_sig++; 1720 MAC_SRS_POLL_RING(mac_srs); 1721 } 1722 1723 /* 1724 * If we didn't signal the poll thread, we need 1725 * to deal with the pending packets ourselves. 1726 */ 1727 if (proc_type == SRS_WORKER) { 1728 srs_rx->sr_drain_again++; 1729 goto again; 1730 } else { 1731 srs_rx->sr_drain_worker_sig++; 1732 cv_signal(&mac_srs->srs_async); 1733 } 1734 } 1735 1736 out: 1737 if (mac_srs->srs_state & SRS_GET_PKTS) { 1738 /* 1739 * Poll thread is already running. Leave the 1740 * SRS_RPOC set and hand over the control to 1741 * poll thread. 1742 */ 1743 mac_srs->srs_state &= ~proc_type; 1744 srs_rx->sr_drain_poll_running++; 1745 return; 1746 } 1747 1748 /* 1749 * Even if there are no packets queued in SRS, we 1750 * need to make sure that the shared counter is 1751 * clear and any associated softrings have cleared 1752 * all the backlog. Otherwise, leave the interface 1753 * in polling mode and the poll thread will get 1754 * signalled once the count goes down to zero. 1755 * 1756 * If someone is already draining the queue (SRS_PROC is 1757 * set) when the srs_poll_pkt_cnt goes down to zero, 1758 * then it means that drain is already running and we 1759 * will turn off polling at that time if there is 1760 * no backlog. 1761 * 1762 * As long as there are packets queued either 1763 * in soft ring set or its soft rings, we will leave 1764 * the interface in polling mode (even if the drain 1765 * was done being the interrupt thread). We signal 1766 * the poll thread as well if we have dipped below 1767 * low water mark. 1768 * 1769 * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1770 * since that turn polling on only for worker thread. 1771 * Its not worth turning polling on for interrupt 1772 * thread (since NIC will not issue another interrupt) 1773 * unless a backlog builds up. 1774 */ 1775 if ((srs_rx->sr_poll_pkt_cnt > 0) && 1776 (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1777 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1778 srs_rx->sr_drain_keep_polling++; 1779 MAC_SRS_POLLING_ON(mac_srs); 1780 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1781 MAC_SRS_POLL_RING(mac_srs); 1782 return; 1783 } 1784 1785 /* Nothing else to do. Get out of poll mode */ 1786 MAC_SRS_POLLING_OFF(mac_srs); 1787 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1788 srs_rx->sr_drain_finish_intr++; 1789 } 1790 1791 /* 1792 * mac_rx_srs_drain_bw 1793 * 1794 * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1795 * (worker, interrupt, poll) can call this based on processing model. 1796 * The first thing we do is disable interrupts if possible and then 1797 * drain the queue. we also try to poll the underlying hardware if 1798 * there is a dedicated hardware Rx ring assigned to this SRS. 1799 * 1800 * There is a equivalent drain routine in non bandwidth control mode 1801 * mac_rx_srs_drain. There is some code duplication between the two 1802 * routines but they are highly performance sensitive and are easier 1803 * to read/debug if they stay separate. Any code changes here might 1804 * also apply to mac_rx_srs_drain as well. 1805 */ 1806 void 1807 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1808 { 1809 mblk_t *head; 1810 mblk_t *tail; 1811 timeout_id_t tid; 1812 size_t sz = 0; 1813 int cnt = 0; 1814 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1815 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1816 1817 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1818 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1819 again: 1820 /* Check if we are doing B/W control */ 1821 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1822 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 1823 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 1824 mac_srs->srs_bw->mac_bw_used = 0; 1825 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1826 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1827 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1828 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1829 goto done; 1830 } else if (mac_srs->srs_bw->mac_bw_used > 1831 mac_srs->srs_bw->mac_bw_limit) { 1832 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1833 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1834 goto done; 1835 } 1836 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1837 1838 /* If we are blanked i.e. can't do upcalls, then we are done */ 1839 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1840 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1841 (mac_srs->srs_state & SRS_PAUSE)); 1842 goto done; 1843 } 1844 1845 sz = 0; 1846 cnt = 0; 1847 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1848 /* 1849 * We couldn't pick up a single packet. 1850 */ 1851 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1852 if ((mac_srs->srs_bw->mac_bw_used == 0) && 1853 (mac_srs->srs_size != 0) && 1854 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1855 /* 1856 * Seems like configured B/W doesn't 1857 * even allow processing of 1 packet 1858 * per tick. 1859 * 1860 * XXX: raise the limit to processing 1861 * at least 1 packet per tick. 1862 */ 1863 mac_srs->srs_bw->mac_bw_limit += 1864 mac_srs->srs_bw->mac_bw_limit; 1865 mac_srs->srs_bw->mac_bw_drop_threshold += 1866 mac_srs->srs_bw->mac_bw_drop_threshold; 1867 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1868 "raised B/W limit to %d since not even a " 1869 "single packet can be processed per " 1870 "tick %d\n", (void *)mac_srs, 1871 (int)mac_srs->srs_bw->mac_bw_limit, 1872 (int)msgdsize(mac_srs->srs_first)); 1873 } 1874 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1875 goto done; 1876 } 1877 1878 ASSERT(head != NULL); 1879 ASSERT(tail != NULL); 1880 1881 /* zero bandwidth: drop all and return to interrupt mode */ 1882 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1883 if (mac_srs->srs_bw->mac_bw_limit == 0) { 1884 srs_rx->sr_drop_count += cnt; 1885 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1886 mac_srs->srs_bw->mac_bw_sz -= sz; 1887 mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1888 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1889 mac_pkt_drop(NULL, NULL, head, B_FALSE); 1890 goto leave_poll; 1891 } else { 1892 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1893 } 1894 1895 if ((tid = mac_srs->srs_tid) != 0) 1896 mac_srs->srs_tid = 0; 1897 1898 mac_srs->srs_state |= (SRS_PROC|proc_type); 1899 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1900 1901 /* 1902 * mcip is NULL for broadcast and multicast flows. The promisc 1903 * callbacks for broadcast and multicast packets are delivered from 1904 * mac_rx() and we don't need to worry about that case in this path 1905 */ 1906 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1907 mutex_exit(&mac_srs->srs_lock); 1908 mac_promisc_client_dispatch(mcip, head); 1909 mutex_enter(&mac_srs->srs_lock); 1910 } 1911 1912 /* 1913 * Check if SRS itself is doing the processing 1914 * This direct path does not apply when subflows are present. In this 1915 * case, packets need to be dispatched to a soft ring according to the 1916 * flow's bandwidth and other resources contraints. 1917 */ 1918 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1919 mac_direct_rx_t proc; 1920 void *arg1; 1921 mac_resource_handle_t arg2; 1922 1923 /* 1924 * This is the case when a Rx is directly 1925 * assigned and we have a fully classified 1926 * protocol chain. We can deal with it in 1927 * one shot. 1928 */ 1929 proc = srs_rx->sr_func; 1930 arg1 = srs_rx->sr_arg1; 1931 arg2 = srs_rx->sr_arg2; 1932 1933 mac_srs->srs_state |= SRS_CLIENT_PROC; 1934 mutex_exit(&mac_srs->srs_lock); 1935 if (tid != 0) { 1936 (void) untimeout(tid); 1937 tid = 0; 1938 } 1939 1940 proc(arg1, arg2, head, NULL); 1941 /* 1942 * Decrement the size and count here itelf 1943 * since the packet has been processed. 1944 */ 1945 mutex_enter(&mac_srs->srs_lock); 1946 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1947 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 1948 1949 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1950 cv_signal(&mac_srs->srs_client_cv); 1951 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1952 } else { 1953 /* Some kind of softrings based fanout is required */ 1954 mutex_exit(&mac_srs->srs_lock); 1955 if (tid != 0) { 1956 (void) untimeout(tid); 1957 tid = 0; 1958 } 1959 1960 /* 1961 * Since the fanout routines can deal with chains, 1962 * shoot the entire chain up. 1963 */ 1964 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1965 mac_rx_srs_fanout(mac_srs, head); 1966 else 1967 mac_rx_srs_proto_fanout(mac_srs, head); 1968 mutex_enter(&mac_srs->srs_lock); 1969 } 1970 1971 /* 1972 * Send the poll thread to pick up any packets arrived 1973 * so far. This also serves as the last check in case 1974 * nothing else is queued in the SRS. The poll thread 1975 * is signalled only in the case the drain was done 1976 * by the worker thread and SRS_WORKER is set. The 1977 * worker thread can run in parallel as long as the 1978 * SRS_WORKER flag is set. We we have nothing else to 1979 * process, we can exit while leaving SRS_PROC set 1980 * which gives the poll thread control to process and 1981 * cleanup once it returns from the NIC. 1982 * 1983 * If we have nothing else to process, we need to 1984 * ensure that we keep holding the srs_lock till 1985 * all the checks below are done and control is 1986 * handed to the poll thread if it was running. 1987 */ 1988 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1989 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1990 if (mac_srs->srs_first != NULL) { 1991 if (proc_type == SRS_WORKER) { 1992 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1993 if (srs_rx->sr_poll_pkt_cnt <= 1994 srs_rx->sr_lowat) 1995 MAC_SRS_POLL_RING(mac_srs); 1996 goto again; 1997 } else { 1998 cv_signal(&mac_srs->srs_async); 1999 } 2000 } 2001 } 2002 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2003 2004 done: 2005 2006 if (mac_srs->srs_state & SRS_GET_PKTS) { 2007 /* 2008 * Poll thread is already running. Leave the 2009 * SRS_RPOC set and hand over the control to 2010 * poll thread. 2011 */ 2012 mac_srs->srs_state &= ~proc_type; 2013 return; 2014 } 2015 2016 /* 2017 * If we can't process packets because we have exceeded 2018 * B/W limit for this tick, just set the timeout 2019 * and leave. 2020 * 2021 * Even if there are no packets queued in SRS, we 2022 * need to make sure that the shared counter is 2023 * clear and any associated softrings have cleared 2024 * all the backlog. Otherwise, leave the interface 2025 * in polling mode and the poll thread will get 2026 * signalled once the count goes down to zero. 2027 * 2028 * If someone is already draining the queue (SRS_PROC is 2029 * set) when the srs_poll_pkt_cnt goes down to zero, 2030 * then it means that drain is already running and we 2031 * will turn off polling at that time if there is 2032 * no backlog. As long as there are packets queued either 2033 * is soft ring set or its soft rings, we will leave 2034 * the interface in polling mode. 2035 */ 2036 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2037 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2038 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2039 (srs_rx->sr_poll_pkt_cnt > 0))) { 2040 MAC_SRS_POLLING_ON(mac_srs); 2041 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2042 if ((mac_srs->srs_first != NULL) && 2043 (mac_srs->srs_tid == NULL)) 2044 mac_srs->srs_tid = timeout(mac_srs_fire, 2045 mac_srs, 1); 2046 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2047 return; 2048 } 2049 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2050 2051 leave_poll: 2052 2053 /* Nothing else to do. Get out of poll mode */ 2054 MAC_SRS_POLLING_OFF(mac_srs); 2055 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2056 } 2057 2058 /* 2059 * mac_srs_worker 2060 * 2061 * The SRS worker routine. Drains the queue when no one else is 2062 * processing it. 2063 */ 2064 void 2065 mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2066 { 2067 kmutex_t *lock = &mac_srs->srs_lock; 2068 kcondvar_t *async = &mac_srs->srs_async; 2069 callb_cpr_t cprinfo; 2070 boolean_t bw_ctl_flag; 2071 2072 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2073 mutex_enter(lock); 2074 2075 start: 2076 for (;;) { 2077 bw_ctl_flag = B_FALSE; 2078 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2079 MAC_SRS_BW_LOCK(mac_srs); 2080 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2081 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2082 bw_ctl_flag = B_TRUE; 2083 MAC_SRS_BW_UNLOCK(mac_srs); 2084 } 2085 /* 2086 * The SRS_BW_ENFORCED flag may change since we have dropped 2087 * the mac_bw_lock. However the drain function can handle both 2088 * a drainable SRS or a bandwidth controlled SRS, and the 2089 * effect of scheduling a timeout is to wakeup the worker 2090 * thread which in turn will call the drain function. Since 2091 * we release the srs_lock atomically only in the cv_wait there 2092 * isn't a fear of waiting for ever. 2093 */ 2094 while (((mac_srs->srs_state & SRS_PROC) || 2095 (mac_srs->srs_first == NULL) || bw_ctl_flag || 2096 (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2097 !(mac_srs->srs_state & SRS_PAUSE)) { 2098 /* 2099 * If we have packets queued and we are here 2100 * because B/W control is in place, we better 2101 * schedule the worker wakeup after 1 tick 2102 * to see if bandwidth control can be relaxed. 2103 */ 2104 if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2105 /* 2106 * We need to ensure that a timer is already 2107 * scheduled or we force schedule one for 2108 * later so that we can continue processing 2109 * after this quanta is over. 2110 */ 2111 mac_srs->srs_tid = timeout(mac_srs_fire, 2112 mac_srs, 1); 2113 } 2114 wait: 2115 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2116 cv_wait(async, lock); 2117 CALLB_CPR_SAFE_END(&cprinfo, lock); 2118 2119 if (mac_srs->srs_state & SRS_PAUSE) 2120 goto done; 2121 if (mac_srs->srs_state & SRS_PROC) 2122 goto wait; 2123 2124 if (mac_srs->srs_first != NULL && 2125 mac_srs->srs_type & SRST_BW_CONTROL) { 2126 MAC_SRS_BW_LOCK(mac_srs); 2127 if (mac_srs->srs_bw->mac_bw_state & 2128 SRS_BW_ENFORCED) { 2129 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2130 } 2131 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2132 SRS_BW_ENFORCED; 2133 MAC_SRS_BW_UNLOCK(mac_srs); 2134 } 2135 } 2136 2137 if (mac_srs->srs_state & SRS_PAUSE) 2138 goto done; 2139 mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2140 } 2141 done: 2142 /* 2143 * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2144 * from both hard and soft classifications and waits for such threads 2145 * to finish before signaling the worker. So at this point the only 2146 * thread left that could be competing with the worker is the poll 2147 * thread. In the case of Tx, there shouldn't be any thread holding 2148 * SRS_PROC at this point. 2149 */ 2150 if (!(mac_srs->srs_state & SRS_PROC)) { 2151 mac_srs->srs_state |= SRS_PROC; 2152 } else { 2153 ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2154 /* 2155 * Poll thread still owns the SRS and is still running 2156 */ 2157 ASSERT((mac_srs->srs_poll_thr == NULL) || 2158 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2159 SRS_POLL_THR_OWNER)); 2160 } 2161 mac_srs_worker_quiesce(mac_srs); 2162 /* 2163 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2164 * of the quiesce operation 2165 */ 2166 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2167 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2168 2169 if (mac_srs->srs_state & SRS_RESTART) { 2170 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2171 mac_srs_worker_restart(mac_srs); 2172 mac_srs->srs_state &= ~SRS_PROC; 2173 goto start; 2174 } 2175 2176 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2177 mac_srs_worker_quiesce(mac_srs); 2178 2179 mac_srs->srs_state &= ~SRS_PROC; 2180 /* The macro drops the srs_lock */ 2181 CALLB_CPR_EXIT(&cprinfo); 2182 thread_exit(); 2183 } 2184 2185 /* 2186 * mac_rx_srs_subflow_process 2187 * 2188 * Receive side routine called from interrupt path when there are 2189 * sub flows present on this SRS. 2190 */ 2191 /* ARGSUSED */ 2192 void 2193 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2194 mblk_t *mp_chain, boolean_t loopback) 2195 { 2196 flow_entry_t *flent = NULL; 2197 flow_entry_t *prev_flent = NULL; 2198 mblk_t *mp = NULL; 2199 mblk_t *tail = NULL; 2200 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2201 mac_client_impl_t *mcip; 2202 2203 mcip = mac_srs->srs_mcip; 2204 ASSERT(mcip != NULL); 2205 2206 /* 2207 * We need to determine the SRS for every packet 2208 * by walking the flow table, if we don't get any, 2209 * then we proceed using the SRS we came with. 2210 */ 2211 mp = tail = mp_chain; 2212 while (mp != NULL) { 2213 2214 /* 2215 * We will increment the stats for the mactching subflow. 2216 * when we get the bytes/pkt count for the classified packets 2217 * later in mac_rx_srs_process. 2218 */ 2219 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2220 FLOW_INBOUND, &flent); 2221 2222 if (mp == mp_chain || flent == prev_flent) { 2223 if (prev_flent != NULL) 2224 FLOW_REFRELE(prev_flent); 2225 prev_flent = flent; 2226 flent = NULL; 2227 tail = mp; 2228 mp = mp->b_next; 2229 continue; 2230 } 2231 tail->b_next = NULL; 2232 /* 2233 * A null indicates, this is for the mac_srs itself. 2234 * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2235 */ 2236 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2237 mac_rx_srs_process(arg, 2238 (mac_resource_handle_t)mac_srs, mp_chain, 2239 loopback); 2240 } else { 2241 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2242 prev_flent->fe_cb_arg2, mp_chain, loopback); 2243 FLOW_REFRELE(prev_flent); 2244 } 2245 prev_flent = flent; 2246 flent = NULL; 2247 mp_chain = mp; 2248 tail = mp; 2249 mp = mp->b_next; 2250 } 2251 /* Last chain */ 2252 ASSERT(mp_chain != NULL); 2253 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2254 mac_rx_srs_process(arg, 2255 (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2256 } else { 2257 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2258 prev_flent->fe_cb_arg2, mp_chain, loopback); 2259 FLOW_REFRELE(prev_flent); 2260 } 2261 } 2262 2263 /* 2264 * mac_rx_srs_process 2265 * 2266 * Receive side routine called from the interrupt path. 2267 * 2268 * loopback is set to force a context switch on the loopback 2269 * path between MAC clients. 2270 */ 2271 /* ARGSUSED */ 2272 void 2273 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2274 boolean_t loopback) 2275 { 2276 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2277 mblk_t *mp, *tail, *head; 2278 int count = 0; 2279 int count1; 2280 size_t sz = 0; 2281 size_t chain_sz, sz1; 2282 mac_bw_ctl_t *mac_bw; 2283 mac_client_impl_t *smcip; 2284 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2285 2286 /* 2287 * Set the tail, count and sz. We set the sz irrespective 2288 * of whether we are doing B/W control or not for the 2289 * purpose of updating the stats. 2290 */ 2291 mp = tail = mp_chain; 2292 while (mp != NULL) { 2293 tail = mp; 2294 count++; 2295 sz += msgdsize(mp); 2296 mp = mp->b_next; 2297 } 2298 2299 mutex_enter(&mac_srs->srs_lock); 2300 smcip = mac_srs->srs_mcip; 2301 2302 if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 2303 FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 2304 FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 2305 } 2306 if (smcip != NULL) { 2307 smcip->mci_stat_ibytes += sz; 2308 smcip->mci_stat_ipackets += count; 2309 } 2310 2311 /* 2312 * If the SRS in already being processed; has been blanked; 2313 * can be processed by worker thread only; or the B/W limit 2314 * has been reached, then queue the chain and check if 2315 * worker thread needs to be awakend. 2316 */ 2317 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2318 mac_bw = mac_srs->srs_bw; 2319 ASSERT(mac_bw != NULL); 2320 mutex_enter(&mac_bw->mac_bw_lock); 2321 /* Count the packets and bytes via interrupt */ 2322 srs_rx->sr_intr_count += count; 2323 mac_bw->mac_bw_intr += sz; 2324 if (mac_bw->mac_bw_limit == 0) { 2325 /* zero bandwidth: drop all */ 2326 srs_rx->sr_drop_count += count; 2327 mac_bw->mac_bw_drop_bytes += sz; 2328 mutex_exit(&mac_bw->mac_bw_lock); 2329 mutex_exit(&mac_srs->srs_lock); 2330 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2331 return; 2332 } else { 2333 if ((mac_bw->mac_bw_sz + sz) <= 2334 mac_bw->mac_bw_drop_threshold) { 2335 mutex_exit(&mac_bw->mac_bw_lock); 2336 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2337 tail, count, sz); 2338 } else { 2339 mp = mp_chain; 2340 chain_sz = 0; 2341 count1 = 0; 2342 tail = NULL; 2343 head = NULL; 2344 while (mp != NULL) { 2345 sz1 = msgdsize(mp); 2346 if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2347 mac_bw->mac_bw_drop_threshold) 2348 break; 2349 chain_sz += sz1; 2350 count1++; 2351 tail = mp; 2352 mp = mp->b_next; 2353 } 2354 mutex_exit(&mac_bw->mac_bw_lock); 2355 if (tail != NULL) { 2356 head = tail->b_next; 2357 tail->b_next = NULL; 2358 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2359 mp_chain, tail, count1, chain_sz); 2360 sz -= chain_sz; 2361 count -= count1; 2362 } else { 2363 /* Can't pick up any */ 2364 head = mp_chain; 2365 } 2366 if (head != NULL) { 2367 /* Drop any packet over the threshold */ 2368 srs_rx->sr_drop_count += count; 2369 mutex_enter(&mac_bw->mac_bw_lock); 2370 mac_bw->mac_bw_drop_bytes += sz; 2371 mutex_exit(&mac_bw->mac_bw_lock); 2372 freemsgchain(head); 2373 } 2374 } 2375 MAC_SRS_WORKER_WAKEUP(mac_srs); 2376 mutex_exit(&mac_srs->srs_lock); 2377 return; 2378 } 2379 } 2380 2381 /* 2382 * If the total number of packets queued in the SRS and 2383 * its associated soft rings exceeds the max allowed, 2384 * then drop the chain. If we are polling capable, this 2385 * shouldn't be happening. 2386 */ 2387 if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2388 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2389 mac_bw = mac_srs->srs_bw; 2390 srs_rx->sr_drop_count += count; 2391 mutex_enter(&mac_bw->mac_bw_lock); 2392 mac_bw->mac_bw_drop_bytes += sz; 2393 mutex_exit(&mac_bw->mac_bw_lock); 2394 freemsgchain(mp_chain); 2395 mutex_exit(&mac_srs->srs_lock); 2396 return; 2397 } 2398 2399 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2400 /* Count the packets entering via interrupt path */ 2401 srs_rx->sr_intr_count += count; 2402 2403 if (!(mac_srs->srs_state & SRS_PROC)) { 2404 /* 2405 * If we are coming via loopback or if we are not 2406 * optimizing for latency, we should signal the 2407 * worker thread. 2408 */ 2409 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 2410 /* 2411 * For loopback, We need to let the worker take 2412 * over as we don't want to continue in the same 2413 * thread even if we can. This could lead to stack 2414 * overflows and may also end up using 2415 * resources (cpu) incorrectly. 2416 */ 2417 cv_signal(&mac_srs->srs_async); 2418 } else { 2419 /* 2420 * Seems like no one is processing the SRS and 2421 * there is no backlog. We also inline process 2422 * our packet if its a single packet in non 2423 * latency optimized case (in latency optimized 2424 * case, we inline process chains of any size). 2425 */ 2426 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2427 } 2428 } 2429 mutex_exit(&mac_srs->srs_lock); 2430 } 2431 2432 /* TX SIDE ROUTINES (RUNTIME) */ 2433 2434 /* 2435 * mac_tx_srs_no_desc 2436 * 2437 * This routine is called by Tx single ring default mode 2438 * when Tx ring runs out of descs. 2439 */ 2440 mac_tx_cookie_t 2441 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2442 uint16_t flag, mblk_t **ret_mp) 2443 { 2444 mac_tx_cookie_t cookie = NULL; 2445 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2446 boolean_t wakeup_worker = B_TRUE; 2447 uint32_t tx_mode = srs_tx->st_mode; 2448 int cnt, sz; 2449 mblk_t *tail; 2450 2451 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2452 if (flag & MAC_DROP_ON_NO_DESC) { 2453 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2454 } else { 2455 if (mac_srs->srs_first != NULL) 2456 wakeup_worker = B_FALSE; 2457 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2458 if (flag & MAC_TX_NO_ENQUEUE) { 2459 /* 2460 * If TX_QUEUED is not set, queue the 2461 * packet and let mac_tx_srs_drain() 2462 * set the TX_BLOCKED bit for the 2463 * reasons explained above. Otherwise, 2464 * return the mblks. 2465 */ 2466 if (wakeup_worker) { 2467 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2468 mp_chain, tail, cnt, sz); 2469 } else { 2470 MAC_TX_SET_NO_ENQUEUE(mac_srs, 2471 mp_chain, ret_mp, cookie); 2472 } 2473 } else { 2474 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2475 tail, cnt, sz, cookie); 2476 } 2477 if (wakeup_worker) 2478 cv_signal(&mac_srs->srs_async); 2479 } 2480 return (cookie); 2481 } 2482 2483 /* 2484 * mac_tx_srs_enqueue 2485 * 2486 * This routine is called when Tx SRS is operating in either serializer 2487 * or bandwidth mode. In serializer mode, a packet will get enqueued 2488 * when a thread cannot enter SRS exclusively. In bandwidth mode, 2489 * packets gets queued if allowed byte-count limit for a tick is 2490 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2491 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2492 * the default mode or fanout mode. Here packets get dropped or 2493 * returned back to the caller only after hi-watermark worth of data 2494 * is queued. 2495 */ 2496 static mac_tx_cookie_t 2497 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2498 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2499 { 2500 mac_tx_cookie_t cookie = NULL; 2501 int cnt, sz; 2502 mblk_t *tail; 2503 boolean_t wakeup_worker = B_TRUE; 2504 2505 /* 2506 * Ignore fanout hint if we don't have multiple tx rings. 2507 */ 2508 if (!TX_MULTI_RING_MODE(mac_srs)) 2509 fanout_hint = 0; 2510 2511 if (mac_srs->srs_first != NULL) 2512 wakeup_worker = B_FALSE; 2513 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2514 if (flag & MAC_DROP_ON_NO_DESC) { 2515 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2516 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2517 } else { 2518 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2519 mp_chain, tail, cnt, sz); 2520 } 2521 } else if (flag & MAC_TX_NO_ENQUEUE) { 2522 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2523 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2524 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2525 ret_mp, cookie); 2526 } else { 2527 mp_chain->b_prev = (mblk_t *)fanout_hint; 2528 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2529 mp_chain, tail, cnt, sz); 2530 } 2531 } else { 2532 /* 2533 * If you are BW_ENFORCED, just enqueue the 2534 * packet. srs_worker will drain it at the 2535 * prescribed rate. Before enqueueing, save 2536 * the fanout hint. 2537 */ 2538 mp_chain->b_prev = (mblk_t *)fanout_hint; 2539 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2540 tail, cnt, sz, cookie); 2541 } 2542 if (wakeup_worker) 2543 cv_signal(&mac_srs->srs_async); 2544 return (cookie); 2545 } 2546 2547 /* 2548 * There are five tx modes: 2549 * 2550 * 1) Default mode (SRS_TX_DEFAULT) 2551 * 2) Serialization mode (SRS_TX_SERIALIZE) 2552 * 3) Fanout mode (SRS_TX_FANOUT) 2553 * 4) Bandwdith mode (SRS_TX_BW) 2554 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2555 * 2556 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2557 * based on the number of Tx rings requested for an SRS and whether 2558 * bandwidth control is requested or not. 2559 * 2560 * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 2561 * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 2562 * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 2563 * When flow-control is relieved, the srs_worker drains the queued 2564 * packets and informs blocked clients to restart sending packets. 2565 * 2566 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 2567 * 2568 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2569 * Tx rings. Each Tx ring will have a soft ring associated with it. 2570 * These soft rings will be hung off the Tx SRS. Queueing if it happens 2571 * due to lack of Tx desc will be in individual soft ring (and not srs) 2572 * associated with Tx ring. 2573 * 2574 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2575 * only if bw is available. Otherwise the packets will be queued in 2576 * SRS. If fanout to multiple Tx rings is configured, the packets will 2577 * be fanned out among the soft rings associated with the Tx rings. 2578 * 2579 * Four flags are used in srs_state for indicating flow control 2580 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2581 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2582 * driver below. 2583 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2584 * and flow-control pressure is applied back to clients. The clients expect 2585 * wakeup when flow-control is relieved. 2586 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2587 * got returned back to client either due to lack of Tx descs or due to bw 2588 * control reasons. The clients expect a wakeup when condition is relieved. 2589 * 2590 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2591 * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2592 * MAC_TX_NO_ENQUEUE 2593 * Mac clients that do not want packets to be enqueued in the mac layer set 2594 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2595 * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2596 * behaviour of this flag is different when the Tx is running in serializer 2597 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2598 * get dropped when Tx high watermark is reached. 2599 * There are some mac clients like vsw, aggr that want the mblks to be 2600 * returned back to clients instead of being queued in Tx SRS (or Tx soft 2601 * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2602 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2603 * In the default and Tx fanout mode, the un-transmitted mblks will be 2604 * returned back to the clients when the driver runs out of Tx descs. 2605 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2606 * soft ring) so that the clients can be woken up when Tx desc become 2607 * available. When running in serializer or bandwidth mode mode, 2608 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2609 */ 2610 2611 mac_tx_func_t 2612 mac_tx_get_func(uint32_t mode) 2613 { 2614 return (mac_tx_mode_list[mode].mac_tx_func); 2615 } 2616 2617 /* ARGSUSED */ 2618 static mac_tx_cookie_t 2619 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2620 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2621 { 2622 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2623 boolean_t is_subflow; 2624 mac_tx_stats_t stats; 2625 mac_tx_cookie_t cookie = NULL; 2626 2627 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2628 2629 /* Regular case with a single Tx ring */ 2630 /* 2631 * SRS_TX_BLOCKED is set when underlying NIC runs 2632 * out of Tx descs and messages start getting 2633 * queued. It won't get reset until 2634 * tx_srs_drain() completely drains out the 2635 * messages. 2636 */ 2637 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2638 /* Tx descs/resources not available */ 2639 mutex_enter(&mac_srs->srs_lock); 2640 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2641 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2642 flag, ret_mp); 2643 mutex_exit(&mac_srs->srs_lock); 2644 return (cookie); 2645 } 2646 /* 2647 * While we were computing mblk count, the 2648 * flow control condition got relieved. 2649 * Continue with the transmission. 2650 */ 2651 mutex_exit(&mac_srs->srs_lock); 2652 } 2653 2654 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2655 2656 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2657 mp_chain, (is_subflow ? &stats : NULL)); 2658 2659 /* 2660 * Multiple threads could be here sending packets. 2661 * Under such conditions, it is not possible to 2662 * automically set SRS_TX_BLOCKED bit to indicate 2663 * out of tx desc condition. To atomically set 2664 * this, we queue the returned packet and do 2665 * the setting of SRS_TX_BLOCKED in 2666 * mac_tx_srs_drain(). 2667 */ 2668 if (mp_chain != NULL) { 2669 mutex_enter(&mac_srs->srs_lock); 2670 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2671 mutex_exit(&mac_srs->srs_lock); 2672 return (cookie); 2673 } 2674 2675 if (is_subflow) 2676 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2677 2678 return (NULL); 2679 } 2680 2681 /* 2682 * mac_tx_serialize_mode 2683 * 2684 * This is an experimental mode implemented as per the request of PAE. 2685 * In this mode, all callers attempting to send a packet to the NIC 2686 * will get serialized. Only one thread at any time will access the 2687 * NIC to send the packet out. 2688 */ 2689 /* ARGSUSED */ 2690 static mac_tx_cookie_t 2691 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2692 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2693 { 2694 boolean_t is_subflow; 2695 mac_tx_stats_t stats; 2696 mac_tx_cookie_t cookie = NULL; 2697 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2698 2699 /* Single ring, serialize below */ 2700 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2701 mutex_enter(&mac_srs->srs_lock); 2702 if ((mac_srs->srs_first != NULL) || 2703 (mac_srs->srs_state & SRS_PROC)) { 2704 /* 2705 * In serialization mode, queue all packets until 2706 * TX_HIWAT is set. 2707 * If drop bit is set, drop if TX_HIWAT is set. 2708 * If no_enqueue is set, still enqueue until hiwat 2709 * is set and return mblks after TX_HIWAT is set. 2710 */ 2711 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2712 flag, NULL, ret_mp); 2713 mutex_exit(&mac_srs->srs_lock); 2714 return (cookie); 2715 } 2716 /* 2717 * No packets queued, nothing on proc and no flow 2718 * control condition. Fast-path, ok. Do inline 2719 * processing. 2720 */ 2721 mac_srs->srs_state |= SRS_PROC; 2722 mutex_exit(&mac_srs->srs_lock); 2723 2724 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2725 2726 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2727 mp_chain, (is_subflow ? &stats : NULL)); 2728 2729 mutex_enter(&mac_srs->srs_lock); 2730 mac_srs->srs_state &= ~SRS_PROC; 2731 if (mp_chain != NULL) { 2732 cookie = mac_tx_srs_enqueue(mac_srs, 2733 mp_chain, flag, NULL, ret_mp); 2734 } 2735 if (mac_srs->srs_first != NULL) { 2736 /* 2737 * We processed inline our packet and a new 2738 * packet/s got queued while we were 2739 * processing. Wakeup srs worker 2740 */ 2741 cv_signal(&mac_srs->srs_async); 2742 } 2743 mutex_exit(&mac_srs->srs_lock); 2744 2745 if (is_subflow && cookie == NULL) 2746 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2747 2748 return (cookie); 2749 } 2750 2751 /* 2752 * mac_tx_fanout_mode 2753 * 2754 * In this mode, the SRS will have access to multiple Tx rings to send 2755 * the packet out. The fanout hint that is passed as an argument is 2756 * used to find an appropriate ring to fanout the traffic. Each Tx 2757 * ring, in turn, will have a soft ring associated with it. If a Tx 2758 * ring runs out of Tx desc's the returned packet will be queued in 2759 * the soft ring associated with that Tx ring. The srs itself will not 2760 * queue any packets. 2761 */ 2762 2763 #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 2764 index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 2765 softring = mac_srs->srs_oth_soft_rings[index]; \ 2766 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 2767 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 2768 } 2769 2770 static mac_tx_cookie_t 2771 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2772 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2773 { 2774 mac_soft_ring_t *softring; 2775 uint64_t hash; 2776 uint_t index; 2777 mac_tx_cookie_t cookie = NULL; 2778 2779 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 2780 if (fanout_hint != 0) { 2781 /* 2782 * The hint is specified by the caller, simply pass the 2783 * whole chain to the soft ring. 2784 */ 2785 hash = HASH_HINT(fanout_hint); 2786 MAC_TX_SOFT_RING_PROCESS(mp_chain); 2787 } else { 2788 mblk_t *last_mp, *cur_mp, *sub_chain; 2789 uint64_t last_hash = 0; 2790 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 2791 2792 /* 2793 * Compute the hash from the contents (headers) of the 2794 * packets of the mblk chain. Split the chains into 2795 * subchains of the same conversation. 2796 * 2797 * Since there may be more than one ring used for 2798 * sub-chains of the same call, and since the caller 2799 * does not maintain per conversation state since it 2800 * passed a zero hint, unsent subchains will be 2801 * dropped. 2802 */ 2803 2804 flag |= MAC_DROP_ON_NO_DESC; 2805 ret_mp = NULL; 2806 2807 ASSERT(ret_mp == NULL); 2808 2809 sub_chain = NULL; 2810 last_mp = NULL; 2811 2812 for (cur_mp = mp_chain; cur_mp != NULL; 2813 cur_mp = cur_mp->b_next) { 2814 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 2815 B_TRUE); 2816 if (last_hash != 0 && hash != last_hash) { 2817 /* 2818 * Starting a different subchain, send current 2819 * chain out. 2820 */ 2821 ASSERT(last_mp != NULL); 2822 last_mp->b_next = NULL; 2823 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2824 sub_chain = NULL; 2825 } 2826 2827 /* add packet to subchain */ 2828 if (sub_chain == NULL) 2829 sub_chain = cur_mp; 2830 last_mp = cur_mp; 2831 last_hash = hash; 2832 } 2833 2834 if (sub_chain != NULL) { 2835 /* send last subchain */ 2836 ASSERT(last_mp != NULL); 2837 last_mp->b_next = NULL; 2838 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2839 } 2840 2841 cookie = NULL; 2842 } 2843 2844 return (cookie); 2845 } 2846 2847 /* 2848 * mac_tx_bw_mode 2849 * 2850 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2851 * only if bw is available. Otherwise the packets will be queued in 2852 * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2853 * out to a Tx rings. 2854 */ 2855 static mac_tx_cookie_t 2856 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2857 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2858 { 2859 int cnt, sz; 2860 mblk_t *tail; 2861 mac_tx_cookie_t cookie = NULL; 2862 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2863 2864 ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2865 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2866 mutex_enter(&mac_srs->srs_lock); 2867 if (mac_srs->srs_bw->mac_bw_limit == 0) { 2868 /* 2869 * zero bandwidth, no traffic is sent: drop the packets, 2870 * or return the whole chain if the caller requests all 2871 * unsent packets back. 2872 */ 2873 if (flag & MAC_TX_NO_ENQUEUE) { 2874 cookie = (mac_tx_cookie_t)mac_srs; 2875 *ret_mp = mp_chain; 2876 } else { 2877 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2878 } 2879 mutex_exit(&mac_srs->srs_lock); 2880 return (cookie); 2881 } else if ((mac_srs->srs_first != NULL) || 2882 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2883 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2884 fanout_hint, ret_mp); 2885 mutex_exit(&mac_srs->srs_lock); 2886 return (cookie); 2887 } 2888 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2889 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 2890 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 2891 mac_srs->srs_bw->mac_bw_used = 0; 2892 } else if (mac_srs->srs_bw->mac_bw_used > 2893 mac_srs->srs_bw->mac_bw_limit) { 2894 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2895 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2896 mp_chain, tail, cnt, sz); 2897 /* 2898 * Wakeup worker thread. Note that worker 2899 * thread has to be woken up so that it 2900 * can fire up the timer to be woken up 2901 * on the next tick. Also once 2902 * BW_ENFORCED is set, it can only be 2903 * reset by srs_worker thread. Until then 2904 * all packets will get queued up in SRS 2905 * and hence this this code path won't be 2906 * entered until BW_ENFORCED is reset. 2907 */ 2908 cv_signal(&mac_srs->srs_async); 2909 mutex_exit(&mac_srs->srs_lock); 2910 return (cookie); 2911 } 2912 2913 mac_srs->srs_bw->mac_bw_used += sz; 2914 mutex_exit(&mac_srs->srs_lock); 2915 2916 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2917 mac_soft_ring_t *softring; 2918 uint_t indx, hash; 2919 2920 hash = HASH_HINT(fanout_hint); 2921 indx = COMPUTE_INDEX(hash, 2922 mac_srs->srs_oth_ring_count); 2923 softring = mac_srs->srs_oth_soft_rings[indx]; 2924 return (mac_tx_soft_ring_process(softring, mp_chain, flag, 2925 ret_mp)); 2926 } else { 2927 boolean_t is_subflow; 2928 mac_tx_stats_t stats; 2929 2930 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2931 2932 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2933 mp_chain, (is_subflow ? &stats : NULL)); 2934 2935 if (mp_chain != NULL) { 2936 mutex_enter(&mac_srs->srs_lock); 2937 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2938 if (mac_srs->srs_bw->mac_bw_used > sz) 2939 mac_srs->srs_bw->mac_bw_used -= sz; 2940 else 2941 mac_srs->srs_bw->mac_bw_used = 0; 2942 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2943 fanout_hint, ret_mp); 2944 mutex_exit(&mac_srs->srs_lock); 2945 return (cookie); 2946 } 2947 if (is_subflow) 2948 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2949 2950 return (NULL); 2951 } 2952 } 2953 2954 /* ARGSUSED */ 2955 void 2956 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 2957 { 2958 mblk_t *head, *tail; 2959 size_t sz; 2960 uint32_t tx_mode; 2961 uint_t saved_pkt_count; 2962 boolean_t is_subflow; 2963 mac_tx_stats_t stats; 2964 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2965 2966 saved_pkt_count = 0; 2967 ASSERT(mutex_owned(&mac_srs->srs_lock)); 2968 ASSERT(!(mac_srs->srs_state & SRS_PROC)); 2969 2970 mac_srs->srs_state |= SRS_PROC; 2971 2972 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2973 tx_mode = srs_tx->st_mode; 2974 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 2975 if (mac_srs->srs_first != NULL) { 2976 head = mac_srs->srs_first; 2977 tail = mac_srs->srs_last; 2978 saved_pkt_count = mac_srs->srs_count; 2979 mac_srs->srs_first = NULL; 2980 mac_srs->srs_last = NULL; 2981 mac_srs->srs_count = 0; 2982 mutex_exit(&mac_srs->srs_lock); 2983 2984 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2985 head, &stats); 2986 2987 mutex_enter(&mac_srs->srs_lock); 2988 if (head != NULL) { 2989 /* Device out of tx desc, set block */ 2990 if (head->b_next == NULL) 2991 VERIFY(head == tail); 2992 tail->b_next = mac_srs->srs_first; 2993 mac_srs->srs_first = head; 2994 mac_srs->srs_count += 2995 (saved_pkt_count - stats.ts_opackets); 2996 if (mac_srs->srs_last == NULL) 2997 mac_srs->srs_last = tail; 2998 MAC_TX_SRS_BLOCK(mac_srs, head); 2999 } else { 3000 srs_tx->st_woken_up = B_FALSE; 3001 if (is_subflow) { 3002 FLOW_TX_STATS_UPDATE( 3003 mac_srs->srs_flent, &stats); 3004 } 3005 } 3006 } 3007 } else if (tx_mode == SRS_TX_BW) { 3008 /* 3009 * We are here because the timer fired and we have some data 3010 * to tranmit. Also mac_tx_srs_worker should have reset 3011 * SRS_BW_ENFORCED flag 3012 */ 3013 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 3014 head = tail = mac_srs->srs_first; 3015 while (mac_srs->srs_first != NULL) { 3016 tail = mac_srs->srs_first; 3017 tail->b_prev = NULL; 3018 mac_srs->srs_first = tail->b_next; 3019 if (mac_srs->srs_first == NULL) 3020 mac_srs->srs_last = NULL; 3021 mac_srs->srs_count--; 3022 sz = msgdsize(tail); 3023 mac_srs->srs_size -= sz; 3024 saved_pkt_count++; 3025 MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 3026 3027 if (mac_srs->srs_bw->mac_bw_used < 3028 mac_srs->srs_bw->mac_bw_limit) 3029 continue; 3030 3031 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 3032 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 3033 mac_srs->srs_bw->mac_bw_used = sz; 3034 continue; 3035 } 3036 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3037 break; 3038 } 3039 3040 ASSERT((head == NULL && tail == NULL) || 3041 (head != NULL && tail != NULL)); 3042 if (tail != NULL) { 3043 tail->b_next = NULL; 3044 mutex_exit(&mac_srs->srs_lock); 3045 3046 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3047 head, &stats); 3048 3049 mutex_enter(&mac_srs->srs_lock); 3050 if (head != NULL) { 3051 uint_t size_sent; 3052 3053 /* Device out of tx desc, set block */ 3054 if (head->b_next == NULL) 3055 VERIFY(head == tail); 3056 tail->b_next = mac_srs->srs_first; 3057 mac_srs->srs_first = head; 3058 mac_srs->srs_count += 3059 (saved_pkt_count - stats.ts_opackets); 3060 if (mac_srs->srs_last == NULL) 3061 mac_srs->srs_last = tail; 3062 size_sent = sz - stats.ts_obytes; 3063 mac_srs->srs_size += size_sent; 3064 mac_srs->srs_bw->mac_bw_sz += size_sent; 3065 if (mac_srs->srs_bw->mac_bw_used > size_sent) { 3066 mac_srs->srs_bw->mac_bw_used -= 3067 size_sent; 3068 } else { 3069 mac_srs->srs_bw->mac_bw_used = 0; 3070 } 3071 MAC_TX_SRS_BLOCK(mac_srs, head); 3072 } else { 3073 srs_tx->st_woken_up = B_FALSE; 3074 if (is_subflow) { 3075 FLOW_TX_STATS_UPDATE( 3076 mac_srs->srs_flent, &stats); 3077 } 3078 } 3079 } 3080 } else if (tx_mode == SRS_TX_BW_FANOUT) { 3081 mblk_t *prev; 3082 mac_soft_ring_t *softring; 3083 uint64_t hint; 3084 3085 /* 3086 * We are here because the timer fired and we 3087 * have some quota to tranmit. 3088 */ 3089 prev = NULL; 3090 head = tail = mac_srs->srs_first; 3091 while (mac_srs->srs_first != NULL) { 3092 tail = mac_srs->srs_first; 3093 mac_srs->srs_first = tail->b_next; 3094 if (mac_srs->srs_first == NULL) 3095 mac_srs->srs_last = NULL; 3096 mac_srs->srs_count--; 3097 sz = msgdsize(tail); 3098 mac_srs->srs_size -= sz; 3099 mac_srs->srs_bw->mac_bw_used += sz; 3100 if (prev == NULL) 3101 hint = (ulong_t)tail->b_prev; 3102 if (hint != (ulong_t)tail->b_prev) { 3103 prev->b_next = NULL; 3104 mutex_exit(&mac_srs->srs_lock); 3105 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3106 head = tail; 3107 hint = (ulong_t)tail->b_prev; 3108 mutex_enter(&mac_srs->srs_lock); 3109 } 3110 3111 prev = tail; 3112 tail->b_prev = NULL; 3113 if (mac_srs->srs_bw->mac_bw_used < 3114 mac_srs->srs_bw->mac_bw_limit) 3115 continue; 3116 3117 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 3118 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 3119 mac_srs->srs_bw->mac_bw_used = 0; 3120 continue; 3121 } 3122 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3123 break; 3124 } 3125 ASSERT((head == NULL && tail == NULL) || 3126 (head != NULL && tail != NULL)); 3127 if (tail != NULL) { 3128 tail->b_next = NULL; 3129 mutex_exit(&mac_srs->srs_lock); 3130 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3131 mutex_enter(&mac_srs->srs_lock); 3132 } 3133 } 3134 /* 3135 * SRS_TX_FANOUT case not considered here because packets 3136 * won't be queued in the SRS for this case. Packets will 3137 * be sent directly to soft rings underneath and if there 3138 * is any queueing at all, it would be in Tx side soft 3139 * rings. 3140 */ 3141 3142 /* 3143 * When srs_count becomes 0, reset SRS_TX_HIWAT and 3144 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3145 */ 3146 if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3147 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3148 mac_tx_notify_cb_t *mtnfp; 3149 mac_cb_t *mcb; 3150 mac_client_impl_t *mcip = mac_srs->srs_mcip; 3151 boolean_t wakeup_required = B_FALSE; 3152 3153 if (mac_srs->srs_state & 3154 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3155 wakeup_required = B_TRUE; 3156 } 3157 mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3158 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3159 mutex_exit(&mac_srs->srs_lock); 3160 if (wakeup_required) { 3161 /* Wakeup callback registered clients */ 3162 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3163 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3164 mcb = mcb->mcb_nextp) { 3165 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3166 mtnfp->mtnf_fn(mtnfp->mtnf_arg, 3167 (mac_tx_cookie_t)mac_srs); 3168 } 3169 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3170 &mcip->mci_tx_notify_cb_list); 3171 /* 3172 * If the client is not the primary MAC client, then we 3173 * need to send the notification to the clients upper 3174 * MAC, i.e. mci_upper_mip. 3175 */ 3176 mac_tx_notify(mcip->mci_upper_mip != NULL ? 3177 mcip->mci_upper_mip : mcip->mci_mip); 3178 } 3179 mutex_enter(&mac_srs->srs_lock); 3180 } 3181 mac_srs->srs_state &= ~SRS_PROC; 3182 } 3183 3184 /* 3185 * Given a packet, get the flow_entry that identifies the flow 3186 * to which that packet belongs. The flow_entry will contain 3187 * the transmit function to be used to send the packet. If the 3188 * function returns NULL, the packet should be sent using the 3189 * underlying NIC. 3190 */ 3191 static flow_entry_t * 3192 mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3193 { 3194 flow_entry_t *flent = NULL; 3195 mac_client_impl_t *mcip; 3196 int err; 3197 3198 /* 3199 * Do classification on the packet. 3200 */ 3201 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3202 if (err != 0) 3203 return (NULL); 3204 3205 /* 3206 * This flent might just be an additional one on the MAC client, 3207 * i.e. for classification purposes (different fdesc), however 3208 * the resources, SRS et. al., are in the mci_flent, so if 3209 * this isn't the mci_flent, we need to get it. 3210 */ 3211 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3212 FLOW_REFRELE(flent); 3213 flent = mcip->mci_flent; 3214 FLOW_TRY_REFHOLD(flent, err); 3215 if (err != 0) 3216 return (NULL); 3217 } 3218 3219 return (flent); 3220 } 3221 3222 /* 3223 * This macro is only meant to be used by mac_tx_send(). 3224 */ 3225 #define CHECK_VID_AND_ADD_TAG(mp) { \ 3226 if (vid_check) { \ 3227 int err = 0; \ 3228 \ 3229 MAC_VID_CHECK(src_mcip, (mp), err); \ 3230 if (err != 0) { \ 3231 freemsg((mp)); \ 3232 (mp) = next; \ 3233 oerrors++; \ 3234 continue; \ 3235 } \ 3236 } \ 3237 if (add_tag) { \ 3238 (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3239 if ((mp) == NULL) { \ 3240 (mp) = next; \ 3241 oerrors++; \ 3242 continue; \ 3243 } \ 3244 } \ 3245 } 3246 3247 mblk_t * 3248 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3249 mac_tx_stats_t *stats) 3250 { 3251 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3252 mac_impl_t *mip = src_mcip->mci_mip; 3253 uint_t obytes = 0, opackets = 0, oerrors = 0; 3254 mblk_t *mp = NULL, *next; 3255 boolean_t vid_check, add_tag; 3256 uint16_t vid = 0; 3257 3258 if (mip->mi_nclients > 1) { 3259 vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3260 add_tag = MAC_TAG_NEEDED(src_mcip); 3261 if (add_tag) 3262 vid = mac_client_vid(mch); 3263 } else { 3264 ASSERT(mip->mi_nclients == 1); 3265 vid_check = add_tag = B_FALSE; 3266 } 3267 3268 /* 3269 * Fastpath: if there's only one client, and there's no 3270 * multicast listeners, we simply send the packet down to the 3271 * underlying NIC. 3272 */ 3273 if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 3274 DTRACE_PROBE2(fastpath, 3275 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3276 3277 mp = mp_chain; 3278 while (mp != NULL) { 3279 next = mp->b_next; 3280 mp->b_next = NULL; 3281 opackets++; 3282 obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3283 msgdsize(mp)); 3284 3285 CHECK_VID_AND_ADD_TAG(mp); 3286 MAC_TX(mip, ring, mp, 3287 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3288 0)); 3289 3290 /* 3291 * If the driver is out of descriptors and does a 3292 * partial send it will return a chain of unsent 3293 * mblks. Adjust the accounting stats. 3294 */ 3295 if (mp != NULL) { 3296 opackets--; 3297 obytes -= msgdsize(mp); 3298 mp->b_next = next; 3299 break; 3300 } 3301 mp = next; 3302 } 3303 goto done; 3304 } 3305 3306 /* 3307 * No fastpath, we either have more than one MAC client 3308 * defined on top of the same MAC, or one or more MAC 3309 * client promiscuous callbacks. 3310 */ 3311 DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3312 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3313 3314 mp = mp_chain; 3315 while (mp != NULL) { 3316 flow_entry_t *dst_flow_ent; 3317 void *flow_cookie; 3318 size_t pkt_size; 3319 mblk_t *mp1; 3320 3321 next = mp->b_next; 3322 mp->b_next = NULL; 3323 opackets++; 3324 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3325 obytes += pkt_size; 3326 CHECK_VID_AND_ADD_TAG(mp); 3327 3328 /* 3329 * Check if there are promiscuous mode callbacks defined. 3330 */ 3331 if (mip->mi_promisc_list != NULL) 3332 mac_promisc_dispatch(mip, mp, src_mcip); 3333 3334 /* 3335 * Find the destination. 3336 */ 3337 dst_flow_ent = mac_tx_classify(mip, mp); 3338 3339 if (dst_flow_ent != NULL) { 3340 size_t hdrsize; 3341 int err = 0; 3342 3343 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3344 struct ether_vlan_header *evhp = 3345 (struct ether_vlan_header *)mp->b_rptr; 3346 3347 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3348 hdrsize = sizeof (*evhp); 3349 else 3350 hdrsize = sizeof (struct ether_header); 3351 } else { 3352 mac_header_info_t mhi; 3353 3354 err = mac_header_info((mac_handle_t)mip, 3355 mp, &mhi); 3356 if (err == 0) 3357 hdrsize = mhi.mhi_hdrsize; 3358 } 3359 3360 /* 3361 * Got a matching flow. It's either another 3362 * MAC client, or a broadcast/multicast flow. 3363 * Make sure the packet size is within the 3364 * allowed size. If not drop the packet and 3365 * move to next packet. 3366 */ 3367 if (err != 0 || 3368 (pkt_size - hdrsize) > mip->mi_sdu_max) { 3369 oerrors++; 3370 DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3371 mblk_t *, mp); 3372 freemsg(mp); 3373 mp = next; 3374 FLOW_REFRELE(dst_flow_ent); 3375 continue; 3376 } 3377 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3378 if (flow_cookie != NULL) { 3379 /* 3380 * The vnic_bcast_send function expects 3381 * to receive the sender MAC client 3382 * as value for arg2. 3383 */ 3384 mac_bcast_send(flow_cookie, src_mcip, mp, 3385 B_TRUE); 3386 } else { 3387 /* 3388 * loopback the packet to a 3389 * local MAC client. We force a context 3390 * switch if both source and destination 3391 * MAC clients are used by IP, i.e. bypass 3392 * is set. 3393 */ 3394 boolean_t do_switch; 3395 mac_client_impl_t *dst_mcip = 3396 dst_flow_ent->fe_mcip; 3397 3398 do_switch = ((src_mcip->mci_state_flags & 3399 dst_mcip->mci_state_flags & 3400 MCIS_CLIENT_POLL_CAPABLE) != 0); 3401 3402 if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3403 (dst_flow_ent->fe_cb_fn)( 3404 dst_flow_ent->fe_cb_arg1, 3405 dst_flow_ent->fe_cb_arg2, 3406 mp1, do_switch); 3407 } 3408 } 3409 FLOW_REFRELE(dst_flow_ent); 3410 } else { 3411 /* 3412 * Unknown destination, send via the underlying 3413 * NIC. 3414 */ 3415 MAC_TX(mip, ring, mp, 3416 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3417 0)); 3418 if (mp != NULL) { 3419 /* 3420 * Adjust for the last packet that 3421 * could not be transmitted 3422 */ 3423 opackets--; 3424 obytes -= pkt_size; 3425 mp->b_next = next; 3426 break; 3427 } 3428 } 3429 mp = next; 3430 } 3431 3432 done: 3433 src_mcip->mci_stat_obytes += obytes; 3434 src_mcip->mci_stat_opackets += opackets; 3435 src_mcip->mci_stat_oerrors += oerrors; 3436 3437 if (stats != NULL) { 3438 stats->ts_opackets = opackets; 3439 stats->ts_obytes = obytes; 3440 stats->ts_oerrors = oerrors; 3441 } 3442 return (mp); 3443 } 3444 3445 /* 3446 * mac_tx_srs_ring_present 3447 * 3448 * Returns whether the specified ring is part of the specified SRS. 3449 */ 3450 boolean_t 3451 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3452 { 3453 int i; 3454 mac_soft_ring_t *soft_ring; 3455 3456 if (srs->srs_tx.st_arg2 == tx_ring) 3457 return (B_TRUE); 3458 3459 for (i = 0; i < srs->srs_oth_ring_count; i++) { 3460 soft_ring = srs->srs_oth_soft_rings[i]; 3461 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3462 return (B_TRUE); 3463 } 3464 3465 return (B_FALSE); 3466 } 3467 3468 /* 3469 * mac_tx_srs_wakeup 3470 * 3471 * Called when Tx desc become available. Wakeup the appropriate worker 3472 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3473 * state field. 3474 */ 3475 void 3476 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3477 { 3478 int i; 3479 mac_soft_ring_t *sringp; 3480 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3481 3482 mutex_enter(&mac_srs->srs_lock); 3483 if (TX_SINGLE_RING_MODE(mac_srs)) { 3484 if (srs_tx->st_arg2 == ring && 3485 mac_srs->srs_state & SRS_TX_BLOCKED) { 3486 mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3487 srs_tx->st_unblocked_cnt++; 3488 cv_signal(&mac_srs->srs_async); 3489 } 3490 /* 3491 * A wakeup can come before tx_srs_drain() could 3492 * grab srs lock and set SRS_TX_BLOCKED. So 3493 * always set woken_up flag when we come here. 3494 */ 3495 srs_tx->st_woken_up = B_TRUE; 3496 mutex_exit(&mac_srs->srs_lock); 3497 return; 3498 } 3499 3500 /* If you are here, it is for FANOUT or BW_FANOUT case */ 3501 ASSERT(TX_MULTI_RING_MODE(mac_srs)); 3502 for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 3503 sringp = mac_srs->srs_oth_soft_rings[i]; 3504 mutex_enter(&sringp->s_ring_lock); 3505 if (sringp->s_ring_tx_arg2 == ring) { 3506 if (sringp->s_ring_state & S_RING_BLOCK) { 3507 sringp->s_ring_state &= ~S_RING_BLOCK; 3508 sringp->s_ring_unblocked_cnt++; 3509 cv_signal(&sringp->s_ring_async); 3510 } 3511 sringp->s_ring_tx_woken_up = B_TRUE; 3512 } 3513 mutex_exit(&sringp->s_ring_lock); 3514 } 3515 mutex_exit(&mac_srs->srs_lock); 3516 } 3517 3518 /* 3519 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3520 * the blocked clients again. 3521 */ 3522 void 3523 mac_tx_notify(mac_impl_t *mip) 3524 { 3525 i_mac_notify(mip, MAC_NOTE_TX); 3526 } 3527 3528 /* 3529 * RX SOFTRING RELATED FUNCTIONS 3530 * 3531 * These functions really belong in mac_soft_ring.c and here for 3532 * a short period. 3533 */ 3534 3535 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3536 /* \ 3537 * Enqueue our mblk chain. \ 3538 */ \ 3539 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3540 \ 3541 if ((ringp)->s_ring_last != NULL) \ 3542 (ringp)->s_ring_last->b_next = (mp); \ 3543 else \ 3544 (ringp)->s_ring_first = (mp); \ 3545 (ringp)->s_ring_last = (tail); \ 3546 (ringp)->s_ring_count += (cnt); \ 3547 ASSERT((ringp)->s_ring_count > 0); \ 3548 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3549 (ringp)->s_ring_size += sz; \ 3550 } \ 3551 } 3552 3553 /* 3554 * Default entry point to deliver a packet chain to a MAC client. 3555 * If the MAC client has flows, do the classification with these 3556 * flows as well. 3557 */ 3558 /* ARGSUSED */ 3559 void 3560 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3561 mac_header_info_t *arg3) 3562 { 3563 mac_client_impl_t *mcip = arg1; 3564 3565 if (mcip->mci_nvids == 1 && 3566 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 3567 /* 3568 * If the client has exactly one VID associated with it 3569 * and striping of VLAN header is not disabled, 3570 * remove the VLAN tag from the packet before 3571 * passing it on to the client's receive callback. 3572 * Note that this needs to be done after we dispatch 3573 * the packet to the promiscuous listeners of the 3574 * client, since they expect to see the whole 3575 * frame including the VLAN headers. 3576 */ 3577 mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3578 } 3579 3580 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3581 } 3582 3583 /* 3584 * mac_rx_soft_ring_process 3585 * 3586 * process a chain for a given soft ring. The number of packets queued 3587 * in the SRS and its associated soft rings (including this one) is 3588 * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3589 * thread (interrupt or poll thread) to do inline processing. This 3590 * helps keep the latency down under low load. 3591 * 3592 * The proc and arg for each mblk is already stored in the mblk in 3593 * appropriate places. 3594 */ 3595 /* ARGSUSED */ 3596 void 3597 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3598 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3599 { 3600 mac_direct_rx_t proc; 3601 void *arg1; 3602 mac_resource_handle_t arg2; 3603 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3604 3605 ASSERT(ringp != NULL); 3606 ASSERT(mp_chain != NULL); 3607 ASSERT(tail != NULL); 3608 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3609 3610 mutex_enter(&ringp->s_ring_lock); 3611 ringp->s_ring_total_inpkt += cnt; 3612 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3613 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 3614 /* If on processor or blanking on, then enqueue and return */ 3615 if (ringp->s_ring_state & S_RING_BLANK || 3616 ringp->s_ring_state & S_RING_PROC) { 3617 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3618 mutex_exit(&ringp->s_ring_lock); 3619 return; 3620 } 3621 proc = ringp->s_ring_rx_func; 3622 arg1 = ringp->s_ring_rx_arg1; 3623 arg2 = ringp->s_ring_rx_arg2; 3624 /* 3625 * See if anything is already queued. If we are the 3626 * first packet, do inline processing else queue the 3627 * packet and do the drain. 3628 */ 3629 if (ringp->s_ring_first == NULL) { 3630 /* 3631 * Fast-path, ok to process and nothing queued. 3632 */ 3633 ringp->s_ring_run = curthread; 3634 ringp->s_ring_state |= (S_RING_PROC); 3635 3636 mutex_exit(&ringp->s_ring_lock); 3637 3638 /* 3639 * We are the chain of 1 packet so 3640 * go through this fast path. 3641 */ 3642 ASSERT(mp_chain->b_next == NULL); 3643 3644 (*proc)(arg1, arg2, mp_chain, NULL); 3645 3646 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3647 /* 3648 * If we have a soft ring set which is doing 3649 * bandwidth control, we need to decrement 3650 * srs_size and count so it the SRS can have a 3651 * accurate idea of what is the real data 3652 * queued between SRS and its soft rings. We 3653 * decrement the counters only when the packet 3654 * gets processed by both SRS and the soft ring. 3655 */ 3656 mutex_enter(&mac_srs->srs_lock); 3657 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3658 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3659 mutex_exit(&mac_srs->srs_lock); 3660 3661 mutex_enter(&ringp->s_ring_lock); 3662 ringp->s_ring_run = NULL; 3663 ringp->s_ring_state &= ~S_RING_PROC; 3664 if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3665 cv_signal(&ringp->s_ring_client_cv); 3666 3667 if ((ringp->s_ring_first == NULL) || 3668 (ringp->s_ring_state & S_RING_BLANK)) { 3669 /* 3670 * We processed inline our packet and 3671 * nothing new has arrived or our 3672 * receiver doesn't want to receive 3673 * any packets. We are done. 3674 */ 3675 mutex_exit(&ringp->s_ring_lock); 3676 return; 3677 } 3678 } else { 3679 SOFT_RING_ENQUEUE_CHAIN(ringp, 3680 mp_chain, tail, cnt, sz); 3681 } 3682 3683 /* 3684 * We are here because either we couldn't do inline 3685 * processing (because something was already 3686 * queued), or we had a chain of more than one 3687 * packet, or something else arrived after we were 3688 * done with inline processing. 3689 */ 3690 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3691 ASSERT(ringp->s_ring_first != NULL); 3692 3693 ringp->s_ring_drain_func(ringp); 3694 mutex_exit(&ringp->s_ring_lock); 3695 return; 3696 } else { 3697 /* ST_RING_WORKER_ONLY case */ 3698 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3699 mac_soft_ring_worker_wakeup(ringp); 3700 mutex_exit(&ringp->s_ring_lock); 3701 } 3702 } 3703 3704 /* 3705 * TX SOFTRING RELATED FUNCTIONS 3706 * 3707 * These functions really belong in mac_soft_ring.c and here for 3708 * a short period. 3709 */ 3710 3711 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3712 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3713 ringp->s_ring_state |= S_RING_ENQUEUED; \ 3714 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3715 } 3716 3717 /* 3718 * mac_tx_sring_queued 3719 * 3720 * When we are out of transmit descriptors and we already have a 3721 * queue that exceeds hiwat (or the client called us with 3722 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3723 * soft ring pointer as the opaque cookie for the client enable 3724 * flow control. 3725 */ 3726 static mac_tx_cookie_t 3727 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3728 mblk_t **ret_mp) 3729 { 3730 int cnt; 3731 size_t sz; 3732 mblk_t *tail; 3733 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3734 mac_tx_cookie_t cookie = NULL; 3735 boolean_t wakeup_worker = B_TRUE; 3736 3737 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3738 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3739 if (flag & MAC_DROP_ON_NO_DESC) { 3740 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3741 /* increment freed stats */ 3742 ringp->s_ring_drops += cnt; 3743 cookie = (mac_tx_cookie_t)ringp; 3744 } else { 3745 if (ringp->s_ring_first != NULL) 3746 wakeup_worker = B_FALSE; 3747 3748 if (flag & MAC_TX_NO_ENQUEUE) { 3749 /* 3750 * If QUEUED is not set, queue the packet 3751 * and let mac_tx_soft_ring_drain() set 3752 * the TX_BLOCKED bit for the reasons 3753 * explained above. Otherwise, return the 3754 * mblks. 3755 */ 3756 if (wakeup_worker) { 3757 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3758 mp_chain, tail, cnt, sz); 3759 } else { 3760 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3761 cookie = (mac_tx_cookie_t)ringp; 3762 *ret_mp = mp_chain; 3763 } 3764 } else { 3765 boolean_t enqueue = B_TRUE; 3766 3767 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3768 /* 3769 * flow-controlled. Store ringp in cookie 3770 * so that it can be returned as 3771 * mac_tx_cookie_t to client 3772 */ 3773 ringp->s_ring_state |= S_RING_TX_HIWAT; 3774 cookie = (mac_tx_cookie_t)ringp; 3775 ringp->s_ring_hiwat_cnt++; 3776 if (ringp->s_ring_count > 3777 ringp->s_ring_tx_max_q_cnt) { 3778 /* increment freed stats */ 3779 ringp->s_ring_drops += cnt; 3780 /* 3781 * b_prev may be set to the fanout hint 3782 * hence can't use freemsg directly 3783 */ 3784 mac_pkt_drop(NULL, NULL, 3785 mp_chain, B_FALSE); 3786 DTRACE_PROBE1(tx_queued_hiwat, 3787 mac_soft_ring_t *, ringp); 3788 enqueue = B_FALSE; 3789 } 3790 } 3791 if (enqueue) { 3792 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3793 tail, cnt, sz); 3794 } 3795 } 3796 if (wakeup_worker) 3797 cv_signal(&ringp->s_ring_async); 3798 } 3799 return (cookie); 3800 } 3801 3802 3803 /* 3804 * mac_tx_soft_ring_process 3805 * 3806 * This routine is called when fanning out outgoing traffic among 3807 * multipe Tx rings. 3808 * Note that a soft ring is associated with a h/w Tx ring. 3809 */ 3810 mac_tx_cookie_t 3811 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3812 uint16_t flag, mblk_t **ret_mp) 3813 { 3814 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3815 int cnt; 3816 size_t sz; 3817 mblk_t *tail; 3818 mac_tx_cookie_t cookie = NULL; 3819 3820 ASSERT(ringp != NULL); 3821 ASSERT(mp_chain != NULL); 3822 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3823 /* 3824 * Only two modes can come here; either it can be 3825 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 3826 */ 3827 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3828 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 3829 3830 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3831 /* Serialization mode */ 3832 3833 mutex_enter(&ringp->s_ring_lock); 3834 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3835 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3836 flag, ret_mp); 3837 mutex_exit(&ringp->s_ring_lock); 3838 return (cookie); 3839 } 3840 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3841 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3842 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3843 /* 3844 * If ring is blocked due to lack of Tx 3845 * descs, just return. Worker thread 3846 * will get scheduled when Tx desc's 3847 * become available. 3848 */ 3849 mutex_exit(&ringp->s_ring_lock); 3850 return (cookie); 3851 } 3852 mac_soft_ring_worker_wakeup(ringp); 3853 mutex_exit(&ringp->s_ring_lock); 3854 return (cookie); 3855 } else { 3856 /* Default fanout mode */ 3857 /* 3858 * S_RING_BLOCKED is set when underlying NIC runs 3859 * out of Tx descs and messages start getting 3860 * queued. It won't get reset until 3861 * tx_srs_drain() completely drains out the 3862 * messages. 3863 */ 3864 boolean_t is_subflow; 3865 mac_tx_stats_t stats; 3866 3867 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3868 /* Tx descs/resources not available */ 3869 mutex_enter(&ringp->s_ring_lock); 3870 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3871 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3872 flag, ret_mp); 3873 mutex_exit(&ringp->s_ring_lock); 3874 return (cookie); 3875 } 3876 /* 3877 * While we were computing mblk count, the 3878 * flow control condition got relieved. 3879 * Continue with the transmission. 3880 */ 3881 mutex_exit(&ringp->s_ring_lock); 3882 } 3883 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 3884 3885 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 3886 ringp->s_ring_tx_arg2, mp_chain, 3887 (is_subflow ? &stats : NULL)); 3888 3889 /* 3890 * Multiple threads could be here sending packets. 3891 * Under such conditions, it is not possible to 3892 * automically set S_RING_BLOCKED bit to indicate 3893 * out of tx desc condition. To atomically set 3894 * this, we queue the returned packet and do 3895 * the setting of S_RING_BLOCKED in 3896 * mac_tx_soft_ring_drain(). 3897 */ 3898 if (mp_chain != NULL) { 3899 mutex_enter(&ringp->s_ring_lock); 3900 cookie = 3901 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 3902 mutex_exit(&ringp->s_ring_lock); 3903 return (cookie); 3904 } 3905 if (is_subflow) { 3906 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 3907 } 3908 return (NULL); 3909 } 3910 } 3911