1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/callb.h> 28 #include <sys/sdt.h> 29 #include <sys/strsubr.h> 30 #include <sys/strsun.h> 31 #include <sys/vlan.h> 32 #include <inet/ipsec_impl.h> 33 #include <inet/ip_impl.h> 34 #include <inet/sadb.h> 35 #include <inet/ipsecesp.h> 36 #include <inet/ipsecah.h> 37 #include <inet/ip6.h> 38 39 #include <sys/mac_impl.h> 40 #include <sys/mac_client_impl.h> 41 #include <sys/mac_client_priv.h> 42 #include <sys/mac_soft_ring.h> 43 #include <sys/mac_flow_impl.h> 44 45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 46 uintptr_t, uint16_t, mblk_t **); 47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 48 uintptr_t, uint16_t, mblk_t **); 49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 50 uintptr_t, uint16_t, mblk_t **); 51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 52 uintptr_t, uint16_t, mblk_t **); 53 54 typedef struct mac_tx_mode_s { 55 mac_tx_srs_mode_t mac_tx_mode; 56 mac_tx_func_t mac_tx_func; 57 } mac_tx_mode_t; 58 59 /* 60 * There are five modes of operation on the Tx side. These modes get set 61 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 62 * none of the other modes are user configurable. They get selected by 63 * the system depending upon whether the link (or flow) has multiple Tx 64 * rings or a bandwidth configured, etc. 65 */ 66 mac_tx_mode_t mac_tx_mode_list[] = { 67 {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 68 {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 69 {SRS_TX_FANOUT, mac_tx_fanout_mode}, 70 {SRS_TX_BW, mac_tx_bw_mode}, 71 {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 72 }; 73 74 /* 75 * Soft Ring Set (SRS) - The Run time code that deals with 76 * dynamic polling from the hardware, bandwidth enforcement, 77 * fanout etc. 78 * 79 * We try to use H/W classification on NIC and assign traffic for 80 * a MAC address to a particular Rx ring or ring group. There is a 81 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 82 * switches the underlying Rx ring between interrupt and 83 * polling mode and enforces any specified B/W control. 84 * 85 * There is always a SRS created and tied to each H/W and S/W rule. 86 * Whenever we create a H/W rule, we always add the the same rule to 87 * S/W classifier and tie a SRS to it. 88 * 89 * In case a B/W control is specified, it is broken into bytes 90 * per ticks and as soon as the quota for a tick is exhausted, 91 * the underlying Rx ring is forced into poll mode for remainder of 92 * the tick. The SRS poll thread only polls for bytes that are 93 * allowed to come in the SRS. We typically let 4x the configured 94 * B/W worth of packets to come in the SRS (to prevent unnecessary 95 * drops due to bursts) but only process the specified amount. 96 * 97 * A MAC client (e.g. a VNIC or aggr) can have 1 or more 98 * Rx rings (and corresponding SRSs) assigned to it. The SRS 99 * in turn can have softrings to do protocol level fanout or 100 * softrings to do S/W based fanout or both. In case the NIC 101 * has no Rx rings, we do S/W classification to respective SRS. 102 * The S/W classification rule is always setup and ready. This 103 * allows the MAC layer to reassign Rx rings whenever needed 104 * but packets still continue to flow via the default path and 105 * getting S/W classified to correct SRS. 106 * 107 * The SRS's are used on both Tx and Rx side. They use the same 108 * data structure but the processing routines have slightly different 109 * semantics due to the fact that Rx side needs to do dynamic 110 * polling etc. 111 * 112 * Dynamic Polling Notes 113 * ===================== 114 * 115 * Each Soft ring set is capable of switching its Rx ring between 116 * interrupt and poll mode and actively 'polls' for packets in 117 * poll mode. If the SRS is implementing a B/W limit, it makes 118 * sure that only Max allowed packets are pulled in poll mode 119 * and goes to poll mode as soon as B/W limit is exceeded. As 120 * such, there are no overheads to implement B/W limits. 121 * 122 * In poll mode, its better to keep the pipeline going where the 123 * SRS worker thread keeps processing packets and poll thread 124 * keeps bringing more packets (specially if they get to run 125 * on different CPUs). This also prevents the overheads associated 126 * by excessive signalling (on NUMA machines, this can be 127 * pretty devastating). The exception is latency optimized case 128 * where worker thread does no work and interrupt and poll thread 129 * are allowed to do their own drain. 130 * 131 * We use the following policy to control Dynamic Polling: 132 * 1) We switch to poll mode anytime the processing 133 * thread causes a backlog to build up in SRS and 134 * its associated Soft Rings (sr_poll_pkt_cnt > 0). 135 * 2) As long as the backlog stays under the low water 136 * mark (sr_lowat), we poll the H/W for more packets. 137 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 138 * water mark, we stay in poll mode but don't poll 139 * the H/W for more packets. 140 * 4) Anytime in polling mode, if we poll the H/W for 141 * packets and find nothing plus we have an existing 142 * backlog (sr_poll_pkt_cnt > 0), we stay in polling 143 * mode but don't poll the H/W for packets anymore 144 * (let the polling thread go to sleep). 145 * 5) Once the backlog is relived (packets are processed) 146 * we reenable polling (by signalling the poll thread) 147 * only when the backlog dips below sr_poll_thres. 148 * 6) sr_hiwat is used exclusively when we are not 149 * polling capable and is used to decide when to 150 * drop packets so the SRS queue length doesn't grow 151 * infinitely. 152 * 153 * NOTE: Also see the block level comment on top of mac_soft_ring.c 154 */ 155 156 /* 157 * mac_latency_optimize 158 * 159 * Controls whether the poll thread can process the packets inline 160 * or let the SRS worker thread do the processing. This applies if 161 * the SRS was not being processed. For latency sensitive traffic, 162 * this needs to be true to allow inline processing. For throughput 163 * under load, this should be false. 164 * 165 * This (and other similar) tunable should be rolled into a link 166 * or flow specific workload hint that can be set using dladm 167 * linkprop (instead of multiple such tunables). 168 */ 169 boolean_t mac_latency_optimize = B_TRUE; 170 171 /* 172 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 173 * 174 * queue a mp or chain in soft ring set and increment the 175 * local count (srs_count) for the SRS and the shared counter 176 * (srs_poll_pkt_cnt - shared between SRS and its soft rings 177 * to track the total unprocessed packets for polling to work 178 * correctly). 179 * 180 * The size (total bytes queued) counters are incremented only 181 * if we are doing B/W control. 182 */ 183 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 184 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 185 if ((mac_srs)->srs_last != NULL) \ 186 (mac_srs)->srs_last->b_next = (head); \ 187 else \ 188 (mac_srs)->srs_first = (head); \ 189 (mac_srs)->srs_last = (tail); \ 190 (mac_srs)->srs_count += count; \ 191 } 192 193 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 194 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 195 \ 196 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 197 srs_rx->sr_poll_pkt_cnt += count; \ 198 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 199 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 200 (mac_srs)->srs_size += (sz); \ 201 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 202 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 203 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 204 } \ 205 } 206 207 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 208 mac_srs->srs_state |= SRS_ENQUEUED; \ 209 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 210 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 211 (mac_srs)->srs_size += (sz); \ 212 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 213 } \ 214 } 215 216 /* 217 * Turn polling on routines 218 */ 219 #define MAC_SRS_POLLING_ON(mac_srs) { \ 220 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 221 if (((mac_srs)->srs_state & \ 222 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 223 (mac_srs)->srs_state |= SRS_POLLING; \ 224 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 225 (mac_srs)->srs_ring); \ 226 (mac_srs)->srs_rx.sr_poll_on++; \ 227 } \ 228 } 229 230 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 231 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 232 if (((mac_srs)->srs_state & \ 233 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 234 (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 235 (mac_srs)->srs_state |= SRS_POLLING; \ 236 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 237 (mac_srs)->srs_ring); \ 238 (mac_srs)->srs_rx.sr_worker_poll_on++; \ 239 } \ 240 } 241 242 /* 243 * MAC_SRS_POLL_RING 244 * 245 * Signal the SRS poll thread to poll the underlying H/W ring 246 * provided it wasn't already polling (SRS_GET_PKTS was set). 247 * 248 * Poll thread gets to run only from mac_rx_srs_drain() and only 249 * if the drain was being done by the worker thread. 250 */ 251 #define MAC_SRS_POLL_RING(mac_srs) { \ 252 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 253 \ 254 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 255 srs_rx->sr_poll_thr_sig++; \ 256 if (((mac_srs)->srs_state & \ 257 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 258 (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 259 (mac_srs)->srs_state |= SRS_GET_PKTS; \ 260 cv_signal(&(mac_srs)->srs_cv); \ 261 } else { \ 262 srs_rx->sr_poll_thr_busy++; \ 263 } \ 264 } 265 266 /* 267 * MAC_SRS_CHECK_BW_CONTROL 268 * 269 * Check to see if next tick has started so we can reset the 270 * SRS_BW_ENFORCED flag and allow more packets to come in the 271 * system. 272 */ 273 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 274 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 275 ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 276 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 277 if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \ 278 (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \ 279 (mac_srs)->srs_bw->mac_bw_used = 0; \ 280 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 281 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 282 } \ 283 } 284 285 /* 286 * MAC_SRS_WORKER_WAKEUP 287 * 288 * Wake up the SRS worker thread to process the queue as long as 289 * no one else is processing the queue. If we are optimizing for 290 * latency, we wake up the worker thread immediately or else we 291 * wait mac_srs_worker_wakeup_ticks before worker thread gets 292 * woken up. 293 */ 294 int mac_srs_worker_wakeup_ticks = 0; 295 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 296 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 297 if (!((mac_srs)->srs_state & SRS_PROC) && \ 298 (mac_srs)->srs_tid == NULL) { \ 299 if (mac_latency_optimize || \ 300 (mac_srs_worker_wakeup_ticks == 0)) \ 301 cv_signal(&(mac_srs)->srs_async); \ 302 else \ 303 (mac_srs)->srs_tid = \ 304 timeout(mac_srs_fire, (mac_srs), \ 305 mac_srs_worker_wakeup_ticks); \ 306 } \ 307 } 308 309 #define TX_SINGLE_RING_MODE(mac_srs) \ 310 ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 311 (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 312 (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 313 314 #define TX_BANDWIDTH_MODE(mac_srs) \ 315 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 316 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 317 318 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 319 uint_t hash, indx; \ 320 hash = HASH_HINT(hint); \ 321 indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 322 softring = mac_srs->srs_oth_soft_rings[indx]; \ 323 (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 324 } 325 326 /* 327 * MAC_TX_SRS_BLOCK 328 * 329 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 330 * will be set only if srs_tx_woken_up is FALSE. If 331 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 332 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 333 * attempt to transmit again and not setting SRS_TX_BLOCKED does 334 * that. 335 */ 336 #define MAC_TX_SRS_BLOCK(srs, mp) { \ 337 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 338 if ((srs)->srs_tx.st_woken_up) { \ 339 (srs)->srs_tx.st_woken_up = B_FALSE; \ 340 } else { \ 341 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 342 (srs)->srs_state |= SRS_TX_BLOCKED; \ 343 (srs)->srs_tx.st_blocked_cnt++; \ 344 } \ 345 } 346 347 /* 348 * MAC_TX_SRS_TEST_HIWAT 349 * 350 * Called before queueing a packet onto Tx SRS to test and set 351 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 352 */ 353 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 354 boolean_t enqueue = 1; \ 355 \ 356 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 357 /* \ 358 * flow-controlled. Store srs in cookie so that it \ 359 * can be returned as mac_tx_cookie_t to client \ 360 */ \ 361 (srs)->srs_state |= SRS_TX_HIWAT; \ 362 cookie = (mac_tx_cookie_t)srs; \ 363 (srs)->srs_tx.st_hiwat_cnt++; \ 364 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 365 /* increment freed stats */ \ 366 (srs)->srs_tx.st_drop_count += cnt; \ 367 /* \ 368 * b_prev may be set to the fanout hint \ 369 * hence can't use freemsg directly \ 370 */ \ 371 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 372 DTRACE_PROBE1(tx_queued_hiwat, \ 373 mac_soft_ring_set_t *, srs); \ 374 enqueue = 0; \ 375 } \ 376 } \ 377 if (enqueue) \ 378 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 379 } 380 381 /* Some utility macros */ 382 #define MAC_SRS_BW_LOCK(srs) \ 383 if (!(srs->srs_type & SRST_TX)) \ 384 mutex_enter(&srs->srs_bw->mac_bw_lock); 385 386 #define MAC_SRS_BW_UNLOCK(srs) \ 387 if (!(srs->srs_type & SRST_TX)) \ 388 mutex_exit(&srs->srs_bw->mac_bw_lock); 389 390 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 391 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 392 /* increment freed stats */ \ 393 mac_srs->srs_tx.st_drop_count++; \ 394 cookie = (mac_tx_cookie_t)srs; \ 395 } 396 397 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 398 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 399 cookie = (mac_tx_cookie_t)srs; \ 400 *ret_mp = mp_chain; \ 401 } 402 403 /* 404 * Drop the rx packet and advance to the next one in the chain. 405 */ 406 static void 407 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 408 { 409 mac_srs_rx_t *srs_rx = &srs->srs_rx; 410 411 ASSERT(mp->b_next == NULL); 412 mutex_enter(&srs->srs_lock); 413 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 414 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 415 mutex_exit(&srs->srs_lock); 416 417 srs_rx->sr_drop_count++; 418 freemsg(mp); 419 } 420 421 /* DATAPATH RUNTIME ROUTINES */ 422 423 /* 424 * mac_srs_fire 425 * 426 * Timer callback routine for waking up the SRS worker thread. 427 */ 428 static void 429 mac_srs_fire(void *arg) 430 { 431 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 432 433 mutex_enter(&mac_srs->srs_lock); 434 if (mac_srs->srs_tid == 0) { 435 mutex_exit(&mac_srs->srs_lock); 436 return; 437 } 438 439 mac_srs->srs_tid = 0; 440 if (!(mac_srs->srs_state & SRS_PROC)) 441 cv_signal(&mac_srs->srs_async); 442 443 mutex_exit(&mac_srs->srs_lock); 444 } 445 446 /* 447 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 448 * and it is used on the TX path. 449 */ 450 #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 451 452 /* 453 * hash based on the src address and the port information. 454 */ 455 #define HASH_ADDR(src, ports) \ 456 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 457 ((ports) >> 8) ^ (ports)) 458 459 #define COMPUTE_INDEX(key, sz) (key % sz) 460 461 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 462 if ((tail) != NULL) { \ 463 ASSERT((tail)->b_next == NULL); \ 464 (tail)->b_next = (mp); \ 465 } else { \ 466 ASSERT((head) == NULL); \ 467 (head) = (mp); \ 468 } \ 469 (tail) = (mp); \ 470 (cnt)++; \ 471 if ((bw_ctl)) \ 472 (sz) += (sz0); \ 473 } 474 475 #define MAC_FANOUT_DEFAULT 0 476 #define MAC_FANOUT_RND_ROBIN 1 477 int mac_fanout_type = MAC_FANOUT_DEFAULT; 478 479 #define MAX_SR_TYPES 3 480 /* fanout types for port based hashing */ 481 enum pkt_type { 482 V4_TCP = 0, 483 V4_UDP, 484 OTH, 485 UNDEF 486 }; 487 488 /* 489 * In general we do port based hashing to spread traffic over different 490 * softrings. The below tunable allows to override that behavior. Setting it 491 * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 492 * is also the applicable to ipv6 packets carrying multiple optional headers 493 * and other uncommon packet types. 494 */ 495 boolean_t mac_src_ipv6_fanout = B_FALSE; 496 497 /* 498 * Pair of local and remote ports in the transport header 499 */ 500 #define PORTS_SIZE 4 501 502 /* 503 * mac_rx_srs_proto_fanout 504 * 505 * This routine delivers packets destined to an SRS into one of the 506 * protocol soft rings. 507 * 508 * Given a chain of packets we need to split it up into multiple sub chains 509 * destined into TCP, UDP or OTH soft ring. Instead of entering 510 * the soft ring one packet at a time, we want to enter it in the form of a 511 * chain otherwise we get this start/stop behaviour where the worker thread 512 * goes to sleep and then next packets comes in forcing it to wake up etc. 513 */ 514 static void 515 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 516 { 517 struct ether_header *ehp; 518 uint16_t etype; 519 ipha_t *ipha; 520 mac_soft_ring_t *softring; 521 size_t ether_hlen; 522 mblk_t *mp; 523 mblk_t *headmp[MAX_SR_TYPES]; 524 mblk_t *tailmp[MAX_SR_TYPES]; 525 int cnt[MAX_SR_TYPES]; 526 size_t sz[MAX_SR_TYPES]; 527 size_t sz1; 528 boolean_t bw_ctl = B_FALSE; 529 boolean_t hw_classified; 530 boolean_t dls_bypass = B_TRUE; 531 enum pkt_type type; 532 mac_client_impl_t *mcip = mac_srs->srs_mcip; 533 struct ether_vlan_header *evhp; 534 535 if (mac_srs->srs_type & SRST_BW_CONTROL) 536 bw_ctl = B_TRUE; 537 538 /* 539 * If we don't have a Rx ring, S/W classification would have done 540 * its job and its a packet meant for us. If we were polling on 541 * the default ring (i.e. there was a ring assigned to this SRS), 542 * then we need to make sure that the mac address really belongs 543 * to us. 544 */ 545 hw_classified = mac_srs->srs_ring != NULL && 546 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 547 548 /* 549 * Special clients (eg. VLAN, non ether, etc) need DLS 550 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 551 * such SRSs. 552 */ 553 if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) 554 dls_bypass = B_FALSE; 555 556 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 557 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 558 bzero(cnt, MAX_SR_TYPES * sizeof (int)); 559 bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 560 561 /* 562 * We got a chain from SRS that we need to send to the soft rings. 563 * Since squeues for TCP & IPv4 sap poll their soft rings (for 564 * performance reasons), we need to separate out v4_tcp, v4_udp 565 * and the rest goes in other. 566 */ 567 while (head != NULL) { 568 mp = head; 569 head = head->b_next; 570 mp->b_next = NULL; 571 572 type = OTH; 573 sz1 = msgdsize(mp); 574 575 if (!dls_bypass) { 576 mac_impl_t *mip = mcip->mci_mip; 577 578 ehp = (struct ether_header *)mp->b_rptr; 579 580 /* 581 * For VLAN packets, if the VLAN id doesn't belong 582 * to this client, we drop the packet. 583 */ 584 if (mip->mi_info.mi_nativemedia == DL_ETHER && 585 ntohs(ehp->ether_type) == VLAN_TPID) { 586 /* 587 * LINTED: cast may result in improper 588 * alignment 589 */ 590 evhp = (struct ether_vlan_header *)ehp; 591 if (!mac_client_check_flow_vid(mcip, 592 VLAN_ID(ntohs(evhp->ether_tci)))) { 593 mac_rx_drop_pkt(mac_srs, mp); 594 continue; 595 } 596 } 597 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 598 cnt[type], bw_ctl, sz[type], sz1, mp); 599 continue; 600 } 601 602 /* 603 * At this point we can be sure the packet at least 604 * has an ether header. 605 */ 606 if (sz1 < sizeof (struct ether_header)) { 607 mac_rx_drop_pkt(mac_srs, mp); 608 continue; 609 } 610 /* LINTED: cast may result in improper alignment */ 611 ehp = (struct ether_header *)mp->b_rptr; 612 613 /* 614 * Determine if this is a VLAN or non-VLAN packet. 615 */ 616 if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { 617 /* LINTED: cast may result in improper alignment */ 618 evhp = (struct ether_vlan_header *)mp->b_rptr; 619 etype = ntohs(evhp->ether_type); 620 ether_hlen = sizeof (struct ether_vlan_header); 621 /* 622 * Check if the VID of the packet, if any, belongs 623 * to this client. 624 */ 625 if (!mac_client_check_flow_vid(mcip, 626 VLAN_ID(ntohs(evhp->ether_tci)))) { 627 mac_rx_drop_pkt(mac_srs, mp); 628 continue; 629 } 630 } else { 631 ether_hlen = sizeof (struct ether_header); 632 } 633 634 if (etype == ETHERTYPE_IP) { 635 /* 636 * If we are H/W classified, but we have promisc 637 * on, then we need to check for the unicast address. 638 */ 639 if (hw_classified && mcip->mci_promisc_list != NULL) { 640 mac_address_t *map; 641 642 rw_enter(&mcip->mci_rw_lock, RW_READER); 643 map = mcip->mci_unicast; 644 if (bcmp(&ehp->ether_dhost, map->ma_addr, 645 map->ma_len) == 0) 646 type = UNDEF; 647 rw_exit(&mcip->mci_rw_lock); 648 } else if (((((uint8_t *)&ehp->ether_dhost)[0] & 649 0x01) == 0)) { 650 type = UNDEF; 651 } 652 } 653 654 /* 655 * This needs to become a contract with the driver for 656 * the fast path. 657 * 658 * In the normal case the packet will have at least the L2 659 * header and the IP + Transport header in the same mblk. 660 * This is usually the case when the NIC driver sends up 661 * the packet. This is also true when the stack generates 662 * a packet that is looped back and when the stack uses the 663 * fastpath mechanism. The normal case is optimized for 664 * performance and may bypass DLS. All other cases go through 665 * the 'OTH' type path without DLS bypass. 666 */ 667 668 /* LINTED: cast may result in improper alignment */ 669 ipha = (ipha_t *)(mp->b_rptr + ether_hlen); 670 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 671 type = OTH; 672 673 if (type == OTH) { 674 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 675 cnt[type], bw_ctl, sz[type], sz1, mp); 676 continue; 677 } 678 679 ASSERT(type == UNDEF); 680 /* 681 * We look for at least 4 bytes past the IP header to get 682 * the port information. If we get an IP fragment, we don't 683 * have the port information, and we use just the protocol 684 * information. 685 */ 686 switch (ipha->ipha_protocol) { 687 case IPPROTO_TCP: 688 type = V4_TCP; 689 mp->b_rptr += ether_hlen; 690 break; 691 case IPPROTO_UDP: 692 type = V4_UDP; 693 mp->b_rptr += ether_hlen; 694 break; 695 default: 696 type = OTH; 697 break; 698 } 699 700 ASSERT(type != UNDEF); 701 702 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 703 bw_ctl, sz[type], sz1, mp); 704 } 705 706 for (type = V4_TCP; type < UNDEF; type++) { 707 if (headmp[type] != NULL) { 708 ASSERT(tailmp[type]->b_next == NULL); 709 switch (type) { 710 case V4_TCP: 711 softring = mac_srs->srs_tcp_soft_rings[0]; 712 break; 713 case V4_UDP: 714 softring = mac_srs->srs_udp_soft_rings[0]; 715 break; 716 case OTH: 717 softring = mac_srs->srs_oth_soft_rings[0]; 718 } 719 mac_rx_soft_ring_process(mac_srs->srs_mcip, softring, 720 headmp[type], tailmp[type], cnt[type], sz[type]); 721 } 722 } 723 } 724 725 int fanout_unalligned = 0; 726 727 /* 728 * mac_rx_srs_long_fanout 729 * 730 * The fanout routine for IPv6 731 */ 732 static int 733 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 734 uint16_t etype, enum pkt_type *type, uint_t *indx) 735 { 736 ip6_t *ip6h; 737 uint8_t *whereptr; 738 uint_t hash; 739 uint16_t remlen; 740 uint8_t nexthdr; 741 uint16_t hdr_len; 742 743 if (etype == ETHERTYPE_IPV6) { 744 boolean_t modifiable = B_TRUE; 745 746 ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 747 748 ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header)); 749 if ((unsigned char *)ip6h == mp->b_wptr) { 750 /* 751 * The first mblk_t only includes the ethernet header. 752 * Note that it is safe to change the mp pointer here, 753 * as the subsequent operation does not assume mp 754 * points to the start of the ethernet header. 755 */ 756 mp = mp->b_cont; 757 758 /* 759 * Make sure ip6h holds the full ip6_t structure. 760 */ 761 if (mp == NULL) 762 return (-1); 763 764 if (MBLKL(mp) < IPV6_HDR_LEN) { 765 modifiable = (DB_REF(mp) == 1); 766 767 if (modifiable && 768 !pullupmsg(mp, IPV6_HDR_LEN)) { 769 return (-1); 770 } 771 } 772 773 ip6h = (ip6_t *)mp->b_rptr; 774 } 775 776 if (!modifiable || !(OK_32PTR((char *)ip6h)) || 777 ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 778 /* 779 * If either ip6h is not alligned, or ip6h does not 780 * hold the complete ip6_t structure (a pullupmsg() 781 * is not an option since it would result in an 782 * unalligned ip6h), fanout to the default ring. Note 783 * that this may cause packets reordering. 784 */ 785 *indx = 0; 786 *type = OTH; 787 fanout_unalligned++; 788 return (0); 789 } 790 791 remlen = ntohs(ip6h->ip6_plen); 792 nexthdr = ip6h->ip6_nxt; 793 794 if (remlen < MIN_EHDR_LEN) 795 return (-1); 796 /* 797 * Do src based fanout if below tunable is set to B_TRUE or 798 * when mac_ip_hdr_length_v6() fails because of malformed 799 * packets or because mblk's need to be concatenated using 800 * pullupmsg(). 801 */ 802 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 803 &hdr_len, &nexthdr)) { 804 goto src_based_fanout; 805 } 806 whereptr = (uint8_t *)ip6h + hdr_len; 807 808 /* If the transport is one of below, we do port based fanout */ 809 switch (nexthdr) { 810 case IPPROTO_TCP: 811 case IPPROTO_UDP: 812 case IPPROTO_SCTP: 813 case IPPROTO_ESP: 814 /* 815 * If the ports in the transport header is not part of 816 * the mblk, do src_based_fanout, instead of calling 817 * pullupmsg(). 818 */ 819 if (mp->b_cont != NULL && 820 whereptr + PORTS_SIZE > mp->b_wptr) { 821 goto src_based_fanout; 822 } 823 break; 824 default: 825 break; 826 } 827 828 switch (nexthdr) { 829 case IPPROTO_TCP: 830 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 831 *(uint32_t *)whereptr); 832 *indx = COMPUTE_INDEX(hash, 833 mac_srs->srs_tcp_ring_count); 834 *type = OTH; 835 break; 836 837 case IPPROTO_UDP: 838 case IPPROTO_SCTP: 839 case IPPROTO_ESP: 840 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 841 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 842 *(uint32_t *)whereptr); 843 *indx = COMPUTE_INDEX(hash, 844 mac_srs->srs_udp_ring_count); 845 } else { 846 *indx = mac_srs->srs_ind % 847 mac_srs->srs_udp_ring_count; 848 mac_srs->srs_ind++; 849 } 850 *type = OTH; 851 break; 852 853 /* For all other protocol, do source based fanout */ 854 default: 855 goto src_based_fanout; 856 } 857 } else { 858 *indx = 0; 859 *type = OTH; 860 } 861 return (0); 862 863 src_based_fanout: 864 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 865 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 866 *type = OTH; 867 return (0); 868 } 869 870 /* 871 * mac_rx_srs_fanout 872 * 873 * This routine delivers packets destined to an SRS into a soft ring member 874 * of the set. 875 * 876 * Given a chain of packets we need to split it up into multiple sub chains 877 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 878 * the soft ring one packet at a time, we want to enter it in the form of a 879 * chain otherwise we get this start/stop behaviour where the worker thread 880 * goes to sleep and then next packets comes in forcing it to wake up etc. 881 * 882 * Note: 883 * Since we know what is the maximum fanout possible, we create a 2D array 884 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 885 * variables so that we can enter the softrings with chain. We need the 886 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 887 * for each packet would be expensive). If we ever want to have the 888 * ability to have unlimited fanout, we should probably declare a head, 889 * tail, cnt, sz with each soft ring (a data struct which contains a softring 890 * along with these members) and create an array of this uber struct so we 891 * don't have to do kmem_alloc. 892 */ 893 int fanout_oth1 = 0; 894 int fanout_oth2 = 0; 895 int fanout_oth3 = 0; 896 int fanout_oth4 = 0; 897 int fanout_oth5 = 0; 898 899 static void 900 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 901 { 902 struct ether_header *ehp; 903 uint16_t etype; 904 ipha_t *ipha; 905 uint_t indx; 906 int ports_offset = -1; 907 int ipha_len; 908 uint_t hash; 909 mac_soft_ring_t *softring; 910 size_t ether_hlen; 911 uint16_t frag_offset_flags; 912 mblk_t *mp; 913 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 914 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 915 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 916 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 917 size_t sz1; 918 boolean_t bw_ctl = B_FALSE; 919 boolean_t hw_classified; 920 boolean_t dls_bypass = B_TRUE; 921 int i; 922 int fanout_cnt; 923 enum pkt_type type; 924 mac_client_impl_t *mcip = mac_srs->srs_mcip; 925 struct ether_vlan_header *evhp; 926 927 if (mac_srs->srs_type & SRST_BW_CONTROL) 928 bw_ctl = B_TRUE; 929 930 /* 931 * If we don't have a Rx ring, S/W classification would have done 932 * its job and its a packet meant for us. If we were polling on 933 * the default ring (i.e. there was a ring assigned to this SRS), 934 * then we need to make sure that the mac address really belongs 935 * to us. 936 */ 937 hw_classified = mac_srs->srs_ring != NULL && 938 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 939 940 /* 941 * Special clients (eg. VLAN, non ether, etc) need DLS 942 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 943 * such SRSs. 944 */ 945 if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) 946 dls_bypass = B_FALSE; 947 948 /* 949 * Since the softrings are never destroyed and we always 950 * create equal number of softrings for TCP, UDP and rest, 951 * its OK to check one of them for count and use it without 952 * any lock. In future, if soft rings get destroyed because 953 * of reduction in fanout, we will need to ensure that happens 954 * behind the SRS_PROC. 955 */ 956 fanout_cnt = mac_srs->srs_tcp_ring_count; 957 958 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 959 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 960 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 961 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 962 963 /* 964 * We got a chain from SRS that we need to send to the soft rings. 965 * Since squeues for TCP & IPv4 sap poll their soft rings (for 966 * performance reasons), we need to separate out v4_tcp, v4_udp 967 * and the rest goes in other. 968 */ 969 while (head != NULL) { 970 mp = head; 971 head = head->b_next; 972 mp->b_next = NULL; 973 974 type = OTH; 975 sz1 = msgdsize(mp); 976 977 if (!dls_bypass) { 978 mac_impl_t *mip = mcip->mci_mip; 979 980 indx = 0; 981 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 982 ehp = (struct ether_header *)mp->b_rptr; 983 etype = ntohs(ehp->ether_type); 984 /* 985 * For VLAN packets, if the VLAN id doesn't 986 * belong to this client, we drop the packet. 987 */ 988 if (etype == VLAN_TPID) { 989 /* 990 * LINTED: cast may result in improper 991 * alignment 992 */ 993 evhp = (struct ether_vlan_header *) 994 mp->b_rptr; 995 if (!mac_client_check_flow_vid(mcip, 996 VLAN_ID(ntohs(evhp->ether_tci)))) { 997 mac_rx_drop_pkt(mac_srs, mp); 998 continue; 999 } 1000 } 1001 if (mac_rx_srs_long_fanout(mac_srs, mp, etype, 1002 &type, &indx) == -1) { 1003 mac_rx_drop_pkt(mac_srs, mp); 1004 continue; 1005 } 1006 } 1007 1008 FANOUT_ENQUEUE_MP(headmp[type][indx], 1009 tailmp[type][indx], cnt[type][indx], bw_ctl, 1010 sz[type][indx], sz1, mp); 1011 continue; 1012 } 1013 1014 /* 1015 * At this point we can be sure the packet at least 1016 * has an ether header. On the outbound side, GLD/stack 1017 * ensure this. On the inbound side, the driver needs 1018 * to ensure this. 1019 */ 1020 if (sz1 < sizeof (struct ether_header)) { 1021 mac_rx_drop_pkt(mac_srs, mp); 1022 continue; 1023 } 1024 /* LINTED: cast may result in improper alignment */ 1025 ehp = (struct ether_header *)mp->b_rptr; 1026 1027 /* 1028 * Determine if this is a VLAN or non-VLAN packet. 1029 */ 1030 if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { 1031 /* LINTED: cast may result in improper alignment */ 1032 evhp = (struct ether_vlan_header *)mp->b_rptr; 1033 etype = ntohs(evhp->ether_type); 1034 ether_hlen = sizeof (struct ether_vlan_header); 1035 /* 1036 * Check if the VID of the packet, if any, belongs 1037 * to this client. 1038 */ 1039 if (!mac_client_check_flow_vid(mcip, 1040 VLAN_ID(ntohs(evhp->ether_tci)))) { 1041 mac_rx_drop_pkt(mac_srs, mp); 1042 continue; 1043 } 1044 } else { 1045 ether_hlen = sizeof (struct ether_header); 1046 } 1047 1048 1049 /* 1050 * If we are using the default Rx ring where H/W or S/W 1051 * classification has not happened, we need to verify if 1052 * this unicast packet really belongs to us. 1053 */ 1054 if (etype == ETHERTYPE_IP) { 1055 /* 1056 * If we are H/W classified, but we have promisc 1057 * on, then we need to check for the unicast address. 1058 */ 1059 if (hw_classified && mcip->mci_promisc_list != NULL) { 1060 mac_address_t *map; 1061 1062 rw_enter(&mcip->mci_rw_lock, RW_READER); 1063 map = mcip->mci_unicast; 1064 if (bcmp(&ehp->ether_dhost, map->ma_addr, 1065 map->ma_len) == 0) 1066 type = UNDEF; 1067 rw_exit(&mcip->mci_rw_lock); 1068 } else if (((((uint8_t *)&ehp->ether_dhost)[0] & 1069 0x01) == 0)) { 1070 type = UNDEF; 1071 } 1072 } 1073 1074 /* 1075 * This needs to become a contract with the driver for 1076 * the fast path. 1077 */ 1078 1079 /* LINTED: cast may result in improper alignment */ 1080 ipha = (ipha_t *)(mp->b_rptr + ether_hlen); 1081 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1082 type = OTH; 1083 fanout_oth1++; 1084 } 1085 1086 if (type != OTH) { 1087 switch (ipha->ipha_protocol) { 1088 case IPPROTO_TCP: 1089 case IPPROTO_UDP: 1090 case IPPROTO_SCTP: 1091 case IPPROTO_ESP: 1092 ipha_len = IPH_HDR_LENGTH(ipha); 1093 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1094 mp->b_wptr) { 1095 type = OTH; 1096 break; 1097 } 1098 frag_offset_flags = 1099 ntohs(ipha->ipha_fragment_offset_and_flags); 1100 if ((frag_offset_flags & 1101 (IPH_MF | IPH_OFFSET)) != 0) { 1102 type = OTH; 1103 fanout_oth3++; 1104 break; 1105 } 1106 ports_offset = ether_hlen + ipha_len; 1107 break; 1108 default: 1109 type = OTH; 1110 fanout_oth4++; 1111 break; 1112 } 1113 } 1114 1115 if (type == OTH) { 1116 if (mac_rx_srs_long_fanout(mac_srs, mp, etype, 1117 &type, &indx) == -1) { 1118 mac_rx_drop_pkt(mac_srs, mp); 1119 continue; 1120 } 1121 1122 FANOUT_ENQUEUE_MP(headmp[type][indx], 1123 tailmp[type][indx], cnt[type][indx], bw_ctl, 1124 sz[type][indx], sz1, mp); 1125 continue; 1126 } 1127 1128 ASSERT(type == UNDEF); 1129 1130 /* 1131 * XXX-Sunay: We should hold srs_lock since ring_count 1132 * below can change. But if we are always called from 1133 * mac_rx_srs_drain and SRS_PROC is set, then we can 1134 * enforce that ring_count can't be changed i.e. 1135 * to change fanout type or ring count, the calling 1136 * thread needs to be behind SRS_PROC. 1137 */ 1138 switch (ipha->ipha_protocol) { 1139 case IPPROTO_TCP: 1140 /* 1141 * Note that for ESP, we fanout on SPI and it is at the 1142 * same offset as the 2x16-bit ports. So it is clumped 1143 * along with TCP, UDP and SCTP. 1144 */ 1145 hash = HASH_ADDR(ipha->ipha_src, 1146 *(uint32_t *)(mp->b_rptr + ports_offset)); 1147 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1148 type = V4_TCP; 1149 mp->b_rptr += ether_hlen; 1150 break; 1151 case IPPROTO_UDP: 1152 case IPPROTO_SCTP: 1153 case IPPROTO_ESP: 1154 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1155 hash = HASH_ADDR(ipha->ipha_src, 1156 *(uint32_t *)(mp->b_rptr + ports_offset)); 1157 indx = COMPUTE_INDEX(hash, 1158 mac_srs->srs_udp_ring_count); 1159 } else { 1160 indx = mac_srs->srs_ind % 1161 mac_srs->srs_udp_ring_count; 1162 mac_srs->srs_ind++; 1163 } 1164 type = V4_UDP; 1165 mp->b_rptr += ether_hlen; 1166 break; 1167 } 1168 1169 ASSERT(type != UNDEF); 1170 1171 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1172 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1173 } 1174 1175 for (type = V4_TCP; type < UNDEF; type++) { 1176 for (i = 0; i < fanout_cnt; i++) { 1177 if (headmp[type][i] != NULL) { 1178 ASSERT(tailmp[type][i]->b_next == NULL); 1179 switch (type) { 1180 case V4_TCP: 1181 softring = 1182 mac_srs->srs_tcp_soft_rings[i]; 1183 break; 1184 case V4_UDP: 1185 softring = 1186 mac_srs->srs_udp_soft_rings[i]; 1187 break; 1188 case OTH: 1189 softring = 1190 mac_srs->srs_oth_soft_rings[i]; 1191 break; 1192 } 1193 mac_rx_soft_ring_process(mac_srs->srs_mcip, 1194 softring, headmp[type][i], tailmp[type][i], 1195 cnt[type][i], sz[type][i]); 1196 } 1197 } 1198 } 1199 } 1200 1201 #define SRS_BYTES_TO_PICKUP 150000 1202 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1203 1204 /* 1205 * mac_rx_srs_poll_ring 1206 * 1207 * This SRS Poll thread uses this routine to poll the underlying hardware 1208 * Rx ring to get a chain of packets. It can inline process that chain 1209 * if mac_latency_optimize is set (default) or signal the SRS worker thread 1210 * to do the remaining processing. 1211 * 1212 * Since packets come in the system via interrupt or poll path, we also 1213 * update the stats and deal with promiscous clients here. 1214 */ 1215 void 1216 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1217 { 1218 kmutex_t *lock = &mac_srs->srs_lock; 1219 kcondvar_t *async = &mac_srs->srs_cv; 1220 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1221 mblk_t *head, *tail, *mp; 1222 callb_cpr_t cprinfo; 1223 ssize_t bytes_to_pickup; 1224 size_t sz; 1225 int count; 1226 mac_client_impl_t *smcip; 1227 1228 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1229 mutex_enter(lock); 1230 1231 start: 1232 for (;;) { 1233 if (mac_srs->srs_state & SRS_PAUSE) 1234 goto done; 1235 1236 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1237 cv_wait(async, lock); 1238 CALLB_CPR_SAFE_END(&cprinfo, lock); 1239 1240 if (mac_srs->srs_state & SRS_PAUSE) 1241 goto done; 1242 1243 check_again: 1244 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1245 /* 1246 * We pick as many bytes as we are allowed to queue. 1247 * Its possible that we will exceed the total 1248 * packets queued in case this SRS is part of the 1249 * Rx ring group since > 1 poll thread can be pulling 1250 * upto the max allowed packets at the same time 1251 * but that should be OK. 1252 */ 1253 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1254 bytes_to_pickup = 1255 mac_srs->srs_bw->mac_bw_drop_threshold - 1256 mac_srs->srs_bw->mac_bw_sz; 1257 /* 1258 * We shouldn't have been signalled if we 1259 * have 0 or less bytes to pick but since 1260 * some of the bytes accounting is driver 1261 * dependant, we do the safety check. 1262 */ 1263 if (bytes_to_pickup < 0) 1264 bytes_to_pickup = 0; 1265 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1266 } else { 1267 /* 1268 * ToDO: Need to change the polling API 1269 * to add a packet count and a flag which 1270 * tells the driver whether we want packets 1271 * based on a count, or bytes, or all the 1272 * packets queued in the driver/HW. This 1273 * way, we never have to check the limits 1274 * on poll path. We truly let only as many 1275 * packets enter the system as we are willing 1276 * to process or queue. 1277 * 1278 * Something along the lines of 1279 * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1280 * mac_srs->srs_poll_pkt_cnt 1281 */ 1282 1283 /* 1284 * Since we are not doing B/W control, pick 1285 * as many packets as allowed. 1286 */ 1287 bytes_to_pickup = max_bytes_to_pickup; 1288 } 1289 1290 /* Poll the underlying Hardware */ 1291 mutex_exit(lock); 1292 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1293 mutex_enter(lock); 1294 1295 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1296 SRS_POLL_THR_OWNER); 1297 1298 mp = tail = head; 1299 count = 0; 1300 sz = 0; 1301 while (mp != NULL) { 1302 tail = mp; 1303 sz += msgdsize(mp); 1304 mp = mp->b_next; 1305 count++; 1306 } 1307 1308 if (head != NULL) { 1309 tail->b_next = NULL; 1310 smcip = mac_srs->srs_mcip; 1311 1312 if ((mac_srs->srs_type & SRST_FLOW) || 1313 (smcip == NULL)) { 1314 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1315 rbytes, sz); 1316 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1317 ipackets, count); 1318 } 1319 1320 /* 1321 * If there are any promiscuous mode callbacks 1322 * defined for this MAC client, pass them a copy 1323 * if appropriate and also update the counters. 1324 */ 1325 if (smcip != NULL) { 1326 smcip->mci_stat_ibytes += sz; 1327 smcip->mci_stat_ipackets += count; 1328 1329 if (smcip->mci_mip->mi_promisc_list != NULL) { 1330 mutex_exit(lock); 1331 mac_promisc_dispatch(smcip->mci_mip, 1332 head, NULL); 1333 mutex_enter(lock); 1334 } 1335 } 1336 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1337 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1338 mac_srs->srs_bw->mac_bw_polled += sz; 1339 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1340 } 1341 srs_rx->sr_poll_count += count; 1342 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1343 count, sz); 1344 if (count <= 10) 1345 srs_rx->sr_chain_cnt_undr10++; 1346 else if (count > 10 && count <= 50) 1347 srs_rx->sr_chain_cnt_10to50++; 1348 else 1349 srs_rx->sr_chain_cnt_over50++; 1350 } 1351 1352 /* 1353 * We are guaranteed that SRS_PROC will be set if we 1354 * are here. Also, poll thread gets to run only if 1355 * the drain was being done by a worker thread although 1356 * its possible that worker thread is still running 1357 * and poll thread was sent down to keep the pipeline 1358 * going instead of doing a complete drain and then 1359 * trying to poll the NIC. 1360 * 1361 * So we need to check SRS_WORKER flag to make sure 1362 * that the worker thread is not processing the queue 1363 * in parallel to us. The flags and conditions are 1364 * protected by the srs_lock to prevent any race. We 1365 * ensure that we don't drop the srs_lock from now 1366 * till the end and similarly we don't drop the srs_lock 1367 * in mac_rx_srs_drain() till similar condition check 1368 * are complete. The mac_rx_srs_drain() needs to ensure 1369 * that SRS_WORKER flag remains set as long as its 1370 * processing the queue. 1371 */ 1372 if (!(mac_srs->srs_state & SRS_WORKER) && 1373 (mac_srs->srs_first != NULL)) { 1374 /* 1375 * We have packets to process and worker thread 1376 * is not running. Check to see if poll thread is 1377 * allowed to process. Let it do processing only if it 1378 * picked up some packets from the NIC otherwise 1379 * wakeup the worker thread. 1380 */ 1381 if ((mac_srs->srs_state & SRS_LATENCY_OPT) && 1382 (head != NULL)) { 1383 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1384 if (srs_rx->sr_poll_pkt_cnt <= 1385 srs_rx->sr_lowat) { 1386 srs_rx->sr_poll_again++; 1387 goto check_again; 1388 } else { 1389 /* 1390 * We are already above low water mark 1391 * so stay in the polling mode but no 1392 * need to poll. Once we dip below 1393 * the polling threshold, the processing 1394 * thread (soft ring) will signal us 1395 * to poll again (MAC_UPDATE_SRS_COUNT) 1396 */ 1397 srs_rx->sr_poll_drain_no_poll++; 1398 mac_srs->srs_state &= 1399 ~(SRS_PROC|SRS_GET_PKTS); 1400 /* 1401 * In B/W control case, its possible 1402 * that the backlog built up due to 1403 * B/W limit being reached and packets 1404 * are queued only in SRS. In this case, 1405 * we should schedule worker thread 1406 * since no one else will wake us up. 1407 */ 1408 if ((mac_srs->srs_type & 1409 SRST_BW_CONTROL) && 1410 (mac_srs->srs_tid == NULL)) { 1411 mac_srs->srs_tid = 1412 timeout(mac_srs_fire, 1413 mac_srs, 1); 1414 srs_rx->sr_poll_worker_wakeup++; 1415 } 1416 } 1417 } else { 1418 /* 1419 * Wakeup the worker thread for more processing. 1420 * We optimize for throughput in this case. 1421 */ 1422 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1423 MAC_SRS_WORKER_WAKEUP(mac_srs); 1424 srs_rx->sr_poll_sig_worker++; 1425 } 1426 } else if ((mac_srs->srs_first == NULL) && 1427 !(mac_srs->srs_state & SRS_WORKER)) { 1428 /* 1429 * There is nothing queued in SRS and 1430 * no worker thread running. Plus we 1431 * didn't get anything from the H/W 1432 * as well (head == NULL); 1433 */ 1434 ASSERT(head == NULL); 1435 mac_srs->srs_state &= 1436 ~(SRS_PROC|SRS_GET_PKTS); 1437 1438 /* 1439 * If we have a packets in soft ring, don't allow 1440 * more packets to come into this SRS by keeping the 1441 * interrupts off but not polling the H/W. The 1442 * poll thread will get signaled as soon as 1443 * srs_poll_pkt_cnt dips below poll threshold. 1444 */ 1445 if (srs_rx->sr_poll_pkt_cnt == 0) { 1446 srs_rx->sr_poll_intr_enable++; 1447 MAC_SRS_POLLING_OFF(mac_srs); 1448 } else { 1449 /* 1450 * We know nothing is queued in SRS 1451 * since we are here after checking 1452 * srs_first is NULL. The backlog 1453 * is entirely due to packets queued 1454 * in Soft ring which will wake us up 1455 * and get the interface out of polling 1456 * mode once the backlog dips below 1457 * sr_poll_thres. 1458 */ 1459 srs_rx->sr_poll_no_poll++; 1460 } 1461 } else { 1462 /* 1463 * Worker thread is already running. 1464 * Nothing much to do. If the polling 1465 * was enabled, worker thread will deal 1466 * with that. 1467 */ 1468 mac_srs->srs_state &= ~SRS_GET_PKTS; 1469 srs_rx->sr_poll_goto_sleep++; 1470 } 1471 } 1472 done: 1473 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1474 cv_signal(&mac_srs->srs_async); 1475 /* 1476 * If this is a temporary quiesce then wait for the restart signal 1477 * from the srs worker. Then clear the flags and signal the srs worker 1478 * to ensure a positive handshake and go back to start. 1479 */ 1480 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1481 cv_wait(async, lock); 1482 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1483 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1484 mac_srs->srs_state &= 1485 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1486 cv_signal(&mac_srs->srs_async); 1487 goto start; 1488 } else { 1489 mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1490 cv_signal(&mac_srs->srs_async); 1491 CALLB_CPR_EXIT(&cprinfo); 1492 thread_exit(); 1493 } 1494 } 1495 1496 /* 1497 * mac_srs_pick_chain 1498 * 1499 * In Bandwidth control case, checks how many packets can be processed 1500 * and return them in a sub chain. 1501 */ 1502 static mblk_t * 1503 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1504 size_t *chain_sz, int *chain_cnt) 1505 { 1506 mblk_t *head = NULL; 1507 mblk_t *tail = NULL; 1508 size_t sz; 1509 size_t tsz = 0; 1510 int cnt = 0; 1511 mblk_t *mp; 1512 1513 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1514 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1515 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1516 mac_srs->srs_bw->mac_bw_limit) || 1517 (mac_srs->srs_bw->mac_bw_limit == 0)) { 1518 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1519 head = mac_srs->srs_first; 1520 mac_srs->srs_first = NULL; 1521 *chain_tail = mac_srs->srs_last; 1522 mac_srs->srs_last = NULL; 1523 *chain_sz = mac_srs->srs_size; 1524 *chain_cnt = mac_srs->srs_count; 1525 mac_srs->srs_count = 0; 1526 mac_srs->srs_size = 0; 1527 return (head); 1528 } 1529 1530 /* 1531 * Can't clear the entire backlog. 1532 * Need to find how many packets to pick 1533 */ 1534 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1535 while ((mp = mac_srs->srs_first) != NULL) { 1536 sz = msgdsize(mp); 1537 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1538 mac_srs->srs_bw->mac_bw_limit) { 1539 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1540 mac_srs->srs_bw->mac_bw_state |= 1541 SRS_BW_ENFORCED; 1542 break; 1543 } 1544 1545 /* 1546 * The _size & cnt is decremented from the softrings 1547 * when they send up the packet for polling to work 1548 * properly. 1549 */ 1550 tsz += sz; 1551 cnt++; 1552 mac_srs->srs_count--; 1553 mac_srs->srs_size -= sz; 1554 if (tail != NULL) 1555 tail->b_next = mp; 1556 else 1557 head = mp; 1558 tail = mp; 1559 mac_srs->srs_first = mac_srs->srs_first->b_next; 1560 } 1561 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1562 if (mac_srs->srs_first == NULL) 1563 mac_srs->srs_last = NULL; 1564 1565 if (tail != NULL) 1566 tail->b_next = NULL; 1567 *chain_tail = tail; 1568 *chain_cnt = cnt; 1569 *chain_sz = tsz; 1570 1571 return (head); 1572 } 1573 1574 /* 1575 * mac_rx_srs_drain 1576 * 1577 * The SRS drain routine. Gets to run to clear the queue. Any thread 1578 * (worker, interrupt, poll) can call this based on processing model. 1579 * The first thing we do is disable interrupts if possible and then 1580 * drain the queue. we also try to poll the underlying hardware if 1581 * there is a dedicated hardware Rx ring assigned to this SRS. 1582 * 1583 * There is a equivalent drain routine in bandwidth control mode 1584 * mac_rx_srs_drain_bw. There is some code duplication between the two 1585 * routines but they are highly performance sensitive and are easier 1586 * to read/debug if they stay separate. Any code changes here might 1587 * also apply to mac_rx_srs_drain_bw as well. 1588 */ 1589 void 1590 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1591 { 1592 mblk_t *head; 1593 mblk_t *tail; 1594 timeout_id_t tid; 1595 int cnt = 0; 1596 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1597 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1598 1599 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1600 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1601 again: 1602 /* If we are blanked i.e. can't do upcalls, then we are done */ 1603 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1604 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1605 (mac_srs->srs_state & SRS_PAUSE)); 1606 goto out; 1607 } 1608 1609 if (mac_srs->srs_first == NULL) 1610 goto out; 1611 1612 head = mac_srs->srs_first; 1613 mac_srs->srs_first = NULL; 1614 tail = mac_srs->srs_last; 1615 mac_srs->srs_last = NULL; 1616 cnt = mac_srs->srs_count; 1617 mac_srs->srs_count = 0; 1618 1619 ASSERT(head != NULL); 1620 ASSERT(tail != NULL); 1621 1622 if ((tid = mac_srs->srs_tid) != 0) 1623 mac_srs->srs_tid = 0; 1624 1625 mac_srs->srs_state |= (SRS_PROC|proc_type); 1626 1627 /* Switch to polling mode */ 1628 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1629 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1630 MAC_SRS_POLL_RING(mac_srs); 1631 /* 1632 * mcip is NULL for broadcast and multicast flows. The promisc 1633 * callbacks for broadcast and multicast packets are delivered from 1634 * mac_rx() and we don't need to worry about that case in this path 1635 */ 1636 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1637 mutex_exit(&mac_srs->srs_lock); 1638 mac_promisc_client_dispatch(mcip, head); 1639 mutex_enter(&mac_srs->srs_lock); 1640 } 1641 1642 /* 1643 * Check if SRS itself is doing the processing 1644 * This direct path does not apply when subflows are present. In this 1645 * case, packets need to be dispatched to a soft ring according to the 1646 * flow's bandwidth and other resources contraints. 1647 */ 1648 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1649 mac_direct_rx_t proc; 1650 void *arg1; 1651 mac_resource_handle_t arg2; 1652 1653 /* 1654 * This is the case when a Rx is directly 1655 * assigned and we have a fully classified 1656 * protocol chain. We can deal with it in 1657 * one shot. 1658 */ 1659 proc = srs_rx->sr_func; 1660 arg1 = srs_rx->sr_arg1; 1661 arg2 = srs_rx->sr_arg2; 1662 1663 mac_srs->srs_state |= SRS_CLIENT_PROC; 1664 mutex_exit(&mac_srs->srs_lock); 1665 if (tid != 0) { 1666 (void) untimeout(tid); 1667 tid = 0; 1668 } 1669 1670 proc(arg1, arg2, head, NULL); 1671 /* 1672 * Decrement the size and count here itelf 1673 * since the packet has been processed. 1674 */ 1675 mutex_enter(&mac_srs->srs_lock); 1676 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1677 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1678 cv_signal(&mac_srs->srs_client_cv); 1679 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1680 } else { 1681 /* Some kind of softrings based fanout is required */ 1682 mutex_exit(&mac_srs->srs_lock); 1683 if (tid != 0) { 1684 (void) untimeout(tid); 1685 tid = 0; 1686 } 1687 1688 /* 1689 * Since the fanout routines can deal with chains, 1690 * shoot the entire chain up. 1691 */ 1692 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1693 mac_rx_srs_fanout(mac_srs, head); 1694 else 1695 mac_rx_srs_proto_fanout(mac_srs, head); 1696 mutex_enter(&mac_srs->srs_lock); 1697 } 1698 1699 /* 1700 * Send the poll thread to pick up any packets arrived 1701 * so far. This also serves as the last check in case 1702 * nothing else is queued in the SRS. The poll thread 1703 * is signalled only in the case the drain was done 1704 * by the worker thread and SRS_WORKER is set. The 1705 * worker thread can run in parallel as long as the 1706 * SRS_WORKER flag is set. We we have nothing else to 1707 * process, we can exit while leaving SRS_PROC set 1708 * which gives the poll thread control to process and 1709 * cleanup once it returns from the NIC. 1710 * 1711 * If we have nothing else to process, we need to 1712 * ensure that we keep holding the srs_lock till 1713 * all the checks below are done and control is 1714 * handed to the poll thread if it was running. 1715 */ 1716 if (mac_srs->srs_first != NULL) { 1717 if (proc_type == SRS_WORKER) { 1718 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1719 MAC_SRS_POLL_RING(mac_srs); 1720 srs_rx->sr_drain_again++; 1721 goto again; 1722 } else { 1723 srs_rx->sr_drain_worker_sig++; 1724 cv_signal(&mac_srs->srs_async); 1725 } 1726 } 1727 1728 out: 1729 1730 if (mac_srs->srs_state & SRS_GET_PKTS) { 1731 /* 1732 * Poll thread is already running. Leave the 1733 * SRS_RPOC set and hand over the control to 1734 * poll thread. 1735 */ 1736 mac_srs->srs_state &= ~proc_type; 1737 srs_rx->sr_drain_poll_running++; 1738 return; 1739 } 1740 1741 /* 1742 * Even if there are no packets queued in SRS, we 1743 * need to make sure that the shared counter is 1744 * clear and any associated softrings have cleared 1745 * all the backlog. Otherwise, leave the interface 1746 * in polling mode and the poll thread will get 1747 * signalled once the count goes down to zero. 1748 * 1749 * If someone is already draining the queue (SRS_PROC is 1750 * set) when the srs_poll_pkt_cnt goes down to zero, 1751 * then it means that drain is already running and we 1752 * will turn off polling at that time if there is 1753 * no backlog. 1754 * 1755 * As long as there are packets queued either 1756 * in soft ring set or its soft rings, we will leave 1757 * the interface in polling mode (even if the drain 1758 * was done being the interrupt thread). We signal 1759 * the poll thread as well if we have dipped below 1760 * low water mark. 1761 * 1762 * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1763 * since that turn polling on only for worker thread. 1764 * Its not worth turning polling on for interrupt 1765 * thread (since NIC will not issue another interrupt) 1766 * unless a backlog builds up. 1767 */ 1768 if ((srs_rx->sr_poll_pkt_cnt > 0) && 1769 (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1770 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1771 srs_rx->sr_drain_keep_polling++; 1772 MAC_SRS_POLLING_ON(mac_srs); 1773 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1774 MAC_SRS_POLL_RING(mac_srs); 1775 return; 1776 } 1777 1778 /* Nothing else to do. Get out of poll mode */ 1779 MAC_SRS_POLLING_OFF(mac_srs); 1780 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1781 srs_rx->sr_drain_finish_intr++; 1782 } 1783 1784 /* 1785 * mac_rx_srs_drain_bw 1786 * 1787 * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1788 * (worker, interrupt, poll) can call this based on processing model. 1789 * The first thing we do is disable interrupts if possible and then 1790 * drain the queue. we also try to poll the underlying hardware if 1791 * there is a dedicated hardware Rx ring assigned to this SRS. 1792 * 1793 * There is a equivalent drain routine in non bandwidth control mode 1794 * mac_rx_srs_drain. There is some code duplication between the two 1795 * routines but they are highly performance sensitive and are easier 1796 * to read/debug if they stay separate. Any code changes here might 1797 * also apply to mac_rx_srs_drain as well. 1798 */ 1799 void 1800 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1801 { 1802 mblk_t *head; 1803 mblk_t *tail; 1804 timeout_id_t tid; 1805 size_t sz = 0; 1806 int cnt = 0; 1807 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1808 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1809 1810 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1811 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1812 again: 1813 /* Check if we are doing B/W control */ 1814 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1815 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 1816 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 1817 mac_srs->srs_bw->mac_bw_used = 0; 1818 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1819 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1820 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1821 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1822 goto done; 1823 } else if (mac_srs->srs_bw->mac_bw_used > 1824 mac_srs->srs_bw->mac_bw_limit) { 1825 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1826 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1827 goto done; 1828 } 1829 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1830 1831 /* If we are blanked i.e. can't do upcalls, then we are done */ 1832 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1833 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1834 (mac_srs->srs_state & SRS_PAUSE)); 1835 goto done; 1836 } 1837 1838 sz = 0; 1839 cnt = 0; 1840 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1841 /* 1842 * We couldn't pick up a single packet. 1843 */ 1844 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1845 if ((mac_srs->srs_bw->mac_bw_used == 0) && 1846 (mac_srs->srs_size != 0) && 1847 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1848 /* 1849 * Seems like configured B/W doesn't 1850 * even allow processing of 1 packet 1851 * per tick. 1852 * 1853 * XXX: raise the limit to processing 1854 * at least 1 packet per tick. 1855 */ 1856 mac_srs->srs_bw->mac_bw_limit += 1857 mac_srs->srs_bw->mac_bw_limit; 1858 mac_srs->srs_bw->mac_bw_drop_threshold += 1859 mac_srs->srs_bw->mac_bw_drop_threshold; 1860 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1861 "raised B/W limit to %d since not even a " 1862 "single packet can be processed per " 1863 "tick %d\n", (void *)mac_srs, 1864 (int)mac_srs->srs_bw->mac_bw_limit, 1865 (int)msgdsize(mac_srs->srs_first)); 1866 } 1867 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1868 goto done; 1869 } 1870 1871 ASSERT(head != NULL); 1872 ASSERT(tail != NULL); 1873 1874 /* zero bandwidth: drop all and return to interrupt mode */ 1875 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1876 if (mac_srs->srs_bw->mac_bw_limit == 0) { 1877 srs_rx->sr_drop_count += cnt; 1878 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1879 mac_srs->srs_bw->mac_bw_sz -= sz; 1880 mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1881 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1882 mac_pkt_drop(NULL, NULL, head, B_FALSE); 1883 goto leave_poll; 1884 } else { 1885 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1886 } 1887 1888 /* 1889 * We can continue processing the queue. 1890 * We need to figure out if there is a fanout needed or 1891 * we can just process this here. 1892 */ 1893 1894 if ((tid = mac_srs->srs_tid) != 0) 1895 mac_srs->srs_tid = 0; 1896 1897 mac_srs->srs_state |= (SRS_PROC|proc_type); 1898 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1899 1900 /* 1901 * mcip is NULL for broadcast and multicast flows. The promisc 1902 * callbacks for broadcast and multicast packets are delivered from 1903 * mac_rx() and we don't need to worry about that case in this path 1904 */ 1905 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1906 mutex_exit(&mac_srs->srs_lock); 1907 mac_promisc_client_dispatch(mcip, head); 1908 mutex_enter(&mac_srs->srs_lock); 1909 } 1910 1911 /* 1912 * Check if SRS itself is doing the processing 1913 * This direct path does not apply when subflows are present. In this 1914 * case, packets need to be dispatched to a soft ring according to the 1915 * flow's bandwidth and other resources contraints. 1916 */ 1917 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1918 mac_direct_rx_t proc; 1919 void *arg1; 1920 mac_resource_handle_t arg2; 1921 1922 /* 1923 * This is the case when a Rx is directly 1924 * assigned and we have a fully classified 1925 * protocol chain. We can deal with it in 1926 * one shot. 1927 */ 1928 proc = srs_rx->sr_func; 1929 arg1 = srs_rx->sr_arg1; 1930 arg2 = srs_rx->sr_arg2; 1931 1932 mac_srs->srs_state |= SRS_CLIENT_PROC; 1933 mutex_exit(&mac_srs->srs_lock); 1934 if (tid != 0) { 1935 (void) untimeout(tid); 1936 tid = 0; 1937 } 1938 1939 proc(arg1, arg2, head, NULL); 1940 /* 1941 * Decrement the size and count here itelf 1942 * since the packet has been processed. 1943 */ 1944 mutex_enter(&mac_srs->srs_lock); 1945 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1946 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 1947 1948 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1949 cv_signal(&mac_srs->srs_client_cv); 1950 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1951 } else { 1952 /* Some kind of softrings based fanout is required */ 1953 mutex_exit(&mac_srs->srs_lock); 1954 if (tid != 0) { 1955 (void) untimeout(tid); 1956 tid = 0; 1957 } 1958 1959 /* 1960 * Since the fanout routines can deal with chains, 1961 * shoot the entire chain up. 1962 */ 1963 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1964 mac_rx_srs_fanout(mac_srs, head); 1965 else 1966 mac_rx_srs_proto_fanout(mac_srs, head); 1967 mutex_enter(&mac_srs->srs_lock); 1968 } 1969 1970 /* 1971 * Send the poll thread to pick up any packets arrived 1972 * so far. This also serves as the last check in case 1973 * nothing else is queued in the SRS. The poll thread 1974 * is signalled only in the case the drain was done 1975 * by the worker thread and SRS_WORKER is set. The 1976 * worker thread can run in parallel as long as the 1977 * SRS_WORKER flag is set. We we have nothing else to 1978 * process, we can exit while leaving SRS_PROC set 1979 * which gives the poll thread control to process and 1980 * cleanup once it returns from the NIC. 1981 * 1982 * If we have nothing else to process, we need to 1983 * ensure that we keep holding the srs_lock till 1984 * all the checks below are done and control is 1985 * handed to the poll thread if it was running. 1986 */ 1987 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1988 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1989 if (mac_srs->srs_first != NULL) { 1990 if (proc_type == SRS_WORKER) { 1991 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1992 if (srs_rx->sr_poll_pkt_cnt <= 1993 srs_rx->sr_lowat) 1994 MAC_SRS_POLL_RING(mac_srs); 1995 goto again; 1996 } else { 1997 cv_signal(&mac_srs->srs_async); 1998 } 1999 } 2000 } 2001 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2002 2003 done: 2004 2005 if (mac_srs->srs_state & SRS_GET_PKTS) { 2006 /* 2007 * Poll thread is already running. Leave the 2008 * SRS_RPOC set and hand over the control to 2009 * poll thread. 2010 */ 2011 mac_srs->srs_state &= ~proc_type; 2012 return; 2013 } 2014 2015 /* 2016 * If we can't process packets because we have exceeded 2017 * B/W limit for this tick, just set the timeout 2018 * and leave. 2019 * 2020 * Even if there are no packets queued in SRS, we 2021 * need to make sure that the shared counter is 2022 * clear and any associated softrings have cleared 2023 * all the backlog. Otherwise, leave the interface 2024 * in polling mode and the poll thread will get 2025 * signalled once the count goes down to zero. 2026 * 2027 * If someone is already draining the queue (SRS_PROC is 2028 * set) when the srs_poll_pkt_cnt goes down to zero, 2029 * then it means that drain is already running and we 2030 * will turn off polling at that time if there is 2031 * no backlog. As long as there are packets queued either 2032 * is soft ring set or its soft rings, we will leave 2033 * the interface in polling mode. 2034 */ 2035 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2036 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2037 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2038 (srs_rx->sr_poll_pkt_cnt > 0))) { 2039 MAC_SRS_POLLING_ON(mac_srs); 2040 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2041 if ((mac_srs->srs_first != NULL) && 2042 (mac_srs->srs_tid == NULL)) 2043 mac_srs->srs_tid = timeout(mac_srs_fire, 2044 mac_srs, 1); 2045 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2046 return; 2047 } 2048 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2049 2050 leave_poll: 2051 2052 /* Nothing else to do. Get out of poll mode */ 2053 MAC_SRS_POLLING_OFF(mac_srs); 2054 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2055 } 2056 2057 /* 2058 * mac_srs_worker 2059 * 2060 * The SRS worker routine. Drains the queue when no one else is 2061 * processing it. 2062 */ 2063 void 2064 mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2065 { 2066 kmutex_t *lock = &mac_srs->srs_lock; 2067 kcondvar_t *async = &mac_srs->srs_async; 2068 callb_cpr_t cprinfo; 2069 boolean_t bw_ctl_flag; 2070 2071 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2072 mutex_enter(lock); 2073 2074 start: 2075 for (;;) { 2076 bw_ctl_flag = B_FALSE; 2077 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2078 MAC_SRS_BW_LOCK(mac_srs); 2079 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2080 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2081 bw_ctl_flag = B_TRUE; 2082 MAC_SRS_BW_UNLOCK(mac_srs); 2083 } 2084 /* 2085 * The SRS_BW_ENFORCED flag may change since we have dropped 2086 * the mac_bw_lock. However the drain function can handle both 2087 * a drainable SRS or a bandwidth controlled SRS, and the 2088 * effect of scheduling a timeout is to wakeup the worker 2089 * thread which in turn will call the drain function. Since 2090 * we release the srs_lock atomically only in the cv_wait there 2091 * isn't a fear of waiting for ever. 2092 */ 2093 while (((mac_srs->srs_state & SRS_PROC) || 2094 (mac_srs->srs_first == NULL) || bw_ctl_flag || 2095 (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2096 !(mac_srs->srs_state & SRS_PAUSE)) { 2097 /* 2098 * If we have packets queued and we are here 2099 * because B/W control is in place, we better 2100 * schedule the worker wakeup after 1 tick 2101 * to see if bandwidth control can be relaxed. 2102 */ 2103 if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2104 /* 2105 * We need to ensure that a timer is already 2106 * scheduled or we force schedule one for 2107 * later so that we can continue processing 2108 * after this quanta is over. 2109 */ 2110 mac_srs->srs_tid = timeout(mac_srs_fire, 2111 mac_srs, 1); 2112 } 2113 wait: 2114 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2115 cv_wait(async, lock); 2116 CALLB_CPR_SAFE_END(&cprinfo, lock); 2117 2118 if (mac_srs->srs_state & SRS_PAUSE) 2119 goto done; 2120 if (mac_srs->srs_state & SRS_PROC) 2121 goto wait; 2122 2123 if (mac_srs->srs_first != NULL && 2124 mac_srs->srs_type & SRST_BW_CONTROL) { 2125 MAC_SRS_BW_LOCK(mac_srs); 2126 if (mac_srs->srs_bw->mac_bw_state & 2127 SRS_BW_ENFORCED) { 2128 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2129 } 2130 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2131 SRS_BW_ENFORCED; 2132 MAC_SRS_BW_UNLOCK(mac_srs); 2133 } 2134 } 2135 2136 if (mac_srs->srs_state & SRS_PAUSE) 2137 goto done; 2138 mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2139 } 2140 done: 2141 /* 2142 * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2143 * from both hard and soft classifications and waits for such threads 2144 * to finish before signaling the worker. So at this point the only 2145 * thread left that could be competing with the worker is the poll 2146 * thread. In the case of Tx, there shouldn't be any thread holding 2147 * SRS_PROC at this point. 2148 */ 2149 if (!(mac_srs->srs_state & SRS_PROC)) { 2150 mac_srs->srs_state |= SRS_PROC; 2151 } else { 2152 ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2153 /* 2154 * Poll thread still owns the SRS and is still running 2155 */ 2156 ASSERT((mac_srs->srs_poll_thr == NULL) || 2157 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2158 SRS_POLL_THR_OWNER)); 2159 } 2160 mac_srs_worker_quiesce(mac_srs); 2161 /* 2162 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2163 * of the quiesce operation 2164 */ 2165 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2166 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2167 2168 if (mac_srs->srs_state & SRS_RESTART) { 2169 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2170 mac_srs_worker_restart(mac_srs); 2171 mac_srs->srs_state &= ~SRS_PROC; 2172 goto start; 2173 } 2174 2175 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2176 mac_srs_worker_quiesce(mac_srs); 2177 2178 mac_srs->srs_state &= ~SRS_PROC; 2179 /* The macro drops the srs_lock */ 2180 CALLB_CPR_EXIT(&cprinfo); 2181 thread_exit(); 2182 } 2183 2184 /* 2185 * mac_rx_srs_subflow_process 2186 * 2187 * Receive side routine called from interrupt path when there are 2188 * sub flows present on this SRS. 2189 */ 2190 /* ARGSUSED */ 2191 void 2192 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2193 mblk_t *mp_chain, boolean_t loopback) 2194 { 2195 flow_entry_t *flent = NULL; 2196 flow_entry_t *prev_flent = NULL; 2197 mblk_t *mp = NULL; 2198 mblk_t *tail = NULL; 2199 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2200 mac_client_impl_t *mcip; 2201 2202 mcip = mac_srs->srs_mcip; 2203 ASSERT(mcip != NULL); 2204 2205 /* 2206 * We need to determine the SRS for every packet 2207 * by walking the flow table, if we don't get any, 2208 * then we proceed using the SRS we came with. 2209 */ 2210 mp = tail = mp_chain; 2211 while (mp != NULL) { 2212 2213 /* 2214 * We will increment the stats for the mactching subflow. 2215 * when we get the bytes/pkt count for the classified packets 2216 * later in mac_rx_srs_process. 2217 */ 2218 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2219 FLOW_INBOUND, &flent); 2220 2221 if (mp == mp_chain || flent == prev_flent) { 2222 if (prev_flent != NULL) 2223 FLOW_REFRELE(prev_flent); 2224 prev_flent = flent; 2225 flent = NULL; 2226 tail = mp; 2227 mp = mp->b_next; 2228 continue; 2229 } 2230 tail->b_next = NULL; 2231 /* 2232 * A null indicates, this is for the mac_srs itself. 2233 * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2234 */ 2235 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2236 mac_rx_srs_process(arg, 2237 (mac_resource_handle_t)mac_srs, mp_chain, 2238 loopback); 2239 } else { 2240 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2241 prev_flent->fe_cb_arg2, mp_chain, loopback); 2242 FLOW_REFRELE(prev_flent); 2243 } 2244 prev_flent = flent; 2245 flent = NULL; 2246 mp_chain = mp; 2247 tail = mp; 2248 mp = mp->b_next; 2249 } 2250 /* Last chain */ 2251 ASSERT(mp_chain != NULL); 2252 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2253 mac_rx_srs_process(arg, 2254 (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2255 } else { 2256 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2257 prev_flent->fe_cb_arg2, mp_chain, loopback); 2258 FLOW_REFRELE(prev_flent); 2259 } 2260 } 2261 2262 /* 2263 * mac_rx_srs_process 2264 * 2265 * Receive side routine called from the interrupt path. 2266 * 2267 * loopback is set to force a context switch on the loopback 2268 * path between MAC clients. 2269 */ 2270 /* ARGSUSED */ 2271 void 2272 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2273 boolean_t loopback) 2274 { 2275 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2276 mblk_t *mp, *tail, *head; 2277 int count = 0; 2278 int count1; 2279 size_t sz = 0; 2280 size_t chain_sz, sz1; 2281 mac_bw_ctl_t *mac_bw; 2282 mac_client_impl_t *smcip; 2283 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2284 2285 /* 2286 * Set the tail, count and sz. We set the sz irrespective 2287 * of whether we are doing B/W control or not for the 2288 * purpose of updating the stats. 2289 */ 2290 mp = tail = mp_chain; 2291 while (mp != NULL) { 2292 tail = mp; 2293 count++; 2294 sz += msgdsize(mp); 2295 mp = mp->b_next; 2296 } 2297 2298 mutex_enter(&mac_srs->srs_lock); 2299 smcip = mac_srs->srs_mcip; 2300 2301 if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 2302 FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 2303 FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 2304 } 2305 if (smcip != NULL) { 2306 smcip->mci_stat_ibytes += sz; 2307 smcip->mci_stat_ipackets += count; 2308 } 2309 2310 /* 2311 * If the SRS in already being processed; has been blanked; 2312 * can be processed by worker thread only; or the B/W limit 2313 * has been reached, then queue the chain and check if 2314 * worker thread needs to be awakend. 2315 */ 2316 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2317 mac_bw = mac_srs->srs_bw; 2318 ASSERT(mac_bw != NULL); 2319 mutex_enter(&mac_bw->mac_bw_lock); 2320 /* Count the packets and bytes via interrupt */ 2321 srs_rx->sr_intr_count += count; 2322 mac_bw->mac_bw_intr += sz; 2323 if (mac_bw->mac_bw_limit == 0) { 2324 /* zero bandwidth: drop all */ 2325 srs_rx->sr_drop_count += count; 2326 mac_bw->mac_bw_drop_bytes += sz; 2327 mutex_exit(&mac_bw->mac_bw_lock); 2328 mutex_exit(&mac_srs->srs_lock); 2329 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2330 return; 2331 } else { 2332 if ((mac_bw->mac_bw_sz + sz) <= 2333 mac_bw->mac_bw_drop_threshold) { 2334 mutex_exit(&mac_bw->mac_bw_lock); 2335 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2336 tail, count, sz); 2337 } else { 2338 mp = mp_chain; 2339 chain_sz = 0; 2340 count1 = 0; 2341 tail = NULL; 2342 head = NULL; 2343 while (mp != NULL) { 2344 sz1 = msgdsize(mp); 2345 if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2346 mac_bw->mac_bw_drop_threshold) 2347 break; 2348 chain_sz += sz1; 2349 count1++; 2350 tail = mp; 2351 mp = mp->b_next; 2352 } 2353 mutex_exit(&mac_bw->mac_bw_lock); 2354 if (tail != NULL) { 2355 head = tail->b_next; 2356 tail->b_next = NULL; 2357 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2358 mp_chain, tail, count1, chain_sz); 2359 sz -= chain_sz; 2360 count -= count1; 2361 } else { 2362 /* Can't pick up any */ 2363 head = mp_chain; 2364 } 2365 if (head != NULL) { 2366 /* Drop any packet over the threshold */ 2367 srs_rx->sr_drop_count += count; 2368 mutex_enter(&mac_bw->mac_bw_lock); 2369 mac_bw->mac_bw_drop_bytes += sz; 2370 mutex_exit(&mac_bw->mac_bw_lock); 2371 freemsgchain(head); 2372 } 2373 } 2374 MAC_SRS_WORKER_WAKEUP(mac_srs); 2375 mutex_exit(&mac_srs->srs_lock); 2376 return; 2377 } 2378 } 2379 2380 /* 2381 * If the total number of packets queued in the SRS and 2382 * its associated soft rings exceeds the max allowed, 2383 * then drop the chain. If we are polling capable, this 2384 * shouldn't be happening. 2385 */ 2386 if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2387 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2388 mac_bw = mac_srs->srs_bw; 2389 srs_rx->sr_drop_count += count; 2390 mutex_enter(&mac_bw->mac_bw_lock); 2391 mac_bw->mac_bw_drop_bytes += sz; 2392 mutex_exit(&mac_bw->mac_bw_lock); 2393 freemsgchain(mp_chain); 2394 mutex_exit(&mac_srs->srs_lock); 2395 return; 2396 } 2397 2398 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2399 /* Count the packets entering via interrupt path */ 2400 srs_rx->sr_intr_count += count; 2401 2402 if (!(mac_srs->srs_state & SRS_PROC)) { 2403 /* 2404 * If we are coming via loopback or if we are not 2405 * optimizing for latency, we should signal the 2406 * worker thread. 2407 */ 2408 if (loopback || ((count > 1) && 2409 !(mac_srs->srs_state & SRS_LATENCY_OPT))) { 2410 /* 2411 * For loopback, We need to let the worker take 2412 * over as we don't want to continue in the same 2413 * thread even if we can. This could lead to stack 2414 * overflows and may also end up using 2415 * resources (cpu) incorrectly. 2416 */ 2417 cv_signal(&mac_srs->srs_async); 2418 } else { 2419 /* 2420 * Seems like no one is processing the SRS and 2421 * there is no backlog. We also inline process 2422 * our packet if its a single packet in non 2423 * latency optimized case (in latency optimized 2424 * case, we inline process chains of any size). 2425 */ 2426 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2427 } 2428 } 2429 mutex_exit(&mac_srs->srs_lock); 2430 } 2431 2432 /* TX SIDE ROUTINES (RUNTIME) */ 2433 2434 /* 2435 * mac_tx_srs_no_desc 2436 * 2437 * This routine is called by Tx single ring default mode 2438 * when Tx ring runs out of descs. 2439 */ 2440 mac_tx_cookie_t 2441 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2442 uint16_t flag, mblk_t **ret_mp) 2443 { 2444 mac_tx_cookie_t cookie = NULL; 2445 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2446 boolean_t wakeup_worker = B_TRUE; 2447 uint32_t tx_mode = srs_tx->st_mode; 2448 int cnt, sz; 2449 mblk_t *tail; 2450 2451 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2452 if (flag & MAC_DROP_ON_NO_DESC) { 2453 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2454 } else { 2455 if (mac_srs->srs_first != NULL) 2456 wakeup_worker = B_FALSE; 2457 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2458 if (flag & MAC_TX_NO_ENQUEUE) { 2459 /* 2460 * If TX_QUEUED is not set, queue the 2461 * packet and let mac_tx_srs_drain() 2462 * set the TX_BLOCKED bit for the 2463 * reasons explained above. Otherwise, 2464 * return the mblks. 2465 */ 2466 if (wakeup_worker) { 2467 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2468 mp_chain, tail, cnt, sz); 2469 } else { 2470 MAC_TX_SET_NO_ENQUEUE(mac_srs, 2471 mp_chain, ret_mp, cookie); 2472 } 2473 } else { 2474 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2475 tail, cnt, sz, cookie); 2476 } 2477 if (wakeup_worker) 2478 cv_signal(&mac_srs->srs_async); 2479 } 2480 return (cookie); 2481 } 2482 2483 /* 2484 * mac_tx_srs_enqueue 2485 * 2486 * This routine is called when Tx SRS is operating in either serializer 2487 * or bandwidth mode. In serializer mode, a packet will get enqueued 2488 * when a thread cannot enter SRS exclusively. In bandwidth mode, 2489 * packets gets queued if allowed byte-count limit for a tick is 2490 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2491 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2492 * the default mode or fanout mode. Here packets get dropped or 2493 * returned back to the caller only after hi-watermark worth of data 2494 * is queued. 2495 */ 2496 static mac_tx_cookie_t 2497 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2498 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2499 { 2500 mac_tx_cookie_t cookie = NULL; 2501 int cnt, sz; 2502 mblk_t *tail; 2503 boolean_t wakeup_worker = B_TRUE; 2504 2505 if (mac_srs->srs_first != NULL) 2506 wakeup_worker = B_FALSE; 2507 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2508 if (flag & MAC_DROP_ON_NO_DESC) { 2509 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2510 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2511 } else { 2512 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2513 mp_chain, tail, cnt, sz); 2514 } 2515 } else if (flag & MAC_TX_NO_ENQUEUE) { 2516 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2517 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2518 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2519 ret_mp, cookie); 2520 } else { 2521 mp_chain->b_prev = (mblk_t *)fanout_hint; 2522 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2523 mp_chain, tail, cnt, sz); 2524 } 2525 } else { 2526 /* 2527 * If you are BW_ENFORCED, just enqueue the 2528 * packet. srs_worker will drain it at the 2529 * prescribed rate. Before enqueueing, save 2530 * the fanout hint. 2531 */ 2532 mp_chain->b_prev = (mblk_t *)fanout_hint; 2533 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2534 tail, cnt, sz, cookie); 2535 } 2536 if (wakeup_worker) 2537 cv_signal(&mac_srs->srs_async); 2538 return (cookie); 2539 } 2540 2541 /* 2542 * There are five tx modes: 2543 * 2544 * 1) Default mode (SRS_TX_DEFAULT) 2545 * 2) Serialization mode (SRS_TX_SERIALIZE) 2546 * 3) Fanout mode (SRS_TX_FANOUT) 2547 * 4) Bandwdith mode (SRS_TX_BW) 2548 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2549 * 2550 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2551 * based on the number of Tx rings requested for an SRS and whether 2552 * bandwidth control is requested or not. 2553 * 2554 * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 2555 * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 2556 * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 2557 * When flow-control is relieved, the srs_worker drains the queued 2558 * packets and informs blocked clients to restart sending packets. 2559 * 2560 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 2561 * 2562 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2563 * Tx rings. Each Tx ring will have a soft ring associated with it. 2564 * These soft rings will be hung off the Tx SRS. Queueing if it happens 2565 * due to lack of Tx desc will be in individual soft ring (and not srs) 2566 * associated with Tx ring. 2567 * 2568 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2569 * only if bw is available. Otherwise the packets will be queued in 2570 * SRS. If fanout to multiple Tx rings is configured, the packets will 2571 * be fanned out among the soft rings associated with the Tx rings. 2572 * 2573 * Four flags are used in srs_state for indicating flow control 2574 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2575 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2576 * driver below. 2577 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2578 * and flow-control pressure is applied back to clients. The clients expect 2579 * wakeup when flow-control is relieved. 2580 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2581 * got returned back to client either due to lack of Tx descs or due to bw 2582 * control reasons. The clients expect a wakeup when condition is relieved. 2583 * 2584 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2585 * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2586 * MAC_TX_NO_ENQUEUE 2587 * Mac clients that do not want packets to be enqueued in the mac layer set 2588 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2589 * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2590 * behaviour of this flag is different when the Tx is running in serializer 2591 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2592 * get dropped when Tx high watermark is reached. 2593 * There are some mac clients like vsw, aggr that want the mblks to be 2594 * returned back to clients instead of being queued in Tx SRS (or Tx soft 2595 * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2596 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2597 * In the default and Tx fanout mode, the un-transmitted mblks will be 2598 * returned back to the clients when the driver runs out of Tx descs. 2599 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2600 * soft ring) so that the clients can be woken up when Tx desc become 2601 * available. When running in serializer or bandwidth mode mode, 2602 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2603 */ 2604 2605 mac_tx_func_t 2606 mac_tx_get_func(uint32_t mode) 2607 { 2608 return (mac_tx_mode_list[mode].mac_tx_func); 2609 } 2610 2611 /* ARGSUSED */ 2612 static mac_tx_cookie_t 2613 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2614 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2615 { 2616 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2617 boolean_t is_subflow; 2618 mac_tx_stats_t stats; 2619 mac_tx_cookie_t cookie = NULL; 2620 2621 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2622 2623 /* Regular case with a single Tx ring */ 2624 /* 2625 * SRS_TX_BLOCKED is set when underlying NIC runs 2626 * out of Tx descs and messages start getting 2627 * queued. It won't get reset until 2628 * tx_srs_drain() completely drains out the 2629 * messages. 2630 */ 2631 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2632 /* Tx descs/resources not available */ 2633 mutex_enter(&mac_srs->srs_lock); 2634 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2635 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2636 flag, ret_mp); 2637 mutex_exit(&mac_srs->srs_lock); 2638 return (cookie); 2639 } 2640 /* 2641 * While we were computing mblk count, the 2642 * flow control condition got relieved. 2643 * Continue with the transmission. 2644 */ 2645 mutex_exit(&mac_srs->srs_lock); 2646 } 2647 2648 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2649 2650 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2651 mp_chain, (is_subflow ? &stats : NULL)); 2652 2653 /* 2654 * Multiple threads could be here sending packets. 2655 * Under such conditions, it is not possible to 2656 * automically set SRS_TX_BLOCKED bit to indicate 2657 * out of tx desc condition. To atomically set 2658 * this, we queue the returned packet and do 2659 * the setting of SRS_TX_BLOCKED in 2660 * mac_tx_srs_drain(). 2661 */ 2662 if (mp_chain != NULL) { 2663 mutex_enter(&mac_srs->srs_lock); 2664 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2665 mutex_exit(&mac_srs->srs_lock); 2666 return (cookie); 2667 } 2668 2669 if (is_subflow) 2670 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2671 2672 return (NULL); 2673 } 2674 2675 /* 2676 * mac_tx_serialize_mode 2677 * 2678 * This is an experimental mode implemented as per the request of PAE. 2679 * In this mode, all callers attempting to send a packet to the NIC 2680 * will get serialized. Only one thread at any time will access the 2681 * NIC to send the packet out. 2682 */ 2683 /* ARGSUSED */ 2684 static mac_tx_cookie_t 2685 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2686 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2687 { 2688 boolean_t is_subflow; 2689 mac_tx_stats_t stats; 2690 mac_tx_cookie_t cookie = NULL; 2691 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2692 2693 /* Single ring, serialize below */ 2694 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2695 mutex_enter(&mac_srs->srs_lock); 2696 if ((mac_srs->srs_first != NULL) || 2697 (mac_srs->srs_state & SRS_PROC)) { 2698 /* 2699 * In serialization mode, queue all packets until 2700 * TX_HIWAT is set. 2701 * If drop bit is set, drop if TX_HIWAT is set. 2702 * If no_enqueue is set, still enqueue until hiwat 2703 * is set and return mblks after TX_HIWAT is set. 2704 */ 2705 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2706 flag, NULL, ret_mp); 2707 mutex_exit(&mac_srs->srs_lock); 2708 return (cookie); 2709 } 2710 /* 2711 * No packets queued, nothing on proc and no flow 2712 * control condition. Fast-path, ok. Do inline 2713 * processing. 2714 */ 2715 mac_srs->srs_state |= SRS_PROC; 2716 mutex_exit(&mac_srs->srs_lock); 2717 2718 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2719 2720 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2721 mp_chain, (is_subflow ? &stats : NULL)); 2722 2723 mutex_enter(&mac_srs->srs_lock); 2724 mac_srs->srs_state &= ~SRS_PROC; 2725 if (mp_chain != NULL) { 2726 cookie = mac_tx_srs_enqueue(mac_srs, 2727 mp_chain, flag, NULL, ret_mp); 2728 } 2729 if (mac_srs->srs_first != NULL) { 2730 /* 2731 * We processed inline our packet and a new 2732 * packet/s got queued while we were 2733 * processing. Wakeup srs worker 2734 */ 2735 cv_signal(&mac_srs->srs_async); 2736 } 2737 mutex_exit(&mac_srs->srs_lock); 2738 2739 if (is_subflow && cookie == NULL) 2740 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2741 2742 return (cookie); 2743 } 2744 2745 /* 2746 * mac_tx_fanout_mode 2747 * 2748 * In this mode, the SRS will have access to multiple Tx rings to send 2749 * the packet out. The fanout hint that is passed as an argument is 2750 * used to find an appropriate ring to fanout the traffic. Each Tx 2751 * ring, in turn, will have a soft ring associated with it. If a Tx 2752 * ring runs out of Tx desc's the returned packet will be queued in 2753 * the soft ring associated with that Tx ring. The srs itself will not 2754 * queue any packets. 2755 */ 2756 static mac_tx_cookie_t 2757 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2758 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2759 { 2760 mac_soft_ring_t *softring; 2761 uint_t indx, hash; 2762 2763 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 2764 hash = HASH_HINT(fanout_hint); 2765 indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 2766 softring = mac_srs->srs_oth_soft_rings[indx]; 2767 return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp)); 2768 } 2769 2770 /* 2771 * mac_tx_bw_mode 2772 * 2773 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2774 * only if bw is available. Otherwise the packets will be queued in 2775 * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2776 * out to a Tx rings. 2777 */ 2778 static mac_tx_cookie_t 2779 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2780 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2781 { 2782 int cnt, sz; 2783 mblk_t *tail; 2784 mac_tx_cookie_t cookie = NULL; 2785 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2786 2787 ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2788 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2789 mutex_enter(&mac_srs->srs_lock); 2790 if (mac_srs->srs_bw->mac_bw_limit == 0) { 2791 /* zero bandwidth: drop all */ 2792 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2793 mutex_exit(&mac_srs->srs_lock); 2794 return (cookie); 2795 } else if ((mac_srs->srs_first != NULL) || 2796 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2797 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2798 fanout_hint, ret_mp); 2799 mutex_exit(&mac_srs->srs_lock); 2800 return (cookie); 2801 } 2802 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2803 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 2804 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 2805 mac_srs->srs_bw->mac_bw_used = 0; 2806 } else if (mac_srs->srs_bw->mac_bw_used > 2807 mac_srs->srs_bw->mac_bw_limit) { 2808 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2809 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2810 mp_chain, tail, cnt, sz); 2811 /* 2812 * Wakeup worker thread. Note that worker 2813 * thread has to be woken up so that it 2814 * can fire up the timer to be woken up 2815 * on the next tick. Also once 2816 * BW_ENFORCED is set, it can only be 2817 * reset by srs_worker thread. Until then 2818 * all packets will get queued up in SRS 2819 * and hence this this code path won't be 2820 * entered until BW_ENFORCED is reset. 2821 */ 2822 cv_signal(&mac_srs->srs_async); 2823 mutex_exit(&mac_srs->srs_lock); 2824 return (cookie); 2825 } 2826 2827 mac_srs->srs_bw->mac_bw_used += sz; 2828 mutex_exit(&mac_srs->srs_lock); 2829 2830 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2831 mac_soft_ring_t *softring; 2832 uint_t indx, hash; 2833 2834 hash = HASH_HINT(fanout_hint); 2835 indx = COMPUTE_INDEX(hash, 2836 mac_srs->srs_oth_ring_count); 2837 softring = mac_srs->srs_oth_soft_rings[indx]; 2838 return (mac_tx_soft_ring_process(softring, mp_chain, flag, 2839 ret_mp)); 2840 } else { 2841 boolean_t is_subflow; 2842 mac_tx_stats_t stats; 2843 2844 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2845 2846 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2847 mp_chain, (is_subflow ? &stats : NULL)); 2848 2849 if (mp_chain != NULL) { 2850 mutex_enter(&mac_srs->srs_lock); 2851 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2852 if (mac_srs->srs_bw->mac_bw_used > sz) 2853 mac_srs->srs_bw->mac_bw_used -= sz; 2854 else 2855 mac_srs->srs_bw->mac_bw_used = 0; 2856 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2857 fanout_hint, ret_mp); 2858 mutex_exit(&mac_srs->srs_lock); 2859 return (cookie); 2860 } 2861 if (is_subflow) 2862 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2863 2864 return (NULL); 2865 } 2866 } 2867 2868 /* ARGSUSED */ 2869 void 2870 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 2871 { 2872 mblk_t *head, *tail; 2873 size_t sz; 2874 uint32_t tx_mode; 2875 uint_t saved_pkt_count; 2876 boolean_t is_subflow; 2877 mac_tx_stats_t stats; 2878 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2879 2880 saved_pkt_count = 0; 2881 ASSERT(mutex_owned(&mac_srs->srs_lock)); 2882 ASSERT(!(mac_srs->srs_state & SRS_PROC)); 2883 2884 mac_srs->srs_state |= SRS_PROC; 2885 2886 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2887 tx_mode = srs_tx->st_mode; 2888 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 2889 if (mac_srs->srs_first != NULL) { 2890 head = mac_srs->srs_first; 2891 tail = mac_srs->srs_last; 2892 saved_pkt_count = mac_srs->srs_count; 2893 mac_srs->srs_first = NULL; 2894 mac_srs->srs_last = NULL; 2895 mac_srs->srs_count = 0; 2896 mutex_exit(&mac_srs->srs_lock); 2897 2898 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2899 head, &stats); 2900 2901 mutex_enter(&mac_srs->srs_lock); 2902 if (head != NULL) { 2903 /* Device out of tx desc, set block */ 2904 if (head->b_next == NULL) 2905 VERIFY(head == tail); 2906 tail->b_next = mac_srs->srs_first; 2907 mac_srs->srs_first = head; 2908 mac_srs->srs_count += 2909 (saved_pkt_count - stats.ts_opackets); 2910 if (mac_srs->srs_last == NULL) 2911 mac_srs->srs_last = tail; 2912 MAC_TX_SRS_BLOCK(mac_srs, head); 2913 } else { 2914 srs_tx->st_woken_up = B_FALSE; 2915 if (is_subflow) { 2916 FLOW_TX_STATS_UPDATE( 2917 mac_srs->srs_flent, &stats); 2918 } 2919 } 2920 } 2921 } else if (tx_mode == SRS_TX_BW) { 2922 /* 2923 * We are here because the timer fired and we have some data 2924 * to tranmit. Also mac_tx_srs_worker should have reset 2925 * SRS_BW_ENFORCED flag 2926 */ 2927 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 2928 head = tail = mac_srs->srs_first; 2929 while (mac_srs->srs_first != NULL) { 2930 tail = mac_srs->srs_first; 2931 tail->b_prev = NULL; 2932 mac_srs->srs_first = tail->b_next; 2933 if (mac_srs->srs_first == NULL) 2934 mac_srs->srs_last = NULL; 2935 mac_srs->srs_count--; 2936 sz = msgdsize(tail); 2937 mac_srs->srs_size -= sz; 2938 saved_pkt_count++; 2939 MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 2940 2941 if (mac_srs->srs_bw->mac_bw_used < 2942 mac_srs->srs_bw->mac_bw_limit) 2943 continue; 2944 2945 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 2946 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 2947 mac_srs->srs_bw->mac_bw_used = sz; 2948 continue; 2949 } 2950 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2951 break; 2952 } 2953 2954 ASSERT((head == NULL && tail == NULL) || 2955 (head != NULL && tail != NULL)); 2956 if (tail != NULL) { 2957 tail->b_next = NULL; 2958 mutex_exit(&mac_srs->srs_lock); 2959 2960 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2961 head, &stats); 2962 2963 mutex_enter(&mac_srs->srs_lock); 2964 if (head != NULL) { 2965 uint_t size_sent; 2966 2967 /* Device out of tx desc, set block */ 2968 if (head->b_next == NULL) 2969 VERIFY(head == tail); 2970 tail->b_next = mac_srs->srs_first; 2971 mac_srs->srs_first = head; 2972 mac_srs->srs_count += 2973 (saved_pkt_count - stats.ts_opackets); 2974 if (mac_srs->srs_last == NULL) 2975 mac_srs->srs_last = tail; 2976 size_sent = sz - stats.ts_obytes; 2977 mac_srs->srs_size += size_sent; 2978 mac_srs->srs_bw->mac_bw_sz += size_sent; 2979 if (mac_srs->srs_bw->mac_bw_used > size_sent) { 2980 mac_srs->srs_bw->mac_bw_used -= 2981 size_sent; 2982 } else { 2983 mac_srs->srs_bw->mac_bw_used = 0; 2984 } 2985 MAC_TX_SRS_BLOCK(mac_srs, head); 2986 } else { 2987 srs_tx->st_woken_up = B_FALSE; 2988 if (is_subflow) { 2989 FLOW_TX_STATS_UPDATE( 2990 mac_srs->srs_flent, &stats); 2991 } 2992 } 2993 } 2994 } else if (tx_mode == SRS_TX_BW_FANOUT) { 2995 mblk_t *prev; 2996 mac_soft_ring_t *softring; 2997 uint64_t hint; 2998 2999 /* 3000 * We are here because the timer fired and we 3001 * have some quota to tranmit. 3002 */ 3003 prev = NULL; 3004 head = tail = mac_srs->srs_first; 3005 while (mac_srs->srs_first != NULL) { 3006 tail = mac_srs->srs_first; 3007 mac_srs->srs_first = tail->b_next; 3008 if (mac_srs->srs_first == NULL) 3009 mac_srs->srs_last = NULL; 3010 mac_srs->srs_count--; 3011 sz = msgdsize(tail); 3012 mac_srs->srs_size -= sz; 3013 mac_srs->srs_bw->mac_bw_used += sz; 3014 if (prev == NULL) 3015 hint = (ulong_t)tail->b_prev; 3016 if (hint != (ulong_t)tail->b_prev) { 3017 prev->b_next = NULL; 3018 mutex_exit(&mac_srs->srs_lock); 3019 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3020 head = tail; 3021 hint = (ulong_t)tail->b_prev; 3022 mutex_enter(&mac_srs->srs_lock); 3023 } 3024 3025 prev = tail; 3026 tail->b_prev = NULL; 3027 if (mac_srs->srs_bw->mac_bw_used < 3028 mac_srs->srs_bw->mac_bw_limit) 3029 continue; 3030 3031 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 3032 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 3033 mac_srs->srs_bw->mac_bw_used = 0; 3034 continue; 3035 } 3036 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3037 break; 3038 } 3039 ASSERT((head == NULL && tail == NULL) || 3040 (head != NULL && tail != NULL)); 3041 if (tail != NULL) { 3042 tail->b_next = NULL; 3043 mutex_exit(&mac_srs->srs_lock); 3044 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3045 mutex_enter(&mac_srs->srs_lock); 3046 } 3047 } 3048 /* 3049 * SRS_TX_FANOUT case not considered here because packets 3050 * won't be queued in the SRS for this case. Packets will 3051 * be sent directly to soft rings underneath and if there 3052 * is any queueing at all, it would be in Tx side soft 3053 * rings. 3054 */ 3055 3056 /* 3057 * When srs_count becomes 0, reset SRS_TX_HIWAT and 3058 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3059 */ 3060 if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3061 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3062 mac_tx_notify_cb_t *mtnfp; 3063 mac_cb_t *mcb; 3064 mac_client_impl_t *mcip = mac_srs->srs_mcip; 3065 boolean_t wakeup_required = B_FALSE; 3066 3067 if (mac_srs->srs_state & 3068 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3069 wakeup_required = B_TRUE; 3070 } 3071 mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3072 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3073 mutex_exit(&mac_srs->srs_lock); 3074 if (wakeup_required) { 3075 /* Wakeup callback registered clients */ 3076 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3077 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3078 mcb = mcb->mcb_nextp) { 3079 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3080 mtnfp->mtnf_fn(mtnfp->mtnf_arg, 3081 (mac_tx_cookie_t)mac_srs); 3082 } 3083 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3084 &mcip->mci_tx_notify_cb_list); 3085 /* 3086 * If the client is not the primary MAC client, then we 3087 * need to send the notification to the clients upper 3088 * MAC, i.e. mci_upper_mip. 3089 */ 3090 mac_tx_notify(mcip->mci_upper_mip != NULL ? 3091 mcip->mci_upper_mip : mcip->mci_mip); 3092 } 3093 mutex_enter(&mac_srs->srs_lock); 3094 } 3095 mac_srs->srs_state &= ~SRS_PROC; 3096 } 3097 3098 /* 3099 * Given a packet, get the flow_entry that identifies the flow 3100 * to which that packet belongs. The flow_entry will contain 3101 * the transmit function to be used to send the packet. If the 3102 * function returns NULL, the packet should be sent using the 3103 * underlying NIC. 3104 */ 3105 static flow_entry_t * 3106 mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3107 { 3108 flow_entry_t *flent = NULL; 3109 mac_client_impl_t *mcip; 3110 int err; 3111 3112 /* 3113 * Do classification on the packet. 3114 */ 3115 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3116 if (err != 0) 3117 return (NULL); 3118 3119 /* 3120 * This flent might just be an additional one on the MAC client, 3121 * i.e. for classification purposes (different fdesc), however 3122 * the resources, SRS et. al., are in the mci_flent, so if 3123 * this isn't the mci_flent, we need to get it. 3124 */ 3125 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3126 FLOW_REFRELE(flent); 3127 flent = mcip->mci_flent; 3128 FLOW_TRY_REFHOLD(flent, err); 3129 if (err != 0) 3130 return (NULL); 3131 } 3132 3133 return (flent); 3134 } 3135 3136 /* 3137 * This macro is only meant to be used by mac_tx_send(). 3138 */ 3139 #define CHECK_VID_AND_ADD_TAG(mp) { \ 3140 if (vid_check) { \ 3141 int err = 0; \ 3142 \ 3143 MAC_VID_CHECK(src_mcip, (mp), err); \ 3144 if (err != 0) { \ 3145 freemsg((mp)); \ 3146 (mp) = next; \ 3147 oerrors++; \ 3148 continue; \ 3149 } \ 3150 } \ 3151 if (add_tag) { \ 3152 (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3153 if ((mp) == NULL) { \ 3154 (mp) = next; \ 3155 oerrors++; \ 3156 continue; \ 3157 } \ 3158 } \ 3159 } 3160 3161 mblk_t * 3162 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3163 mac_tx_stats_t *stats) 3164 { 3165 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3166 mac_impl_t *mip = src_mcip->mci_mip; 3167 uint_t obytes = 0, opackets = 0, oerrors = 0; 3168 mblk_t *mp = NULL, *next; 3169 boolean_t vid_check, add_tag; 3170 uint16_t vid = 0; 3171 3172 if (mip->mi_nclients > 1) { 3173 vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3174 add_tag = MAC_TAG_NEEDED(src_mcip); 3175 if (add_tag) 3176 vid = mac_client_vid(mch); 3177 } else { 3178 ASSERT(mip->mi_nclients == 1); 3179 vid_check = add_tag = B_FALSE; 3180 } 3181 3182 /* 3183 * Fastpath: if there's only one client, and there's no 3184 * multicast listeners, we simply send the packet down to the 3185 * underlying NIC. 3186 */ 3187 if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 3188 DTRACE_PROBE2(fastpath, 3189 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3190 3191 mp = mp_chain; 3192 while (mp != NULL) { 3193 next = mp->b_next; 3194 mp->b_next = NULL; 3195 opackets++; 3196 obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3197 msgdsize(mp)); 3198 3199 CHECK_VID_AND_ADD_TAG(mp); 3200 MAC_TX(mip, ring, mp, src_mcip); 3201 3202 /* 3203 * If the driver is out of descriptors and does a 3204 * partial send it will return a chain of unsent 3205 * mblks. Adjust the accounting stats. 3206 */ 3207 if (mp != NULL) { 3208 opackets--; 3209 obytes -= msgdsize(mp); 3210 mp->b_next = next; 3211 break; 3212 } 3213 mp = next; 3214 } 3215 goto done; 3216 } 3217 3218 /* 3219 * No fastpath, we either have more than one MAC client 3220 * defined on top of the same MAC, or one or more MAC 3221 * client promiscuous callbacks. 3222 */ 3223 DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3224 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3225 3226 if (mip->mi_promisc_list != NULL) 3227 mac_promisc_dispatch(mip, mp_chain, src_mcip); 3228 3229 mp = mp_chain; 3230 while (mp != NULL) { 3231 flow_entry_t *dst_flow_ent; 3232 void *flow_cookie; 3233 size_t pkt_size; 3234 mblk_t *mp1; 3235 3236 next = mp->b_next; 3237 mp->b_next = NULL; 3238 opackets++; 3239 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3240 obytes += pkt_size; 3241 CHECK_VID_AND_ADD_TAG(mp); 3242 3243 /* 3244 * Find the destination. 3245 */ 3246 dst_flow_ent = mac_tx_classify(mip, mp); 3247 3248 if (dst_flow_ent != NULL) { 3249 size_t hdrsize; 3250 int err = 0; 3251 3252 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3253 struct ether_vlan_header *evhp = 3254 (struct ether_vlan_header *)mp->b_rptr; 3255 3256 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3257 hdrsize = sizeof (*evhp); 3258 else 3259 hdrsize = sizeof (struct ether_header); 3260 } else { 3261 mac_header_info_t mhi; 3262 3263 err = mac_header_info((mac_handle_t)mip, 3264 mp, &mhi); 3265 if (err == 0) 3266 hdrsize = mhi.mhi_hdrsize; 3267 } 3268 3269 /* 3270 * Got a matching flow. It's either another 3271 * MAC client, or a broadcast/multicast flow. 3272 * Make sure the packet size is within the 3273 * allowed size. If not drop the packet and 3274 * move to next packet. 3275 */ 3276 if (err != 0 || 3277 (pkt_size - hdrsize) > mip->mi_sdu_max) { 3278 oerrors++; 3279 DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3280 mblk_t *, mp); 3281 freemsg(mp); 3282 mp = next; 3283 FLOW_REFRELE(dst_flow_ent); 3284 continue; 3285 } 3286 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3287 if (flow_cookie != NULL) { 3288 /* 3289 * The vnic_bcast_send function expects 3290 * to receive the sender MAC client 3291 * as value for arg2. 3292 */ 3293 mac_bcast_send(flow_cookie, src_mcip, mp, 3294 B_TRUE); 3295 } else { 3296 /* 3297 * loopback the packet to a 3298 * local MAC client. We force a context 3299 * switch if both source and destination 3300 * MAC clients are used by IP, i.e. bypass 3301 * is set. 3302 */ 3303 boolean_t do_switch; 3304 mac_client_impl_t *dst_mcip = 3305 dst_flow_ent->fe_mcip; 3306 3307 do_switch = ((src_mcip->mci_state_flags & 3308 dst_mcip->mci_state_flags & 3309 MCIS_CLIENT_POLL_CAPABLE) != 0); 3310 3311 if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3312 (dst_flow_ent->fe_cb_fn)( 3313 dst_flow_ent->fe_cb_arg1, 3314 dst_flow_ent->fe_cb_arg2, 3315 mp1, do_switch); 3316 } 3317 } 3318 FLOW_REFRELE(dst_flow_ent); 3319 } else { 3320 /* 3321 * Unknown destination, send via the underlying 3322 * NIC. 3323 */ 3324 MAC_TX(mip, ring, mp, src_mcip); 3325 if (mp != NULL) { 3326 /* 3327 * Adjust for the last packet that 3328 * could not be transmitted 3329 */ 3330 opackets--; 3331 obytes -= pkt_size; 3332 mp->b_next = next; 3333 break; 3334 } 3335 } 3336 mp = next; 3337 } 3338 3339 done: 3340 src_mcip->mci_stat_obytes += obytes; 3341 src_mcip->mci_stat_opackets += opackets; 3342 src_mcip->mci_stat_oerrors += oerrors; 3343 3344 if (stats != NULL) { 3345 stats->ts_opackets = opackets; 3346 stats->ts_obytes = obytes; 3347 stats->ts_oerrors = oerrors; 3348 } 3349 return (mp); 3350 } 3351 3352 /* 3353 * mac_tx_srs_ring_present 3354 * 3355 * Returns whether the specified ring is part of the specified SRS. 3356 */ 3357 boolean_t 3358 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3359 { 3360 int i; 3361 mac_soft_ring_t *soft_ring; 3362 3363 if (srs->srs_tx.st_arg2 == tx_ring) 3364 return (B_TRUE); 3365 3366 for (i = 0; i < srs->srs_oth_ring_count; i++) { 3367 soft_ring = srs->srs_oth_soft_rings[i]; 3368 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3369 return (B_TRUE); 3370 } 3371 3372 return (B_FALSE); 3373 } 3374 3375 /* 3376 * mac_tx_srs_wakeup 3377 * 3378 * Called when Tx desc become available. Wakeup the appropriate worker 3379 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3380 * state field. 3381 */ 3382 void 3383 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3384 { 3385 int i; 3386 mac_soft_ring_t *sringp; 3387 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3388 3389 mutex_enter(&mac_srs->srs_lock); 3390 if (TX_SINGLE_RING_MODE(mac_srs)) { 3391 if (srs_tx->st_arg2 == ring && 3392 mac_srs->srs_state & SRS_TX_BLOCKED) { 3393 mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3394 srs_tx->st_unblocked_cnt++; 3395 cv_signal(&mac_srs->srs_async); 3396 } 3397 /* 3398 * A wakeup can come before tx_srs_drain() could 3399 * grab srs lock and set SRS_TX_BLOCKED. So 3400 * always set woken_up flag when we come here. 3401 */ 3402 srs_tx->st_woken_up = B_TRUE; 3403 mutex_exit(&mac_srs->srs_lock); 3404 return; 3405 } 3406 3407 /* If you are here, it is for FANOUT or BW_FANOUT case */ 3408 ASSERT(TX_MULTI_RING_MODE(mac_srs)); 3409 for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 3410 sringp = mac_srs->srs_oth_soft_rings[i]; 3411 mutex_enter(&sringp->s_ring_lock); 3412 if (sringp->s_ring_tx_arg2 == ring) { 3413 if (sringp->s_ring_state & S_RING_BLOCK) { 3414 sringp->s_ring_state &= ~S_RING_BLOCK; 3415 sringp->s_ring_unblocked_cnt++; 3416 cv_signal(&sringp->s_ring_async); 3417 } 3418 sringp->s_ring_tx_woken_up = B_TRUE; 3419 } 3420 mutex_exit(&sringp->s_ring_lock); 3421 } 3422 mutex_exit(&mac_srs->srs_lock); 3423 } 3424 3425 /* 3426 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3427 * the blocked clients again. 3428 */ 3429 void 3430 mac_tx_notify(mac_impl_t *mip) 3431 { 3432 i_mac_notify(mip, MAC_NOTE_TX); 3433 } 3434 3435 /* 3436 * RX SOFTRING RELATED FUNCTIONS 3437 * 3438 * These functions really belong in mac_soft_ring.c and here for 3439 * a short period. 3440 */ 3441 3442 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3443 /* \ 3444 * Enqueue our mblk chain. \ 3445 */ \ 3446 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3447 \ 3448 if ((ringp)->s_ring_last != NULL) \ 3449 (ringp)->s_ring_last->b_next = (mp); \ 3450 else \ 3451 (ringp)->s_ring_first = (mp); \ 3452 (ringp)->s_ring_last = (tail); \ 3453 (ringp)->s_ring_count += (cnt); \ 3454 ASSERT((ringp)->s_ring_count > 0); \ 3455 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3456 (ringp)->s_ring_size += sz; \ 3457 } \ 3458 } 3459 3460 /* 3461 * Default entry point to deliver a packet chain to a MAC client. 3462 * If the MAC client has flows, do the classification with these 3463 * flows as well. 3464 */ 3465 /* ARGSUSED */ 3466 void 3467 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3468 mac_header_info_t *arg3) 3469 { 3470 mac_client_impl_t *mcip = arg1; 3471 3472 if (mcip->mci_nvids == 1 && 3473 !(mcip->mci_state_flags & MCIS_TAG_DISABLE)) { 3474 /* 3475 * If the client has exactly one VID associated with it 3476 * and striping of VLAN header is not disabled, 3477 * remove the VLAN tag from the packet before 3478 * passing it on to the client's receive callback. 3479 * Note that this needs to be done after we dispatch 3480 * the packet to the promiscuous listeners of the 3481 * client, since they expect to see the whole 3482 * frame including the VLAN headers. 3483 */ 3484 mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3485 } 3486 3487 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3488 } 3489 3490 /* 3491 * mac_rx_soft_ring_process 3492 * 3493 * process a chain for a given soft ring. The number of packets queued 3494 * in the SRS and its associated soft rings (including this one) is 3495 * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3496 * thread (interrupt or poll thread) to do inline processing. This 3497 * helps keep the latency down under low load. 3498 * 3499 * The proc and arg for each mblk is already stored in the mblk in 3500 * appropriate places. 3501 */ 3502 /* ARGSUSED */ 3503 void 3504 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3505 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3506 { 3507 mac_direct_rx_t proc; 3508 void *arg1; 3509 mac_resource_handle_t arg2; 3510 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3511 3512 ASSERT(ringp != NULL); 3513 ASSERT(mp_chain != NULL); 3514 ASSERT(tail != NULL); 3515 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3516 3517 mutex_enter(&ringp->s_ring_lock); 3518 ringp->s_ring_total_inpkt += cnt; 3519 if ((ringp->s_ring_type & ST_RING_ANY) || 3520 ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3521 !mac_srs->srs_rx.sr_enqueue_always)) { 3522 /* If on processor or blanking on, then enqueue and return */ 3523 if (ringp->s_ring_state & S_RING_BLANK || 3524 ringp->s_ring_state & S_RING_PROC) { 3525 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3526 mutex_exit(&ringp->s_ring_lock); 3527 return; 3528 } 3529 3530 proc = ringp->s_ring_rx_func; 3531 arg1 = ringp->s_ring_rx_arg1; 3532 arg2 = ringp->s_ring_rx_arg2; 3533 /* 3534 * See if anything is already queued. If we are the 3535 * first packet, do inline processing else queue the 3536 * packet and do the drain. 3537 */ 3538 if (ringp->s_ring_first == NULL) { 3539 /* 3540 * Fast-path, ok to process and nothing queued. 3541 */ 3542 ringp->s_ring_run = curthread; 3543 ringp->s_ring_state |= (S_RING_PROC); 3544 3545 mutex_exit(&ringp->s_ring_lock); 3546 3547 /* 3548 * We are the chain of 1 packet so 3549 * go through this fast path. 3550 */ 3551 ASSERT(mp_chain->b_next == NULL); 3552 3553 (*proc)(arg1, arg2, mp_chain, NULL); 3554 3555 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3556 /* 3557 * If we have a soft ring set which is doing 3558 * bandwidth control, we need to decrement 3559 * srs_size and count so it the SRS can have a 3560 * accurate idea of what is the real data 3561 * queued between SRS and its soft rings. We 3562 * decrement the counters only when the packet 3563 * gets processed by both SRS and the soft ring. 3564 */ 3565 mutex_enter(&mac_srs->srs_lock); 3566 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3567 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3568 mutex_exit(&mac_srs->srs_lock); 3569 3570 mutex_enter(&ringp->s_ring_lock); 3571 ringp->s_ring_run = NULL; 3572 ringp->s_ring_state &= ~S_RING_PROC; 3573 if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3574 cv_signal(&ringp->s_ring_client_cv); 3575 3576 if ((ringp->s_ring_first == NULL) || 3577 (ringp->s_ring_state & S_RING_BLANK)) { 3578 /* 3579 * We processed inline our packet and 3580 * nothing new has arrived or our 3581 * receiver doesn't want to receive 3582 * any packets. We are done. 3583 */ 3584 mutex_exit(&ringp->s_ring_lock); 3585 return; 3586 } 3587 } else { 3588 SOFT_RING_ENQUEUE_CHAIN(ringp, 3589 mp_chain, tail, cnt, sz); 3590 } 3591 3592 /* 3593 * We are here because either we couldn't do inline 3594 * processing (because something was already 3595 * queued), or we had a chain of more than one 3596 * packet, or something else arrived after we were 3597 * done with inline processing. 3598 */ 3599 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3600 ASSERT(ringp->s_ring_first != NULL); 3601 3602 ringp->s_ring_drain_func(ringp); 3603 mutex_exit(&ringp->s_ring_lock); 3604 return; 3605 } else { 3606 /* ST_RING_WORKER_ONLY case */ 3607 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3608 mac_soft_ring_worker_wakeup(ringp); 3609 mutex_exit(&ringp->s_ring_lock); 3610 } 3611 } 3612 3613 /* 3614 * TX SOFTRING RELATED FUNCTIONS 3615 * 3616 * These functions really belong in mac_soft_ring.c and here for 3617 * a short period. 3618 */ 3619 3620 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3621 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3622 ringp->s_ring_state |= S_RING_ENQUEUED; \ 3623 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3624 } 3625 3626 /* 3627 * mac_tx_sring_queued 3628 * 3629 * When we are out of transmit descriptors and we already have a 3630 * queue that exceeds hiwat (or the client called us with 3631 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3632 * soft ring pointer as the opaque cookie for the client enable 3633 * flow control. 3634 */ 3635 static mac_tx_cookie_t 3636 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3637 mblk_t **ret_mp) 3638 { 3639 int cnt; 3640 size_t sz; 3641 mblk_t *tail; 3642 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3643 mac_tx_cookie_t cookie = NULL; 3644 boolean_t wakeup_worker = B_TRUE; 3645 3646 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3647 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3648 if (flag & MAC_DROP_ON_NO_DESC) { 3649 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3650 /* increment freed stats */ 3651 ringp->s_ring_drops += cnt; 3652 cookie = (mac_tx_cookie_t)ringp; 3653 } else { 3654 if (ringp->s_ring_first != NULL) 3655 wakeup_worker = B_FALSE; 3656 3657 if (flag & MAC_TX_NO_ENQUEUE) { 3658 /* 3659 * If QUEUED is not set, queue the packet 3660 * and let mac_tx_soft_ring_drain() set 3661 * the TX_BLOCKED bit for the reasons 3662 * explained above. Otherwise, return the 3663 * mblks. 3664 */ 3665 if (wakeup_worker) { 3666 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3667 mp_chain, tail, cnt, sz); 3668 } else { 3669 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3670 cookie = (mac_tx_cookie_t)ringp; 3671 *ret_mp = mp_chain; 3672 } 3673 } else { 3674 boolean_t enqueue = B_TRUE; 3675 3676 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3677 /* 3678 * flow-controlled. Store ringp in cookie 3679 * so that it can be returned as 3680 * mac_tx_cookie_t to client 3681 */ 3682 ringp->s_ring_state |= S_RING_TX_HIWAT; 3683 cookie = (mac_tx_cookie_t)ringp; 3684 ringp->s_ring_hiwat_cnt++; 3685 if (ringp->s_ring_count > 3686 ringp->s_ring_tx_max_q_cnt) { 3687 /* increment freed stats */ 3688 ringp->s_ring_drops += cnt; 3689 /* 3690 * b_prev may be set to the fanout hint 3691 * hence can't use freemsg directly 3692 */ 3693 mac_pkt_drop(NULL, NULL, 3694 mp_chain, B_FALSE); 3695 DTRACE_PROBE1(tx_queued_hiwat, 3696 mac_soft_ring_t *, ringp); 3697 enqueue = B_FALSE; 3698 } 3699 } 3700 if (enqueue) { 3701 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3702 tail, cnt, sz); 3703 } 3704 } 3705 if (wakeup_worker) 3706 cv_signal(&ringp->s_ring_async); 3707 } 3708 return (cookie); 3709 } 3710 3711 3712 /* 3713 * mac_tx_soft_ring_process 3714 * 3715 * This routine is called when fanning out outgoing traffic among 3716 * multipe Tx rings. 3717 * Note that a soft ring is associated with a h/w Tx ring. 3718 */ 3719 mac_tx_cookie_t 3720 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3721 uint16_t flag, mblk_t **ret_mp) 3722 { 3723 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3724 int cnt; 3725 size_t sz; 3726 mblk_t *tail; 3727 mac_tx_cookie_t cookie = NULL; 3728 3729 ASSERT(ringp != NULL); 3730 ASSERT(mp_chain != NULL); 3731 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3732 /* 3733 * Only two modes can come here; either it can be 3734 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 3735 */ 3736 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3737 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 3738 3739 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3740 /* Serialization mode */ 3741 3742 mutex_enter(&ringp->s_ring_lock); 3743 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3744 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3745 flag, ret_mp); 3746 mutex_exit(&ringp->s_ring_lock); 3747 return (cookie); 3748 } 3749 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3750 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3751 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3752 /* 3753 * If ring is blocked due to lack of Tx 3754 * descs, just return. Worker thread 3755 * will get scheduled when Tx desc's 3756 * become available. 3757 */ 3758 mutex_exit(&ringp->s_ring_lock); 3759 return (cookie); 3760 } 3761 mac_soft_ring_worker_wakeup(ringp); 3762 mutex_exit(&ringp->s_ring_lock); 3763 return (cookie); 3764 } else { 3765 /* Default fanout mode */ 3766 /* 3767 * S_RING_BLOCKED is set when underlying NIC runs 3768 * out of Tx descs and messages start getting 3769 * queued. It won't get reset until 3770 * tx_srs_drain() completely drains out the 3771 * messages. 3772 */ 3773 boolean_t is_subflow; 3774 mac_tx_stats_t stats; 3775 3776 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3777 /* Tx descs/resources not available */ 3778 mutex_enter(&ringp->s_ring_lock); 3779 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3780 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3781 flag, ret_mp); 3782 mutex_exit(&ringp->s_ring_lock); 3783 return (cookie); 3784 } 3785 /* 3786 * While we were computing mblk count, the 3787 * flow control condition got relieved. 3788 * Continue with the transmission. 3789 */ 3790 mutex_exit(&ringp->s_ring_lock); 3791 } 3792 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 3793 3794 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 3795 ringp->s_ring_tx_arg2, mp_chain, 3796 (is_subflow ? &stats : NULL)); 3797 3798 /* 3799 * Multiple threads could be here sending packets. 3800 * Under such conditions, it is not possible to 3801 * automically set S_RING_BLOCKED bit to indicate 3802 * out of tx desc condition. To atomically set 3803 * this, we queue the returned packet and do 3804 * the setting of S_RING_BLOCKED in 3805 * mac_tx_soft_ring_drain(). 3806 */ 3807 if (mp_chain != NULL) { 3808 mutex_enter(&ringp->s_ring_lock); 3809 cookie = 3810 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 3811 mutex_exit(&ringp->s_ring_lock); 3812 return (cookie); 3813 } 3814 if (is_subflow) { 3815 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 3816 } 3817 return (NULL); 3818 } 3819 } 3820