1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/callb.h> 28 #include <sys/sdt.h> 29 #include <sys/strsubr.h> 30 #include <sys/strsun.h> 31 #include <sys/vlan.h> 32 #include <inet/ipsec_impl.h> 33 #include <inet/ip_impl.h> 34 #include <inet/sadb.h> 35 #include <inet/ipsecesp.h> 36 #include <inet/ipsecah.h> 37 #include <inet/ip6.h> 38 39 #include <sys/mac_impl.h> 40 #include <sys/mac_client_impl.h> 41 #include <sys/mac_client_priv.h> 42 #include <sys/mac_soft_ring.h> 43 #include <sys/mac_flow_impl.h> 44 45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 46 uintptr_t, uint16_t, mblk_t **); 47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 48 uintptr_t, uint16_t, mblk_t **); 49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 50 uintptr_t, uint16_t, mblk_t **); 51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 52 uintptr_t, uint16_t, mblk_t **); 53 54 typedef struct mac_tx_mode_s { 55 mac_tx_srs_mode_t mac_tx_mode; 56 mac_tx_func_t mac_tx_func; 57 } mac_tx_mode_t; 58 59 /* 60 * There are five modes of operation on the Tx side. These modes get set 61 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 62 * none of the other modes are user configurable. They get selected by 63 * the system depending upon whether the link (or flow) has multiple Tx 64 * rings or a bandwidth configured, etc. 65 */ 66 mac_tx_mode_t mac_tx_mode_list[] = { 67 {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 68 {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 69 {SRS_TX_FANOUT, mac_tx_fanout_mode}, 70 {SRS_TX_BW, mac_tx_bw_mode}, 71 {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 72 }; 73 74 /* 75 * Soft Ring Set (SRS) - The Run time code that deals with 76 * dynamic polling from the hardware, bandwidth enforcement, 77 * fanout etc. 78 * 79 * We try to use H/W classification on NIC and assign traffic for 80 * a MAC address to a particular Rx ring or ring group. There is a 81 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 82 * switches the underlying Rx ring between interrupt and 83 * polling mode and enforces any specified B/W control. 84 * 85 * There is always a SRS created and tied to each H/W and S/W rule. 86 * Whenever we create a H/W rule, we always add the the same rule to 87 * S/W classifier and tie a SRS to it. 88 * 89 * In case a B/W control is specified, it is broken into bytes 90 * per ticks and as soon as the quota for a tick is exhausted, 91 * the underlying Rx ring is forced into poll mode for remainder of 92 * the tick. The SRS poll thread only polls for bytes that are 93 * allowed to come in the SRS. We typically let 4x the configured 94 * B/W worth of packets to come in the SRS (to prevent unnecessary 95 * drops due to bursts) but only process the specified amount. 96 * 97 * A MAC client (e.g. a VNIC or aggr) can have 1 or more 98 * Rx rings (and corresponding SRSs) assigned to it. The SRS 99 * in turn can have softrings to do protocol level fanout or 100 * softrings to do S/W based fanout or both. In case the NIC 101 * has no Rx rings, we do S/W classification to respective SRS. 102 * The S/W classification rule is always setup and ready. This 103 * allows the MAC layer to reassign Rx rings whenever needed 104 * but packets still continue to flow via the default path and 105 * getting S/W classified to correct SRS. 106 * 107 * The SRS's are used on both Tx and Rx side. They use the same 108 * data structure but the processing routines have slightly different 109 * semantics due to the fact that Rx side needs to do dynamic 110 * polling etc. 111 * 112 * Dynamic Polling Notes 113 * ===================== 114 * 115 * Each Soft ring set is capable of switching its Rx ring between 116 * interrupt and poll mode and actively 'polls' for packets in 117 * poll mode. If the SRS is implementing a B/W limit, it makes 118 * sure that only Max allowed packets are pulled in poll mode 119 * and goes to poll mode as soon as B/W limit is exceeded. As 120 * such, there are no overheads to implement B/W limits. 121 * 122 * In poll mode, its better to keep the pipeline going where the 123 * SRS worker thread keeps processing packets and poll thread 124 * keeps bringing more packets (specially if they get to run 125 * on different CPUs). This also prevents the overheads associated 126 * by excessive signalling (on NUMA machines, this can be 127 * pretty devastating). The exception is latency optimized case 128 * where worker thread does no work and interrupt and poll thread 129 * are allowed to do their own drain. 130 * 131 * We use the following policy to control Dynamic Polling: 132 * 1) We switch to poll mode anytime the processing 133 * thread causes a backlog to build up in SRS and 134 * its associated Soft Rings (sr_poll_pkt_cnt > 0). 135 * 2) As long as the backlog stays under the low water 136 * mark (sr_lowat), we poll the H/W for more packets. 137 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 138 * water mark, we stay in poll mode but don't poll 139 * the H/W for more packets. 140 * 4) Anytime in polling mode, if we poll the H/W for 141 * packets and find nothing plus we have an existing 142 * backlog (sr_poll_pkt_cnt > 0), we stay in polling 143 * mode but don't poll the H/W for packets anymore 144 * (let the polling thread go to sleep). 145 * 5) Once the backlog is relived (packets are processed) 146 * we reenable polling (by signalling the poll thread) 147 * only when the backlog dips below sr_poll_thres. 148 * 6) sr_hiwat is used exclusively when we are not 149 * polling capable and is used to decide when to 150 * drop packets so the SRS queue length doesn't grow 151 * infinitely. 152 * 153 * NOTE: Also see the block level comment on top of mac_soft_ring.c 154 */ 155 156 /* 157 * mac_latency_optimize 158 * 159 * Controls whether the poll thread can process the packets inline 160 * or let the SRS worker thread do the processing. This applies if 161 * the SRS was not being processed. For latency sensitive traffic, 162 * this needs to be true to allow inline processing. For throughput 163 * under load, this should be false. 164 * 165 * This (and other similar) tunable should be rolled into a link 166 * or flow specific workload hint that can be set using dladm 167 * linkprop (instead of multiple such tunables). 168 */ 169 boolean_t mac_latency_optimize = B_TRUE; 170 171 /* 172 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 173 * 174 * queue a mp or chain in soft ring set and increment the 175 * local count (srs_count) for the SRS and the shared counter 176 * (srs_poll_pkt_cnt - shared between SRS and its soft rings 177 * to track the total unprocessed packets for polling to work 178 * correctly). 179 * 180 * The size (total bytes queued) counters are incremented only 181 * if we are doing B/W control. 182 */ 183 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 184 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 185 if ((mac_srs)->srs_last != NULL) \ 186 (mac_srs)->srs_last->b_next = (head); \ 187 else \ 188 (mac_srs)->srs_first = (head); \ 189 (mac_srs)->srs_last = (tail); \ 190 (mac_srs)->srs_count += count; \ 191 } 192 193 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 194 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 195 \ 196 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 197 srs_rx->sr_poll_pkt_cnt += count; \ 198 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 199 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 200 (mac_srs)->srs_size += (sz); \ 201 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 202 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 203 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 204 } \ 205 } 206 207 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 208 mac_srs->srs_state |= SRS_ENQUEUED; \ 209 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 210 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 211 (mac_srs)->srs_size += (sz); \ 212 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 213 } \ 214 } 215 216 /* 217 * Turn polling on routines 218 */ 219 #define MAC_SRS_POLLING_ON(mac_srs) { \ 220 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 221 if (((mac_srs)->srs_state & \ 222 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 223 (mac_srs)->srs_state |= SRS_POLLING; \ 224 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 225 (mac_srs)->srs_ring); \ 226 (mac_srs)->srs_rx.sr_poll_on++; \ 227 } \ 228 } 229 230 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 231 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 232 if (((mac_srs)->srs_state & \ 233 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 234 (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 235 (mac_srs)->srs_state |= SRS_POLLING; \ 236 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 237 (mac_srs)->srs_ring); \ 238 (mac_srs)->srs_rx.sr_worker_poll_on++; \ 239 } \ 240 } 241 242 /* 243 * MAC_SRS_POLL_RING 244 * 245 * Signal the SRS poll thread to poll the underlying H/W ring 246 * provided it wasn't already polling (SRS_GET_PKTS was set). 247 * 248 * Poll thread gets to run only from mac_rx_srs_drain() and only 249 * if the drain was being done by the worker thread. 250 */ 251 #define MAC_SRS_POLL_RING(mac_srs) { \ 252 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 253 \ 254 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 255 srs_rx->sr_poll_thr_sig++; \ 256 if (((mac_srs)->srs_state & \ 257 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 258 (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 259 (mac_srs)->srs_state |= SRS_GET_PKTS; \ 260 cv_signal(&(mac_srs)->srs_cv); \ 261 } else { \ 262 srs_rx->sr_poll_thr_busy++; \ 263 } \ 264 } 265 266 /* 267 * MAC_SRS_CHECK_BW_CONTROL 268 * 269 * Check to see if next tick has started so we can reset the 270 * SRS_BW_ENFORCED flag and allow more packets to come in the 271 * system. 272 */ 273 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 274 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 275 ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 276 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 277 if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \ 278 (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \ 279 (mac_srs)->srs_bw->mac_bw_used = 0; \ 280 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 281 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 282 } \ 283 } 284 285 /* 286 * MAC_SRS_WORKER_WAKEUP 287 * 288 * Wake up the SRS worker thread to process the queue as long as 289 * no one else is processing the queue. If we are optimizing for 290 * latency, we wake up the worker thread immediately or else we 291 * wait mac_srs_worker_wakeup_ticks before worker thread gets 292 * woken up. 293 */ 294 int mac_srs_worker_wakeup_ticks = 0; 295 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 296 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 297 if (!((mac_srs)->srs_state & SRS_PROC) && \ 298 (mac_srs)->srs_tid == NULL) { \ 299 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 300 (mac_srs_worker_wakeup_ticks == 0)) \ 301 cv_signal(&(mac_srs)->srs_async); \ 302 else \ 303 (mac_srs)->srs_tid = \ 304 timeout(mac_srs_fire, (mac_srs), \ 305 mac_srs_worker_wakeup_ticks); \ 306 } \ 307 } 308 309 #define TX_SINGLE_RING_MODE(mac_srs) \ 310 ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 311 (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 312 (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 313 314 #define TX_BANDWIDTH_MODE(mac_srs) \ 315 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 316 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 317 318 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 319 uint_t hash, indx; \ 320 hash = HASH_HINT(hint); \ 321 indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 322 softring = mac_srs->srs_oth_soft_rings[indx]; \ 323 (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 324 } 325 326 /* 327 * MAC_TX_SRS_BLOCK 328 * 329 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 330 * will be set only if srs_tx_woken_up is FALSE. If 331 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 332 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 333 * attempt to transmit again and not setting SRS_TX_BLOCKED does 334 * that. 335 */ 336 #define MAC_TX_SRS_BLOCK(srs, mp) { \ 337 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 338 if ((srs)->srs_tx.st_woken_up) { \ 339 (srs)->srs_tx.st_woken_up = B_FALSE; \ 340 } else { \ 341 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 342 (srs)->srs_state |= SRS_TX_BLOCKED; \ 343 (srs)->srs_tx.st_blocked_cnt++; \ 344 } \ 345 } 346 347 /* 348 * MAC_TX_SRS_TEST_HIWAT 349 * 350 * Called before queueing a packet onto Tx SRS to test and set 351 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 352 */ 353 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 354 boolean_t enqueue = 1; \ 355 \ 356 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 357 /* \ 358 * flow-controlled. Store srs in cookie so that it \ 359 * can be returned as mac_tx_cookie_t to client \ 360 */ \ 361 (srs)->srs_state |= SRS_TX_HIWAT; \ 362 cookie = (mac_tx_cookie_t)srs; \ 363 (srs)->srs_tx.st_hiwat_cnt++; \ 364 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 365 /* increment freed stats */ \ 366 (srs)->srs_tx.st_drop_count += cnt; \ 367 /* \ 368 * b_prev may be set to the fanout hint \ 369 * hence can't use freemsg directly \ 370 */ \ 371 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 372 DTRACE_PROBE1(tx_queued_hiwat, \ 373 mac_soft_ring_set_t *, srs); \ 374 enqueue = 0; \ 375 } \ 376 } \ 377 if (enqueue) \ 378 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 379 } 380 381 /* Some utility macros */ 382 #define MAC_SRS_BW_LOCK(srs) \ 383 if (!(srs->srs_type & SRST_TX)) \ 384 mutex_enter(&srs->srs_bw->mac_bw_lock); 385 386 #define MAC_SRS_BW_UNLOCK(srs) \ 387 if (!(srs->srs_type & SRST_TX)) \ 388 mutex_exit(&srs->srs_bw->mac_bw_lock); 389 390 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 391 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 392 /* increment freed stats */ \ 393 mac_srs->srs_tx.st_drop_count++; \ 394 cookie = (mac_tx_cookie_t)srs; \ 395 } 396 397 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 398 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 399 cookie = (mac_tx_cookie_t)srs; \ 400 *ret_mp = mp_chain; \ 401 } 402 403 /* 404 * Drop the rx packet and advance to the next one in the chain. 405 */ 406 static void 407 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 408 { 409 mac_srs_rx_t *srs_rx = &srs->srs_rx; 410 411 ASSERT(mp->b_next == NULL); 412 mutex_enter(&srs->srs_lock); 413 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 414 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 415 mutex_exit(&srs->srs_lock); 416 417 srs_rx->sr_drop_count++; 418 freemsg(mp); 419 } 420 421 /* DATAPATH RUNTIME ROUTINES */ 422 423 /* 424 * mac_srs_fire 425 * 426 * Timer callback routine for waking up the SRS worker thread. 427 */ 428 static void 429 mac_srs_fire(void *arg) 430 { 431 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 432 433 mutex_enter(&mac_srs->srs_lock); 434 if (mac_srs->srs_tid == 0) { 435 mutex_exit(&mac_srs->srs_lock); 436 return; 437 } 438 439 mac_srs->srs_tid = 0; 440 if (!(mac_srs->srs_state & SRS_PROC)) 441 cv_signal(&mac_srs->srs_async); 442 443 mutex_exit(&mac_srs->srs_lock); 444 } 445 446 /* 447 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 448 * and it is used on the TX path. 449 */ 450 #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 451 452 /* 453 * hash based on the src address and the port information. 454 */ 455 #define HASH_ADDR(src, ports) \ 456 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 457 ((ports) >> 8) ^ (ports)) 458 459 #define COMPUTE_INDEX(key, sz) (key % sz) 460 461 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 462 if ((tail) != NULL) { \ 463 ASSERT((tail)->b_next == NULL); \ 464 (tail)->b_next = (mp); \ 465 } else { \ 466 ASSERT((head) == NULL); \ 467 (head) = (mp); \ 468 } \ 469 (tail) = (mp); \ 470 (cnt)++; \ 471 if ((bw_ctl)) \ 472 (sz) += (sz0); \ 473 } 474 475 #define MAC_FANOUT_DEFAULT 0 476 #define MAC_FANOUT_RND_ROBIN 1 477 int mac_fanout_type = MAC_FANOUT_DEFAULT; 478 479 #define MAX_SR_TYPES 3 480 /* fanout types for port based hashing */ 481 enum pkt_type { 482 V4_TCP = 0, 483 V4_UDP, 484 OTH, 485 UNDEF 486 }; 487 488 /* 489 * In general we do port based hashing to spread traffic over different 490 * softrings. The below tunable allows to override that behavior. Setting it 491 * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 492 * is also the applicable to ipv6 packets carrying multiple optional headers 493 * and other uncommon packet types. 494 */ 495 boolean_t mac_src_ipv6_fanout = B_FALSE; 496 497 /* 498 * Pair of local and remote ports in the transport header 499 */ 500 #define PORTS_SIZE 4 501 502 /* 503 * mac_rx_srs_proto_fanout 504 * 505 * This routine delivers packets destined to an SRS into one of the 506 * protocol soft rings. 507 * 508 * Given a chain of packets we need to split it up into multiple sub chains 509 * destined into TCP, UDP or OTH soft ring. Instead of entering 510 * the soft ring one packet at a time, we want to enter it in the form of a 511 * chain otherwise we get this start/stop behaviour where the worker thread 512 * goes to sleep and then next packets comes in forcing it to wake up etc. 513 */ 514 static void 515 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 516 { 517 struct ether_header *ehp; 518 struct ether_vlan_header *evhp; 519 uint32_t sap; 520 ipha_t *ipha; 521 uint8_t *dstaddr; 522 size_t hdrsize; 523 mblk_t *mp; 524 mblk_t *headmp[MAX_SR_TYPES]; 525 mblk_t *tailmp[MAX_SR_TYPES]; 526 int cnt[MAX_SR_TYPES]; 527 size_t sz[MAX_SR_TYPES]; 528 size_t sz1; 529 boolean_t bw_ctl; 530 boolean_t hw_classified; 531 boolean_t dls_bypass; 532 boolean_t is_ether; 533 boolean_t is_unicast; 534 enum pkt_type type; 535 mac_client_impl_t *mcip = mac_srs->srs_mcip; 536 537 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 538 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 539 540 /* 541 * If we don't have a Rx ring, S/W classification would have done 542 * its job and its a packet meant for us. If we were polling on 543 * the default ring (i.e. there was a ring assigned to this SRS), 544 * then we need to make sure that the mac address really belongs 545 * to us. 546 */ 547 hw_classified = mac_srs->srs_ring != NULL && 548 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 549 550 /* 551 * Special clients (eg. VLAN, non ether, etc) need DLS 552 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 553 * such SRSs. 554 */ 555 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0); 556 557 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 558 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 559 bzero(cnt, MAX_SR_TYPES * sizeof (int)); 560 bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 561 562 /* 563 * We got a chain from SRS that we need to send to the soft rings. 564 * Since squeues for TCP & IPv4 sap poll their soft rings (for 565 * performance reasons), we need to separate out v4_tcp, v4_udp 566 * and the rest goes in other. 567 */ 568 while (head != NULL) { 569 mp = head; 570 head = head->b_next; 571 mp->b_next = NULL; 572 573 type = OTH; 574 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 575 576 if (is_ether) { 577 /* 578 * At this point we can be sure the packet at least 579 * has an ether header. 580 */ 581 if (sz1 < sizeof (struct ether_header)) { 582 mac_rx_drop_pkt(mac_srs, mp); 583 continue; 584 } 585 ehp = (struct ether_header *)mp->b_rptr; 586 587 /* 588 * Determine if this is a VLAN or non-VLAN packet. 589 */ 590 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 591 evhp = (struct ether_vlan_header *)mp->b_rptr; 592 sap = ntohs(evhp->ether_type); 593 hdrsize = sizeof (struct ether_vlan_header); 594 /* 595 * Check if the VID of the packet, if any, 596 * belongs to this client. 597 */ 598 if (!mac_client_check_flow_vid(mcip, 599 VLAN_ID(ntohs(evhp->ether_tci)))) { 600 mac_rx_drop_pkt(mac_srs, mp); 601 continue; 602 } 603 } else { 604 hdrsize = sizeof (struct ether_header); 605 } 606 is_unicast = 607 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 608 dstaddr = (uint8_t *)&ehp->ether_dhost; 609 } else { 610 mac_header_info_t mhi; 611 612 if (mac_header_info((mac_handle_t)mcip->mci_mip, 613 mp, &mhi) != 0) { 614 mac_rx_drop_pkt(mac_srs, mp); 615 continue; 616 } 617 hdrsize = mhi.mhi_hdrsize; 618 sap = mhi.mhi_bindsap; 619 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 620 dstaddr = (uint8_t *)mhi.mhi_daddr; 621 } 622 623 if (!dls_bypass) { 624 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 625 cnt[type], bw_ctl, sz[type], sz1, mp); 626 continue; 627 } 628 629 if (sap == ETHERTYPE_IP) { 630 /* 631 * If we are H/W classified, but we have promisc 632 * on, then we need to check for the unicast address. 633 */ 634 if (hw_classified && mcip->mci_promisc_list != NULL) { 635 mac_address_t *map; 636 637 rw_enter(&mcip->mci_rw_lock, RW_READER); 638 map = mcip->mci_unicast; 639 if (bcmp(dstaddr, map->ma_addr, 640 map->ma_len) == 0) 641 type = UNDEF; 642 rw_exit(&mcip->mci_rw_lock); 643 } else if (is_unicast) { 644 type = UNDEF; 645 } 646 } 647 648 /* 649 * This needs to become a contract with the driver for 650 * the fast path. 651 * 652 * In the normal case the packet will have at least the L2 653 * header and the IP + Transport header in the same mblk. 654 * This is usually the case when the NIC driver sends up 655 * the packet. This is also true when the stack generates 656 * a packet that is looped back and when the stack uses the 657 * fastpath mechanism. The normal case is optimized for 658 * performance and may bypass DLS. All other cases go through 659 * the 'OTH' type path without DLS bypass. 660 */ 661 662 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 663 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 664 type = OTH; 665 666 if (type == OTH) { 667 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 668 cnt[type], bw_ctl, sz[type], sz1, mp); 669 continue; 670 } 671 672 ASSERT(type == UNDEF); 673 /* 674 * We look for at least 4 bytes past the IP header to get 675 * the port information. If we get an IP fragment, we don't 676 * have the port information, and we use just the protocol 677 * information. 678 */ 679 switch (ipha->ipha_protocol) { 680 case IPPROTO_TCP: 681 type = V4_TCP; 682 mp->b_rptr += hdrsize; 683 break; 684 case IPPROTO_UDP: 685 type = V4_UDP; 686 mp->b_rptr += hdrsize; 687 break; 688 default: 689 type = OTH; 690 break; 691 } 692 693 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 694 bw_ctl, sz[type], sz1, mp); 695 } 696 697 for (type = V4_TCP; type < UNDEF; type++) { 698 if (headmp[type] != NULL) { 699 mac_soft_ring_t *softring; 700 701 ASSERT(tailmp[type]->b_next == NULL); 702 switch (type) { 703 case V4_TCP: 704 softring = mac_srs->srs_tcp_soft_rings[0]; 705 break; 706 case V4_UDP: 707 softring = mac_srs->srs_udp_soft_rings[0]; 708 break; 709 case OTH: 710 softring = mac_srs->srs_oth_soft_rings[0]; 711 } 712 mac_rx_soft_ring_process(mcip, softring, 713 headmp[type], tailmp[type], cnt[type], sz[type]); 714 } 715 } 716 } 717 718 int fanout_unalligned = 0; 719 720 /* 721 * mac_rx_srs_long_fanout 722 * 723 * The fanout routine for IPv6 724 */ 725 static int 726 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 727 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 728 { 729 ip6_t *ip6h; 730 uint8_t *whereptr; 731 uint_t hash; 732 uint16_t remlen; 733 uint8_t nexthdr; 734 uint16_t hdr_len; 735 736 if (sap == ETHERTYPE_IPV6) { 737 boolean_t modifiable = B_TRUE; 738 739 ASSERT(MBLKL(mp) >= hdrsize); 740 741 ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 742 if ((unsigned char *)ip6h == mp->b_wptr) { 743 /* 744 * The first mblk_t only includes the mac header. 745 * Note that it is safe to change the mp pointer here, 746 * as the subsequent operation does not assume mp 747 * points to the start of the mac header. 748 */ 749 mp = mp->b_cont; 750 751 /* 752 * Make sure ip6h holds the full ip6_t structure. 753 */ 754 if (mp == NULL) 755 return (-1); 756 757 if (MBLKL(mp) < IPV6_HDR_LEN) { 758 modifiable = (DB_REF(mp) == 1); 759 760 if (modifiable && 761 !pullupmsg(mp, IPV6_HDR_LEN)) { 762 return (-1); 763 } 764 } 765 766 ip6h = (ip6_t *)mp->b_rptr; 767 } 768 769 if (!modifiable || !(OK_32PTR((char *)ip6h)) || 770 ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 771 /* 772 * If either ip6h is not alligned, or ip6h does not 773 * hold the complete ip6_t structure (a pullupmsg() 774 * is not an option since it would result in an 775 * unalligned ip6h), fanout to the default ring. Note 776 * that this may cause packets reordering. 777 */ 778 *indx = 0; 779 *type = OTH; 780 fanout_unalligned++; 781 return (0); 782 } 783 784 remlen = ntohs(ip6h->ip6_plen); 785 nexthdr = ip6h->ip6_nxt; 786 787 if (remlen < MIN_EHDR_LEN) 788 return (-1); 789 /* 790 * Do src based fanout if below tunable is set to B_TRUE or 791 * when mac_ip_hdr_length_v6() fails because of malformed 792 * packets or because mblk's need to be concatenated using 793 * pullupmsg(). 794 */ 795 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 796 &hdr_len, &nexthdr)) { 797 goto src_based_fanout; 798 } 799 whereptr = (uint8_t *)ip6h + hdr_len; 800 801 /* If the transport is one of below, we do port based fanout */ 802 switch (nexthdr) { 803 case IPPROTO_TCP: 804 case IPPROTO_UDP: 805 case IPPROTO_SCTP: 806 case IPPROTO_ESP: 807 /* 808 * If the ports in the transport header is not part of 809 * the mblk, do src_based_fanout, instead of calling 810 * pullupmsg(). 811 */ 812 if (mp->b_cont != NULL && 813 whereptr + PORTS_SIZE > mp->b_wptr) { 814 goto src_based_fanout; 815 } 816 break; 817 default: 818 break; 819 } 820 821 switch (nexthdr) { 822 case IPPROTO_TCP: 823 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 824 *(uint32_t *)whereptr); 825 *indx = COMPUTE_INDEX(hash, 826 mac_srs->srs_tcp_ring_count); 827 *type = OTH; 828 break; 829 830 case IPPROTO_UDP: 831 case IPPROTO_SCTP: 832 case IPPROTO_ESP: 833 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 834 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 835 *(uint32_t *)whereptr); 836 *indx = COMPUTE_INDEX(hash, 837 mac_srs->srs_udp_ring_count); 838 } else { 839 *indx = mac_srs->srs_ind % 840 mac_srs->srs_udp_ring_count; 841 mac_srs->srs_ind++; 842 } 843 *type = OTH; 844 break; 845 846 /* For all other protocol, do source based fanout */ 847 default: 848 goto src_based_fanout; 849 } 850 } else { 851 *indx = 0; 852 *type = OTH; 853 } 854 return (0); 855 856 src_based_fanout: 857 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 858 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 859 *type = OTH; 860 return (0); 861 } 862 863 /* 864 * mac_rx_srs_fanout 865 * 866 * This routine delivers packets destined to an SRS into a soft ring member 867 * of the set. 868 * 869 * Given a chain of packets we need to split it up into multiple sub chains 870 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 871 * the soft ring one packet at a time, we want to enter it in the form of a 872 * chain otherwise we get this start/stop behaviour where the worker thread 873 * goes to sleep and then next packets comes in forcing it to wake up etc. 874 * 875 * Note: 876 * Since we know what is the maximum fanout possible, we create a 2D array 877 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 878 * variables so that we can enter the softrings with chain. We need the 879 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 880 * for each packet would be expensive). If we ever want to have the 881 * ability to have unlimited fanout, we should probably declare a head, 882 * tail, cnt, sz with each soft ring (a data struct which contains a softring 883 * along with these members) and create an array of this uber struct so we 884 * don't have to do kmem_alloc. 885 */ 886 int fanout_oth1 = 0; 887 int fanout_oth2 = 0; 888 int fanout_oth3 = 0; 889 int fanout_oth4 = 0; 890 int fanout_oth5 = 0; 891 892 static void 893 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 894 { 895 struct ether_header *ehp; 896 struct ether_vlan_header *evhp; 897 uint32_t sap; 898 ipha_t *ipha; 899 uint8_t *dstaddr; 900 uint_t indx; 901 size_t ports_offset; 902 size_t ipha_len; 903 size_t hdrsize; 904 uint_t hash; 905 mblk_t *mp; 906 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 907 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 908 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 909 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 910 size_t sz1; 911 boolean_t bw_ctl; 912 boolean_t hw_classified; 913 boolean_t dls_bypass; 914 boolean_t is_ether; 915 boolean_t is_unicast; 916 int fanout_cnt; 917 enum pkt_type type; 918 mac_client_impl_t *mcip = mac_srs->srs_mcip; 919 920 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 921 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 922 923 /* 924 * If we don't have a Rx ring, S/W classification would have done 925 * its job and its a packet meant for us. If we were polling on 926 * the default ring (i.e. there was a ring assigned to this SRS), 927 * then we need to make sure that the mac address really belongs 928 * to us. 929 */ 930 hw_classified = mac_srs->srs_ring != NULL && 931 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 932 933 /* 934 * Special clients (eg. VLAN, non ether, etc) need DLS 935 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 936 * such SRSs. 937 */ 938 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0); 939 940 /* 941 * Since the softrings are never destroyed and we always 942 * create equal number of softrings for TCP, UDP and rest, 943 * its OK to check one of them for count and use it without 944 * any lock. In future, if soft rings get destroyed because 945 * of reduction in fanout, we will need to ensure that happens 946 * behind the SRS_PROC. 947 */ 948 fanout_cnt = mac_srs->srs_tcp_ring_count; 949 950 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 951 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 952 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 953 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 954 955 /* 956 * We got a chain from SRS that we need to send to the soft rings. 957 * Since squeues for TCP & IPv4 sap poll their soft rings (for 958 * performance reasons), we need to separate out v4_tcp, v4_udp 959 * and the rest goes in other. 960 */ 961 while (head != NULL) { 962 mp = head; 963 head = head->b_next; 964 mp->b_next = NULL; 965 966 type = OTH; 967 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 968 969 if (is_ether) { 970 /* 971 * At this point we can be sure the packet at least 972 * has an ether header. 973 */ 974 if (sz1 < sizeof (struct ether_header)) { 975 mac_rx_drop_pkt(mac_srs, mp); 976 continue; 977 } 978 ehp = (struct ether_header *)mp->b_rptr; 979 980 /* 981 * Determine if this is a VLAN or non-VLAN packet. 982 */ 983 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 984 evhp = (struct ether_vlan_header *)mp->b_rptr; 985 sap = ntohs(evhp->ether_type); 986 hdrsize = sizeof (struct ether_vlan_header); 987 /* 988 * Check if the VID of the packet, if any, 989 * belongs to this client. 990 */ 991 if (!mac_client_check_flow_vid(mcip, 992 VLAN_ID(ntohs(evhp->ether_tci)))) { 993 mac_rx_drop_pkt(mac_srs, mp); 994 continue; 995 } 996 } else { 997 hdrsize = sizeof (struct ether_header); 998 } 999 is_unicast = 1000 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 1001 dstaddr = (uint8_t *)&ehp->ether_dhost; 1002 } else { 1003 mac_header_info_t mhi; 1004 1005 if (mac_header_info((mac_handle_t)mcip->mci_mip, 1006 mp, &mhi) != 0) { 1007 mac_rx_drop_pkt(mac_srs, mp); 1008 continue; 1009 } 1010 hdrsize = mhi.mhi_hdrsize; 1011 sap = mhi.mhi_bindsap; 1012 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 1013 dstaddr = (uint8_t *)mhi.mhi_daddr; 1014 } 1015 1016 if (!dls_bypass) { 1017 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1018 hdrsize, &type, &indx) == -1) { 1019 mac_rx_drop_pkt(mac_srs, mp); 1020 continue; 1021 } 1022 1023 FANOUT_ENQUEUE_MP(headmp[type][indx], 1024 tailmp[type][indx], cnt[type][indx], bw_ctl, 1025 sz[type][indx], sz1, mp); 1026 continue; 1027 } 1028 1029 1030 /* 1031 * If we are using the default Rx ring where H/W or S/W 1032 * classification has not happened, we need to verify if 1033 * this unicast packet really belongs to us. 1034 */ 1035 if (sap == ETHERTYPE_IP) { 1036 /* 1037 * If we are H/W classified, but we have promisc 1038 * on, then we need to check for the unicast address. 1039 */ 1040 if (hw_classified && mcip->mci_promisc_list != NULL) { 1041 mac_address_t *map; 1042 1043 rw_enter(&mcip->mci_rw_lock, RW_READER); 1044 map = mcip->mci_unicast; 1045 if (bcmp(dstaddr, map->ma_addr, 1046 map->ma_len) == 0) 1047 type = UNDEF; 1048 rw_exit(&mcip->mci_rw_lock); 1049 } else if (is_unicast) { 1050 type = UNDEF; 1051 } 1052 } 1053 1054 /* 1055 * This needs to become a contract with the driver for 1056 * the fast path. 1057 */ 1058 1059 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 1060 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1061 type = OTH; 1062 fanout_oth1++; 1063 } 1064 1065 if (type != OTH) { 1066 uint16_t frag_offset_flags; 1067 1068 switch (ipha->ipha_protocol) { 1069 case IPPROTO_TCP: 1070 case IPPROTO_UDP: 1071 case IPPROTO_SCTP: 1072 case IPPROTO_ESP: 1073 ipha_len = IPH_HDR_LENGTH(ipha); 1074 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1075 mp->b_wptr) { 1076 type = OTH; 1077 break; 1078 } 1079 frag_offset_flags = 1080 ntohs(ipha->ipha_fragment_offset_and_flags); 1081 if ((frag_offset_flags & 1082 (IPH_MF | IPH_OFFSET)) != 0) { 1083 type = OTH; 1084 fanout_oth3++; 1085 break; 1086 } 1087 ports_offset = hdrsize + ipha_len; 1088 break; 1089 default: 1090 type = OTH; 1091 fanout_oth4++; 1092 break; 1093 } 1094 } 1095 1096 if (type == OTH) { 1097 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1098 hdrsize, &type, &indx) == -1) { 1099 mac_rx_drop_pkt(mac_srs, mp); 1100 continue; 1101 } 1102 1103 FANOUT_ENQUEUE_MP(headmp[type][indx], 1104 tailmp[type][indx], cnt[type][indx], bw_ctl, 1105 sz[type][indx], sz1, mp); 1106 continue; 1107 } 1108 1109 ASSERT(type == UNDEF); 1110 1111 /* 1112 * XXX-Sunay: We should hold srs_lock since ring_count 1113 * below can change. But if we are always called from 1114 * mac_rx_srs_drain and SRS_PROC is set, then we can 1115 * enforce that ring_count can't be changed i.e. 1116 * to change fanout type or ring count, the calling 1117 * thread needs to be behind SRS_PROC. 1118 */ 1119 switch (ipha->ipha_protocol) { 1120 case IPPROTO_TCP: 1121 /* 1122 * Note that for ESP, we fanout on SPI and it is at the 1123 * same offset as the 2x16-bit ports. So it is clumped 1124 * along with TCP, UDP and SCTP. 1125 */ 1126 hash = HASH_ADDR(ipha->ipha_src, 1127 *(uint32_t *)(mp->b_rptr + ports_offset)); 1128 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1129 type = V4_TCP; 1130 mp->b_rptr += hdrsize; 1131 break; 1132 case IPPROTO_UDP: 1133 case IPPROTO_SCTP: 1134 case IPPROTO_ESP: 1135 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1136 hash = HASH_ADDR(ipha->ipha_src, 1137 *(uint32_t *)(mp->b_rptr + ports_offset)); 1138 indx = COMPUTE_INDEX(hash, 1139 mac_srs->srs_udp_ring_count); 1140 } else { 1141 indx = mac_srs->srs_ind % 1142 mac_srs->srs_udp_ring_count; 1143 mac_srs->srs_ind++; 1144 } 1145 type = V4_UDP; 1146 mp->b_rptr += hdrsize; 1147 break; 1148 default: 1149 indx = 0; 1150 type = OTH; 1151 } 1152 1153 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1154 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1155 } 1156 1157 for (type = V4_TCP; type < UNDEF; type++) { 1158 int i; 1159 1160 for (i = 0; i < fanout_cnt; i++) { 1161 if (headmp[type][i] != NULL) { 1162 mac_soft_ring_t *softring; 1163 1164 ASSERT(tailmp[type][i]->b_next == NULL); 1165 switch (type) { 1166 case V4_TCP: 1167 softring = 1168 mac_srs->srs_tcp_soft_rings[i]; 1169 break; 1170 case V4_UDP: 1171 softring = 1172 mac_srs->srs_udp_soft_rings[i]; 1173 break; 1174 case OTH: 1175 softring = 1176 mac_srs->srs_oth_soft_rings[i]; 1177 break; 1178 } 1179 mac_rx_soft_ring_process(mcip, 1180 softring, headmp[type][i], tailmp[type][i], 1181 cnt[type][i], sz[type][i]); 1182 } 1183 } 1184 } 1185 } 1186 1187 #define SRS_BYTES_TO_PICKUP 150000 1188 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1189 1190 /* 1191 * mac_rx_srs_poll_ring 1192 * 1193 * This SRS Poll thread uses this routine to poll the underlying hardware 1194 * Rx ring to get a chain of packets. It can inline process that chain 1195 * if mac_latency_optimize is set (default) or signal the SRS worker thread 1196 * to do the remaining processing. 1197 * 1198 * Since packets come in the system via interrupt or poll path, we also 1199 * update the stats and deal with promiscous clients here. 1200 */ 1201 void 1202 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1203 { 1204 kmutex_t *lock = &mac_srs->srs_lock; 1205 kcondvar_t *async = &mac_srs->srs_cv; 1206 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1207 mblk_t *head, *tail, *mp; 1208 callb_cpr_t cprinfo; 1209 ssize_t bytes_to_pickup; 1210 size_t sz; 1211 int count; 1212 mac_client_impl_t *smcip; 1213 1214 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1215 mutex_enter(lock); 1216 1217 start: 1218 for (;;) { 1219 if (mac_srs->srs_state & SRS_PAUSE) 1220 goto done; 1221 1222 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1223 cv_wait(async, lock); 1224 CALLB_CPR_SAFE_END(&cprinfo, lock); 1225 1226 if (mac_srs->srs_state & SRS_PAUSE) 1227 goto done; 1228 1229 check_again: 1230 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1231 /* 1232 * We pick as many bytes as we are allowed to queue. 1233 * Its possible that we will exceed the total 1234 * packets queued in case this SRS is part of the 1235 * Rx ring group since > 1 poll thread can be pulling 1236 * upto the max allowed packets at the same time 1237 * but that should be OK. 1238 */ 1239 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1240 bytes_to_pickup = 1241 mac_srs->srs_bw->mac_bw_drop_threshold - 1242 mac_srs->srs_bw->mac_bw_sz; 1243 /* 1244 * We shouldn't have been signalled if we 1245 * have 0 or less bytes to pick but since 1246 * some of the bytes accounting is driver 1247 * dependant, we do the safety check. 1248 */ 1249 if (bytes_to_pickup < 0) 1250 bytes_to_pickup = 0; 1251 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1252 } else { 1253 /* 1254 * ToDO: Need to change the polling API 1255 * to add a packet count and a flag which 1256 * tells the driver whether we want packets 1257 * based on a count, or bytes, or all the 1258 * packets queued in the driver/HW. This 1259 * way, we never have to check the limits 1260 * on poll path. We truly let only as many 1261 * packets enter the system as we are willing 1262 * to process or queue. 1263 * 1264 * Something along the lines of 1265 * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1266 * mac_srs->srs_poll_pkt_cnt 1267 */ 1268 1269 /* 1270 * Since we are not doing B/W control, pick 1271 * as many packets as allowed. 1272 */ 1273 bytes_to_pickup = max_bytes_to_pickup; 1274 } 1275 1276 /* Poll the underlying Hardware */ 1277 mutex_exit(lock); 1278 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1279 mutex_enter(lock); 1280 1281 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1282 SRS_POLL_THR_OWNER); 1283 1284 mp = tail = head; 1285 count = 0; 1286 sz = 0; 1287 while (mp != NULL) { 1288 tail = mp; 1289 sz += msgdsize(mp); 1290 mp = mp->b_next; 1291 count++; 1292 } 1293 1294 if (head != NULL) { 1295 tail->b_next = NULL; 1296 smcip = mac_srs->srs_mcip; 1297 1298 if ((mac_srs->srs_type & SRST_FLOW) || 1299 (smcip == NULL)) { 1300 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1301 rbytes, sz); 1302 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1303 ipackets, count); 1304 } 1305 1306 /* 1307 * If there are any promiscuous mode callbacks 1308 * defined for this MAC client, pass them a copy 1309 * if appropriate and also update the counters. 1310 */ 1311 if (smcip != NULL) { 1312 smcip->mci_stat_ibytes += sz; 1313 smcip->mci_stat_ipackets += count; 1314 1315 if (smcip->mci_mip->mi_promisc_list != NULL) { 1316 mutex_exit(lock); 1317 mac_promisc_dispatch(smcip->mci_mip, 1318 head, NULL); 1319 mutex_enter(lock); 1320 } 1321 } 1322 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1323 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1324 mac_srs->srs_bw->mac_bw_polled += sz; 1325 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1326 } 1327 srs_rx->sr_poll_count += count; 1328 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1329 count, sz); 1330 if (count <= 10) 1331 srs_rx->sr_chain_cnt_undr10++; 1332 else if (count > 10 && count <= 50) 1333 srs_rx->sr_chain_cnt_10to50++; 1334 else 1335 srs_rx->sr_chain_cnt_over50++; 1336 } 1337 1338 /* 1339 * We are guaranteed that SRS_PROC will be set if we 1340 * are here. Also, poll thread gets to run only if 1341 * the drain was being done by a worker thread although 1342 * its possible that worker thread is still running 1343 * and poll thread was sent down to keep the pipeline 1344 * going instead of doing a complete drain and then 1345 * trying to poll the NIC. 1346 * 1347 * So we need to check SRS_WORKER flag to make sure 1348 * that the worker thread is not processing the queue 1349 * in parallel to us. The flags and conditions are 1350 * protected by the srs_lock to prevent any race. We 1351 * ensure that we don't drop the srs_lock from now 1352 * till the end and similarly we don't drop the srs_lock 1353 * in mac_rx_srs_drain() till similar condition check 1354 * are complete. The mac_rx_srs_drain() needs to ensure 1355 * that SRS_WORKER flag remains set as long as its 1356 * processing the queue. 1357 */ 1358 if (!(mac_srs->srs_state & SRS_WORKER) && 1359 (mac_srs->srs_first != NULL)) { 1360 /* 1361 * We have packets to process and worker thread 1362 * is not running. Check to see if poll thread is 1363 * allowed to process. 1364 */ 1365 if (mac_srs->srs_state & SRS_LATENCY_OPT) { 1366 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1367 if (!(mac_srs->srs_state & SRS_PAUSE) && 1368 srs_rx->sr_poll_pkt_cnt <= 1369 srs_rx->sr_lowat) { 1370 srs_rx->sr_poll_again++; 1371 goto check_again; 1372 } 1373 /* 1374 * We are already above low water mark 1375 * so stay in the polling mode but no 1376 * need to poll. Once we dip below 1377 * the polling threshold, the processing 1378 * thread (soft ring) will signal us 1379 * to poll again (MAC_UPDATE_SRS_COUNT) 1380 */ 1381 srs_rx->sr_poll_drain_no_poll++; 1382 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1383 /* 1384 * In B/W control case, its possible 1385 * that the backlog built up due to 1386 * B/W limit being reached and packets 1387 * are queued only in SRS. In this case, 1388 * we should schedule worker thread 1389 * since no one else will wake us up. 1390 */ 1391 if ((mac_srs->srs_type & SRST_BW_CONTROL) && 1392 (mac_srs->srs_tid == NULL)) { 1393 mac_srs->srs_tid = 1394 timeout(mac_srs_fire, mac_srs, 1); 1395 srs_rx->sr_poll_worker_wakeup++; 1396 } 1397 } else { 1398 /* 1399 * Wakeup the worker thread for more processing. 1400 * We optimize for throughput in this case. 1401 */ 1402 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1403 MAC_SRS_WORKER_WAKEUP(mac_srs); 1404 srs_rx->sr_poll_sig_worker++; 1405 } 1406 } else if ((mac_srs->srs_first == NULL) && 1407 !(mac_srs->srs_state & SRS_WORKER)) { 1408 /* 1409 * There is nothing queued in SRS and 1410 * no worker thread running. Plus we 1411 * didn't get anything from the H/W 1412 * as well (head == NULL); 1413 */ 1414 ASSERT(head == NULL); 1415 mac_srs->srs_state &= 1416 ~(SRS_PROC|SRS_GET_PKTS); 1417 1418 /* 1419 * If we have a packets in soft ring, don't allow 1420 * more packets to come into this SRS by keeping the 1421 * interrupts off but not polling the H/W. The 1422 * poll thread will get signaled as soon as 1423 * srs_poll_pkt_cnt dips below poll threshold. 1424 */ 1425 if (srs_rx->sr_poll_pkt_cnt == 0) { 1426 srs_rx->sr_poll_intr_enable++; 1427 MAC_SRS_POLLING_OFF(mac_srs); 1428 } else { 1429 /* 1430 * We know nothing is queued in SRS 1431 * since we are here after checking 1432 * srs_first is NULL. The backlog 1433 * is entirely due to packets queued 1434 * in Soft ring which will wake us up 1435 * and get the interface out of polling 1436 * mode once the backlog dips below 1437 * sr_poll_thres. 1438 */ 1439 srs_rx->sr_poll_no_poll++; 1440 } 1441 } else { 1442 /* 1443 * Worker thread is already running. 1444 * Nothing much to do. If the polling 1445 * was enabled, worker thread will deal 1446 * with that. 1447 */ 1448 mac_srs->srs_state &= ~SRS_GET_PKTS; 1449 srs_rx->sr_poll_goto_sleep++; 1450 } 1451 } 1452 done: 1453 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1454 cv_signal(&mac_srs->srs_async); 1455 /* 1456 * If this is a temporary quiesce then wait for the restart signal 1457 * from the srs worker. Then clear the flags and signal the srs worker 1458 * to ensure a positive handshake and go back to start. 1459 */ 1460 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1461 cv_wait(async, lock); 1462 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1463 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1464 mac_srs->srs_state &= 1465 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1466 cv_signal(&mac_srs->srs_async); 1467 goto start; 1468 } else { 1469 mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1470 cv_signal(&mac_srs->srs_async); 1471 CALLB_CPR_EXIT(&cprinfo); 1472 thread_exit(); 1473 } 1474 } 1475 1476 /* 1477 * mac_srs_pick_chain 1478 * 1479 * In Bandwidth control case, checks how many packets can be processed 1480 * and return them in a sub chain. 1481 */ 1482 static mblk_t * 1483 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1484 size_t *chain_sz, int *chain_cnt) 1485 { 1486 mblk_t *head = NULL; 1487 mblk_t *tail = NULL; 1488 size_t sz; 1489 size_t tsz = 0; 1490 int cnt = 0; 1491 mblk_t *mp; 1492 1493 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1494 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1495 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1496 mac_srs->srs_bw->mac_bw_limit) || 1497 (mac_srs->srs_bw->mac_bw_limit == 0)) { 1498 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1499 head = mac_srs->srs_first; 1500 mac_srs->srs_first = NULL; 1501 *chain_tail = mac_srs->srs_last; 1502 mac_srs->srs_last = NULL; 1503 *chain_sz = mac_srs->srs_size; 1504 *chain_cnt = mac_srs->srs_count; 1505 mac_srs->srs_count = 0; 1506 mac_srs->srs_size = 0; 1507 return (head); 1508 } 1509 1510 /* 1511 * Can't clear the entire backlog. 1512 * Need to find how many packets to pick 1513 */ 1514 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1515 while ((mp = mac_srs->srs_first) != NULL) { 1516 sz = msgdsize(mp); 1517 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1518 mac_srs->srs_bw->mac_bw_limit) { 1519 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1520 mac_srs->srs_bw->mac_bw_state |= 1521 SRS_BW_ENFORCED; 1522 break; 1523 } 1524 1525 /* 1526 * The _size & cnt is decremented from the softrings 1527 * when they send up the packet for polling to work 1528 * properly. 1529 */ 1530 tsz += sz; 1531 cnt++; 1532 mac_srs->srs_count--; 1533 mac_srs->srs_size -= sz; 1534 if (tail != NULL) 1535 tail->b_next = mp; 1536 else 1537 head = mp; 1538 tail = mp; 1539 mac_srs->srs_first = mac_srs->srs_first->b_next; 1540 } 1541 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1542 if (mac_srs->srs_first == NULL) 1543 mac_srs->srs_last = NULL; 1544 1545 if (tail != NULL) 1546 tail->b_next = NULL; 1547 *chain_tail = tail; 1548 *chain_cnt = cnt; 1549 *chain_sz = tsz; 1550 1551 return (head); 1552 } 1553 1554 /* 1555 * mac_rx_srs_drain 1556 * 1557 * The SRS drain routine. Gets to run to clear the queue. Any thread 1558 * (worker, interrupt, poll) can call this based on processing model. 1559 * The first thing we do is disable interrupts if possible and then 1560 * drain the queue. we also try to poll the underlying hardware if 1561 * there is a dedicated hardware Rx ring assigned to this SRS. 1562 * 1563 * There is a equivalent drain routine in bandwidth control mode 1564 * mac_rx_srs_drain_bw. There is some code duplication between the two 1565 * routines but they are highly performance sensitive and are easier 1566 * to read/debug if they stay separate. Any code changes here might 1567 * also apply to mac_rx_srs_drain_bw as well. 1568 */ 1569 void 1570 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1571 { 1572 mblk_t *head; 1573 mblk_t *tail; 1574 timeout_id_t tid; 1575 int cnt = 0; 1576 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1577 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1578 1579 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1580 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1581 1582 /* If we are blanked i.e. can't do upcalls, then we are done */ 1583 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1584 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1585 (mac_srs->srs_state & SRS_PAUSE)); 1586 goto out; 1587 } 1588 1589 if (mac_srs->srs_first == NULL) 1590 goto out; 1591 1592 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 1593 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 1594 /* 1595 * In the normal case, the SRS worker thread does no 1596 * work and we wait for a backlog to build up before 1597 * we switch into polling mode. In case we are 1598 * optimizing for throughput, we use the worker thread 1599 * as well. The goal is to let worker thread process 1600 * the queue and poll thread to feed packets into 1601 * the queue. As such, we should signal the poll 1602 * thread to try and get more packets. 1603 * 1604 * We could have pulled this check in the POLL_RING 1605 * macro itself but keeping it explicit here makes 1606 * the architecture more human understandable. 1607 */ 1608 MAC_SRS_POLL_RING(mac_srs); 1609 } 1610 1611 again: 1612 head = mac_srs->srs_first; 1613 mac_srs->srs_first = NULL; 1614 tail = mac_srs->srs_last; 1615 mac_srs->srs_last = NULL; 1616 cnt = mac_srs->srs_count; 1617 mac_srs->srs_count = 0; 1618 1619 ASSERT(head != NULL); 1620 ASSERT(tail != NULL); 1621 1622 if ((tid = mac_srs->srs_tid) != 0) 1623 mac_srs->srs_tid = 0; 1624 1625 mac_srs->srs_state |= (SRS_PROC|proc_type); 1626 1627 1628 /* 1629 * mcip is NULL for broadcast and multicast flows. The promisc 1630 * callbacks for broadcast and multicast packets are delivered from 1631 * mac_rx() and we don't need to worry about that case in this path 1632 */ 1633 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1634 mutex_exit(&mac_srs->srs_lock); 1635 mac_promisc_client_dispatch(mcip, head); 1636 mutex_enter(&mac_srs->srs_lock); 1637 } 1638 1639 /* 1640 * Check if SRS itself is doing the processing 1641 * This direct path does not apply when subflows are present. In this 1642 * case, packets need to be dispatched to a soft ring according to the 1643 * flow's bandwidth and other resources contraints. 1644 */ 1645 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1646 mac_direct_rx_t proc; 1647 void *arg1; 1648 mac_resource_handle_t arg2; 1649 1650 /* 1651 * This is the case when a Rx is directly 1652 * assigned and we have a fully classified 1653 * protocol chain. We can deal with it in 1654 * one shot. 1655 */ 1656 proc = srs_rx->sr_func; 1657 arg1 = srs_rx->sr_arg1; 1658 arg2 = srs_rx->sr_arg2; 1659 1660 mac_srs->srs_state |= SRS_CLIENT_PROC; 1661 mutex_exit(&mac_srs->srs_lock); 1662 if (tid != 0) { 1663 (void) untimeout(tid); 1664 tid = 0; 1665 } 1666 1667 proc(arg1, arg2, head, NULL); 1668 /* 1669 * Decrement the size and count here itelf 1670 * since the packet has been processed. 1671 */ 1672 mutex_enter(&mac_srs->srs_lock); 1673 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1674 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1675 cv_signal(&mac_srs->srs_client_cv); 1676 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1677 } else { 1678 /* Some kind of softrings based fanout is required */ 1679 mutex_exit(&mac_srs->srs_lock); 1680 if (tid != 0) { 1681 (void) untimeout(tid); 1682 tid = 0; 1683 } 1684 1685 /* 1686 * Since the fanout routines can deal with chains, 1687 * shoot the entire chain up. 1688 */ 1689 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1690 mac_rx_srs_fanout(mac_srs, head); 1691 else 1692 mac_rx_srs_proto_fanout(mac_srs, head); 1693 mutex_enter(&mac_srs->srs_lock); 1694 } 1695 1696 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 1697 (mac_srs->srs_first != NULL)) { 1698 /* 1699 * More packets arrived while we were clearing the 1700 * SRS. This can be possible because of one of 1701 * three conditions below: 1702 * 1) The driver is using multiple worker threads 1703 * to send the packets to us. 1704 * 2) The driver has a race in switching 1705 * between interrupt and polling mode or 1706 * 3) Packets are arriving in this SRS via the 1707 * S/W classification as well. 1708 * 1709 * We should switch to polling mode and see if we 1710 * need to send the poll thread down. Also, signal 1711 * the worker thread to process whats just arrived. 1712 */ 1713 MAC_SRS_POLLING_ON(mac_srs); 1714 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 1715 srs_rx->sr_drain_poll_sig++; 1716 MAC_SRS_POLL_RING(mac_srs); 1717 } 1718 1719 /* 1720 * If we didn't signal the poll thread, we need 1721 * to deal with the pending packets ourselves. 1722 */ 1723 if (proc_type == SRS_WORKER) { 1724 srs_rx->sr_drain_again++; 1725 goto again; 1726 } else { 1727 srs_rx->sr_drain_worker_sig++; 1728 cv_signal(&mac_srs->srs_async); 1729 } 1730 } 1731 1732 out: 1733 if (mac_srs->srs_state & SRS_GET_PKTS) { 1734 /* 1735 * Poll thread is already running. Leave the 1736 * SRS_RPOC set and hand over the control to 1737 * poll thread. 1738 */ 1739 mac_srs->srs_state &= ~proc_type; 1740 srs_rx->sr_drain_poll_running++; 1741 return; 1742 } 1743 1744 /* 1745 * Even if there are no packets queued in SRS, we 1746 * need to make sure that the shared counter is 1747 * clear and any associated softrings have cleared 1748 * all the backlog. Otherwise, leave the interface 1749 * in polling mode and the poll thread will get 1750 * signalled once the count goes down to zero. 1751 * 1752 * If someone is already draining the queue (SRS_PROC is 1753 * set) when the srs_poll_pkt_cnt goes down to zero, 1754 * then it means that drain is already running and we 1755 * will turn off polling at that time if there is 1756 * no backlog. 1757 * 1758 * As long as there are packets queued either 1759 * in soft ring set or its soft rings, we will leave 1760 * the interface in polling mode (even if the drain 1761 * was done being the interrupt thread). We signal 1762 * the poll thread as well if we have dipped below 1763 * low water mark. 1764 * 1765 * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1766 * since that turn polling on only for worker thread. 1767 * Its not worth turning polling on for interrupt 1768 * thread (since NIC will not issue another interrupt) 1769 * unless a backlog builds up. 1770 */ 1771 if ((srs_rx->sr_poll_pkt_cnt > 0) && 1772 (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1773 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1774 srs_rx->sr_drain_keep_polling++; 1775 MAC_SRS_POLLING_ON(mac_srs); 1776 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1777 MAC_SRS_POLL_RING(mac_srs); 1778 return; 1779 } 1780 1781 /* Nothing else to do. Get out of poll mode */ 1782 MAC_SRS_POLLING_OFF(mac_srs); 1783 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1784 srs_rx->sr_drain_finish_intr++; 1785 } 1786 1787 /* 1788 * mac_rx_srs_drain_bw 1789 * 1790 * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1791 * (worker, interrupt, poll) can call this based on processing model. 1792 * The first thing we do is disable interrupts if possible and then 1793 * drain the queue. we also try to poll the underlying hardware if 1794 * there is a dedicated hardware Rx ring assigned to this SRS. 1795 * 1796 * There is a equivalent drain routine in non bandwidth control mode 1797 * mac_rx_srs_drain. There is some code duplication between the two 1798 * routines but they are highly performance sensitive and are easier 1799 * to read/debug if they stay separate. Any code changes here might 1800 * also apply to mac_rx_srs_drain as well. 1801 */ 1802 void 1803 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1804 { 1805 mblk_t *head; 1806 mblk_t *tail; 1807 timeout_id_t tid; 1808 size_t sz = 0; 1809 int cnt = 0; 1810 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1811 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1812 1813 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1814 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1815 again: 1816 /* Check if we are doing B/W control */ 1817 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1818 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 1819 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 1820 mac_srs->srs_bw->mac_bw_used = 0; 1821 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1822 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1823 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1824 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1825 goto done; 1826 } else if (mac_srs->srs_bw->mac_bw_used > 1827 mac_srs->srs_bw->mac_bw_limit) { 1828 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1829 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1830 goto done; 1831 } 1832 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1833 1834 /* If we are blanked i.e. can't do upcalls, then we are done */ 1835 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1836 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1837 (mac_srs->srs_state & SRS_PAUSE)); 1838 goto done; 1839 } 1840 1841 sz = 0; 1842 cnt = 0; 1843 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1844 /* 1845 * We couldn't pick up a single packet. 1846 */ 1847 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1848 if ((mac_srs->srs_bw->mac_bw_used == 0) && 1849 (mac_srs->srs_size != 0) && 1850 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1851 /* 1852 * Seems like configured B/W doesn't 1853 * even allow processing of 1 packet 1854 * per tick. 1855 * 1856 * XXX: raise the limit to processing 1857 * at least 1 packet per tick. 1858 */ 1859 mac_srs->srs_bw->mac_bw_limit += 1860 mac_srs->srs_bw->mac_bw_limit; 1861 mac_srs->srs_bw->mac_bw_drop_threshold += 1862 mac_srs->srs_bw->mac_bw_drop_threshold; 1863 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1864 "raised B/W limit to %d since not even a " 1865 "single packet can be processed per " 1866 "tick %d\n", (void *)mac_srs, 1867 (int)mac_srs->srs_bw->mac_bw_limit, 1868 (int)msgdsize(mac_srs->srs_first)); 1869 } 1870 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1871 goto done; 1872 } 1873 1874 ASSERT(head != NULL); 1875 ASSERT(tail != NULL); 1876 1877 /* zero bandwidth: drop all and return to interrupt mode */ 1878 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1879 if (mac_srs->srs_bw->mac_bw_limit == 0) { 1880 srs_rx->sr_drop_count += cnt; 1881 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1882 mac_srs->srs_bw->mac_bw_sz -= sz; 1883 mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1884 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1885 mac_pkt_drop(NULL, NULL, head, B_FALSE); 1886 goto leave_poll; 1887 } else { 1888 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1889 } 1890 1891 if ((tid = mac_srs->srs_tid) != 0) 1892 mac_srs->srs_tid = 0; 1893 1894 mac_srs->srs_state |= (SRS_PROC|proc_type); 1895 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1896 1897 /* 1898 * mcip is NULL for broadcast and multicast flows. The promisc 1899 * callbacks for broadcast and multicast packets are delivered from 1900 * mac_rx() and we don't need to worry about that case in this path 1901 */ 1902 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1903 mutex_exit(&mac_srs->srs_lock); 1904 mac_promisc_client_dispatch(mcip, head); 1905 mutex_enter(&mac_srs->srs_lock); 1906 } 1907 1908 /* 1909 * Check if SRS itself is doing the processing 1910 * This direct path does not apply when subflows are present. In this 1911 * case, packets need to be dispatched to a soft ring according to the 1912 * flow's bandwidth and other resources contraints. 1913 */ 1914 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1915 mac_direct_rx_t proc; 1916 void *arg1; 1917 mac_resource_handle_t arg2; 1918 1919 /* 1920 * This is the case when a Rx is directly 1921 * assigned and we have a fully classified 1922 * protocol chain. We can deal with it in 1923 * one shot. 1924 */ 1925 proc = srs_rx->sr_func; 1926 arg1 = srs_rx->sr_arg1; 1927 arg2 = srs_rx->sr_arg2; 1928 1929 mac_srs->srs_state |= SRS_CLIENT_PROC; 1930 mutex_exit(&mac_srs->srs_lock); 1931 if (tid != 0) { 1932 (void) untimeout(tid); 1933 tid = 0; 1934 } 1935 1936 proc(arg1, arg2, head, NULL); 1937 /* 1938 * Decrement the size and count here itelf 1939 * since the packet has been processed. 1940 */ 1941 mutex_enter(&mac_srs->srs_lock); 1942 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1943 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 1944 1945 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1946 cv_signal(&mac_srs->srs_client_cv); 1947 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1948 } else { 1949 /* Some kind of softrings based fanout is required */ 1950 mutex_exit(&mac_srs->srs_lock); 1951 if (tid != 0) { 1952 (void) untimeout(tid); 1953 tid = 0; 1954 } 1955 1956 /* 1957 * Since the fanout routines can deal with chains, 1958 * shoot the entire chain up. 1959 */ 1960 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1961 mac_rx_srs_fanout(mac_srs, head); 1962 else 1963 mac_rx_srs_proto_fanout(mac_srs, head); 1964 mutex_enter(&mac_srs->srs_lock); 1965 } 1966 1967 /* 1968 * Send the poll thread to pick up any packets arrived 1969 * so far. This also serves as the last check in case 1970 * nothing else is queued in the SRS. The poll thread 1971 * is signalled only in the case the drain was done 1972 * by the worker thread and SRS_WORKER is set. The 1973 * worker thread can run in parallel as long as the 1974 * SRS_WORKER flag is set. We we have nothing else to 1975 * process, we can exit while leaving SRS_PROC set 1976 * which gives the poll thread control to process and 1977 * cleanup once it returns from the NIC. 1978 * 1979 * If we have nothing else to process, we need to 1980 * ensure that we keep holding the srs_lock till 1981 * all the checks below are done and control is 1982 * handed to the poll thread if it was running. 1983 */ 1984 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1985 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1986 if (mac_srs->srs_first != NULL) { 1987 if (proc_type == SRS_WORKER) { 1988 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1989 if (srs_rx->sr_poll_pkt_cnt <= 1990 srs_rx->sr_lowat) 1991 MAC_SRS_POLL_RING(mac_srs); 1992 goto again; 1993 } else { 1994 cv_signal(&mac_srs->srs_async); 1995 } 1996 } 1997 } 1998 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1999 2000 done: 2001 2002 if (mac_srs->srs_state & SRS_GET_PKTS) { 2003 /* 2004 * Poll thread is already running. Leave the 2005 * SRS_RPOC set and hand over the control to 2006 * poll thread. 2007 */ 2008 mac_srs->srs_state &= ~proc_type; 2009 return; 2010 } 2011 2012 /* 2013 * If we can't process packets because we have exceeded 2014 * B/W limit for this tick, just set the timeout 2015 * and leave. 2016 * 2017 * Even if there are no packets queued in SRS, we 2018 * need to make sure that the shared counter is 2019 * clear and any associated softrings have cleared 2020 * all the backlog. Otherwise, leave the interface 2021 * in polling mode and the poll thread will get 2022 * signalled once the count goes down to zero. 2023 * 2024 * If someone is already draining the queue (SRS_PROC is 2025 * set) when the srs_poll_pkt_cnt goes down to zero, 2026 * then it means that drain is already running and we 2027 * will turn off polling at that time if there is 2028 * no backlog. As long as there are packets queued either 2029 * is soft ring set or its soft rings, we will leave 2030 * the interface in polling mode. 2031 */ 2032 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2033 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2034 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2035 (srs_rx->sr_poll_pkt_cnt > 0))) { 2036 MAC_SRS_POLLING_ON(mac_srs); 2037 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2038 if ((mac_srs->srs_first != NULL) && 2039 (mac_srs->srs_tid == NULL)) 2040 mac_srs->srs_tid = timeout(mac_srs_fire, 2041 mac_srs, 1); 2042 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2043 return; 2044 } 2045 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2046 2047 leave_poll: 2048 2049 /* Nothing else to do. Get out of poll mode */ 2050 MAC_SRS_POLLING_OFF(mac_srs); 2051 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2052 } 2053 2054 /* 2055 * mac_srs_worker 2056 * 2057 * The SRS worker routine. Drains the queue when no one else is 2058 * processing it. 2059 */ 2060 void 2061 mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2062 { 2063 kmutex_t *lock = &mac_srs->srs_lock; 2064 kcondvar_t *async = &mac_srs->srs_async; 2065 callb_cpr_t cprinfo; 2066 boolean_t bw_ctl_flag; 2067 2068 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2069 mutex_enter(lock); 2070 2071 start: 2072 for (;;) { 2073 bw_ctl_flag = B_FALSE; 2074 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2075 MAC_SRS_BW_LOCK(mac_srs); 2076 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2077 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2078 bw_ctl_flag = B_TRUE; 2079 MAC_SRS_BW_UNLOCK(mac_srs); 2080 } 2081 /* 2082 * The SRS_BW_ENFORCED flag may change since we have dropped 2083 * the mac_bw_lock. However the drain function can handle both 2084 * a drainable SRS or a bandwidth controlled SRS, and the 2085 * effect of scheduling a timeout is to wakeup the worker 2086 * thread which in turn will call the drain function. Since 2087 * we release the srs_lock atomically only in the cv_wait there 2088 * isn't a fear of waiting for ever. 2089 */ 2090 while (((mac_srs->srs_state & SRS_PROC) || 2091 (mac_srs->srs_first == NULL) || bw_ctl_flag || 2092 (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2093 !(mac_srs->srs_state & SRS_PAUSE)) { 2094 /* 2095 * If we have packets queued and we are here 2096 * because B/W control is in place, we better 2097 * schedule the worker wakeup after 1 tick 2098 * to see if bandwidth control can be relaxed. 2099 */ 2100 if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2101 /* 2102 * We need to ensure that a timer is already 2103 * scheduled or we force schedule one for 2104 * later so that we can continue processing 2105 * after this quanta is over. 2106 */ 2107 mac_srs->srs_tid = timeout(mac_srs_fire, 2108 mac_srs, 1); 2109 } 2110 wait: 2111 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2112 cv_wait(async, lock); 2113 CALLB_CPR_SAFE_END(&cprinfo, lock); 2114 2115 if (mac_srs->srs_state & SRS_PAUSE) 2116 goto done; 2117 if (mac_srs->srs_state & SRS_PROC) 2118 goto wait; 2119 2120 if (mac_srs->srs_first != NULL && 2121 mac_srs->srs_type & SRST_BW_CONTROL) { 2122 MAC_SRS_BW_LOCK(mac_srs); 2123 if (mac_srs->srs_bw->mac_bw_state & 2124 SRS_BW_ENFORCED) { 2125 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2126 } 2127 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2128 SRS_BW_ENFORCED; 2129 MAC_SRS_BW_UNLOCK(mac_srs); 2130 } 2131 } 2132 2133 if (mac_srs->srs_state & SRS_PAUSE) 2134 goto done; 2135 mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2136 } 2137 done: 2138 /* 2139 * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2140 * from both hard and soft classifications and waits for such threads 2141 * to finish before signaling the worker. So at this point the only 2142 * thread left that could be competing with the worker is the poll 2143 * thread. In the case of Tx, there shouldn't be any thread holding 2144 * SRS_PROC at this point. 2145 */ 2146 if (!(mac_srs->srs_state & SRS_PROC)) { 2147 mac_srs->srs_state |= SRS_PROC; 2148 } else { 2149 ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2150 /* 2151 * Poll thread still owns the SRS and is still running 2152 */ 2153 ASSERT((mac_srs->srs_poll_thr == NULL) || 2154 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2155 SRS_POLL_THR_OWNER)); 2156 } 2157 mac_srs_worker_quiesce(mac_srs); 2158 /* 2159 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2160 * of the quiesce operation 2161 */ 2162 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2163 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2164 2165 if (mac_srs->srs_state & SRS_RESTART) { 2166 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2167 mac_srs_worker_restart(mac_srs); 2168 mac_srs->srs_state &= ~SRS_PROC; 2169 goto start; 2170 } 2171 2172 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2173 mac_srs_worker_quiesce(mac_srs); 2174 2175 mac_srs->srs_state &= ~SRS_PROC; 2176 /* The macro drops the srs_lock */ 2177 CALLB_CPR_EXIT(&cprinfo); 2178 thread_exit(); 2179 } 2180 2181 /* 2182 * mac_rx_srs_subflow_process 2183 * 2184 * Receive side routine called from interrupt path when there are 2185 * sub flows present on this SRS. 2186 */ 2187 /* ARGSUSED */ 2188 void 2189 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2190 mblk_t *mp_chain, boolean_t loopback) 2191 { 2192 flow_entry_t *flent = NULL; 2193 flow_entry_t *prev_flent = NULL; 2194 mblk_t *mp = NULL; 2195 mblk_t *tail = NULL; 2196 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2197 mac_client_impl_t *mcip; 2198 2199 mcip = mac_srs->srs_mcip; 2200 ASSERT(mcip != NULL); 2201 2202 /* 2203 * We need to determine the SRS for every packet 2204 * by walking the flow table, if we don't get any, 2205 * then we proceed using the SRS we came with. 2206 */ 2207 mp = tail = mp_chain; 2208 while (mp != NULL) { 2209 2210 /* 2211 * We will increment the stats for the mactching subflow. 2212 * when we get the bytes/pkt count for the classified packets 2213 * later in mac_rx_srs_process. 2214 */ 2215 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2216 FLOW_INBOUND, &flent); 2217 2218 if (mp == mp_chain || flent == prev_flent) { 2219 if (prev_flent != NULL) 2220 FLOW_REFRELE(prev_flent); 2221 prev_flent = flent; 2222 flent = NULL; 2223 tail = mp; 2224 mp = mp->b_next; 2225 continue; 2226 } 2227 tail->b_next = NULL; 2228 /* 2229 * A null indicates, this is for the mac_srs itself. 2230 * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2231 */ 2232 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2233 mac_rx_srs_process(arg, 2234 (mac_resource_handle_t)mac_srs, mp_chain, 2235 loopback); 2236 } else { 2237 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2238 prev_flent->fe_cb_arg2, mp_chain, loopback); 2239 FLOW_REFRELE(prev_flent); 2240 } 2241 prev_flent = flent; 2242 flent = NULL; 2243 mp_chain = mp; 2244 tail = mp; 2245 mp = mp->b_next; 2246 } 2247 /* Last chain */ 2248 ASSERT(mp_chain != NULL); 2249 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2250 mac_rx_srs_process(arg, 2251 (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2252 } else { 2253 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2254 prev_flent->fe_cb_arg2, mp_chain, loopback); 2255 FLOW_REFRELE(prev_flent); 2256 } 2257 } 2258 2259 /* 2260 * mac_rx_srs_process 2261 * 2262 * Receive side routine called from the interrupt path. 2263 * 2264 * loopback is set to force a context switch on the loopback 2265 * path between MAC clients. 2266 */ 2267 /* ARGSUSED */ 2268 void 2269 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2270 boolean_t loopback) 2271 { 2272 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2273 mblk_t *mp, *tail, *head; 2274 int count = 0; 2275 int count1; 2276 size_t sz = 0; 2277 size_t chain_sz, sz1; 2278 mac_bw_ctl_t *mac_bw; 2279 mac_client_impl_t *smcip; 2280 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2281 2282 /* 2283 * Set the tail, count and sz. We set the sz irrespective 2284 * of whether we are doing B/W control or not for the 2285 * purpose of updating the stats. 2286 */ 2287 mp = tail = mp_chain; 2288 while (mp != NULL) { 2289 tail = mp; 2290 count++; 2291 sz += msgdsize(mp); 2292 mp = mp->b_next; 2293 } 2294 2295 mutex_enter(&mac_srs->srs_lock); 2296 smcip = mac_srs->srs_mcip; 2297 2298 if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 2299 FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 2300 FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 2301 } 2302 if (smcip != NULL) { 2303 smcip->mci_stat_ibytes += sz; 2304 smcip->mci_stat_ipackets += count; 2305 } 2306 2307 /* 2308 * If the SRS in already being processed; has been blanked; 2309 * can be processed by worker thread only; or the B/W limit 2310 * has been reached, then queue the chain and check if 2311 * worker thread needs to be awakend. 2312 */ 2313 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2314 mac_bw = mac_srs->srs_bw; 2315 ASSERT(mac_bw != NULL); 2316 mutex_enter(&mac_bw->mac_bw_lock); 2317 /* Count the packets and bytes via interrupt */ 2318 srs_rx->sr_intr_count += count; 2319 mac_bw->mac_bw_intr += sz; 2320 if (mac_bw->mac_bw_limit == 0) { 2321 /* zero bandwidth: drop all */ 2322 srs_rx->sr_drop_count += count; 2323 mac_bw->mac_bw_drop_bytes += sz; 2324 mutex_exit(&mac_bw->mac_bw_lock); 2325 mutex_exit(&mac_srs->srs_lock); 2326 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2327 return; 2328 } else { 2329 if ((mac_bw->mac_bw_sz + sz) <= 2330 mac_bw->mac_bw_drop_threshold) { 2331 mutex_exit(&mac_bw->mac_bw_lock); 2332 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2333 tail, count, sz); 2334 } else { 2335 mp = mp_chain; 2336 chain_sz = 0; 2337 count1 = 0; 2338 tail = NULL; 2339 head = NULL; 2340 while (mp != NULL) { 2341 sz1 = msgdsize(mp); 2342 if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2343 mac_bw->mac_bw_drop_threshold) 2344 break; 2345 chain_sz += sz1; 2346 count1++; 2347 tail = mp; 2348 mp = mp->b_next; 2349 } 2350 mutex_exit(&mac_bw->mac_bw_lock); 2351 if (tail != NULL) { 2352 head = tail->b_next; 2353 tail->b_next = NULL; 2354 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2355 mp_chain, tail, count1, chain_sz); 2356 sz -= chain_sz; 2357 count -= count1; 2358 } else { 2359 /* Can't pick up any */ 2360 head = mp_chain; 2361 } 2362 if (head != NULL) { 2363 /* Drop any packet over the threshold */ 2364 srs_rx->sr_drop_count += count; 2365 mutex_enter(&mac_bw->mac_bw_lock); 2366 mac_bw->mac_bw_drop_bytes += sz; 2367 mutex_exit(&mac_bw->mac_bw_lock); 2368 freemsgchain(head); 2369 } 2370 } 2371 MAC_SRS_WORKER_WAKEUP(mac_srs); 2372 mutex_exit(&mac_srs->srs_lock); 2373 return; 2374 } 2375 } 2376 2377 /* 2378 * If the total number of packets queued in the SRS and 2379 * its associated soft rings exceeds the max allowed, 2380 * then drop the chain. If we are polling capable, this 2381 * shouldn't be happening. 2382 */ 2383 if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2384 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2385 mac_bw = mac_srs->srs_bw; 2386 srs_rx->sr_drop_count += count; 2387 mutex_enter(&mac_bw->mac_bw_lock); 2388 mac_bw->mac_bw_drop_bytes += sz; 2389 mutex_exit(&mac_bw->mac_bw_lock); 2390 freemsgchain(mp_chain); 2391 mutex_exit(&mac_srs->srs_lock); 2392 return; 2393 } 2394 2395 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2396 /* Count the packets entering via interrupt path */ 2397 srs_rx->sr_intr_count += count; 2398 2399 if (!(mac_srs->srs_state & SRS_PROC)) { 2400 /* 2401 * If we are coming via loopback or if we are not 2402 * optimizing for latency, we should signal the 2403 * worker thread. 2404 */ 2405 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 2406 /* 2407 * For loopback, We need to let the worker take 2408 * over as we don't want to continue in the same 2409 * thread even if we can. This could lead to stack 2410 * overflows and may also end up using 2411 * resources (cpu) incorrectly. 2412 */ 2413 cv_signal(&mac_srs->srs_async); 2414 } else { 2415 /* 2416 * Seems like no one is processing the SRS and 2417 * there is no backlog. We also inline process 2418 * our packet if its a single packet in non 2419 * latency optimized case (in latency optimized 2420 * case, we inline process chains of any size). 2421 */ 2422 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2423 } 2424 } 2425 mutex_exit(&mac_srs->srs_lock); 2426 } 2427 2428 /* TX SIDE ROUTINES (RUNTIME) */ 2429 2430 /* 2431 * mac_tx_srs_no_desc 2432 * 2433 * This routine is called by Tx single ring default mode 2434 * when Tx ring runs out of descs. 2435 */ 2436 mac_tx_cookie_t 2437 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2438 uint16_t flag, mblk_t **ret_mp) 2439 { 2440 mac_tx_cookie_t cookie = NULL; 2441 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2442 boolean_t wakeup_worker = B_TRUE; 2443 uint32_t tx_mode = srs_tx->st_mode; 2444 int cnt, sz; 2445 mblk_t *tail; 2446 2447 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2448 if (flag & MAC_DROP_ON_NO_DESC) { 2449 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2450 } else { 2451 if (mac_srs->srs_first != NULL) 2452 wakeup_worker = B_FALSE; 2453 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2454 if (flag & MAC_TX_NO_ENQUEUE) { 2455 /* 2456 * If TX_QUEUED is not set, queue the 2457 * packet and let mac_tx_srs_drain() 2458 * set the TX_BLOCKED bit for the 2459 * reasons explained above. Otherwise, 2460 * return the mblks. 2461 */ 2462 if (wakeup_worker) { 2463 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2464 mp_chain, tail, cnt, sz); 2465 } else { 2466 MAC_TX_SET_NO_ENQUEUE(mac_srs, 2467 mp_chain, ret_mp, cookie); 2468 } 2469 } else { 2470 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2471 tail, cnt, sz, cookie); 2472 } 2473 if (wakeup_worker) 2474 cv_signal(&mac_srs->srs_async); 2475 } 2476 return (cookie); 2477 } 2478 2479 /* 2480 * mac_tx_srs_enqueue 2481 * 2482 * This routine is called when Tx SRS is operating in either serializer 2483 * or bandwidth mode. In serializer mode, a packet will get enqueued 2484 * when a thread cannot enter SRS exclusively. In bandwidth mode, 2485 * packets gets queued if allowed byte-count limit for a tick is 2486 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2487 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2488 * the default mode or fanout mode. Here packets get dropped or 2489 * returned back to the caller only after hi-watermark worth of data 2490 * is queued. 2491 */ 2492 static mac_tx_cookie_t 2493 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2494 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2495 { 2496 mac_tx_cookie_t cookie = NULL; 2497 int cnt, sz; 2498 mblk_t *tail; 2499 boolean_t wakeup_worker = B_TRUE; 2500 2501 /* 2502 * Ignore fanout hint if we don't have multiple tx rings. 2503 */ 2504 if (!TX_MULTI_RING_MODE(mac_srs)) 2505 fanout_hint = 0; 2506 2507 if (mac_srs->srs_first != NULL) 2508 wakeup_worker = B_FALSE; 2509 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2510 if (flag & MAC_DROP_ON_NO_DESC) { 2511 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2512 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2513 } else { 2514 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2515 mp_chain, tail, cnt, sz); 2516 } 2517 } else if (flag & MAC_TX_NO_ENQUEUE) { 2518 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2519 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2520 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2521 ret_mp, cookie); 2522 } else { 2523 mp_chain->b_prev = (mblk_t *)fanout_hint; 2524 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2525 mp_chain, tail, cnt, sz); 2526 } 2527 } else { 2528 /* 2529 * If you are BW_ENFORCED, just enqueue the 2530 * packet. srs_worker will drain it at the 2531 * prescribed rate. Before enqueueing, save 2532 * the fanout hint. 2533 */ 2534 mp_chain->b_prev = (mblk_t *)fanout_hint; 2535 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2536 tail, cnt, sz, cookie); 2537 } 2538 if (wakeup_worker) 2539 cv_signal(&mac_srs->srs_async); 2540 return (cookie); 2541 } 2542 2543 /* 2544 * There are five tx modes: 2545 * 2546 * 1) Default mode (SRS_TX_DEFAULT) 2547 * 2) Serialization mode (SRS_TX_SERIALIZE) 2548 * 3) Fanout mode (SRS_TX_FANOUT) 2549 * 4) Bandwdith mode (SRS_TX_BW) 2550 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2551 * 2552 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2553 * based on the number of Tx rings requested for an SRS and whether 2554 * bandwidth control is requested or not. 2555 * 2556 * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 2557 * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 2558 * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 2559 * When flow-control is relieved, the srs_worker drains the queued 2560 * packets and informs blocked clients to restart sending packets. 2561 * 2562 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 2563 * 2564 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2565 * Tx rings. Each Tx ring will have a soft ring associated with it. 2566 * These soft rings will be hung off the Tx SRS. Queueing if it happens 2567 * due to lack of Tx desc will be in individual soft ring (and not srs) 2568 * associated with Tx ring. 2569 * 2570 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2571 * only if bw is available. Otherwise the packets will be queued in 2572 * SRS. If fanout to multiple Tx rings is configured, the packets will 2573 * be fanned out among the soft rings associated with the Tx rings. 2574 * 2575 * Four flags are used in srs_state for indicating flow control 2576 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2577 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2578 * driver below. 2579 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2580 * and flow-control pressure is applied back to clients. The clients expect 2581 * wakeup when flow-control is relieved. 2582 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2583 * got returned back to client either due to lack of Tx descs or due to bw 2584 * control reasons. The clients expect a wakeup when condition is relieved. 2585 * 2586 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2587 * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2588 * MAC_TX_NO_ENQUEUE 2589 * Mac clients that do not want packets to be enqueued in the mac layer set 2590 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2591 * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2592 * behaviour of this flag is different when the Tx is running in serializer 2593 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2594 * get dropped when Tx high watermark is reached. 2595 * There are some mac clients like vsw, aggr that want the mblks to be 2596 * returned back to clients instead of being queued in Tx SRS (or Tx soft 2597 * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2598 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2599 * In the default and Tx fanout mode, the un-transmitted mblks will be 2600 * returned back to the clients when the driver runs out of Tx descs. 2601 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2602 * soft ring) so that the clients can be woken up when Tx desc become 2603 * available. When running in serializer or bandwidth mode mode, 2604 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2605 */ 2606 2607 mac_tx_func_t 2608 mac_tx_get_func(uint32_t mode) 2609 { 2610 return (mac_tx_mode_list[mode].mac_tx_func); 2611 } 2612 2613 /* ARGSUSED */ 2614 static mac_tx_cookie_t 2615 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2616 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2617 { 2618 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2619 boolean_t is_subflow; 2620 mac_tx_stats_t stats; 2621 mac_tx_cookie_t cookie = NULL; 2622 2623 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2624 2625 /* Regular case with a single Tx ring */ 2626 /* 2627 * SRS_TX_BLOCKED is set when underlying NIC runs 2628 * out of Tx descs and messages start getting 2629 * queued. It won't get reset until 2630 * tx_srs_drain() completely drains out the 2631 * messages. 2632 */ 2633 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2634 /* Tx descs/resources not available */ 2635 mutex_enter(&mac_srs->srs_lock); 2636 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2637 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2638 flag, ret_mp); 2639 mutex_exit(&mac_srs->srs_lock); 2640 return (cookie); 2641 } 2642 /* 2643 * While we were computing mblk count, the 2644 * flow control condition got relieved. 2645 * Continue with the transmission. 2646 */ 2647 mutex_exit(&mac_srs->srs_lock); 2648 } 2649 2650 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2651 2652 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2653 mp_chain, (is_subflow ? &stats : NULL)); 2654 2655 /* 2656 * Multiple threads could be here sending packets. 2657 * Under such conditions, it is not possible to 2658 * automically set SRS_TX_BLOCKED bit to indicate 2659 * out of tx desc condition. To atomically set 2660 * this, we queue the returned packet and do 2661 * the setting of SRS_TX_BLOCKED in 2662 * mac_tx_srs_drain(). 2663 */ 2664 if (mp_chain != NULL) { 2665 mutex_enter(&mac_srs->srs_lock); 2666 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2667 mutex_exit(&mac_srs->srs_lock); 2668 return (cookie); 2669 } 2670 2671 if (is_subflow) 2672 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2673 2674 return (NULL); 2675 } 2676 2677 /* 2678 * mac_tx_serialize_mode 2679 * 2680 * This is an experimental mode implemented as per the request of PAE. 2681 * In this mode, all callers attempting to send a packet to the NIC 2682 * will get serialized. Only one thread at any time will access the 2683 * NIC to send the packet out. 2684 */ 2685 /* ARGSUSED */ 2686 static mac_tx_cookie_t 2687 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2688 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2689 { 2690 boolean_t is_subflow; 2691 mac_tx_stats_t stats; 2692 mac_tx_cookie_t cookie = NULL; 2693 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2694 2695 /* Single ring, serialize below */ 2696 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2697 mutex_enter(&mac_srs->srs_lock); 2698 if ((mac_srs->srs_first != NULL) || 2699 (mac_srs->srs_state & SRS_PROC)) { 2700 /* 2701 * In serialization mode, queue all packets until 2702 * TX_HIWAT is set. 2703 * If drop bit is set, drop if TX_HIWAT is set. 2704 * If no_enqueue is set, still enqueue until hiwat 2705 * is set and return mblks after TX_HIWAT is set. 2706 */ 2707 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2708 flag, NULL, ret_mp); 2709 mutex_exit(&mac_srs->srs_lock); 2710 return (cookie); 2711 } 2712 /* 2713 * No packets queued, nothing on proc and no flow 2714 * control condition. Fast-path, ok. Do inline 2715 * processing. 2716 */ 2717 mac_srs->srs_state |= SRS_PROC; 2718 mutex_exit(&mac_srs->srs_lock); 2719 2720 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2721 2722 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2723 mp_chain, (is_subflow ? &stats : NULL)); 2724 2725 mutex_enter(&mac_srs->srs_lock); 2726 mac_srs->srs_state &= ~SRS_PROC; 2727 if (mp_chain != NULL) { 2728 cookie = mac_tx_srs_enqueue(mac_srs, 2729 mp_chain, flag, NULL, ret_mp); 2730 } 2731 if (mac_srs->srs_first != NULL) { 2732 /* 2733 * We processed inline our packet and a new 2734 * packet/s got queued while we were 2735 * processing. Wakeup srs worker 2736 */ 2737 cv_signal(&mac_srs->srs_async); 2738 } 2739 mutex_exit(&mac_srs->srs_lock); 2740 2741 if (is_subflow && cookie == NULL) 2742 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2743 2744 return (cookie); 2745 } 2746 2747 /* 2748 * mac_tx_fanout_mode 2749 * 2750 * In this mode, the SRS will have access to multiple Tx rings to send 2751 * the packet out. The fanout hint that is passed as an argument is 2752 * used to find an appropriate ring to fanout the traffic. Each Tx 2753 * ring, in turn, will have a soft ring associated with it. If a Tx 2754 * ring runs out of Tx desc's the returned packet will be queued in 2755 * the soft ring associated with that Tx ring. The srs itself will not 2756 * queue any packets. 2757 */ 2758 2759 #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 2760 index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 2761 softring = mac_srs->srs_oth_soft_rings[index]; \ 2762 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 2763 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 2764 } 2765 2766 static mac_tx_cookie_t 2767 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2768 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2769 { 2770 mac_soft_ring_t *softring; 2771 uint64_t hash; 2772 uint_t index; 2773 mac_tx_cookie_t cookie = NULL; 2774 2775 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 2776 if (fanout_hint != 0) { 2777 /* 2778 * The hint is specified by the caller, simply pass the 2779 * whole chain to the soft ring. 2780 */ 2781 hash = HASH_HINT(fanout_hint); 2782 MAC_TX_SOFT_RING_PROCESS(mp_chain); 2783 } else { 2784 mblk_t *last_mp, *cur_mp, *sub_chain; 2785 uint64_t last_hash = 0; 2786 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 2787 2788 /* 2789 * Compute the hash from the contents (headers) of the 2790 * packets of the mblk chain. Split the chains into 2791 * subchains of the same conversation. 2792 * 2793 * Since there may be more than one ring used for 2794 * sub-chains of the same call, and since the caller 2795 * does not maintain per conversation state since it 2796 * passed a zero hint, unsent subchains will be 2797 * dropped. 2798 */ 2799 2800 flag |= MAC_DROP_ON_NO_DESC; 2801 ret_mp = NULL; 2802 2803 ASSERT(ret_mp == NULL); 2804 2805 sub_chain = NULL; 2806 last_mp = NULL; 2807 2808 for (cur_mp = mp_chain; cur_mp != NULL; 2809 cur_mp = cur_mp->b_next) { 2810 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 2811 B_TRUE); 2812 if (last_hash != 0 && hash != last_hash) { 2813 /* 2814 * Starting a different subchain, send current 2815 * chain out. 2816 */ 2817 ASSERT(last_mp != NULL); 2818 last_mp->b_next = NULL; 2819 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2820 sub_chain = NULL; 2821 } 2822 2823 /* add packet to subchain */ 2824 if (sub_chain == NULL) 2825 sub_chain = cur_mp; 2826 last_mp = cur_mp; 2827 last_hash = hash; 2828 } 2829 2830 if (sub_chain != NULL) { 2831 /* send last subchain */ 2832 ASSERT(last_mp != NULL); 2833 last_mp->b_next = NULL; 2834 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2835 } 2836 2837 cookie = NULL; 2838 } 2839 2840 return (cookie); 2841 } 2842 2843 /* 2844 * mac_tx_bw_mode 2845 * 2846 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2847 * only if bw is available. Otherwise the packets will be queued in 2848 * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2849 * out to a Tx rings. 2850 */ 2851 static mac_tx_cookie_t 2852 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2853 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2854 { 2855 int cnt, sz; 2856 mblk_t *tail; 2857 mac_tx_cookie_t cookie = NULL; 2858 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2859 2860 ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2861 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2862 mutex_enter(&mac_srs->srs_lock); 2863 if (mac_srs->srs_bw->mac_bw_limit == 0) { 2864 /* 2865 * zero bandwidth, no traffic is sent: drop the packets, 2866 * or return the whole chain if the caller requests all 2867 * unsent packets back. 2868 */ 2869 if (flag & MAC_TX_NO_ENQUEUE) { 2870 cookie = (mac_tx_cookie_t)mac_srs; 2871 *ret_mp = mp_chain; 2872 } else { 2873 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2874 } 2875 mutex_exit(&mac_srs->srs_lock); 2876 return (cookie); 2877 } else if ((mac_srs->srs_first != NULL) || 2878 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2879 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2880 fanout_hint, ret_mp); 2881 mutex_exit(&mac_srs->srs_lock); 2882 return (cookie); 2883 } 2884 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2885 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 2886 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 2887 mac_srs->srs_bw->mac_bw_used = 0; 2888 } else if (mac_srs->srs_bw->mac_bw_used > 2889 mac_srs->srs_bw->mac_bw_limit) { 2890 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2891 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2892 mp_chain, tail, cnt, sz); 2893 /* 2894 * Wakeup worker thread. Note that worker 2895 * thread has to be woken up so that it 2896 * can fire up the timer to be woken up 2897 * on the next tick. Also once 2898 * BW_ENFORCED is set, it can only be 2899 * reset by srs_worker thread. Until then 2900 * all packets will get queued up in SRS 2901 * and hence this this code path won't be 2902 * entered until BW_ENFORCED is reset. 2903 */ 2904 cv_signal(&mac_srs->srs_async); 2905 mutex_exit(&mac_srs->srs_lock); 2906 return (cookie); 2907 } 2908 2909 mac_srs->srs_bw->mac_bw_used += sz; 2910 mutex_exit(&mac_srs->srs_lock); 2911 2912 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2913 mac_soft_ring_t *softring; 2914 uint_t indx, hash; 2915 2916 hash = HASH_HINT(fanout_hint); 2917 indx = COMPUTE_INDEX(hash, 2918 mac_srs->srs_oth_ring_count); 2919 softring = mac_srs->srs_oth_soft_rings[indx]; 2920 return (mac_tx_soft_ring_process(softring, mp_chain, flag, 2921 ret_mp)); 2922 } else { 2923 boolean_t is_subflow; 2924 mac_tx_stats_t stats; 2925 2926 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2927 2928 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2929 mp_chain, (is_subflow ? &stats : NULL)); 2930 2931 if (mp_chain != NULL) { 2932 mutex_enter(&mac_srs->srs_lock); 2933 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2934 if (mac_srs->srs_bw->mac_bw_used > sz) 2935 mac_srs->srs_bw->mac_bw_used -= sz; 2936 else 2937 mac_srs->srs_bw->mac_bw_used = 0; 2938 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2939 fanout_hint, ret_mp); 2940 mutex_exit(&mac_srs->srs_lock); 2941 return (cookie); 2942 } 2943 if (is_subflow) 2944 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2945 2946 return (NULL); 2947 } 2948 } 2949 2950 /* ARGSUSED */ 2951 void 2952 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 2953 { 2954 mblk_t *head, *tail; 2955 size_t sz; 2956 uint32_t tx_mode; 2957 uint_t saved_pkt_count; 2958 boolean_t is_subflow; 2959 mac_tx_stats_t stats; 2960 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2961 2962 saved_pkt_count = 0; 2963 ASSERT(mutex_owned(&mac_srs->srs_lock)); 2964 ASSERT(!(mac_srs->srs_state & SRS_PROC)); 2965 2966 mac_srs->srs_state |= SRS_PROC; 2967 2968 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2969 tx_mode = srs_tx->st_mode; 2970 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 2971 if (mac_srs->srs_first != NULL) { 2972 head = mac_srs->srs_first; 2973 tail = mac_srs->srs_last; 2974 saved_pkt_count = mac_srs->srs_count; 2975 mac_srs->srs_first = NULL; 2976 mac_srs->srs_last = NULL; 2977 mac_srs->srs_count = 0; 2978 mutex_exit(&mac_srs->srs_lock); 2979 2980 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2981 head, &stats); 2982 2983 mutex_enter(&mac_srs->srs_lock); 2984 if (head != NULL) { 2985 /* Device out of tx desc, set block */ 2986 if (head->b_next == NULL) 2987 VERIFY(head == tail); 2988 tail->b_next = mac_srs->srs_first; 2989 mac_srs->srs_first = head; 2990 mac_srs->srs_count += 2991 (saved_pkt_count - stats.ts_opackets); 2992 if (mac_srs->srs_last == NULL) 2993 mac_srs->srs_last = tail; 2994 MAC_TX_SRS_BLOCK(mac_srs, head); 2995 } else { 2996 srs_tx->st_woken_up = B_FALSE; 2997 if (is_subflow) { 2998 FLOW_TX_STATS_UPDATE( 2999 mac_srs->srs_flent, &stats); 3000 } 3001 } 3002 } 3003 } else if (tx_mode == SRS_TX_BW) { 3004 /* 3005 * We are here because the timer fired and we have some data 3006 * to tranmit. Also mac_tx_srs_worker should have reset 3007 * SRS_BW_ENFORCED flag 3008 */ 3009 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 3010 head = tail = mac_srs->srs_first; 3011 while (mac_srs->srs_first != NULL) { 3012 tail = mac_srs->srs_first; 3013 tail->b_prev = NULL; 3014 mac_srs->srs_first = tail->b_next; 3015 if (mac_srs->srs_first == NULL) 3016 mac_srs->srs_last = NULL; 3017 mac_srs->srs_count--; 3018 sz = msgdsize(tail); 3019 mac_srs->srs_size -= sz; 3020 saved_pkt_count++; 3021 MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 3022 3023 if (mac_srs->srs_bw->mac_bw_used < 3024 mac_srs->srs_bw->mac_bw_limit) 3025 continue; 3026 3027 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 3028 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 3029 mac_srs->srs_bw->mac_bw_used = sz; 3030 continue; 3031 } 3032 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3033 break; 3034 } 3035 3036 ASSERT((head == NULL && tail == NULL) || 3037 (head != NULL && tail != NULL)); 3038 if (tail != NULL) { 3039 tail->b_next = NULL; 3040 mutex_exit(&mac_srs->srs_lock); 3041 3042 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3043 head, &stats); 3044 3045 mutex_enter(&mac_srs->srs_lock); 3046 if (head != NULL) { 3047 uint_t size_sent; 3048 3049 /* Device out of tx desc, set block */ 3050 if (head->b_next == NULL) 3051 VERIFY(head == tail); 3052 tail->b_next = mac_srs->srs_first; 3053 mac_srs->srs_first = head; 3054 mac_srs->srs_count += 3055 (saved_pkt_count - stats.ts_opackets); 3056 if (mac_srs->srs_last == NULL) 3057 mac_srs->srs_last = tail; 3058 size_sent = sz - stats.ts_obytes; 3059 mac_srs->srs_size += size_sent; 3060 mac_srs->srs_bw->mac_bw_sz += size_sent; 3061 if (mac_srs->srs_bw->mac_bw_used > size_sent) { 3062 mac_srs->srs_bw->mac_bw_used -= 3063 size_sent; 3064 } else { 3065 mac_srs->srs_bw->mac_bw_used = 0; 3066 } 3067 MAC_TX_SRS_BLOCK(mac_srs, head); 3068 } else { 3069 srs_tx->st_woken_up = B_FALSE; 3070 if (is_subflow) { 3071 FLOW_TX_STATS_UPDATE( 3072 mac_srs->srs_flent, &stats); 3073 } 3074 } 3075 } 3076 } else if (tx_mode == SRS_TX_BW_FANOUT) { 3077 mblk_t *prev; 3078 mac_soft_ring_t *softring; 3079 uint64_t hint; 3080 3081 /* 3082 * We are here because the timer fired and we 3083 * have some quota to tranmit. 3084 */ 3085 prev = NULL; 3086 head = tail = mac_srs->srs_first; 3087 while (mac_srs->srs_first != NULL) { 3088 tail = mac_srs->srs_first; 3089 mac_srs->srs_first = tail->b_next; 3090 if (mac_srs->srs_first == NULL) 3091 mac_srs->srs_last = NULL; 3092 mac_srs->srs_count--; 3093 sz = msgdsize(tail); 3094 mac_srs->srs_size -= sz; 3095 mac_srs->srs_bw->mac_bw_used += sz; 3096 if (prev == NULL) 3097 hint = (ulong_t)tail->b_prev; 3098 if (hint != (ulong_t)tail->b_prev) { 3099 prev->b_next = NULL; 3100 mutex_exit(&mac_srs->srs_lock); 3101 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3102 head = tail; 3103 hint = (ulong_t)tail->b_prev; 3104 mutex_enter(&mac_srs->srs_lock); 3105 } 3106 3107 prev = tail; 3108 tail->b_prev = NULL; 3109 if (mac_srs->srs_bw->mac_bw_used < 3110 mac_srs->srs_bw->mac_bw_limit) 3111 continue; 3112 3113 if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 3114 mac_srs->srs_bw->mac_bw_curr_time = lbolt; 3115 mac_srs->srs_bw->mac_bw_used = 0; 3116 continue; 3117 } 3118 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3119 break; 3120 } 3121 ASSERT((head == NULL && tail == NULL) || 3122 (head != NULL && tail != NULL)); 3123 if (tail != NULL) { 3124 tail->b_next = NULL; 3125 mutex_exit(&mac_srs->srs_lock); 3126 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3127 mutex_enter(&mac_srs->srs_lock); 3128 } 3129 } 3130 /* 3131 * SRS_TX_FANOUT case not considered here because packets 3132 * won't be queued in the SRS for this case. Packets will 3133 * be sent directly to soft rings underneath and if there 3134 * is any queueing at all, it would be in Tx side soft 3135 * rings. 3136 */ 3137 3138 /* 3139 * When srs_count becomes 0, reset SRS_TX_HIWAT and 3140 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3141 */ 3142 if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3143 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3144 mac_tx_notify_cb_t *mtnfp; 3145 mac_cb_t *mcb; 3146 mac_client_impl_t *mcip = mac_srs->srs_mcip; 3147 boolean_t wakeup_required = B_FALSE; 3148 3149 if (mac_srs->srs_state & 3150 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3151 wakeup_required = B_TRUE; 3152 } 3153 mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3154 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3155 mutex_exit(&mac_srs->srs_lock); 3156 if (wakeup_required) { 3157 /* Wakeup callback registered clients */ 3158 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3159 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3160 mcb = mcb->mcb_nextp) { 3161 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3162 mtnfp->mtnf_fn(mtnfp->mtnf_arg, 3163 (mac_tx_cookie_t)mac_srs); 3164 } 3165 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3166 &mcip->mci_tx_notify_cb_list); 3167 /* 3168 * If the client is not the primary MAC client, then we 3169 * need to send the notification to the clients upper 3170 * MAC, i.e. mci_upper_mip. 3171 */ 3172 mac_tx_notify(mcip->mci_upper_mip != NULL ? 3173 mcip->mci_upper_mip : mcip->mci_mip); 3174 } 3175 mutex_enter(&mac_srs->srs_lock); 3176 } 3177 mac_srs->srs_state &= ~SRS_PROC; 3178 } 3179 3180 /* 3181 * Given a packet, get the flow_entry that identifies the flow 3182 * to which that packet belongs. The flow_entry will contain 3183 * the transmit function to be used to send the packet. If the 3184 * function returns NULL, the packet should be sent using the 3185 * underlying NIC. 3186 */ 3187 static flow_entry_t * 3188 mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3189 { 3190 flow_entry_t *flent = NULL; 3191 mac_client_impl_t *mcip; 3192 int err; 3193 3194 /* 3195 * Do classification on the packet. 3196 */ 3197 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3198 if (err != 0) 3199 return (NULL); 3200 3201 /* 3202 * This flent might just be an additional one on the MAC client, 3203 * i.e. for classification purposes (different fdesc), however 3204 * the resources, SRS et. al., are in the mci_flent, so if 3205 * this isn't the mci_flent, we need to get it. 3206 */ 3207 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3208 FLOW_REFRELE(flent); 3209 flent = mcip->mci_flent; 3210 FLOW_TRY_REFHOLD(flent, err); 3211 if (err != 0) 3212 return (NULL); 3213 } 3214 3215 return (flent); 3216 } 3217 3218 /* 3219 * This macro is only meant to be used by mac_tx_send(). 3220 */ 3221 #define CHECK_VID_AND_ADD_TAG(mp) { \ 3222 if (vid_check) { \ 3223 int err = 0; \ 3224 \ 3225 MAC_VID_CHECK(src_mcip, (mp), err); \ 3226 if (err != 0) { \ 3227 freemsg((mp)); \ 3228 (mp) = next; \ 3229 oerrors++; \ 3230 continue; \ 3231 } \ 3232 } \ 3233 if (add_tag) { \ 3234 (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3235 if ((mp) == NULL) { \ 3236 (mp) = next; \ 3237 oerrors++; \ 3238 continue; \ 3239 } \ 3240 } \ 3241 } 3242 3243 mblk_t * 3244 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3245 mac_tx_stats_t *stats) 3246 { 3247 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3248 mac_impl_t *mip = src_mcip->mci_mip; 3249 uint_t obytes = 0, opackets = 0, oerrors = 0; 3250 mblk_t *mp = NULL, *next; 3251 boolean_t vid_check, add_tag; 3252 uint16_t vid = 0; 3253 3254 if (mip->mi_nclients > 1) { 3255 vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3256 add_tag = MAC_TAG_NEEDED(src_mcip); 3257 if (add_tag) 3258 vid = mac_client_vid(mch); 3259 } else { 3260 ASSERT(mip->mi_nclients == 1); 3261 vid_check = add_tag = B_FALSE; 3262 } 3263 3264 /* 3265 * Fastpath: if there's only one client, and there's no 3266 * multicast listeners, we simply send the packet down to the 3267 * underlying NIC. 3268 */ 3269 if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 3270 DTRACE_PROBE2(fastpath, 3271 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3272 3273 mp = mp_chain; 3274 while (mp != NULL) { 3275 next = mp->b_next; 3276 mp->b_next = NULL; 3277 opackets++; 3278 obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3279 msgdsize(mp)); 3280 3281 CHECK_VID_AND_ADD_TAG(mp); 3282 MAC_TX(mip, ring, mp, 3283 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3284 0)); 3285 3286 /* 3287 * If the driver is out of descriptors and does a 3288 * partial send it will return a chain of unsent 3289 * mblks. Adjust the accounting stats. 3290 */ 3291 if (mp != NULL) { 3292 opackets--; 3293 obytes -= msgdsize(mp); 3294 mp->b_next = next; 3295 break; 3296 } 3297 mp = next; 3298 } 3299 goto done; 3300 } 3301 3302 /* 3303 * No fastpath, we either have more than one MAC client 3304 * defined on top of the same MAC, or one or more MAC 3305 * client promiscuous callbacks. 3306 */ 3307 DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3308 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3309 3310 mp = mp_chain; 3311 while (mp != NULL) { 3312 flow_entry_t *dst_flow_ent; 3313 void *flow_cookie; 3314 size_t pkt_size; 3315 mblk_t *mp1; 3316 3317 next = mp->b_next; 3318 mp->b_next = NULL; 3319 opackets++; 3320 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3321 obytes += pkt_size; 3322 CHECK_VID_AND_ADD_TAG(mp); 3323 3324 /* 3325 * Check if there are promiscuous mode callbacks defined. 3326 */ 3327 if (mip->mi_promisc_list != NULL) 3328 mac_promisc_dispatch(mip, mp, src_mcip); 3329 3330 /* 3331 * Find the destination. 3332 */ 3333 dst_flow_ent = mac_tx_classify(mip, mp); 3334 3335 if (dst_flow_ent != NULL) { 3336 size_t hdrsize; 3337 int err = 0; 3338 3339 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3340 struct ether_vlan_header *evhp = 3341 (struct ether_vlan_header *)mp->b_rptr; 3342 3343 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3344 hdrsize = sizeof (*evhp); 3345 else 3346 hdrsize = sizeof (struct ether_header); 3347 } else { 3348 mac_header_info_t mhi; 3349 3350 err = mac_header_info((mac_handle_t)mip, 3351 mp, &mhi); 3352 if (err == 0) 3353 hdrsize = mhi.mhi_hdrsize; 3354 } 3355 3356 /* 3357 * Got a matching flow. It's either another 3358 * MAC client, or a broadcast/multicast flow. 3359 * Make sure the packet size is within the 3360 * allowed size. If not drop the packet and 3361 * move to next packet. 3362 */ 3363 if (err != 0 || 3364 (pkt_size - hdrsize) > mip->mi_sdu_max) { 3365 oerrors++; 3366 DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3367 mblk_t *, mp); 3368 freemsg(mp); 3369 mp = next; 3370 FLOW_REFRELE(dst_flow_ent); 3371 continue; 3372 } 3373 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3374 if (flow_cookie != NULL) { 3375 /* 3376 * The vnic_bcast_send function expects 3377 * to receive the sender MAC client 3378 * as value for arg2. 3379 */ 3380 mac_bcast_send(flow_cookie, src_mcip, mp, 3381 B_TRUE); 3382 } else { 3383 /* 3384 * loopback the packet to a 3385 * local MAC client. We force a context 3386 * switch if both source and destination 3387 * MAC clients are used by IP, i.e. bypass 3388 * is set. 3389 */ 3390 boolean_t do_switch; 3391 mac_client_impl_t *dst_mcip = 3392 dst_flow_ent->fe_mcip; 3393 3394 do_switch = ((src_mcip->mci_state_flags & 3395 dst_mcip->mci_state_flags & 3396 MCIS_CLIENT_POLL_CAPABLE) != 0); 3397 3398 if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3399 (dst_flow_ent->fe_cb_fn)( 3400 dst_flow_ent->fe_cb_arg1, 3401 dst_flow_ent->fe_cb_arg2, 3402 mp1, do_switch); 3403 } 3404 } 3405 FLOW_REFRELE(dst_flow_ent); 3406 } else { 3407 /* 3408 * Unknown destination, send via the underlying 3409 * NIC. 3410 */ 3411 MAC_TX(mip, ring, mp, 3412 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3413 0)); 3414 if (mp != NULL) { 3415 /* 3416 * Adjust for the last packet that 3417 * could not be transmitted 3418 */ 3419 opackets--; 3420 obytes -= pkt_size; 3421 mp->b_next = next; 3422 break; 3423 } 3424 } 3425 mp = next; 3426 } 3427 3428 done: 3429 src_mcip->mci_stat_obytes += obytes; 3430 src_mcip->mci_stat_opackets += opackets; 3431 src_mcip->mci_stat_oerrors += oerrors; 3432 3433 if (stats != NULL) { 3434 stats->ts_opackets = opackets; 3435 stats->ts_obytes = obytes; 3436 stats->ts_oerrors = oerrors; 3437 } 3438 return (mp); 3439 } 3440 3441 /* 3442 * mac_tx_srs_ring_present 3443 * 3444 * Returns whether the specified ring is part of the specified SRS. 3445 */ 3446 boolean_t 3447 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3448 { 3449 int i; 3450 mac_soft_ring_t *soft_ring; 3451 3452 if (srs->srs_tx.st_arg2 == tx_ring) 3453 return (B_TRUE); 3454 3455 for (i = 0; i < srs->srs_oth_ring_count; i++) { 3456 soft_ring = srs->srs_oth_soft_rings[i]; 3457 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3458 return (B_TRUE); 3459 } 3460 3461 return (B_FALSE); 3462 } 3463 3464 /* 3465 * mac_tx_srs_wakeup 3466 * 3467 * Called when Tx desc become available. Wakeup the appropriate worker 3468 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3469 * state field. 3470 */ 3471 void 3472 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3473 { 3474 int i; 3475 mac_soft_ring_t *sringp; 3476 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3477 3478 mutex_enter(&mac_srs->srs_lock); 3479 if (TX_SINGLE_RING_MODE(mac_srs)) { 3480 if (srs_tx->st_arg2 == ring && 3481 mac_srs->srs_state & SRS_TX_BLOCKED) { 3482 mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3483 srs_tx->st_unblocked_cnt++; 3484 cv_signal(&mac_srs->srs_async); 3485 } 3486 /* 3487 * A wakeup can come before tx_srs_drain() could 3488 * grab srs lock and set SRS_TX_BLOCKED. So 3489 * always set woken_up flag when we come here. 3490 */ 3491 srs_tx->st_woken_up = B_TRUE; 3492 mutex_exit(&mac_srs->srs_lock); 3493 return; 3494 } 3495 3496 /* If you are here, it is for FANOUT or BW_FANOUT case */ 3497 ASSERT(TX_MULTI_RING_MODE(mac_srs)); 3498 for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 3499 sringp = mac_srs->srs_oth_soft_rings[i]; 3500 mutex_enter(&sringp->s_ring_lock); 3501 if (sringp->s_ring_tx_arg2 == ring) { 3502 if (sringp->s_ring_state & S_RING_BLOCK) { 3503 sringp->s_ring_state &= ~S_RING_BLOCK; 3504 sringp->s_ring_unblocked_cnt++; 3505 cv_signal(&sringp->s_ring_async); 3506 } 3507 sringp->s_ring_tx_woken_up = B_TRUE; 3508 } 3509 mutex_exit(&sringp->s_ring_lock); 3510 } 3511 mutex_exit(&mac_srs->srs_lock); 3512 } 3513 3514 /* 3515 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3516 * the blocked clients again. 3517 */ 3518 void 3519 mac_tx_notify(mac_impl_t *mip) 3520 { 3521 i_mac_notify(mip, MAC_NOTE_TX); 3522 } 3523 3524 /* 3525 * RX SOFTRING RELATED FUNCTIONS 3526 * 3527 * These functions really belong in mac_soft_ring.c and here for 3528 * a short period. 3529 */ 3530 3531 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3532 /* \ 3533 * Enqueue our mblk chain. \ 3534 */ \ 3535 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3536 \ 3537 if ((ringp)->s_ring_last != NULL) \ 3538 (ringp)->s_ring_last->b_next = (mp); \ 3539 else \ 3540 (ringp)->s_ring_first = (mp); \ 3541 (ringp)->s_ring_last = (tail); \ 3542 (ringp)->s_ring_count += (cnt); \ 3543 ASSERT((ringp)->s_ring_count > 0); \ 3544 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3545 (ringp)->s_ring_size += sz; \ 3546 } \ 3547 } 3548 3549 /* 3550 * Default entry point to deliver a packet chain to a MAC client. 3551 * If the MAC client has flows, do the classification with these 3552 * flows as well. 3553 */ 3554 /* ARGSUSED */ 3555 void 3556 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3557 mac_header_info_t *arg3) 3558 { 3559 mac_client_impl_t *mcip = arg1; 3560 3561 if (mcip->mci_nvids == 1 && 3562 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 3563 /* 3564 * If the client has exactly one VID associated with it 3565 * and striping of VLAN header is not disabled, 3566 * remove the VLAN tag from the packet before 3567 * passing it on to the client's receive callback. 3568 * Note that this needs to be done after we dispatch 3569 * the packet to the promiscuous listeners of the 3570 * client, since they expect to see the whole 3571 * frame including the VLAN headers. 3572 */ 3573 mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3574 } 3575 3576 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3577 } 3578 3579 /* 3580 * mac_rx_soft_ring_process 3581 * 3582 * process a chain for a given soft ring. The number of packets queued 3583 * in the SRS and its associated soft rings (including this one) is 3584 * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3585 * thread (interrupt or poll thread) to do inline processing. This 3586 * helps keep the latency down under low load. 3587 * 3588 * The proc and arg for each mblk is already stored in the mblk in 3589 * appropriate places. 3590 */ 3591 /* ARGSUSED */ 3592 void 3593 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3594 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3595 { 3596 mac_direct_rx_t proc; 3597 void *arg1; 3598 mac_resource_handle_t arg2; 3599 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3600 3601 ASSERT(ringp != NULL); 3602 ASSERT(mp_chain != NULL); 3603 ASSERT(tail != NULL); 3604 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3605 3606 mutex_enter(&ringp->s_ring_lock); 3607 ringp->s_ring_total_inpkt += cnt; 3608 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3609 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 3610 /* If on processor or blanking on, then enqueue and return */ 3611 if (ringp->s_ring_state & S_RING_BLANK || 3612 ringp->s_ring_state & S_RING_PROC) { 3613 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3614 mutex_exit(&ringp->s_ring_lock); 3615 return; 3616 } 3617 proc = ringp->s_ring_rx_func; 3618 arg1 = ringp->s_ring_rx_arg1; 3619 arg2 = ringp->s_ring_rx_arg2; 3620 /* 3621 * See if anything is already queued. If we are the 3622 * first packet, do inline processing else queue the 3623 * packet and do the drain. 3624 */ 3625 if (ringp->s_ring_first == NULL) { 3626 /* 3627 * Fast-path, ok to process and nothing queued. 3628 */ 3629 ringp->s_ring_run = curthread; 3630 ringp->s_ring_state |= (S_RING_PROC); 3631 3632 mutex_exit(&ringp->s_ring_lock); 3633 3634 /* 3635 * We are the chain of 1 packet so 3636 * go through this fast path. 3637 */ 3638 ASSERT(mp_chain->b_next == NULL); 3639 3640 (*proc)(arg1, arg2, mp_chain, NULL); 3641 3642 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3643 /* 3644 * If we have a soft ring set which is doing 3645 * bandwidth control, we need to decrement 3646 * srs_size and count so it the SRS can have a 3647 * accurate idea of what is the real data 3648 * queued between SRS and its soft rings. We 3649 * decrement the counters only when the packet 3650 * gets processed by both SRS and the soft ring. 3651 */ 3652 mutex_enter(&mac_srs->srs_lock); 3653 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3654 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3655 mutex_exit(&mac_srs->srs_lock); 3656 3657 mutex_enter(&ringp->s_ring_lock); 3658 ringp->s_ring_run = NULL; 3659 ringp->s_ring_state &= ~S_RING_PROC; 3660 if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3661 cv_signal(&ringp->s_ring_client_cv); 3662 3663 if ((ringp->s_ring_first == NULL) || 3664 (ringp->s_ring_state & S_RING_BLANK)) { 3665 /* 3666 * We processed inline our packet and 3667 * nothing new has arrived or our 3668 * receiver doesn't want to receive 3669 * any packets. We are done. 3670 */ 3671 mutex_exit(&ringp->s_ring_lock); 3672 return; 3673 } 3674 } else { 3675 SOFT_RING_ENQUEUE_CHAIN(ringp, 3676 mp_chain, tail, cnt, sz); 3677 } 3678 3679 /* 3680 * We are here because either we couldn't do inline 3681 * processing (because something was already 3682 * queued), or we had a chain of more than one 3683 * packet, or something else arrived after we were 3684 * done with inline processing. 3685 */ 3686 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3687 ASSERT(ringp->s_ring_first != NULL); 3688 3689 ringp->s_ring_drain_func(ringp); 3690 mutex_exit(&ringp->s_ring_lock); 3691 return; 3692 } else { 3693 /* ST_RING_WORKER_ONLY case */ 3694 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3695 mac_soft_ring_worker_wakeup(ringp); 3696 mutex_exit(&ringp->s_ring_lock); 3697 } 3698 } 3699 3700 /* 3701 * TX SOFTRING RELATED FUNCTIONS 3702 * 3703 * These functions really belong in mac_soft_ring.c and here for 3704 * a short period. 3705 */ 3706 3707 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3708 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3709 ringp->s_ring_state |= S_RING_ENQUEUED; \ 3710 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3711 } 3712 3713 /* 3714 * mac_tx_sring_queued 3715 * 3716 * When we are out of transmit descriptors and we already have a 3717 * queue that exceeds hiwat (or the client called us with 3718 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3719 * soft ring pointer as the opaque cookie for the client enable 3720 * flow control. 3721 */ 3722 static mac_tx_cookie_t 3723 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3724 mblk_t **ret_mp) 3725 { 3726 int cnt; 3727 size_t sz; 3728 mblk_t *tail; 3729 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3730 mac_tx_cookie_t cookie = NULL; 3731 boolean_t wakeup_worker = B_TRUE; 3732 3733 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3734 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3735 if (flag & MAC_DROP_ON_NO_DESC) { 3736 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3737 /* increment freed stats */ 3738 ringp->s_ring_drops += cnt; 3739 cookie = (mac_tx_cookie_t)ringp; 3740 } else { 3741 if (ringp->s_ring_first != NULL) 3742 wakeup_worker = B_FALSE; 3743 3744 if (flag & MAC_TX_NO_ENQUEUE) { 3745 /* 3746 * If QUEUED is not set, queue the packet 3747 * and let mac_tx_soft_ring_drain() set 3748 * the TX_BLOCKED bit for the reasons 3749 * explained above. Otherwise, return the 3750 * mblks. 3751 */ 3752 if (wakeup_worker) { 3753 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3754 mp_chain, tail, cnt, sz); 3755 } else { 3756 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3757 cookie = (mac_tx_cookie_t)ringp; 3758 *ret_mp = mp_chain; 3759 } 3760 } else { 3761 boolean_t enqueue = B_TRUE; 3762 3763 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3764 /* 3765 * flow-controlled. Store ringp in cookie 3766 * so that it can be returned as 3767 * mac_tx_cookie_t to client 3768 */ 3769 ringp->s_ring_state |= S_RING_TX_HIWAT; 3770 cookie = (mac_tx_cookie_t)ringp; 3771 ringp->s_ring_hiwat_cnt++; 3772 if (ringp->s_ring_count > 3773 ringp->s_ring_tx_max_q_cnt) { 3774 /* increment freed stats */ 3775 ringp->s_ring_drops += cnt; 3776 /* 3777 * b_prev may be set to the fanout hint 3778 * hence can't use freemsg directly 3779 */ 3780 mac_pkt_drop(NULL, NULL, 3781 mp_chain, B_FALSE); 3782 DTRACE_PROBE1(tx_queued_hiwat, 3783 mac_soft_ring_t *, ringp); 3784 enqueue = B_FALSE; 3785 } 3786 } 3787 if (enqueue) { 3788 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3789 tail, cnt, sz); 3790 } 3791 } 3792 if (wakeup_worker) 3793 cv_signal(&ringp->s_ring_async); 3794 } 3795 return (cookie); 3796 } 3797 3798 3799 /* 3800 * mac_tx_soft_ring_process 3801 * 3802 * This routine is called when fanning out outgoing traffic among 3803 * multipe Tx rings. 3804 * Note that a soft ring is associated with a h/w Tx ring. 3805 */ 3806 mac_tx_cookie_t 3807 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3808 uint16_t flag, mblk_t **ret_mp) 3809 { 3810 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3811 int cnt; 3812 size_t sz; 3813 mblk_t *tail; 3814 mac_tx_cookie_t cookie = NULL; 3815 3816 ASSERT(ringp != NULL); 3817 ASSERT(mp_chain != NULL); 3818 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3819 /* 3820 * Only two modes can come here; either it can be 3821 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 3822 */ 3823 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3824 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 3825 3826 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3827 /* Serialization mode */ 3828 3829 mutex_enter(&ringp->s_ring_lock); 3830 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3831 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3832 flag, ret_mp); 3833 mutex_exit(&ringp->s_ring_lock); 3834 return (cookie); 3835 } 3836 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3837 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3838 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3839 /* 3840 * If ring is blocked due to lack of Tx 3841 * descs, just return. Worker thread 3842 * will get scheduled when Tx desc's 3843 * become available. 3844 */ 3845 mutex_exit(&ringp->s_ring_lock); 3846 return (cookie); 3847 } 3848 mac_soft_ring_worker_wakeup(ringp); 3849 mutex_exit(&ringp->s_ring_lock); 3850 return (cookie); 3851 } else { 3852 /* Default fanout mode */ 3853 /* 3854 * S_RING_BLOCKED is set when underlying NIC runs 3855 * out of Tx descs and messages start getting 3856 * queued. It won't get reset until 3857 * tx_srs_drain() completely drains out the 3858 * messages. 3859 */ 3860 boolean_t is_subflow; 3861 mac_tx_stats_t stats; 3862 3863 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3864 /* Tx descs/resources not available */ 3865 mutex_enter(&ringp->s_ring_lock); 3866 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3867 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3868 flag, ret_mp); 3869 mutex_exit(&ringp->s_ring_lock); 3870 return (cookie); 3871 } 3872 /* 3873 * While we were computing mblk count, the 3874 * flow control condition got relieved. 3875 * Continue with the transmission. 3876 */ 3877 mutex_exit(&ringp->s_ring_lock); 3878 } 3879 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 3880 3881 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 3882 ringp->s_ring_tx_arg2, mp_chain, 3883 (is_subflow ? &stats : NULL)); 3884 3885 /* 3886 * Multiple threads could be here sending packets. 3887 * Under such conditions, it is not possible to 3888 * automically set S_RING_BLOCKED bit to indicate 3889 * out of tx desc condition. To atomically set 3890 * this, we queue the returned packet and do 3891 * the setting of S_RING_BLOCKED in 3892 * mac_tx_soft_ring_drain(). 3893 */ 3894 if (mp_chain != NULL) { 3895 mutex_enter(&ringp->s_ring_lock); 3896 cookie = 3897 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 3898 mutex_exit(&ringp->s_ring_lock); 3899 return (cookie); 3900 } 3901 if (is_subflow) { 3902 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 3903 } 3904 return (NULL); 3905 } 3906 } 3907