1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/callb.h> 28 #include <sys/sdt.h> 29 #include <sys/strsubr.h> 30 #include <sys/strsun.h> 31 #include <sys/vlan.h> 32 #include <inet/ipsec_impl.h> 33 #include <inet/ip_impl.h> 34 #include <inet/sadb.h> 35 #include <inet/ipsecesp.h> 36 #include <inet/ipsecah.h> 37 #include <inet/ip6.h> 38 39 #include <sys/mac_impl.h> 40 #include <sys/mac_client_impl.h> 41 #include <sys/mac_client_priv.h> 42 #include <sys/mac_soft_ring.h> 43 #include <sys/mac_flow_impl.h> 44 45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 46 uintptr_t, uint16_t, mblk_t **); 47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 48 uintptr_t, uint16_t, mblk_t **); 49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 50 uintptr_t, uint16_t, mblk_t **); 51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 52 uintptr_t, uint16_t, mblk_t **); 53 54 typedef struct mac_tx_mode_s { 55 mac_tx_srs_mode_t mac_tx_mode; 56 mac_tx_func_t mac_tx_func; 57 } mac_tx_mode_t; 58 59 /* 60 * There are five modes of operation on the Tx side. These modes get set 61 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 62 * none of the other modes are user configurable. They get selected by 63 * the system depending upon whether the link (or flow) has multiple Tx 64 * rings or a bandwidth configured, etc. 65 */ 66 mac_tx_mode_t mac_tx_mode_list[] = { 67 {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 68 {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 69 {SRS_TX_FANOUT, mac_tx_fanout_mode}, 70 {SRS_TX_BW, mac_tx_bw_mode}, 71 {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 72 }; 73 74 /* 75 * Soft Ring Set (SRS) - The Run time code that deals with 76 * dynamic polling from the hardware, bandwidth enforcement, 77 * fanout etc. 78 * 79 * We try to use H/W classification on NIC and assign traffic for 80 * a MAC address to a particular Rx ring or ring group. There is a 81 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 82 * switches the underlying Rx ring between interrupt and 83 * polling mode and enforces any specified B/W control. 84 * 85 * There is always a SRS created and tied to each H/W and S/W rule. 86 * Whenever we create a H/W rule, we always add the the same rule to 87 * S/W classifier and tie a SRS to it. 88 * 89 * In case a B/W control is specified, it is broken into bytes 90 * per ticks and as soon as the quota for a tick is exhausted, 91 * the underlying Rx ring is forced into poll mode for remainder of 92 * the tick. The SRS poll thread only polls for bytes that are 93 * allowed to come in the SRS. We typically let 4x the configured 94 * B/W worth of packets to come in the SRS (to prevent unnecessary 95 * drops due to bursts) but only process the specified amount. 96 * 97 * A MAC client (e.g. a VNIC or aggr) can have 1 or more 98 * Rx rings (and corresponding SRSs) assigned to it. The SRS 99 * in turn can have softrings to do protocol level fanout or 100 * softrings to do S/W based fanout or both. In case the NIC 101 * has no Rx rings, we do S/W classification to respective SRS. 102 * The S/W classification rule is always setup and ready. This 103 * allows the MAC layer to reassign Rx rings whenever needed 104 * but packets still continue to flow via the default path and 105 * getting S/W classified to correct SRS. 106 * 107 * The SRS's are used on both Tx and Rx side. They use the same 108 * data structure but the processing routines have slightly different 109 * semantics due to the fact that Rx side needs to do dynamic 110 * polling etc. 111 * 112 * Dynamic Polling Notes 113 * ===================== 114 * 115 * Each Soft ring set is capable of switching its Rx ring between 116 * interrupt and poll mode and actively 'polls' for packets in 117 * poll mode. If the SRS is implementing a B/W limit, it makes 118 * sure that only Max allowed packets are pulled in poll mode 119 * and goes to poll mode as soon as B/W limit is exceeded. As 120 * such, there are no overheads to implement B/W limits. 121 * 122 * In poll mode, its better to keep the pipeline going where the 123 * SRS worker thread keeps processing packets and poll thread 124 * keeps bringing more packets (specially if they get to run 125 * on different CPUs). This also prevents the overheads associated 126 * by excessive signalling (on NUMA machines, this can be 127 * pretty devastating). The exception is latency optimized case 128 * where worker thread does no work and interrupt and poll thread 129 * are allowed to do their own drain. 130 * 131 * We use the following policy to control Dynamic Polling: 132 * 1) We switch to poll mode anytime the processing 133 * thread causes a backlog to build up in SRS and 134 * its associated Soft Rings (sr_poll_pkt_cnt > 0). 135 * 2) As long as the backlog stays under the low water 136 * mark (sr_lowat), we poll the H/W for more packets. 137 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 138 * water mark, we stay in poll mode but don't poll 139 * the H/W for more packets. 140 * 4) Anytime in polling mode, if we poll the H/W for 141 * packets and find nothing plus we have an existing 142 * backlog (sr_poll_pkt_cnt > 0), we stay in polling 143 * mode but don't poll the H/W for packets anymore 144 * (let the polling thread go to sleep). 145 * 5) Once the backlog is relived (packets are processed) 146 * we reenable polling (by signalling the poll thread) 147 * only when the backlog dips below sr_poll_thres. 148 * 6) sr_hiwat is used exclusively when we are not 149 * polling capable and is used to decide when to 150 * drop packets so the SRS queue length doesn't grow 151 * infinitely. 152 * 153 * NOTE: Also see the block level comment on top of mac_soft_ring.c 154 */ 155 156 /* 157 * mac_latency_optimize 158 * 159 * Controls whether the poll thread can process the packets inline 160 * or let the SRS worker thread do the processing. This applies if 161 * the SRS was not being processed. For latency sensitive traffic, 162 * this needs to be true to allow inline processing. For throughput 163 * under load, this should be false. 164 * 165 * This (and other similar) tunable should be rolled into a link 166 * or flow specific workload hint that can be set using dladm 167 * linkprop (instead of multiple such tunables). 168 */ 169 boolean_t mac_latency_optimize = B_TRUE; 170 171 /* 172 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 173 * 174 * queue a mp or chain in soft ring set and increment the 175 * local count (srs_count) for the SRS and the shared counter 176 * (srs_poll_pkt_cnt - shared between SRS and its soft rings 177 * to track the total unprocessed packets for polling to work 178 * correctly). 179 * 180 * The size (total bytes queued) counters are incremented only 181 * if we are doing B/W control. 182 */ 183 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 184 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 185 if ((mac_srs)->srs_last != NULL) \ 186 (mac_srs)->srs_last->b_next = (head); \ 187 else \ 188 (mac_srs)->srs_first = (head); \ 189 (mac_srs)->srs_last = (tail); \ 190 (mac_srs)->srs_count += count; \ 191 } 192 193 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 194 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 195 \ 196 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 197 srs_rx->sr_poll_pkt_cnt += count; \ 198 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 199 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 200 (mac_srs)->srs_size += (sz); \ 201 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 202 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 203 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 204 } \ 205 } 206 207 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 208 mac_srs->srs_state |= SRS_ENQUEUED; \ 209 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 210 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 211 (mac_srs)->srs_size += (sz); \ 212 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 213 } \ 214 } 215 216 /* 217 * Turn polling on routines 218 */ 219 #define MAC_SRS_POLLING_ON(mac_srs) { \ 220 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 221 if (((mac_srs)->srs_state & \ 222 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 223 (mac_srs)->srs_state |= SRS_POLLING; \ 224 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 225 (mac_srs)->srs_ring); \ 226 (mac_srs)->srs_rx.sr_poll_on++; \ 227 } \ 228 } 229 230 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 231 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 232 if (((mac_srs)->srs_state & \ 233 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 234 (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 235 (mac_srs)->srs_state |= SRS_POLLING; \ 236 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 237 (mac_srs)->srs_ring); \ 238 (mac_srs)->srs_rx.sr_worker_poll_on++; \ 239 } \ 240 } 241 242 /* 243 * MAC_SRS_POLL_RING 244 * 245 * Signal the SRS poll thread to poll the underlying H/W ring 246 * provided it wasn't already polling (SRS_GET_PKTS was set). 247 * 248 * Poll thread gets to run only from mac_rx_srs_drain() and only 249 * if the drain was being done by the worker thread. 250 */ 251 #define MAC_SRS_POLL_RING(mac_srs) { \ 252 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 253 \ 254 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 255 srs_rx->sr_poll_thr_sig++; \ 256 if (((mac_srs)->srs_state & \ 257 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 258 (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 259 (mac_srs)->srs_state |= SRS_GET_PKTS; \ 260 cv_signal(&(mac_srs)->srs_cv); \ 261 } else { \ 262 srs_rx->sr_poll_thr_busy++; \ 263 } \ 264 } 265 266 /* 267 * MAC_SRS_CHECK_BW_CONTROL 268 * 269 * Check to see if next tick has started so we can reset the 270 * SRS_BW_ENFORCED flag and allow more packets to come in the 271 * system. 272 */ 273 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 274 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 275 ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 276 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 277 clock_t now = ddi_get_lbolt(); \ 278 if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \ 279 (mac_srs)->srs_bw->mac_bw_curr_time = now; \ 280 (mac_srs)->srs_bw->mac_bw_used = 0; \ 281 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 282 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 283 } \ 284 } 285 286 /* 287 * MAC_SRS_WORKER_WAKEUP 288 * 289 * Wake up the SRS worker thread to process the queue as long as 290 * no one else is processing the queue. If we are optimizing for 291 * latency, we wake up the worker thread immediately or else we 292 * wait mac_srs_worker_wakeup_ticks before worker thread gets 293 * woken up. 294 */ 295 int mac_srs_worker_wakeup_ticks = 0; 296 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 297 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 298 if (!((mac_srs)->srs_state & SRS_PROC) && \ 299 (mac_srs)->srs_tid == NULL) { \ 300 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 301 (mac_srs_worker_wakeup_ticks == 0)) \ 302 cv_signal(&(mac_srs)->srs_async); \ 303 else \ 304 (mac_srs)->srs_tid = \ 305 timeout(mac_srs_fire, (mac_srs), \ 306 mac_srs_worker_wakeup_ticks); \ 307 } \ 308 } 309 310 #define TX_SINGLE_RING_MODE(mac_srs) \ 311 ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 312 (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 313 (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 314 315 #define TX_BANDWIDTH_MODE(mac_srs) \ 316 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 317 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 318 319 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 320 uint_t hash, indx; \ 321 hash = HASH_HINT(hint); \ 322 indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 323 softring = mac_srs->srs_oth_soft_rings[indx]; \ 324 (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 325 } 326 327 /* 328 * MAC_TX_SRS_BLOCK 329 * 330 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 331 * will be set only if srs_tx_woken_up is FALSE. If 332 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 333 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 334 * attempt to transmit again and not setting SRS_TX_BLOCKED does 335 * that. 336 */ 337 #define MAC_TX_SRS_BLOCK(srs, mp) { \ 338 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 339 if ((srs)->srs_tx.st_woken_up) { \ 340 (srs)->srs_tx.st_woken_up = B_FALSE; \ 341 } else { \ 342 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 343 (srs)->srs_state |= SRS_TX_BLOCKED; \ 344 (srs)->srs_tx.st_blocked_cnt++; \ 345 } \ 346 } 347 348 /* 349 * MAC_TX_SRS_TEST_HIWAT 350 * 351 * Called before queueing a packet onto Tx SRS to test and set 352 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 353 */ 354 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 355 boolean_t enqueue = 1; \ 356 \ 357 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 358 /* \ 359 * flow-controlled. Store srs in cookie so that it \ 360 * can be returned as mac_tx_cookie_t to client \ 361 */ \ 362 (srs)->srs_state |= SRS_TX_HIWAT; \ 363 cookie = (mac_tx_cookie_t)srs; \ 364 (srs)->srs_tx.st_hiwat_cnt++; \ 365 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 366 /* increment freed stats */ \ 367 (srs)->srs_tx.st_drop_count += cnt; \ 368 /* \ 369 * b_prev may be set to the fanout hint \ 370 * hence can't use freemsg directly \ 371 */ \ 372 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 373 DTRACE_PROBE1(tx_queued_hiwat, \ 374 mac_soft_ring_set_t *, srs); \ 375 enqueue = 0; \ 376 } \ 377 } \ 378 if (enqueue) \ 379 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 380 } 381 382 /* Some utility macros */ 383 #define MAC_SRS_BW_LOCK(srs) \ 384 if (!(srs->srs_type & SRST_TX)) \ 385 mutex_enter(&srs->srs_bw->mac_bw_lock); 386 387 #define MAC_SRS_BW_UNLOCK(srs) \ 388 if (!(srs->srs_type & SRST_TX)) \ 389 mutex_exit(&srs->srs_bw->mac_bw_lock); 390 391 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 392 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 393 /* increment freed stats */ \ 394 mac_srs->srs_tx.st_drop_count++; \ 395 cookie = (mac_tx_cookie_t)srs; \ 396 } 397 398 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 399 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 400 cookie = (mac_tx_cookie_t)srs; \ 401 *ret_mp = mp_chain; \ 402 } 403 404 /* 405 * Drop the rx packet and advance to the next one in the chain. 406 */ 407 static void 408 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 409 { 410 mac_srs_rx_t *srs_rx = &srs->srs_rx; 411 412 ASSERT(mp->b_next == NULL); 413 mutex_enter(&srs->srs_lock); 414 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 415 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 416 mutex_exit(&srs->srs_lock); 417 418 srs_rx->sr_drop_count++; 419 freemsg(mp); 420 } 421 422 /* DATAPATH RUNTIME ROUTINES */ 423 424 /* 425 * mac_srs_fire 426 * 427 * Timer callback routine for waking up the SRS worker thread. 428 */ 429 static void 430 mac_srs_fire(void *arg) 431 { 432 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 433 434 mutex_enter(&mac_srs->srs_lock); 435 if (mac_srs->srs_tid == 0) { 436 mutex_exit(&mac_srs->srs_lock); 437 return; 438 } 439 440 mac_srs->srs_tid = 0; 441 if (!(mac_srs->srs_state & SRS_PROC)) 442 cv_signal(&mac_srs->srs_async); 443 444 mutex_exit(&mac_srs->srs_lock); 445 } 446 447 /* 448 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 449 * and it is used on the TX path. 450 */ 451 #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 452 453 /* 454 * hash based on the src address and the port information. 455 */ 456 #define HASH_ADDR(src, ports) \ 457 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 458 ((ports) >> 8) ^ (ports)) 459 460 #define COMPUTE_INDEX(key, sz) (key % sz) 461 462 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 463 if ((tail) != NULL) { \ 464 ASSERT((tail)->b_next == NULL); \ 465 (tail)->b_next = (mp); \ 466 } else { \ 467 ASSERT((head) == NULL); \ 468 (head) = (mp); \ 469 } \ 470 (tail) = (mp); \ 471 (cnt)++; \ 472 if ((bw_ctl)) \ 473 (sz) += (sz0); \ 474 } 475 476 #define MAC_FANOUT_DEFAULT 0 477 #define MAC_FANOUT_RND_ROBIN 1 478 int mac_fanout_type = MAC_FANOUT_DEFAULT; 479 480 #define MAX_SR_TYPES 3 481 /* fanout types for port based hashing */ 482 enum pkt_type { 483 V4_TCP = 0, 484 V4_UDP, 485 OTH, 486 UNDEF 487 }; 488 489 /* 490 * In general we do port based hashing to spread traffic over different 491 * softrings. The below tunable allows to override that behavior. Setting it 492 * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 493 * is also the applicable to ipv6 packets carrying multiple optional headers 494 * and other uncommon packet types. 495 */ 496 boolean_t mac_src_ipv6_fanout = B_FALSE; 497 498 /* 499 * Pair of local and remote ports in the transport header 500 */ 501 #define PORTS_SIZE 4 502 503 /* 504 * mac_rx_srs_proto_fanout 505 * 506 * This routine delivers packets destined to an SRS into one of the 507 * protocol soft rings. 508 * 509 * Given a chain of packets we need to split it up into multiple sub chains 510 * destined into TCP, UDP or OTH soft ring. Instead of entering 511 * the soft ring one packet at a time, we want to enter it in the form of a 512 * chain otherwise we get this start/stop behaviour where the worker thread 513 * goes to sleep and then next packets comes in forcing it to wake up etc. 514 */ 515 static void 516 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 517 { 518 struct ether_header *ehp; 519 struct ether_vlan_header *evhp; 520 uint32_t sap; 521 ipha_t *ipha; 522 uint8_t *dstaddr; 523 size_t hdrsize; 524 mblk_t *mp; 525 mblk_t *headmp[MAX_SR_TYPES]; 526 mblk_t *tailmp[MAX_SR_TYPES]; 527 int cnt[MAX_SR_TYPES]; 528 size_t sz[MAX_SR_TYPES]; 529 size_t sz1; 530 boolean_t bw_ctl; 531 boolean_t hw_classified; 532 boolean_t dls_bypass; 533 boolean_t is_ether; 534 boolean_t is_unicast; 535 enum pkt_type type; 536 mac_client_impl_t *mcip = mac_srs->srs_mcip; 537 538 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 539 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 540 541 /* 542 * If we don't have a Rx ring, S/W classification would have done 543 * its job and its a packet meant for us. If we were polling on 544 * the default ring (i.e. there was a ring assigned to this SRS), 545 * then we need to make sure that the mac address really belongs 546 * to us. 547 */ 548 hw_classified = mac_srs->srs_ring != NULL && 549 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 550 551 /* 552 * Special clients (eg. VLAN, non ether, etc) need DLS 553 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 554 * such SRSs. Another way of disabling bypass is to set the 555 * MCIS_RX_BYPASS_DISABLE flag. 556 */ 557 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 558 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 559 560 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 561 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 562 bzero(cnt, MAX_SR_TYPES * sizeof (int)); 563 bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 564 565 /* 566 * We got a chain from SRS that we need to send to the soft rings. 567 * Since squeues for TCP & IPv4 sap poll their soft rings (for 568 * performance reasons), we need to separate out v4_tcp, v4_udp 569 * and the rest goes in other. 570 */ 571 while (head != NULL) { 572 mp = head; 573 head = head->b_next; 574 mp->b_next = NULL; 575 576 type = OTH; 577 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 578 579 if (is_ether) { 580 /* 581 * At this point we can be sure the packet at least 582 * has an ether header. 583 */ 584 if (sz1 < sizeof (struct ether_header)) { 585 mac_rx_drop_pkt(mac_srs, mp); 586 continue; 587 } 588 ehp = (struct ether_header *)mp->b_rptr; 589 590 /* 591 * Determine if this is a VLAN or non-VLAN packet. 592 */ 593 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 594 evhp = (struct ether_vlan_header *)mp->b_rptr; 595 sap = ntohs(evhp->ether_type); 596 hdrsize = sizeof (struct ether_vlan_header); 597 /* 598 * Check if the VID of the packet, if any, 599 * belongs to this client. 600 */ 601 if (!mac_client_check_flow_vid(mcip, 602 VLAN_ID(ntohs(evhp->ether_tci)))) { 603 mac_rx_drop_pkt(mac_srs, mp); 604 continue; 605 } 606 } else { 607 hdrsize = sizeof (struct ether_header); 608 } 609 is_unicast = 610 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 611 dstaddr = (uint8_t *)&ehp->ether_dhost; 612 } else { 613 mac_header_info_t mhi; 614 615 if (mac_header_info((mac_handle_t)mcip->mci_mip, 616 mp, &mhi) != 0) { 617 mac_rx_drop_pkt(mac_srs, mp); 618 continue; 619 } 620 hdrsize = mhi.mhi_hdrsize; 621 sap = mhi.mhi_bindsap; 622 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 623 dstaddr = (uint8_t *)mhi.mhi_daddr; 624 } 625 626 if (!dls_bypass) { 627 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 628 cnt[type], bw_ctl, sz[type], sz1, mp); 629 continue; 630 } 631 632 if (sap == ETHERTYPE_IP) { 633 /* 634 * If we are H/W classified, but we have promisc 635 * on, then we need to check for the unicast address. 636 */ 637 if (hw_classified && mcip->mci_promisc_list != NULL) { 638 mac_address_t *map; 639 640 rw_enter(&mcip->mci_rw_lock, RW_READER); 641 map = mcip->mci_unicast; 642 if (bcmp(dstaddr, map->ma_addr, 643 map->ma_len) == 0) 644 type = UNDEF; 645 rw_exit(&mcip->mci_rw_lock); 646 } else if (is_unicast) { 647 type = UNDEF; 648 } 649 } 650 651 /* 652 * This needs to become a contract with the driver for 653 * the fast path. 654 * 655 * In the normal case the packet will have at least the L2 656 * header and the IP + Transport header in the same mblk. 657 * This is usually the case when the NIC driver sends up 658 * the packet. This is also true when the stack generates 659 * a packet that is looped back and when the stack uses the 660 * fastpath mechanism. The normal case is optimized for 661 * performance and may bypass DLS. All other cases go through 662 * the 'OTH' type path without DLS bypass. 663 */ 664 665 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 666 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 667 type = OTH; 668 669 if (type == OTH) { 670 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 671 cnt[type], bw_ctl, sz[type], sz1, mp); 672 continue; 673 } 674 675 ASSERT(type == UNDEF); 676 /* 677 * We look for at least 4 bytes past the IP header to get 678 * the port information. If we get an IP fragment, we don't 679 * have the port information, and we use just the protocol 680 * information. 681 */ 682 switch (ipha->ipha_protocol) { 683 case IPPROTO_TCP: 684 type = V4_TCP; 685 mp->b_rptr += hdrsize; 686 break; 687 case IPPROTO_UDP: 688 type = V4_UDP; 689 mp->b_rptr += hdrsize; 690 break; 691 default: 692 type = OTH; 693 break; 694 } 695 696 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 697 bw_ctl, sz[type], sz1, mp); 698 } 699 700 for (type = V4_TCP; type < UNDEF; type++) { 701 if (headmp[type] != NULL) { 702 mac_soft_ring_t *softring; 703 704 ASSERT(tailmp[type]->b_next == NULL); 705 switch (type) { 706 case V4_TCP: 707 softring = mac_srs->srs_tcp_soft_rings[0]; 708 break; 709 case V4_UDP: 710 softring = mac_srs->srs_udp_soft_rings[0]; 711 break; 712 case OTH: 713 softring = mac_srs->srs_oth_soft_rings[0]; 714 } 715 mac_rx_soft_ring_process(mcip, softring, 716 headmp[type], tailmp[type], cnt[type], sz[type]); 717 } 718 } 719 } 720 721 int fanout_unalligned = 0; 722 723 /* 724 * mac_rx_srs_long_fanout 725 * 726 * The fanout routine for IPv6 727 */ 728 static int 729 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 730 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 731 { 732 ip6_t *ip6h; 733 uint8_t *whereptr; 734 uint_t hash; 735 uint16_t remlen; 736 uint8_t nexthdr; 737 uint16_t hdr_len; 738 739 if (sap == ETHERTYPE_IPV6) { 740 boolean_t modifiable = B_TRUE; 741 742 ASSERT(MBLKL(mp) >= hdrsize); 743 744 ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 745 if ((unsigned char *)ip6h == mp->b_wptr) { 746 /* 747 * The first mblk_t only includes the mac header. 748 * Note that it is safe to change the mp pointer here, 749 * as the subsequent operation does not assume mp 750 * points to the start of the mac header. 751 */ 752 mp = mp->b_cont; 753 754 /* 755 * Make sure ip6h holds the full ip6_t structure. 756 */ 757 if (mp == NULL) 758 return (-1); 759 760 if (MBLKL(mp) < IPV6_HDR_LEN) { 761 modifiable = (DB_REF(mp) == 1); 762 763 if (modifiable && 764 !pullupmsg(mp, IPV6_HDR_LEN)) { 765 return (-1); 766 } 767 } 768 769 ip6h = (ip6_t *)mp->b_rptr; 770 } 771 772 if (!modifiable || !(OK_32PTR((char *)ip6h)) || 773 ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 774 /* 775 * If either ip6h is not alligned, or ip6h does not 776 * hold the complete ip6_t structure (a pullupmsg() 777 * is not an option since it would result in an 778 * unalligned ip6h), fanout to the default ring. Note 779 * that this may cause packets reordering. 780 */ 781 *indx = 0; 782 *type = OTH; 783 fanout_unalligned++; 784 return (0); 785 } 786 787 remlen = ntohs(ip6h->ip6_plen); 788 nexthdr = ip6h->ip6_nxt; 789 790 if (remlen < MIN_EHDR_LEN) 791 return (-1); 792 /* 793 * Do src based fanout if below tunable is set to B_TRUE or 794 * when mac_ip_hdr_length_v6() fails because of malformed 795 * packets or because mblk's need to be concatenated using 796 * pullupmsg(). 797 */ 798 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 799 &hdr_len, &nexthdr, NULL, NULL)) { 800 goto src_based_fanout; 801 } 802 whereptr = (uint8_t *)ip6h + hdr_len; 803 804 /* If the transport is one of below, we do port based fanout */ 805 switch (nexthdr) { 806 case IPPROTO_TCP: 807 case IPPROTO_UDP: 808 case IPPROTO_SCTP: 809 case IPPROTO_ESP: 810 /* 811 * If the ports in the transport header is not part of 812 * the mblk, do src_based_fanout, instead of calling 813 * pullupmsg(). 814 */ 815 if (mp->b_cont != NULL && 816 whereptr + PORTS_SIZE > mp->b_wptr) { 817 goto src_based_fanout; 818 } 819 break; 820 default: 821 break; 822 } 823 824 switch (nexthdr) { 825 case IPPROTO_TCP: 826 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 827 *(uint32_t *)whereptr); 828 *indx = COMPUTE_INDEX(hash, 829 mac_srs->srs_tcp_ring_count); 830 *type = OTH; 831 break; 832 833 case IPPROTO_UDP: 834 case IPPROTO_SCTP: 835 case IPPROTO_ESP: 836 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 837 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 838 *(uint32_t *)whereptr); 839 *indx = COMPUTE_INDEX(hash, 840 mac_srs->srs_udp_ring_count); 841 } else { 842 *indx = mac_srs->srs_ind % 843 mac_srs->srs_udp_ring_count; 844 mac_srs->srs_ind++; 845 } 846 *type = OTH; 847 break; 848 849 /* For all other protocol, do source based fanout */ 850 default: 851 goto src_based_fanout; 852 } 853 } else { 854 *indx = 0; 855 *type = OTH; 856 } 857 return (0); 858 859 src_based_fanout: 860 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 861 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 862 *type = OTH; 863 return (0); 864 } 865 866 /* 867 * mac_rx_srs_fanout 868 * 869 * This routine delivers packets destined to an SRS into a soft ring member 870 * of the set. 871 * 872 * Given a chain of packets we need to split it up into multiple sub chains 873 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 874 * the soft ring one packet at a time, we want to enter it in the form of a 875 * chain otherwise we get this start/stop behaviour where the worker thread 876 * goes to sleep and then next packets comes in forcing it to wake up etc. 877 * 878 * Note: 879 * Since we know what is the maximum fanout possible, we create a 2D array 880 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 881 * variables so that we can enter the softrings with chain. We need the 882 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 883 * for each packet would be expensive). If we ever want to have the 884 * ability to have unlimited fanout, we should probably declare a head, 885 * tail, cnt, sz with each soft ring (a data struct which contains a softring 886 * along with these members) and create an array of this uber struct so we 887 * don't have to do kmem_alloc. 888 */ 889 int fanout_oth1 = 0; 890 int fanout_oth2 = 0; 891 int fanout_oth3 = 0; 892 int fanout_oth4 = 0; 893 int fanout_oth5 = 0; 894 895 static void 896 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 897 { 898 struct ether_header *ehp; 899 struct ether_vlan_header *evhp; 900 uint32_t sap; 901 ipha_t *ipha; 902 uint8_t *dstaddr; 903 uint_t indx; 904 size_t ports_offset; 905 size_t ipha_len; 906 size_t hdrsize; 907 uint_t hash; 908 mblk_t *mp; 909 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 910 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 911 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 912 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 913 size_t sz1; 914 boolean_t bw_ctl; 915 boolean_t hw_classified; 916 boolean_t dls_bypass; 917 boolean_t is_ether; 918 boolean_t is_unicast; 919 int fanout_cnt; 920 enum pkt_type type; 921 mac_client_impl_t *mcip = mac_srs->srs_mcip; 922 923 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 924 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 925 926 /* 927 * If we don't have a Rx ring, S/W classification would have done 928 * its job and its a packet meant for us. If we were polling on 929 * the default ring (i.e. there was a ring assigned to this SRS), 930 * then we need to make sure that the mac address really belongs 931 * to us. 932 */ 933 hw_classified = mac_srs->srs_ring != NULL && 934 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 935 936 /* 937 * Special clients (eg. VLAN, non ether, etc) need DLS 938 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 939 * such SRSs. Another way of disabling bypass is to set the 940 * MCIS_RX_BYPASS_DISABLE flag. 941 */ 942 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 943 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 944 945 /* 946 * Since the softrings are never destroyed and we always 947 * create equal number of softrings for TCP, UDP and rest, 948 * its OK to check one of them for count and use it without 949 * any lock. In future, if soft rings get destroyed because 950 * of reduction in fanout, we will need to ensure that happens 951 * behind the SRS_PROC. 952 */ 953 fanout_cnt = mac_srs->srs_tcp_ring_count; 954 955 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 956 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 957 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 958 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 959 960 /* 961 * We got a chain from SRS that we need to send to the soft rings. 962 * Since squeues for TCP & IPv4 sap poll their soft rings (for 963 * performance reasons), we need to separate out v4_tcp, v4_udp 964 * and the rest goes in other. 965 */ 966 while (head != NULL) { 967 mp = head; 968 head = head->b_next; 969 mp->b_next = NULL; 970 971 type = OTH; 972 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 973 974 if (is_ether) { 975 /* 976 * At this point we can be sure the packet at least 977 * has an ether header. 978 */ 979 if (sz1 < sizeof (struct ether_header)) { 980 mac_rx_drop_pkt(mac_srs, mp); 981 continue; 982 } 983 ehp = (struct ether_header *)mp->b_rptr; 984 985 /* 986 * Determine if this is a VLAN or non-VLAN packet. 987 */ 988 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 989 evhp = (struct ether_vlan_header *)mp->b_rptr; 990 sap = ntohs(evhp->ether_type); 991 hdrsize = sizeof (struct ether_vlan_header); 992 /* 993 * Check if the VID of the packet, if any, 994 * belongs to this client. 995 */ 996 if (!mac_client_check_flow_vid(mcip, 997 VLAN_ID(ntohs(evhp->ether_tci)))) { 998 mac_rx_drop_pkt(mac_srs, mp); 999 continue; 1000 } 1001 } else { 1002 hdrsize = sizeof (struct ether_header); 1003 } 1004 is_unicast = 1005 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 1006 dstaddr = (uint8_t *)&ehp->ether_dhost; 1007 } else { 1008 mac_header_info_t mhi; 1009 1010 if (mac_header_info((mac_handle_t)mcip->mci_mip, 1011 mp, &mhi) != 0) { 1012 mac_rx_drop_pkt(mac_srs, mp); 1013 continue; 1014 } 1015 hdrsize = mhi.mhi_hdrsize; 1016 sap = mhi.mhi_bindsap; 1017 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 1018 dstaddr = (uint8_t *)mhi.mhi_daddr; 1019 } 1020 1021 if (!dls_bypass) { 1022 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1023 hdrsize, &type, &indx) == -1) { 1024 mac_rx_drop_pkt(mac_srs, mp); 1025 continue; 1026 } 1027 1028 FANOUT_ENQUEUE_MP(headmp[type][indx], 1029 tailmp[type][indx], cnt[type][indx], bw_ctl, 1030 sz[type][indx], sz1, mp); 1031 continue; 1032 } 1033 1034 1035 /* 1036 * If we are using the default Rx ring where H/W or S/W 1037 * classification has not happened, we need to verify if 1038 * this unicast packet really belongs to us. 1039 */ 1040 if (sap == ETHERTYPE_IP) { 1041 /* 1042 * If we are H/W classified, but we have promisc 1043 * on, then we need to check for the unicast address. 1044 */ 1045 if (hw_classified && mcip->mci_promisc_list != NULL) { 1046 mac_address_t *map; 1047 1048 rw_enter(&mcip->mci_rw_lock, RW_READER); 1049 map = mcip->mci_unicast; 1050 if (bcmp(dstaddr, map->ma_addr, 1051 map->ma_len) == 0) 1052 type = UNDEF; 1053 rw_exit(&mcip->mci_rw_lock); 1054 } else if (is_unicast) { 1055 type = UNDEF; 1056 } 1057 } 1058 1059 /* 1060 * This needs to become a contract with the driver for 1061 * the fast path. 1062 */ 1063 1064 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 1065 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1066 type = OTH; 1067 fanout_oth1++; 1068 } 1069 1070 if (type != OTH) { 1071 uint16_t frag_offset_flags; 1072 1073 switch (ipha->ipha_protocol) { 1074 case IPPROTO_TCP: 1075 case IPPROTO_UDP: 1076 case IPPROTO_SCTP: 1077 case IPPROTO_ESP: 1078 ipha_len = IPH_HDR_LENGTH(ipha); 1079 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1080 mp->b_wptr) { 1081 type = OTH; 1082 break; 1083 } 1084 frag_offset_flags = 1085 ntohs(ipha->ipha_fragment_offset_and_flags); 1086 if ((frag_offset_flags & 1087 (IPH_MF | IPH_OFFSET)) != 0) { 1088 type = OTH; 1089 fanout_oth3++; 1090 break; 1091 } 1092 ports_offset = hdrsize + ipha_len; 1093 break; 1094 default: 1095 type = OTH; 1096 fanout_oth4++; 1097 break; 1098 } 1099 } 1100 1101 if (type == OTH) { 1102 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1103 hdrsize, &type, &indx) == -1) { 1104 mac_rx_drop_pkt(mac_srs, mp); 1105 continue; 1106 } 1107 1108 FANOUT_ENQUEUE_MP(headmp[type][indx], 1109 tailmp[type][indx], cnt[type][indx], bw_ctl, 1110 sz[type][indx], sz1, mp); 1111 continue; 1112 } 1113 1114 ASSERT(type == UNDEF); 1115 1116 /* 1117 * XXX-Sunay: We should hold srs_lock since ring_count 1118 * below can change. But if we are always called from 1119 * mac_rx_srs_drain and SRS_PROC is set, then we can 1120 * enforce that ring_count can't be changed i.e. 1121 * to change fanout type or ring count, the calling 1122 * thread needs to be behind SRS_PROC. 1123 */ 1124 switch (ipha->ipha_protocol) { 1125 case IPPROTO_TCP: 1126 /* 1127 * Note that for ESP, we fanout on SPI and it is at the 1128 * same offset as the 2x16-bit ports. So it is clumped 1129 * along with TCP, UDP and SCTP. 1130 */ 1131 hash = HASH_ADDR(ipha->ipha_src, 1132 *(uint32_t *)(mp->b_rptr + ports_offset)); 1133 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1134 type = V4_TCP; 1135 mp->b_rptr += hdrsize; 1136 break; 1137 case IPPROTO_UDP: 1138 case IPPROTO_SCTP: 1139 case IPPROTO_ESP: 1140 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1141 hash = HASH_ADDR(ipha->ipha_src, 1142 *(uint32_t *)(mp->b_rptr + ports_offset)); 1143 indx = COMPUTE_INDEX(hash, 1144 mac_srs->srs_udp_ring_count); 1145 } else { 1146 indx = mac_srs->srs_ind % 1147 mac_srs->srs_udp_ring_count; 1148 mac_srs->srs_ind++; 1149 } 1150 type = V4_UDP; 1151 mp->b_rptr += hdrsize; 1152 break; 1153 default: 1154 indx = 0; 1155 type = OTH; 1156 } 1157 1158 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1159 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1160 } 1161 1162 for (type = V4_TCP; type < UNDEF; type++) { 1163 int i; 1164 1165 for (i = 0; i < fanout_cnt; i++) { 1166 if (headmp[type][i] != NULL) { 1167 mac_soft_ring_t *softring; 1168 1169 ASSERT(tailmp[type][i]->b_next == NULL); 1170 switch (type) { 1171 case V4_TCP: 1172 softring = 1173 mac_srs->srs_tcp_soft_rings[i]; 1174 break; 1175 case V4_UDP: 1176 softring = 1177 mac_srs->srs_udp_soft_rings[i]; 1178 break; 1179 case OTH: 1180 softring = 1181 mac_srs->srs_oth_soft_rings[i]; 1182 break; 1183 } 1184 mac_rx_soft_ring_process(mcip, 1185 softring, headmp[type][i], tailmp[type][i], 1186 cnt[type][i], sz[type][i]); 1187 } 1188 } 1189 } 1190 } 1191 1192 #define SRS_BYTES_TO_PICKUP 150000 1193 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1194 1195 /* 1196 * mac_rx_srs_poll_ring 1197 * 1198 * This SRS Poll thread uses this routine to poll the underlying hardware 1199 * Rx ring to get a chain of packets. It can inline process that chain 1200 * if mac_latency_optimize is set (default) or signal the SRS worker thread 1201 * to do the remaining processing. 1202 * 1203 * Since packets come in the system via interrupt or poll path, we also 1204 * update the stats and deal with promiscous clients here. 1205 */ 1206 void 1207 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1208 { 1209 kmutex_t *lock = &mac_srs->srs_lock; 1210 kcondvar_t *async = &mac_srs->srs_cv; 1211 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1212 mblk_t *head, *tail, *mp; 1213 callb_cpr_t cprinfo; 1214 ssize_t bytes_to_pickup; 1215 size_t sz; 1216 int count; 1217 mac_client_impl_t *smcip; 1218 1219 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1220 mutex_enter(lock); 1221 1222 start: 1223 for (;;) { 1224 if (mac_srs->srs_state & SRS_PAUSE) 1225 goto done; 1226 1227 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1228 cv_wait(async, lock); 1229 CALLB_CPR_SAFE_END(&cprinfo, lock); 1230 1231 if (mac_srs->srs_state & SRS_PAUSE) 1232 goto done; 1233 1234 check_again: 1235 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1236 /* 1237 * We pick as many bytes as we are allowed to queue. 1238 * Its possible that we will exceed the total 1239 * packets queued in case this SRS is part of the 1240 * Rx ring group since > 1 poll thread can be pulling 1241 * upto the max allowed packets at the same time 1242 * but that should be OK. 1243 */ 1244 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1245 bytes_to_pickup = 1246 mac_srs->srs_bw->mac_bw_drop_threshold - 1247 mac_srs->srs_bw->mac_bw_sz; 1248 /* 1249 * We shouldn't have been signalled if we 1250 * have 0 or less bytes to pick but since 1251 * some of the bytes accounting is driver 1252 * dependant, we do the safety check. 1253 */ 1254 if (bytes_to_pickup < 0) 1255 bytes_to_pickup = 0; 1256 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1257 } else { 1258 /* 1259 * ToDO: Need to change the polling API 1260 * to add a packet count and a flag which 1261 * tells the driver whether we want packets 1262 * based on a count, or bytes, or all the 1263 * packets queued in the driver/HW. This 1264 * way, we never have to check the limits 1265 * on poll path. We truly let only as many 1266 * packets enter the system as we are willing 1267 * to process or queue. 1268 * 1269 * Something along the lines of 1270 * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1271 * mac_srs->srs_poll_pkt_cnt 1272 */ 1273 1274 /* 1275 * Since we are not doing B/W control, pick 1276 * as many packets as allowed. 1277 */ 1278 bytes_to_pickup = max_bytes_to_pickup; 1279 } 1280 1281 /* Poll the underlying Hardware */ 1282 mutex_exit(lock); 1283 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1284 mutex_enter(lock); 1285 1286 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1287 SRS_POLL_THR_OWNER); 1288 1289 mp = tail = head; 1290 count = 0; 1291 sz = 0; 1292 while (mp != NULL) { 1293 tail = mp; 1294 sz += msgdsize(mp); 1295 mp = mp->b_next; 1296 count++; 1297 } 1298 1299 if (head != NULL) { 1300 tail->b_next = NULL; 1301 smcip = mac_srs->srs_mcip; 1302 1303 if ((mac_srs->srs_type & SRST_FLOW) || 1304 (smcip == NULL)) { 1305 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1306 rbytes, sz); 1307 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1308 ipackets, count); 1309 } 1310 1311 /* 1312 * If there are any promiscuous mode callbacks 1313 * defined for this MAC client, pass them a copy 1314 * if appropriate and also update the counters. 1315 */ 1316 if (smcip != NULL) { 1317 smcip->mci_stat_ibytes += sz; 1318 smcip->mci_stat_ipackets += count; 1319 1320 if (smcip->mci_mip->mi_promisc_list != NULL) { 1321 mutex_exit(lock); 1322 mac_promisc_dispatch(smcip->mci_mip, 1323 head, NULL); 1324 mutex_enter(lock); 1325 } 1326 } 1327 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1328 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1329 mac_srs->srs_bw->mac_bw_polled += sz; 1330 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1331 } 1332 srs_rx->sr_poll_count += count; 1333 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1334 count, sz); 1335 if (count <= 10) 1336 srs_rx->sr_chain_cnt_undr10++; 1337 else if (count > 10 && count <= 50) 1338 srs_rx->sr_chain_cnt_10to50++; 1339 else 1340 srs_rx->sr_chain_cnt_over50++; 1341 } 1342 1343 /* 1344 * We are guaranteed that SRS_PROC will be set if we 1345 * are here. Also, poll thread gets to run only if 1346 * the drain was being done by a worker thread although 1347 * its possible that worker thread is still running 1348 * and poll thread was sent down to keep the pipeline 1349 * going instead of doing a complete drain and then 1350 * trying to poll the NIC. 1351 * 1352 * So we need to check SRS_WORKER flag to make sure 1353 * that the worker thread is not processing the queue 1354 * in parallel to us. The flags and conditions are 1355 * protected by the srs_lock to prevent any race. We 1356 * ensure that we don't drop the srs_lock from now 1357 * till the end and similarly we don't drop the srs_lock 1358 * in mac_rx_srs_drain() till similar condition check 1359 * are complete. The mac_rx_srs_drain() needs to ensure 1360 * that SRS_WORKER flag remains set as long as its 1361 * processing the queue. 1362 */ 1363 if (!(mac_srs->srs_state & SRS_WORKER) && 1364 (mac_srs->srs_first != NULL)) { 1365 /* 1366 * We have packets to process and worker thread 1367 * is not running. Check to see if poll thread is 1368 * allowed to process. 1369 */ 1370 if (mac_srs->srs_state & SRS_LATENCY_OPT) { 1371 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1372 if (!(mac_srs->srs_state & SRS_PAUSE) && 1373 srs_rx->sr_poll_pkt_cnt <= 1374 srs_rx->sr_lowat) { 1375 srs_rx->sr_poll_again++; 1376 goto check_again; 1377 } 1378 /* 1379 * We are already above low water mark 1380 * so stay in the polling mode but no 1381 * need to poll. Once we dip below 1382 * the polling threshold, the processing 1383 * thread (soft ring) will signal us 1384 * to poll again (MAC_UPDATE_SRS_COUNT) 1385 */ 1386 srs_rx->sr_poll_drain_no_poll++; 1387 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1388 /* 1389 * In B/W control case, its possible 1390 * that the backlog built up due to 1391 * B/W limit being reached and packets 1392 * are queued only in SRS. In this case, 1393 * we should schedule worker thread 1394 * since no one else will wake us up. 1395 */ 1396 if ((mac_srs->srs_type & SRST_BW_CONTROL) && 1397 (mac_srs->srs_tid == NULL)) { 1398 mac_srs->srs_tid = 1399 timeout(mac_srs_fire, mac_srs, 1); 1400 srs_rx->sr_poll_worker_wakeup++; 1401 } 1402 } else { 1403 /* 1404 * Wakeup the worker thread for more processing. 1405 * We optimize for throughput in this case. 1406 */ 1407 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1408 MAC_SRS_WORKER_WAKEUP(mac_srs); 1409 srs_rx->sr_poll_sig_worker++; 1410 } 1411 } else if ((mac_srs->srs_first == NULL) && 1412 !(mac_srs->srs_state & SRS_WORKER)) { 1413 /* 1414 * There is nothing queued in SRS and 1415 * no worker thread running. Plus we 1416 * didn't get anything from the H/W 1417 * as well (head == NULL); 1418 */ 1419 ASSERT(head == NULL); 1420 mac_srs->srs_state &= 1421 ~(SRS_PROC|SRS_GET_PKTS); 1422 1423 /* 1424 * If we have a packets in soft ring, don't allow 1425 * more packets to come into this SRS by keeping the 1426 * interrupts off but not polling the H/W. The 1427 * poll thread will get signaled as soon as 1428 * srs_poll_pkt_cnt dips below poll threshold. 1429 */ 1430 if (srs_rx->sr_poll_pkt_cnt == 0) { 1431 srs_rx->sr_poll_intr_enable++; 1432 MAC_SRS_POLLING_OFF(mac_srs); 1433 } else { 1434 /* 1435 * We know nothing is queued in SRS 1436 * since we are here after checking 1437 * srs_first is NULL. The backlog 1438 * is entirely due to packets queued 1439 * in Soft ring which will wake us up 1440 * and get the interface out of polling 1441 * mode once the backlog dips below 1442 * sr_poll_thres. 1443 */ 1444 srs_rx->sr_poll_no_poll++; 1445 } 1446 } else { 1447 /* 1448 * Worker thread is already running. 1449 * Nothing much to do. If the polling 1450 * was enabled, worker thread will deal 1451 * with that. 1452 */ 1453 mac_srs->srs_state &= ~SRS_GET_PKTS; 1454 srs_rx->sr_poll_goto_sleep++; 1455 } 1456 } 1457 done: 1458 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1459 cv_signal(&mac_srs->srs_async); 1460 /* 1461 * If this is a temporary quiesce then wait for the restart signal 1462 * from the srs worker. Then clear the flags and signal the srs worker 1463 * to ensure a positive handshake and go back to start. 1464 */ 1465 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1466 cv_wait(async, lock); 1467 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1468 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1469 mac_srs->srs_state &= 1470 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1471 cv_signal(&mac_srs->srs_async); 1472 goto start; 1473 } else { 1474 mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1475 cv_signal(&mac_srs->srs_async); 1476 CALLB_CPR_EXIT(&cprinfo); 1477 thread_exit(); 1478 } 1479 } 1480 1481 /* 1482 * mac_srs_pick_chain 1483 * 1484 * In Bandwidth control case, checks how many packets can be processed 1485 * and return them in a sub chain. 1486 */ 1487 static mblk_t * 1488 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1489 size_t *chain_sz, int *chain_cnt) 1490 { 1491 mblk_t *head = NULL; 1492 mblk_t *tail = NULL; 1493 size_t sz; 1494 size_t tsz = 0; 1495 int cnt = 0; 1496 mblk_t *mp; 1497 1498 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1499 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1500 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1501 mac_srs->srs_bw->mac_bw_limit) || 1502 (mac_srs->srs_bw->mac_bw_limit == 0)) { 1503 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1504 head = mac_srs->srs_first; 1505 mac_srs->srs_first = NULL; 1506 *chain_tail = mac_srs->srs_last; 1507 mac_srs->srs_last = NULL; 1508 *chain_sz = mac_srs->srs_size; 1509 *chain_cnt = mac_srs->srs_count; 1510 mac_srs->srs_count = 0; 1511 mac_srs->srs_size = 0; 1512 return (head); 1513 } 1514 1515 /* 1516 * Can't clear the entire backlog. 1517 * Need to find how many packets to pick 1518 */ 1519 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1520 while ((mp = mac_srs->srs_first) != NULL) { 1521 sz = msgdsize(mp); 1522 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1523 mac_srs->srs_bw->mac_bw_limit) { 1524 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1525 mac_srs->srs_bw->mac_bw_state |= 1526 SRS_BW_ENFORCED; 1527 break; 1528 } 1529 1530 /* 1531 * The _size & cnt is decremented from the softrings 1532 * when they send up the packet for polling to work 1533 * properly. 1534 */ 1535 tsz += sz; 1536 cnt++; 1537 mac_srs->srs_count--; 1538 mac_srs->srs_size -= sz; 1539 if (tail != NULL) 1540 tail->b_next = mp; 1541 else 1542 head = mp; 1543 tail = mp; 1544 mac_srs->srs_first = mac_srs->srs_first->b_next; 1545 } 1546 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1547 if (mac_srs->srs_first == NULL) 1548 mac_srs->srs_last = NULL; 1549 1550 if (tail != NULL) 1551 tail->b_next = NULL; 1552 *chain_tail = tail; 1553 *chain_cnt = cnt; 1554 *chain_sz = tsz; 1555 1556 return (head); 1557 } 1558 1559 /* 1560 * mac_rx_srs_drain 1561 * 1562 * The SRS drain routine. Gets to run to clear the queue. Any thread 1563 * (worker, interrupt, poll) can call this based on processing model. 1564 * The first thing we do is disable interrupts if possible and then 1565 * drain the queue. we also try to poll the underlying hardware if 1566 * there is a dedicated hardware Rx ring assigned to this SRS. 1567 * 1568 * There is a equivalent drain routine in bandwidth control mode 1569 * mac_rx_srs_drain_bw. There is some code duplication between the two 1570 * routines but they are highly performance sensitive and are easier 1571 * to read/debug if they stay separate. Any code changes here might 1572 * also apply to mac_rx_srs_drain_bw as well. 1573 */ 1574 void 1575 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1576 { 1577 mblk_t *head; 1578 mblk_t *tail; 1579 timeout_id_t tid; 1580 int cnt = 0; 1581 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1582 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1583 1584 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1585 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1586 1587 /* If we are blanked i.e. can't do upcalls, then we are done */ 1588 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1589 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1590 (mac_srs->srs_state & SRS_PAUSE)); 1591 goto out; 1592 } 1593 1594 if (mac_srs->srs_first == NULL) 1595 goto out; 1596 1597 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 1598 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 1599 /* 1600 * In the normal case, the SRS worker thread does no 1601 * work and we wait for a backlog to build up before 1602 * we switch into polling mode. In case we are 1603 * optimizing for throughput, we use the worker thread 1604 * as well. The goal is to let worker thread process 1605 * the queue and poll thread to feed packets into 1606 * the queue. As such, we should signal the poll 1607 * thread to try and get more packets. 1608 * 1609 * We could have pulled this check in the POLL_RING 1610 * macro itself but keeping it explicit here makes 1611 * the architecture more human understandable. 1612 */ 1613 MAC_SRS_POLL_RING(mac_srs); 1614 } 1615 1616 again: 1617 head = mac_srs->srs_first; 1618 mac_srs->srs_first = NULL; 1619 tail = mac_srs->srs_last; 1620 mac_srs->srs_last = NULL; 1621 cnt = mac_srs->srs_count; 1622 mac_srs->srs_count = 0; 1623 1624 ASSERT(head != NULL); 1625 ASSERT(tail != NULL); 1626 1627 if ((tid = mac_srs->srs_tid) != 0) 1628 mac_srs->srs_tid = 0; 1629 1630 mac_srs->srs_state |= (SRS_PROC|proc_type); 1631 1632 1633 /* 1634 * mcip is NULL for broadcast and multicast flows. The promisc 1635 * callbacks for broadcast and multicast packets are delivered from 1636 * mac_rx() and we don't need to worry about that case in this path 1637 */ 1638 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1639 mutex_exit(&mac_srs->srs_lock); 1640 mac_promisc_client_dispatch(mcip, head); 1641 mutex_enter(&mac_srs->srs_lock); 1642 } 1643 1644 /* 1645 * Check if SRS itself is doing the processing 1646 * This direct path does not apply when subflows are present. In this 1647 * case, packets need to be dispatched to a soft ring according to the 1648 * flow's bandwidth and other resources contraints. 1649 */ 1650 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1651 mac_direct_rx_t proc; 1652 void *arg1; 1653 mac_resource_handle_t arg2; 1654 1655 /* 1656 * This is the case when a Rx is directly 1657 * assigned and we have a fully classified 1658 * protocol chain. We can deal with it in 1659 * one shot. 1660 */ 1661 proc = srs_rx->sr_func; 1662 arg1 = srs_rx->sr_arg1; 1663 arg2 = srs_rx->sr_arg2; 1664 1665 mac_srs->srs_state |= SRS_CLIENT_PROC; 1666 mutex_exit(&mac_srs->srs_lock); 1667 if (tid != 0) { 1668 (void) untimeout(tid); 1669 tid = 0; 1670 } 1671 1672 proc(arg1, arg2, head, NULL); 1673 /* 1674 * Decrement the size and count here itelf 1675 * since the packet has been processed. 1676 */ 1677 mutex_enter(&mac_srs->srs_lock); 1678 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1679 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1680 cv_signal(&mac_srs->srs_client_cv); 1681 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1682 } else { 1683 /* Some kind of softrings based fanout is required */ 1684 mutex_exit(&mac_srs->srs_lock); 1685 if (tid != 0) { 1686 (void) untimeout(tid); 1687 tid = 0; 1688 } 1689 1690 /* 1691 * Since the fanout routines can deal with chains, 1692 * shoot the entire chain up. 1693 */ 1694 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1695 mac_rx_srs_fanout(mac_srs, head); 1696 else 1697 mac_rx_srs_proto_fanout(mac_srs, head); 1698 mutex_enter(&mac_srs->srs_lock); 1699 } 1700 1701 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 1702 (mac_srs->srs_first != NULL)) { 1703 /* 1704 * More packets arrived while we were clearing the 1705 * SRS. This can be possible because of one of 1706 * three conditions below: 1707 * 1) The driver is using multiple worker threads 1708 * to send the packets to us. 1709 * 2) The driver has a race in switching 1710 * between interrupt and polling mode or 1711 * 3) Packets are arriving in this SRS via the 1712 * S/W classification as well. 1713 * 1714 * We should switch to polling mode and see if we 1715 * need to send the poll thread down. Also, signal 1716 * the worker thread to process whats just arrived. 1717 */ 1718 MAC_SRS_POLLING_ON(mac_srs); 1719 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 1720 srs_rx->sr_drain_poll_sig++; 1721 MAC_SRS_POLL_RING(mac_srs); 1722 } 1723 1724 /* 1725 * If we didn't signal the poll thread, we need 1726 * to deal with the pending packets ourselves. 1727 */ 1728 if (proc_type == SRS_WORKER) { 1729 srs_rx->sr_drain_again++; 1730 goto again; 1731 } else { 1732 srs_rx->sr_drain_worker_sig++; 1733 cv_signal(&mac_srs->srs_async); 1734 } 1735 } 1736 1737 out: 1738 if (mac_srs->srs_state & SRS_GET_PKTS) { 1739 /* 1740 * Poll thread is already running. Leave the 1741 * SRS_RPOC set and hand over the control to 1742 * poll thread. 1743 */ 1744 mac_srs->srs_state &= ~proc_type; 1745 srs_rx->sr_drain_poll_running++; 1746 return; 1747 } 1748 1749 /* 1750 * Even if there are no packets queued in SRS, we 1751 * need to make sure that the shared counter is 1752 * clear and any associated softrings have cleared 1753 * all the backlog. Otherwise, leave the interface 1754 * in polling mode and the poll thread will get 1755 * signalled once the count goes down to zero. 1756 * 1757 * If someone is already draining the queue (SRS_PROC is 1758 * set) when the srs_poll_pkt_cnt goes down to zero, 1759 * then it means that drain is already running and we 1760 * will turn off polling at that time if there is 1761 * no backlog. 1762 * 1763 * As long as there are packets queued either 1764 * in soft ring set or its soft rings, we will leave 1765 * the interface in polling mode (even if the drain 1766 * was done being the interrupt thread). We signal 1767 * the poll thread as well if we have dipped below 1768 * low water mark. 1769 * 1770 * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1771 * since that turn polling on only for worker thread. 1772 * Its not worth turning polling on for interrupt 1773 * thread (since NIC will not issue another interrupt) 1774 * unless a backlog builds up. 1775 */ 1776 if ((srs_rx->sr_poll_pkt_cnt > 0) && 1777 (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1778 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1779 srs_rx->sr_drain_keep_polling++; 1780 MAC_SRS_POLLING_ON(mac_srs); 1781 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1782 MAC_SRS_POLL_RING(mac_srs); 1783 return; 1784 } 1785 1786 /* Nothing else to do. Get out of poll mode */ 1787 MAC_SRS_POLLING_OFF(mac_srs); 1788 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1789 srs_rx->sr_drain_finish_intr++; 1790 } 1791 1792 /* 1793 * mac_rx_srs_drain_bw 1794 * 1795 * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1796 * (worker, interrupt, poll) can call this based on processing model. 1797 * The first thing we do is disable interrupts if possible and then 1798 * drain the queue. we also try to poll the underlying hardware if 1799 * there is a dedicated hardware Rx ring assigned to this SRS. 1800 * 1801 * There is a equivalent drain routine in non bandwidth control mode 1802 * mac_rx_srs_drain. There is some code duplication between the two 1803 * routines but they are highly performance sensitive and are easier 1804 * to read/debug if they stay separate. Any code changes here might 1805 * also apply to mac_rx_srs_drain as well. 1806 */ 1807 void 1808 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1809 { 1810 mblk_t *head; 1811 mblk_t *tail; 1812 timeout_id_t tid; 1813 size_t sz = 0; 1814 int cnt = 0; 1815 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1816 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1817 clock_t now; 1818 1819 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1820 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1821 again: 1822 /* Check if we are doing B/W control */ 1823 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1824 now = ddi_get_lbolt(); 1825 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 1826 mac_srs->srs_bw->mac_bw_curr_time = now; 1827 mac_srs->srs_bw->mac_bw_used = 0; 1828 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1829 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1830 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1831 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1832 goto done; 1833 } else if (mac_srs->srs_bw->mac_bw_used > 1834 mac_srs->srs_bw->mac_bw_limit) { 1835 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1836 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1837 goto done; 1838 } 1839 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1840 1841 /* If we are blanked i.e. can't do upcalls, then we are done */ 1842 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1843 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1844 (mac_srs->srs_state & SRS_PAUSE)); 1845 goto done; 1846 } 1847 1848 sz = 0; 1849 cnt = 0; 1850 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1851 /* 1852 * We couldn't pick up a single packet. 1853 */ 1854 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1855 if ((mac_srs->srs_bw->mac_bw_used == 0) && 1856 (mac_srs->srs_size != 0) && 1857 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1858 /* 1859 * Seems like configured B/W doesn't 1860 * even allow processing of 1 packet 1861 * per tick. 1862 * 1863 * XXX: raise the limit to processing 1864 * at least 1 packet per tick. 1865 */ 1866 mac_srs->srs_bw->mac_bw_limit += 1867 mac_srs->srs_bw->mac_bw_limit; 1868 mac_srs->srs_bw->mac_bw_drop_threshold += 1869 mac_srs->srs_bw->mac_bw_drop_threshold; 1870 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1871 "raised B/W limit to %d since not even a " 1872 "single packet can be processed per " 1873 "tick %d\n", (void *)mac_srs, 1874 (int)mac_srs->srs_bw->mac_bw_limit, 1875 (int)msgdsize(mac_srs->srs_first)); 1876 } 1877 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1878 goto done; 1879 } 1880 1881 ASSERT(head != NULL); 1882 ASSERT(tail != NULL); 1883 1884 /* zero bandwidth: drop all and return to interrupt mode */ 1885 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1886 if (mac_srs->srs_bw->mac_bw_limit == 0) { 1887 srs_rx->sr_drop_count += cnt; 1888 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1889 mac_srs->srs_bw->mac_bw_sz -= sz; 1890 mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1891 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1892 mac_pkt_drop(NULL, NULL, head, B_FALSE); 1893 goto leave_poll; 1894 } else { 1895 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1896 } 1897 1898 if ((tid = mac_srs->srs_tid) != 0) 1899 mac_srs->srs_tid = 0; 1900 1901 mac_srs->srs_state |= (SRS_PROC|proc_type); 1902 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1903 1904 /* 1905 * mcip is NULL for broadcast and multicast flows. The promisc 1906 * callbacks for broadcast and multicast packets are delivered from 1907 * mac_rx() and we don't need to worry about that case in this path 1908 */ 1909 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1910 mutex_exit(&mac_srs->srs_lock); 1911 mac_promisc_client_dispatch(mcip, head); 1912 mutex_enter(&mac_srs->srs_lock); 1913 } 1914 1915 /* 1916 * Check if SRS itself is doing the processing 1917 * This direct path does not apply when subflows are present. In this 1918 * case, packets need to be dispatched to a soft ring according to the 1919 * flow's bandwidth and other resources contraints. 1920 */ 1921 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1922 mac_direct_rx_t proc; 1923 void *arg1; 1924 mac_resource_handle_t arg2; 1925 1926 /* 1927 * This is the case when a Rx is directly 1928 * assigned and we have a fully classified 1929 * protocol chain. We can deal with it in 1930 * one shot. 1931 */ 1932 proc = srs_rx->sr_func; 1933 arg1 = srs_rx->sr_arg1; 1934 arg2 = srs_rx->sr_arg2; 1935 1936 mac_srs->srs_state |= SRS_CLIENT_PROC; 1937 mutex_exit(&mac_srs->srs_lock); 1938 if (tid != 0) { 1939 (void) untimeout(tid); 1940 tid = 0; 1941 } 1942 1943 proc(arg1, arg2, head, NULL); 1944 /* 1945 * Decrement the size and count here itelf 1946 * since the packet has been processed. 1947 */ 1948 mutex_enter(&mac_srs->srs_lock); 1949 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1950 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 1951 1952 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1953 cv_signal(&mac_srs->srs_client_cv); 1954 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1955 } else { 1956 /* Some kind of softrings based fanout is required */ 1957 mutex_exit(&mac_srs->srs_lock); 1958 if (tid != 0) { 1959 (void) untimeout(tid); 1960 tid = 0; 1961 } 1962 1963 /* 1964 * Since the fanout routines can deal with chains, 1965 * shoot the entire chain up. 1966 */ 1967 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1968 mac_rx_srs_fanout(mac_srs, head); 1969 else 1970 mac_rx_srs_proto_fanout(mac_srs, head); 1971 mutex_enter(&mac_srs->srs_lock); 1972 } 1973 1974 /* 1975 * Send the poll thread to pick up any packets arrived 1976 * so far. This also serves as the last check in case 1977 * nothing else is queued in the SRS. The poll thread 1978 * is signalled only in the case the drain was done 1979 * by the worker thread and SRS_WORKER is set. The 1980 * worker thread can run in parallel as long as the 1981 * SRS_WORKER flag is set. We we have nothing else to 1982 * process, we can exit while leaving SRS_PROC set 1983 * which gives the poll thread control to process and 1984 * cleanup once it returns from the NIC. 1985 * 1986 * If we have nothing else to process, we need to 1987 * ensure that we keep holding the srs_lock till 1988 * all the checks below are done and control is 1989 * handed to the poll thread if it was running. 1990 */ 1991 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1992 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1993 if (mac_srs->srs_first != NULL) { 1994 if (proc_type == SRS_WORKER) { 1995 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1996 if (srs_rx->sr_poll_pkt_cnt <= 1997 srs_rx->sr_lowat) 1998 MAC_SRS_POLL_RING(mac_srs); 1999 goto again; 2000 } else { 2001 cv_signal(&mac_srs->srs_async); 2002 } 2003 } 2004 } 2005 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2006 2007 done: 2008 2009 if (mac_srs->srs_state & SRS_GET_PKTS) { 2010 /* 2011 * Poll thread is already running. Leave the 2012 * SRS_RPOC set and hand over the control to 2013 * poll thread. 2014 */ 2015 mac_srs->srs_state &= ~proc_type; 2016 return; 2017 } 2018 2019 /* 2020 * If we can't process packets because we have exceeded 2021 * B/W limit for this tick, just set the timeout 2022 * and leave. 2023 * 2024 * Even if there are no packets queued in SRS, we 2025 * need to make sure that the shared counter is 2026 * clear and any associated softrings have cleared 2027 * all the backlog. Otherwise, leave the interface 2028 * in polling mode and the poll thread will get 2029 * signalled once the count goes down to zero. 2030 * 2031 * If someone is already draining the queue (SRS_PROC is 2032 * set) when the srs_poll_pkt_cnt goes down to zero, 2033 * then it means that drain is already running and we 2034 * will turn off polling at that time if there is 2035 * no backlog. As long as there are packets queued either 2036 * is soft ring set or its soft rings, we will leave 2037 * the interface in polling mode. 2038 */ 2039 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2040 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2041 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2042 (srs_rx->sr_poll_pkt_cnt > 0))) { 2043 MAC_SRS_POLLING_ON(mac_srs); 2044 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2045 if ((mac_srs->srs_first != NULL) && 2046 (mac_srs->srs_tid == NULL)) 2047 mac_srs->srs_tid = timeout(mac_srs_fire, 2048 mac_srs, 1); 2049 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2050 return; 2051 } 2052 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2053 2054 leave_poll: 2055 2056 /* Nothing else to do. Get out of poll mode */ 2057 MAC_SRS_POLLING_OFF(mac_srs); 2058 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2059 } 2060 2061 /* 2062 * mac_srs_worker 2063 * 2064 * The SRS worker routine. Drains the queue when no one else is 2065 * processing it. 2066 */ 2067 void 2068 mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2069 { 2070 kmutex_t *lock = &mac_srs->srs_lock; 2071 kcondvar_t *async = &mac_srs->srs_async; 2072 callb_cpr_t cprinfo; 2073 boolean_t bw_ctl_flag; 2074 2075 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2076 mutex_enter(lock); 2077 2078 start: 2079 for (;;) { 2080 bw_ctl_flag = B_FALSE; 2081 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2082 MAC_SRS_BW_LOCK(mac_srs); 2083 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2084 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2085 bw_ctl_flag = B_TRUE; 2086 MAC_SRS_BW_UNLOCK(mac_srs); 2087 } 2088 /* 2089 * The SRS_BW_ENFORCED flag may change since we have dropped 2090 * the mac_bw_lock. However the drain function can handle both 2091 * a drainable SRS or a bandwidth controlled SRS, and the 2092 * effect of scheduling a timeout is to wakeup the worker 2093 * thread which in turn will call the drain function. Since 2094 * we release the srs_lock atomically only in the cv_wait there 2095 * isn't a fear of waiting for ever. 2096 */ 2097 while (((mac_srs->srs_state & SRS_PROC) || 2098 (mac_srs->srs_first == NULL) || bw_ctl_flag || 2099 (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2100 !(mac_srs->srs_state & SRS_PAUSE)) { 2101 /* 2102 * If we have packets queued and we are here 2103 * because B/W control is in place, we better 2104 * schedule the worker wakeup after 1 tick 2105 * to see if bandwidth control can be relaxed. 2106 */ 2107 if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2108 /* 2109 * We need to ensure that a timer is already 2110 * scheduled or we force schedule one for 2111 * later so that we can continue processing 2112 * after this quanta is over. 2113 */ 2114 mac_srs->srs_tid = timeout(mac_srs_fire, 2115 mac_srs, 1); 2116 } 2117 wait: 2118 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2119 cv_wait(async, lock); 2120 CALLB_CPR_SAFE_END(&cprinfo, lock); 2121 2122 if (mac_srs->srs_state & SRS_PAUSE) 2123 goto done; 2124 if (mac_srs->srs_state & SRS_PROC) 2125 goto wait; 2126 2127 if (mac_srs->srs_first != NULL && 2128 mac_srs->srs_type & SRST_BW_CONTROL) { 2129 MAC_SRS_BW_LOCK(mac_srs); 2130 if (mac_srs->srs_bw->mac_bw_state & 2131 SRS_BW_ENFORCED) { 2132 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2133 } 2134 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2135 SRS_BW_ENFORCED; 2136 MAC_SRS_BW_UNLOCK(mac_srs); 2137 } 2138 } 2139 2140 if (mac_srs->srs_state & SRS_PAUSE) 2141 goto done; 2142 mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2143 } 2144 done: 2145 /* 2146 * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2147 * from both hard and soft classifications and waits for such threads 2148 * to finish before signaling the worker. So at this point the only 2149 * thread left that could be competing with the worker is the poll 2150 * thread. In the case of Tx, there shouldn't be any thread holding 2151 * SRS_PROC at this point. 2152 */ 2153 if (!(mac_srs->srs_state & SRS_PROC)) { 2154 mac_srs->srs_state |= SRS_PROC; 2155 } else { 2156 ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2157 /* 2158 * Poll thread still owns the SRS and is still running 2159 */ 2160 ASSERT((mac_srs->srs_poll_thr == NULL) || 2161 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2162 SRS_POLL_THR_OWNER)); 2163 } 2164 mac_srs_worker_quiesce(mac_srs); 2165 /* 2166 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2167 * of the quiesce operation 2168 */ 2169 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2170 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2171 2172 if (mac_srs->srs_state & SRS_RESTART) { 2173 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2174 mac_srs_worker_restart(mac_srs); 2175 mac_srs->srs_state &= ~SRS_PROC; 2176 goto start; 2177 } 2178 2179 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2180 mac_srs_worker_quiesce(mac_srs); 2181 2182 mac_srs->srs_state &= ~SRS_PROC; 2183 /* The macro drops the srs_lock */ 2184 CALLB_CPR_EXIT(&cprinfo); 2185 thread_exit(); 2186 } 2187 2188 /* 2189 * mac_rx_srs_subflow_process 2190 * 2191 * Receive side routine called from interrupt path when there are 2192 * sub flows present on this SRS. 2193 */ 2194 /* ARGSUSED */ 2195 void 2196 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2197 mblk_t *mp_chain, boolean_t loopback) 2198 { 2199 flow_entry_t *flent = NULL; 2200 flow_entry_t *prev_flent = NULL; 2201 mblk_t *mp = NULL; 2202 mblk_t *tail = NULL; 2203 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2204 mac_client_impl_t *mcip; 2205 2206 mcip = mac_srs->srs_mcip; 2207 ASSERT(mcip != NULL); 2208 2209 /* 2210 * We need to determine the SRS for every packet 2211 * by walking the flow table, if we don't get any, 2212 * then we proceed using the SRS we came with. 2213 */ 2214 mp = tail = mp_chain; 2215 while (mp != NULL) { 2216 2217 /* 2218 * We will increment the stats for the mactching subflow. 2219 * when we get the bytes/pkt count for the classified packets 2220 * later in mac_rx_srs_process. 2221 */ 2222 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2223 FLOW_INBOUND, &flent); 2224 2225 if (mp == mp_chain || flent == prev_flent) { 2226 if (prev_flent != NULL) 2227 FLOW_REFRELE(prev_flent); 2228 prev_flent = flent; 2229 flent = NULL; 2230 tail = mp; 2231 mp = mp->b_next; 2232 continue; 2233 } 2234 tail->b_next = NULL; 2235 /* 2236 * A null indicates, this is for the mac_srs itself. 2237 * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2238 */ 2239 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2240 mac_rx_srs_process(arg, 2241 (mac_resource_handle_t)mac_srs, mp_chain, 2242 loopback); 2243 } else { 2244 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2245 prev_flent->fe_cb_arg2, mp_chain, loopback); 2246 FLOW_REFRELE(prev_flent); 2247 } 2248 prev_flent = flent; 2249 flent = NULL; 2250 mp_chain = mp; 2251 tail = mp; 2252 mp = mp->b_next; 2253 } 2254 /* Last chain */ 2255 ASSERT(mp_chain != NULL); 2256 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2257 mac_rx_srs_process(arg, 2258 (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2259 } else { 2260 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2261 prev_flent->fe_cb_arg2, mp_chain, loopback); 2262 FLOW_REFRELE(prev_flent); 2263 } 2264 } 2265 2266 /* 2267 * mac_rx_srs_process 2268 * 2269 * Receive side routine called from the interrupt path. 2270 * 2271 * loopback is set to force a context switch on the loopback 2272 * path between MAC clients. 2273 */ 2274 /* ARGSUSED */ 2275 void 2276 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2277 boolean_t loopback) 2278 { 2279 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2280 mblk_t *mp, *tail, *head; 2281 int count = 0; 2282 int count1; 2283 size_t sz = 0; 2284 size_t chain_sz, sz1; 2285 mac_bw_ctl_t *mac_bw; 2286 mac_client_impl_t *smcip; 2287 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2288 2289 /* 2290 * Set the tail, count and sz. We set the sz irrespective 2291 * of whether we are doing B/W control or not for the 2292 * purpose of updating the stats. 2293 */ 2294 mp = tail = mp_chain; 2295 while (mp != NULL) { 2296 tail = mp; 2297 count++; 2298 sz += msgdsize(mp); 2299 mp = mp->b_next; 2300 } 2301 2302 mutex_enter(&mac_srs->srs_lock); 2303 smcip = mac_srs->srs_mcip; 2304 2305 if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 2306 FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 2307 FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 2308 } 2309 if (smcip != NULL) { 2310 smcip->mci_stat_ibytes += sz; 2311 smcip->mci_stat_ipackets += count; 2312 } 2313 2314 /* 2315 * If the SRS in already being processed; has been blanked; 2316 * can be processed by worker thread only; or the B/W limit 2317 * has been reached, then queue the chain and check if 2318 * worker thread needs to be awakend. 2319 */ 2320 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2321 mac_bw = mac_srs->srs_bw; 2322 ASSERT(mac_bw != NULL); 2323 mutex_enter(&mac_bw->mac_bw_lock); 2324 /* Count the packets and bytes via interrupt */ 2325 srs_rx->sr_intr_count += count; 2326 mac_bw->mac_bw_intr += sz; 2327 if (mac_bw->mac_bw_limit == 0) { 2328 /* zero bandwidth: drop all */ 2329 srs_rx->sr_drop_count += count; 2330 mac_bw->mac_bw_drop_bytes += sz; 2331 mutex_exit(&mac_bw->mac_bw_lock); 2332 mutex_exit(&mac_srs->srs_lock); 2333 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2334 return; 2335 } else { 2336 if ((mac_bw->mac_bw_sz + sz) <= 2337 mac_bw->mac_bw_drop_threshold) { 2338 mutex_exit(&mac_bw->mac_bw_lock); 2339 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2340 tail, count, sz); 2341 } else { 2342 mp = mp_chain; 2343 chain_sz = 0; 2344 count1 = 0; 2345 tail = NULL; 2346 head = NULL; 2347 while (mp != NULL) { 2348 sz1 = msgdsize(mp); 2349 if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2350 mac_bw->mac_bw_drop_threshold) 2351 break; 2352 chain_sz += sz1; 2353 count1++; 2354 tail = mp; 2355 mp = mp->b_next; 2356 } 2357 mutex_exit(&mac_bw->mac_bw_lock); 2358 if (tail != NULL) { 2359 head = tail->b_next; 2360 tail->b_next = NULL; 2361 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2362 mp_chain, tail, count1, chain_sz); 2363 sz -= chain_sz; 2364 count -= count1; 2365 } else { 2366 /* Can't pick up any */ 2367 head = mp_chain; 2368 } 2369 if (head != NULL) { 2370 /* Drop any packet over the threshold */ 2371 srs_rx->sr_drop_count += count; 2372 mutex_enter(&mac_bw->mac_bw_lock); 2373 mac_bw->mac_bw_drop_bytes += sz; 2374 mutex_exit(&mac_bw->mac_bw_lock); 2375 freemsgchain(head); 2376 } 2377 } 2378 MAC_SRS_WORKER_WAKEUP(mac_srs); 2379 mutex_exit(&mac_srs->srs_lock); 2380 return; 2381 } 2382 } 2383 2384 /* 2385 * If the total number of packets queued in the SRS and 2386 * its associated soft rings exceeds the max allowed, 2387 * then drop the chain. If we are polling capable, this 2388 * shouldn't be happening. 2389 */ 2390 if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2391 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2392 mac_bw = mac_srs->srs_bw; 2393 srs_rx->sr_drop_count += count; 2394 mutex_enter(&mac_bw->mac_bw_lock); 2395 mac_bw->mac_bw_drop_bytes += sz; 2396 mutex_exit(&mac_bw->mac_bw_lock); 2397 freemsgchain(mp_chain); 2398 mutex_exit(&mac_srs->srs_lock); 2399 return; 2400 } 2401 2402 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2403 /* Count the packets entering via interrupt path */ 2404 srs_rx->sr_intr_count += count; 2405 2406 if (!(mac_srs->srs_state & SRS_PROC)) { 2407 /* 2408 * If we are coming via loopback or if we are not 2409 * optimizing for latency, we should signal the 2410 * worker thread. 2411 */ 2412 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 2413 /* 2414 * For loopback, We need to let the worker take 2415 * over as we don't want to continue in the same 2416 * thread even if we can. This could lead to stack 2417 * overflows and may also end up using 2418 * resources (cpu) incorrectly. 2419 */ 2420 cv_signal(&mac_srs->srs_async); 2421 } else { 2422 /* 2423 * Seems like no one is processing the SRS and 2424 * there is no backlog. We also inline process 2425 * our packet if its a single packet in non 2426 * latency optimized case (in latency optimized 2427 * case, we inline process chains of any size). 2428 */ 2429 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2430 } 2431 } 2432 mutex_exit(&mac_srs->srs_lock); 2433 } 2434 2435 /* TX SIDE ROUTINES (RUNTIME) */ 2436 2437 /* 2438 * mac_tx_srs_no_desc 2439 * 2440 * This routine is called by Tx single ring default mode 2441 * when Tx ring runs out of descs. 2442 */ 2443 mac_tx_cookie_t 2444 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2445 uint16_t flag, mblk_t **ret_mp) 2446 { 2447 mac_tx_cookie_t cookie = NULL; 2448 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2449 boolean_t wakeup_worker = B_TRUE; 2450 uint32_t tx_mode = srs_tx->st_mode; 2451 int cnt, sz; 2452 mblk_t *tail; 2453 2454 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2455 if (flag & MAC_DROP_ON_NO_DESC) { 2456 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2457 } else { 2458 if (mac_srs->srs_first != NULL) 2459 wakeup_worker = B_FALSE; 2460 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2461 if (flag & MAC_TX_NO_ENQUEUE) { 2462 /* 2463 * If TX_QUEUED is not set, queue the 2464 * packet and let mac_tx_srs_drain() 2465 * set the TX_BLOCKED bit for the 2466 * reasons explained above. Otherwise, 2467 * return the mblks. 2468 */ 2469 if (wakeup_worker) { 2470 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2471 mp_chain, tail, cnt, sz); 2472 } else { 2473 MAC_TX_SET_NO_ENQUEUE(mac_srs, 2474 mp_chain, ret_mp, cookie); 2475 } 2476 } else { 2477 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2478 tail, cnt, sz, cookie); 2479 } 2480 if (wakeup_worker) 2481 cv_signal(&mac_srs->srs_async); 2482 } 2483 return (cookie); 2484 } 2485 2486 /* 2487 * mac_tx_srs_enqueue 2488 * 2489 * This routine is called when Tx SRS is operating in either serializer 2490 * or bandwidth mode. In serializer mode, a packet will get enqueued 2491 * when a thread cannot enter SRS exclusively. In bandwidth mode, 2492 * packets gets queued if allowed byte-count limit for a tick is 2493 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2494 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2495 * the default mode or fanout mode. Here packets get dropped or 2496 * returned back to the caller only after hi-watermark worth of data 2497 * is queued. 2498 */ 2499 static mac_tx_cookie_t 2500 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2501 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2502 { 2503 mac_tx_cookie_t cookie = NULL; 2504 int cnt, sz; 2505 mblk_t *tail; 2506 boolean_t wakeup_worker = B_TRUE; 2507 2508 /* 2509 * Ignore fanout hint if we don't have multiple tx rings. 2510 */ 2511 if (!TX_MULTI_RING_MODE(mac_srs)) 2512 fanout_hint = 0; 2513 2514 if (mac_srs->srs_first != NULL) 2515 wakeup_worker = B_FALSE; 2516 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2517 if (flag & MAC_DROP_ON_NO_DESC) { 2518 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2519 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2520 } else { 2521 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2522 mp_chain, tail, cnt, sz); 2523 } 2524 } else if (flag & MAC_TX_NO_ENQUEUE) { 2525 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2526 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2527 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2528 ret_mp, cookie); 2529 } else { 2530 mp_chain->b_prev = (mblk_t *)fanout_hint; 2531 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2532 mp_chain, tail, cnt, sz); 2533 } 2534 } else { 2535 /* 2536 * If you are BW_ENFORCED, just enqueue the 2537 * packet. srs_worker will drain it at the 2538 * prescribed rate. Before enqueueing, save 2539 * the fanout hint. 2540 */ 2541 mp_chain->b_prev = (mblk_t *)fanout_hint; 2542 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2543 tail, cnt, sz, cookie); 2544 } 2545 if (wakeup_worker) 2546 cv_signal(&mac_srs->srs_async); 2547 return (cookie); 2548 } 2549 2550 /* 2551 * There are five tx modes: 2552 * 2553 * 1) Default mode (SRS_TX_DEFAULT) 2554 * 2) Serialization mode (SRS_TX_SERIALIZE) 2555 * 3) Fanout mode (SRS_TX_FANOUT) 2556 * 4) Bandwdith mode (SRS_TX_BW) 2557 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2558 * 2559 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2560 * based on the number of Tx rings requested for an SRS and whether 2561 * bandwidth control is requested or not. 2562 * 2563 * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 2564 * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 2565 * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 2566 * When flow-control is relieved, the srs_worker drains the queued 2567 * packets and informs blocked clients to restart sending packets. 2568 * 2569 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 2570 * 2571 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2572 * Tx rings. Each Tx ring will have a soft ring associated with it. 2573 * These soft rings will be hung off the Tx SRS. Queueing if it happens 2574 * due to lack of Tx desc will be in individual soft ring (and not srs) 2575 * associated with Tx ring. 2576 * 2577 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2578 * only if bw is available. Otherwise the packets will be queued in 2579 * SRS. If fanout to multiple Tx rings is configured, the packets will 2580 * be fanned out among the soft rings associated with the Tx rings. 2581 * 2582 * Four flags are used in srs_state for indicating flow control 2583 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2584 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2585 * driver below. 2586 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2587 * and flow-control pressure is applied back to clients. The clients expect 2588 * wakeup when flow-control is relieved. 2589 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2590 * got returned back to client either due to lack of Tx descs or due to bw 2591 * control reasons. The clients expect a wakeup when condition is relieved. 2592 * 2593 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2594 * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2595 * MAC_TX_NO_ENQUEUE 2596 * Mac clients that do not want packets to be enqueued in the mac layer set 2597 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2598 * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2599 * behaviour of this flag is different when the Tx is running in serializer 2600 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2601 * get dropped when Tx high watermark is reached. 2602 * There are some mac clients like vsw, aggr that want the mblks to be 2603 * returned back to clients instead of being queued in Tx SRS (or Tx soft 2604 * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2605 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2606 * In the default and Tx fanout mode, the un-transmitted mblks will be 2607 * returned back to the clients when the driver runs out of Tx descs. 2608 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2609 * soft ring) so that the clients can be woken up when Tx desc become 2610 * available. When running in serializer or bandwidth mode mode, 2611 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2612 */ 2613 2614 mac_tx_func_t 2615 mac_tx_get_func(uint32_t mode) 2616 { 2617 return (mac_tx_mode_list[mode].mac_tx_func); 2618 } 2619 2620 /* ARGSUSED */ 2621 static mac_tx_cookie_t 2622 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2623 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2624 { 2625 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2626 boolean_t is_subflow; 2627 mac_tx_stats_t stats; 2628 mac_tx_cookie_t cookie = NULL; 2629 2630 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2631 2632 /* Regular case with a single Tx ring */ 2633 /* 2634 * SRS_TX_BLOCKED is set when underlying NIC runs 2635 * out of Tx descs and messages start getting 2636 * queued. It won't get reset until 2637 * tx_srs_drain() completely drains out the 2638 * messages. 2639 */ 2640 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2641 /* Tx descs/resources not available */ 2642 mutex_enter(&mac_srs->srs_lock); 2643 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2644 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2645 flag, ret_mp); 2646 mutex_exit(&mac_srs->srs_lock); 2647 return (cookie); 2648 } 2649 /* 2650 * While we were computing mblk count, the 2651 * flow control condition got relieved. 2652 * Continue with the transmission. 2653 */ 2654 mutex_exit(&mac_srs->srs_lock); 2655 } 2656 2657 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2658 2659 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2660 mp_chain, (is_subflow ? &stats : NULL)); 2661 2662 /* 2663 * Multiple threads could be here sending packets. 2664 * Under such conditions, it is not possible to 2665 * automically set SRS_TX_BLOCKED bit to indicate 2666 * out of tx desc condition. To atomically set 2667 * this, we queue the returned packet and do 2668 * the setting of SRS_TX_BLOCKED in 2669 * mac_tx_srs_drain(). 2670 */ 2671 if (mp_chain != NULL) { 2672 mutex_enter(&mac_srs->srs_lock); 2673 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2674 mutex_exit(&mac_srs->srs_lock); 2675 return (cookie); 2676 } 2677 2678 if (is_subflow) 2679 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2680 2681 return (NULL); 2682 } 2683 2684 /* 2685 * mac_tx_serialize_mode 2686 * 2687 * This is an experimental mode implemented as per the request of PAE. 2688 * In this mode, all callers attempting to send a packet to the NIC 2689 * will get serialized. Only one thread at any time will access the 2690 * NIC to send the packet out. 2691 */ 2692 /* ARGSUSED */ 2693 static mac_tx_cookie_t 2694 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2695 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2696 { 2697 boolean_t is_subflow; 2698 mac_tx_stats_t stats; 2699 mac_tx_cookie_t cookie = NULL; 2700 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2701 2702 /* Single ring, serialize below */ 2703 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2704 mutex_enter(&mac_srs->srs_lock); 2705 if ((mac_srs->srs_first != NULL) || 2706 (mac_srs->srs_state & SRS_PROC)) { 2707 /* 2708 * In serialization mode, queue all packets until 2709 * TX_HIWAT is set. 2710 * If drop bit is set, drop if TX_HIWAT is set. 2711 * If no_enqueue is set, still enqueue until hiwat 2712 * is set and return mblks after TX_HIWAT is set. 2713 */ 2714 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2715 flag, NULL, ret_mp); 2716 mutex_exit(&mac_srs->srs_lock); 2717 return (cookie); 2718 } 2719 /* 2720 * No packets queued, nothing on proc and no flow 2721 * control condition. Fast-path, ok. Do inline 2722 * processing. 2723 */ 2724 mac_srs->srs_state |= SRS_PROC; 2725 mutex_exit(&mac_srs->srs_lock); 2726 2727 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2728 2729 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2730 mp_chain, (is_subflow ? &stats : NULL)); 2731 2732 mutex_enter(&mac_srs->srs_lock); 2733 mac_srs->srs_state &= ~SRS_PROC; 2734 if (mp_chain != NULL) { 2735 cookie = mac_tx_srs_enqueue(mac_srs, 2736 mp_chain, flag, NULL, ret_mp); 2737 } 2738 if (mac_srs->srs_first != NULL) { 2739 /* 2740 * We processed inline our packet and a new 2741 * packet/s got queued while we were 2742 * processing. Wakeup srs worker 2743 */ 2744 cv_signal(&mac_srs->srs_async); 2745 } 2746 mutex_exit(&mac_srs->srs_lock); 2747 2748 if (is_subflow && cookie == NULL) 2749 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2750 2751 return (cookie); 2752 } 2753 2754 /* 2755 * mac_tx_fanout_mode 2756 * 2757 * In this mode, the SRS will have access to multiple Tx rings to send 2758 * the packet out. The fanout hint that is passed as an argument is 2759 * used to find an appropriate ring to fanout the traffic. Each Tx 2760 * ring, in turn, will have a soft ring associated with it. If a Tx 2761 * ring runs out of Tx desc's the returned packet will be queued in 2762 * the soft ring associated with that Tx ring. The srs itself will not 2763 * queue any packets. 2764 */ 2765 2766 #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 2767 index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 2768 softring = mac_srs->srs_oth_soft_rings[index]; \ 2769 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 2770 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 2771 } 2772 2773 static mac_tx_cookie_t 2774 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2775 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2776 { 2777 mac_soft_ring_t *softring; 2778 uint64_t hash; 2779 uint_t index; 2780 mac_tx_cookie_t cookie = NULL; 2781 2782 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 2783 if (fanout_hint != 0) { 2784 /* 2785 * The hint is specified by the caller, simply pass the 2786 * whole chain to the soft ring. 2787 */ 2788 hash = HASH_HINT(fanout_hint); 2789 MAC_TX_SOFT_RING_PROCESS(mp_chain); 2790 } else { 2791 mblk_t *last_mp, *cur_mp, *sub_chain; 2792 uint64_t last_hash = 0; 2793 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 2794 2795 /* 2796 * Compute the hash from the contents (headers) of the 2797 * packets of the mblk chain. Split the chains into 2798 * subchains of the same conversation. 2799 * 2800 * Since there may be more than one ring used for 2801 * sub-chains of the same call, and since the caller 2802 * does not maintain per conversation state since it 2803 * passed a zero hint, unsent subchains will be 2804 * dropped. 2805 */ 2806 2807 flag |= MAC_DROP_ON_NO_DESC; 2808 ret_mp = NULL; 2809 2810 ASSERT(ret_mp == NULL); 2811 2812 sub_chain = NULL; 2813 last_mp = NULL; 2814 2815 for (cur_mp = mp_chain; cur_mp != NULL; 2816 cur_mp = cur_mp->b_next) { 2817 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 2818 B_TRUE); 2819 if (last_hash != 0 && hash != last_hash) { 2820 /* 2821 * Starting a different subchain, send current 2822 * chain out. 2823 */ 2824 ASSERT(last_mp != NULL); 2825 last_mp->b_next = NULL; 2826 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2827 sub_chain = NULL; 2828 } 2829 2830 /* add packet to subchain */ 2831 if (sub_chain == NULL) 2832 sub_chain = cur_mp; 2833 last_mp = cur_mp; 2834 last_hash = hash; 2835 } 2836 2837 if (sub_chain != NULL) { 2838 /* send last subchain */ 2839 ASSERT(last_mp != NULL); 2840 last_mp->b_next = NULL; 2841 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2842 } 2843 2844 cookie = NULL; 2845 } 2846 2847 return (cookie); 2848 } 2849 2850 /* 2851 * mac_tx_bw_mode 2852 * 2853 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2854 * only if bw is available. Otherwise the packets will be queued in 2855 * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2856 * out to a Tx rings. 2857 */ 2858 static mac_tx_cookie_t 2859 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2860 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2861 { 2862 int cnt, sz; 2863 mblk_t *tail; 2864 mac_tx_cookie_t cookie = NULL; 2865 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2866 clock_t now; 2867 2868 ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2869 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2870 mutex_enter(&mac_srs->srs_lock); 2871 if (mac_srs->srs_bw->mac_bw_limit == 0) { 2872 /* 2873 * zero bandwidth, no traffic is sent: drop the packets, 2874 * or return the whole chain if the caller requests all 2875 * unsent packets back. 2876 */ 2877 if (flag & MAC_TX_NO_ENQUEUE) { 2878 cookie = (mac_tx_cookie_t)mac_srs; 2879 *ret_mp = mp_chain; 2880 } else { 2881 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2882 } 2883 mutex_exit(&mac_srs->srs_lock); 2884 return (cookie); 2885 } else if ((mac_srs->srs_first != NULL) || 2886 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2887 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2888 fanout_hint, ret_mp); 2889 mutex_exit(&mac_srs->srs_lock); 2890 return (cookie); 2891 } 2892 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2893 now = ddi_get_lbolt(); 2894 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 2895 mac_srs->srs_bw->mac_bw_curr_time = now; 2896 mac_srs->srs_bw->mac_bw_used = 0; 2897 } else if (mac_srs->srs_bw->mac_bw_used > 2898 mac_srs->srs_bw->mac_bw_limit) { 2899 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2900 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2901 mp_chain, tail, cnt, sz); 2902 /* 2903 * Wakeup worker thread. Note that worker 2904 * thread has to be woken up so that it 2905 * can fire up the timer to be woken up 2906 * on the next tick. Also once 2907 * BW_ENFORCED is set, it can only be 2908 * reset by srs_worker thread. Until then 2909 * all packets will get queued up in SRS 2910 * and hence this this code path won't be 2911 * entered until BW_ENFORCED is reset. 2912 */ 2913 cv_signal(&mac_srs->srs_async); 2914 mutex_exit(&mac_srs->srs_lock); 2915 return (cookie); 2916 } 2917 2918 mac_srs->srs_bw->mac_bw_used += sz; 2919 mutex_exit(&mac_srs->srs_lock); 2920 2921 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2922 mac_soft_ring_t *softring; 2923 uint_t indx, hash; 2924 2925 hash = HASH_HINT(fanout_hint); 2926 indx = COMPUTE_INDEX(hash, 2927 mac_srs->srs_oth_ring_count); 2928 softring = mac_srs->srs_oth_soft_rings[indx]; 2929 return (mac_tx_soft_ring_process(softring, mp_chain, flag, 2930 ret_mp)); 2931 } else { 2932 boolean_t is_subflow; 2933 mac_tx_stats_t stats; 2934 2935 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2936 2937 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2938 mp_chain, (is_subflow ? &stats : NULL)); 2939 2940 if (mp_chain != NULL) { 2941 mutex_enter(&mac_srs->srs_lock); 2942 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2943 if (mac_srs->srs_bw->mac_bw_used > sz) 2944 mac_srs->srs_bw->mac_bw_used -= sz; 2945 else 2946 mac_srs->srs_bw->mac_bw_used = 0; 2947 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2948 fanout_hint, ret_mp); 2949 mutex_exit(&mac_srs->srs_lock); 2950 return (cookie); 2951 } 2952 if (is_subflow) 2953 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2954 2955 return (NULL); 2956 } 2957 } 2958 2959 /* ARGSUSED */ 2960 void 2961 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 2962 { 2963 mblk_t *head, *tail; 2964 size_t sz; 2965 uint32_t tx_mode; 2966 uint_t saved_pkt_count; 2967 boolean_t is_subflow; 2968 mac_tx_stats_t stats; 2969 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2970 clock_t now; 2971 2972 saved_pkt_count = 0; 2973 ASSERT(mutex_owned(&mac_srs->srs_lock)); 2974 ASSERT(!(mac_srs->srs_state & SRS_PROC)); 2975 2976 mac_srs->srs_state |= SRS_PROC; 2977 2978 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2979 tx_mode = srs_tx->st_mode; 2980 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 2981 if (mac_srs->srs_first != NULL) { 2982 head = mac_srs->srs_first; 2983 tail = mac_srs->srs_last; 2984 saved_pkt_count = mac_srs->srs_count; 2985 mac_srs->srs_first = NULL; 2986 mac_srs->srs_last = NULL; 2987 mac_srs->srs_count = 0; 2988 mutex_exit(&mac_srs->srs_lock); 2989 2990 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2991 head, &stats); 2992 2993 mutex_enter(&mac_srs->srs_lock); 2994 if (head != NULL) { 2995 /* Device out of tx desc, set block */ 2996 if (head->b_next == NULL) 2997 VERIFY(head == tail); 2998 tail->b_next = mac_srs->srs_first; 2999 mac_srs->srs_first = head; 3000 mac_srs->srs_count += 3001 (saved_pkt_count - stats.ts_opackets); 3002 if (mac_srs->srs_last == NULL) 3003 mac_srs->srs_last = tail; 3004 MAC_TX_SRS_BLOCK(mac_srs, head); 3005 } else { 3006 srs_tx->st_woken_up = B_FALSE; 3007 if (is_subflow) { 3008 FLOW_TX_STATS_UPDATE( 3009 mac_srs->srs_flent, &stats); 3010 } 3011 } 3012 } 3013 } else if (tx_mode == SRS_TX_BW) { 3014 /* 3015 * We are here because the timer fired and we have some data 3016 * to tranmit. Also mac_tx_srs_worker should have reset 3017 * SRS_BW_ENFORCED flag 3018 */ 3019 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 3020 head = tail = mac_srs->srs_first; 3021 while (mac_srs->srs_first != NULL) { 3022 tail = mac_srs->srs_first; 3023 tail->b_prev = NULL; 3024 mac_srs->srs_first = tail->b_next; 3025 if (mac_srs->srs_first == NULL) 3026 mac_srs->srs_last = NULL; 3027 mac_srs->srs_count--; 3028 sz = msgdsize(tail); 3029 mac_srs->srs_size -= sz; 3030 saved_pkt_count++; 3031 MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 3032 3033 if (mac_srs->srs_bw->mac_bw_used < 3034 mac_srs->srs_bw->mac_bw_limit) 3035 continue; 3036 3037 now = ddi_get_lbolt(); 3038 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3039 mac_srs->srs_bw->mac_bw_curr_time = now; 3040 mac_srs->srs_bw->mac_bw_used = sz; 3041 continue; 3042 } 3043 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3044 break; 3045 } 3046 3047 ASSERT((head == NULL && tail == NULL) || 3048 (head != NULL && tail != NULL)); 3049 if (tail != NULL) { 3050 tail->b_next = NULL; 3051 mutex_exit(&mac_srs->srs_lock); 3052 3053 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3054 head, &stats); 3055 3056 mutex_enter(&mac_srs->srs_lock); 3057 if (head != NULL) { 3058 uint_t size_sent; 3059 3060 /* Device out of tx desc, set block */ 3061 if (head->b_next == NULL) 3062 VERIFY(head == tail); 3063 tail->b_next = mac_srs->srs_first; 3064 mac_srs->srs_first = head; 3065 mac_srs->srs_count += 3066 (saved_pkt_count - stats.ts_opackets); 3067 if (mac_srs->srs_last == NULL) 3068 mac_srs->srs_last = tail; 3069 size_sent = sz - stats.ts_obytes; 3070 mac_srs->srs_size += size_sent; 3071 mac_srs->srs_bw->mac_bw_sz += size_sent; 3072 if (mac_srs->srs_bw->mac_bw_used > size_sent) { 3073 mac_srs->srs_bw->mac_bw_used -= 3074 size_sent; 3075 } else { 3076 mac_srs->srs_bw->mac_bw_used = 0; 3077 } 3078 MAC_TX_SRS_BLOCK(mac_srs, head); 3079 } else { 3080 srs_tx->st_woken_up = B_FALSE; 3081 if (is_subflow) { 3082 FLOW_TX_STATS_UPDATE( 3083 mac_srs->srs_flent, &stats); 3084 } 3085 } 3086 } 3087 } else if (tx_mode == SRS_TX_BW_FANOUT) { 3088 mblk_t *prev; 3089 mac_soft_ring_t *softring; 3090 uint64_t hint; 3091 3092 /* 3093 * We are here because the timer fired and we 3094 * have some quota to tranmit. 3095 */ 3096 prev = NULL; 3097 head = tail = mac_srs->srs_first; 3098 while (mac_srs->srs_first != NULL) { 3099 tail = mac_srs->srs_first; 3100 mac_srs->srs_first = tail->b_next; 3101 if (mac_srs->srs_first == NULL) 3102 mac_srs->srs_last = NULL; 3103 mac_srs->srs_count--; 3104 sz = msgdsize(tail); 3105 mac_srs->srs_size -= sz; 3106 mac_srs->srs_bw->mac_bw_used += sz; 3107 if (prev == NULL) 3108 hint = (ulong_t)tail->b_prev; 3109 if (hint != (ulong_t)tail->b_prev) { 3110 prev->b_next = NULL; 3111 mutex_exit(&mac_srs->srs_lock); 3112 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3113 head = tail; 3114 hint = (ulong_t)tail->b_prev; 3115 mutex_enter(&mac_srs->srs_lock); 3116 } 3117 3118 prev = tail; 3119 tail->b_prev = NULL; 3120 if (mac_srs->srs_bw->mac_bw_used < 3121 mac_srs->srs_bw->mac_bw_limit) 3122 continue; 3123 3124 now = ddi_get_lbolt(); 3125 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3126 mac_srs->srs_bw->mac_bw_curr_time = now; 3127 mac_srs->srs_bw->mac_bw_used = 0; 3128 continue; 3129 } 3130 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3131 break; 3132 } 3133 ASSERT((head == NULL && tail == NULL) || 3134 (head != NULL && tail != NULL)); 3135 if (tail != NULL) { 3136 tail->b_next = NULL; 3137 mutex_exit(&mac_srs->srs_lock); 3138 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3139 mutex_enter(&mac_srs->srs_lock); 3140 } 3141 } 3142 /* 3143 * SRS_TX_FANOUT case not considered here because packets 3144 * won't be queued in the SRS for this case. Packets will 3145 * be sent directly to soft rings underneath and if there 3146 * is any queueing at all, it would be in Tx side soft 3147 * rings. 3148 */ 3149 3150 /* 3151 * When srs_count becomes 0, reset SRS_TX_HIWAT and 3152 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3153 */ 3154 if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3155 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3156 mac_tx_notify_cb_t *mtnfp; 3157 mac_cb_t *mcb; 3158 mac_client_impl_t *mcip = mac_srs->srs_mcip; 3159 boolean_t wakeup_required = B_FALSE; 3160 3161 if (mac_srs->srs_state & 3162 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3163 wakeup_required = B_TRUE; 3164 } 3165 mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3166 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3167 mutex_exit(&mac_srs->srs_lock); 3168 if (wakeup_required) { 3169 /* Wakeup callback registered clients */ 3170 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3171 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3172 mcb = mcb->mcb_nextp) { 3173 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3174 mtnfp->mtnf_fn(mtnfp->mtnf_arg, 3175 (mac_tx_cookie_t)mac_srs); 3176 } 3177 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3178 &mcip->mci_tx_notify_cb_list); 3179 /* 3180 * If the client is not the primary MAC client, then we 3181 * need to send the notification to the clients upper 3182 * MAC, i.e. mci_upper_mip. 3183 */ 3184 mac_tx_notify(mcip->mci_upper_mip != NULL ? 3185 mcip->mci_upper_mip : mcip->mci_mip); 3186 } 3187 mutex_enter(&mac_srs->srs_lock); 3188 } 3189 mac_srs->srs_state &= ~SRS_PROC; 3190 } 3191 3192 /* 3193 * Given a packet, get the flow_entry that identifies the flow 3194 * to which that packet belongs. The flow_entry will contain 3195 * the transmit function to be used to send the packet. If the 3196 * function returns NULL, the packet should be sent using the 3197 * underlying NIC. 3198 */ 3199 static flow_entry_t * 3200 mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3201 { 3202 flow_entry_t *flent = NULL; 3203 mac_client_impl_t *mcip; 3204 int err; 3205 3206 /* 3207 * Do classification on the packet. 3208 */ 3209 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3210 if (err != 0) 3211 return (NULL); 3212 3213 /* 3214 * This flent might just be an additional one on the MAC client, 3215 * i.e. for classification purposes (different fdesc), however 3216 * the resources, SRS et. al., are in the mci_flent, so if 3217 * this isn't the mci_flent, we need to get it. 3218 */ 3219 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3220 FLOW_REFRELE(flent); 3221 flent = mcip->mci_flent; 3222 FLOW_TRY_REFHOLD(flent, err); 3223 if (err != 0) 3224 return (NULL); 3225 } 3226 3227 return (flent); 3228 } 3229 3230 /* 3231 * This macro is only meant to be used by mac_tx_send(). 3232 */ 3233 #define CHECK_VID_AND_ADD_TAG(mp) { \ 3234 if (vid_check) { \ 3235 int err = 0; \ 3236 \ 3237 MAC_VID_CHECK(src_mcip, (mp), err); \ 3238 if (err != 0) { \ 3239 freemsg((mp)); \ 3240 (mp) = next; \ 3241 oerrors++; \ 3242 continue; \ 3243 } \ 3244 } \ 3245 if (add_tag) { \ 3246 (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3247 if ((mp) == NULL) { \ 3248 (mp) = next; \ 3249 oerrors++; \ 3250 continue; \ 3251 } \ 3252 } \ 3253 } 3254 3255 mblk_t * 3256 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3257 mac_tx_stats_t *stats) 3258 { 3259 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3260 mac_impl_t *mip = src_mcip->mci_mip; 3261 uint_t obytes = 0, opackets = 0, oerrors = 0; 3262 mblk_t *mp = NULL, *next; 3263 boolean_t vid_check, add_tag; 3264 uint16_t vid = 0; 3265 3266 if (mip->mi_nclients > 1) { 3267 vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3268 add_tag = MAC_TAG_NEEDED(src_mcip); 3269 if (add_tag) 3270 vid = mac_client_vid(mch); 3271 } else { 3272 ASSERT(mip->mi_nclients == 1); 3273 vid_check = add_tag = B_FALSE; 3274 } 3275 3276 /* 3277 * Fastpath: if there's only one client, and there's no 3278 * multicast listeners, we simply send the packet down to the 3279 * underlying NIC. 3280 */ 3281 if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 3282 DTRACE_PROBE2(fastpath, 3283 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3284 3285 mp = mp_chain; 3286 while (mp != NULL) { 3287 next = mp->b_next; 3288 mp->b_next = NULL; 3289 opackets++; 3290 obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3291 msgdsize(mp)); 3292 3293 CHECK_VID_AND_ADD_TAG(mp); 3294 MAC_TX(mip, ring, mp, 3295 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3296 0)); 3297 3298 /* 3299 * If the driver is out of descriptors and does a 3300 * partial send it will return a chain of unsent 3301 * mblks. Adjust the accounting stats. 3302 */ 3303 if (mp != NULL) { 3304 opackets--; 3305 obytes -= msgdsize(mp); 3306 mp->b_next = next; 3307 break; 3308 } 3309 mp = next; 3310 } 3311 goto done; 3312 } 3313 3314 /* 3315 * No fastpath, we either have more than one MAC client 3316 * defined on top of the same MAC, or one or more MAC 3317 * client promiscuous callbacks. 3318 */ 3319 DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3320 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3321 3322 mp = mp_chain; 3323 while (mp != NULL) { 3324 flow_entry_t *dst_flow_ent; 3325 void *flow_cookie; 3326 size_t pkt_size; 3327 mblk_t *mp1; 3328 3329 next = mp->b_next; 3330 mp->b_next = NULL; 3331 opackets++; 3332 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3333 obytes += pkt_size; 3334 CHECK_VID_AND_ADD_TAG(mp); 3335 3336 /* 3337 * Check if there are promiscuous mode callbacks defined. 3338 */ 3339 if (mip->mi_promisc_list != NULL) 3340 mac_promisc_dispatch(mip, mp, src_mcip); 3341 3342 /* 3343 * Find the destination. 3344 */ 3345 dst_flow_ent = mac_tx_classify(mip, mp); 3346 3347 if (dst_flow_ent != NULL) { 3348 size_t hdrsize; 3349 int err = 0; 3350 3351 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3352 struct ether_vlan_header *evhp = 3353 (struct ether_vlan_header *)mp->b_rptr; 3354 3355 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3356 hdrsize = sizeof (*evhp); 3357 else 3358 hdrsize = sizeof (struct ether_header); 3359 } else { 3360 mac_header_info_t mhi; 3361 3362 err = mac_header_info((mac_handle_t)mip, 3363 mp, &mhi); 3364 if (err == 0) 3365 hdrsize = mhi.mhi_hdrsize; 3366 } 3367 3368 /* 3369 * Got a matching flow. It's either another 3370 * MAC client, or a broadcast/multicast flow. 3371 * Make sure the packet size is within the 3372 * allowed size. If not drop the packet and 3373 * move to next packet. 3374 */ 3375 if (err != 0 || 3376 (pkt_size - hdrsize) > mip->mi_sdu_max) { 3377 oerrors++; 3378 DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3379 mblk_t *, mp); 3380 freemsg(mp); 3381 mp = next; 3382 FLOW_REFRELE(dst_flow_ent); 3383 continue; 3384 } 3385 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3386 if (flow_cookie != NULL) { 3387 /* 3388 * The vnic_bcast_send function expects 3389 * to receive the sender MAC client 3390 * as value for arg2. 3391 */ 3392 mac_bcast_send(flow_cookie, src_mcip, mp, 3393 B_TRUE); 3394 } else { 3395 /* 3396 * loopback the packet to a 3397 * local MAC client. We force a context 3398 * switch if both source and destination 3399 * MAC clients are used by IP, i.e. bypass 3400 * is set. 3401 */ 3402 boolean_t do_switch; 3403 mac_client_impl_t *dst_mcip = 3404 dst_flow_ent->fe_mcip; 3405 3406 do_switch = ((src_mcip->mci_state_flags & 3407 dst_mcip->mci_state_flags & 3408 MCIS_CLIENT_POLL_CAPABLE) != 0); 3409 3410 if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3411 (dst_flow_ent->fe_cb_fn)( 3412 dst_flow_ent->fe_cb_arg1, 3413 dst_flow_ent->fe_cb_arg2, 3414 mp1, do_switch); 3415 } 3416 } 3417 FLOW_REFRELE(dst_flow_ent); 3418 } else { 3419 /* 3420 * Unknown destination, send via the underlying 3421 * NIC. 3422 */ 3423 MAC_TX(mip, ring, mp, 3424 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3425 0)); 3426 if (mp != NULL) { 3427 /* 3428 * Adjust for the last packet that 3429 * could not be transmitted 3430 */ 3431 opackets--; 3432 obytes -= pkt_size; 3433 mp->b_next = next; 3434 break; 3435 } 3436 } 3437 mp = next; 3438 } 3439 3440 done: 3441 src_mcip->mci_stat_obytes += obytes; 3442 src_mcip->mci_stat_opackets += opackets; 3443 src_mcip->mci_stat_oerrors += oerrors; 3444 3445 if (stats != NULL) { 3446 stats->ts_opackets = opackets; 3447 stats->ts_obytes = obytes; 3448 stats->ts_oerrors = oerrors; 3449 } 3450 return (mp); 3451 } 3452 3453 /* 3454 * mac_tx_srs_ring_present 3455 * 3456 * Returns whether the specified ring is part of the specified SRS. 3457 */ 3458 boolean_t 3459 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3460 { 3461 int i; 3462 mac_soft_ring_t *soft_ring; 3463 3464 if (srs->srs_tx.st_arg2 == tx_ring) 3465 return (B_TRUE); 3466 3467 for (i = 0; i < srs->srs_oth_ring_count; i++) { 3468 soft_ring = srs->srs_oth_soft_rings[i]; 3469 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3470 return (B_TRUE); 3471 } 3472 3473 return (B_FALSE); 3474 } 3475 3476 /* 3477 * mac_tx_srs_wakeup 3478 * 3479 * Called when Tx desc become available. Wakeup the appropriate worker 3480 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3481 * state field. 3482 */ 3483 void 3484 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3485 { 3486 int i; 3487 mac_soft_ring_t *sringp; 3488 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3489 3490 mutex_enter(&mac_srs->srs_lock); 3491 if (TX_SINGLE_RING_MODE(mac_srs)) { 3492 if (srs_tx->st_arg2 == ring && 3493 mac_srs->srs_state & SRS_TX_BLOCKED) { 3494 mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3495 srs_tx->st_unblocked_cnt++; 3496 cv_signal(&mac_srs->srs_async); 3497 } 3498 /* 3499 * A wakeup can come before tx_srs_drain() could 3500 * grab srs lock and set SRS_TX_BLOCKED. So 3501 * always set woken_up flag when we come here. 3502 */ 3503 srs_tx->st_woken_up = B_TRUE; 3504 mutex_exit(&mac_srs->srs_lock); 3505 return; 3506 } 3507 3508 /* If you are here, it is for FANOUT or BW_FANOUT case */ 3509 ASSERT(TX_MULTI_RING_MODE(mac_srs)); 3510 for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 3511 sringp = mac_srs->srs_oth_soft_rings[i]; 3512 mutex_enter(&sringp->s_ring_lock); 3513 if (sringp->s_ring_tx_arg2 == ring) { 3514 if (sringp->s_ring_state & S_RING_BLOCK) { 3515 sringp->s_ring_state &= ~S_RING_BLOCK; 3516 sringp->s_ring_unblocked_cnt++; 3517 cv_signal(&sringp->s_ring_async); 3518 } 3519 sringp->s_ring_tx_woken_up = B_TRUE; 3520 } 3521 mutex_exit(&sringp->s_ring_lock); 3522 } 3523 mutex_exit(&mac_srs->srs_lock); 3524 } 3525 3526 /* 3527 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3528 * the blocked clients again. 3529 */ 3530 void 3531 mac_tx_notify(mac_impl_t *mip) 3532 { 3533 i_mac_notify(mip, MAC_NOTE_TX); 3534 } 3535 3536 /* 3537 * RX SOFTRING RELATED FUNCTIONS 3538 * 3539 * These functions really belong in mac_soft_ring.c and here for 3540 * a short period. 3541 */ 3542 3543 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3544 /* \ 3545 * Enqueue our mblk chain. \ 3546 */ \ 3547 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3548 \ 3549 if ((ringp)->s_ring_last != NULL) \ 3550 (ringp)->s_ring_last->b_next = (mp); \ 3551 else \ 3552 (ringp)->s_ring_first = (mp); \ 3553 (ringp)->s_ring_last = (tail); \ 3554 (ringp)->s_ring_count += (cnt); \ 3555 ASSERT((ringp)->s_ring_count > 0); \ 3556 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3557 (ringp)->s_ring_size += sz; \ 3558 } \ 3559 } 3560 3561 /* 3562 * Default entry point to deliver a packet chain to a MAC client. 3563 * If the MAC client has flows, do the classification with these 3564 * flows as well. 3565 */ 3566 /* ARGSUSED */ 3567 void 3568 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3569 mac_header_info_t *arg3) 3570 { 3571 mac_client_impl_t *mcip = arg1; 3572 3573 if (mcip->mci_nvids == 1 && 3574 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 3575 /* 3576 * If the client has exactly one VID associated with it 3577 * and striping of VLAN header is not disabled, 3578 * remove the VLAN tag from the packet before 3579 * passing it on to the client's receive callback. 3580 * Note that this needs to be done after we dispatch 3581 * the packet to the promiscuous listeners of the 3582 * client, since they expect to see the whole 3583 * frame including the VLAN headers. 3584 */ 3585 mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3586 } 3587 3588 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3589 } 3590 3591 /* 3592 * mac_rx_soft_ring_process 3593 * 3594 * process a chain for a given soft ring. The number of packets queued 3595 * in the SRS and its associated soft rings (including this one) is 3596 * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3597 * thread (interrupt or poll thread) to do inline processing. This 3598 * helps keep the latency down under low load. 3599 * 3600 * The proc and arg for each mblk is already stored in the mblk in 3601 * appropriate places. 3602 */ 3603 /* ARGSUSED */ 3604 void 3605 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3606 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3607 { 3608 mac_direct_rx_t proc; 3609 void *arg1; 3610 mac_resource_handle_t arg2; 3611 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3612 3613 ASSERT(ringp != NULL); 3614 ASSERT(mp_chain != NULL); 3615 ASSERT(tail != NULL); 3616 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3617 3618 mutex_enter(&ringp->s_ring_lock); 3619 ringp->s_ring_total_inpkt += cnt; 3620 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3621 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 3622 /* If on processor or blanking on, then enqueue and return */ 3623 if (ringp->s_ring_state & S_RING_BLANK || 3624 ringp->s_ring_state & S_RING_PROC) { 3625 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3626 mutex_exit(&ringp->s_ring_lock); 3627 return; 3628 } 3629 proc = ringp->s_ring_rx_func; 3630 arg1 = ringp->s_ring_rx_arg1; 3631 arg2 = ringp->s_ring_rx_arg2; 3632 /* 3633 * See if anything is already queued. If we are the 3634 * first packet, do inline processing else queue the 3635 * packet and do the drain. 3636 */ 3637 if (ringp->s_ring_first == NULL) { 3638 /* 3639 * Fast-path, ok to process and nothing queued. 3640 */ 3641 ringp->s_ring_run = curthread; 3642 ringp->s_ring_state |= (S_RING_PROC); 3643 3644 mutex_exit(&ringp->s_ring_lock); 3645 3646 /* 3647 * We are the chain of 1 packet so 3648 * go through this fast path. 3649 */ 3650 ASSERT(mp_chain->b_next == NULL); 3651 3652 (*proc)(arg1, arg2, mp_chain, NULL); 3653 3654 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3655 /* 3656 * If we have a soft ring set which is doing 3657 * bandwidth control, we need to decrement 3658 * srs_size and count so it the SRS can have a 3659 * accurate idea of what is the real data 3660 * queued between SRS and its soft rings. We 3661 * decrement the counters only when the packet 3662 * gets processed by both SRS and the soft ring. 3663 */ 3664 mutex_enter(&mac_srs->srs_lock); 3665 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3666 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3667 mutex_exit(&mac_srs->srs_lock); 3668 3669 mutex_enter(&ringp->s_ring_lock); 3670 ringp->s_ring_run = NULL; 3671 ringp->s_ring_state &= ~S_RING_PROC; 3672 if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3673 cv_signal(&ringp->s_ring_client_cv); 3674 3675 if ((ringp->s_ring_first == NULL) || 3676 (ringp->s_ring_state & S_RING_BLANK)) { 3677 /* 3678 * We processed inline our packet and 3679 * nothing new has arrived or our 3680 * receiver doesn't want to receive 3681 * any packets. We are done. 3682 */ 3683 mutex_exit(&ringp->s_ring_lock); 3684 return; 3685 } 3686 } else { 3687 SOFT_RING_ENQUEUE_CHAIN(ringp, 3688 mp_chain, tail, cnt, sz); 3689 } 3690 3691 /* 3692 * We are here because either we couldn't do inline 3693 * processing (because something was already 3694 * queued), or we had a chain of more than one 3695 * packet, or something else arrived after we were 3696 * done with inline processing. 3697 */ 3698 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3699 ASSERT(ringp->s_ring_first != NULL); 3700 3701 ringp->s_ring_drain_func(ringp); 3702 mutex_exit(&ringp->s_ring_lock); 3703 return; 3704 } else { 3705 /* ST_RING_WORKER_ONLY case */ 3706 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3707 mac_soft_ring_worker_wakeup(ringp); 3708 mutex_exit(&ringp->s_ring_lock); 3709 } 3710 } 3711 3712 /* 3713 * TX SOFTRING RELATED FUNCTIONS 3714 * 3715 * These functions really belong in mac_soft_ring.c and here for 3716 * a short period. 3717 */ 3718 3719 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3720 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3721 ringp->s_ring_state |= S_RING_ENQUEUED; \ 3722 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3723 } 3724 3725 /* 3726 * mac_tx_sring_queued 3727 * 3728 * When we are out of transmit descriptors and we already have a 3729 * queue that exceeds hiwat (or the client called us with 3730 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3731 * soft ring pointer as the opaque cookie for the client enable 3732 * flow control. 3733 */ 3734 static mac_tx_cookie_t 3735 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3736 mblk_t **ret_mp) 3737 { 3738 int cnt; 3739 size_t sz; 3740 mblk_t *tail; 3741 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3742 mac_tx_cookie_t cookie = NULL; 3743 boolean_t wakeup_worker = B_TRUE; 3744 3745 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3746 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3747 if (flag & MAC_DROP_ON_NO_DESC) { 3748 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3749 /* increment freed stats */ 3750 ringp->s_ring_drops += cnt; 3751 cookie = (mac_tx_cookie_t)ringp; 3752 } else { 3753 if (ringp->s_ring_first != NULL) 3754 wakeup_worker = B_FALSE; 3755 3756 if (flag & MAC_TX_NO_ENQUEUE) { 3757 /* 3758 * If QUEUED is not set, queue the packet 3759 * and let mac_tx_soft_ring_drain() set 3760 * the TX_BLOCKED bit for the reasons 3761 * explained above. Otherwise, return the 3762 * mblks. 3763 */ 3764 if (wakeup_worker) { 3765 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3766 mp_chain, tail, cnt, sz); 3767 } else { 3768 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3769 cookie = (mac_tx_cookie_t)ringp; 3770 *ret_mp = mp_chain; 3771 } 3772 } else { 3773 boolean_t enqueue = B_TRUE; 3774 3775 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3776 /* 3777 * flow-controlled. Store ringp in cookie 3778 * so that it can be returned as 3779 * mac_tx_cookie_t to client 3780 */ 3781 ringp->s_ring_state |= S_RING_TX_HIWAT; 3782 cookie = (mac_tx_cookie_t)ringp; 3783 ringp->s_ring_hiwat_cnt++; 3784 if (ringp->s_ring_count > 3785 ringp->s_ring_tx_max_q_cnt) { 3786 /* increment freed stats */ 3787 ringp->s_ring_drops += cnt; 3788 /* 3789 * b_prev may be set to the fanout hint 3790 * hence can't use freemsg directly 3791 */ 3792 mac_pkt_drop(NULL, NULL, 3793 mp_chain, B_FALSE); 3794 DTRACE_PROBE1(tx_queued_hiwat, 3795 mac_soft_ring_t *, ringp); 3796 enqueue = B_FALSE; 3797 } 3798 } 3799 if (enqueue) { 3800 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3801 tail, cnt, sz); 3802 } 3803 } 3804 if (wakeup_worker) 3805 cv_signal(&ringp->s_ring_async); 3806 } 3807 return (cookie); 3808 } 3809 3810 3811 /* 3812 * mac_tx_soft_ring_process 3813 * 3814 * This routine is called when fanning out outgoing traffic among 3815 * multipe Tx rings. 3816 * Note that a soft ring is associated with a h/w Tx ring. 3817 */ 3818 mac_tx_cookie_t 3819 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3820 uint16_t flag, mblk_t **ret_mp) 3821 { 3822 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3823 int cnt; 3824 size_t sz; 3825 mblk_t *tail; 3826 mac_tx_cookie_t cookie = NULL; 3827 3828 ASSERT(ringp != NULL); 3829 ASSERT(mp_chain != NULL); 3830 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3831 /* 3832 * Only two modes can come here; either it can be 3833 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 3834 */ 3835 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3836 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 3837 3838 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3839 /* Serialization mode */ 3840 3841 mutex_enter(&ringp->s_ring_lock); 3842 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3843 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3844 flag, ret_mp); 3845 mutex_exit(&ringp->s_ring_lock); 3846 return (cookie); 3847 } 3848 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3849 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3850 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3851 /* 3852 * If ring is blocked due to lack of Tx 3853 * descs, just return. Worker thread 3854 * will get scheduled when Tx desc's 3855 * become available. 3856 */ 3857 mutex_exit(&ringp->s_ring_lock); 3858 return (cookie); 3859 } 3860 mac_soft_ring_worker_wakeup(ringp); 3861 mutex_exit(&ringp->s_ring_lock); 3862 return (cookie); 3863 } else { 3864 /* Default fanout mode */ 3865 /* 3866 * S_RING_BLOCKED is set when underlying NIC runs 3867 * out of Tx descs and messages start getting 3868 * queued. It won't get reset until 3869 * tx_srs_drain() completely drains out the 3870 * messages. 3871 */ 3872 boolean_t is_subflow; 3873 mac_tx_stats_t stats; 3874 3875 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3876 /* Tx descs/resources not available */ 3877 mutex_enter(&ringp->s_ring_lock); 3878 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3879 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3880 flag, ret_mp); 3881 mutex_exit(&ringp->s_ring_lock); 3882 return (cookie); 3883 } 3884 /* 3885 * While we were computing mblk count, the 3886 * flow control condition got relieved. 3887 * Continue with the transmission. 3888 */ 3889 mutex_exit(&ringp->s_ring_lock); 3890 } 3891 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 3892 3893 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 3894 ringp->s_ring_tx_arg2, mp_chain, 3895 (is_subflow ? &stats : NULL)); 3896 3897 /* 3898 * Multiple threads could be here sending packets. 3899 * Under such conditions, it is not possible to 3900 * automically set S_RING_BLOCKED bit to indicate 3901 * out of tx desc condition. To atomically set 3902 * this, we queue the returned packet and do 3903 * the setting of S_RING_BLOCKED in 3904 * mac_tx_soft_ring_drain(). 3905 */ 3906 if (mp_chain != NULL) { 3907 mutex_enter(&ringp->s_ring_lock); 3908 cookie = 3909 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 3910 mutex_exit(&ringp->s_ring_lock); 3911 return (cookie); 3912 } 3913 if (is_subflow) { 3914 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 3915 } 3916 return (NULL); 3917 } 3918 } 3919