1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/callb.h> 28 #include <sys/sdt.h> 29 #include <sys/strsubr.h> 30 #include <sys/strsun.h> 31 #include <sys/vlan.h> 32 #include <inet/ipsec_impl.h> 33 #include <inet/ip_impl.h> 34 #include <inet/sadb.h> 35 #include <inet/ipsecesp.h> 36 #include <inet/ipsecah.h> 37 #include <inet/ip6.h> 38 39 #include <sys/mac_impl.h> 40 #include <sys/mac_client_impl.h> 41 #include <sys/mac_client_priv.h> 42 #include <sys/mac_soft_ring.h> 43 #include <sys/mac_flow_impl.h> 44 45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 46 uintptr_t, uint16_t, mblk_t **); 47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 48 uintptr_t, uint16_t, mblk_t **); 49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 50 uintptr_t, uint16_t, mblk_t **); 51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 52 uintptr_t, uint16_t, mblk_t **); 53 54 typedef struct mac_tx_mode_s { 55 mac_tx_srs_mode_t mac_tx_mode; 56 mac_tx_func_t mac_tx_func; 57 } mac_tx_mode_t; 58 59 /* 60 * There are five modes of operation on the Tx side. These modes get set 61 * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 62 * none of the other modes are user configurable. They get selected by 63 * the system depending upon whether the link (or flow) has multiple Tx 64 * rings or a bandwidth configured, etc. 65 */ 66 mac_tx_mode_t mac_tx_mode_list[] = { 67 {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 68 {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 69 {SRS_TX_FANOUT, mac_tx_fanout_mode}, 70 {SRS_TX_BW, mac_tx_bw_mode}, 71 {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 72 }; 73 74 /* 75 * Soft Ring Set (SRS) - The Run time code that deals with 76 * dynamic polling from the hardware, bandwidth enforcement, 77 * fanout etc. 78 * 79 * We try to use H/W classification on NIC and assign traffic for 80 * a MAC address to a particular Rx ring or ring group. There is a 81 * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 82 * switches the underlying Rx ring between interrupt and 83 * polling mode and enforces any specified B/W control. 84 * 85 * There is always a SRS created and tied to each H/W and S/W rule. 86 * Whenever we create a H/W rule, we always add the the same rule to 87 * S/W classifier and tie a SRS to it. 88 * 89 * In case a B/W control is specified, it is broken into bytes 90 * per ticks and as soon as the quota for a tick is exhausted, 91 * the underlying Rx ring is forced into poll mode for remainder of 92 * the tick. The SRS poll thread only polls for bytes that are 93 * allowed to come in the SRS. We typically let 4x the configured 94 * B/W worth of packets to come in the SRS (to prevent unnecessary 95 * drops due to bursts) but only process the specified amount. 96 * 97 * A MAC client (e.g. a VNIC or aggr) can have 1 or more 98 * Rx rings (and corresponding SRSs) assigned to it. The SRS 99 * in turn can have softrings to do protocol level fanout or 100 * softrings to do S/W based fanout or both. In case the NIC 101 * has no Rx rings, we do S/W classification to respective SRS. 102 * The S/W classification rule is always setup and ready. This 103 * allows the MAC layer to reassign Rx rings whenever needed 104 * but packets still continue to flow via the default path and 105 * getting S/W classified to correct SRS. 106 * 107 * The SRS's are used on both Tx and Rx side. They use the same 108 * data structure but the processing routines have slightly different 109 * semantics due to the fact that Rx side needs to do dynamic 110 * polling etc. 111 * 112 * Dynamic Polling Notes 113 * ===================== 114 * 115 * Each Soft ring set is capable of switching its Rx ring between 116 * interrupt and poll mode and actively 'polls' for packets in 117 * poll mode. If the SRS is implementing a B/W limit, it makes 118 * sure that only Max allowed packets are pulled in poll mode 119 * and goes to poll mode as soon as B/W limit is exceeded. As 120 * such, there are no overheads to implement B/W limits. 121 * 122 * In poll mode, its better to keep the pipeline going where the 123 * SRS worker thread keeps processing packets and poll thread 124 * keeps bringing more packets (specially if they get to run 125 * on different CPUs). This also prevents the overheads associated 126 * by excessive signalling (on NUMA machines, this can be 127 * pretty devastating). The exception is latency optimized case 128 * where worker thread does no work and interrupt and poll thread 129 * are allowed to do their own drain. 130 * 131 * We use the following policy to control Dynamic Polling: 132 * 1) We switch to poll mode anytime the processing 133 * thread causes a backlog to build up in SRS and 134 * its associated Soft Rings (sr_poll_pkt_cnt > 0). 135 * 2) As long as the backlog stays under the low water 136 * mark (sr_lowat), we poll the H/W for more packets. 137 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 138 * water mark, we stay in poll mode but don't poll 139 * the H/W for more packets. 140 * 4) Anytime in polling mode, if we poll the H/W for 141 * packets and find nothing plus we have an existing 142 * backlog (sr_poll_pkt_cnt > 0), we stay in polling 143 * mode but don't poll the H/W for packets anymore 144 * (let the polling thread go to sleep). 145 * 5) Once the backlog is relived (packets are processed) 146 * we reenable polling (by signalling the poll thread) 147 * only when the backlog dips below sr_poll_thres. 148 * 6) sr_hiwat is used exclusively when we are not 149 * polling capable and is used to decide when to 150 * drop packets so the SRS queue length doesn't grow 151 * infinitely. 152 * 153 * NOTE: Also see the block level comment on top of mac_soft_ring.c 154 */ 155 156 /* 157 * mac_latency_optimize 158 * 159 * Controls whether the poll thread can process the packets inline 160 * or let the SRS worker thread do the processing. This applies if 161 * the SRS was not being processed. For latency sensitive traffic, 162 * this needs to be true to allow inline processing. For throughput 163 * under load, this should be false. 164 * 165 * This (and other similar) tunable should be rolled into a link 166 * or flow specific workload hint that can be set using dladm 167 * linkprop (instead of multiple such tunables). 168 */ 169 boolean_t mac_latency_optimize = B_TRUE; 170 171 /* 172 * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 173 * 174 * queue a mp or chain in soft ring set and increment the 175 * local count (srs_count) for the SRS and the shared counter 176 * (srs_poll_pkt_cnt - shared between SRS and its soft rings 177 * to track the total unprocessed packets for polling to work 178 * correctly). 179 * 180 * The size (total bytes queued) counters are incremented only 181 * if we are doing B/W control. 182 */ 183 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 184 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 185 if ((mac_srs)->srs_last != NULL) \ 186 (mac_srs)->srs_last->b_next = (head); \ 187 else \ 188 (mac_srs)->srs_first = (head); \ 189 (mac_srs)->srs_last = (tail); \ 190 (mac_srs)->srs_count += count; \ 191 } 192 193 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 194 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 195 \ 196 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 197 srs_rx->sr_poll_pkt_cnt += count; \ 198 ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 199 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 200 (mac_srs)->srs_size += (sz); \ 201 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 202 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 203 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 204 } \ 205 } 206 207 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 208 mac_srs->srs_state |= SRS_ENQUEUED; \ 209 MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 210 if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 211 (mac_srs)->srs_size += (sz); \ 212 (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 213 } \ 214 } 215 216 /* 217 * Turn polling on routines 218 */ 219 #define MAC_SRS_POLLING_ON(mac_srs) { \ 220 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 221 if (((mac_srs)->srs_state & \ 222 (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 223 (mac_srs)->srs_state |= SRS_POLLING; \ 224 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 225 (mac_srs)->srs_ring); \ 226 (mac_srs)->srs_rx.sr_poll_on++; \ 227 } \ 228 } 229 230 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 231 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 232 if (((mac_srs)->srs_state & \ 233 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 234 (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 235 (mac_srs)->srs_state |= SRS_POLLING; \ 236 (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 237 (mac_srs)->srs_ring); \ 238 (mac_srs)->srs_rx.sr_worker_poll_on++; \ 239 } \ 240 } 241 242 /* 243 * MAC_SRS_POLL_RING 244 * 245 * Signal the SRS poll thread to poll the underlying H/W ring 246 * provided it wasn't already polling (SRS_GET_PKTS was set). 247 * 248 * Poll thread gets to run only from mac_rx_srs_drain() and only 249 * if the drain was being done by the worker thread. 250 */ 251 #define MAC_SRS_POLL_RING(mac_srs) { \ 252 mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 253 \ 254 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 255 srs_rx->sr_poll_thr_sig++; \ 256 if (((mac_srs)->srs_state & \ 257 (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 258 (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 259 (mac_srs)->srs_state |= SRS_GET_PKTS; \ 260 cv_signal(&(mac_srs)->srs_cv); \ 261 } else { \ 262 srs_rx->sr_poll_thr_busy++; \ 263 } \ 264 } 265 266 /* 267 * MAC_SRS_CHECK_BW_CONTROL 268 * 269 * Check to see if next tick has started so we can reset the 270 * SRS_BW_ENFORCED flag and allow more packets to come in the 271 * system. 272 */ 273 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 274 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 275 ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 276 MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 277 clock_t now = ddi_get_lbolt(); \ 278 if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \ 279 (mac_srs)->srs_bw->mac_bw_curr_time = now; \ 280 (mac_srs)->srs_bw->mac_bw_used = 0; \ 281 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 282 (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 283 } \ 284 } 285 286 /* 287 * MAC_SRS_WORKER_WAKEUP 288 * 289 * Wake up the SRS worker thread to process the queue as long as 290 * no one else is processing the queue. If we are optimizing for 291 * latency, we wake up the worker thread immediately or else we 292 * wait mac_srs_worker_wakeup_ticks before worker thread gets 293 * woken up. 294 */ 295 int mac_srs_worker_wakeup_ticks = 0; 296 #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 297 ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 298 if (!((mac_srs)->srs_state & SRS_PROC) && \ 299 (mac_srs)->srs_tid == NULL) { \ 300 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 301 (mac_srs_worker_wakeup_ticks == 0)) \ 302 cv_signal(&(mac_srs)->srs_async); \ 303 else \ 304 (mac_srs)->srs_tid = \ 305 timeout(mac_srs_fire, (mac_srs), \ 306 mac_srs_worker_wakeup_ticks); \ 307 } \ 308 } 309 310 #define TX_SINGLE_RING_MODE(mac_srs) \ 311 ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 312 (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 313 (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 314 315 #define TX_BANDWIDTH_MODE(mac_srs) \ 316 ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 317 (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 318 319 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 320 uint_t hash, indx; \ 321 hash = HASH_HINT(hint); \ 322 indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 323 softring = mac_srs->srs_oth_soft_rings[indx]; \ 324 (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 325 } 326 327 /* 328 * MAC_TX_SRS_BLOCK 329 * 330 * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 331 * will be set only if srs_tx_woken_up is FALSE. If 332 * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 333 * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 334 * attempt to transmit again and not setting SRS_TX_BLOCKED does 335 * that. 336 */ 337 #define MAC_TX_SRS_BLOCK(srs, mp) { \ 338 ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 339 if ((srs)->srs_tx.st_woken_up) { \ 340 (srs)->srs_tx.st_woken_up = B_FALSE; \ 341 } else { \ 342 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 343 (srs)->srs_state |= SRS_TX_BLOCKED; \ 344 (srs)->srs_tx.st_blocked_cnt++; \ 345 } \ 346 } 347 348 /* 349 * MAC_TX_SRS_TEST_HIWAT 350 * 351 * Called before queueing a packet onto Tx SRS to test and set 352 * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 353 */ 354 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 355 boolean_t enqueue = 1; \ 356 \ 357 if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 358 /* \ 359 * flow-controlled. Store srs in cookie so that it \ 360 * can be returned as mac_tx_cookie_t to client \ 361 */ \ 362 (srs)->srs_state |= SRS_TX_HIWAT; \ 363 cookie = (mac_tx_cookie_t)srs; \ 364 (srs)->srs_tx.st_hiwat_cnt++; \ 365 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 366 /* increment freed stats */ \ 367 (srs)->srs_tx.st_drop_count += cnt; \ 368 /* \ 369 * b_prev may be set to the fanout hint \ 370 * hence can't use freemsg directly \ 371 */ \ 372 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 373 DTRACE_PROBE1(tx_queued_hiwat, \ 374 mac_soft_ring_set_t *, srs); \ 375 enqueue = 0; \ 376 } \ 377 } \ 378 if (enqueue) \ 379 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 380 } 381 382 /* Some utility macros */ 383 #define MAC_SRS_BW_LOCK(srs) \ 384 if (!(srs->srs_type & SRST_TX)) \ 385 mutex_enter(&srs->srs_bw->mac_bw_lock); 386 387 #define MAC_SRS_BW_UNLOCK(srs) \ 388 if (!(srs->srs_type & SRST_TX)) \ 389 mutex_exit(&srs->srs_bw->mac_bw_lock); 390 391 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 392 mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 393 /* increment freed stats */ \ 394 mac_srs->srs_tx.st_drop_count++; \ 395 cookie = (mac_tx_cookie_t)srs; \ 396 } 397 398 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 399 mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 400 cookie = (mac_tx_cookie_t)srs; \ 401 *ret_mp = mp_chain; \ 402 } 403 404 /* 405 * Drop the rx packet and advance to the next one in the chain. 406 */ 407 static void 408 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 409 { 410 mac_srs_rx_t *srs_rx = &srs->srs_rx; 411 412 ASSERT(mp->b_next == NULL); 413 mutex_enter(&srs->srs_lock); 414 MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 415 MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 416 mutex_exit(&srs->srs_lock); 417 418 srs_rx->sr_drop_count++; 419 freemsg(mp); 420 } 421 422 /* DATAPATH RUNTIME ROUTINES */ 423 424 /* 425 * mac_srs_fire 426 * 427 * Timer callback routine for waking up the SRS worker thread. 428 */ 429 static void 430 mac_srs_fire(void *arg) 431 { 432 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 433 434 mutex_enter(&mac_srs->srs_lock); 435 if (mac_srs->srs_tid == 0) { 436 mutex_exit(&mac_srs->srs_lock); 437 return; 438 } 439 440 mac_srs->srs_tid = 0; 441 if (!(mac_srs->srs_state & SRS_PROC)) 442 cv_signal(&mac_srs->srs_async); 443 444 mutex_exit(&mac_srs->srs_lock); 445 } 446 447 /* 448 * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 449 * and it is used on the TX path. 450 */ 451 #define HASH_HINT(hint) \ 452 ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8)) 453 454 455 /* 456 * hash based on the src address and the port information. 457 */ 458 #define HASH_ADDR(src, ports) \ 459 (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 460 ((ports) >> 8) ^ (ports)) 461 462 #define COMPUTE_INDEX(key, sz) (key % sz) 463 464 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 465 if ((tail) != NULL) { \ 466 ASSERT((tail)->b_next == NULL); \ 467 (tail)->b_next = (mp); \ 468 } else { \ 469 ASSERT((head) == NULL); \ 470 (head) = (mp); \ 471 } \ 472 (tail) = (mp); \ 473 (cnt)++; \ 474 if ((bw_ctl)) \ 475 (sz) += (sz0); \ 476 } 477 478 #define MAC_FANOUT_DEFAULT 0 479 #define MAC_FANOUT_RND_ROBIN 1 480 int mac_fanout_type = MAC_FANOUT_DEFAULT; 481 482 #define MAX_SR_TYPES 3 483 /* fanout types for port based hashing */ 484 enum pkt_type { 485 V4_TCP = 0, 486 V4_UDP, 487 OTH, 488 UNDEF 489 }; 490 491 /* 492 * In general we do port based hashing to spread traffic over different 493 * softrings. The below tunable allows to override that behavior. Setting it 494 * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 495 * is also the applicable to ipv6 packets carrying multiple optional headers 496 * and other uncommon packet types. 497 */ 498 boolean_t mac_src_ipv6_fanout = B_FALSE; 499 500 /* 501 * Pair of local and remote ports in the transport header 502 */ 503 #define PORTS_SIZE 4 504 505 /* 506 * mac_rx_srs_proto_fanout 507 * 508 * This routine delivers packets destined to an SRS into one of the 509 * protocol soft rings. 510 * 511 * Given a chain of packets we need to split it up into multiple sub chains 512 * destined into TCP, UDP or OTH soft ring. Instead of entering 513 * the soft ring one packet at a time, we want to enter it in the form of a 514 * chain otherwise we get this start/stop behaviour where the worker thread 515 * goes to sleep and then next packets comes in forcing it to wake up etc. 516 */ 517 static void 518 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 519 { 520 struct ether_header *ehp; 521 struct ether_vlan_header *evhp; 522 uint32_t sap; 523 ipha_t *ipha; 524 uint8_t *dstaddr; 525 size_t hdrsize; 526 mblk_t *mp; 527 mblk_t *headmp[MAX_SR_TYPES]; 528 mblk_t *tailmp[MAX_SR_TYPES]; 529 int cnt[MAX_SR_TYPES]; 530 size_t sz[MAX_SR_TYPES]; 531 size_t sz1; 532 boolean_t bw_ctl; 533 boolean_t hw_classified; 534 boolean_t dls_bypass; 535 boolean_t is_ether; 536 boolean_t is_unicast; 537 enum pkt_type type; 538 mac_client_impl_t *mcip = mac_srs->srs_mcip; 539 540 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 541 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 542 543 /* 544 * If we don't have a Rx ring, S/W classification would have done 545 * its job and its a packet meant for us. If we were polling on 546 * the default ring (i.e. there was a ring assigned to this SRS), 547 * then we need to make sure that the mac address really belongs 548 * to us. 549 */ 550 hw_classified = mac_srs->srs_ring != NULL && 551 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 552 553 /* 554 * Special clients (eg. VLAN, non ether, etc) need DLS 555 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 556 * such SRSs. Another way of disabling bypass is to set the 557 * MCIS_RX_BYPASS_DISABLE flag. 558 */ 559 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 560 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 561 562 bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 563 bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 564 bzero(cnt, MAX_SR_TYPES * sizeof (int)); 565 bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 566 567 /* 568 * We got a chain from SRS that we need to send to the soft rings. 569 * Since squeues for TCP & IPv4 sap poll their soft rings (for 570 * performance reasons), we need to separate out v4_tcp, v4_udp 571 * and the rest goes in other. 572 */ 573 while (head != NULL) { 574 mp = head; 575 head = head->b_next; 576 mp->b_next = NULL; 577 578 type = OTH; 579 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 580 581 if (is_ether) { 582 /* 583 * At this point we can be sure the packet at least 584 * has an ether header. 585 */ 586 if (sz1 < sizeof (struct ether_header)) { 587 mac_rx_drop_pkt(mac_srs, mp); 588 continue; 589 } 590 ehp = (struct ether_header *)mp->b_rptr; 591 592 /* 593 * Determine if this is a VLAN or non-VLAN packet. 594 */ 595 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 596 evhp = (struct ether_vlan_header *)mp->b_rptr; 597 sap = ntohs(evhp->ether_type); 598 hdrsize = sizeof (struct ether_vlan_header); 599 /* 600 * Check if the VID of the packet, if any, 601 * belongs to this client. 602 */ 603 if (!mac_client_check_flow_vid(mcip, 604 VLAN_ID(ntohs(evhp->ether_tci)))) { 605 mac_rx_drop_pkt(mac_srs, mp); 606 continue; 607 } 608 } else { 609 hdrsize = sizeof (struct ether_header); 610 } 611 is_unicast = 612 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 613 dstaddr = (uint8_t *)&ehp->ether_dhost; 614 } else { 615 mac_header_info_t mhi; 616 617 if (mac_header_info((mac_handle_t)mcip->mci_mip, 618 mp, &mhi) != 0) { 619 mac_rx_drop_pkt(mac_srs, mp); 620 continue; 621 } 622 hdrsize = mhi.mhi_hdrsize; 623 sap = mhi.mhi_bindsap; 624 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 625 dstaddr = (uint8_t *)mhi.mhi_daddr; 626 } 627 628 if (!dls_bypass) { 629 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 630 cnt[type], bw_ctl, sz[type], sz1, mp); 631 continue; 632 } 633 634 if (sap == ETHERTYPE_IP) { 635 /* 636 * If we are H/W classified, but we have promisc 637 * on, then we need to check for the unicast address. 638 */ 639 if (hw_classified && mcip->mci_promisc_list != NULL) { 640 mac_address_t *map; 641 642 rw_enter(&mcip->mci_rw_lock, RW_READER); 643 map = mcip->mci_unicast; 644 if (bcmp(dstaddr, map->ma_addr, 645 map->ma_len) == 0) 646 type = UNDEF; 647 rw_exit(&mcip->mci_rw_lock); 648 } else if (is_unicast) { 649 type = UNDEF; 650 } 651 } 652 653 /* 654 * This needs to become a contract with the driver for 655 * the fast path. 656 * 657 * In the normal case the packet will have at least the L2 658 * header and the IP + Transport header in the same mblk. 659 * This is usually the case when the NIC driver sends up 660 * the packet. This is also true when the stack generates 661 * a packet that is looped back and when the stack uses the 662 * fastpath mechanism. The normal case is optimized for 663 * performance and may bypass DLS. All other cases go through 664 * the 'OTH' type path without DLS bypass. 665 */ 666 667 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 668 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 669 type = OTH; 670 671 if (type == OTH) { 672 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 673 cnt[type], bw_ctl, sz[type], sz1, mp); 674 continue; 675 } 676 677 ASSERT(type == UNDEF); 678 /* 679 * We look for at least 4 bytes past the IP header to get 680 * the port information. If we get an IP fragment, we don't 681 * have the port information, and we use just the protocol 682 * information. 683 */ 684 switch (ipha->ipha_protocol) { 685 case IPPROTO_TCP: 686 type = V4_TCP; 687 mp->b_rptr += hdrsize; 688 break; 689 case IPPROTO_UDP: 690 type = V4_UDP; 691 mp->b_rptr += hdrsize; 692 break; 693 default: 694 type = OTH; 695 break; 696 } 697 698 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 699 bw_ctl, sz[type], sz1, mp); 700 } 701 702 for (type = V4_TCP; type < UNDEF; type++) { 703 if (headmp[type] != NULL) { 704 mac_soft_ring_t *softring; 705 706 ASSERT(tailmp[type]->b_next == NULL); 707 switch (type) { 708 case V4_TCP: 709 softring = mac_srs->srs_tcp_soft_rings[0]; 710 break; 711 case V4_UDP: 712 softring = mac_srs->srs_udp_soft_rings[0]; 713 break; 714 case OTH: 715 softring = mac_srs->srs_oth_soft_rings[0]; 716 } 717 mac_rx_soft_ring_process(mcip, softring, 718 headmp[type], tailmp[type], cnt[type], sz[type]); 719 } 720 } 721 } 722 723 int fanout_unalligned = 0; 724 725 /* 726 * mac_rx_srs_long_fanout 727 * 728 * The fanout routine for IPv6 729 */ 730 static int 731 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 732 uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 733 { 734 ip6_t *ip6h; 735 uint8_t *whereptr; 736 uint_t hash; 737 uint16_t remlen; 738 uint8_t nexthdr; 739 uint16_t hdr_len; 740 741 if (sap == ETHERTYPE_IPV6) { 742 boolean_t modifiable = B_TRUE; 743 744 ASSERT(MBLKL(mp) >= hdrsize); 745 746 ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 747 if ((unsigned char *)ip6h == mp->b_wptr) { 748 /* 749 * The first mblk_t only includes the mac header. 750 * Note that it is safe to change the mp pointer here, 751 * as the subsequent operation does not assume mp 752 * points to the start of the mac header. 753 */ 754 mp = mp->b_cont; 755 756 /* 757 * Make sure ip6h holds the full ip6_t structure. 758 */ 759 if (mp == NULL) 760 return (-1); 761 762 if (MBLKL(mp) < IPV6_HDR_LEN) { 763 modifiable = (DB_REF(mp) == 1); 764 765 if (modifiable && 766 !pullupmsg(mp, IPV6_HDR_LEN)) { 767 return (-1); 768 } 769 } 770 771 ip6h = (ip6_t *)mp->b_rptr; 772 } 773 774 if (!modifiable || !(OK_32PTR((char *)ip6h)) || 775 ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 776 /* 777 * If either ip6h is not alligned, or ip6h does not 778 * hold the complete ip6_t structure (a pullupmsg() 779 * is not an option since it would result in an 780 * unalligned ip6h), fanout to the default ring. Note 781 * that this may cause packets reordering. 782 */ 783 *indx = 0; 784 *type = OTH; 785 fanout_unalligned++; 786 return (0); 787 } 788 789 remlen = ntohs(ip6h->ip6_plen); 790 nexthdr = ip6h->ip6_nxt; 791 792 if (remlen < MIN_EHDR_LEN) 793 return (-1); 794 /* 795 * Do src based fanout if below tunable is set to B_TRUE or 796 * when mac_ip_hdr_length_v6() fails because of malformed 797 * packets or because mblk's need to be concatenated using 798 * pullupmsg(). 799 */ 800 if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 801 &hdr_len, &nexthdr, NULL, NULL)) { 802 goto src_based_fanout; 803 } 804 whereptr = (uint8_t *)ip6h + hdr_len; 805 806 /* If the transport is one of below, we do port based fanout */ 807 switch (nexthdr) { 808 case IPPROTO_TCP: 809 case IPPROTO_UDP: 810 case IPPROTO_SCTP: 811 case IPPROTO_ESP: 812 /* 813 * If the ports in the transport header is not part of 814 * the mblk, do src_based_fanout, instead of calling 815 * pullupmsg(). 816 */ 817 if (mp->b_cont != NULL && 818 whereptr + PORTS_SIZE > mp->b_wptr) { 819 goto src_based_fanout; 820 } 821 break; 822 default: 823 break; 824 } 825 826 switch (nexthdr) { 827 case IPPROTO_TCP: 828 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 829 *(uint32_t *)whereptr); 830 *indx = COMPUTE_INDEX(hash, 831 mac_srs->srs_tcp_ring_count); 832 *type = OTH; 833 break; 834 835 case IPPROTO_UDP: 836 case IPPROTO_SCTP: 837 case IPPROTO_ESP: 838 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 839 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 840 *(uint32_t *)whereptr); 841 *indx = COMPUTE_INDEX(hash, 842 mac_srs->srs_udp_ring_count); 843 } else { 844 *indx = mac_srs->srs_ind % 845 mac_srs->srs_udp_ring_count; 846 mac_srs->srs_ind++; 847 } 848 *type = OTH; 849 break; 850 851 /* For all other protocol, do source based fanout */ 852 default: 853 goto src_based_fanout; 854 } 855 } else { 856 *indx = 0; 857 *type = OTH; 858 } 859 return (0); 860 861 src_based_fanout: 862 hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 863 *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 864 *type = OTH; 865 return (0); 866 } 867 868 /* 869 * mac_rx_srs_fanout 870 * 871 * This routine delivers packets destined to an SRS into a soft ring member 872 * of the set. 873 * 874 * Given a chain of packets we need to split it up into multiple sub chains 875 * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 876 * the soft ring one packet at a time, we want to enter it in the form of a 877 * chain otherwise we get this start/stop behaviour where the worker thread 878 * goes to sleep and then next packets comes in forcing it to wake up etc. 879 * 880 * Note: 881 * Since we know what is the maximum fanout possible, we create a 2D array 882 * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 883 * variables so that we can enter the softrings with chain. We need the 884 * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 885 * for each packet would be expensive). If we ever want to have the 886 * ability to have unlimited fanout, we should probably declare a head, 887 * tail, cnt, sz with each soft ring (a data struct which contains a softring 888 * along with these members) and create an array of this uber struct so we 889 * don't have to do kmem_alloc. 890 */ 891 int fanout_oth1 = 0; 892 int fanout_oth2 = 0; 893 int fanout_oth3 = 0; 894 int fanout_oth4 = 0; 895 int fanout_oth5 = 0; 896 897 static void 898 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 899 { 900 struct ether_header *ehp; 901 struct ether_vlan_header *evhp; 902 uint32_t sap; 903 ipha_t *ipha; 904 uint8_t *dstaddr; 905 uint_t indx; 906 size_t ports_offset; 907 size_t ipha_len; 908 size_t hdrsize; 909 uint_t hash; 910 mblk_t *mp; 911 mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 912 mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 913 int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 914 size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 915 size_t sz1; 916 boolean_t bw_ctl; 917 boolean_t hw_classified; 918 boolean_t dls_bypass; 919 boolean_t is_ether; 920 boolean_t is_unicast; 921 int fanout_cnt; 922 enum pkt_type type; 923 mac_client_impl_t *mcip = mac_srs->srs_mcip; 924 925 is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 926 bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 927 928 /* 929 * If we don't have a Rx ring, S/W classification would have done 930 * its job and its a packet meant for us. If we were polling on 931 * the default ring (i.e. there was a ring assigned to this SRS), 932 * then we need to make sure that the mac address really belongs 933 * to us. 934 */ 935 hw_classified = mac_srs->srs_ring != NULL && 936 mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 937 938 /* 939 * Special clients (eg. VLAN, non ether, etc) need DLS 940 * processing in the Rx path. SRST_DLS_BYPASS will be clear for 941 * such SRSs. Another way of disabling bypass is to set the 942 * MCIS_RX_BYPASS_DISABLE flag. 943 */ 944 dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 945 ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 946 947 /* 948 * Since the softrings are never destroyed and we always 949 * create equal number of softrings for TCP, UDP and rest, 950 * its OK to check one of them for count and use it without 951 * any lock. In future, if soft rings get destroyed because 952 * of reduction in fanout, we will need to ensure that happens 953 * behind the SRS_PROC. 954 */ 955 fanout_cnt = mac_srs->srs_tcp_ring_count; 956 957 bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 958 bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 959 bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 960 bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 961 962 /* 963 * We got a chain from SRS that we need to send to the soft rings. 964 * Since squeues for TCP & IPv4 sap poll their soft rings (for 965 * performance reasons), we need to separate out v4_tcp, v4_udp 966 * and the rest goes in other. 967 */ 968 while (head != NULL) { 969 mp = head; 970 head = head->b_next; 971 mp->b_next = NULL; 972 973 type = OTH; 974 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 975 976 if (is_ether) { 977 /* 978 * At this point we can be sure the packet at least 979 * has an ether header. 980 */ 981 if (sz1 < sizeof (struct ether_header)) { 982 mac_rx_drop_pkt(mac_srs, mp); 983 continue; 984 } 985 ehp = (struct ether_header *)mp->b_rptr; 986 987 /* 988 * Determine if this is a VLAN or non-VLAN packet. 989 */ 990 if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 991 evhp = (struct ether_vlan_header *)mp->b_rptr; 992 sap = ntohs(evhp->ether_type); 993 hdrsize = sizeof (struct ether_vlan_header); 994 /* 995 * Check if the VID of the packet, if any, 996 * belongs to this client. 997 */ 998 if (!mac_client_check_flow_vid(mcip, 999 VLAN_ID(ntohs(evhp->ether_tci)))) { 1000 mac_rx_drop_pkt(mac_srs, mp); 1001 continue; 1002 } 1003 } else { 1004 hdrsize = sizeof (struct ether_header); 1005 } 1006 is_unicast = 1007 ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 1008 dstaddr = (uint8_t *)&ehp->ether_dhost; 1009 } else { 1010 mac_header_info_t mhi; 1011 1012 if (mac_header_info((mac_handle_t)mcip->mci_mip, 1013 mp, &mhi) != 0) { 1014 mac_rx_drop_pkt(mac_srs, mp); 1015 continue; 1016 } 1017 hdrsize = mhi.mhi_hdrsize; 1018 sap = mhi.mhi_bindsap; 1019 is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 1020 dstaddr = (uint8_t *)mhi.mhi_daddr; 1021 } 1022 1023 if (!dls_bypass) { 1024 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1025 hdrsize, &type, &indx) == -1) { 1026 mac_rx_drop_pkt(mac_srs, mp); 1027 continue; 1028 } 1029 1030 FANOUT_ENQUEUE_MP(headmp[type][indx], 1031 tailmp[type][indx], cnt[type][indx], bw_ctl, 1032 sz[type][indx], sz1, mp); 1033 continue; 1034 } 1035 1036 1037 /* 1038 * If we are using the default Rx ring where H/W or S/W 1039 * classification has not happened, we need to verify if 1040 * this unicast packet really belongs to us. 1041 */ 1042 if (sap == ETHERTYPE_IP) { 1043 /* 1044 * If we are H/W classified, but we have promisc 1045 * on, then we need to check for the unicast address. 1046 */ 1047 if (hw_classified && mcip->mci_promisc_list != NULL) { 1048 mac_address_t *map; 1049 1050 rw_enter(&mcip->mci_rw_lock, RW_READER); 1051 map = mcip->mci_unicast; 1052 if (bcmp(dstaddr, map->ma_addr, 1053 map->ma_len) == 0) 1054 type = UNDEF; 1055 rw_exit(&mcip->mci_rw_lock); 1056 } else if (is_unicast) { 1057 type = UNDEF; 1058 } 1059 } 1060 1061 /* 1062 * This needs to become a contract with the driver for 1063 * the fast path. 1064 */ 1065 1066 ipha = (ipha_t *)(mp->b_rptr + hdrsize); 1067 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1068 type = OTH; 1069 fanout_oth1++; 1070 } 1071 1072 if (type != OTH) { 1073 uint16_t frag_offset_flags; 1074 1075 switch (ipha->ipha_protocol) { 1076 case IPPROTO_TCP: 1077 case IPPROTO_UDP: 1078 case IPPROTO_SCTP: 1079 case IPPROTO_ESP: 1080 ipha_len = IPH_HDR_LENGTH(ipha); 1081 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1082 mp->b_wptr) { 1083 type = OTH; 1084 break; 1085 } 1086 frag_offset_flags = 1087 ntohs(ipha->ipha_fragment_offset_and_flags); 1088 if ((frag_offset_flags & 1089 (IPH_MF | IPH_OFFSET)) != 0) { 1090 type = OTH; 1091 fanout_oth3++; 1092 break; 1093 } 1094 ports_offset = hdrsize + ipha_len; 1095 break; 1096 default: 1097 type = OTH; 1098 fanout_oth4++; 1099 break; 1100 } 1101 } 1102 1103 if (type == OTH) { 1104 if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1105 hdrsize, &type, &indx) == -1) { 1106 mac_rx_drop_pkt(mac_srs, mp); 1107 continue; 1108 } 1109 1110 FANOUT_ENQUEUE_MP(headmp[type][indx], 1111 tailmp[type][indx], cnt[type][indx], bw_ctl, 1112 sz[type][indx], sz1, mp); 1113 continue; 1114 } 1115 1116 ASSERT(type == UNDEF); 1117 1118 /* 1119 * XXX-Sunay: We should hold srs_lock since ring_count 1120 * below can change. But if we are always called from 1121 * mac_rx_srs_drain and SRS_PROC is set, then we can 1122 * enforce that ring_count can't be changed i.e. 1123 * to change fanout type or ring count, the calling 1124 * thread needs to be behind SRS_PROC. 1125 */ 1126 switch (ipha->ipha_protocol) { 1127 case IPPROTO_TCP: 1128 /* 1129 * Note that for ESP, we fanout on SPI and it is at the 1130 * same offset as the 2x16-bit ports. So it is clumped 1131 * along with TCP, UDP and SCTP. 1132 */ 1133 hash = HASH_ADDR(ipha->ipha_src, 1134 *(uint32_t *)(mp->b_rptr + ports_offset)); 1135 indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1136 type = V4_TCP; 1137 mp->b_rptr += hdrsize; 1138 break; 1139 case IPPROTO_UDP: 1140 case IPPROTO_SCTP: 1141 case IPPROTO_ESP: 1142 if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1143 hash = HASH_ADDR(ipha->ipha_src, 1144 *(uint32_t *)(mp->b_rptr + ports_offset)); 1145 indx = COMPUTE_INDEX(hash, 1146 mac_srs->srs_udp_ring_count); 1147 } else { 1148 indx = mac_srs->srs_ind % 1149 mac_srs->srs_udp_ring_count; 1150 mac_srs->srs_ind++; 1151 } 1152 type = V4_UDP; 1153 mp->b_rptr += hdrsize; 1154 break; 1155 default: 1156 indx = 0; 1157 type = OTH; 1158 } 1159 1160 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1161 cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1162 } 1163 1164 for (type = V4_TCP; type < UNDEF; type++) { 1165 int i; 1166 1167 for (i = 0; i < fanout_cnt; i++) { 1168 if (headmp[type][i] != NULL) { 1169 mac_soft_ring_t *softring; 1170 1171 ASSERT(tailmp[type][i]->b_next == NULL); 1172 switch (type) { 1173 case V4_TCP: 1174 softring = 1175 mac_srs->srs_tcp_soft_rings[i]; 1176 break; 1177 case V4_UDP: 1178 softring = 1179 mac_srs->srs_udp_soft_rings[i]; 1180 break; 1181 case OTH: 1182 softring = 1183 mac_srs->srs_oth_soft_rings[i]; 1184 break; 1185 } 1186 mac_rx_soft_ring_process(mcip, 1187 softring, headmp[type][i], tailmp[type][i], 1188 cnt[type][i], sz[type][i]); 1189 } 1190 } 1191 } 1192 } 1193 1194 #define SRS_BYTES_TO_PICKUP 150000 1195 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1196 1197 /* 1198 * mac_rx_srs_poll_ring 1199 * 1200 * This SRS Poll thread uses this routine to poll the underlying hardware 1201 * Rx ring to get a chain of packets. It can inline process that chain 1202 * if mac_latency_optimize is set (default) or signal the SRS worker thread 1203 * to do the remaining processing. 1204 * 1205 * Since packets come in the system via interrupt or poll path, we also 1206 * update the stats and deal with promiscous clients here. 1207 */ 1208 void 1209 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1210 { 1211 kmutex_t *lock = &mac_srs->srs_lock; 1212 kcondvar_t *async = &mac_srs->srs_cv; 1213 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1214 mblk_t *head, *tail, *mp; 1215 callb_cpr_t cprinfo; 1216 ssize_t bytes_to_pickup; 1217 size_t sz; 1218 int count; 1219 mac_client_impl_t *smcip; 1220 1221 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1222 mutex_enter(lock); 1223 1224 start: 1225 for (;;) { 1226 if (mac_srs->srs_state & SRS_PAUSE) 1227 goto done; 1228 1229 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1230 cv_wait(async, lock); 1231 CALLB_CPR_SAFE_END(&cprinfo, lock); 1232 1233 if (mac_srs->srs_state & SRS_PAUSE) 1234 goto done; 1235 1236 check_again: 1237 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1238 /* 1239 * We pick as many bytes as we are allowed to queue. 1240 * Its possible that we will exceed the total 1241 * packets queued in case this SRS is part of the 1242 * Rx ring group since > 1 poll thread can be pulling 1243 * upto the max allowed packets at the same time 1244 * but that should be OK. 1245 */ 1246 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1247 bytes_to_pickup = 1248 mac_srs->srs_bw->mac_bw_drop_threshold - 1249 mac_srs->srs_bw->mac_bw_sz; 1250 /* 1251 * We shouldn't have been signalled if we 1252 * have 0 or less bytes to pick but since 1253 * some of the bytes accounting is driver 1254 * dependant, we do the safety check. 1255 */ 1256 if (bytes_to_pickup < 0) 1257 bytes_to_pickup = 0; 1258 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1259 } else { 1260 /* 1261 * ToDO: Need to change the polling API 1262 * to add a packet count and a flag which 1263 * tells the driver whether we want packets 1264 * based on a count, or bytes, or all the 1265 * packets queued in the driver/HW. This 1266 * way, we never have to check the limits 1267 * on poll path. We truly let only as many 1268 * packets enter the system as we are willing 1269 * to process or queue. 1270 * 1271 * Something along the lines of 1272 * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1273 * mac_srs->srs_poll_pkt_cnt 1274 */ 1275 1276 /* 1277 * Since we are not doing B/W control, pick 1278 * as many packets as allowed. 1279 */ 1280 bytes_to_pickup = max_bytes_to_pickup; 1281 } 1282 1283 /* Poll the underlying Hardware */ 1284 mutex_exit(lock); 1285 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1286 mutex_enter(lock); 1287 1288 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1289 SRS_POLL_THR_OWNER); 1290 1291 mp = tail = head; 1292 count = 0; 1293 sz = 0; 1294 while (mp != NULL) { 1295 tail = mp; 1296 sz += msgdsize(mp); 1297 mp = mp->b_next; 1298 count++; 1299 } 1300 1301 if (head != NULL) { 1302 tail->b_next = NULL; 1303 smcip = mac_srs->srs_mcip; 1304 1305 if ((mac_srs->srs_type & SRST_FLOW) || 1306 (smcip == NULL)) { 1307 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1308 rbytes, sz); 1309 FLOW_STAT_UPDATE(mac_srs->srs_flent, 1310 ipackets, count); 1311 } 1312 1313 /* 1314 * If there are any promiscuous mode callbacks 1315 * defined for this MAC client, pass them a copy 1316 * if appropriate and also update the counters. 1317 */ 1318 if (smcip != NULL) { 1319 smcip->mci_stat_ibytes += sz; 1320 smcip->mci_stat_ipackets += count; 1321 1322 if (smcip->mci_mip->mi_promisc_list != NULL) { 1323 mutex_exit(lock); 1324 mac_promisc_dispatch(smcip->mci_mip, 1325 head, NULL); 1326 mutex_enter(lock); 1327 } 1328 } 1329 if (mac_srs->srs_type & SRST_BW_CONTROL) { 1330 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1331 mac_srs->srs_bw->mac_bw_polled += sz; 1332 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1333 } 1334 srs_rx->sr_poll_count += count; 1335 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1336 count, sz); 1337 if (count <= 10) 1338 srs_rx->sr_chain_cnt_undr10++; 1339 else if (count > 10 && count <= 50) 1340 srs_rx->sr_chain_cnt_10to50++; 1341 else 1342 srs_rx->sr_chain_cnt_over50++; 1343 } 1344 1345 /* 1346 * We are guaranteed that SRS_PROC will be set if we 1347 * are here. Also, poll thread gets to run only if 1348 * the drain was being done by a worker thread although 1349 * its possible that worker thread is still running 1350 * and poll thread was sent down to keep the pipeline 1351 * going instead of doing a complete drain and then 1352 * trying to poll the NIC. 1353 * 1354 * So we need to check SRS_WORKER flag to make sure 1355 * that the worker thread is not processing the queue 1356 * in parallel to us. The flags and conditions are 1357 * protected by the srs_lock to prevent any race. We 1358 * ensure that we don't drop the srs_lock from now 1359 * till the end and similarly we don't drop the srs_lock 1360 * in mac_rx_srs_drain() till similar condition check 1361 * are complete. The mac_rx_srs_drain() needs to ensure 1362 * that SRS_WORKER flag remains set as long as its 1363 * processing the queue. 1364 */ 1365 if (!(mac_srs->srs_state & SRS_WORKER) && 1366 (mac_srs->srs_first != NULL)) { 1367 /* 1368 * We have packets to process and worker thread 1369 * is not running. Check to see if poll thread is 1370 * allowed to process. 1371 */ 1372 if (mac_srs->srs_state & SRS_LATENCY_OPT) { 1373 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1374 if (!(mac_srs->srs_state & SRS_PAUSE) && 1375 srs_rx->sr_poll_pkt_cnt <= 1376 srs_rx->sr_lowat) { 1377 srs_rx->sr_poll_again++; 1378 goto check_again; 1379 } 1380 /* 1381 * We are already above low water mark 1382 * so stay in the polling mode but no 1383 * need to poll. Once we dip below 1384 * the polling threshold, the processing 1385 * thread (soft ring) will signal us 1386 * to poll again (MAC_UPDATE_SRS_COUNT) 1387 */ 1388 srs_rx->sr_poll_drain_no_poll++; 1389 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1390 /* 1391 * In B/W control case, its possible 1392 * that the backlog built up due to 1393 * B/W limit being reached and packets 1394 * are queued only in SRS. In this case, 1395 * we should schedule worker thread 1396 * since no one else will wake us up. 1397 */ 1398 if ((mac_srs->srs_type & SRST_BW_CONTROL) && 1399 (mac_srs->srs_tid == NULL)) { 1400 mac_srs->srs_tid = 1401 timeout(mac_srs_fire, mac_srs, 1); 1402 srs_rx->sr_poll_worker_wakeup++; 1403 } 1404 } else { 1405 /* 1406 * Wakeup the worker thread for more processing. 1407 * We optimize for throughput in this case. 1408 */ 1409 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1410 MAC_SRS_WORKER_WAKEUP(mac_srs); 1411 srs_rx->sr_poll_sig_worker++; 1412 } 1413 } else if ((mac_srs->srs_first == NULL) && 1414 !(mac_srs->srs_state & SRS_WORKER)) { 1415 /* 1416 * There is nothing queued in SRS and 1417 * no worker thread running. Plus we 1418 * didn't get anything from the H/W 1419 * as well (head == NULL); 1420 */ 1421 ASSERT(head == NULL); 1422 mac_srs->srs_state &= 1423 ~(SRS_PROC|SRS_GET_PKTS); 1424 1425 /* 1426 * If we have a packets in soft ring, don't allow 1427 * more packets to come into this SRS by keeping the 1428 * interrupts off but not polling the H/W. The 1429 * poll thread will get signaled as soon as 1430 * srs_poll_pkt_cnt dips below poll threshold. 1431 */ 1432 if (srs_rx->sr_poll_pkt_cnt == 0) { 1433 srs_rx->sr_poll_intr_enable++; 1434 MAC_SRS_POLLING_OFF(mac_srs); 1435 } else { 1436 /* 1437 * We know nothing is queued in SRS 1438 * since we are here after checking 1439 * srs_first is NULL. The backlog 1440 * is entirely due to packets queued 1441 * in Soft ring which will wake us up 1442 * and get the interface out of polling 1443 * mode once the backlog dips below 1444 * sr_poll_thres. 1445 */ 1446 srs_rx->sr_poll_no_poll++; 1447 } 1448 } else { 1449 /* 1450 * Worker thread is already running. 1451 * Nothing much to do. If the polling 1452 * was enabled, worker thread will deal 1453 * with that. 1454 */ 1455 mac_srs->srs_state &= ~SRS_GET_PKTS; 1456 srs_rx->sr_poll_goto_sleep++; 1457 } 1458 } 1459 done: 1460 mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1461 cv_signal(&mac_srs->srs_async); 1462 /* 1463 * If this is a temporary quiesce then wait for the restart signal 1464 * from the srs worker. Then clear the flags and signal the srs worker 1465 * to ensure a positive handshake and go back to start. 1466 */ 1467 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1468 cv_wait(async, lock); 1469 if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1470 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1471 mac_srs->srs_state &= 1472 ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1473 cv_signal(&mac_srs->srs_async); 1474 goto start; 1475 } else { 1476 mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1477 cv_signal(&mac_srs->srs_async); 1478 CALLB_CPR_EXIT(&cprinfo); 1479 thread_exit(); 1480 } 1481 } 1482 1483 /* 1484 * mac_srs_pick_chain 1485 * 1486 * In Bandwidth control case, checks how many packets can be processed 1487 * and return them in a sub chain. 1488 */ 1489 static mblk_t * 1490 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1491 size_t *chain_sz, int *chain_cnt) 1492 { 1493 mblk_t *head = NULL; 1494 mblk_t *tail = NULL; 1495 size_t sz; 1496 size_t tsz = 0; 1497 int cnt = 0; 1498 mblk_t *mp; 1499 1500 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1501 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1502 if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1503 mac_srs->srs_bw->mac_bw_limit) || 1504 (mac_srs->srs_bw->mac_bw_limit == 0)) { 1505 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1506 head = mac_srs->srs_first; 1507 mac_srs->srs_first = NULL; 1508 *chain_tail = mac_srs->srs_last; 1509 mac_srs->srs_last = NULL; 1510 *chain_sz = mac_srs->srs_size; 1511 *chain_cnt = mac_srs->srs_count; 1512 mac_srs->srs_count = 0; 1513 mac_srs->srs_size = 0; 1514 return (head); 1515 } 1516 1517 /* 1518 * Can't clear the entire backlog. 1519 * Need to find how many packets to pick 1520 */ 1521 ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1522 while ((mp = mac_srs->srs_first) != NULL) { 1523 sz = msgdsize(mp); 1524 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1525 mac_srs->srs_bw->mac_bw_limit) { 1526 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1527 mac_srs->srs_bw->mac_bw_state |= 1528 SRS_BW_ENFORCED; 1529 break; 1530 } 1531 1532 /* 1533 * The _size & cnt is decremented from the softrings 1534 * when they send up the packet for polling to work 1535 * properly. 1536 */ 1537 tsz += sz; 1538 cnt++; 1539 mac_srs->srs_count--; 1540 mac_srs->srs_size -= sz; 1541 if (tail != NULL) 1542 tail->b_next = mp; 1543 else 1544 head = mp; 1545 tail = mp; 1546 mac_srs->srs_first = mac_srs->srs_first->b_next; 1547 } 1548 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1549 if (mac_srs->srs_first == NULL) 1550 mac_srs->srs_last = NULL; 1551 1552 if (tail != NULL) 1553 tail->b_next = NULL; 1554 *chain_tail = tail; 1555 *chain_cnt = cnt; 1556 *chain_sz = tsz; 1557 1558 return (head); 1559 } 1560 1561 /* 1562 * mac_rx_srs_drain 1563 * 1564 * The SRS drain routine. Gets to run to clear the queue. Any thread 1565 * (worker, interrupt, poll) can call this based on processing model. 1566 * The first thing we do is disable interrupts if possible and then 1567 * drain the queue. we also try to poll the underlying hardware if 1568 * there is a dedicated hardware Rx ring assigned to this SRS. 1569 * 1570 * There is a equivalent drain routine in bandwidth control mode 1571 * mac_rx_srs_drain_bw. There is some code duplication between the two 1572 * routines but they are highly performance sensitive and are easier 1573 * to read/debug if they stay separate. Any code changes here might 1574 * also apply to mac_rx_srs_drain_bw as well. 1575 */ 1576 void 1577 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1578 { 1579 mblk_t *head; 1580 mblk_t *tail; 1581 timeout_id_t tid; 1582 int cnt = 0; 1583 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1584 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1585 1586 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1587 ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1588 1589 /* If we are blanked i.e. can't do upcalls, then we are done */ 1590 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1591 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1592 (mac_srs->srs_state & SRS_PAUSE)); 1593 goto out; 1594 } 1595 1596 if (mac_srs->srs_first == NULL) 1597 goto out; 1598 1599 if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 1600 (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 1601 /* 1602 * In the normal case, the SRS worker thread does no 1603 * work and we wait for a backlog to build up before 1604 * we switch into polling mode. In case we are 1605 * optimizing for throughput, we use the worker thread 1606 * as well. The goal is to let worker thread process 1607 * the queue and poll thread to feed packets into 1608 * the queue. As such, we should signal the poll 1609 * thread to try and get more packets. 1610 * 1611 * We could have pulled this check in the POLL_RING 1612 * macro itself but keeping it explicit here makes 1613 * the architecture more human understandable. 1614 */ 1615 MAC_SRS_POLL_RING(mac_srs); 1616 } 1617 1618 again: 1619 head = mac_srs->srs_first; 1620 mac_srs->srs_first = NULL; 1621 tail = mac_srs->srs_last; 1622 mac_srs->srs_last = NULL; 1623 cnt = mac_srs->srs_count; 1624 mac_srs->srs_count = 0; 1625 1626 ASSERT(head != NULL); 1627 ASSERT(tail != NULL); 1628 1629 if ((tid = mac_srs->srs_tid) != 0) 1630 mac_srs->srs_tid = 0; 1631 1632 mac_srs->srs_state |= (SRS_PROC|proc_type); 1633 1634 1635 /* 1636 * mcip is NULL for broadcast and multicast flows. The promisc 1637 * callbacks for broadcast and multicast packets are delivered from 1638 * mac_rx() and we don't need to worry about that case in this path 1639 */ 1640 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1641 mutex_exit(&mac_srs->srs_lock); 1642 mac_promisc_client_dispatch(mcip, head); 1643 mutex_enter(&mac_srs->srs_lock); 1644 } 1645 1646 /* 1647 * Check if SRS itself is doing the processing 1648 * This direct path does not apply when subflows are present. In this 1649 * case, packets need to be dispatched to a soft ring according to the 1650 * flow's bandwidth and other resources contraints. 1651 */ 1652 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1653 mac_direct_rx_t proc; 1654 void *arg1; 1655 mac_resource_handle_t arg2; 1656 1657 /* 1658 * This is the case when a Rx is directly 1659 * assigned and we have a fully classified 1660 * protocol chain. We can deal with it in 1661 * one shot. 1662 */ 1663 proc = srs_rx->sr_func; 1664 arg1 = srs_rx->sr_arg1; 1665 arg2 = srs_rx->sr_arg2; 1666 1667 mac_srs->srs_state |= SRS_CLIENT_PROC; 1668 mutex_exit(&mac_srs->srs_lock); 1669 if (tid != 0) { 1670 (void) untimeout(tid); 1671 tid = 0; 1672 } 1673 1674 proc(arg1, arg2, head, NULL); 1675 /* 1676 * Decrement the size and count here itelf 1677 * since the packet has been processed. 1678 */ 1679 mutex_enter(&mac_srs->srs_lock); 1680 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1681 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1682 cv_signal(&mac_srs->srs_client_cv); 1683 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1684 } else { 1685 /* Some kind of softrings based fanout is required */ 1686 mutex_exit(&mac_srs->srs_lock); 1687 if (tid != 0) { 1688 (void) untimeout(tid); 1689 tid = 0; 1690 } 1691 1692 /* 1693 * Since the fanout routines can deal with chains, 1694 * shoot the entire chain up. 1695 */ 1696 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1697 mac_rx_srs_fanout(mac_srs, head); 1698 else 1699 mac_rx_srs_proto_fanout(mac_srs, head); 1700 mutex_enter(&mac_srs->srs_lock); 1701 } 1702 1703 if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 1704 (mac_srs->srs_first != NULL)) { 1705 /* 1706 * More packets arrived while we were clearing the 1707 * SRS. This can be possible because of one of 1708 * three conditions below: 1709 * 1) The driver is using multiple worker threads 1710 * to send the packets to us. 1711 * 2) The driver has a race in switching 1712 * between interrupt and polling mode or 1713 * 3) Packets are arriving in this SRS via the 1714 * S/W classification as well. 1715 * 1716 * We should switch to polling mode and see if we 1717 * need to send the poll thread down. Also, signal 1718 * the worker thread to process whats just arrived. 1719 */ 1720 MAC_SRS_POLLING_ON(mac_srs); 1721 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 1722 srs_rx->sr_drain_poll_sig++; 1723 MAC_SRS_POLL_RING(mac_srs); 1724 } 1725 1726 /* 1727 * If we didn't signal the poll thread, we need 1728 * to deal with the pending packets ourselves. 1729 */ 1730 if (proc_type == SRS_WORKER) { 1731 srs_rx->sr_drain_again++; 1732 goto again; 1733 } else { 1734 srs_rx->sr_drain_worker_sig++; 1735 cv_signal(&mac_srs->srs_async); 1736 } 1737 } 1738 1739 out: 1740 if (mac_srs->srs_state & SRS_GET_PKTS) { 1741 /* 1742 * Poll thread is already running. Leave the 1743 * SRS_RPOC set and hand over the control to 1744 * poll thread. 1745 */ 1746 mac_srs->srs_state &= ~proc_type; 1747 srs_rx->sr_drain_poll_running++; 1748 return; 1749 } 1750 1751 /* 1752 * Even if there are no packets queued in SRS, we 1753 * need to make sure that the shared counter is 1754 * clear and any associated softrings have cleared 1755 * all the backlog. Otherwise, leave the interface 1756 * in polling mode and the poll thread will get 1757 * signalled once the count goes down to zero. 1758 * 1759 * If someone is already draining the queue (SRS_PROC is 1760 * set) when the srs_poll_pkt_cnt goes down to zero, 1761 * then it means that drain is already running and we 1762 * will turn off polling at that time if there is 1763 * no backlog. 1764 * 1765 * As long as there are packets queued either 1766 * in soft ring set or its soft rings, we will leave 1767 * the interface in polling mode (even if the drain 1768 * was done being the interrupt thread). We signal 1769 * the poll thread as well if we have dipped below 1770 * low water mark. 1771 * 1772 * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1773 * since that turn polling on only for worker thread. 1774 * Its not worth turning polling on for interrupt 1775 * thread (since NIC will not issue another interrupt) 1776 * unless a backlog builds up. 1777 */ 1778 if ((srs_rx->sr_poll_pkt_cnt > 0) && 1779 (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1780 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1781 srs_rx->sr_drain_keep_polling++; 1782 MAC_SRS_POLLING_ON(mac_srs); 1783 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1784 MAC_SRS_POLL_RING(mac_srs); 1785 return; 1786 } 1787 1788 /* Nothing else to do. Get out of poll mode */ 1789 MAC_SRS_POLLING_OFF(mac_srs); 1790 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1791 srs_rx->sr_drain_finish_intr++; 1792 } 1793 1794 /* 1795 * mac_rx_srs_drain_bw 1796 * 1797 * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1798 * (worker, interrupt, poll) can call this based on processing model. 1799 * The first thing we do is disable interrupts if possible and then 1800 * drain the queue. we also try to poll the underlying hardware if 1801 * there is a dedicated hardware Rx ring assigned to this SRS. 1802 * 1803 * There is a equivalent drain routine in non bandwidth control mode 1804 * mac_rx_srs_drain. There is some code duplication between the two 1805 * routines but they are highly performance sensitive and are easier 1806 * to read/debug if they stay separate. Any code changes here might 1807 * also apply to mac_rx_srs_drain as well. 1808 */ 1809 void 1810 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1811 { 1812 mblk_t *head; 1813 mblk_t *tail; 1814 timeout_id_t tid; 1815 size_t sz = 0; 1816 int cnt = 0; 1817 mac_client_impl_t *mcip = mac_srs->srs_mcip; 1818 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1819 clock_t now; 1820 1821 ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1822 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1823 again: 1824 /* Check if we are doing B/W control */ 1825 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1826 now = ddi_get_lbolt(); 1827 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 1828 mac_srs->srs_bw->mac_bw_curr_time = now; 1829 mac_srs->srs_bw->mac_bw_used = 0; 1830 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1831 mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1832 } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1833 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1834 goto done; 1835 } else if (mac_srs->srs_bw->mac_bw_used > 1836 mac_srs->srs_bw->mac_bw_limit) { 1837 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1838 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1839 goto done; 1840 } 1841 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1842 1843 /* If we are blanked i.e. can't do upcalls, then we are done */ 1844 if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1845 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1846 (mac_srs->srs_state & SRS_PAUSE)); 1847 goto done; 1848 } 1849 1850 sz = 0; 1851 cnt = 0; 1852 if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1853 /* 1854 * We couldn't pick up a single packet. 1855 */ 1856 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1857 if ((mac_srs->srs_bw->mac_bw_used == 0) && 1858 (mac_srs->srs_size != 0) && 1859 !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1860 /* 1861 * Seems like configured B/W doesn't 1862 * even allow processing of 1 packet 1863 * per tick. 1864 * 1865 * XXX: raise the limit to processing 1866 * at least 1 packet per tick. 1867 */ 1868 mac_srs->srs_bw->mac_bw_limit += 1869 mac_srs->srs_bw->mac_bw_limit; 1870 mac_srs->srs_bw->mac_bw_drop_threshold += 1871 mac_srs->srs_bw->mac_bw_drop_threshold; 1872 cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1873 "raised B/W limit to %d since not even a " 1874 "single packet can be processed per " 1875 "tick %d\n", (void *)mac_srs, 1876 (int)mac_srs->srs_bw->mac_bw_limit, 1877 (int)msgdsize(mac_srs->srs_first)); 1878 } 1879 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1880 goto done; 1881 } 1882 1883 ASSERT(head != NULL); 1884 ASSERT(tail != NULL); 1885 1886 /* zero bandwidth: drop all and return to interrupt mode */ 1887 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1888 if (mac_srs->srs_bw->mac_bw_limit == 0) { 1889 srs_rx->sr_drop_count += cnt; 1890 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1891 mac_srs->srs_bw->mac_bw_sz -= sz; 1892 mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1893 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1894 mac_pkt_drop(NULL, NULL, head, B_FALSE); 1895 goto leave_poll; 1896 } else { 1897 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1898 } 1899 1900 if ((tid = mac_srs->srs_tid) != 0) 1901 mac_srs->srs_tid = 0; 1902 1903 mac_srs->srs_state |= (SRS_PROC|proc_type); 1904 MAC_SRS_WORKER_POLLING_ON(mac_srs); 1905 1906 /* 1907 * mcip is NULL for broadcast and multicast flows. The promisc 1908 * callbacks for broadcast and multicast packets are delivered from 1909 * mac_rx() and we don't need to worry about that case in this path 1910 */ 1911 if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1912 mutex_exit(&mac_srs->srs_lock); 1913 mac_promisc_client_dispatch(mcip, head); 1914 mutex_enter(&mac_srs->srs_lock); 1915 } 1916 1917 /* 1918 * Check if SRS itself is doing the processing 1919 * This direct path does not apply when subflows are present. In this 1920 * case, packets need to be dispatched to a soft ring according to the 1921 * flow's bandwidth and other resources contraints. 1922 */ 1923 if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1924 mac_direct_rx_t proc; 1925 void *arg1; 1926 mac_resource_handle_t arg2; 1927 1928 /* 1929 * This is the case when a Rx is directly 1930 * assigned and we have a fully classified 1931 * protocol chain. We can deal with it in 1932 * one shot. 1933 */ 1934 proc = srs_rx->sr_func; 1935 arg1 = srs_rx->sr_arg1; 1936 arg2 = srs_rx->sr_arg2; 1937 1938 mac_srs->srs_state |= SRS_CLIENT_PROC; 1939 mutex_exit(&mac_srs->srs_lock); 1940 if (tid != 0) { 1941 (void) untimeout(tid); 1942 tid = 0; 1943 } 1944 1945 proc(arg1, arg2, head, NULL); 1946 /* 1947 * Decrement the size and count here itelf 1948 * since the packet has been processed. 1949 */ 1950 mutex_enter(&mac_srs->srs_lock); 1951 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1952 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 1953 1954 if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1955 cv_signal(&mac_srs->srs_client_cv); 1956 mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1957 } else { 1958 /* Some kind of softrings based fanout is required */ 1959 mutex_exit(&mac_srs->srs_lock); 1960 if (tid != 0) { 1961 (void) untimeout(tid); 1962 tid = 0; 1963 } 1964 1965 /* 1966 * Since the fanout routines can deal with chains, 1967 * shoot the entire chain up. 1968 */ 1969 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1970 mac_rx_srs_fanout(mac_srs, head); 1971 else 1972 mac_rx_srs_proto_fanout(mac_srs, head); 1973 mutex_enter(&mac_srs->srs_lock); 1974 } 1975 1976 /* 1977 * Send the poll thread to pick up any packets arrived 1978 * so far. This also serves as the last check in case 1979 * nothing else is queued in the SRS. The poll thread 1980 * is signalled only in the case the drain was done 1981 * by the worker thread and SRS_WORKER is set. The 1982 * worker thread can run in parallel as long as the 1983 * SRS_WORKER flag is set. We we have nothing else to 1984 * process, we can exit while leaving SRS_PROC set 1985 * which gives the poll thread control to process and 1986 * cleanup once it returns from the NIC. 1987 * 1988 * If we have nothing else to process, we need to 1989 * ensure that we keep holding the srs_lock till 1990 * all the checks below are done and control is 1991 * handed to the poll thread if it was running. 1992 */ 1993 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1994 if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1995 if (mac_srs->srs_first != NULL) { 1996 if (proc_type == SRS_WORKER) { 1997 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1998 if (srs_rx->sr_poll_pkt_cnt <= 1999 srs_rx->sr_lowat) 2000 MAC_SRS_POLL_RING(mac_srs); 2001 goto again; 2002 } else { 2003 cv_signal(&mac_srs->srs_async); 2004 } 2005 } 2006 } 2007 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2008 2009 done: 2010 2011 if (mac_srs->srs_state & SRS_GET_PKTS) { 2012 /* 2013 * Poll thread is already running. Leave the 2014 * SRS_RPOC set and hand over the control to 2015 * poll thread. 2016 */ 2017 mac_srs->srs_state &= ~proc_type; 2018 return; 2019 } 2020 2021 /* 2022 * If we can't process packets because we have exceeded 2023 * B/W limit for this tick, just set the timeout 2024 * and leave. 2025 * 2026 * Even if there are no packets queued in SRS, we 2027 * need to make sure that the shared counter is 2028 * clear and any associated softrings have cleared 2029 * all the backlog. Otherwise, leave the interface 2030 * in polling mode and the poll thread will get 2031 * signalled once the count goes down to zero. 2032 * 2033 * If someone is already draining the queue (SRS_PROC is 2034 * set) when the srs_poll_pkt_cnt goes down to zero, 2035 * then it means that drain is already running and we 2036 * will turn off polling at that time if there is 2037 * no backlog. As long as there are packets queued either 2038 * is soft ring set or its soft rings, we will leave 2039 * the interface in polling mode. 2040 */ 2041 mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2042 if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2043 ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2044 (srs_rx->sr_poll_pkt_cnt > 0))) { 2045 MAC_SRS_POLLING_ON(mac_srs); 2046 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2047 if ((mac_srs->srs_first != NULL) && 2048 (mac_srs->srs_tid == NULL)) 2049 mac_srs->srs_tid = timeout(mac_srs_fire, 2050 mac_srs, 1); 2051 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2052 return; 2053 } 2054 mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2055 2056 leave_poll: 2057 2058 /* Nothing else to do. Get out of poll mode */ 2059 MAC_SRS_POLLING_OFF(mac_srs); 2060 mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2061 } 2062 2063 /* 2064 * mac_srs_worker 2065 * 2066 * The SRS worker routine. Drains the queue when no one else is 2067 * processing it. 2068 */ 2069 void 2070 mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2071 { 2072 kmutex_t *lock = &mac_srs->srs_lock; 2073 kcondvar_t *async = &mac_srs->srs_async; 2074 callb_cpr_t cprinfo; 2075 boolean_t bw_ctl_flag; 2076 2077 CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2078 mutex_enter(lock); 2079 2080 start: 2081 for (;;) { 2082 bw_ctl_flag = B_FALSE; 2083 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2084 MAC_SRS_BW_LOCK(mac_srs); 2085 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2086 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2087 bw_ctl_flag = B_TRUE; 2088 MAC_SRS_BW_UNLOCK(mac_srs); 2089 } 2090 /* 2091 * The SRS_BW_ENFORCED flag may change since we have dropped 2092 * the mac_bw_lock. However the drain function can handle both 2093 * a drainable SRS or a bandwidth controlled SRS, and the 2094 * effect of scheduling a timeout is to wakeup the worker 2095 * thread which in turn will call the drain function. Since 2096 * we release the srs_lock atomically only in the cv_wait there 2097 * isn't a fear of waiting for ever. 2098 */ 2099 while (((mac_srs->srs_state & SRS_PROC) || 2100 (mac_srs->srs_first == NULL) || bw_ctl_flag || 2101 (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2102 !(mac_srs->srs_state & SRS_PAUSE)) { 2103 /* 2104 * If we have packets queued and we are here 2105 * because B/W control is in place, we better 2106 * schedule the worker wakeup after 1 tick 2107 * to see if bandwidth control can be relaxed. 2108 */ 2109 if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2110 /* 2111 * We need to ensure that a timer is already 2112 * scheduled or we force schedule one for 2113 * later so that we can continue processing 2114 * after this quanta is over. 2115 */ 2116 mac_srs->srs_tid = timeout(mac_srs_fire, 2117 mac_srs, 1); 2118 } 2119 wait: 2120 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2121 cv_wait(async, lock); 2122 CALLB_CPR_SAFE_END(&cprinfo, lock); 2123 2124 if (mac_srs->srs_state & SRS_PAUSE) 2125 goto done; 2126 if (mac_srs->srs_state & SRS_PROC) 2127 goto wait; 2128 2129 if (mac_srs->srs_first != NULL && 2130 mac_srs->srs_type & SRST_BW_CONTROL) { 2131 MAC_SRS_BW_LOCK(mac_srs); 2132 if (mac_srs->srs_bw->mac_bw_state & 2133 SRS_BW_ENFORCED) { 2134 MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2135 } 2136 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2137 SRS_BW_ENFORCED; 2138 MAC_SRS_BW_UNLOCK(mac_srs); 2139 } 2140 } 2141 2142 if (mac_srs->srs_state & SRS_PAUSE) 2143 goto done; 2144 mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2145 } 2146 done: 2147 /* 2148 * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2149 * from both hard and soft classifications and waits for such threads 2150 * to finish before signaling the worker. So at this point the only 2151 * thread left that could be competing with the worker is the poll 2152 * thread. In the case of Tx, there shouldn't be any thread holding 2153 * SRS_PROC at this point. 2154 */ 2155 if (!(mac_srs->srs_state & SRS_PROC)) { 2156 mac_srs->srs_state |= SRS_PROC; 2157 } else { 2158 ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2159 /* 2160 * Poll thread still owns the SRS and is still running 2161 */ 2162 ASSERT((mac_srs->srs_poll_thr == NULL) || 2163 ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2164 SRS_POLL_THR_OWNER)); 2165 } 2166 mac_srs_worker_quiesce(mac_srs); 2167 /* 2168 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2169 * of the quiesce operation 2170 */ 2171 while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2172 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2173 2174 if (mac_srs->srs_state & SRS_RESTART) { 2175 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2176 mac_srs_worker_restart(mac_srs); 2177 mac_srs->srs_state &= ~SRS_PROC; 2178 goto start; 2179 } 2180 2181 if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2182 mac_srs_worker_quiesce(mac_srs); 2183 2184 mac_srs->srs_state &= ~SRS_PROC; 2185 /* The macro drops the srs_lock */ 2186 CALLB_CPR_EXIT(&cprinfo); 2187 thread_exit(); 2188 } 2189 2190 /* 2191 * mac_rx_srs_subflow_process 2192 * 2193 * Receive side routine called from interrupt path when there are 2194 * sub flows present on this SRS. 2195 */ 2196 /* ARGSUSED */ 2197 void 2198 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2199 mblk_t *mp_chain, boolean_t loopback) 2200 { 2201 flow_entry_t *flent = NULL; 2202 flow_entry_t *prev_flent = NULL; 2203 mblk_t *mp = NULL; 2204 mblk_t *tail = NULL; 2205 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2206 mac_client_impl_t *mcip; 2207 2208 mcip = mac_srs->srs_mcip; 2209 ASSERT(mcip != NULL); 2210 2211 /* 2212 * We need to determine the SRS for every packet 2213 * by walking the flow table, if we don't get any, 2214 * then we proceed using the SRS we came with. 2215 */ 2216 mp = tail = mp_chain; 2217 while (mp != NULL) { 2218 2219 /* 2220 * We will increment the stats for the mactching subflow. 2221 * when we get the bytes/pkt count for the classified packets 2222 * later in mac_rx_srs_process. 2223 */ 2224 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2225 FLOW_INBOUND, &flent); 2226 2227 if (mp == mp_chain || flent == prev_flent) { 2228 if (prev_flent != NULL) 2229 FLOW_REFRELE(prev_flent); 2230 prev_flent = flent; 2231 flent = NULL; 2232 tail = mp; 2233 mp = mp->b_next; 2234 continue; 2235 } 2236 tail->b_next = NULL; 2237 /* 2238 * A null indicates, this is for the mac_srs itself. 2239 * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2240 */ 2241 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2242 mac_rx_srs_process(arg, 2243 (mac_resource_handle_t)mac_srs, mp_chain, 2244 loopback); 2245 } else { 2246 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2247 prev_flent->fe_cb_arg2, mp_chain, loopback); 2248 FLOW_REFRELE(prev_flent); 2249 } 2250 prev_flent = flent; 2251 flent = NULL; 2252 mp_chain = mp; 2253 tail = mp; 2254 mp = mp->b_next; 2255 } 2256 /* Last chain */ 2257 ASSERT(mp_chain != NULL); 2258 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2259 mac_rx_srs_process(arg, 2260 (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2261 } else { 2262 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2263 prev_flent->fe_cb_arg2, mp_chain, loopback); 2264 FLOW_REFRELE(prev_flent); 2265 } 2266 } 2267 2268 /* 2269 * mac_rx_srs_process 2270 * 2271 * Receive side routine called from the interrupt path. 2272 * 2273 * loopback is set to force a context switch on the loopback 2274 * path between MAC clients. 2275 */ 2276 /* ARGSUSED */ 2277 void 2278 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2279 boolean_t loopback) 2280 { 2281 mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2282 mblk_t *mp, *tail, *head; 2283 int count = 0; 2284 int count1; 2285 size_t sz = 0; 2286 size_t chain_sz, sz1; 2287 mac_bw_ctl_t *mac_bw; 2288 mac_client_impl_t *smcip; 2289 mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2290 2291 /* 2292 * Set the tail, count and sz. We set the sz irrespective 2293 * of whether we are doing B/W control or not for the 2294 * purpose of updating the stats. 2295 */ 2296 mp = tail = mp_chain; 2297 while (mp != NULL) { 2298 tail = mp; 2299 count++; 2300 sz += msgdsize(mp); 2301 mp = mp->b_next; 2302 } 2303 2304 mutex_enter(&mac_srs->srs_lock); 2305 smcip = mac_srs->srs_mcip; 2306 2307 if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 2308 FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 2309 FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 2310 } 2311 if (smcip != NULL) { 2312 smcip->mci_stat_ibytes += sz; 2313 smcip->mci_stat_ipackets += count; 2314 } 2315 2316 /* 2317 * If the SRS in already being processed; has been blanked; 2318 * can be processed by worker thread only; or the B/W limit 2319 * has been reached, then queue the chain and check if 2320 * worker thread needs to be awakend. 2321 */ 2322 if (mac_srs->srs_type & SRST_BW_CONTROL) { 2323 mac_bw = mac_srs->srs_bw; 2324 ASSERT(mac_bw != NULL); 2325 mutex_enter(&mac_bw->mac_bw_lock); 2326 /* Count the packets and bytes via interrupt */ 2327 srs_rx->sr_intr_count += count; 2328 mac_bw->mac_bw_intr += sz; 2329 if (mac_bw->mac_bw_limit == 0) { 2330 /* zero bandwidth: drop all */ 2331 srs_rx->sr_drop_count += count; 2332 mac_bw->mac_bw_drop_bytes += sz; 2333 mutex_exit(&mac_bw->mac_bw_lock); 2334 mutex_exit(&mac_srs->srs_lock); 2335 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2336 return; 2337 } else { 2338 if ((mac_bw->mac_bw_sz + sz) <= 2339 mac_bw->mac_bw_drop_threshold) { 2340 mutex_exit(&mac_bw->mac_bw_lock); 2341 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2342 tail, count, sz); 2343 } else { 2344 mp = mp_chain; 2345 chain_sz = 0; 2346 count1 = 0; 2347 tail = NULL; 2348 head = NULL; 2349 while (mp != NULL) { 2350 sz1 = msgdsize(mp); 2351 if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2352 mac_bw->mac_bw_drop_threshold) 2353 break; 2354 chain_sz += sz1; 2355 count1++; 2356 tail = mp; 2357 mp = mp->b_next; 2358 } 2359 mutex_exit(&mac_bw->mac_bw_lock); 2360 if (tail != NULL) { 2361 head = tail->b_next; 2362 tail->b_next = NULL; 2363 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2364 mp_chain, tail, count1, chain_sz); 2365 sz -= chain_sz; 2366 count -= count1; 2367 } else { 2368 /* Can't pick up any */ 2369 head = mp_chain; 2370 } 2371 if (head != NULL) { 2372 /* Drop any packet over the threshold */ 2373 srs_rx->sr_drop_count += count; 2374 mutex_enter(&mac_bw->mac_bw_lock); 2375 mac_bw->mac_bw_drop_bytes += sz; 2376 mutex_exit(&mac_bw->mac_bw_lock); 2377 freemsgchain(head); 2378 } 2379 } 2380 MAC_SRS_WORKER_WAKEUP(mac_srs); 2381 mutex_exit(&mac_srs->srs_lock); 2382 return; 2383 } 2384 } 2385 2386 /* 2387 * If the total number of packets queued in the SRS and 2388 * its associated soft rings exceeds the max allowed, 2389 * then drop the chain. If we are polling capable, this 2390 * shouldn't be happening. 2391 */ 2392 if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2393 (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2394 mac_bw = mac_srs->srs_bw; 2395 srs_rx->sr_drop_count += count; 2396 mutex_enter(&mac_bw->mac_bw_lock); 2397 mac_bw->mac_bw_drop_bytes += sz; 2398 mutex_exit(&mac_bw->mac_bw_lock); 2399 freemsgchain(mp_chain); 2400 mutex_exit(&mac_srs->srs_lock); 2401 return; 2402 } 2403 2404 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2405 /* Count the packets entering via interrupt path */ 2406 srs_rx->sr_intr_count += count; 2407 2408 if (!(mac_srs->srs_state & SRS_PROC)) { 2409 /* 2410 * If we are coming via loopback or if we are not 2411 * optimizing for latency, we should signal the 2412 * worker thread. 2413 */ 2414 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 2415 /* 2416 * For loopback, We need to let the worker take 2417 * over as we don't want to continue in the same 2418 * thread even if we can. This could lead to stack 2419 * overflows and may also end up using 2420 * resources (cpu) incorrectly. 2421 */ 2422 cv_signal(&mac_srs->srs_async); 2423 } else { 2424 /* 2425 * Seems like no one is processing the SRS and 2426 * there is no backlog. We also inline process 2427 * our packet if its a single packet in non 2428 * latency optimized case (in latency optimized 2429 * case, we inline process chains of any size). 2430 */ 2431 mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2432 } 2433 } 2434 mutex_exit(&mac_srs->srs_lock); 2435 } 2436 2437 /* TX SIDE ROUTINES (RUNTIME) */ 2438 2439 /* 2440 * mac_tx_srs_no_desc 2441 * 2442 * This routine is called by Tx single ring default mode 2443 * when Tx ring runs out of descs. 2444 */ 2445 mac_tx_cookie_t 2446 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2447 uint16_t flag, mblk_t **ret_mp) 2448 { 2449 mac_tx_cookie_t cookie = NULL; 2450 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2451 boolean_t wakeup_worker = B_TRUE; 2452 uint32_t tx_mode = srs_tx->st_mode; 2453 int cnt, sz; 2454 mblk_t *tail; 2455 2456 ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2457 if (flag & MAC_DROP_ON_NO_DESC) { 2458 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2459 } else { 2460 if (mac_srs->srs_first != NULL) 2461 wakeup_worker = B_FALSE; 2462 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2463 if (flag & MAC_TX_NO_ENQUEUE) { 2464 /* 2465 * If TX_QUEUED is not set, queue the 2466 * packet and let mac_tx_srs_drain() 2467 * set the TX_BLOCKED bit for the 2468 * reasons explained above. Otherwise, 2469 * return the mblks. 2470 */ 2471 if (wakeup_worker) { 2472 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2473 mp_chain, tail, cnt, sz); 2474 } else { 2475 MAC_TX_SET_NO_ENQUEUE(mac_srs, 2476 mp_chain, ret_mp, cookie); 2477 } 2478 } else { 2479 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2480 tail, cnt, sz, cookie); 2481 } 2482 if (wakeup_worker) 2483 cv_signal(&mac_srs->srs_async); 2484 } 2485 return (cookie); 2486 } 2487 2488 /* 2489 * mac_tx_srs_enqueue 2490 * 2491 * This routine is called when Tx SRS is operating in either serializer 2492 * or bandwidth mode. In serializer mode, a packet will get enqueued 2493 * when a thread cannot enter SRS exclusively. In bandwidth mode, 2494 * packets gets queued if allowed byte-count limit for a tick is 2495 * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2496 * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2497 * the default mode or fanout mode. Here packets get dropped or 2498 * returned back to the caller only after hi-watermark worth of data 2499 * is queued. 2500 */ 2501 static mac_tx_cookie_t 2502 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2503 uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2504 { 2505 mac_tx_cookie_t cookie = NULL; 2506 int cnt, sz; 2507 mblk_t *tail; 2508 boolean_t wakeup_worker = B_TRUE; 2509 2510 /* 2511 * Ignore fanout hint if we don't have multiple tx rings. 2512 */ 2513 if (!TX_MULTI_RING_MODE(mac_srs)) 2514 fanout_hint = 0; 2515 2516 if (mac_srs->srs_first != NULL) 2517 wakeup_worker = B_FALSE; 2518 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2519 if (flag & MAC_DROP_ON_NO_DESC) { 2520 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2521 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2522 } else { 2523 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2524 mp_chain, tail, cnt, sz); 2525 } 2526 } else if (flag & MAC_TX_NO_ENQUEUE) { 2527 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2528 (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2529 MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2530 ret_mp, cookie); 2531 } else { 2532 mp_chain->b_prev = (mblk_t *)fanout_hint; 2533 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2534 mp_chain, tail, cnt, sz); 2535 } 2536 } else { 2537 /* 2538 * If you are BW_ENFORCED, just enqueue the 2539 * packet. srs_worker will drain it at the 2540 * prescribed rate. Before enqueueing, save 2541 * the fanout hint. 2542 */ 2543 mp_chain->b_prev = (mblk_t *)fanout_hint; 2544 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2545 tail, cnt, sz, cookie); 2546 } 2547 if (wakeup_worker) 2548 cv_signal(&mac_srs->srs_async); 2549 return (cookie); 2550 } 2551 2552 /* 2553 * There are five tx modes: 2554 * 2555 * 1) Default mode (SRS_TX_DEFAULT) 2556 * 2) Serialization mode (SRS_TX_SERIALIZE) 2557 * 3) Fanout mode (SRS_TX_FANOUT) 2558 * 4) Bandwdith mode (SRS_TX_BW) 2559 * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2560 * 2561 * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2562 * based on the number of Tx rings requested for an SRS and whether 2563 * bandwidth control is requested or not. 2564 * 2565 * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 2566 * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 2567 * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 2568 * When flow-control is relieved, the srs_worker drains the queued 2569 * packets and informs blocked clients to restart sending packets. 2570 * 2571 * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 2572 * 2573 * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2574 * Tx rings. Each Tx ring will have a soft ring associated with it. 2575 * These soft rings will be hung off the Tx SRS. Queueing if it happens 2576 * due to lack of Tx desc will be in individual soft ring (and not srs) 2577 * associated with Tx ring. 2578 * 2579 * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2580 * only if bw is available. Otherwise the packets will be queued in 2581 * SRS. If fanout to multiple Tx rings is configured, the packets will 2582 * be fanned out among the soft rings associated with the Tx rings. 2583 * 2584 * Four flags are used in srs_state for indicating flow control 2585 * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2586 * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2587 * driver below. 2588 * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2589 * and flow-control pressure is applied back to clients. The clients expect 2590 * wakeup when flow-control is relieved. 2591 * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2592 * got returned back to client either due to lack of Tx descs or due to bw 2593 * control reasons. The clients expect a wakeup when condition is relieved. 2594 * 2595 * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2596 * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2597 * MAC_TX_NO_ENQUEUE 2598 * Mac clients that do not want packets to be enqueued in the mac layer set 2599 * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2600 * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2601 * behaviour of this flag is different when the Tx is running in serializer 2602 * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2603 * get dropped when Tx high watermark is reached. 2604 * There are some mac clients like vsw, aggr that want the mblks to be 2605 * returned back to clients instead of being queued in Tx SRS (or Tx soft 2606 * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2607 * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2608 * In the default and Tx fanout mode, the un-transmitted mblks will be 2609 * returned back to the clients when the driver runs out of Tx descs. 2610 * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2611 * soft ring) so that the clients can be woken up when Tx desc become 2612 * available. When running in serializer or bandwidth mode mode, 2613 * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2614 */ 2615 2616 mac_tx_func_t 2617 mac_tx_get_func(uint32_t mode) 2618 { 2619 return (mac_tx_mode_list[mode].mac_tx_func); 2620 } 2621 2622 /* ARGSUSED */ 2623 static mac_tx_cookie_t 2624 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2625 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2626 { 2627 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2628 boolean_t is_subflow; 2629 mac_tx_stats_t stats; 2630 mac_tx_cookie_t cookie = NULL; 2631 2632 ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2633 2634 /* Regular case with a single Tx ring */ 2635 /* 2636 * SRS_TX_BLOCKED is set when underlying NIC runs 2637 * out of Tx descs and messages start getting 2638 * queued. It won't get reset until 2639 * tx_srs_drain() completely drains out the 2640 * messages. 2641 */ 2642 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2643 /* Tx descs/resources not available */ 2644 mutex_enter(&mac_srs->srs_lock); 2645 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2646 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2647 flag, ret_mp); 2648 mutex_exit(&mac_srs->srs_lock); 2649 return (cookie); 2650 } 2651 /* 2652 * While we were computing mblk count, the 2653 * flow control condition got relieved. 2654 * Continue with the transmission. 2655 */ 2656 mutex_exit(&mac_srs->srs_lock); 2657 } 2658 2659 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2660 2661 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2662 mp_chain, (is_subflow ? &stats : NULL)); 2663 2664 /* 2665 * Multiple threads could be here sending packets. 2666 * Under such conditions, it is not possible to 2667 * automically set SRS_TX_BLOCKED bit to indicate 2668 * out of tx desc condition. To atomically set 2669 * this, we queue the returned packet and do 2670 * the setting of SRS_TX_BLOCKED in 2671 * mac_tx_srs_drain(). 2672 */ 2673 if (mp_chain != NULL) { 2674 mutex_enter(&mac_srs->srs_lock); 2675 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2676 mutex_exit(&mac_srs->srs_lock); 2677 return (cookie); 2678 } 2679 2680 if (is_subflow) 2681 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2682 2683 return (NULL); 2684 } 2685 2686 /* 2687 * mac_tx_serialize_mode 2688 * 2689 * This is an experimental mode implemented as per the request of PAE. 2690 * In this mode, all callers attempting to send a packet to the NIC 2691 * will get serialized. Only one thread at any time will access the 2692 * NIC to send the packet out. 2693 */ 2694 /* ARGSUSED */ 2695 static mac_tx_cookie_t 2696 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2697 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2698 { 2699 boolean_t is_subflow; 2700 mac_tx_stats_t stats; 2701 mac_tx_cookie_t cookie = NULL; 2702 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2703 2704 /* Single ring, serialize below */ 2705 ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2706 mutex_enter(&mac_srs->srs_lock); 2707 if ((mac_srs->srs_first != NULL) || 2708 (mac_srs->srs_state & SRS_PROC)) { 2709 /* 2710 * In serialization mode, queue all packets until 2711 * TX_HIWAT is set. 2712 * If drop bit is set, drop if TX_HIWAT is set. 2713 * If no_enqueue is set, still enqueue until hiwat 2714 * is set and return mblks after TX_HIWAT is set. 2715 */ 2716 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2717 flag, NULL, ret_mp); 2718 mutex_exit(&mac_srs->srs_lock); 2719 return (cookie); 2720 } 2721 /* 2722 * No packets queued, nothing on proc and no flow 2723 * control condition. Fast-path, ok. Do inline 2724 * processing. 2725 */ 2726 mac_srs->srs_state |= SRS_PROC; 2727 mutex_exit(&mac_srs->srs_lock); 2728 2729 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2730 2731 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2732 mp_chain, (is_subflow ? &stats : NULL)); 2733 2734 mutex_enter(&mac_srs->srs_lock); 2735 mac_srs->srs_state &= ~SRS_PROC; 2736 if (mp_chain != NULL) { 2737 cookie = mac_tx_srs_enqueue(mac_srs, 2738 mp_chain, flag, NULL, ret_mp); 2739 } 2740 if (mac_srs->srs_first != NULL) { 2741 /* 2742 * We processed inline our packet and a new 2743 * packet/s got queued while we were 2744 * processing. Wakeup srs worker 2745 */ 2746 cv_signal(&mac_srs->srs_async); 2747 } 2748 mutex_exit(&mac_srs->srs_lock); 2749 2750 if (is_subflow && cookie == NULL) 2751 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2752 2753 return (cookie); 2754 } 2755 2756 /* 2757 * mac_tx_fanout_mode 2758 * 2759 * In this mode, the SRS will have access to multiple Tx rings to send 2760 * the packet out. The fanout hint that is passed as an argument is 2761 * used to find an appropriate ring to fanout the traffic. Each Tx 2762 * ring, in turn, will have a soft ring associated with it. If a Tx 2763 * ring runs out of Tx desc's the returned packet will be queued in 2764 * the soft ring associated with that Tx ring. The srs itself will not 2765 * queue any packets. 2766 */ 2767 2768 #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 2769 index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 2770 softring = mac_srs->srs_oth_soft_rings[index]; \ 2771 cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 2772 DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 2773 } 2774 2775 static mac_tx_cookie_t 2776 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2777 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2778 { 2779 mac_soft_ring_t *softring; 2780 uint64_t hash; 2781 uint_t index; 2782 mac_tx_cookie_t cookie = NULL; 2783 2784 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 2785 if (fanout_hint != 0) { 2786 /* 2787 * The hint is specified by the caller, simply pass the 2788 * whole chain to the soft ring. 2789 */ 2790 hash = HASH_HINT(fanout_hint); 2791 MAC_TX_SOFT_RING_PROCESS(mp_chain); 2792 } else { 2793 mblk_t *last_mp, *cur_mp, *sub_chain; 2794 uint64_t last_hash = 0; 2795 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 2796 2797 /* 2798 * Compute the hash from the contents (headers) of the 2799 * packets of the mblk chain. Split the chains into 2800 * subchains of the same conversation. 2801 * 2802 * Since there may be more than one ring used for 2803 * sub-chains of the same call, and since the caller 2804 * does not maintain per conversation state since it 2805 * passed a zero hint, unsent subchains will be 2806 * dropped. 2807 */ 2808 2809 flag |= MAC_DROP_ON_NO_DESC; 2810 ret_mp = NULL; 2811 2812 ASSERT(ret_mp == NULL); 2813 2814 sub_chain = NULL; 2815 last_mp = NULL; 2816 2817 for (cur_mp = mp_chain; cur_mp != NULL; 2818 cur_mp = cur_mp->b_next) { 2819 hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 2820 B_TRUE); 2821 if (last_hash != 0 && hash != last_hash) { 2822 /* 2823 * Starting a different subchain, send current 2824 * chain out. 2825 */ 2826 ASSERT(last_mp != NULL); 2827 last_mp->b_next = NULL; 2828 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2829 sub_chain = NULL; 2830 } 2831 2832 /* add packet to subchain */ 2833 if (sub_chain == NULL) 2834 sub_chain = cur_mp; 2835 last_mp = cur_mp; 2836 last_hash = hash; 2837 } 2838 2839 if (sub_chain != NULL) { 2840 /* send last subchain */ 2841 ASSERT(last_mp != NULL); 2842 last_mp->b_next = NULL; 2843 MAC_TX_SOFT_RING_PROCESS(sub_chain); 2844 } 2845 2846 cookie = NULL; 2847 } 2848 2849 return (cookie); 2850 } 2851 2852 /* 2853 * mac_tx_bw_mode 2854 * 2855 * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2856 * only if bw is available. Otherwise the packets will be queued in 2857 * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2858 * out to a Tx rings. 2859 */ 2860 static mac_tx_cookie_t 2861 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2862 uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2863 { 2864 int cnt, sz; 2865 mblk_t *tail; 2866 mac_tx_cookie_t cookie = NULL; 2867 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2868 clock_t now; 2869 2870 ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2871 ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2872 mutex_enter(&mac_srs->srs_lock); 2873 if (mac_srs->srs_bw->mac_bw_limit == 0) { 2874 /* 2875 * zero bandwidth, no traffic is sent: drop the packets, 2876 * or return the whole chain if the caller requests all 2877 * unsent packets back. 2878 */ 2879 if (flag & MAC_TX_NO_ENQUEUE) { 2880 cookie = (mac_tx_cookie_t)mac_srs; 2881 *ret_mp = mp_chain; 2882 } else { 2883 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2884 } 2885 mutex_exit(&mac_srs->srs_lock); 2886 return (cookie); 2887 } else if ((mac_srs->srs_first != NULL) || 2888 (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2889 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2890 fanout_hint, ret_mp); 2891 mutex_exit(&mac_srs->srs_lock); 2892 return (cookie); 2893 } 2894 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2895 now = ddi_get_lbolt(); 2896 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 2897 mac_srs->srs_bw->mac_bw_curr_time = now; 2898 mac_srs->srs_bw->mac_bw_used = 0; 2899 } else if (mac_srs->srs_bw->mac_bw_used > 2900 mac_srs->srs_bw->mac_bw_limit) { 2901 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2902 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2903 mp_chain, tail, cnt, sz); 2904 /* 2905 * Wakeup worker thread. Note that worker 2906 * thread has to be woken up so that it 2907 * can fire up the timer to be woken up 2908 * on the next tick. Also once 2909 * BW_ENFORCED is set, it can only be 2910 * reset by srs_worker thread. Until then 2911 * all packets will get queued up in SRS 2912 * and hence this this code path won't be 2913 * entered until BW_ENFORCED is reset. 2914 */ 2915 cv_signal(&mac_srs->srs_async); 2916 mutex_exit(&mac_srs->srs_lock); 2917 return (cookie); 2918 } 2919 2920 mac_srs->srs_bw->mac_bw_used += sz; 2921 mutex_exit(&mac_srs->srs_lock); 2922 2923 if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2924 mac_soft_ring_t *softring; 2925 uint_t indx, hash; 2926 2927 hash = HASH_HINT(fanout_hint); 2928 indx = COMPUTE_INDEX(hash, 2929 mac_srs->srs_oth_ring_count); 2930 softring = mac_srs->srs_oth_soft_rings[indx]; 2931 return (mac_tx_soft_ring_process(softring, mp_chain, flag, 2932 ret_mp)); 2933 } else { 2934 boolean_t is_subflow; 2935 mac_tx_stats_t stats; 2936 2937 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2938 2939 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2940 mp_chain, (is_subflow ? &stats : NULL)); 2941 2942 if (mp_chain != NULL) { 2943 mutex_enter(&mac_srs->srs_lock); 2944 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2945 if (mac_srs->srs_bw->mac_bw_used > sz) 2946 mac_srs->srs_bw->mac_bw_used -= sz; 2947 else 2948 mac_srs->srs_bw->mac_bw_used = 0; 2949 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2950 fanout_hint, ret_mp); 2951 mutex_exit(&mac_srs->srs_lock); 2952 return (cookie); 2953 } 2954 if (is_subflow) 2955 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2956 2957 return (NULL); 2958 } 2959 } 2960 2961 /* ARGSUSED */ 2962 void 2963 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 2964 { 2965 mblk_t *head, *tail; 2966 size_t sz; 2967 uint32_t tx_mode; 2968 uint_t saved_pkt_count; 2969 boolean_t is_subflow; 2970 mac_tx_stats_t stats; 2971 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2972 clock_t now; 2973 2974 saved_pkt_count = 0; 2975 ASSERT(mutex_owned(&mac_srs->srs_lock)); 2976 ASSERT(!(mac_srs->srs_state & SRS_PROC)); 2977 2978 mac_srs->srs_state |= SRS_PROC; 2979 2980 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2981 tx_mode = srs_tx->st_mode; 2982 if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 2983 if (mac_srs->srs_first != NULL) { 2984 head = mac_srs->srs_first; 2985 tail = mac_srs->srs_last; 2986 saved_pkt_count = mac_srs->srs_count; 2987 mac_srs->srs_first = NULL; 2988 mac_srs->srs_last = NULL; 2989 mac_srs->srs_count = 0; 2990 mutex_exit(&mac_srs->srs_lock); 2991 2992 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2993 head, &stats); 2994 2995 mutex_enter(&mac_srs->srs_lock); 2996 if (head != NULL) { 2997 /* Device out of tx desc, set block */ 2998 if (head->b_next == NULL) 2999 VERIFY(head == tail); 3000 tail->b_next = mac_srs->srs_first; 3001 mac_srs->srs_first = head; 3002 mac_srs->srs_count += 3003 (saved_pkt_count - stats.ts_opackets); 3004 if (mac_srs->srs_last == NULL) 3005 mac_srs->srs_last = tail; 3006 MAC_TX_SRS_BLOCK(mac_srs, head); 3007 } else { 3008 srs_tx->st_woken_up = B_FALSE; 3009 if (is_subflow) { 3010 FLOW_TX_STATS_UPDATE( 3011 mac_srs->srs_flent, &stats); 3012 } 3013 } 3014 } 3015 } else if (tx_mode == SRS_TX_BW) { 3016 /* 3017 * We are here because the timer fired and we have some data 3018 * to tranmit. Also mac_tx_srs_worker should have reset 3019 * SRS_BW_ENFORCED flag 3020 */ 3021 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 3022 head = tail = mac_srs->srs_first; 3023 while (mac_srs->srs_first != NULL) { 3024 tail = mac_srs->srs_first; 3025 tail->b_prev = NULL; 3026 mac_srs->srs_first = tail->b_next; 3027 if (mac_srs->srs_first == NULL) 3028 mac_srs->srs_last = NULL; 3029 mac_srs->srs_count--; 3030 sz = msgdsize(tail); 3031 mac_srs->srs_size -= sz; 3032 saved_pkt_count++; 3033 MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 3034 3035 if (mac_srs->srs_bw->mac_bw_used < 3036 mac_srs->srs_bw->mac_bw_limit) 3037 continue; 3038 3039 now = ddi_get_lbolt(); 3040 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3041 mac_srs->srs_bw->mac_bw_curr_time = now; 3042 mac_srs->srs_bw->mac_bw_used = sz; 3043 continue; 3044 } 3045 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3046 break; 3047 } 3048 3049 ASSERT((head == NULL && tail == NULL) || 3050 (head != NULL && tail != NULL)); 3051 if (tail != NULL) { 3052 tail->b_next = NULL; 3053 mutex_exit(&mac_srs->srs_lock); 3054 3055 head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 3056 head, &stats); 3057 3058 mutex_enter(&mac_srs->srs_lock); 3059 if (head != NULL) { 3060 uint_t size_sent; 3061 3062 /* Device out of tx desc, set block */ 3063 if (head->b_next == NULL) 3064 VERIFY(head == tail); 3065 tail->b_next = mac_srs->srs_first; 3066 mac_srs->srs_first = head; 3067 mac_srs->srs_count += 3068 (saved_pkt_count - stats.ts_opackets); 3069 if (mac_srs->srs_last == NULL) 3070 mac_srs->srs_last = tail; 3071 size_sent = sz - stats.ts_obytes; 3072 mac_srs->srs_size += size_sent; 3073 mac_srs->srs_bw->mac_bw_sz += size_sent; 3074 if (mac_srs->srs_bw->mac_bw_used > size_sent) { 3075 mac_srs->srs_bw->mac_bw_used -= 3076 size_sent; 3077 } else { 3078 mac_srs->srs_bw->mac_bw_used = 0; 3079 } 3080 MAC_TX_SRS_BLOCK(mac_srs, head); 3081 } else { 3082 srs_tx->st_woken_up = B_FALSE; 3083 if (is_subflow) { 3084 FLOW_TX_STATS_UPDATE( 3085 mac_srs->srs_flent, &stats); 3086 } 3087 } 3088 } 3089 } else if (tx_mode == SRS_TX_BW_FANOUT) { 3090 mblk_t *prev; 3091 mac_soft_ring_t *softring; 3092 uint64_t hint; 3093 3094 /* 3095 * We are here because the timer fired and we 3096 * have some quota to tranmit. 3097 */ 3098 prev = NULL; 3099 head = tail = mac_srs->srs_first; 3100 while (mac_srs->srs_first != NULL) { 3101 tail = mac_srs->srs_first; 3102 mac_srs->srs_first = tail->b_next; 3103 if (mac_srs->srs_first == NULL) 3104 mac_srs->srs_last = NULL; 3105 mac_srs->srs_count--; 3106 sz = msgdsize(tail); 3107 mac_srs->srs_size -= sz; 3108 mac_srs->srs_bw->mac_bw_used += sz; 3109 if (prev == NULL) 3110 hint = (ulong_t)tail->b_prev; 3111 if (hint != (ulong_t)tail->b_prev) { 3112 prev->b_next = NULL; 3113 mutex_exit(&mac_srs->srs_lock); 3114 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3115 head = tail; 3116 hint = (ulong_t)tail->b_prev; 3117 mutex_enter(&mac_srs->srs_lock); 3118 } 3119 3120 prev = tail; 3121 tail->b_prev = NULL; 3122 if (mac_srs->srs_bw->mac_bw_used < 3123 mac_srs->srs_bw->mac_bw_limit) 3124 continue; 3125 3126 now = ddi_get_lbolt(); 3127 if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3128 mac_srs->srs_bw->mac_bw_curr_time = now; 3129 mac_srs->srs_bw->mac_bw_used = 0; 3130 continue; 3131 } 3132 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3133 break; 3134 } 3135 ASSERT((head == NULL && tail == NULL) || 3136 (head != NULL && tail != NULL)); 3137 if (tail != NULL) { 3138 tail->b_next = NULL; 3139 mutex_exit(&mac_srs->srs_lock); 3140 TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3141 mutex_enter(&mac_srs->srs_lock); 3142 } 3143 } 3144 /* 3145 * SRS_TX_FANOUT case not considered here because packets 3146 * won't be queued in the SRS for this case. Packets will 3147 * be sent directly to soft rings underneath and if there 3148 * is any queueing at all, it would be in Tx side soft 3149 * rings. 3150 */ 3151 3152 /* 3153 * When srs_count becomes 0, reset SRS_TX_HIWAT and 3154 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3155 */ 3156 if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3157 (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3158 mac_tx_notify_cb_t *mtnfp; 3159 mac_cb_t *mcb; 3160 mac_client_impl_t *mcip = mac_srs->srs_mcip; 3161 boolean_t wakeup_required = B_FALSE; 3162 3163 if (mac_srs->srs_state & 3164 (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3165 wakeup_required = B_TRUE; 3166 } 3167 mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3168 SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3169 mutex_exit(&mac_srs->srs_lock); 3170 if (wakeup_required) { 3171 /* Wakeup callback registered clients */ 3172 MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3173 for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3174 mcb = mcb->mcb_nextp) { 3175 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3176 mtnfp->mtnf_fn(mtnfp->mtnf_arg, 3177 (mac_tx_cookie_t)mac_srs); 3178 } 3179 MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3180 &mcip->mci_tx_notify_cb_list); 3181 /* 3182 * If the client is not the primary MAC client, then we 3183 * need to send the notification to the clients upper 3184 * MAC, i.e. mci_upper_mip. 3185 */ 3186 mac_tx_notify(mcip->mci_upper_mip != NULL ? 3187 mcip->mci_upper_mip : mcip->mci_mip); 3188 } 3189 mutex_enter(&mac_srs->srs_lock); 3190 } 3191 mac_srs->srs_state &= ~SRS_PROC; 3192 } 3193 3194 /* 3195 * Given a packet, get the flow_entry that identifies the flow 3196 * to which that packet belongs. The flow_entry will contain 3197 * the transmit function to be used to send the packet. If the 3198 * function returns NULL, the packet should be sent using the 3199 * underlying NIC. 3200 */ 3201 static flow_entry_t * 3202 mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3203 { 3204 flow_entry_t *flent = NULL; 3205 mac_client_impl_t *mcip; 3206 int err; 3207 3208 /* 3209 * Do classification on the packet. 3210 */ 3211 err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3212 if (err != 0) 3213 return (NULL); 3214 3215 /* 3216 * This flent might just be an additional one on the MAC client, 3217 * i.e. for classification purposes (different fdesc), however 3218 * the resources, SRS et. al., are in the mci_flent, so if 3219 * this isn't the mci_flent, we need to get it. 3220 */ 3221 if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3222 FLOW_REFRELE(flent); 3223 flent = mcip->mci_flent; 3224 FLOW_TRY_REFHOLD(flent, err); 3225 if (err != 0) 3226 return (NULL); 3227 } 3228 3229 return (flent); 3230 } 3231 3232 /* 3233 * This macro is only meant to be used by mac_tx_send(). 3234 */ 3235 #define CHECK_VID_AND_ADD_TAG(mp) { \ 3236 if (vid_check) { \ 3237 int err = 0; \ 3238 \ 3239 MAC_VID_CHECK(src_mcip, (mp), err); \ 3240 if (err != 0) { \ 3241 freemsg((mp)); \ 3242 (mp) = next; \ 3243 oerrors++; \ 3244 continue; \ 3245 } \ 3246 } \ 3247 if (add_tag) { \ 3248 (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3249 if ((mp) == NULL) { \ 3250 (mp) = next; \ 3251 oerrors++; \ 3252 continue; \ 3253 } \ 3254 } \ 3255 } 3256 3257 mblk_t * 3258 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3259 mac_tx_stats_t *stats) 3260 { 3261 mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3262 mac_impl_t *mip = src_mcip->mci_mip; 3263 uint_t obytes = 0, opackets = 0, oerrors = 0; 3264 mblk_t *mp = NULL, *next; 3265 boolean_t vid_check, add_tag; 3266 uint16_t vid = 0; 3267 3268 if (mip->mi_nclients > 1) { 3269 vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3270 add_tag = MAC_TAG_NEEDED(src_mcip); 3271 if (add_tag) 3272 vid = mac_client_vid(mch); 3273 } else { 3274 ASSERT(mip->mi_nclients == 1); 3275 vid_check = add_tag = B_FALSE; 3276 } 3277 3278 /* 3279 * Fastpath: if there's only one client, and there's no 3280 * multicast listeners, we simply send the packet down to the 3281 * underlying NIC. 3282 */ 3283 if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 3284 DTRACE_PROBE2(fastpath, 3285 mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3286 3287 mp = mp_chain; 3288 while (mp != NULL) { 3289 next = mp->b_next; 3290 mp->b_next = NULL; 3291 opackets++; 3292 obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3293 msgdsize(mp)); 3294 3295 CHECK_VID_AND_ADD_TAG(mp); 3296 MAC_TX(mip, ring, mp, 3297 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3298 0)); 3299 3300 /* 3301 * If the driver is out of descriptors and does a 3302 * partial send it will return a chain of unsent 3303 * mblks. Adjust the accounting stats. 3304 */ 3305 if (mp != NULL) { 3306 opackets--; 3307 obytes -= msgdsize(mp); 3308 mp->b_next = next; 3309 break; 3310 } 3311 mp = next; 3312 } 3313 goto done; 3314 } 3315 3316 /* 3317 * No fastpath, we either have more than one MAC client 3318 * defined on top of the same MAC, or one or more MAC 3319 * client promiscuous callbacks. 3320 */ 3321 DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3322 src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3323 3324 mp = mp_chain; 3325 while (mp != NULL) { 3326 flow_entry_t *dst_flow_ent; 3327 void *flow_cookie; 3328 size_t pkt_size; 3329 mblk_t *mp1; 3330 3331 next = mp->b_next; 3332 mp->b_next = NULL; 3333 opackets++; 3334 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3335 obytes += pkt_size; 3336 CHECK_VID_AND_ADD_TAG(mp); 3337 3338 /* 3339 * Check if there are promiscuous mode callbacks defined. 3340 */ 3341 if (mip->mi_promisc_list != NULL) 3342 mac_promisc_dispatch(mip, mp, src_mcip); 3343 3344 /* 3345 * Find the destination. 3346 */ 3347 dst_flow_ent = mac_tx_classify(mip, mp); 3348 3349 if (dst_flow_ent != NULL) { 3350 size_t hdrsize; 3351 int err = 0; 3352 3353 if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3354 struct ether_vlan_header *evhp = 3355 (struct ether_vlan_header *)mp->b_rptr; 3356 3357 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3358 hdrsize = sizeof (*evhp); 3359 else 3360 hdrsize = sizeof (struct ether_header); 3361 } else { 3362 mac_header_info_t mhi; 3363 3364 err = mac_header_info((mac_handle_t)mip, 3365 mp, &mhi); 3366 if (err == 0) 3367 hdrsize = mhi.mhi_hdrsize; 3368 } 3369 3370 /* 3371 * Got a matching flow. It's either another 3372 * MAC client, or a broadcast/multicast flow. 3373 * Make sure the packet size is within the 3374 * allowed size. If not drop the packet and 3375 * move to next packet. 3376 */ 3377 if (err != 0 || 3378 (pkt_size - hdrsize) > mip->mi_sdu_max) { 3379 oerrors++; 3380 DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3381 mblk_t *, mp); 3382 freemsg(mp); 3383 mp = next; 3384 FLOW_REFRELE(dst_flow_ent); 3385 continue; 3386 } 3387 flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3388 if (flow_cookie != NULL) { 3389 /* 3390 * The vnic_bcast_send function expects 3391 * to receive the sender MAC client 3392 * as value for arg2. 3393 */ 3394 mac_bcast_send(flow_cookie, src_mcip, mp, 3395 B_TRUE); 3396 } else { 3397 /* 3398 * loopback the packet to a 3399 * local MAC client. We force a context 3400 * switch if both source and destination 3401 * MAC clients are used by IP, i.e. bypass 3402 * is set. 3403 */ 3404 boolean_t do_switch; 3405 mac_client_impl_t *dst_mcip = 3406 dst_flow_ent->fe_mcip; 3407 3408 do_switch = ((src_mcip->mci_state_flags & 3409 dst_mcip->mci_state_flags & 3410 MCIS_CLIENT_POLL_CAPABLE) != 0); 3411 3412 if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3413 (dst_flow_ent->fe_cb_fn)( 3414 dst_flow_ent->fe_cb_arg1, 3415 dst_flow_ent->fe_cb_arg2, 3416 mp1, do_switch); 3417 } 3418 } 3419 FLOW_REFRELE(dst_flow_ent); 3420 } else { 3421 /* 3422 * Unknown destination, send via the underlying 3423 * NIC. 3424 */ 3425 MAC_TX(mip, ring, mp, 3426 ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 3427 0)); 3428 if (mp != NULL) { 3429 /* 3430 * Adjust for the last packet that 3431 * could not be transmitted 3432 */ 3433 opackets--; 3434 obytes -= pkt_size; 3435 mp->b_next = next; 3436 break; 3437 } 3438 } 3439 mp = next; 3440 } 3441 3442 done: 3443 src_mcip->mci_stat_obytes += obytes; 3444 src_mcip->mci_stat_opackets += opackets; 3445 src_mcip->mci_stat_oerrors += oerrors; 3446 3447 if (stats != NULL) { 3448 stats->ts_opackets = opackets; 3449 stats->ts_obytes = obytes; 3450 stats->ts_oerrors = oerrors; 3451 } 3452 return (mp); 3453 } 3454 3455 /* 3456 * mac_tx_srs_ring_present 3457 * 3458 * Returns whether the specified ring is part of the specified SRS. 3459 */ 3460 boolean_t 3461 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3462 { 3463 int i; 3464 mac_soft_ring_t *soft_ring; 3465 3466 if (srs->srs_tx.st_arg2 == tx_ring) 3467 return (B_TRUE); 3468 3469 for (i = 0; i < srs->srs_oth_ring_count; i++) { 3470 soft_ring = srs->srs_oth_soft_rings[i]; 3471 if (soft_ring->s_ring_tx_arg2 == tx_ring) 3472 return (B_TRUE); 3473 } 3474 3475 return (B_FALSE); 3476 } 3477 3478 /* 3479 * mac_tx_srs_wakeup 3480 * 3481 * Called when Tx desc become available. Wakeup the appropriate worker 3482 * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3483 * state field. 3484 */ 3485 void 3486 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3487 { 3488 int i; 3489 mac_soft_ring_t *sringp; 3490 mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3491 3492 mutex_enter(&mac_srs->srs_lock); 3493 if (TX_SINGLE_RING_MODE(mac_srs)) { 3494 if (srs_tx->st_arg2 == ring && 3495 mac_srs->srs_state & SRS_TX_BLOCKED) { 3496 mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3497 srs_tx->st_unblocked_cnt++; 3498 cv_signal(&mac_srs->srs_async); 3499 } 3500 /* 3501 * A wakeup can come before tx_srs_drain() could 3502 * grab srs lock and set SRS_TX_BLOCKED. So 3503 * always set woken_up flag when we come here. 3504 */ 3505 srs_tx->st_woken_up = B_TRUE; 3506 mutex_exit(&mac_srs->srs_lock); 3507 return; 3508 } 3509 3510 /* If you are here, it is for FANOUT or BW_FANOUT case */ 3511 ASSERT(TX_MULTI_RING_MODE(mac_srs)); 3512 for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 3513 sringp = mac_srs->srs_oth_soft_rings[i]; 3514 mutex_enter(&sringp->s_ring_lock); 3515 if (sringp->s_ring_tx_arg2 == ring) { 3516 if (sringp->s_ring_state & S_RING_BLOCK) { 3517 sringp->s_ring_state &= ~S_RING_BLOCK; 3518 sringp->s_ring_unblocked_cnt++; 3519 cv_signal(&sringp->s_ring_async); 3520 } 3521 sringp->s_ring_tx_woken_up = B_TRUE; 3522 } 3523 mutex_exit(&sringp->s_ring_lock); 3524 } 3525 mutex_exit(&mac_srs->srs_lock); 3526 } 3527 3528 /* 3529 * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3530 * the blocked clients again. 3531 */ 3532 void 3533 mac_tx_notify(mac_impl_t *mip) 3534 { 3535 i_mac_notify(mip, MAC_NOTE_TX); 3536 } 3537 3538 /* 3539 * RX SOFTRING RELATED FUNCTIONS 3540 * 3541 * These functions really belong in mac_soft_ring.c and here for 3542 * a short period. 3543 */ 3544 3545 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3546 /* \ 3547 * Enqueue our mblk chain. \ 3548 */ \ 3549 ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3550 \ 3551 if ((ringp)->s_ring_last != NULL) \ 3552 (ringp)->s_ring_last->b_next = (mp); \ 3553 else \ 3554 (ringp)->s_ring_first = (mp); \ 3555 (ringp)->s_ring_last = (tail); \ 3556 (ringp)->s_ring_count += (cnt); \ 3557 ASSERT((ringp)->s_ring_count > 0); \ 3558 if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3559 (ringp)->s_ring_size += sz; \ 3560 } \ 3561 } 3562 3563 /* 3564 * Default entry point to deliver a packet chain to a MAC client. 3565 * If the MAC client has flows, do the classification with these 3566 * flows as well. 3567 */ 3568 /* ARGSUSED */ 3569 void 3570 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3571 mac_header_info_t *arg3) 3572 { 3573 mac_client_impl_t *mcip = arg1; 3574 3575 if (mcip->mci_nvids == 1 && 3576 !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 3577 /* 3578 * If the client has exactly one VID associated with it 3579 * and striping of VLAN header is not disabled, 3580 * remove the VLAN tag from the packet before 3581 * passing it on to the client's receive callback. 3582 * Note that this needs to be done after we dispatch 3583 * the packet to the promiscuous listeners of the 3584 * client, since they expect to see the whole 3585 * frame including the VLAN headers. 3586 */ 3587 mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3588 } 3589 3590 mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3591 } 3592 3593 /* 3594 * mac_rx_soft_ring_process 3595 * 3596 * process a chain for a given soft ring. The number of packets queued 3597 * in the SRS and its associated soft rings (including this one) is 3598 * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3599 * thread (interrupt or poll thread) to do inline processing. This 3600 * helps keep the latency down under low load. 3601 * 3602 * The proc and arg for each mblk is already stored in the mblk in 3603 * appropriate places. 3604 */ 3605 /* ARGSUSED */ 3606 void 3607 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3608 mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3609 { 3610 mac_direct_rx_t proc; 3611 void *arg1; 3612 mac_resource_handle_t arg2; 3613 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3614 3615 ASSERT(ringp != NULL); 3616 ASSERT(mp_chain != NULL); 3617 ASSERT(tail != NULL); 3618 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3619 3620 mutex_enter(&ringp->s_ring_lock); 3621 ringp->s_ring_total_inpkt += cnt; 3622 if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3623 !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 3624 /* If on processor or blanking on, then enqueue and return */ 3625 if (ringp->s_ring_state & S_RING_BLANK || 3626 ringp->s_ring_state & S_RING_PROC) { 3627 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3628 mutex_exit(&ringp->s_ring_lock); 3629 return; 3630 } 3631 proc = ringp->s_ring_rx_func; 3632 arg1 = ringp->s_ring_rx_arg1; 3633 arg2 = ringp->s_ring_rx_arg2; 3634 /* 3635 * See if anything is already queued. If we are the 3636 * first packet, do inline processing else queue the 3637 * packet and do the drain. 3638 */ 3639 if (ringp->s_ring_first == NULL) { 3640 /* 3641 * Fast-path, ok to process and nothing queued. 3642 */ 3643 ringp->s_ring_run = curthread; 3644 ringp->s_ring_state |= (S_RING_PROC); 3645 3646 mutex_exit(&ringp->s_ring_lock); 3647 3648 /* 3649 * We are the chain of 1 packet so 3650 * go through this fast path. 3651 */ 3652 ASSERT(mp_chain->b_next == NULL); 3653 3654 (*proc)(arg1, arg2, mp_chain, NULL); 3655 3656 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3657 /* 3658 * If we have a soft ring set which is doing 3659 * bandwidth control, we need to decrement 3660 * srs_size and count so it the SRS can have a 3661 * accurate idea of what is the real data 3662 * queued between SRS and its soft rings. We 3663 * decrement the counters only when the packet 3664 * gets processed by both SRS and the soft ring. 3665 */ 3666 mutex_enter(&mac_srs->srs_lock); 3667 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3668 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3669 mutex_exit(&mac_srs->srs_lock); 3670 3671 mutex_enter(&ringp->s_ring_lock); 3672 ringp->s_ring_run = NULL; 3673 ringp->s_ring_state &= ~S_RING_PROC; 3674 if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3675 cv_signal(&ringp->s_ring_client_cv); 3676 3677 if ((ringp->s_ring_first == NULL) || 3678 (ringp->s_ring_state & S_RING_BLANK)) { 3679 /* 3680 * We processed inline our packet and 3681 * nothing new has arrived or our 3682 * receiver doesn't want to receive 3683 * any packets. We are done. 3684 */ 3685 mutex_exit(&ringp->s_ring_lock); 3686 return; 3687 } 3688 } else { 3689 SOFT_RING_ENQUEUE_CHAIN(ringp, 3690 mp_chain, tail, cnt, sz); 3691 } 3692 3693 /* 3694 * We are here because either we couldn't do inline 3695 * processing (because something was already 3696 * queued), or we had a chain of more than one 3697 * packet, or something else arrived after we were 3698 * done with inline processing. 3699 */ 3700 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3701 ASSERT(ringp->s_ring_first != NULL); 3702 3703 ringp->s_ring_drain_func(ringp); 3704 mutex_exit(&ringp->s_ring_lock); 3705 return; 3706 } else { 3707 /* ST_RING_WORKER_ONLY case */ 3708 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3709 mac_soft_ring_worker_wakeup(ringp); 3710 mutex_exit(&ringp->s_ring_lock); 3711 } 3712 } 3713 3714 /* 3715 * TX SOFTRING RELATED FUNCTIONS 3716 * 3717 * These functions really belong in mac_soft_ring.c and here for 3718 * a short period. 3719 */ 3720 3721 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3722 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3723 ringp->s_ring_state |= S_RING_ENQUEUED; \ 3724 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3725 } 3726 3727 /* 3728 * mac_tx_sring_queued 3729 * 3730 * When we are out of transmit descriptors and we already have a 3731 * queue that exceeds hiwat (or the client called us with 3732 * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3733 * soft ring pointer as the opaque cookie for the client enable 3734 * flow control. 3735 */ 3736 static mac_tx_cookie_t 3737 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3738 mblk_t **ret_mp) 3739 { 3740 int cnt; 3741 size_t sz; 3742 mblk_t *tail; 3743 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3744 mac_tx_cookie_t cookie = NULL; 3745 boolean_t wakeup_worker = B_TRUE; 3746 3747 ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3748 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3749 if (flag & MAC_DROP_ON_NO_DESC) { 3750 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3751 /* increment freed stats */ 3752 ringp->s_ring_drops += cnt; 3753 cookie = (mac_tx_cookie_t)ringp; 3754 } else { 3755 if (ringp->s_ring_first != NULL) 3756 wakeup_worker = B_FALSE; 3757 3758 if (flag & MAC_TX_NO_ENQUEUE) { 3759 /* 3760 * If QUEUED is not set, queue the packet 3761 * and let mac_tx_soft_ring_drain() set 3762 * the TX_BLOCKED bit for the reasons 3763 * explained above. Otherwise, return the 3764 * mblks. 3765 */ 3766 if (wakeup_worker) { 3767 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3768 mp_chain, tail, cnt, sz); 3769 } else { 3770 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3771 cookie = (mac_tx_cookie_t)ringp; 3772 *ret_mp = mp_chain; 3773 } 3774 } else { 3775 boolean_t enqueue = B_TRUE; 3776 3777 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3778 /* 3779 * flow-controlled. Store ringp in cookie 3780 * so that it can be returned as 3781 * mac_tx_cookie_t to client 3782 */ 3783 ringp->s_ring_state |= S_RING_TX_HIWAT; 3784 cookie = (mac_tx_cookie_t)ringp; 3785 ringp->s_ring_hiwat_cnt++; 3786 if (ringp->s_ring_count > 3787 ringp->s_ring_tx_max_q_cnt) { 3788 /* increment freed stats */ 3789 ringp->s_ring_drops += cnt; 3790 /* 3791 * b_prev may be set to the fanout hint 3792 * hence can't use freemsg directly 3793 */ 3794 mac_pkt_drop(NULL, NULL, 3795 mp_chain, B_FALSE); 3796 DTRACE_PROBE1(tx_queued_hiwat, 3797 mac_soft_ring_t *, ringp); 3798 enqueue = B_FALSE; 3799 } 3800 } 3801 if (enqueue) { 3802 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3803 tail, cnt, sz); 3804 } 3805 } 3806 if (wakeup_worker) 3807 cv_signal(&ringp->s_ring_async); 3808 } 3809 return (cookie); 3810 } 3811 3812 3813 /* 3814 * mac_tx_soft_ring_process 3815 * 3816 * This routine is called when fanning out outgoing traffic among 3817 * multipe Tx rings. 3818 * Note that a soft ring is associated with a h/w Tx ring. 3819 */ 3820 mac_tx_cookie_t 3821 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3822 uint16_t flag, mblk_t **ret_mp) 3823 { 3824 mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3825 int cnt; 3826 size_t sz; 3827 mblk_t *tail; 3828 mac_tx_cookie_t cookie = NULL; 3829 3830 ASSERT(ringp != NULL); 3831 ASSERT(mp_chain != NULL); 3832 ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3833 /* 3834 * Only two modes can come here; either it can be 3835 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 3836 */ 3837 ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3838 mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 3839 3840 if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3841 /* Serialization mode */ 3842 3843 mutex_enter(&ringp->s_ring_lock); 3844 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3845 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3846 flag, ret_mp); 3847 mutex_exit(&ringp->s_ring_lock); 3848 return (cookie); 3849 } 3850 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3851 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3852 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3853 /* 3854 * If ring is blocked due to lack of Tx 3855 * descs, just return. Worker thread 3856 * will get scheduled when Tx desc's 3857 * become available. 3858 */ 3859 mutex_exit(&ringp->s_ring_lock); 3860 return (cookie); 3861 } 3862 mac_soft_ring_worker_wakeup(ringp); 3863 mutex_exit(&ringp->s_ring_lock); 3864 return (cookie); 3865 } else { 3866 /* Default fanout mode */ 3867 /* 3868 * S_RING_BLOCKED is set when underlying NIC runs 3869 * out of Tx descs and messages start getting 3870 * queued. It won't get reset until 3871 * tx_srs_drain() completely drains out the 3872 * messages. 3873 */ 3874 boolean_t is_subflow; 3875 mac_tx_stats_t stats; 3876 3877 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3878 /* Tx descs/resources not available */ 3879 mutex_enter(&ringp->s_ring_lock); 3880 if (ringp->s_ring_state & S_RING_ENQUEUED) { 3881 cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3882 flag, ret_mp); 3883 mutex_exit(&ringp->s_ring_lock); 3884 return (cookie); 3885 } 3886 /* 3887 * While we were computing mblk count, the 3888 * flow control condition got relieved. 3889 * Continue with the transmission. 3890 */ 3891 mutex_exit(&ringp->s_ring_lock); 3892 } 3893 is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 3894 3895 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 3896 ringp->s_ring_tx_arg2, mp_chain, 3897 (is_subflow ? &stats : NULL)); 3898 3899 /* 3900 * Multiple threads could be here sending packets. 3901 * Under such conditions, it is not possible to 3902 * automically set S_RING_BLOCKED bit to indicate 3903 * out of tx desc condition. To atomically set 3904 * this, we queue the returned packet and do 3905 * the setting of S_RING_BLOCKED in 3906 * mac_tx_soft_ring_drain(). 3907 */ 3908 if (mp_chain != NULL) { 3909 mutex_enter(&ringp->s_ring_lock); 3910 cookie = 3911 mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 3912 mutex_exit(&ringp->s_ring_lock); 3913 return (cookie); 3914 } 3915 if (is_subflow) { 3916 FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 3917 } 3918 return (NULL); 3919 } 3920 } 3921