1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2026 Oxide Computer Company 26 */ 27 28 #ifndef _MAC_FLOW_IMPL_H 29 #define _MAC_FLOW_IMPL_H 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 #include <sys/param.h> 36 #include <sys/atomic.h> 37 #include <sys/time.h> 38 #include <sys/ksynch.h> 39 #include <sys/mac_flow.h> 40 #include <sys/stream.h> 41 #include <sys/sdt.h> 42 #include <net/if.h> 43 44 /* 45 * Macros to increment/decrement the reference count on a flow_entry_t. 46 */ 47 #define FLOW_REFHOLD(flent) { \ 48 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 49 mutex_enter(&(flent)->fe_lock); \ 50 (flent)->fe_refcnt++; \ 51 mutex_exit(&(flent)->fe_lock); \ 52 } 53 54 /* 55 * Data paths must not attempt to use a flow entry if it is marked INCIPIENT 56 * or QUIESCE. In the former case the set up is not yet complete and the 57 * data path could stumble on inconsistent data structures. In the latter 58 * case a control operation is waiting for quiescence so that it can 59 * change callbacks or other structures without the use of locks. 60 */ 61 #define FLOW_TRY_REFHOLD(flent, err) { \ 62 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 63 (err) = 0; \ 64 mutex_enter(&(flent)->fe_lock); \ 65 if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \ 66 FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH)) \ 67 (err) = -1; \ 68 else \ 69 (flent)->fe_refcnt++; \ 70 mutex_exit(&(flent)->fe_lock); \ 71 } 72 73 #define FLOW_REFRELE(flent) { \ 74 DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent)); \ 75 mutex_enter(&(flent)->fe_lock); \ 76 ASSERT((flent)->fe_refcnt != 0); \ 77 (flent)->fe_refcnt--; \ 78 if ((flent)->fe_flags & FE_WAITER) { \ 79 ASSERT((flent)->fe_refcnt != 0); \ 80 cv_signal(&(flent)->fe_cv); \ 81 mutex_exit(&(flent)->fe_lock); \ 82 } else if ((flent)->fe_refcnt == 0) { \ 83 mac_flow_destroy(flent); \ 84 } else { \ 85 mutex_exit(&(flent)->fe_lock); \ 86 } \ 87 } 88 89 #define FLOW_USER_REFHOLD(flent) { \ 90 mutex_enter(&(flent)->fe_lock); \ 91 (flent)->fe_user_refcnt++; \ 92 mutex_exit(&(flent)->fe_lock); \ 93 } 94 95 #define FLOW_USER_REFRELE(flent) { \ 96 mutex_enter(&(flent)->fe_lock); \ 97 ASSERT((flent)->fe_user_refcnt != 0); \ 98 if (--(flent)->fe_user_refcnt == 0 && \ 99 ((flent)->fe_flags & FE_WAITER)) \ 100 cv_signal(&(flent)->fe_cv); \ 101 mutex_exit(&(flent)->fe_lock); \ 102 } 103 104 #define FLOW_FINAL_REFRELE(flent) { \ 105 ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0); \ 106 FLOW_REFRELE(flent); \ 107 } 108 109 /* 110 * Mark or unmark the flent with a bit flag 111 */ 112 #define FLOW_MARK(flent, flag) { \ 113 mutex_enter(&(flent)->fe_lock); \ 114 (flent)->fe_flags |= flag; \ 115 mutex_exit(&(flent)->fe_lock); \ 116 } 117 118 #define FLOW_UNMARK(flent, flag) { \ 119 mutex_enter(&(flent)->fe_lock); \ 120 (flent)->fe_flags &= ~flag; \ 121 mutex_exit(&(flent)->fe_lock); \ 122 } 123 124 #define FLENT_TO_MIP(flent) \ 125 (flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) : \ 126 ((mac_client_impl_t *)flent->fe_mcip)->mci_mip) 127 128 /* Convert a bandwidth expressed in bps to a number of bytes per tick. */ 129 #define FLOW_BYTES_PER_TICK(bps) (((bps) >> 3) / hz) 130 131 /* 132 * Given an underlying range and a priority level, obtain the minimum for the 133 * new range. 134 */ 135 #define FLOW_MIN_PRIORITY(min, max, pri) \ 136 ((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri))) 137 138 /* 139 * Given an underlying range and a minimum level (base), obtain the maximum 140 * for the new range. 141 */ 142 #define FLOW_MAX_PRIORITY(min, max, base) \ 143 ((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS)) 144 145 /* 146 * Given an underlying range and a priority level, get the absolute 147 * priority value. For now there are just 3 values, high, low and 148 * medium so we can just return max, min or min + (max - min) / 2. 149 * If there are more than three we need to change this computation. 150 */ 151 #define FLOW_PRIORITY(min, max, pri) \ 152 (pri) == MPL_HIGH ? (max) : \ 153 (pri) == MPL_LOW ? (min) : \ 154 ((min) + (((max) - (min)) / 2)) 155 156 #define MAC_FLOW_TAB_SIZE 500 157 158 typedef struct flow_entry_s flow_entry_t; 159 typedef struct flow_tab_s flow_tab_t; 160 typedef struct flow_state_s flow_state_t; 161 struct mac_impl_s; 162 struct mac_client_impl_s; 163 struct mac_soft_ring_set_s; 164 struct mac_group_s; 165 struct mac_bcast_grp_s; 166 167 /* 168 * Classification flags used to lookup the flow. 169 */ 170 #define FLOW_INBOUND 0x01 171 #define FLOW_OUTBOUND 0x02 172 /* Don't compare VID when classifying the packets, see mac_rx_classify() */ 173 #define FLOW_IGNORE_VLAN 0x04 174 175 /* Generic flow client function signature */ 176 typedef void (*flow_fn_t)(void *, void *, mblk_t *, boolean_t); 177 178 /* Flow state */ 179 typedef enum { 180 FLOW_DRIVER_UPCALL, 181 FLOW_USER_REF 182 } mac_flow_state_t; 183 184 /* Matches a flow_entry_t using the extracted flow_state_t info */ 185 typedef boolean_t (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *, 186 flow_state_t *); 187 188 typedef enum { 189 /* Quiesce the flow */ 190 FE_QUIESCE = 0x01, 191 /* Flow has a waiter */ 192 FE_WAITER = 0x02, 193 /* Flow is in the flow tab list */ 194 FE_FLOW_TAB = 0x04, 195 /* Flow is in the global flow hash */ 196 FE_G_FLOW_HASH = 0x08, 197 /* Being setup */ 198 FE_INCIPIENT = 0x10, 199 /* Being deleted */ 200 FE_CONDEMNED = 0x20, 201 /* No datapath setup for User flow */ 202 FE_UF_NO_DATAPATH = 0x40, 203 /* No datapath setup for mac client */ 204 FE_MC_NO_DATAPATH = 0x80, 205 } flow_entry_flags_t; 206 207 typedef enum { 208 /* NIC primary MAC address */ 209 FLOW_PRIMARY_MAC = 0x01, 210 /* VNIC flow */ 211 FLOW_VNIC_MAC = 0x02, 212 /* Multicast (and broadcast) */ 213 FLOW_MCAST = 0x04, 214 /* Other flows configured */ 215 FLOW_OTHER = 0x08, 216 /* User defined flow */ 217 FLOW_USER = 0x10, 218 /* Don't create stats for the flow */ 219 FLOW_NO_STATS = 0x20, 220 } flow_entry_type_t; 221 222 #define FLOW_VNIC FLOW_VNIC_MAC 223 224 /* 225 * Bitflags denoting the state of an individual bandwidth control. 226 */ 227 typedef enum { 228 BW_ENABLED = 1 << 0, 229 BW_ENFORCED = 1 << 1, 230 } mac_bw_state_t; 231 232 /* 233 * Shared Bandwidth control counters between the soft ring set and its 234 * associated soft rings. In case the flow associated with NIC/VNIC 235 * has a group of Rx rings assigned to it, we have the same 236 * number of soft ring sets as we have the Rx ring in the group 237 * and each individual SRS (and its soft rings) decide when to 238 * poll their Rx ring independently. But if there is a B/W limit 239 * associated with the NIC/VNIC, then the B/W control counter is 240 * shared across all the SRS in the group and their associated 241 * soft rings. 242 * 243 * Bandwidth controls cause all affected SRSes (packet queues) to obey a shared 244 * policing/shaping criteria: 245 * 246 * - Total queue occupancy beyond `mac_bw_drop_threshold` will lead to packet 247 * drops. (Policing) 248 * 249 * - All queues can, amongst themselves, admit at most `mac_bw_limit` bytes 250 * to their softrings per system tick. (Shaping) 251 * 252 * The policing threshold is set today at 2 * `mac_bw_limit`. 253 * 254 * There is generally a many-to-1 mapping between SRSes and mac_bw_ctl. The Rx 255 * path's software classifier and SRSes for hardware rings will necessarily 256 * share a control, as will any Rx SRSes for subflows. In the Tx path, a 257 * bandwidth limit is used by just one SRS but may be referenced by its worker 258 * or `mac_tx`. 259 */ 260 typedef struct mac_bw_ctl_s { 261 kmutex_t mac_bw_lock; 262 mac_bw_state_t mac_bw_state; 263 size_t mac_bw_sz; /* Bytes enqueued in controlled SRSes */ 264 size_t mac_bw_limit; /* Max bytes to process per tick */ 265 size_t mac_bw_used; /* Bytes processed in current tick */ 266 size_t mac_bw_drop_threshold; /* Max queue length */ 267 hrtime_t mac_bw_curr_time; 268 269 /* stats */ 270 uint64_t mac_bw_drop_bytes; 271 uint64_t mac_bw_polled; 272 uint64_t mac_bw_intr; 273 } mac_bw_ctl_t; 274 275 struct flow_entry_s { /* Protected by */ 276 flow_entry_t *fe_next; /* ft_lock */ 277 278 datalink_id_t fe_link_id; /* WO */ 279 280 /* Properties as specified for this flow */ 281 mac_resource_props_t fe_resource_props; /* SL */ 282 283 /* Properties actually effective at run time for this flow */ 284 mac_resource_props_t fe_effective_props; /* SL */ 285 286 kmutex_t fe_lock; 287 char fe_flow_name[MAXFLOWNAMELEN]; /* fe_lock */ 288 flow_desc_t fe_flow_desc; /* fe_lock */ 289 kcondvar_t fe_cv; /* fe_lock */ 290 /* 291 * Initial flow ref is 1 on creation. A thread that lookups the 292 * flent typically by a mac_flow_lookup() dynamically holds a ref. 293 * If the ref is 1, it means there arent' any upcalls from the driver 294 * or downcalls from the stack using this flent. Structures pointing 295 * to the flent or flent inserted in lists don't count towards this 296 * refcnt. Instead they are tracked using fe_flags. Only a control 297 * thread doing a teardown operation deletes the flent, after waiting 298 * for upcalls to finish synchronously. The fe_refcnt tracks 299 * the number of upcall refs 300 */ 301 uint32_t fe_refcnt; /* fe_lock */ 302 303 /* 304 * This tracks lookups done using the global hash list for user 305 * generated flows. This refcnt only protects the flent itself 306 * from disappearing and helps walkers to read the flent info such 307 * as flow spec. However the flent may be quiesced and the SRS could 308 * be deleted. The fe_user_refcnt tracks the number of global flow 309 * has refs. 310 */ 311 uint32_t fe_user_refcnt; /* fe_lock */ 312 flow_entry_flags_t fe_flags; /* fe_lock */ 313 314 /* 315 * Function/args to invoke for delivering matching packets 316 * Only the function ff_fn may be changed dynamically and atomically. 317 * The ff_arg1 and ff_arg2 are set at creation time and may not 318 * be changed. 319 */ 320 flow_fn_t fe_cb_fn; /* fe_lock */ 321 void *fe_cb_arg1; /* fe_lock */ 322 void *fe_cb_arg2; /* fe_lock */ 323 324 void *fe_client_cookie; /* WO */ 325 struct mac_group_s *fe_rx_ring_group; /* SL */ 326 327 /* fe_lock */ 328 struct mac_soft_ring_set_s *fe_rx_srs[MAX_RINGS_PER_GROUP]; 329 uint32_t fe_rx_srs_cnt; /* fe_lock */ 330 struct mac_group_s *fe_tx_ring_group; 331 struct mac_soft_ring_set_s *fe_tx_srs; /* WO */ 332 333 /* 334 * This is a unicast flow, and is a mac_client_impl_t 335 */ 336 struct mac_client_impl_s *fe_mcip; /* WO */ 337 338 /* 339 * Used by mci_flent_list of mac_client_impl_t to track flows sharing 340 * the same mac_client_impl_t. 341 */ 342 flow_entry_t *fe_client_next; 343 344 /* 345 * This is a broadcast or multicast flow and is a mac_bcast_grp_t 346 */ 347 struct mac_bcast_grp_s *fe_mbg; /* WO */ 348 flow_entry_type_t fe_type; /* WO */ 349 350 /* 351 * BW control info. 352 */ 353 mac_bw_ctl_t fe_tx_bw; 354 mac_bw_ctl_t fe_rx_bw; 355 356 /* 357 * Used by flow table lookup code 358 */ 359 flow_match_fn_t fe_match; 360 361 /* 362 * Used by mac_flow_remove(). 363 */ 364 int fe_index; 365 flow_tab_t *fe_flow_tab; 366 367 kstat_t *fe_ksp; 368 kstat_t *fe_misc_stat_ksp; 369 370 boolean_t fe_desc_logged; 371 uint64_t fe_nic_speed; 372 }; 373 374 /* 375 * Various structures used by the flows framework for keeping track 376 * of packet state information. 377 */ 378 379 /* Layer 2 */ 380 typedef struct flow_l2info_s { 381 uchar_t *l2_start; 382 uint8_t *l2_daddr; 383 uint16_t l2_vid; 384 uint32_t l2_sap; 385 uint_t l2_hdrsize; 386 } flow_l2info_t; 387 388 /* Layer 3 */ 389 typedef struct flow_l3info_s { 390 uchar_t *l3_start; 391 uint8_t l3_protocol; 392 uint8_t l3_version; 393 boolean_t l3_dst_or_src; 394 uint_t l3_hdrsize; 395 boolean_t l3_fragmented; 396 } flow_l3info_t; 397 398 /* Layer 4 */ 399 typedef struct flow_l4info_s { 400 uchar_t *l4_start; 401 uint16_t l4_src_port; 402 uint16_t l4_dst_port; 403 uint16_t l4_hash_port; 404 } flow_l4info_t; 405 406 /* 407 * Combined state structure. 408 * Holds flow direction and an mblk_t pointer. 409 */ 410 struct flow_state_s { 411 uint_t fs_flags; 412 mblk_t *fs_mp; 413 flow_l2info_t fs_l2info; 414 flow_l3info_t fs_l3info; 415 flow_l4info_t fs_l4info; 416 }; 417 418 /* 419 * Flow ops vector. 420 * There are two groups of functions. The ones ending with _fe are 421 * called when a flow is being added. The others (hash, accept) are 422 * called at flow lookup time. 423 */ 424 #define FLOW_MAX_ACCEPT 16 425 typedef struct flow_ops_s { 426 /* 427 * fo_accept_fe(): 428 * Validates the contents of the flow and checks whether 429 * it's compatible with the flow table. sets the fe_match 430 * function of the flow. 431 */ 432 int (*fo_accept_fe)(flow_tab_t *, flow_entry_t *); 433 /* 434 * fo_hash_fe(): 435 * Generates a hash index to the flow table. This function 436 * must use the same algorithm as fo_hash(), which is used 437 * by the flow lookup code path. 438 */ 439 uint32_t (*fo_hash_fe)(flow_tab_t *, flow_entry_t *); 440 /* 441 * fo_match_fe(): 442 * This is used for finding identical flows. 443 */ 444 boolean_t (*fo_match_fe)(flow_tab_t *, flow_entry_t *, 445 flow_entry_t *); 446 /* 447 * fo_insert_fe(): 448 * Used for inserting a flow to a flow chain. 449 * Protocols that have special ordering requirements would 450 * need to implement this. For those that don't, 451 * flow_generic_insert_fe() may be used. 452 */ 453 int (*fo_insert_fe)(flow_tab_t *, flow_entry_t **, 454 flow_entry_t *); 455 456 /* 457 * Calculates the flow hash index based on the accumulated 458 * state in flow_state_t. Must use the same algorithm as 459 * fo_hash_fe(). 460 */ 461 uint32_t (*fo_hash)(flow_tab_t *, flow_state_t *); 462 463 /* 464 * Array of accept fuctions. 465 * Each function in the array will accumulate enough state 466 * (header length, protocol) to allow the next function to 467 * proceed. We support up to FLOW_MAX_ACCEPT functions which 468 * should be sufficient for all practical purposes. 469 */ 470 int (*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *, 471 flow_state_t *); 472 } flow_ops_t; 473 474 /* 475 * Generic flow table. 476 */ 477 struct flow_tab_s { 478 krwlock_t ft_lock; 479 /* 480 * Contains a list of functions (described above) 481 * specific to this table type. 482 */ 483 flow_ops_t ft_ops; 484 485 /* 486 * Indicates what types of flows are supported. 487 */ 488 flow_mask_t ft_mask; 489 490 /* 491 * An array of flow_entry_t * of size ft_size. 492 * Each element is the beginning of a hash chain. 493 */ 494 flow_entry_t **ft_table; 495 uint_t ft_size; 496 497 /* 498 * The number of flows inserted into ft_table. 499 */ 500 uint_t ft_flow_count; 501 struct mac_impl_s *ft_mip; 502 struct mac_client_impl_s *ft_mcip; 503 }; 504 505 /* 506 * This is used for describing what type of flow table can be created. 507 * mac_flow.c contains a list of these structures. 508 */ 509 typedef struct flow_tab_info_s { 510 flow_ops_t *fti_ops; 511 flow_mask_t fti_mask; 512 uint_t fti_size; 513 } flow_tab_info_t; 514 515 #define FLOW_TAB_EMPTY(ft) ((ft) == NULL || (ft)->ft_flow_count == 0) 516 517 518 #define MCIP_STAT_UPDATE(m, s, c) { \ 519 ((mac_client_impl_t *)(m))->mci_misc_stat.mms_##s \ 520 += ((uint64_t)(c)); \ 521 } 522 523 #define SRS_RX_STAT_UPDATE(m, s, c) { \ 524 ((mac_soft_ring_set_t *)(m))->srs_rx.sr_stat.mrs_##s \ 525 += ((uint64_t)(c)); \ 526 } 527 528 #define SRS_TX_STAT_UPDATE(m, s, c) { \ 529 ((mac_soft_ring_set_t *)(m))->srs_tx.st_stat.mts_##s \ 530 += ((uint64_t)(c)); \ 531 } 532 533 #define SRS_TX_STATS_UPDATE(m, s) { \ 534 SRS_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets); \ 535 SRS_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes); \ 536 SRS_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors); \ 537 } 538 539 #define SOFTRING_TX_STAT_UPDATE(m, s, c) { \ 540 ((mac_soft_ring_t *)(m))->s_st_stat.mts_##s += ((uint64_t)(c)); \ 541 } 542 543 #define SOFTRING_TX_STATS_UPDATE(m, s) { \ 544 SOFTRING_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets); \ 545 SOFTRING_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes); \ 546 SOFTRING_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors); \ 547 } 548 549 extern void mac_flow_init(); 550 extern void mac_flow_fini(); 551 extern int mac_flow_create(flow_desc_t *, mac_resource_props_t *, 552 char *, void *, uint_t, flow_entry_t **); 553 554 extern int mac_flow_add(flow_tab_t *, flow_entry_t *); 555 extern int mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *, 556 boolean_t); 557 extern int mac_flow_hash_add(flow_entry_t *); 558 extern int mac_flow_lookup_byname(char *, flow_entry_t **); 559 extern int mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t, 560 flow_entry_t **); 561 562 extern int mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *), 563 void *); 564 565 extern int mac_flow_walk_nolock(flow_tab_t *, 566 int (*)(flow_entry_t *, void *), void *); 567 568 extern void mac_flow_modify(flow_tab_t *, flow_entry_t *, 569 mac_resource_props_t *); 570 571 extern void *mac_flow_get_client_cookie(flow_entry_t *); 572 573 extern uint32_t mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *); 574 575 extern void mac_flow_get_desc(flow_entry_t *, flow_desc_t *); 576 extern void mac_flow_set_desc(flow_entry_t *, flow_desc_t *); 577 578 extern void mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t); 579 extern void mac_flow_hash_remove(flow_entry_t *); 580 extern void mac_flow_wait(flow_entry_t *, mac_flow_state_t); 581 extern void mac_flow_cleanup(flow_entry_t *); 582 extern void mac_flow_destroy(flow_entry_t *); 583 584 extern void mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t, 585 struct mac_impl_s *, flow_tab_t **); 586 extern void mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **); 587 extern void mac_flow_tab_destroy(flow_tab_t *); 588 extern void flow_stat_destroy(flow_entry_t *); 589 590 extern boolean_t mac_bw_ctl_is_enabled(const mac_bw_ctl_t *); 591 extern boolean_t mac_bw_ctl_is_enforced(const mac_bw_ctl_t *); 592 593 #ifdef __cplusplus 594 } 595 #endif 596 597 #endif /* _MAC_FLOW_IMPL_H */ 598