1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2026 Oxide Computer Company 26 */ 27 28 #ifndef _MAC_FLOW_IMPL_H 29 #define _MAC_FLOW_IMPL_H 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 #include <sys/param.h> 36 #include <sys/atomic.h> 37 #include <sys/ksynch.h> 38 #include <sys/mac_flow.h> 39 #include <sys/stream.h> 40 #include <sys/sdt.h> 41 #include <net/if.h> 42 43 /* 44 * Macros to increment/decrement the reference count on a flow_entry_t. 45 */ 46 #define FLOW_REFHOLD(flent) { \ 47 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 48 mutex_enter(&(flent)->fe_lock); \ 49 (flent)->fe_refcnt++; \ 50 mutex_exit(&(flent)->fe_lock); \ 51 } 52 53 /* 54 * Data paths must not attempt to use a flow entry if it is marked INCIPIENT 55 * or QUIESCE. In the former case the set up is not yet complete and the 56 * data path could stumble on inconsistent data structures. In the latter 57 * case a control operation is waiting for quiescence so that it can 58 * change callbacks or other structures without the use of locks. 59 */ 60 #define FLOW_TRY_REFHOLD(flent, err) { \ 61 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 62 (err) = 0; \ 63 mutex_enter(&(flent)->fe_lock); \ 64 if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \ 65 FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH)) \ 66 (err) = -1; \ 67 else \ 68 (flent)->fe_refcnt++; \ 69 mutex_exit(&(flent)->fe_lock); \ 70 } 71 72 #define FLOW_REFRELE(flent) { \ 73 DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent)); \ 74 mutex_enter(&(flent)->fe_lock); \ 75 ASSERT((flent)->fe_refcnt != 0); \ 76 (flent)->fe_refcnt--; \ 77 if ((flent)->fe_flags & FE_WAITER) { \ 78 ASSERT((flent)->fe_refcnt != 0); \ 79 cv_signal(&(flent)->fe_cv); \ 80 mutex_exit(&(flent)->fe_lock); \ 81 } else if ((flent)->fe_refcnt == 0) { \ 82 mac_flow_destroy(flent); \ 83 } else { \ 84 mutex_exit(&(flent)->fe_lock); \ 85 } \ 86 } 87 88 #define FLOW_USER_REFHOLD(flent) { \ 89 mutex_enter(&(flent)->fe_lock); \ 90 (flent)->fe_user_refcnt++; \ 91 mutex_exit(&(flent)->fe_lock); \ 92 } 93 94 #define FLOW_USER_REFRELE(flent) { \ 95 mutex_enter(&(flent)->fe_lock); \ 96 ASSERT((flent)->fe_user_refcnt != 0); \ 97 if (--(flent)->fe_user_refcnt == 0 && \ 98 ((flent)->fe_flags & FE_WAITER)) \ 99 cv_signal(&(flent)->fe_cv); \ 100 mutex_exit(&(flent)->fe_lock); \ 101 } 102 103 #define FLOW_FINAL_REFRELE(flent) { \ 104 ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0); \ 105 FLOW_REFRELE(flent); \ 106 } 107 108 /* 109 * Mark or unmark the flent with a bit flag 110 */ 111 #define FLOW_MARK(flent, flag) { \ 112 mutex_enter(&(flent)->fe_lock); \ 113 (flent)->fe_flags |= flag; \ 114 mutex_exit(&(flent)->fe_lock); \ 115 } 116 117 #define FLOW_UNMARK(flent, flag) { \ 118 mutex_enter(&(flent)->fe_lock); \ 119 (flent)->fe_flags &= ~flag; \ 120 mutex_exit(&(flent)->fe_lock); \ 121 } 122 123 #define FLENT_TO_MIP(flent) \ 124 (flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) : \ 125 ((mac_client_impl_t *)flent->fe_mcip)->mci_mip) 126 127 /* Convert a bandwidth expressed in bps to a number of bytes per tick. */ 128 #define FLOW_BYTES_PER_TICK(bps) (((bps) >> 3) / hz) 129 130 /* 131 * Given an underlying range and a priority level, obtain the minimum for the 132 * new range. 133 */ 134 #define FLOW_MIN_PRIORITY(min, max, pri) \ 135 ((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri))) 136 137 /* 138 * Given an underlying range and a minimum level (base), obtain the maximum 139 * for the new range. 140 */ 141 #define FLOW_MAX_PRIORITY(min, max, base) \ 142 ((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS)) 143 144 /* 145 * Given an underlying range and a priority level, get the absolute 146 * priority value. For now there are just 3 values, high, low and 147 * medium so we can just return max, min or min + (max - min) / 2. 148 * If there are more than three we need to change this computation. 149 */ 150 #define FLOW_PRIORITY(min, max, pri) \ 151 (pri) == MPL_HIGH ? (max) : \ 152 (pri) == MPL_LOW ? (min) : \ 153 ((min) + (((max) - (min)) / 2)) 154 155 #define MAC_FLOW_TAB_SIZE 500 156 157 typedef struct flow_entry_s flow_entry_t; 158 typedef struct flow_tab_s flow_tab_t; 159 typedef struct flow_state_s flow_state_t; 160 struct mac_impl_s; 161 struct mac_client_impl_s; 162 struct mac_soft_ring_set_s; 163 struct mac_group_s; 164 struct mac_bcast_grp_s; 165 166 /* 167 * Classification flags used to lookup the flow. 168 */ 169 #define FLOW_INBOUND 0x01 170 #define FLOW_OUTBOUND 0x02 171 /* Don't compare VID when classifying the packets, see mac_rx_classify() */ 172 #define FLOW_IGNORE_VLAN 0x04 173 174 /* Generic flow client function signature */ 175 typedef void (*flow_fn_t)(void *, void *, mblk_t *, boolean_t); 176 177 /* Flow state */ 178 typedef enum { 179 FLOW_DRIVER_UPCALL, 180 FLOW_USER_REF 181 } mac_flow_state_t; 182 183 /* Matches a flow_entry_t using the extracted flow_state_t info */ 184 typedef boolean_t (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *, 185 flow_state_t *); 186 187 typedef enum { 188 /* Quiesce the flow */ 189 FE_QUIESCE = 0x01, 190 /* Flow has a waiter */ 191 FE_WAITER = 0x02, 192 /* Flow is in the flow tab list */ 193 FE_FLOW_TAB = 0x04, 194 /* Flow is in the global flow hash */ 195 FE_G_FLOW_HASH = 0x08, 196 /* Being setup */ 197 FE_INCIPIENT = 0x10, 198 /* Being deleted */ 199 FE_CONDEMNED = 0x20, 200 /* No datapath setup for User flow */ 201 FE_UF_NO_DATAPATH = 0x40, 202 /* No datapath setup for mac client */ 203 FE_MC_NO_DATAPATH = 0x80, 204 } flow_entry_flags_t; 205 206 typedef enum { 207 /* NIC primary MAC address */ 208 FLOW_PRIMARY_MAC = 0x01, 209 /* VNIC flow */ 210 FLOW_VNIC_MAC = 0x02, 211 /* Multicast (and broadcast) */ 212 FLOW_MCAST = 0x04, 213 /* Other flows configured */ 214 FLOW_OTHER = 0x08, 215 /* User defined flow */ 216 FLOW_USER = 0x10, 217 /* Don't create stats for the flow */ 218 FLOW_NO_STATS = 0x20, 219 } flow_entry_type_t; 220 221 #define FLOW_VNIC FLOW_VNIC_MAC 222 223 /* 224 * Shared Bandwidth control counters between the soft ring set and its 225 * associated soft rings. In case the flow associated with NIC/VNIC 226 * has a group of Rx rings assigned to it, we have the same 227 * number of soft ring sets as we have the Rx ring in the group 228 * and each individual SRS (and its soft rings) decide when to 229 * poll their Rx ring independently. But if there is a B/W limit 230 * associated with the NIC/VNIC, then the B/W control counter is 231 * shared across all the SRS in the group and their associated 232 * soft rings. 233 * 234 * There is a many to 1 mapping between the SRS and 235 * mac_bw_ctl if the flow has a group of Rx rings associated with 236 * it. 237 */ 238 typedef struct mac_bw_ctl_s { 239 kmutex_t mac_bw_lock; 240 uint32_t mac_bw_state; 241 size_t mac_bw_sz; /* ?? Is it needed */ 242 size_t mac_bw_limit; /* Max bytes to process per tick */ 243 size_t mac_bw_used; /* Bytes processed in current tick */ 244 size_t mac_bw_drop_threshold; /* Max queue length */ 245 size_t mac_bw_drop_bytes; 246 size_t mac_bw_polled; 247 size_t mac_bw_intr; 248 clock_t mac_bw_curr_time; 249 } mac_bw_ctl_t; 250 251 struct flow_entry_s { /* Protected by */ 252 flow_entry_t *fe_next; /* ft_lock */ 253 254 datalink_id_t fe_link_id; /* WO */ 255 256 /* Properties as specified for this flow */ 257 mac_resource_props_t fe_resource_props; /* SL */ 258 259 /* Properties actually effective at run time for this flow */ 260 mac_resource_props_t fe_effective_props; /* SL */ 261 262 kmutex_t fe_lock; 263 char fe_flow_name[MAXFLOWNAMELEN]; /* fe_lock */ 264 flow_desc_t fe_flow_desc; /* fe_lock */ 265 kcondvar_t fe_cv; /* fe_lock */ 266 /* 267 * Initial flow ref is 1 on creation. A thread that lookups the 268 * flent typically by a mac_flow_lookup() dynamically holds a ref. 269 * If the ref is 1, it means there arent' any upcalls from the driver 270 * or downcalls from the stack using this flent. Structures pointing 271 * to the flent or flent inserted in lists don't count towards this 272 * refcnt. Instead they are tracked using fe_flags. Only a control 273 * thread doing a teardown operation deletes the flent, after waiting 274 * for upcalls to finish synchronously. The fe_refcnt tracks 275 * the number of upcall refs 276 */ 277 uint32_t fe_refcnt; /* fe_lock */ 278 279 /* 280 * This tracks lookups done using the global hash list for user 281 * generated flows. This refcnt only protects the flent itself 282 * from disappearing and helps walkers to read the flent info such 283 * as flow spec. However the flent may be quiesced and the SRS could 284 * be deleted. The fe_user_refcnt tracks the number of global flow 285 * has refs. 286 */ 287 uint32_t fe_user_refcnt; /* fe_lock */ 288 flow_entry_flags_t fe_flags; /* fe_lock */ 289 290 /* 291 * Function/args to invoke for delivering matching packets 292 * Only the function ff_fn may be changed dynamically and atomically. 293 * The ff_arg1 and ff_arg2 are set at creation time and may not 294 * be changed. 295 */ 296 flow_fn_t fe_cb_fn; /* fe_lock */ 297 void *fe_cb_arg1; /* fe_lock */ 298 void *fe_cb_arg2; /* fe_lock */ 299 300 void *fe_client_cookie; /* WO */ 301 struct mac_group_s *fe_rx_ring_group; /* SL */ 302 303 /* fe_lock */ 304 struct mac_soft_ring_set_s *fe_rx_srs[MAX_RINGS_PER_GROUP]; 305 uint32_t fe_rx_srs_cnt; /* fe_lock */ 306 struct mac_group_s *fe_tx_ring_group; 307 struct mac_soft_ring_set_s *fe_tx_srs; /* WO */ 308 309 /* 310 * This is a unicast flow, and is a mac_client_impl_t 311 */ 312 struct mac_client_impl_s *fe_mcip; /* WO */ 313 314 /* 315 * Used by mci_flent_list of mac_client_impl_t to track flows sharing 316 * the same mac_client_impl_t. 317 */ 318 flow_entry_t *fe_client_next; 319 320 /* 321 * This is a broadcast or multicast flow and is a mac_bcast_grp_t 322 */ 323 struct mac_bcast_grp_s *fe_mbg; /* WO */ 324 flow_entry_type_t fe_type; /* WO */ 325 326 /* 327 * BW control info. 328 */ 329 mac_bw_ctl_t fe_tx_bw; 330 mac_bw_ctl_t fe_rx_bw; 331 332 /* 333 * Used by flow table lookup code 334 */ 335 flow_match_fn_t fe_match; 336 337 /* 338 * Used by mac_flow_remove(). 339 */ 340 int fe_index; 341 flow_tab_t *fe_flow_tab; 342 343 kstat_t *fe_ksp; 344 kstat_t *fe_misc_stat_ksp; 345 346 boolean_t fe_desc_logged; 347 uint64_t fe_nic_speed; 348 }; 349 350 /* 351 * Various structures used by the flows framework for keeping track 352 * of packet state information. 353 */ 354 355 /* Layer 2 */ 356 typedef struct flow_l2info_s { 357 uchar_t *l2_start; 358 uint8_t *l2_daddr; 359 uint16_t l2_vid; 360 uint32_t l2_sap; 361 uint_t l2_hdrsize; 362 } flow_l2info_t; 363 364 /* Layer 3 */ 365 typedef struct flow_l3info_s { 366 uchar_t *l3_start; 367 uint8_t l3_protocol; 368 uint8_t l3_version; 369 boolean_t l3_dst_or_src; 370 uint_t l3_hdrsize; 371 boolean_t l3_fragmented; 372 } flow_l3info_t; 373 374 /* Layer 4 */ 375 typedef struct flow_l4info_s { 376 uchar_t *l4_start; 377 uint16_t l4_src_port; 378 uint16_t l4_dst_port; 379 uint16_t l4_hash_port; 380 } flow_l4info_t; 381 382 /* 383 * Combined state structure. 384 * Holds flow direction and an mblk_t pointer. 385 */ 386 struct flow_state_s { 387 uint_t fs_flags; 388 mblk_t *fs_mp; 389 flow_l2info_t fs_l2info; 390 flow_l3info_t fs_l3info; 391 flow_l4info_t fs_l4info; 392 }; 393 394 /* 395 * Flow ops vector. 396 * There are two groups of functions. The ones ending with _fe are 397 * called when a flow is being added. The others (hash, accept) are 398 * called at flow lookup time. 399 */ 400 #define FLOW_MAX_ACCEPT 16 401 typedef struct flow_ops_s { 402 /* 403 * fo_accept_fe(): 404 * Validates the contents of the flow and checks whether 405 * it's compatible with the flow table. sets the fe_match 406 * function of the flow. 407 */ 408 int (*fo_accept_fe)(flow_tab_t *, flow_entry_t *); 409 /* 410 * fo_hash_fe(): 411 * Generates a hash index to the flow table. This function 412 * must use the same algorithm as fo_hash(), which is used 413 * by the flow lookup code path. 414 */ 415 uint32_t (*fo_hash_fe)(flow_tab_t *, flow_entry_t *); 416 /* 417 * fo_match_fe(): 418 * This is used for finding identical flows. 419 */ 420 boolean_t (*fo_match_fe)(flow_tab_t *, flow_entry_t *, 421 flow_entry_t *); 422 /* 423 * fo_insert_fe(): 424 * Used for inserting a flow to a flow chain. 425 * Protocols that have special ordering requirements would 426 * need to implement this. For those that don't, 427 * flow_generic_insert_fe() may be used. 428 */ 429 int (*fo_insert_fe)(flow_tab_t *, flow_entry_t **, 430 flow_entry_t *); 431 432 /* 433 * Calculates the flow hash index based on the accumulated 434 * state in flow_state_t. Must use the same algorithm as 435 * fo_hash_fe(). 436 */ 437 uint32_t (*fo_hash)(flow_tab_t *, flow_state_t *); 438 439 /* 440 * Array of accept fuctions. 441 * Each function in the array will accumulate enough state 442 * (header length, protocol) to allow the next function to 443 * proceed. We support up to FLOW_MAX_ACCEPT functions which 444 * should be sufficient for all practical purposes. 445 */ 446 int (*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *, 447 flow_state_t *); 448 } flow_ops_t; 449 450 /* 451 * Generic flow table. 452 */ 453 struct flow_tab_s { 454 krwlock_t ft_lock; 455 /* 456 * Contains a list of functions (described above) 457 * specific to this table type. 458 */ 459 flow_ops_t ft_ops; 460 461 /* 462 * Indicates what types of flows are supported. 463 */ 464 flow_mask_t ft_mask; 465 466 /* 467 * An array of flow_entry_t * of size ft_size. 468 * Each element is the beginning of a hash chain. 469 */ 470 flow_entry_t **ft_table; 471 uint_t ft_size; 472 473 /* 474 * The number of flows inserted into ft_table. 475 */ 476 uint_t ft_flow_count; 477 struct mac_impl_s *ft_mip; 478 struct mac_client_impl_s *ft_mcip; 479 }; 480 481 /* 482 * This is used for describing what type of flow table can be created. 483 * mac_flow.c contains a list of these structures. 484 */ 485 typedef struct flow_tab_info_s { 486 flow_ops_t *fti_ops; 487 flow_mask_t fti_mask; 488 uint_t fti_size; 489 } flow_tab_info_t; 490 491 #define FLOW_TAB_EMPTY(ft) ((ft) == NULL || (ft)->ft_flow_count == 0) 492 493 494 #define MCIP_STAT_UPDATE(m, s, c) { \ 495 ((mac_client_impl_t *)(m))->mci_misc_stat.mms_##s \ 496 += ((uint64_t)(c)); \ 497 } 498 499 #define SRS_RX_STAT_UPDATE(m, s, c) { \ 500 ((mac_soft_ring_set_t *)(m))->srs_rx.sr_stat.mrs_##s \ 501 += ((uint64_t)(c)); \ 502 } 503 504 #define SRS_TX_STAT_UPDATE(m, s, c) { \ 505 ((mac_soft_ring_set_t *)(m))->srs_tx.st_stat.mts_##s \ 506 += ((uint64_t)(c)); \ 507 } 508 509 #define SRS_TX_STATS_UPDATE(m, s) { \ 510 SRS_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets); \ 511 SRS_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes); \ 512 SRS_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors); \ 513 } 514 515 #define SOFTRING_TX_STAT_UPDATE(m, s, c) { \ 516 ((mac_soft_ring_t *)(m))->s_st_stat.mts_##s += ((uint64_t)(c)); \ 517 } 518 519 #define SOFTRING_TX_STATS_UPDATE(m, s) { \ 520 SOFTRING_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets); \ 521 SOFTRING_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes); \ 522 SOFTRING_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors); \ 523 } 524 525 extern void mac_flow_init(); 526 extern void mac_flow_fini(); 527 extern int mac_flow_create(flow_desc_t *, mac_resource_props_t *, 528 char *, void *, uint_t, flow_entry_t **); 529 530 extern int mac_flow_add(flow_tab_t *, flow_entry_t *); 531 extern int mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *, 532 boolean_t); 533 extern int mac_flow_hash_add(flow_entry_t *); 534 extern int mac_flow_lookup_byname(char *, flow_entry_t **); 535 extern int mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t, 536 flow_entry_t **); 537 538 extern int mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *), 539 void *); 540 541 extern int mac_flow_walk_nolock(flow_tab_t *, 542 int (*)(flow_entry_t *, void *), void *); 543 544 extern void mac_flow_modify(flow_tab_t *, flow_entry_t *, 545 mac_resource_props_t *); 546 547 extern void *mac_flow_get_client_cookie(flow_entry_t *); 548 549 extern uint32_t mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *); 550 551 extern void mac_flow_get_desc(flow_entry_t *, flow_desc_t *); 552 extern void mac_flow_set_desc(flow_entry_t *, flow_desc_t *); 553 554 extern void mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t); 555 extern void mac_flow_hash_remove(flow_entry_t *); 556 extern void mac_flow_wait(flow_entry_t *, mac_flow_state_t); 557 extern void mac_flow_cleanup(flow_entry_t *); 558 extern void mac_flow_destroy(flow_entry_t *); 559 560 extern void mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t, 561 struct mac_impl_s *, flow_tab_t **); 562 extern void mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **); 563 extern void mac_flow_tab_destroy(flow_tab_t *); 564 extern void flow_stat_destroy(flow_entry_t *); 565 566 #ifdef __cplusplus 567 } 568 #endif 569 570 #endif /* _MAC_FLOW_IMPL_H */ 571