1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _MAC_FLOW_IMPL_H 28 #define _MAC_FLOW_IMPL_H 29 30 #ifdef __cplusplus 31 extern "C" { 32 #endif 33 34 #include <sys/param.h> 35 #include <sys/atomic.h> 36 #include <sys/ksynch.h> 37 #include <sys/mac_flow.h> 38 #include <sys/stream.h> 39 #include <sys/sdt.h> 40 #include <net/if.h> 41 42 /* 43 * Macros to increment/decrement the reference count on a flow_entry_t. 44 */ 45 #define FLOW_REFHOLD(flent) { \ 46 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 47 mutex_enter(&(flent)->fe_lock); \ 48 (flent)->fe_refcnt++; \ 49 mutex_exit(&(flent)->fe_lock); \ 50 } 51 52 /* 53 * Data paths must not attempt to use a flow entry if it is marked INCIPIENT 54 * or QUIESCE. In the former case the set up is not yet complete and the 55 * data path could stumble on inconsistent data structures. In the latter 56 * case a control operation is waiting for quiescence so that it can 57 * change callbacks or other structures without the use of locks. 58 */ 59 #define FLOW_TRY_REFHOLD(flent, err) { \ 60 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 61 (err) = 0; \ 62 mutex_enter(&(flent)->fe_lock); \ 63 if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \ 64 FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH)) \ 65 (err) = -1; \ 66 else \ 67 (flent)->fe_refcnt++; \ 68 mutex_exit(&(flent)->fe_lock); \ 69 } 70 71 #define FLOW_REFRELE(flent) { \ 72 DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent)); \ 73 mutex_enter(&(flent)->fe_lock); \ 74 ASSERT((flent)->fe_refcnt != 0); \ 75 (flent)->fe_refcnt--; \ 76 if ((flent)->fe_flags & FE_WAITER) { \ 77 ASSERT((flent)->fe_refcnt != 0); \ 78 cv_signal(&(flent)->fe_cv); \ 79 mutex_exit(&(flent)->fe_lock); \ 80 } else if ((flent)->fe_refcnt == 0) { \ 81 mac_flow_destroy(flent); \ 82 } else { \ 83 mutex_exit(&(flent)->fe_lock); \ 84 } \ 85 } 86 87 #define FLOW_USER_REFHOLD(flent) { \ 88 mutex_enter(&(flent)->fe_lock); \ 89 (flent)->fe_user_refcnt++; \ 90 mutex_exit(&(flent)->fe_lock); \ 91 } 92 93 #define FLOW_USER_REFRELE(flent) { \ 94 mutex_enter(&(flent)->fe_lock); \ 95 ASSERT((flent)->fe_user_refcnt != 0); \ 96 if (--(flent)->fe_user_refcnt == 0 && \ 97 ((flent)->fe_flags & FE_WAITER)) \ 98 cv_signal(&(flent)->fe_cv); \ 99 mutex_exit(&(flent)->fe_lock); \ 100 } 101 102 #define FLOW_FINAL_REFRELE(flent) { \ 103 ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0); \ 104 FLOW_REFRELE(flent); \ 105 } 106 107 /* 108 * Mark or unmark the flent with a bit flag 109 */ 110 #define FLOW_MARK(flent, flag) { \ 111 mutex_enter(&(flent)->fe_lock); \ 112 (flent)->fe_flags |= flag; \ 113 mutex_exit(&(flent)->fe_lock); \ 114 } 115 116 #define FLOW_UNMARK(flent, flag) { \ 117 mutex_enter(&(flent)->fe_lock); \ 118 (flent)->fe_flags &= ~flag; \ 119 mutex_exit(&(flent)->fe_lock); \ 120 } 121 122 #define FLENT_TO_MIP(flent) \ 123 (flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) : \ 124 ((mac_client_impl_t *)flent->fe_mcip)->mci_mip) 125 126 /* Convert a bandwidth expressed in bps to a number of bytes per tick. */ 127 #define FLOW_BYTES_PER_TICK(bps) (((bps) >> 3) / hz) 128 129 /* 130 * Given an underlying range and a priority level, obtain the minimum for the 131 * new range. 132 */ 133 #define FLOW_MIN_PRIORITY(min, max, pri) \ 134 ((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri))) 135 136 /* 137 * Given an underlying range and a minimum level (base), obtain the maximum 138 * for the new range. 139 */ 140 #define FLOW_MAX_PRIORITY(min, max, base) \ 141 ((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS)) 142 143 /* 144 * Given an underlying range and a priority level, get the absolute 145 * priority value. For now there are just 3 values, high, low and 146 * medium so we can just return max, min or min + (max - min) / 2. 147 * If there are more than three we need to change this computation. 148 */ 149 #define FLOW_PRIORITY(min, max, pri) \ 150 (pri) == MPL_HIGH ? (max) : \ 151 (pri) == MPL_LOW ? (min) : \ 152 ((min) + (((max) - (min)) / 2)) 153 154 #define MAC_FLOW_TAB_SIZE 500 155 156 typedef struct flow_entry_s flow_entry_t; 157 typedef struct flow_tab_s flow_tab_t; 158 typedef struct flow_state_s flow_state_t; 159 struct mac_impl_s; 160 struct mac_client_impl_s; 161 162 /* 163 * Classification flags used to lookup the flow. 164 */ 165 #define FLOW_INBOUND 0x01 166 #define FLOW_OUTBOUND 0x02 167 /* Don't compare VID when classifying the packets, see mac_rx_classify() */ 168 #define FLOW_IGNORE_VLAN 0x04 169 170 /* Generic flow client function signature */ 171 typedef void (*flow_fn_t)(void *, void *, mblk_t *, boolean_t); 172 173 /* Flow state */ 174 typedef enum { 175 FLOW_DRIVER_UPCALL, 176 FLOW_USER_REF 177 } mac_flow_state_t; 178 179 /* Matches a flow_entry_t using the extracted flow_state_t info */ 180 typedef boolean_t (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *, 181 flow_state_t *); 182 183 /* fe_flags */ 184 #define FE_QUIESCE 0x01 /* Quiesce the flow */ 185 #define FE_WAITER 0x02 /* Flow has a waiter */ 186 #define FE_FLOW_TAB 0x04 /* Flow is in the flow tab list */ 187 #define FE_G_FLOW_HASH 0x08 /* Flow is in the global flow hash */ 188 #define FE_INCIPIENT 0x10 /* Being setup */ 189 #define FE_CONDEMNED 0x20 /* Being deleted */ 190 #define FE_UF_NO_DATAPATH 0x40 /* No datapath setup for User flow */ 191 #define FE_MC_NO_DATAPATH 0x80 /* No datapath setup for mac client */ 192 193 /* fe_type */ 194 #define FLOW_PRIMARY_MAC 0x01 /* NIC primary MAC address */ 195 #define FLOW_VNIC_MAC 0x02 /* VNIC flow */ 196 #define FLOW_MCAST 0x04 /* Multicast (and broadcast) */ 197 #define FLOW_OTHER 0x08 /* Other flows configured */ 198 #define FLOW_USER 0x10 /* User defined flow */ 199 #define FLOW_VNIC FLOW_VNIC_MAC 200 #define FLOW_NO_STATS 0x20 /* Don't create stats for the flow */ 201 202 /* 203 * Shared Bandwidth control counters between the soft ring set and its 204 * associated soft rings. In case the flow associated with NIC/VNIC 205 * has a group of Rx rings assigned to it, we have the same 206 * number of soft ring sets as we have the Rx ring in the group 207 * and each individual SRS (and its soft rings) decide when to 208 * poll their Rx ring independently. But if there is a B/W limit 209 * associated with the NIC/VNIC, then the B/W control counter is 210 * shared across all the SRS in the group and their associated 211 * soft rings. 212 * 213 * There is a many to 1 mapping between the SRS and 214 * mac_bw_ctl if the flow has a group of Rx rings associated with 215 * it. 216 */ 217 typedef struct mac_bw_ctl_s { 218 kmutex_t mac_bw_lock; 219 uint32_t mac_bw_state; 220 size_t mac_bw_sz; /* ?? Is it needed */ 221 size_t mac_bw_limit; /* Max bytes to process per tick */ 222 size_t mac_bw_used; /* Bytes processed in current tick */ 223 size_t mac_bw_drop_threshold; /* Max queue length */ 224 size_t mac_bw_drop_bytes; 225 size_t mac_bw_polled; 226 size_t mac_bw_intr; 227 clock_t mac_bw_curr_time; 228 } mac_bw_ctl_t; 229 230 struct flow_entry_s { /* Protected by */ 231 struct flow_entry_s *fe_next; /* ft_lock */ 232 233 datalink_id_t fe_link_id; /* WO */ 234 235 /* Properties as specified for this flow */ 236 mac_resource_props_t fe_resource_props; /* SL */ 237 238 /* Properties actually effective at run time for this flow */ 239 mac_resource_props_t fe_effective_props; /* SL */ 240 241 kmutex_t fe_lock; 242 char fe_flow_name[MAXFLOWNAMELEN]; /* fe_lock */ 243 flow_desc_t fe_flow_desc; /* fe_lock */ 244 kcondvar_t fe_cv; /* fe_lock */ 245 /* 246 * Initial flow ref is 1 on creation. A thread that lookups the 247 * flent typically by a mac_flow_lookup() dynamically holds a ref. 248 * If the ref is 1, it means there arent' any upcalls from the driver 249 * or downcalls from the stack using this flent. Structures pointing 250 * to the flent or flent inserted in lists don't count towards this 251 * refcnt. Instead they are tracked using fe_flags. Only a control 252 * thread doing a teardown operation deletes the flent, after waiting 253 * for upcalls to finish synchronously. The fe_refcnt tracks 254 * the number of upcall refs 255 */ 256 uint32_t fe_refcnt; /* fe_lock */ 257 258 /* 259 * This tracks lookups done using the global hash list for user 260 * generated flows. This refcnt only protects the flent itself 261 * from disappearing and helps walkers to read the flent info such 262 * as flow spec. However the flent may be quiesced and the SRS could 263 * be deleted. The fe_user_refcnt tracks the number of global flow 264 * has refs. 265 */ 266 uint32_t fe_user_refcnt; /* fe_lock */ 267 uint_t fe_flags; /* fe_lock */ 268 269 /* 270 * Function/args to invoke for delivering matching packets 271 * Only the function ff_fn may be changed dynamically and atomically. 272 * The ff_arg1 and ff_arg2 are set at creation time and may not 273 * be changed. 274 */ 275 flow_fn_t fe_cb_fn; /* fe_lock */ 276 void *fe_cb_arg1; /* fe_lock */ 277 void *fe_cb_arg2; /* fe_lock */ 278 279 void *fe_client_cookie; /* WO */ 280 void *fe_rx_ring_group; /* SL */ 281 void *fe_rx_srs[MAX_RINGS_PER_GROUP]; /* fe_lock */ 282 int fe_rx_srs_cnt; /* fe_lock */ 283 void *fe_tx_ring_group; 284 void *fe_tx_srs; /* WO */ 285 int fe_tx_ring_cnt; 286 287 /* 288 * This is a unicast flow, and is a mac_client_impl_t 289 */ 290 void *fe_mcip; /* WO */ 291 292 /* 293 * Used by mci_flent_list of mac_client_impl_t to track flows sharing 294 * the same mac_client_impl_t. 295 */ 296 struct flow_entry_s *fe_client_next; 297 298 /* 299 * This is a broadcast or multicast flow and is a mac_bcast_grp_t 300 */ 301 void *fe_mbg; /* WO */ 302 uint_t fe_type; /* WO */ 303 304 /* 305 * BW control info. 306 */ 307 mac_bw_ctl_t fe_tx_bw; 308 mac_bw_ctl_t fe_rx_bw; 309 310 /* 311 * Used by flow table lookup code 312 */ 313 flow_match_fn_t fe_match; 314 315 /* 316 * Used by mac_flow_remove(). 317 */ 318 int fe_index; 319 flow_tab_t *fe_flow_tab; 320 321 kstat_t *fe_ksp; 322 kstat_t *fe_misc_stat_ksp; 323 324 boolean_t fe_desc_logged; 325 uint64_t fe_nic_speed; 326 }; 327 328 /* 329 * Various structures used by the flows framework for keeping track 330 * of packet state information. 331 */ 332 333 /* Layer 2 */ 334 typedef struct flow_l2info_s { 335 uchar_t *l2_start; 336 uint8_t *l2_daddr; 337 uint16_t l2_vid; 338 uint32_t l2_sap; 339 uint_t l2_hdrsize; 340 } flow_l2info_t; 341 342 /* Layer 3 */ 343 typedef struct flow_l3info_s { 344 uchar_t *l3_start; 345 uint8_t l3_protocol; 346 uint8_t l3_version; 347 boolean_t l3_dst_or_src; 348 uint_t l3_hdrsize; 349 boolean_t l3_fragmented; 350 } flow_l3info_t; 351 352 /* Layer 4 */ 353 typedef struct flow_l4info_s { 354 uchar_t *l4_start; 355 uint16_t l4_src_port; 356 uint16_t l4_dst_port; 357 uint16_t l4_hash_port; 358 } flow_l4info_t; 359 360 /* 361 * Combined state structure. 362 * Holds flow direction and an mblk_t pointer. 363 */ 364 struct flow_state_s { 365 uint_t fs_flags; 366 mblk_t *fs_mp; 367 flow_l2info_t fs_l2info; 368 flow_l3info_t fs_l3info; 369 flow_l4info_t fs_l4info; 370 }; 371 372 /* 373 * Flow ops vector. 374 * There are two groups of functions. The ones ending with _fe are 375 * called when a flow is being added. The others (hash, accept) are 376 * called at flow lookup time. 377 */ 378 #define FLOW_MAX_ACCEPT 16 379 typedef struct flow_ops_s { 380 /* 381 * fo_accept_fe(): 382 * Validates the contents of the flow and checks whether 383 * it's compatible with the flow table. sets the fe_match 384 * function of the flow. 385 */ 386 int (*fo_accept_fe)(flow_tab_t *, flow_entry_t *); 387 /* 388 * fo_hash_fe(): 389 * Generates a hash index to the flow table. This function 390 * must use the same algorithm as fo_hash(), which is used 391 * by the flow lookup code path. 392 */ 393 uint32_t (*fo_hash_fe)(flow_tab_t *, flow_entry_t *); 394 /* 395 * fo_match_fe(): 396 * This is used for finding identical flows. 397 */ 398 boolean_t (*fo_match_fe)(flow_tab_t *, flow_entry_t *, 399 flow_entry_t *); 400 /* 401 * fo_insert_fe(): 402 * Used for inserting a flow to a flow chain. 403 * Protocols that have special ordering requirements would 404 * need to implement this. For those that don't, 405 * flow_generic_insert_fe() may be used. 406 */ 407 int (*fo_insert_fe)(flow_tab_t *, flow_entry_t **, 408 flow_entry_t *); 409 410 /* 411 * Calculates the flow hash index based on the accumulated 412 * state in flow_state_t. Must use the same algorithm as 413 * fo_hash_fe(). 414 */ 415 uint32_t (*fo_hash)(flow_tab_t *, flow_state_t *); 416 417 /* 418 * Array of accept fuctions. 419 * Each function in the array will accumulate enough state 420 * (header length, protocol) to allow the next function to 421 * proceed. We support up to FLOW_MAX_ACCEPT functions which 422 * should be sufficient for all practical purposes. 423 */ 424 int (*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *, 425 flow_state_t *); 426 } flow_ops_t; 427 428 /* 429 * Generic flow table. 430 */ 431 struct flow_tab_s { 432 krwlock_t ft_lock; 433 /* 434 * Contains a list of functions (described above) 435 * specific to this table type. 436 */ 437 flow_ops_t ft_ops; 438 439 /* 440 * Indicates what types of flows are supported. 441 */ 442 flow_mask_t ft_mask; 443 444 /* 445 * An array of flow_entry_t * of size ft_size. 446 * Each element is the beginning of a hash chain. 447 */ 448 flow_entry_t **ft_table; 449 uint_t ft_size; 450 451 /* 452 * The number of flows inserted into ft_table. 453 */ 454 uint_t ft_flow_count; 455 struct mac_impl_s *ft_mip; 456 struct mac_client_impl_s *ft_mcip; 457 }; 458 459 /* 460 * This is used for describing what type of flow table can be created. 461 * mac_flow.c contains a list of these structures. 462 */ 463 typedef struct flow_tab_info_s { 464 flow_ops_t *fti_ops; 465 flow_mask_t fti_mask; 466 uint_t fti_size; 467 } flow_tab_info_t; 468 469 #define FLOW_TAB_EMPTY(ft) ((ft) == NULL || (ft)->ft_flow_count == 0) 470 471 472 #define MCIP_STAT_UPDATE(m, s, c) { \ 473 ((mac_client_impl_t *)(m))->mci_misc_stat.mms_##s \ 474 += ((uint64_t)(c)); \ 475 } 476 477 #define SRS_RX_STAT_UPDATE(m, s, c) { \ 478 ((mac_soft_ring_set_t *)(m))->srs_rx.sr_stat.mrs_##s \ 479 += ((uint64_t)(c)); \ 480 } 481 482 #define SRS_TX_STAT_UPDATE(m, s, c) { \ 483 ((mac_soft_ring_set_t *)(m))->srs_tx.st_stat.mts_##s \ 484 += ((uint64_t)(c)); \ 485 } 486 487 #define SRS_TX_STATS_UPDATE(m, s) { \ 488 SRS_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets); \ 489 SRS_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes); \ 490 SRS_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors); \ 491 } 492 493 #define SOFTRING_TX_STAT_UPDATE(m, s, c) { \ 494 ((mac_soft_ring_t *)(m))->s_st_stat.mts_##s += ((uint64_t)(c)); \ 495 } 496 497 #define SOFTRING_TX_STATS_UPDATE(m, s) { \ 498 SOFTRING_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets); \ 499 SOFTRING_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes); \ 500 SOFTRING_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors); \ 501 } 502 503 extern void mac_flow_init(); 504 extern void mac_flow_fini(); 505 extern int mac_flow_create(flow_desc_t *, mac_resource_props_t *, 506 char *, void *, uint_t, flow_entry_t **); 507 508 extern int mac_flow_add(flow_tab_t *, flow_entry_t *); 509 extern int mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *, 510 boolean_t); 511 extern int mac_flow_hash_add(flow_entry_t *); 512 extern int mac_flow_lookup_byname(char *, flow_entry_t **); 513 extern int mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t, 514 flow_entry_t **); 515 516 extern int mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *), 517 void *); 518 519 extern int mac_flow_walk_nolock(flow_tab_t *, 520 int (*)(flow_entry_t *, void *), void *); 521 522 extern void mac_flow_modify(flow_tab_t *, flow_entry_t *, 523 mac_resource_props_t *); 524 525 extern void *mac_flow_get_client_cookie(flow_entry_t *); 526 527 extern uint32_t mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *); 528 529 extern int mac_flow_update(flow_tab_t *, flow_entry_t *, flow_desc_t *); 530 extern void mac_flow_get_desc(flow_entry_t *, flow_desc_t *); 531 extern void mac_flow_set_desc(flow_entry_t *, flow_desc_t *); 532 533 extern void mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t); 534 extern void mac_flow_hash_remove(flow_entry_t *); 535 extern void mac_flow_wait(flow_entry_t *, mac_flow_state_t); 536 extern void mac_flow_quiesce(flow_entry_t *); 537 extern void mac_flow_restart(flow_entry_t *); 538 extern void mac_flow_cleanup(flow_entry_t *); 539 extern void mac_flow_destroy(flow_entry_t *); 540 541 extern void mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t, 542 struct mac_impl_s *, flow_tab_t **); 543 extern void mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **); 544 extern void mac_flow_tab_destroy(flow_tab_t *); 545 extern void mac_flow_drop(void *, void *, mblk_t *); 546 extern void flow_stat_destroy(flow_entry_t *); 547 548 #ifdef __cplusplus 549 } 550 #endif 551 552 #endif /* _MAC_FLOW_IMPL_H */ 553