1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _MAC_FLOW_IMPL_H 28 #define _MAC_FLOW_IMPL_H 29 30 #ifdef __cplusplus 31 extern "C" { 32 #endif 33 34 #include <sys/param.h> 35 #include <sys/atomic.h> 36 #include <sys/ksynch.h> 37 #include <sys/mac_flow.h> 38 #include <sys/stream.h> 39 #include <sys/sdt.h> 40 #include <net/if.h> 41 42 /* 43 * Macros to increment/decrement the reference count on a flow_entry_t. 44 */ 45 #define FLOW_REFHOLD(flent) { \ 46 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 47 mutex_enter(&(flent)->fe_lock); \ 48 (flent)->fe_refcnt++; \ 49 mutex_exit(&(flent)->fe_lock); \ 50 } 51 52 /* 53 * Data paths must not attempt to use a flow entry if it is marked INCIPIENT 54 * or QUIESCE. In the former case the set up is not yet complete and the 55 * data path could stumble on inconsistent data structures. In the latter 56 * case a control operation is waiting for quiescence so that it can 57 * change callbacks or other structures without the use of locks. 58 */ 59 #define FLOW_TRY_REFHOLD(flent, err) { \ 60 DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ 61 (err) = 0; \ 62 mutex_enter(&(flent)->fe_lock); \ 63 if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \ 64 FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH)) \ 65 (err) = -1; \ 66 else \ 67 (flent)->fe_refcnt++; \ 68 mutex_exit(&(flent)->fe_lock); \ 69 } 70 71 #define FLOW_REFRELE(flent) { \ 72 DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent)); \ 73 mutex_enter(&(flent)->fe_lock); \ 74 ASSERT((flent)->fe_refcnt != 0); \ 75 (flent)->fe_refcnt--; \ 76 if ((flent)->fe_flags & FE_WAITER) { \ 77 ASSERT((flent)->fe_refcnt != 0); \ 78 cv_signal(&(flent)->fe_cv); \ 79 mutex_exit(&(flent)->fe_lock); \ 80 } else if ((flent)->fe_refcnt == 0) { \ 81 mac_flow_destroy(flent); \ 82 } else { \ 83 mutex_exit(&(flent)->fe_lock); \ 84 } \ 85 } 86 87 #define FLOW_USER_REFHOLD(flent) { \ 88 mutex_enter(&(flent)->fe_lock); \ 89 (flent)->fe_user_refcnt++; \ 90 mutex_exit(&(flent)->fe_lock); \ 91 } 92 93 #define FLOW_USER_REFRELE(flent) { \ 94 mutex_enter(&(flent)->fe_lock); \ 95 ASSERT((flent)->fe_user_refcnt != 0); \ 96 if (--(flent)->fe_user_refcnt == 0 && \ 97 ((flent)->fe_flags & FE_WAITER)) \ 98 cv_signal(&(flent)->fe_cv); \ 99 mutex_exit(&(flent)->fe_lock); \ 100 } 101 102 #define FLOW_FINAL_REFRELE(flent) { \ 103 ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0); \ 104 FLOW_REFRELE(flent); \ 105 } 106 107 /* 108 * Mark or unmark the flent with a bit flag 109 */ 110 #define FLOW_MARK(flent, flag) { \ 111 mutex_enter(&(flent)->fe_lock); \ 112 (flent)->fe_flags |= flag; \ 113 mutex_exit(&(flent)->fe_lock); \ 114 } 115 116 #define FLOW_UNMARK(flent, flag) { \ 117 mutex_enter(&(flent)->fe_lock); \ 118 (flent)->fe_flags &= ~flag; \ 119 mutex_exit(&(flent)->fe_lock); \ 120 } 121 122 #define FLENT_TO_MIP(flent) \ 123 (flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) : \ 124 ((mac_client_impl_t *)flent->fe_mcip)->mci_mip) 125 126 /* Convert a bandwidth expressed in bps to a number of bytes per tick. */ 127 #define FLOW_BYTES_PER_TICK(bps) (((bps) >> 3) / hz) 128 129 /* 130 * Given an underlying range and a priority level, obtain the minimum for the 131 * new range. 132 */ 133 #define FLOW_MIN_PRIORITY(min, max, pri) \ 134 ((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri))) 135 136 /* 137 * Given an underlying range and a minimum level (base), obtain the maximum 138 * for the new range. 139 */ 140 #define FLOW_MAX_PRIORITY(min, max, base) \ 141 ((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS)) 142 143 /* 144 * Given an underlying range and a priority level, get the absolute 145 * priority value. For now there are just 3 values, high, low and 146 * medium so we can just return max, min or min + (max - min) / 2. 147 * If there are more than three we need to change this computation. 148 */ 149 #define FLOW_PRIORITY(min, max, pri) \ 150 (pri) == MPL_HIGH ? (max) : \ 151 (pri) == MPL_LOW ? (min) : \ 152 ((min) + (((max) - (min)) / 2)) 153 154 #define MAC_FLOW_TAB_SIZE 500 155 156 typedef struct flow_entry_s flow_entry_t; 157 typedef struct flow_tab_s flow_tab_t; 158 typedef struct flow_state_s flow_state_t; 159 struct mac_impl_s; 160 struct mac_client_impl_s; 161 162 /* 163 * Classification flags used to lookup the flow. 164 */ 165 #define FLOW_INBOUND 0x01 166 #define FLOW_OUTBOUND 0x02 167 /* Don't compare VID when classifying the packets, see mac_rx_classify() */ 168 #define FLOW_IGNORE_VLAN 0x04 169 170 /* Generic flow client function signature */ 171 typedef void (*flow_fn_t)(void *, void *, mblk_t *, boolean_t); 172 173 /* Flow state */ 174 typedef enum { 175 FLOW_DRIVER_UPCALL, 176 FLOW_USER_REF 177 } mac_flow_state_t; 178 179 /* Matches a flow_entry_t using the extracted flow_state_t info */ 180 typedef boolean_t (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *, 181 flow_state_t *); 182 183 /* fe_flags */ 184 #define FE_QUIESCE 0x01 /* Quiesce the flow */ 185 #define FE_WAITER 0x02 /* Flow has a waiter */ 186 #define FE_FLOW_TAB 0x04 /* Flow is in the flow tab list */ 187 #define FE_G_FLOW_HASH 0x08 /* Flow is in the global flow hash */ 188 #define FE_INCIPIENT 0x10 /* Being setup */ 189 #define FE_CONDEMNED 0x20 /* Being deleted */ 190 #define FE_UF_NO_DATAPATH 0x40 /* No datapath setup for User flow */ 191 #define FE_MC_NO_DATAPATH 0x80 /* No datapath setup for mac client */ 192 193 /* fe_type */ 194 #define FLOW_PRIMARY_MAC 0x01 /* NIC primary MAC address */ 195 #define FLOW_VNIC_MAC 0x02 /* VNIC flow */ 196 #define FLOW_MCAST 0x04 /* Multicast (and broadcast) */ 197 #define FLOW_OTHER 0x08 /* Other flows configured */ 198 #define FLOW_USER 0x10 /* User defined flow */ 199 #define FLOW_VNIC FLOW_VNIC_MAC 200 #define FLOW_NO_STATS 0x20 /* Don't create stats for the flow */ 201 202 /* 203 * Shared Bandwidth control counters between the soft ring set and its 204 * associated soft rings. In case the flow associated with NIC/VNIC 205 * has a group of Rx rings assigned to it, we have the same 206 * number of soft ring sets as we have the Rx ring in the group 207 * and each individual SRS (and its soft rings) decide when to 208 * poll their Rx ring independently. But if there is a B/W limit 209 * associated with the NIC/VNIC, then the B/W control counter is 210 * shared across all the SRS in the group and their associated 211 * soft rings. 212 * 213 * There is a many to 1 mapping between the SRS and 214 * mac_bw_ctl if the flow has a group of Rx rings associated with 215 * it. 216 */ 217 typedef struct mac_bw_ctl_s { 218 kmutex_t mac_bw_lock; 219 uint32_t mac_bw_state; 220 size_t mac_bw_sz; /* ?? Is it needed */ 221 size_t mac_bw_limit; /* Max bytes to process per tick */ 222 size_t mac_bw_used; /* Bytes processed in current tick */ 223 size_t mac_bw_drop_threshold; /* Max queue length */ 224 size_t mac_bw_drop_bytes; 225 size_t mac_bw_polled; 226 size_t mac_bw_intr; 227 clock_t mac_bw_curr_time; 228 } mac_bw_ctl_t; 229 230 struct flow_entry_s { /* Protected by */ 231 struct flow_entry_s *fe_next; /* ft_lock */ 232 233 datalink_id_t fe_link_id; /* WO */ 234 235 /* Properties as specified for this flow */ 236 mac_resource_props_t fe_resource_props; /* SL */ 237 238 /* Properties actually effective at run time for this flow */ 239 mac_resource_props_t fe_effective_props; /* SL */ 240 241 kmutex_t fe_lock; 242 char fe_flow_name[MAXFLOWNAMELEN]; /* fe_lock */ 243 flow_desc_t fe_flow_desc; /* fe_lock */ 244 kcondvar_t fe_cv; /* fe_lock */ 245 /* 246 * Initial flow ref is 1 on creation. A thread that lookups the 247 * flent typically by a mac_flow_lookup() dynamically holds a ref. 248 * If the ref is 1, it means there arent' any upcalls from the driver 249 * or downcalls from the stack using this flent. Structures pointing 250 * to the flent or flent inserted in lists don't count towards this 251 * refcnt. Instead they are tracked using fe_flags. Only a control 252 * thread doing a teardown operation deletes the flent, after waiting 253 * for upcalls to finish synchronously. The fe_refcnt tracks 254 * the number of upcall refs 255 */ 256 uint32_t fe_refcnt; /* fe_lock */ 257 258 /* 259 * This tracks lookups done using the global hash list for user 260 * generated flows. This refcnt only protects the flent itself 261 * from disappearing and helps walkers to read the flent info such 262 * as flow spec. However the flent may be quiesced and the SRS could 263 * be deleted. The fe_user_refcnt tracks the number of global flow 264 * has refs. 265 */ 266 uint32_t fe_user_refcnt; /* fe_lock */ 267 uint_t fe_flags; /* fe_lock */ 268 269 /* 270 * Function/args to invoke for delivering matching packets 271 * Only the function ff_fn may be changed dynamically and atomically. 272 * The ff_arg1 and ff_arg2 are set at creation time and may not 273 * be changed. 274 */ 275 flow_fn_t fe_cb_fn; /* fe_lock */ 276 void *fe_cb_arg1; /* fe_lock */ 277 void *fe_cb_arg2; /* fe_lock */ 278 279 void *fe_client_cookie; /* WO */ 280 void *fe_rx_ring_group; /* SL */ 281 void *fe_rx_srs[MAX_RINGS_PER_GROUP]; /* fe_lock */ 282 int fe_rx_srs_cnt; /* fe_lock */ 283 void *fe_tx_srs; /* WO */ 284 285 /* 286 * This is a unicast flow, and is a mac_client_impl_t 287 */ 288 void *fe_mcip; /* WO */ 289 290 /* 291 * Used by mci_flent_list of mac_client_impl_t to track flows sharing 292 * the same mac_client_impl_t. 293 */ 294 struct flow_entry_s *fe_client_next; 295 296 /* 297 * This is a broadcast or multicast flow and is a mac_bcast_grp_t 298 */ 299 void *fe_mbg; /* WO */ 300 uint_t fe_type; /* WO */ 301 302 /* 303 * BW control info. 304 */ 305 mac_bw_ctl_t fe_tx_bw; 306 mac_bw_ctl_t fe_rx_bw; 307 308 /* 309 * Used by flow table lookup code 310 */ 311 flow_match_fn_t fe_match; 312 313 /* 314 * Used by mac_flow_remove(). 315 */ 316 int fe_index; 317 flow_tab_t *fe_flow_tab; 318 319 kstat_t *fe_ksp; 320 flow_stats_t fe_flowstats; 321 boolean_t fe_desc_logged; 322 zoneid_t fe_zoneid; 323 uint64_t fe_nic_speed; 324 }; 325 326 /* 327 * Various structures used by the flows framework for keeping track 328 * of packet state information. 329 */ 330 331 /* Layer 2 */ 332 typedef struct flow_l2info_s { 333 uchar_t *l2_start; 334 uint8_t *l2_daddr; 335 uint16_t l2_vid; 336 uint32_t l2_sap; 337 uint_t l2_hdrsize; 338 } flow_l2info_t; 339 340 /* Layer 3 */ 341 typedef struct flow_l3info_s { 342 uchar_t *l3_start; 343 uint8_t l3_protocol; 344 uint8_t l3_version; 345 boolean_t l3_dst_or_src; 346 uint_t l3_hdrsize; 347 boolean_t l3_fragmented; 348 } flow_l3info_t; 349 350 /* Layer 4 */ 351 typedef struct flow_l4info_s { 352 uchar_t *l4_start; 353 uint16_t l4_src_port; 354 uint16_t l4_dst_port; 355 uint16_t l4_hash_port; 356 } flow_l4info_t; 357 358 /* 359 * Combined state structure. 360 * Holds flow direction and an mblk_t pointer. 361 */ 362 struct flow_state_s { 363 uint_t fs_flags; 364 mblk_t *fs_mp; 365 flow_l2info_t fs_l2info; 366 flow_l3info_t fs_l3info; 367 flow_l4info_t fs_l4info; 368 }; 369 370 /* 371 * Flow ops vector. 372 * There are two groups of functions. The ones ending with _fe are 373 * called when a flow is being added. The others (hash, accept) are 374 * called at flow lookup time. 375 */ 376 #define FLOW_MAX_ACCEPT 16 377 typedef struct flow_ops_s { 378 /* 379 * fo_accept_fe(): 380 * Validates the contents of the flow and checks whether 381 * it's compatible with the flow table. sets the fe_match 382 * function of the flow. 383 */ 384 int (*fo_accept_fe)(flow_tab_t *, flow_entry_t *); 385 /* 386 * fo_hash_fe(): 387 * Generates a hash index to the flow table. This function 388 * must use the same algorithm as fo_hash(), which is used 389 * by the flow lookup code path. 390 */ 391 uint32_t (*fo_hash_fe)(flow_tab_t *, flow_entry_t *); 392 /* 393 * fo_match_fe(): 394 * This is used for finding identical flows. 395 */ 396 boolean_t (*fo_match_fe)(flow_tab_t *, flow_entry_t *, 397 flow_entry_t *); 398 /* 399 * fo_insert_fe(): 400 * Used for inserting a flow to a flow chain. 401 * Protocols that have special ordering requirements would 402 * need to implement this. For those that don't, 403 * flow_generic_insert_fe() may be used. 404 */ 405 int (*fo_insert_fe)(flow_tab_t *, flow_entry_t **, 406 flow_entry_t *); 407 408 /* 409 * Calculates the flow hash index based on the accumulated 410 * state in flow_state_t. Must use the same algorithm as 411 * fo_hash_fe(). 412 */ 413 uint32_t (*fo_hash)(flow_tab_t *, flow_state_t *); 414 415 /* 416 * Array of accept fuctions. 417 * Each function in the array will accumulate enough state 418 * (header length, protocol) to allow the next function to 419 * proceed. We support up to FLOW_MAX_ACCEPT functions which 420 * should be sufficient for all practical purposes. 421 */ 422 int (*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *, 423 flow_state_t *); 424 } flow_ops_t; 425 426 /* 427 * Generic flow table. 428 */ 429 struct flow_tab_s { 430 krwlock_t ft_lock; 431 /* 432 * Contains a list of functions (described above) 433 * specific to this table type. 434 */ 435 flow_ops_t ft_ops; 436 437 /* 438 * Indicates what types of flows are supported. 439 */ 440 flow_mask_t ft_mask; 441 442 /* 443 * An array of flow_entry_t * of size ft_size. 444 * Each element is the beginning of a hash chain. 445 */ 446 flow_entry_t **ft_table; 447 uint_t ft_size; 448 449 /* 450 * The number of flows inserted into ft_table. 451 */ 452 uint_t ft_flow_count; 453 struct mac_impl_s *ft_mip; 454 struct mac_client_impl_s *ft_mcip; 455 }; 456 457 /* 458 * This is used for describing what type of flow table can be created. 459 * mac_flow.c contains a list of these structures. 460 */ 461 typedef struct flow_tab_info_s { 462 flow_ops_t *fti_ops; 463 flow_mask_t fti_mask; 464 uint_t fti_size; 465 } flow_tab_info_t; 466 467 #define FLOW_TAB_EMPTY(ft) ((ft) == NULL || (ft)->ft_flow_count == 0) 468 469 /* 470 * This is used by mac_tx_send. 471 */ 472 typedef struct mac_tx_stats_s { 473 uint_t ts_opackets; 474 uint_t ts_obytes; 475 uint_t ts_oerrors; 476 } mac_tx_stats_t; 477 478 #define FLOW_STAT_UPDATE(f, s, c) { \ 479 ((flow_entry_t *)(f))->fe_flowstats.fs_##s += ((uint64_t)(c)); \ 480 } 481 482 #define FLOW_TX_STATS_UPDATE(f, s) { \ 483 FLOW_STAT_UPDATE((f), opackets, (s)->ts_opackets); \ 484 FLOW_STAT_UPDATE((f), obytes, (s)->ts_obytes); \ 485 FLOW_STAT_UPDATE((f), oerrors, (s)->ts_oerrors); \ 486 } 487 488 extern void mac_flow_init(); 489 extern void mac_flow_fini(); 490 extern int mac_flow_create(flow_desc_t *, mac_resource_props_t *, 491 char *, void *, uint_t, flow_entry_t **); 492 493 extern int mac_flow_add(flow_tab_t *, flow_entry_t *); 494 extern int mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *, 495 boolean_t); 496 extern int mac_flow_hash_add(flow_entry_t *); 497 extern int mac_flow_lookup_byname(char *, flow_entry_t **); 498 extern int mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t, 499 flow_entry_t **); 500 501 extern int mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *), 502 void *); 503 504 extern int mac_flow_walk_nolock(flow_tab_t *, 505 int (*)(flow_entry_t *, void *), void *); 506 507 extern void mac_flow_modify(flow_tab_t *, flow_entry_t *, 508 mac_resource_props_t *); 509 510 extern void *mac_flow_get_client_cookie(flow_entry_t *); 511 512 extern uint32_t mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *); 513 514 extern int mac_flow_update(flow_tab_t *, flow_entry_t *, flow_desc_t *); 515 extern void mac_flow_get_desc(flow_entry_t *, flow_desc_t *); 516 extern void mac_flow_set_desc(flow_entry_t *, flow_desc_t *); 517 518 extern void mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t); 519 extern void mac_flow_hash_remove(flow_entry_t *); 520 extern void mac_flow_wait(flow_entry_t *, mac_flow_state_t); 521 extern void mac_flow_quiesce(flow_entry_t *); 522 extern void mac_flow_restart(flow_entry_t *); 523 extern void mac_flow_cleanup(flow_entry_t *); 524 extern void mac_flow_destroy(flow_entry_t *); 525 526 extern void mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t, 527 struct mac_impl_s *, flow_tab_t **); 528 extern void mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **); 529 extern void mac_flow_tab_destroy(flow_tab_t *); 530 extern void mac_flow_drop(void *, void *, mblk_t *); 531 extern void flow_stat_destroy(flow_entry_t *); 532 533 #ifdef __cplusplus 534 } 535 #endif 536 537 #endif /* _MAC_FLOW_IMPL_H */ 538