xref: /illumos-gate/usr/src/uts/common/sys/mac_flow_impl.h (revision 00c09443b66b156809f3c9fc8f098e07c7842aa4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2026 Oxide Computer Company
26  */
27 
28 #ifndef	_MAC_FLOW_IMPL_H
29 #define	_MAC_FLOW_IMPL_H
30 
31 #ifdef	__cplusplus
32 extern "C" {
33 #endif
34 
35 #include <sys/param.h>
36 #include <sys/atomic.h>
37 #include <sys/ksynch.h>
38 #include <sys/mac_flow.h>
39 #include <sys/stream.h>
40 #include <sys/sdt.h>
41 #include <net/if.h>
42 
43 /*
44  * Macros to increment/decrement the reference count on a flow_entry_t.
45  */
46 #define	FLOW_REFHOLD(flent) {					\
47 	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
48 	mutex_enter(&(flent)->fe_lock);				\
49 	(flent)->fe_refcnt++;					\
50 	mutex_exit(&(flent)->fe_lock);				\
51 }
52 
53 /*
54  * Data paths must not attempt to use a flow entry if it is marked INCIPIENT
55  * or QUIESCE. In the former case the set up is not yet complete and the
56  * data path could stumble on inconsistent data structures. In the latter
57  * case a control operation is waiting for quiescence so that it can
58  * change callbacks or other structures without the use of locks.
59  */
60 #define	FLOW_TRY_REFHOLD(flent, err) {				\
61 	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
62 	(err) = 0;						\
63 	mutex_enter(&(flent)->fe_lock);				\
64 	if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \
65 	    FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH))			\
66 		(err) = -1;					\
67 	else							\
68 		(flent)->fe_refcnt++;				\
69 	mutex_exit(&(flent)->fe_lock);				\
70 }
71 
72 #define	FLOW_REFRELE(flent) {					\
73 	DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent));	\
74 	mutex_enter(&(flent)->fe_lock);				\
75 	ASSERT((flent)->fe_refcnt != 0);			\
76 	(flent)->fe_refcnt--;					\
77 	if ((flent)->fe_flags & FE_WAITER) {			\
78 		ASSERT((flent)->fe_refcnt != 0);		\
79 		cv_signal(&(flent)->fe_cv);			\
80 		mutex_exit(&(flent)->fe_lock);			\
81 	} else if ((flent)->fe_refcnt == 0) {			\
82 		mac_flow_destroy(flent);			\
83 	} else {						\
84 		mutex_exit(&(flent)->fe_lock);			\
85 	}							\
86 }
87 
88 #define	FLOW_USER_REFHOLD(flent) {			\
89 	mutex_enter(&(flent)->fe_lock);			\
90 	(flent)->fe_user_refcnt++;			\
91 	mutex_exit(&(flent)->fe_lock);			\
92 }
93 
94 #define	FLOW_USER_REFRELE(flent) {			\
95 	mutex_enter(&(flent)->fe_lock);			\
96 	ASSERT((flent)->fe_user_refcnt != 0);		\
97 	if (--(flent)->fe_user_refcnt == 0 &&		\
98 	    ((flent)->fe_flags & FE_WAITER))		\
99 		cv_signal(&(flent)->fe_cv);		\
100 	mutex_exit(&(flent)->fe_lock);			\
101 }
102 
103 #define	FLOW_FINAL_REFRELE(flent) {			\
104 	ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0);	\
105 	FLOW_REFRELE(flent);				\
106 }
107 
108 /*
109  * Mark or unmark the flent with a bit flag
110  */
111 #define	FLOW_MARK(flent, flag) {		\
112 	mutex_enter(&(flent)->fe_lock);		\
113 	(flent)->fe_flags |= flag;		\
114 	mutex_exit(&(flent)->fe_lock);		\
115 }
116 
117 #define	FLOW_UNMARK(flent, flag) {		\
118 	mutex_enter(&(flent)->fe_lock);		\
119 	(flent)->fe_flags &= ~flag;		\
120 	mutex_exit(&(flent)->fe_lock);		\
121 }
122 
123 #define	FLENT_TO_MIP(flent)			\
124 	(flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) :	\
125 	((mac_client_impl_t *)flent->fe_mcip)->mci_mip)
126 
127 /* Convert a bandwidth expressed in bps to a number of bytes per tick. */
128 #define	FLOW_BYTES_PER_TICK(bps)	(((bps) >> 3) / hz)
129 
130 /*
131  * Given an underlying range and a priority level, obtain the minimum for the
132  * new range.
133  */
134 #define	FLOW_MIN_PRIORITY(min, max, pri)	\
135 	((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri)))
136 
137 /*
138  * Given an underlying range and a minimum level (base), obtain the maximum
139  * for the new range.
140  */
141 #define	FLOW_MAX_PRIORITY(min, max, base)	\
142 	((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS))
143 
144 /*
145  * Given an underlying range and a priority level, get the absolute
146  * priority value. For now there are just 3 values, high, low and
147  * medium  so we can just return max, min or min + (max - min) / 2.
148  * If there are more than three we need to change this computation.
149  */
150 #define	FLOW_PRIORITY(min, max, pri)		\
151 	(pri) == MPL_HIGH ? (max) :	\
152 	(pri) == MPL_LOW ? (min) :	\
153 	((min) + (((max) - (min)) / 2))
154 
155 #define	MAC_FLOW_TAB_SIZE		500
156 
157 typedef struct flow_entry_s		flow_entry_t;
158 typedef struct flow_tab_s		flow_tab_t;
159 typedef struct flow_state_s		flow_state_t;
160 struct mac_impl_s;
161 struct mac_client_impl_s;
162 struct mac_soft_ring_set_s;
163 struct mac_group_s;
164 struct mac_bcast_grp_s;
165 
166 /*
167  * Classification flags used to lookup the flow.
168  */
169 #define	FLOW_INBOUND		0x01
170 #define	FLOW_OUTBOUND		0x02
171 /* Don't compare VID when classifying the packets, see mac_rx_classify() */
172 #define	FLOW_IGNORE_VLAN	0x04
173 
174 /* Generic flow client function signature */
175 typedef void		(*flow_fn_t)(void *, void *, mblk_t *, boolean_t);
176 
177 /* Flow state */
178 typedef enum {
179 	FLOW_DRIVER_UPCALL,
180 	FLOW_USER_REF
181 } mac_flow_state_t;
182 
183 /* Matches a flow_entry_t using the extracted flow_state_t info */
184 typedef boolean_t	(*flow_match_fn_t)(flow_tab_t *, flow_entry_t *,
185 			    flow_state_t *);
186 
187 typedef enum {
188 	/* Quiesce the flow */
189 	FE_QUIESCE		= 0x01,
190 	/* Flow has a waiter */
191 	FE_WAITER		= 0x02,
192 	/* Flow is in the flow tab list */
193 	FE_FLOW_TAB		= 0x04,
194 	/* Flow is in the global flow hash */
195 	FE_G_FLOW_HASH		= 0x08,
196 	/* Being setup */
197 	FE_INCIPIENT		= 0x10,
198 	/* Being deleted */
199 	FE_CONDEMNED		= 0x20,
200 	/* No datapath setup for User flow */
201 	FE_UF_NO_DATAPATH	= 0x40,
202 	/* No datapath setup for mac client */
203 	FE_MC_NO_DATAPATH	= 0x80,
204 } flow_entry_flags_t;
205 
206 typedef enum {
207 	/* NIC primary MAC address */
208 	FLOW_PRIMARY_MAC	= 0x01,
209 	/* VNIC flow */
210 	FLOW_VNIC_MAC		= 0x02,
211 	/* Multicast (and broadcast) */
212 	FLOW_MCAST		= 0x04,
213 	/* Other flows configured */
214 	FLOW_OTHER		= 0x08,
215 	/* User defined flow */
216 	FLOW_USER		= 0x10,
217 	/* Don't create stats for the flow */
218 	FLOW_NO_STATS		= 0x20,
219 } flow_entry_type_t;
220 
221 #define	FLOW_VNIC		FLOW_VNIC_MAC
222 
223 /*
224  * Shared Bandwidth control counters between the soft ring set and its
225  * associated soft rings. In case the flow associated with NIC/VNIC
226  * has a group of Rx rings assigned to it, we have the same
227  * number of soft ring sets as we have the Rx ring in the group
228  * and each individual SRS (and its soft rings) decide when to
229  * poll their Rx ring independently. But if there is a B/W limit
230  * associated with the NIC/VNIC, then the B/W control counter is
231  * shared across all the SRS in the group and their associated
232  * soft rings.
233  *
234  * There is a many to 1 mapping between the SRS and
235  * mac_bw_ctl if the flow has a group of Rx rings associated with
236  * it.
237  */
238 typedef struct mac_bw_ctl_s {
239 	kmutex_t	mac_bw_lock;
240 	uint32_t	mac_bw_state;
241 	size_t		mac_bw_sz;	/* ?? Is it needed */
242 	size_t		mac_bw_limit;	/* Max bytes to process per tick */
243 	size_t		mac_bw_used;	/* Bytes processed in current tick */
244 	size_t		mac_bw_drop_threshold; /* Max queue length */
245 	size_t		mac_bw_drop_bytes;
246 	size_t		mac_bw_polled;
247 	size_t		mac_bw_intr;
248 	clock_t		mac_bw_curr_time;
249 } mac_bw_ctl_t;
250 
251 struct flow_entry_s {					/* Protected by */
252 	flow_entry_t		*fe_next;		/* ft_lock */
253 
254 	datalink_id_t		fe_link_id;		/* WO */
255 
256 	/* Properties as specified for this flow */
257 	mac_resource_props_t	fe_resource_props;	/* SL */
258 
259 	/* Properties actually effective at run time for this flow */
260 	mac_resource_props_t	fe_effective_props;	/* SL */
261 
262 	kmutex_t		fe_lock;
263 	char			fe_flow_name[MAXFLOWNAMELEN];	/* fe_lock */
264 	flow_desc_t		fe_flow_desc;		/* fe_lock */
265 	kcondvar_t		fe_cv;			/* fe_lock */
266 	/*
267 	 * Initial flow ref is 1 on creation. A thread that lookups the
268 	 * flent typically by a mac_flow_lookup() dynamically holds a ref.
269 	 * If the ref is 1, it means there arent' any upcalls from the driver
270 	 * or downcalls from the stack using this flent. Structures pointing
271 	 * to the flent or flent inserted in lists don't count towards this
272 	 * refcnt. Instead they are tracked using fe_flags. Only a control
273 	 * thread doing a teardown operation deletes the flent, after waiting
274 	 * for upcalls to finish synchronously. The fe_refcnt tracks
275 	 * the number of upcall refs
276 	 */
277 	uint32_t		fe_refcnt;		/* fe_lock */
278 
279 	/*
280 	 * This tracks lookups done using the global hash list for user
281 	 * generated flows. This refcnt only protects the flent itself
282 	 * from disappearing and helps walkers to read the flent info such
283 	 * as flow spec. However the flent may be quiesced and the SRS could
284 	 * be deleted. The fe_user_refcnt tracks the number of global flow
285 	 * has refs.
286 	 */
287 	uint32_t		fe_user_refcnt;		/* fe_lock */
288 	flow_entry_flags_t	fe_flags;		/* fe_lock */
289 
290 	/*
291 	 * Function/args to invoke for delivering matching packets
292 	 * Only the function ff_fn may be changed dynamically and atomically.
293 	 * The ff_arg1 and ff_arg2 are set at creation time and may not
294 	 * be changed.
295 	 */
296 	flow_fn_t		fe_cb_fn;		/* fe_lock */
297 	void			*fe_cb_arg1;		/* fe_lock */
298 	void			*fe_cb_arg2;		/* fe_lock */
299 
300 	void			*fe_client_cookie;	/* WO */
301 	struct mac_group_s	*fe_rx_ring_group;	/* SL */
302 
303 							/* fe_lock */
304 	struct mac_soft_ring_set_s	*fe_rx_srs[MAX_RINGS_PER_GROUP];
305 	uint32_t			fe_rx_srs_cnt;		/* fe_lock */
306 	struct mac_group_s		*fe_tx_ring_group;
307 	struct mac_soft_ring_set_s	*fe_tx_srs;		/* WO */
308 
309 	/*
310 	 * This is a unicast flow, and is a mac_client_impl_t
311 	 */
312 	struct mac_client_impl_s	*fe_mcip;		/* WO */
313 
314 	/*
315 	 * Used by mci_flent_list of mac_client_impl_t to track flows sharing
316 	 * the same mac_client_impl_t.
317 	 */
318 	flow_entry_t		*fe_client_next;
319 
320 	/*
321 	 * This is a broadcast or multicast flow and is a mac_bcast_grp_t
322 	 */
323 	struct mac_bcast_grp_s	*fe_mbg;		/* WO */
324 	flow_entry_type_t	fe_type;		/* WO */
325 
326 	/*
327 	 * BW control info.
328 	 */
329 	mac_bw_ctl_t		fe_tx_bw;
330 	mac_bw_ctl_t		fe_rx_bw;
331 
332 	/*
333 	 * Used by flow table lookup code
334 	 */
335 	flow_match_fn_t		fe_match;
336 
337 	/*
338 	 * Used by mac_flow_remove().
339 	 */
340 	int			fe_index;
341 	flow_tab_t		*fe_flow_tab;
342 
343 	kstat_t			*fe_ksp;
344 	kstat_t			*fe_misc_stat_ksp;
345 
346 	boolean_t		fe_desc_logged;
347 	uint64_t		fe_nic_speed;
348 };
349 
350 /*
351  * Various structures used by the flows framework for keeping track
352  * of packet state information.
353  */
354 
355 /* Layer 2 */
356 typedef struct flow_l2info_s {
357 	uchar_t		*l2_start;
358 	uint8_t		*l2_daddr;
359 	uint16_t	l2_vid;
360 	uint32_t	l2_sap;
361 	uint_t		l2_hdrsize;
362 } flow_l2info_t;
363 
364 /* Layer 3 */
365 typedef struct flow_l3info_s {
366 	uchar_t		*l3_start;
367 	uint8_t		l3_protocol;
368 	uint8_t		l3_version;
369 	boolean_t	l3_dst_or_src;
370 	uint_t		l3_hdrsize;
371 	boolean_t	l3_fragmented;
372 } flow_l3info_t;
373 
374 /* Layer 4 */
375 typedef struct flow_l4info_s {
376 	uchar_t		*l4_start;
377 	uint16_t	l4_src_port;
378 	uint16_t	l4_dst_port;
379 	uint16_t	l4_hash_port;
380 } flow_l4info_t;
381 
382 /*
383  * Combined state structure.
384  * Holds flow direction and an mblk_t pointer.
385  */
386 struct flow_state_s {
387 	uint_t		fs_flags;
388 	mblk_t		*fs_mp;
389 	flow_l2info_t	fs_l2info;
390 	flow_l3info_t	fs_l3info;
391 	flow_l4info_t	fs_l4info;
392 };
393 
394 /*
395  * Flow ops vector.
396  * There are two groups of functions. The ones ending with _fe are
397  * called when a flow is being added. The others (hash, accept) are
398  * called at flow lookup time.
399  */
400 #define	FLOW_MAX_ACCEPT	16
401 typedef struct flow_ops_s {
402 	/*
403 	 * fo_accept_fe():
404 	 * Validates the contents of the flow and checks whether
405 	 * it's compatible with the flow table. sets the fe_match
406 	 * function of the flow.
407 	 */
408 	int		(*fo_accept_fe)(flow_tab_t *, flow_entry_t *);
409 	/*
410 	 * fo_hash_fe():
411 	 * Generates a hash index to the flow table. This function
412 	 * must use the same algorithm as fo_hash(), which is used
413 	 * by the flow lookup code path.
414 	 */
415 	uint32_t	(*fo_hash_fe)(flow_tab_t *, flow_entry_t *);
416 	/*
417 	 * fo_match_fe():
418 	 * This is used for finding identical flows.
419 	 */
420 	boolean_t	(*fo_match_fe)(flow_tab_t *, flow_entry_t *,
421 			    flow_entry_t *);
422 	/*
423 	 * fo_insert_fe():
424 	 * Used for inserting a flow to a flow chain.
425 	 * Protocols that have special ordering requirements would
426 	 * need to implement this. For those that don't,
427 	 * flow_generic_insert_fe() may be used.
428 	 */
429 	int		(*fo_insert_fe)(flow_tab_t *, flow_entry_t **,
430 			    flow_entry_t *);
431 
432 	/*
433 	 * Calculates the flow hash index based on the accumulated
434 	 * state in flow_state_t. Must use the same algorithm as
435 	 * fo_hash_fe().
436 	 */
437 	uint32_t	(*fo_hash)(flow_tab_t *, flow_state_t *);
438 
439 	/*
440 	 * Array of accept fuctions.
441 	 * Each function in the array will accumulate enough state
442 	 * (header length, protocol) to allow the next function to
443 	 * proceed. We support up to FLOW_MAX_ACCEPT functions which
444 	 * should be sufficient for all practical purposes.
445 	 */
446 	int		(*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *,
447 			    flow_state_t *);
448 } flow_ops_t;
449 
450 /*
451  * Generic flow table.
452  */
453 struct flow_tab_s {
454 	krwlock_t		ft_lock;
455 	/*
456 	 * Contains a list of functions (described above)
457 	 * specific to this table type.
458 	 */
459 	flow_ops_t		ft_ops;
460 
461 	/*
462 	 * Indicates what types of flows are supported.
463 	 */
464 	flow_mask_t		ft_mask;
465 
466 	/*
467 	 * An array of flow_entry_t * of size ft_size.
468 	 * Each element is the beginning of a hash chain.
469 	 */
470 	flow_entry_t		**ft_table;
471 	uint_t			ft_size;
472 
473 	/*
474 	 * The number of flows inserted into ft_table.
475 	 */
476 	uint_t			ft_flow_count;
477 	struct mac_impl_s	*ft_mip;
478 	struct mac_client_impl_s	*ft_mcip;
479 };
480 
481 /*
482  * This is used for describing what type of flow table can be created.
483  * mac_flow.c contains a list of these structures.
484  */
485 typedef struct flow_tab_info_s {
486 	flow_ops_t		*fti_ops;
487 	flow_mask_t		fti_mask;
488 	uint_t			fti_size;
489 } flow_tab_info_t;
490 
491 #define	FLOW_TAB_EMPTY(ft)	((ft) == NULL || (ft)->ft_flow_count == 0)
492 
493 
494 #define	MCIP_STAT_UPDATE(m, s, c) {					\
495 	((mac_client_impl_t *)(m))->mci_misc_stat.mms_##s		\
496 	+= ((uint64_t)(c));						\
497 }
498 
499 #define	SRS_RX_STAT_UPDATE(m, s, c)  {					\
500 	((mac_soft_ring_set_t *)(m))->srs_rx.sr_stat.mrs_##s		\
501 	+= ((uint64_t)(c));						\
502 }
503 
504 #define	SRS_TX_STAT_UPDATE(m, s, c)  {					\
505 	((mac_soft_ring_set_t *)(m))->srs_tx.st_stat.mts_##s		\
506 	+= ((uint64_t)(c));						\
507 }
508 
509 #define	SRS_TX_STATS_UPDATE(m, s) {					\
510 	SRS_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets);		\
511 	SRS_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes);		\
512 	SRS_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors);		\
513 }
514 
515 #define	SOFTRING_TX_STAT_UPDATE(m, s, c)  {				\
516 	((mac_soft_ring_t *)(m))->s_st_stat.mts_##s += ((uint64_t)(c));	\
517 }
518 
519 #define	SOFTRING_TX_STATS_UPDATE(m, s) {				\
520 	SOFTRING_TX_STAT_UPDATE((m), opackets, (s)->mts_opackets);	\
521 	SOFTRING_TX_STAT_UPDATE((m), obytes, (s)->mts_obytes);		\
522 	SOFTRING_TX_STAT_UPDATE((m), oerrors, (s)->mts_oerrors);	\
523 }
524 
525 extern void	mac_flow_init();
526 extern void	mac_flow_fini();
527 extern int	mac_flow_create(flow_desc_t *, mac_resource_props_t *,
528 		    char *, void *, uint_t, flow_entry_t **);
529 
530 extern int	mac_flow_add(flow_tab_t *, flow_entry_t *);
531 extern int	mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *,
532 		    boolean_t);
533 extern int	mac_flow_hash_add(flow_entry_t *);
534 extern int	mac_flow_lookup_byname(char *, flow_entry_t **);
535 extern int	mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t,
536 		    flow_entry_t **);
537 
538 extern int	mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *),
539 		    void *);
540 
541 extern int	mac_flow_walk_nolock(flow_tab_t *,
542 		    int (*)(flow_entry_t *, void *), void *);
543 
544 extern void	mac_flow_modify(flow_tab_t *, flow_entry_t *,
545 		    mac_resource_props_t *);
546 
547 extern void	*mac_flow_get_client_cookie(flow_entry_t *);
548 
549 extern uint32_t	mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *);
550 
551 extern void	mac_flow_get_desc(flow_entry_t *, flow_desc_t *);
552 extern void	mac_flow_set_desc(flow_entry_t *, flow_desc_t *);
553 
554 extern void	mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t);
555 extern void	mac_flow_hash_remove(flow_entry_t *);
556 extern void	mac_flow_wait(flow_entry_t *, mac_flow_state_t);
557 extern void	mac_flow_cleanup(flow_entry_t *);
558 extern void	mac_flow_destroy(flow_entry_t *);
559 
560 extern void	mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t,
561 		    struct mac_impl_s *, flow_tab_t **);
562 extern void	mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **);
563 extern void	mac_flow_tab_destroy(flow_tab_t *);
564 extern void	flow_stat_destroy(flow_entry_t *);
565 
566 #ifdef	__cplusplus
567 }
568 #endif
569 
570 #endif	/* _MAC_FLOW_IMPL_H */
571