xref: /illumos-gate/usr/src/uts/common/sys/mac_flow_impl.h (revision 1a1a84a324206b6b1f5f704ab166c4ebf78aed76)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef	_MAC_FLOW_IMPL_H
28 #define	_MAC_FLOW_IMPL_H
29 
30 #ifdef	__cplusplus
31 extern "C" {
32 #endif
33 
34 #include <sys/param.h>
35 #include <sys/atomic.h>
36 #include <sys/ksynch.h>
37 #include <sys/mac_flow.h>
38 #include <sys/stream.h>
39 #include <sys/sdt.h>
40 #include <net/if.h>
41 
42 /*
43  * Macros to increment/decrement the reference count on a flow_entry_t.
44  */
45 #define	FLOW_REFHOLD(flent) {					\
46 	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
47 	mutex_enter(&(flent)->fe_lock);				\
48 	(flent)->fe_refcnt++;					\
49 	mutex_exit(&(flent)->fe_lock);				\
50 }
51 
52 /*
53  * Data paths must not attempt to use a flow entry if it is marked INCIPIENT
54  * or QUIESCE. In the former case the set up is not yet complete and the
55  * data path could stumble on inconsistent data structures. In the latter
56  * case a control operation is waiting for quiescence so that it can
57  * change callbacks or other structures without the use of locks.
58  */
59 #define	FLOW_TRY_REFHOLD(flent, err) {				\
60 	DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent));	\
61 	(err) = 0;						\
62 	mutex_enter(&(flent)->fe_lock);				\
63 	if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \
64 	    FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH))			\
65 		(err) = -1;					\
66 	else							\
67 		(flent)->fe_refcnt++;				\
68 	mutex_exit(&(flent)->fe_lock);				\
69 }
70 
71 #define	FLOW_REFRELE(flent) {					\
72 	DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent));	\
73 	mutex_enter(&(flent)->fe_lock);				\
74 	ASSERT((flent)->fe_refcnt != 0);			\
75 	(flent)->fe_refcnt--;					\
76 	if ((flent)->fe_flags & FE_WAITER) {			\
77 		ASSERT((flent)->fe_refcnt != 0);		\
78 		cv_signal(&(flent)->fe_cv);			\
79 		mutex_exit(&(flent)->fe_lock);			\
80 	} else if ((flent)->fe_refcnt == 0) {			\
81 		mac_flow_destroy(flent);			\
82 	} else {						\
83 		mutex_exit(&(flent)->fe_lock);			\
84 	}							\
85 }
86 
87 #define	FLOW_USER_REFHOLD(flent) {			\
88 	mutex_enter(&(flent)->fe_lock);			\
89 	(flent)->fe_user_refcnt++;			\
90 	mutex_exit(&(flent)->fe_lock);			\
91 }
92 
93 #define	FLOW_USER_REFRELE(flent) {			\
94 	mutex_enter(&(flent)->fe_lock);			\
95 	ASSERT((flent)->fe_user_refcnt != 0);		\
96 	if (--(flent)->fe_user_refcnt == 0 &&		\
97 	    ((flent)->fe_flags & FE_WAITER))		\
98 		cv_signal(&(flent)->fe_cv);		\
99 	mutex_exit(&(flent)->fe_lock);			\
100 }
101 
102 #define	FLOW_FINAL_REFRELE(flent) {			\
103 	ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0);	\
104 	FLOW_REFRELE(flent);				\
105 }
106 
107 /*
108  * Mark or unmark the flent with a bit flag
109  */
110 #define	FLOW_MARK(flent, flag) {		\
111 	mutex_enter(&(flent)->fe_lock);		\
112 	(flent)->fe_flags |= flag;		\
113 	mutex_exit(&(flent)->fe_lock);		\
114 }
115 
116 #define	FLOW_UNMARK(flent, flag) {		\
117 	mutex_enter(&(flent)->fe_lock);		\
118 	(flent)->fe_flags &= ~flag;		\
119 	mutex_exit(&(flent)->fe_lock);		\
120 }
121 
122 #define	FLENT_TO_MIP(flent)			\
123 	(flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) :	\
124 	((mac_client_impl_t *)flent->fe_mcip)->mci_mip)
125 
126 /* Convert a bandwidth expressed in bps to a number of bytes per tick. */
127 #define	FLOW_BYTES_PER_TICK(bps)	(((bps) >> 3) / hz)
128 
129 /*
130  * Given an underlying range and a priority level, obtain the minimum for the
131  * new range.
132  */
133 #define	FLOW_MIN_PRIORITY(min, max, pri)	\
134 	((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri)))
135 
136 /*
137  * Given an underlying range and a minimum level (base), obtain the maximum
138  * for the new range.
139  */
140 #define	FLOW_MAX_PRIORITY(min, max, base)	\
141 	((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS))
142 
143 /*
144  * Given an underlying range and a priority level, get the absolute
145  * priority value. For now there are just 3 values, high, low and
146  * medium  so we can just return max, min or min + (max - min) / 2.
147  * If there are more than three we need to change this computation.
148  */
149 #define	FLOW_PRIORITY(min, max, pri)		\
150 	(pri) == MPL_HIGH ? (max) :	\
151 	(pri) == MPL_LOW ? (min) :	\
152 	((min) + (((max) - (min)) / 2))
153 
154 #define	MAC_FLOW_TAB_SIZE		500
155 
156 typedef struct flow_entry_s		flow_entry_t;
157 typedef struct flow_tab_s		flow_tab_t;
158 typedef struct flow_state_s 		flow_state_t;
159 struct mac_impl_s;
160 struct mac_client_impl_s;
161 
162 /*
163  * Classification flags used to lookup the flow.
164  */
165 #define	FLOW_INBOUND		0x01
166 #define	FLOW_OUTBOUND		0x02
167 /* Don't compare VID when classifying the packets, see mac_rx_classify() */
168 #define	FLOW_IGNORE_VLAN	0x04
169 
170 /* Generic flow client function signature */
171 typedef void		(*flow_fn_t)(void *, void *, mblk_t *, boolean_t);
172 
173 /* Flow state */
174 typedef enum {
175 	FLOW_DRIVER_UPCALL,
176 	FLOW_USER_REF
177 } mac_flow_state_t;
178 
179 /* Matches a flow_entry_t using the extracted flow_state_t info */
180 typedef boolean_t	(*flow_match_fn_t)(flow_tab_t *, flow_entry_t *,
181 			    flow_state_t *);
182 
183 /* fe_flags */
184 #define	FE_QUIESCE		0x01	/* Quiesce the flow */
185 #define	FE_WAITER		0x02	/* Flow has a waiter */
186 #define	FE_FLOW_TAB		0x04	/* Flow is in the flow tab list */
187 #define	FE_G_FLOW_HASH		0x08	/* Flow is in the global flow hash */
188 #define	FE_INCIPIENT		0x10	/* Being setup */
189 #define	FE_CONDEMNED		0x20	/* Being deleted */
190 #define	FE_UF_NO_DATAPATH	0x40	/* No datapath setup for User flow */
191 #define	FE_MC_NO_DATAPATH	0x80	/* No datapath setup for mac client */
192 
193 /* fe_type */
194 #define	FLOW_PRIMARY_MAC	0x01 	/* NIC primary MAC address */
195 #define	FLOW_VNIC_MAC		0x02	/* VNIC flow */
196 #define	FLOW_MCAST		0x04	/* Multicast (and broadcast) */
197 #define	FLOW_OTHER		0x08	/* Other flows configured */
198 #define	FLOW_USER		0x10	/* User defined flow */
199 #define	FLOW_VNIC		FLOW_VNIC_MAC
200 #define	FLOW_NO_STATS		0x20	/* Don't create stats for the flow */
201 
202 /*
203  * Shared Bandwidth control counters between the soft ring set and its
204  * associated soft rings. In case the flow associated with NIC/VNIC
205  * has a group of Rx rings assigned to it, we have the same
206  * number of soft ring sets as we have the Rx ring in the group
207  * and each individual SRS (and its soft rings) decide when to
208  * poll their Rx ring independently. But if there is a B/W limit
209  * associated with the NIC/VNIC, then the B/W control counter is
210  * shared across all the SRS in the group and their associated
211  * soft rings.
212  *
213  * There is a many to 1 mapping between the SRS and
214  * mac_bw_ctl if the flow has a group of Rx rings associated with
215  * it.
216  */
217 typedef struct mac_bw_ctl_s {
218 	kmutex_t	mac_bw_lock;
219 	uint32_t	mac_bw_state;
220 	size_t		mac_bw_sz;	/* ?? Is it needed */
221 	size_t		mac_bw_limit;	/* Max bytes to process per tick */
222 	size_t		mac_bw_used;	/* Bytes processed in current tick */
223 	size_t		mac_bw_drop_threshold; /* Max queue length */
224 	size_t		mac_bw_drop_bytes;
225 	size_t		mac_bw_polled;
226 	size_t		mac_bw_intr;
227 	clock_t		mac_bw_curr_time;
228 } mac_bw_ctl_t;
229 
230 struct flow_entry_s {					/* Protected by */
231 	struct flow_entry_s	*fe_next;		/* ft_lock */
232 
233 	datalink_id_t		fe_link_id;		/* WO */
234 
235 	/* Properties as specified for this flow */
236 	mac_resource_props_t	fe_resource_props;	/* SL */
237 
238 	/* Properties actually effective at run time for this flow */
239 	mac_resource_props_t	fe_effective_props;	/* SL */
240 
241 	kmutex_t		fe_lock;
242 	char			fe_flow_name[MAXFLOWNAMELEN];	/* fe_lock */
243 	flow_desc_t		fe_flow_desc;		/* fe_lock */
244 	kcondvar_t		fe_cv;			/* fe_lock */
245 	/*
246 	 * Initial flow ref is 1 on creation. A thread that lookups the
247 	 * flent typically by a mac_flow_lookup() dynamically holds a ref.
248 	 * If the ref is 1, it means there arent' any upcalls from the driver
249 	 * or downcalls from the stack using this flent. Structures pointing
250 	 * to the flent or flent inserted in lists don't count towards this
251 	 * refcnt. Instead they are tracked using fe_flags. Only a control
252 	 * thread doing a teardown operation deletes the flent, after waiting
253 	 * for upcalls to finish synchronously. The fe_refcnt tracks
254 	 * the number of upcall refs
255 	 */
256 	uint32_t		fe_refcnt;		/* fe_lock */
257 
258 	/*
259 	 * This tracks lookups done using the global hash list for user
260 	 * generated flows. This refcnt only protects the flent itself
261 	 * from disappearing and helps walkers to read the flent info such
262 	 * as flow spec. However the flent may be quiesced and the SRS could
263 	 * be deleted. The fe_user_refcnt tracks the number of global flow
264 	 * has refs.
265 	 */
266 	uint32_t		fe_user_refcnt;		/* fe_lock */
267 	uint_t			fe_flags;		/* fe_lock */
268 
269 	/*
270 	 * Function/args to invoke for delivering matching packets
271 	 * Only the function ff_fn may be changed dynamically and atomically.
272 	 * The ff_arg1 and ff_arg2 are set at creation time and may not
273 	 * be changed.
274 	 */
275 	flow_fn_t		fe_cb_fn;		/* fe_lock */
276 	void 			*fe_cb_arg1;		/* fe_lock */
277 	void			*fe_cb_arg2;		/* fe_lock */
278 
279 	void			*fe_client_cookie;	/* WO */
280 	void			*fe_rx_ring_group;	/* SL */
281 	void			*fe_rx_srs[MAX_RINGS_PER_GROUP]; /* fe_lock */
282 	int			fe_rx_srs_cnt;		/* fe_lock */
283 	void			*fe_tx_srs;		/* WO */
284 
285 	/*
286 	 * This is a unicast flow, and is a mac_client_impl_t
287 	 */
288 	void			*fe_mcip; 		/* WO */
289 
290 	/*
291 	 * Used by mci_flent_list of mac_client_impl_t to track flows sharing
292 	 * the same mac_client_impl_t.
293 	 */
294 	struct flow_entry_s	*fe_client_next;
295 
296 	/*
297 	 * This is a broadcast or multicast flow and is a mac_bcast_grp_t
298 	 */
299 	void			*fe_mbg;		/* WO */
300 	uint_t			fe_type;		/* WO */
301 
302 	/*
303 	 * BW control info.
304 	 */
305 	mac_bw_ctl_t		fe_tx_bw;
306 	mac_bw_ctl_t		fe_rx_bw;
307 
308 	/*
309 	 * Used by flow table lookup code
310 	 */
311 	flow_match_fn_t		fe_match;
312 
313 	/*
314 	 * Used by mac_flow_remove().
315 	 */
316 	int			fe_index;
317 	flow_tab_t		*fe_flow_tab;
318 
319 	kstat_t			*fe_ksp;
320 	flow_stats_t		fe_flowstats;
321 	boolean_t		fe_desc_logged;
322 	zoneid_t		fe_zoneid;
323 	uint64_t		fe_nic_speed;
324 };
325 
326 /*
327  * Various structures used by the flows framework for keeping track
328  * of packet state information.
329  */
330 
331 /* Layer 2 */
332 typedef struct flow_l2info_s {
333 	uchar_t		*l2_start;
334 	uint8_t		*l2_daddr;
335 	uint16_t	l2_vid;
336 	uint32_t	l2_sap;
337 	uint_t		l2_hdrsize;
338 } flow_l2info_t;
339 
340 /* Layer 3 */
341 typedef struct flow_l3info_s {
342 	uchar_t		*l3_start;
343 	uint8_t		l3_protocol;
344 	uint8_t		l3_version;
345 	boolean_t	l3_dst_or_src;
346 	uint_t		l3_hdrsize;
347 	boolean_t	l3_fragmented;
348 } flow_l3info_t;
349 
350 /* Layer 4 */
351 typedef struct flow_l4info_s {
352 	uchar_t		*l4_start;
353 	uint16_t	l4_src_port;
354 	uint16_t	l4_dst_port;
355 	uint16_t	l4_hash_port;
356 } flow_l4info_t;
357 
358 /*
359  * Combined state structure.
360  * Holds flow direction and an mblk_t pointer.
361  */
362 struct flow_state_s {
363 	uint_t		fs_flags;
364 	mblk_t		*fs_mp;
365 	flow_l2info_t	fs_l2info;
366 	flow_l3info_t	fs_l3info;
367 	flow_l4info_t	fs_l4info;
368 };
369 
370 /*
371  * Flow ops vector.
372  * There are two groups of functions. The ones ending with _fe are
373  * called when a flow is being added. The others (hash, accept) are
374  * called at flow lookup time.
375  */
376 #define	FLOW_MAX_ACCEPT	16
377 typedef struct flow_ops_s {
378 	/*
379 	 * fo_accept_fe():
380 	 * Validates the contents of the flow and checks whether
381 	 * it's compatible with the flow table. sets the fe_match
382 	 * function of the flow.
383 	 */
384 	int		(*fo_accept_fe)(flow_tab_t *, flow_entry_t *);
385 	/*
386 	 * fo_hash_fe():
387 	 * Generates a hash index to the flow table. This function
388 	 * must use the same algorithm as fo_hash(), which is used
389 	 * by the flow lookup code path.
390 	 */
391 	uint32_t	(*fo_hash_fe)(flow_tab_t *, flow_entry_t *);
392 	/*
393 	 * fo_match_fe():
394 	 * This is used for finding identical flows.
395 	 */
396 	boolean_t	(*fo_match_fe)(flow_tab_t *, flow_entry_t *,
397 			    flow_entry_t *);
398 	/*
399 	 * fo_insert_fe():
400 	 * Used for inserting a flow to a flow chain.
401 	 * Protocols that have special ordering requirements would
402 	 * need to implement this. For those that don't,
403 	 * flow_generic_insert_fe() may be used.
404 	 */
405 	int		(*fo_insert_fe)(flow_tab_t *, flow_entry_t **,
406 			    flow_entry_t *);
407 
408 	/*
409 	 * Calculates the flow hash index based on the accumulated
410 	 * state in flow_state_t. Must use the same algorithm as
411 	 * fo_hash_fe().
412 	 */
413 	uint32_t	(*fo_hash)(flow_tab_t *, flow_state_t *);
414 
415 	/*
416 	 * Array of accept fuctions.
417 	 * Each function in the array will accumulate enough state
418 	 * (header length, protocol) to allow the next function to
419 	 * proceed. We support up to FLOW_MAX_ACCEPT functions which
420 	 * should be sufficient for all practical purposes.
421 	 */
422 	int		(*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *,
423 			    flow_state_t *);
424 } flow_ops_t;
425 
426 /*
427  * Generic flow table.
428  */
429 struct flow_tab_s {
430 	krwlock_t		ft_lock;
431 	/*
432 	 * Contains a list of functions (described above)
433 	 * specific to this table type.
434 	 */
435 	flow_ops_t		ft_ops;
436 
437 	/*
438 	 * Indicates what types of flows are supported.
439 	 */
440 	flow_mask_t		ft_mask;
441 
442 	/*
443 	 * An array of flow_entry_t * of size ft_size.
444 	 * Each element is the beginning of a hash chain.
445 	 */
446 	flow_entry_t		**ft_table;
447 	uint_t			ft_size;
448 
449 	/*
450 	 * The number of flows inserted into ft_table.
451 	 */
452 	uint_t			ft_flow_count;
453 	struct mac_impl_s	*ft_mip;
454 	struct mac_client_impl_s	*ft_mcip;
455 };
456 
457 /*
458  * This is used for describing what type of flow table can be created.
459  * mac_flow.c contains a list of these structures.
460  */
461 typedef struct flow_tab_info_s {
462 	flow_ops_t		*fti_ops;
463 	flow_mask_t		fti_mask;
464 	uint_t			fti_size;
465 } flow_tab_info_t;
466 
467 #define	FLOW_TAB_EMPTY(ft)	((ft) == NULL || (ft)->ft_flow_count == 0)
468 
469 /*
470  * This is used by mac_tx_send.
471  */
472 typedef struct mac_tx_stats_s {
473 	uint_t			ts_opackets;
474 	uint_t			ts_obytes;
475 	uint_t			ts_oerrors;
476 } mac_tx_stats_t;
477 
478 #define	FLOW_STAT_UPDATE(f, s, c)  {					\
479 	((flow_entry_t *)(f))->fe_flowstats.fs_##s += ((uint64_t)(c));	\
480 }
481 
482 #define	FLOW_TX_STATS_UPDATE(f, s) {					\
483 	FLOW_STAT_UPDATE((f), opackets, (s)->ts_opackets);		\
484 	FLOW_STAT_UPDATE((f), obytes, (s)->ts_obytes);			\
485 	FLOW_STAT_UPDATE((f), oerrors, (s)->ts_oerrors);		\
486 }
487 
488 extern void	mac_flow_init();
489 extern void	mac_flow_fini();
490 extern int	mac_flow_create(flow_desc_t *, mac_resource_props_t *,
491 		    char *, void *, uint_t, flow_entry_t **);
492 
493 extern int	mac_flow_add(flow_tab_t *, flow_entry_t *);
494 extern int	mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *,
495 		    boolean_t);
496 extern int	mac_flow_hash_add(flow_entry_t *);
497 extern int	mac_flow_lookup_byname(char *, flow_entry_t **);
498 extern int	mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t,
499 		    flow_entry_t **);
500 
501 extern int	mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *),
502 		    void *);
503 
504 extern int	mac_flow_walk_nolock(flow_tab_t *,
505 		    int (*)(flow_entry_t *, void *), void *);
506 
507 extern void	mac_flow_modify(flow_tab_t *, flow_entry_t *,
508 		    mac_resource_props_t *);
509 
510 extern void	*mac_flow_get_client_cookie(flow_entry_t *);
511 
512 extern uint32_t	mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *);
513 
514 extern int	mac_flow_update(flow_tab_t *, flow_entry_t *, flow_desc_t *);
515 extern void	mac_flow_get_desc(flow_entry_t *, flow_desc_t *);
516 extern void	mac_flow_set_desc(flow_entry_t *, flow_desc_t *);
517 
518 extern void	mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t);
519 extern void	mac_flow_hash_remove(flow_entry_t *);
520 extern void	mac_flow_wait(flow_entry_t *, mac_flow_state_t);
521 extern void	mac_flow_quiesce(flow_entry_t *);
522 extern void	mac_flow_restart(flow_entry_t *);
523 extern void	mac_flow_cleanup(flow_entry_t *);
524 extern void	mac_flow_destroy(flow_entry_t *);
525 
526 extern void	mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t,
527 		    struct mac_impl_s *, flow_tab_t **);
528 extern void	mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **);
529 extern void	mac_flow_tab_destroy(flow_tab_t *);
530 extern void	mac_flow_drop(void *, void *, mblk_t *);
531 extern void	flow_stat_destroy(flow_entry_t *);
532 
533 #ifdef	__cplusplus
534 }
535 #endif
536 
537 #endif	/* _MAC_FLOW_IMPL_H */
538