xref: /titanic_50/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h (revision 381a2a9a387f449fab7d0c7e97c4184c26963abf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef	_MPD_TABLES_H
27 #define	_MPD_TABLES_H
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #ifdef	__cplusplus
32 extern "C" {
33 #endif
34 
35 /*
36  * Terminology:
37  *
38  * phyint: A NIC eg. hme0. This is represented as 'struct phyint'
39  *
40  * phyint instance: A protocol instance of a phyint. Eg. the IPv4 instance of
41  * 	hme0 or the IPv6 instance of hme0. (struct phyint_instance)
42  *
43  * logint: A logical interface eg. hme0:1 (struct logint)
44  *
45  * phyint_group: A group of phyints i.e. physical interfaces that are
46  *	(i) connected to the same level 2 topology e.g. the same ethernet
47  *	    switch AND
48  *	(ii) share the same phyint group name.
49  * Load spreading and failover occur across members of the same phyint group.
50  * phyint group members must be homogenous. i.e. if a phyint belonging to a
51  * phyint group has a IPv6 protocol instance, then all members of the phyint
52  * group, must have IPv6 protocol instances. (struct phyint_group)
53  */
54 
55 /*
56  * Parameter passed to try_failover(), indicating the type of failover
57  * that is requested.
58  */
59 #define	FAILOVER_NORMAL		1	/* Failover to another phyint */
60 					/* that is preferably a standby */
61 #define	FAILOVER_TO_NONSTANDBY	2	/* Failover to non-standby phyint */
62 #define	FAILOVER_TO_ANY		3	/* Failover to any available phyint */
63 
64 #define	MAXDEFERREDRTT		1	/* Maximum number of deferred rtts */
65 
66 /*
67  * Status of the phyint, expressed by the return code of failure_state()
68  */
69 #define	PHYINT_OK	0		/* No failure detected */
70 #define	PHYINT_FAILURE	1		/* NIC failure detected */
71 #define	GROUP_FAILURE	2		/* All NICs have failed */
72 
73 /*
74  * Return values of phyint_inst_update_from_k()
75  */
76 #define	PI_OK			1	/* Phyint matches in the kernel */
77 #define	PI_DELETED		2	/* Phyint has vanished in the kernel */
78 #define	PI_IFINDEX_CHANGED	3	/* Phyint's ifindex has changed */
79 #define	PI_IOCTL_ERROR		4	/* Some ioctl error */
80 #define	PI_GROUP_CHANGED	5	/* The phyint has changed group. */
81 
82 /*
83  * Though IFF_POINTOPOINT is a logint property, for the purpose of
84  * failover, we treat it as a phyint property. Note that we cannot failover
85  * individual logints.
86  */
87 #define	PHYINT_FLAGS(flags)	\
88 	(((flags) &  (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \
89 	IFF_POINTOPOINT | IFF_RUNNING)) | (handle_link_notifications ? \
90 	0 : IFF_RUNNING))
91 
92 /* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */
93 #define	PHYINT_INSTANCE(pi, af)	\
94 	((af) == AF_INET ? (pi)->pi_v4 : (pi)->pi_v6)
95 
96 /*
97  * A phyint instance is probe *enabled* if it has been configured with a
98  * unique probe address (i.e., an IFF_NOFAILOVER address).  It is probe
99  * *capable* if it is also able to send probes (i.e., has one or more
100  * targets available).
101  */
102 #define	PROBE_ENABLED(pii) \
103 	(((pii) != NULL) && ((pii)->pii_probe_sock != -1) &&	\
104 	((pii)->pii_probe_logint != NULL) &&			\
105 	(((pii)->pii_probe_logint->li_dupaddr == 0)))
106 
107 #define	PROBE_CAPABLE(pii) \
108 	(PROBE_ENABLED(pii) && ((pii)->pii_ntargets != 0))
109 
110 /* Subtract b from a modulo n. i.e. (a - b) mod n  */
111 #define	MOD_SUB(a, b, n)	\
112 	((((a) + (n)) - (b)) % (n))
113 
114 /* Increment modulo n */
115 #define	MOD_INCR(a, n)		\
116 	(((a) + 1) % (n))
117 
118 /* Decrement modulo n */
119 #define	MOD_DCR(a, n)		\
120 	MOD_SUB(a, 1, n)
121 
122 /*
123  * 'index' represents an index into the circular probe stats array of
124  * size PROBE_STATS_COUNT.  0 <= index < PROBE_STATS_COUNT. This is used
125  * to access members of the pii_probes[] array defined in the phyint_instance
126  * structure.
127  */
128 #define	PROBE_INDEX_PREV(index)	\
129 	MOD_DCR(index, PROBE_STATS_COUNT)
130 
131 #define	PROBE_INDEX_NEXT(index)	\
132 	MOD_INCR(index, PROBE_STATS_COUNT)
133 
134 
135 /*
136  * If we receive more than LINK_UP_PERMIN "link up" notifications in a minute,
137  * then don't actually perform the repair operation until we've dropped back
138  * below the threshold (or we have a probe address and our probes indicate
139  * that the link is functioning again).  This is to prevent link flapping in
140  * the case where we don't have a probe address.
141  */
142 #define	LINK_UP_PERMIN	2
143 
144 #define	LINK_DOWN(pi) ((pi)->pi_link_state == 0)
145 #define	LINK_UP(pi) (!LINK_DOWN(pi))
146 #define	FLAGS_TO_LINK_STATE(pi) (((pi)->pi_flags & IFF_RUNNING) != 0)
147 #define	UPDATE_LINK_STATE(pi) ((pi)->pi_link_state = \
148 	FLAGS_TO_LINK_STATE(pi) ? 1 : 0)
149 #define	INIT_LINK_STATE(pi) ((pi)->pi_link_state = 1)
150 
151 /*
152  * Phyint group states; see below for the phyint group definition.
153  */
154 enum pg_state {
155 	PG_RUNNING	= 1,	/* at least one interface in group is working */
156 	PG_FAILED	= 2	/* group has failed completely */
157 };
158 
159 /*
160  * Convenience macro to check if the whole group has failed.
161  */
162 #define	GROUP_FAILED(pg)	((pg)->pg_groupfailed)
163 
164 /*
165  * A doubly linked list of all phyint groups in the system.
166  * A phyint group is identified by its group name.
167  */
168 struct phyint_group {
169 	char pg_name[LIFNAMSIZ + 1];	/* Phyint group name */
170 	struct phyint *pg_phyint;	/* List of phyints in this group */
171 	struct phyint_group *pg_next;	/* Next phyint group */
172 	struct phyint_group *pg_prev;	/* Prev phyint group */
173 	uint64_t pg_sig;		/* Current signature of this group */
174 	int	pg_probeint;		/* Interval between probes */
175 	int	pg_fdt;			/* Time needed to detect failure */
176 	uint_t
177 		pg_groupfailed : 1; /* The whole group has failed */
178 };
179 
180 /*
181  * Phyint states; see below for the phyint definition.
182  */
183 enum pi_state {
184 	PI_NOTARGETS	= 1,	/* Phyint has no targets */
185 	PI_RUNNING	= 2,	/* Phyint is functioning */
186 	PI_FAILED	= 3,	/* Phyint is failed */
187 	PI_OFFLINE	= 4	/* Phyint is offline */
188 };
189 
190 /*
191  * Representation of a NIC or a phyint. There is a list of all known phyints.
192  * There is also a list of phyints belonging to a phyint group, one list
193  * per phyint group.
194  */
195 struct phyint {
196 	char	pi_name[LIFNAMSIZ + 1]; /* Phyint name eg. le0 */
197 	struct phyint_instance *pi_v4;	/* The IPv4 instance */
198 	struct phyint_instance *pi_v6;	/* The IPv6 instance */
199 	struct phyint_group *pi_group;	/* Pointer to the group */
200 	struct phyint	*pi_next;	/* List of all phyints */
201 	struct phyint	*pi_prev;	/* List of all phyints */
202 	struct phyint	*pi_pgnext;	/* List of phyints in this group */
203 	struct phyint	*pi_pgprev;	/* List of phyints in this group */
204 	uint_t		pi_ifindex;	/* interface index */
205 	enum pi_state	pi_state;	/* State of the phyint */
206 	uint64_t	pi_flags;	/* Phyint flags from kernel */
207 	uint16_t	pi_icmpid;	/* icmp id in icmp echo request */
208 	/*
209 	 * The pi_whenup array is a circular buffer of the most recent
210 	 * times (in milliseconds since some arbitrary point of time in
211 	 * the past) that the interface was brought up; pi_whendx identifies
212 	 * the oldest element of the array.
213 	 */
214 	uint_t		pi_whenup[LINK_UP_PERMIN];
215 	unsigned int	pi_whendx;
216 
217 	uint_t
218 		pi_empty : 1,		/* failover done, empty */
219 		pi_full  : 1,		/* failback done, full  */
220 					/* More details in probe.c */
221 		pi_taddrmsg_printed : 1,	/* testaddr msg printed */
222 		pi_cfgmsg_printed : 1,	/* bad config msg printed */
223 		pi_lfmsg_printed : 1,   /* link-flapping msg printed */
224 		pi_link_state : 1;	/* interface link state */
225 };
226 
227 /*
228  * A doubly linked list of all phyint_instances each of which contains a
229  * doubly linked list of logical interfaces and targets. For eg. if both
230  * IPv4 and IPv6 are used over hme0, we have 2 phyint instances, 1 for each
231  * protocol.
232  */
233 struct phyint_instance {
234 	struct phyint_instance	*pii_next;	/* List of all phyint insts */
235 	struct phyint_instance	*pii_prev;	/* List of all phyint insts */
236 
237 	struct phyint	*pii_phyint;	/* Back pointer to the phyint */
238 	struct target	*pii_targets;	/* List of targets on this link */
239 	struct logint	*pii_probe_logint; /* IFF_NOFAILOVER addr for probing */
240 	struct logint	*pii_logint;	/* Doubly linked list of logical ifs */
241 
242 	int	pii_probe_sock;		/* Socket for ICMP Probe packets */
243 	int	pii_af;			/* Address family */
244 	uint16_t pii_rack;		/* highest acknowledged seq number */
245 	uint16_t pii_snxt;		/* sequence number of next probe */
246 	uint_t	pii_snxt_time;		/* actual next probe time that */
247 					/* includes some randomness */
248 
249 	uint_t	pii_snxt_basetime; 	/* strictly periodic base probe time */
250 					/* for all periodic probes */
251 	uint_t	pii_fd_snxt_basetime; 	/* strictly periodic base probe time */
252 					/* for failure detection probes */
253 
254 	hrtime_t 	pii_fd_hrtime;	/* hrtime_t before which we should */
255 					/* not send probes out this pii */
256 
257 	uint64_t	pii_flags;	/* Phyint flags from kernel */
258 
259 	struct probe_stats {
260 		struct target *pr_target;	/* Probe Target */
261 		uint_t	pr_time_sent; 	/* Time probe was sent */
262 		uint_t	pr_status;	/* probe status as below */
263 #define	PR_UNUSED	0		/* Probe slot unused */
264 #define	PR_UNACKED	1		/* Probe is unacknowledged */
265 #define	PR_ACKED	2		/* Probe has been acknowledged */
266 #define	PR_LOST		3		/* Probe is declared lost */
267 		union {
268 			uint_t  tl;	/* time probe is declared lost */
269 			uint_t	ta;	/* time probe is acked */
270 		} prt;
271 #define	pr_time_lost	prt.tl
272 #define	pr_time_acked	prt.ta
273 	} pii_probes[PROBE_STATS_COUNT];
274 
275 	uint_t
276 		pii_in_use : 1,			/* To detect removed phyints */
277 		pii_basetime_inited : 1,	/* probe time initialized */
278 		pii_targets_are_routers : 1;	/* routers or hosts ? */
279 
280 	uint_t	pii_probe_next;		/* next index to use in pii_probes[] */
281 	struct target *pii_target_next;	/* next target for probing */
282 	struct target *pii_rtt_target_next;
283 					/* next target for rtt probes */
284 
285 	int	pii_ntargets;		/* Number of active targets */
286 	struct stats {			/* Cumulative statistics */
287 		uint64_t	lost;		/* Number of probes lost */
288 		uint64_t	acked;		/* Number of probes acked */
289 		uint64_t	sent;		/* Number of probes sent */
290 		uint64_t	unknown;	/* Number of ambiguous */
291 						/* probe acks */
292 	} pii_cum_stats;
293 };
294 
295 #define	pii_name	pii_phyint->pi_name
296 #define	pii_ifindex	pii_phyint->pi_ifindex
297 #define	pii_state	pii_phyint->pi_state
298 #define	pii_icmpid	pii_phyint->pi_icmpid
299 
300 #define	PR_STATUS_VALID(status)		((status) <= PR_LOST)
301 
302 
303 /*
304  * A doubly linked list of prefixes or logicals, hanging off the
305  * phyint instance.
306  */
307 struct logint {
308 	struct logint	*li_next;	/* Next logint of this phyint inst. */
309 	struct logint	*li_prev;	/* Prev logint of this phyint inst. */
310 	struct phyint_instance	*li_phyint_inst;
311 					/* Back pointer to phyint inst. */
312 
313 	char		li_name[LIFNAMSIZ + 1];	/* name Eg. hme0:1 */
314 	struct in6_addr	li_addr;	/* IP address */
315 	struct in6_addr	li_dstaddr;	/* Dst IP address for pointopoint */
316 	struct in6_addr	li_subnet;	/* prefix / subnet */
317 	uint_t		li_subnet_len;	/* prefix / subnet length */
318 	uint64_t	li_flags;	/* IFF_* flags */
319 	uint_t		li_oifindex;	/* original ifindex (SIOCGLIFOINDEX) */
320 	uint_t
321 			li_in_use : 1,	/* flag to detect deleted logints */
322 			li_dupaddr : 1,	/* This test address is not unique */
323 			li_dupaddrmsg_printed : 1;
324 					/* Error has been logged to console */
325 };
326 
327 
328 /*
329  * Doubly-linked list of probe targets on a phyint instance. Probe targets are
330  * usually onlink routers. If no onlink routers can be found, onlink hosts
331  * are used.
332  */
333 struct target {
334 	struct target	*tg_next;	/* Next target for this phyint inst. */
335 	struct target	*tg_prev;	/* Prev target for this phyint inst. */
336 	struct phyint_instance	*tg_phyint_inst;
337 					/* Back pointer to phyint instance */
338 
339 	struct in6_addr	tg_address;	/* Target IP address */
340 	int		tg_status;	/* Status of the target below */
341 #define	TG_ACTIVE	1		/* active probe target */
342 #define	TG_UNUSED	2		/* target not in use now */
343 #define	TG_SLOW		3		/* rtt is high - Not in use now */
344 #define	TG_DEAD		4		/* Target is not responding */
345 
346 	hrtime_t	tg_latime;	/* Target's last active time */
347 	int		tg_rtt_sa;	/* Scaled round trip time(RTT) avg. */
348 	int		tg_rtt_sd;	/* Scaled RTT deviation */
349 	int		tg_crtt;	/* Conservative RTT = A + 4D */
350 	uint32_t
351 			tg_in_use : 1;	/* In use flag */
352 	int		tg_deferred[MAXDEFERREDRTT + 1];
353 					/* Deferred rtt data points */
354 	int		tg_num_deferred;
355 					/* Number of deferred rtt data points */
356 };
357 
358 #define	TG_STATUS_VALID(status) \
359 	(((status) >= TG_ACTIVE) && ((status) <= TG_DEAD))
360 
361 /*
362  * Statistics about consecutive probe failures are passed around between
363  * functions in this structure.
364  */
365 struct probe_fail_count
366 {
367 	uint_t	pf_tff;		/* Earliest time of failure in a series */
368 	int	pf_nfail;	/* Number of consecutive probe failures */
369 	int	pf_nfail_tg;	/* Number of consecutive probe fails for */
370 				/* some given target 'tg' */
371 };
372 
373 /*
374  * Statistics about consecutive probe successes is passed around between
375  * functions in this structure.
376  */
377 struct probe_success_count
378 {
379 	uint_t ps_tls;		/* Most recent time of probe success */
380 	boolean_t ps_tls_valid;	/* is ps_tls valid */
381 	int	ps_nsucc;	/* Number of consecutive probe successes */
382 				/* starting from the most recent */
383 	int	ps_nsucc_tg;	/* Number of consecutive probe successes */
384 				/* for some given target 'tg' */
385 };
386 
387 /*
388  * Statistics about missed probes that were never sent.
389  * Happens due to scheduling delay.
390  */
391 
392 struct probes_missed
393 {
394 	uint_t	pm_nprobes;	/* Cumulative number of missed probes */
395 	uint_t	pm_ntimes;	/* Total number of occassions */
396 };
397 
398 struct local_addr
399 {
400 	struct in6_addr addr;
401 	struct local_addr *next;
402 };
403 
404 /*
405  * Globals
406  */
407 extern struct local_addr *laddr_list;
408 			/* List of all local addresses, including local zones */
409 extern struct phyint *phyints;		/* List of all phyints */
410 extern struct phyint_group *phyint_groups; /* List of all phyint groups */
411 extern struct phyint_group *phyint_anongroup; /* Pointer to the anon group */
412 extern struct phyint_instance *phyint_instances;
413 					/* List of all phyint instances */
414 extern struct probes_missed probes_missed;
415 					/* statistics about missed probes */
416 
417 /*
418  * Function prototypes
419  */
420 extern int phyint_init(void);
421 extern struct phyint *phyint_lookup(const char *name);
422 extern struct phyint_instance *phyint_inst_lookup(int af, char *name);
423 extern struct phyint_instance *phyint_inst_init_from_k(int af, char *name);
424 extern struct phyint_instance *phyint_inst_other(struct phyint_instance *pii);
425 extern int phyint_inst_update_from_k(struct phyint_instance *pii);
426 extern void phyint_inst_delete(struct phyint_instance *pii);
427 extern uint_t phyint_inst_timer(struct phyint_instance *pii);
428 extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii);
429 
430 extern void phyint_newtype(struct phyint *pi);
431 extern void phyint_chstate(struct phyint *pi, enum pi_state state);
432 extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state);
433 extern void phyint_check_for_repair(struct phyint *pi);
434 
435 extern void logint_init_from_k(struct phyint_instance *pii, char *li_name);
436 extern void logint_delete(struct logint *li);
437 
438 extern struct target *target_lookup(struct phyint_instance *pii,
439     struct in6_addr addr);
440 extern void target_create(struct phyint_instance *pii,
441     struct in6_addr addr, boolean_t is_router);
442 extern void target_delete(struct target *tg);
443 extern struct target *target_next(struct target *tg);
444 extern void target_add(struct phyint_instance *pii, struct in6_addr addr,
445     boolean_t is_router);
446 
447 extern void in_data(struct phyint_instance *pii);
448 extern void in6_data(struct phyint_instance *pii);
449 
450 extern int try_failover(struct phyint *pi, int failover_type);
451 extern int try_failback(struct phyint *pi);
452 extern int do_failback(struct phyint *pi);
453 extern boolean_t change_lif_flags(struct phyint *pi, uint64_t flags,
454     boolean_t setfl);
455 
456 extern void logperror_pii(struct phyint_instance *pii, char *str);
457 extern void logperror_li(struct logint *li, char *str);
458 extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len);
459 extern void phyint_inst_print_all(void);
460 
461 extern int logint_upcount(struct phyint *pi);
462 extern void restore_phyint(struct phyint *pi);
463 extern void reset_crtt_all(struct phyint *pi);
464 extern int failure_state(struct phyint_instance *pii);
465 extern void process_link_state_changes(void);
466 extern void clear_pii_probe_stats(struct phyint_instance *pii);
467 extern void start_timer(struct phyint_instance *pii);
468 
469 extern boolean_t own_address(struct in6_addr addr);
470 
471 extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag);
472 
473 extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **);
474 extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **);
475 extern unsigned int getgrouplist(ipmp_grouplist_t **);
476 extern unsigned int getsnap(ipmp_snap_t **);
477 
478 #ifdef	__cplusplus
479 }
480 #endif
481 
482 #endif	/* _MPD_TABLES_H */
483