1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _MPD_TABLES_H 27 #define _MPD_TABLES_H 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 /* 36 * Terminology: 37 * 38 * phyint: A NIC eg. hme0. This is represented as 'struct phyint' 39 * 40 * phyint instance: A protocol instance of a phyint. Eg. the IPv4 instance of 41 * hme0 or the IPv6 instance of hme0. (struct phyint_instance) 42 * 43 * logint: A logical interface eg. hme0:1 (struct logint) 44 * 45 * phyint_group: A group of phyints i.e. physical interfaces that are 46 * (i) connected to the same level 2 topology e.g. the same ethernet 47 * switch AND 48 * (ii) share the same phyint group name. 49 * Load spreading and failover occur across members of the same phyint group. 50 * phyint group members must be homogenous. i.e. if a phyint belonging to a 51 * phyint group has a IPv6 protocol instance, then all members of the phyint 52 * group, must have IPv6 protocol instances. (struct phyint_group) 53 */ 54 55 /* 56 * Parameter passed to try_failover(), indicating the type of failover 57 * that is requested. 58 */ 59 #define FAILOVER_NORMAL 1 /* Failover to another phyint */ 60 /* that is preferably a standby */ 61 #define FAILOVER_TO_NONSTANDBY 2 /* Failover to non-standby phyint */ 62 #define FAILOVER_TO_ANY 3 /* Failover to any available phyint */ 63 64 #define MAXDEFERREDRTT 1 /* Maximum number of deferred rtts */ 65 66 /* 67 * Status of the phyint, expressed by the return code of failure_state() 68 */ 69 #define PHYINT_OK 0 /* No failure detected */ 70 #define PHYINT_FAILURE 1 /* NIC failure detected */ 71 #define GROUP_FAILURE 2 /* All NICs have failed */ 72 73 /* 74 * Return values of phyint_inst_update_from_k() 75 */ 76 #define PI_OK 1 /* Phyint matches in the kernel */ 77 #define PI_DELETED 2 /* Phyint has vanished in the kernel */ 78 #define PI_IFINDEX_CHANGED 3 /* Phyint's ifindex has changed */ 79 #define PI_IOCTL_ERROR 4 /* Some ioctl error */ 80 #define PI_GROUP_CHANGED 5 /* The phyint has changed group. */ 81 82 /* 83 * Though IFF_POINTOPOINT is a logint property, for the purpose of 84 * failover, we treat it as a phyint property. Note that we cannot failover 85 * individual logints. 86 */ 87 #define PHYINT_FLAGS(flags) \ 88 (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ 89 IFF_POINTOPOINT | IFF_RUNNING)) | (handle_link_notifications ? \ 90 0 : IFF_RUNNING)) 91 92 /* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */ 93 #define PHYINT_INSTANCE(pi, af) \ 94 ((af) == AF_INET ? (pi)->pi_v4 : (pi)->pi_v6) 95 96 /* 97 * A phyint instance is probe *enabled* if it has been configured with a 98 * unique probe address (i.e., an IFF_NOFAILOVER address). It is probe 99 * *capable* if it is also able to send probes (i.e., has one or more 100 * targets available). 101 */ 102 #define PROBE_ENABLED(pii) \ 103 (((pii) != NULL) && ((pii)->pii_probe_sock != -1) && \ 104 ((pii)->pii_probe_logint != NULL) && \ 105 (((pii)->pii_probe_logint->li_dupaddr == 0))) 106 107 #define PROBE_CAPABLE(pii) \ 108 (PROBE_ENABLED(pii) && ((pii)->pii_ntargets != 0)) 109 110 /* Subtract b from a modulo n. i.e. (a - b) mod n */ 111 #define MOD_SUB(a, b, n) \ 112 ((((a) + (n)) - (b)) % (n)) 113 114 /* Increment modulo n */ 115 #define MOD_INCR(a, n) \ 116 (((a) + 1) % (n)) 117 118 /* Decrement modulo n */ 119 #define MOD_DCR(a, n) \ 120 MOD_SUB(a, 1, n) 121 122 /* 123 * 'index' represents an index into the circular probe stats array of 124 * size PROBE_STATS_COUNT. 0 <= index < PROBE_STATS_COUNT. This is used 125 * to access members of the pii_probes[] array defined in the phyint_instance 126 * structure. 127 */ 128 #define PROBE_INDEX_PREV(index) \ 129 MOD_DCR(index, PROBE_STATS_COUNT) 130 131 #define PROBE_INDEX_NEXT(index) \ 132 MOD_INCR(index, PROBE_STATS_COUNT) 133 134 135 /* 136 * If we receive more than LINK_UP_PERMIN "link up" notifications in a minute, 137 * then don't actually perform the repair operation until we've dropped back 138 * below the threshold (or we have a probe address and our probes indicate 139 * that the link is functioning again). This is to prevent link flapping in 140 * the case where we don't have a probe address. 141 */ 142 #define LINK_UP_PERMIN 2 143 144 #define LINK_DOWN(pi) ((pi)->pi_link_state == 0) 145 #define LINK_UP(pi) (!LINK_DOWN(pi)) 146 #define FLAGS_TO_LINK_STATE(pi) (((pi)->pi_flags & IFF_RUNNING) != 0) 147 #define UPDATE_LINK_STATE(pi) ((pi)->pi_link_state = \ 148 FLAGS_TO_LINK_STATE(pi) ? 1 : 0) 149 #define INIT_LINK_STATE(pi) ((pi)->pi_link_state = 1) 150 151 /* 152 * Phyint group states; see below for the phyint group definition. 153 */ 154 enum pg_state { 155 PG_RUNNING = 1, /* at least one interface in group is working */ 156 PG_FAILED = 2 /* group has failed completely */ 157 }; 158 159 /* 160 * Convenience macro to check if the whole group has failed. 161 */ 162 #define GROUP_FAILED(pg) ((pg)->pg_groupfailed) 163 164 /* 165 * A doubly linked list of all phyint groups in the system. 166 * A phyint group is identified by its group name. 167 */ 168 struct phyint_group { 169 char pg_name[LIFNAMSIZ + 1]; /* Phyint group name */ 170 struct phyint *pg_phyint; /* List of phyints in this group */ 171 struct phyint_group *pg_next; /* Next phyint group */ 172 struct phyint_group *pg_prev; /* Prev phyint group */ 173 uint64_t pg_sig; /* Current signature of this group */ 174 int pg_probeint; /* Interval between probes */ 175 int pg_fdt; /* Time needed to detect failure */ 176 uint_t 177 pg_groupfailed : 1; /* The whole group has failed */ 178 }; 179 180 /* 181 * Phyint states; see below for the phyint definition. 182 */ 183 enum pi_state { 184 PI_NOTARGETS = 1, /* Phyint has no targets */ 185 PI_RUNNING = 2, /* Phyint is functioning */ 186 PI_FAILED = 3, /* Phyint is failed */ 187 PI_OFFLINE = 4 /* Phyint is offline */ 188 }; 189 190 /* 191 * Representation of a NIC or a phyint. There is a list of all known phyints. 192 * There is also a list of phyints belonging to a phyint group, one list 193 * per phyint group. 194 */ 195 struct phyint { 196 char pi_name[LIFNAMSIZ + 1]; /* Phyint name eg. le0 */ 197 struct phyint_instance *pi_v4; /* The IPv4 instance */ 198 struct phyint_instance *pi_v6; /* The IPv6 instance */ 199 struct phyint_group *pi_group; /* Pointer to the group */ 200 struct phyint *pi_next; /* List of all phyints */ 201 struct phyint *pi_prev; /* List of all phyints */ 202 struct phyint *pi_pgnext; /* List of phyints in this group */ 203 struct phyint *pi_pgprev; /* List of phyints in this group */ 204 uint_t pi_ifindex; /* interface index */ 205 enum pi_state pi_state; /* State of the phyint */ 206 uint64_t pi_flags; /* Phyint flags from kernel */ 207 uint16_t pi_icmpid; /* icmp id in icmp echo request */ 208 uint64_t pi_taddrthresh; /* time (in secs) to delay logging */ 209 /* about missing test addresses */ 210 /* 211 * The pi_whenup array is a circular buffer of the most recent 212 * times (in milliseconds since some arbitrary point of time in 213 * the past) that the interface was brought up; pi_whendx identifies 214 * the oldest element of the array. 215 */ 216 uint_t pi_whenup[LINK_UP_PERMIN]; 217 unsigned int pi_whendx; 218 219 uint_t 220 pi_empty : 1, /* failover done, empty */ 221 pi_full : 1, /* failback done, full */ 222 /* More details in probe.c */ 223 pi_taddrmsg_printed : 1, /* testaddr msg printed */ 224 pi_duptaddrmsg_printed : 1, /* dup testaddr msg printed */ 225 pi_cfgmsg_printed : 1, /* bad config msg printed */ 226 pi_lfmsg_printed : 1, /* link-flapping msg printed */ 227 pi_link_state : 1; /* interface link state */ 228 }; 229 230 /* 231 * A doubly linked list of all phyint_instances each of which contains a 232 * doubly linked list of logical interfaces and targets. For eg. if both 233 * IPv4 and IPv6 are used over hme0, we have 2 phyint instances, 1 for each 234 * protocol. 235 */ 236 struct phyint_instance { 237 struct phyint_instance *pii_next; /* List of all phyint insts */ 238 struct phyint_instance *pii_prev; /* List of all phyint insts */ 239 240 struct phyint *pii_phyint; /* Back pointer to the phyint */ 241 struct target *pii_targets; /* List of targets on this link */ 242 struct logint *pii_probe_logint; /* IFF_NOFAILOVER addr for probing */ 243 struct logint *pii_logint; /* Doubly linked list of logical ifs */ 244 245 int pii_probe_sock; /* Socket for ICMP Probe packets */ 246 int pii_af; /* Address family */ 247 uint16_t pii_rack; /* highest acknowledged seq number */ 248 uint16_t pii_snxt; /* sequence number of next probe */ 249 uint_t pii_snxt_time; /* actual next probe time that */ 250 /* includes some randomness */ 251 252 uint_t pii_snxt_basetime; /* strictly periodic base probe time */ 253 /* for all periodic probes */ 254 uint_t pii_fd_snxt_basetime; /* strictly periodic base probe time */ 255 /* for failure detection probes */ 256 257 hrtime_t pii_fd_hrtime; /* hrtime_t before which we should */ 258 /* not send probes out this pii */ 259 260 uint64_t pii_flags; /* Phyint flags from kernel */ 261 262 struct probe_stats { 263 struct target *pr_target; /* Probe Target */ 264 uint_t pr_time_sent; /* Time probe was sent */ 265 uint_t pr_status; /* probe status as below */ 266 #define PR_UNUSED 0 /* Probe slot unused */ 267 #define PR_UNACKED 1 /* Probe is unacknowledged */ 268 #define PR_ACKED 2 /* Probe has been acknowledged */ 269 #define PR_LOST 3 /* Probe is declared lost */ 270 union { 271 uint_t tl; /* time probe is declared lost */ 272 uint_t ta; /* time probe is acked */ 273 } prt; 274 #define pr_time_lost prt.tl 275 #define pr_time_acked prt.ta 276 } pii_probes[PROBE_STATS_COUNT]; 277 278 uint_t 279 pii_in_use : 1, /* To detect removed phyints */ 280 pii_basetime_inited : 1, /* probe time initialized */ 281 pii_targets_are_routers : 1; /* routers or hosts ? */ 282 283 uint_t pii_probe_next; /* next index to use in pii_probes[] */ 284 struct target *pii_target_next; /* next target for probing */ 285 struct target *pii_rtt_target_next; 286 /* next target for rtt probes */ 287 288 int pii_ntargets; /* Number of active targets */ 289 struct stats { /* Cumulative statistics */ 290 uint64_t lost; /* Number of probes lost */ 291 uint64_t acked; /* Number of probes acked */ 292 uint64_t sent; /* Number of probes sent */ 293 uint64_t unknown; /* Number of ambiguous */ 294 /* probe acks */ 295 } pii_cum_stats; 296 }; 297 298 #define pii_name pii_phyint->pi_name 299 #define pii_ifindex pii_phyint->pi_ifindex 300 #define pii_state pii_phyint->pi_state 301 #define pii_icmpid pii_phyint->pi_icmpid 302 303 #define PR_STATUS_VALID(status) ((status) <= PR_LOST) 304 305 306 /* 307 * A doubly linked list of prefixes or logicals, hanging off the 308 * phyint instance. 309 */ 310 struct logint { 311 struct logint *li_next; /* Next logint of this phyint inst. */ 312 struct logint *li_prev; /* Prev logint of this phyint inst. */ 313 struct phyint_instance *li_phyint_inst; 314 /* Back pointer to phyint inst. */ 315 316 char li_name[LIFNAMSIZ + 1]; /* name Eg. hme0:1 */ 317 struct in6_addr li_addr; /* IP address */ 318 struct in6_addr li_dstaddr; /* Dst IP address for pointopoint */ 319 struct in6_addr li_subnet; /* prefix / subnet */ 320 uint_t li_subnet_len; /* prefix / subnet length */ 321 uint64_t li_flags; /* IFF_* flags */ 322 uint_t li_oifindex; /* original ifindex (SIOCGLIFOINDEX) */ 323 uint_t 324 li_in_use : 1, /* flag to detect deleted logints */ 325 li_dupaddr : 1; /* test address is not unique */ 326 }; 327 328 329 /* 330 * Doubly-linked list of probe targets on a phyint instance. Probe targets are 331 * usually onlink routers. If no onlink routers can be found, onlink hosts 332 * are used. 333 */ 334 struct target { 335 struct target *tg_next; /* Next target for this phyint inst. */ 336 struct target *tg_prev; /* Prev target for this phyint inst. */ 337 struct phyint_instance *tg_phyint_inst; 338 /* Back pointer to phyint instance */ 339 340 struct in6_addr tg_address; /* Target IP address */ 341 int tg_status; /* Status of the target below */ 342 #define TG_ACTIVE 1 /* active probe target */ 343 #define TG_UNUSED 2 /* target not in use now */ 344 #define TG_SLOW 3 /* rtt is high - Not in use now */ 345 #define TG_DEAD 4 /* Target is not responding */ 346 347 hrtime_t tg_latime; /* Target's last active time */ 348 int tg_rtt_sa; /* Scaled round trip time(RTT) avg. */ 349 int tg_rtt_sd; /* Scaled RTT deviation */ 350 int tg_crtt; /* Conservative RTT = A + 4D */ 351 uint32_t 352 tg_in_use : 1; /* In use flag */ 353 int tg_deferred[MAXDEFERREDRTT + 1]; 354 /* Deferred rtt data points */ 355 int tg_num_deferred; 356 /* Number of deferred rtt data points */ 357 }; 358 359 #define TG_STATUS_VALID(status) \ 360 (((status) >= TG_ACTIVE) && ((status) <= TG_DEAD)) 361 362 /* 363 * Statistics about consecutive probe failures are passed around between 364 * functions in this structure. 365 */ 366 struct probe_fail_count 367 { 368 uint_t pf_tff; /* Earliest time of failure in a series */ 369 int pf_nfail; /* Number of consecutive probe failures */ 370 int pf_nfail_tg; /* Number of consecutive probe fails for */ 371 /* some given target 'tg' */ 372 }; 373 374 /* 375 * Statistics about consecutive probe successes is passed around between 376 * functions in this structure. 377 */ 378 struct probe_success_count 379 { 380 uint_t ps_tls; /* Most recent time of probe success */ 381 boolean_t ps_tls_valid; /* is ps_tls valid */ 382 int ps_nsucc; /* Number of consecutive probe successes */ 383 /* starting from the most recent */ 384 int ps_nsucc_tg; /* Number of consecutive probe successes */ 385 /* for some given target 'tg' */ 386 }; 387 388 /* 389 * Statistics about missed probes that were never sent. 390 * Happens due to scheduling delay. 391 */ 392 393 struct probes_missed 394 { 395 uint_t pm_nprobes; /* Cumulative number of missed probes */ 396 uint_t pm_ntimes; /* Total number of occassions */ 397 }; 398 399 struct local_addr 400 { 401 struct in6_addr addr; 402 struct local_addr *next; 403 }; 404 405 /* 406 * Globals 407 */ 408 extern struct local_addr *laddr_list; 409 /* List of all local addresses, including local zones */ 410 extern struct phyint *phyints; /* List of all phyints */ 411 extern struct phyint_group *phyint_groups; /* List of all phyint groups */ 412 extern struct phyint_group *phyint_anongroup; /* Pointer to the anon group */ 413 extern struct phyint_instance *phyint_instances; 414 /* List of all phyint instances */ 415 extern struct probes_missed probes_missed; 416 /* statistics about missed probes */ 417 418 /* 419 * Function prototypes 420 */ 421 extern int phyint_init(void); 422 extern struct phyint *phyint_lookup(const char *name); 423 extern struct phyint_instance *phyint_inst_lookup(int af, char *name); 424 extern struct phyint_instance *phyint_inst_init_from_k(int af, char *name); 425 extern struct phyint_instance *phyint_inst_other(struct phyint_instance *pii); 426 extern int phyint_inst_update_from_k(struct phyint_instance *pii); 427 extern void phyint_inst_delete(struct phyint_instance *pii); 428 extern uint_t phyint_inst_timer(struct phyint_instance *pii); 429 extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii); 430 431 extern void phyint_newtype(struct phyint *pi); 432 extern void phyint_chstate(struct phyint *pi, enum pi_state state); 433 extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state); 434 extern void phyint_check_for_repair(struct phyint *pi); 435 436 extern void logint_init_from_k(struct phyint_instance *pii, char *li_name); 437 extern void logint_delete(struct logint *li); 438 439 extern struct target *target_lookup(struct phyint_instance *pii, 440 struct in6_addr addr); 441 extern void target_create(struct phyint_instance *pii, 442 struct in6_addr addr, boolean_t is_router); 443 extern void target_delete(struct target *tg); 444 extern struct target *target_next(struct target *tg); 445 extern void target_add(struct phyint_instance *pii, struct in6_addr addr, 446 boolean_t is_router); 447 448 extern void in_data(struct phyint_instance *pii); 449 extern void in6_data(struct phyint_instance *pii); 450 451 extern int try_failover(struct phyint *pi, int failover_type); 452 extern int try_failback(struct phyint *pi); 453 extern int do_failback(struct phyint *pi); 454 extern boolean_t change_lif_flags(struct phyint *pi, uint64_t flags, 455 boolean_t setfl); 456 457 extern void logperror_pii(struct phyint_instance *pii, const char *str); 458 extern void logperror_li(struct logint *li, const char *str); 459 extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len); 460 extern void phyint_inst_print_all(void); 461 462 extern int logint_upcount(struct phyint *pi); 463 extern void restore_phyint(struct phyint *pi); 464 extern void reset_crtt_all(struct phyint *pi); 465 extern int failure_state(struct phyint_instance *pii); 466 extern void process_link_state_changes(void); 467 extern void clear_pii_probe_stats(struct phyint_instance *pii); 468 extern void start_timer(struct phyint_instance *pii); 469 470 extern boolean_t own_address(struct in6_addr addr); 471 472 extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag); 473 474 extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **); 475 extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **); 476 extern unsigned int getgrouplist(ipmp_grouplist_t **); 477 extern unsigned int getsnap(ipmp_snap_t **); 478 479 #ifdef __cplusplus 480 } 481 #endif 482 483 #endif /* _MPD_TABLES_H */ 484