1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _MPD_TABLES_H 27 #define _MPD_TABLES_H 28 29 #ifdef __cplusplus 30 extern "C" { 31 #endif 32 33 /* 34 * Terminology: 35 * 36 * phyint: A NIC eg. hme0. This is represented as 'struct phyint' 37 * 38 * phyint instance: A protocol instance of a phyint. Eg. the IPv4 instance of 39 * hme0 or the IPv6 instance of hme0. (struct phyint_instance) 40 * 41 * logint: A logical interface eg. hme0:1 (struct logint) 42 * 43 * phyint_group: A group of phyints i.e. physical interfaces that are 44 * (i) connected to the same level 2 topology e.g. the same ethernet 45 * switch AND 46 * (ii) share the same phyint group name. 47 * Load spreading and failover occur across members of the same phyint group. 48 * phyint group members must be homogeneous. i.e. if a phyint belonging to a 49 * phyint group has a IPv6 protocol instance, then all members of the phyint 50 * group, must have IPv6 protocol instances. (struct phyint_group) 51 */ 52 53 #define MAXDEFERREDRTT 1 /* Maximum number of deferred rtts */ 54 55 /* 56 * Status of the phyint, expressed by the return code of failure_state() 57 */ 58 #define PHYINT_OK 0 /* No failure detected */ 59 #define PHYINT_FAILURE 1 /* NIC failure detected */ 60 #define GROUP_FAILURE 2 /* All NICs have failed */ 61 62 /* 63 * Return values of phyint_inst_update_from_k() 64 */ 65 #define PI_OK 1 /* Phyint matches in the kernel */ 66 #define PI_DELETED 2 /* Phyint has vanished in the kernel */ 67 #define PI_IFINDEX_CHANGED 3 /* Phyint's ifindex has changed */ 68 #define PI_IOCTL_ERROR 4 /* Some ioctl error */ 69 #define PI_GROUP_CHANGED 5 /* The phyint has changed group. */ 70 71 #define PHYINT_FLAGS(flags) \ 72 (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \ 73 IFF_RUNNING)) | (handle_link_notifications ? 0 : IFF_RUNNING)) 74 75 /* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */ 76 #define PHYINT_INSTANCE(pi, af) \ 77 ((af) == AF_INET ? (pi)->pi_v4 : (pi)->pi_v6) 78 79 /* 80 * A phyint instance is probe *enabled* if it has been configured with a 81 * unique probe address (i.e., an IFF_NOFAILOVER address). It is probe 82 * *capable* if it is also able to send probes (i.e., has one or more 83 * targets available). 84 */ 85 #define PROBE_ENABLED(pii) \ 86 (((pii) != NULL) && ((pii)->pii_probe_sock != -1) && \ 87 ((pii)->pii_probe_logint != NULL) && \ 88 (((pii)->pii_probe_logint->li_dupaddr == 0))) 89 90 #define PROBE_CAPABLE(pii) \ 91 (PROBE_ENABLED(pii) && ((pii)->pii_ntargets != 0)) 92 93 /* Subtract b from a modulo n. i.e. (a - b) mod n */ 94 #define MOD_SUB(a, b, n) \ 95 ((((a) + (n)) - (b)) % (n)) 96 97 /* Increment modulo n */ 98 #define MOD_INCR(a, n) \ 99 (((a) + 1) % (n)) 100 101 /* Decrement modulo n */ 102 #define MOD_DCR(a, n) \ 103 MOD_SUB(a, 1, n) 104 105 /* 106 * 'index' represents an index into the circular probe stats array of 107 * size PROBE_STATS_COUNT. 0 <= index < PROBE_STATS_COUNT. This is used 108 * to access members of the pii_probes[] array defined in the phyint_instance 109 * structure. 110 */ 111 #define PROBE_INDEX_PREV(index) \ 112 MOD_DCR(index, PROBE_STATS_COUNT) 113 114 #define PROBE_INDEX_NEXT(index) \ 115 MOD_INCR(index, PROBE_STATS_COUNT) 116 117 118 /* 119 * If we receive more than LINK_UP_PERMIN "link up" notifications in a minute, 120 * then don't actually perform the repair operation until we've dropped back 121 * below the threshold (or we have a probe address and our probes indicate 122 * that the link is functioning again). This is to prevent link flapping in 123 * the case where we don't have a probe address. 124 */ 125 #define LINK_UP_PERMIN 2 126 127 #define LINK_DOWN(pi) ((pi)->pi_link_state == 0) 128 #define LINK_UP(pi) (!LINK_DOWN(pi)) 129 #define FLAGS_TO_LINK_STATE(pi) (((pi)->pi_flags & IFF_RUNNING) != 0) 130 #define UPDATE_LINK_STATE(pi) ((pi)->pi_link_state = \ 131 FLAGS_TO_LINK_STATE(pi) ? 1 : 0) 132 #define INIT_LINK_STATE(pi) ((pi)->pi_link_state = 1) 133 134 /* 135 * Phyint group states; see below for the phyint group definition. 136 */ 137 enum pg_state { 138 PG_OK = 1, /* all interfaces in the group are working */ 139 PG_DEGRADED, /* some interfaces in the group are unusable */ 140 PG_FAILED /* all interfaces in the group are unusable */ 141 }; 142 143 /* 144 * Convenience macro to check if the whole group has failed. 145 */ 146 #define GROUP_FAILED(pg) ((pg)->pg_state == PG_FAILED) 147 148 /* 149 * A doubly linked list of all phyint groups in the system. 150 * A phyint group is identified by its group name. 151 */ 152 struct phyint_group { 153 char pg_name[LIFGRNAMSIZ]; /* Phyint group name */ 154 struct phyint *pg_phyint; /* List of phyints in this group */ 155 struct phyint_group *pg_next; /* Next phyint group */ 156 struct phyint_group *pg_prev; /* Prev phyint group */ 157 uint64_t pg_sig; /* Current signature of this group */ 158 int pg_probeint; /* Interval between probes */ 159 int pg_fdt; /* Time needed to detect failure */ 160 enum pg_state pg_state; /* Current group state */ 161 boolean_t pg_in_use; /* To detect removed groups */ 162 struct addrlist *pg_addrs; /* Data addresses in this group */ 163 boolean_t pg_failmsg_printed; /* Group failure msg printed */ 164 }; 165 166 /* 167 * Phyint states; see below for the phyint definition. 168 */ 169 enum pi_state { 170 PI_INIT = 0, /* Phyint is being initialized */ 171 PI_NOTARGETS = 1, /* Phyint has no targets */ 172 PI_RUNNING = 2, /* Phyint is functioning */ 173 PI_FAILED = 3, /* Phyint is failed */ 174 PI_OFFLINE = 4 /* Phyint is offline */ 175 }; 176 177 /* 178 * Representation of a NIC or a phyint. There is a list of all known phyints. 179 * There is also a list of phyints belonging to a phyint group, one list 180 * per phyint group. 181 */ 182 struct phyint { 183 char pi_name[LIFNAMSIZ + 1]; /* Phyint name eg. le0 */ 184 struct phyint_instance *pi_v4; /* The IPv4 instance */ 185 struct phyint_instance *pi_v6; /* The IPv6 instance */ 186 struct phyint_group *pi_group; /* Pointer to the group */ 187 struct phyint *pi_next; /* List of all phyints */ 188 struct phyint *pi_prev; /* List of all phyints */ 189 struct phyint *pi_pgnext; /* List of phyints in this group */ 190 struct phyint *pi_pgprev; /* List of phyints in this group */ 191 uint_t pi_ifindex; /* interface index */ 192 enum pi_state pi_state; /* State of the phyint */ 193 uint64_t pi_flags; /* Phyint flags from kernel */ 194 uint16_t pi_icmpid; /* icmp id in icmp echo request */ 195 uint64_t pi_taddrthresh; /* time (in secs) to delay logging */ 196 /* about missing test addresses */ 197 dlpi_handle_t pi_dh; /* DLPI handle to underlying link */ 198 uint_t pi_notes; /* enabled DLPI notifications */ 199 uchar_t pi_hwaddr[DLPI_PHYSADDR_MAX]; /* phyint's hw address */ 200 size_t pi_hwaddrlen; /* phyint's hw address length */ 201 202 /* 203 * The pi_whenup array is a circular buffer of the most recent 204 * times (in milliseconds since some arbitrary point of time in 205 * the past) that the interface was brought up; pi_whendx identifies 206 * the oldest element of the array. 207 */ 208 uint_t pi_whenup[LINK_UP_PERMIN]; 209 unsigned int pi_whendx; 210 211 uint_t 212 pi_taddrmsg_printed : 1, /* testaddr msg printed */ 213 pi_duptaddrmsg_printed : 1, /* dup testaddr msg printed */ 214 pi_cfgmsg_printed : 1, /* bad config msg printed */ 215 pi_lfmsg_printed : 1, /* link-flapping msg printed */ 216 pi_link_state : 1, /* interface link state */ 217 pi_hwaddrdup : 1; /* disabled due to dup hw address */ 218 }; 219 220 /* 221 * A doubly linked list of all phyint_instances each of which contains a 222 * doubly linked list of logical interfaces and targets. For eg. if both 223 * IPv4 and IPv6 are used over hme0, we have 2 phyint instances, 1 for each 224 * protocol. 225 */ 226 struct phyint_instance { 227 struct phyint_instance *pii_next; /* List of all phyint insts */ 228 struct phyint_instance *pii_prev; /* List of all phyint insts */ 229 230 struct phyint *pii_phyint; /* Back pointer to the phyint */ 231 struct target *pii_targets; /* List of targets on this link */ 232 struct logint *pii_probe_logint; /* IFF_NOFAILOVER addr for probing */ 233 struct logint *pii_logint; /* Doubly linked list of logical ifs */ 234 235 int pii_probe_sock; /* Socket for ICMP Probe packets */ 236 int pii_af; /* Address family */ 237 uint16_t pii_rack; /* highest acknowledged seq number */ 238 uint16_t pii_snxt; /* sequence number of next probe */ 239 uint_t pii_snxt_time; /* actual next probe time that */ 240 /* includes some randomness */ 241 242 uint_t pii_snxt_basetime; /* strictly periodic base probe time */ 243 /* for all periodic probes */ 244 uint_t pii_fd_snxt_basetime; /* strictly periodic base probe time */ 245 /* for failure detection probes */ 246 247 hrtime_t pii_fd_hrtime; /* hrtime_t before which we should */ 248 /* not send probes out this pii */ 249 250 uint64_t pii_flags; /* Phyint flags from kernel */ 251 252 struct probe_stats { 253 uint_t pr_id; /* Full ID of probe */ 254 struct target *pr_target; /* Probe Target */ 255 uint_t pr_time_lost; /* Time probe declared lost */ 256 struct timeval pr_tv_sent; /* Wall time probe was sent */ 257 hrtime_t pr_hrtime_start; /* hrtime probe op started */ 258 hrtime_t pr_hrtime_sent; /* hrtime probe was sent */ 259 hrtime_t pr_hrtime_ackrecv; /* hrtime probe ack received */ 260 hrtime_t pr_hrtime_ackproc; /* hrtime probe ack processed */ 261 uint_t pr_status; /* probe status as below */ 262 #define PR_UNUSED 0 /* Probe slot unused */ 263 #define PR_UNACKED 1 /* Probe is unacknowledged */ 264 #define PR_ACKED 2 /* Probe has been acknowledged */ 265 #define PR_LOST 3 /* Probe is declared lost */ 266 } pii_probes[PROBE_STATS_COUNT]; 267 268 uint_t 269 pii_in_use : 1, /* To detect removed phyints */ 270 pii_basetime_inited : 1, /* probe time initialized */ 271 pii_targets_are_routers : 1; /* routers or hosts ? */ 272 273 uint_t pii_probe_next; /* next index to use in pii_probes[] */ 274 struct target *pii_target_next; /* next target for probing */ 275 struct target *pii_rtt_target_next; 276 /* next target for rtt probes */ 277 278 int pii_ntargets; /* Number of active targets */ 279 struct stats { /* Cumulative statistics */ 280 uint64_t lost; /* Number of probes lost */ 281 uint64_t acked; /* Number of probes acked */ 282 uint64_t sent; /* Number of probes sent */ 283 uint64_t unknown; /* Number of ambiguous */ 284 /* probe acks */ 285 } pii_cum_stats; 286 }; 287 288 #define pii_name pii_phyint->pi_name 289 #define pii_ifindex pii_phyint->pi_ifindex 290 #define pii_state pii_phyint->pi_state 291 #define pii_icmpid pii_phyint->pi_icmpid 292 293 #define PR_STATUS_VALID(status) ((status) <= PR_LOST) 294 295 296 /* 297 * A doubly linked list of prefixes or logicals, hanging off the 298 * phyint instance. 299 */ 300 struct logint { 301 struct logint *li_next; /* Next logint of this phyint inst. */ 302 struct logint *li_prev; /* Prev logint of this phyint inst. */ 303 struct phyint_instance *li_phyint_inst; 304 /* Back pointer to phyint inst. */ 305 306 char li_name[LIFNAMSIZ + 1]; /* name Eg. hme0:1 */ 307 struct in6_addr li_addr; /* IP address */ 308 struct in6_addr li_dstaddr; /* Dst IP address for pointopoint */ 309 struct in6_addr li_subnet; /* prefix / subnet */ 310 uint_t li_subnet_len; /* prefix / subnet length */ 311 uint64_t li_flags; /* IFF_* flags */ 312 uint_t 313 li_in_use : 1, /* flag to detect deleted logints */ 314 li_dupaddr : 1; /* test address is not unique */ 315 }; 316 317 318 /* 319 * Doubly-linked list of probe targets on a phyint instance. Probe targets are 320 * usually onlink routers. If no onlink routers can be found, onlink hosts 321 * are used. 322 */ 323 struct target { 324 struct target *tg_next; /* Next target for this phyint inst. */ 325 struct target *tg_prev; /* Prev target for this phyint inst. */ 326 struct phyint_instance *tg_phyint_inst; 327 /* Back pointer to phyint instance */ 328 329 struct in6_addr tg_address; /* Target IP address */ 330 int tg_status; /* Status of the target below */ 331 #define TG_ACTIVE 1 /* active probe target */ 332 #define TG_UNUSED 2 /* target not in use now */ 333 #define TG_SLOW 3 /* rtt is high - Not in use now */ 334 #define TG_DEAD 4 /* Target is not responding */ 335 336 hrtime_t tg_latime; /* Target's last active time */ 337 int64_t tg_rtt_sa; /* Scaled RTT average (in ns) */ 338 int64_t tg_rtt_sd; /* Scaled RTT deviation (in ns) */ 339 int tg_crtt; /* Conservative RTT = A + 4D (in ms) */ 340 uint32_t 341 tg_in_use : 1; /* In use flag */ 342 int64_t tg_deferred[MAXDEFERREDRTT + 1]; 343 /* Deferred rtt data points */ 344 int tg_num_deferred; 345 /* Number of deferred rtt data points */ 346 }; 347 348 #define TG_STATUS_VALID(status) \ 349 (((status) >= TG_ACTIVE) && ((status) <= TG_DEAD)) 350 351 /* 352 * Statistics about consecutive probe failures are passed around between 353 * functions in this structure. 354 */ 355 struct probe_fail_count 356 { 357 uint_t pf_tff; /* Earliest time of failure in a series */ 358 int pf_nfail; /* Number of consecutive probe failures */ 359 int pf_nfail_tg; /* Number of consecutive probe fails for */ 360 /* some given target 'tg' */ 361 }; 362 363 /* 364 * Statistics about consecutive probe successes is passed around between 365 * functions in this structure. 366 */ 367 struct probe_success_count 368 { 369 uint_t ps_tls; /* Most recent time of probe success */ 370 boolean_t ps_tls_valid; /* is ps_tls valid */ 371 int ps_nsucc; /* Number of consecutive probe successes */ 372 /* starting from the most recent */ 373 int ps_nsucc_tg; /* Number of consecutive probe successes */ 374 /* for some given target 'tg' */ 375 }; 376 377 /* 378 * Statistics about missed probes that were never sent. 379 * Happens due to scheduling delay. 380 */ 381 382 struct probes_missed 383 { 384 uint_t pm_nprobes; /* Cumulative number of missed probes */ 385 uint_t pm_ntimes; /* Total number of occasions */ 386 }; 387 388 typedef struct addrlist { 389 struct addrlist *al_next; /* next address */ 390 char al_name[LIFNAMSIZ]; /* address lif name */ 391 uint64_t al_flags; /* address flags */ 392 struct sockaddr_storage al_addr; /* address */ 393 } addrlist_t; 394 395 /* 396 * Globals 397 */ 398 extern addrlist_t *localaddrs; 399 /* List of all local addresses, including local zones */ 400 extern struct phyint *phyints; /* List of all phyints */ 401 extern struct phyint_group *phyint_groups; /* List of all phyint groups */ 402 extern struct phyint_group *phyint_anongroup; /* Pointer to the anon group */ 403 extern struct phyint_instance *phyint_instances; 404 /* List of all phyint instances */ 405 extern struct probes_missed probes_missed; 406 /* statistics about missed probes */ 407 408 /* 409 * Function prototypes 410 */ 411 extern int phyint_init(void); 412 extern struct phyint *phyint_lookup(const char *name); 413 extern struct phyint_instance *phyint_inst_lookup(int af, char *name); 414 extern struct phyint_instance *phyint_inst_init_from_k(int af, char *name); 415 extern struct phyint_instance *phyint_inst_other(struct phyint_instance *pii); 416 extern int phyint_inst_update_from_k(struct phyint_instance *pii); 417 extern void phyint_inst_delete(struct phyint_instance *pii); 418 extern uint_t phyint_inst_timer(struct phyint_instance *pii); 419 extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii); 420 421 extern void phyint_changed(struct phyint *pi); 422 extern void phyint_chstate(struct phyint *pi, enum pi_state state); 423 extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state); 424 extern struct phyint_group *phyint_group_create(const char *pg_name); 425 extern struct phyint_group *phyint_group_lookup(const char *pg_name); 426 extern void phyint_group_insert(struct phyint_group *pg); 427 extern void phyint_group_delete(struct phyint_group *pg); 428 extern void phyint_group_refresh_state(struct phyint_group *pg); 429 extern void phyint_standby_refresh_inactive(struct phyint *pi); 430 extern void phyint_check_for_repair(struct phyint *pi); 431 extern void phyint_transition_to_running(struct phyint *pi); 432 extern void phyint_activate_another(struct phyint *pi); 433 extern int phyint_offline(struct phyint *pi, unsigned int); 434 extern int phyint_undo_offline(struct phyint *pi); 435 extern boolean_t phyint_is_functioning(struct phyint *pi); 436 437 extern void logint_init_from_k(struct phyint_instance *pii, char *li_name); 438 extern void logint_delete(struct logint *li); 439 440 extern struct target *target_lookup(struct phyint_instance *pii, 441 struct in6_addr addr); 442 extern void target_create(struct phyint_instance *pii, 443 struct in6_addr addr, boolean_t is_router); 444 extern void target_delete(struct target *tg); 445 extern struct target *target_next(struct target *tg); 446 extern void target_add(struct phyint_instance *pii, struct in6_addr addr, 447 boolean_t is_router); 448 449 extern void in_data(struct phyint_instance *pii); 450 extern void in6_data(struct phyint_instance *pii); 451 452 extern void logperror_pii(struct phyint_instance *pii, const char *str); 453 extern void logperror_li(struct logint *li, const char *str); 454 extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len); 455 extern void addr2storage(int af, const struct in6_addr *addr, 456 struct sockaddr_storage *ssp); 457 extern void phyint_inst_print_all(void); 458 extern boolean_t prefix_equal(struct in6_addr, struct in6_addr, uint_t); 459 460 extern void reset_crtt_all(struct phyint *pi); 461 extern int failure_state(struct phyint_instance *pii); 462 extern void process_link_state_changes(void); 463 extern void clear_pii_probe_stats(struct phyint_instance *pii); 464 extern void start_timer(struct phyint_instance *pii); 465 extern void stop_probing(struct phyint *pi); 466 467 extern boolean_t own_address(struct in6_addr addr); 468 extern boolean_t change_pif_flags(struct phyint *pi, uint64_t set, 469 uint64_t clear); 470 471 extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag); 472 extern int probe_state_event(struct probe_stats *, struct phyint_instance *); 473 extern void probe_chstate(struct probe_stats *, struct phyint_instance *, int); 474 475 extern unsigned int getgraddrinfo(const char *, struct sockaddr_storage *, 476 ipmp_addrinfo_t **); 477 extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **); 478 extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **); 479 extern unsigned int getgrouplist(ipmp_grouplist_t **); 480 extern unsigned int getsnap(ipmp_snap_t **); 481 482 extern boolean_t addrlist_add(addrlist_t **, const char *, uint64_t, 483 struct sockaddr_storage *); 484 extern void addrlist_free(addrlist_t **); 485 486 #ifdef __cplusplus 487 } 488 #endif 489 490 #endif /* _MPD_TABLES_H */ 491