/* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 1987 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms are permitted * provided that the above copyright notice and this paragraph are * duplicated in all such forms and that any documentation, * advertising materials, and other materials related to such * distribution and use acknowledge that the software was developed * by the University of California, Berkeley. The name of the * University may not be used to endorse or promote products derived * from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ #pragma ident "%Z%%M% %I% %E% SMI" #include "mpd_defs.h" #include "mpd_tables.h" /* * Probe types for probe() */ #define PROBE_UNI 0x1234 /* Unicast probe packet */ #define PROBE_MULTI 0x5678 /* Multicast probe packet */ #define PROBE_RTT 0x9abc /* RTT only probe packet */ #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ /* * Format of probe / probe response packets. This is an ICMP Echo request * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 */ struct pr_icmp { uint8_t pr_icmp_type; /* type field */ uint8_t pr_icmp_code; /* code field */ uint16_t pr_icmp_cksum; /* checksum field */ uint16_t pr_icmp_id; /* Identification */ uint16_t pr_icmp_seq; /* sequence number */ uint32_t pr_icmp_timestamp; /* Time stamp */ uint32_t pr_icmp_mtype; /* Message type */ }; static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1 } }; static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ static void *find_ancillary(struct msghdr *msg, int cmsg_type); static void pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni); static void incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static void incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static boolean_t check_pg_crtt_improved(struct phyint_group *pg); static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); static boolean_t check_exception_target(struct phyint_instance *pii, struct target *target); static void probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_fail_count *pfinfo); static void probe_success_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_success_count *psinfo); static boolean_t phyint_repaired(struct phyint *pi); static int failover(struct phyint *from, struct phyint *to); static int failback(struct phyint *from, struct phyint *to); static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); static int in_cksum(ushort_t *addr, int len); static void reset_snxt_basetimes(void); /* * CRTT - Conservative Round Trip Time Estimate * Probe success - A matching probe reply received before CRTT ms has elapsed * after sending the probe. * Probe failure - No probe reply received and more than CRTT ms has elapsed * after sending the probe. * * TLS - Time last success. Most recent probe ack received at this time. * TFF - Time first fail. The time of the earliest probe failure in * a consecutive series of probe failures. * NUM_PROBE_REPAIRS - Number of consecutive successful probes required * before declaring phyint repair. * NUM_PROBE_FAILS - Number of consecutive probe failures required to * declare a phyint failure. * * Phyint state diagram * * The state of a phyint that is capable of being probed, is completely * specified by the 5-tuple . * * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state * of the link (according to the driver). If the phyint is also configured * with a test address (the common case) and probe targets, then a phyint must * also successfully be able to send and receive probes in order to remain in * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). * * Further, if a PI_RUNNING phyint is configured with a test address but is * unable to find any probe targets, it will transition to the PI_NOTARGETS * state, which indicates that the link is apparently functional but that * in.mpathd is unable to send probes to verify functionality (in this case, * in.mpathd makes the optimistic assumption that the interface is working * correctly and thus does not perform a failover, but reports the interface * as IPMP_IF_UNKNOWN through the async events and query interfaces). * * At any point, a phyint may be administratively marked offline via if_mpadm. * In this case, the interface always transitions to PI_OFFLINE, regardless * of its previous state. When the interface is later brought back online, * in.mpathd acts as if the interface is new (and thus it transitions to * PI_RUNNING or PI_FAILED based on the status of the link and the result of * its probes, if probes are sent). * * pi_state - PI_RUNNING or PI_FAILED * PI_RUNNING: The failure detection logic says the phyint is good. * PI_FAILED: The failure detection logic says the phyint has failed. * * pg_groupfailed - Group failure, all interfaces in the group have failed. * The pi_state may be either PI_FAILED or PI_NOTARGETS. * In the case of router targets, we assume that the current list of * targets obtained from the routing table, is still valid, so the * phyint stat is PI_FAILED. In the case of host targets, we delete the * list of targets, and multicast to the all hosts, to reconstruct the * target list. So the phyints are in the PI_NOTARGETS state. * * I - value of (pi_flags & IFF_INACTIVE) * IFF_INACTIVE: No failovers have been done to this phyint, from * other phyints. This phyint is inactive. Phyint can be a Standby. * When failback has been disabled (FAILOVER=no configured), * phyint can also be a non-STANDBY. In this case IFF_INACTIVE * is set when phyint subsequently recovers after a failure. * * pi_empty * This phyint has failed over successfully to another phyint, and * this phyint is currently "empty". It does not host any addresses or * multicast membership etc. This is the state of a phyint after a * failover from the phyint has completed successfully and no subsequent * 'failover to' or 'failback to' has occurred on the phyint. * IP guarantees that no new logicals will be hosted nor any multicast * joins permitted on the phyint, since the phyint is either failed or * inactive. pi_empty is set implies the phyint is either failed or * inactive. * * pi_full * The phyint hosts all of its own addresses that it "owns". If the * phyint was previously failed or inactive, failbacks to the phyint * has completed successfully. i.e. No more failbacks to this phyint * can produce any change in system state whatsoever. * * Not all 32 possible combinations of the above 5-tuple are possible. * Furthermore some of the above combinations are transient. They may occur * only because the failover or failback did not complete successfully. The * failover/failback will be retried and eventually a stable state will be * reached. * * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. * The following are the state machines. 'from' and 'to' are the src and * dst of the failover/failback, below * * pi_empty state machine * --------------------------------------------------------------------------- * Event State -> New State * --------------------------------------------------------------------------- * successful completion from.pi_empty = 0 -> from.pi_empty = 1 * of failover * * Initiate failover to.pi_empty = X -> to.pi_empty = 0 * * Initiate failback to.pi_empty = X -> to.pi_empty = 0 * * group failure pi_empty = X -> pi_empty = 0 * --------------------------------------------------------------------------- * * pi_full state machine * --------------------------------------------------------------------------- * Event State -> New State * --------------------------------------------------------------------------- * successful completion to.pi_full = 0 -> to.pi_full = 1 * of failback from * each of the other phyints * * Initiate failover from.pi_full = X -> from.pi_full = 0 * * group failure pi_full = X -> pi_full = 0 * --------------------------------------------------------------------------- * * pi_state state machine * --------------------------------------------------------------------------- * Event State New State * Action: * --------------------------------------------------------------------------- * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint * : failover from this phyint to another * * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint * * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) * detection -> (PI_RUNNING, I == 0) * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint * : failback to this phyint if enabled * * NIC repair (PI_FAILED, I == 0, FAILBACK=no) * detection -> (PI_RUNNING, I == 1) * : to.pi_empty = 0 * : clear IFF_FAILED on this phyint * : if failback is disabled set I == 1 * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_FAILED * (Router targets) : set IFF_FAILED * : clear pi_empty and pi_full * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_NOTARGETS * (Host targets) : set IFF_FAILED * : clear pi_empty and pi_full * : delete the target list on all phyints * --------------------------------------------------------------------------- * * I state machine * --------------------------------------------------------------------------- * Event State Action: * --------------------------------------------------------------------------- * Turn on I pi_empty == 0, STANDBY : failover from standby * * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 * pi_full == 0 : failback to this if enabled * --------------------------------------------------------------------------- * * Assertions: (Read '==>' as implies) * * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) * (pi_empty == 1) ==> (pi_full == 0) * (pi_full == 1) ==> (pi_empty == 0) * * Invariants * * pg_groupfailed = 0 && * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint * * 1. says that an inactive standby, that is not empty, has to be failed * over. For a standby to be truly inactive, it should not host any * addresses. So we move them to some other phyint. Usually we catch the * turn on of IFF_INACTIVE, and perform this action. However if the failover * did not complete successfully, then subsequently we have lost the edge * trigger, and this invariant kicks in and completes the action. * * 2. says that any failed phyint that is not empty must be failed over. * Usually we do the failover when we detect NIC failure. However if the * failover does not complete successfully, this invariant kicks in and * completes the failover. We exclude inactive standby which is covered by 1. * * 3. says that any running phyint that is not full must be failed back. * Usually we do the failback when we detect NIC repair. However if the * failback does not complete successfully, this invariant kicks in and * completes the failback. Note that we don't want to failback to an inactive * standby. * * The invariants 1 - 3 and the actions are in initifs(). */ struct probes_missed probes_missed; /* * Compose and transmit an ICMP ECHO REQUEST packet. The IP header * will be added on by the kernel. The id field identifies this phyint. * and the sequence number is an increasing (modulo 2^^16) integer. The data * portion holds the time value when the packet is sent. On echo this is * extracted to compute the round-trip time. Three different types of * probe packets are used. * * PROBE_UNI: This type is used to do failure detection / failure recovery * and RTT calculation. PROBE_UNI probes are spaced apart in time, * not less than the current CRTT. pii_probes[] stores data * about these probes. These packets consume sequence number space. * * PROBE_RTT: This type is used to make only rtt measurments. Normally these * are not used. Under heavy network load, the rtt may go up very high, * due to a spike, or may appear to go high, due to extreme scheduling * delays. Once the network stress is removed, mpathd takes long time to * recover, because the probe_interval is already high, and it takes * a long time to send out sufficient number of probes to bring down the * rtt. To avoid this problem, PROBE_RTT probes are sent out every * user_probe_interval ms. and will cause only rtt updates. These packets * do not consume sequence number space nor is information about these * packets stored in the pii_probes[] * * PROBE_MULTI: This type is only used to construct a list of targets, when * no targets are known. The packet is multicast to the all hosts addr. */ static void probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) { struct pr_icmp probe_pkt; /* Probe packet */ struct sockaddr_in6 whereto6; /* target address IPv6 */ struct sockaddr_in whereto; /* target address IPv4 */ int pr_ndx; /* probe index in pii->pii_probes[] */ boolean_t sent = _B_TRUE; if (debug & D_TARGET) { logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), pii->pii_name, probe_type, cur_time); } assert(pii->pii_probe_sock != -1); assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || probe_type == PROBE_RTT); probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; probe_pkt.pr_icmp_code = 0; probe_pkt.pr_icmp_cksum = 0; probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); /* * Since there is no need to do arithmetic on the icmpid, * (only equality check is done) pii_icmpid is stored in * network byte order at initialization itself. */ probe_pkt.pr_icmp_id = pii->pii_icmpid; probe_pkt.pr_icmp_timestamp = htonl(cur_time); probe_pkt.pr_icmp_mtype = htonl(probe_type); /* * If probe_type is PROBE_MULTI, this packet will be multicast to * the all hosts address. Otherwise it is unicast to the next target. */ assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && pii->pii_rtt_target_next != NULL)); if (pii->pii_af == AF_INET6) { bzero(&whereto6, sizeof (whereto6)); whereto6.sin6_family = AF_INET6; if (probe_type == PROBE_MULTI) { whereto6.sin6_addr = all_nodes_mcast_v6; } else if (probe_type == PROBE_UNI) { whereto6.sin6_addr = pii->pii_target_next->tg_address; } else { /* type is PROBE_RTT */ whereto6.sin6_addr = pii->pii_rtt_target_next->tg_address; } if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, sizeof (whereto6)) != sizeof (probe_pkt)) { logperror_pii(pii, "probe: probe sendto"); sent = _B_FALSE; } } else { bzero(&whereto, sizeof (whereto)); whereto.sin_family = AF_INET; if (probe_type == PROBE_MULTI) { whereto.sin_addr = all_nodes_mcast_v4; } else if (probe_type == PROBE_UNI) { IN6_V4MAPPED_TO_INADDR( &pii->pii_target_next->tg_address, &whereto.sin_addr); } else { /* type is PROBE_RTT */ IN6_V4MAPPED_TO_INADDR( &pii->pii_rtt_target_next->tg_address, &whereto.sin_addr); } /* * Compute the IPv4 icmp checksum. Does not cover the IP header. */ probe_pkt.pr_icmp_cksum = in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, sizeof (whereto)) != sizeof (probe_pkt)) { logperror_pii(pii, "probe: probe sendto"); sent = _B_FALSE; } } /* * If this is a PROBE_UNI probe packet being unicast to a target, then * update our tables. We will need this info in processing the probe * response. PROBE_MULTI and PROBE_RTT packets are not used for * the purpose of failure or recovery detection. PROBE_MULTI packets * are only used to construct a list of targets. PROBE_RTT packets are * used only for updating the rtt and not for failure detection. */ if (probe_type == PROBE_UNI && sent) { pr_ndx = pii->pii_probe_next; assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); /* Collect statistics, before we reuse the last slot. */ if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) pii->pii_cum_stats.lost++; else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) pii->pii_cum_stats.acked++; pii->pii_cum_stats.sent++; pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; pii->pii_probes[pr_ndx].pr_time_sent = cur_time; pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); pii->pii_target_next = target_next(pii->pii_target_next); assert(pii->pii_target_next != NULL); /* * If we have a single variable to denote the next target to * probe for both rtt probes and failure detection probes, we * could end up with a situation where the failure detection * probe targets become disjoint from the rtt probe targets. * Eg. if 2 targets and the actual fdt is double the user * specified fdt. So we have 2 variables. In this scheme * we also reset pii_rtt_target_next for every fdt probe, * though that may not be necessary. */ pii->pii_rtt_target_next = pii->pii_target_next; pii->pii_snxt++; } else if (probe_type == PROBE_RTT) { pii->pii_rtt_target_next = target_next(pii->pii_rtt_target_next); assert(pii->pii_rtt_target_next != NULL); } } /* * Incoming IPv4 data from wire, is received here. Called from main. */ void in_data(struct phyint_instance *pii) { struct sockaddr_in from; struct in6_addr fromaddr; uint_t fromlen; static uint_t in_packet[(IP_MAXPACKET + 1)/4]; struct ip *ip; int iphlen; int len; char abuf[INET_ADDRSTRLEN]; struct pr_icmp *reply; if (debug & D_PROBE) { logdebug("in_data(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } /* * Poll has already told us that a message is waiting, * on this socket. Read it now. We should not block. */ fromlen = sizeof (from); len = recvfrom(pii->pii_probe_sock, (char *)in_packet, sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); if (len < 0) { logperror_pii(pii, "in_data: recvfrom"); return; } /* * If the NIC has indicated the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) return; /* Get the printable address for error reporting */ (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); /* Make sure packet contains at least minimum ICMP header */ ip = (struct ip *)in_packet; iphlen = ip->ip_hl << 2; if (len < iphlen + ICMP_MINLEN) { if (debug & D_PKTBAD) { logdebug("in_data: packet too short (%d bytes)" " from %s\n", len, abuf); } return; } /* * Subtract the IP hdr length, 'len' will be length of the probe * reply, starting from the icmp hdr. */ len -= iphlen; /* LINTED */ reply = (struct pr_icmp *)((char *)in_packet + iphlen); /* Probe replies are icmp echo replies. Ignore anything else */ if (reply->pr_icmp_type != ICMP_ECHO_REPLY) return; /* * The icmp id should match what we sent, which is stored * in pi_icmpid. The icmp code for reply must be 0. * The reply content must be a struct pr_icmp */ if (reply->pr_icmp_id != pii->pii_icmpid) { /* Not in response to our probe */ return; } if (reply->pr_icmp_code != 0) { logtrace("probe reply code %d from %s on %s\n", reply->pr_icmp_code, abuf, pii->pii_name); return; } if (len < sizeof (struct pr_icmp)) { logtrace("probe reply too short: %d bytes from %s on %s\n", len, abuf, pii->pii_name); return; } IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) /* Unicast probe reply */ incoming_echo_reply(pii, reply, fromaddr); else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { /* Multicast reply */ incoming_mcast_reply(pii, reply, fromaddr); } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { incoming_rtt_reply(pii, reply, fromaddr); } else { /* Probably not in response to our probe */ logtrace("probe reply type: %d from %s on %s\n", reply->pr_icmp_mtype, abuf, pii->pii_name); return; } } /* * Incoming IPv6 data from wire is received here. Called from main. */ void in6_data(struct phyint_instance *pii) { struct sockaddr_in6 from; static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; int len; char abuf[INET6_ADDRSTRLEN]; struct msghdr msg; struct iovec iov; uchar_t *opt; struct pr_icmp *reply; if (debug & D_PROBE) { logdebug("in6_data(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } iov.iov_base = (char *)in_packet; iov.iov_len = sizeof (in_packet); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_name = (struct sockaddr *)&from; msg.msg_namelen = sizeof (from); msg.msg_control = ancillary_data; msg.msg_controllen = sizeof (ancillary_data); if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { logperror_pii(pii, "in6_data: recvfrom"); return; } /* * If the NIC has indicated that the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) return; /* Get the printable address for error reporting */ (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); if (len < ICMP_MINLEN) { if (debug & D_PKTBAD) { logdebug("Truncated message: msg_flags 0x%x from %s\n", msg.msg_flags, abuf); } return; } /* Ignore packets > 64k or control buffers that don't fit */ if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { if (debug & D_PKTBAD) { logdebug("Truncated message: msg_flags 0x%x from %s\n", msg.msg_flags, abuf); } return; } reply = (struct pr_icmp *)in_packet; if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) return; if (reply->pr_icmp_id != pii->pii_icmpid) { /* Not in response to our probe */ return; } /* * The kernel has already verified the the ICMP checksum. */ if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { logtrace("ICMPv6 echo reply source address not linklocal from " "%s on %s\n", abuf, pii->pii_name); return; } opt = find_ancillary(&msg, IPV6_RTHDR); if (opt != NULL) { /* Can't allow routing headers in probe replies */ logtrace("message with routing header from %s on %s\n", abuf, pii->pii_name); return; } if (reply->pr_icmp_code != 0) { logtrace("probe reply code: %d from %s on %s\n", reply->pr_icmp_code, abuf, pii->pii_name); return; } if (len < (sizeof (struct pr_icmp))) { logtrace("probe reply too short: %d bytes from %s on %s\n", len, abuf, pii->pii_name); return; } if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { incoming_echo_reply(pii, reply, from.sin6_addr); } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { incoming_mcast_reply(pii, reply, from.sin6_addr); } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { incoming_rtt_reply(pii, reply, from.sin6_addr); } else { /* Probably not in response to our probe */ logtrace("probe reply type: %d from %s on %s\n", reply->pr_icmp_mtype, abuf, pii->pii_name); } } /* * Process the incoming rtt reply, in response to our rtt probe. * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't * have any stored information about the probe we sent. So we don't log * any errors if we receive bad replies. */ static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) { int m; /* rtt measurment in ms */ uint32_t cur_time; /* in ms from some arbitrary point */ char abuf[INET6_ADDRSTRLEN]; struct target *target; uint32_t pr_icmp_timestamp; struct phyint_group *pg; /* Get the printable address for error reporting */ (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); if (debug & D_PROBE) { logdebug("incoming_rtt_reply: %s %s %s\n", AF_STR(pii->pii_af), pii->pii_name, abuf); } /* Do we know this target ? */ target = target_lookup(pii, fromaddr); if (target == NULL) return; pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); cur_time = getcurrenttime(); m = (int)(cur_time - pr_icmp_timestamp); /* Invalid rtt. It has wrapped around */ if (m < 0) return; /* * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses * The initial few responses after the interface is repaired may * contain high rtt's because they could have been queued up waiting * for ARP/NDP resolution on a failed interface. */ pg = pii->pii_phyint->pi_group; if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) return; /* * Update rtt only if the new rtt is lower than the current rtt. * (specified by the 3rd parameter to pi_set_crtt). * If a spike has caused the current probe_interval to be > * user_probe_interval, then this mechanism is used to bring down * the rtt rapidly once the network stress is removed. * If the new rtt is higher than the current rtt, we don't want to * update the rtt. We are having more than 1 outstanding probe and * the increase in rtt we are seeing is being unnecessarily weighted * many times. The regular rtt update will be handled by * incoming_echo_reply() and will take care of any rtt increase. */ pi_set_crtt(target, m, _B_FALSE); if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && (user_failure_detection_time < pg->pg_fdt) && (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { /* * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, * investigate if we can improve the failure detection time to * meet whatever the user specified. */ if (check_pg_crtt_improved(pg)) { pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, user_failure_detection_time); pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); if (pii->pii_phyint->pi_group != phyint_anongroup) { logerr("Improved failure detection time %d ms " "on (%s %s) for group \"%s\"\n", pg->pg_fdt, AF_STR(pii->pii_af), pii->pii_name, pii->pii_phyint->pi_group->pg_name); } if (user_failure_detection_time == pg->pg_fdt) { /* Avoid any truncation or rounding errors */ pg->pg_probeint = user_probe_interval; /* * No more rtt probes will be sent. The actual * fdt has dropped to the user specified value. * pii_fd_snxt_basetime and pii_snxt_basetime * will be in sync henceforth. */ reset_snxt_basetimes(); } } } } /* * Process the incoming echo reply, in response to our unicast probe. * Common for both IPv4 and IPv6 */ static void incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) { int m; /* rtt measurment in ms */ uint32_t cur_time; /* in ms from some arbitrary point */ char abuf[INET6_ADDRSTRLEN]; int pr_ndx; struct target *target; boolean_t exception; uint32_t pr_icmp_timestamp; uint16_t pr_icmp_seq; struct phyint_group *pg = pii->pii_phyint->pi_group; /* Get the printable address for error reporting */ (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); if (debug & D_PROBE) { logdebug("incoming_echo_reply: %s %s %s seq %u\n", AF_STR(pii->pii_af), pii->pii_name, abuf, ntohs(reply->pr_icmp_seq)); } pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); pr_icmp_seq = ntohs(reply->pr_icmp_seq); /* Reject out of window probe replies */ if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { logtrace("out of window probe seq %u snxt %u on %s from %s\n", pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); pii->pii_cum_stats.unknown++; return; } cur_time = getcurrenttime(); m = (int)(cur_time - pr_icmp_timestamp); if (m < 0) { /* * This is a ridiculously high value of rtt. rtt has wrapped * around. Log a message, and ignore the rtt. */ logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " "timestamp %u\n", cur_time, pr_icmp_timestamp); } /* * Get the probe index pr_ndx corresponding to the received icmp seq. * number in our pii->pii_probes[] array. The icmp sequence number * pii_snxt corresponds to the probe index pii->pii_probe_next */ pr_ndx = MOD_SUB(pii->pii_probe_next, (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); target = pii->pii_probes[pr_ndx].pr_target; /* * Perform sanity checks, whether this probe reply that we * have received is genuine */ if (target != NULL) { /* * Compare the src. addr of the received ICMP or ICMPv6 * probe reply with the target address in our tables. */ if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { /* * We don't have any record of having sent a probe to * this target. This is a fake probe reply. Log an error */ logtrace("probe status %d Fake probe reply seq %u " "snxt %u on %s from %s\n", pii->pii_probes[pr_ndx].pr_status, pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); pii->pii_cum_stats.unknown++; return; } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { /* * The address matches, but our tables indicate that * this probe reply has been acked already. So this * is a duplicate probe reply. Log an error */ logtrace("probe status %d Duplicate probe reply seq %u " "snxt %u on %s from %s\n", pii->pii_probes[pr_ndx].pr_status, pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); pii->pii_cum_stats.unknown++; return; } } else { /* * Target must not be NULL in the PR_UNACKED state */ assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { /* * The probe stats slot is unused. So we didn't * send out any probe to this target. This is a fake. * Log an error. */ logtrace("probe status %d Fake probe reply seq %u " "snxt %u on %s from %s\n", pii->pii_probes[pr_ndx].pr_status, pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); } pii->pii_cum_stats.unknown++; return; } /* * If the rtt does not appear to be right, don't update the * rtt stats. This can happen if the system dropped into the * debugger, or the system was hung or too busy for a * substantial time that we didn't get a chance to run. */ if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { /* * If the probe corresponding to this receieved response * was truly sent 'm' ms. ago, then this response must * have been rejected by the sequence number checks. The * fact that it has passed the sequence number checks * means that the measured rtt is wrong. We were probably * scheduled long after the packet was received. */ goto out; } /* * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses * The initial few responses after the interface is repaired may * contain high rtt's because they could have been queued up waiting * for ARP/NDP resolution on a failed interface. */ if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) goto out; /* * Don't update the Conservative Round Trip Time estimate for this * (phint, target) pair if this is the not the highest ack seq seen * thus far on this target. */ if (!highest_ack_tg(pr_icmp_seq, target)) goto out; /* * Always update the rtt. This is a failure detection probe * and we want to measure both increase / decrease in rtt. */ pi_set_crtt(target, m, _B_TRUE); /* * If the crtt exceeds the average time between probes, * investigate if this slow target is an exception. If so we * can avoid this target and still meet the failure detection * time. Otherwise we can't meet the failure detection time. */ if (target->tg_crtt > pg->pg_probeint) { exception = check_exception_target(pii, target); if (exception) { /* * This target is exceptionally slow. Don't use it * for future probes. check_exception_target() has * made sure that we have at least MIN_PROBE_TARGETS * other active targets */ if (pii->pii_targets_are_routers) { /* * This is a slow router, mark it as slow * and don't use it for further probes. We * don't delete it, since it will be populated * again when we do a router scan. Hence we * need to maintain extra state (unlike the * host case below). Mark it as TG_SLOW. */ if (target->tg_status == TG_ACTIVE) pii->pii_ntargets--; target->tg_status = TG_SLOW; target->tg_latime = gethrtime(); target->tg_rtt_sa = -1; target->tg_crtt = 0; target->tg_rtt_sd = 0; if (pii->pii_target_next == target) { pii->pii_target_next = target_next(target); } } else { /* * the slow target is not a router, we can * just delete it. Send an icmp multicast and * pick the fastest responder that is not * already an active target. target_delete() * adjusts pii->pii_target_next */ target_delete(target); probe(pii, PROBE_MULTI, cur_time); } } else { /* * We can't meet the failure detection time. * Log a message, and update the detection time to * whatever we can achieve. */ pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); last_fdt_bumpup_time = gethrtime(); if (pg != phyint_anongroup) { logerr("Cannot meet requested failure detection" " time of %d ms on (%s %s) new failure" " detection time for group \"%s\" is %d" " ms\n", user_failure_detection_time, AF_STR(pii->pii_af), pii->pii_name, pg->pg_name, pg->pg_fdt); } } } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && (user_failure_detection_time < pg->pg_fdt) && (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { /* * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER * investigate if we can improve the failure detection time to * meet whatever the user specified. */ if (check_pg_crtt_improved(pg)) { pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, user_failure_detection_time); pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); if (pg != phyint_anongroup) { logerr("Improved failure detection time %d ms " "on (%s %s) for group \"%s\"\n", pg->pg_fdt, AF_STR(pii->pii_af), pii->pii_name, pg->pg_name); } if (user_failure_detection_time == pg->pg_fdt) { /* Avoid any truncation or rounding errors */ pg->pg_probeint = user_probe_interval; /* * No more rtt probes will be sent. The actual * fdt has dropped to the user specified value. * pii_fd_snxt_basetime and pii_snxt_basetime * will be in sync henceforth. */ reset_snxt_basetimes(); } } } out: pii->pii_probes[pr_ndx].pr_status = PR_ACKED; pii->pii_probes[pr_ndx].pr_time_acked = cur_time; /* * Update pii->pii_rack, i.e. the sequence number of the last received * probe response, based on the echo reply we have received now, if * either of the following conditions are satisfied. * a. pii_rack is outside the current receive window of * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). * This means we have not received probe responses for a * long time, and the sequence number has wrapped around. * b. pii_rack is within the current receive window and this echo * reply corresponds to the highest sequence number we have seen * so far. */ if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || SEQ_GT(pr_icmp_seq, pii->pii_rack)) { pii->pii_rack = pr_icmp_seq; } } /* * Returns true if seq is the highest unacknowledged seq for target tg * else returns false */ static boolean_t highest_ack_tg(uint16_t seq, struct target *tg) { struct phyint_instance *pii; int pr_ndx; uint16_t pr_seq; pii = tg->tg_phyint_inst; /* * Get the seq number of the most recent probe sent so far, * and also get the corresponding probe index in the probe stats * array. */ pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); pr_seq = pii->pii_snxt; pr_seq--; /* * Start from the most recent probe and walk back, trying to find * an acked probe corresponding to target tg. */ for (; pr_ndx != pii->pii_probe_next; pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { if (pii->pii_probes[pr_ndx].pr_target == tg && pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { if (SEQ_GT(pr_seq, seq)) return (_B_FALSE); } } return (_B_TRUE); } /* * Check whether the crtt for the group has improved by a factor of * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure * detection time flapping in the face of small crtt changes. */ static boolean_t check_pg_crtt_improved(struct phyint_group *pg) { struct phyint *pi; if (debug & D_PROBE) logdebug("check_pg_crtt_improved()\n"); /* * The crtt for the group is only improved if each phyint_instance * for both ipv4 and ipv6 is improved. */ for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { if (!check_pii_crtt_improved(pi->pi_v4) || !check_pii_crtt_improved(pi->pi_v6)) return (_B_FALSE); } return (_B_TRUE); } /* * Check whether the crtt has improved substantially on this phyint_instance. * Returns _B_TRUE if there's no crtt information available, because pii * is NULL or the phyint_instance is not capable of probing. */ boolean_t check_pii_crtt_improved(struct phyint_instance *pii) { struct target *tg; if (pii == NULL) return (_B_TRUE); if (!PROBE_CAPABLE(pii) || pii->pii_phyint->pi_state == PI_FAILED) return (_B_TRUE); for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { if (tg->tg_status != TG_ACTIVE) continue; if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / LOWER_FDT_TRIGGER)) { return (_B_FALSE); } } return (_B_TRUE); } /* * This target responds very slowly to probes. The target's crtt exceeds * the probe interval of its group. Compare against other targets * and determine if this target is an exception, if so return true, else false */ static boolean_t check_exception_target(struct phyint_instance *pii, struct target *target) { struct target *tg; char abuf[INET6_ADDRSTRLEN]; if (debug & D_PROBE) { logdebug("check_exception_target(%s %s target %s)\n", AF_STR(pii->pii_af), pii->pii_name, pr_addr(pii->pii_af, target->tg_address, abuf, sizeof (abuf))); } /* * We should have at least MIN_PROBE_TARGETS + 1 good targets now, * to make a good judgement. Otherwise don't drop this target. */ if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) return (_B_FALSE); /* * Determine whether only this particular target is slow. * We know that this target's crtt exceeds the group's probe interval. * If all other active targets have a * crtt < (this group's probe interval) / EXCEPTION_FACTOR, * then this target is considered slow. */ for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { if (tg != target && tg->tg_status == TG_ACTIVE) { if (tg->tg_crtt > pii->pii_phyint->pi_group->pg_probeint / EXCEPTION_FACTOR) { return (_B_FALSE); } } } return (_B_TRUE); } /* * Update the target list. The icmp all hosts multicast has given us * some host to which we can send probes. If we already have sufficient * targets, discard it. */ static void incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) /* ARGSUSED */ { int af; char abuf[INET6_ADDRSTRLEN]; struct phyint *pi; if (debug & D_PROBE) { logdebug("incoming_mcast_reply(%s %s %s)\n", AF_STR(pii->pii_af), pii->pii_name, pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); } /* * Using host targets is a fallback mechanism. If we have * found a router, don't add this host target. If we already * know MAX_PROBE_TARGETS, don't add another target. */ assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); if (pii->pii_targets != NULL) { if (pii->pii_targets_are_routers || (pii->pii_ntargets == MAX_PROBE_TARGETS)) { return; } } if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { /* * Guard against response from 0.0.0.0 * and ::. Log a trace message */ logtrace("probe response from %s on %s\n", pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), pii->pii_name); return; } /* * This address is one of our own, so reject this address as a * valid probe target. */ af = pii->pii_af; if (own_address(fromaddr)) return; /* * If the phyint is part a named group, then add the address to all * members of the group. Otherwise, add the address only to the * phyint itself, since other phyints in the anongroup may not be on * the same subnet. */ pi = pii->pii_phyint; if (pi->pi_group == phyint_anongroup) { target_add(pii, fromaddr, _B_FALSE); } else { pi = pi->pi_group->pg_phyint; for (; pi != NULL; pi = pi->pi_pgnext) target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); } } /* * Compute CRTT given an existing scaled average, scaled deviation estimate * and a new rtt time. The formula is from Jacobson and Karels' * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names * are the same as those in Appendix A.2 of that paper. * * m = new measurement * sa = scaled RTT average (8 * average estimates) * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). * crtt = Conservative round trip time. Used to determine whether probe * has timed out. * * New scaled average and deviation are passed back via sap and svp */ static int compute_crtt(int *sap, int *svp, int m) { int sa = *sap; int sv = *svp; int crtt; int saved_m = m; assert(*sap >= -1); assert(*svp >= 0); if (sa != -1) { /* * Update average estimator: * new rtt = old rtt + 1/8 Error * where Error = m - old rtt * i.e. 8 * new rtt = 8 * old rtt + Error * i.e. new sa = old sa + Error */ m -= sa >> 3; /* m is now Error in estimate. */ if ((sa += m) < 0) { /* Don't allow the smoothed average to be negative. */ sa = 0; } /* * Update deviation estimator: * new mdev = old mdev + 1/4 (abs(Error) - old mdev) * i.e. 4 * new mdev = 4 * old mdev + * (abs(Error) - old mdev) * i.e. new sv = old sv + (abs(Error) - old mdev) */ if (m < 0) m = -m; m -= sv >> 2; sv += m; } else { /* Initialization. This is the first response received. */ sa = (m << 3); sv = (m << 1); } crtt = (sa >> 3) + sv; if (debug & D_PROBE) { logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " "%d\n", saved_m, sa, sv, crtt); } *sap = sa; *svp = sv; /* * CRTT = average estimates + 4 * deviation estimates * = sa / 8 + sv */ return (crtt); } static void pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) { struct phyint_instance *pii = tg->tg_phyint_inst; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; int sa = tg->tg_rtt_sa; int sv = tg->tg_rtt_sd; int new_crtt; int i; if (debug & D_PROBE) logdebug("pi_set_crtt: target - m %d\n", m); /* store the round trip time, in case we need to defer computation */ tg->tg_deferred[tg->tg_num_deferred] = m; new_crtt = compute_crtt(&sa, &sv, m); /* * If this probe's round trip time would singlehandedly cause an * increase in the group's probe interval consider it suspect. */ if ((new_crtt > probe_interval) && is_probe_uni) { if (debug & D_PROBE) { logdebug("Received a suspect probe on %s, new_crtt =" " %d, probe_interval = %d, num_deferred = %d\n", pii->pii_probe_logint->li_name, new_crtt, probe_interval, tg->tg_num_deferred); } /* * If we've deferred as many rtts as we plan on deferring, then * assume the link really did slow down and process all queued * rtts */ if (tg->tg_num_deferred == MAXDEFERREDRTT) { if (debug & D_PROBE) { logdebug("Received MAXDEFERREDRTT probes which " "would cause an increased probe_interval. " "Integrating queued rtt data points.\n"); } for (i = 0; i <= tg->tg_num_deferred; i++) { tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, &tg->tg_rtt_sd, tg->tg_deferred[i]); } tg->tg_num_deferred = 0; } else { tg->tg_num_deferred++; } return; } /* * If this is a normal probe, or an RTT probe that would lead to a * reduced CRTT, then update our CRTT data. Further, if this was * a normal probe, pitch any deferred probes since our probes are * again being answered within our CRTT estimates. */ if (is_probe_uni || new_crtt < tg->tg_crtt) { tg->tg_rtt_sa = sa; tg->tg_rtt_sd = sv; tg->tg_crtt = new_crtt; if (is_probe_uni) tg->tg_num_deferred = 0; } } /* * Return a pointer to the specified option buffer. * If not found return NULL. */ static void * find_ancillary(struct msghdr *msg, int cmsg_type) { struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (cmsg->cmsg_level == IPPROTO_IPV6 && cmsg->cmsg_type == cmsg_type) { return (CMSG_DATA(cmsg)); } } return (NULL); } /* * See if a previously failed interface has started working again. */ void phyint_check_for_repair(struct phyint *pi) { if (phyint_repaired(pi)) { if (pi->pi_group == phyint_anongroup) { logerr("NIC repair detected on %s\n", pi->pi_name); } else { logerr("NIC repair detected on %s of group %s\n", pi->pi_name, pi->pi_group->pg_name); } /* * If the interface is offline, just clear the FAILED flag, * delaying the state change and failback operation until it * is brought back online. */ if (pi->pi_state == PI_OFFLINE) { (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); return; } if (pi->pi_flags & IFF_STANDBY) { (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); } else { if (try_failback(pi) != IPMP_FAILURE) { (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); /* Per state diagram */ pi->pi_empty = 0; } } phyint_chstate(pi, PI_RUNNING); if (GROUP_FAILED(pi->pi_group)) { /* * This is the 1st phyint to receive a response * after group failure. */ logerr("At least 1 interface (%s) of group %s has " "repaired\n", pi->pi_name, pi->pi_group->pg_name); phyint_group_chstate(pi->pi_group, PG_RUNNING); } } } /* * See if a previously functioning interface has failed, or if the * whole group of interfaces has failed. */ static void phyint_inst_check_for_failure(struct phyint_instance *pii) { struct phyint *pi; struct phyint *pi2; pi = pii->pii_phyint; switch (failure_state(pii)) { case PHYINT_FAILURE: (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); if (pi->pi_group == phyint_anongroup) { logerr("NIC failure detected on %s\n", pii->pii_name); } else { logerr("NIC failure detected on %s of group %s\n", pii->pii_name, pi->pi_group->pg_name); } /* * Do the failover, unless the interface is offline (in * which case we've already failed over). */ if (pi->pi_state != PI_OFFLINE) { phyint_chstate(pi, PI_FAILED); reset_crtt_all(pi); if (!(pi->pi_flags & IFF_INACTIVE)) (void) try_failover(pi, FAILOVER_NORMAL); } break; case GROUP_FAILURE: logerr("All Interfaces in group %s have failed\n", pi->pi_group->pg_name); for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { if (pi2->pi_flags & IFF_OFFLINE) continue; (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); reset_crtt_all(pi2); /* * In the case of host targets, we * would have flushed the targets, * and gone to PI_NOTARGETS state. */ if (pi2->pi_state == PI_RUNNING) phyint_chstate(pi2, PI_FAILED); pi2->pi_empty = 0; pi2->pi_full = 0; } break; default: break; } } /* * Determines if any timeout event has occurred and returns the number of * milliseconds until the next timeout event for the phyint. Returns * TIMER_INFINITY for "never". */ uint_t phyint_inst_timer(struct phyint_instance *pii) { int pr_ndx; uint_t timeout; struct target *cur_tg; struct probe_stats *pr_statp; struct phyint_instance *pii_other; struct phyint *pi; int valid_unack_count; int i; int interval; uint_t check_time; uint_t cur_time; hrtime_t cur_hrtime; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; cur_time = getcurrenttime(); if (debug & D_TIMER) { logdebug("phyint_inst_timer(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } pii_other = phyint_inst_other(pii); if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { /* * Check to see if we're here due to link up/down flapping; If * enough time has passed, then try to bring the interface * back up; otherwise, schedule a timer to bring it back up * when enough time *has* elapsed. */ pi = pii->pii_phyint; if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; if (check_time > cur_time) return (check_time - cur_time); phyint_check_for_repair(pi); } } /* * If probing is not enabled on this phyint instance, don't proceed. */ if (!PROBE_ENABLED(pii)) return (TIMER_INFINITY); /* * If the timer has fired too soon, probably triggered * by some other phyint instance, return the remaining * time */ if (TIME_LT(cur_time, pii->pii_snxt_time)) return (pii->pii_snxt_time - cur_time); /* * If the link is down, don't send any probes for now. */ if (LINK_DOWN(pii->pii_phyint)) return (TIMER_INFINITY); /* * Randomize the next probe time, between MIN_RANDOM_FACTOR * and MAX_RANDOM_FACTOR with respect to the base probe time. * Base probe time is strictly periodic. */ interval = GET_RANDOM( (int)(MIN_RANDOM_FACTOR * user_probe_interval), (int)(MAX_RANDOM_FACTOR * user_probe_interval)); pii->pii_snxt_time = pii->pii_snxt_basetime + interval; /* * Check if the current time > next time to probe. If so, we missed * sending 1 or more probes, probably due to heavy system load. At least * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we * were scheduled. Make adjustments to the times, in multiples of * user_probe_interval. */ if (TIME_GT(cur_time, pii->pii_snxt_time)) { int n; n = (cur_time - pii->pii_snxt_time) / user_probe_interval; pii->pii_snxt_time += (n + 1) * user_probe_interval; pii->pii_snxt_basetime += (n + 1) * user_probe_interval; logtrace("missed sending %d probes cur_time %u snxt_time %u" " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, pii->pii_snxt_basetime); /* Collect statistics about missed probes */ probes_missed.pm_nprobes += n + 1; probes_missed.pm_ntimes++; } pii->pii_snxt_basetime += user_probe_interval; interval = pii->pii_snxt_time - cur_time; if (debug & D_TARGET) { logdebug("cur_time %u snxt_time %u snxt_basetime %u" " interval %u\n", cur_time, pii->pii_snxt_time, pii->pii_snxt_basetime, interval); } /* * If no targets are known, we need to send an ICMP multicast. The * probe type is PROBE_MULTI. We'll check back in 'interval' msec * to see if we found a target. */ if (pii->pii_target_next == NULL) { assert(pii->pii_ntargets == 0); pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; probe(pii, PROBE_MULTI, cur_time); return (interval); } if ((user_probe_interval != probe_interval) && TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { /* * the failure detection (fd) probe timer has not yet fired. * Need to send only an rtt probe. The probe type is PROBE_RTT. */ probe(pii, PROBE_RTT, cur_time); return (interval); } /* * the fd probe timer has fired. Need to do all failure * detection / recovery calculations, and then send an fd probe * of type PROBE_UNI. */ if (user_probe_interval == probe_interval) { /* * We could have missed some probes, and then adjusted * pii_snxt_basetime above. Otherwise we could have * blindly added probe_interval to pii_fd_snxt_basetime. */ pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; } else { pii->pii_fd_snxt_basetime += probe_interval; if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { int n; n = (cur_time - pii->pii_fd_snxt_basetime) / probe_interval; pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; } } /* * We can have at most, the latest 2 probes that we sent, in * the PR_UNACKED state. All previous probes sent, are either * PR_LOST or PR_ACKED. An unacknowledged probe is considered * timed out if the probe's time_sent + the CRTT < currenttime. * For each of the last 2 probes, examine whether it has timed * out. If so, mark it PR_LOST. The probe stats is a circular array. */ pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); valid_unack_count = 0; for (i = 0; i < 2; i++) { pr_statp = &pii->pii_probes[pr_ndx]; cur_tg = pii->pii_probes[pr_ndx].pr_target; switch (pr_statp->pr_status) { case PR_ACKED: /* * We received back an ACK, so the switch clearly * is not dropping our traffic, and thus we can * enable failure detection immediately. */ if (pii->pii_fd_hrtime > gethrtime()) { if (debug & D_PROBE) { logdebug("successful probe on %s; " "ending quiet period\n", pii->pii_phyint->pi_name); } pii->pii_fd_hrtime = gethrtime(); } break; case PR_UNACKED: assert(cur_tg != NULL); /* * The crtt could be zero for some reason, * Eg. the phyint could be failed. If the crtt is * not available use group's probe interval, * which is a worst case estimate. */ if (cur_tg->tg_crtt != 0) { timeout = pr_statp->pr_time_sent + cur_tg->tg_crtt; } else { timeout = pr_statp->pr_time_sent + probe_interval; } if (TIME_LT(timeout, cur_time)) { pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = timeout; } else if (i == 1) { /* * We are forced to consider this probe * lost, as we can have at most 2 unack. * probes any time, and we will be sending a * probe at the end of this function. * Normally, we should not be here, but * this can happen if an incoming response * that was considered lost has increased * the crtt for this target, and also bumped * up the FDT. Note that we never cancel or * increase the current pii_time_left, so * when the timer fires, we find 2 valid * unacked probes, and they are yet to timeout */ pr_statp->pr_status = PR_LOST; pr_statp->pr_time_lost = cur_time; } else { /* * Only the most recent probe can enter * this 'else' arm. The second most recent * probe must take either of the above arms, * if it is unacked. */ valid_unack_count++; } break; } pr_ndx = PROBE_INDEX_PREV(pr_ndx); } /* * We send out 1 probe randomly in the interval between one half * and one probe interval for the group. Given that the CRTT is always * less than the group's probe interval, we can have at most 1 * unacknowledged probe now. All previous probes are either lost or * acked. */ assert(valid_unack_count == 0 || valid_unack_count == 1); /* * The timer has fired. Take appropriate action depending * on the current state of the phyint. * * PI_RUNNING state - Failure detection and failover * PI_FAILED state - Repair detection and failback */ switch (pii->pii_phyint->pi_state) { case PI_FAILED: /* * If the most recent probe (excluding unacked probes that * are yet to time out) has been acked, check whether the * phyint is now repaired. If the phyint is repaired, then * attempt failback, unless it is an inactive standby. */ if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { phyint_check_for_repair(pii->pii_phyint); } break; case PI_RUNNING: /* * It's possible our probes have been lost because of a * spanning-tree mandated quiet period on the switch. If so, * ignore the lost probes and consider the interface to still * be functioning. */ cur_hrtime = gethrtime(); if (pii->pii_fd_hrtime - cur_hrtime > 0) break; if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { /* * We have 1 or more failed probes (excluding unacked * probes that are yet to time out). Determine if the * phyint has failed. If so attempt a failover, * unless it is an inactive standby */ phyint_inst_check_for_failure(pii); } break; default: logerr("phyint_inst_timer: invalid state %d\n", pii->pii_phyint->pi_state); abort(); } /* * Start the next probe. probe() will also set pii->pii_probe_time_left * to the group's probe interval. If phyint_failed -> target_flush_hosts * was called, the target list may be empty. */ if (pii->pii_target_next != NULL) { probe(pii, PROBE_UNI, cur_time); /* * If we have just the one probe target, and we're not using * router targets, try to find another as we presently have * no resilience. */ if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) probe(pii, PROBE_MULTI, cur_time); } else { probe(pii, PROBE_MULTI, cur_time); } return (interval); } /* * Start the probe timer for an interface instance. */ void start_timer(struct phyint_instance *pii) { uint32_t interval; /* * Spread the base probe times (pi_snxt_basetime) across phyints * uniformly over the (curtime..curtime + the group's probe_interval). * pi_snxt_basetime is strictly periodic with a frequency of * the group's probe interval. The actual probe time pi_snxt_time * adds some randomness to pi_snxt_basetime and happens in probe(). * For the 1st probe on each phyint after the timer is started, * pi_snxt_time and pi_snxt_basetime are the same. */ interval = GET_RANDOM(0, (int)pii->pii_phyint->pi_group->pg_probeint); pii->pii_snxt_basetime = getcurrenttime() + interval; pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; pii->pii_snxt_time = pii->pii_snxt_basetime; timer_schedule(interval); } /* * Restart the probe timer on an interface instance. */ static void restart_timer(struct phyint_instance *pii) { /* * We don't need to restart the timer if it was never started in * the first place (pii->pii_basetime_inited not set), as the timer * won't have gone off yet. */ if (pii->pii_basetime_inited != 0) { if (debug & D_LINKNOTE) logdebug("restart timer: restarting timer on %s, " "address family %s\n", pii->pii_phyint->pi_name, AF_STR(pii->pii_af)); start_timer(pii); } } static void process_link_state_down(struct phyint *pi) { logerr("The link has gone down on %s\n", pi->pi_name); /* * Clear the probe statistics arrays, we don't want the repair * detection logic relying on probes that were succesful prior * to the link going down. */ if (PROBE_CAPABLE(pi->pi_v4)) clear_pii_probe_stats(pi->pi_v4); if (PROBE_CAPABLE(pi->pi_v6)) clear_pii_probe_stats(pi->pi_v6); /* * Check for interface failure. Although we know the interface * has failed, we don't know if all the other interfaces in the * group have failed as well. */ if ((pi->pi_state == PI_RUNNING) || (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { if (debug & D_LINKNOTE) { logdebug("process_link_state_down:" " checking for failure on %s\n", pi->pi_name); } if (pi->pi_v4 != NULL) phyint_inst_check_for_failure(pi->pi_v4); else if (pi->pi_v6 != NULL) phyint_inst_check_for_failure(pi->pi_v6); } } static void process_link_state_up(struct phyint *pi) { logerr("The link has come up on %s\n", pi->pi_name); /* * We stopped any running timers on each instance when the link * went down, so restart them. */ if (pi->pi_v4) restart_timer(pi->pi_v4); if (pi->pi_v6) restart_timer(pi->pi_v6); phyint_check_for_repair(pi); pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); if (pi->pi_whendx == LINK_UP_PERMIN) pi->pi_whendx = 0; } /* * Process any changes in link state passed up from the interfaces. */ void process_link_state_changes(void) { struct phyint *pi; /* Look for interfaces where the link state has just changed */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { boolean_t old_link_state_up = LINK_UP(pi); /* * Except when the "phyint" structure is created, this is * the only place the link state is updated. This allows * this routine to detect changes in link state, rather * than just the current state. */ UPDATE_LINK_STATE(pi); if (LINK_DOWN(pi)) { /* * Has link just gone down? */ if (old_link_state_up) process_link_state_down(pi); } else { /* * Has link just gone back up? */ if (!old_link_state_up) process_link_state_up(pi); } } } void reset_crtt_all(struct phyint *pi) { struct phyint_instance *pii; struct target *tg; pii = pi->pi_v4; if (pii != NULL) { for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { tg->tg_crtt = 0; tg->tg_rtt_sa = -1; tg->tg_rtt_sd = 0; } } pii = pi->pi_v6; if (pii != NULL) { for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { tg->tg_crtt = 0; tg->tg_rtt_sa = -1; tg->tg_rtt_sd = 0; } } } /* * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive * probes on both instances IPv4 and IPv6. * If the interface has failed, return the time of the first probe failure * in "tff". */ static int phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) { uint_t pi_tff; struct target *cur_tg; struct probe_fail_count pfinfo; struct phyint_instance *pii_other; int pr_ndx; /* * Get the number of consecutive failed probes on * this phyint across all targets. Also get the number * of consecutive failed probes on this target only */ pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); cur_tg = pii->pii_probes[pr_ndx].pr_target; probe_fail_info(pii, cur_tg, &pfinfo); /* Get the time of first failure, for later use */ pi_tff = pfinfo.pf_tff; /* * If the current target has not responded to the * last NUM_PROBE_FAILS probes, and other targets are * responding delete this target. Dead gateway detection * will eventually remove this target (if router) from the * routing tables. If that does not occur, we may end * up adding this to our list again. */ if (pfinfo.pf_nfail < NUM_PROBE_FAILS && pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { if (pii->pii_targets_are_routers) { if (cur_tg->tg_status == TG_ACTIVE) pii->pii_ntargets--; cur_tg->tg_status = TG_DEAD; cur_tg->tg_crtt = 0; cur_tg->tg_rtt_sa = -1; cur_tg->tg_rtt_sd = 0; if (pii->pii_target_next == cur_tg) pii->pii_target_next = target_next(cur_tg); } else { target_delete(cur_tg); probe(pii, PROBE_MULTI, getcurrenttime()); } return (PHYINT_OK); } /* * If the phyint has lost NUM_PROBE_FAILS or more * consecutive probes, on both IPv4 and IPv6 protocol * instances of the phyint, then trigger failure * detection, else return false */ if (pfinfo.pf_nfail < NUM_PROBE_FAILS) return (PHYINT_OK); pii_other = phyint_inst_other(pii); if (PROBE_CAPABLE(pii_other)) { probe_fail_info(pii_other, NULL, &pfinfo); if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { /* * We have NUM_PROBE_FAILS or more failures * on both IPv4 and IPv6. Get the earliest * time when failure was detected on this * phyint across IPv4 and IPv6. */ if (TIME_LT(pfinfo.pf_tff, pi_tff)) pi_tff = pfinfo.pf_tff; } else { /* * This instance has < NUM_PROBE_FAILS failure. * So return false */ return (PHYINT_OK); } } *tff = pi_tff; return (PHYINT_FAILURE); } /* * Check if the link has gone down on this phyint, or it has failed the * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. * Also look at other phyints of this group, for group failures. */ int failure_state(struct phyint_instance *pii) { struct probe_success_count psinfo; uint_t pi2_tls; /* time last success */ uint_t pi_tff; /* time first fail */ struct phyint *pi2; struct phyint *pi; struct phyint_instance *pii2; struct phyint_group *pg; boolean_t alone; if (debug & D_FAILOVER) logdebug("phyint_failed(%s)\n", pii->pii_name); pi = pii->pii_phyint; pg = pi->pi_group; if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == PHYINT_OK) return (PHYINT_OK); /* * At this point, the link is down, or the phyint is suspect, * as it has lost NUM_PROBE_FAILS or more probes. If the phyint * does not belong to any group, or is the only member of the * group capable of being probed, return PHYINT_FAILURE. */ alone = _B_TRUE; if (pg != phyint_anongroup) { for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { if (pi2 == pi) continue; if (PROBE_CAPABLE(pi2->pi_v4) || PROBE_CAPABLE(pi2->pi_v6)) { alone = _B_FALSE; break; } } } if (alone) return (PHYINT_FAILURE); /* * Need to compare against other phyints of the same group * to exclude group failures. If the failure was detected via * probing, then if the time of last success (tls) of any * phyint is more recent than the time of first fail (tff) of the * phyint in question, and the link is up on the phyint, * then it is a phyint failure. Otherwise it is a group failure. * If failure was detected via a link down notification sent from * the driver to IP, we see if any phyints in the group are still * running and haven't received a link down notification. We * will usually be processing the link down notification shortly * after it was received, so there is no point looking at the tls * of other phyints. */ for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { /* Exclude ourself from comparison */ if (pi2 == pi) continue; if (LINK_DOWN(pi)) { /* * We use FLAGS_TO_LINK_STATE() to test the * flags directly, rather then LINK_UP() or * LINK_DOWN(), as we may not have got round * to processing the link state for the other * phyints in the group yet. * * The check for PI_RUNNING and group * failure handles the case when the * group begins to recover. The first * phyint to recover should not trigger * a failover from the soon-to-recover * other phyints to the first recovered * phyint. PI_RUNNING will be set, and * pg_groupfailed cleared only after * receipt of NUM_PROBE_REPAIRS, by * which time the other phyints should * have received at least 1 packet, * and so will not have NUM_PROBE_FAILS. */ if ((pi2->pi_state == PI_RUNNING) && !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) return (PHYINT_FAILURE); } else { /* * Need to compare against both IPv4 and * IPv6 instances. */ pii2 = pi2->pi_v4; if (pii2 != NULL) { probe_success_info(pii2, NULL, &psinfo); if (psinfo.ps_tls_valid) { pi2_tls = psinfo.ps_tls; /* * See comment above regarding check * for PI_RUNNING and group failure. */ if (TIME_GT(pi2_tls, pi_tff) && (pi2->pi_state == PI_RUNNING) && !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) return (PHYINT_FAILURE); } } pii2 = pi2->pi_v6; if (pii2 != NULL) { probe_success_info(pii2, NULL, &psinfo); if (psinfo.ps_tls_valid) { pi2_tls = psinfo.ps_tls; /* * See comment above regarding check * for PI_RUNNING and group failure. */ if (TIME_GT(pi2_tls, pi_tff) && (pi2->pi_state == PI_RUNNING) && !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) return (PHYINT_FAILURE); } } } } /* * Change the group state to PG_FAILED if it's not already. */ if (!GROUP_FAILED(pg)) phyint_group_chstate(pg, PG_FAILED); return (GROUP_FAILURE); } /* * Return the information associated with consecutive probe successes * starting with the most recent probe. At most the last 2 probes can be * in the unacknowledged state. All previous probes have either failed * or succeeded. */ static void probe_success_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_success_count *psinfo) { uint_t i; struct probe_stats *pr_statp; uint_t most_recent; uint_t second_most_recent; boolean_t pi_found_failure = _B_FALSE; boolean_t tg_found_failure = _B_FALSE; uint_t now; uint_t timeout; struct target *tg; if (debug & D_FAILOVER) logdebug("probe_success_info(%s)\n", pii->pii_name); bzero(psinfo, sizeof (*psinfo)); now = getcurrenttime(); /* * Start with the most recent probe, and count the number * of consecutive probe successes. Latch the number of successes * on hitting a failure. */ most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); second_most_recent = PROBE_INDEX_PREV(most_recent); for (i = most_recent; i != pii->pii_probe_next; i = PROBE_INDEX_PREV(i)) { pr_statp = &pii->pii_probes[i]; switch (pr_statp->pr_status) { case PR_UNACKED: /* * Only the most recent 2 probes can be unacknowledged */ assert(i == most_recent || i == second_most_recent); tg = pr_statp->pr_target; assert(tg != NULL); /* * The crtt could be zero for some reason, * Eg. the phyint could be failed. If the crtt is * not available use the value of the group's probe * interval which is a worst case estimate. */ if (tg->tg_crtt != 0) { timeout = pr_statp->pr_time_sent + tg->tg_crtt; } else { timeout = pr_statp->pr_time_sent + pii->pii_phyint->pi_group->pg_probeint; } if (TIME_LT(timeout, now)) { /* * We hit a failure. Latch the total number of * recent consecutive successes. */ pr_statp->pr_time_lost = timeout; pr_statp->pr_status = PR_LOST; pi_found_failure = _B_TRUE; if (cur_tg != NULL && tg == cur_tg) { /* * We hit a failure for the desired * target. Latch the number of recent * consecutive successes for this target */ tg_found_failure = _B_TRUE; } } break; case PR_ACKED: /* * Bump up the count of probe successes, if we * have not seen any failure so far. */ if (!pi_found_failure) psinfo->ps_nsucc++; if (cur_tg != NULL && pr_statp->pr_target == cur_tg && !tg_found_failure) { psinfo->ps_nsucc_tg++; } /* * Record the time of last success, if this is * the most recent probe success. */ if (!psinfo->ps_tls_valid) { psinfo->ps_tls = pr_statp->pr_time_acked; psinfo->ps_tls_valid = _B_TRUE; } break; case PR_LOST: /* * We hit a failure. Latch the total number of * recent consecutive successes. */ pi_found_failure = _B_TRUE; if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { /* * We hit a failure for the desired target. * Latch the number of recent consecutive * successes for this target */ tg_found_failure = _B_TRUE; } break; default: return; } } } /* * Return the information associated with consecutive probe failures * starting with the most recent probe. Only the last 2 probes can be in the * unacknowledged state. All previous probes have either failed or succeeded. */ static void probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_fail_count *pfinfo) { int i; struct probe_stats *pr_statp; boolean_t tg_found_success = _B_FALSE; boolean_t pi_found_success = _B_FALSE; int most_recent; int second_most_recent; uint_t now; uint_t timeout; struct target *tg; if (debug & D_FAILOVER) logdebug("probe_fail_info(%s)\n", pii->pii_name); bzero(pfinfo, sizeof (*pfinfo)); now = getcurrenttime(); /* * Start with the most recent probe, and count the number * of consecutive probe failures. Latch the number of failures * on hitting a probe success. */ most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); second_most_recent = PROBE_INDEX_PREV(most_recent); for (i = most_recent; i != pii->pii_probe_next; i = PROBE_INDEX_PREV(i)) { pr_statp = &pii->pii_probes[i]; assert(PR_STATUS_VALID(pr_statp->pr_status)); switch (pr_statp->pr_status) { case PR_UNACKED: /* * Only the most recent 2 probes can be unacknowledged */ assert(i == most_recent || i == second_most_recent); tg = pr_statp->pr_target; /* * Target is guaranteed to exist in the unack. state */ assert(tg != NULL); /* * The crtt could be zero for some reason, * Eg. the phyint could be failed. If the crtt is * not available use the group's probe interval, * which is a worst case estimate. */ if (tg->tg_crtt != 0) { timeout = pr_statp->pr_time_sent + tg->tg_crtt; } else { timeout = pr_statp->pr_time_sent + pii->pii_phyint->pi_group->pg_probeint; } if (TIME_GT(timeout, now)) break; pr_statp->pr_time_lost = timeout; pr_statp->pr_status = PR_LOST; /* FALLTHRU */ case PR_LOST: if (!pi_found_success) { pfinfo->pf_nfail++; pfinfo->pf_tff = pr_statp->pr_time_lost; } if (cur_tg != NULL && pr_statp->pr_target == cur_tg && !tg_found_success) { pfinfo->pf_nfail_tg++; } break; default: /* * We hit a success or unused slot. Latch the * total number of recent consecutive failures. */ pi_found_success = _B_TRUE; if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { /* * We hit a success for the desired target. * Latch the number of recent consecutive * failures for this target */ tg_found_success = _B_TRUE; } } } } /* * Check if the phyint has been repaired. If no test address has been * configured, then consider the interface repaired if the link is up (unless * the link is flapping; see below). Otherwise, look for proof of probes * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on * either IPv4 or IPv6 instance, the phyint can be considered repaired. */ static boolean_t phyint_repaired(struct phyint *pi) { struct probe_success_count psinfo; struct phyint_instance *pii; struct target *cur_tg; int pr_ndx; uint_t cur_time; if (debug & D_FAILOVER) logdebug("phyint_repaired(%s)\n", pi->pi_name); if (LINK_DOWN(pi)) return (_B_FALSE); /* * If we don't have any test addresses and the link is up, then * consider the interface repaired, unless we've received more than * LINK_UP_PERMIN link up notifications in the last minute, in * which case we keep the link down until we drop back below * the threshold. */ if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { cur_time = getcurrenttime(); if ((pi->pi_whenup[pi->pi_whendx] == 0 || (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { pi->pi_lfmsg_printed = 0; return (_B_TRUE); } if (!pi->pi_lfmsg_printed) { logerr("The link has come up on %s more than %d times " "in the last minute; disabling failback until it " "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); pi->pi_lfmsg_printed = 1; } return (_B_FALSE); } pii = pi->pi_v4; if (PROBE_CAPABLE(pii)) { pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); cur_tg = pii->pii_probes[pr_ndx].pr_target; probe_success_info(pii, cur_tg, &psinfo); if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) return (_B_TRUE); } pii = pi->pi_v6; if (PROBE_CAPABLE(pii)) { pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); cur_tg = pii->pii_probes[pr_ndx].pr_target; probe_success_info(pii, cur_tg, &psinfo); if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) return (_B_TRUE); } return (_B_FALSE); } /* * Try failover from phyint 'pi' to a suitable destination. */ int try_failover(struct phyint *pi, int failover_type) { struct phyint *dst; int err; if (debug & D_FAILOVER) logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); /* * Attempt to find a failover destination 'dst'. * dst will be null if any of the following is true * Phyint is not part of a group OR * Phyint is the only member of a group OR * No suitable failover dst was available */ dst = get_failover_dst(pi, failover_type); if (dst == NULL) return (IPMP_EMINRED); dst->pi_empty = 0; /* Per state diagram */ pi->pi_full = 0; /* Per state diagram */ err = failover(pi, dst); if (debug & D_FAILOVER) { logdebug("failed over from %s to %s ret %d\n", pi->pi_name, dst->pi_name, err); } if (err == 0) { pi->pi_empty = 1; /* Per state diagram */ /* * we don't want to print out this message if a * phyint is leaving the group, nor for failover from * standby */ if (failover_type == FAILOVER_NORMAL) { logerr("Successfully failed over from NIC %s to NIC " "%s\n", pi->pi_name, dst->pi_name); } return (0); } else { /* * The failover did not succeed. We must retry the failover * only after resyncing our state based on the kernel's. * For eg. either the src or the dst might have been unplumbed * causing this failure. initifs() will be called again, * from main, since full_scan_required has been set to true * by failover(); */ return (IPMP_FAILURE); } } /* * global_errno captures the errno value, if failover() or failback() * fails. This is sent to if_mpadm(1M). */ int global_errno; /* * Attempt failover from phyint 'from' to phyint 'to'. * IP moves everything from phyint 'from' to phyint 'to'. */ static int failover(struct phyint *from, struct phyint *to) { struct lifreq lifr; int ret; if (debug & D_FAILOVER) { logdebug("failing over from %s to %s\n", from->pi_name, to->pi_name); } /* * Perform the failover. Both IPv4 and IPv6 are failed over * using a single ioctl by passing in AF_UNSPEC family. */ lifr.lifr_addr.ss_family = AF_UNSPEC; (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); lifr.lifr_movetoindex = to->pi_ifindex; ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); if (ret < 0) { global_errno = errno; logperror("failover: ioctl (failover)"); } /* * Set full_scan_required to true. This will make us read * the state from the kernel in initifs() and update our tables, * to reflect the current state after the failover. If the * failover has failed it will then reissue the failover. */ full_scan_required = _B_TRUE; return (ret); } /* * phyint 'pi' has recovered. Attempt failback from every phyint in the same * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. * Return values: * IPMP_SUCCESS: Failback successful from each of the other * phyints in the group. * IPMP_EFBPARTIAL: Failback successful from some of the other * phyints in the group. * IPMP_FAILURE: Failback syscall failed with some error. * * Note that failback is attempted regardless of the setting of the * failback_enabled flag. */ int do_failback(struct phyint *pi) { struct phyint *from; boolean_t done; boolean_t partial; boolean_t attempted_failback = _B_FALSE; if (debug & D_FAILOVER) logdebug("do_failback(%s)\n", pi->pi_name); /* If this phyint is not part of a named group, return. */ if (pi->pi_group == phyint_anongroup) { pi->pi_full = 1; return (IPMP_SUCCESS); } /* * Attempt failback from every phyint in the group to 'pi'. * The reason for doing this, instead of only from the * phyint to which we did the failover is given below. * * After 'pi' failed, if any app. tries to join on a multicast * address (IPv6), on the failed phyint, IP picks any arbitrary * non-failed phyint in the group, instead of the failed phyint, * in.mpathd is not aware of this. Thus failing back only from the * interface to which 'pi' failed over, will failback the ipif's * but not the ilm's. So we need to failback from all members of * the phyint group */ done = _B_TRUE; partial = _B_FALSE; for (from = pi->pi_group->pg_phyint; from != NULL; from = from->pi_pgnext) { /* Exclude ourself as a failback src */ if (from == pi) continue; /* * If the 'from' phyint has IPv4 plumbed, the 'to' * phyint must also have IPv4 plumbed. Similar check * for IPv6. IP makes the same check. Otherwise the * failback will fail. */ if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { partial = _B_TRUE; continue; } pi->pi_empty = 0; /* Per state diagram */ attempted_failback = _B_TRUE; if (failback(from, pi) != 0) { done = _B_FALSE; break; } } /* * We are done. No more phyint from which we can src the failback */ if (done) { if (!partial) pi->pi_full = 1; /* Per state diagram */ /* * Don't print out a message unless there is a * transition from FAILED to RUNNING. For eg. * we don't want to print out this message if a * phyint is leaving the group, or at startup */ if (attempted_failback && (pi->pi_flags & (IFF_FAILED | IFF_OFFLINE))) { logerr("Successfully failed back to NIC %s\n", pi->pi_name); } return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); } return (IPMP_FAILURE); } /* * This function is similar to do_failback() above, but respects the * failback_enabled flag for phyints in named groups. */ int try_failback(struct phyint *pi) { if (debug & D_FAILOVER) logdebug("try_failback(%s)\n", pi->pi_name); if (pi->pi_group != phyint_anongroup && !failback_enabled) return (IPMP_EFBDISABLED); return (do_failback(pi)); } /* * Failback everything from phyint 'from' that has the same ifindex * as phyint to's ifindex. */ static int failback(struct phyint *from, struct phyint *to) { struct lifreq lifr; int ret; if (debug & D_FAILOVER) logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); lifr.lifr_addr.ss_family = AF_UNSPEC; (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); lifr.lifr_movetoindex = to->pi_ifindex; ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); if (ret < 0) { global_errno = errno; logperror("failback: ioctl (failback)"); } /* * Set full_scan_required to true. This will make us read * the state from the kernel in initifs() and update our tables, * to reflect the current state after the failback. If the * failback has failed it will then reissue the failback. */ full_scan_required = _B_TRUE; return (ret); } /* * Select a target phyint for failing over from 'pi'. * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred * target phyint is chosen as follows, * 1. Pick any inactive standby interface. * 2. If no inactive standby is available, select any phyint in the * same group that has the least number of logints, (excluding * IFF_NOFAILOVER and !IFF_UP logints) * If we are failing over from a standby, failover_type is * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, * and we won't return NULL, as long as there is at least 1 other phyint * in the group. */ static struct phyint * get_failover_dst(struct phyint *pi, int failover_type) { struct phyint *maybe = NULL; struct phyint *pi2; struct phyint *last_choice = NULL; if (pi->pi_group == phyint_anongroup) return (NULL); /* * Loop thru the phyints in the group, and pick the preferred * phyint for the target. */ for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { /* Exclude ourself and offlined interfaces */ if (pi2 == pi || pi2->pi_state == PI_OFFLINE) continue; /* * The chosen target phyint must have IPv4 instance * plumbed, if the src phyint has IPv4 plumbed. Similarly * for IPv6. */ if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) continue; /* The chosen target must be PI_RUNNING. */ if (pi2->pi_state != PI_RUNNING) { last_choice = pi2; continue; } if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && (failover_type != FAILOVER_TO_NONSTANDBY)) { return (pi2); } else { if (maybe == NULL) maybe = pi2; else if (logint_upcount(pi2) < logint_upcount(maybe)) maybe = pi2; } } if (maybe == NULL && failover_type == FAILOVER_TO_ANY) return (last_choice); else return (maybe); } /* * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. */ boolean_t change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) { int ifsock; struct lifreq lifr; uint64_t old_flags; if (debug & D_FAILOVER) { logdebug("change_lif_flags(%s): flags %llx setfl %d\n", pi->pi_name, flags, (int)setfl); } if (pi->pi_v4 != NULL) { ifsock = ifsock_v4; } else { ifsock = ifsock_v6; } /* * Get the current flags from the kernel, and set/clear the * desired phyint flags. Since we set only phyint flags, we can * do it on either IPv4 or IPv6 instance. */ (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) logperror("change_lif_flags: ioctl (get flags)"); return (_B_FALSE); } old_flags = lifr.lifr_flags; if (setfl) lifr.lifr_flags |= flags; else lifr.lifr_flags &= ~flags; if (old_flags == lifr.lifr_flags) { /* No change in the flags. No need to send ioctl */ return (_B_TRUE); } if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) logperror("change_lif_flags: ioctl (set flags)"); return (_B_FALSE); } /* * Keep pi_flags in synch. with actual flags. Assumes flags are * phyint flags. */ if (setfl) pi->pi_flags |= flags; else pi->pi_flags &= ~flags; if (pi->pi_v4) pi->pi_v4->pii_flags = pi->pi_flags; if (pi->pi_v6) pi->pi_v6->pii_flags = pi->pi_flags; return (_B_TRUE); } /* * icmp cksum computation for IPv4. */ static int in_cksum(ushort_t *addr, int len) { register int nleft = len; register ushort_t *w = addr; register ushort_t answer; ushort_t odd_byte = 0; register int sum = 0; /* * Our algorithm is simple, using a 32 bit accumulator (sum), * we add sequential 16 bit words to it, and at the end, fold * back all the carry bits from the top 16 bits into the lower * 16 bits. */ while (nleft > 1) { sum += *w++; nleft -= 2; } /* mop up an odd byte, if necessary */ if (nleft == 1) { *(uchar_t *)(&odd_byte) = *(uchar_t *)w; sum += odd_byte; } /* * add back carry outs from top 16 bits to low 16 bits */ sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ sum += (sum >> 16); /* add carry */ answer = ~sum; /* truncate to 16 bits */ return (answer); } static void reset_snxt_basetimes(void) { struct phyint_instance *pii; for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; } } /* * Is the address one of our own addresses? Unfortunately, * we cannot check our phyint tables to determine if the address * is our own. This is because, we don't track interfaces that * are not part of any group. We have to either use a 'bind' or * get the complete list of all interfaces using SIOCGLIFCONF, * to do this check. We could also use SIOCTMYADDR. * Bind fails for the local zone address, so we might include local zone * address as target address. If local zone address is a target address * and it is up, it is not possible to detect the interface failure. * SIOCTMYADDR also doesn't consider local zone address as own address. * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they * are stored in laddr_list. */ boolean_t own_address(struct in6_addr addr) { struct local_addr *taddr = laddr_list; for (; taddr != NULL; taddr = taddr->next) { if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { return (_B_TRUE); } } return (_B_FALSE); }