/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 1987 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms are permitted * provided that the above copyright notice and this paragraph are * duplicated in all such forms and that any documentation, * advertising materials, and other materials related to such * distribution and use acknowledge that the software was developed * by the University of California, Berkeley. The name of the * University may not be used to endorse or promote products derived * from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ #include "mpd_defs.h" #include "mpd_tables.h" /* * Probe types for probe() */ #define PROBE_UNI 0x1234 /* Unicast probe packet */ #define PROBE_MULTI 0x5678 /* Multicast probe packet */ #define PROBE_RTT 0x9abc /* RTT only probe packet */ #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ /* * Format of probe / probe response packets. This is an ICMP Echo request * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 */ struct pr_icmp { uint8_t pr_icmp_type; /* type field */ uint8_t pr_icmp_code; /* code field */ uint16_t pr_icmp_cksum; /* checksum field */ uint16_t pr_icmp_id; /* Identification */ uint16_t pr_icmp_seq; /* sequence number */ uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ uint32_t pr_icmp_mtype; /* Message type */ }; static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1 } }; static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ static void *find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type); static void pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni); static void incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static void incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr); static boolean_t check_pg_crtt_improved(struct phyint_group *pg); static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); static boolean_t check_exception_target(struct phyint_instance *pii, struct target *target); static void probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_fail_count *pfinfo); static void probe_success_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_success_count *psinfo); static boolean_t phyint_repaired(struct phyint *pi); static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); static int in_cksum(ushort_t *addr, int len); static void reset_snxt_basetimes(void); static int ns2ms(int64_t ns); static int64_t tv2ns(struct timeval *); /* * CRTT - Conservative Round Trip Time Estimate * Probe success - A matching probe reply received before CRTT ms has elapsed * after sending the probe. * Probe failure - No probe reply received and more than CRTT ms has elapsed * after sending the probe. * * TLS - Time last success. Most recent probe ack received at this time. * TFF - Time first fail. The time of the earliest probe failure in * a consecutive series of probe failures. * NUM_PROBE_REPAIRS - Number of consecutive successful probes required * before declaring phyint repair. * NUM_PROBE_FAILS - Number of consecutive probe failures required to * declare a phyint failure. * * Phyint state diagram * * The state of a phyint that is capable of being probed, is completely * specified by the 3-tuple . * * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether * IFF_OFFLINE is set. If the phyint is also configured with a test address * (the common case) and probe targets, then a phyint must also successfully * be able to send and receive probes in order to remain in the PI_RUNNING * state (otherwise, it transitions to PI_FAILED). * * Further, if a PI_RUNNING phyint is configured with a test address but is * unable to find any probe targets, it will transition to the PI_NOTARGETS * state, which indicates that the link is apparently functional but that * in.mpathd is unable to send probes to verify functionality (in this case, * in.mpathd makes the optimistic assumption that the interface is working * correctly and thus does not mark the interface FAILED, but reports it as * IPMP_IF_UNKNOWN through the async events and query interfaces). * * At any point, a phyint may be administratively marked offline via if_mpadm. * In this case, the interface always transitions to PI_OFFLINE, regardless * of its previous state. When the interface is later brought back online, * in.mpathd acts as if the interface is new (and thus it transitions to * PI_RUNNING or PI_FAILED based on the status of the link and the result of * its probes, if probes are sent). * * pi_state - PI_RUNNING or PI_FAILED * PI_RUNNING: The failure detection logic says the phyint is good. * PI_FAILED: The failure detection logic says the phyint has failed. * * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. * PG_OK: All interfaces in the group are OK. * PG_DEGRADED: Some interfaces in the group are unusable. * PG_FAILED: All interfaces in the group are unusable. * * In the case of router targets, we assume that the current list of * targets obtained from the routing table, is still valid, so the * phyint stat is PI_FAILED. In the case of host targets, we delete the * list of targets, and multicast to the all hosts, to reconstruct the * target list. So the phyints are in the PI_NOTARGETS state. * * I - value of (pi_flags & IFF_INACTIVE) * IFF_INACTIVE: This phyint will not send or receive packets. * Usually, inactive is tied to standby interfaces that are not yet * needed (e.g., no non-standby interfaces in the group have failed). * When failback has been disabled (FAILBACK=no configured), phyint can * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint * subsequently recovers after a failure. * * Not all 9 possible combinations of the above 3-tuple are possible. * * I is tracked by IP. pi_state is tracked by mpathd. * * pi_state state machine * --------------------------------------------------------------------------- * Event State New State * Action: * --------------------------------------------------------------------------- * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint * * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) * detection : set IFF_FAILED on this phyint * * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) * detection -> (PI_RUNNING, I == 0) * : clear IFF_FAILED on this phyint * * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) * detection -> (PI_RUNNING, I == 1) * : clear IFF_FAILED on this phyint * : if failback is disabled set I == 1 * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_FAILED * (Router targets) : set IFF_FAILED * * Group failure (perform on all phyints in the group) * detection PI_RUNNING PI_NOTARGETS * (Host targets) : set IFF_FAILED * : delete the target list on all phyints * --------------------------------------------------------------------------- */ struct probes_missed probes_missed; /* * Compose and transmit an ICMP ECHO REQUEST packet. The IP header * will be added on by the kernel. The id field identifies this phyint. * and the sequence number is an increasing (modulo 2^^16) integer. The data * portion holds the time value when the packet is sent. On echo this is * extracted to compute the round-trip time. Three different types of * probe packets are used. * * PROBE_UNI: This type is used to do failure detection / failure recovery * and RTT calculation. PROBE_UNI probes are spaced apart in time, * not less than the current CRTT. pii_probes[] stores data * about these probes. These packets consume sequence number space. * * PROBE_RTT: This type is used to make only rtt measurements. Normally these * are not used. Under heavy network load, the rtt may go up very high, * due to a spike, or may appear to go high, due to extreme scheduling * delays. Once the network stress is removed, mpathd takes long time to * recover, because the probe_interval is already high, and it takes * a long time to send out sufficient number of probes to bring down the * rtt. To avoid this problem, PROBE_RTT probes are sent out every * user_probe_interval ms. and will cause only rtt updates. These packets * do not consume sequence number space nor is information about these * packets stored in the pii_probes[] * * PROBE_MULTI: This type is only used to construct a list of targets, when * no targets are known. The packet is multicast to the all hosts addr. */ static void probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) { hrtime_t sent_hrtime; struct timeval sent_tv; struct pr_icmp probe_pkt; /* Probe packet */ struct sockaddr_storage targ; /* target address */ uint_t targaddrlen; /* targed address length */ int pr_ndx; /* probe index in pii->pii_probes[] */ boolean_t sent = _B_FALSE; int rval; if (debug & D_TARGET) { logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), pii->pii_name, probe_type, start_hrtime); } assert(pii->pii_probe_sock != -1); assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || probe_type == PROBE_RTT); probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; probe_pkt.pr_icmp_code = 0; probe_pkt.pr_icmp_cksum = 0; probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); /* * Since there is no need to do arithmetic on the icmpid, * (only equality check is done) pii_icmpid is stored in * network byte order at initialization itself. */ probe_pkt.pr_icmp_id = pii->pii_icmpid; probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); probe_pkt.pr_icmp_mtype = htonl(probe_type); /* * If probe_type is PROBE_MULTI, this packet will be multicast to * the all hosts address. Otherwise it is unicast to the next target. */ assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && pii->pii_rtt_target_next != NULL)); bzero(&targ, sizeof (targ)); targ.ss_family = pii->pii_af; if (pii->pii_af == AF_INET6) { struct in6_addr *addr6; addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; targaddrlen = sizeof (struct sockaddr_in6); if (probe_type == PROBE_MULTI) { *addr6 = all_nodes_mcast_v6; } else if (probe_type == PROBE_UNI) { *addr6 = pii->pii_target_next->tg_address; } else { /* type is PROBE_RTT */ *addr6 = pii->pii_rtt_target_next->tg_address; } } else { struct in_addr *addr4; addr4 = &((struct sockaddr_in *)&targ)->sin_addr; targaddrlen = sizeof (struct sockaddr_in); if (probe_type == PROBE_MULTI) { *addr4 = all_nodes_mcast_v4; } else if (probe_type == PROBE_UNI) { IN6_V4MAPPED_TO_INADDR( &pii->pii_target_next->tg_address, addr4); } else { /* type is PROBE_RTT */ IN6_V4MAPPED_TO_INADDR( &pii->pii_rtt_target_next->tg_address, addr4); } /* * Compute the IPv4 icmp checksum. Does not cover the IP header. */ probe_pkt.pr_icmp_cksum = in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); } /* * Use the current time as the time we sent. Not atomic, but the best * we can do from here. */ sent_hrtime = gethrtime(); (void) gettimeofday(&sent_tv, NULL); rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, (struct sockaddr *)&targ, targaddrlen); /* * If the send would block, this may either be transient or a hang in a * lower layer. We pretend the probe was actually sent, the daemon will * not see a reply to the probe and will fail the interface if normal * failure detection criteria are met. */ if (rval == sizeof (probe_pkt) || (rval == -1 && errno == EWOULDBLOCK)) { sent = _B_TRUE; } else { logperror_pii(pii, "probe: probe sendto"); } /* * If this is a PROBE_UNI probe packet being unicast to a target, then * update our tables. We will need this info in processing the probe * response. PROBE_MULTI and PROBE_RTT packets are not used for * the purpose of failure or recovery detection. PROBE_MULTI packets * are only used to construct a list of targets. PROBE_RTT packets are * used only for updating the rtt and not for failure detection. */ if (probe_type == PROBE_UNI && sent) { pr_ndx = pii->pii_probe_next; assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); /* Collect statistics, before we reuse the last slot. */ if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) pii->pii_cum_stats.lost++; else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) pii->pii_cum_stats.acked++; pii->pii_cum_stats.sent++; pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); pii->pii_target_next = target_next(pii->pii_target_next); assert(pii->pii_target_next != NULL); /* * If we have a single variable to denote the next target to * probe for both rtt probes and failure detection probes, we * could end up with a situation where the failure detection * probe targets become disjoint from the rtt probe targets. * Eg. if 2 targets and the actual fdt is double the user * specified fdt. So we have 2 variables. In this scheme * we also reset pii_rtt_target_next for every fdt probe, * though that may not be necessary. */ pii->pii_rtt_target_next = pii->pii_target_next; pii->pii_snxt++; } else if (probe_type == PROBE_RTT) { pii->pii_rtt_target_next = target_next(pii->pii_rtt_target_next); assert(pii->pii_rtt_target_next != NULL); } } /* * Incoming IPv4 data from wire, is received here. Called from main. */ void in_data(struct phyint_instance *pii) { struct sockaddr_in from; struct in6_addr fromaddr; static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; struct ip *ip; int iphlen; int len; char abuf[INET_ADDRSTRLEN]; struct msghdr msg; struct iovec iov; struct pr_icmp *reply; struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in_data(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } iov.iov_base = (char *)in_packet; iov.iov_len = sizeof (in_packet); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_name = (struct sockaddr *)&from; msg.msg_namelen = sizeof (from); msg.msg_control = ancillary_data; msg.msg_controllen = sizeof (ancillary_data); /* * Poll has already told us that a message is waiting, * on this socket. Read it now. We should not block. */ if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { logperror_pii(pii, "in_data: recvmsg"); return; } /* * If the datalink has indicated the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) return; /* Get the printable address for error reporting */ (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); /* Ignore packets > 64k or control buffers that don't fit */ if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { if (debug & D_PKTBAD) { logdebug("Truncated message: msg_flags 0x%x from %s\n", msg.msg_flags, abuf); } return; } /* Make sure packet contains at least minimum ICMP header */ ip = (struct ip *)in_packet; iphlen = ip->ip_hl << 2; if (len < iphlen + ICMP_MINLEN) { if (debug & D_PKTBAD) { logdebug("in_data: packet too short (%d bytes)" " from %s\n", len, abuf); } return; } /* * Subtract the IP hdr length, 'len' will be length of the probe * reply, starting from the icmp hdr. */ len -= iphlen; /* LINTED */ reply = (struct pr_icmp *)((char *)in_packet + iphlen); /* Probe replies are icmp echo replies. Ignore anything else */ if (reply->pr_icmp_type != ICMP_ECHO_REPLY) return; /* * The icmp id should match what we sent, which is stored * in pi_icmpid. The icmp code for reply must be 0. * The reply content must be a struct pr_icmp */ if (reply->pr_icmp_id != pii->pii_icmpid) { /* Not in response to our probe */ return; } if (reply->pr_icmp_code != 0) { logtrace("probe reply code %d from %s on %s\n", reply->pr_icmp_code, abuf, pii->pii_name); return; } if (len < sizeof (struct pr_icmp)) { logtrace("probe reply too short: %d bytes from %s on %s\n", len, abuf, pii->pii_name); return; } recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); if (recv_tvp == NULL) { logtrace("message without timestamp from %s on %s\n", abuf, pii->pii_name); return; } IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) /* Unicast probe reply */ incoming_echo_reply(pii, reply, fromaddr, recv_tvp); else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { /* Multicast reply */ incoming_mcast_reply(pii, reply, fromaddr); } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { incoming_rtt_reply(pii, reply, fromaddr); } else { /* Probably not in response to our probe */ logtrace("probe reply type: %d from %s on %s\n", reply->pr_icmp_mtype, abuf, pii->pii_name); return; } } /* * Incoming IPv6 data from wire is received here. Called from main. */ void in6_data(struct phyint_instance *pii) { struct sockaddr_in6 from; static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; int len; char abuf[INET6_ADDRSTRLEN]; struct msghdr msg; struct iovec iov; void *opt; struct pr_icmp *reply; struct timeval *recv_tvp; if (debug & D_PROBE) { logdebug("in6_data(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } iov.iov_base = (char *)in_packet; iov.iov_len = sizeof (in_packet); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_name = (struct sockaddr *)&from; msg.msg_namelen = sizeof (from); msg.msg_control = ancillary_data; msg.msg_controllen = sizeof (ancillary_data); if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { logperror_pii(pii, "in6_data: recvmsg"); return; } /* * If the datalink has indicated that the link is down, don't go * any further. */ if (LINK_DOWN(pii->pii_phyint)) return; /* Get the printable address for error reporting */ (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); if (len < ICMP_MINLEN) { if (debug & D_PKTBAD) { logdebug("Truncated message: msg_flags 0x%x from %s\n", msg.msg_flags, abuf); } return; } /* Ignore packets > 64k or control buffers that don't fit */ if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { if (debug & D_PKTBAD) { logdebug("Truncated message: msg_flags 0x%x from %s\n", msg.msg_flags, abuf); } return; } reply = (struct pr_icmp *)in_packet; if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) return; if (reply->pr_icmp_id != pii->pii_icmpid) { /* Not in response to our probe */ return; } /* * The kernel has already verified the the ICMP checksum. */ if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { logtrace("ICMPv6 echo reply source address not linklocal from " "%s on %s\n", abuf, pii->pii_name); return; } opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); if (opt != NULL) { /* Can't allow routing headers in probe replies */ logtrace("message with routing header from %s on %s\n", abuf, pii->pii_name); return; } if (reply->pr_icmp_code != 0) { logtrace("probe reply code: %d from %s on %s\n", reply->pr_icmp_code, abuf, pii->pii_name); return; } if (len < (sizeof (struct pr_icmp))) { logtrace("probe reply too short: %d bytes from %s on %s\n", len, abuf, pii->pii_name); return; } recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); if (recv_tvp == NULL) { logtrace("message without timestamp from %s on %s\n", abuf, pii->pii_name); return; } if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { incoming_mcast_reply(pii, reply, from.sin6_addr); } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { incoming_rtt_reply(pii, reply, from.sin6_addr); } else { /* Probably not in response to our probe */ logtrace("probe reply type: %d from %s on %s\n", reply->pr_icmp_mtype, abuf, pii->pii_name); } } /* * Process the incoming rtt reply, in response to our rtt probe. * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't * have any stored information about the probe we sent. So we don't log * any errors if we receive bad replies. */ static void incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) { int64_t m; /* rtt measurement in ns */ char abuf[INET6_ADDRSTRLEN]; struct target *target; struct phyint_group *pg; /* Get the printable address for error reporting */ (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); if (debug & D_PROBE) { logdebug("incoming_rtt_reply: %s %s %s\n", AF_STR(pii->pii_af), pii->pii_name, abuf); } /* Do we know this target ? */ target = target_lookup(pii, fromaddr); if (target == NULL) return; m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); /* Invalid rtt. It has wrapped around */ if (m < 0) return; /* * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses * The initial few responses after the interface is repaired may * contain high rtt's because they could have been queued up waiting * for ARP/NDP resolution on a failed interface. */ pg = pii->pii_phyint->pi_group; if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) return; /* * Update rtt only if the new rtt is lower than the current rtt. * (specified by the 3rd parameter to pi_set_crtt). * If a spike has caused the current probe_interval to be > * user_probe_interval, then this mechanism is used to bring down * the rtt rapidly once the network stress is removed. * If the new rtt is higher than the current rtt, we don't want to * update the rtt. We are having more than 1 outstanding probe and * the increase in rtt we are seeing is being unnecessarily weighted * many times. The regular rtt update will be handled by * incoming_echo_reply() and will take care of any rtt increase. */ pi_set_crtt(target, m, _B_FALSE); if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && (user_failure_detection_time < pg->pg_fdt) && (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { /* * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, * investigate if we can improve the failure detection time to * meet whatever the user specified. */ if (check_pg_crtt_improved(pg)) { pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, user_failure_detection_time); pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); if (pii->pii_phyint->pi_group != phyint_anongroup) { logerr("Improved failure detection time %d ms " "on (%s %s) for group \"%s\"\n", pg->pg_fdt, AF_STR(pii->pii_af), pii->pii_name, pii->pii_phyint->pi_group->pg_name); } if (user_failure_detection_time == pg->pg_fdt) { /* Avoid any truncation or rounding errors */ pg->pg_probeint = user_probe_interval; /* * No more rtt probes will be sent. The actual * fdt has dropped to the user specified value. * pii_fd_snxt_basetime and pii_snxt_basetime * will be in sync henceforth. */ reset_snxt_basetimes(); } } } } /* * Process the incoming echo reply, in response to our unicast probe. * Common for both IPv4 and IPv6 */ static void incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp) { int64_t m; /* rtt measurement in ns */ hrtime_t cur_hrtime; /* in ns from some arbitrary point */ char abuf[INET6_ADDRSTRLEN]; int pr_ndx; struct target *target; boolean_t exception; uint64_t pr_icmp_timestamp; uint16_t pr_icmp_seq; struct probe_stats *pr_statp; struct phyint_group *pg = pii->pii_phyint->pi_group; /* Get the printable address for error reporting */ (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); if (debug & D_PROBE) { logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", AF_STR(pii->pii_af), pii->pii_name, abuf, ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); } pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); pr_icmp_seq = ntohs(reply->pr_icmp_seq); /* Reject out of window probe replies */ if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { logtrace("out of window probe seq %u snxt %u on %s from %s\n", pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); pii->pii_cum_stats.unknown++; return; } cur_hrtime = gethrtime(); m = (int64_t)(cur_hrtime - pr_icmp_timestamp); if (m < 0) { /* * This is a ridiculously high value of rtt. rtt has wrapped * around. Log a message, and ignore the rtt. */ logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); } /* * Get the probe index pr_ndx corresponding to the received icmp seq. * number in our pii->pii_probes[] array. The icmp sequence number * pii_snxt corresponds to the probe index pii->pii_probe_next */ pr_ndx = MOD_SUB(pii->pii_probe_next, (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); target = pii->pii_probes[pr_ndx].pr_target; /* * Perform sanity checks, whether this probe reply that we * have received is genuine */ if (target != NULL) { /* * Compare the src. addr of the received ICMP or ICMPv6 * probe reply with the target address in our tables. */ if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { /* * We don't have any record of having sent a probe to * this target. This is a fake probe reply. Log an error */ logtrace("probe status %d Fake probe reply seq %u " "snxt %u on %s from %s\n", pii->pii_probes[pr_ndx].pr_status, pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); pii->pii_cum_stats.unknown++; return; } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { /* * The address matches, but our tables indicate that * this probe reply has been acked already. So this * is a duplicate probe reply. Log an error */ logtrace("probe status %d Duplicate probe reply seq %u " "snxt %u on %s from %s\n", pii->pii_probes[pr_ndx].pr_status, pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); pii->pii_cum_stats.unknown++; return; } } else { /* * Target must not be NULL in the PR_UNACKED state */ assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { /* * The probe stats slot is unused. So we didn't * send out any probe to this target. This is a fake. * Log an error. */ logtrace("probe status %d Fake probe reply seq %u " "snxt %u on %s from %s\n", pii->pii_probes[pr_ndx].pr_status, pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); } pii->pii_cum_stats.unknown++; return; } /* * If the rtt does not appear to be right, don't update the * rtt stats. This can happen if the system dropped into the * debugger, or the system was hung or too busy for a * substantial time that we didn't get a chance to run. */ if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { /* * If the probe corresponding to this received response * was truly sent 'm' ns. ago, then this response must * have been rejected by the sequence number checks. The * fact that it has passed the sequence number checks * means that the measured rtt is wrong. We were probably * scheduled long after the packet was received. */ goto out; } /* * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses * The initial few responses after the interface is repaired may * contain high rtt's because they could have been queued up waiting * for ARP/NDP resolution on a failed interface. */ if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) goto out; /* * Don't update the Conservative Round Trip Time estimate for this * (phint, target) pair if this is the not the highest ack seq seen * thus far on this target. */ if (!highest_ack_tg(pr_icmp_seq, target)) goto out; /* * Always update the rtt. This is a failure detection probe * and we want to measure both increase / decrease in rtt. */ pi_set_crtt(target, m, _B_TRUE); /* * If the crtt exceeds the average time between probes, * investigate if this slow target is an exception. If so we * can avoid this target and still meet the failure detection * time. Otherwise we can't meet the failure detection time. */ if (target->tg_crtt > pg->pg_probeint) { exception = check_exception_target(pii, target); if (exception) { /* * This target is exceptionally slow. Don't use it * for future probes. check_exception_target() has * made sure that we have at least MIN_PROBE_TARGETS * other active targets */ if (pii->pii_targets_are_routers) { /* * This is a slow router, mark it as slow * and don't use it for further probes. We * don't delete it, since it will be populated * again when we do a router scan. Hence we * need to maintain extra state (unlike the * host case below). Mark it as TG_SLOW. */ if (target->tg_status == TG_ACTIVE) pii->pii_ntargets--; target->tg_status = TG_SLOW; target->tg_latime = gethrtime(); target->tg_rtt_sa = -1; target->tg_crtt = 0; target->tg_rtt_sd = 0; if (pii->pii_target_next == target) { pii->pii_target_next = target_next(target); } } else { /* * the slow target is not a router, we can * just delete it. Send an icmp multicast and * pick the fastest responder that is not * already an active target. target_delete() * adjusts pii->pii_target_next */ target_delete(target); probe(pii, PROBE_MULTI, cur_hrtime); } } else { /* * We can't meet the failure detection time. * Log a message, and update the detection time to * whatever we can achieve. */ pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); last_fdt_bumpup_time = gethrtime(); if (pg != phyint_anongroup) { logtrace("Cannot meet requested failure" " detection time of %d ms on (%s %s) new" " failure detection time for group \"%s\"" " is %d ms\n", user_failure_detection_time, AF_STR(pii->pii_af), pii->pii_name, pg->pg_name, pg->pg_fdt); } } } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && (user_failure_detection_time < pg->pg_fdt) && (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { /* * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER * investigate if we can improve the failure detection time to * meet whatever the user specified. */ if (check_pg_crtt_improved(pg)) { pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, user_failure_detection_time); pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); if (pg != phyint_anongroup) { logtrace("Improved failure detection time %d ms" " on (%s %s) for group \"%s\"\n", pg->pg_fdt, AF_STR(pii->pii_af), pii->pii_name, pg->pg_name); } if (user_failure_detection_time == pg->pg_fdt) { /* Avoid any truncation or rounding errors */ pg->pg_probeint = user_probe_interval; /* * No more rtt probes will be sent. The actual * fdt has dropped to the user specified value. * pii_fd_snxt_basetime and pii_snxt_basetime * will be in sync henceforth. */ reset_snxt_basetimes(); } } } out: pr_statp = &pii->pii_probes[pr_ndx]; pr_statp->pr_hrtime_ackproc = cur_hrtime; pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); probe_chstate(pr_statp, pii, PR_ACKED); /* * Update pii->pii_rack, i.e. the sequence number of the last received * probe response, based on the echo reply we have received now, if * either of the following conditions are satisfied. * a. pii_rack is outside the current receive window of * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). * This means we have not received probe responses for a * long time, and the sequence number has wrapped around. * b. pii_rack is within the current receive window and this echo * reply corresponds to the highest sequence number we have seen * so far. */ if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || SEQ_GT(pr_icmp_seq, pii->pii_rack)) { pii->pii_rack = pr_icmp_seq; } } /* * Returns true if seq is the highest unacknowledged seq for target tg * else returns false */ static boolean_t highest_ack_tg(uint16_t seq, struct target *tg) { struct phyint_instance *pii; int pr_ndx; uint16_t pr_seq; pii = tg->tg_phyint_inst; /* * Get the seq number of the most recent probe sent so far, * and also get the corresponding probe index in the probe stats * array. */ pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); pr_seq = pii->pii_snxt; pr_seq--; /* * Start from the most recent probe and walk back, trying to find * an acked probe corresponding to target tg. */ for (; pr_ndx != pii->pii_probe_next; pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { if (pii->pii_probes[pr_ndx].pr_target == tg && pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { if (SEQ_GT(pr_seq, seq)) return (_B_FALSE); } } return (_B_TRUE); } /* * Check whether the crtt for the group has improved by a factor of * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure * detection time flapping in the face of small crtt changes. */ static boolean_t check_pg_crtt_improved(struct phyint_group *pg) { struct phyint *pi; if (debug & D_PROBE) logdebug("check_pg_crtt_improved()\n"); /* * The crtt for the group is only improved if each phyint_instance * for both ipv4 and ipv6 is improved. */ for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { if (!check_pii_crtt_improved(pi->pi_v4) || !check_pii_crtt_improved(pi->pi_v6)) return (_B_FALSE); } return (_B_TRUE); } /* * Check whether the crtt has improved substantially on this phyint_instance. * Returns _B_TRUE if there's no crtt information available, because pii * is NULL or the phyint_instance is not capable of probing. */ boolean_t check_pii_crtt_improved(struct phyint_instance *pii) { struct target *tg; if (pii == NULL) return (_B_TRUE); if (!PROBE_CAPABLE(pii) || pii->pii_phyint->pi_state == PI_FAILED) return (_B_TRUE); for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { if (tg->tg_status != TG_ACTIVE) continue; if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / LOWER_FDT_TRIGGER)) { return (_B_FALSE); } } return (_B_TRUE); } /* * This target responds very slowly to probes. The target's crtt exceeds * the probe interval of its group. Compare against other targets * and determine if this target is an exception, if so return true, else false */ static boolean_t check_exception_target(struct phyint_instance *pii, struct target *target) { struct target *tg; char abuf[INET6_ADDRSTRLEN]; if (debug & D_PROBE) { logdebug("check_exception_target(%s %s target %s)\n", AF_STR(pii->pii_af), pii->pii_name, pr_addr(pii->pii_af, target->tg_address, abuf, sizeof (abuf))); } /* * We should have at least MIN_PROBE_TARGETS + 1 good targets now, * to make a good judgement. Otherwise don't drop this target. */ if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) return (_B_FALSE); /* * Determine whether only this particular target is slow. * We know that this target's crtt exceeds the group's probe interval. * If all other active targets have a * crtt < (this group's probe interval) / EXCEPTION_FACTOR, * then this target is considered slow. */ for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { if (tg != target && tg->tg_status == TG_ACTIVE) { if (tg->tg_crtt > pii->pii_phyint->pi_group->pg_probeint / EXCEPTION_FACTOR) { return (_B_FALSE); } } } return (_B_TRUE); } /* * Update the target list. The icmp all hosts multicast has given us * some host to which we can send probes. If we already have sufficient * targets, discard it. */ static void incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, struct in6_addr fromaddr) /* ARGSUSED */ { int af; char abuf[INET6_ADDRSTRLEN]; struct phyint *pi; if (debug & D_PROBE) { logdebug("incoming_mcast_reply(%s %s %s)\n", AF_STR(pii->pii_af), pii->pii_name, pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); } /* * Using host targets is a fallback mechanism. If we have * found a router, don't add this host target. If we already * know MAX_PROBE_TARGETS, don't add another target. */ assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); if (pii->pii_targets != NULL) { if (pii->pii_targets_are_routers || (pii->pii_ntargets == MAX_PROBE_TARGETS)) { return; } } if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { /* * Guard against response from 0.0.0.0 * and ::. Log a trace message */ logtrace("probe response from %s on %s\n", pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), pii->pii_name); return; } /* * This address is one of our own, so reject this address as a * valid probe target. */ af = pii->pii_af; if (own_address(fromaddr)) return; /* * If the phyint is part a named group, then add the address to all * members of the group. Otherwise, add the address only to the * phyint itself, since other phyints in the anongroup may not be on * the same subnet. */ pi = pii->pii_phyint; if (pi->pi_group == phyint_anongroup) { target_add(pii, fromaddr, _B_FALSE); } else { pi = pi->pi_group->pg_phyint; for (; pi != NULL; pi = pi->pi_pgnext) target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); } } /* * Compute CRTT given an existing scaled average, scaled deviation estimate * and a new rtt time. The formula is from Jacobson and Karels' * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names * are the same as those in Appendix A.2 of that paper. * * m = new measurement * sa = scaled RTT average (8 * average estimates) * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). * crtt = Conservative round trip time. Used to determine whether probe * has timed out. * * New scaled average and deviation are passed back via sap and svp */ static int64_t compute_crtt(int64_t *sap, int64_t *svp, int64_t m) { int64_t sa = *sap; int64_t sv = *svp; int64_t crtt; int64_t saved_m = m; assert(*sap >= -1); assert(*svp >= 0); if (sa != -1) { /* * Update average estimator: * new rtt = old rtt + 1/8 Error * where Error = m - old rtt * i.e. 8 * new rtt = 8 * old rtt + Error * i.e. new sa = old sa + Error */ m -= sa >> 3; /* m is now Error in estimate. */ if ((sa += m) < 0) { /* Don't allow the smoothed average to be negative. */ sa = 0; } /* * Update deviation estimator: * new mdev = old mdev + 1/4 (abs(Error) - old mdev) * i.e. 4 * new mdev = 4 * old mdev + * (abs(Error) - old mdev) * i.e. new sv = old sv + (abs(Error) - old mdev) */ if (m < 0) m = -m; m -= sv >> 2; sv += m; } else { /* Initialization. This is the first response received. */ sa = (m << 3); sv = (m << 1); } crtt = (sa >> 3) + sv; if (debug & D_PROBE) { logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " "crtt = %lld\n", saved_m, sa, sv, crtt); } *sap = sa; *svp = sv; /* * CRTT = average estimates + 4 * deviation estimates * = sa / 8 + sv */ return (crtt); } static void pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) { struct phyint_instance *pii = tg->tg_phyint_inst; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; int64_t sa = tg->tg_rtt_sa; int64_t sv = tg->tg_rtt_sd; int new_crtt; int i; if (debug & D_PROBE) logdebug("pi_set_crtt: target - m %lld\n", m); /* store the round trip time, in case we need to defer computation */ tg->tg_deferred[tg->tg_num_deferred] = m; new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); /* * If this probe's round trip time would singlehandedly cause an * increase in the group's probe interval consider it suspect. */ if ((new_crtt > probe_interval) && is_probe_uni) { if (debug & D_PROBE) { logdebug("Received a suspect probe on %s, new_crtt =" " %d, probe_interval = %d, num_deferred = %d\n", pii->pii_probe_logint->li_name, new_crtt, probe_interval, tg->tg_num_deferred); } /* * If we've deferred as many rtts as we plan on deferring, then * assume the link really did slow down and process all queued * rtts */ if (tg->tg_num_deferred == MAXDEFERREDRTT) { if (debug & D_PROBE) { logdebug("Received MAXDEFERREDRTT probes which " "would cause an increased probe_interval. " "Integrating queued rtt data points.\n"); } for (i = 0; i <= tg->tg_num_deferred; i++) { tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, &tg->tg_rtt_sd, tg->tg_deferred[i])); } tg->tg_num_deferred = 0; } else { tg->tg_num_deferred++; } return; } /* * If this is a normal probe, or an RTT probe that would lead to a * reduced CRTT, then update our CRTT data. Further, if this was * a normal probe, pitch any deferred probes since our probes are * again being answered within our CRTT estimates. */ if (is_probe_uni || new_crtt < tg->tg_crtt) { tg->tg_rtt_sa = sa; tg->tg_rtt_sd = sv; tg->tg_crtt = new_crtt; if (is_probe_uni) tg->tg_num_deferred = 0; } } /* * Return a pointer to the specified option buffer. * If not found return NULL. */ static void * find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) { struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (cmsg->cmsg_level == cmsg_level && cmsg->cmsg_type == cmsg_type) { return (CMSG_DATA(cmsg)); } } return (NULL); } /* * Try to activate another INACTIVE interface in the same group as `pi'. * Prefer STANDBY INACTIVE to just INACTIVE. */ void phyint_activate_another(struct phyint *pi) { struct phyint *pi2; struct phyint *inactivepi = NULL; if (pi->pi_group == phyint_anongroup) return; for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { if (pi == pi2 || !phyint_is_functioning(pi2) || !(pi2->pi_flags & IFF_INACTIVE)) continue; inactivepi = pi2; if (pi2->pi_flags & IFF_STANDBY) break; } if (inactivepi != NULL) (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); } /* * Transition a phyint to PI_RUNNING. The caller must ensure that the * transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if * appropriate. Also sets IFF_INACTIVE on this or other interfaces as * appropriate (see comment below). Finally, also updates the phyint's group * state to account for the change. */ void phyint_transition_to_running(struct phyint *pi) { struct phyint *pi2; struct phyint *actstandbypi = NULL; uint_t nactive = 0, nnonstandby = 0; boolean_t onlining = (pi->pi_state == PI_OFFLINE); boolean_t initial = (pi->pi_state == PI_INIT); uint64_t set, clear; /* * The interface is running again, but should it or another interface * in the group end up INACTIVE? There are three cases: * * 1. If it's a STANDBY interface, it should be end up INACTIVE if * the group is operating at capacity (i.e., there are at least as * many active interfaces as non-STANDBY interfaces in the group). * No other interfaces should be changed. * * 2. If it's a non-STANDBY interface and we're onlining it or * FAILBACK is enabled, then it should *not* end up INACTIVE. * Further, if the group is above capacity as a result of this * interface, then an active STANDBY interface in the group should * end up INACTIVE. * * 3. If it's a non-STANDBY interface, we're repairing it, and * FAILBACK is disabled, then it should end up INACTIVE *unless* * the group was failed (in which case we have no choice but to * use it). No other interfaces should be changed. */ if (pi->pi_group != phyint_anongroup) { pi2 = pi->pi_group->pg_phyint; for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { if (!(pi2->pi_flags & IFF_STANDBY)) nnonstandby++; if (phyint_is_functioning(pi2) && !(pi2->pi_flags & IFF_INACTIVE)) { nactive++; if (pi2->pi_flags & IFF_STANDBY) actstandbypi = pi2; } } } set = 0; clear = (onlining ? IFF_OFFLINE : IFF_FAILED); if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ if (nactive >= nnonstandby) set |= IFF_INACTIVE; else clear |= IFF_INACTIVE; } else if (onlining || failback_enabled) { /* case 2 */ if (nactive >= nnonstandby && actstandbypi != NULL) (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); } else if (!initial && !GROUP_FAILED(pi->pi_group)) { /* case 3 */ set |= IFF_INACTIVE; } (void) change_pif_flags(pi, set, clear); phyint_chstate(pi, PI_RUNNING); /* * Update the group state to account for the change. */ phyint_group_refresh_state(pi->pi_group); } /* * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration * to have at least one active interface and as many active interfaces as * non-standby interfaces. */ void phyint_standby_refresh_inactive(struct phyint *pi) { struct phyint *pi2; uint_t nactive = 0, nnonstandby = 0; /* * All phyints in the anonymous group are effectively in their own * group and thus active regardless of whether they're marked standby. */ if (pi->pi_group == phyint_anongroup) { (void) change_pif_flags(pi, 0, IFF_INACTIVE); return; } /* * If the phyint isn't functioning we can't consider it. */ if (!phyint_is_functioning(pi)) return; for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { if (!(pi2->pi_flags & IFF_STANDBY)) nnonstandby++; if (phyint_is_functioning(pi2) && !(pi2->pi_flags & IFF_INACTIVE)) nactive++; } if (nactive == 0 || nactive < nnonstandby) (void) change_pif_flags(pi, 0, IFF_INACTIVE); else if (nactive > nnonstandby) (void) change_pif_flags(pi, IFF_INACTIVE, 0); } /* * See if a previously failed interface has started working again. */ void phyint_check_for_repair(struct phyint *pi) { if (!phyint_repaired(pi)) return; if (pi->pi_group == phyint_anongroup) { logerr("IP interface repair detected on %s\n", pi->pi_name); } else { logerr("IP interface repair detected on %s of group %s\n", pi->pi_name, pi->pi_group->pg_name); } /* * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. * So just clear IFF_OFFLINE and defer phyint_transition_to_running() * until it is brought back online. */ if (pi->pi_state == PI_OFFLINE) { (void) change_pif_flags(pi, 0, IFF_FAILED); return; } phyint_transition_to_running(pi); /* calls phyint_chstate() */ } /* * See if an interface has failed, or if the whole group of interfaces has * failed. */ static void phyint_inst_check_for_failure(struct phyint_instance *pii) { struct phyint *pi = pii->pii_phyint; struct phyint *pi2; boolean_t was_active; switch (failure_state(pii)) { case PHYINT_FAILURE: was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); if (pi->pi_group == phyint_anongroup) { logerr("IP interface failure detected on %s\n", pii->pii_name); } else { logerr("IP interface failure detected on %s of group" " %s\n", pii->pii_name, pi->pi_group->pg_name); } /* * If the failed interface was active, activate another * INACTIVE interface in the group if possible. */ if (was_active) phyint_activate_another(pi); /* * If the interface is offline, the state change will be * noted when it comes back online. */ if (pi->pi_state != PI_OFFLINE) { phyint_chstate(pi, PI_FAILED); reset_crtt_all(pi); } break; case GROUP_FAILURE: pi2 = pi->pi_group->pg_phyint; for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); if (pi2->pi_state == PI_OFFLINE) /* see comment above */ continue; reset_crtt_all(pi2); /* * In the case of host targets, we would have flushed * the targets, and gone to PI_NOTARGETS state. */ if (pi2->pi_state == PI_RUNNING) phyint_chstate(pi2, PI_FAILED); } break; default: break; } } /* * Determines if any timeout event has occurred and returns the number of * milliseconds until the next timeout event for the phyint. Returns * TIMER_INFINITY for "never". */ uint_t phyint_inst_timer(struct phyint_instance *pii) { int pr_ndx; uint_t timeout; struct target *cur_tg; struct probe_stats *pr_statp; struct phyint_instance *pii_other; struct phyint *pi; int valid_unack_count; int i; int interval; uint_t check_time; uint_t cur_time; hrtime_t cur_hrtime; int probe_interval = pii->pii_phyint->pi_group->pg_probeint; cur_hrtime = gethrtime(); cur_time = ns2ms(cur_hrtime); if (debug & D_TIMER) { logdebug("phyint_inst_timer(%s %s)\n", AF_STR(pii->pii_af), pii->pii_name); } pii_other = phyint_inst_other(pii); if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { /* * Check to see if we're here due to link up/down flapping; If * enough time has passed, then try to bring the interface * back up; otherwise, schedule a timer to bring it back up * when enough time *has* elapsed. */ pi = pii->pii_phyint; if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; if (check_time > cur_time) return (check_time - cur_time); phyint_check_for_repair(pi); } } /* * If probing is not enabled on this phyint instance, don't proceed. */ if (!PROBE_ENABLED(pii)) return (TIMER_INFINITY); /* * If the timer has fired too soon, probably triggered * by some other phyint instance, return the remaining * time */ if (TIME_LT(cur_time, pii->pii_snxt_time)) return (pii->pii_snxt_time - cur_time); /* * If the link is down, don't send any probes for now. */ if (LINK_DOWN(pii->pii_phyint)) return (TIMER_INFINITY); /* * Randomize the next probe time, between MIN_RANDOM_FACTOR * and MAX_RANDOM_FACTOR with respect to the base probe time. * Base probe time is strictly periodic. */ interval = GET_RANDOM( (int)(MIN_RANDOM_FACTOR * user_probe_interval), (int)(MAX_RANDOM_FACTOR * user_probe_interval)); pii->pii_snxt_time = pii->pii_snxt_basetime + interval; /* * Check if the current time > next time to probe. If so, we missed * sending 1 or more probes, probably due to heavy system load. At least * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we * were scheduled. Make adjustments to the times, in multiples of * user_probe_interval. */ if (TIME_GT(cur_time, pii->pii_snxt_time)) { int n; n = (cur_time - pii->pii_snxt_time) / user_probe_interval; pii->pii_snxt_time += (n + 1) * user_probe_interval; pii->pii_snxt_basetime += (n + 1) * user_probe_interval; logtrace("missed sending %d probes cur_time %u snxt_time %u" " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, pii->pii_snxt_basetime); /* Collect statistics about missed probes */ probes_missed.pm_nprobes += n + 1; probes_missed.pm_ntimes++; } pii->pii_snxt_basetime += user_probe_interval; interval = pii->pii_snxt_time - cur_time; if (debug & D_TARGET) { logdebug("cur_time %u snxt_time %u snxt_basetime %u" " interval %u\n", cur_time, pii->pii_snxt_time, pii->pii_snxt_basetime, interval); } /* * If no targets are known, we need to send an ICMP multicast. The * probe type is PROBE_MULTI. We'll check back in 'interval' msec * to see if we found a target. */ if (pii->pii_target_next == NULL) { assert(pii->pii_ntargets == 0); pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; probe(pii, PROBE_MULTI, cur_time); return (interval); } if ((user_probe_interval != probe_interval) && TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { /* * the failure detection (fd) probe timer has not yet fired. * Need to send only an rtt probe. The probe type is PROBE_RTT. */ probe(pii, PROBE_RTT, cur_hrtime); return (interval); } /* * the fd probe timer has fired. Need to do all failure * detection / recovery calculations, and then send an fd probe * of type PROBE_UNI. */ if (user_probe_interval == probe_interval) { /* * We could have missed some probes, and then adjusted * pii_snxt_basetime above. Otherwise we could have * blindly added probe_interval to pii_fd_snxt_basetime. */ pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; } else { pii->pii_fd_snxt_basetime += probe_interval; if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { int n; n = (cur_time - pii->pii_fd_snxt_basetime) / probe_interval; pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; } } /* * We can have at most, the latest 2 probes that we sent, in * the PR_UNACKED state. All previous probes sent, are either * PR_LOST or PR_ACKED. An unacknowledged probe is considered * timed out if the probe's time_start + the CRTT < currenttime. * For each of the last 2 probes, examine whether it has timed * out. If so, mark it PR_LOST. The probe stats is a circular array. */ pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); valid_unack_count = 0; for (i = 0; i < 2; i++) { pr_statp = &pii->pii_probes[pr_ndx]; cur_tg = pii->pii_probes[pr_ndx].pr_target; switch (pr_statp->pr_status) { case PR_ACKED: /* * We received back an ACK, so the switch clearly * is not dropping our traffic, and thus we can * enable failure detection immediately. */ if (pii->pii_fd_hrtime > gethrtime()) { if (debug & D_PROBE) { logdebug("successful probe on %s; " "ending quiet period\n", pii->pii_phyint->pi_name); } pii->pii_fd_hrtime = gethrtime(); } break; case PR_UNACKED: assert(cur_tg != NULL); /* * The crtt could be zero for some reason, * Eg. the phyint could be failed. If the crtt is * not available use group's probe interval, * which is a worst case estimate. */ timeout = ns2ms(pr_statp->pr_hrtime_start); if (cur_tg->tg_crtt != 0) { timeout += cur_tg->tg_crtt; } else { timeout += probe_interval; } if (TIME_LT(timeout, cur_time)) { pr_statp->pr_time_lost = timeout; probe_chstate(pr_statp, pii, PR_LOST); } else if (i == 1) { /* * We are forced to consider this probe * lost, as we can have at most 2 unack. * probes any time, and we will be sending a * probe at the end of this function. * Normally, we should not be here, but * this can happen if an incoming response * that was considered lost has increased * the crtt for this target, and also bumped * up the FDT. Note that we never cancel or * increase the current pii_time_left, so * when the timer fires, we find 2 valid * unacked probes, and they are yet to timeout */ pr_statp->pr_time_lost = cur_time; probe_chstate(pr_statp, pii, PR_LOST); } else { /* * Only the most recent probe can enter * this 'else' arm. The second most recent * probe must take either of the above arms, * if it is unacked. */ valid_unack_count++; } break; } pr_ndx = PROBE_INDEX_PREV(pr_ndx); } /* * We send out 1 probe randomly in the interval between one half * and one probe interval for the group. Given that the CRTT is always * less than the group's probe interval, we can have at most 1 * unacknowledged probe now. All previous probes are either lost or * acked. */ assert(valid_unack_count == 0 || valid_unack_count == 1); /* * The timer has fired. Take appropriate action depending * on the current state of the phyint. * * PI_RUNNING state - Failure detection * PI_FAILED state - Repair detection */ switch (pii->pii_phyint->pi_state) { case PI_FAILED: /* * If the most recent probe (excluding unacked probes that * are yet to time out) has been acked, check whether the * phyint is now repaired. */ if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { phyint_check_for_repair(pii->pii_phyint); } break; case PI_RUNNING: /* * It's possible our probes have been lost because of a * spanning-tree mandated quiet period on the switch. If so, * ignore the lost probes. */ if (pii->pii_fd_hrtime - cur_hrtime > 0) break; if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { /* * We have 1 or more failed probes (excluding unacked * probes that are yet to time out). Determine if the * phyint has failed. */ phyint_inst_check_for_failure(pii); } break; default: logerr("phyint_inst_timer: invalid state %d\n", pii->pii_phyint->pi_state); abort(); } /* * Start the next probe. probe() will also set pii->pii_probe_time_left * to the group's probe interval. If phyint_failed -> target_flush_hosts * was called, the target list may be empty. */ if (pii->pii_target_next != NULL) { probe(pii, PROBE_UNI, cur_hrtime); /* * If we have just the one probe target, and we're not using * router targets, try to find another as we presently have * no resilience. */ if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) probe(pii, PROBE_MULTI, cur_hrtime); } else { probe(pii, PROBE_MULTI, cur_hrtime); } return (interval); } /* * Start the probe timer for an interface instance. */ void start_timer(struct phyint_instance *pii) { uint32_t interval; /* * Spread the base probe times (pi_snxt_basetime) across phyints * uniformly over the (curtime..curtime + the group's probe_interval). * pi_snxt_basetime is strictly periodic with a frequency of * the group's probe interval. The actual probe time pi_snxt_time * adds some randomness to pi_snxt_basetime and happens in probe(). * For the 1st probe on each phyint after the timer is started, * pi_snxt_time and pi_snxt_basetime are the same. */ interval = GET_RANDOM(0, (int)pii->pii_phyint->pi_group->pg_probeint); pii->pii_snxt_basetime = getcurrenttime() + interval; pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; pii->pii_snxt_time = pii->pii_snxt_basetime; timer_schedule(interval); } /* * Restart the probe timer on an interface instance. */ static void restart_timer(struct phyint_instance *pii) { /* * We don't need to restart the timer if it was never started in * the first place (pii->pii_basetime_inited not set), as the timer * won't have gone off yet. */ if (pii->pii_basetime_inited != 0) { if (debug & D_LINKNOTE) logdebug("restart timer: restarting timer on %s, " "address family %s\n", pii->pii_phyint->pi_name, AF_STR(pii->pii_af)); start_timer(pii); } } static void process_link_state_down(struct phyint *pi) { logerr("The link has gone down on %s\n", pi->pi_name); /* * Clear the probe statistics arrays, we don't want the repair * detection logic relying on probes that were successful prior * to the link going down. */ if (PROBE_CAPABLE(pi->pi_v4)) clear_pii_probe_stats(pi->pi_v4); if (PROBE_CAPABLE(pi->pi_v6)) clear_pii_probe_stats(pi->pi_v6); /* * Check for interface failure. Although we know the interface * has failed, we don't know if all the other interfaces in the * group have failed as well. */ if ((pi->pi_state == PI_RUNNING) || (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { if (debug & D_LINKNOTE) { logdebug("process_link_state_down:" " checking for failure on %s\n", pi->pi_name); } if (pi->pi_v4 != NULL) phyint_inst_check_for_failure(pi->pi_v4); else if (pi->pi_v6 != NULL) phyint_inst_check_for_failure(pi->pi_v6); } } static void process_link_state_up(struct phyint *pi) { logerr("The link has come up on %s\n", pi->pi_name); /* * We stopped any running timers on each instance when the link * went down, so restart them. */ if (pi->pi_v4) restart_timer(pi->pi_v4); if (pi->pi_v6) restart_timer(pi->pi_v6); phyint_check_for_repair(pi); pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); if (pi->pi_whendx == LINK_UP_PERMIN) pi->pi_whendx = 0; } /* * Process any changes in link state passed up from the interfaces. */ void process_link_state_changes(void) { struct phyint *pi; /* Look for interfaces where the link state has just changed */ for (pi = phyints; pi != NULL; pi = pi->pi_next) { boolean_t old_link_state_up = LINK_UP(pi); /* * Except when the "phyint" structure is created, this is * the only place the link state is updated. This allows * this routine to detect changes in link state, rather * than just the current state. */ UPDATE_LINK_STATE(pi); if (LINK_DOWN(pi)) { /* * Has link just gone down? */ if (old_link_state_up) process_link_state_down(pi); } else { /* * Has link just gone back up? */ if (!old_link_state_up) process_link_state_up(pi); } } } void reset_crtt_all(struct phyint *pi) { struct phyint_instance *pii; struct target *tg; pii = pi->pi_v4; if (pii != NULL) { for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { tg->tg_crtt = 0; tg->tg_rtt_sa = -1; tg->tg_rtt_sd = 0; } } pii = pi->pi_v6; if (pii != NULL) { for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { tg->tg_crtt = 0; tg->tg_rtt_sa = -1; tg->tg_rtt_sd = 0; } } } /* * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive * probes on both instances IPv4 and IPv6. * If the interface has failed, return the time of the first probe failure * in "tff". */ static int phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) { uint_t pi_tff; struct target *cur_tg; struct probe_fail_count pfinfo; struct phyint_instance *pii_other; int pr_ndx; /* * Get the number of consecutive failed probes on * this phyint across all targets. Also get the number * of consecutive failed probes on this target only */ pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); cur_tg = pii->pii_probes[pr_ndx].pr_target; probe_fail_info(pii, cur_tg, &pfinfo); /* Get the time of first failure, for later use */ pi_tff = pfinfo.pf_tff; /* * If the current target has not responded to the * last NUM_PROBE_FAILS probes, and other targets are * responding delete this target. Dead gateway detection * will eventually remove this target (if router) from the * routing tables. If that does not occur, we may end * up adding this to our list again. */ if (pfinfo.pf_nfail < NUM_PROBE_FAILS && pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { if (pii->pii_targets_are_routers) { if (cur_tg->tg_status == TG_ACTIVE) pii->pii_ntargets--; cur_tg->tg_status = TG_DEAD; cur_tg->tg_crtt = 0; cur_tg->tg_rtt_sa = -1; cur_tg->tg_rtt_sd = 0; if (pii->pii_target_next == cur_tg) pii->pii_target_next = target_next(cur_tg); } else { target_delete(cur_tg); probe(pii, PROBE_MULTI, gethrtime()); } return (PHYINT_OK); } /* * If the phyint has lost NUM_PROBE_FAILS or more * consecutive probes, on both IPv4 and IPv6 protocol * instances of the phyint, then trigger failure * detection, else return false */ if (pfinfo.pf_nfail < NUM_PROBE_FAILS) return (PHYINT_OK); pii_other = phyint_inst_other(pii); if (PROBE_CAPABLE(pii_other)) { probe_fail_info(pii_other, NULL, &pfinfo); if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { /* * We have NUM_PROBE_FAILS or more failures * on both IPv4 and IPv6. Get the earliest * time when failure was detected on this * phyint across IPv4 and IPv6. */ if (TIME_LT(pfinfo.pf_tff, pi_tff)) pi_tff = pfinfo.pf_tff; } else { /* * This instance has < NUM_PROBE_FAILS failure. * So return false */ return (PHYINT_OK); } } *tff = pi_tff; return (PHYINT_FAILURE); } /* * Check if the link has gone down on this phyint, or it has failed the * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. * Also look at other phyints of this group, for group failures. */ int failure_state(struct phyint_instance *pii) { struct probe_success_count psinfo; uint_t pi2_tls; /* time last success */ uint_t pi_tff; /* time first fail */ struct phyint *pi2; struct phyint *pi; struct phyint_instance *pii2; struct phyint_group *pg; int retval; if (debug & D_FAILREP) logdebug("phyint_failed(%s)\n", pii->pii_name); pi = pii->pii_phyint; pg = pi->pi_group; if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == PHYINT_OK) return (PHYINT_OK); /* * At this point, the link is down, or the phyint is suspect, as it * has lost NUM_PROBE_FAILS or more probes. If the phyint does not * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue * on to determine whether this should be considered a PHYINT_FAILURE * or GROUP_FAILURE. */ if (pg == phyint_anongroup) return (PHYINT_FAILURE); /* * Need to compare against other phyints of the same group * to exclude group failures. If the failure was detected via * probing, then if the time of last success (tls) of any * phyint is more recent than the time of first fail (tff) of the * phyint in question, and the link is up on the phyint, * then it is a phyint failure. Otherwise it is a group failure. * If failure was detected via a link down notification sent from * the driver to IP, we see if any phyints in the group are still * running and haven't received a link down notification. We * will usually be processing the link down notification shortly * after it was received, so there is no point looking at the tls * of other phyints. */ retval = GROUP_FAILURE; for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { /* Exclude ourself from comparison */ if (pi2 == pi) continue; if (LINK_DOWN(pi)) { /* * We use FLAGS_TO_LINK_STATE() to test the flags * directly, rather then LINK_UP() or LINK_DOWN(), as * we may not have got round to processing the link * state for the other phyints in the group yet. * * The check for PI_RUNNING and group failure handles * the case when the group begins to recover. * PI_RUNNING will be set, and group failure cleared * only after receipt of NUM_PROBE_REPAIRS, by which * time the other phyints should have received at * least 1 packet, and so will not have NUM_PROBE_FAILS. */ if ((pi2->pi_state == PI_RUNNING) && !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { retval = PHYINT_FAILURE; break; } continue; } if (LINK_DOWN(pi2)) continue; /* * If there's no probe-based failure detection on this * interface, and its link is still up, then it's still * working and thus the group has not failed. */ if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { retval = PHYINT_FAILURE; break; } /* * Need to compare against both IPv4 and IPv6 instances. */ pii2 = pi2->pi_v4; if (pii2 != NULL) { probe_success_info(pii2, NULL, &psinfo); if (psinfo.ps_tls_valid) { pi2_tls = psinfo.ps_tls; /* * See comment above regarding check * for PI_RUNNING and group failure. */ if (TIME_GT(pi2_tls, pi_tff) && (pi2->pi_state == PI_RUNNING) && !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { retval = PHYINT_FAILURE; break; } } } pii2 = pi2->pi_v6; if (pii2 != NULL) { probe_success_info(pii2, NULL, &psinfo); if (psinfo.ps_tls_valid) { pi2_tls = psinfo.ps_tls; /* * See comment above regarding check * for PI_RUNNING and group failure. */ if (TIME_GT(pi2_tls, pi_tff) && (pi2->pi_state == PI_RUNNING) && !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { retval = PHYINT_FAILURE; break; } } } } /* * Update the group state to account for the changes. */ phyint_group_refresh_state(pg); return (retval); } /* * Return the information associated with consecutive probe successes * starting with the most recent probe. At most the last 2 probes can be * in the unacknowledged state. All previous probes have either failed * or succeeded. */ static void probe_success_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_success_count *psinfo) { uint_t i; struct probe_stats *pr_statp; uint_t most_recent; uint_t second_most_recent; boolean_t pi_found_failure = _B_FALSE; boolean_t tg_found_failure = _B_FALSE; uint_t now; uint_t timeout; struct target *tg; if (debug & D_FAILREP) logdebug("probe_success_info(%s)\n", pii->pii_name); bzero(psinfo, sizeof (*psinfo)); now = getcurrenttime(); /* * Start with the most recent probe, and count the number * of consecutive probe successes. Latch the number of successes * on hitting a failure. */ most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); second_most_recent = PROBE_INDEX_PREV(most_recent); for (i = most_recent; i != pii->pii_probe_next; i = PROBE_INDEX_PREV(i)) { pr_statp = &pii->pii_probes[i]; switch (pr_statp->pr_status) { case PR_UNACKED: /* * Only the most recent 2 probes can be unacknowledged */ assert(i == most_recent || i == second_most_recent); tg = pr_statp->pr_target; assert(tg != NULL); /* * The crtt could be zero for some reason, * Eg. the phyint could be failed. If the crtt is * not available use the value of the group's probe * interval which is a worst case estimate. */ timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { timeout += tg->tg_crtt; } else { timeout += pii->pii_phyint->pi_group->pg_probeint; } if (TIME_LT(timeout, now)) { /* * We hit a failure. Latch the total number of * recent consecutive successes. */ pr_statp->pr_time_lost = timeout; probe_chstate(pr_statp, pii, PR_LOST); pi_found_failure = _B_TRUE; if (cur_tg != NULL && tg == cur_tg) { /* * We hit a failure for the desired * target. Latch the number of recent * consecutive successes for this target */ tg_found_failure = _B_TRUE; } } break; case PR_ACKED: /* * Bump up the count of probe successes, if we * have not seen any failure so far. */ if (!pi_found_failure) psinfo->ps_nsucc++; if (cur_tg != NULL && pr_statp->pr_target == cur_tg && !tg_found_failure) { psinfo->ps_nsucc_tg++; } /* * Record the time of last success, if this is * the most recent probe success. */ if (!psinfo->ps_tls_valid) { psinfo->ps_tls = ns2ms(pr_statp->pr_hrtime_ackproc); psinfo->ps_tls_valid = _B_TRUE; } break; case PR_LOST: /* * We hit a failure. Latch the total number of * recent consecutive successes. */ pi_found_failure = _B_TRUE; if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { /* * We hit a failure for the desired target. * Latch the number of recent consecutive * successes for this target */ tg_found_failure = _B_TRUE; } break; default: return; } } } /* * Return the information associated with consecutive probe failures * starting with the most recent probe. Only the last 2 probes can be in the * unacknowledged state. All previous probes have either failed or succeeded. */ static void probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, struct probe_fail_count *pfinfo) { int i; struct probe_stats *pr_statp; boolean_t tg_found_success = _B_FALSE; boolean_t pi_found_success = _B_FALSE; int most_recent; int second_most_recent; uint_t now; uint_t timeout; struct target *tg; if (debug & D_FAILREP) logdebug("probe_fail_info(%s)\n", pii->pii_name); bzero(pfinfo, sizeof (*pfinfo)); now = getcurrenttime(); /* * Start with the most recent probe, and count the number * of consecutive probe failures. Latch the number of failures * on hitting a probe success. */ most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); second_most_recent = PROBE_INDEX_PREV(most_recent); for (i = most_recent; i != pii->pii_probe_next; i = PROBE_INDEX_PREV(i)) { pr_statp = &pii->pii_probes[i]; assert(PR_STATUS_VALID(pr_statp->pr_status)); switch (pr_statp->pr_status) { case PR_UNACKED: /* * Only the most recent 2 probes can be unacknowledged */ assert(i == most_recent || i == second_most_recent); tg = pr_statp->pr_target; /* * Target is guaranteed to exist in the unack. state */ assert(tg != NULL); /* * The crtt could be zero for some reason, * Eg. the phyint could be failed. If the crtt is * not available use the group's probe interval, * which is a worst case estimate. */ timeout = ns2ms(pr_statp->pr_hrtime_start); if (tg->tg_crtt != 0) { timeout += tg->tg_crtt; } else { timeout += pii->pii_phyint->pi_group->pg_probeint; } if (TIME_GT(timeout, now)) break; pr_statp->pr_time_lost = timeout; probe_chstate(pr_statp, pii, PR_LOST); /* FALLTHRU */ case PR_LOST: if (!pi_found_success) { pfinfo->pf_nfail++; pfinfo->pf_tff = pr_statp->pr_time_lost; } if (cur_tg != NULL && pr_statp->pr_target == cur_tg && !tg_found_success) { pfinfo->pf_nfail_tg++; } break; default: /* * We hit a success or unused slot. Latch the * total number of recent consecutive failures. */ pi_found_success = _B_TRUE; if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { /* * We hit a success for the desired target. * Latch the number of recent consecutive * failures for this target */ tg_found_success = _B_TRUE; } } } } /* * Change the state of probe `pr' on phyint_instance `pii' to state `state'. */ void probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) { if (pr->pr_status == state) return; pr->pr_status = state; (void) probe_state_event(pr, pii); } /* * Check if the phyint has been repaired. If no test address has been * configured, then consider the interface repaired if the link is up (unless * the link is flapping; see below). Otherwise, look for proof of probes * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on * either IPv4 or IPv6 instance, the phyint can be considered repaired. */ static boolean_t phyint_repaired(struct phyint *pi) { struct probe_success_count psinfo; struct phyint_instance *pii; struct target *cur_tg; int pr_ndx; uint_t cur_time; if (debug & D_FAILREP) logdebug("phyint_repaired(%s)\n", pi->pi_name); if (LINK_DOWN(pi)) return (_B_FALSE); /* * If we don't have any test addresses and the link is up, then * consider the interface repaired, unless we've received more than * LINK_UP_PERMIN link up notifications in the last minute, in * which case we keep the link down until we drop back below * the threshold. */ if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { cur_time = getcurrenttime(); if ((pi->pi_whenup[pi->pi_whendx] == 0 || (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { pi->pi_lfmsg_printed = 0; return (_B_TRUE); } if (!pi->pi_lfmsg_printed) { logerr("The link has come up on %s more than %d times " "in the last minute; disabling repair until it " "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); pi->pi_lfmsg_printed = 1; } return (_B_FALSE); } pii = pi->pi_v4; if (PROBE_CAPABLE(pii)) { pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); cur_tg = pii->pii_probes[pr_ndx].pr_target; probe_success_info(pii, cur_tg, &psinfo); if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) return (_B_TRUE); } pii = pi->pi_v6; if (PROBE_CAPABLE(pii)) { pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); cur_tg = pii->pii_probes[pr_ndx].pr_target; probe_success_info(pii, cur_tg, &psinfo); if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) return (_B_TRUE); } return (_B_FALSE); } /* * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. */ boolean_t change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) { int ifsock; struct lifreq lifr; uint64_t old_flags; if (debug & D_FAILREP) { logdebug("change_pif_flags(%s): set %llx clear %llx\n", pi->pi_name, set, clear); } if (pi->pi_v4 != NULL) ifsock = ifsock_v4; else ifsock = ifsock_v6; /* * Get the current flags from the kernel, and set/clear the * desired phyint flags. Since we set only phyint flags, we can * do it on either IPv4 or IPv6 instance. */ (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) logperror("change_pif_flags: ioctl (get flags)"); return (_B_FALSE); } old_flags = lifr.lifr_flags; lifr.lifr_flags |= set; lifr.lifr_flags &= ~clear; if (old_flags == lifr.lifr_flags) { /* No change in the flags. No need to send ioctl */ return (_B_TRUE); } if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { if (errno != ENXIO) logperror("change_pif_flags: ioctl (set flags)"); return (_B_FALSE); } /* * Keep pi_flags in synch. with actual flags. Assumes flags are * phyint flags. */ pi->pi_flags |= set; pi->pi_flags &= ~clear; if (pi->pi_v4 != NULL) pi->pi_v4->pii_flags = pi->pi_flags; if (pi->pi_v6 != NULL) pi->pi_v6->pii_flags = pi->pi_flags; return (_B_TRUE); } /* * icmp cksum computation for IPv4. */ static int in_cksum(ushort_t *addr, int len) { register int nleft = len; register ushort_t *w = addr; register ushort_t answer; ushort_t odd_byte = 0; register int sum = 0; /* * Our algorithm is simple, using a 32 bit accumulator (sum), * we add sequential 16 bit words to it, and at the end, fold * back all the carry bits from the top 16 bits into the lower * 16 bits. */ while (nleft > 1) { sum += *w++; nleft -= 2; } /* mop up an odd byte, if necessary */ if (nleft == 1) { *(uchar_t *)(&odd_byte) = *(uchar_t *)w; sum += odd_byte; } /* * add back carry outs from top 16 bits to low 16 bits */ sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ sum += (sum >> 16); /* add carry */ answer = ~sum; /* truncate to 16 bits */ return (answer); } static void reset_snxt_basetimes(void) { struct phyint_instance *pii; for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; } } /* * Is the address one of our own addresses? Unfortunately, * we cannot check our phyint tables to determine if the address * is our own. This is because, we don't track interfaces that * are not part of any group. We have to either use a 'bind' or * get the complete list of all interfaces using SIOCGLIFCONF, * to do this check. We could also use SIOCTMYADDR. * Bind fails for the local zone address, so we might include local zone * address as target address. If local zone address is a target address * and it is up, it is not possible to detect the interface failure. * SIOCTMYADDR also doesn't consider local zone address as own address. * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they * are stored in `localaddrs' */ boolean_t own_address(struct in6_addr addr) { addrlist_t *addrp; struct sockaddr_storage ss; int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; addr2storage(af, &addr, &ss); for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { if (sockaddrcmp(&ss, &addrp->al_addr)) return (_B_TRUE); } return (_B_FALSE); } static int ns2ms(int64_t ns) { return (ns / (NANOSEC / MILLISEC)); } static int64_t tv2ns(struct timeval *tvp) { return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); }