17c478bd9Sstevel@tonic-gate /*
2e11c3f44Smeem * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
37c478bd9Sstevel@tonic-gate * Use is subject to license terms.
47c478bd9Sstevel@tonic-gate */
57c478bd9Sstevel@tonic-gate
67c478bd9Sstevel@tonic-gate /*
77c478bd9Sstevel@tonic-gate * Copyright (c) 1987 Regents of the University of California.
87c478bd9Sstevel@tonic-gate * All rights reserved.
97c478bd9Sstevel@tonic-gate *
107c478bd9Sstevel@tonic-gate * Redistribution and use in source and binary forms are permitted
117c478bd9Sstevel@tonic-gate * provided that the above copyright notice and this paragraph are
127c478bd9Sstevel@tonic-gate * duplicated in all such forms and that any documentation,
137c478bd9Sstevel@tonic-gate * advertising materials, and other materials related to such
147c478bd9Sstevel@tonic-gate * distribution and use acknowledge that the software was developed
157c478bd9Sstevel@tonic-gate * by the University of California, Berkeley. The name of the
167c478bd9Sstevel@tonic-gate * University may not be used to endorse or promote products derived
177c478bd9Sstevel@tonic-gate * from this software without specific prior written permission.
187c478bd9Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
197c478bd9Sstevel@tonic-gate * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
207c478bd9Sstevel@tonic-gate * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
217c478bd9Sstevel@tonic-gate */
227c478bd9Sstevel@tonic-gate
237c478bd9Sstevel@tonic-gate #include "mpd_defs.h"
247c478bd9Sstevel@tonic-gate #include "mpd_tables.h"
257c478bd9Sstevel@tonic-gate
267c478bd9Sstevel@tonic-gate /*
277c478bd9Sstevel@tonic-gate * Probe types for probe()
287c478bd9Sstevel@tonic-gate */
297c478bd9Sstevel@tonic-gate #define PROBE_UNI 0x1234 /* Unicast probe packet */
307c478bd9Sstevel@tonic-gate #define PROBE_MULTI 0x5678 /* Multicast probe packet */
317c478bd9Sstevel@tonic-gate #define PROBE_RTT 0x9abc /* RTT only probe packet */
327c478bd9Sstevel@tonic-gate
337c478bd9Sstevel@tonic-gate #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */
347c478bd9Sstevel@tonic-gate
357c478bd9Sstevel@tonic-gate /*
367c478bd9Sstevel@tonic-gate * Format of probe / probe response packets. This is an ICMP Echo request
377c478bd9Sstevel@tonic-gate * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
387c478bd9Sstevel@tonic-gate */
397c478bd9Sstevel@tonic-gate struct pr_icmp
407c478bd9Sstevel@tonic-gate {
417c478bd9Sstevel@tonic-gate uint8_t pr_icmp_type; /* type field */
427c478bd9Sstevel@tonic-gate uint8_t pr_icmp_code; /* code field */
437c478bd9Sstevel@tonic-gate uint16_t pr_icmp_cksum; /* checksum field */
447c478bd9Sstevel@tonic-gate uint16_t pr_icmp_id; /* Identification */
457c478bd9Sstevel@tonic-gate uint16_t pr_icmp_seq; /* sequence number */
46e11c3f44Smeem uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */
477c478bd9Sstevel@tonic-gate uint32_t pr_icmp_mtype; /* Message type */
487c478bd9Sstevel@tonic-gate };
497c478bd9Sstevel@tonic-gate
507c478bd9Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
517c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0,
527c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0,
537c478bd9Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x1 } };
547c478bd9Sstevel@tonic-gate
557c478bd9Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
567c478bd9Sstevel@tonic-gate
577c478bd9Sstevel@tonic-gate static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */
587c478bd9Sstevel@tonic-gate
59e11c3f44Smeem static void *find_ancillary(struct msghdr *msg, int cmsg_level,
60e11c3f44Smeem int cmsg_type);
61e11c3f44Smeem static void pi_set_crtt(struct target *tg, int64_t m,
627c478bd9Sstevel@tonic-gate boolean_t is_probe_uni);
637c478bd9Sstevel@tonic-gate static void incoming_echo_reply(struct phyint_instance *pii,
64e11c3f44Smeem struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
657c478bd9Sstevel@tonic-gate static void incoming_rtt_reply(struct phyint_instance *pii,
667c478bd9Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr);
677c478bd9Sstevel@tonic-gate static void incoming_mcast_reply(struct phyint_instance *pii,
687c478bd9Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr);
697c478bd9Sstevel@tonic-gate
707c478bd9Sstevel@tonic-gate static boolean_t check_pg_crtt_improved(struct phyint_group *pg);
717c478bd9Sstevel@tonic-gate static boolean_t check_pii_crtt_improved(struct phyint_instance *pii);
727c478bd9Sstevel@tonic-gate static boolean_t check_exception_target(struct phyint_instance *pii,
737c478bd9Sstevel@tonic-gate struct target *target);
747c478bd9Sstevel@tonic-gate static void probe_fail_info(struct phyint_instance *pii,
757c478bd9Sstevel@tonic-gate struct target *cur_tg, struct probe_fail_count *pfinfo);
767c478bd9Sstevel@tonic-gate static void probe_success_info(struct phyint_instance *pii,
777c478bd9Sstevel@tonic-gate struct target *cur_tg, struct probe_success_count *psinfo);
787c478bd9Sstevel@tonic-gate static boolean_t phyint_repaired(struct phyint *pi);
797c478bd9Sstevel@tonic-gate
807c478bd9Sstevel@tonic-gate static boolean_t highest_ack_tg(uint16_t seq, struct target *tg);
817c478bd9Sstevel@tonic-gate static int in_cksum(ushort_t *addr, int len);
827c478bd9Sstevel@tonic-gate static void reset_snxt_basetimes(void);
83e11c3f44Smeem static int ns2ms(int64_t ns);
84e11c3f44Smeem static int64_t tv2ns(struct timeval *);
857c478bd9Sstevel@tonic-gate
867c478bd9Sstevel@tonic-gate /*
877c478bd9Sstevel@tonic-gate * CRTT - Conservative Round Trip Time Estimate
887c478bd9Sstevel@tonic-gate * Probe success - A matching probe reply received before CRTT ms has elapsed
897c478bd9Sstevel@tonic-gate * after sending the probe.
907c478bd9Sstevel@tonic-gate * Probe failure - No probe reply received and more than CRTT ms has elapsed
917c478bd9Sstevel@tonic-gate * after sending the probe.
927c478bd9Sstevel@tonic-gate *
937c478bd9Sstevel@tonic-gate * TLS - Time last success. Most recent probe ack received at this time.
947c478bd9Sstevel@tonic-gate * TFF - Time first fail. The time of the earliest probe failure in
957c478bd9Sstevel@tonic-gate * a consecutive series of probe failures.
967c478bd9Sstevel@tonic-gate * NUM_PROBE_REPAIRS - Number of consecutive successful probes required
977c478bd9Sstevel@tonic-gate * before declaring phyint repair.
987c478bd9Sstevel@tonic-gate * NUM_PROBE_FAILS - Number of consecutive probe failures required to
997c478bd9Sstevel@tonic-gate * declare a phyint failure.
1007c478bd9Sstevel@tonic-gate *
1017c478bd9Sstevel@tonic-gate * Phyint state diagram
1027c478bd9Sstevel@tonic-gate *
1037c478bd9Sstevel@tonic-gate * The state of a phyint that is capable of being probed, is completely
104e11c3f44Smeem * specified by the 3-tuple <pi_state, pg_state, I>.
1057c478bd9Sstevel@tonic-gate *
106fcdc8680Smeem * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
107fcdc8680Smeem * IFF_OFFLINE is set. If the phyint is also configured with a test address
108fcdc8680Smeem * (the common case) and probe targets, then a phyint must also successfully
109fcdc8680Smeem * be able to send and receive probes in order to remain in the PI_RUNNING
110fcdc8680Smeem * state (otherwise, it transitions to PI_FAILED).
1117c478bd9Sstevel@tonic-gate *
1127c478bd9Sstevel@tonic-gate * Further, if a PI_RUNNING phyint is configured with a test address but is
1137c478bd9Sstevel@tonic-gate * unable to find any probe targets, it will transition to the PI_NOTARGETS
1147c478bd9Sstevel@tonic-gate * state, which indicates that the link is apparently functional but that
1157c478bd9Sstevel@tonic-gate * in.mpathd is unable to send probes to verify functionality (in this case,
1167c478bd9Sstevel@tonic-gate * in.mpathd makes the optimistic assumption that the interface is working
117e11c3f44Smeem * correctly and thus does not mark the interface FAILED, but reports it as
118e11c3f44Smeem * IPMP_IF_UNKNOWN through the async events and query interfaces).
1197c478bd9Sstevel@tonic-gate *
1207c478bd9Sstevel@tonic-gate * At any point, a phyint may be administratively marked offline via if_mpadm.
1217c478bd9Sstevel@tonic-gate * In this case, the interface always transitions to PI_OFFLINE, regardless
1227c478bd9Sstevel@tonic-gate * of its previous state. When the interface is later brought back online,
1237c478bd9Sstevel@tonic-gate * in.mpathd acts as if the interface is new (and thus it transitions to
1247c478bd9Sstevel@tonic-gate * PI_RUNNING or PI_FAILED based on the status of the link and the result of
1257c478bd9Sstevel@tonic-gate * its probes, if probes are sent).
1267c478bd9Sstevel@tonic-gate *
1277c478bd9Sstevel@tonic-gate * pi_state - PI_RUNNING or PI_FAILED
1287c478bd9Sstevel@tonic-gate * PI_RUNNING: The failure detection logic says the phyint is good.
1297c478bd9Sstevel@tonic-gate * PI_FAILED: The failure detection logic says the phyint has failed.
1307c478bd9Sstevel@tonic-gate *
131e11c3f44Smeem * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED.
132e11c3f44Smeem * PG_OK: All interfaces in the group are OK.
133e11c3f44Smeem * PG_DEGRADED: Some interfaces in the group are unusable.
134e11c3f44Smeem * PG_FAILED: All interfaces in the group are unusable.
135e11c3f44Smeem *
1367c478bd9Sstevel@tonic-gate * In the case of router targets, we assume that the current list of
1377c478bd9Sstevel@tonic-gate * targets obtained from the routing table, is still valid, so the
1387c478bd9Sstevel@tonic-gate * phyint stat is PI_FAILED. In the case of host targets, we delete the
1397c478bd9Sstevel@tonic-gate * list of targets, and multicast to the all hosts, to reconstruct the
1407c478bd9Sstevel@tonic-gate * target list. So the phyints are in the PI_NOTARGETS state.
1417c478bd9Sstevel@tonic-gate *
1427c478bd9Sstevel@tonic-gate * I - value of (pi_flags & IFF_INACTIVE)
143e11c3f44Smeem * IFF_INACTIVE: This phyint will not send or receive packets.
144e11c3f44Smeem * Usually, inactive is tied to standby interfaces that are not yet
145e11c3f44Smeem * needed (e.g., no non-standby interfaces in the group have failed).
146e11c3f44Smeem * When failback has been disabled (FAILBACK=no configured), phyint can
147e11c3f44Smeem * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
148e11c3f44Smeem * subsequently recovers after a failure.
1497c478bd9Sstevel@tonic-gate *
150e11c3f44Smeem * Not all 9 possible combinations of the above 3-tuple are possible.
1517c478bd9Sstevel@tonic-gate *
152e11c3f44Smeem * I is tracked by IP. pi_state is tracked by mpathd.
1537c478bd9Sstevel@tonic-gate *
1547c478bd9Sstevel@tonic-gate * pi_state state machine
1557c478bd9Sstevel@tonic-gate * ---------------------------------------------------------------------------
1567c478bd9Sstevel@tonic-gate * Event State New State
1577c478bd9Sstevel@tonic-gate * Action:
1587c478bd9Sstevel@tonic-gate * ---------------------------------------------------------------------------
159e11c3f44Smeem * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
1607c478bd9Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint
1617c478bd9Sstevel@tonic-gate *
162e11c3f44Smeem * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
163e11c3f44Smeem * detection : set IFF_FAILED on this phyint
164e11c3f44Smeem *
165e11c3f44Smeem * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes)
16649df4566Sethindra * detection -> (PI_RUNNING, I == 0)
1677c478bd9Sstevel@tonic-gate * : clear IFF_FAILED on this phyint
1687c478bd9Sstevel@tonic-gate *
169e11c3f44Smeem * IP interface repair (PI_FAILED, I == 0, FAILBACK=no)
17049df4566Sethindra * detection -> (PI_RUNNING, I == 1)
17149df4566Sethindra * : clear IFF_FAILED on this phyint
17249df4566Sethindra * : if failback is disabled set I == 1
1737c478bd9Sstevel@tonic-gate *
1747c478bd9Sstevel@tonic-gate * Group failure (perform on all phyints in the group)
1757c478bd9Sstevel@tonic-gate * detection PI_RUNNING PI_FAILED
1767c478bd9Sstevel@tonic-gate * (Router targets) : set IFF_FAILED
1777c478bd9Sstevel@tonic-gate *
1787c478bd9Sstevel@tonic-gate * Group failure (perform on all phyints in the group)
1797c478bd9Sstevel@tonic-gate * detection PI_RUNNING PI_NOTARGETS
1807c478bd9Sstevel@tonic-gate * (Host targets) : set IFF_FAILED
1817c478bd9Sstevel@tonic-gate * : delete the target list on all phyints
1827c478bd9Sstevel@tonic-gate * ---------------------------------------------------------------------------
1837c478bd9Sstevel@tonic-gate */
1847c478bd9Sstevel@tonic-gate
1857c478bd9Sstevel@tonic-gate struct probes_missed probes_missed;
1867c478bd9Sstevel@tonic-gate
1877c478bd9Sstevel@tonic-gate /*
1887c478bd9Sstevel@tonic-gate * Compose and transmit an ICMP ECHO REQUEST packet. The IP header
1897c478bd9Sstevel@tonic-gate * will be added on by the kernel. The id field identifies this phyint.
1907c478bd9Sstevel@tonic-gate * and the sequence number is an increasing (modulo 2^^16) integer. The data
1917c478bd9Sstevel@tonic-gate * portion holds the time value when the packet is sent. On echo this is
1927c478bd9Sstevel@tonic-gate * extracted to compute the round-trip time. Three different types of
1937c478bd9Sstevel@tonic-gate * probe packets are used.
1947c478bd9Sstevel@tonic-gate *
1957c478bd9Sstevel@tonic-gate * PROBE_UNI: This type is used to do failure detection / failure recovery
1967c478bd9Sstevel@tonic-gate * and RTT calculation. PROBE_UNI probes are spaced apart in time,
1977c478bd9Sstevel@tonic-gate * not less than the current CRTT. pii_probes[] stores data
1987c478bd9Sstevel@tonic-gate * about these probes. These packets consume sequence number space.
1997c478bd9Sstevel@tonic-gate *
200e11c3f44Smeem * PROBE_RTT: This type is used to make only rtt measurements. Normally these
2017c478bd9Sstevel@tonic-gate * are not used. Under heavy network load, the rtt may go up very high,
2027c478bd9Sstevel@tonic-gate * due to a spike, or may appear to go high, due to extreme scheduling
2037c478bd9Sstevel@tonic-gate * delays. Once the network stress is removed, mpathd takes long time to
2047c478bd9Sstevel@tonic-gate * recover, because the probe_interval is already high, and it takes
2057c478bd9Sstevel@tonic-gate * a long time to send out sufficient number of probes to bring down the
2067c478bd9Sstevel@tonic-gate * rtt. To avoid this problem, PROBE_RTT probes are sent out every
2077c478bd9Sstevel@tonic-gate * user_probe_interval ms. and will cause only rtt updates. These packets
2087c478bd9Sstevel@tonic-gate * do not consume sequence number space nor is information about these
2097c478bd9Sstevel@tonic-gate * packets stored in the pii_probes[]
2107c478bd9Sstevel@tonic-gate *
2117c478bd9Sstevel@tonic-gate * PROBE_MULTI: This type is only used to construct a list of targets, when
2127c478bd9Sstevel@tonic-gate * no targets are known. The packet is multicast to the all hosts addr.
2137c478bd9Sstevel@tonic-gate */
2147c478bd9Sstevel@tonic-gate static void
probe(struct phyint_instance * pii,uint_t probe_type,hrtime_t start_hrtime)215e11c3f44Smeem probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
2167c478bd9Sstevel@tonic-gate {
217e11c3f44Smeem hrtime_t sent_hrtime;
218e11c3f44Smeem struct timeval sent_tv;
2197c478bd9Sstevel@tonic-gate struct pr_icmp probe_pkt; /* Probe packet */
220e11c3f44Smeem struct sockaddr_storage targ; /* target address */
221e11c3f44Smeem uint_t targaddrlen; /* targed address length */
2227c478bd9Sstevel@tonic-gate int pr_ndx; /* probe index in pii->pii_probes[] */
223b6bc5f8fSGeorge Shepherd boolean_t sent = _B_FALSE;
224b6bc5f8fSGeorge Shepherd int rval;
2257c478bd9Sstevel@tonic-gate
2267c478bd9Sstevel@tonic-gate if (debug & D_TARGET) {
227e11c3f44Smeem logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
228e11c3f44Smeem pii->pii_name, probe_type, start_hrtime);
2297c478bd9Sstevel@tonic-gate }
2307c478bd9Sstevel@tonic-gate
2317c478bd9Sstevel@tonic-gate assert(pii->pii_probe_sock != -1);
2327c478bd9Sstevel@tonic-gate assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
2337c478bd9Sstevel@tonic-gate probe_type == PROBE_RTT);
2347c478bd9Sstevel@tonic-gate
2357c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
2367c478bd9Sstevel@tonic-gate ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
2377c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_code = 0;
2387c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 0;
2397c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
2407c478bd9Sstevel@tonic-gate
2417c478bd9Sstevel@tonic-gate /*
2427c478bd9Sstevel@tonic-gate * Since there is no need to do arithmetic on the icmpid,
2437c478bd9Sstevel@tonic-gate * (only equality check is done) pii_icmpid is stored in
2447c478bd9Sstevel@tonic-gate * network byte order at initialization itself.
2457c478bd9Sstevel@tonic-gate */
2467c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_id = pii->pii_icmpid;
247e11c3f44Smeem probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
2487c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_mtype = htonl(probe_type);
2497c478bd9Sstevel@tonic-gate
2507c478bd9Sstevel@tonic-gate /*
2517c478bd9Sstevel@tonic-gate * If probe_type is PROBE_MULTI, this packet will be multicast to
2527c478bd9Sstevel@tonic-gate * the all hosts address. Otherwise it is unicast to the next target.
2537c478bd9Sstevel@tonic-gate */
2547c478bd9Sstevel@tonic-gate assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
2557c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next != NULL));
2567c478bd9Sstevel@tonic-gate
257e11c3f44Smeem bzero(&targ, sizeof (targ));
258e11c3f44Smeem targ.ss_family = pii->pii_af;
259e11c3f44Smeem
2607c478bd9Sstevel@tonic-gate if (pii->pii_af == AF_INET6) {
261e11c3f44Smeem struct in6_addr *addr6;
262e11c3f44Smeem
263e11c3f44Smeem addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
264e11c3f44Smeem targaddrlen = sizeof (struct sockaddr_in6);
2657c478bd9Sstevel@tonic-gate if (probe_type == PROBE_MULTI) {
266e11c3f44Smeem *addr6 = all_nodes_mcast_v6;
2677c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) {
268e11c3f44Smeem *addr6 = pii->pii_target_next->tg_address;
269e11c3f44Smeem } else { /* type is PROBE_RTT */
270e11c3f44Smeem *addr6 = pii->pii_rtt_target_next->tg_address;
2717c478bd9Sstevel@tonic-gate }
2727c478bd9Sstevel@tonic-gate } else {
273e11c3f44Smeem struct in_addr *addr4;
274e11c3f44Smeem
275e11c3f44Smeem addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
276e11c3f44Smeem targaddrlen = sizeof (struct sockaddr_in);
2777c478bd9Sstevel@tonic-gate if (probe_type == PROBE_MULTI) {
278e11c3f44Smeem *addr4 = all_nodes_mcast_v4;
2797c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) {
2807c478bd9Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR(
281e11c3f44Smeem &pii->pii_target_next->tg_address, addr4);
282e11c3f44Smeem } else { /* type is PROBE_RTT */
2837c478bd9Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR(
284e11c3f44Smeem &pii->pii_rtt_target_next->tg_address, addr4);
2857c478bd9Sstevel@tonic-gate }
2867c478bd9Sstevel@tonic-gate
2877c478bd9Sstevel@tonic-gate /*
2887c478bd9Sstevel@tonic-gate * Compute the IPv4 icmp checksum. Does not cover the IP header.
2897c478bd9Sstevel@tonic-gate */
2907c478bd9Sstevel@tonic-gate probe_pkt.pr_icmp_cksum =
2917c478bd9Sstevel@tonic-gate in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
292e11c3f44Smeem }
293e11c3f44Smeem
294e11c3f44Smeem /*
295e11c3f44Smeem * Use the current time as the time we sent. Not atomic, but the best
296e11c3f44Smeem * we can do from here.
297e11c3f44Smeem */
298e11c3f44Smeem sent_hrtime = gethrtime();
299e11c3f44Smeem (void) gettimeofday(&sent_tv, NULL);
300b6bc5f8fSGeorge Shepherd rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
301b6bc5f8fSGeorge Shepherd (struct sockaddr *)&targ, targaddrlen);
302b6bc5f8fSGeorge Shepherd /*
303b6bc5f8fSGeorge Shepherd * If the send would block, this may either be transient or a hang in a
304b6bc5f8fSGeorge Shepherd * lower layer. We pretend the probe was actually sent, the daemon will
305b6bc5f8fSGeorge Shepherd * not see a reply to the probe and will fail the interface if normal
306b6bc5f8fSGeorge Shepherd * failure detection criteria are met.
307b6bc5f8fSGeorge Shepherd */
308b6bc5f8fSGeorge Shepherd if (rval == sizeof (probe_pkt) ||
309b6bc5f8fSGeorge Shepherd (rval == -1 && errno == EWOULDBLOCK)) {
310b6bc5f8fSGeorge Shepherd sent = _B_TRUE;
311b6bc5f8fSGeorge Shepherd } else {
3127c478bd9Sstevel@tonic-gate logperror_pii(pii, "probe: probe sendto");
3137c478bd9Sstevel@tonic-gate }
3147c478bd9Sstevel@tonic-gate
3157c478bd9Sstevel@tonic-gate /*
3167c478bd9Sstevel@tonic-gate * If this is a PROBE_UNI probe packet being unicast to a target, then
3177c478bd9Sstevel@tonic-gate * update our tables. We will need this info in processing the probe
3187c478bd9Sstevel@tonic-gate * response. PROBE_MULTI and PROBE_RTT packets are not used for
3197c478bd9Sstevel@tonic-gate * the purpose of failure or recovery detection. PROBE_MULTI packets
3207c478bd9Sstevel@tonic-gate * are only used to construct a list of targets. PROBE_RTT packets are
3217c478bd9Sstevel@tonic-gate * used only for updating the rtt and not for failure detection.
3227c478bd9Sstevel@tonic-gate */
3237c478bd9Sstevel@tonic-gate if (probe_type == PROBE_UNI && sent) {
3247c478bd9Sstevel@tonic-gate pr_ndx = pii->pii_probe_next;
3257c478bd9Sstevel@tonic-gate assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
3267c478bd9Sstevel@tonic-gate
3277c478bd9Sstevel@tonic-gate /* Collect statistics, before we reuse the last slot. */
3287c478bd9Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
3297c478bd9Sstevel@tonic-gate pii->pii_cum_stats.lost++;
3307c478bd9Sstevel@tonic-gate else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
3317c478bd9Sstevel@tonic-gate pii->pii_cum_stats.acked++;
3327c478bd9Sstevel@tonic-gate pii->pii_cum_stats.sent++;
3337c478bd9Sstevel@tonic-gate
334e11c3f44Smeem pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
335e11c3f44Smeem pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
336e11c3f44Smeem pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
337e11c3f44Smeem pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
3387c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
339e11c3f44Smeem probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
340e11c3f44Smeem
3417c478bd9Sstevel@tonic-gate pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
3427c478bd9Sstevel@tonic-gate pii->pii_target_next = target_next(pii->pii_target_next);
3437c478bd9Sstevel@tonic-gate assert(pii->pii_target_next != NULL);
3447c478bd9Sstevel@tonic-gate /*
3457c478bd9Sstevel@tonic-gate * If we have a single variable to denote the next target to
3467c478bd9Sstevel@tonic-gate * probe for both rtt probes and failure detection probes, we
3477c478bd9Sstevel@tonic-gate * could end up with a situation where the failure detection
3487c478bd9Sstevel@tonic-gate * probe targets become disjoint from the rtt probe targets.
3497c478bd9Sstevel@tonic-gate * Eg. if 2 targets and the actual fdt is double the user
3507c478bd9Sstevel@tonic-gate * specified fdt. So we have 2 variables. In this scheme
3517c478bd9Sstevel@tonic-gate * we also reset pii_rtt_target_next for every fdt probe,
3527c478bd9Sstevel@tonic-gate * though that may not be necessary.
3537c478bd9Sstevel@tonic-gate */
3547c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next = pii->pii_target_next;
3557c478bd9Sstevel@tonic-gate pii->pii_snxt++;
3567c478bd9Sstevel@tonic-gate } else if (probe_type == PROBE_RTT) {
3577c478bd9Sstevel@tonic-gate pii->pii_rtt_target_next =
3587c478bd9Sstevel@tonic-gate target_next(pii->pii_rtt_target_next);
3597c478bd9Sstevel@tonic-gate assert(pii->pii_rtt_target_next != NULL);
3607c478bd9Sstevel@tonic-gate }
3617c478bd9Sstevel@tonic-gate }
3627c478bd9Sstevel@tonic-gate
3637c478bd9Sstevel@tonic-gate /*
3647c478bd9Sstevel@tonic-gate * Incoming IPv4 data from wire, is received here. Called from main.
3657c478bd9Sstevel@tonic-gate */
3667c478bd9Sstevel@tonic-gate void
in_data(struct phyint_instance * pii)3677c478bd9Sstevel@tonic-gate in_data(struct phyint_instance *pii)
3687c478bd9Sstevel@tonic-gate {
3697c478bd9Sstevel@tonic-gate struct sockaddr_in from;
3707c478bd9Sstevel@tonic-gate struct in6_addr fromaddr;
371e11c3f44Smeem static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
372e11c3f44Smeem static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
3737c478bd9Sstevel@tonic-gate struct ip *ip;
3747c478bd9Sstevel@tonic-gate int iphlen;
3757c478bd9Sstevel@tonic-gate int len;
3767c478bd9Sstevel@tonic-gate char abuf[INET_ADDRSTRLEN];
377e11c3f44Smeem struct msghdr msg;
378e11c3f44Smeem struct iovec iov;
3797c478bd9Sstevel@tonic-gate struct pr_icmp *reply;
380e11c3f44Smeem struct timeval *recv_tvp;
3817c478bd9Sstevel@tonic-gate
3827c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
3837c478bd9Sstevel@tonic-gate logdebug("in_data(%s %s)\n",
3847c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name);
3857c478bd9Sstevel@tonic-gate }
3867c478bd9Sstevel@tonic-gate
387e11c3f44Smeem iov.iov_base = (char *)in_packet;
388e11c3f44Smeem iov.iov_len = sizeof (in_packet);
389e11c3f44Smeem msg.msg_iov = &iov;
390e11c3f44Smeem msg.msg_iovlen = 1;
391e11c3f44Smeem msg.msg_name = (struct sockaddr *)&from;
392e11c3f44Smeem msg.msg_namelen = sizeof (from);
393e11c3f44Smeem msg.msg_control = ancillary_data;
394e11c3f44Smeem msg.msg_controllen = sizeof (ancillary_data);
395e11c3f44Smeem
3967c478bd9Sstevel@tonic-gate /*
3977c478bd9Sstevel@tonic-gate * Poll has already told us that a message is waiting,
3987c478bd9Sstevel@tonic-gate * on this socket. Read it now. We should not block.
3997c478bd9Sstevel@tonic-gate */
400e11c3f44Smeem if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
401e11c3f44Smeem logperror_pii(pii, "in_data: recvmsg");
4027c478bd9Sstevel@tonic-gate return;
4037c478bd9Sstevel@tonic-gate }
4047c478bd9Sstevel@tonic-gate
4057c478bd9Sstevel@tonic-gate /*
406e11c3f44Smeem * If the datalink has indicated the link is down, don't go
4077c478bd9Sstevel@tonic-gate * any further.
4087c478bd9Sstevel@tonic-gate */
4097c478bd9Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint))
4107c478bd9Sstevel@tonic-gate return;
4117c478bd9Sstevel@tonic-gate
4127c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */
4137c478bd9Sstevel@tonic-gate (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
4147c478bd9Sstevel@tonic-gate
415e11c3f44Smeem /* Ignore packets > 64k or control buffers that don't fit */
416e11c3f44Smeem if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
417e11c3f44Smeem if (debug & D_PKTBAD) {
418e11c3f44Smeem logdebug("Truncated message: msg_flags 0x%x from %s\n",
419e11c3f44Smeem msg.msg_flags, abuf);
420e11c3f44Smeem }
421e11c3f44Smeem return;
422e11c3f44Smeem }
423e11c3f44Smeem
4247c478bd9Sstevel@tonic-gate /* Make sure packet contains at least minimum ICMP header */
4257c478bd9Sstevel@tonic-gate ip = (struct ip *)in_packet;
4267c478bd9Sstevel@tonic-gate iphlen = ip->ip_hl << 2;
4277c478bd9Sstevel@tonic-gate if (len < iphlen + ICMP_MINLEN) {
4287c478bd9Sstevel@tonic-gate if (debug & D_PKTBAD) {
4297c478bd9Sstevel@tonic-gate logdebug("in_data: packet too short (%d bytes)"
4307c478bd9Sstevel@tonic-gate " from %s\n", len, abuf);
4317c478bd9Sstevel@tonic-gate }
4327c478bd9Sstevel@tonic-gate return;
4337c478bd9Sstevel@tonic-gate }
4347c478bd9Sstevel@tonic-gate
4357c478bd9Sstevel@tonic-gate /*
4367c478bd9Sstevel@tonic-gate * Subtract the IP hdr length, 'len' will be length of the probe
4377c478bd9Sstevel@tonic-gate * reply, starting from the icmp hdr.
4387c478bd9Sstevel@tonic-gate */
4397c478bd9Sstevel@tonic-gate len -= iphlen;
4407c478bd9Sstevel@tonic-gate /* LINTED */
4417c478bd9Sstevel@tonic-gate reply = (struct pr_icmp *)((char *)in_packet + iphlen);
4427c478bd9Sstevel@tonic-gate
4437c478bd9Sstevel@tonic-gate /* Probe replies are icmp echo replies. Ignore anything else */
4447c478bd9Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
4457c478bd9Sstevel@tonic-gate return;
4467c478bd9Sstevel@tonic-gate
4477c478bd9Sstevel@tonic-gate /*
4487c478bd9Sstevel@tonic-gate * The icmp id should match what we sent, which is stored
4497c478bd9Sstevel@tonic-gate * in pi_icmpid. The icmp code for reply must be 0.
4507c478bd9Sstevel@tonic-gate * The reply content must be a struct pr_icmp
4517c478bd9Sstevel@tonic-gate */
4527c478bd9Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) {
4537c478bd9Sstevel@tonic-gate /* Not in response to our probe */
4547c478bd9Sstevel@tonic-gate return;
4557c478bd9Sstevel@tonic-gate }
4567c478bd9Sstevel@tonic-gate
4577c478bd9Sstevel@tonic-gate if (reply->pr_icmp_code != 0) {
4587c478bd9Sstevel@tonic-gate logtrace("probe reply code %d from %s on %s\n",
4597c478bd9Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name);
4607c478bd9Sstevel@tonic-gate return;
4617c478bd9Sstevel@tonic-gate }
4627c478bd9Sstevel@tonic-gate
4637c478bd9Sstevel@tonic-gate if (len < sizeof (struct pr_icmp)) {
4647c478bd9Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n",
4657c478bd9Sstevel@tonic-gate len, abuf, pii->pii_name);
4667c478bd9Sstevel@tonic-gate return;
4677c478bd9Sstevel@tonic-gate }
4687c478bd9Sstevel@tonic-gate
469e11c3f44Smeem recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
470e11c3f44Smeem if (recv_tvp == NULL) {
471e11c3f44Smeem logtrace("message without timestamp from %s on %s\n",
472e11c3f44Smeem abuf, pii->pii_name);
473e11c3f44Smeem return;
474e11c3f44Smeem }
475e11c3f44Smeem
4767c478bd9Sstevel@tonic-gate IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
4777c478bd9Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
4787c478bd9Sstevel@tonic-gate /* Unicast probe reply */
479e11c3f44Smeem incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
4807c478bd9Sstevel@tonic-gate else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
4817c478bd9Sstevel@tonic-gate /* Multicast reply */
4827c478bd9Sstevel@tonic-gate incoming_mcast_reply(pii, reply, fromaddr);
4837c478bd9Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
4847c478bd9Sstevel@tonic-gate incoming_rtt_reply(pii, reply, fromaddr);
4857c478bd9Sstevel@tonic-gate } else {
4867c478bd9Sstevel@tonic-gate /* Probably not in response to our probe */
4877c478bd9Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n",
4887c478bd9Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name);
4897c478bd9Sstevel@tonic-gate return;
4907c478bd9Sstevel@tonic-gate }
4917c478bd9Sstevel@tonic-gate }
4927c478bd9Sstevel@tonic-gate
4937c478bd9Sstevel@tonic-gate /*
4947c478bd9Sstevel@tonic-gate * Incoming IPv6 data from wire is received here. Called from main.
4957c478bd9Sstevel@tonic-gate */
4967c478bd9Sstevel@tonic-gate void
in6_data(struct phyint_instance * pii)4977c478bd9Sstevel@tonic-gate in6_data(struct phyint_instance *pii)
4987c478bd9Sstevel@tonic-gate {
4997c478bd9Sstevel@tonic-gate struct sockaddr_in6 from;
5007c478bd9Sstevel@tonic-gate static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
5017c478bd9Sstevel@tonic-gate static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
5027c478bd9Sstevel@tonic-gate int len;
5037c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN];
5047c478bd9Sstevel@tonic-gate struct msghdr msg;
5057c478bd9Sstevel@tonic-gate struct iovec iov;
506e11c3f44Smeem void *opt;
5077c478bd9Sstevel@tonic-gate struct pr_icmp *reply;
508e11c3f44Smeem struct timeval *recv_tvp;
5097c478bd9Sstevel@tonic-gate
5107c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
5117c478bd9Sstevel@tonic-gate logdebug("in6_data(%s %s)\n",
5127c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name);
5137c478bd9Sstevel@tonic-gate }
5147c478bd9Sstevel@tonic-gate
5157c478bd9Sstevel@tonic-gate iov.iov_base = (char *)in_packet;
5167c478bd9Sstevel@tonic-gate iov.iov_len = sizeof (in_packet);
5177c478bd9Sstevel@tonic-gate msg.msg_iov = &iov;
5187c478bd9Sstevel@tonic-gate msg.msg_iovlen = 1;
5197c478bd9Sstevel@tonic-gate msg.msg_name = (struct sockaddr *)&from;
5207c478bd9Sstevel@tonic-gate msg.msg_namelen = sizeof (from);
5217c478bd9Sstevel@tonic-gate msg.msg_control = ancillary_data;
5227c478bd9Sstevel@tonic-gate msg.msg_controllen = sizeof (ancillary_data);
5237c478bd9Sstevel@tonic-gate
5247c478bd9Sstevel@tonic-gate if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
525e11c3f44Smeem logperror_pii(pii, "in6_data: recvmsg");
5267c478bd9Sstevel@tonic-gate return;
5277c478bd9Sstevel@tonic-gate }
5287c478bd9Sstevel@tonic-gate
5297c478bd9Sstevel@tonic-gate /*
530e11c3f44Smeem * If the datalink has indicated that the link is down, don't go
5317c478bd9Sstevel@tonic-gate * any further.
5327c478bd9Sstevel@tonic-gate */
5337c478bd9Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint))
5347c478bd9Sstevel@tonic-gate return;
5357c478bd9Sstevel@tonic-gate
5367c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */
5377c478bd9Sstevel@tonic-gate (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
5387c478bd9Sstevel@tonic-gate if (len < ICMP_MINLEN) {
5397c478bd9Sstevel@tonic-gate if (debug & D_PKTBAD) {
5407c478bd9Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n",
5417c478bd9Sstevel@tonic-gate msg.msg_flags, abuf);
5427c478bd9Sstevel@tonic-gate }
5437c478bd9Sstevel@tonic-gate return;
5447c478bd9Sstevel@tonic-gate }
5457c478bd9Sstevel@tonic-gate /* Ignore packets > 64k or control buffers that don't fit */
5467c478bd9Sstevel@tonic-gate if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
5477c478bd9Sstevel@tonic-gate if (debug & D_PKTBAD) {
5487c478bd9Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n",
5497c478bd9Sstevel@tonic-gate msg.msg_flags, abuf);
5507c478bd9Sstevel@tonic-gate }
5517c478bd9Sstevel@tonic-gate return;
5527c478bd9Sstevel@tonic-gate }
5537c478bd9Sstevel@tonic-gate
5547c478bd9Sstevel@tonic-gate reply = (struct pr_icmp *)in_packet;
5557c478bd9Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
5567c478bd9Sstevel@tonic-gate return;
5577c478bd9Sstevel@tonic-gate
5587c478bd9Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) {
5597c478bd9Sstevel@tonic-gate /* Not in response to our probe */
5607c478bd9Sstevel@tonic-gate return;
5617c478bd9Sstevel@tonic-gate }
5627c478bd9Sstevel@tonic-gate
5637c478bd9Sstevel@tonic-gate /*
5647c478bd9Sstevel@tonic-gate * The kernel has already verified the the ICMP checksum.
5657c478bd9Sstevel@tonic-gate */
5667c478bd9Sstevel@tonic-gate if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
5677c478bd9Sstevel@tonic-gate logtrace("ICMPv6 echo reply source address not linklocal from "
5687c478bd9Sstevel@tonic-gate "%s on %s\n", abuf, pii->pii_name);
5697c478bd9Sstevel@tonic-gate return;
5707c478bd9Sstevel@tonic-gate }
571e11c3f44Smeem opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
5727c478bd9Sstevel@tonic-gate if (opt != NULL) {
5737c478bd9Sstevel@tonic-gate /* Can't allow routing headers in probe replies */
5747c478bd9Sstevel@tonic-gate logtrace("message with routing header from %s on %s\n",
5757c478bd9Sstevel@tonic-gate abuf, pii->pii_name);
5767c478bd9Sstevel@tonic-gate return;
5777c478bd9Sstevel@tonic-gate }
578e11c3f44Smeem
5797c478bd9Sstevel@tonic-gate if (reply->pr_icmp_code != 0) {
5807c478bd9Sstevel@tonic-gate logtrace("probe reply code: %d from %s on %s\n",
5817c478bd9Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name);
5827c478bd9Sstevel@tonic-gate return;
5837c478bd9Sstevel@tonic-gate }
5847c478bd9Sstevel@tonic-gate if (len < (sizeof (struct pr_icmp))) {
5857c478bd9Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n",
5867c478bd9Sstevel@tonic-gate len, abuf, pii->pii_name);
5877c478bd9Sstevel@tonic-gate return;
5887c478bd9Sstevel@tonic-gate }
589e11c3f44Smeem
590e11c3f44Smeem recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
591e11c3f44Smeem if (recv_tvp == NULL) {
592e11c3f44Smeem logtrace("message without timestamp from %s on %s\n",
593e11c3f44Smeem abuf, pii->pii_name);
594e11c3f44Smeem return;
595e11c3f44Smeem }
596e11c3f44Smeem
5977c478bd9Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
598e11c3f44Smeem incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
5997c478bd9Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
6007c478bd9Sstevel@tonic-gate incoming_mcast_reply(pii, reply, from.sin6_addr);
6017c478bd9Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
6027c478bd9Sstevel@tonic-gate incoming_rtt_reply(pii, reply, from.sin6_addr);
6037c478bd9Sstevel@tonic-gate } else {
6047c478bd9Sstevel@tonic-gate /* Probably not in response to our probe */
6057c478bd9Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n",
6067c478bd9Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name);
6077c478bd9Sstevel@tonic-gate }
6087c478bd9Sstevel@tonic-gate }
6097c478bd9Sstevel@tonic-gate
6107c478bd9Sstevel@tonic-gate /*
6117c478bd9Sstevel@tonic-gate * Process the incoming rtt reply, in response to our rtt probe.
6127c478bd9Sstevel@tonic-gate * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
6137c478bd9Sstevel@tonic-gate * have any stored information about the probe we sent. So we don't log
6147c478bd9Sstevel@tonic-gate * any errors if we receive bad replies.
6157c478bd9Sstevel@tonic-gate */
6167c478bd9Sstevel@tonic-gate static void
incoming_rtt_reply(struct phyint_instance * pii,struct pr_icmp * reply,struct in6_addr fromaddr)6177c478bd9Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
6187c478bd9Sstevel@tonic-gate struct in6_addr fromaddr)
6197c478bd9Sstevel@tonic-gate {
620e11c3f44Smeem int64_t m; /* rtt measurement in ns */
6217c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN];
6227c478bd9Sstevel@tonic-gate struct target *target;
6237c478bd9Sstevel@tonic-gate struct phyint_group *pg;
6247c478bd9Sstevel@tonic-gate
6257c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */
6267c478bd9Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
6277c478bd9Sstevel@tonic-gate
6287c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
6297c478bd9Sstevel@tonic-gate logdebug("incoming_rtt_reply: %s %s %s\n",
6307c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf);
6317c478bd9Sstevel@tonic-gate }
6327c478bd9Sstevel@tonic-gate
6337c478bd9Sstevel@tonic-gate /* Do we know this target ? */
6347c478bd9Sstevel@tonic-gate target = target_lookup(pii, fromaddr);
6357c478bd9Sstevel@tonic-gate if (target == NULL)
6367c478bd9Sstevel@tonic-gate return;
6377c478bd9Sstevel@tonic-gate
638e11c3f44Smeem m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
6397c478bd9Sstevel@tonic-gate /* Invalid rtt. It has wrapped around */
6407c478bd9Sstevel@tonic-gate if (m < 0)
6417c478bd9Sstevel@tonic-gate return;
6427c478bd9Sstevel@tonic-gate
6437c478bd9Sstevel@tonic-gate /*
6447c478bd9Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
6457c478bd9Sstevel@tonic-gate * The initial few responses after the interface is repaired may
6467c478bd9Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting
6477c478bd9Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface.
6487c478bd9Sstevel@tonic-gate */
6497c478bd9Sstevel@tonic-gate pg = pii->pii_phyint->pi_group;
6507c478bd9Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
6517c478bd9Sstevel@tonic-gate return;
6527c478bd9Sstevel@tonic-gate
6537c478bd9Sstevel@tonic-gate /*
6547c478bd9Sstevel@tonic-gate * Update rtt only if the new rtt is lower than the current rtt.
6557c478bd9Sstevel@tonic-gate * (specified by the 3rd parameter to pi_set_crtt).
6567c478bd9Sstevel@tonic-gate * If a spike has caused the current probe_interval to be >
6577c478bd9Sstevel@tonic-gate * user_probe_interval, then this mechanism is used to bring down
6587c478bd9Sstevel@tonic-gate * the rtt rapidly once the network stress is removed.
6597c478bd9Sstevel@tonic-gate * If the new rtt is higher than the current rtt, we don't want to
6607c478bd9Sstevel@tonic-gate * update the rtt. We are having more than 1 outstanding probe and
6617c478bd9Sstevel@tonic-gate * the increase in rtt we are seeing is being unnecessarily weighted
6627c478bd9Sstevel@tonic-gate * many times. The regular rtt update will be handled by
6637c478bd9Sstevel@tonic-gate * incoming_echo_reply() and will take care of any rtt increase.
6647c478bd9Sstevel@tonic-gate */
6657c478bd9Sstevel@tonic-gate pi_set_crtt(target, m, _B_FALSE);
6667c478bd9Sstevel@tonic-gate if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
6677c478bd9Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) &&
6687c478bd9Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
6697c478bd9Sstevel@tonic-gate /*
6707c478bd9Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
6717c478bd9Sstevel@tonic-gate * investigate if we can improve the failure detection time to
6727c478bd9Sstevel@tonic-gate * meet whatever the user specified.
6737c478bd9Sstevel@tonic-gate */
6747c478bd9Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) {
6757c478bd9Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
6767c478bd9Sstevel@tonic-gate user_failure_detection_time);
6777c478bd9Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
6787c478bd9Sstevel@tonic-gate if (pii->pii_phyint->pi_group != phyint_anongroup) {
6797c478bd9Sstevel@tonic-gate logerr("Improved failure detection time %d ms "
6807c478bd9Sstevel@tonic-gate "on (%s %s) for group \"%s\"\n",
6817c478bd9Sstevel@tonic-gate pg->pg_fdt, AF_STR(pii->pii_af),
6827c478bd9Sstevel@tonic-gate pii->pii_name,
6837c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_name);
6847c478bd9Sstevel@tonic-gate }
6857c478bd9Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) {
6867c478bd9Sstevel@tonic-gate /* Avoid any truncation or rounding errors */
6877c478bd9Sstevel@tonic-gate pg->pg_probeint = user_probe_interval;
6887c478bd9Sstevel@tonic-gate /*
6897c478bd9Sstevel@tonic-gate * No more rtt probes will be sent. The actual
6907c478bd9Sstevel@tonic-gate * fdt has dropped to the user specified value.
6917c478bd9Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime
6927c478bd9Sstevel@tonic-gate * will be in sync henceforth.
6937c478bd9Sstevel@tonic-gate */
6947c478bd9Sstevel@tonic-gate reset_snxt_basetimes();
6957c478bd9Sstevel@tonic-gate }
6967c478bd9Sstevel@tonic-gate }
6977c478bd9Sstevel@tonic-gate }
6987c478bd9Sstevel@tonic-gate }
6997c478bd9Sstevel@tonic-gate
7007c478bd9Sstevel@tonic-gate /*
7017c478bd9Sstevel@tonic-gate * Process the incoming echo reply, in response to our unicast probe.
7027c478bd9Sstevel@tonic-gate * Common for both IPv4 and IPv6
7037c478bd9Sstevel@tonic-gate */
7047c478bd9Sstevel@tonic-gate static void
incoming_echo_reply(struct phyint_instance * pii,struct pr_icmp * reply,struct in6_addr fromaddr,struct timeval * recv_tvp)7057c478bd9Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
706e11c3f44Smeem struct in6_addr fromaddr, struct timeval *recv_tvp)
7077c478bd9Sstevel@tonic-gate {
708e11c3f44Smeem int64_t m; /* rtt measurement in ns */
709e11c3f44Smeem hrtime_t cur_hrtime; /* in ns from some arbitrary point */
7107c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN];
7117c478bd9Sstevel@tonic-gate int pr_ndx;
7127c478bd9Sstevel@tonic-gate struct target *target;
7137c478bd9Sstevel@tonic-gate boolean_t exception;
714e11c3f44Smeem uint64_t pr_icmp_timestamp;
7157c478bd9Sstevel@tonic-gate uint16_t pr_icmp_seq;
716e11c3f44Smeem struct probe_stats *pr_statp;
7177c478bd9Sstevel@tonic-gate struct phyint_group *pg = pii->pii_phyint->pi_group;
7187c478bd9Sstevel@tonic-gate
7197c478bd9Sstevel@tonic-gate /* Get the printable address for error reporting */
7207c478bd9Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
7217c478bd9Sstevel@tonic-gate
7227c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
723e11c3f44Smeem logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
7247c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf,
725e11c3f44Smeem ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
7267c478bd9Sstevel@tonic-gate }
7277c478bd9Sstevel@tonic-gate
728e11c3f44Smeem pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
7297c478bd9Sstevel@tonic-gate pr_icmp_seq = ntohs(reply->pr_icmp_seq);
7307c478bd9Sstevel@tonic-gate
7317c478bd9Sstevel@tonic-gate /* Reject out of window probe replies */
7327c478bd9Sstevel@tonic-gate if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
7337c478bd9Sstevel@tonic-gate SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
7347c478bd9Sstevel@tonic-gate logtrace("out of window probe seq %u snxt %u on %s from %s\n",
7357c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7367c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++;
7377c478bd9Sstevel@tonic-gate return;
7387c478bd9Sstevel@tonic-gate }
739e11c3f44Smeem
740e11c3f44Smeem cur_hrtime = gethrtime();
741e11c3f44Smeem m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
7427c478bd9Sstevel@tonic-gate if (m < 0) {
7437c478bd9Sstevel@tonic-gate /*
7447c478bd9Sstevel@tonic-gate * This is a ridiculously high value of rtt. rtt has wrapped
7457c478bd9Sstevel@tonic-gate * around. Log a message, and ignore the rtt.
7467c478bd9Sstevel@tonic-gate */
747e11c3f44Smeem logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
748e11c3f44Smeem "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
7497c478bd9Sstevel@tonic-gate }
7507c478bd9Sstevel@tonic-gate
7517c478bd9Sstevel@tonic-gate /*
7527c478bd9Sstevel@tonic-gate * Get the probe index pr_ndx corresponding to the received icmp seq.
7537c478bd9Sstevel@tonic-gate * number in our pii->pii_probes[] array. The icmp sequence number
7547c478bd9Sstevel@tonic-gate * pii_snxt corresponds to the probe index pii->pii_probe_next
7557c478bd9Sstevel@tonic-gate */
7567c478bd9Sstevel@tonic-gate pr_ndx = MOD_SUB(pii->pii_probe_next,
7577c478bd9Sstevel@tonic-gate (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
7587c478bd9Sstevel@tonic-gate
7597c478bd9Sstevel@tonic-gate assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
7607c478bd9Sstevel@tonic-gate
7617c478bd9Sstevel@tonic-gate target = pii->pii_probes[pr_ndx].pr_target;
7627c478bd9Sstevel@tonic-gate
7637c478bd9Sstevel@tonic-gate /*
7647c478bd9Sstevel@tonic-gate * Perform sanity checks, whether this probe reply that we
7657c478bd9Sstevel@tonic-gate * have received is genuine
7667c478bd9Sstevel@tonic-gate */
7677c478bd9Sstevel@tonic-gate if (target != NULL) {
7687c478bd9Sstevel@tonic-gate /*
7697c478bd9Sstevel@tonic-gate * Compare the src. addr of the received ICMP or ICMPv6
7707c478bd9Sstevel@tonic-gate * probe reply with the target address in our tables.
7717c478bd9Sstevel@tonic-gate */
7727c478bd9Sstevel@tonic-gate if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
7737c478bd9Sstevel@tonic-gate /*
7747c478bd9Sstevel@tonic-gate * We don't have any record of having sent a probe to
7757c478bd9Sstevel@tonic-gate * this target. This is a fake probe reply. Log an error
7767c478bd9Sstevel@tonic-gate */
7777c478bd9Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u "
7787c478bd9Sstevel@tonic-gate "snxt %u on %s from %s\n",
7797c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status,
7807c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7817c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++;
7827c478bd9Sstevel@tonic-gate return;
7837c478bd9Sstevel@tonic-gate } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
7847c478bd9Sstevel@tonic-gate /*
7857c478bd9Sstevel@tonic-gate * The address matches, but our tables indicate that
7867c478bd9Sstevel@tonic-gate * this probe reply has been acked already. So this
7877c478bd9Sstevel@tonic-gate * is a duplicate probe reply. Log an error
7887c478bd9Sstevel@tonic-gate */
7897c478bd9Sstevel@tonic-gate logtrace("probe status %d Duplicate probe reply seq %u "
7907c478bd9Sstevel@tonic-gate "snxt %u on %s from %s\n",
7917c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status,
7927c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7937c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++;
7947c478bd9Sstevel@tonic-gate return;
7957c478bd9Sstevel@tonic-gate }
7967c478bd9Sstevel@tonic-gate } else {
7977c478bd9Sstevel@tonic-gate /*
7987c478bd9Sstevel@tonic-gate * Target must not be NULL in the PR_UNACKED state
7997c478bd9Sstevel@tonic-gate */
8007c478bd9Sstevel@tonic-gate assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
8017c478bd9Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
8027c478bd9Sstevel@tonic-gate /*
8037c478bd9Sstevel@tonic-gate * The probe stats slot is unused. So we didn't
8047c478bd9Sstevel@tonic-gate * send out any probe to this target. This is a fake.
8057c478bd9Sstevel@tonic-gate * Log an error.
8067c478bd9Sstevel@tonic-gate */
8077c478bd9Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u "
8087c478bd9Sstevel@tonic-gate "snxt %u on %s from %s\n",
8097c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status,
8107c478bd9Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8117c478bd9Sstevel@tonic-gate }
8127c478bd9Sstevel@tonic-gate pii->pii_cum_stats.unknown++;
8137c478bd9Sstevel@tonic-gate return;
8147c478bd9Sstevel@tonic-gate }
8157c478bd9Sstevel@tonic-gate
8167c478bd9Sstevel@tonic-gate /*
8177c478bd9Sstevel@tonic-gate * If the rtt does not appear to be right, don't update the
8187c478bd9Sstevel@tonic-gate * rtt stats. This can happen if the system dropped into the
8197c478bd9Sstevel@tonic-gate * debugger, or the system was hung or too busy for a
8207c478bd9Sstevel@tonic-gate * substantial time that we didn't get a chance to run.
8217c478bd9Sstevel@tonic-gate */
822e11c3f44Smeem if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
8237c478bd9Sstevel@tonic-gate /*
824e11c3f44Smeem * If the probe corresponding to this received response
825e11c3f44Smeem * was truly sent 'm' ns. ago, then this response must
8267c478bd9Sstevel@tonic-gate * have been rejected by the sequence number checks. The
8277c478bd9Sstevel@tonic-gate * fact that it has passed the sequence number checks
8287c478bd9Sstevel@tonic-gate * means that the measured rtt is wrong. We were probably
8297c478bd9Sstevel@tonic-gate * scheduled long after the packet was received.
8307c478bd9Sstevel@tonic-gate */
8317c478bd9Sstevel@tonic-gate goto out;
8327c478bd9Sstevel@tonic-gate }
8337c478bd9Sstevel@tonic-gate
8347c478bd9Sstevel@tonic-gate /*
8357c478bd9Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
8367c478bd9Sstevel@tonic-gate * The initial few responses after the interface is repaired may
8377c478bd9Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting
8387c478bd9Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface.
8397c478bd9Sstevel@tonic-gate */
8407c478bd9Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
8417c478bd9Sstevel@tonic-gate goto out;
8427c478bd9Sstevel@tonic-gate
8437c478bd9Sstevel@tonic-gate /*
8447c478bd9Sstevel@tonic-gate * Don't update the Conservative Round Trip Time estimate for this
8457c478bd9Sstevel@tonic-gate * (phint, target) pair if this is the not the highest ack seq seen
8467c478bd9Sstevel@tonic-gate * thus far on this target.
8477c478bd9Sstevel@tonic-gate */
8487c478bd9Sstevel@tonic-gate if (!highest_ack_tg(pr_icmp_seq, target))
8497c478bd9Sstevel@tonic-gate goto out;
8507c478bd9Sstevel@tonic-gate
8517c478bd9Sstevel@tonic-gate /*
8527c478bd9Sstevel@tonic-gate * Always update the rtt. This is a failure detection probe
8537c478bd9Sstevel@tonic-gate * and we want to measure both increase / decrease in rtt.
8547c478bd9Sstevel@tonic-gate */
8557c478bd9Sstevel@tonic-gate pi_set_crtt(target, m, _B_TRUE);
8567c478bd9Sstevel@tonic-gate
8577c478bd9Sstevel@tonic-gate /*
8587c478bd9Sstevel@tonic-gate * If the crtt exceeds the average time between probes,
8597c478bd9Sstevel@tonic-gate * investigate if this slow target is an exception. If so we
8607c478bd9Sstevel@tonic-gate * can avoid this target and still meet the failure detection
8617c478bd9Sstevel@tonic-gate * time. Otherwise we can't meet the failure detection time.
8627c478bd9Sstevel@tonic-gate */
8637c478bd9Sstevel@tonic-gate if (target->tg_crtt > pg->pg_probeint) {
8647c478bd9Sstevel@tonic-gate exception = check_exception_target(pii, target);
8657c478bd9Sstevel@tonic-gate if (exception) {
8667c478bd9Sstevel@tonic-gate /*
8677c478bd9Sstevel@tonic-gate * This target is exceptionally slow. Don't use it
8687c478bd9Sstevel@tonic-gate * for future probes. check_exception_target() has
8697c478bd9Sstevel@tonic-gate * made sure that we have at least MIN_PROBE_TARGETS
8707c478bd9Sstevel@tonic-gate * other active targets
8717c478bd9Sstevel@tonic-gate */
8727c478bd9Sstevel@tonic-gate if (pii->pii_targets_are_routers) {
8737c478bd9Sstevel@tonic-gate /*
8747c478bd9Sstevel@tonic-gate * This is a slow router, mark it as slow
8757c478bd9Sstevel@tonic-gate * and don't use it for further probes. We
8767c478bd9Sstevel@tonic-gate * don't delete it, since it will be populated
8777c478bd9Sstevel@tonic-gate * again when we do a router scan. Hence we
8787c478bd9Sstevel@tonic-gate * need to maintain extra state (unlike the
8797c478bd9Sstevel@tonic-gate * host case below). Mark it as TG_SLOW.
8807c478bd9Sstevel@tonic-gate */
8817c478bd9Sstevel@tonic-gate if (target->tg_status == TG_ACTIVE)
8827c478bd9Sstevel@tonic-gate pii->pii_ntargets--;
8837c478bd9Sstevel@tonic-gate target->tg_status = TG_SLOW;
8847c478bd9Sstevel@tonic-gate target->tg_latime = gethrtime();
8857c478bd9Sstevel@tonic-gate target->tg_rtt_sa = -1;
8867c478bd9Sstevel@tonic-gate target->tg_crtt = 0;
8877c478bd9Sstevel@tonic-gate target->tg_rtt_sd = 0;
8887c478bd9Sstevel@tonic-gate if (pii->pii_target_next == target) {
8897c478bd9Sstevel@tonic-gate pii->pii_target_next =
8907c478bd9Sstevel@tonic-gate target_next(target);
8917c478bd9Sstevel@tonic-gate }
8927c478bd9Sstevel@tonic-gate } else {
8937c478bd9Sstevel@tonic-gate /*
8947c478bd9Sstevel@tonic-gate * the slow target is not a router, we can
8957c478bd9Sstevel@tonic-gate * just delete it. Send an icmp multicast and
8967c478bd9Sstevel@tonic-gate * pick the fastest responder that is not
8977c478bd9Sstevel@tonic-gate * already an active target. target_delete()
8987c478bd9Sstevel@tonic-gate * adjusts pii->pii_target_next
8997c478bd9Sstevel@tonic-gate */
9007c478bd9Sstevel@tonic-gate target_delete(target);
901e11c3f44Smeem probe(pii, PROBE_MULTI, cur_hrtime);
9027c478bd9Sstevel@tonic-gate }
9037c478bd9Sstevel@tonic-gate } else {
9047c478bd9Sstevel@tonic-gate /*
9057c478bd9Sstevel@tonic-gate * We can't meet the failure detection time.
9067c478bd9Sstevel@tonic-gate * Log a message, and update the detection time to
9077c478bd9Sstevel@tonic-gate * whatever we can achieve.
9087c478bd9Sstevel@tonic-gate */
9097c478bd9Sstevel@tonic-gate pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
9107c478bd9Sstevel@tonic-gate pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
9117c478bd9Sstevel@tonic-gate last_fdt_bumpup_time = gethrtime();
9127c478bd9Sstevel@tonic-gate if (pg != phyint_anongroup) {
9139bea6098Smeem logtrace("Cannot meet requested failure"
9149bea6098Smeem " detection time of %d ms on (%s %s) new"
9159bea6098Smeem " failure detection time for group \"%s\""
9169bea6098Smeem " is %d ms\n", user_failure_detection_time,
9177c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name,
9187c478bd9Sstevel@tonic-gate pg->pg_name, pg->pg_fdt);
9197c478bd9Sstevel@tonic-gate }
9207c478bd9Sstevel@tonic-gate }
9217c478bd9Sstevel@tonic-gate } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
9227c478bd9Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) &&
9237c478bd9Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
9247c478bd9Sstevel@tonic-gate /*
9257c478bd9Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
9267c478bd9Sstevel@tonic-gate * investigate if we can improve the failure detection time to
9277c478bd9Sstevel@tonic-gate * meet whatever the user specified.
9287c478bd9Sstevel@tonic-gate */
9297c478bd9Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) {
9307c478bd9Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
9317c478bd9Sstevel@tonic-gate user_failure_detection_time);
9327c478bd9Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
9337c478bd9Sstevel@tonic-gate if (pg != phyint_anongroup) {
9349bea6098Smeem logtrace("Improved failure detection time %d ms"
9359bea6098Smeem " on (%s %s) for group \"%s\"\n",
9369bea6098Smeem pg->pg_fdt, AF_STR(pii->pii_af),
9379bea6098Smeem pii->pii_name, pg->pg_name);
9387c478bd9Sstevel@tonic-gate }
9397c478bd9Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) {
9407c478bd9Sstevel@tonic-gate /* Avoid any truncation or rounding errors */
9417c478bd9Sstevel@tonic-gate pg->pg_probeint = user_probe_interval;
9427c478bd9Sstevel@tonic-gate /*
9437c478bd9Sstevel@tonic-gate * No more rtt probes will be sent. The actual
9447c478bd9Sstevel@tonic-gate * fdt has dropped to the user specified value.
9457c478bd9Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime
9467c478bd9Sstevel@tonic-gate * will be in sync henceforth.
9477c478bd9Sstevel@tonic-gate */
9487c478bd9Sstevel@tonic-gate reset_snxt_basetimes();
9497c478bd9Sstevel@tonic-gate }
9507c478bd9Sstevel@tonic-gate }
9517c478bd9Sstevel@tonic-gate }
9527c478bd9Sstevel@tonic-gate out:
953e11c3f44Smeem pr_statp = &pii->pii_probes[pr_ndx];
954e11c3f44Smeem pr_statp->pr_hrtime_ackproc = cur_hrtime;
955e11c3f44Smeem pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
956e11c3f44Smeem (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
957e11c3f44Smeem
958e11c3f44Smeem probe_chstate(pr_statp, pii, PR_ACKED);
9597c478bd9Sstevel@tonic-gate
9607c478bd9Sstevel@tonic-gate /*
9617c478bd9Sstevel@tonic-gate * Update pii->pii_rack, i.e. the sequence number of the last received
9627c478bd9Sstevel@tonic-gate * probe response, based on the echo reply we have received now, if
9637c478bd9Sstevel@tonic-gate * either of the following conditions are satisfied.
9647c478bd9Sstevel@tonic-gate * a. pii_rack is outside the current receive window of
9657c478bd9Sstevel@tonic-gate * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
9667c478bd9Sstevel@tonic-gate * This means we have not received probe responses for a
9677c478bd9Sstevel@tonic-gate * long time, and the sequence number has wrapped around.
9687c478bd9Sstevel@tonic-gate * b. pii_rack is within the current receive window and this echo
9697c478bd9Sstevel@tonic-gate * reply corresponds to the highest sequence number we have seen
9707c478bd9Sstevel@tonic-gate * so far.
9717c478bd9Sstevel@tonic-gate */
9727c478bd9Sstevel@tonic-gate if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
9737c478bd9Sstevel@tonic-gate SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
9747c478bd9Sstevel@tonic-gate SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
9757c478bd9Sstevel@tonic-gate pii->pii_rack = pr_icmp_seq;
9767c478bd9Sstevel@tonic-gate }
9777c478bd9Sstevel@tonic-gate }
9787c478bd9Sstevel@tonic-gate
9797c478bd9Sstevel@tonic-gate /*
9807c478bd9Sstevel@tonic-gate * Returns true if seq is the highest unacknowledged seq for target tg
9817c478bd9Sstevel@tonic-gate * else returns false
9827c478bd9Sstevel@tonic-gate */
9837c478bd9Sstevel@tonic-gate static boolean_t
highest_ack_tg(uint16_t seq,struct target * tg)9847c478bd9Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg)
9857c478bd9Sstevel@tonic-gate {
9867c478bd9Sstevel@tonic-gate struct phyint_instance *pii;
9877c478bd9Sstevel@tonic-gate int pr_ndx;
9887c478bd9Sstevel@tonic-gate uint16_t pr_seq;
9897c478bd9Sstevel@tonic-gate
9907c478bd9Sstevel@tonic-gate pii = tg->tg_phyint_inst;
9917c478bd9Sstevel@tonic-gate
9927c478bd9Sstevel@tonic-gate /*
9937c478bd9Sstevel@tonic-gate * Get the seq number of the most recent probe sent so far,
9947c478bd9Sstevel@tonic-gate * and also get the corresponding probe index in the probe stats
9957c478bd9Sstevel@tonic-gate * array.
9967c478bd9Sstevel@tonic-gate */
9977c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
9987c478bd9Sstevel@tonic-gate pr_seq = pii->pii_snxt;
9997c478bd9Sstevel@tonic-gate pr_seq--;
10007c478bd9Sstevel@tonic-gate
10017c478bd9Sstevel@tonic-gate /*
10027c478bd9Sstevel@tonic-gate * Start from the most recent probe and walk back, trying to find
10037c478bd9Sstevel@tonic-gate * an acked probe corresponding to target tg.
10047c478bd9Sstevel@tonic-gate */
10057c478bd9Sstevel@tonic-gate for (; pr_ndx != pii->pii_probe_next;
10067c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
10077c478bd9Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_target == tg &&
10087c478bd9Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
10097c478bd9Sstevel@tonic-gate if (SEQ_GT(pr_seq, seq))
10107c478bd9Sstevel@tonic-gate return (_B_FALSE);
10117c478bd9Sstevel@tonic-gate }
10127c478bd9Sstevel@tonic-gate }
10137c478bd9Sstevel@tonic-gate return (_B_TRUE);
10147c478bd9Sstevel@tonic-gate }
10157c478bd9Sstevel@tonic-gate
10167c478bd9Sstevel@tonic-gate /*
10177c478bd9Sstevel@tonic-gate * Check whether the crtt for the group has improved by a factor of
10187c478bd9Sstevel@tonic-gate * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure
10197c478bd9Sstevel@tonic-gate * detection time flapping in the face of small crtt changes.
10207c478bd9Sstevel@tonic-gate */
10217c478bd9Sstevel@tonic-gate static boolean_t
check_pg_crtt_improved(struct phyint_group * pg)10227c478bd9Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg)
10237c478bd9Sstevel@tonic-gate {
10247c478bd9Sstevel@tonic-gate struct phyint *pi;
10257c478bd9Sstevel@tonic-gate
10267c478bd9Sstevel@tonic-gate if (debug & D_PROBE)
10277c478bd9Sstevel@tonic-gate logdebug("check_pg_crtt_improved()\n");
10287c478bd9Sstevel@tonic-gate
10297c478bd9Sstevel@tonic-gate /*
10307c478bd9Sstevel@tonic-gate * The crtt for the group is only improved if each phyint_instance
10317c478bd9Sstevel@tonic-gate * for both ipv4 and ipv6 is improved.
10327c478bd9Sstevel@tonic-gate */
10337c478bd9Sstevel@tonic-gate for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
10347c478bd9Sstevel@tonic-gate if (!check_pii_crtt_improved(pi->pi_v4) ||
10357c478bd9Sstevel@tonic-gate !check_pii_crtt_improved(pi->pi_v6))
10367c478bd9Sstevel@tonic-gate return (_B_FALSE);
10377c478bd9Sstevel@tonic-gate }
10387c478bd9Sstevel@tonic-gate
10397c478bd9Sstevel@tonic-gate return (_B_TRUE);
10407c478bd9Sstevel@tonic-gate }
10417c478bd9Sstevel@tonic-gate
10427c478bd9Sstevel@tonic-gate /*
10437c478bd9Sstevel@tonic-gate * Check whether the crtt has improved substantially on this phyint_instance.
10447c478bd9Sstevel@tonic-gate * Returns _B_TRUE if there's no crtt information available, because pii
10457c478bd9Sstevel@tonic-gate * is NULL or the phyint_instance is not capable of probing.
10467c478bd9Sstevel@tonic-gate */
10477c478bd9Sstevel@tonic-gate boolean_t
check_pii_crtt_improved(struct phyint_instance * pii)10487c478bd9Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) {
10497c478bd9Sstevel@tonic-gate struct target *tg;
10507c478bd9Sstevel@tonic-gate
10517c478bd9Sstevel@tonic-gate if (pii == NULL)
10527c478bd9Sstevel@tonic-gate return (_B_TRUE);
10537c478bd9Sstevel@tonic-gate
10547c478bd9Sstevel@tonic-gate if (!PROBE_CAPABLE(pii) ||
10557c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_state == PI_FAILED)
10567c478bd9Sstevel@tonic-gate return (_B_TRUE);
10577c478bd9Sstevel@tonic-gate
10587c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
10597c478bd9Sstevel@tonic-gate if (tg->tg_status != TG_ACTIVE)
10607c478bd9Sstevel@tonic-gate continue;
10617c478bd9Sstevel@tonic-gate if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
10627c478bd9Sstevel@tonic-gate LOWER_FDT_TRIGGER)) {
10637c478bd9Sstevel@tonic-gate return (_B_FALSE);
10647c478bd9Sstevel@tonic-gate }
10657c478bd9Sstevel@tonic-gate }
10667c478bd9Sstevel@tonic-gate
10677c478bd9Sstevel@tonic-gate return (_B_TRUE);
10687c478bd9Sstevel@tonic-gate }
10697c478bd9Sstevel@tonic-gate
10707c478bd9Sstevel@tonic-gate /*
10717c478bd9Sstevel@tonic-gate * This target responds very slowly to probes. The target's crtt exceeds
10727c478bd9Sstevel@tonic-gate * the probe interval of its group. Compare against other targets
10737c478bd9Sstevel@tonic-gate * and determine if this target is an exception, if so return true, else false
10747c478bd9Sstevel@tonic-gate */
10757c478bd9Sstevel@tonic-gate static boolean_t
check_exception_target(struct phyint_instance * pii,struct target * target)10767c478bd9Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target)
10777c478bd9Sstevel@tonic-gate {
10787c478bd9Sstevel@tonic-gate struct target *tg;
10797c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN];
10807c478bd9Sstevel@tonic-gate
10817c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
10827c478bd9Sstevel@tonic-gate logdebug("check_exception_target(%s %s target %s)\n",
10837c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name,
10847c478bd9Sstevel@tonic-gate pr_addr(pii->pii_af, target->tg_address,
10857c478bd9Sstevel@tonic-gate abuf, sizeof (abuf)));
10867c478bd9Sstevel@tonic-gate }
10877c478bd9Sstevel@tonic-gate
10887c478bd9Sstevel@tonic-gate /*
10897c478bd9Sstevel@tonic-gate * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
10907c478bd9Sstevel@tonic-gate * to make a good judgement. Otherwise don't drop this target.
10917c478bd9Sstevel@tonic-gate */
10927c478bd9Sstevel@tonic-gate if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1)
10937c478bd9Sstevel@tonic-gate return (_B_FALSE);
10947c478bd9Sstevel@tonic-gate
10957c478bd9Sstevel@tonic-gate /*
10967c478bd9Sstevel@tonic-gate * Determine whether only this particular target is slow.
10977c478bd9Sstevel@tonic-gate * We know that this target's crtt exceeds the group's probe interval.
10987c478bd9Sstevel@tonic-gate * If all other active targets have a
10997c478bd9Sstevel@tonic-gate * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
11007c478bd9Sstevel@tonic-gate * then this target is considered slow.
11017c478bd9Sstevel@tonic-gate */
11027c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
11037c478bd9Sstevel@tonic-gate if (tg != target && tg->tg_status == TG_ACTIVE) {
11047c478bd9Sstevel@tonic-gate if (tg->tg_crtt >
11057c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint /
11067c478bd9Sstevel@tonic-gate EXCEPTION_FACTOR) {
11077c478bd9Sstevel@tonic-gate return (_B_FALSE);
11087c478bd9Sstevel@tonic-gate }
11097c478bd9Sstevel@tonic-gate }
11107c478bd9Sstevel@tonic-gate }
11117c478bd9Sstevel@tonic-gate
11127c478bd9Sstevel@tonic-gate return (_B_TRUE);
11137c478bd9Sstevel@tonic-gate }
11147c478bd9Sstevel@tonic-gate
11157c478bd9Sstevel@tonic-gate /*
11167c478bd9Sstevel@tonic-gate * Update the target list. The icmp all hosts multicast has given us
11177c478bd9Sstevel@tonic-gate * some host to which we can send probes. If we already have sufficient
11187c478bd9Sstevel@tonic-gate * targets, discard it.
11197c478bd9Sstevel@tonic-gate */
11207c478bd9Sstevel@tonic-gate static void
incoming_mcast_reply(struct phyint_instance * pii,struct pr_icmp * reply,struct in6_addr fromaddr)11217c478bd9Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
11227c478bd9Sstevel@tonic-gate struct in6_addr fromaddr)
11237c478bd9Sstevel@tonic-gate /* ARGSUSED */
11247c478bd9Sstevel@tonic-gate {
11257c478bd9Sstevel@tonic-gate int af;
11267c478bd9Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN];
11277c478bd9Sstevel@tonic-gate struct phyint *pi;
11287c478bd9Sstevel@tonic-gate
11297c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
11307c478bd9Sstevel@tonic-gate logdebug("incoming_mcast_reply(%s %s %s)\n",
11317c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name,
11327c478bd9Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
11337c478bd9Sstevel@tonic-gate }
11347c478bd9Sstevel@tonic-gate
11357c478bd9Sstevel@tonic-gate /*
11367c478bd9Sstevel@tonic-gate * Using host targets is a fallback mechanism. If we have
11377c478bd9Sstevel@tonic-gate * found a router, don't add this host target. If we already
11387c478bd9Sstevel@tonic-gate * know MAX_PROBE_TARGETS, don't add another target.
11397c478bd9Sstevel@tonic-gate */
11407c478bd9Sstevel@tonic-gate assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
11417c478bd9Sstevel@tonic-gate if (pii->pii_targets != NULL) {
11427c478bd9Sstevel@tonic-gate if (pii->pii_targets_are_routers ||
11437c478bd9Sstevel@tonic-gate (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
11447c478bd9Sstevel@tonic-gate return;
11457c478bd9Sstevel@tonic-gate }
11467c478bd9Sstevel@tonic-gate }
11477c478bd9Sstevel@tonic-gate
11487c478bd9Sstevel@tonic-gate if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
11497c478bd9Sstevel@tonic-gate IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
11507c478bd9Sstevel@tonic-gate /*
11517c478bd9Sstevel@tonic-gate * Guard against response from 0.0.0.0
11527c478bd9Sstevel@tonic-gate * and ::. Log a trace message
11537c478bd9Sstevel@tonic-gate */
11547c478bd9Sstevel@tonic-gate logtrace("probe response from %s on %s\n",
11557c478bd9Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
11567c478bd9Sstevel@tonic-gate pii->pii_name);
11577c478bd9Sstevel@tonic-gate return;
11587c478bd9Sstevel@tonic-gate }
11597c478bd9Sstevel@tonic-gate
11607c478bd9Sstevel@tonic-gate /*
11617c478bd9Sstevel@tonic-gate * This address is one of our own, so reject this address as a
11627c478bd9Sstevel@tonic-gate * valid probe target.
11637c478bd9Sstevel@tonic-gate */
11647c478bd9Sstevel@tonic-gate af = pii->pii_af;
116587e66ffcSrk129064 if (own_address(fromaddr))
11667c478bd9Sstevel@tonic-gate return;
11677c478bd9Sstevel@tonic-gate
11687c478bd9Sstevel@tonic-gate /*
11697c478bd9Sstevel@tonic-gate * If the phyint is part a named group, then add the address to all
11707c478bd9Sstevel@tonic-gate * members of the group. Otherwise, add the address only to the
11717c478bd9Sstevel@tonic-gate * phyint itself, since other phyints in the anongroup may not be on
11727c478bd9Sstevel@tonic-gate * the same subnet.
11737c478bd9Sstevel@tonic-gate */
11747c478bd9Sstevel@tonic-gate pi = pii->pii_phyint;
11757c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) {
11767c478bd9Sstevel@tonic-gate target_add(pii, fromaddr, _B_FALSE);
11777c478bd9Sstevel@tonic-gate } else {
11787c478bd9Sstevel@tonic-gate pi = pi->pi_group->pg_phyint;
11797c478bd9Sstevel@tonic-gate for (; pi != NULL; pi = pi->pi_pgnext)
11807c478bd9Sstevel@tonic-gate target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
11817c478bd9Sstevel@tonic-gate }
11827c478bd9Sstevel@tonic-gate }
11837c478bd9Sstevel@tonic-gate
11847c478bd9Sstevel@tonic-gate /*
11857c478bd9Sstevel@tonic-gate * Compute CRTT given an existing scaled average, scaled deviation estimate
11867c478bd9Sstevel@tonic-gate * and a new rtt time. The formula is from Jacobson and Karels'
11877c478bd9Sstevel@tonic-gate * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
11887c478bd9Sstevel@tonic-gate * are the same as those in Appendix A.2 of that paper.
11897c478bd9Sstevel@tonic-gate *
11907c478bd9Sstevel@tonic-gate * m = new measurement
11917c478bd9Sstevel@tonic-gate * sa = scaled RTT average (8 * average estimates)
11927c478bd9Sstevel@tonic-gate * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
11937c478bd9Sstevel@tonic-gate * crtt = Conservative round trip time. Used to determine whether probe
11947c478bd9Sstevel@tonic-gate * has timed out.
11957c478bd9Sstevel@tonic-gate *
11967c478bd9Sstevel@tonic-gate * New scaled average and deviation are passed back via sap and svp
11977c478bd9Sstevel@tonic-gate */
1198e11c3f44Smeem static int64_t
compute_crtt(int64_t * sap,int64_t * svp,int64_t m)1199e11c3f44Smeem compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
12007c478bd9Sstevel@tonic-gate {
1201e11c3f44Smeem int64_t sa = *sap;
1202e11c3f44Smeem int64_t sv = *svp;
1203e11c3f44Smeem int64_t crtt;
1204e11c3f44Smeem int64_t saved_m = m;
12057c478bd9Sstevel@tonic-gate
12067c478bd9Sstevel@tonic-gate assert(*sap >= -1);
12077c478bd9Sstevel@tonic-gate assert(*svp >= 0);
12087c478bd9Sstevel@tonic-gate
12097c478bd9Sstevel@tonic-gate if (sa != -1) {
12107c478bd9Sstevel@tonic-gate /*
12117c478bd9Sstevel@tonic-gate * Update average estimator:
12127c478bd9Sstevel@tonic-gate * new rtt = old rtt + 1/8 Error
12137c478bd9Sstevel@tonic-gate * where Error = m - old rtt
12147c478bd9Sstevel@tonic-gate * i.e. 8 * new rtt = 8 * old rtt + Error
12157c478bd9Sstevel@tonic-gate * i.e. new sa = old sa + Error
12167c478bd9Sstevel@tonic-gate */
12177c478bd9Sstevel@tonic-gate m -= sa >> 3; /* m is now Error in estimate. */
12187c478bd9Sstevel@tonic-gate if ((sa += m) < 0) {
12197c478bd9Sstevel@tonic-gate /* Don't allow the smoothed average to be negative. */
12207c478bd9Sstevel@tonic-gate sa = 0;
12217c478bd9Sstevel@tonic-gate }
12227c478bd9Sstevel@tonic-gate
12237c478bd9Sstevel@tonic-gate /*
12247c478bd9Sstevel@tonic-gate * Update deviation estimator:
12257c478bd9Sstevel@tonic-gate * new mdev = old mdev + 1/4 (abs(Error) - old mdev)
12267c478bd9Sstevel@tonic-gate * i.e. 4 * new mdev = 4 * old mdev +
12277c478bd9Sstevel@tonic-gate * (abs(Error) - old mdev)
12287c478bd9Sstevel@tonic-gate * i.e. new sv = old sv + (abs(Error) - old mdev)
12297c478bd9Sstevel@tonic-gate */
12307c478bd9Sstevel@tonic-gate if (m < 0)
12317c478bd9Sstevel@tonic-gate m = -m;
12327c478bd9Sstevel@tonic-gate m -= sv >> 2;
12337c478bd9Sstevel@tonic-gate sv += m;
12347c478bd9Sstevel@tonic-gate } else {
12357c478bd9Sstevel@tonic-gate /* Initialization. This is the first response received. */
12367c478bd9Sstevel@tonic-gate sa = (m << 3);
12377c478bd9Sstevel@tonic-gate sv = (m << 1);
12387c478bd9Sstevel@tonic-gate }
12397c478bd9Sstevel@tonic-gate
12407c478bd9Sstevel@tonic-gate crtt = (sa >> 3) + sv;
12417c478bd9Sstevel@tonic-gate
12427c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
1243e11c3f44Smeem logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
1244e11c3f44Smeem "crtt = %lld\n", saved_m, sa, sv, crtt);
12457c478bd9Sstevel@tonic-gate }
12467c478bd9Sstevel@tonic-gate
12477c478bd9Sstevel@tonic-gate *sap = sa;
12487c478bd9Sstevel@tonic-gate *svp = sv;
12497c478bd9Sstevel@tonic-gate
12507c478bd9Sstevel@tonic-gate /*
12517c478bd9Sstevel@tonic-gate * CRTT = average estimates + 4 * deviation estimates
12527c478bd9Sstevel@tonic-gate * = sa / 8 + sv
12537c478bd9Sstevel@tonic-gate */
12547c478bd9Sstevel@tonic-gate return (crtt);
12557c478bd9Sstevel@tonic-gate }
12567c478bd9Sstevel@tonic-gate
12577c478bd9Sstevel@tonic-gate static void
pi_set_crtt(struct target * tg,int64_t m,boolean_t is_probe_uni)1258e11c3f44Smeem pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
12597c478bd9Sstevel@tonic-gate {
12607c478bd9Sstevel@tonic-gate struct phyint_instance *pii = tg->tg_phyint_inst;
12617c478bd9Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1262e11c3f44Smeem int64_t sa = tg->tg_rtt_sa;
1263e11c3f44Smeem int64_t sv = tg->tg_rtt_sd;
12647c478bd9Sstevel@tonic-gate int new_crtt;
12657c478bd9Sstevel@tonic-gate int i;
12667c478bd9Sstevel@tonic-gate
12677c478bd9Sstevel@tonic-gate if (debug & D_PROBE)
1268e11c3f44Smeem logdebug("pi_set_crtt: target - m %lld\n", m);
12697c478bd9Sstevel@tonic-gate
12707c478bd9Sstevel@tonic-gate /* store the round trip time, in case we need to defer computation */
12717c478bd9Sstevel@tonic-gate tg->tg_deferred[tg->tg_num_deferred] = m;
12727c478bd9Sstevel@tonic-gate
1273e11c3f44Smeem new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
12747c478bd9Sstevel@tonic-gate
12757c478bd9Sstevel@tonic-gate /*
12767c478bd9Sstevel@tonic-gate * If this probe's round trip time would singlehandedly cause an
12777c478bd9Sstevel@tonic-gate * increase in the group's probe interval consider it suspect.
12787c478bd9Sstevel@tonic-gate */
12797c478bd9Sstevel@tonic-gate if ((new_crtt > probe_interval) && is_probe_uni) {
12807c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
12817c478bd9Sstevel@tonic-gate logdebug("Received a suspect probe on %s, new_crtt ="
12827c478bd9Sstevel@tonic-gate " %d, probe_interval = %d, num_deferred = %d\n",
12837c478bd9Sstevel@tonic-gate pii->pii_probe_logint->li_name, new_crtt,
12847c478bd9Sstevel@tonic-gate probe_interval, tg->tg_num_deferred);
12857c478bd9Sstevel@tonic-gate }
12867c478bd9Sstevel@tonic-gate
12877c478bd9Sstevel@tonic-gate /*
12887c478bd9Sstevel@tonic-gate * If we've deferred as many rtts as we plan on deferring, then
12897c478bd9Sstevel@tonic-gate * assume the link really did slow down and process all queued
12907c478bd9Sstevel@tonic-gate * rtts
12917c478bd9Sstevel@tonic-gate */
12927c478bd9Sstevel@tonic-gate if (tg->tg_num_deferred == MAXDEFERREDRTT) {
12937c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
12947c478bd9Sstevel@tonic-gate logdebug("Received MAXDEFERREDRTT probes which "
12957c478bd9Sstevel@tonic-gate "would cause an increased probe_interval. "
12967c478bd9Sstevel@tonic-gate "Integrating queued rtt data points.\n");
12977c478bd9Sstevel@tonic-gate }
12987c478bd9Sstevel@tonic-gate
12997c478bd9Sstevel@tonic-gate for (i = 0; i <= tg->tg_num_deferred; i++) {
1300e11c3f44Smeem tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
1301e11c3f44Smeem &tg->tg_rtt_sd, tg->tg_deferred[i]));
13027c478bd9Sstevel@tonic-gate }
13037c478bd9Sstevel@tonic-gate
13047c478bd9Sstevel@tonic-gate tg->tg_num_deferred = 0;
13057c478bd9Sstevel@tonic-gate } else {
13067c478bd9Sstevel@tonic-gate tg->tg_num_deferred++;
13077c478bd9Sstevel@tonic-gate }
13087c478bd9Sstevel@tonic-gate return;
13097c478bd9Sstevel@tonic-gate }
13107c478bd9Sstevel@tonic-gate
13117c478bd9Sstevel@tonic-gate /*
13127c478bd9Sstevel@tonic-gate * If this is a normal probe, or an RTT probe that would lead to a
13137c478bd9Sstevel@tonic-gate * reduced CRTT, then update our CRTT data. Further, if this was
13147c478bd9Sstevel@tonic-gate * a normal probe, pitch any deferred probes since our probes are
13157c478bd9Sstevel@tonic-gate * again being answered within our CRTT estimates.
13167c478bd9Sstevel@tonic-gate */
13177c478bd9Sstevel@tonic-gate if (is_probe_uni || new_crtt < tg->tg_crtt) {
13187c478bd9Sstevel@tonic-gate tg->tg_rtt_sa = sa;
13197c478bd9Sstevel@tonic-gate tg->tg_rtt_sd = sv;
13207c478bd9Sstevel@tonic-gate tg->tg_crtt = new_crtt;
13217c478bd9Sstevel@tonic-gate if (is_probe_uni)
13227c478bd9Sstevel@tonic-gate tg->tg_num_deferred = 0;
13237c478bd9Sstevel@tonic-gate }
13247c478bd9Sstevel@tonic-gate }
13257c478bd9Sstevel@tonic-gate
13267c478bd9Sstevel@tonic-gate /*
13277c478bd9Sstevel@tonic-gate * Return a pointer to the specified option buffer.
13287c478bd9Sstevel@tonic-gate * If not found return NULL.
13297c478bd9Sstevel@tonic-gate */
13307c478bd9Sstevel@tonic-gate static void *
find_ancillary(struct msghdr * msg,int cmsg_level,int cmsg_type)1331e11c3f44Smeem find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
13327c478bd9Sstevel@tonic-gate {
13337c478bd9Sstevel@tonic-gate struct cmsghdr *cmsg;
13347c478bd9Sstevel@tonic-gate
13357c478bd9Sstevel@tonic-gate for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
13367c478bd9Sstevel@tonic-gate cmsg = CMSG_NXTHDR(msg, cmsg)) {
1337e11c3f44Smeem if (cmsg->cmsg_level == cmsg_level &&
13387c478bd9Sstevel@tonic-gate cmsg->cmsg_type == cmsg_type) {
13397c478bd9Sstevel@tonic-gate return (CMSG_DATA(cmsg));
13407c478bd9Sstevel@tonic-gate }
13417c478bd9Sstevel@tonic-gate }
13427c478bd9Sstevel@tonic-gate return (NULL);
13437c478bd9Sstevel@tonic-gate }
13447c478bd9Sstevel@tonic-gate
13457c478bd9Sstevel@tonic-gate /*
1346e11c3f44Smeem * Try to activate another INACTIVE interface in the same group as `pi'.
1347e11c3f44Smeem * Prefer STANDBY INACTIVE to just INACTIVE.
1348e11c3f44Smeem */
1349e11c3f44Smeem void
phyint_activate_another(struct phyint * pi)1350e11c3f44Smeem phyint_activate_another(struct phyint *pi)
1351e11c3f44Smeem {
1352e11c3f44Smeem struct phyint *pi2;
1353e11c3f44Smeem struct phyint *inactivepi = NULL;
1354e11c3f44Smeem
1355e11c3f44Smeem if (pi->pi_group == phyint_anongroup)
1356e11c3f44Smeem return;
1357e11c3f44Smeem
1358e11c3f44Smeem for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
13599bea6098Smeem if (pi == pi2 || !phyint_is_functioning(pi2) ||
1360e11c3f44Smeem !(pi2->pi_flags & IFF_INACTIVE))
1361e11c3f44Smeem continue;
1362e11c3f44Smeem
1363e11c3f44Smeem inactivepi = pi2;
1364e11c3f44Smeem if (pi2->pi_flags & IFF_STANDBY)
1365e11c3f44Smeem break;
1366e11c3f44Smeem }
1367e11c3f44Smeem
1368e11c3f44Smeem if (inactivepi != NULL)
1369e11c3f44Smeem (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
1370e11c3f44Smeem }
1371e11c3f44Smeem
1372e11c3f44Smeem /*
1373fcdc8680Smeem * Transition a phyint to PI_RUNNING. The caller must ensure that the
1374fcdc8680Smeem * transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if
1375fcdc8680Smeem * appropriate. Also sets IFF_INACTIVE on this or other interfaces as
1376fcdc8680Smeem * appropriate (see comment below). Finally, also updates the phyint's group
1377fcdc8680Smeem * state to account for the change.
1378e11c3f44Smeem */
1379e11c3f44Smeem void
phyint_transition_to_running(struct phyint * pi)1380e11c3f44Smeem phyint_transition_to_running(struct phyint *pi)
1381e11c3f44Smeem {
1382e11c3f44Smeem struct phyint *pi2;
1383e11c3f44Smeem struct phyint *actstandbypi = NULL;
1384e11c3f44Smeem uint_t nactive = 0, nnonstandby = 0;
1385e11c3f44Smeem boolean_t onlining = (pi->pi_state == PI_OFFLINE);
1386fcdc8680Smeem boolean_t initial = (pi->pi_state == PI_INIT);
1387e11c3f44Smeem uint64_t set, clear;
1388e11c3f44Smeem
1389e11c3f44Smeem /*
1390e11c3f44Smeem * The interface is running again, but should it or another interface
1391e11c3f44Smeem * in the group end up INACTIVE? There are three cases:
1392e11c3f44Smeem *
1393e11c3f44Smeem * 1. If it's a STANDBY interface, it should be end up INACTIVE if
1394e11c3f44Smeem * the group is operating at capacity (i.e., there are at least as
1395e11c3f44Smeem * many active interfaces as non-STANDBY interfaces in the group).
1396e11c3f44Smeem * No other interfaces should be changed.
1397e11c3f44Smeem *
1398e11c3f44Smeem * 2. If it's a non-STANDBY interface and we're onlining it or
1399e11c3f44Smeem * FAILBACK is enabled, then it should *not* end up INACTIVE.
1400e11c3f44Smeem * Further, if the group is above capacity as a result of this
1401e11c3f44Smeem * interface, then an active STANDBY interface in the group should
1402e11c3f44Smeem * end up INACTIVE.
1403e11c3f44Smeem *
1404e11c3f44Smeem * 3. If it's a non-STANDBY interface, we're repairing it, and
1405e11c3f44Smeem * FAILBACK is disabled, then it should end up INACTIVE *unless*
1406e11c3f44Smeem * the group was failed (in which case we have no choice but to
1407e11c3f44Smeem * use it). No other interfaces should be changed.
1408e11c3f44Smeem */
1409e11c3f44Smeem if (pi->pi_group != phyint_anongroup) {
1410e11c3f44Smeem pi2 = pi->pi_group->pg_phyint;
1411e11c3f44Smeem for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1412e11c3f44Smeem if (!(pi2->pi_flags & IFF_STANDBY))
1413e11c3f44Smeem nnonstandby++;
1414e11c3f44Smeem
14159bea6098Smeem if (phyint_is_functioning(pi2) &&
14169bea6098Smeem !(pi2->pi_flags & IFF_INACTIVE)) {
1417e11c3f44Smeem nactive++;
1418e11c3f44Smeem if (pi2->pi_flags & IFF_STANDBY)
1419e11c3f44Smeem actstandbypi = pi2;
1420e11c3f44Smeem }
1421e11c3f44Smeem }
1422e11c3f44Smeem }
1423e11c3f44Smeem
1424e11c3f44Smeem set = 0;
1425e11c3f44Smeem clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
1426e11c3f44Smeem
1427e11c3f44Smeem if (pi->pi_flags & IFF_STANDBY) { /* case 1 */
1428e11c3f44Smeem if (nactive >= nnonstandby)
1429e11c3f44Smeem set |= IFF_INACTIVE;
1430e11c3f44Smeem else
1431e11c3f44Smeem clear |= IFF_INACTIVE;
1432e11c3f44Smeem } else if (onlining || failback_enabled) { /* case 2 */
1433e11c3f44Smeem if (nactive >= nnonstandby && actstandbypi != NULL)
1434e11c3f44Smeem (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
1435fcdc8680Smeem } else if (!initial && !GROUP_FAILED(pi->pi_group)) { /* case 3 */
1436e11c3f44Smeem set |= IFF_INACTIVE;
1437e11c3f44Smeem }
1438e11c3f44Smeem (void) change_pif_flags(pi, set, clear);
1439e11c3f44Smeem
1440e11c3f44Smeem phyint_chstate(pi, PI_RUNNING);
1441e11c3f44Smeem
1442e11c3f44Smeem /*
1443e11c3f44Smeem * Update the group state to account for the change.
1444e11c3f44Smeem */
1445e11c3f44Smeem phyint_group_refresh_state(pi->pi_group);
1446e11c3f44Smeem }
1447e11c3f44Smeem
1448e11c3f44Smeem /*
14499bea6098Smeem * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
14509bea6098Smeem * to have at least one active interface and as many active interfaces as
14519bea6098Smeem * non-standby interfaces.
14529bea6098Smeem */
14539bea6098Smeem void
phyint_standby_refresh_inactive(struct phyint * pi)14549bea6098Smeem phyint_standby_refresh_inactive(struct phyint *pi)
14559bea6098Smeem {
14569bea6098Smeem struct phyint *pi2;
14579bea6098Smeem uint_t nactive = 0, nnonstandby = 0;
14589bea6098Smeem
14599bea6098Smeem /*
14609bea6098Smeem * All phyints in the anonymous group are effectively in their own
14619bea6098Smeem * group and thus active regardless of whether they're marked standby.
14629bea6098Smeem */
14639bea6098Smeem if (pi->pi_group == phyint_anongroup) {
14649bea6098Smeem (void) change_pif_flags(pi, 0, IFF_INACTIVE);
14659bea6098Smeem return;
14669bea6098Smeem }
14679bea6098Smeem
14689bea6098Smeem /*
14699bea6098Smeem * If the phyint isn't functioning we can't consider it.
14709bea6098Smeem */
14719bea6098Smeem if (!phyint_is_functioning(pi))
14729bea6098Smeem return;
14739bea6098Smeem
14749bea6098Smeem for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
14759bea6098Smeem if (!(pi2->pi_flags & IFF_STANDBY))
14769bea6098Smeem nnonstandby++;
14779bea6098Smeem
14789bea6098Smeem if (phyint_is_functioning(pi2) &&
14799bea6098Smeem !(pi2->pi_flags & IFF_INACTIVE))
14809bea6098Smeem nactive++;
14819bea6098Smeem }
14829bea6098Smeem
14839bea6098Smeem if (nactive == 0 || nactive < nnonstandby)
14849bea6098Smeem (void) change_pif_flags(pi, 0, IFF_INACTIVE);
14859bea6098Smeem else if (nactive > nnonstandby)
14869bea6098Smeem (void) change_pif_flags(pi, IFF_INACTIVE, 0);
14879bea6098Smeem }
14889bea6098Smeem
14899bea6098Smeem /*
14907c478bd9Sstevel@tonic-gate * See if a previously failed interface has started working again.
14917c478bd9Sstevel@tonic-gate */
14927c478bd9Sstevel@tonic-gate void
phyint_check_for_repair(struct phyint * pi)14937c478bd9Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi)
14947c478bd9Sstevel@tonic-gate {
1495e11c3f44Smeem if (!phyint_repaired(pi))
1496e11c3f44Smeem return;
1497e11c3f44Smeem
14987c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) {
1499e11c3f44Smeem logerr("IP interface repair detected on %s\n", pi->pi_name);
15007c478bd9Sstevel@tonic-gate } else {
1501e11c3f44Smeem logerr("IP interface repair detected on %s of group %s\n",
15027c478bd9Sstevel@tonic-gate pi->pi_name, pi->pi_group->pg_name);
15037c478bd9Sstevel@tonic-gate }
15047c478bd9Sstevel@tonic-gate
15057c478bd9Sstevel@tonic-gate /*
1506e11c3f44Smeem * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
1507e11c3f44Smeem * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
1508e11c3f44Smeem * until it is brought back online.
15097c478bd9Sstevel@tonic-gate */
15107c478bd9Sstevel@tonic-gate if (pi->pi_state == PI_OFFLINE) {
1511e11c3f44Smeem (void) change_pif_flags(pi, 0, IFF_FAILED);
15127c478bd9Sstevel@tonic-gate return;
15137c478bd9Sstevel@tonic-gate }
15147c478bd9Sstevel@tonic-gate
1515e11c3f44Smeem phyint_transition_to_running(pi); /* calls phyint_chstate() */
15167c478bd9Sstevel@tonic-gate }
15177c478bd9Sstevel@tonic-gate
15187c478bd9Sstevel@tonic-gate /*
1519e11c3f44Smeem * See if an interface has failed, or if the whole group of interfaces has
1520e11c3f44Smeem * failed.
15217c478bd9Sstevel@tonic-gate */
15227c478bd9Sstevel@tonic-gate static void
phyint_inst_check_for_failure(struct phyint_instance * pii)15237c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii)
15247c478bd9Sstevel@tonic-gate {
1525e11c3f44Smeem struct phyint *pi = pii->pii_phyint;
15267c478bd9Sstevel@tonic-gate struct phyint *pi2;
1527e11c3f44Smeem boolean_t was_active;
15287c478bd9Sstevel@tonic-gate
15297c478bd9Sstevel@tonic-gate switch (failure_state(pii)) {
15307c478bd9Sstevel@tonic-gate case PHYINT_FAILURE:
1531e11c3f44Smeem was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1532e11c3f44Smeem
1533e11c3f44Smeem (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
15347c478bd9Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) {
1535e11c3f44Smeem logerr("IP interface failure detected on %s\n",
1536e11c3f44Smeem pii->pii_name);
15377c478bd9Sstevel@tonic-gate } else {
1538e11c3f44Smeem logerr("IP interface failure detected on %s of group"
1539e11c3f44Smeem " %s\n", pii->pii_name, pi->pi_group->pg_name);
15407c478bd9Sstevel@tonic-gate }
1541e11c3f44Smeem
15427c478bd9Sstevel@tonic-gate /*
1543c445e3e1Smeem * If the failed interface was active, activate another
1544c445e3e1Smeem * INACTIVE interface in the group if possible.
1545e11c3f44Smeem */
1546e11c3f44Smeem if (was_active)
1547e11c3f44Smeem phyint_activate_another(pi);
1548e11c3f44Smeem
1549c445e3e1Smeem /*
1550c445e3e1Smeem * If the interface is offline, the state change will be
1551c445e3e1Smeem * noted when it comes back online.
1552c445e3e1Smeem */
1553c445e3e1Smeem if (pi->pi_state != PI_OFFLINE) {
15547c478bd9Sstevel@tonic-gate phyint_chstate(pi, PI_FAILED);
15557c478bd9Sstevel@tonic-gate reset_crtt_all(pi);
15567c478bd9Sstevel@tonic-gate }
15577c478bd9Sstevel@tonic-gate break;
15587c478bd9Sstevel@tonic-gate
15597c478bd9Sstevel@tonic-gate case GROUP_FAILURE:
1560e11c3f44Smeem pi2 = pi->pi_group->pg_phyint;
1561e11c3f44Smeem for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1562e11c3f44Smeem (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
1563e11c3f44Smeem if (pi2->pi_state == PI_OFFLINE) /* see comment above */
15647c478bd9Sstevel@tonic-gate continue;
15657c478bd9Sstevel@tonic-gate
1566e11c3f44Smeem reset_crtt_all(pi2);
15677c478bd9Sstevel@tonic-gate /*
1568e11c3f44Smeem * In the case of host targets, we would have flushed
1569e11c3f44Smeem * the targets, and gone to PI_NOTARGETS state.
15707c478bd9Sstevel@tonic-gate */
15717c478bd9Sstevel@tonic-gate if (pi2->pi_state == PI_RUNNING)
157249df4566Sethindra phyint_chstate(pi2, PI_FAILED);
15737c478bd9Sstevel@tonic-gate }
15747c478bd9Sstevel@tonic-gate break;
15757c478bd9Sstevel@tonic-gate
15767c478bd9Sstevel@tonic-gate default:
15777c478bd9Sstevel@tonic-gate break;
15787c478bd9Sstevel@tonic-gate }
15797c478bd9Sstevel@tonic-gate }
15807c478bd9Sstevel@tonic-gate
15817c478bd9Sstevel@tonic-gate /*
15827c478bd9Sstevel@tonic-gate * Determines if any timeout event has occurred and returns the number of
15837c478bd9Sstevel@tonic-gate * milliseconds until the next timeout event for the phyint. Returns
15847c478bd9Sstevel@tonic-gate * TIMER_INFINITY for "never".
15857c478bd9Sstevel@tonic-gate */
15867c478bd9Sstevel@tonic-gate uint_t
phyint_inst_timer(struct phyint_instance * pii)15877c478bd9Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii)
15887c478bd9Sstevel@tonic-gate {
15897c478bd9Sstevel@tonic-gate int pr_ndx;
15907c478bd9Sstevel@tonic-gate uint_t timeout;
15917c478bd9Sstevel@tonic-gate struct target *cur_tg;
15927c478bd9Sstevel@tonic-gate struct probe_stats *pr_statp;
15937c478bd9Sstevel@tonic-gate struct phyint_instance *pii_other;
15947c478bd9Sstevel@tonic-gate struct phyint *pi;
15957c478bd9Sstevel@tonic-gate int valid_unack_count;
15967c478bd9Sstevel@tonic-gate int i;
15977c478bd9Sstevel@tonic-gate int interval;
15987c478bd9Sstevel@tonic-gate uint_t check_time;
15997c478bd9Sstevel@tonic-gate uint_t cur_time;
16007c478bd9Sstevel@tonic-gate hrtime_t cur_hrtime;
16017c478bd9Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
16027c478bd9Sstevel@tonic-gate
1603e11c3f44Smeem cur_hrtime = gethrtime();
1604e11c3f44Smeem cur_time = ns2ms(cur_hrtime);
16057c478bd9Sstevel@tonic-gate
16067c478bd9Sstevel@tonic-gate if (debug & D_TIMER) {
16077c478bd9Sstevel@tonic-gate logdebug("phyint_inst_timer(%s %s)\n",
16087c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name);
16097c478bd9Sstevel@tonic-gate }
16107c478bd9Sstevel@tonic-gate
16117c478bd9Sstevel@tonic-gate pii_other = phyint_inst_other(pii);
16127c478bd9Sstevel@tonic-gate if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
16137c478bd9Sstevel@tonic-gate /*
16147c478bd9Sstevel@tonic-gate * Check to see if we're here due to link up/down flapping; If
16157c478bd9Sstevel@tonic-gate * enough time has passed, then try to bring the interface
16167c478bd9Sstevel@tonic-gate * back up; otherwise, schedule a timer to bring it back up
16177c478bd9Sstevel@tonic-gate * when enough time *has* elapsed.
16187c478bd9Sstevel@tonic-gate */
16197c478bd9Sstevel@tonic-gate pi = pii->pii_phyint;
16207c478bd9Sstevel@tonic-gate if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
16217c478bd9Sstevel@tonic-gate check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
16227c478bd9Sstevel@tonic-gate if (check_time > cur_time)
16237c478bd9Sstevel@tonic-gate return (check_time - cur_time);
16247c478bd9Sstevel@tonic-gate
16257c478bd9Sstevel@tonic-gate phyint_check_for_repair(pi);
16267c478bd9Sstevel@tonic-gate }
16277c478bd9Sstevel@tonic-gate }
16287c478bd9Sstevel@tonic-gate
16297c478bd9Sstevel@tonic-gate /*
163006cdd167Smeem * If probing is not enabled on this phyint instance, don't proceed.
16317c478bd9Sstevel@tonic-gate */
163206cdd167Smeem if (!PROBE_ENABLED(pii))
16337c478bd9Sstevel@tonic-gate return (TIMER_INFINITY);
16347c478bd9Sstevel@tonic-gate
16357c478bd9Sstevel@tonic-gate /*
16367c478bd9Sstevel@tonic-gate * If the timer has fired too soon, probably triggered
16377c478bd9Sstevel@tonic-gate * by some other phyint instance, return the remaining
16387c478bd9Sstevel@tonic-gate * time
16397c478bd9Sstevel@tonic-gate */
16407c478bd9Sstevel@tonic-gate if (TIME_LT(cur_time, pii->pii_snxt_time))
16417c478bd9Sstevel@tonic-gate return (pii->pii_snxt_time - cur_time);
16427c478bd9Sstevel@tonic-gate
16437c478bd9Sstevel@tonic-gate /*
16447c478bd9Sstevel@tonic-gate * If the link is down, don't send any probes for now.
16457c478bd9Sstevel@tonic-gate */
16467c478bd9Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint))
16477c478bd9Sstevel@tonic-gate return (TIMER_INFINITY);
16487c478bd9Sstevel@tonic-gate
16497c478bd9Sstevel@tonic-gate /*
16507c478bd9Sstevel@tonic-gate * Randomize the next probe time, between MIN_RANDOM_FACTOR
16517c478bd9Sstevel@tonic-gate * and MAX_RANDOM_FACTOR with respect to the base probe time.
16527c478bd9Sstevel@tonic-gate * Base probe time is strictly periodic.
16537c478bd9Sstevel@tonic-gate */
16547c478bd9Sstevel@tonic-gate interval = GET_RANDOM(
16557c478bd9Sstevel@tonic-gate (int)(MIN_RANDOM_FACTOR * user_probe_interval),
16567c478bd9Sstevel@tonic-gate (int)(MAX_RANDOM_FACTOR * user_probe_interval));
16577c478bd9Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
16587c478bd9Sstevel@tonic-gate
16597c478bd9Sstevel@tonic-gate /*
16607c478bd9Sstevel@tonic-gate * Check if the current time > next time to probe. If so, we missed
16617c478bd9Sstevel@tonic-gate * sending 1 or more probes, probably due to heavy system load. At least
16627c478bd9Sstevel@tonic-gate * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
16637c478bd9Sstevel@tonic-gate * were scheduled. Make adjustments to the times, in multiples of
16647c478bd9Sstevel@tonic-gate * user_probe_interval.
16657c478bd9Sstevel@tonic-gate */
16667c478bd9Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_snxt_time)) {
16677c478bd9Sstevel@tonic-gate int n;
16687c478bd9Sstevel@tonic-gate
16697c478bd9Sstevel@tonic-gate n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
16707c478bd9Sstevel@tonic-gate pii->pii_snxt_time += (n + 1) * user_probe_interval;
16717c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime += (n + 1) * user_probe_interval;
16727c478bd9Sstevel@tonic-gate logtrace("missed sending %d probes cur_time %u snxt_time %u"
16737c478bd9Sstevel@tonic-gate " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
16747c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime);
16757c478bd9Sstevel@tonic-gate
16767c478bd9Sstevel@tonic-gate /* Collect statistics about missed probes */
16777c478bd9Sstevel@tonic-gate probes_missed.pm_nprobes += n + 1;
16787c478bd9Sstevel@tonic-gate probes_missed.pm_ntimes++;
16797c478bd9Sstevel@tonic-gate }
16807c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime += user_probe_interval;
16817c478bd9Sstevel@tonic-gate interval = pii->pii_snxt_time - cur_time;
16827c478bd9Sstevel@tonic-gate if (debug & D_TARGET) {
16837c478bd9Sstevel@tonic-gate logdebug("cur_time %u snxt_time %u snxt_basetime %u"
16847c478bd9Sstevel@tonic-gate " interval %u\n", cur_time, pii->pii_snxt_time,
16857c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime, interval);
16867c478bd9Sstevel@tonic-gate }
16877c478bd9Sstevel@tonic-gate
16887c478bd9Sstevel@tonic-gate /*
16897c478bd9Sstevel@tonic-gate * If no targets are known, we need to send an ICMP multicast. The
16907c478bd9Sstevel@tonic-gate * probe type is PROBE_MULTI. We'll check back in 'interval' msec
16917c478bd9Sstevel@tonic-gate * to see if we found a target.
16927c478bd9Sstevel@tonic-gate */
16937c478bd9Sstevel@tonic-gate if (pii->pii_target_next == NULL) {
16947c478bd9Sstevel@tonic-gate assert(pii->pii_ntargets == 0);
16957c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
16967c478bd9Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time);
16977c478bd9Sstevel@tonic-gate return (interval);
16987c478bd9Sstevel@tonic-gate }
16997c478bd9Sstevel@tonic-gate
17007c478bd9Sstevel@tonic-gate if ((user_probe_interval != probe_interval) &&
17017c478bd9Sstevel@tonic-gate TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
17027c478bd9Sstevel@tonic-gate /*
17037c478bd9Sstevel@tonic-gate * the failure detection (fd) probe timer has not yet fired.
17047c478bd9Sstevel@tonic-gate * Need to send only an rtt probe. The probe type is PROBE_RTT.
17057c478bd9Sstevel@tonic-gate */
1706e11c3f44Smeem probe(pii, PROBE_RTT, cur_hrtime);
17077c478bd9Sstevel@tonic-gate return (interval);
17087c478bd9Sstevel@tonic-gate }
17097c478bd9Sstevel@tonic-gate /*
17107c478bd9Sstevel@tonic-gate * the fd probe timer has fired. Need to do all failure
17117c478bd9Sstevel@tonic-gate * detection / recovery calculations, and then send an fd probe
17127c478bd9Sstevel@tonic-gate * of type PROBE_UNI.
17137c478bd9Sstevel@tonic-gate */
17147c478bd9Sstevel@tonic-gate if (user_probe_interval == probe_interval) {
17157c478bd9Sstevel@tonic-gate /*
17167c478bd9Sstevel@tonic-gate * We could have missed some probes, and then adjusted
17177c478bd9Sstevel@tonic-gate * pii_snxt_basetime above. Otherwise we could have
17187c478bd9Sstevel@tonic-gate * blindly added probe_interval to pii_fd_snxt_basetime.
17197c478bd9Sstevel@tonic-gate */
17207c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
17217c478bd9Sstevel@tonic-gate } else {
17227c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime += probe_interval;
17237c478bd9Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
17247c478bd9Sstevel@tonic-gate int n;
17257c478bd9Sstevel@tonic-gate
17267c478bd9Sstevel@tonic-gate n = (cur_time - pii->pii_fd_snxt_basetime) /
17277c478bd9Sstevel@tonic-gate probe_interval;
17287c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
17297c478bd9Sstevel@tonic-gate }
17307c478bd9Sstevel@tonic-gate }
17317c478bd9Sstevel@tonic-gate
17327c478bd9Sstevel@tonic-gate /*
17337c478bd9Sstevel@tonic-gate * We can have at most, the latest 2 probes that we sent, in
17347c478bd9Sstevel@tonic-gate * the PR_UNACKED state. All previous probes sent, are either
17357c478bd9Sstevel@tonic-gate * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1736e11c3f44Smeem * timed out if the probe's time_start + the CRTT < currenttime.
17377c478bd9Sstevel@tonic-gate * For each of the last 2 probes, examine whether it has timed
17387c478bd9Sstevel@tonic-gate * out. If so, mark it PR_LOST. The probe stats is a circular array.
17397c478bd9Sstevel@tonic-gate */
17407c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
17417c478bd9Sstevel@tonic-gate valid_unack_count = 0;
17427c478bd9Sstevel@tonic-gate
17437c478bd9Sstevel@tonic-gate for (i = 0; i < 2; i++) {
17447c478bd9Sstevel@tonic-gate pr_statp = &pii->pii_probes[pr_ndx];
17457c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target;
17467c478bd9Sstevel@tonic-gate switch (pr_statp->pr_status) {
17477c478bd9Sstevel@tonic-gate case PR_ACKED:
17487c478bd9Sstevel@tonic-gate /*
17497c478bd9Sstevel@tonic-gate * We received back an ACK, so the switch clearly
17507c478bd9Sstevel@tonic-gate * is not dropping our traffic, and thus we can
17517c478bd9Sstevel@tonic-gate * enable failure detection immediately.
17527c478bd9Sstevel@tonic-gate */
17537c478bd9Sstevel@tonic-gate if (pii->pii_fd_hrtime > gethrtime()) {
17547c478bd9Sstevel@tonic-gate if (debug & D_PROBE) {
17557c478bd9Sstevel@tonic-gate logdebug("successful probe on %s; "
17567c478bd9Sstevel@tonic-gate "ending quiet period\n",
17577c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_name);
17587c478bd9Sstevel@tonic-gate }
17597c478bd9Sstevel@tonic-gate pii->pii_fd_hrtime = gethrtime();
17607c478bd9Sstevel@tonic-gate }
17617c478bd9Sstevel@tonic-gate break;
17627c478bd9Sstevel@tonic-gate
17637c478bd9Sstevel@tonic-gate case PR_UNACKED:
17647c478bd9Sstevel@tonic-gate assert(cur_tg != NULL);
17657c478bd9Sstevel@tonic-gate /*
17667c478bd9Sstevel@tonic-gate * The crtt could be zero for some reason,
17677c478bd9Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is
17687c478bd9Sstevel@tonic-gate * not available use group's probe interval,
17697c478bd9Sstevel@tonic-gate * which is a worst case estimate.
17707c478bd9Sstevel@tonic-gate */
1771e11c3f44Smeem timeout = ns2ms(pr_statp->pr_hrtime_start);
17727c478bd9Sstevel@tonic-gate if (cur_tg->tg_crtt != 0) {
1773e11c3f44Smeem timeout += cur_tg->tg_crtt;
17747c478bd9Sstevel@tonic-gate } else {
1775e11c3f44Smeem timeout += probe_interval;
17767c478bd9Sstevel@tonic-gate }
17777c478bd9Sstevel@tonic-gate if (TIME_LT(timeout, cur_time)) {
17787c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = timeout;
1779e11c3f44Smeem probe_chstate(pr_statp, pii, PR_LOST);
17807c478bd9Sstevel@tonic-gate } else if (i == 1) {
17817c478bd9Sstevel@tonic-gate /*
17827c478bd9Sstevel@tonic-gate * We are forced to consider this probe
17837c478bd9Sstevel@tonic-gate * lost, as we can have at most 2 unack.
17847c478bd9Sstevel@tonic-gate * probes any time, and we will be sending a
17857c478bd9Sstevel@tonic-gate * probe at the end of this function.
17867c478bd9Sstevel@tonic-gate * Normally, we should not be here, but
17877c478bd9Sstevel@tonic-gate * this can happen if an incoming response
17887c478bd9Sstevel@tonic-gate * that was considered lost has increased
17897c478bd9Sstevel@tonic-gate * the crtt for this target, and also bumped
17907c478bd9Sstevel@tonic-gate * up the FDT. Note that we never cancel or
17917c478bd9Sstevel@tonic-gate * increase the current pii_time_left, so
17927c478bd9Sstevel@tonic-gate * when the timer fires, we find 2 valid
17937c478bd9Sstevel@tonic-gate * unacked probes, and they are yet to timeout
17947c478bd9Sstevel@tonic-gate */
17957c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = cur_time;
1796e11c3f44Smeem probe_chstate(pr_statp, pii, PR_LOST);
17977c478bd9Sstevel@tonic-gate } else {
17987c478bd9Sstevel@tonic-gate /*
17997c478bd9Sstevel@tonic-gate * Only the most recent probe can enter
18007c478bd9Sstevel@tonic-gate * this 'else' arm. The second most recent
18017c478bd9Sstevel@tonic-gate * probe must take either of the above arms,
18027c478bd9Sstevel@tonic-gate * if it is unacked.
18037c478bd9Sstevel@tonic-gate */
18047c478bd9Sstevel@tonic-gate valid_unack_count++;
18057c478bd9Sstevel@tonic-gate }
18067c478bd9Sstevel@tonic-gate break;
18077c478bd9Sstevel@tonic-gate }
18087c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx);
18097c478bd9Sstevel@tonic-gate }
18107c478bd9Sstevel@tonic-gate
18117c478bd9Sstevel@tonic-gate /*
18127c478bd9Sstevel@tonic-gate * We send out 1 probe randomly in the interval between one half
18137c478bd9Sstevel@tonic-gate * and one probe interval for the group. Given that the CRTT is always
18147c478bd9Sstevel@tonic-gate * less than the group's probe interval, we can have at most 1
18157c478bd9Sstevel@tonic-gate * unacknowledged probe now. All previous probes are either lost or
18167c478bd9Sstevel@tonic-gate * acked.
18177c478bd9Sstevel@tonic-gate */
18187c478bd9Sstevel@tonic-gate assert(valid_unack_count == 0 || valid_unack_count == 1);
18197c478bd9Sstevel@tonic-gate
18207c478bd9Sstevel@tonic-gate /*
18217c478bd9Sstevel@tonic-gate * The timer has fired. Take appropriate action depending
18227c478bd9Sstevel@tonic-gate * on the current state of the phyint.
18237c478bd9Sstevel@tonic-gate *
1824e11c3f44Smeem * PI_RUNNING state - Failure detection
1825e11c3f44Smeem * PI_FAILED state - Repair detection
18267c478bd9Sstevel@tonic-gate */
18277c478bd9Sstevel@tonic-gate switch (pii->pii_phyint->pi_state) {
18287c478bd9Sstevel@tonic-gate case PI_FAILED:
18297c478bd9Sstevel@tonic-gate /*
18307c478bd9Sstevel@tonic-gate * If the most recent probe (excluding unacked probes that
18317c478bd9Sstevel@tonic-gate * are yet to time out) has been acked, check whether the
1832e11c3f44Smeem * phyint is now repaired.
18337c478bd9Sstevel@tonic-gate */
18347c478bd9Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
18357c478bd9Sstevel@tonic-gate phyint_check_for_repair(pii->pii_phyint);
18367c478bd9Sstevel@tonic-gate }
18377c478bd9Sstevel@tonic-gate break;
18387c478bd9Sstevel@tonic-gate
18397c478bd9Sstevel@tonic-gate case PI_RUNNING:
18407c478bd9Sstevel@tonic-gate /*
18417c478bd9Sstevel@tonic-gate * It's possible our probes have been lost because of a
18427c478bd9Sstevel@tonic-gate * spanning-tree mandated quiet period on the switch. If so,
1843e11c3f44Smeem * ignore the lost probes.
18447c478bd9Sstevel@tonic-gate */
18457c478bd9Sstevel@tonic-gate if (pii->pii_fd_hrtime - cur_hrtime > 0)
18467c478bd9Sstevel@tonic-gate break;
18477c478bd9Sstevel@tonic-gate
18487c478bd9Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
18497c478bd9Sstevel@tonic-gate /*
18507c478bd9Sstevel@tonic-gate * We have 1 or more failed probes (excluding unacked
18517c478bd9Sstevel@tonic-gate * probes that are yet to time out). Determine if the
1852e11c3f44Smeem * phyint has failed.
18537c478bd9Sstevel@tonic-gate */
18547c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(pii);
18557c478bd9Sstevel@tonic-gate }
18567c478bd9Sstevel@tonic-gate break;
18577c478bd9Sstevel@tonic-gate
18587c478bd9Sstevel@tonic-gate default:
18597c478bd9Sstevel@tonic-gate logerr("phyint_inst_timer: invalid state %d\n",
18607c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_state);
18617c478bd9Sstevel@tonic-gate abort();
18627c478bd9Sstevel@tonic-gate }
18637c478bd9Sstevel@tonic-gate
18647c478bd9Sstevel@tonic-gate /*
18657c478bd9Sstevel@tonic-gate * Start the next probe. probe() will also set pii->pii_probe_time_left
18667c478bd9Sstevel@tonic-gate * to the group's probe interval. If phyint_failed -> target_flush_hosts
18677c478bd9Sstevel@tonic-gate * was called, the target list may be empty.
18687c478bd9Sstevel@tonic-gate */
18697c478bd9Sstevel@tonic-gate if (pii->pii_target_next != NULL) {
1870e11c3f44Smeem probe(pii, PROBE_UNI, cur_hrtime);
18717c478bd9Sstevel@tonic-gate /*
18727c478bd9Sstevel@tonic-gate * If we have just the one probe target, and we're not using
18737c478bd9Sstevel@tonic-gate * router targets, try to find another as we presently have
18747c478bd9Sstevel@tonic-gate * no resilience.
18757c478bd9Sstevel@tonic-gate */
18767c478bd9Sstevel@tonic-gate if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1877e11c3f44Smeem probe(pii, PROBE_MULTI, cur_hrtime);
18787c478bd9Sstevel@tonic-gate } else {
1879e11c3f44Smeem probe(pii, PROBE_MULTI, cur_hrtime);
18807c478bd9Sstevel@tonic-gate }
18817c478bd9Sstevel@tonic-gate return (interval);
18827c478bd9Sstevel@tonic-gate }
18837c478bd9Sstevel@tonic-gate
18847c478bd9Sstevel@tonic-gate /*
18857c478bd9Sstevel@tonic-gate * Start the probe timer for an interface instance.
18867c478bd9Sstevel@tonic-gate */
18877c478bd9Sstevel@tonic-gate void
start_timer(struct phyint_instance * pii)18887c478bd9Sstevel@tonic-gate start_timer(struct phyint_instance *pii)
18897c478bd9Sstevel@tonic-gate {
18907c478bd9Sstevel@tonic-gate uint32_t interval;
18917c478bd9Sstevel@tonic-gate
18927c478bd9Sstevel@tonic-gate /*
18937c478bd9Sstevel@tonic-gate * Spread the base probe times (pi_snxt_basetime) across phyints
18947c478bd9Sstevel@tonic-gate * uniformly over the (curtime..curtime + the group's probe_interval).
18957c478bd9Sstevel@tonic-gate * pi_snxt_basetime is strictly periodic with a frequency of
18967c478bd9Sstevel@tonic-gate * the group's probe interval. The actual probe time pi_snxt_time
18977c478bd9Sstevel@tonic-gate * adds some randomness to pi_snxt_basetime and happens in probe().
18987c478bd9Sstevel@tonic-gate * For the 1st probe on each phyint after the timer is started,
18997c478bd9Sstevel@tonic-gate * pi_snxt_time and pi_snxt_basetime are the same.
19007c478bd9Sstevel@tonic-gate */
19017c478bd9Sstevel@tonic-gate interval = GET_RANDOM(0,
19027c478bd9Sstevel@tonic-gate (int)pii->pii_phyint->pi_group->pg_probeint);
19037c478bd9Sstevel@tonic-gate
19047c478bd9Sstevel@tonic-gate pii->pii_snxt_basetime = getcurrenttime() + interval;
19057c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
19067c478bd9Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime;
19077c478bd9Sstevel@tonic-gate timer_schedule(interval);
19087c478bd9Sstevel@tonic-gate }
19097c478bd9Sstevel@tonic-gate
19107c478bd9Sstevel@tonic-gate /*
19117c478bd9Sstevel@tonic-gate * Restart the probe timer on an interface instance.
19127c478bd9Sstevel@tonic-gate */
19137c478bd9Sstevel@tonic-gate static void
restart_timer(struct phyint_instance * pii)19147c478bd9Sstevel@tonic-gate restart_timer(struct phyint_instance *pii)
19157c478bd9Sstevel@tonic-gate {
19167c478bd9Sstevel@tonic-gate /*
19177c478bd9Sstevel@tonic-gate * We don't need to restart the timer if it was never started in
19187c478bd9Sstevel@tonic-gate * the first place (pii->pii_basetime_inited not set), as the timer
19197c478bd9Sstevel@tonic-gate * won't have gone off yet.
19207c478bd9Sstevel@tonic-gate */
19217c478bd9Sstevel@tonic-gate if (pii->pii_basetime_inited != 0) {
19227c478bd9Sstevel@tonic-gate
19237c478bd9Sstevel@tonic-gate if (debug & D_LINKNOTE)
19247c478bd9Sstevel@tonic-gate logdebug("restart timer: restarting timer on %s, "
19257c478bd9Sstevel@tonic-gate "address family %s\n", pii->pii_phyint->pi_name,
19267c478bd9Sstevel@tonic-gate AF_STR(pii->pii_af));
19277c478bd9Sstevel@tonic-gate
19287c478bd9Sstevel@tonic-gate start_timer(pii);
19297c478bd9Sstevel@tonic-gate }
19307c478bd9Sstevel@tonic-gate }
19317c478bd9Sstevel@tonic-gate
19327c478bd9Sstevel@tonic-gate static void
process_link_state_down(struct phyint * pi)19337c478bd9Sstevel@tonic-gate process_link_state_down(struct phyint *pi)
19347c478bd9Sstevel@tonic-gate {
19357c478bd9Sstevel@tonic-gate logerr("The link has gone down on %s\n", pi->pi_name);
19367c478bd9Sstevel@tonic-gate
19377c478bd9Sstevel@tonic-gate /*
19387c478bd9Sstevel@tonic-gate * Clear the probe statistics arrays, we don't want the repair
1939e11c3f44Smeem * detection logic relying on probes that were successful prior
19407c478bd9Sstevel@tonic-gate * to the link going down.
19417c478bd9Sstevel@tonic-gate */
19427c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v4))
19437c478bd9Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v4);
19447c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v6))
19457c478bd9Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v6);
19467c478bd9Sstevel@tonic-gate /*
19477c478bd9Sstevel@tonic-gate * Check for interface failure. Although we know the interface
19487c478bd9Sstevel@tonic-gate * has failed, we don't know if all the other interfaces in the
19497c478bd9Sstevel@tonic-gate * group have failed as well.
19507c478bd9Sstevel@tonic-gate */
19517c478bd9Sstevel@tonic-gate if ((pi->pi_state == PI_RUNNING) ||
19527c478bd9Sstevel@tonic-gate (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
19537c478bd9Sstevel@tonic-gate if (debug & D_LINKNOTE) {
19547c478bd9Sstevel@tonic-gate logdebug("process_link_state_down:"
19557c478bd9Sstevel@tonic-gate " checking for failure on %s\n", pi->pi_name);
19567c478bd9Sstevel@tonic-gate }
19577c478bd9Sstevel@tonic-gate
19587c478bd9Sstevel@tonic-gate if (pi->pi_v4 != NULL)
19597c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v4);
19607c478bd9Sstevel@tonic-gate else if (pi->pi_v6 != NULL)
19617c478bd9Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v6);
19627c478bd9Sstevel@tonic-gate }
19637c478bd9Sstevel@tonic-gate }
19647c478bd9Sstevel@tonic-gate
19657c478bd9Sstevel@tonic-gate static void
process_link_state_up(struct phyint * pi)19667c478bd9Sstevel@tonic-gate process_link_state_up(struct phyint *pi)
19677c478bd9Sstevel@tonic-gate {
19687c478bd9Sstevel@tonic-gate logerr("The link has come up on %s\n", pi->pi_name);
19697c478bd9Sstevel@tonic-gate
19707c478bd9Sstevel@tonic-gate /*
19717c478bd9Sstevel@tonic-gate * We stopped any running timers on each instance when the link
19727c478bd9Sstevel@tonic-gate * went down, so restart them.
19737c478bd9Sstevel@tonic-gate */
19747c478bd9Sstevel@tonic-gate if (pi->pi_v4)
19757c478bd9Sstevel@tonic-gate restart_timer(pi->pi_v4);
19767c478bd9Sstevel@tonic-gate if (pi->pi_v6)
19777c478bd9Sstevel@tonic-gate restart_timer(pi->pi_v6);
19787c478bd9Sstevel@tonic-gate
19797c478bd9Sstevel@tonic-gate phyint_check_for_repair(pi);
19807c478bd9Sstevel@tonic-gate
19817c478bd9Sstevel@tonic-gate pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
19827c478bd9Sstevel@tonic-gate if (pi->pi_whendx == LINK_UP_PERMIN)
19837c478bd9Sstevel@tonic-gate pi->pi_whendx = 0;
19847c478bd9Sstevel@tonic-gate }
19857c478bd9Sstevel@tonic-gate
19867c478bd9Sstevel@tonic-gate /*
19877c478bd9Sstevel@tonic-gate * Process any changes in link state passed up from the interfaces.
19887c478bd9Sstevel@tonic-gate */
19897c478bd9Sstevel@tonic-gate void
process_link_state_changes(void)19907c478bd9Sstevel@tonic-gate process_link_state_changes(void)
19917c478bd9Sstevel@tonic-gate {
19927c478bd9Sstevel@tonic-gate struct phyint *pi;
19937c478bd9Sstevel@tonic-gate
19947c478bd9Sstevel@tonic-gate /* Look for interfaces where the link state has just changed */
19957c478bd9Sstevel@tonic-gate
19967c478bd9Sstevel@tonic-gate for (pi = phyints; pi != NULL; pi = pi->pi_next) {
19977c478bd9Sstevel@tonic-gate boolean_t old_link_state_up = LINK_UP(pi);
19987c478bd9Sstevel@tonic-gate
19997c478bd9Sstevel@tonic-gate /*
20007c478bd9Sstevel@tonic-gate * Except when the "phyint" structure is created, this is
20017c478bd9Sstevel@tonic-gate * the only place the link state is updated. This allows
20027c478bd9Sstevel@tonic-gate * this routine to detect changes in link state, rather
20037c478bd9Sstevel@tonic-gate * than just the current state.
20047c478bd9Sstevel@tonic-gate */
20057c478bd9Sstevel@tonic-gate UPDATE_LINK_STATE(pi);
20067c478bd9Sstevel@tonic-gate
20077c478bd9Sstevel@tonic-gate if (LINK_DOWN(pi)) {
20087c478bd9Sstevel@tonic-gate /*
20097c478bd9Sstevel@tonic-gate * Has link just gone down?
20107c478bd9Sstevel@tonic-gate */
20117c478bd9Sstevel@tonic-gate if (old_link_state_up)
20127c478bd9Sstevel@tonic-gate process_link_state_down(pi);
20137c478bd9Sstevel@tonic-gate } else {
20147c478bd9Sstevel@tonic-gate /*
20157c478bd9Sstevel@tonic-gate * Has link just gone back up?
20167c478bd9Sstevel@tonic-gate */
20177c478bd9Sstevel@tonic-gate if (!old_link_state_up)
20187c478bd9Sstevel@tonic-gate process_link_state_up(pi);
20197c478bd9Sstevel@tonic-gate }
20207c478bd9Sstevel@tonic-gate }
20217c478bd9Sstevel@tonic-gate }
20227c478bd9Sstevel@tonic-gate
20237c478bd9Sstevel@tonic-gate void
reset_crtt_all(struct phyint * pi)20247c478bd9Sstevel@tonic-gate reset_crtt_all(struct phyint *pi)
20257c478bd9Sstevel@tonic-gate {
20267c478bd9Sstevel@tonic-gate struct phyint_instance *pii;
20277c478bd9Sstevel@tonic-gate struct target *tg;
20287c478bd9Sstevel@tonic-gate
20297c478bd9Sstevel@tonic-gate pii = pi->pi_v4;
20307c478bd9Sstevel@tonic-gate if (pii != NULL) {
20317c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
20327c478bd9Sstevel@tonic-gate tg->tg_crtt = 0;
20337c478bd9Sstevel@tonic-gate tg->tg_rtt_sa = -1;
20347c478bd9Sstevel@tonic-gate tg->tg_rtt_sd = 0;
20357c478bd9Sstevel@tonic-gate }
20367c478bd9Sstevel@tonic-gate }
20377c478bd9Sstevel@tonic-gate
20387c478bd9Sstevel@tonic-gate pii = pi->pi_v6;
20397c478bd9Sstevel@tonic-gate if (pii != NULL) {
20407c478bd9Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
20417c478bd9Sstevel@tonic-gate tg->tg_crtt = 0;
20427c478bd9Sstevel@tonic-gate tg->tg_rtt_sa = -1;
20437c478bd9Sstevel@tonic-gate tg->tg_rtt_sd = 0;
20447c478bd9Sstevel@tonic-gate }
20457c478bd9Sstevel@tonic-gate }
20467c478bd9Sstevel@tonic-gate }
20477c478bd9Sstevel@tonic-gate
20487c478bd9Sstevel@tonic-gate /*
20497c478bd9Sstevel@tonic-gate * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
20507c478bd9Sstevel@tonic-gate * probes on both instances IPv4 and IPv6.
20517c478bd9Sstevel@tonic-gate * If the interface has failed, return the time of the first probe failure
20527c478bd9Sstevel@tonic-gate * in "tff".
20537c478bd9Sstevel@tonic-gate */
20547c478bd9Sstevel@tonic-gate static int
phyint_inst_probe_failure_state(struct phyint_instance * pii,uint_t * tff)20557c478bd9Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
20567c478bd9Sstevel@tonic-gate {
20577c478bd9Sstevel@tonic-gate uint_t pi_tff;
20587c478bd9Sstevel@tonic-gate struct target *cur_tg;
20597c478bd9Sstevel@tonic-gate struct probe_fail_count pfinfo;
20607c478bd9Sstevel@tonic-gate struct phyint_instance *pii_other;
20617c478bd9Sstevel@tonic-gate int pr_ndx;
20627c478bd9Sstevel@tonic-gate
20637c478bd9Sstevel@tonic-gate /*
20647c478bd9Sstevel@tonic-gate * Get the number of consecutive failed probes on
20657c478bd9Sstevel@tonic-gate * this phyint across all targets. Also get the number
20667c478bd9Sstevel@tonic-gate * of consecutive failed probes on this target only
20677c478bd9Sstevel@tonic-gate */
20687c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
20697c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target;
20707c478bd9Sstevel@tonic-gate probe_fail_info(pii, cur_tg, &pfinfo);
20717c478bd9Sstevel@tonic-gate
20727c478bd9Sstevel@tonic-gate /* Get the time of first failure, for later use */
20737c478bd9Sstevel@tonic-gate pi_tff = pfinfo.pf_tff;
20747c478bd9Sstevel@tonic-gate
20757c478bd9Sstevel@tonic-gate /*
20767c478bd9Sstevel@tonic-gate * If the current target has not responded to the
20777c478bd9Sstevel@tonic-gate * last NUM_PROBE_FAILS probes, and other targets are
20787c478bd9Sstevel@tonic-gate * responding delete this target. Dead gateway detection
20797c478bd9Sstevel@tonic-gate * will eventually remove this target (if router) from the
20807c478bd9Sstevel@tonic-gate * routing tables. If that does not occur, we may end
20817c478bd9Sstevel@tonic-gate * up adding this to our list again.
20827c478bd9Sstevel@tonic-gate */
20837c478bd9Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
20847c478bd9Sstevel@tonic-gate pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
20857c478bd9Sstevel@tonic-gate if (pii->pii_targets_are_routers) {
20867c478bd9Sstevel@tonic-gate if (cur_tg->tg_status == TG_ACTIVE)
20877c478bd9Sstevel@tonic-gate pii->pii_ntargets--;
20887c478bd9Sstevel@tonic-gate cur_tg->tg_status = TG_DEAD;
20897c478bd9Sstevel@tonic-gate cur_tg->tg_crtt = 0;
20907c478bd9Sstevel@tonic-gate cur_tg->tg_rtt_sa = -1;
20917c478bd9Sstevel@tonic-gate cur_tg->tg_rtt_sd = 0;
20927c478bd9Sstevel@tonic-gate if (pii->pii_target_next == cur_tg)
20937c478bd9Sstevel@tonic-gate pii->pii_target_next = target_next(cur_tg);
20947c478bd9Sstevel@tonic-gate } else {
20957c478bd9Sstevel@tonic-gate target_delete(cur_tg);
2096e11c3f44Smeem probe(pii, PROBE_MULTI, gethrtime());
20977c478bd9Sstevel@tonic-gate }
20987c478bd9Sstevel@tonic-gate return (PHYINT_OK);
20997c478bd9Sstevel@tonic-gate }
21007c478bd9Sstevel@tonic-gate
21017c478bd9Sstevel@tonic-gate /*
21027c478bd9Sstevel@tonic-gate * If the phyint has lost NUM_PROBE_FAILS or more
21037c478bd9Sstevel@tonic-gate * consecutive probes, on both IPv4 and IPv6 protocol
21047c478bd9Sstevel@tonic-gate * instances of the phyint, then trigger failure
21057c478bd9Sstevel@tonic-gate * detection, else return false
21067c478bd9Sstevel@tonic-gate */
21077c478bd9Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
21087c478bd9Sstevel@tonic-gate return (PHYINT_OK);
21097c478bd9Sstevel@tonic-gate
21107c478bd9Sstevel@tonic-gate pii_other = phyint_inst_other(pii);
21117c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pii_other)) {
21127c478bd9Sstevel@tonic-gate probe_fail_info(pii_other, NULL, &pfinfo);
21137c478bd9Sstevel@tonic-gate if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
21147c478bd9Sstevel@tonic-gate /*
21157c478bd9Sstevel@tonic-gate * We have NUM_PROBE_FAILS or more failures
21167c478bd9Sstevel@tonic-gate * on both IPv4 and IPv6. Get the earliest
21177c478bd9Sstevel@tonic-gate * time when failure was detected on this
21187c478bd9Sstevel@tonic-gate * phyint across IPv4 and IPv6.
21197c478bd9Sstevel@tonic-gate */
21207c478bd9Sstevel@tonic-gate if (TIME_LT(pfinfo.pf_tff, pi_tff))
21217c478bd9Sstevel@tonic-gate pi_tff = pfinfo.pf_tff;
21227c478bd9Sstevel@tonic-gate } else {
21237c478bd9Sstevel@tonic-gate /*
21247c478bd9Sstevel@tonic-gate * This instance has < NUM_PROBE_FAILS failure.
21257c478bd9Sstevel@tonic-gate * So return false
21267c478bd9Sstevel@tonic-gate */
21277c478bd9Sstevel@tonic-gate return (PHYINT_OK);
21287c478bd9Sstevel@tonic-gate }
21297c478bd9Sstevel@tonic-gate }
21307c478bd9Sstevel@tonic-gate *tff = pi_tff;
21317c478bd9Sstevel@tonic-gate return (PHYINT_FAILURE);
21327c478bd9Sstevel@tonic-gate }
21337c478bd9Sstevel@tonic-gate
21347c478bd9Sstevel@tonic-gate /*
21357c478bd9Sstevel@tonic-gate * Check if the link has gone down on this phyint, or it has failed the
21367c478bd9Sstevel@tonic-gate * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
21377c478bd9Sstevel@tonic-gate * Also look at other phyints of this group, for group failures.
21387c478bd9Sstevel@tonic-gate */
21397c478bd9Sstevel@tonic-gate int
failure_state(struct phyint_instance * pii)21407c478bd9Sstevel@tonic-gate failure_state(struct phyint_instance *pii)
21417c478bd9Sstevel@tonic-gate {
21427c478bd9Sstevel@tonic-gate struct probe_success_count psinfo;
21437c478bd9Sstevel@tonic-gate uint_t pi2_tls; /* time last success */
21447c478bd9Sstevel@tonic-gate uint_t pi_tff; /* time first fail */
21457c478bd9Sstevel@tonic-gate struct phyint *pi2;
21467c478bd9Sstevel@tonic-gate struct phyint *pi;
21477c478bd9Sstevel@tonic-gate struct phyint_instance *pii2;
21487c478bd9Sstevel@tonic-gate struct phyint_group *pg;
2149e11c3f44Smeem int retval;
21507c478bd9Sstevel@tonic-gate
2151e11c3f44Smeem if (debug & D_FAILREP)
21527c478bd9Sstevel@tonic-gate logdebug("phyint_failed(%s)\n", pii->pii_name);
21537c478bd9Sstevel@tonic-gate
21547c478bd9Sstevel@tonic-gate pi = pii->pii_phyint;
21557c478bd9Sstevel@tonic-gate pg = pi->pi_group;
21567c478bd9Sstevel@tonic-gate
21577c478bd9Sstevel@tonic-gate if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
21587c478bd9Sstevel@tonic-gate PHYINT_OK)
21597c478bd9Sstevel@tonic-gate return (PHYINT_OK);
21607c478bd9Sstevel@tonic-gate
21617c478bd9Sstevel@tonic-gate /*
2162e11c3f44Smeem * At this point, the link is down, or the phyint is suspect, as it
2163e11c3f44Smeem * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
2164e11c3f44Smeem * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue
2165e11c3f44Smeem * on to determine whether this should be considered a PHYINT_FAILURE
2166e11c3f44Smeem * or GROUP_FAILURE.
21677c478bd9Sstevel@tonic-gate */
2168e11c3f44Smeem if (pg == phyint_anongroup)
21697c478bd9Sstevel@tonic-gate return (PHYINT_FAILURE);
21707c478bd9Sstevel@tonic-gate
21717c478bd9Sstevel@tonic-gate /*
21727c478bd9Sstevel@tonic-gate * Need to compare against other phyints of the same group
21737c478bd9Sstevel@tonic-gate * to exclude group failures. If the failure was detected via
21747c478bd9Sstevel@tonic-gate * probing, then if the time of last success (tls) of any
21757c478bd9Sstevel@tonic-gate * phyint is more recent than the time of first fail (tff) of the
21767c478bd9Sstevel@tonic-gate * phyint in question, and the link is up on the phyint,
21777c478bd9Sstevel@tonic-gate * then it is a phyint failure. Otherwise it is a group failure.
21787c478bd9Sstevel@tonic-gate * If failure was detected via a link down notification sent from
21797c478bd9Sstevel@tonic-gate * the driver to IP, we see if any phyints in the group are still
21807c478bd9Sstevel@tonic-gate * running and haven't received a link down notification. We
21817c478bd9Sstevel@tonic-gate * will usually be processing the link down notification shortly
21827c478bd9Sstevel@tonic-gate * after it was received, so there is no point looking at the tls
21837c478bd9Sstevel@tonic-gate * of other phyints.
21847c478bd9Sstevel@tonic-gate */
2185e11c3f44Smeem retval = GROUP_FAILURE;
21867c478bd9Sstevel@tonic-gate for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
21877c478bd9Sstevel@tonic-gate /* Exclude ourself from comparison */
21887c478bd9Sstevel@tonic-gate if (pi2 == pi)
21897c478bd9Sstevel@tonic-gate continue;
21907c478bd9Sstevel@tonic-gate
21917c478bd9Sstevel@tonic-gate if (LINK_DOWN(pi)) {
21927c478bd9Sstevel@tonic-gate /*
2193e11c3f44Smeem * We use FLAGS_TO_LINK_STATE() to test the flags
2194e11c3f44Smeem * directly, rather then LINK_UP() or LINK_DOWN(), as
2195e11c3f44Smeem * we may not have got round to processing the link
2196e11c3f44Smeem * state for the other phyints in the group yet.
21977c478bd9Sstevel@tonic-gate *
2198e11c3f44Smeem * The check for PI_RUNNING and group failure handles
2199e11c3f44Smeem * the case when the group begins to recover.
2200e11c3f44Smeem * PI_RUNNING will be set, and group failure cleared
2201e11c3f44Smeem * only after receipt of NUM_PROBE_REPAIRS, by which
2202e11c3f44Smeem * time the other phyints should have received at
2203e11c3f44Smeem * least 1 packet, and so will not have NUM_PROBE_FAILS.
22047c478bd9Sstevel@tonic-gate */
22057c478bd9Sstevel@tonic-gate if ((pi2->pi_state == PI_RUNNING) &&
2206e11c3f44Smeem !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
2207e11c3f44Smeem retval = PHYINT_FAILURE;
2208e11c3f44Smeem break;
2209e11c3f44Smeem }
2210e11c3f44Smeem continue;
2211e11c3f44Smeem }
2212e11c3f44Smeem
2213e11c3f44Smeem if (LINK_DOWN(pi2))
2214e11c3f44Smeem continue;
2215e11c3f44Smeem
22167c478bd9Sstevel@tonic-gate /*
2217e11c3f44Smeem * If there's no probe-based failure detection on this
2218e11c3f44Smeem * interface, and its link is still up, then it's still
2219e11c3f44Smeem * working and thus the group has not failed.
2220e11c3f44Smeem */
2221e11c3f44Smeem if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
2222e11c3f44Smeem retval = PHYINT_FAILURE;
2223e11c3f44Smeem break;
2224e11c3f44Smeem }
2225e11c3f44Smeem
2226e11c3f44Smeem /*
2227e11c3f44Smeem * Need to compare against both IPv4 and IPv6 instances.
22287c478bd9Sstevel@tonic-gate */
22297c478bd9Sstevel@tonic-gate pii2 = pi2->pi_v4;
22307c478bd9Sstevel@tonic-gate if (pii2 != NULL) {
22317c478bd9Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo);
22327c478bd9Sstevel@tonic-gate if (psinfo.ps_tls_valid) {
22337c478bd9Sstevel@tonic-gate pi2_tls = psinfo.ps_tls;
22347c478bd9Sstevel@tonic-gate /*
22357c478bd9Sstevel@tonic-gate * See comment above regarding check
22367c478bd9Sstevel@tonic-gate * for PI_RUNNING and group failure.
22377c478bd9Sstevel@tonic-gate */
22387c478bd9Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) &&
22397c478bd9Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) &&
22407c478bd9Sstevel@tonic-gate !GROUP_FAILED(pg) &&
2241e11c3f44Smeem FLAGS_TO_LINK_STATE(pi2)) {
2242e11c3f44Smeem retval = PHYINT_FAILURE;
2243e11c3f44Smeem break;
2244e11c3f44Smeem }
22457c478bd9Sstevel@tonic-gate }
22467c478bd9Sstevel@tonic-gate }
22477c478bd9Sstevel@tonic-gate
22487c478bd9Sstevel@tonic-gate pii2 = pi2->pi_v6;
22497c478bd9Sstevel@tonic-gate if (pii2 != NULL) {
22507c478bd9Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo);
22517c478bd9Sstevel@tonic-gate if (psinfo.ps_tls_valid) {
22527c478bd9Sstevel@tonic-gate pi2_tls = psinfo.ps_tls;
22537c478bd9Sstevel@tonic-gate /*
22547c478bd9Sstevel@tonic-gate * See comment above regarding check
22557c478bd9Sstevel@tonic-gate * for PI_RUNNING and group failure.
22567c478bd9Sstevel@tonic-gate */
22577c478bd9Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) &&
22587c478bd9Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) &&
22597c478bd9Sstevel@tonic-gate !GROUP_FAILED(pg) &&
2260e11c3f44Smeem FLAGS_TO_LINK_STATE(pi2)) {
2261e11c3f44Smeem retval = PHYINT_FAILURE;
2262e11c3f44Smeem break;
22637c478bd9Sstevel@tonic-gate }
22647c478bd9Sstevel@tonic-gate }
22657c478bd9Sstevel@tonic-gate }
22667c478bd9Sstevel@tonic-gate }
22677c478bd9Sstevel@tonic-gate
22687c478bd9Sstevel@tonic-gate /*
2269e11c3f44Smeem * Update the group state to account for the changes.
22707c478bd9Sstevel@tonic-gate */
2271e11c3f44Smeem phyint_group_refresh_state(pg);
2272e11c3f44Smeem return (retval);
22737c478bd9Sstevel@tonic-gate }
22747c478bd9Sstevel@tonic-gate
22757c478bd9Sstevel@tonic-gate /*
22767c478bd9Sstevel@tonic-gate * Return the information associated with consecutive probe successes
22777c478bd9Sstevel@tonic-gate * starting with the most recent probe. At most the last 2 probes can be
22787c478bd9Sstevel@tonic-gate * in the unacknowledged state. All previous probes have either failed
22797c478bd9Sstevel@tonic-gate * or succeeded.
22807c478bd9Sstevel@tonic-gate */
22817c478bd9Sstevel@tonic-gate static void
probe_success_info(struct phyint_instance * pii,struct target * cur_tg,struct probe_success_count * psinfo)22827c478bd9Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
22837c478bd9Sstevel@tonic-gate struct probe_success_count *psinfo)
22847c478bd9Sstevel@tonic-gate {
22857c478bd9Sstevel@tonic-gate uint_t i;
22867c478bd9Sstevel@tonic-gate struct probe_stats *pr_statp;
22877c478bd9Sstevel@tonic-gate uint_t most_recent;
22887c478bd9Sstevel@tonic-gate uint_t second_most_recent;
22897c478bd9Sstevel@tonic-gate boolean_t pi_found_failure = _B_FALSE;
22907c478bd9Sstevel@tonic-gate boolean_t tg_found_failure = _B_FALSE;
22917c478bd9Sstevel@tonic-gate uint_t now;
22927c478bd9Sstevel@tonic-gate uint_t timeout;
22937c478bd9Sstevel@tonic-gate struct target *tg;
22947c478bd9Sstevel@tonic-gate
2295e11c3f44Smeem if (debug & D_FAILREP)
22967c478bd9Sstevel@tonic-gate logdebug("probe_success_info(%s)\n", pii->pii_name);
22977c478bd9Sstevel@tonic-gate
22987c478bd9Sstevel@tonic-gate bzero(psinfo, sizeof (*psinfo));
22997c478bd9Sstevel@tonic-gate now = getcurrenttime();
23007c478bd9Sstevel@tonic-gate
23017c478bd9Sstevel@tonic-gate /*
23027c478bd9Sstevel@tonic-gate * Start with the most recent probe, and count the number
23037c478bd9Sstevel@tonic-gate * of consecutive probe successes. Latch the number of successes
23047c478bd9Sstevel@tonic-gate * on hitting a failure.
23057c478bd9Sstevel@tonic-gate */
23067c478bd9Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
23077c478bd9Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent);
23087c478bd9Sstevel@tonic-gate
23097c478bd9Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next;
23107c478bd9Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) {
23117c478bd9Sstevel@tonic-gate pr_statp = &pii->pii_probes[i];
23127c478bd9Sstevel@tonic-gate
23137c478bd9Sstevel@tonic-gate switch (pr_statp->pr_status) {
23147c478bd9Sstevel@tonic-gate case PR_UNACKED:
23157c478bd9Sstevel@tonic-gate /*
23167c478bd9Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged
23177c478bd9Sstevel@tonic-gate */
23187c478bd9Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent);
23197c478bd9Sstevel@tonic-gate
23207c478bd9Sstevel@tonic-gate tg = pr_statp->pr_target;
23217c478bd9Sstevel@tonic-gate assert(tg != NULL);
23227c478bd9Sstevel@tonic-gate /*
23237c478bd9Sstevel@tonic-gate * The crtt could be zero for some reason,
23247c478bd9Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is
23257c478bd9Sstevel@tonic-gate * not available use the value of the group's probe
23267c478bd9Sstevel@tonic-gate * interval which is a worst case estimate.
23277c478bd9Sstevel@tonic-gate */
2328e11c3f44Smeem timeout = ns2ms(pr_statp->pr_hrtime_start);
23297c478bd9Sstevel@tonic-gate if (tg->tg_crtt != 0) {
2330e11c3f44Smeem timeout += tg->tg_crtt;
23317c478bd9Sstevel@tonic-gate } else {
2332e11c3f44Smeem timeout +=
23337c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint;
23347c478bd9Sstevel@tonic-gate }
23357c478bd9Sstevel@tonic-gate
23367c478bd9Sstevel@tonic-gate if (TIME_LT(timeout, now)) {
23377c478bd9Sstevel@tonic-gate /*
23387c478bd9Sstevel@tonic-gate * We hit a failure. Latch the total number of
23397c478bd9Sstevel@tonic-gate * recent consecutive successes.
23407c478bd9Sstevel@tonic-gate */
23417c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = timeout;
2342e11c3f44Smeem probe_chstate(pr_statp, pii, PR_LOST);
23437c478bd9Sstevel@tonic-gate pi_found_failure = _B_TRUE;
23447c478bd9Sstevel@tonic-gate if (cur_tg != NULL && tg == cur_tg) {
23457c478bd9Sstevel@tonic-gate /*
23467c478bd9Sstevel@tonic-gate * We hit a failure for the desired
23477c478bd9Sstevel@tonic-gate * target. Latch the number of recent
23487c478bd9Sstevel@tonic-gate * consecutive successes for this target
23497c478bd9Sstevel@tonic-gate */
23507c478bd9Sstevel@tonic-gate tg_found_failure = _B_TRUE;
23517c478bd9Sstevel@tonic-gate }
23527c478bd9Sstevel@tonic-gate }
23537c478bd9Sstevel@tonic-gate break;
23547c478bd9Sstevel@tonic-gate
23557c478bd9Sstevel@tonic-gate case PR_ACKED:
23567c478bd9Sstevel@tonic-gate /*
23577c478bd9Sstevel@tonic-gate * Bump up the count of probe successes, if we
23587c478bd9Sstevel@tonic-gate * have not seen any failure so far.
23597c478bd9Sstevel@tonic-gate */
23607c478bd9Sstevel@tonic-gate if (!pi_found_failure)
23617c478bd9Sstevel@tonic-gate psinfo->ps_nsucc++;
23627c478bd9Sstevel@tonic-gate
23637c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
23647c478bd9Sstevel@tonic-gate !tg_found_failure) {
23657c478bd9Sstevel@tonic-gate psinfo->ps_nsucc_tg++;
23667c478bd9Sstevel@tonic-gate }
23677c478bd9Sstevel@tonic-gate
23687c478bd9Sstevel@tonic-gate /*
23697c478bd9Sstevel@tonic-gate * Record the time of last success, if this is
23707c478bd9Sstevel@tonic-gate * the most recent probe success.
23717c478bd9Sstevel@tonic-gate */
23727c478bd9Sstevel@tonic-gate if (!psinfo->ps_tls_valid) {
2373e11c3f44Smeem psinfo->ps_tls =
2374e11c3f44Smeem ns2ms(pr_statp->pr_hrtime_ackproc);
23757c478bd9Sstevel@tonic-gate psinfo->ps_tls_valid = _B_TRUE;
23767c478bd9Sstevel@tonic-gate }
23777c478bd9Sstevel@tonic-gate break;
23787c478bd9Sstevel@tonic-gate
23797c478bd9Sstevel@tonic-gate case PR_LOST:
23807c478bd9Sstevel@tonic-gate /*
23817c478bd9Sstevel@tonic-gate * We hit a failure. Latch the total number of
23827c478bd9Sstevel@tonic-gate * recent consecutive successes.
23837c478bd9Sstevel@tonic-gate */
23847c478bd9Sstevel@tonic-gate pi_found_failure = _B_TRUE;
23857c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
23867c478bd9Sstevel@tonic-gate /*
23877c478bd9Sstevel@tonic-gate * We hit a failure for the desired target.
23887c478bd9Sstevel@tonic-gate * Latch the number of recent consecutive
23897c478bd9Sstevel@tonic-gate * successes for this target
23907c478bd9Sstevel@tonic-gate */
23917c478bd9Sstevel@tonic-gate tg_found_failure = _B_TRUE;
23927c478bd9Sstevel@tonic-gate }
23937c478bd9Sstevel@tonic-gate break;
23947c478bd9Sstevel@tonic-gate
23957c478bd9Sstevel@tonic-gate default:
23967c478bd9Sstevel@tonic-gate return;
23977c478bd9Sstevel@tonic-gate
23987c478bd9Sstevel@tonic-gate }
23997c478bd9Sstevel@tonic-gate }
24007c478bd9Sstevel@tonic-gate }
24017c478bd9Sstevel@tonic-gate
24027c478bd9Sstevel@tonic-gate /*
24037c478bd9Sstevel@tonic-gate * Return the information associated with consecutive probe failures
24047c478bd9Sstevel@tonic-gate * starting with the most recent probe. Only the last 2 probes can be in the
24057c478bd9Sstevel@tonic-gate * unacknowledged state. All previous probes have either failed or succeeded.
24067c478bd9Sstevel@tonic-gate */
24077c478bd9Sstevel@tonic-gate static void
probe_fail_info(struct phyint_instance * pii,struct target * cur_tg,struct probe_fail_count * pfinfo)24087c478bd9Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
24097c478bd9Sstevel@tonic-gate struct probe_fail_count *pfinfo)
24107c478bd9Sstevel@tonic-gate {
24117c478bd9Sstevel@tonic-gate int i;
24127c478bd9Sstevel@tonic-gate struct probe_stats *pr_statp;
24137c478bd9Sstevel@tonic-gate boolean_t tg_found_success = _B_FALSE;
24147c478bd9Sstevel@tonic-gate boolean_t pi_found_success = _B_FALSE;
24157c478bd9Sstevel@tonic-gate int most_recent;
24167c478bd9Sstevel@tonic-gate int second_most_recent;
24177c478bd9Sstevel@tonic-gate uint_t now;
24187c478bd9Sstevel@tonic-gate uint_t timeout;
24197c478bd9Sstevel@tonic-gate struct target *tg;
24207c478bd9Sstevel@tonic-gate
2421e11c3f44Smeem if (debug & D_FAILREP)
24227c478bd9Sstevel@tonic-gate logdebug("probe_fail_info(%s)\n", pii->pii_name);
24237c478bd9Sstevel@tonic-gate
24247c478bd9Sstevel@tonic-gate bzero(pfinfo, sizeof (*pfinfo));
24257c478bd9Sstevel@tonic-gate now = getcurrenttime();
24267c478bd9Sstevel@tonic-gate
24277c478bd9Sstevel@tonic-gate /*
24287c478bd9Sstevel@tonic-gate * Start with the most recent probe, and count the number
24297c478bd9Sstevel@tonic-gate * of consecutive probe failures. Latch the number of failures
24307c478bd9Sstevel@tonic-gate * on hitting a probe success.
24317c478bd9Sstevel@tonic-gate */
24327c478bd9Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
24337c478bd9Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent);
24347c478bd9Sstevel@tonic-gate
24357c478bd9Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next;
24367c478bd9Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) {
24377c478bd9Sstevel@tonic-gate pr_statp = &pii->pii_probes[i];
24387c478bd9Sstevel@tonic-gate
24397c478bd9Sstevel@tonic-gate assert(PR_STATUS_VALID(pr_statp->pr_status));
24407c478bd9Sstevel@tonic-gate
24417c478bd9Sstevel@tonic-gate switch (pr_statp->pr_status) {
24427c478bd9Sstevel@tonic-gate case PR_UNACKED:
24437c478bd9Sstevel@tonic-gate /*
24447c478bd9Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged
24457c478bd9Sstevel@tonic-gate */
24467c478bd9Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent);
24477c478bd9Sstevel@tonic-gate
24487c478bd9Sstevel@tonic-gate tg = pr_statp->pr_target;
24497c478bd9Sstevel@tonic-gate /*
24507c478bd9Sstevel@tonic-gate * Target is guaranteed to exist in the unack. state
24517c478bd9Sstevel@tonic-gate */
24527c478bd9Sstevel@tonic-gate assert(tg != NULL);
24537c478bd9Sstevel@tonic-gate /*
24547c478bd9Sstevel@tonic-gate * The crtt could be zero for some reason,
24557c478bd9Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is
24567c478bd9Sstevel@tonic-gate * not available use the group's probe interval,
24577c478bd9Sstevel@tonic-gate * which is a worst case estimate.
24587c478bd9Sstevel@tonic-gate */
2459e11c3f44Smeem timeout = ns2ms(pr_statp->pr_hrtime_start);
24607c478bd9Sstevel@tonic-gate if (tg->tg_crtt != 0) {
2461e11c3f44Smeem timeout += tg->tg_crtt;
24627c478bd9Sstevel@tonic-gate } else {
2463e11c3f44Smeem timeout +=
24647c478bd9Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint;
24657c478bd9Sstevel@tonic-gate }
24667c478bd9Sstevel@tonic-gate
24677c478bd9Sstevel@tonic-gate if (TIME_GT(timeout, now))
24687c478bd9Sstevel@tonic-gate break;
24697c478bd9Sstevel@tonic-gate
24707c478bd9Sstevel@tonic-gate pr_statp->pr_time_lost = timeout;
2471e11c3f44Smeem probe_chstate(pr_statp, pii, PR_LOST);
24727c478bd9Sstevel@tonic-gate /* FALLTHRU */
24737c478bd9Sstevel@tonic-gate
24747c478bd9Sstevel@tonic-gate case PR_LOST:
24757c478bd9Sstevel@tonic-gate if (!pi_found_success) {
24767c478bd9Sstevel@tonic-gate pfinfo->pf_nfail++;
24777c478bd9Sstevel@tonic-gate pfinfo->pf_tff = pr_statp->pr_time_lost;
24787c478bd9Sstevel@tonic-gate }
24797c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
24807c478bd9Sstevel@tonic-gate !tg_found_success) {
24817c478bd9Sstevel@tonic-gate pfinfo->pf_nfail_tg++;
24827c478bd9Sstevel@tonic-gate }
24837c478bd9Sstevel@tonic-gate break;
24847c478bd9Sstevel@tonic-gate
24857c478bd9Sstevel@tonic-gate default:
24867c478bd9Sstevel@tonic-gate /*
24877c478bd9Sstevel@tonic-gate * We hit a success or unused slot. Latch the
24887c478bd9Sstevel@tonic-gate * total number of recent consecutive failures.
24897c478bd9Sstevel@tonic-gate */
24907c478bd9Sstevel@tonic-gate pi_found_success = _B_TRUE;
24917c478bd9Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
24927c478bd9Sstevel@tonic-gate /*
24937c478bd9Sstevel@tonic-gate * We hit a success for the desired target.
24947c478bd9Sstevel@tonic-gate * Latch the number of recent consecutive
24957c478bd9Sstevel@tonic-gate * failures for this target
24967c478bd9Sstevel@tonic-gate */
24977c478bd9Sstevel@tonic-gate tg_found_success = _B_TRUE;
24987c478bd9Sstevel@tonic-gate }
24997c478bd9Sstevel@tonic-gate }
25007c478bd9Sstevel@tonic-gate }
25017c478bd9Sstevel@tonic-gate }
25027c478bd9Sstevel@tonic-gate
25037c478bd9Sstevel@tonic-gate /*
2504e11c3f44Smeem * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
2505e11c3f44Smeem */
2506e11c3f44Smeem void
probe_chstate(struct probe_stats * pr,struct phyint_instance * pii,int state)2507e11c3f44Smeem probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
2508e11c3f44Smeem {
2509e11c3f44Smeem if (pr->pr_status == state)
2510e11c3f44Smeem return;
2511e11c3f44Smeem
2512e11c3f44Smeem pr->pr_status = state;
2513e11c3f44Smeem (void) probe_state_event(pr, pii);
2514e11c3f44Smeem }
2515e11c3f44Smeem
2516e11c3f44Smeem /*
25177c478bd9Sstevel@tonic-gate * Check if the phyint has been repaired. If no test address has been
25187c478bd9Sstevel@tonic-gate * configured, then consider the interface repaired if the link is up (unless
25197c478bd9Sstevel@tonic-gate * the link is flapping; see below). Otherwise, look for proof of probes
25207c478bd9Sstevel@tonic-gate * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
25217c478bd9Sstevel@tonic-gate * either IPv4 or IPv6 instance, the phyint can be considered repaired.
25227c478bd9Sstevel@tonic-gate */
25237c478bd9Sstevel@tonic-gate static boolean_t
phyint_repaired(struct phyint * pi)25247c478bd9Sstevel@tonic-gate phyint_repaired(struct phyint *pi)
25257c478bd9Sstevel@tonic-gate {
25267c478bd9Sstevel@tonic-gate struct probe_success_count psinfo;
25277c478bd9Sstevel@tonic-gate struct phyint_instance *pii;
25287c478bd9Sstevel@tonic-gate struct target *cur_tg;
25297c478bd9Sstevel@tonic-gate int pr_ndx;
25307c478bd9Sstevel@tonic-gate uint_t cur_time;
25317c478bd9Sstevel@tonic-gate
2532e11c3f44Smeem if (debug & D_FAILREP)
25337c478bd9Sstevel@tonic-gate logdebug("phyint_repaired(%s)\n", pi->pi_name);
25347c478bd9Sstevel@tonic-gate
25357c478bd9Sstevel@tonic-gate if (LINK_DOWN(pi))
25367c478bd9Sstevel@tonic-gate return (_B_FALSE);
25377c478bd9Sstevel@tonic-gate
25387c478bd9Sstevel@tonic-gate /*
25397c478bd9Sstevel@tonic-gate * If we don't have any test addresses and the link is up, then
25407c478bd9Sstevel@tonic-gate * consider the interface repaired, unless we've received more than
25417c478bd9Sstevel@tonic-gate * LINK_UP_PERMIN link up notifications in the last minute, in
25427c478bd9Sstevel@tonic-gate * which case we keep the link down until we drop back below
25437c478bd9Sstevel@tonic-gate * the threshold.
25447c478bd9Sstevel@tonic-gate */
25457c478bd9Sstevel@tonic-gate if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
25467c478bd9Sstevel@tonic-gate cur_time = getcurrenttime();
25477c478bd9Sstevel@tonic-gate if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
25487c478bd9Sstevel@tonic-gate (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
25497c478bd9Sstevel@tonic-gate pi->pi_lfmsg_printed = 0;
25507c478bd9Sstevel@tonic-gate return (_B_TRUE);
25517c478bd9Sstevel@tonic-gate }
25527c478bd9Sstevel@tonic-gate if (!pi->pi_lfmsg_printed) {
25537c478bd9Sstevel@tonic-gate logerr("The link has come up on %s more than %d times "
2554e11c3f44Smeem "in the last minute; disabling repair until it "
25557c478bd9Sstevel@tonic-gate "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
25567c478bd9Sstevel@tonic-gate pi->pi_lfmsg_printed = 1;
25577c478bd9Sstevel@tonic-gate }
25587c478bd9Sstevel@tonic-gate
25597c478bd9Sstevel@tonic-gate return (_B_FALSE);
25607c478bd9Sstevel@tonic-gate }
25617c478bd9Sstevel@tonic-gate
25627c478bd9Sstevel@tonic-gate pii = pi->pi_v4;
25637c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) {
25647c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
25657c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target;
25667c478bd9Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo);
25677c478bd9Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
25687c478bd9Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
25697c478bd9Sstevel@tonic-gate return (_B_TRUE);
25707c478bd9Sstevel@tonic-gate }
25717c478bd9Sstevel@tonic-gate
25727c478bd9Sstevel@tonic-gate pii = pi->pi_v6;
25737c478bd9Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) {
25747c478bd9Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
25757c478bd9Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target;
25767c478bd9Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo);
25777c478bd9Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
25787c478bd9Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
25797c478bd9Sstevel@tonic-gate return (_B_TRUE);
25807c478bd9Sstevel@tonic-gate }
25817c478bd9Sstevel@tonic-gate
25827c478bd9Sstevel@tonic-gate return (_B_FALSE);
25837c478bd9Sstevel@tonic-gate }
25847c478bd9Sstevel@tonic-gate
25857c478bd9Sstevel@tonic-gate /*
25867c478bd9Sstevel@tonic-gate * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
25877c478bd9Sstevel@tonic-gate */
25887c478bd9Sstevel@tonic-gate boolean_t
change_pif_flags(struct phyint * pi,uint64_t set,uint64_t clear)2589e11c3f44Smeem change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
25907c478bd9Sstevel@tonic-gate {
25917c478bd9Sstevel@tonic-gate int ifsock;
25927c478bd9Sstevel@tonic-gate struct lifreq lifr;
259328f13c35Srk129064 uint64_t old_flags;
25947c478bd9Sstevel@tonic-gate
2595e11c3f44Smeem if (debug & D_FAILREP) {
2596e11c3f44Smeem logdebug("change_pif_flags(%s): set %llx clear %llx\n",
2597e11c3f44Smeem pi->pi_name, set, clear);
25987c478bd9Sstevel@tonic-gate }
25997c478bd9Sstevel@tonic-gate
2600e11c3f44Smeem if (pi->pi_v4 != NULL)
26017c478bd9Sstevel@tonic-gate ifsock = ifsock_v4;
2602e11c3f44Smeem else
26037c478bd9Sstevel@tonic-gate ifsock = ifsock_v6;
26047c478bd9Sstevel@tonic-gate
26057c478bd9Sstevel@tonic-gate /*
26067c478bd9Sstevel@tonic-gate * Get the current flags from the kernel, and set/clear the
26077c478bd9Sstevel@tonic-gate * desired phyint flags. Since we set only phyint flags, we can
26087c478bd9Sstevel@tonic-gate * do it on either IPv4 or IPv6 instance.
26097c478bd9Sstevel@tonic-gate */
2610e11c3f44Smeem (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2611e11c3f44Smeem
26127c478bd9Sstevel@tonic-gate if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
26137c478bd9Sstevel@tonic-gate if (errno != ENXIO)
2614e11c3f44Smeem logperror("change_pif_flags: ioctl (get flags)");
26157c478bd9Sstevel@tonic-gate return (_B_FALSE);
26167c478bd9Sstevel@tonic-gate }
261728f13c35Srk129064
261828f13c35Srk129064 old_flags = lifr.lifr_flags;
2619e11c3f44Smeem lifr.lifr_flags |= set;
2620e11c3f44Smeem lifr.lifr_flags &= ~clear;
262128f13c35Srk129064
262228f13c35Srk129064 if (old_flags == lifr.lifr_flags) {
262328f13c35Srk129064 /* No change in the flags. No need to send ioctl */
262428f13c35Srk129064 return (_B_TRUE);
262528f13c35Srk129064 }
262628f13c35Srk129064
26277c478bd9Sstevel@tonic-gate if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
26287c478bd9Sstevel@tonic-gate if (errno != ENXIO)
2629e11c3f44Smeem logperror("change_pif_flags: ioctl (set flags)");
26307c478bd9Sstevel@tonic-gate return (_B_FALSE);
26317c478bd9Sstevel@tonic-gate }
26327c478bd9Sstevel@tonic-gate
26337c478bd9Sstevel@tonic-gate /*
26347c478bd9Sstevel@tonic-gate * Keep pi_flags in synch. with actual flags. Assumes flags are
26357c478bd9Sstevel@tonic-gate * phyint flags.
26367c478bd9Sstevel@tonic-gate */
2637e11c3f44Smeem pi->pi_flags |= set;
2638e11c3f44Smeem pi->pi_flags &= ~clear;
26397c478bd9Sstevel@tonic-gate
2640e11c3f44Smeem if (pi->pi_v4 != NULL)
26417c478bd9Sstevel@tonic-gate pi->pi_v4->pii_flags = pi->pi_flags;
26427c478bd9Sstevel@tonic-gate
2643e11c3f44Smeem if (pi->pi_v6 != NULL)
26447c478bd9Sstevel@tonic-gate pi->pi_v6->pii_flags = pi->pi_flags;
26457c478bd9Sstevel@tonic-gate
26467c478bd9Sstevel@tonic-gate return (_B_TRUE);
26477c478bd9Sstevel@tonic-gate }
26487c478bd9Sstevel@tonic-gate
26497c478bd9Sstevel@tonic-gate /*
26507c478bd9Sstevel@tonic-gate * icmp cksum computation for IPv4.
26517c478bd9Sstevel@tonic-gate */
26527c478bd9Sstevel@tonic-gate static int
in_cksum(ushort_t * addr,int len)26537c478bd9Sstevel@tonic-gate in_cksum(ushort_t *addr, int len)
26547c478bd9Sstevel@tonic-gate {
26557c478bd9Sstevel@tonic-gate register int nleft = len;
26567c478bd9Sstevel@tonic-gate register ushort_t *w = addr;
26577c478bd9Sstevel@tonic-gate register ushort_t answer;
26587c478bd9Sstevel@tonic-gate ushort_t odd_byte = 0;
26597c478bd9Sstevel@tonic-gate register int sum = 0;
26607c478bd9Sstevel@tonic-gate
26617c478bd9Sstevel@tonic-gate /*
26627c478bd9Sstevel@tonic-gate * Our algorithm is simple, using a 32 bit accumulator (sum),
26637c478bd9Sstevel@tonic-gate * we add sequential 16 bit words to it, and at the end, fold
26647c478bd9Sstevel@tonic-gate * back all the carry bits from the top 16 bits into the lower
26657c478bd9Sstevel@tonic-gate * 16 bits.
26667c478bd9Sstevel@tonic-gate */
26677c478bd9Sstevel@tonic-gate while (nleft > 1) {
26687c478bd9Sstevel@tonic-gate sum += *w++;
26697c478bd9Sstevel@tonic-gate nleft -= 2;
26707c478bd9Sstevel@tonic-gate }
26717c478bd9Sstevel@tonic-gate
26727c478bd9Sstevel@tonic-gate /* mop up an odd byte, if necessary */
26737c478bd9Sstevel@tonic-gate if (nleft == 1) {
26747c478bd9Sstevel@tonic-gate *(uchar_t *)(&odd_byte) = *(uchar_t *)w;
26757c478bd9Sstevel@tonic-gate sum += odd_byte;
26767c478bd9Sstevel@tonic-gate }
26777c478bd9Sstevel@tonic-gate
26787c478bd9Sstevel@tonic-gate /*
26797c478bd9Sstevel@tonic-gate * add back carry outs from top 16 bits to low 16 bits
26807c478bd9Sstevel@tonic-gate */
26817c478bd9Sstevel@tonic-gate sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */
26827c478bd9Sstevel@tonic-gate sum += (sum >> 16); /* add carry */
26837c478bd9Sstevel@tonic-gate answer = ~sum; /* truncate to 16 bits */
26847c478bd9Sstevel@tonic-gate return (answer);
26857c478bd9Sstevel@tonic-gate }
26867c478bd9Sstevel@tonic-gate
26877c478bd9Sstevel@tonic-gate static void
reset_snxt_basetimes(void)26887c478bd9Sstevel@tonic-gate reset_snxt_basetimes(void)
26897c478bd9Sstevel@tonic-gate {
26907c478bd9Sstevel@tonic-gate struct phyint_instance *pii;
26917c478bd9Sstevel@tonic-gate
26927c478bd9Sstevel@tonic-gate for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
26937c478bd9Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
26947c478bd9Sstevel@tonic-gate }
26957c478bd9Sstevel@tonic-gate }
26967c478bd9Sstevel@tonic-gate
26977c478bd9Sstevel@tonic-gate /*
26987c478bd9Sstevel@tonic-gate * Is the address one of our own addresses? Unfortunately,
26997c478bd9Sstevel@tonic-gate * we cannot check our phyint tables to determine if the address
27007c478bd9Sstevel@tonic-gate * is our own. This is because, we don't track interfaces that
27017c478bd9Sstevel@tonic-gate * are not part of any group. We have to either use a 'bind' or
27027c478bd9Sstevel@tonic-gate * get the complete list of all interfaces using SIOCGLIFCONF,
270387e66ffcSrk129064 * to do this check. We could also use SIOCTMYADDR.
270487e66ffcSrk129064 * Bind fails for the local zone address, so we might include local zone
270587e66ffcSrk129064 * address as target address. If local zone address is a target address
270687e66ffcSrk129064 * and it is up, it is not possible to detect the interface failure.
270787e66ffcSrk129064 * SIOCTMYADDR also doesn't consider local zone address as own address.
270887e66ffcSrk129064 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2709e11c3f44Smeem * are stored in `localaddrs'
27107c478bd9Sstevel@tonic-gate */
27117c478bd9Sstevel@tonic-gate boolean_t
own_address(struct in6_addr addr)271287e66ffcSrk129064 own_address(struct in6_addr addr)
27137c478bd9Sstevel@tonic-gate {
2714e11c3f44Smeem addrlist_t *addrp;
2715e11c3f44Smeem struct sockaddr_storage ss;
2716e11c3f44Smeem int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
27177c478bd9Sstevel@tonic-gate
2718e11c3f44Smeem addr2storage(af, &addr, &ss);
2719e11c3f44Smeem for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
2720e11c3f44Smeem if (sockaddrcmp(&ss, &addrp->al_addr))
272187e66ffcSrk129064 return (_B_TRUE);
27227c478bd9Sstevel@tonic-gate }
272387e66ffcSrk129064 return (_B_FALSE);
27247c478bd9Sstevel@tonic-gate }
2725e11c3f44Smeem
2726e11c3f44Smeem static int
ns2ms(int64_t ns)2727e11c3f44Smeem ns2ms(int64_t ns)
2728e11c3f44Smeem {
2729*19449258SJosef 'Jeff' Sipek return (NSEC2MSEC(ns));
2730e11c3f44Smeem }
2731e11c3f44Smeem
2732e11c3f44Smeem static int64_t
tv2ns(struct timeval * tvp)2733e11c3f44Smeem tv2ns(struct timeval *tvp)
2734e11c3f44Smeem {
2735e11c3f44Smeem return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
2736e11c3f44Smeem }
2737