xref: /titanic_50/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c (revision 6213860b943e0dc644bdec5d9f94034cab88816e)
1 
2 /*
3  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
4  * Use is subject to license terms.
5  */
6 
7 /*
8  * Copyright (c) 1987 Regents of the University of California.
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms are permitted
12  * provided that the above copyright notice and this paragraph are
13  * duplicated in all such forms and that any documentation,
14  * advertising materials, and other materials related to such
15  * distribution and use acknowledge that the software was developed
16  * by the University of California, Berkeley. The name of the
17  * University may not be used to endorse or promote products derived
18  * from this software without specific prior written permission.
19  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
21  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22  */
23 
24 #include "mpd_defs.h"
25 #include "mpd_tables.h"
26 
27 /*
28  * Probe types for probe()
29  */
30 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
31 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
32 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
33 
34 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
35 
36 /*
37  * Format of probe / probe response packets. This is an ICMP Echo request
38  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
39  */
40 struct pr_icmp
41 {
42 	uint8_t  pr_icmp_type;		/* type field */
43 	uint8_t  pr_icmp_code;		/* code field */
44 	uint16_t pr_icmp_cksum;		/* checksum field */
45 	uint16_t pr_icmp_id;		/* Identification */
46 	uint16_t pr_icmp_seq;		/* sequence number */
47 	uint64_t pr_icmp_timestamp;	/* Time stamp (in ns) */
48 	uint32_t pr_icmp_mtype;		/* Message type */
49 };
50 
51 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
52 				    0x0, 0x0, 0x0, 0x0,
53 				    0x0, 0x0, 0x0, 0x0,
54 				    0x0, 0x0, 0x0, 0x1 } };
55 
56 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
57 
58 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
59 
60 static void		*find_ancillary(struct msghdr *msg, int cmsg_level,
61     int cmsg_type);
62 static void		pi_set_crtt(struct target *tg, int64_t m,
63     boolean_t is_probe_uni);
64 static void		incoming_echo_reply(struct phyint_instance *pii,
65     struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
66 static void		incoming_rtt_reply(struct phyint_instance *pii,
67     struct pr_icmp *reply, struct in6_addr fromaddr);
68 static void		incoming_mcast_reply(struct phyint_instance *pii,
69     struct pr_icmp *reply, struct in6_addr fromaddr);
70 
71 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
72 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
73 static boolean_t	check_exception_target(struct phyint_instance *pii,
74     struct target *target);
75 static void		probe_fail_info(struct phyint_instance *pii,
76     struct target *cur_tg, struct probe_fail_count *pfinfo);
77 static void		probe_success_info(struct phyint_instance *pii,
78     struct target *cur_tg, struct probe_success_count *psinfo);
79 static boolean_t	phyint_repaired(struct phyint *pi);
80 
81 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
82 static int 		in_cksum(ushort_t *addr, int len);
83 static void		reset_snxt_basetimes(void);
84 static int		ns2ms(int64_t ns);
85 static int64_t		tv2ns(struct timeval *);
86 
87 /*
88  * CRTT - Conservative Round Trip Time Estimate
89  * Probe success - A matching probe reply received before CRTT ms has elapsed
90  *	after sending the probe.
91  * Probe failure - No probe reply received and more than CRTT ms has elapsed
92  *	after sending the probe.
93  *
94  * TLS - Time last success. Most recent probe ack received at this time.
95  * TFF - Time first fail. The time of the earliest probe failure in
96  *	a consecutive series of probe failures.
97  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
98  * 	before declaring phyint repair.
99  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
100  *	declare a phyint failure.
101  *
102  * 			Phyint state diagram
103  *
104  * The state of a phyint that is capable of being probed, is completely
105  * specified by the 3-tuple <pi_state, pg_state, I>.
106  *
107  * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
108  * IFF_OFFLINE is set.  If the phyint is also configured with a test address
109  * (the common case) and probe targets, then a phyint must also successfully
110  * be able to send and receive probes in order to remain in the PI_RUNNING
111  * state (otherwise, it transitions to PI_FAILED).
112  *
113  * Further, if a PI_RUNNING phyint is configured with a test address but is
114  * unable to find any probe targets, it will transition to the PI_NOTARGETS
115  * state, which indicates that the link is apparently functional but that
116  * in.mpathd is unable to send probes to verify functionality (in this case,
117  * in.mpathd makes the optimistic assumption that the interface is working
118  * correctly and thus does not mark the interface FAILED, but reports it as
119  * IPMP_IF_UNKNOWN through the async events and query interfaces).
120  *
121  * At any point, a phyint may be administratively marked offline via if_mpadm.
122  * In this case, the interface always transitions to PI_OFFLINE, regardless
123  * of its previous state.  When the interface is later brought back online,
124  * in.mpathd acts as if the interface is new (and thus it transitions to
125  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
126  * its probes, if probes are sent).
127  *
128  * pi_state -  PI_RUNNING or PI_FAILED
129  *	PI_RUNNING: The failure detection logic says the phyint is good.
130  *	PI_FAILED: The failure detection logic says the phyint has failed.
131  *
132  * pg_state  - PG_OK, PG_DEGRADED, or PG_FAILED.
133  *	PG_OK: All interfaces in the group are OK.
134  *	PG_DEGRADED: Some interfaces in the group are unusable.
135  *	PG_FAILED: All interfaces in the group are unusable.
136  *
137  *	In the case of router targets, we assume that the current list of
138  *	targets obtained from the routing table, is still valid, so the
139  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
140  *	list of targets, and multicast to the all hosts, to reconstruct the
141  *	target list. So the phyints are in the PI_NOTARGETS state.
142  *
143  * I -	value of (pi_flags & IFF_INACTIVE)
144  *	IFF_INACTIVE: This phyint will not send or receive packets.
145  *	Usually, inactive is tied to standby interfaces that are not yet
146  *	needed (e.g., no non-standby interfaces in the group have failed).
147  *	When failback has been disabled (FAILBACK=no configured), phyint can
148  *	also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
149  *	subsequently recovers after a failure.
150  *
151  * Not all 9 possible combinations of the above 3-tuple are possible.
152  *
153  * I is tracked by IP. pi_state is tracked by mpathd.
154  *
155  *			pi_state state machine
156  * ---------------------------------------------------------------------------
157  *	Event			State			New State
158  *				Action:
159  * ---------------------------------------------------------------------------
160  *	IP interface failure	(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
161  *	detection		: set IFF_FAILED on this phyint
162  *
163  *	IP interface failure	(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
164  *	detection		: set IFF_FAILED on this phyint
165  *
166  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=yes)
167  *	detection				     -> (PI_RUNNING, I == 0)
168  *				: clear IFF_FAILED on this phyint
169  *
170  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=no)
171  *	detection				     ->	(PI_RUNNING, I == 1)
172  *				: clear IFF_FAILED on this phyint
173  *				: if failback is disabled set I == 1
174  *
175  *	Group failure		(perform on all phyints in the group)
176  *	detection 		PI_RUNNING		PI_FAILED
177  *	(Router targets)	: set IFF_FAILED
178  *
179  *	Group failure		(perform on all phyints in the group)
180  *	detection 		PI_RUNNING		PI_NOTARGETS
181  *	(Host targets)		: set IFF_FAILED
182  *				: delete the target list on all phyints
183  * ---------------------------------------------------------------------------
184  */
185 
186 struct probes_missed probes_missed;
187 
188 /*
189  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
190  * will be added on by the kernel.  The id field identifies this phyint.
191  * and the sequence number is an increasing (modulo 2^^16) integer. The data
192  * portion holds the time value when the packet is sent. On echo this is
193  * extracted to compute the round-trip time. Three different types of
194  * probe packets are used.
195  *
196  * PROBE_UNI: This type is used to do failure detection / failure recovery
197  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
198  *	not less than the current CRTT. pii_probes[] stores data
199  *	about these probes. These packets consume sequence number space.
200  *
201  * PROBE_RTT: This type is used to make only rtt measurements. Normally these
202  * 	are not used. Under heavy network load, the rtt may go up very high,
203  *	due to a spike, or may appear to go high, due to extreme scheduling
204  * 	delays. Once the network stress is removed, mpathd takes long time to
205  *	recover, because the probe_interval is already high, and it takes
206  *	a long time to send out sufficient number of probes to bring down the
207  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
208  *	user_probe_interval ms. and will cause only rtt updates. These packets
209  *	do not consume sequence number space nor is information about these
210  *	packets stored in the pii_probes[]
211  *
212  * PROBE_MULTI: This type is only used to construct a list of targets, when
213  *	no targets are known. The packet is multicast to the all hosts addr.
214  */
215 static void
216 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
217 {
218 	hrtime_t sent_hrtime;
219 	struct timeval sent_tv;
220 	struct pr_icmp probe_pkt;	/* Probe packet */
221 	struct sockaddr_storage targ;	/* target address */
222 	uint_t	targaddrlen;		/* targed address length */
223 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
224 	boolean_t sent = _B_TRUE;
225 
226 	if (debug & D_TARGET) {
227 		logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
228 		    pii->pii_name, probe_type, start_hrtime);
229 	}
230 
231 	assert(pii->pii_probe_sock != -1);
232 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
233 	    probe_type == PROBE_RTT);
234 
235 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
236 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
237 	probe_pkt.pr_icmp_code = 0;
238 	probe_pkt.pr_icmp_cksum = 0;
239 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
240 
241 	/*
242 	 * Since there is no need to do arithmetic on the icmpid,
243 	 * (only equality check is done) pii_icmpid is stored in
244 	 * network byte order at initialization itself.
245 	 */
246 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
247 	probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
248 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
249 
250 	/*
251 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
252 	 * the all hosts address. Otherwise it is unicast to the next target.
253 	 */
254 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
255 	    pii->pii_rtt_target_next != NULL));
256 
257 	bzero(&targ, sizeof (targ));
258 	targ.ss_family = pii->pii_af;
259 
260 	if (pii->pii_af == AF_INET6) {
261 		struct in6_addr *addr6;
262 
263 		addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
264 		targaddrlen = sizeof (struct sockaddr_in6);
265 		if (probe_type == PROBE_MULTI) {
266 			*addr6 = all_nodes_mcast_v6;
267 		} else if (probe_type == PROBE_UNI) {
268 			*addr6 = pii->pii_target_next->tg_address;
269 		} else { /* type is PROBE_RTT */
270 			*addr6 = pii->pii_rtt_target_next->tg_address;
271 		}
272 	} else {
273 		struct in_addr *addr4;
274 
275 		addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
276 		targaddrlen = sizeof (struct sockaddr_in);
277 		if (probe_type == PROBE_MULTI) {
278 			*addr4 = all_nodes_mcast_v4;
279 		} else if (probe_type == PROBE_UNI) {
280 			IN6_V4MAPPED_TO_INADDR(
281 			    &pii->pii_target_next->tg_address, addr4);
282 		} else { /* type is PROBE_RTT */
283 			IN6_V4MAPPED_TO_INADDR(
284 			    &pii->pii_rtt_target_next->tg_address, addr4);
285 		}
286 
287 		/*
288 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
289 		 */
290 		probe_pkt.pr_icmp_cksum =
291 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
292 	}
293 
294 	/*
295 	 * Use the current time as the time we sent.  Not atomic, but the best
296 	 * we can do from here.
297 	 */
298 	sent_hrtime = gethrtime();
299 	(void) gettimeofday(&sent_tv, NULL);
300 	if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
301 	    (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) {
302 		logperror_pii(pii, "probe: probe sendto");
303 		sent = _B_FALSE;
304 	}
305 
306 	/*
307 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
308 	 * update our tables. We will need this info in processing the probe
309 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
310 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
311 	 * are only used to construct a list of targets. PROBE_RTT packets are
312 	 * used only for updating the rtt and not for failure detection.
313 	 */
314 	if (probe_type == PROBE_UNI && sent) {
315 		pr_ndx = pii->pii_probe_next;
316 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
317 
318 		/* Collect statistics, before we reuse the last slot. */
319 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
320 			pii->pii_cum_stats.lost++;
321 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
322 			pii->pii_cum_stats.acked++;
323 		pii->pii_cum_stats.sent++;
324 
325 		pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
326 		pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
327 		pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
328 		pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
329 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
330 		probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
331 
332 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
333 		pii->pii_target_next = target_next(pii->pii_target_next);
334 		assert(pii->pii_target_next != NULL);
335 		/*
336 		 * If we have a single variable to denote the next target to
337 		 * probe for both rtt probes and failure detection probes, we
338 		 * could end up with a situation where the failure detection
339 		 * probe targets become disjoint from the rtt probe targets.
340 		 * Eg. if 2 targets and the actual fdt is double the user
341 		 * specified fdt. So we have 2 variables. In this scheme
342 		 * we also reset pii_rtt_target_next for every fdt probe,
343 		 * though that may not be necessary.
344 		 */
345 		pii->pii_rtt_target_next = pii->pii_target_next;
346 		pii->pii_snxt++;
347 	} else if (probe_type == PROBE_RTT) {
348 		pii->pii_rtt_target_next =
349 		    target_next(pii->pii_rtt_target_next);
350 		assert(pii->pii_rtt_target_next != NULL);
351 	}
352 }
353 
354 /*
355  * Incoming IPv4 data from wire, is received here. Called from main.
356  */
357 void
358 in_data(struct phyint_instance *pii)
359 {
360 	struct	sockaddr_in 	from;
361 	struct	in6_addr	fromaddr;
362 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
363 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
364 	struct ip *ip;
365 	int 	iphlen;
366 	int 	len;
367 	char 	abuf[INET_ADDRSTRLEN];
368 	struct msghdr msg;
369 	struct iovec iov;
370 	struct pr_icmp *reply;
371 	struct timeval *recv_tvp;
372 
373 	if (debug & D_PROBE) {
374 		logdebug("in_data(%s %s)\n",
375 		    AF_STR(pii->pii_af), pii->pii_name);
376 	}
377 
378 	iov.iov_base = (char *)in_packet;
379 	iov.iov_len = sizeof (in_packet);
380 	msg.msg_iov = &iov;
381 	msg.msg_iovlen = 1;
382 	msg.msg_name = (struct sockaddr *)&from;
383 	msg.msg_namelen = sizeof (from);
384 	msg.msg_control = ancillary_data;
385 	msg.msg_controllen = sizeof (ancillary_data);
386 
387 	/*
388 	 * Poll has already told us that a message is waiting,
389 	 * on this socket. Read it now. We should not block.
390 	 */
391 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
392 		logperror_pii(pii, "in_data: recvmsg");
393 		return;
394 	}
395 
396 	/*
397 	 * If the datalink has indicated the link is down, don't go
398 	 * any further.
399 	 */
400 	if (LINK_DOWN(pii->pii_phyint))
401 		return;
402 
403 	/* Get the printable address for error reporting */
404 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
405 
406 	/* Ignore packets > 64k or control buffers that don't fit */
407 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
408 		if (debug & D_PKTBAD) {
409 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
410 			    msg.msg_flags, abuf);
411 		}
412 		return;
413 	}
414 
415 	/* Make sure packet contains at least minimum ICMP header */
416 	ip = (struct ip *)in_packet;
417 	iphlen = ip->ip_hl << 2;
418 	if (len < iphlen + ICMP_MINLEN) {
419 		if (debug & D_PKTBAD) {
420 			logdebug("in_data: packet too short (%d bytes)"
421 			    " from %s\n", len, abuf);
422 		}
423 		return;
424 	}
425 
426 	/*
427 	 * Subtract the IP hdr length, 'len' will be length of the probe
428 	 * reply, starting from the icmp hdr.
429 	 */
430 	len -= iphlen;
431 	/* LINTED */
432 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
433 
434 	/* Probe replies are icmp echo replies. Ignore anything else */
435 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
436 		return;
437 
438 	/*
439 	 * The icmp id should match what we sent, which is stored
440 	 * in pi_icmpid. The icmp code for reply must be 0.
441 	 * The reply content must be a struct pr_icmp
442 	 */
443 	if (reply->pr_icmp_id != pii->pii_icmpid) {
444 		/* Not in response to our probe */
445 		return;
446 	}
447 
448 	if (reply->pr_icmp_code != 0) {
449 		logtrace("probe reply code %d from %s on %s\n",
450 		    reply->pr_icmp_code, abuf, pii->pii_name);
451 		return;
452 	}
453 
454 	if (len < sizeof (struct pr_icmp)) {
455 		logtrace("probe reply too short: %d bytes from %s on %s\n",
456 		    len, abuf, pii->pii_name);
457 		return;
458 	}
459 
460 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
461 	if (recv_tvp == NULL) {
462 		logtrace("message without timestamp from %s on %s\n",
463 		    abuf, pii->pii_name);
464 		return;
465 	}
466 
467 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
468 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
469 		/* Unicast probe reply */
470 		incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
471 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
472 		/* Multicast reply */
473 		incoming_mcast_reply(pii, reply, fromaddr);
474 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
475 		incoming_rtt_reply(pii, reply, fromaddr);
476 	} else {
477 		/* Probably not in response to our probe */
478 		logtrace("probe reply type: %d from %s on %s\n",
479 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
480 		return;
481 	}
482 }
483 
484 /*
485  * Incoming IPv6 data from wire is received here. Called from main.
486  */
487 void
488 in6_data(struct phyint_instance *pii)
489 {
490 	struct sockaddr_in6 from;
491 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
492 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
493 	int len;
494 	char abuf[INET6_ADDRSTRLEN];
495 	struct msghdr msg;
496 	struct iovec iov;
497 	void	*opt;
498 	struct	pr_icmp *reply;
499 	struct	timeval *recv_tvp;
500 
501 	if (debug & D_PROBE) {
502 		logdebug("in6_data(%s %s)\n",
503 		    AF_STR(pii->pii_af), pii->pii_name);
504 	}
505 
506 	iov.iov_base = (char *)in_packet;
507 	iov.iov_len = sizeof (in_packet);
508 	msg.msg_iov = &iov;
509 	msg.msg_iovlen = 1;
510 	msg.msg_name = (struct sockaddr *)&from;
511 	msg.msg_namelen = sizeof (from);
512 	msg.msg_control = ancillary_data;
513 	msg.msg_controllen = sizeof (ancillary_data);
514 
515 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
516 		logperror_pii(pii, "in6_data: recvmsg");
517 		return;
518 	}
519 
520 	/*
521 	 * If the datalink has indicated that the link is down, don't go
522 	 * any further.
523 	 */
524 	if (LINK_DOWN(pii->pii_phyint))
525 		return;
526 
527 	/* Get the printable address for error reporting */
528 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
529 	if (len < ICMP_MINLEN) {
530 		if (debug & D_PKTBAD) {
531 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
532 			    msg.msg_flags, abuf);
533 		}
534 		return;
535 	}
536 	/* Ignore packets > 64k or control buffers that don't fit */
537 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
538 		if (debug & D_PKTBAD) {
539 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
540 			    msg.msg_flags, abuf);
541 		}
542 		return;
543 	}
544 
545 	reply = (struct pr_icmp *)in_packet;
546 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
547 		return;
548 
549 	if (reply->pr_icmp_id != pii->pii_icmpid) {
550 		/* Not in response to our probe */
551 		return;
552 	}
553 
554 	/*
555 	 * The kernel has already verified the the ICMP checksum.
556 	 */
557 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
558 		logtrace("ICMPv6 echo reply source address not linklocal from "
559 		    "%s on %s\n", abuf, pii->pii_name);
560 		return;
561 	}
562 	opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
563 	if (opt != NULL) {
564 		/* Can't allow routing headers in probe replies  */
565 		logtrace("message with routing header from %s on %s\n",
566 		    abuf, pii->pii_name);
567 		return;
568 	}
569 
570 	if (reply->pr_icmp_code != 0) {
571 		logtrace("probe reply code: %d from %s on %s\n",
572 		    reply->pr_icmp_code, abuf, pii->pii_name);
573 		return;
574 	}
575 	if (len < (sizeof (struct pr_icmp))) {
576 		logtrace("probe reply too short: %d bytes from %s on %s\n",
577 		    len, abuf, pii->pii_name);
578 		return;
579 	}
580 
581 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
582 	if (recv_tvp == NULL) {
583 		logtrace("message without timestamp from %s on %s\n",
584 		    abuf, pii->pii_name);
585 		return;
586 	}
587 
588 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
589 		incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
590 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
591 		incoming_mcast_reply(pii, reply, from.sin6_addr);
592 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
593 		incoming_rtt_reply(pii, reply, from.sin6_addr);
594 	} else  {
595 		/* Probably not in response to our probe */
596 		logtrace("probe reply type: %d from %s on %s\n",
597 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
598 	}
599 }
600 
601 /*
602  * Process the incoming rtt reply, in response to our rtt probe.
603  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
604  * have any stored information about the probe we sent. So we don't log
605  * any errors if we receive bad replies.
606  */
607 static void
608 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
609     struct in6_addr fromaddr)
610 {
611 	int64_t	m;		/* rtt measurement in ns */
612 	char	abuf[INET6_ADDRSTRLEN];
613 	struct	target	*target;
614 	struct 	phyint_group *pg;
615 
616 	/* Get the printable address for error reporting */
617 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
618 
619 	if (debug & D_PROBE) {
620 		logdebug("incoming_rtt_reply: %s %s %s\n",
621 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
622 	}
623 
624 	/* Do we know this target ? */
625 	target = target_lookup(pii, fromaddr);
626 	if (target == NULL)
627 		return;
628 
629 	m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
630 	/* Invalid rtt. It has wrapped around */
631 	if (m < 0)
632 		return;
633 
634 	/*
635 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
636 	 * The initial few responses after the interface is repaired may
637 	 * contain high rtt's because they could have been queued up waiting
638 	 * for ARP/NDP resolution on a failed interface.
639 	 */
640 	pg = pii->pii_phyint->pi_group;
641 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
642 		return;
643 
644 	/*
645 	 * Update rtt only if the new rtt is lower than the current rtt.
646 	 * (specified by the 3rd parameter to pi_set_crtt).
647 	 * If a spike has caused the current probe_interval to be >
648 	 * user_probe_interval, then this mechanism is used to bring down
649 	 * the rtt rapidly once the network stress is removed.
650 	 * If the new rtt is higher than the current rtt, we don't want to
651 	 * update the rtt. We are having more than 1 outstanding probe and
652 	 * the increase in rtt we are seeing is being unnecessarily weighted
653 	 * many times. The regular rtt update will be handled by
654 	 * incoming_echo_reply() and will take care of any rtt increase.
655 	 */
656 	pi_set_crtt(target, m, _B_FALSE);
657 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
658 	    (user_failure_detection_time < pg->pg_fdt) &&
659 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
660 		/*
661 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
662 		 * investigate if we can improve the failure detection time to
663 		 * meet whatever the user specified.
664 		 */
665 		if (check_pg_crtt_improved(pg)) {
666 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
667 			    user_failure_detection_time);
668 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
669 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
670 				logerr("Improved failure detection time %d ms "
671 				    "on (%s %s) for group \"%s\"\n",
672 				    pg->pg_fdt, AF_STR(pii->pii_af),
673 				    pii->pii_name,
674 				    pii->pii_phyint->pi_group->pg_name);
675 			}
676 			if (user_failure_detection_time == pg->pg_fdt) {
677 				/* Avoid any truncation or rounding errors */
678 				pg->pg_probeint = user_probe_interval;
679 				/*
680 				 * No more rtt probes will be sent. The actual
681 				 * fdt has dropped to the user specified value.
682 				 * pii_fd_snxt_basetime and pii_snxt_basetime
683 				 * will be in sync henceforth.
684 				 */
685 				reset_snxt_basetimes();
686 			}
687 		}
688 	}
689 }
690 
691 /*
692  * Process the incoming echo reply, in response to our unicast probe.
693  * Common for both IPv4 and IPv6
694  */
695 static void
696 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
697     struct in6_addr fromaddr, struct timeval *recv_tvp)
698 {
699 	int64_t	m;		/* rtt measurement in ns */
700 	hrtime_t cur_hrtime;	/* in ns from some arbitrary point */
701 	char	abuf[INET6_ADDRSTRLEN];
702 	int	pr_ndx;
703 	struct	target	*target;
704 	boolean_t exception;
705 	uint64_t pr_icmp_timestamp;
706 	uint16_t pr_icmp_seq;
707 	struct	probe_stats *pr_statp;
708 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
709 
710 	/* Get the printable address for error reporting */
711 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
712 
713 	if (debug & D_PROBE) {
714 		logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
715 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
716 		    ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
717 	}
718 
719 	pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
720 	pr_icmp_seq = ntohs(reply->pr_icmp_seq);
721 
722 	/* Reject out of window probe replies */
723 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
724 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
725 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
726 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
727 		pii->pii_cum_stats.unknown++;
728 		return;
729 	}
730 
731 	cur_hrtime = gethrtime();
732 	m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
733 	if (m < 0) {
734 		/*
735 		 * This is a ridiculously high value of rtt. rtt has wrapped
736 		 * around. Log a message, and ignore the rtt.
737 		 */
738 		logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
739 		    "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
740 	}
741 
742 	/*
743 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
744 	 * number in our pii->pii_probes[] array. The icmp sequence number
745 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
746 	 */
747 	pr_ndx = MOD_SUB(pii->pii_probe_next,
748 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
749 
750 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
751 
752 	target = pii->pii_probes[pr_ndx].pr_target;
753 
754 	/*
755 	 * Perform sanity checks, whether this probe reply that we
756 	 * have received is genuine
757 	 */
758 	if (target != NULL) {
759 		/*
760 		 * Compare the src. addr of the received ICMP or ICMPv6
761 		 * probe reply with the target address in our tables.
762 		 */
763 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
764 			/*
765 			 * We don't have any record of having sent a probe to
766 			 * this target. This is a fake probe reply. Log an error
767 			 */
768 			logtrace("probe status %d Fake probe reply seq %u "
769 			    "snxt %u on %s from %s\n",
770 			    pii->pii_probes[pr_ndx].pr_status,
771 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
772 			pii->pii_cum_stats.unknown++;
773 			return;
774 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
775 			/*
776 			 * The address matches, but our tables indicate that
777 			 * this probe reply has been acked already. So this
778 			 * is a duplicate probe reply. Log an error
779 			 */
780 			logtrace("probe status %d Duplicate probe reply seq %u "
781 			    "snxt %u on %s from %s\n",
782 			    pii->pii_probes[pr_ndx].pr_status,
783 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
784 			pii->pii_cum_stats.unknown++;
785 			return;
786 		}
787 	} else {
788 		/*
789 		 * Target must not be NULL in the PR_UNACKED state
790 		 */
791 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
792 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
793 			/*
794 			 * The probe stats slot is unused. So we didn't
795 			 * send out any probe to this target. This is a fake.
796 			 * Log an error.
797 			 */
798 			logtrace("probe status %d Fake probe reply seq %u "
799 			    "snxt %u on %s from %s\n",
800 			    pii->pii_probes[pr_ndx].pr_status,
801 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
802 		}
803 		pii->pii_cum_stats.unknown++;
804 		return;
805 	}
806 
807 	/*
808 	 * If the rtt does not appear to be right, don't update the
809 	 * rtt stats. This can happen if the system dropped into the
810 	 * debugger, or the system was hung or too busy for a
811 	 * substantial time that we didn't get a chance to run.
812 	 */
813 	if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
814 		/*
815 		 * If the probe corresponding to this received response
816 		 * was truly sent 'm' ns. ago, then this response must
817 		 * have been rejected by the sequence number checks. The
818 		 * fact that it has passed the sequence number checks
819 		 * means that the measured rtt is wrong. We were probably
820 		 * scheduled long after the packet was received.
821 		 */
822 		goto out;
823 	}
824 
825 	/*
826 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
827 	 * The initial few responses after the interface is repaired may
828 	 * contain high rtt's because they could have been queued up waiting
829 	 * for ARP/NDP resolution on a failed interface.
830 	 */
831 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
832 		goto out;
833 
834 	/*
835 	 * Don't update the Conservative Round Trip Time estimate for this
836 	 * (phint, target) pair if this is the not the highest ack seq seen
837 	 * thus far on this target.
838 	 */
839 	if (!highest_ack_tg(pr_icmp_seq, target))
840 		goto out;
841 
842 	/*
843 	 * Always update the rtt. This is a failure detection probe
844 	 * and we want to measure both increase / decrease in rtt.
845 	 */
846 	pi_set_crtt(target, m, _B_TRUE);
847 
848 	/*
849 	 * If the crtt exceeds the average time between probes,
850 	 * investigate if this slow target is an exception. If so we
851 	 * can avoid this target and still meet the failure detection
852 	 * time. Otherwise we can't meet the failure detection time.
853 	 */
854 	if (target->tg_crtt > pg->pg_probeint) {
855 		exception = check_exception_target(pii, target);
856 		if (exception) {
857 			/*
858 			 * This target is exceptionally slow. Don't use it
859 			 * for future probes. check_exception_target() has
860 			 * made sure that we have at least MIN_PROBE_TARGETS
861 			 * other active targets
862 			 */
863 			if (pii->pii_targets_are_routers) {
864 				/*
865 				 * This is a slow router, mark it as slow
866 				 * and don't use it for further probes. We
867 				 * don't delete it, since it will be populated
868 				 * again when we do a router scan. Hence we
869 				 * need to maintain extra state (unlike the
870 				 * host case below).  Mark it as TG_SLOW.
871 				 */
872 				if (target->tg_status == TG_ACTIVE)
873 					pii->pii_ntargets--;
874 				target->tg_status = TG_SLOW;
875 				target->tg_latime = gethrtime();
876 				target->tg_rtt_sa = -1;
877 				target->tg_crtt = 0;
878 				target->tg_rtt_sd = 0;
879 				if (pii->pii_target_next == target) {
880 					pii->pii_target_next =
881 					    target_next(target);
882 				}
883 			} else {
884 				/*
885 				 * the slow target is not a router, we can
886 				 * just delete it. Send an icmp multicast and
887 				 * pick the fastest responder that is not
888 				 * already an active target. target_delete()
889 				 * adjusts pii->pii_target_next
890 				 */
891 				target_delete(target);
892 				probe(pii, PROBE_MULTI, cur_hrtime);
893 			}
894 		} else {
895 			/*
896 			 * We can't meet the failure detection time.
897 			 * Log a message, and update the detection time to
898 			 * whatever we can achieve.
899 			 */
900 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
901 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
902 			last_fdt_bumpup_time = gethrtime();
903 			if (pg != phyint_anongroup) {
904 				logerr("Cannot meet requested failure detection"
905 				    " time of %d ms on (%s %s) new failure"
906 				    " detection time for group \"%s\" is %d"
907 				    " ms\n", user_failure_detection_time,
908 				    AF_STR(pii->pii_af), pii->pii_name,
909 				    pg->pg_name, pg->pg_fdt);
910 			}
911 		}
912 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
913 	    (user_failure_detection_time < pg->pg_fdt) &&
914 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
915 		/*
916 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
917 		 * investigate if we can improve the failure detection time to
918 		 * meet whatever the user specified.
919 		 */
920 		if (check_pg_crtt_improved(pg)) {
921 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
922 			    user_failure_detection_time);
923 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
924 			if (pg != phyint_anongroup) {
925 				logerr("Improved failure detection time %d ms "
926 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
927 				    AF_STR(pii->pii_af), pii->pii_name,
928 				    pg->pg_name);
929 			}
930 			if (user_failure_detection_time == pg->pg_fdt) {
931 				/* Avoid any truncation or rounding errors */
932 				pg->pg_probeint = user_probe_interval;
933 				/*
934 				 * No more rtt probes will be sent. The actual
935 				 * fdt has dropped to the user specified value.
936 				 * pii_fd_snxt_basetime and pii_snxt_basetime
937 				 * will be in sync henceforth.
938 				 */
939 				reset_snxt_basetimes();
940 			}
941 		}
942 	}
943 out:
944 	pr_statp = &pii->pii_probes[pr_ndx];
945 	pr_statp->pr_hrtime_ackproc = cur_hrtime;
946 	pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
947 	    (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
948 
949 	probe_chstate(pr_statp, pii, PR_ACKED);
950 
951 	/*
952 	 * Update pii->pii_rack, i.e. the sequence number of the last received
953 	 * probe response, based on the echo reply we have received now, if
954 	 * either of the following conditions are satisfied.
955 	 * a. pii_rack is outside the current receive window of
956 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
957 	 *    This means we have not received probe responses for a
958 	 *    long time, and the sequence number has wrapped around.
959 	 * b. pii_rack is within the current receive window and this echo
960 	 *    reply corresponds to the highest sequence number we have seen
961 	 *    so far.
962 	 */
963 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
964 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
965 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
966 		pii->pii_rack = pr_icmp_seq;
967 	}
968 }
969 
970 /*
971  * Returns true if seq is the highest unacknowledged seq for target tg
972  * else returns false
973  */
974 static boolean_t
975 highest_ack_tg(uint16_t seq, struct target *tg)
976 {
977 	struct phyint_instance *pii;
978 	int	 pr_ndx;
979 	uint16_t pr_seq;
980 
981 	pii = tg->tg_phyint_inst;
982 
983 	/*
984 	 * Get the seq number of the most recent probe sent so far,
985 	 * and also get the corresponding probe index in the probe stats
986 	 * array.
987 	 */
988 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
989 	pr_seq = pii->pii_snxt;
990 	pr_seq--;
991 
992 	/*
993 	 * Start from the most recent probe and walk back, trying to find
994 	 * an acked probe corresponding to target tg.
995 	 */
996 	for (; pr_ndx != pii->pii_probe_next;
997 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
998 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
999 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1000 			if (SEQ_GT(pr_seq, seq))
1001 				return (_B_FALSE);
1002 		}
1003 	}
1004 	return (_B_TRUE);
1005 }
1006 
1007 /*
1008  * Check whether the crtt for the group has improved by a factor of
1009  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1010  * detection time flapping in the face of small crtt changes.
1011  */
1012 static boolean_t
1013 check_pg_crtt_improved(struct phyint_group *pg)
1014 {
1015 	struct	phyint *pi;
1016 
1017 	if (debug & D_PROBE)
1018 		logdebug("check_pg_crtt_improved()\n");
1019 
1020 	/*
1021 	 * The crtt for the group is only improved if each phyint_instance
1022 	 * for both ipv4 and ipv6 is improved.
1023 	 */
1024 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1025 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1026 		    !check_pii_crtt_improved(pi->pi_v6))
1027 			return (_B_FALSE);
1028 	}
1029 
1030 	return (_B_TRUE);
1031 }
1032 
1033 /*
1034  * Check whether the crtt has improved substantially on this phyint_instance.
1035  * Returns _B_TRUE if there's no crtt information available, because pii
1036  * is NULL or the phyint_instance is not capable of probing.
1037  */
1038 boolean_t
1039 check_pii_crtt_improved(struct phyint_instance *pii) {
1040 	struct 	target *tg;
1041 
1042 	if (pii == NULL)
1043 		return (_B_TRUE);
1044 
1045 	if (!PROBE_CAPABLE(pii) ||
1046 	    pii->pii_phyint->pi_state == PI_FAILED)
1047 		return (_B_TRUE);
1048 
1049 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1050 		if (tg->tg_status != TG_ACTIVE)
1051 			continue;
1052 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1053 		    LOWER_FDT_TRIGGER)) {
1054 			return (_B_FALSE);
1055 		}
1056 	}
1057 
1058 	return (_B_TRUE);
1059 }
1060 
1061 /*
1062  * This target responds very slowly to probes. The target's crtt exceeds
1063  * the probe interval of its group. Compare against other targets
1064  * and determine if this target is an exception, if so return true, else false
1065  */
1066 static boolean_t
1067 check_exception_target(struct phyint_instance *pii, struct target *target)
1068 {
1069 	struct	target *tg;
1070 	char abuf[INET6_ADDRSTRLEN];
1071 
1072 	if (debug & D_PROBE) {
1073 		logdebug("check_exception_target(%s %s target %s)\n",
1074 		    AF_STR(pii->pii_af), pii->pii_name,
1075 		    pr_addr(pii->pii_af, target->tg_address,
1076 		    abuf, sizeof (abuf)));
1077 	}
1078 
1079 	/*
1080 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1081 	 * to make a good judgement. Otherwise don't drop this target.
1082 	 */
1083 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1084 		return (_B_FALSE);
1085 
1086 	/*
1087 	 * Determine whether only this particular target is slow.
1088 	 * We know that this target's crtt exceeds the group's probe interval.
1089 	 * If all other active targets have a
1090 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1091 	 * then this target is considered slow.
1092 	 */
1093 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1094 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1095 			if (tg->tg_crtt >
1096 			    pii->pii_phyint->pi_group->pg_probeint /
1097 			    EXCEPTION_FACTOR) {
1098 				return (_B_FALSE);
1099 			}
1100 		}
1101 	}
1102 
1103 	return (_B_TRUE);
1104 }
1105 
1106 /*
1107  * Update the target list. The icmp all hosts multicast has given us
1108  * some host to which we can send probes. If we already have sufficient
1109  * targets, discard it.
1110  */
1111 static void
1112 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1113     struct in6_addr fromaddr)
1114 /* ARGSUSED */
1115 {
1116 	int af;
1117 	char abuf[INET6_ADDRSTRLEN];
1118 	struct phyint *pi;
1119 
1120 	if (debug & D_PROBE) {
1121 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1122 		    AF_STR(pii->pii_af), pii->pii_name,
1123 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1124 	}
1125 
1126 	/*
1127 	 * Using host targets is a fallback mechanism. If we have
1128 	 * found a router, don't add this host target. If we already
1129 	 * know MAX_PROBE_TARGETS, don't add another target.
1130 	 */
1131 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1132 	if (pii->pii_targets != NULL) {
1133 		if (pii->pii_targets_are_routers ||
1134 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1135 			return;
1136 		}
1137 	}
1138 
1139 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1140 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1141 		/*
1142 		 * Guard against response from 0.0.0.0
1143 		 * and ::. Log a trace message
1144 		 */
1145 		logtrace("probe response from %s on %s\n",
1146 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1147 		    pii->pii_name);
1148 		return;
1149 	}
1150 
1151 	/*
1152 	 * This address is one of our own, so reject this address as a
1153 	 * valid probe target.
1154 	 */
1155 	af = pii->pii_af;
1156 	if (own_address(fromaddr))
1157 		return;
1158 
1159 	/*
1160 	 * If the phyint is part a named group, then add the address to all
1161 	 * members of the group.  Otherwise, add the address only to the
1162 	 * phyint itself, since other phyints in the anongroup may not be on
1163 	 * the same subnet.
1164 	 */
1165 	pi = pii->pii_phyint;
1166 	if (pi->pi_group == phyint_anongroup) {
1167 		target_add(pii, fromaddr, _B_FALSE);
1168 	} else {
1169 		pi = pi->pi_group->pg_phyint;
1170 		for (; pi != NULL; pi = pi->pi_pgnext)
1171 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1172 	}
1173 }
1174 
1175 /*
1176  * Compute CRTT given an existing scaled average, scaled deviation estimate
1177  * and a new rtt time.  The formula is from Jacobson and Karels'
1178  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1179  * are the same as those in Appendix A.2 of that paper.
1180  *
1181  * m = new measurement
1182  * sa = scaled RTT average (8 * average estimates)
1183  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1184  * crtt = Conservative round trip time. Used to determine whether probe
1185  * has timed out.
1186  *
1187  * New scaled average and deviation are passed back via sap and svp
1188  */
1189 static int64_t
1190 compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
1191 {
1192 	int64_t sa = *sap;
1193 	int64_t sv = *svp;
1194 	int64_t crtt;
1195 	int64_t saved_m = m;
1196 
1197 	assert(*sap >= -1);
1198 	assert(*svp >= 0);
1199 
1200 	if (sa != -1) {
1201 		/*
1202 		 * Update average estimator:
1203 		 *	new rtt = old rtt + 1/8 Error
1204 		 *	    where Error = m - old rtt
1205 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1206 		 *	i.e. new sa =  old sa + Error
1207 		 */
1208 		m -= sa >> 3;		/* m is now Error in estimate. */
1209 		if ((sa += m) < 0) {
1210 			/* Don't allow the smoothed average to be negative. */
1211 			sa = 0;
1212 		}
1213 
1214 		/*
1215 		 * Update deviation estimator:
1216 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1217 		 *	i.e. 4 * new mdev = 4 * old mdev +
1218 		 *		(abs(Error) - old mdev)
1219 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1220 		 */
1221 		if (m < 0)
1222 			m = -m;
1223 		m -= sv >> 2;
1224 		sv += m;
1225 	} else {
1226 		/* Initialization. This is the first response received. */
1227 		sa = (m << 3);
1228 		sv = (m << 1);
1229 	}
1230 
1231 	crtt = (sa >> 3) + sv;
1232 
1233 	if (debug & D_PROBE) {
1234 		logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
1235 		    "crtt = %lld\n", saved_m, sa, sv, crtt);
1236 	}
1237 
1238 	*sap = sa;
1239 	*svp = sv;
1240 
1241 	/*
1242 	 * CRTT = average estimates  + 4 * deviation estimates
1243 	 *	= sa / 8 + sv
1244 	 */
1245 	return (crtt);
1246 }
1247 
1248 static void
1249 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
1250 {
1251 	struct phyint_instance *pii = tg->tg_phyint_inst;
1252 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1253 	int64_t sa = tg->tg_rtt_sa;
1254 	int64_t sv = tg->tg_rtt_sd;
1255 	int new_crtt;
1256 	int i;
1257 
1258 	if (debug & D_PROBE)
1259 		logdebug("pi_set_crtt: target -  m %lld\n", m);
1260 
1261 	/* store the round trip time, in case we need to defer computation */
1262 	tg->tg_deferred[tg->tg_num_deferred] = m;
1263 
1264 	new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
1265 
1266 	/*
1267 	 * If this probe's round trip time would singlehandedly cause an
1268 	 * increase in the group's probe interval consider it suspect.
1269 	 */
1270 	if ((new_crtt > probe_interval) && is_probe_uni) {
1271 		if (debug & D_PROBE) {
1272 			logdebug("Received a suspect probe on %s, new_crtt ="
1273 			    " %d, probe_interval = %d, num_deferred = %d\n",
1274 			    pii->pii_probe_logint->li_name, new_crtt,
1275 			    probe_interval, tg->tg_num_deferred);
1276 		}
1277 
1278 		/*
1279 		 * If we've deferred as many rtts as we plan on deferring, then
1280 		 * assume the link really did slow down and process all queued
1281 		 * rtts
1282 		 */
1283 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1284 			if (debug & D_PROBE) {
1285 				logdebug("Received MAXDEFERREDRTT probes which "
1286 				    "would cause an increased probe_interval.  "
1287 				    "Integrating queued rtt data points.\n");
1288 			}
1289 
1290 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1291 				tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
1292 				    &tg->tg_rtt_sd, tg->tg_deferred[i]));
1293 			}
1294 
1295 			tg->tg_num_deferred = 0;
1296 		} else {
1297 			tg->tg_num_deferred++;
1298 		}
1299 		return;
1300 	}
1301 
1302 	/*
1303 	 * If this is a normal probe, or an RTT probe that would lead to a
1304 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1305 	 * a normal probe, pitch any deferred probes since our probes are
1306 	 * again being answered within our CRTT estimates.
1307 	 */
1308 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1309 		tg->tg_rtt_sa = sa;
1310 		tg->tg_rtt_sd = sv;
1311 		tg->tg_crtt = new_crtt;
1312 		if (is_probe_uni)
1313 			tg->tg_num_deferred = 0;
1314 	}
1315 }
1316 
1317 /*
1318  * Return a pointer to the specified option buffer.
1319  * If not found return NULL.
1320  */
1321 static void *
1322 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
1323 {
1324 	struct cmsghdr *cmsg;
1325 
1326 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1327 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1328 		if (cmsg->cmsg_level == cmsg_level &&
1329 		    cmsg->cmsg_type == cmsg_type) {
1330 			return (CMSG_DATA(cmsg));
1331 		}
1332 	}
1333 	return (NULL);
1334 }
1335 
1336 /*
1337  * Try to activate another INACTIVE interface in the same group as `pi'.
1338  * Prefer STANDBY INACTIVE to just INACTIVE.
1339  */
1340 void
1341 phyint_activate_another(struct phyint *pi)
1342 {
1343 	struct phyint *pi2;
1344 	struct phyint *inactivepi = NULL;
1345 
1346 	if (pi->pi_group == phyint_anongroup)
1347 		return;
1348 
1349 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1350 		if (pi == pi2 || pi2->pi_state != PI_RUNNING ||
1351 		    !(pi2->pi_flags & IFF_INACTIVE))
1352 			continue;
1353 
1354 		inactivepi = pi2;
1355 		if (pi2->pi_flags & IFF_STANDBY)
1356 			break;
1357 	}
1358 
1359 	if (inactivepi != NULL)
1360 		(void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
1361 }
1362 
1363 /*
1364  * Transition a phyint to PI_RUNNING.  The caller must ensure that the
1365  * transition is appropriate.  Clears IFF_OFFLINE or IFF_FAILED if
1366  * appropriate.  Also sets IFF_INACTIVE on this or other interfaces as
1367  * appropriate (see comment below).  Finally, also updates the phyint's group
1368  * state to account for the change.
1369  */
1370 void
1371 phyint_transition_to_running(struct phyint *pi)
1372 {
1373 	struct phyint *pi2;
1374 	struct phyint *actstandbypi = NULL;
1375 	uint_t nactive = 0, nnonstandby = 0;
1376 	boolean_t onlining = (pi->pi_state == PI_OFFLINE);
1377 	boolean_t initial = (pi->pi_state == PI_INIT);
1378 	uint64_t set, clear;
1379 
1380 	/*
1381 	 * The interface is running again, but should it or another interface
1382 	 * in the group end up INACTIVE?  There are three cases:
1383 	 *
1384 	 * 1. If it's a STANDBY interface, it should be end up INACTIVE if
1385 	 *    the group is operating at capacity (i.e., there are at least as
1386 	 *    many active interfaces as non-STANDBY interfaces in the group).
1387 	 *    No other interfaces should be changed.
1388 	 *
1389 	 * 2. If it's a non-STANDBY interface and we're onlining it or
1390 	 *    FAILBACK is enabled, then it should *not* end up INACTIVE.
1391 	 *    Further, if the group is above capacity as a result of this
1392 	 *    interface, then an active STANDBY interface in the group should
1393 	 *    end up INACTIVE.
1394 	 *
1395 	 * 3. If it's a non-STANDBY interface, we're repairing it, and
1396 	 *    FAILBACK is disabled, then it should end up INACTIVE *unless*
1397 	 *    the group was failed (in which case we have no choice but to
1398 	 *    use it).  No other interfaces should be changed.
1399 	 */
1400 	if (pi->pi_group != phyint_anongroup) {
1401 		pi2 = pi->pi_group->pg_phyint;
1402 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1403 			if (!(pi2->pi_flags & IFF_STANDBY))
1404 				nnonstandby++;
1405 
1406 			if (pi2->pi_state == PI_RUNNING) {
1407 				if (!(pi2->pi_flags & IFF_INACTIVE)) {
1408 					nactive++;
1409 					if (pi2->pi_flags & IFF_STANDBY)
1410 						actstandbypi = pi2;
1411 				}
1412 			}
1413 		}
1414 	}
1415 
1416 	set = 0;
1417 	clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
1418 
1419 	if (pi->pi_flags & IFF_STANDBY) {			/* case 1 */
1420 		if (nactive >= nnonstandby)
1421 			set |= IFF_INACTIVE;
1422 		else
1423 			clear |= IFF_INACTIVE;
1424 	} else if (onlining || failback_enabled) {		/* case 2 */
1425 		if (nactive >= nnonstandby && actstandbypi != NULL)
1426 			(void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
1427 	} else if (!initial && !GROUP_FAILED(pi->pi_group)) {	/* case 3 */
1428 		set |= IFF_INACTIVE;
1429 	}
1430 	(void) change_pif_flags(pi, set, clear);
1431 
1432 	phyint_chstate(pi, PI_RUNNING);
1433 
1434 	/*
1435 	 * Update the group state to account for the change.
1436 	 */
1437 	phyint_group_refresh_state(pi->pi_group);
1438 }
1439 
1440 /*
1441  * See if a previously failed interface has started working again.
1442  */
1443 void
1444 phyint_check_for_repair(struct phyint *pi)
1445 {
1446 	if (!phyint_repaired(pi))
1447 		return;
1448 
1449 	if (pi->pi_group == phyint_anongroup) {
1450 		logerr("IP interface repair detected on %s\n", pi->pi_name);
1451 	} else {
1452 		logerr("IP interface repair detected on %s of group %s\n",
1453 		    pi->pi_name, pi->pi_group->pg_name);
1454 	}
1455 
1456 	/*
1457 	 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
1458 	 * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
1459 	 * until it is brought back online.
1460 	 */
1461 	if (pi->pi_state == PI_OFFLINE) {
1462 		(void) change_pif_flags(pi, 0, IFF_FAILED);
1463 		return;
1464 	}
1465 
1466 	phyint_transition_to_running(pi);	/* calls phyint_chstate() */
1467 }
1468 
1469 /*
1470  * See if an interface has failed, or if the whole group of interfaces has
1471  * failed.
1472  */
1473 static void
1474 phyint_inst_check_for_failure(struct phyint_instance *pii)
1475 {
1476 	struct phyint	*pi = pii->pii_phyint;
1477 	struct phyint	*pi2;
1478 	boolean_t	was_active;
1479 
1480 	switch (failure_state(pii)) {
1481 	case PHYINT_FAILURE:
1482 		was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1483 
1484 		(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1485 		if (pi->pi_group == phyint_anongroup) {
1486 			logerr("IP interface failure detected on %s\n",
1487 			    pii->pii_name);
1488 		} else {
1489 			logerr("IP interface failure detected on %s of group"
1490 			    " %s\n", pii->pii_name, pi->pi_group->pg_name);
1491 		}
1492 
1493 		/*
1494 		 * If the failed interface was active, activate another
1495 		 * INACTIVE interface in the group if possible.
1496 		 */
1497 		if (was_active)
1498 			phyint_activate_another(pi);
1499 
1500 		/*
1501 		 * If the interface is offline, the state change will be
1502 		 * noted when it comes back online.
1503 		 */
1504 		if (pi->pi_state != PI_OFFLINE) {
1505 			phyint_chstate(pi, PI_FAILED);
1506 			reset_crtt_all(pi);
1507 		}
1508 		break;
1509 
1510 	case GROUP_FAILURE:
1511 		pi2 = pi->pi_group->pg_phyint;
1512 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1513 			(void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
1514 			if (pi2->pi_state == PI_OFFLINE) /* see comment above */
1515 				continue;
1516 
1517 			reset_crtt_all(pi2);
1518 			/*
1519 			 * In the case of host targets, we would have flushed
1520 			 * the targets, and gone to PI_NOTARGETS state.
1521 			 */
1522 			if (pi2->pi_state == PI_RUNNING)
1523 				phyint_chstate(pi2, PI_FAILED);
1524 		}
1525 		break;
1526 
1527 	default:
1528 		break;
1529 	}
1530 }
1531 
1532 /*
1533  * Determines if any timeout event has occurred and returns the number of
1534  * milliseconds until the next timeout event for the phyint. Returns
1535  * TIMER_INFINITY for "never".
1536  */
1537 uint_t
1538 phyint_inst_timer(struct phyint_instance *pii)
1539 {
1540 	int 	pr_ndx;
1541 	uint_t	timeout;
1542 	struct	target	*cur_tg;
1543 	struct	probe_stats *pr_statp;
1544 	struct	phyint_instance *pii_other;
1545 	struct	phyint *pi;
1546 	int	valid_unack_count;
1547 	int	i;
1548 	int	interval;
1549 	uint_t	check_time;
1550 	uint_t	cur_time;
1551 	hrtime_t cur_hrtime;
1552 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1553 
1554 	cur_hrtime = gethrtime();
1555 	cur_time = ns2ms(cur_hrtime);
1556 
1557 	if (debug & D_TIMER) {
1558 		logdebug("phyint_inst_timer(%s %s)\n",
1559 		    AF_STR(pii->pii_af), pii->pii_name);
1560 	}
1561 
1562 	pii_other = phyint_inst_other(pii);
1563 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1564 		/*
1565 		 * Check to see if we're here due to link up/down flapping; If
1566 		 * enough time has passed, then try to bring the interface
1567 		 * back up; otherwise, schedule a timer to bring it back up
1568 		 * when enough time *has* elapsed.
1569 		 */
1570 		pi = pii->pii_phyint;
1571 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1572 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1573 			if (check_time > cur_time)
1574 				return (check_time - cur_time);
1575 
1576 			phyint_check_for_repair(pi);
1577 		}
1578 	}
1579 
1580 	/*
1581 	 * If probing is not enabled on this phyint instance, don't proceed.
1582 	 */
1583 	if (!PROBE_ENABLED(pii))
1584 		return (TIMER_INFINITY);
1585 
1586 	/*
1587 	 * If the timer has fired too soon, probably triggered
1588 	 * by some other phyint instance, return the remaining
1589 	 * time
1590 	 */
1591 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1592 		return (pii->pii_snxt_time - cur_time);
1593 
1594 	/*
1595 	 * If the link is down, don't send any probes for now.
1596 	 */
1597 	if (LINK_DOWN(pii->pii_phyint))
1598 		return (TIMER_INFINITY);
1599 
1600 	/*
1601 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1602 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1603 	 * Base probe time is strictly periodic.
1604 	 */
1605 	interval = GET_RANDOM(
1606 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1607 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1608 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1609 
1610 	/*
1611 	 * Check if the current time > next time to probe. If so, we missed
1612 	 * sending 1 or more probes, probably due to heavy system load. At least
1613 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1614 	 * were scheduled. Make adjustments to the times, in multiples of
1615 	 * user_probe_interval.
1616 	 */
1617 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1618 		int n;
1619 
1620 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1621 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1622 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1623 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1624 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1625 		    pii->pii_snxt_basetime);
1626 
1627 		/* Collect statistics about missed probes */
1628 		probes_missed.pm_nprobes += n + 1;
1629 		probes_missed.pm_ntimes++;
1630 	}
1631 	pii->pii_snxt_basetime += user_probe_interval;
1632 	interval = pii->pii_snxt_time - cur_time;
1633 	if (debug & D_TARGET) {
1634 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1635 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1636 		    pii->pii_snxt_basetime, interval);
1637 	}
1638 
1639 	/*
1640 	 * If no targets are known, we need to send an ICMP multicast. The
1641 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1642 	 * to see if we found a target.
1643 	 */
1644 	if (pii->pii_target_next == NULL) {
1645 		assert(pii->pii_ntargets == 0);
1646 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1647 		probe(pii, PROBE_MULTI, cur_time);
1648 		return (interval);
1649 	}
1650 
1651 	if ((user_probe_interval != probe_interval) &&
1652 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1653 		/*
1654 		 * the failure detection (fd) probe timer has not yet fired.
1655 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1656 		 */
1657 		probe(pii, PROBE_RTT, cur_hrtime);
1658 		return (interval);
1659 	}
1660 	/*
1661 	 * the fd probe timer has fired. Need to do all failure
1662 	 * detection / recovery calculations, and then send an fd probe
1663 	 * of type PROBE_UNI.
1664 	 */
1665 	if (user_probe_interval == probe_interval) {
1666 		/*
1667 		 * We could have missed some probes, and then adjusted
1668 		 * pii_snxt_basetime above. Otherwise we could have
1669 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1670 		 */
1671 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1672 	} else {
1673 		pii->pii_fd_snxt_basetime += probe_interval;
1674 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1675 			int n;
1676 
1677 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1678 			    probe_interval;
1679 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1680 		}
1681 	}
1682 
1683 	/*
1684 	 * We can have at most, the latest 2 probes that we sent, in
1685 	 * the PR_UNACKED state. All previous probes sent, are either
1686 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1687 	 * timed out if the probe's time_start + the CRTT < currenttime.
1688 	 * For each of the last 2 probes, examine whether it has timed
1689 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1690 	 */
1691 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1692 	valid_unack_count = 0;
1693 
1694 	for (i = 0; i < 2; i++) {
1695 		pr_statp = &pii->pii_probes[pr_ndx];
1696 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1697 		switch (pr_statp->pr_status) {
1698 		case PR_ACKED:
1699 			/*
1700 			 * We received back an ACK, so the switch clearly
1701 			 * is not dropping our traffic, and thus we can
1702 			 * enable failure detection immediately.
1703 			 */
1704 			if (pii->pii_fd_hrtime > gethrtime()) {
1705 				if (debug & D_PROBE) {
1706 					logdebug("successful probe on %s; "
1707 					    "ending quiet period\n",
1708 					    pii->pii_phyint->pi_name);
1709 				}
1710 				pii->pii_fd_hrtime = gethrtime();
1711 			}
1712 			break;
1713 
1714 		case PR_UNACKED:
1715 			assert(cur_tg != NULL);
1716 			/*
1717 			 * The crtt could be zero for some reason,
1718 			 * Eg. the phyint could be failed. If the crtt is
1719 			 * not available use group's probe interval,
1720 			 * which is a worst case estimate.
1721 			 */
1722 			timeout = ns2ms(pr_statp->pr_hrtime_start);
1723 			if (cur_tg->tg_crtt != 0) {
1724 				timeout += cur_tg->tg_crtt;
1725 			} else {
1726 				timeout += probe_interval;
1727 			}
1728 			if (TIME_LT(timeout, cur_time)) {
1729 				pr_statp->pr_time_lost = timeout;
1730 				probe_chstate(pr_statp, pii, PR_LOST);
1731 			} else if (i == 1) {
1732 				/*
1733 				 * We are forced to consider this probe
1734 				 * lost, as we can have at most 2 unack.
1735 				 * probes any time, and we will be sending a
1736 				 * probe at the end of this function.
1737 				 * Normally, we should not be here, but
1738 				 * this can happen if an incoming response
1739 				 * that was considered lost has increased
1740 				 * the crtt for this target, and also bumped
1741 				 * up the FDT. Note that we never cancel or
1742 				 * increase the current pii_time_left, so
1743 				 * when the timer fires, we find 2 valid
1744 				 * unacked probes, and they are yet to timeout
1745 				 */
1746 				pr_statp->pr_time_lost = cur_time;
1747 				probe_chstate(pr_statp, pii, PR_LOST);
1748 			} else {
1749 				/*
1750 				 * Only the most recent probe can enter
1751 				 * this 'else' arm. The second most recent
1752 				 * probe must take either of the above arms,
1753 				 * if it is unacked.
1754 				 */
1755 				valid_unack_count++;
1756 			}
1757 			break;
1758 		}
1759 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1760 	}
1761 
1762 	/*
1763 	 * We send out 1 probe randomly in the interval between one half
1764 	 * and one probe interval for the group. Given that the CRTT is always
1765 	 * less than the group's probe interval, we can have at most 1
1766 	 * unacknowledged probe now.  All previous probes are either lost or
1767 	 * acked.
1768 	 */
1769 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1770 
1771 	/*
1772 	 * The timer has fired. Take appropriate action depending
1773 	 * on the current state of the phyint.
1774 	 *
1775 	 * PI_RUNNING state 	- Failure detection
1776 	 * PI_FAILED state 	- Repair detection
1777 	 */
1778 	switch (pii->pii_phyint->pi_state) {
1779 	case PI_FAILED:
1780 		/*
1781 		 * If the most recent probe (excluding unacked probes that
1782 		 * are yet to time out) has been acked, check whether the
1783 		 * phyint is now repaired.
1784 		 */
1785 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1786 			phyint_check_for_repair(pii->pii_phyint);
1787 		}
1788 		break;
1789 
1790 	case PI_RUNNING:
1791 		/*
1792 		 * It's possible our probes have been lost because of a
1793 		 * spanning-tree mandated quiet period on the switch.  If so,
1794 		 * ignore the lost probes.
1795 		 */
1796 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1797 			break;
1798 
1799 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1800 			/*
1801 			 * We have 1 or more failed probes (excluding unacked
1802 			 * probes that are yet to time out). Determine if the
1803 			 * phyint has failed.
1804 			 */
1805 			phyint_inst_check_for_failure(pii);
1806 		}
1807 		break;
1808 
1809 	default:
1810 		logerr("phyint_inst_timer: invalid state %d\n",
1811 		    pii->pii_phyint->pi_state);
1812 		abort();
1813 	}
1814 
1815 	/*
1816 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1817 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1818 	 * was called, the target list may be empty.
1819 	 */
1820 	if (pii->pii_target_next != NULL) {
1821 		probe(pii, PROBE_UNI, cur_hrtime);
1822 		/*
1823 		 * If we have just the one probe target, and we're not using
1824 		 * router targets, try to find another as we presently have
1825 		 * no resilience.
1826 		 */
1827 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1828 			probe(pii, PROBE_MULTI, cur_hrtime);
1829 	} else {
1830 		probe(pii, PROBE_MULTI, cur_hrtime);
1831 	}
1832 	return (interval);
1833 }
1834 
1835 /*
1836  * Start the probe timer for an interface instance.
1837  */
1838 void
1839 start_timer(struct phyint_instance *pii)
1840 {
1841 	uint32_t interval;
1842 
1843 	/*
1844 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1845 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1846 	 * pi_snxt_basetime is strictly periodic with a frequency of
1847 	 * the group's probe interval. The actual probe time pi_snxt_time
1848 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1849 	 * For the 1st probe on each phyint after the timer is started,
1850 	 * pi_snxt_time and pi_snxt_basetime are the same.
1851 	 */
1852 	interval = GET_RANDOM(0,
1853 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1854 
1855 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1856 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1857 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1858 	timer_schedule(interval);
1859 }
1860 
1861 /*
1862  * Restart the probe timer on an interface instance.
1863  */
1864 static void
1865 restart_timer(struct phyint_instance *pii)
1866 {
1867 	/*
1868 	 * We don't need to restart the timer if it was never started in
1869 	 * the first place (pii->pii_basetime_inited not set), as the timer
1870 	 * won't have gone off yet.
1871 	 */
1872 	if (pii->pii_basetime_inited != 0) {
1873 
1874 		if (debug & D_LINKNOTE)
1875 			logdebug("restart timer: restarting timer on %s, "
1876 			    "address family %s\n", pii->pii_phyint->pi_name,
1877 			    AF_STR(pii->pii_af));
1878 
1879 		start_timer(pii);
1880 	}
1881 }
1882 
1883 static void
1884 process_link_state_down(struct phyint *pi)
1885 {
1886 	logerr("The link has gone down on %s\n", pi->pi_name);
1887 
1888 	/*
1889 	 * Clear the probe statistics arrays, we don't want the repair
1890 	 * detection logic relying on probes that were successful prior
1891 	 * to the link going down.
1892 	 */
1893 	if (PROBE_CAPABLE(pi->pi_v4))
1894 		clear_pii_probe_stats(pi->pi_v4);
1895 	if (PROBE_CAPABLE(pi->pi_v6))
1896 		clear_pii_probe_stats(pi->pi_v6);
1897 	/*
1898 	 * Check for interface failure.  Although we know the interface
1899 	 * has failed, we don't know if all the other interfaces in the
1900 	 * group have failed as well.
1901 	 */
1902 	if ((pi->pi_state == PI_RUNNING) ||
1903 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1904 		if (debug & D_LINKNOTE) {
1905 			logdebug("process_link_state_down:"
1906 			    " checking for failure on %s\n", pi->pi_name);
1907 		}
1908 
1909 		if (pi->pi_v4 != NULL)
1910 			phyint_inst_check_for_failure(pi->pi_v4);
1911 		else if (pi->pi_v6 != NULL)
1912 			phyint_inst_check_for_failure(pi->pi_v6);
1913 	}
1914 }
1915 
1916 static void
1917 process_link_state_up(struct phyint *pi)
1918 {
1919 	logerr("The link has come up on %s\n", pi->pi_name);
1920 
1921 	/*
1922 	 * We stopped any running timers on each instance when the link
1923 	 * went down, so restart them.
1924 	 */
1925 	if (pi->pi_v4)
1926 		restart_timer(pi->pi_v4);
1927 	if (pi->pi_v6)
1928 		restart_timer(pi->pi_v6);
1929 
1930 	phyint_check_for_repair(pi);
1931 
1932 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1933 	if (pi->pi_whendx == LINK_UP_PERMIN)
1934 		pi->pi_whendx = 0;
1935 }
1936 
1937 /*
1938  * Process any changes in link state passed up from the interfaces.
1939  */
1940 void
1941 process_link_state_changes(void)
1942 {
1943 	struct phyint *pi;
1944 
1945 	/* Look for interfaces where the link state has just changed */
1946 
1947 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1948 		boolean_t old_link_state_up = LINK_UP(pi);
1949 
1950 		/*
1951 		 * Except when the "phyint" structure is created, this is
1952 		 * the only place the link state is updated.  This allows
1953 		 * this routine to detect changes in link state, rather
1954 		 * than just the current state.
1955 		 */
1956 		UPDATE_LINK_STATE(pi);
1957 
1958 		if (LINK_DOWN(pi)) {
1959 			/*
1960 			 * Has link just gone down?
1961 			 */
1962 			if (old_link_state_up)
1963 				process_link_state_down(pi);
1964 		} else {
1965 			/*
1966 			 * Has link just gone back up?
1967 			 */
1968 			if (!old_link_state_up)
1969 				process_link_state_up(pi);
1970 		}
1971 	}
1972 }
1973 
1974 void
1975 reset_crtt_all(struct phyint *pi)
1976 {
1977 	struct phyint_instance *pii;
1978 	struct target *tg;
1979 
1980 	pii = pi->pi_v4;
1981 	if (pii != NULL) {
1982 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1983 			tg->tg_crtt = 0;
1984 			tg->tg_rtt_sa = -1;
1985 			tg->tg_rtt_sd = 0;
1986 		}
1987 	}
1988 
1989 	pii = pi->pi_v6;
1990 	if (pii != NULL) {
1991 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1992 			tg->tg_crtt = 0;
1993 			tg->tg_rtt_sa = -1;
1994 			tg->tg_rtt_sd = 0;
1995 		}
1996 	}
1997 }
1998 
1999 /*
2000  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
2001  * probes on both instances IPv4 and IPv6.
2002  * If the interface has failed, return the time of the first probe failure
2003  * in "tff".
2004  */
2005 static int
2006 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
2007 {
2008 	uint_t	pi_tff;
2009 	struct	target *cur_tg;
2010 	struct	probe_fail_count pfinfo;
2011 	struct	phyint_instance *pii_other;
2012 	int	pr_ndx;
2013 
2014 	/*
2015 	 * Get the number of consecutive failed probes on
2016 	 * this phyint across all targets. Also get the number
2017 	 * of consecutive failed probes on this target only
2018 	 */
2019 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2020 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
2021 	probe_fail_info(pii, cur_tg, &pfinfo);
2022 
2023 	/* Get the time of first failure, for later use */
2024 	pi_tff = pfinfo.pf_tff;
2025 
2026 	/*
2027 	 * If the current target has not responded to the
2028 	 * last NUM_PROBE_FAILS probes, and other targets are
2029 	 * responding delete this target. Dead gateway detection
2030 	 * will eventually remove this target (if router) from the
2031 	 * routing tables. If that does not occur, we may end
2032 	 * up adding this to our list again.
2033 	 */
2034 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2035 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2036 		if (pii->pii_targets_are_routers) {
2037 			if (cur_tg->tg_status == TG_ACTIVE)
2038 				pii->pii_ntargets--;
2039 			cur_tg->tg_status = TG_DEAD;
2040 			cur_tg->tg_crtt = 0;
2041 			cur_tg->tg_rtt_sa = -1;
2042 			cur_tg->tg_rtt_sd = 0;
2043 			if (pii->pii_target_next == cur_tg)
2044 				pii->pii_target_next = target_next(cur_tg);
2045 		} else {
2046 			target_delete(cur_tg);
2047 			probe(pii, PROBE_MULTI, gethrtime());
2048 		}
2049 		return (PHYINT_OK);
2050 	}
2051 
2052 	/*
2053 	 * If the phyint has lost NUM_PROBE_FAILS or more
2054 	 * consecutive probes, on both IPv4 and IPv6 protocol
2055 	 * instances of the phyint, then trigger failure
2056 	 * detection, else return false
2057 	 */
2058 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2059 		return (PHYINT_OK);
2060 
2061 	pii_other = phyint_inst_other(pii);
2062 	if (PROBE_CAPABLE(pii_other)) {
2063 		probe_fail_info(pii_other, NULL, &pfinfo);
2064 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2065 			/*
2066 			 * We have NUM_PROBE_FAILS or more failures
2067 			 * on both IPv4 and IPv6. Get the earliest
2068 			 * time when failure was detected on this
2069 			 * phyint across IPv4 and IPv6.
2070 			 */
2071 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2072 				pi_tff = pfinfo.pf_tff;
2073 		} else {
2074 			/*
2075 			 * This instance has < NUM_PROBE_FAILS failure.
2076 			 * So return false
2077 			 */
2078 			return (PHYINT_OK);
2079 		}
2080 	}
2081 	*tff = pi_tff;
2082 	return (PHYINT_FAILURE);
2083 }
2084 
2085 /*
2086  * Check if the link has gone down on this phyint, or it has failed the
2087  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2088  * Also look at other phyints of this group, for group failures.
2089  */
2090 int
2091 failure_state(struct phyint_instance *pii)
2092 {
2093 	struct	probe_success_count psinfo;
2094 	uint_t	pi2_tls;		/* time last success */
2095 	uint_t	pi_tff;			/* time first fail */
2096 	struct	phyint *pi2;
2097 	struct	phyint *pi;
2098 	struct	phyint_instance *pii2;
2099 	struct  phyint_group *pg;
2100 	int	retval;
2101 
2102 	if (debug & D_FAILREP)
2103 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2104 
2105 	pi = pii->pii_phyint;
2106 	pg = pi->pi_group;
2107 
2108 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2109 	    PHYINT_OK)
2110 		return (PHYINT_OK);
2111 
2112 	/*
2113 	 * At this point, the link is down, or the phyint is suspect, as it
2114 	 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
2115 	 * belong to any group, this is a PHYINT_FAILURE.  Otherwise, continue
2116 	 * on to determine whether this should be considered a PHYINT_FAILURE
2117 	 * or GROUP_FAILURE.
2118 	 */
2119 	if (pg == phyint_anongroup)
2120 		return (PHYINT_FAILURE);
2121 
2122 	/*
2123 	 * Need to compare against other phyints of the same group
2124 	 * to exclude group failures. If the failure was detected via
2125 	 * probing, then if the time of last success (tls) of any
2126 	 * phyint is more recent than the time of first fail (tff) of the
2127 	 * phyint in question, and the link is up on the phyint,
2128 	 * then it is a phyint failure. Otherwise it is a group failure.
2129 	 * If failure was detected via a link down notification sent from
2130 	 * the driver to IP, we see if any phyints in the group are still
2131 	 * running and haven't received a link down notification.  We
2132 	 * will usually be processing the link down notification shortly
2133 	 * after it was received, so there is no point looking at the tls
2134 	 * of other phyints.
2135 	 */
2136 	retval = GROUP_FAILURE;
2137 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2138 		/* Exclude ourself from comparison */
2139 		if (pi2 == pi)
2140 			continue;
2141 
2142 		if (LINK_DOWN(pi)) {
2143 			/*
2144 			 * We use FLAGS_TO_LINK_STATE() to test the flags
2145 			 * directly, rather then LINK_UP() or LINK_DOWN(), as
2146 			 * we may not have got round to processing the link
2147 			 * state for the other phyints in the group yet.
2148 			 *
2149 			 * The check for PI_RUNNING and group failure handles
2150 			 * the case when the group begins to recover.
2151 			 * PI_RUNNING will be set, and group failure cleared
2152 			 * only after receipt of NUM_PROBE_REPAIRS, by which
2153 			 * time the other phyints should have received at
2154 			 * least 1 packet, and so will not have NUM_PROBE_FAILS.
2155 			 */
2156 			if ((pi2->pi_state == PI_RUNNING) &&
2157 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
2158 				retval = PHYINT_FAILURE;
2159 				break;
2160 			}
2161 			continue;
2162 		}
2163 
2164 		if (LINK_DOWN(pi2))
2165 			continue;
2166 
2167 		/*
2168 		 * If there's no probe-based failure detection on this
2169 		 * interface, and its link is still up, then it's still
2170 		 * working and thus the group has not failed.
2171 		 */
2172 		if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
2173 			retval = PHYINT_FAILURE;
2174 			break;
2175 		}
2176 
2177 		/*
2178 		 * Need to compare against both IPv4 and IPv6 instances.
2179 		 */
2180 		pii2 = pi2->pi_v4;
2181 		if (pii2 != NULL) {
2182 			probe_success_info(pii2, NULL, &psinfo);
2183 			if (psinfo.ps_tls_valid) {
2184 				pi2_tls = psinfo.ps_tls;
2185 				/*
2186 				 * See comment above regarding check
2187 				 * for PI_RUNNING and group failure.
2188 				 */
2189 				if (TIME_GT(pi2_tls, pi_tff) &&
2190 				    (pi2->pi_state == PI_RUNNING) &&
2191 				    !GROUP_FAILED(pg) &&
2192 				    FLAGS_TO_LINK_STATE(pi2)) {
2193 					retval = PHYINT_FAILURE;
2194 					break;
2195 				}
2196 			}
2197 		}
2198 
2199 		pii2 = pi2->pi_v6;
2200 		if (pii2 != NULL) {
2201 			probe_success_info(pii2, NULL, &psinfo);
2202 			if (psinfo.ps_tls_valid) {
2203 				pi2_tls = psinfo.ps_tls;
2204 				/*
2205 				 * See comment above regarding check
2206 				 * for PI_RUNNING and group failure.
2207 				 */
2208 				if (TIME_GT(pi2_tls, pi_tff) &&
2209 				    (pi2->pi_state == PI_RUNNING) &&
2210 				    !GROUP_FAILED(pg) &&
2211 				    FLAGS_TO_LINK_STATE(pi2)) {
2212 					retval = PHYINT_FAILURE;
2213 					break;
2214 				}
2215 			}
2216 		}
2217 	}
2218 
2219 	/*
2220 	 * Update the group state to account for the changes.
2221 	 */
2222 	phyint_group_refresh_state(pg);
2223 	return (retval);
2224 }
2225 
2226 /*
2227  * Return the information associated with consecutive probe successes
2228  * starting with the most recent probe. At most the last 2 probes can be
2229  * in the unacknowledged state. All previous probes have either failed
2230  * or succeeded.
2231  */
2232 static void
2233 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2234     struct probe_success_count *psinfo)
2235 {
2236 	uint_t	i;
2237 	struct probe_stats *pr_statp;
2238 	uint_t most_recent;
2239 	uint_t second_most_recent;
2240 	boolean_t pi_found_failure = _B_FALSE;
2241 	boolean_t tg_found_failure = _B_FALSE;
2242 	uint_t now;
2243 	uint_t timeout;
2244 	struct target *tg;
2245 
2246 	if (debug & D_FAILREP)
2247 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2248 
2249 	bzero(psinfo, sizeof (*psinfo));
2250 	now = getcurrenttime();
2251 
2252 	/*
2253 	 * Start with the most recent probe, and count the number
2254 	 * of consecutive probe successes. Latch the number of successes
2255 	 * on hitting a failure.
2256 	 */
2257 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2258 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2259 
2260 	for (i = most_recent; i != pii->pii_probe_next;
2261 	    i = PROBE_INDEX_PREV(i)) {
2262 		pr_statp = &pii->pii_probes[i];
2263 
2264 		switch (pr_statp->pr_status) {
2265 		case PR_UNACKED:
2266 			/*
2267 			 * Only the most recent 2 probes can be unacknowledged
2268 			 */
2269 			assert(i == most_recent || i == second_most_recent);
2270 
2271 			tg = pr_statp->pr_target;
2272 			assert(tg != NULL);
2273 			/*
2274 			 * The crtt could be zero for some reason,
2275 			 * Eg. the phyint could be failed. If the crtt is
2276 			 * not available use the value of the group's probe
2277 			 * interval which is a worst case estimate.
2278 			 */
2279 			timeout = ns2ms(pr_statp->pr_hrtime_start);
2280 			if (tg->tg_crtt != 0) {
2281 				timeout += tg->tg_crtt;
2282 			} else {
2283 				timeout +=
2284 				    pii->pii_phyint->pi_group->pg_probeint;
2285 			}
2286 
2287 			if (TIME_LT(timeout, now)) {
2288 				/*
2289 				 * We hit a failure. Latch the total number of
2290 				 * recent consecutive successes.
2291 				 */
2292 				pr_statp->pr_time_lost = timeout;
2293 				probe_chstate(pr_statp, pii, PR_LOST);
2294 				pi_found_failure = _B_TRUE;
2295 				if (cur_tg != NULL && tg == cur_tg) {
2296 					/*
2297 					 * We hit a failure for the desired
2298 					 * target. Latch the number of recent
2299 					 * consecutive successes for this target
2300 					 */
2301 					tg_found_failure = _B_TRUE;
2302 				}
2303 			}
2304 			break;
2305 
2306 		case PR_ACKED:
2307 			/*
2308 			 * Bump up the count of probe successes, if we
2309 			 * have not seen any failure so far.
2310 			 */
2311 			if (!pi_found_failure)
2312 				psinfo->ps_nsucc++;
2313 
2314 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2315 			    !tg_found_failure) {
2316 				psinfo->ps_nsucc_tg++;
2317 			}
2318 
2319 			/*
2320 			 * Record the time of last success, if this is
2321 			 * the most recent probe success.
2322 			 */
2323 			if (!psinfo->ps_tls_valid) {
2324 				psinfo->ps_tls =
2325 				    ns2ms(pr_statp->pr_hrtime_ackproc);
2326 				psinfo->ps_tls_valid = _B_TRUE;
2327 			}
2328 			break;
2329 
2330 		case PR_LOST:
2331 			/*
2332 			 * We hit a failure. Latch the total number of
2333 			 * recent consecutive successes.
2334 			 */
2335 			pi_found_failure = _B_TRUE;
2336 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2337 				/*
2338 				 * We hit a failure for the desired target.
2339 				 * Latch the number of recent consecutive
2340 				 * successes for this target
2341 				 */
2342 				tg_found_failure = _B_TRUE;
2343 			}
2344 			break;
2345 
2346 		default:
2347 			return;
2348 
2349 		}
2350 	}
2351 }
2352 
2353 /*
2354  * Return the information associated with consecutive probe failures
2355  * starting with the most recent probe. Only the last 2 probes can be in the
2356  * unacknowledged state. All previous probes have either failed or succeeded.
2357  */
2358 static void
2359 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2360     struct probe_fail_count *pfinfo)
2361 {
2362 	int	i;
2363 	struct probe_stats *pr_statp;
2364 	boolean_t	tg_found_success = _B_FALSE;
2365 	boolean_t	pi_found_success = _B_FALSE;
2366 	int	most_recent;
2367 	int	second_most_recent;
2368 	uint_t	now;
2369 	uint_t	timeout;
2370 	struct	target *tg;
2371 
2372 	if (debug & D_FAILREP)
2373 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2374 
2375 	bzero(pfinfo, sizeof (*pfinfo));
2376 	now = getcurrenttime();
2377 
2378 	/*
2379 	 * Start with the most recent probe, and count the number
2380 	 * of consecutive probe failures. Latch the number of failures
2381 	 * on hitting a probe success.
2382 	 */
2383 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2384 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2385 
2386 	for (i = most_recent; i != pii->pii_probe_next;
2387 	    i = PROBE_INDEX_PREV(i)) {
2388 		pr_statp = &pii->pii_probes[i];
2389 
2390 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2391 
2392 		switch (pr_statp->pr_status) {
2393 		case PR_UNACKED:
2394 			/*
2395 			 * Only the most recent 2 probes can be unacknowledged
2396 			 */
2397 			assert(i == most_recent || i == second_most_recent);
2398 
2399 			tg = pr_statp->pr_target;
2400 			/*
2401 			 * Target is guaranteed to exist in the unack. state
2402 			 */
2403 			assert(tg != NULL);
2404 			/*
2405 			 * The crtt could be zero for some reason,
2406 			 * Eg. the phyint could be failed. If the crtt is
2407 			 * not available use the group's probe interval,
2408 			 * which is a worst case estimate.
2409 			 */
2410 			timeout = ns2ms(pr_statp->pr_hrtime_start);
2411 			if (tg->tg_crtt != 0) {
2412 				timeout += tg->tg_crtt;
2413 			} else {
2414 				timeout +=
2415 				    pii->pii_phyint->pi_group->pg_probeint;
2416 			}
2417 
2418 			if (TIME_GT(timeout, now))
2419 				break;
2420 
2421 			pr_statp->pr_time_lost = timeout;
2422 			probe_chstate(pr_statp, pii, PR_LOST);
2423 			/* FALLTHRU */
2424 
2425 		case PR_LOST:
2426 			if (!pi_found_success) {
2427 				pfinfo->pf_nfail++;
2428 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2429 			}
2430 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2431 			    !tg_found_success)  {
2432 				pfinfo->pf_nfail_tg++;
2433 			}
2434 			break;
2435 
2436 		default:
2437 			/*
2438 			 * We hit a success or unused slot. Latch the
2439 			 * total number of recent consecutive failures.
2440 			 */
2441 			pi_found_success = _B_TRUE;
2442 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2443 				/*
2444 				 * We hit a success for the desired target.
2445 				 * Latch the number of recent consecutive
2446 				 * failures for this target
2447 				 */
2448 				tg_found_success = _B_TRUE;
2449 			}
2450 		}
2451 	}
2452 }
2453 
2454 /*
2455  * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
2456  */
2457 void
2458 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
2459 {
2460 	if (pr->pr_status == state)
2461 		return;
2462 
2463 	pr->pr_status = state;
2464 	(void) probe_state_event(pr, pii);
2465 }
2466 
2467 /*
2468  * Check if the phyint has been repaired.  If no test address has been
2469  * configured, then consider the interface repaired if the link is up (unless
2470  * the link is flapping; see below).  Otherwise, look for proof of probes
2471  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2472  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2473  */
2474 static boolean_t
2475 phyint_repaired(struct phyint *pi)
2476 {
2477 	struct	probe_success_count psinfo;
2478 	struct	phyint_instance *pii;
2479 	struct	target *cur_tg;
2480 	int	pr_ndx;
2481 	uint_t	cur_time;
2482 
2483 	if (debug & D_FAILREP)
2484 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2485 
2486 	if (LINK_DOWN(pi))
2487 		return (_B_FALSE);
2488 
2489 	/*
2490 	 * If we don't have any test addresses and the link is up, then
2491 	 * consider the interface repaired, unless we've received more than
2492 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2493 	 * which case we keep the link down until we drop back below
2494 	 * the threshold.
2495 	 */
2496 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2497 		cur_time = getcurrenttime();
2498 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2499 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2500 			pi->pi_lfmsg_printed = 0;
2501 			return (_B_TRUE);
2502 		}
2503 		if (!pi->pi_lfmsg_printed) {
2504 			logerr("The link has come up on %s more than %d times "
2505 			    "in the last minute; disabling repair until it "
2506 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2507 			pi->pi_lfmsg_printed = 1;
2508 		}
2509 
2510 		return (_B_FALSE);
2511 	}
2512 
2513 	pii = pi->pi_v4;
2514 	if (PROBE_CAPABLE(pii)) {
2515 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2516 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2517 		probe_success_info(pii, cur_tg, &psinfo);
2518 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2519 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2520 			return (_B_TRUE);
2521 	}
2522 
2523 	pii = pi->pi_v6;
2524 	if (PROBE_CAPABLE(pii)) {
2525 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2526 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2527 		probe_success_info(pii, cur_tg, &psinfo);
2528 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2529 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2530 			return (_B_TRUE);
2531 	}
2532 
2533 	return (_B_FALSE);
2534 }
2535 
2536 /*
2537  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2538  */
2539 boolean_t
2540 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
2541 {
2542 	int ifsock;
2543 	struct lifreq lifr;
2544 	uint64_t old_flags;
2545 
2546 	if (debug & D_FAILREP) {
2547 		logdebug("change_pif_flags(%s): set %llx clear %llx\n",
2548 		    pi->pi_name, set, clear);
2549 	}
2550 
2551 	if (pi->pi_v4 != NULL)
2552 		ifsock = ifsock_v4;
2553 	else
2554 		ifsock = ifsock_v6;
2555 
2556 	/*
2557 	 * Get the current flags from the kernel, and set/clear the
2558 	 * desired phyint flags. Since we set only phyint flags, we can
2559 	 * do it on either IPv4 or IPv6 instance.
2560 	 */
2561 	(void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2562 
2563 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2564 		if (errno != ENXIO)
2565 			logperror("change_pif_flags: ioctl (get flags)");
2566 		return (_B_FALSE);
2567 	}
2568 
2569 	old_flags = lifr.lifr_flags;
2570 	lifr.lifr_flags |= set;
2571 	lifr.lifr_flags &= ~clear;
2572 
2573 	if (old_flags == lifr.lifr_flags) {
2574 		/* No change in the flags. No need to send ioctl */
2575 		return (_B_TRUE);
2576 	}
2577 
2578 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2579 		if (errno != ENXIO)
2580 			logperror("change_pif_flags: ioctl (set flags)");
2581 		return (_B_FALSE);
2582 	}
2583 
2584 	/*
2585 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2586 	 * phyint flags.
2587 	 */
2588 	pi->pi_flags |= set;
2589 	pi->pi_flags &= ~clear;
2590 
2591 	if (pi->pi_v4 != NULL)
2592 		pi->pi_v4->pii_flags = pi->pi_flags;
2593 
2594 	if (pi->pi_v6 != NULL)
2595 		pi->pi_v6->pii_flags = pi->pi_flags;
2596 
2597 	return (_B_TRUE);
2598 }
2599 
2600 /*
2601  * icmp cksum computation for IPv4.
2602  */
2603 static int
2604 in_cksum(ushort_t *addr, int len)
2605 {
2606 	register int nleft = len;
2607 	register ushort_t *w = addr;
2608 	register ushort_t answer;
2609 	ushort_t odd_byte = 0;
2610 	register int sum = 0;
2611 
2612 	/*
2613 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2614 	 *  we add sequential 16 bit words to it, and at the end, fold
2615 	 *  back all the carry bits from the top 16 bits into the lower
2616 	 *  16 bits.
2617 	 */
2618 	while (nleft > 1)  {
2619 		sum += *w++;
2620 		nleft -= 2;
2621 	}
2622 
2623 	/* mop up an odd byte, if necessary */
2624 	if (nleft == 1) {
2625 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2626 		sum += odd_byte;
2627 	}
2628 
2629 	/*
2630 	 * add back carry outs from top 16 bits to low 16 bits
2631 	 */
2632 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2633 	sum += (sum >> 16);			/* add carry */
2634 	answer = ~sum;				/* truncate to 16 bits */
2635 	return (answer);
2636 }
2637 
2638 static void
2639 reset_snxt_basetimes(void)
2640 {
2641 	struct phyint_instance *pii;
2642 
2643 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2644 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2645 	}
2646 }
2647 
2648 /*
2649  * Is the address one of our own addresses? Unfortunately,
2650  * we cannot check our phyint tables to determine if the address
2651  * is our own. This is because, we don't track interfaces that
2652  * are not part of any group. We have to either use a 'bind' or
2653  * get the complete list of all interfaces using SIOCGLIFCONF,
2654  * to do this check. We could also use SIOCTMYADDR.
2655  * Bind fails for the local zone address, so we might include local zone
2656  * address as target address. If local zone address is a target address
2657  * and it is up, it is not possible to detect the interface failure.
2658  * SIOCTMYADDR also doesn't consider local zone address as own address.
2659  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2660  * are stored in `localaddrs'
2661  */
2662 boolean_t
2663 own_address(struct in6_addr addr)
2664 {
2665 	addrlist_t *addrp;
2666 	struct sockaddr_storage ss;
2667 	int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
2668 
2669 	addr2storage(af, &addr, &ss);
2670 	for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
2671 		if (sockaddrcmp(&ss, &addrp->al_addr))
2672 			return (_B_TRUE);
2673 	}
2674 	return (_B_FALSE);
2675 }
2676 
2677 static int
2678 ns2ms(int64_t ns)
2679 {
2680 	return (ns / (NANOSEC / MILLISEC));
2681 }
2682 
2683 static int64_t
2684 tv2ns(struct timeval *tvp)
2685 {
2686 	return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
2687 }
2688