xref: /titanic_50/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c (revision 67e3a03ed4a2813074d36330f062ed6e593a4937)
1 /*
2  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1987 Regents of the University of California.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms are permitted
11  * provided that the above copyright notice and this paragraph are
12  * duplicated in all such forms and that any documentation,
13  * advertising materials, and other materials related to such
14  * distribution and use acknowledge that the software was developed
15  * by the University of California, Berkeley. The name of the
16  * University may not be used to endorse or promote products derived
17  * from this software without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21  */
22 
23 #pragma ident	"%Z%%M%	%I%	%E% SMI"
24 
25 #include "mpd_defs.h"
26 #include "mpd_tables.h"
27 
28 /*
29  * Probe types for probe()
30  */
31 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
32 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
33 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
34 
35 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
36 
37 /*
38  * Format of probe / probe response packets. This is an ICMP Echo request
39  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
40  */
41 struct pr_icmp
42 {
43 	uint8_t  pr_icmp_type;		/* type field */
44 	uint8_t  pr_icmp_code;		/* code field */
45 	uint16_t pr_icmp_cksum;		/* checksum field */
46 	uint16_t pr_icmp_id;		/* Identification */
47 	uint16_t pr_icmp_seq;		/* sequence number */
48 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
49 	uint32_t pr_icmp_mtype;		/* Message type */
50 };
51 
52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
53 				    0x0, 0x0, 0x0, 0x0,
54 				    0x0, 0x0, 0x0, 0x0,
55 				    0x0, 0x0, 0x0, 0x1 } };
56 
57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
58 
59 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
60 
61 static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
62 static void		pi_set_crtt(struct target *tg, int m,
63     boolean_t is_probe_uni);
64 static void		incoming_echo_reply(struct phyint_instance *pii,
65     struct pr_icmp *reply, struct in6_addr fromaddr);
66 static void		incoming_rtt_reply(struct phyint_instance *pii,
67     struct pr_icmp *reply, struct in6_addr fromaddr);
68 static void		incoming_mcast_reply(struct phyint_instance *pii,
69     struct pr_icmp *reply, struct in6_addr fromaddr);
70 
71 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
72 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
73 static boolean_t	check_exception_target(struct phyint_instance *pii,
74     struct target *target);
75 static void		probe_fail_info(struct phyint_instance *pii,
76     struct target *cur_tg, struct probe_fail_count *pfinfo);
77 static void		probe_success_info(struct phyint_instance *pii,
78     struct target *cur_tg, struct probe_success_count *psinfo);
79 static boolean_t	phyint_repaired(struct phyint *pi);
80 
81 static int		failover(struct phyint *from, struct phyint *to);
82 static int		failback(struct phyint *from, struct phyint *to);
83 static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
84 
85 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
86 static int 		in_cksum(ushort_t *addr, int len);
87 static void		reset_snxt_basetimes(void);
88 
89 /*
90  * CRTT - Conservative Round Trip Time Estimate
91  * Probe success - A matching probe reply received before CRTT ms has elapsed
92  *	after sending the probe.
93  * Probe failure - No probe reply received and more than CRTT ms has elapsed
94  *	after sending the probe.
95  *
96  * TLS - Time last success. Most recent probe ack received at this time.
97  * TFF - Time first fail. The time of the earliest probe failure in
98  *	a consecutive series of probe failures.
99  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
100  * 	before declaring phyint repair.
101  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
102  *	declare a phyint failure.
103  *
104  * 			Phyint state diagram
105  *
106  * The state of a phyint that is capable of being probed, is completely
107  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
108  *
109  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
110  * of the link (according to the driver).  If the phyint is also configured
111  * with a test address (the common case) and probe targets, then a phyint must
112  * also successfully be able to send and receive probes in order to remain in
113  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
114  *
115  * Further, if a PI_RUNNING phyint is configured with a test address but is
116  * unable to find any probe targets, it will transition to the PI_NOTARGETS
117  * state, which indicates that the link is apparently functional but that
118  * in.mpathd is unable to send probes to verify functionality (in this case,
119  * in.mpathd makes the optimistic assumption that the interface is working
120  * correctly and thus does not perform a failover, but reports the interface
121  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
122  *
123  * At any point, a phyint may be administratively marked offline via if_mpadm.
124  * In this case, the interface always transitions to PI_OFFLINE, regardless
125  * of its previous state.  When the interface is later brought back online,
126  * in.mpathd acts as if the interface is new (and thus it transitions to
127  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
128  * its probes, if probes are sent).
129  *
130  * pi_state -  PI_RUNNING or PI_FAILED
131  *	PI_RUNNING: The failure detection logic says the phyint is good.
132  *	PI_FAILED: The failure detection logic says the phyint has failed.
133  *
134  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
135  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
136  *	In the case of router targets, we assume that the current list of
137  *	targets obtained from the routing table, is still valid, so the
138  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139  *	list of targets, and multicast to the all hosts, to reconstruct the
140  *	target list. So the phyints are in the PI_NOTARGETS state.
141  *
142  * I -	value of (pi_flags & IFF_INACTIVE)
143  *	IFF_INACTIVE: No failovers have been done to this phyint, from
144  *		other phyints. This phyint is inactive. Phyint can be a Standby.
145  *		When failback has been disabled (FAILOVER=no configured),
146  *		phyint can also be a non-STANDBY. In this case IFF_INACTIVE
147  *		is set when phyint subsequently recovers after a failure.
148  *
149  * pi_empty
150  *	This phyint has failed over successfully to another phyint, and
151  *	this phyint is currently "empty". It does not host any addresses or
152  *	multicast membership etc. This is the state of a phyint after a
153  *	failover from the phyint has completed successfully and no subsequent
154  *	'failover to' or 'failback to' has occurred on the phyint.
155  *	IP guarantees that no new logicals will be hosted nor any multicast
156  *	joins permitted on the phyint, since the phyint is either failed or
157  *	inactive. pi_empty is set implies the phyint is either failed or
158  *	inactive.
159  *
160  * pi_full
161  *	The phyint hosts all of its own addresses that it "owns". If the
162  *	phyint was previously failed or inactive, failbacks to the phyint
163  *	has completed successfully. i.e. No more failbacks to this phyint
164  *	can produce any change in system state whatsoever.
165  *
166  * Not all 32 possible combinations of the above 5-tuple are possible.
167  * Furthermore some of the above combinations are transient. They may occur
168  * only because the failover or failback did not complete successfully. The
169  * failover/failback will be retried and eventually a stable state will be
170  * reached.
171  *
172  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
173  * The following are the state machines. 'from' and 'to' are the src and
174  * dst of the failover/failback, below
175  *
176  *			pi_empty state machine
177  * ---------------------------------------------------------------------------
178  *	Event				State	->	New State
179  * ---------------------------------------------------------------------------
180  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
181  *	of failover
182  *
183  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
184  *
185  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
186  *
187  * 	group failure			pi_empty = X	  -> pi_empty = 0
188  * ---------------------------------------------------------------------------
189  *
190  *			pi_full state machine
191  * ---------------------------------------------------------------------------
192  *	Event				State		  -> New State
193  * ---------------------------------------------------------------------------
194  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
195  *	of failback from
196  *	each of the other phyints
197  *
198  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
199  *
200  *	group failure			pi_full = X	  -> pi_full = 0
201  * ---------------------------------------------------------------------------
202  *
203  *			pi_state state machine
204  * ---------------------------------------------------------------------------
205  *	Event			State			New State
206  *				Action:
207  * ---------------------------------------------------------------------------
208  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
209  *	detection		: set IFF_FAILED on this phyint
210  *				: failover from this phyint to another
211  *
212  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
213  *	detection		: set IFF_FAILED on this phyint
214  *
215  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=yes)
216  *	detection				     -> (PI_RUNNING, I == 0)
217  *				: to.pi_empty = 0
218  *				: clear IFF_FAILED on this phyint
219  *				: failback to this phyint if enabled
220  *
221  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=no)
222  *	detection				     ->	(PI_RUNNING, I == 1)
223  *				: to.pi_empty = 0
224  *				: clear IFF_FAILED on this phyint
225  *				: if failback is disabled set I == 1
226  *
227  *	Group failure		(perform on all phyints in the group)
228  *	detection 		PI_RUNNING		PI_FAILED
229  *	(Router targets)	: set IFF_FAILED
230  *				: clear pi_empty and pi_full
231  *
232  *	Group failure		(perform on all phyints in the group)
233  *	detection 		PI_RUNNING		PI_NOTARGETS
234  *	(Host targets)		: set IFF_FAILED
235  *				: clear pi_empty and pi_full
236  *				: delete the target list on all phyints
237  * ---------------------------------------------------------------------------
238  *
239  *			I state machine
240  * ---------------------------------------------------------------------------
241  *	Event		State			Action:
242  * ---------------------------------------------------------------------------
243  *	Turn on I 	pi_empty == 0, STANDBY 	: failover from standby
244  *
245  *	Turn off I 	PI_RUNNING, STANDBY	: pi_empty = 0
246  *			pi_full == 0		: failback to this if enabled
247  * ---------------------------------------------------------------------------
248  *
249  * Assertions: (Read '==>' as implies)
250  *
251  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
252  * (pi_empty == 1) ==> (pi_full == 0)
253  * (pi_full  == 1) ==> (pi_empty == 0)
254  *
255  * Invariants
256  *
257  * pg_groupfailed = 0  &&
258  *   1. (I == 1, pi_empty == 0)		   ==> initiate failover from standby
259  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
260  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
261  *
262  * 1. says that an inactive standby, that is not empty, has to be failed
263  * over. For a standby to be truly inactive, it should not host any
264  * addresses. So we move them to some other phyint. Usually we catch the
265  * turn on of IFF_INACTIVE, and perform this action. However if the failover
266  * did not complete successfully, then subsequently we have lost the edge
267  * trigger, and this invariant kicks in and completes the action.
268  *
269  * 2. says that any failed phyint that is not empty must be failed over.
270  * Usually we do the failover when we detect NIC failure. However if the
271  * failover does not complete successfully, this invariant kicks in and
272  * completes the failover. We exclude inactive standby which is covered by 1.
273  *
274  * 3. says that any running phyint that is not full must be failed back.
275  * Usually we do the failback when we detect NIC repair. However if the
276  * failback does not complete successfully, this invariant kicks in and
277  * completes the failback. Note that we don't want to failback to an inactive
278  * standby.
279  *
280  * The invariants 1 - 3 and the actions are in initifs().
281  */
282 
283 struct probes_missed probes_missed;
284 
285 /*
286  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
287  * will be added on by the kernel.  The id field identifies this phyint.
288  * and the sequence number is an increasing (modulo 2^^16) integer. The data
289  * portion holds the time value when the packet is sent. On echo this is
290  * extracted to compute the round-trip time. Three different types of
291  * probe packets are used.
292  *
293  * PROBE_UNI: This type is used to do failure detection / failure recovery
294  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
295  *	not less than the current CRTT. pii_probes[] stores data
296  *	about these probes. These packets consume sequence number space.
297  *
298  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
299  * 	are not used. Under heavy network load, the rtt may go up very high,
300  *	due to a spike, or may appear to go high, due to extreme scheduling
301  * 	delays. Once the network stress is removed, mpathd takes long time to
302  *	recover, because the probe_interval is already high, and it takes
303  *	a long time to send out sufficient number of probes to bring down the
304  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
305  *	user_probe_interval ms. and will cause only rtt updates. These packets
306  *	do not consume sequence number space nor is information about these
307  *	packets stored in the pii_probes[]
308  *
309  * PROBE_MULTI: This type is only used to construct a list of targets, when
310  *	no targets are known. The packet is multicast to the all hosts addr.
311  */
312 static void
313 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
314 {
315 	struct pr_icmp probe_pkt;	/* Probe packet */
316 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
317 	struct sockaddr_in whereto; 	/* target address IPv4 */
318 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
319 	boolean_t sent = _B_TRUE;
320 
321 	if (debug & D_TARGET) {
322 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
323 		    pii->pii_name, probe_type, cur_time);
324 	}
325 
326 	assert(pii->pii_probe_sock != -1);
327 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
328 	    probe_type == PROBE_RTT);
329 
330 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
331 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
332 	probe_pkt.pr_icmp_code = 0;
333 	probe_pkt.pr_icmp_cksum = 0;
334 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
335 
336 	/*
337 	 * Since there is no need to do arithmetic on the icmpid,
338 	 * (only equality check is done) pii_icmpid is stored in
339 	 * network byte order at initialization itself.
340 	 */
341 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
342 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
343 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
344 
345 	/*
346 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
347 	 * the all hosts address. Otherwise it is unicast to the next target.
348 	 */
349 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
350 	    pii->pii_rtt_target_next != NULL));
351 
352 	if (pii->pii_af == AF_INET6) {
353 		bzero(&whereto6, sizeof (whereto6));
354 		whereto6.sin6_family = AF_INET6;
355 		if (probe_type == PROBE_MULTI) {
356 			whereto6.sin6_addr = all_nodes_mcast_v6;
357 		} else if (probe_type == PROBE_UNI) {
358 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
359 		} else  {
360 			/* type is PROBE_RTT */
361 			whereto6.sin6_addr =
362 			    pii->pii_rtt_target_next->tg_address;
363 		}
364 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
365 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
366 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
367 			logperror_pii(pii, "probe: probe sendto");
368 			sent = _B_FALSE;
369 		}
370 	} else {
371 		bzero(&whereto, sizeof (whereto));
372 		whereto.sin_family = AF_INET;
373 		if (probe_type == PROBE_MULTI) {
374 			whereto.sin_addr = all_nodes_mcast_v4;
375 		} else if (probe_type == PROBE_UNI) {
376 			IN6_V4MAPPED_TO_INADDR(
377 			    &pii->pii_target_next->tg_address,
378 			    &whereto.sin_addr);
379 		} else {
380 			/* type is PROBE_RTT */
381 			IN6_V4MAPPED_TO_INADDR(
382 			    &pii->pii_rtt_target_next->tg_address,
383 			    &whereto.sin_addr);
384 		}
385 
386 		/*
387 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
388 		 */
389 		probe_pkt.pr_icmp_cksum =
390 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
391 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
392 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
393 		    sizeof (whereto)) != sizeof (probe_pkt)) {
394 			logperror_pii(pii, "probe: probe sendto");
395 			sent = _B_FALSE;
396 		}
397 	}
398 
399 	/*
400 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
401 	 * update our tables. We will need this info in processing the probe
402 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
403 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
404 	 * are only used to construct a list of targets. PROBE_RTT packets are
405 	 * used only for updating the rtt and not for failure detection.
406 	 */
407 	if (probe_type == PROBE_UNI && sent) {
408 		pr_ndx = pii->pii_probe_next;
409 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
410 
411 		/* Collect statistics, before we reuse the last slot. */
412 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
413 			pii->pii_cum_stats.lost++;
414 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
415 			pii->pii_cum_stats.acked++;
416 		pii->pii_cum_stats.sent++;
417 
418 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
419 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
420 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
421 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
422 		pii->pii_target_next = target_next(pii->pii_target_next);
423 		assert(pii->pii_target_next != NULL);
424 		/*
425 		 * If we have a single variable to denote the next target to
426 		 * probe for both rtt probes and failure detection probes, we
427 		 * could end up with a situation where the failure detection
428 		 * probe targets become disjoint from the rtt probe targets.
429 		 * Eg. if 2 targets and the actual fdt is double the user
430 		 * specified fdt. So we have 2 variables. In this scheme
431 		 * we also reset pii_rtt_target_next for every fdt probe,
432 		 * though that may not be necessary.
433 		 */
434 		pii->pii_rtt_target_next = pii->pii_target_next;
435 		pii->pii_snxt++;
436 	} else if (probe_type == PROBE_RTT) {
437 		pii->pii_rtt_target_next =
438 		    target_next(pii->pii_rtt_target_next);
439 		assert(pii->pii_rtt_target_next != NULL);
440 	}
441 }
442 
443 /*
444  * Incoming IPv4 data from wire, is received here. Called from main.
445  */
446 void
447 in_data(struct phyint_instance *pii)
448 {
449 	struct	sockaddr_in 	from;
450 	struct	in6_addr	fromaddr;
451 	uint_t	fromlen;
452 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
453 	struct ip *ip;
454 	int 	iphlen;
455 	int 	len;
456 	char 	abuf[INET_ADDRSTRLEN];
457 	struct	pr_icmp	*reply;
458 
459 	if (debug & D_PROBE) {
460 		logdebug("in_data(%s %s)\n",
461 		    AF_STR(pii->pii_af), pii->pii_name);
462 	}
463 
464 	/*
465 	 * Poll has already told us that a message is waiting,
466 	 * on this socket. Read it now. We should not block.
467 	 */
468 	fromlen = sizeof (from);
469 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
470 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
471 	if (len < 0) {
472 		logperror_pii(pii, "in_data: recvfrom");
473 		return;
474 	}
475 
476 	/*
477 	 * If the NIC has indicated the link is down, don't go
478 	 * any further.
479 	 */
480 	if (LINK_DOWN(pii->pii_phyint))
481 		return;
482 
483 	/* Get the printable address for error reporting */
484 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
485 
486 	/* Make sure packet contains at least minimum ICMP header */
487 	ip = (struct ip *)in_packet;
488 	iphlen = ip->ip_hl << 2;
489 	if (len < iphlen + ICMP_MINLEN) {
490 		if (debug & D_PKTBAD) {
491 			logdebug("in_data: packet too short (%d bytes)"
492 			    " from %s\n", len, abuf);
493 		}
494 		return;
495 	}
496 
497 	/*
498 	 * Subtract the IP hdr length, 'len' will be length of the probe
499 	 * reply, starting from the icmp hdr.
500 	 */
501 	len -= iphlen;
502 	/* LINTED */
503 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
504 
505 	/* Probe replies are icmp echo replies. Ignore anything else */
506 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
507 		return;
508 
509 	/*
510 	 * The icmp id should match what we sent, which is stored
511 	 * in pi_icmpid. The icmp code for reply must be 0.
512 	 * The reply content must be a struct pr_icmp
513 	 */
514 	if (reply->pr_icmp_id != pii->pii_icmpid) {
515 		/* Not in response to our probe */
516 		return;
517 	}
518 
519 	if (reply->pr_icmp_code != 0) {
520 		logtrace("probe reply code %d from %s on %s\n",
521 		    reply->pr_icmp_code, abuf, pii->pii_name);
522 		return;
523 	}
524 
525 	if (len < sizeof (struct pr_icmp)) {
526 		logtrace("probe reply too short: %d bytes from %s on %s\n",
527 		    len, abuf, pii->pii_name);
528 		return;
529 	}
530 
531 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
532 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
533 		/* Unicast probe reply */
534 		incoming_echo_reply(pii, reply, fromaddr);
535 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
536 		/* Multicast reply */
537 		incoming_mcast_reply(pii, reply, fromaddr);
538 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
539 		incoming_rtt_reply(pii, reply, fromaddr);
540 	} else {
541 		/* Probably not in response to our probe */
542 		logtrace("probe reply type: %d from %s on %s\n",
543 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
544 		return;
545 	}
546 
547 }
548 
549 /*
550  * Incoming IPv6 data from wire is received here. Called from main.
551  */
552 void
553 in6_data(struct phyint_instance *pii)
554 {
555 	struct sockaddr_in6 from;
556 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
557 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
558 	int len;
559 	char abuf[INET6_ADDRSTRLEN];
560 	struct msghdr msg;
561 	struct iovec iov;
562 	uchar_t *opt;
563 	struct	pr_icmp *reply;
564 
565 	if (debug & D_PROBE) {
566 		logdebug("in6_data(%s %s)\n",
567 		    AF_STR(pii->pii_af), pii->pii_name);
568 	}
569 
570 	iov.iov_base = (char *)in_packet;
571 	iov.iov_len = sizeof (in_packet);
572 	msg.msg_iov = &iov;
573 	msg.msg_iovlen = 1;
574 	msg.msg_name = (struct sockaddr *)&from;
575 	msg.msg_namelen = sizeof (from);
576 	msg.msg_control = ancillary_data;
577 	msg.msg_controllen = sizeof (ancillary_data);
578 
579 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
580 		logperror_pii(pii, "in6_data: recvfrom");
581 		return;
582 	}
583 
584 	/*
585 	 * If the NIC has indicated that the link is down, don't go
586 	 * any further.
587 	 */
588 	if (LINK_DOWN(pii->pii_phyint))
589 		return;
590 
591 	/* Get the printable address for error reporting */
592 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
593 	if (len < ICMP_MINLEN) {
594 		if (debug & D_PKTBAD) {
595 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
596 			    msg.msg_flags, abuf);
597 		}
598 		return;
599 	}
600 	/* Ignore packets > 64k or control buffers that don't fit */
601 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
602 		if (debug & D_PKTBAD) {
603 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
604 			    msg.msg_flags, abuf);
605 		}
606 		return;
607 	}
608 
609 	reply = (struct pr_icmp *)in_packet;
610 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
611 		return;
612 
613 	if (reply->pr_icmp_id != pii->pii_icmpid) {
614 		/* Not in response to our probe */
615 		return;
616 	}
617 
618 	/*
619 	 * The kernel has already verified the the ICMP checksum.
620 	 */
621 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
622 		logtrace("ICMPv6 echo reply source address not linklocal from "
623 		    "%s on %s\n", abuf, pii->pii_name);
624 		return;
625 	}
626 	opt = find_ancillary(&msg, IPV6_RTHDR);
627 	if (opt != NULL) {
628 		/* Can't allow routing headers in probe replies  */
629 		logtrace("message with routing header from %s on %s\n",
630 		    abuf, pii->pii_name);
631 		return;
632 	}
633 	if (reply->pr_icmp_code != 0) {
634 		logtrace("probe reply code: %d from %s on %s\n",
635 		    reply->pr_icmp_code, abuf, pii->pii_name);
636 		return;
637 	}
638 	if (len < (sizeof (struct pr_icmp))) {
639 		logtrace("probe reply too short: %d bytes from %s on %s\n",
640 		    len, abuf, pii->pii_name);
641 		return;
642 	}
643 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
644 		incoming_echo_reply(pii, reply, from.sin6_addr);
645 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
646 		incoming_mcast_reply(pii, reply, from.sin6_addr);
647 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
648 		incoming_rtt_reply(pii, reply, from.sin6_addr);
649 	} else  {
650 		/* Probably not in response to our probe */
651 		logtrace("probe reply type: %d from %s on %s\n",
652 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
653 	}
654 }
655 
656 /*
657  * Process the incoming rtt reply, in response to our rtt probe.
658  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
659  * have any stored information about the probe we sent. So we don't log
660  * any errors if we receive bad replies.
661  */
662 static void
663 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
664     struct in6_addr fromaddr)
665 {
666 	int 	m;		/* rtt measurment in ms */
667 	uint32_t cur_time;	/* in ms from some arbitrary point */
668 	char	abuf[INET6_ADDRSTRLEN];
669 	struct	target	*target;
670 	uint32_t pr_icmp_timestamp;
671 	struct 	phyint_group *pg;
672 
673 	/* Get the printable address for error reporting */
674 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
675 
676 	if (debug & D_PROBE) {
677 		logdebug("incoming_rtt_reply: %s %s %s\n",
678 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
679 	}
680 
681 	/* Do we know this target ? */
682 	target = target_lookup(pii, fromaddr);
683 	if (target == NULL)
684 		return;
685 
686 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
687 	cur_time = getcurrenttime();
688 	m = (int)(cur_time - pr_icmp_timestamp);
689 
690 	/* Invalid rtt. It has wrapped around */
691 	if (m < 0)
692 		return;
693 
694 	/*
695 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
696 	 * The initial few responses after the interface is repaired may
697 	 * contain high rtt's because they could have been queued up waiting
698 	 * for ARP/NDP resolution on a failed interface.
699 	 */
700 	pg = pii->pii_phyint->pi_group;
701 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
702 		return;
703 
704 	/*
705 	 * Update rtt only if the new rtt is lower than the current rtt.
706 	 * (specified by the 3rd parameter to pi_set_crtt).
707 	 * If a spike has caused the current probe_interval to be >
708 	 * user_probe_interval, then this mechanism is used to bring down
709 	 * the rtt rapidly once the network stress is removed.
710 	 * If the new rtt is higher than the current rtt, we don't want to
711 	 * update the rtt. We are having more than 1 outstanding probe and
712 	 * the increase in rtt we are seeing is being unnecessarily weighted
713 	 * many times. The regular rtt update will be handled by
714 	 * incoming_echo_reply() and will take care of any rtt increase.
715 	 */
716 	pi_set_crtt(target, m, _B_FALSE);
717 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
718 	    (user_failure_detection_time < pg->pg_fdt) &&
719 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
720 		/*
721 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
722 		 * investigate if we can improve the failure detection time to
723 		 * meet whatever the user specified.
724 		 */
725 		if (check_pg_crtt_improved(pg)) {
726 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
727 			    user_failure_detection_time);
728 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
729 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
730 				logerr("Improved failure detection time %d ms "
731 				    "on (%s %s) for group \"%s\"\n",
732 				    pg->pg_fdt, AF_STR(pii->pii_af),
733 				    pii->pii_name,
734 				    pii->pii_phyint->pi_group->pg_name);
735 			}
736 			if (user_failure_detection_time == pg->pg_fdt) {
737 				/* Avoid any truncation or rounding errors */
738 				pg->pg_probeint = user_probe_interval;
739 				/*
740 				 * No more rtt probes will be sent. The actual
741 				 * fdt has dropped to the user specified value.
742 				 * pii_fd_snxt_basetime and pii_snxt_basetime
743 				 * will be in sync henceforth.
744 				 */
745 				reset_snxt_basetimes();
746 			}
747 		}
748 	}
749 }
750 
751 /*
752  * Process the incoming echo reply, in response to our unicast probe.
753  * Common for both IPv4 and IPv6
754  */
755 static void
756 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
757     struct in6_addr fromaddr)
758 {
759 	int 	m;		/* rtt measurment in ms */
760 	uint32_t cur_time;	/* in ms from some arbitrary point */
761 	char	abuf[INET6_ADDRSTRLEN];
762 	int	pr_ndx;
763 	struct	target	*target;
764 	boolean_t exception;
765 	uint32_t pr_icmp_timestamp;
766 	uint16_t pr_icmp_seq;
767 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
768 
769 	/* Get the printable address for error reporting */
770 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
771 
772 	if (debug & D_PROBE) {
773 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
774 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
775 		    ntohs(reply->pr_icmp_seq));
776 	}
777 
778 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
779 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
780 
781 	/* Reject out of window probe replies */
782 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
783 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
784 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
785 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
786 		pii->pii_cum_stats.unknown++;
787 		return;
788 	}
789 	cur_time = getcurrenttime();
790 	m = (int)(cur_time - pr_icmp_timestamp);
791 	if (m < 0) {
792 		/*
793 		 * This is a ridiculously high value of rtt. rtt has wrapped
794 		 * around. Log a message, and ignore the rtt.
795 		 */
796 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
797 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
798 	}
799 
800 	/*
801 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
802 	 * number in our pii->pii_probes[] array. The icmp sequence number
803 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
804 	 */
805 	pr_ndx = MOD_SUB(pii->pii_probe_next,
806 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
807 
808 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
809 
810 	target = pii->pii_probes[pr_ndx].pr_target;
811 
812 	/*
813 	 * Perform sanity checks, whether this probe reply that we
814 	 * have received is genuine
815 	 */
816 	if (target != NULL) {
817 		/*
818 		 * Compare the src. addr of the received ICMP or ICMPv6
819 		 * probe reply with the target address in our tables.
820 		 */
821 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
822 			/*
823 			 * We don't have any record of having sent a probe to
824 			 * this target. This is a fake probe reply. Log an error
825 			 */
826 			logtrace("probe status %d Fake probe reply seq %u "
827 			    "snxt %u on %s from %s\n",
828 			    pii->pii_probes[pr_ndx].pr_status,
829 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
830 			pii->pii_cum_stats.unknown++;
831 			return;
832 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
833 			/*
834 			 * The address matches, but our tables indicate that
835 			 * this probe reply has been acked already. So this
836 			 * is a duplicate probe reply. Log an error
837 			 */
838 			logtrace("probe status %d Duplicate probe reply seq %u "
839 			    "snxt %u on %s from %s\n",
840 			    pii->pii_probes[pr_ndx].pr_status,
841 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
842 			pii->pii_cum_stats.unknown++;
843 			return;
844 		}
845 	} else {
846 		/*
847 		 * Target must not be NULL in the PR_UNACKED state
848 		 */
849 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
850 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
851 			/*
852 			 * The probe stats slot is unused. So we didn't
853 			 * send out any probe to this target. This is a fake.
854 			 * Log an error.
855 			 */
856 			logtrace("probe status %d Fake probe reply seq %u "
857 			    "snxt %u on %s from %s\n",
858 			    pii->pii_probes[pr_ndx].pr_status,
859 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
860 		}
861 		pii->pii_cum_stats.unknown++;
862 		return;
863 	}
864 
865 	/*
866 	 * If the rtt does not appear to be right, don't update the
867 	 * rtt stats. This can happen if the system dropped into the
868 	 * debugger, or the system was hung or too busy for a
869 	 * substantial time that we didn't get a chance to run.
870 	 */
871 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
872 		/*
873 		 * If the probe corresponding to this receieved response
874 		 * was truly sent 'm' ms. ago, then this response must
875 		 * have been rejected by the sequence number checks. The
876 		 * fact that it has passed the sequence number checks
877 		 * means that the measured rtt is wrong. We were probably
878 		 * scheduled long after the packet was received.
879 		 */
880 		goto out;
881 	}
882 
883 	/*
884 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
885 	 * The initial few responses after the interface is repaired may
886 	 * contain high rtt's because they could have been queued up waiting
887 	 * for ARP/NDP resolution on a failed interface.
888 	 */
889 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
890 		goto out;
891 
892 	/*
893 	 * Don't update the Conservative Round Trip Time estimate for this
894 	 * (phint, target) pair if this is the not the highest ack seq seen
895 	 * thus far on this target.
896 	 */
897 	if (!highest_ack_tg(pr_icmp_seq, target))
898 		goto out;
899 
900 	/*
901 	 * Always update the rtt. This is a failure detection probe
902 	 * and we want to measure both increase / decrease in rtt.
903 	 */
904 	pi_set_crtt(target, m, _B_TRUE);
905 
906 	/*
907 	 * If the crtt exceeds the average time between probes,
908 	 * investigate if this slow target is an exception. If so we
909 	 * can avoid this target and still meet the failure detection
910 	 * time. Otherwise we can't meet the failure detection time.
911 	 */
912 	if (target->tg_crtt > pg->pg_probeint) {
913 		exception = check_exception_target(pii, target);
914 		if (exception) {
915 			/*
916 			 * This target is exceptionally slow. Don't use it
917 			 * for future probes. check_exception_target() has
918 			 * made sure that we have at least MIN_PROBE_TARGETS
919 			 * other active targets
920 			 */
921 			if (pii->pii_targets_are_routers) {
922 				/*
923 				 * This is a slow router, mark it as slow
924 				 * and don't use it for further probes. We
925 				 * don't delete it, since it will be populated
926 				 * again when we do a router scan. Hence we
927 				 * need to maintain extra state (unlike the
928 				 * host case below).  Mark it as TG_SLOW.
929 				 */
930 				if (target->tg_status == TG_ACTIVE)
931 					pii->pii_ntargets--;
932 				target->tg_status = TG_SLOW;
933 				target->tg_latime = gethrtime();
934 				target->tg_rtt_sa = -1;
935 				target->tg_crtt = 0;
936 				target->tg_rtt_sd = 0;
937 				if (pii->pii_target_next == target) {
938 					pii->pii_target_next =
939 					    target_next(target);
940 				}
941 			} else {
942 				/*
943 				 * the slow target is not a router, we can
944 				 * just delete it. Send an icmp multicast and
945 				 * pick the fastest responder that is not
946 				 * already an active target. target_delete()
947 				 * adjusts pii->pii_target_next
948 				 */
949 				target_delete(target);
950 				probe(pii, PROBE_MULTI, cur_time);
951 			}
952 		} else {
953 			/*
954 			 * We can't meet the failure detection time.
955 			 * Log a message, and update the detection time to
956 			 * whatever we can achieve.
957 			 */
958 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
959 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
960 			last_fdt_bumpup_time = gethrtime();
961 			if (pg != phyint_anongroup) {
962 				logerr("Cannot meet requested failure detection"
963 				    " time of %d ms on (%s %s) new failure"
964 				    " detection time for group \"%s\" is %d"
965 				    " ms\n", user_failure_detection_time,
966 				    AF_STR(pii->pii_af), pii->pii_name,
967 				    pg->pg_name, pg->pg_fdt);
968 			}
969 		}
970 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
971 	    (user_failure_detection_time < pg->pg_fdt) &&
972 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
973 		/*
974 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
975 		 * investigate if we can improve the failure detection time to
976 		 * meet whatever the user specified.
977 		 */
978 		if (check_pg_crtt_improved(pg)) {
979 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
980 			    user_failure_detection_time);
981 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
982 			if (pg != phyint_anongroup) {
983 				logerr("Improved failure detection time %d ms "
984 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
985 				    AF_STR(pii->pii_af), pii->pii_name,
986 				    pg->pg_name);
987 			}
988 			if (user_failure_detection_time == pg->pg_fdt) {
989 				/* Avoid any truncation or rounding errors */
990 				pg->pg_probeint = user_probe_interval;
991 				/*
992 				 * No more rtt probes will be sent. The actual
993 				 * fdt has dropped to the user specified value.
994 				 * pii_fd_snxt_basetime and pii_snxt_basetime
995 				 * will be in sync henceforth.
996 				 */
997 				reset_snxt_basetimes();
998 			}
999 		}
1000 	}
1001 out:
1002 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
1003 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
1004 
1005 	/*
1006 	 * Update pii->pii_rack, i.e. the sequence number of the last received
1007 	 * probe response, based on the echo reply we have received now, if
1008 	 * either of the following conditions are satisfied.
1009 	 * a. pii_rack is outside the current receive window of
1010 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
1011 	 *    This means we have not received probe responses for a
1012 	 *    long time, and the sequence number has wrapped around.
1013 	 * b. pii_rack is within the current receive window and this echo
1014 	 *    reply corresponds to the highest sequence number we have seen
1015 	 *    so far.
1016 	 */
1017 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
1018 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
1019 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
1020 		pii->pii_rack = pr_icmp_seq;
1021 	}
1022 }
1023 
1024 /*
1025  * Returns true if seq is the highest unacknowledged seq for target tg
1026  * else returns false
1027  */
1028 static boolean_t
1029 highest_ack_tg(uint16_t seq, struct target *tg)
1030 {
1031 	struct phyint_instance *pii;
1032 	int	 pr_ndx;
1033 	uint16_t pr_seq;
1034 
1035 	pii = tg->tg_phyint_inst;
1036 
1037 	/*
1038 	 * Get the seq number of the most recent probe sent so far,
1039 	 * and also get the corresponding probe index in the probe stats
1040 	 * array.
1041 	 */
1042 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1043 	pr_seq = pii->pii_snxt;
1044 	pr_seq--;
1045 
1046 	/*
1047 	 * Start from the most recent probe and walk back, trying to find
1048 	 * an acked probe corresponding to target tg.
1049 	 */
1050 	for (; pr_ndx != pii->pii_probe_next;
1051 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1052 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1053 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1054 			if (SEQ_GT(pr_seq, seq))
1055 				return (_B_FALSE);
1056 		}
1057 	}
1058 	return (_B_TRUE);
1059 }
1060 
1061 /*
1062  * Check whether the crtt for the group has improved by a factor of
1063  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1064  * detection time flapping in the face of small crtt changes.
1065  */
1066 static boolean_t
1067 check_pg_crtt_improved(struct phyint_group *pg)
1068 {
1069 	struct	phyint *pi;
1070 
1071 	if (debug & D_PROBE)
1072 		logdebug("check_pg_crtt_improved()\n");
1073 
1074 	/*
1075 	 * The crtt for the group is only improved if each phyint_instance
1076 	 * for both ipv4 and ipv6 is improved.
1077 	 */
1078 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1079 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1080 		    !check_pii_crtt_improved(pi->pi_v6))
1081 			return (_B_FALSE);
1082 	}
1083 
1084 	return (_B_TRUE);
1085 }
1086 
1087 /*
1088  * Check whether the crtt has improved substantially on this phyint_instance.
1089  * Returns _B_TRUE if there's no crtt information available, because pii
1090  * is NULL or the phyint_instance is not capable of probing.
1091  */
1092 boolean_t
1093 check_pii_crtt_improved(struct phyint_instance *pii) {
1094 	struct 	target *tg;
1095 
1096 	if (pii == NULL)
1097 		return (_B_TRUE);
1098 
1099 	if (!PROBE_CAPABLE(pii) ||
1100 	    pii->pii_phyint->pi_state == PI_FAILED)
1101 		return (_B_TRUE);
1102 
1103 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1104 		if (tg->tg_status != TG_ACTIVE)
1105 			continue;
1106 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1107 		    LOWER_FDT_TRIGGER)) {
1108 			return (_B_FALSE);
1109 		}
1110 	}
1111 
1112 	return (_B_TRUE);
1113 }
1114 
1115 /*
1116  * This target responds very slowly to probes. The target's crtt exceeds
1117  * the probe interval of its group. Compare against other targets
1118  * and determine if this target is an exception, if so return true, else false
1119  */
1120 static boolean_t
1121 check_exception_target(struct phyint_instance *pii, struct target *target)
1122 {
1123 	struct	target *tg;
1124 	char abuf[INET6_ADDRSTRLEN];
1125 
1126 	if (debug & D_PROBE) {
1127 		logdebug("check_exception_target(%s %s target %s)\n",
1128 		    AF_STR(pii->pii_af), pii->pii_name,
1129 		    pr_addr(pii->pii_af, target->tg_address,
1130 		    abuf, sizeof (abuf)));
1131 	}
1132 
1133 	/*
1134 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1135 	 * to make a good judgement. Otherwise don't drop this target.
1136 	 */
1137 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1138 		return (_B_FALSE);
1139 
1140 	/*
1141 	 * Determine whether only this particular target is slow.
1142 	 * We know that this target's crtt exceeds the group's probe interval.
1143 	 * If all other active targets have a
1144 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1145 	 * then this target is considered slow.
1146 	 */
1147 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1148 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1149 			if (tg->tg_crtt >
1150 			    pii->pii_phyint->pi_group->pg_probeint /
1151 			    EXCEPTION_FACTOR) {
1152 				return (_B_FALSE);
1153 			}
1154 		}
1155 	}
1156 
1157 	return (_B_TRUE);
1158 }
1159 
1160 /*
1161  * Update the target list. The icmp all hosts multicast has given us
1162  * some host to which we can send probes. If we already have sufficient
1163  * targets, discard it.
1164  */
1165 static void
1166 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1167     struct in6_addr fromaddr)
1168 /* ARGSUSED */
1169 {
1170 	int af;
1171 	char abuf[INET6_ADDRSTRLEN];
1172 	struct phyint *pi;
1173 
1174 	if (debug & D_PROBE) {
1175 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1176 		    AF_STR(pii->pii_af), pii->pii_name,
1177 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1178 	}
1179 
1180 	/*
1181 	 * Using host targets is a fallback mechanism. If we have
1182 	 * found a router, don't add this host target. If we already
1183 	 * know MAX_PROBE_TARGETS, don't add another target.
1184 	 */
1185 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1186 	if (pii->pii_targets != NULL) {
1187 		if (pii->pii_targets_are_routers ||
1188 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1189 			return;
1190 		}
1191 	}
1192 
1193 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1194 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1195 		/*
1196 		 * Guard against response from 0.0.0.0
1197 		 * and ::. Log a trace message
1198 		 */
1199 		logtrace("probe response from %s on %s\n",
1200 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1201 		    pii->pii_name);
1202 		return;
1203 	}
1204 
1205 	/*
1206 	 * This address is one of our own, so reject this address as a
1207 	 * valid probe target.
1208 	 */
1209 	af = pii->pii_af;
1210 	if (own_address(fromaddr))
1211 		return;
1212 
1213 	/*
1214 	 * If the phyint is part a named group, then add the address to all
1215 	 * members of the group.  Otherwise, add the address only to the
1216 	 * phyint itself, since other phyints in the anongroup may not be on
1217 	 * the same subnet.
1218 	 */
1219 	pi = pii->pii_phyint;
1220 	if (pi->pi_group == phyint_anongroup) {
1221 		target_add(pii, fromaddr, _B_FALSE);
1222 	} else {
1223 		pi = pi->pi_group->pg_phyint;
1224 		for (; pi != NULL; pi = pi->pi_pgnext)
1225 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1226 	}
1227 }
1228 
1229 /*
1230  * Compute CRTT given an existing scaled average, scaled deviation estimate
1231  * and a new rtt time.  The formula is from Jacobson and Karels'
1232  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1233  * are the same as those in Appendix A.2 of that paper.
1234  *
1235  * m = new measurement
1236  * sa = scaled RTT average (8 * average estimates)
1237  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1238  * crtt = Conservative round trip time. Used to determine whether probe
1239  * has timed out.
1240  *
1241  * New scaled average and deviation are passed back via sap and svp
1242  */
1243 static int
1244 compute_crtt(int *sap, int *svp, int m)
1245 {
1246 	int sa = *sap;
1247 	int sv = *svp;
1248 	int crtt;
1249 	int saved_m = m;
1250 
1251 	assert(*sap >= -1);
1252 	assert(*svp >= 0);
1253 
1254 	if (sa != -1) {
1255 		/*
1256 		 * Update average estimator:
1257 		 *	new rtt = old rtt + 1/8 Error
1258 		 *	    where Error = m - old rtt
1259 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1260 		 *	i.e. new sa =  old sa + Error
1261 		 */
1262 		m -= sa >> 3;		/* m is now Error in estimate. */
1263 		if ((sa += m) < 0) {
1264 			/* Don't allow the smoothed average to be negative. */
1265 			sa = 0;
1266 		}
1267 
1268 		/*
1269 		 * Update deviation estimator:
1270 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1271 		 *	i.e. 4 * new mdev = 4 * old mdev +
1272 		 *		(abs(Error) - old mdev)
1273 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1274 		 */
1275 		if (m < 0)
1276 			m = -m;
1277 		m -= sv >> 2;
1278 		sv += m;
1279 	} else {
1280 		/* Initialization. This is the first response received. */
1281 		sa = (m << 3);
1282 		sv = (m << 1);
1283 	}
1284 
1285 	crtt = (sa >> 3) + sv;
1286 
1287 	if (debug & D_PROBE) {
1288 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
1289 		    "%d\n", saved_m, sa, sv, crtt);
1290 	}
1291 
1292 	*sap = sa;
1293 	*svp = sv;
1294 
1295 	/*
1296 	 * CRTT = average estimates  + 4 * deviation estimates
1297 	 *	= sa / 8 + sv
1298 	 */
1299 	return (crtt);
1300 }
1301 
1302 static void
1303 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
1304 {
1305 	struct phyint_instance *pii = tg->tg_phyint_inst;
1306 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1307 	int sa = tg->tg_rtt_sa;
1308 	int sv = tg->tg_rtt_sd;
1309 	int new_crtt;
1310 	int i;
1311 
1312 	if (debug & D_PROBE)
1313 		logdebug("pi_set_crtt: target -  m %d\n", m);
1314 
1315 	/* store the round trip time, in case we need to defer computation */
1316 	tg->tg_deferred[tg->tg_num_deferred] = m;
1317 
1318 	new_crtt = compute_crtt(&sa, &sv, m);
1319 
1320 	/*
1321 	 * If this probe's round trip time would singlehandedly cause an
1322 	 * increase in the group's probe interval consider it suspect.
1323 	 */
1324 	if ((new_crtt > probe_interval) && is_probe_uni) {
1325 		if (debug & D_PROBE) {
1326 			logdebug("Received a suspect probe on %s, new_crtt ="
1327 			    " %d, probe_interval = %d, num_deferred = %d\n",
1328 			    pii->pii_probe_logint->li_name, new_crtt,
1329 			    probe_interval, tg->tg_num_deferred);
1330 		}
1331 
1332 		/*
1333 		 * If we've deferred as many rtts as we plan on deferring, then
1334 		 * assume the link really did slow down and process all queued
1335 		 * rtts
1336 		 */
1337 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1338 			if (debug & D_PROBE) {
1339 				logdebug("Received MAXDEFERREDRTT probes which "
1340 				    "would cause an increased probe_interval.  "
1341 				    "Integrating queued rtt data points.\n");
1342 			}
1343 
1344 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1345 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
1346 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
1347 			}
1348 
1349 			tg->tg_num_deferred = 0;
1350 		} else {
1351 			tg->tg_num_deferred++;
1352 		}
1353 		return;
1354 	}
1355 
1356 	/*
1357 	 * If this is a normal probe, or an RTT probe that would lead to a
1358 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1359 	 * a normal probe, pitch any deferred probes since our probes are
1360 	 * again being answered within our CRTT estimates.
1361 	 */
1362 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1363 		tg->tg_rtt_sa = sa;
1364 		tg->tg_rtt_sd = sv;
1365 		tg->tg_crtt = new_crtt;
1366 		if (is_probe_uni)
1367 			tg->tg_num_deferred = 0;
1368 	}
1369 }
1370 
1371 /*
1372  * Return a pointer to the specified option buffer.
1373  * If not found return NULL.
1374  */
1375 static void *
1376 find_ancillary(struct msghdr *msg, int cmsg_type)
1377 {
1378 	struct cmsghdr *cmsg;
1379 
1380 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1381 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1382 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
1383 		    cmsg->cmsg_type == cmsg_type) {
1384 			return (CMSG_DATA(cmsg));
1385 		}
1386 	}
1387 	return (NULL);
1388 }
1389 
1390 /*
1391  * See if a previously failed interface has started working again.
1392  */
1393 void
1394 phyint_check_for_repair(struct phyint *pi)
1395 {
1396 	if (phyint_repaired(pi)) {
1397 		if (pi->pi_group == phyint_anongroup) {
1398 			logerr("NIC repair detected on %s\n", pi->pi_name);
1399 		} else {
1400 			logerr("NIC repair detected on %s of group %s\n",
1401 			    pi->pi_name, pi->pi_group->pg_name);
1402 		}
1403 
1404 		/*
1405 		 * If the interface is offline, just clear the FAILED flag,
1406 		 * delaying the state change and failback operation until it
1407 		 * is brought back online.
1408 		 */
1409 		if (pi->pi_state == PI_OFFLINE) {
1410 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1411 			return;
1412 		}
1413 
1414 		if (pi->pi_flags & IFF_STANDBY) {
1415 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1416 		} else {
1417 			if (try_failback(pi) != IPMP_FAILURE) {
1418 				(void) change_lif_flags(pi,
1419 				    IFF_FAILED, _B_FALSE);
1420 				/* Per state diagram */
1421 				pi->pi_empty = 0;
1422 			}
1423 		}
1424 
1425 		phyint_chstate(pi, PI_RUNNING);
1426 
1427 		if (GROUP_FAILED(pi->pi_group)) {
1428 			/*
1429 			 * This is the 1st phyint to receive a response
1430 			 * after group failure.
1431 			 */
1432 			logerr("At least 1 interface (%s) of group %s has "
1433 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
1434 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
1435 		}
1436 	}
1437 }
1438 
1439 /*
1440  * See if a previously functioning interface has failed, or if the
1441  * whole group of interfaces has failed.
1442  */
1443 static void
1444 phyint_inst_check_for_failure(struct phyint_instance *pii)
1445 {
1446 	struct	phyint	*pi;
1447 	struct	phyint	*pi2;
1448 
1449 	pi = pii->pii_phyint;
1450 
1451 	switch (failure_state(pii)) {
1452 	case PHYINT_FAILURE:
1453 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1454 		if (pi->pi_group == phyint_anongroup) {
1455 			logerr("NIC failure detected on %s\n", pii->pii_name);
1456 		} else {
1457 			logerr("NIC failure detected on %s of group %s\n",
1458 			    pii->pii_name, pi->pi_group->pg_name);
1459 		}
1460 		/*
1461 		 * Do the failover, unless the interface is offline (in
1462 		 * which case we've already failed over).
1463 		 */
1464 		if (pi->pi_state != PI_OFFLINE) {
1465 			phyint_chstate(pi, PI_FAILED);
1466 			reset_crtt_all(pi);
1467 			if (!(pi->pi_flags & IFF_INACTIVE))
1468 				(void) try_failover(pi, FAILOVER_NORMAL);
1469 		}
1470 		break;
1471 
1472 	case GROUP_FAILURE:
1473 		logerr("All Interfaces in group %s have failed\n",
1474 		    pi->pi_group->pg_name);
1475 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
1476 		    pi2 = pi2->pi_pgnext) {
1477 			if (pi2->pi_flags & IFF_OFFLINE)
1478 				continue;
1479 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
1480 			reset_crtt_all(pi2);
1481 
1482 			/*
1483 			 * In the case of host targets, we
1484 			 * would have flushed the targets,
1485 			 * and gone to PI_NOTARGETS state.
1486 			 */
1487 			if (pi2->pi_state == PI_RUNNING)
1488 				phyint_chstate(pi2, PI_FAILED);
1489 
1490 			pi2->pi_empty = 0;
1491 			pi2->pi_full = 0;
1492 		}
1493 		break;
1494 
1495 	default:
1496 		break;
1497 	}
1498 }
1499 
1500 /*
1501  * Determines if any timeout event has occurred and returns the number of
1502  * milliseconds until the next timeout event for the phyint. Returns
1503  * TIMER_INFINITY for "never".
1504  */
1505 uint_t
1506 phyint_inst_timer(struct phyint_instance *pii)
1507 {
1508 	int 	pr_ndx;
1509 	uint_t	timeout;
1510 	struct	target	*cur_tg;
1511 	struct	probe_stats *pr_statp;
1512 	struct	phyint_instance *pii_other;
1513 	struct	phyint *pi;
1514 	int	valid_unack_count;
1515 	int	i;
1516 	int	interval;
1517 	uint_t	check_time;
1518 	uint_t	cur_time;
1519 	hrtime_t cur_hrtime;
1520 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1521 
1522 	cur_time = getcurrenttime();
1523 
1524 	if (debug & D_TIMER) {
1525 		logdebug("phyint_inst_timer(%s %s)\n",
1526 		    AF_STR(pii->pii_af), pii->pii_name);
1527 	}
1528 
1529 	pii_other = phyint_inst_other(pii);
1530 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1531 		/*
1532 		 * Check to see if we're here due to link up/down flapping; If
1533 		 * enough time has passed, then try to bring the interface
1534 		 * back up; otherwise, schedule a timer to bring it back up
1535 		 * when enough time *has* elapsed.
1536 		 */
1537 		pi = pii->pii_phyint;
1538 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1539 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1540 			if (check_time > cur_time)
1541 				return (check_time - cur_time);
1542 
1543 			phyint_check_for_repair(pi);
1544 		}
1545 	}
1546 
1547 	/*
1548 	 * If probing is not enabled on this phyint instance, don't proceed.
1549 	 */
1550 	if (!PROBE_ENABLED(pii))
1551 		return (TIMER_INFINITY);
1552 
1553 	/*
1554 	 * If the timer has fired too soon, probably triggered
1555 	 * by some other phyint instance, return the remaining
1556 	 * time
1557 	 */
1558 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1559 		return (pii->pii_snxt_time - cur_time);
1560 
1561 	/*
1562 	 * If the link is down, don't send any probes for now.
1563 	 */
1564 	if (LINK_DOWN(pii->pii_phyint))
1565 		return (TIMER_INFINITY);
1566 
1567 	/*
1568 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1569 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1570 	 * Base probe time is strictly periodic.
1571 	 */
1572 	interval = GET_RANDOM(
1573 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1574 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1575 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1576 
1577 	/*
1578 	 * Check if the current time > next time to probe. If so, we missed
1579 	 * sending 1 or more probes, probably due to heavy system load. At least
1580 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1581 	 * were scheduled. Make adjustments to the times, in multiples of
1582 	 * user_probe_interval.
1583 	 */
1584 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1585 		int n;
1586 
1587 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1588 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1589 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1590 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1591 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1592 		    pii->pii_snxt_basetime);
1593 
1594 		/* Collect statistics about missed probes */
1595 		probes_missed.pm_nprobes += n + 1;
1596 		probes_missed.pm_ntimes++;
1597 	}
1598 	pii->pii_snxt_basetime += user_probe_interval;
1599 	interval = pii->pii_snxt_time - cur_time;
1600 	if (debug & D_TARGET) {
1601 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1602 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1603 		    pii->pii_snxt_basetime, interval);
1604 	}
1605 
1606 	/*
1607 	 * If no targets are known, we need to send an ICMP multicast. The
1608 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1609 	 * to see if we found a target.
1610 	 */
1611 	if (pii->pii_target_next == NULL) {
1612 		assert(pii->pii_ntargets == 0);
1613 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1614 		probe(pii, PROBE_MULTI, cur_time);
1615 		return (interval);
1616 	}
1617 
1618 	if ((user_probe_interval != probe_interval) &&
1619 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1620 		/*
1621 		 * the failure detection (fd) probe timer has not yet fired.
1622 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1623 		 */
1624 		probe(pii, PROBE_RTT, cur_time);
1625 		return (interval);
1626 	}
1627 	/*
1628 	 * the fd probe timer has fired. Need to do all failure
1629 	 * detection / recovery calculations, and then send an fd probe
1630 	 * of type PROBE_UNI.
1631 	 */
1632 	if (user_probe_interval == probe_interval) {
1633 		/*
1634 		 * We could have missed some probes, and then adjusted
1635 		 * pii_snxt_basetime above. Otherwise we could have
1636 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1637 		 */
1638 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1639 	} else {
1640 		pii->pii_fd_snxt_basetime += probe_interval;
1641 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1642 			int n;
1643 
1644 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1645 			    probe_interval;
1646 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1647 		}
1648 	}
1649 
1650 	/*
1651 	 * We can have at most, the latest 2 probes that we sent, in
1652 	 * the PR_UNACKED state. All previous probes sent, are either
1653 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1654 	 * timed out if the probe's time_sent + the CRTT < currenttime.
1655 	 * For each of the last 2 probes, examine whether it has timed
1656 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1657 	 */
1658 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1659 	valid_unack_count = 0;
1660 
1661 	for (i = 0; i < 2; i++) {
1662 		pr_statp = &pii->pii_probes[pr_ndx];
1663 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1664 		switch (pr_statp->pr_status) {
1665 		case PR_ACKED:
1666 			/*
1667 			 * We received back an ACK, so the switch clearly
1668 			 * is not dropping our traffic, and thus we can
1669 			 * enable failure detection immediately.
1670 			 */
1671 			if (pii->pii_fd_hrtime > gethrtime()) {
1672 				if (debug & D_PROBE) {
1673 					logdebug("successful probe on %s; "
1674 					    "ending quiet period\n",
1675 					    pii->pii_phyint->pi_name);
1676 				}
1677 				pii->pii_fd_hrtime = gethrtime();
1678 			}
1679 			break;
1680 
1681 		case PR_UNACKED:
1682 			assert(cur_tg != NULL);
1683 			/*
1684 			 * The crtt could be zero for some reason,
1685 			 * Eg. the phyint could be failed. If the crtt is
1686 			 * not available use group's probe interval,
1687 			 * which is a worst case estimate.
1688 			 */
1689 			if (cur_tg->tg_crtt != 0) {
1690 				timeout = pr_statp->pr_time_sent +
1691 				    cur_tg->tg_crtt;
1692 			} else {
1693 				timeout = pr_statp->pr_time_sent +
1694 				    probe_interval;
1695 			}
1696 			if (TIME_LT(timeout, cur_time)) {
1697 				pr_statp->pr_status = PR_LOST;
1698 				pr_statp->pr_time_lost = timeout;
1699 			} else if (i == 1) {
1700 				/*
1701 				 * We are forced to consider this probe
1702 				 * lost, as we can have at most 2 unack.
1703 				 * probes any time, and we will be sending a
1704 				 * probe at the end of this function.
1705 				 * Normally, we should not be here, but
1706 				 * this can happen if an incoming response
1707 				 * that was considered lost has increased
1708 				 * the crtt for this target, and also bumped
1709 				 * up the FDT. Note that we never cancel or
1710 				 * increase the current pii_time_left, so
1711 				 * when the timer fires, we find 2 valid
1712 				 * unacked probes, and they are yet to timeout
1713 				 */
1714 				pr_statp->pr_status = PR_LOST;
1715 				pr_statp->pr_time_lost = cur_time;
1716 			} else {
1717 				/*
1718 				 * Only the most recent probe can enter
1719 				 * this 'else' arm. The second most recent
1720 				 * probe must take either of the above arms,
1721 				 * if it is unacked.
1722 				 */
1723 				valid_unack_count++;
1724 			}
1725 			break;
1726 		}
1727 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1728 	}
1729 
1730 	/*
1731 	 * We send out 1 probe randomly in the interval between one half
1732 	 * and one probe interval for the group. Given that the CRTT is always
1733 	 * less than the group's probe interval, we can have at most 1
1734 	 * unacknowledged probe now.  All previous probes are either lost or
1735 	 * acked.
1736 	 */
1737 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1738 
1739 	/*
1740 	 * The timer has fired. Take appropriate action depending
1741 	 * on the current state of the phyint.
1742 	 *
1743 	 * PI_RUNNING state 	- Failure detection and failover
1744 	 * PI_FAILED state 	- Repair detection and failback
1745 	 */
1746 	switch (pii->pii_phyint->pi_state) {
1747 	case PI_FAILED:
1748 		/*
1749 		 * If the most recent probe (excluding unacked probes that
1750 		 * are yet to time out) has been acked, check whether the
1751 		 * phyint is now repaired. If the phyint is repaired, then
1752 		 * attempt failback, unless it is an inactive standby.
1753 		 */
1754 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1755 			phyint_check_for_repair(pii->pii_phyint);
1756 		}
1757 		break;
1758 
1759 	case PI_RUNNING:
1760 		/*
1761 		 * It's possible our probes have been lost because of a
1762 		 * spanning-tree mandated quiet period on the switch.  If so,
1763 		 * ignore the lost probes and consider the interface to still
1764 		 * be functioning.
1765 		 */
1766 		cur_hrtime = gethrtime();
1767 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1768 			break;
1769 
1770 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1771 			/*
1772 			 * We have 1 or more failed probes (excluding unacked
1773 			 * probes that are yet to time out). Determine if the
1774 			 * phyint has failed. If so attempt a failover,
1775 			 * unless it is an inactive standby
1776 			 */
1777 			phyint_inst_check_for_failure(pii);
1778 		}
1779 		break;
1780 
1781 	default:
1782 		logerr("phyint_inst_timer: invalid state %d\n",
1783 		    pii->pii_phyint->pi_state);
1784 		abort();
1785 	}
1786 
1787 	/*
1788 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1789 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1790 	 * was called, the target list may be empty.
1791 	 */
1792 	if (pii->pii_target_next != NULL) {
1793 		probe(pii, PROBE_UNI, cur_time);
1794 		/*
1795 		 * If we have just the one probe target, and we're not using
1796 		 * router targets, try to find another as we presently have
1797 		 * no resilience.
1798 		 */
1799 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1800 			probe(pii, PROBE_MULTI, cur_time);
1801 	} else {
1802 		probe(pii, PROBE_MULTI, cur_time);
1803 	}
1804 	return (interval);
1805 }
1806 
1807 /*
1808  * Start the probe timer for an interface instance.
1809  */
1810 void
1811 start_timer(struct phyint_instance *pii)
1812 {
1813 	uint32_t interval;
1814 
1815 	/*
1816 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1817 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1818 	 * pi_snxt_basetime is strictly periodic with a frequency of
1819 	 * the group's probe interval. The actual probe time pi_snxt_time
1820 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1821 	 * For the 1st probe on each phyint after the timer is started,
1822 	 * pi_snxt_time and pi_snxt_basetime are the same.
1823 	 */
1824 	interval = GET_RANDOM(0,
1825 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1826 
1827 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1828 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1829 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1830 	timer_schedule(interval);
1831 }
1832 
1833 /*
1834  * Restart the probe timer on an interface instance.
1835  */
1836 static void
1837 restart_timer(struct phyint_instance *pii)
1838 {
1839 	/*
1840 	 * We don't need to restart the timer if it was never started in
1841 	 * the first place (pii->pii_basetime_inited not set), as the timer
1842 	 * won't have gone off yet.
1843 	 */
1844 	if (pii->pii_basetime_inited != 0) {
1845 
1846 		if (debug & D_LINKNOTE)
1847 			logdebug("restart timer: restarting timer on %s, "
1848 			    "address family %s\n", pii->pii_phyint->pi_name,
1849 			    AF_STR(pii->pii_af));
1850 
1851 		start_timer(pii);
1852 	}
1853 }
1854 
1855 static void
1856 process_link_state_down(struct phyint *pi)
1857 {
1858 	logerr("The link has gone down on %s\n", pi->pi_name);
1859 
1860 	/*
1861 	 * Clear the probe statistics arrays, we don't want the repair
1862 	 * detection logic relying on probes that were succesful prior
1863 	 *  to the link going down.
1864 	 */
1865 	if (PROBE_CAPABLE(pi->pi_v4))
1866 		clear_pii_probe_stats(pi->pi_v4);
1867 	if (PROBE_CAPABLE(pi->pi_v6))
1868 		clear_pii_probe_stats(pi->pi_v6);
1869 	/*
1870 	 * Check for interface failure.  Although we know the interface
1871 	 * has failed, we don't know if all the other interfaces in the
1872 	 * group have failed as well.
1873 	 */
1874 	if ((pi->pi_state == PI_RUNNING) ||
1875 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1876 		if (debug & D_LINKNOTE) {
1877 			logdebug("process_link_state_down:"
1878 			    " checking for failure on %s\n", pi->pi_name);
1879 		}
1880 
1881 		if (pi->pi_v4 != NULL)
1882 			phyint_inst_check_for_failure(pi->pi_v4);
1883 		else if (pi->pi_v6 != NULL)
1884 			phyint_inst_check_for_failure(pi->pi_v6);
1885 	}
1886 }
1887 
1888 static void
1889 process_link_state_up(struct phyint *pi)
1890 {
1891 	logerr("The link has come up on %s\n", pi->pi_name);
1892 
1893 	/*
1894 	 * We stopped any running timers on each instance when the link
1895 	 * went down, so restart them.
1896 	 */
1897 	if (pi->pi_v4)
1898 		restart_timer(pi->pi_v4);
1899 	if (pi->pi_v6)
1900 		restart_timer(pi->pi_v6);
1901 
1902 	phyint_check_for_repair(pi);
1903 
1904 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1905 	if (pi->pi_whendx == LINK_UP_PERMIN)
1906 		pi->pi_whendx = 0;
1907 }
1908 
1909 /*
1910  * Process any changes in link state passed up from the interfaces.
1911  */
1912 void
1913 process_link_state_changes(void)
1914 {
1915 	struct phyint *pi;
1916 
1917 	/* Look for interfaces where the link state has just changed */
1918 
1919 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1920 		boolean_t old_link_state_up = LINK_UP(pi);
1921 
1922 		/*
1923 		 * Except when the "phyint" structure is created, this is
1924 		 * the only place the link state is updated.  This allows
1925 		 * this routine to detect changes in link state, rather
1926 		 * than just the current state.
1927 		 */
1928 		UPDATE_LINK_STATE(pi);
1929 
1930 		if (LINK_DOWN(pi)) {
1931 			/*
1932 			 * Has link just gone down?
1933 			 */
1934 			if (old_link_state_up)
1935 				process_link_state_down(pi);
1936 		} else {
1937 			/*
1938 			 * Has link just gone back up?
1939 			 */
1940 			if (!old_link_state_up)
1941 				process_link_state_up(pi);
1942 		}
1943 	}
1944 }
1945 
1946 void
1947 reset_crtt_all(struct phyint *pi)
1948 {
1949 	struct phyint_instance *pii;
1950 	struct target *tg;
1951 
1952 	pii = pi->pi_v4;
1953 	if (pii != NULL) {
1954 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1955 			tg->tg_crtt = 0;
1956 			tg->tg_rtt_sa = -1;
1957 			tg->tg_rtt_sd = 0;
1958 		}
1959 	}
1960 
1961 	pii = pi->pi_v6;
1962 	if (pii != NULL) {
1963 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1964 			tg->tg_crtt = 0;
1965 			tg->tg_rtt_sa = -1;
1966 			tg->tg_rtt_sd = 0;
1967 		}
1968 	}
1969 }
1970 
1971 /*
1972  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
1973  * probes on both instances IPv4 and IPv6.
1974  * If the interface has failed, return the time of the first probe failure
1975  * in "tff".
1976  */
1977 static int
1978 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
1979 {
1980 	uint_t	pi_tff;
1981 	struct	target *cur_tg;
1982 	struct	probe_fail_count pfinfo;
1983 	struct	phyint_instance *pii_other;
1984 	int	pr_ndx;
1985 
1986 	/*
1987 	 * Get the number of consecutive failed probes on
1988 	 * this phyint across all targets. Also get the number
1989 	 * of consecutive failed probes on this target only
1990 	 */
1991 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1992 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
1993 	probe_fail_info(pii, cur_tg, &pfinfo);
1994 
1995 	/* Get the time of first failure, for later use */
1996 	pi_tff = pfinfo.pf_tff;
1997 
1998 	/*
1999 	 * If the current target has not responded to the
2000 	 * last NUM_PROBE_FAILS probes, and other targets are
2001 	 * responding delete this target. Dead gateway detection
2002 	 * will eventually remove this target (if router) from the
2003 	 * routing tables. If that does not occur, we may end
2004 	 * up adding this to our list again.
2005 	 */
2006 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2007 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2008 		if (pii->pii_targets_are_routers) {
2009 			if (cur_tg->tg_status == TG_ACTIVE)
2010 				pii->pii_ntargets--;
2011 			cur_tg->tg_status = TG_DEAD;
2012 			cur_tg->tg_crtt = 0;
2013 			cur_tg->tg_rtt_sa = -1;
2014 			cur_tg->tg_rtt_sd = 0;
2015 			if (pii->pii_target_next == cur_tg)
2016 				pii->pii_target_next = target_next(cur_tg);
2017 		} else {
2018 			target_delete(cur_tg);
2019 			probe(pii, PROBE_MULTI, getcurrenttime());
2020 		}
2021 		return (PHYINT_OK);
2022 	}
2023 
2024 	/*
2025 	 * If the phyint has lost NUM_PROBE_FAILS or more
2026 	 * consecutive probes, on both IPv4 and IPv6 protocol
2027 	 * instances of the phyint, then trigger failure
2028 	 * detection, else return false
2029 	 */
2030 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2031 		return (PHYINT_OK);
2032 
2033 	pii_other = phyint_inst_other(pii);
2034 	if (PROBE_CAPABLE(pii_other)) {
2035 		probe_fail_info(pii_other, NULL, &pfinfo);
2036 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2037 			/*
2038 			 * We have NUM_PROBE_FAILS or more failures
2039 			 * on both IPv4 and IPv6. Get the earliest
2040 			 * time when failure was detected on this
2041 			 * phyint across IPv4 and IPv6.
2042 			 */
2043 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2044 				pi_tff = pfinfo.pf_tff;
2045 		} else {
2046 			/*
2047 			 * This instance has < NUM_PROBE_FAILS failure.
2048 			 * So return false
2049 			 */
2050 			return (PHYINT_OK);
2051 		}
2052 	}
2053 	*tff = pi_tff;
2054 	return (PHYINT_FAILURE);
2055 }
2056 
2057 /*
2058  * Check if the link has gone down on this phyint, or it has failed the
2059  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2060  * Also look at other phyints of this group, for group failures.
2061  */
2062 int
2063 failure_state(struct phyint_instance *pii)
2064 {
2065 	struct	probe_success_count psinfo;
2066 	uint_t	pi2_tls;		/* time last success */
2067 	uint_t	pi_tff;			/* time first fail */
2068 	struct	phyint	*pi2;
2069 	struct	phyint *pi;
2070 	struct	phyint_instance *pii2;
2071 	struct  phyint_group *pg;
2072 	boolean_t alone;
2073 
2074 	if (debug & D_FAILOVER)
2075 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2076 
2077 	pi = pii->pii_phyint;
2078 	pg = pi->pi_group;
2079 
2080 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2081 	    PHYINT_OK)
2082 		return (PHYINT_OK);
2083 
2084 	/*
2085 	 * At this point, the link is down, or the phyint is suspect,
2086 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
2087 	 * does not belong to any group, or is the only member of the
2088 	 * group capable of being probed, return PHYINT_FAILURE.
2089 	 */
2090 	alone = _B_TRUE;
2091 	if (pg != phyint_anongroup) {
2092 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2093 			if (pi2 == pi)
2094 				continue;
2095 			if (PROBE_CAPABLE(pi2->pi_v4) ||
2096 			    PROBE_CAPABLE(pi2->pi_v6)) {
2097 				alone = _B_FALSE;
2098 				break;
2099 			}
2100 		}
2101 	}
2102 	if (alone)
2103 		return (PHYINT_FAILURE);
2104 
2105 	/*
2106 	 * Need to compare against other phyints of the same group
2107 	 * to exclude group failures. If the failure was detected via
2108 	 * probing, then if the time of last success (tls) of any
2109 	 * phyint is more recent than the time of first fail (tff) of the
2110 	 * phyint in question, and the link is up on the phyint,
2111 	 * then it is a phyint failure. Otherwise it is a group failure.
2112 	 * If failure was detected via a link down notification sent from
2113 	 * the driver to IP, we see if any phyints in the group are still
2114 	 * running and haven't received a link down notification.  We
2115 	 * will usually be processing the link down notification shortly
2116 	 * after it was received, so there is no point looking at the tls
2117 	 * of other phyints.
2118 	 */
2119 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2120 		/* Exclude ourself from comparison */
2121 		if (pi2 == pi)
2122 			continue;
2123 
2124 		if (LINK_DOWN(pi)) {
2125 			/*
2126 			 * We use FLAGS_TO_LINK_STATE() to test the
2127 			 * flags directly, rather then LINK_UP() or
2128 			 * LINK_DOWN(), as we may not have got round
2129 			 * to processing the link state for the other
2130 			 * phyints in the group yet.
2131 			 *
2132 			 * The check for PI_RUNNING and group
2133 			 * failure handles the case when the
2134 			 * group begins to recover.  The first
2135 			 * phyint to recover should not trigger
2136 			 * a failover from the soon-to-recover
2137 			 * other phyints to the first recovered
2138 			 * phyint. PI_RUNNING will be set, and
2139 			 * pg_groupfailed cleared only after
2140 			 * receipt of NUM_PROBE_REPAIRS, by
2141 			 * which time the other phyints should
2142 			 * have received at least 1 packet,
2143 			 * and so will not have NUM_PROBE_FAILS.
2144 			 */
2145 			if ((pi2->pi_state == PI_RUNNING) &&
2146 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
2147 				return (PHYINT_FAILURE);
2148 		} else {
2149 			/*
2150 			 * Need to compare against both IPv4 and
2151 			 * IPv6 instances.
2152 			 */
2153 			pii2 = pi2->pi_v4;
2154 			if (pii2 != NULL) {
2155 				probe_success_info(pii2, NULL, &psinfo);
2156 				if (psinfo.ps_tls_valid) {
2157 					pi2_tls = psinfo.ps_tls;
2158 					/*
2159 					 * See comment above regarding check
2160 					 * for PI_RUNNING and group failure.
2161 					 */
2162 					if (TIME_GT(pi2_tls, pi_tff) &&
2163 					    (pi2->pi_state == PI_RUNNING) &&
2164 					    !GROUP_FAILED(pg) &&
2165 					    FLAGS_TO_LINK_STATE(pi2))
2166 						return (PHYINT_FAILURE);
2167 				}
2168 			}
2169 
2170 			pii2 = pi2->pi_v6;
2171 			if (pii2 != NULL) {
2172 				probe_success_info(pii2, NULL, &psinfo);
2173 				if (psinfo.ps_tls_valid) {
2174 					pi2_tls = psinfo.ps_tls;
2175 					/*
2176 					 * See comment above regarding check
2177 					 * for PI_RUNNING and group failure.
2178 					 */
2179 					if (TIME_GT(pi2_tls, pi_tff) &&
2180 					    (pi2->pi_state == PI_RUNNING) &&
2181 					    !GROUP_FAILED(pg) &&
2182 					    FLAGS_TO_LINK_STATE(pi2))
2183 						return (PHYINT_FAILURE);
2184 				}
2185 			}
2186 		}
2187 	}
2188 
2189 	/*
2190 	 * Change the group state to PG_FAILED if it's not already.
2191 	 */
2192 	if (!GROUP_FAILED(pg))
2193 		phyint_group_chstate(pg, PG_FAILED);
2194 
2195 	return (GROUP_FAILURE);
2196 }
2197 
2198 /*
2199  * Return the information associated with consecutive probe successes
2200  * starting with the most recent probe. At most the last 2 probes can be
2201  * in the unacknowledged state. All previous probes have either failed
2202  * or succeeded.
2203  */
2204 static void
2205 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2206     struct probe_success_count *psinfo)
2207 {
2208 	uint_t	i;
2209 	struct probe_stats *pr_statp;
2210 	uint_t most_recent;
2211 	uint_t second_most_recent;
2212 	boolean_t pi_found_failure = _B_FALSE;
2213 	boolean_t tg_found_failure = _B_FALSE;
2214 	uint_t now;
2215 	uint_t timeout;
2216 	struct target *tg;
2217 
2218 	if (debug & D_FAILOVER)
2219 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2220 
2221 	bzero(psinfo, sizeof (*psinfo));
2222 	now = getcurrenttime();
2223 
2224 	/*
2225 	 * Start with the most recent probe, and count the number
2226 	 * of consecutive probe successes. Latch the number of successes
2227 	 * on hitting a failure.
2228 	 */
2229 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2230 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2231 
2232 	for (i = most_recent; i != pii->pii_probe_next;
2233 	    i = PROBE_INDEX_PREV(i)) {
2234 		pr_statp = &pii->pii_probes[i];
2235 
2236 		switch (pr_statp->pr_status) {
2237 		case PR_UNACKED:
2238 			/*
2239 			 * Only the most recent 2 probes can be unacknowledged
2240 			 */
2241 			assert(i == most_recent || i == second_most_recent);
2242 
2243 			tg = pr_statp->pr_target;
2244 			assert(tg != NULL);
2245 			/*
2246 			 * The crtt could be zero for some reason,
2247 			 * Eg. the phyint could be failed. If the crtt is
2248 			 * not available use the value of the group's probe
2249 			 * interval which is a worst case estimate.
2250 			 */
2251 			if (tg->tg_crtt != 0) {
2252 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2253 			} else {
2254 				timeout = pr_statp->pr_time_sent +
2255 				    pii->pii_phyint->pi_group->pg_probeint;
2256 			}
2257 
2258 			if (TIME_LT(timeout, now)) {
2259 				/*
2260 				 * We hit a failure. Latch the total number of
2261 				 * recent consecutive successes.
2262 				 */
2263 				pr_statp->pr_time_lost = timeout;
2264 				pr_statp->pr_status = PR_LOST;
2265 				pi_found_failure = _B_TRUE;
2266 				if (cur_tg != NULL && tg == cur_tg) {
2267 					/*
2268 					 * We hit a failure for the desired
2269 					 * target. Latch the number of recent
2270 					 * consecutive successes for this target
2271 					 */
2272 					tg_found_failure = _B_TRUE;
2273 				}
2274 			}
2275 			break;
2276 
2277 		case PR_ACKED:
2278 			/*
2279 			 * Bump up the count of probe successes, if we
2280 			 * have not seen any failure so far.
2281 			 */
2282 			if (!pi_found_failure)
2283 				psinfo->ps_nsucc++;
2284 
2285 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2286 			    !tg_found_failure) {
2287 				psinfo->ps_nsucc_tg++;
2288 			}
2289 
2290 			/*
2291 			 * Record the time of last success, if this is
2292 			 * the most recent probe success.
2293 			 */
2294 			if (!psinfo->ps_tls_valid) {
2295 				psinfo->ps_tls = pr_statp->pr_time_acked;
2296 				psinfo->ps_tls_valid = _B_TRUE;
2297 			}
2298 			break;
2299 
2300 		case PR_LOST:
2301 			/*
2302 			 * We hit a failure. Latch the total number of
2303 			 * recent consecutive successes.
2304 			 */
2305 			pi_found_failure = _B_TRUE;
2306 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2307 				/*
2308 				 * We hit a failure for the desired target.
2309 				 * Latch the number of recent consecutive
2310 				 * successes for this target
2311 				 */
2312 				tg_found_failure = _B_TRUE;
2313 			}
2314 			break;
2315 
2316 		default:
2317 			return;
2318 
2319 		}
2320 	}
2321 }
2322 
2323 /*
2324  * Return the information associated with consecutive probe failures
2325  * starting with the most recent probe. Only the last 2 probes can be in the
2326  * unacknowledged state. All previous probes have either failed or succeeded.
2327  */
2328 static void
2329 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2330     struct probe_fail_count *pfinfo)
2331 {
2332 	int	i;
2333 	struct probe_stats *pr_statp;
2334 	boolean_t	tg_found_success = _B_FALSE;
2335 	boolean_t	pi_found_success = _B_FALSE;
2336 	int	most_recent;
2337 	int	second_most_recent;
2338 	uint_t	now;
2339 	uint_t	timeout;
2340 	struct	target *tg;
2341 
2342 	if (debug & D_FAILOVER)
2343 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2344 
2345 	bzero(pfinfo, sizeof (*pfinfo));
2346 	now = getcurrenttime();
2347 
2348 	/*
2349 	 * Start with the most recent probe, and count the number
2350 	 * of consecutive probe failures. Latch the number of failures
2351 	 * on hitting a probe success.
2352 	 */
2353 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2354 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2355 
2356 	for (i = most_recent; i != pii->pii_probe_next;
2357 	    i = PROBE_INDEX_PREV(i)) {
2358 		pr_statp = &pii->pii_probes[i];
2359 
2360 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2361 
2362 		switch (pr_statp->pr_status) {
2363 		case PR_UNACKED:
2364 			/*
2365 			 * Only the most recent 2 probes can be unacknowledged
2366 			 */
2367 			assert(i == most_recent || i == second_most_recent);
2368 
2369 			tg = pr_statp->pr_target;
2370 			/*
2371 			 * Target is guaranteed to exist in the unack. state
2372 			 */
2373 			assert(tg != NULL);
2374 			/*
2375 			 * The crtt could be zero for some reason,
2376 			 * Eg. the phyint could be failed. If the crtt is
2377 			 * not available use the group's probe interval,
2378 			 * which is a worst case estimate.
2379 			 */
2380 			if (tg->tg_crtt != 0) {
2381 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2382 			} else {
2383 				timeout = pr_statp->pr_time_sent +
2384 				    pii->pii_phyint->pi_group->pg_probeint;
2385 			}
2386 
2387 			if (TIME_GT(timeout, now))
2388 				break;
2389 
2390 			pr_statp->pr_time_lost = timeout;
2391 			pr_statp->pr_status = PR_LOST;
2392 			/* FALLTHRU */
2393 
2394 		case PR_LOST:
2395 			if (!pi_found_success) {
2396 				pfinfo->pf_nfail++;
2397 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2398 			}
2399 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2400 			    !tg_found_success)  {
2401 				pfinfo->pf_nfail_tg++;
2402 			}
2403 			break;
2404 
2405 		default:
2406 			/*
2407 			 * We hit a success or unused slot. Latch the
2408 			 * total number of recent consecutive failures.
2409 			 */
2410 			pi_found_success = _B_TRUE;
2411 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2412 				/*
2413 				 * We hit a success for the desired target.
2414 				 * Latch the number of recent consecutive
2415 				 * failures for this target
2416 				 */
2417 				tg_found_success = _B_TRUE;
2418 			}
2419 		}
2420 	}
2421 }
2422 
2423 /*
2424  * Check if the phyint has been repaired.  If no test address has been
2425  * configured, then consider the interface repaired if the link is up (unless
2426  * the link is flapping; see below).  Otherwise, look for proof of probes
2427  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2428  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2429  */
2430 static boolean_t
2431 phyint_repaired(struct phyint *pi)
2432 {
2433 	struct	probe_success_count psinfo;
2434 	struct	phyint_instance *pii;
2435 	struct	target *cur_tg;
2436 	int	pr_ndx;
2437 	uint_t	cur_time;
2438 
2439 	if (debug & D_FAILOVER)
2440 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2441 
2442 	if (LINK_DOWN(pi))
2443 		return (_B_FALSE);
2444 
2445 	/*
2446 	 * If we don't have any test addresses and the link is up, then
2447 	 * consider the interface repaired, unless we've received more than
2448 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2449 	 * which case we keep the link down until we drop back below
2450 	 * the threshold.
2451 	 */
2452 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2453 		cur_time = getcurrenttime();
2454 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2455 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2456 			pi->pi_lfmsg_printed = 0;
2457 			return (_B_TRUE);
2458 		}
2459 		if (!pi->pi_lfmsg_printed) {
2460 			logerr("The link has come up on %s more than %d times "
2461 			    "in the last minute; disabling failback until it "
2462 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2463 			pi->pi_lfmsg_printed = 1;
2464 		}
2465 
2466 		return (_B_FALSE);
2467 	}
2468 
2469 	pii = pi->pi_v4;
2470 	if (PROBE_CAPABLE(pii)) {
2471 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2472 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2473 		probe_success_info(pii, cur_tg, &psinfo);
2474 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2475 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2476 			return (_B_TRUE);
2477 	}
2478 
2479 	pii = pi->pi_v6;
2480 	if (PROBE_CAPABLE(pii)) {
2481 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2482 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2483 		probe_success_info(pii, cur_tg, &psinfo);
2484 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2485 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2486 			return (_B_TRUE);
2487 	}
2488 
2489 	return (_B_FALSE);
2490 }
2491 
2492 /*
2493  * Try failover from phyint 'pi' to a suitable destination.
2494  */
2495 int
2496 try_failover(struct phyint *pi, int failover_type)
2497 {
2498 	struct phyint *dst;
2499 	int err;
2500 
2501 	if (debug & D_FAILOVER)
2502 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
2503 
2504 	/*
2505 	 * Attempt to find a failover destination 'dst'.
2506 	 * dst will be null if any of the following is true
2507 	 * Phyint is not part of a group  OR
2508 	 * Phyint is the only member of a group OR
2509 	 * No suitable failover dst was available
2510 	 */
2511 	dst = get_failover_dst(pi, failover_type);
2512 	if (dst == NULL)
2513 		return (IPMP_EMINRED);
2514 
2515 	dst->pi_empty = 0;			/* Per state diagram */
2516 	pi->pi_full = 0;			/* Per state diagram */
2517 
2518 	err = failover(pi, dst);
2519 
2520 	if (debug & D_FAILOVER) {
2521 		logdebug("failed over from %s to %s ret %d\n",
2522 		    pi->pi_name, dst->pi_name, err);
2523 	}
2524 	if (err == 0) {
2525 		pi->pi_empty = 1;		/* Per state diagram */
2526 		/*
2527 		 * we don't want to print out this message if a
2528 		 * phyint is leaving the group, nor for failover from
2529 		 * standby
2530 		 */
2531 		if (failover_type == FAILOVER_NORMAL) {
2532 			logerr("Successfully failed over from NIC %s to NIC "
2533 			    "%s\n", pi->pi_name, dst->pi_name);
2534 		}
2535 		return (0);
2536 	} else {
2537 		/*
2538 		 * The failover did not succeed. We must retry the failover
2539 		 * only after resyncing our state based on the kernel's.
2540 		 * For eg. either the src or the dst might have been unplumbed
2541 		 * causing this failure. initifs() will be called again,
2542 		 * from main, since full_scan_required has been set to true
2543 		 * by failover();
2544 		 */
2545 		return (IPMP_FAILURE);
2546 	}
2547 }
2548 
2549 /*
2550  * global_errno captures the errno value, if failover() or failback()
2551  * fails. This is sent to if_mpadm(1M).
2552  */
2553 int global_errno;
2554 
2555 /*
2556  * Attempt failover from phyint 'from' to phyint 'to'.
2557  * IP moves everything from phyint 'from' to phyint 'to'.
2558  */
2559 static int
2560 failover(struct phyint *from, struct phyint *to)
2561 {
2562 	struct	lifreq	lifr;
2563 	int 	ret;
2564 
2565 	if (debug & D_FAILOVER) {
2566 		logdebug("failing over from %s to %s\n",
2567 		    from->pi_name, to->pi_name);
2568 	}
2569 
2570 	/*
2571 	 * Perform the failover. Both IPv4 and IPv6 are failed over
2572 	 * using a single ioctl by passing in AF_UNSPEC family.
2573 	 */
2574 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2575 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2576 	lifr.lifr_movetoindex = to->pi_ifindex;
2577 
2578 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
2579 	if (ret < 0) {
2580 		global_errno = errno;
2581 		logperror("failover: ioctl (failover)");
2582 	}
2583 
2584 	/*
2585 	 * Set full_scan_required to true. This will make us read
2586 	 * the state from the kernel in initifs() and update our tables,
2587 	 * to reflect the current state after the failover. If the
2588 	 * failover has failed it will then reissue the failover.
2589 	 */
2590 	full_scan_required = _B_TRUE;
2591 	return (ret);
2592 }
2593 
2594 /*
2595  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
2596  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
2597  * Return values:
2598  * IPMP_SUCCESS:		Failback successful from each of the other
2599  *				phyints in the group.
2600  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
2601  *				phyints in the group.
2602  * IPMP_FAILURE:		Failback syscall failed with some error.
2603  *
2604  * Note that failback is attempted regardless of the setting of the
2605  * failback_enabled flag.
2606  */
2607 int
2608 do_failback(struct phyint *pi)
2609 {
2610 	struct  phyint *from;
2611 	boolean_t done;
2612 	boolean_t partial;
2613 	boolean_t attempted_failback = _B_FALSE;
2614 
2615 	if (debug & D_FAILOVER)
2616 		logdebug("do_failback(%s)\n", pi->pi_name);
2617 
2618 	/* If this phyint is not part of a named group, return. */
2619 	if (pi->pi_group == phyint_anongroup) {
2620 		pi->pi_full = 1;
2621 		return (IPMP_SUCCESS);
2622 	}
2623 
2624 	/*
2625 	 * Attempt failback from every phyint in the group to 'pi'.
2626 	 * The reason for doing this, instead of only from the
2627 	 * phyint to which we did the failover is given below.
2628 	 *
2629 	 * After 'pi' failed, if any app. tries to join on a multicast
2630 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
2631 	 * non-failed phyint in the group, instead of the failed phyint,
2632 	 * in.mpathd is not aware of this. Thus failing back only from the
2633 	 * interface to which 'pi' failed over, will failback the ipif's
2634 	 * but not the ilm's. So we need to failback from all members of
2635 	 * the phyint group
2636 	 */
2637 	done = _B_TRUE;
2638 	partial = _B_FALSE;
2639 	for (from = pi->pi_group->pg_phyint; from != NULL;
2640 	    from = from->pi_pgnext) {
2641 		/* Exclude ourself as a failback src */
2642 		if (from == pi)
2643 			continue;
2644 
2645 		/*
2646 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
2647 		 * phyint must also have IPv4 plumbed. Similar check
2648 		 * for IPv6. IP makes the same check. Otherwise the
2649 		 * failback will fail.
2650 		 */
2651 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
2652 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
2653 			partial = _B_TRUE;
2654 			continue;
2655 		}
2656 
2657 		pi->pi_empty = 0;	/* Per state diagram */
2658 		attempted_failback = _B_TRUE;
2659 		if (failback(from, pi) != 0) {
2660 			done = _B_FALSE;
2661 			break;
2662 		}
2663 	}
2664 
2665 	/*
2666 	 * We are done. No more phyint from which we can src the failback
2667 	 */
2668 	if (done) {
2669 		if (!partial)
2670 			pi->pi_full = 1;	/* Per state diagram */
2671 		/*
2672 		 * Don't print out a message unless there is a
2673 		 * transition from FAILED to RUNNING. For eg.
2674 		 * we don't want to print out this message if a
2675 		 * phyint is leaving the group, or at startup
2676 		 */
2677 		if (attempted_failback && (pi->pi_flags &
2678 		    (IFF_FAILED | IFF_OFFLINE))) {
2679 			logerr("Successfully failed back to NIC %s\n",
2680 			    pi->pi_name);
2681 		}
2682 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2683 	}
2684 
2685 	return (IPMP_FAILURE);
2686 }
2687 
2688 /*
2689  * This function is similar to do_failback() above, but respects the
2690  * failback_enabled flag for phyints in named groups.
2691  */
2692 int
2693 try_failback(struct phyint *pi)
2694 {
2695 	if (debug & D_FAILOVER)
2696 		logdebug("try_failback(%s)\n", pi->pi_name);
2697 
2698 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
2699 		return (IPMP_EFBDISABLED);
2700 
2701 	return (do_failback(pi));
2702 }
2703 
2704 /*
2705  * Failback everything from phyint 'from' that has the same ifindex
2706  * as phyint to's ifindex.
2707  */
2708 static int
2709 failback(struct phyint *from, struct phyint *to)
2710 {
2711 	struct lifreq lifr;
2712 	int ret;
2713 
2714 	if (debug & D_FAILOVER)
2715 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
2716 
2717 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2718 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2719 	lifr.lifr_movetoindex = to->pi_ifindex;
2720 
2721 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
2722 	if (ret < 0) {
2723 		global_errno = errno;
2724 		logperror("failback: ioctl (failback)");
2725 	}
2726 
2727 	/*
2728 	 * Set full_scan_required to true. This will make us read
2729 	 * the state from the kernel in initifs() and update our tables,
2730 	 * to reflect the current state after the failback. If the
2731 	 * failback has failed it will then reissue the failback.
2732 	 */
2733 	full_scan_required = _B_TRUE;
2734 
2735 	return (ret);
2736 }
2737 
2738 /*
2739  * Select a target phyint for failing over from 'pi'.
2740  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
2741  * target phyint is chosen as follows,
2742  *	1. Pick any inactive standby interface.
2743  *	2. If no inactive standby is available, select any phyint in the
2744  *	   same group that has the least number of logints, (excluding
2745  *	   IFF_NOFAILOVER and !IFF_UP logints)
2746  * If we are failing over from a standby, failover_type is
2747  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
2748  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
2749  * and we won't return NULL, as long as there is at least 1 other phyint
2750  * in the group.
2751  */
2752 static struct phyint *
2753 get_failover_dst(struct phyint *pi, int failover_type)
2754 {
2755 	struct phyint	*maybe = NULL;
2756 	struct phyint	*pi2;
2757 	struct phyint 	*last_choice = NULL;
2758 
2759 	if (pi->pi_group == phyint_anongroup)
2760 		return (NULL);
2761 
2762 	/*
2763 	 * Loop thru the phyints in the group, and pick the preferred
2764 	 * phyint for the target.
2765 	 */
2766 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2767 		/* Exclude ourself and offlined interfaces */
2768 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
2769 			continue;
2770 
2771 		/*
2772 		 * The chosen target phyint must have IPv4 instance
2773 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
2774 		 * for IPv6.
2775 		 */
2776 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
2777 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
2778 			continue;
2779 
2780 		/* The chosen target must be PI_RUNNING. */
2781 		if (pi2->pi_state != PI_RUNNING) {
2782 			last_choice = pi2;
2783 			continue;
2784 		}
2785 
2786 		if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) &&
2787 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
2788 			return (pi2);
2789 		} else {
2790 			if (maybe == NULL)
2791 				maybe = pi2;
2792 			else if (logint_upcount(pi2) < logint_upcount(maybe))
2793 				maybe = pi2;
2794 		}
2795 	}
2796 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
2797 		return (last_choice);
2798 	else
2799 		return (maybe);
2800 }
2801 
2802 /*
2803  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2804  */
2805 boolean_t
2806 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
2807 {
2808 	int ifsock;
2809 	struct lifreq lifr;
2810 	uint64_t old_flags;
2811 
2812 	if (debug & D_FAILOVER) {
2813 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
2814 		    pi->pi_name, flags, (int)setfl);
2815 	}
2816 
2817 	if (pi->pi_v4 != NULL) {
2818 		ifsock = ifsock_v4;
2819 	} else  {
2820 		ifsock = ifsock_v6;
2821 	}
2822 
2823 	/*
2824 	 * Get the current flags from the kernel, and set/clear the
2825 	 * desired phyint flags. Since we set only phyint flags, we can
2826 	 * do it on either IPv4 or IPv6 instance.
2827 	 */
2828 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2829 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
2830 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2831 		if (errno != ENXIO)
2832 			logperror("change_lif_flags: ioctl (get flags)");
2833 		return (_B_FALSE);
2834 	}
2835 
2836 	old_flags = lifr.lifr_flags;
2837 	if (setfl)
2838 		lifr.lifr_flags |= flags;
2839 	else
2840 		lifr.lifr_flags &= ~flags;
2841 
2842 	if (old_flags == lifr.lifr_flags) {
2843 		/* No change in the flags. No need to send ioctl */
2844 		return (_B_TRUE);
2845 	}
2846 
2847 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2848 		if (errno != ENXIO)
2849 			logperror("change_lif_flags: ioctl (set flags)");
2850 		return (_B_FALSE);
2851 	}
2852 
2853 	/*
2854 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2855 	 * phyint flags.
2856 	 */
2857 	if (setfl)
2858 		pi->pi_flags |= flags;
2859 	else
2860 		pi->pi_flags &= ~flags;
2861 
2862 	if (pi->pi_v4)
2863 		pi->pi_v4->pii_flags = pi->pi_flags;
2864 
2865 	if (pi->pi_v6)
2866 		pi->pi_v6->pii_flags = pi->pi_flags;
2867 
2868 	return (_B_TRUE);
2869 }
2870 
2871 /*
2872  * icmp cksum computation for IPv4.
2873  */
2874 static int
2875 in_cksum(ushort_t *addr, int len)
2876 {
2877 	register int nleft = len;
2878 	register ushort_t *w = addr;
2879 	register ushort_t answer;
2880 	ushort_t odd_byte = 0;
2881 	register int sum = 0;
2882 
2883 	/*
2884 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2885 	 *  we add sequential 16 bit words to it, and at the end, fold
2886 	 *  back all the carry bits from the top 16 bits into the lower
2887 	 *  16 bits.
2888 	 */
2889 	while (nleft > 1)  {
2890 		sum += *w++;
2891 		nleft -= 2;
2892 	}
2893 
2894 	/* mop up an odd byte, if necessary */
2895 	if (nleft == 1) {
2896 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2897 		sum += odd_byte;
2898 	}
2899 
2900 	/*
2901 	 * add back carry outs from top 16 bits to low 16 bits
2902 	 */
2903 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2904 	sum += (sum >> 16);			/* add carry */
2905 	answer = ~sum;				/* truncate to 16 bits */
2906 	return (answer);
2907 }
2908 
2909 static void
2910 reset_snxt_basetimes(void)
2911 {
2912 	struct phyint_instance *pii;
2913 
2914 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2915 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2916 	}
2917 }
2918 
2919 /*
2920  * Is the address one of our own addresses? Unfortunately,
2921  * we cannot check our phyint tables to determine if the address
2922  * is our own. This is because, we don't track interfaces that
2923  * are not part of any group. We have to either use a 'bind' or
2924  * get the complete list of all interfaces using SIOCGLIFCONF,
2925  * to do this check. We could also use SIOCTMYADDR.
2926  * Bind fails for the local zone address, so we might include local zone
2927  * address as target address. If local zone address is a target address
2928  * and it is up, it is not possible to detect the interface failure.
2929  * SIOCTMYADDR also doesn't consider local zone address as own address.
2930  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2931  * are stored in laddr_list.
2932  */
2933 
2934 boolean_t
2935 own_address(struct in6_addr addr)
2936 {
2937 	struct local_addr *taddr = laddr_list;
2938 
2939 	for (; taddr != NULL; taddr = taddr->next) {
2940 		if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) {
2941 			return (_B_TRUE);
2942 		}
2943 	}
2944 	return (_B_FALSE);
2945 }
2946