xref: /titanic_44/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c (revision e2738c5e21a9e5d9a6525e48af4738deda3df455)
1 /*
2  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1987 Regents of the University of California.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms are permitted
11  * provided that the above copyright notice and this paragraph are
12  * duplicated in all such forms and that any documentation,
13  * advertising materials, and other materials related to such
14  * distribution and use acknowledge that the software was developed
15  * by the University of California, Berkeley. The name of the
16  * University may not be used to endorse or promote products derived
17  * from this software without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21  */
22 
23 #pragma ident	"%Z%%M%	%I%	%E% SMI"
24 
25 #include "mpd_defs.h"
26 #include "mpd_tables.h"
27 
28 /*
29  * Probe types for probe()
30  */
31 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
32 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
33 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
34 
35 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
36 
37 /*
38  * Format of probe / probe response packets. This is an ICMP Echo request
39  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
40  */
41 struct pr_icmp
42 {
43 	uint8_t  pr_icmp_type;		/* type field */
44 	uint8_t  pr_icmp_code;		/* code field */
45 	uint16_t pr_icmp_cksum;		/* checksum field */
46 	uint16_t pr_icmp_id;		/* Identification */
47 	uint16_t pr_icmp_seq;		/* sequence number */
48 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
49 	uint32_t pr_icmp_mtype;		/* Message type */
50 };
51 
52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
53 				    0x0, 0x0, 0x0, 0x0,
54 				    0x0, 0x0, 0x0, 0x0,
55 				    0x0, 0x0, 0x0, 0x1 } };
56 
57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
58 
59 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
60 
61 static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
62 static void		pi_set_crtt(struct target *tg, int m,
63     boolean_t is_probe_uni);
64 static void		incoming_echo_reply(struct phyint_instance *pii,
65     struct pr_icmp *reply, struct in6_addr fromaddr);
66 static void		incoming_rtt_reply(struct phyint_instance *pii,
67     struct pr_icmp *reply, struct in6_addr fromaddr);
68 static void		incoming_mcast_reply(struct phyint_instance *pii,
69     struct pr_icmp *reply, struct in6_addr fromaddr);
70 
71 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
72 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
73 static boolean_t	check_exception_target(struct phyint_instance *pii,
74     struct target *target);
75 static void		probe_fail_info(struct phyint_instance *pii,
76     struct target *cur_tg, struct probe_fail_count *pfinfo);
77 static void		probe_success_info(struct phyint_instance *pii,
78     struct target *cur_tg, struct probe_success_count *psinfo);
79 static boolean_t	phyint_repaired(struct phyint *pi);
80 
81 static int		failover(struct phyint *from, struct phyint *to);
82 static int		failback(struct phyint *from, struct phyint *to);
83 static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
84 
85 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
86 static int 		in_cksum(ushort_t *addr, int len);
87 static void		reset_snxt_basetimes(void);
88 
89 /*
90  * CRTT - Conservative Round Trip Time Estimate
91  * Probe success - A matching probe reply received before CRTT ms has elapsed
92  *	after sending the probe.
93  * Probe failure - No probe reply received and more than CRTT ms has elapsed
94  *	after sending the probe.
95  *
96  * TLS - Time last success. Most recent probe ack received at this time.
97  * TFF - Time first fail. The time of the earliest probe failure in
98  *	a consecutive series of probe failures.
99  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
100  * 	before declaring phyint repair.
101  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
102  *	declare a phyint failure.
103  *
104  * 			Phyint state diagram
105  *
106  * The state of a phyint that is capable of being probed, is completely
107  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
108  *
109  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
110  * of the link (according to the driver).  If the phyint is also configured
111  * with a test address (the common case) and probe targets, then a phyint must
112  * also successfully be able to send and receive probes in order to remain in
113  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
114  *
115  * Further, if a PI_RUNNING phyint is configured with a test address but is
116  * unable to find any probe targets, it will transition to the PI_NOTARGETS
117  * state, which indicates that the link is apparently functional but that
118  * in.mpathd is unable to send probes to verify functionality (in this case,
119  * in.mpathd makes the optimistic assumption that the interface is working
120  * correctly and thus does not perform a failover, but reports the interface
121  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
122  *
123  * At any point, a phyint may be administratively marked offline via if_mpadm.
124  * In this case, the interface always transitions to PI_OFFLINE, regardless
125  * of its previous state.  When the interface is later brought back online,
126  * in.mpathd acts as if the interface is new (and thus it transitions to
127  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
128  * its probes, if probes are sent).
129  *
130  * pi_state -  PI_RUNNING or PI_FAILED
131  *	PI_RUNNING: The failure detection logic says the phyint is good.
132  *	PI_FAILED: The failure detection logic says the phyint has failed.
133  *
134  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
135  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
136  *	In the case of router targets, we assume that the current list of
137  *	targets obtained from the routing table, is still valid, so the
138  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139  *	list of targets, and multicast to the all hosts, to reconstruct the
140  *	target list. So the phyints are in the PI_NOTARGETS state.
141  *
142  * I -	value of (pi_flags & IFF_INACTIVE)
143  *	IFF_INACTIVE: No failovers have been done to this phyint, from
144  *		other phyints. This phyint is inactive. Phyint can be a Standby.
145  *		When failback has been disabled (FAILOVER=no configured),
146  *		phyint can also be a non-STANDBY. In this case IFF_INACTIVE
147  *		is set when phyint subsequently recovers after a failure.
148  *
149  * pi_empty
150  *	This phyint has failed over successfully to another phyint, and
151  *	this phyint is currently "empty". It does not host any addresses or
152  *	multicast membership etc. This is the state of a phyint after a
153  *	failover from the phyint has completed successfully and no subsequent
154  *	'failover to' or 'failback to' has occurred on the phyint.
155  *	IP guarantees that no new logicals will be hosted nor any multicast
156  *	joins permitted on the phyint, since the phyint is either failed or
157  *	inactive. pi_empty is set implies the phyint is either failed or
158  *	inactive.
159  *
160  * pi_full
161  *	The phyint hosts all of its own addresses that it "owns". If the
162  *	phyint was previously failed or inactive, failbacks to the phyint
163  *	has completed successfully. i.e. No more failbacks to this phyint
164  *	can produce any change in system state whatsoever.
165  *
166  * Not all 32 possible combinations of the above 5-tuple are possible.
167  * Furthermore some of the above combinations are transient. They may occur
168  * only because the failover or failback did not complete successfully. The
169  * failover/failback will be retried and eventually a stable state will be
170  * reached.
171  *
172  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
173  * The following are the state machines. 'from' and 'to' are the src and
174  * dst of the failover/failback, below
175  *
176  *			pi_empty state machine
177  * ---------------------------------------------------------------------------
178  *	Event				State	->	New State
179  * ---------------------------------------------------------------------------
180  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
181  *	of failover
182  *
183  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
184  *
185  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
186  *
187  * 	group failure			pi_empty = X	  -> pi_empty = 0
188  * ---------------------------------------------------------------------------
189  *
190  *			pi_full state machine
191  * ---------------------------------------------------------------------------
192  *	Event				State		  -> New State
193  * ---------------------------------------------------------------------------
194  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
195  *	of failback from
196  *	each of the other phyints
197  *
198  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
199  *
200  *	group failure			pi_full = X	  -> pi_full = 0
201  * ---------------------------------------------------------------------------
202  *
203  *			pi_state state machine
204  * ---------------------------------------------------------------------------
205  *	Event			State			New State
206  *				Action:
207  * ---------------------------------------------------------------------------
208  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
209  *	detection		: set IFF_FAILED on this phyint
210  *				: failover from this phyint to another
211  *
212  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
213  *	detection		: set IFF_FAILED on this phyint
214  *
215  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=yes)
216  *	detection				     -> (PI_RUNNING, I == 0)
217  *				: to.pi_empty = 0
218  *				: clear IFF_FAILED on this phyint
219  *				: failback to this phyint if enabled
220  *
221  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=no)
222  *	detection				     ->	(PI_RUNNING, I == 1)
223  *				: to.pi_empty = 0
224  *				: clear IFF_FAILED on this phyint
225  *				: if failback is disabled set I == 1
226  *
227  *	Group failure		(perform on all phyints in the group)
228  *	detection 		PI_RUNNING		PI_FAILED
229  *	(Router targets)	: set IFF_FAILED
230  *				: clear pi_empty and pi_full
231  *
232  *	Group failure		(perform on all phyints in the group)
233  *	detection 		PI_RUNNING		PI_NOTARGETS
234  *	(Host targets)		: set IFF_FAILED
235  *				: clear pi_empty and pi_full
236  *				: delete the target list on all phyints
237  * ---------------------------------------------------------------------------
238  *
239  *			I state machine
240  * ---------------------------------------------------------------------------
241  *	Event		State			Action:
242  * ---------------------------------------------------------------------------
243  *	Turn on I 	pi_empty == 0, STANDBY 	: failover from standby
244  *
245  *	Turn off I 	PI_RUNNING, STANDBY	: pi_empty = 0
246  *			pi_full == 0		: failback to this if enabled
247  * ---------------------------------------------------------------------------
248  *
249  * Assertions: (Read '==>' as implies)
250  *
251  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
252  * (pi_empty == 1) ==> (pi_full == 0)
253  * (pi_full  == 1) ==> (pi_empty == 0)
254  *
255  * Invariants
256  *
257  * pg_groupfailed = 0  &&
258  *   1. (I == 1, pi_empty == 0)		   ==> initiate failover from standby
259  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
260  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
261  *
262  * 1. says that an inactive standby, that is not empty, has to be failed
263  * over. For a standby to be truly inactive, it should not host any
264  * addresses. So we move them to some other phyint. Usually we catch the
265  * turn on of IFF_INACTIVE, and perform this action. However if the failover
266  * did not complete successfully, then subsequently we have lost the edge
267  * trigger, and this invariant kicks in and completes the action.
268  *
269  * 2. says that any failed phyint that is not empty must be failed over.
270  * Usually we do the failover when we detect NIC failure. However if the
271  * failover does not complete successfully, this invariant kicks in and
272  * completes the failover. We exclude inactive standby which is covered by 1.
273  *
274  * 3. says that any running phyint that is not full must be failed back.
275  * Usually we do the failback when we detect NIC repair. However if the
276  * failback does not complete successfully, this invariant kicks in and
277  * completes the failback. Note that we don't want to failback to an inactive
278  * standby.
279  *
280  * The invariants 1 - 3 and the actions are in initifs().
281  */
282 
283 struct probes_missed probes_missed;
284 
285 /*
286  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
287  * will be added on by the kernel.  The id field identifies this phyint.
288  * and the sequence number is an increasing (modulo 2^^16) integer. The data
289  * portion holds the time value when the packet is sent. On echo this is
290  * extracted to compute the round-trip time. Three different types of
291  * probe packets are used.
292  *
293  * PROBE_UNI: This type is used to do failure detection / failure recovery
294  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
295  *	not less than the current CRTT. pii_probes[] stores data
296  *	about these probes. These packets consume sequence number space.
297  *
298  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
299  * 	are not used. Under heavy network load, the rtt may go up very high,
300  *	due to a spike, or may appear to go high, due to extreme scheduling
301  * 	delays. Once the network stress is removed, mpathd takes long time to
302  *	recover, because the probe_interval is already high, and it takes
303  *	a long time to send out sufficient number of probes to bring down the
304  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
305  *	user_probe_interval ms. and will cause only rtt updates. These packets
306  *	do not consume sequence number space nor is information about these
307  *	packets stored in the pii_probes[]
308  *
309  * PROBE_MULTI: This type is only used to construct a list of targets, when
310  *	no targets are known. The packet is multicast to the all hosts addr.
311  */
312 static void
313 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
314 {
315 	struct pr_icmp probe_pkt;	/* Probe packet */
316 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
317 	struct sockaddr_in whereto; 	/* target address IPv4 */
318 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
319 	boolean_t sent = _B_TRUE;
320 
321 	if (debug & D_TARGET) {
322 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
323 		    pii->pii_name, probe_type, cur_time);
324 	}
325 
326 	assert(pii->pii_probe_sock != -1);
327 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
328 	    probe_type == PROBE_RTT);
329 
330 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
331 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
332 	probe_pkt.pr_icmp_code = 0;
333 	probe_pkt.pr_icmp_cksum = 0;
334 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
335 
336 	/*
337 	 * Since there is no need to do arithmetic on the icmpid,
338 	 * (only equality check is done) pii_icmpid is stored in
339 	 * network byte order at initialization itself.
340 	 */
341 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
342 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
343 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
344 
345 	/*
346 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
347 	 * the all hosts address. Otherwise it is unicast to the next target.
348 	 */
349 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
350 	    pii->pii_rtt_target_next != NULL));
351 
352 	if (pii->pii_af == AF_INET6) {
353 		bzero(&whereto6, sizeof (whereto6));
354 		whereto6.sin6_family = AF_INET6;
355 		if (probe_type == PROBE_MULTI) {
356 			whereto6.sin6_addr = all_nodes_mcast_v6;
357 		} else if (probe_type == PROBE_UNI) {
358 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
359 		} else  {
360 			/* type is PROBE_RTT */
361 			whereto6.sin6_addr =
362 			    pii->pii_rtt_target_next->tg_address;
363 		}
364 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
365 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
366 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
367 			logperror_pii(pii, "probe: probe sendto");
368 			sent = _B_FALSE;
369 		}
370 	} else {
371 		bzero(&whereto, sizeof (whereto));
372 		whereto.sin_family = AF_INET;
373 		if (probe_type == PROBE_MULTI) {
374 			whereto.sin_addr = all_nodes_mcast_v4;
375 		} else if (probe_type == PROBE_UNI) {
376 			IN6_V4MAPPED_TO_INADDR(
377 			    &pii->pii_target_next->tg_address,
378 			    &whereto.sin_addr);
379 		} else {
380 			/* type is PROBE_RTT */
381 			IN6_V4MAPPED_TO_INADDR(
382 			    &pii->pii_rtt_target_next->tg_address,
383 			    &whereto.sin_addr);
384 		}
385 
386 		/*
387 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
388 		 */
389 		probe_pkt.pr_icmp_cksum =
390 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
391 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
392 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
393 		    sizeof (whereto)) != sizeof (probe_pkt)) {
394 			logperror_pii(pii, "probe: probe sendto");
395 			sent = _B_FALSE;
396 		}
397 	}
398 
399 	/*
400 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
401 	 * update our tables. We will need this info in processing the probe
402 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
403 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
404 	 * are only used to construct a list of targets. PROBE_RTT packets are
405 	 * used only for updating the rtt and not for failure detection.
406 	 */
407 	if (probe_type == PROBE_UNI && sent) {
408 		pr_ndx = pii->pii_probe_next;
409 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
410 
411 		/* Collect statistics, before we reuse the last slot. */
412 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
413 			pii->pii_cum_stats.lost++;
414 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
415 			pii->pii_cum_stats.acked++;
416 		pii->pii_cum_stats.sent++;
417 
418 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
419 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
420 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
421 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
422 		pii->pii_target_next = target_next(pii->pii_target_next);
423 		assert(pii->pii_target_next != NULL);
424 		/*
425 		 * If we have a single variable to denote the next target to
426 		 * probe for both rtt probes and failure detection probes, we
427 		 * could end up with a situation where the failure detection
428 		 * probe targets become disjoint from the rtt probe targets.
429 		 * Eg. if 2 targets and the actual fdt is double the user
430 		 * specified fdt. So we have 2 variables. In this scheme
431 		 * we also reset pii_rtt_target_next for every fdt probe,
432 		 * though that may not be necessary.
433 		 */
434 		pii->pii_rtt_target_next = pii->pii_target_next;
435 		pii->pii_snxt++;
436 	} else if (probe_type == PROBE_RTT) {
437 		pii->pii_rtt_target_next =
438 		    target_next(pii->pii_rtt_target_next);
439 		assert(pii->pii_rtt_target_next != NULL);
440 	}
441 }
442 
443 /*
444  * Incoming IPv4 data from wire, is received here. Called from main.
445  */
446 void
447 in_data(struct phyint_instance *pii)
448 {
449 	struct	sockaddr_in 	from;
450 	struct	in6_addr	fromaddr;
451 	uint_t	fromlen;
452 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
453 	struct ip *ip;
454 	int 	iphlen;
455 	int 	len;
456 	char 	abuf[INET_ADDRSTRLEN];
457 	struct	pr_icmp	*reply;
458 
459 	if (debug & D_PROBE) {
460 		logdebug("in_data(%s %s)\n",
461 		    AF_STR(pii->pii_af), pii->pii_name);
462 	}
463 
464 	/*
465 	 * Poll has already told us that a message is waiting,
466 	 * on this socket. Read it now. We should not block.
467 	 */
468 	fromlen = sizeof (from);
469 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
470 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
471 	if (len < 0) {
472 		logperror_pii(pii, "in_data: recvfrom");
473 		return;
474 	}
475 
476 	/*
477 	 * If the NIC has indicated the link is down, don't go
478 	 * any further.
479 	 */
480 	if (LINK_DOWN(pii->pii_phyint))
481 		return;
482 
483 	/* Get the printable address for error reporting */
484 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
485 
486 	/* Make sure packet contains at least minimum ICMP header */
487 	ip = (struct ip *)in_packet;
488 	iphlen = ip->ip_hl << 2;
489 	if (len < iphlen + ICMP_MINLEN) {
490 		if (debug & D_PKTBAD) {
491 			logdebug("in_data: packet too short (%d bytes)"
492 			    " from %s\n", len, abuf);
493 		}
494 		return;
495 	}
496 
497 	/*
498 	 * Subtract the IP hdr length, 'len' will be length of the probe
499 	 * reply, starting from the icmp hdr.
500 	 */
501 	len -= iphlen;
502 	/* LINTED */
503 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
504 
505 	/* Probe replies are icmp echo replies. Ignore anything else */
506 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
507 		return;
508 
509 	/*
510 	 * The icmp id should match what we sent, which is stored
511 	 * in pi_icmpid. The icmp code for reply must be 0.
512 	 * The reply content must be a struct pr_icmp
513 	 */
514 	if (reply->pr_icmp_id != pii->pii_icmpid) {
515 		/* Not in response to our probe */
516 		return;
517 	}
518 
519 	if (reply->pr_icmp_code != 0) {
520 		logtrace("probe reply code %d from %s on %s\n",
521 		    reply->pr_icmp_code, abuf, pii->pii_name);
522 		return;
523 	}
524 
525 	if (len < sizeof (struct pr_icmp)) {
526 		logtrace("probe reply too short: %d bytes from %s on %s\n",
527 		    len, abuf, pii->pii_name);
528 		return;
529 	}
530 
531 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
532 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
533 		/* Unicast probe reply */
534 		incoming_echo_reply(pii, reply, fromaddr);
535 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
536 		/* Multicast reply */
537 		incoming_mcast_reply(pii, reply, fromaddr);
538 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
539 		incoming_rtt_reply(pii, reply, fromaddr);
540 	} else {
541 		/* Probably not in response to our probe */
542 		logtrace("probe reply type: %d from %s on %s\n",
543 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
544 		return;
545 	}
546 
547 }
548 
549 /*
550  * Incoming IPv6 data from wire is received here. Called from main.
551  */
552 void
553 in6_data(struct phyint_instance *pii)
554 {
555 	struct sockaddr_in6 from;
556 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
557 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
558 	int len;
559 	char abuf[INET6_ADDRSTRLEN];
560 	struct msghdr msg;
561 	struct iovec iov;
562 	uchar_t *opt;
563 	struct	pr_icmp *reply;
564 
565 	if (debug & D_PROBE) {
566 		logdebug("in6_data(%s %s)\n",
567 		    AF_STR(pii->pii_af), pii->pii_name);
568 	}
569 
570 	iov.iov_base = (char *)in_packet;
571 	iov.iov_len = sizeof (in_packet);
572 	msg.msg_iov = &iov;
573 	msg.msg_iovlen = 1;
574 	msg.msg_name = (struct sockaddr *)&from;
575 	msg.msg_namelen = sizeof (from);
576 	msg.msg_control = ancillary_data;
577 	msg.msg_controllen = sizeof (ancillary_data);
578 
579 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
580 		logperror_pii(pii, "in6_data: recvfrom");
581 		return;
582 	}
583 
584 	/*
585 	 * If the NIC has indicated that the link is down, don't go
586 	 * any further.
587 	 */
588 	if (LINK_DOWN(pii->pii_phyint))
589 		return;
590 
591 	/* Get the printable address for error reporting */
592 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
593 	if (len < ICMP_MINLEN) {
594 		if (debug & D_PKTBAD) {
595 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
596 			    msg.msg_flags, abuf);
597 		}
598 		return;
599 	}
600 	/* Ignore packets > 64k or control buffers that don't fit */
601 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
602 		if (debug & D_PKTBAD) {
603 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
604 			    msg.msg_flags, abuf);
605 		}
606 		return;
607 	}
608 
609 	reply = (struct pr_icmp *)in_packet;
610 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
611 		return;
612 
613 	if (reply->pr_icmp_id != pii->pii_icmpid) {
614 		/* Not in response to our probe */
615 		return;
616 	}
617 
618 	/*
619 	 * The kernel has already verified the the ICMP checksum.
620 	 */
621 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
622 		logtrace("ICMPv6 echo reply source address not linklocal from "
623 		    "%s on %s\n", abuf, pii->pii_name);
624 		return;
625 	}
626 	opt = find_ancillary(&msg, IPV6_RTHDR);
627 	if (opt != NULL) {
628 		/* Can't allow routing headers in probe replies  */
629 		logtrace("message with routing header from %s on %s\n",
630 		    abuf, pii->pii_name);
631 		return;
632 	}
633 	if (reply->pr_icmp_code != 0) {
634 		logtrace("probe reply code: %d from %s on %s\n",
635 		    reply->pr_icmp_code, abuf, pii->pii_name);
636 		return;
637 	}
638 	if (len < (sizeof (struct pr_icmp))) {
639 		logtrace("probe reply too short: %d bytes from %s on %s\n",
640 		    len, abuf, pii->pii_name);
641 		return;
642 	}
643 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
644 		incoming_echo_reply(pii, reply, from.sin6_addr);
645 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
646 		incoming_mcast_reply(pii, reply, from.sin6_addr);
647 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
648 		incoming_rtt_reply(pii, reply, from.sin6_addr);
649 	} else  {
650 		/* Probably not in response to our probe */
651 		logtrace("probe reply type: %d from %s on %s\n",
652 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
653 	}
654 }
655 
656 /*
657  * Process the incoming rtt reply, in response to our rtt probe.
658  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
659  * have any stored information about the probe we sent. So we don't log
660  * any errors if we receive bad replies.
661  */
662 static void
663 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
664     struct in6_addr fromaddr)
665 {
666 	int 	m;		/* rtt measurment in ms */
667 	uint32_t cur_time;	/* in ms from some arbitrary point */
668 	char	abuf[INET6_ADDRSTRLEN];
669 	struct	target	*target;
670 	uint32_t pr_icmp_timestamp;
671 	struct 	phyint_group *pg;
672 
673 	/* Get the printable address for error reporting */
674 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
675 
676 	if (debug & D_PROBE) {
677 		logdebug("incoming_rtt_reply: %s %s %s\n",
678 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
679 	}
680 
681 	/* Do we know this target ? */
682 	target = target_lookup(pii, fromaddr);
683 	if (target == NULL)
684 		return;
685 
686 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
687 	cur_time = getcurrenttime();
688 	m = (int)(cur_time - pr_icmp_timestamp);
689 
690 	/* Invalid rtt. It has wrapped around */
691 	if (m < 0)
692 		return;
693 
694 	/*
695 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
696 	 * The initial few responses after the interface is repaired may
697 	 * contain high rtt's because they could have been queued up waiting
698 	 * for ARP/NDP resolution on a failed interface.
699 	 */
700 	pg = pii->pii_phyint->pi_group;
701 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
702 		return;
703 
704 	/*
705 	 * Update rtt only if the new rtt is lower than the current rtt.
706 	 * (specified by the 3rd parameter to pi_set_crtt).
707 	 * If a spike has caused the current probe_interval to be >
708 	 * user_probe_interval, then this mechanism is used to bring down
709 	 * the rtt rapidly once the network stress is removed.
710 	 * If the new rtt is higher than the current rtt, we don't want to
711 	 * update the rtt. We are having more than 1 outstanding probe and
712 	 * the increase in rtt we are seeing is being unnecessarily weighted
713 	 * many times. The regular rtt update will be handled by
714 	 * incoming_echo_reply() and will take care of any rtt increase.
715 	 */
716 	pi_set_crtt(target, m, _B_FALSE);
717 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
718 	    (user_failure_detection_time < pg->pg_fdt) &&
719 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
720 		/*
721 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
722 		 * investigate if we can improve the failure detection time to
723 		 * meet whatever the user specified.
724 		 */
725 		if (check_pg_crtt_improved(pg)) {
726 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
727 			    user_failure_detection_time);
728 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
729 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
730 				logerr("Improved failure detection time %d ms "
731 				    "on (%s %s) for group \"%s\"\n",
732 				    pg->pg_fdt, AF_STR(pii->pii_af),
733 				    pii->pii_name,
734 				    pii->pii_phyint->pi_group->pg_name);
735 			}
736 			if (user_failure_detection_time == pg->pg_fdt) {
737 				/* Avoid any truncation or rounding errors */
738 				pg->pg_probeint = user_probe_interval;
739 				/*
740 				 * No more rtt probes will be sent. The actual
741 				 * fdt has dropped to the user specified value.
742 				 * pii_fd_snxt_basetime and pii_snxt_basetime
743 				 * will be in sync henceforth.
744 				 */
745 				reset_snxt_basetimes();
746 			}
747 		}
748 	}
749 }
750 
751 /*
752  * Process the incoming echo reply, in response to our unicast probe.
753  * Common for both IPv4 and IPv6
754  */
755 static void
756 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
757     struct in6_addr fromaddr)
758 {
759 	int 	m;		/* rtt measurment in ms */
760 	uint32_t cur_time;	/* in ms from some arbitrary point */
761 	char	abuf[INET6_ADDRSTRLEN];
762 	int	pr_ndx;
763 	struct	target	*target;
764 	boolean_t exception;
765 	uint32_t pr_icmp_timestamp;
766 	uint16_t pr_icmp_seq;
767 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
768 
769 	/* Get the printable address for error reporting */
770 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
771 
772 	if (debug & D_PROBE) {
773 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
774 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
775 		    ntohs(reply->pr_icmp_seq));
776 	}
777 
778 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
779 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
780 
781 	/* Reject out of window probe replies */
782 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
783 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
784 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
785 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
786 		pii->pii_cum_stats.unknown++;
787 		return;
788 	}
789 	cur_time = getcurrenttime();
790 	m = (int)(cur_time - pr_icmp_timestamp);
791 	if (m < 0) {
792 		/*
793 		 * This is a ridiculously high value of rtt. rtt has wrapped
794 		 * around. Log a message, and ignore the rtt.
795 		 */
796 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
797 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
798 	}
799 
800 	/*
801 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
802 	 * number in our pii->pii_probes[] array. The icmp sequence number
803 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
804 	 */
805 	pr_ndx = MOD_SUB(pii->pii_probe_next,
806 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
807 
808 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
809 
810 	target = pii->pii_probes[pr_ndx].pr_target;
811 
812 	/*
813 	 * Perform sanity checks, whether this probe reply that we
814 	 * have received is genuine
815 	 */
816 	if (target != NULL) {
817 		/*
818 		 * Compare the src. addr of the received ICMP or ICMPv6
819 		 * probe reply with the target address in our tables.
820 		 */
821 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
822 			/*
823 			 * We don't have any record of having sent a probe to
824 			 * this target. This is a fake probe reply. Log an error
825 			 */
826 			logtrace("probe status %d Fake probe reply seq %u "
827 			    "snxt %u on %s from %s\n",
828 			    pii->pii_probes[pr_ndx].pr_status,
829 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
830 			pii->pii_cum_stats.unknown++;
831 			return;
832 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
833 			/*
834 			 * The address matches, but our tables indicate that
835 			 * this probe reply has been acked already. So this
836 			 * is a duplicate probe reply. Log an error
837 			 */
838 			logtrace("probe status %d Duplicate probe reply seq %u "
839 			    "snxt %u on %s from %s\n",
840 			    pii->pii_probes[pr_ndx].pr_status,
841 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
842 			pii->pii_cum_stats.unknown++;
843 			return;
844 		}
845 	} else {
846 		/*
847 		 * Target must not be NULL in the PR_UNACKED state
848 		 */
849 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
850 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
851 			/*
852 			 * The probe stats slot is unused. So we didn't
853 			 * send out any probe to this target. This is a fake.
854 			 * Log an error.
855 			 */
856 			logtrace("probe status %d Fake probe reply seq %u "
857 			    "snxt %u on %s from %s\n",
858 			    pii->pii_probes[pr_ndx].pr_status,
859 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
860 		}
861 		pii->pii_cum_stats.unknown++;
862 		return;
863 	}
864 
865 	/*
866 	 * If the rtt does not appear to be right, don't update the
867 	 * rtt stats. This can happen if the system dropped into the
868 	 * debugger, or the system was hung or too busy for a
869 	 * substantial time that we didn't get a chance to run.
870 	 */
871 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
872 		/*
873 		 * If the probe corresponding to this receieved response
874 		 * was truly sent 'm' ms. ago, then this response must
875 		 * have been rejected by the sequence number checks. The
876 		 * fact that it has passed the sequence number checks
877 		 * means that the measured rtt is wrong. We were probably
878 		 * scheduled long after the packet was received.
879 		 */
880 		goto out;
881 	}
882 
883 	/*
884 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
885 	 * The initial few responses after the interface is repaired may
886 	 * contain high rtt's because they could have been queued up waiting
887 	 * for ARP/NDP resolution on a failed interface.
888 	 */
889 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
890 		goto out;
891 
892 	/*
893 	 * Don't update the Conservative Round Trip Time estimate for this
894 	 * (phint, target) pair if this is the not the highest ack seq seen
895 	 * thus far on this target.
896 	 */
897 	if (!highest_ack_tg(pr_icmp_seq, target))
898 		goto out;
899 
900 	/*
901 	 * Always update the rtt. This is a failure detection probe
902 	 * and we want to measure both increase / decrease in rtt.
903 	 */
904 	pi_set_crtt(target, m, _B_TRUE);
905 
906 	/*
907 	 * If the crtt exceeds the average time between probes,
908 	 * investigate if this slow target is an exception. If so we
909 	 * can avoid this target and still meet the failure detection
910 	 * time. Otherwise we can't meet the failure detection time.
911 	 */
912 	if (target->tg_crtt > pg->pg_probeint) {
913 		exception = check_exception_target(pii, target);
914 		if (exception) {
915 			/*
916 			 * This target is exceptionally slow. Don't use it
917 			 * for future probes. check_exception_target() has
918 			 * made sure that we have at least MIN_PROBE_TARGETS
919 			 * other active targets
920 			 */
921 			if (pii->pii_targets_are_routers) {
922 				/*
923 				 * This is a slow router, mark it as slow
924 				 * and don't use it for further probes. We
925 				 * don't delete it, since it will be populated
926 				 * again when we do a router scan. Hence we
927 				 * need to maintain extra state (unlike the
928 				 * host case below).  Mark it as TG_SLOW.
929 				 */
930 				if (target->tg_status == TG_ACTIVE)
931 					pii->pii_ntargets--;
932 				target->tg_status = TG_SLOW;
933 				target->tg_latime = gethrtime();
934 				target->tg_rtt_sa = -1;
935 				target->tg_crtt = 0;
936 				target->tg_rtt_sd = 0;
937 				if (pii->pii_target_next == target) {
938 					pii->pii_target_next =
939 					    target_next(target);
940 				}
941 			} else {
942 				/*
943 				 * the slow target is not a router, we can
944 				 * just delete it. Send an icmp multicast and
945 				 * pick the fastest responder that is not
946 				 * already an active target. target_delete()
947 				 * adjusts pii->pii_target_next
948 				 */
949 				target_delete(target);
950 				probe(pii, PROBE_MULTI, cur_time);
951 			}
952 		} else {
953 			/*
954 			 * We can't meet the failure detection time.
955 			 * Log a message, and update the detection time to
956 			 * whatever we can achieve.
957 			 */
958 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
959 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
960 			last_fdt_bumpup_time = gethrtime();
961 			if (pg != phyint_anongroup) {
962 				logerr("Cannot meet requested failure detection"
963 				    " time of %d ms on (%s %s) new failure"
964 				    " detection time for group \"%s\" is %d"
965 				    " ms\n", user_failure_detection_time,
966 				    AF_STR(pii->pii_af), pii->pii_name,
967 				    pg->pg_name, pg->pg_fdt);
968 			}
969 		}
970 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
971 	    (user_failure_detection_time < pg->pg_fdt) &&
972 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
973 		/*
974 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
975 		 * investigate if we can improve the failure detection time to
976 		 * meet whatever the user specified.
977 		 */
978 		if (check_pg_crtt_improved(pg)) {
979 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
980 			    user_failure_detection_time);
981 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
982 			if (pg != phyint_anongroup) {
983 				logerr("Improved failure detection time %d ms "
984 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
985 				    AF_STR(pii->pii_af), pii->pii_name,
986 				    pg->pg_name);
987 			}
988 			if (user_failure_detection_time == pg->pg_fdt) {
989 				/* Avoid any truncation or rounding errors */
990 				pg->pg_probeint = user_probe_interval;
991 				/*
992 				 * No more rtt probes will be sent. The actual
993 				 * fdt has dropped to the user specified value.
994 				 * pii_fd_snxt_basetime and pii_snxt_basetime
995 				 * will be in sync henceforth.
996 				 */
997 				reset_snxt_basetimes();
998 			}
999 		}
1000 	}
1001 out:
1002 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
1003 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
1004 
1005 	/*
1006 	 * Update pii->pii_rack, i.e. the sequence number of the last received
1007 	 * probe response, based on the echo reply we have received now, if
1008 	 * either of the following conditions are satisfied.
1009 	 * a. pii_rack is outside the current receive window of
1010 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
1011 	 *    This means we have not received probe responses for a
1012 	 *    long time, and the sequence number has wrapped around.
1013 	 * b. pii_rack is within the current receive window and this echo
1014 	 *    reply corresponds to the highest sequence number we have seen
1015 	 *    so far.
1016 	 */
1017 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
1018 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
1019 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
1020 		pii->pii_rack = pr_icmp_seq;
1021 	}
1022 }
1023 
1024 /*
1025  * Returns true if seq is the highest unacknowledged seq for target tg
1026  * else returns false
1027  */
1028 static boolean_t
1029 highest_ack_tg(uint16_t seq, struct target *tg)
1030 {
1031 	struct phyint_instance *pii;
1032 	int	 pr_ndx;
1033 	uint16_t pr_seq;
1034 
1035 	pii = tg->tg_phyint_inst;
1036 
1037 	/*
1038 	 * Get the seq number of the most recent probe sent so far,
1039 	 * and also get the corresponding probe index in the probe stats
1040 	 * array.
1041 	 */
1042 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1043 	pr_seq = pii->pii_snxt;
1044 	pr_seq--;
1045 
1046 	/*
1047 	 * Start from the most recent probe and walk back, trying to find
1048 	 * an acked probe corresponding to target tg.
1049 	 */
1050 	for (; pr_ndx != pii->pii_probe_next;
1051 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1052 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1053 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1054 			if (SEQ_GT(pr_seq, seq))
1055 				return (_B_FALSE);
1056 		}
1057 	}
1058 	return (_B_TRUE);
1059 }
1060 
1061 /*
1062  * Check whether the crtt for the group has improved by a factor of
1063  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1064  * detection time flapping in the face of small crtt changes.
1065  */
1066 static boolean_t
1067 check_pg_crtt_improved(struct phyint_group *pg)
1068 {
1069 	struct	phyint *pi;
1070 
1071 	if (debug & D_PROBE)
1072 		logdebug("check_pg_crtt_improved()\n");
1073 
1074 	/*
1075 	 * The crtt for the group is only improved if each phyint_instance
1076 	 * for both ipv4 and ipv6 is improved.
1077 	 */
1078 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1079 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1080 		    !check_pii_crtt_improved(pi->pi_v6))
1081 			return (_B_FALSE);
1082 	}
1083 
1084 	return (_B_TRUE);
1085 }
1086 
1087 /*
1088  * Check whether the crtt has improved substantially on this phyint_instance.
1089  * Returns _B_TRUE if there's no crtt information available, because pii
1090  * is NULL or the phyint_instance is not capable of probing.
1091  */
1092 boolean_t
1093 check_pii_crtt_improved(struct phyint_instance *pii) {
1094 	struct 	target *tg;
1095 
1096 	if (pii == NULL)
1097 		return (_B_TRUE);
1098 
1099 	if (!PROBE_CAPABLE(pii) ||
1100 	    pii->pii_phyint->pi_state == PI_FAILED)
1101 		return (_B_TRUE);
1102 
1103 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1104 		if (tg->tg_status != TG_ACTIVE)
1105 			continue;
1106 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1107 		    LOWER_FDT_TRIGGER)) {
1108 			return (_B_FALSE);
1109 		}
1110 	}
1111 
1112 	return (_B_TRUE);
1113 }
1114 
1115 /*
1116  * This target responds very slowly to probes. The target's crtt exceeds
1117  * the probe interval of its group. Compare against other targets
1118  * and determine if this target is an exception, if so return true, else false
1119  */
1120 static boolean_t
1121 check_exception_target(struct phyint_instance *pii, struct target *target)
1122 {
1123 	struct	target *tg;
1124 	char abuf[INET6_ADDRSTRLEN];
1125 
1126 	if (debug & D_PROBE) {
1127 		logdebug("check_exception_target(%s %s target %s)\n",
1128 		    AF_STR(pii->pii_af), pii->pii_name,
1129 		    pr_addr(pii->pii_af, target->tg_address,
1130 			abuf, sizeof (abuf)));
1131 	}
1132 
1133 	/*
1134 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1135 	 * to make a good judgement. Otherwise don't drop this target.
1136 	 */
1137 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1138 		return (_B_FALSE);
1139 
1140 	/*
1141 	 * Determine whether only this particular target is slow.
1142 	 * We know that this target's crtt exceeds the group's probe interval.
1143 	 * If all other active targets have a
1144 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1145 	 * then this target is considered slow.
1146 	 */
1147 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1148 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1149 			if (tg->tg_crtt >
1150 			    pii->pii_phyint->pi_group->pg_probeint /
1151 			    EXCEPTION_FACTOR) {
1152 				return (_B_FALSE);
1153 			}
1154 		}
1155 	}
1156 
1157 	return (_B_TRUE);
1158 }
1159 
1160 /*
1161  * Update the target list. The icmp all hosts multicast has given us
1162  * some host to which we can send probes. If we already have sufficient
1163  * targets, discard it.
1164  */
1165 static void
1166 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1167     struct in6_addr fromaddr)
1168 /* ARGSUSED */
1169 {
1170 	int af;
1171 	char abuf[INET6_ADDRSTRLEN];
1172 	struct phyint *pi;
1173 
1174 	if (debug & D_PROBE) {
1175 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1176 		    AF_STR(pii->pii_af), pii->pii_name,
1177 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1178 	}
1179 
1180 	/*
1181 	 * Using host targets is a fallback mechanism. If we have
1182 	 * found a router, don't add this host target. If we already
1183 	 * know MAX_PROBE_TARGETS, don't add another target.
1184 	 */
1185 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1186 	if (pii->pii_targets != NULL) {
1187 		if (pii->pii_targets_are_routers ||
1188 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1189 			return;
1190 		}
1191 	}
1192 
1193 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1194 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1195 		/*
1196 		 * Guard against response from 0.0.0.0
1197 		 * and ::. Log a trace message
1198 		 */
1199 		logtrace("probe response from %s on %s\n",
1200 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1201 		    pii->pii_name);
1202 		return;
1203 	}
1204 
1205 	/*
1206 	 * This address is one of our own, so reject this address as a
1207 	 * valid probe target.
1208 	 */
1209 	af = pii->pii_af;
1210 	if (own_address(fromaddr))
1211 		return;
1212 
1213 	/*
1214 	 * If the phyint is part a named group, then add the address to all
1215 	 * members of the group.  Otherwise, add the address only to the
1216 	 * phyint itself, since other phyints in the anongroup may not be on
1217 	 * the same subnet.
1218 	 */
1219 	pi = pii->pii_phyint;
1220 	if (pi->pi_group == phyint_anongroup) {
1221 		target_add(pii, fromaddr, _B_FALSE);
1222 	} else {
1223 		pi = pi->pi_group->pg_phyint;
1224 		for (; pi != NULL; pi = pi->pi_pgnext)
1225 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1226 	}
1227 }
1228 
1229 /*
1230  * Compute CRTT given an existing scaled average, scaled deviation estimate
1231  * and a new rtt time.  The formula is from Jacobson and Karels'
1232  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1233  * are the same as those in Appendix A.2 of that paper.
1234  *
1235  * m = new measurement
1236  * sa = scaled RTT average (8 * average estimates)
1237  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1238  * crtt = Conservative round trip time. Used to determine whether probe
1239  * has timed out.
1240  *
1241  * New scaled average and deviation are passed back via sap and svp
1242  */
1243 static int
1244 compute_crtt(int *sap, int *svp, int m)
1245 {
1246 	int sa = *sap;
1247 	int sv = *svp;
1248 	int crtt;
1249 	int saved_m = m;
1250 
1251 	assert(*sap >= -1);
1252 	assert(*svp >= 0);
1253 
1254 	if (sa != -1) {
1255 		/*
1256 		 * Update average estimator:
1257 		 *	new rtt = old rtt + 1/8 Error
1258 		 *	    where Error = m - old rtt
1259 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1260 		 *	i.e. new sa =  old sa + Error
1261 		 */
1262 		m -= sa >> 3;		/* m is now Error in estimate. */
1263 		if ((sa += m) < 0) {
1264 			/* Don't allow the smoothed average to be negative. */
1265 			sa = 0;
1266 		}
1267 
1268 		/*
1269 		 * Update deviation estimator:
1270 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1271 		 *	i.e. 4 * new mdev = 4 * old mdev +
1272 		 *		(abs(Error) - old mdev)
1273 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1274 		 */
1275 		if (m < 0)
1276 			m = -m;
1277 		m -= sv >> 2;
1278 		sv += m;
1279 	} else {
1280 		/* Initialization. This is the first response received. */
1281 		sa = (m << 3);
1282 		sv = (m << 1);
1283 	}
1284 
1285 	crtt = (sa >> 3) + sv;
1286 
1287 	if (debug & D_PROBE) {
1288 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
1289 		    "%d\n", saved_m, sa, sv, crtt);
1290 	}
1291 
1292 	*sap = sa;
1293 	*svp = sv;
1294 
1295 	/*
1296 	 * CRTT = average estimates  + 4 * deviation estimates
1297 	 *	= sa / 8 + sv
1298 	 */
1299 	return (crtt);
1300 }
1301 
1302 static void
1303 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
1304 {
1305 	struct phyint_instance *pii = tg->tg_phyint_inst;
1306 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1307 	int sa = tg->tg_rtt_sa;
1308 	int sv = tg->tg_rtt_sd;
1309 	int new_crtt;
1310 	int i;
1311 
1312 	if (debug & D_PROBE)
1313 		logdebug("pi_set_crtt: target -  m %d\n", m);
1314 
1315 	/* store the round trip time, in case we need to defer computation */
1316 	tg->tg_deferred[tg->tg_num_deferred] = m;
1317 
1318 	new_crtt = compute_crtt(&sa, &sv, m);
1319 
1320 	/*
1321 	 * If this probe's round trip time would singlehandedly cause an
1322 	 * increase in the group's probe interval consider it suspect.
1323 	 */
1324 	if ((new_crtt > probe_interval) && is_probe_uni) {
1325 		if (debug & D_PROBE) {
1326 			logdebug("Received a suspect probe on %s, new_crtt ="
1327 			    " %d, probe_interval = %d, num_deferred = %d\n",
1328 			    pii->pii_probe_logint->li_name, new_crtt,
1329 			    probe_interval, tg->tg_num_deferred);
1330 		}
1331 
1332 		/*
1333 		 * If we've deferred as many rtts as we plan on deferring, then
1334 		 * assume the link really did slow down and process all queued
1335 		 * rtts
1336 		 */
1337 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1338 			if (debug & D_PROBE) {
1339 				logdebug("Received MAXDEFERREDRTT probes which "
1340 				    "would cause an increased probe_interval.  "
1341 				    "Integrating queued rtt data points.\n");
1342 			}
1343 
1344 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1345 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
1346 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
1347 			}
1348 
1349 			tg->tg_num_deferred = 0;
1350 		} else {
1351 			tg->tg_num_deferred++;
1352 		}
1353 		return;
1354 	}
1355 
1356 	/*
1357 	 * If this is a normal probe, or an RTT probe that would lead to a
1358 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1359 	 * a normal probe, pitch any deferred probes since our probes are
1360 	 * again being answered within our CRTT estimates.
1361 	 */
1362 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1363 		tg->tg_rtt_sa = sa;
1364 		tg->tg_rtt_sd = sv;
1365 		tg->tg_crtt = new_crtt;
1366 		if (is_probe_uni)
1367 			tg->tg_num_deferred = 0;
1368 	}
1369 }
1370 
1371 /*
1372  * Return a pointer to the specified option buffer.
1373  * If not found return NULL.
1374  */
1375 static void *
1376 find_ancillary(struct msghdr *msg, int cmsg_type)
1377 {
1378 	struct cmsghdr *cmsg;
1379 
1380 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1381 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1382 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
1383 		    cmsg->cmsg_type == cmsg_type) {
1384 			return (CMSG_DATA(cmsg));
1385 		}
1386 	}
1387 	return (NULL);
1388 }
1389 
1390 /*
1391  * See if a previously failed interface has started working again.
1392  */
1393 void
1394 phyint_check_for_repair(struct phyint *pi)
1395 {
1396 	if (phyint_repaired(pi)) {
1397 		if (pi->pi_group == phyint_anongroup) {
1398 			logerr("NIC repair detected on %s\n", pi->pi_name);
1399 		} else {
1400 			logerr("NIC repair detected on %s of group %s\n",
1401 			    pi->pi_name, pi->pi_group->pg_name);
1402 		}
1403 
1404 		/*
1405 		 * If the interface is offline, just clear the FAILED flag,
1406 		 * delaying the state change and failback operation until it
1407 		 * is brought back online.
1408 		 */
1409 		if (pi->pi_state == PI_OFFLINE) {
1410 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1411 			return;
1412 		}
1413 
1414 		if (pi->pi_flags & IFF_STANDBY) {
1415 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1416 		} else {
1417 			if (try_failback(pi) != IPMP_FAILURE) {
1418 				(void) change_lif_flags(pi,
1419 				    IFF_FAILED, _B_FALSE);
1420 				/* Per state diagram */
1421 				pi->pi_empty = 0;
1422 			}
1423 		}
1424 
1425 		phyint_chstate(pi, PI_RUNNING);
1426 
1427 		if (GROUP_FAILED(pi->pi_group)) {
1428 			/*
1429 			 * This is the 1st phyint to receive a response
1430 			 * after group failure.
1431 			 */
1432 			logerr("At least 1 interface (%s) of group %s has "
1433 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
1434 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
1435 			/*
1436 			 * If this is the STANDBY phyint to be repaired after a
1437 			 * group failure. Move data addresses on other failed
1438 			 * phyints in the group to this one.
1439 			 */
1440 			if (pi->pi_flags & IFF_STANDBY) {
1441 				struct phyint *fpi = pi->pi_group->pg_phyint;
1442 				for (; fpi != NULL; fpi = fpi->pi_pgnext) {
1443 					if (fpi != pi) {
1444 						(void) try_failover(fpi,
1445 						    FAILOVER_NORMAL);
1446 					}
1447 				}
1448 			}
1449 		}
1450 	}
1451 }
1452 
1453 /*
1454  * See if a previously functioning interface has failed, or if the
1455  * whole group of interfaces has failed.
1456  */
1457 static void
1458 phyint_inst_check_for_failure(struct phyint_instance *pii)
1459 {
1460 	struct	phyint	*pi;
1461 	struct	phyint	*pi2;
1462 
1463 	pi = pii->pii_phyint;
1464 
1465 	switch (failure_state(pii)) {
1466 	case PHYINT_FAILURE:
1467 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1468 		if (pi->pi_group == phyint_anongroup) {
1469 			logerr("NIC failure detected on %s\n", pii->pii_name);
1470 		} else {
1471 			logerr("NIC failure detected on %s of group %s\n",
1472 			    pii->pii_name, pi->pi_group->pg_name);
1473 		}
1474 		/*
1475 		 * Do the failover, unless the interface is offline (in
1476 		 * which case we've already failed over).
1477 		 */
1478 		if (pi->pi_state != PI_OFFLINE) {
1479 			phyint_chstate(pi, PI_FAILED);
1480 			reset_crtt_all(pi);
1481 			if (!(pi->pi_flags & IFF_INACTIVE))
1482 				(void) try_failover(pi, FAILOVER_NORMAL);
1483 		}
1484 		break;
1485 
1486 	case GROUP_FAILURE:
1487 		logerr("All Interfaces in group %s have failed\n",
1488 		    pi->pi_group->pg_name);
1489 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
1490 		    pi2 = pi2->pi_pgnext) {
1491 			if (pi2->pi_flags & IFF_OFFLINE)
1492 				continue;
1493 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
1494 			reset_crtt_all(pi2);
1495 
1496 			/*
1497 			 * In the case of host targets, we
1498 			 * would have flushed the targets,
1499 			 * and gone to PI_NOTARGETS state.
1500 			 */
1501 			if (pi2->pi_state == PI_RUNNING)
1502 				phyint_chstate(pi2, PI_FAILED);
1503 
1504 			pi2->pi_empty = 0;
1505 			pi2->pi_full = 0;
1506 		}
1507 		break;
1508 
1509 	default:
1510 		break;
1511 	}
1512 }
1513 
1514 /*
1515  * Determines if any timeout event has occurred and returns the number of
1516  * milliseconds until the next timeout event for the phyint. Returns
1517  * TIMER_INFINITY for "never".
1518  */
1519 uint_t
1520 phyint_inst_timer(struct phyint_instance *pii)
1521 {
1522 	int 	pr_ndx;
1523 	uint_t	timeout;
1524 	struct	target	*cur_tg;
1525 	struct	probe_stats *pr_statp;
1526 	struct	phyint_instance *pii_other;
1527 	struct	phyint *pi;
1528 	int	valid_unack_count;
1529 	int	i;
1530 	int	interval;
1531 	uint_t	check_time;
1532 	uint_t	cur_time;
1533 	hrtime_t cur_hrtime;
1534 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1535 
1536 	cur_time = getcurrenttime();
1537 
1538 	if (debug & D_TIMER) {
1539 		logdebug("phyint_inst_timer(%s %s)\n",
1540 		    AF_STR(pii->pii_af), pii->pii_name);
1541 	}
1542 
1543 	pii_other = phyint_inst_other(pii);
1544 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1545 		/*
1546 		 * Check to see if we're here due to link up/down flapping; If
1547 		 * enough time has passed, then try to bring the interface
1548 		 * back up; otherwise, schedule a timer to bring it back up
1549 		 * when enough time *has* elapsed.
1550 		 */
1551 		pi = pii->pii_phyint;
1552 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1553 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1554 			if (check_time > cur_time)
1555 				return (check_time - cur_time);
1556 
1557 			phyint_check_for_repair(pi);
1558 		}
1559 	}
1560 
1561 	/*
1562 	 * If probing is not enabled on this phyint instance, don't proceed.
1563 	 */
1564 	if (!PROBE_ENABLED(pii))
1565 		return (TIMER_INFINITY);
1566 
1567 	/*
1568 	 * If the timer has fired too soon, probably triggered
1569 	 * by some other phyint instance, return the remaining
1570 	 * time
1571 	 */
1572 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1573 		return (pii->pii_snxt_time - cur_time);
1574 
1575 	/*
1576 	 * If the link is down, don't send any probes for now.
1577 	 */
1578 	if (LINK_DOWN(pii->pii_phyint))
1579 		return (TIMER_INFINITY);
1580 
1581 	/*
1582 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1583 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1584 	 * Base probe time is strictly periodic.
1585 	 */
1586 	interval = GET_RANDOM(
1587 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1588 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1589 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1590 
1591 	/*
1592 	 * Check if the current time > next time to probe. If so, we missed
1593 	 * sending 1 or more probes, probably due to heavy system load. At least
1594 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1595 	 * were scheduled. Make adjustments to the times, in multiples of
1596 	 * user_probe_interval.
1597 	 */
1598 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1599 		int n;
1600 
1601 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1602 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1603 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1604 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1605 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1606 		    pii->pii_snxt_basetime);
1607 
1608 		/* Collect statistics about missed probes */
1609 		probes_missed.pm_nprobes += n + 1;
1610 		probes_missed.pm_ntimes++;
1611 	}
1612 	pii->pii_snxt_basetime += user_probe_interval;
1613 	interval = pii->pii_snxt_time - cur_time;
1614 	if (debug & D_TARGET) {
1615 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1616 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1617 		    pii->pii_snxt_basetime, interval);
1618 	}
1619 
1620 	/*
1621 	 * If no targets are known, we need to send an ICMP multicast. The
1622 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1623 	 * to see if we found a target.
1624 	 */
1625 	if (pii->pii_target_next == NULL) {
1626 		assert(pii->pii_ntargets == 0);
1627 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1628 		probe(pii, PROBE_MULTI, cur_time);
1629 		return (interval);
1630 	}
1631 
1632 	if ((user_probe_interval != probe_interval) &&
1633 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1634 		/*
1635 		 * the failure detection (fd) probe timer has not yet fired.
1636 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1637 		 */
1638 		probe(pii, PROBE_RTT, cur_time);
1639 		return (interval);
1640 	}
1641 	/*
1642 	 * the fd probe timer has fired. Need to do all failure
1643 	 * detection / recovery calculations, and then send an fd probe
1644 	 * of type PROBE_UNI.
1645 	 */
1646 	if (user_probe_interval == probe_interval) {
1647 		/*
1648 		 * We could have missed some probes, and then adjusted
1649 		 * pii_snxt_basetime above. Otherwise we could have
1650 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1651 		 */
1652 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1653 	} else {
1654 		pii->pii_fd_snxt_basetime += probe_interval;
1655 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1656 			int n;
1657 
1658 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1659 			    probe_interval;
1660 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1661 		}
1662 	}
1663 
1664 	/*
1665 	 * We can have at most, the latest 2 probes that we sent, in
1666 	 * the PR_UNACKED state. All previous probes sent, are either
1667 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1668 	 * timed out if the probe's time_sent + the CRTT < currenttime.
1669 	 * For each of the last 2 probes, examine whether it has timed
1670 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1671 	 */
1672 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1673 	valid_unack_count = 0;
1674 
1675 	for (i = 0; i < 2; i++) {
1676 		pr_statp = &pii->pii_probes[pr_ndx];
1677 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1678 		switch (pr_statp->pr_status) {
1679 		case PR_ACKED:
1680 			/*
1681 			 * We received back an ACK, so the switch clearly
1682 			 * is not dropping our traffic, and thus we can
1683 			 * enable failure detection immediately.
1684 			 */
1685 			if (pii->pii_fd_hrtime > gethrtime()) {
1686 				if (debug & D_PROBE) {
1687 					logdebug("successful probe on %s; "
1688 					    "ending quiet period\n",
1689 					    pii->pii_phyint->pi_name);
1690 				}
1691 				pii->pii_fd_hrtime = gethrtime();
1692 			}
1693 			break;
1694 
1695 		case PR_UNACKED:
1696 			assert(cur_tg != NULL);
1697 			/*
1698 			 * The crtt could be zero for some reason,
1699 			 * Eg. the phyint could be failed. If the crtt is
1700 			 * not available use group's probe interval,
1701 			 * which is a worst case estimate.
1702 			 */
1703 			if (cur_tg->tg_crtt != 0) {
1704 				timeout = pr_statp->pr_time_sent +
1705 				    cur_tg->tg_crtt;
1706 			} else {
1707 				timeout = pr_statp->pr_time_sent +
1708 				    probe_interval;
1709 			}
1710 			if (TIME_LT(timeout, cur_time)) {
1711 				pr_statp->pr_status = PR_LOST;
1712 				pr_statp->pr_time_lost = timeout;
1713 			} else if (i == 1) {
1714 				/*
1715 				 * We are forced to consider this probe
1716 				 * lost, as we can have at most 2 unack.
1717 				 * probes any time, and we will be sending a
1718 				 * probe at the end of this function.
1719 				 * Normally, we should not be here, but
1720 				 * this can happen if an incoming response
1721 				 * that was considered lost has increased
1722 				 * the crtt for this target, and also bumped
1723 				 * up the FDT. Note that we never cancel or
1724 				 * increase the current pii_time_left, so
1725 				 * when the timer fires, we find 2 valid
1726 				 * unacked probes, and they are yet to timeout
1727 				 */
1728 				pr_statp->pr_status = PR_LOST;
1729 				pr_statp->pr_time_lost = cur_time;
1730 			} else {
1731 				/*
1732 				 * Only the most recent probe can enter
1733 				 * this 'else' arm. The second most recent
1734 				 * probe must take either of the above arms,
1735 				 * if it is unacked.
1736 				 */
1737 				valid_unack_count++;
1738 			}
1739 			break;
1740 		}
1741 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1742 	}
1743 
1744 	/*
1745 	 * We send out 1 probe randomly in the interval between one half
1746 	 * and one probe interval for the group. Given that the CRTT is always
1747 	 * less than the group's probe interval, we can have at most 1
1748 	 * unacknowledged probe now.  All previous probes are either lost or
1749 	 * acked.
1750 	 */
1751 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1752 
1753 	/*
1754 	 * The timer has fired. Take appropriate action depending
1755 	 * on the current state of the phyint.
1756 	 *
1757 	 * PI_RUNNING state 	- Failure detection and failover
1758 	 * PI_FAILED state 	- Repair detection and failback
1759 	 */
1760 	switch (pii->pii_phyint->pi_state) {
1761 	case PI_FAILED:
1762 		/*
1763 		 * If the most recent probe (excluding unacked probes that
1764 		 * are yet to time out) has been acked, check whether the
1765 		 * phyint is now repaired. If the phyint is repaired, then
1766 		 * attempt failback, unless it is an inactive standby.
1767 		 */
1768 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1769 			phyint_check_for_repair(pii->pii_phyint);
1770 		}
1771 		break;
1772 
1773 	case PI_RUNNING:
1774 		/*
1775 		 * It's possible our probes have been lost because of a
1776 		 * spanning-tree mandated quiet period on the switch.  If so,
1777 		 * ignore the lost probes and consider the interface to still
1778 		 * be functioning.
1779 		 */
1780 		cur_hrtime = gethrtime();
1781 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1782 			break;
1783 
1784 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1785 			/*
1786 			 * We have 1 or more failed probes (excluding unacked
1787 			 * probes that are yet to time out). Determine if the
1788 			 * phyint has failed. If so attempt a failover,
1789 			 * unless it is an inactive standby
1790 			 */
1791 			phyint_inst_check_for_failure(pii);
1792 		}
1793 		break;
1794 
1795 	default:
1796 		logerr("phyint_inst_timer: invalid state %d\n",
1797 		    pii->pii_phyint->pi_state);
1798 		abort();
1799 	}
1800 
1801 	/*
1802 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1803 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1804 	 * was called, the target list may be empty.
1805 	 */
1806 	if (pii->pii_target_next != NULL) {
1807 		probe(pii, PROBE_UNI, cur_time);
1808 		/*
1809 		 * If we have just the one probe target, and we're not using
1810 		 * router targets, try to find another as we presently have
1811 		 * no resilience.
1812 		 */
1813 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1814 			probe(pii, PROBE_MULTI, cur_time);
1815 	} else {
1816 		probe(pii, PROBE_MULTI, cur_time);
1817 	}
1818 	return (interval);
1819 }
1820 
1821 /*
1822  * Start the probe timer for an interface instance.
1823  */
1824 void
1825 start_timer(struct phyint_instance *pii)
1826 {
1827 	uint32_t interval;
1828 
1829 	/*
1830 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1831 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1832 	 * pi_snxt_basetime is strictly periodic with a frequency of
1833 	 * the group's probe interval. The actual probe time pi_snxt_time
1834 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1835 	 * For the 1st probe on each phyint after the timer is started,
1836 	 * pi_snxt_time and pi_snxt_basetime are the same.
1837 	 */
1838 	interval = GET_RANDOM(0,
1839 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1840 
1841 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1842 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1843 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1844 	timer_schedule(interval);
1845 }
1846 
1847 /*
1848  * Restart the probe timer on an interface instance.
1849  */
1850 static void
1851 restart_timer(struct phyint_instance *pii)
1852 {
1853 	/*
1854 	 * We don't need to restart the timer if it was never started in
1855 	 * the first place (pii->pii_basetime_inited not set), as the timer
1856 	 * won't have gone off yet.
1857 	 */
1858 	if (pii->pii_basetime_inited != 0) {
1859 
1860 		if (debug & D_LINKNOTE)
1861 			logdebug("restart timer: restarting timer on %s, "
1862 			    "address family %s\n", pii->pii_phyint->pi_name,
1863 			    AF_STR(pii->pii_af));
1864 
1865 		start_timer(pii);
1866 	}
1867 }
1868 
1869 static void
1870 process_link_state_down(struct phyint *pi)
1871 {
1872 	logerr("The link has gone down on %s\n", pi->pi_name);
1873 
1874 	/*
1875 	 * Clear the probe statistics arrays, we don't want the repair
1876 	 * detection logic relying on probes that were succesful prior
1877 	 *  to the link going down.
1878 	 */
1879 	if (PROBE_CAPABLE(pi->pi_v4))
1880 		clear_pii_probe_stats(pi->pi_v4);
1881 	if (PROBE_CAPABLE(pi->pi_v6))
1882 		clear_pii_probe_stats(pi->pi_v6);
1883 	/*
1884 	 * Check for interface failure.  Although we know the interface
1885 	 * has failed, we don't know if all the other interfaces in the
1886 	 * group have failed as well.
1887 	 */
1888 	if ((pi->pi_state == PI_RUNNING) ||
1889 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1890 		if (debug & D_LINKNOTE) {
1891 			logdebug("process_link_state_down:"
1892 			    " checking for failure on %s\n", pi->pi_name);
1893 		}
1894 
1895 		if (pi->pi_v4 != NULL)
1896 			phyint_inst_check_for_failure(pi->pi_v4);
1897 		else if (pi->pi_v6 != NULL)
1898 			phyint_inst_check_for_failure(pi->pi_v6);
1899 	}
1900 }
1901 
1902 static void
1903 process_link_state_up(struct phyint *pi)
1904 {
1905 	logerr("The link has come up on %s\n", pi->pi_name);
1906 
1907 	/*
1908 	 * We stopped any running timers on each instance when the link
1909 	 * went down, so restart them.
1910 	 */
1911 	if (pi->pi_v4)
1912 		restart_timer(pi->pi_v4);
1913 	if (pi->pi_v6)
1914 		restart_timer(pi->pi_v6);
1915 
1916 	phyint_check_for_repair(pi);
1917 
1918 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1919 	if (pi->pi_whendx == LINK_UP_PERMIN)
1920 		pi->pi_whendx = 0;
1921 }
1922 
1923 /*
1924  * Process any changes in link state passed up from the interfaces.
1925  */
1926 void
1927 process_link_state_changes(void)
1928 {
1929 	struct phyint *pi;
1930 
1931 	/* Look for interfaces where the link state has just changed */
1932 
1933 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1934 		boolean_t old_link_state_up = LINK_UP(pi);
1935 
1936 		/*
1937 		 * Except when the "phyint" structure is created, this is
1938 		 * the only place the link state is updated.  This allows
1939 		 * this routine to detect changes in link state, rather
1940 		 * than just the current state.
1941 		 */
1942 		UPDATE_LINK_STATE(pi);
1943 
1944 		if (LINK_DOWN(pi)) {
1945 			/*
1946 			 * Has link just gone down?
1947 			 */
1948 			if (old_link_state_up)
1949 				process_link_state_down(pi);
1950 		} else {
1951 			/*
1952 			 * Has link just gone back up?
1953 			 */
1954 			if (!old_link_state_up)
1955 				process_link_state_up(pi);
1956 		}
1957 	}
1958 }
1959 
1960 void
1961 reset_crtt_all(struct phyint *pi)
1962 {
1963 	struct phyint_instance *pii;
1964 	struct target *tg;
1965 
1966 	pii = pi->pi_v4;
1967 	if (pii != NULL) {
1968 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1969 			tg->tg_crtt = 0;
1970 			tg->tg_rtt_sa = -1;
1971 			tg->tg_rtt_sd = 0;
1972 		}
1973 	}
1974 
1975 	pii = pi->pi_v6;
1976 	if (pii != NULL) {
1977 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1978 			tg->tg_crtt = 0;
1979 			tg->tg_rtt_sa = -1;
1980 			tg->tg_rtt_sd = 0;
1981 		}
1982 	}
1983 }
1984 
1985 /*
1986  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
1987  * probes on both instances IPv4 and IPv6.
1988  * If the interface has failed, return the time of the first probe failure
1989  * in "tff".
1990  */
1991 static int
1992 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
1993 {
1994 	uint_t	pi_tff;
1995 	struct	target *cur_tg;
1996 	struct	probe_fail_count pfinfo;
1997 	struct	phyint_instance *pii_other;
1998 	int	pr_ndx;
1999 
2000 	/*
2001 	 * Get the number of consecutive failed probes on
2002 	 * this phyint across all targets. Also get the number
2003 	 * of consecutive failed probes on this target only
2004 	 */
2005 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2006 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
2007 	probe_fail_info(pii, cur_tg, &pfinfo);
2008 
2009 	/* Get the time of first failure, for later use */
2010 	pi_tff = pfinfo.pf_tff;
2011 
2012 	/*
2013 	 * If the current target has not responded to the
2014 	 * last NUM_PROBE_FAILS probes, and other targets are
2015 	 * responding delete this target. Dead gateway detection
2016 	 * will eventually remove this target (if router) from the
2017 	 * routing tables. If that does not occur, we may end
2018 	 * up adding this to our list again.
2019 	 */
2020 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2021 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2022 		if (pii->pii_targets_are_routers) {
2023 			if (cur_tg->tg_status == TG_ACTIVE)
2024 				pii->pii_ntargets--;
2025 			cur_tg->tg_status = TG_DEAD;
2026 			cur_tg->tg_crtt = 0;
2027 			cur_tg->tg_rtt_sa = -1;
2028 			cur_tg->tg_rtt_sd = 0;
2029 			if (pii->pii_target_next == cur_tg)
2030 				pii->pii_target_next = target_next(cur_tg);
2031 		} else {
2032 			target_delete(cur_tg);
2033 			probe(pii, PROBE_MULTI, getcurrenttime());
2034 		}
2035 		return (PHYINT_OK);
2036 	}
2037 
2038 	/*
2039 	 * If the phyint has lost NUM_PROBE_FAILS or more
2040 	 * consecutive probes, on both IPv4 and IPv6 protocol
2041 	 * instances of the phyint, then trigger failure
2042 	 * detection, else return false
2043 	 */
2044 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2045 		return (PHYINT_OK);
2046 
2047 	pii_other = phyint_inst_other(pii);
2048 	if (PROBE_CAPABLE(pii_other)) {
2049 		probe_fail_info(pii_other, NULL, &pfinfo);
2050 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2051 			/*
2052 			 * We have NUM_PROBE_FAILS or more failures
2053 			 * on both IPv4 and IPv6. Get the earliest
2054 			 * time when failure was detected on this
2055 			 * phyint across IPv4 and IPv6.
2056 			 */
2057 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2058 				pi_tff = pfinfo.pf_tff;
2059 		} else {
2060 			/*
2061 			 * This instance has < NUM_PROBE_FAILS failure.
2062 			 * So return false
2063 			 */
2064 			return (PHYINT_OK);
2065 		}
2066 	}
2067 	*tff = pi_tff;
2068 	return (PHYINT_FAILURE);
2069 }
2070 
2071 /*
2072  * Check if the link has gone down on this phyint, or it has failed the
2073  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2074  * Also look at other phyints of this group, for group failures.
2075  */
2076 int
2077 failure_state(struct phyint_instance *pii)
2078 {
2079 	struct	probe_success_count psinfo;
2080 	uint_t	pi2_tls;		/* time last success */
2081 	uint_t	pi_tff;			/* time first fail */
2082 	struct	phyint	*pi2;
2083 	struct	phyint *pi;
2084 	struct	phyint_instance *pii2;
2085 	struct  phyint_group *pg;
2086 	boolean_t alone;
2087 
2088 	if (debug & D_FAILOVER)
2089 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2090 
2091 	pi = pii->pii_phyint;
2092 	pg = pi->pi_group;
2093 
2094 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2095 		PHYINT_OK)
2096 		return (PHYINT_OK);
2097 
2098 	/*
2099 	 * At this point, the link is down, or the phyint is suspect,
2100 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
2101 	 * does not belong to any group, or is the only member of the
2102 	 * group capable of being probed, return PHYINT_FAILURE.
2103 	 */
2104 	alone = _B_TRUE;
2105 	if (pg != phyint_anongroup) {
2106 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2107 			if (pi2 == pi)
2108 				continue;
2109 			if (PROBE_CAPABLE(pi2->pi_v4) ||
2110 			    PROBE_CAPABLE(pi2->pi_v6)) {
2111 				alone = _B_FALSE;
2112 				break;
2113 			}
2114 		}
2115 	}
2116 	if (alone)
2117 		return (PHYINT_FAILURE);
2118 
2119 	/*
2120 	 * Need to compare against other phyints of the same group
2121 	 * to exclude group failures. If the failure was detected via
2122 	 * probing, then if the time of last success (tls) of any
2123 	 * phyint is more recent than the time of first fail (tff) of the
2124 	 * phyint in question, and the link is up on the phyint,
2125 	 * then it is a phyint failure. Otherwise it is a group failure.
2126 	 * If failure was detected via a link down notification sent from
2127 	 * the driver to IP, we see if any phyints in the group are still
2128 	 * running and haven't received a link down notification.  We
2129 	 * will usually be processing the link down notification shortly
2130 	 * after it was received, so there is no point looking at the tls
2131 	 * of other phyints.
2132 	 */
2133 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2134 		/* Exclude ourself from comparison */
2135 		if (pi2 == pi)
2136 			continue;
2137 
2138 		if (LINK_DOWN(pi)) {
2139 			/*
2140 			 * We use FLAGS_TO_LINK_STATE() to test the
2141 			 * flags directly, rather then LINK_UP() or
2142 			 * LINK_DOWN(), as we may not have got round
2143 			 * to processing the link state for the other
2144 			 * phyints in the group yet.
2145 			 *
2146 			 * The check for PI_RUNNING and group
2147 			 * failure handles the case when the
2148 			 * group begins to recover.  The first
2149 			 * phyint to recover should not trigger
2150 			 * a failover from the soon-to-recover
2151 			 * other phyints to the first recovered
2152 			 * phyint. PI_RUNNING will be set, and
2153 			 * pg_groupfailed cleared only after
2154 			 * receipt of NUM_PROBE_REPAIRS, by
2155 			 * which time the other phyints should
2156 			 * have received at least 1 packet,
2157 			 * and so will not have NUM_PROBE_FAILS.
2158 			 */
2159 			if ((pi2->pi_state == PI_RUNNING) &&
2160 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
2161 				return (PHYINT_FAILURE);
2162 		} else {
2163 			/*
2164 			 * Need to compare against both IPv4 and
2165 			 * IPv6 instances.
2166 			 */
2167 			pii2 = pi2->pi_v4;
2168 			if (pii2 != NULL) {
2169 				probe_success_info(pii2, NULL, &psinfo);
2170 				if (psinfo.ps_tls_valid) {
2171 					pi2_tls = psinfo.ps_tls;
2172 					/*
2173 					 * See comment above regarding check
2174 					 * for PI_RUNNING and group failure.
2175 					 */
2176 					if (TIME_GT(pi2_tls, pi_tff) &&
2177 					    (pi2->pi_state == PI_RUNNING) &&
2178 					    !GROUP_FAILED(pg) &&
2179 					    FLAGS_TO_LINK_STATE(pi2))
2180 						return (PHYINT_FAILURE);
2181 				}
2182 			}
2183 
2184 			pii2 = pi2->pi_v6;
2185 			if (pii2 != NULL) {
2186 				probe_success_info(pii2, NULL, &psinfo);
2187 				if (psinfo.ps_tls_valid) {
2188 					pi2_tls = psinfo.ps_tls;
2189 					/*
2190 					 * See comment above regarding check
2191 					 * for PI_RUNNING and group failure.
2192 					 */
2193 					if (TIME_GT(pi2_tls, pi_tff) &&
2194 					    (pi2->pi_state == PI_RUNNING) &&
2195 					    !GROUP_FAILED(pg) &&
2196 					    FLAGS_TO_LINK_STATE(pi2))
2197 						return (PHYINT_FAILURE);
2198 				}
2199 			}
2200 		}
2201 	}
2202 
2203 	/*
2204 	 * Change the group state to PG_FAILED if it's not already.
2205 	 */
2206 	if (!GROUP_FAILED(pg))
2207 		phyint_group_chstate(pg, PG_FAILED);
2208 
2209 	return (GROUP_FAILURE);
2210 }
2211 
2212 /*
2213  * Return the information associated with consecutive probe successes
2214  * starting with the most recent probe. At most the last 2 probes can be
2215  * in the unacknowledged state. All previous probes have either failed
2216  * or succeeded.
2217  */
2218 static void
2219 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2220     struct probe_success_count *psinfo)
2221 {
2222 	uint_t	i;
2223 	struct probe_stats *pr_statp;
2224 	uint_t most_recent;
2225 	uint_t second_most_recent;
2226 	boolean_t pi_found_failure = _B_FALSE;
2227 	boolean_t tg_found_failure = _B_FALSE;
2228 	uint_t now;
2229 	uint_t timeout;
2230 	struct target *tg;
2231 
2232 	if (debug & D_FAILOVER)
2233 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2234 
2235 	bzero(psinfo, sizeof (*psinfo));
2236 	now = getcurrenttime();
2237 
2238 	/*
2239 	 * Start with the most recent probe, and count the number
2240 	 * of consecutive probe successes. Latch the number of successes
2241 	 * on hitting a failure.
2242 	 */
2243 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2244 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2245 
2246 	for (i = most_recent; i != pii->pii_probe_next;
2247 	    i = PROBE_INDEX_PREV(i)) {
2248 		pr_statp = &pii->pii_probes[i];
2249 
2250 		switch (pr_statp->pr_status) {
2251 		case PR_UNACKED:
2252 			/*
2253 			 * Only the most recent 2 probes can be unacknowledged
2254 			 */
2255 			assert(i == most_recent || i == second_most_recent);
2256 
2257 			tg = pr_statp->pr_target;
2258 			assert(tg != NULL);
2259 			/*
2260 			 * The crtt could be zero for some reason,
2261 			 * Eg. the phyint could be failed. If the crtt is
2262 			 * not available use the value of the group's probe
2263 			 * interval which is a worst case estimate.
2264 			 */
2265 			if (tg->tg_crtt != 0) {
2266 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2267 			} else {
2268 				timeout = pr_statp->pr_time_sent +
2269 				    pii->pii_phyint->pi_group->pg_probeint;
2270 			}
2271 
2272 			if (TIME_LT(timeout, now)) {
2273 				/*
2274 				 * We hit a failure. Latch the total number of
2275 				 * recent consecutive successes.
2276 				 */
2277 				pr_statp->pr_time_lost = timeout;
2278 				pr_statp->pr_status = PR_LOST;
2279 				pi_found_failure = _B_TRUE;
2280 				if (cur_tg != NULL && tg == cur_tg) {
2281 					/*
2282 					 * We hit a failure for the desired
2283 					 * target. Latch the number of recent
2284 					 * consecutive successes for this target
2285 					 */
2286 					tg_found_failure = _B_TRUE;
2287 				}
2288 			}
2289 			break;
2290 
2291 		case PR_ACKED:
2292 			/*
2293 			 * Bump up the count of probe successes, if we
2294 			 * have not seen any failure so far.
2295 			 */
2296 			if (!pi_found_failure)
2297 				psinfo->ps_nsucc++;
2298 
2299 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2300 			    !tg_found_failure) {
2301 				psinfo->ps_nsucc_tg++;
2302 			}
2303 
2304 			/*
2305 			 * Record the time of last success, if this is
2306 			 * the most recent probe success.
2307 			 */
2308 			if (!psinfo->ps_tls_valid) {
2309 				psinfo->ps_tls = pr_statp->pr_time_acked;
2310 				psinfo->ps_tls_valid = _B_TRUE;
2311 			}
2312 			break;
2313 
2314 		case PR_LOST:
2315 			/*
2316 			 * We hit a failure. Latch the total number of
2317 			 * recent consecutive successes.
2318 			 */
2319 			pi_found_failure = _B_TRUE;
2320 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2321 				/*
2322 				 * We hit a failure for the desired target.
2323 				 * Latch the number of recent consecutive
2324 				 * successes for this target
2325 				 */
2326 				tg_found_failure = _B_TRUE;
2327 			}
2328 			break;
2329 
2330 		default:
2331 			return;
2332 
2333 		}
2334 	}
2335 }
2336 
2337 /*
2338  * Return the information associated with consecutive probe failures
2339  * starting with the most recent probe. Only the last 2 probes can be in the
2340  * unacknowledged state. All previous probes have either failed or succeeded.
2341  */
2342 static void
2343 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2344     struct probe_fail_count *pfinfo)
2345 {
2346 	int	i;
2347 	struct probe_stats *pr_statp;
2348 	boolean_t	tg_found_success = _B_FALSE;
2349 	boolean_t	pi_found_success = _B_FALSE;
2350 	int	most_recent;
2351 	int	second_most_recent;
2352 	uint_t	now;
2353 	uint_t	timeout;
2354 	struct	target *tg;
2355 
2356 	if (debug & D_FAILOVER)
2357 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2358 
2359 	bzero(pfinfo, sizeof (*pfinfo));
2360 	now = getcurrenttime();
2361 
2362 	/*
2363 	 * Start with the most recent probe, and count the number
2364 	 * of consecutive probe failures. Latch the number of failures
2365 	 * on hitting a probe success.
2366 	 */
2367 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2368 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2369 
2370 	for (i = most_recent; i != pii->pii_probe_next;
2371 	    i = PROBE_INDEX_PREV(i)) {
2372 		pr_statp = &pii->pii_probes[i];
2373 
2374 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2375 
2376 		switch (pr_statp->pr_status) {
2377 		case PR_UNACKED:
2378 			/*
2379 			 * Only the most recent 2 probes can be unacknowledged
2380 			 */
2381 			assert(i == most_recent || i == second_most_recent);
2382 
2383 			tg = pr_statp->pr_target;
2384 			/*
2385 			 * Target is guaranteed to exist in the unack. state
2386 			 */
2387 			assert(tg != NULL);
2388 			/*
2389 			 * The crtt could be zero for some reason,
2390 			 * Eg. the phyint could be failed. If the crtt is
2391 			 * not available use the group's probe interval,
2392 			 * which is a worst case estimate.
2393 			 */
2394 			if (tg->tg_crtt != 0) {
2395 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2396 			} else {
2397 				timeout = pr_statp->pr_time_sent +
2398 				    pii->pii_phyint->pi_group->pg_probeint;
2399 			}
2400 
2401 			if (TIME_GT(timeout, now))
2402 				break;
2403 
2404 			pr_statp->pr_time_lost = timeout;
2405 			pr_statp->pr_status = PR_LOST;
2406 			/* FALLTHRU */
2407 
2408 		case PR_LOST:
2409 			if (!pi_found_success) {
2410 				pfinfo->pf_nfail++;
2411 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2412 			}
2413 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2414 			    !tg_found_success)  {
2415 				pfinfo->pf_nfail_tg++;
2416 			}
2417 			break;
2418 
2419 		default:
2420 			/*
2421 			 * We hit a success or unused slot. Latch the
2422 			 * total number of recent consecutive failures.
2423 			 */
2424 			pi_found_success = _B_TRUE;
2425 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2426 				/*
2427 				 * We hit a success for the desired target.
2428 				 * Latch the number of recent consecutive
2429 				 * failures for this target
2430 				 */
2431 				tg_found_success = _B_TRUE;
2432 			}
2433 		}
2434 	}
2435 }
2436 
2437 /*
2438  * Check if the phyint has been repaired.  If no test address has been
2439  * configured, then consider the interface repaired if the link is up (unless
2440  * the link is flapping; see below).  Otherwise, look for proof of probes
2441  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2442  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2443  */
2444 static boolean_t
2445 phyint_repaired(struct phyint *pi)
2446 {
2447 	struct	probe_success_count psinfo;
2448 	struct	phyint_instance *pii;
2449 	struct	target *cur_tg;
2450 	int	pr_ndx;
2451 	uint_t	cur_time;
2452 
2453 	if (debug & D_FAILOVER)
2454 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2455 
2456 	if (LINK_DOWN(pi))
2457 		return (_B_FALSE);
2458 
2459 	/*
2460 	 * If we don't have any test addresses and the link is up, then
2461 	 * consider the interface repaired, unless we've received more than
2462 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2463 	 * which case we keep the link down until we drop back below
2464 	 * the threshold.
2465 	 */
2466 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2467 		cur_time = getcurrenttime();
2468 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2469 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2470 			pi->pi_lfmsg_printed = 0;
2471 			return (_B_TRUE);
2472 		}
2473 		if (!pi->pi_lfmsg_printed) {
2474 			logerr("The link has come up on %s more than %d times "
2475 			    "in the last minute; disabling failback until it "
2476 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2477 			pi->pi_lfmsg_printed = 1;
2478 		}
2479 
2480 		return (_B_FALSE);
2481 	}
2482 
2483 	pii = pi->pi_v4;
2484 	if (PROBE_CAPABLE(pii)) {
2485 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2486 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2487 		probe_success_info(pii, cur_tg, &psinfo);
2488 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2489 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2490 			return (_B_TRUE);
2491 	}
2492 
2493 	pii = pi->pi_v6;
2494 	if (PROBE_CAPABLE(pii)) {
2495 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2496 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2497 		probe_success_info(pii, cur_tg, &psinfo);
2498 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2499 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2500 			return (_B_TRUE);
2501 	}
2502 
2503 	return (_B_FALSE);
2504 }
2505 
2506 /*
2507  * Try failover from phyint 'pi' to a suitable destination.
2508  */
2509 int
2510 try_failover(struct phyint *pi, int failover_type)
2511 {
2512 	struct phyint *dst;
2513 	int err;
2514 
2515 	if (debug & D_FAILOVER)
2516 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
2517 
2518 	/*
2519 	 * Attempt to find a failover destination 'dst'.
2520 	 * dst will be null if any of the following is true
2521 	 * Phyint is not part of a group  OR
2522 	 * Phyint is the only member of a group OR
2523 	 * No suitable failover dst was available
2524 	 */
2525 	dst = get_failover_dst(pi, failover_type);
2526 	if (dst == NULL)
2527 		return (IPMP_EMINRED);
2528 
2529 	dst->pi_empty = 0;			/* Per state diagram */
2530 	pi->pi_full = 0;			/* Per state diagram */
2531 
2532 	err = failover(pi, dst);
2533 
2534 	if (debug & D_FAILOVER) {
2535 		logdebug("failed over from %s to %s ret %d\n",
2536 		    pi->pi_name, dst->pi_name, err);
2537 	}
2538 	if (err == 0) {
2539 		pi->pi_empty = 1;		/* Per state diagram */
2540 		/*
2541 		 * we don't want to print out this message if a
2542 		 * phyint is leaving the group, nor for failover from
2543 		 * standby
2544 		 */
2545 		if (failover_type == FAILOVER_NORMAL) {
2546 			logerr("Successfully failed over from NIC %s to NIC "
2547 			    "%s\n", pi->pi_name, dst->pi_name);
2548 		}
2549 		return (0);
2550 	} else {
2551 		/*
2552 		 * The failover did not succeed. We must retry the failover
2553 		 * only after resyncing our state based on the kernel's.
2554 		 * For eg. either the src or the dst might have been unplumbed
2555 		 * causing this failure. initifs() will be called again,
2556 		 * from main, since full_scan_required has been set to true
2557 		 * by failover();
2558 		 */
2559 		return (IPMP_FAILURE);
2560 	}
2561 }
2562 
2563 /*
2564  * global_errno captures the errno value, if failover() or failback()
2565  * fails. This is sent to if_mpadm(1M).
2566  */
2567 int global_errno;
2568 
2569 /*
2570  * Attempt failover from phyint 'from' to phyint 'to'.
2571  * IP moves everything from phyint 'from' to phyint 'to'.
2572  */
2573 static int
2574 failover(struct phyint *from, struct phyint *to)
2575 {
2576 	struct	lifreq	lifr;
2577 	int 	ret;
2578 
2579 	if (debug & D_FAILOVER) {
2580 		logdebug("failing over from %s to %s\n",
2581 		    from->pi_name, to->pi_name);
2582 	}
2583 
2584 	/*
2585 	 * Perform the failover. Both IPv4 and IPv6 are failed over
2586 	 * using a single ioctl by passing in AF_UNSPEC family.
2587 	 */
2588 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2589 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2590 	lifr.lifr_movetoindex = to->pi_ifindex;
2591 
2592 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
2593 	if (ret < 0) {
2594 		global_errno = errno;
2595 		logperror("failover: ioctl (failover)");
2596 	}
2597 
2598 	/*
2599 	 * Set full_scan_required to true. This will make us read
2600 	 * the state from the kernel in initifs() and update our tables,
2601 	 * to reflect the current state after the failover. If the
2602 	 * failover has failed it will then reissue the failover.
2603 	 */
2604 	full_scan_required = _B_TRUE;
2605 	return (ret);
2606 }
2607 
2608 /*
2609  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
2610  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
2611  * Return values:
2612  * IPMP_SUCCESS:		Failback successful from each of the other
2613  *				phyints in the group.
2614  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
2615  *				phyints in the group.
2616  * IPMP_FAILURE:		Failback syscall failed with some error.
2617  *
2618  * Note that failback is attempted regardless of the setting of the
2619  * failback_enabled flag.
2620  */
2621 int
2622 do_failback(struct phyint *pi)
2623 {
2624 	struct  phyint *from;
2625 	boolean_t done;
2626 	boolean_t partial;
2627 	boolean_t attempted_failback = _B_FALSE;
2628 
2629 	if (debug & D_FAILOVER)
2630 		logdebug("do_failback(%s)\n", pi->pi_name);
2631 
2632 	/* If this phyint is not part of a named group, return. */
2633 	if (pi->pi_group == phyint_anongroup) {
2634 		pi->pi_full = 1;
2635 		return (IPMP_SUCCESS);
2636 	}
2637 
2638 	/*
2639 	 * Attempt failback from every phyint in the group to 'pi'.
2640 	 * The reason for doing this, instead of only from the
2641 	 * phyint to which we did the failover is given below.
2642 	 *
2643 	 * After 'pi' failed, if any app. tries to join on a multicast
2644 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
2645 	 * non-failed phyint in the group, instead of the failed phyint,
2646 	 * in.mpathd is not aware of this. Thus failing back only from the
2647 	 * interface to which 'pi' failed over, will failback the ipif's
2648 	 * but not the ilm's. So we need to failback from all members of
2649 	 * the phyint group
2650 	 */
2651 	done = _B_TRUE;
2652 	partial = _B_FALSE;
2653 	for (from = pi->pi_group->pg_phyint; from != NULL;
2654 	    from = from->pi_pgnext) {
2655 		/* Exclude ourself as a failback src */
2656 		if (from == pi)
2657 			continue;
2658 
2659 		/*
2660 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
2661 		 * phyint must also have IPv4 plumbed. Similar check
2662 		 * for IPv6. IP makes the same check. Otherwise the
2663 		 * failback will fail.
2664 		 */
2665 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
2666 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
2667 			partial = _B_TRUE;
2668 			continue;
2669 		}
2670 
2671 		pi->pi_empty = 0;	/* Per state diagram */
2672 		attempted_failback = _B_TRUE;
2673 		if (failback(from, pi) != 0) {
2674 			done = _B_FALSE;
2675 			break;
2676 		}
2677 	}
2678 
2679 	/*
2680 	 * We are done. No more phyint from which we can src the failback
2681 	 */
2682 	if (done) {
2683 		if (!partial)
2684 			pi->pi_full = 1;	/* Per state diagram */
2685 		/*
2686 		 * Don't print out a message unless there is a
2687 		 * transition from FAILED to RUNNING. For eg.
2688 		 * we don't want to print out this message if a
2689 		 * phyint is leaving the group, or at startup
2690 		 */
2691 		if (attempted_failback && (pi->pi_flags &
2692 		    (IFF_FAILED | IFF_OFFLINE))) {
2693 			logerr("Successfully failed back to NIC %s\n",
2694 			    pi->pi_name);
2695 		}
2696 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2697 	}
2698 
2699 	return (IPMP_FAILURE);
2700 }
2701 
2702 /*
2703  * This function is similar to do_failback() above, but respects the
2704  * failback_enabled flag for phyints in named groups.
2705  */
2706 int
2707 try_failback(struct phyint *pi)
2708 {
2709 	if (debug & D_FAILOVER)
2710 		logdebug("try_failback(%s)\n", pi->pi_name);
2711 
2712 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
2713 		return (IPMP_EFBDISABLED);
2714 
2715 	return (do_failback(pi));
2716 }
2717 
2718 /*
2719  * Failback everything from phyint 'from' that has the same ifindex
2720  * as phyint to's ifindex.
2721  */
2722 static int
2723 failback(struct phyint *from, struct phyint *to)
2724 {
2725 	struct lifreq lifr;
2726 	int ret;
2727 
2728 	if (debug & D_FAILOVER)
2729 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
2730 
2731 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2732 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2733 	lifr.lifr_movetoindex = to->pi_ifindex;
2734 
2735 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
2736 	if (ret < 0) {
2737 		global_errno = errno;
2738 		logperror("failback: ioctl (failback)");
2739 	}
2740 
2741 	/*
2742 	 * Set full_scan_required to true. This will make us read
2743 	 * the state from the kernel in initifs() and update our tables,
2744 	 * to reflect the current state after the failback. If the
2745 	 * failback has failed it will then reissue the failback.
2746 	 */
2747 	full_scan_required = _B_TRUE;
2748 
2749 	return (ret);
2750 }
2751 
2752 /*
2753  * Select a target phyint for failing over from 'pi'.
2754  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
2755  * target phyint is chosen as follows,
2756  *	1. Pick any inactive standby interface.
2757  *	2. If no inactive standby is available, select any phyint in the
2758  *	   same group that has the least number of logints, (excluding
2759  *	   IFF_NOFAILOVER and !IFF_UP logints)
2760  * If we are failing over from a standby, failover_type is
2761  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
2762  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
2763  * and we won't return NULL, as long as there is at least 1 other phyint
2764  * in the group.
2765  */
2766 static struct phyint *
2767 get_failover_dst(struct phyint *pi, int failover_type)
2768 {
2769 	struct phyint	*maybe = NULL;
2770 	struct phyint	*pi2;
2771 	struct phyint 	*last_choice = NULL;
2772 
2773 	if (pi->pi_group == phyint_anongroup)
2774 		return (NULL);
2775 
2776 	/*
2777 	 * Loop thru the phyints in the group, and pick the preferred
2778 	 * phyint for the target.
2779 	 */
2780 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2781 		/* Exclude ourself and offlined interfaces */
2782 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
2783 			continue;
2784 
2785 		/*
2786 		 * The chosen target phyint must have IPv4 instance
2787 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
2788 		 * for IPv6.
2789 		 */
2790 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
2791 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
2792 			continue;
2793 
2794 		/* The chosen target must be PI_RUNNING. */
2795 		if (pi2->pi_state != PI_RUNNING) {
2796 			last_choice = pi2;
2797 			continue;
2798 		}
2799 
2800 		if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) &&
2801 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
2802 			return (pi2);
2803 		} else {
2804 			if (maybe == NULL)
2805 				maybe = pi2;
2806 			else if (logint_upcount(pi2) < logint_upcount(maybe))
2807 				maybe = pi2;
2808 		}
2809 	}
2810 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
2811 		return (last_choice);
2812 	else
2813 		return (maybe);
2814 }
2815 
2816 /*
2817  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2818  */
2819 boolean_t
2820 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
2821 {
2822 	int ifsock;
2823 	struct lifreq lifr;
2824 
2825 	if (debug & D_FAILOVER) {
2826 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
2827 		    pi->pi_name, flags, (int)setfl);
2828 	}
2829 
2830 	if (pi->pi_v4 != NULL) {
2831 		ifsock = ifsock_v4;
2832 	} else  {
2833 		ifsock = ifsock_v6;
2834 	}
2835 
2836 	/*
2837 	 * Get the current flags from the kernel, and set/clear the
2838 	 * desired phyint flags. Since we set only phyint flags, we can
2839 	 * do it on either IPv4 or IPv6 instance.
2840 	 */
2841 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2842 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
2843 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2844 		if (errno != ENXIO)
2845 			logperror("change_lif_flags: ioctl (get flags)");
2846 		return (_B_FALSE);
2847 	}
2848 	if (setfl)
2849 		lifr.lifr_flags |= flags;
2850 	else
2851 		lifr.lifr_flags &= ~flags;
2852 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2853 		if (errno != ENXIO)
2854 			logperror("change_lif_flags: ioctl (set flags)");
2855 		return (_B_FALSE);
2856 	}
2857 
2858 	/*
2859 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2860 	 * phyint flags.
2861 	 */
2862 	if (setfl)
2863 		pi->pi_flags |= flags;
2864 	else
2865 		pi->pi_flags &= ~flags;
2866 
2867 	if (pi->pi_v4)
2868 		pi->pi_v4->pii_flags = pi->pi_flags;
2869 
2870 	if (pi->pi_v6)
2871 		pi->pi_v6->pii_flags = pi->pi_flags;
2872 
2873 	return (_B_TRUE);
2874 }
2875 
2876 /*
2877  * icmp cksum computation for IPv4.
2878  */
2879 static int
2880 in_cksum(ushort_t *addr, int len)
2881 {
2882 	register int nleft = len;
2883 	register ushort_t *w = addr;
2884 	register ushort_t answer;
2885 	ushort_t odd_byte = 0;
2886 	register int sum = 0;
2887 
2888 	/*
2889 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2890 	 *  we add sequential 16 bit words to it, and at the end, fold
2891 	 *  back all the carry bits from the top 16 bits into the lower
2892 	 *  16 bits.
2893 	 */
2894 	while (nleft > 1)  {
2895 		sum += *w++;
2896 		nleft -= 2;
2897 	}
2898 
2899 	/* mop up an odd byte, if necessary */
2900 	if (nleft == 1) {
2901 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2902 		sum += odd_byte;
2903 	}
2904 
2905 	/*
2906 	 * add back carry outs from top 16 bits to low 16 bits
2907 	 */
2908 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2909 	sum += (sum >> 16);			/* add carry */
2910 	answer = ~sum;				/* truncate to 16 bits */
2911 	return (answer);
2912 }
2913 
2914 static void
2915 reset_snxt_basetimes(void)
2916 {
2917 	struct phyint_instance *pii;
2918 
2919 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2920 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2921 	}
2922 }
2923 
2924 /*
2925  * Is the address one of our own addresses? Unfortunately,
2926  * we cannot check our phyint tables to determine if the address
2927  * is our own. This is because, we don't track interfaces that
2928  * are not part of any group. We have to either use a 'bind' or
2929  * get the complete list of all interfaces using SIOCGLIFCONF,
2930  * to do this check. We could also use SIOCTMYADDR.
2931  * Bind fails for the local zone address, so we might include local zone
2932  * address as target address. If local zone address is a target address
2933  * and it is up, it is not possible to detect the interface failure.
2934  * SIOCTMYADDR also doesn't consider local zone address as own address.
2935  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2936  * are stored in laddr_list.
2937  */
2938 
2939 boolean_t
2940 own_address(struct in6_addr addr)
2941 {
2942 	struct local_addr *taddr = laddr_list;
2943 
2944 	for (; taddr != NULL; taddr = taddr->next) {
2945 		if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) {
2946 			return (_B_TRUE);
2947 		}
2948 	}
2949 	return (_B_FALSE);
2950 }
2951