xref: /titanic_41/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c (revision f9e2a31fcb74a362a725343aea080a7e5b858b70)
1 /*
2  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 
6 /*
7  * Copyright (c) 1987 Regents of the University of California.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms are permitted
11  * provided that the above copyright notice and this paragraph are
12  * duplicated in all such forms and that any documentation,
13  * advertising materials, and other materials related to such
14  * distribution and use acknowledge that the software was developed
15  * by the University of California, Berkeley. The name of the
16  * University may not be used to endorse or promote products derived
17  * from this software without specific prior written permission.
18  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
20  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
21  */
22 
23 #pragma ident	"%Z%%M%	%I%	%E% SMI"
24 
25 #include "mpd_defs.h"
26 #include "mpd_tables.h"
27 
28 /*
29  * Probe types for probe()
30  */
31 #define	PROBE_UNI	0x1234		/* Unicast probe packet */
32 #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
33 #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
34 
35 #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
36 
37 /*
38  * Format of probe / probe response packets. This is an ICMP Echo request
39  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
40  */
41 struct pr_icmp
42 {
43 	uint8_t  pr_icmp_type;		/* type field */
44 	uint8_t  pr_icmp_code;		/* code field */
45 	uint16_t pr_icmp_cksum;		/* checksum field */
46 	uint16_t pr_icmp_id;		/* Identification */
47 	uint16_t pr_icmp_seq;		/* sequence number */
48 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
49 	uint32_t pr_icmp_mtype;		/* Message type */
50 };
51 
52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
53 				    0x0, 0x0, 0x0, 0x0,
54 				    0x0, 0x0, 0x0, 0x0,
55 				    0x0, 0x0, 0x0, 0x1 } };
56 
57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
58 
59 static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
60 
61 static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
62 static void		pi_set_crtt(struct target *tg, int m,
63     boolean_t is_probe_uni);
64 static void		incoming_echo_reply(struct phyint_instance *pii,
65     struct pr_icmp *reply, struct in6_addr fromaddr);
66 static void		incoming_rtt_reply(struct phyint_instance *pii,
67     struct pr_icmp *reply, struct in6_addr fromaddr);
68 static void		incoming_mcast_reply(struct phyint_instance *pii,
69     struct pr_icmp *reply, struct in6_addr fromaddr);
70 
71 static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
72 static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
73 static boolean_t	check_exception_target(struct phyint_instance *pii,
74     struct target *target);
75 static void		probe_fail_info(struct phyint_instance *pii,
76     struct target *cur_tg, struct probe_fail_count *pfinfo);
77 static void		probe_success_info(struct phyint_instance *pii,
78     struct target *cur_tg, struct probe_success_count *psinfo);
79 static boolean_t	phyint_repaired(struct phyint *pi);
80 
81 static int		failover(struct phyint *from, struct phyint *to);
82 static int		failback(struct phyint *from, struct phyint *to);
83 static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
84 
85 static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
86 static int 		in_cksum(ushort_t *addr, int len);
87 static void		reset_snxt_basetimes(void);
88 
89 /*
90  * CRTT - Conservative Round Trip Time Estimate
91  * Probe success - A matching probe reply received before CRTT ms has elapsed
92  *	after sending the probe.
93  * Probe failure - No probe reply received and more than CRTT ms has elapsed
94  *	after sending the probe.
95  *
96  * TLS - Time last success. Most recent probe ack received at this time.
97  * TFF - Time first fail. The time of the earliest probe failure in
98  *	a consecutive series of probe failures.
99  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
100  * 	before declaring phyint repair.
101  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
102  *	declare a phyint failure.
103  *
104  * 			Phyint state diagram
105  *
106  * The state of a phyint that is capable of being probed, is completely
107  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
108  *
109  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
110  * of the link (according to the driver).  If the phyint is also configured
111  * with a test address (the common case) and probe targets, then a phyint must
112  * also successfully be able to send and receive probes in order to remain in
113  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
114  *
115  * Further, if a PI_RUNNING phyint is configured with a test address but is
116  * unable to find any probe targets, it will transition to the PI_NOTARGETS
117  * state, which indicates that the link is apparently functional but that
118  * in.mpathd is unable to send probes to verify functionality (in this case,
119  * in.mpathd makes the optimistic assumption that the interface is working
120  * correctly and thus does not perform a failover, but reports the interface
121  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
122  *
123  * At any point, a phyint may be administratively marked offline via if_mpadm.
124  * In this case, the interface always transitions to PI_OFFLINE, regardless
125  * of its previous state.  When the interface is later brought back online,
126  * in.mpathd acts as if the interface is new (and thus it transitions to
127  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
128  * its probes, if probes are sent).
129  *
130  * pi_state -  PI_RUNNING or PI_FAILED
131  *	PI_RUNNING: The failure detection logic says the phyint is good.
132  *	PI_FAILED: The failure detection logic says the phyint has failed.
133  *
134  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
135  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
136  *	In the case of router targets, we assume that the current list of
137  *	targets obtained from the routing table, is still valid, so the
138  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
139  *	list of targets, and multicast to the all hosts, to reconstruct the
140  *	target list. So the phyints are in the PI_NOTARGETS state.
141  *
142  * I -	value of (pi_flags & IFF_INACTIVE)
143  *	IFF_INACTIVE: No failovers have been done to this phyint, from
144  *		other phyints. This phyint is inactive. Phyint can be a Standby.
145  *		When failback has been disabled (FAILOVER=no configured),
146  *		phyint can also be a non-STANDBY. In this case IFF_INACTIVE
147  *		is set when phyint subsequently recovers after a failure.
148  *
149  * pi_empty
150  *	This phyint has failed over successfully to another phyint, and
151  *	this phyint is currently "empty". It does not host any addresses or
152  *	multicast membership etc. This is the state of a phyint after a
153  *	failover from the phyint has completed successfully and no subsequent
154  *	'failover to' or 'failback to' has occurred on the phyint.
155  *	IP guarantees that no new logicals will be hosted nor any multicast
156  *	joins permitted on the phyint, since the phyint is either failed or
157  *	inactive. pi_empty is set implies the phyint is either failed or
158  *	inactive.
159  *
160  * pi_full
161  *	The phyint hosts all of its own addresses that it "owns". If the
162  *	phyint was previously failed or inactive, failbacks to the phyint
163  *	has completed successfully. i.e. No more failbacks to this phyint
164  *	can produce any change in system state whatsoever.
165  *
166  * Not all 32 possible combinations of the above 5-tuple are possible.
167  * Furthermore some of the above combinations are transient. They may occur
168  * only because the failover or failback did not complete successfully. The
169  * failover/failback will be retried and eventually a stable state will be
170  * reached.
171  *
172  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
173  * The following are the state machines. 'from' and 'to' are the src and
174  * dst of the failover/failback, below
175  *
176  *			pi_empty state machine
177  * ---------------------------------------------------------------------------
178  *	Event				State	->	New State
179  * ---------------------------------------------------------------------------
180  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
181  *	of failover
182  *
183  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
184  *
185  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
186  *
187  * 	group failure			pi_empty = X	  -> pi_empty = 0
188  * ---------------------------------------------------------------------------
189  *
190  *			pi_full state machine
191  * ---------------------------------------------------------------------------
192  *	Event				State		  -> New State
193  * ---------------------------------------------------------------------------
194  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
195  *	of failback from
196  *	each of the other phyints
197  *
198  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
199  *
200  *	group failure			pi_full = X	  -> pi_full = 0
201  * ---------------------------------------------------------------------------
202  *
203  *			pi_state state machine
204  * ---------------------------------------------------------------------------
205  *	Event			State			New State
206  *				Action:
207  * ---------------------------------------------------------------------------
208  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
209  *	detection		: set IFF_FAILED on this phyint
210  *				: failover from this phyint to another
211  *
212  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
213  *	detection		: set IFF_FAILED on this phyint
214  *
215  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=yes)
216  *	detection				     -> (PI_RUNNING, I == 0)
217  *				: to.pi_empty = 0
218  *				: clear IFF_FAILED on this phyint
219  *				: failback to this phyint if enabled
220  *
221  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=no)
222  *	detection				     ->	(PI_RUNNING, I == 1)
223  *				: to.pi_empty = 0
224  *				: clear IFF_FAILED on this phyint
225  *				: if failback is disabled set I == 1
226  *
227  *	Group failure		(perform on all phyints in the group)
228  *	detection 		PI_RUNNING		PI_FAILED
229  *	(Router targets)	: set IFF_FAILED
230  *				: clear pi_empty and pi_full
231  *
232  *	Group failure		(perform on all phyints in the group)
233  *	detection 		PI_RUNNING		PI_NOTARGETS
234  *	(Host targets)		: set IFF_FAILED
235  *				: clear pi_empty and pi_full
236  *				: delete the target list on all phyints
237  * ---------------------------------------------------------------------------
238  *
239  *			I state machine
240  * ---------------------------------------------------------------------------
241  *	Event		State			Action:
242  * ---------------------------------------------------------------------------
243  *	Turn on I 	pi_empty == 0, STANDBY 	: failover from standby
244  *
245  *	Turn off I 	PI_RUNNING, STANDBY	: pi_empty = 0
246  *			pi_full == 0		: failback to this if enabled
247  * ---------------------------------------------------------------------------
248  *
249  * Assertions: (Read '==>' as implies)
250  *
251  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
252  * (pi_empty == 1) ==> (pi_full == 0)
253  * (pi_full  == 1) ==> (pi_empty == 0)
254  *
255  * Invariants
256  *
257  * pg_groupfailed = 0  &&
258  *   1. (I == 1, pi_empty == 0)		   ==> initiate failover from standby
259  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
260  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
261  *
262  * 1. says that an inactive standby, that is not empty, has to be failed
263  * over. For a standby to be truly inactive, it should not host any
264  * addresses. So we move them to some other phyint. Usually we catch the
265  * turn on of IFF_INACTIVE, and perform this action. However if the failover
266  * did not complete successfully, then subsequently we have lost the edge
267  * trigger, and this invariant kicks in and completes the action.
268  *
269  * 2. says that any failed phyint that is not empty must be failed over.
270  * Usually we do the failover when we detect NIC failure. However if the
271  * failover does not complete successfully, this invariant kicks in and
272  * completes the failover. We exclude inactive standby which is covered by 1.
273  *
274  * 3. says that any running phyint that is not full must be failed back.
275  * Usually we do the failback when we detect NIC repair. However if the
276  * failback does not complete successfully, this invariant kicks in and
277  * completes the failback. Note that we don't want to failback to an inactive
278  * standby.
279  *
280  * The invariants 1 - 3 and the actions are in initifs().
281  */
282 
283 struct probes_missed probes_missed;
284 
285 /*
286  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
287  * will be added on by the kernel.  The id field identifies this phyint.
288  * and the sequence number is an increasing (modulo 2^^16) integer. The data
289  * portion holds the time value when the packet is sent. On echo this is
290  * extracted to compute the round-trip time. Three different types of
291  * probe packets are used.
292  *
293  * PROBE_UNI: This type is used to do failure detection / failure recovery
294  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
295  *	not less than the current CRTT. pii_probes[] stores data
296  *	about these probes. These packets consume sequence number space.
297  *
298  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
299  * 	are not used. Under heavy network load, the rtt may go up very high,
300  *	due to a spike, or may appear to go high, due to extreme scheduling
301  * 	delays. Once the network stress is removed, mpathd takes long time to
302  *	recover, because the probe_interval is already high, and it takes
303  *	a long time to send out sufficient number of probes to bring down the
304  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
305  *	user_probe_interval ms. and will cause only rtt updates. These packets
306  *	do not consume sequence number space nor is information about these
307  *	packets stored in the pii_probes[]
308  *
309  * PROBE_MULTI: This type is only used to construct a list of targets, when
310  *	no targets are known. The packet is multicast to the all hosts addr.
311  */
312 static void
313 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
314 {
315 	struct pr_icmp probe_pkt;	/* Probe packet */
316 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
317 	struct sockaddr_in whereto; 	/* target address IPv4 */
318 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
319 	boolean_t sent = _B_TRUE;
320 
321 	if (debug & D_TARGET) {
322 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
323 		    pii->pii_name, probe_type, cur_time);
324 	}
325 
326 	assert(pii->pii_probe_sock != -1);
327 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
328 	    probe_type == PROBE_RTT);
329 
330 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
331 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
332 	probe_pkt.pr_icmp_code = 0;
333 	probe_pkt.pr_icmp_cksum = 0;
334 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
335 
336 	/*
337 	 * Since there is no need to do arithmetic on the icmpid,
338 	 * (only equality check is done) pii_icmpid is stored in
339 	 * network byte order at initialization itself.
340 	 */
341 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
342 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
343 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
344 
345 	/*
346 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
347 	 * the all hosts address. Otherwise it is unicast to the next target.
348 	 */
349 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
350 	    pii->pii_rtt_target_next != NULL));
351 
352 	if (pii->pii_af == AF_INET6) {
353 		bzero(&whereto6, sizeof (whereto6));
354 		whereto6.sin6_family = AF_INET6;
355 		if (probe_type == PROBE_MULTI) {
356 			whereto6.sin6_addr = all_nodes_mcast_v6;
357 		} else if (probe_type == PROBE_UNI) {
358 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
359 		} else  {
360 			/* type is PROBE_RTT */
361 			whereto6.sin6_addr =
362 			    pii->pii_rtt_target_next->tg_address;
363 		}
364 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
365 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
366 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
367 			logperror_pii(pii, "probe: probe sendto");
368 			sent = _B_FALSE;
369 		}
370 	} else {
371 		bzero(&whereto, sizeof (whereto));
372 		whereto.sin_family = AF_INET;
373 		if (probe_type == PROBE_MULTI) {
374 			whereto.sin_addr = all_nodes_mcast_v4;
375 		} else if (probe_type == PROBE_UNI) {
376 			IN6_V4MAPPED_TO_INADDR(
377 			    &pii->pii_target_next->tg_address,
378 			    &whereto.sin_addr);
379 		} else {
380 			/* type is PROBE_RTT */
381 			IN6_V4MAPPED_TO_INADDR(
382 			    &pii->pii_rtt_target_next->tg_address,
383 			    &whereto.sin_addr);
384 		}
385 
386 		/*
387 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
388 		 */
389 		probe_pkt.pr_icmp_cksum =
390 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
391 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
392 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
393 		    sizeof (whereto)) != sizeof (probe_pkt)) {
394 			logperror_pii(pii, "probe: probe sendto");
395 			sent = _B_FALSE;
396 		}
397 	}
398 
399 	/*
400 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
401 	 * update our tables. We will need this info in processing the probe
402 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
403 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
404 	 * are only used to construct a list of targets. PROBE_RTT packets are
405 	 * used only for updating the rtt and not for failure detection.
406 	 */
407 	if (probe_type == PROBE_UNI && sent) {
408 		pr_ndx = pii->pii_probe_next;
409 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
410 
411 		/* Collect statistics, before we reuse the last slot. */
412 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
413 			pii->pii_cum_stats.lost++;
414 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
415 			pii->pii_cum_stats.acked++;
416 		pii->pii_cum_stats.sent++;
417 
418 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
419 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
420 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
421 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
422 		pii->pii_target_next = target_next(pii->pii_target_next);
423 		assert(pii->pii_target_next != NULL);
424 		/*
425 		 * If we have a single variable to denote the next target to
426 		 * probe for both rtt probes and failure detection probes, we
427 		 * could end up with a situation where the failure detection
428 		 * probe targets become disjoint from the rtt probe targets.
429 		 * Eg. if 2 targets and the actual fdt is double the user
430 		 * specified fdt. So we have 2 variables. In this scheme
431 		 * we also reset pii_rtt_target_next for every fdt probe,
432 		 * though that may not be necessary.
433 		 */
434 		pii->pii_rtt_target_next = pii->pii_target_next;
435 		pii->pii_snxt++;
436 	} else if (probe_type == PROBE_RTT) {
437 		pii->pii_rtt_target_next =
438 		    target_next(pii->pii_rtt_target_next);
439 		assert(pii->pii_rtt_target_next != NULL);
440 	}
441 }
442 
443 /*
444  * Incoming IPv4 data from wire, is received here. Called from main.
445  */
446 void
447 in_data(struct phyint_instance *pii)
448 {
449 	struct	sockaddr_in 	from;
450 	struct	in6_addr	fromaddr;
451 	uint_t	fromlen;
452 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
453 	struct ip *ip;
454 	int 	iphlen;
455 	int 	len;
456 	char 	abuf[INET_ADDRSTRLEN];
457 	struct	pr_icmp	*reply;
458 
459 	if (debug & D_PROBE) {
460 		logdebug("in_data(%s %s)\n",
461 		    AF_STR(pii->pii_af), pii->pii_name);
462 	}
463 
464 	/*
465 	 * Poll has already told us that a message is waiting,
466 	 * on this socket. Read it now. We should not block.
467 	 */
468 	fromlen = sizeof (from);
469 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
470 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
471 	if (len < 0) {
472 		logperror_pii(pii, "in_data: recvfrom");
473 		return;
474 	}
475 
476 	/*
477 	 * If the NIC has indicated the link is down, don't go
478 	 * any further.
479 	 */
480 	if (LINK_DOWN(pii->pii_phyint))
481 		return;
482 
483 	/* Get the printable address for error reporting */
484 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
485 
486 	/* Make sure packet contains at least minimum ICMP header */
487 	ip = (struct ip *)in_packet;
488 	iphlen = ip->ip_hl << 2;
489 	if (len < iphlen + ICMP_MINLEN) {
490 		if (debug & D_PKTBAD) {
491 			logdebug("in_data: packet too short (%d bytes)"
492 			    " from %s\n", len, abuf);
493 		}
494 		return;
495 	}
496 
497 	/*
498 	 * Subtract the IP hdr length, 'len' will be length of the probe
499 	 * reply, starting from the icmp hdr.
500 	 */
501 	len -= iphlen;
502 	/* LINTED */
503 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
504 
505 	/* Probe replies are icmp echo replies. Ignore anything else */
506 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
507 		return;
508 
509 	/*
510 	 * The icmp id should match what we sent, which is stored
511 	 * in pi_icmpid. The icmp code for reply must be 0.
512 	 * The reply content must be a struct pr_icmp
513 	 */
514 	if (reply->pr_icmp_id != pii->pii_icmpid) {
515 		/* Not in response to our probe */
516 		return;
517 	}
518 
519 	if (reply->pr_icmp_code != 0) {
520 		logtrace("probe reply code %d from %s on %s\n",
521 		    reply->pr_icmp_code, abuf, pii->pii_name);
522 		return;
523 	}
524 
525 	if (len < sizeof (struct pr_icmp)) {
526 		logtrace("probe reply too short: %d bytes from %s on %s\n",
527 		    len, abuf, pii->pii_name);
528 		return;
529 	}
530 
531 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
532 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
533 		/* Unicast probe reply */
534 		incoming_echo_reply(pii, reply, fromaddr);
535 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
536 		/* Multicast reply */
537 		incoming_mcast_reply(pii, reply, fromaddr);
538 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
539 		incoming_rtt_reply(pii, reply, fromaddr);
540 	} else {
541 		/* Probably not in response to our probe */
542 		logtrace("probe reply type: %d from %s on %s\n",
543 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
544 		return;
545 	}
546 
547 }
548 
549 /*
550  * Incoming IPv6 data from wire is received here. Called from main.
551  */
552 void
553 in6_data(struct phyint_instance *pii)
554 {
555 	struct sockaddr_in6 from;
556 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
557 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
558 	int len;
559 	char abuf[INET6_ADDRSTRLEN];
560 	struct msghdr msg;
561 	struct iovec iov;
562 	uchar_t *opt;
563 	struct	pr_icmp *reply;
564 
565 	if (debug & D_PROBE) {
566 		logdebug("in6_data(%s %s)\n",
567 		    AF_STR(pii->pii_af), pii->pii_name);
568 	}
569 
570 	iov.iov_base = (char *)in_packet;
571 	iov.iov_len = sizeof (in_packet);
572 	msg.msg_iov = &iov;
573 	msg.msg_iovlen = 1;
574 	msg.msg_name = (struct sockaddr *)&from;
575 	msg.msg_namelen = sizeof (from);
576 	msg.msg_control = ancillary_data;
577 	msg.msg_controllen = sizeof (ancillary_data);
578 
579 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
580 		logperror_pii(pii, "in6_data: recvfrom");
581 		return;
582 	}
583 
584 	/*
585 	 * If the NIC has indicated that the link is down, don't go
586 	 * any further.
587 	 */
588 	if (LINK_DOWN(pii->pii_phyint))
589 		return;
590 
591 	/* Get the printable address for error reporting */
592 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
593 	if (len < ICMP_MINLEN) {
594 		if (debug & D_PKTBAD) {
595 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
596 			    msg.msg_flags, abuf);
597 		}
598 		return;
599 	}
600 	/* Ignore packets > 64k or control buffers that don't fit */
601 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
602 		if (debug & D_PKTBAD) {
603 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
604 			    msg.msg_flags, abuf);
605 		}
606 		return;
607 	}
608 
609 	reply = (struct pr_icmp *)in_packet;
610 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
611 		return;
612 
613 	if (reply->pr_icmp_id != pii->pii_icmpid) {
614 		/* Not in response to our probe */
615 		return;
616 	}
617 
618 	/*
619 	 * The kernel has already verified the the ICMP checksum.
620 	 */
621 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
622 		logtrace("ICMPv6 echo reply source address not linklocal from "
623 		    "%s on %s\n", abuf, pii->pii_name);
624 		return;
625 	}
626 	opt = find_ancillary(&msg, IPV6_RTHDR);
627 	if (opt != NULL) {
628 		/* Can't allow routing headers in probe replies  */
629 		logtrace("message with routing header from %s on %s\n",
630 		    abuf, pii->pii_name);
631 		return;
632 	}
633 	if (reply->pr_icmp_code != 0) {
634 		logtrace("probe reply code: %d from %s on %s\n",
635 		    reply->pr_icmp_code, abuf, pii->pii_name);
636 		return;
637 	}
638 	if (len < (sizeof (struct pr_icmp))) {
639 		logtrace("probe reply too short: %d bytes from %s on %s\n",
640 		    len, abuf, pii->pii_name);
641 		return;
642 	}
643 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
644 		incoming_echo_reply(pii, reply, from.sin6_addr);
645 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
646 		incoming_mcast_reply(pii, reply, from.sin6_addr);
647 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
648 		incoming_rtt_reply(pii, reply, from.sin6_addr);
649 	} else  {
650 		/* Probably not in response to our probe */
651 		logtrace("probe reply type: %d from %s on %s\n",
652 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
653 	}
654 }
655 
656 /*
657  * Process the incoming rtt reply, in response to our rtt probe.
658  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
659  * have any stored information about the probe we sent. So we don't log
660  * any errors if we receive bad replies.
661  */
662 static void
663 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
664     struct in6_addr fromaddr)
665 {
666 	int 	m;		/* rtt measurment in ms */
667 	uint32_t cur_time;	/* in ms from some arbitrary point */
668 	char	abuf[INET6_ADDRSTRLEN];
669 	struct	target	*target;
670 	uint32_t pr_icmp_timestamp;
671 	struct 	phyint_group *pg;
672 
673 	/* Get the printable address for error reporting */
674 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
675 
676 	if (debug & D_PROBE) {
677 		logdebug("incoming_rtt_reply: %s %s %s\n",
678 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
679 	}
680 
681 	/* Do we know this target ? */
682 	target = target_lookup(pii, fromaddr);
683 	if (target == NULL)
684 		return;
685 
686 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
687 	cur_time = getcurrenttime();
688 	m = (int)(cur_time - pr_icmp_timestamp);
689 
690 	/* Invalid rtt. It has wrapped around */
691 	if (m < 0)
692 		return;
693 
694 	/*
695 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
696 	 * The initial few responses after the interface is repaired may
697 	 * contain high rtt's because they could have been queued up waiting
698 	 * for ARP/NDP resolution on a failed interface.
699 	 */
700 	pg = pii->pii_phyint->pi_group;
701 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
702 		return;
703 
704 	/*
705 	 * Update rtt only if the new rtt is lower than the current rtt.
706 	 * (specified by the 3rd parameter to pi_set_crtt).
707 	 * If a spike has caused the current probe_interval to be >
708 	 * user_probe_interval, then this mechanism is used to bring down
709 	 * the rtt rapidly once the network stress is removed.
710 	 * If the new rtt is higher than the current rtt, we don't want to
711 	 * update the rtt. We are having more than 1 outstanding probe and
712 	 * the increase in rtt we are seeing is being unnecessarily weighted
713 	 * many times. The regular rtt update will be handled by
714 	 * incoming_echo_reply() and will take care of any rtt increase.
715 	 */
716 	pi_set_crtt(target, m, _B_FALSE);
717 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
718 	    (user_failure_detection_time < pg->pg_fdt) &&
719 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
720 		/*
721 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
722 		 * investigate if we can improve the failure detection time to
723 		 * meet whatever the user specified.
724 		 */
725 		if (check_pg_crtt_improved(pg)) {
726 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
727 			    user_failure_detection_time);
728 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
729 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
730 				logerr("Improved failure detection time %d ms "
731 				    "on (%s %s) for group \"%s\"\n",
732 				    pg->pg_fdt, AF_STR(pii->pii_af),
733 				    pii->pii_name,
734 				    pii->pii_phyint->pi_group->pg_name);
735 			}
736 			if (user_failure_detection_time == pg->pg_fdt) {
737 				/* Avoid any truncation or rounding errors */
738 				pg->pg_probeint = user_probe_interval;
739 				/*
740 				 * No more rtt probes will be sent. The actual
741 				 * fdt has dropped to the user specified value.
742 				 * pii_fd_snxt_basetime and pii_snxt_basetime
743 				 * will be in sync henceforth.
744 				 */
745 				reset_snxt_basetimes();
746 			}
747 		}
748 	}
749 }
750 
751 /*
752  * Process the incoming echo reply, in response to our unicast probe.
753  * Common for both IPv4 and IPv6
754  */
755 static void
756 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
757     struct in6_addr fromaddr)
758 {
759 	int 	m;		/* rtt measurment in ms */
760 	uint32_t cur_time;	/* in ms from some arbitrary point */
761 	char	abuf[INET6_ADDRSTRLEN];
762 	int	pr_ndx;
763 	struct	target	*target;
764 	boolean_t exception;
765 	uint32_t pr_icmp_timestamp;
766 	uint16_t pr_icmp_seq;
767 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
768 
769 	/* Get the printable address for error reporting */
770 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
771 
772 	if (debug & D_PROBE) {
773 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
774 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
775 		    ntohs(reply->pr_icmp_seq));
776 	}
777 
778 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
779 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
780 
781 	/* Reject out of window probe replies */
782 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
783 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
784 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
785 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
786 		pii->pii_cum_stats.unknown++;
787 		return;
788 	}
789 	cur_time = getcurrenttime();
790 	m = (int)(cur_time - pr_icmp_timestamp);
791 	if (m < 0) {
792 		/*
793 		 * This is a ridiculously high value of rtt. rtt has wrapped
794 		 * around. Log a message, and ignore the rtt.
795 		 */
796 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
797 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
798 	}
799 
800 	/*
801 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
802 	 * number in our pii->pii_probes[] array. The icmp sequence number
803 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
804 	 */
805 	pr_ndx = MOD_SUB(pii->pii_probe_next,
806 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
807 
808 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
809 
810 	target = pii->pii_probes[pr_ndx].pr_target;
811 
812 	/*
813 	 * Perform sanity checks, whether this probe reply that we
814 	 * have received is genuine
815 	 */
816 	if (target != NULL) {
817 		/*
818 		 * Compare the src. addr of the received ICMP or ICMPv6
819 		 * probe reply with the target address in our tables.
820 		 */
821 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
822 			/*
823 			 * We don't have any record of having sent a probe to
824 			 * this target. This is a fake probe reply. Log an error
825 			 */
826 			logtrace("probe status %d Fake probe reply seq %u "
827 			    "snxt %u on %s from %s\n",
828 			    pii->pii_probes[pr_ndx].pr_status,
829 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
830 			pii->pii_cum_stats.unknown++;
831 			return;
832 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
833 			/*
834 			 * The address matches, but our tables indicate that
835 			 * this probe reply has been acked already. So this
836 			 * is a duplicate probe reply. Log an error
837 			 */
838 			logtrace("probe status %d Duplicate probe reply seq %u "
839 			    "snxt %u on %s from %s\n",
840 			    pii->pii_probes[pr_ndx].pr_status,
841 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
842 			pii->pii_cum_stats.unknown++;
843 			return;
844 		}
845 	} else {
846 		/*
847 		 * Target must not be NULL in the PR_UNACKED state
848 		 */
849 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
850 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
851 			/*
852 			 * The probe stats slot is unused. So we didn't
853 			 * send out any probe to this target. This is a fake.
854 			 * Log an error.
855 			 */
856 			logtrace("probe status %d Fake probe reply seq %u "
857 			    "snxt %u on %s from %s\n",
858 			    pii->pii_probes[pr_ndx].pr_status,
859 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
860 		}
861 		pii->pii_cum_stats.unknown++;
862 		return;
863 	}
864 
865 	/*
866 	 * If the rtt does not appear to be right, don't update the
867 	 * rtt stats. This can happen if the system dropped into the
868 	 * debugger, or the system was hung or too busy for a
869 	 * substantial time that we didn't get a chance to run.
870 	 */
871 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
872 		/*
873 		 * If the probe corresponding to this receieved response
874 		 * was truly sent 'm' ms. ago, then this response must
875 		 * have been rejected by the sequence number checks. The
876 		 * fact that it has passed the sequence number checks
877 		 * means that the measured rtt is wrong. We were probably
878 		 * scheduled long after the packet was received.
879 		 */
880 		goto out;
881 	}
882 
883 	/*
884 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
885 	 * The initial few responses after the interface is repaired may
886 	 * contain high rtt's because they could have been queued up waiting
887 	 * for ARP/NDP resolution on a failed interface.
888 	 */
889 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
890 		goto out;
891 
892 	/*
893 	 * Don't update the Conservative Round Trip Time estimate for this
894 	 * (phint, target) pair if this is the not the highest ack seq seen
895 	 * thus far on this target.
896 	 */
897 	if (!highest_ack_tg(pr_icmp_seq, target))
898 		goto out;
899 
900 	/*
901 	 * Always update the rtt. This is a failure detection probe
902 	 * and we want to measure both increase / decrease in rtt.
903 	 */
904 	pi_set_crtt(target, m, _B_TRUE);
905 
906 	/*
907 	 * If the crtt exceeds the average time between probes,
908 	 * investigate if this slow target is an exception. If so we
909 	 * can avoid this target and still meet the failure detection
910 	 * time. Otherwise we can't meet the failure detection time.
911 	 */
912 	if (target->tg_crtt > pg->pg_probeint) {
913 		exception = check_exception_target(pii, target);
914 		if (exception) {
915 			/*
916 			 * This target is exceptionally slow. Don't use it
917 			 * for future probes. check_exception_target() has
918 			 * made sure that we have at least MIN_PROBE_TARGETS
919 			 * other active targets
920 			 */
921 			if (pii->pii_targets_are_routers) {
922 				/*
923 				 * This is a slow router, mark it as slow
924 				 * and don't use it for further probes. We
925 				 * don't delete it, since it will be populated
926 				 * again when we do a router scan. Hence we
927 				 * need to maintain extra state (unlike the
928 				 * host case below).  Mark it as TG_SLOW.
929 				 */
930 				if (target->tg_status == TG_ACTIVE)
931 					pii->pii_ntargets--;
932 				target->tg_status = TG_SLOW;
933 				target->tg_latime = gethrtime();
934 				target->tg_rtt_sa = -1;
935 				target->tg_crtt = 0;
936 				target->tg_rtt_sd = 0;
937 				if (pii->pii_target_next == target) {
938 					pii->pii_target_next =
939 					    target_next(target);
940 				}
941 			} else {
942 				/*
943 				 * the slow target is not a router, we can
944 				 * just delete it. Send an icmp multicast and
945 				 * pick the fastest responder that is not
946 				 * already an active target. target_delete()
947 				 * adjusts pii->pii_target_next
948 				 */
949 				target_delete(target);
950 				probe(pii, PROBE_MULTI, cur_time);
951 			}
952 		} else {
953 			/*
954 			 * We can't meet the failure detection time.
955 			 * Log a message, and update the detection time to
956 			 * whatever we can achieve.
957 			 */
958 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
959 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
960 			last_fdt_bumpup_time = gethrtime();
961 			if (pg != phyint_anongroup) {
962 				logerr("Cannot meet requested failure detection"
963 				    " time of %d ms on (%s %s) new failure"
964 				    " detection time for group \"%s\" is %d"
965 				    " ms\n", user_failure_detection_time,
966 				    AF_STR(pii->pii_af), pii->pii_name,
967 				    pg->pg_name, pg->pg_fdt);
968 			}
969 		}
970 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
971 	    (user_failure_detection_time < pg->pg_fdt) &&
972 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
973 		/*
974 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
975 		 * investigate if we can improve the failure detection time to
976 		 * meet whatever the user specified.
977 		 */
978 		if (check_pg_crtt_improved(pg)) {
979 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
980 			    user_failure_detection_time);
981 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
982 			if (pg != phyint_anongroup) {
983 				logerr("Improved failure detection time %d ms "
984 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
985 				    AF_STR(pii->pii_af), pii->pii_name,
986 				    pg->pg_name);
987 			}
988 			if (user_failure_detection_time == pg->pg_fdt) {
989 				/* Avoid any truncation or rounding errors */
990 				pg->pg_probeint = user_probe_interval;
991 				/*
992 				 * No more rtt probes will be sent. The actual
993 				 * fdt has dropped to the user specified value.
994 				 * pii_fd_snxt_basetime and pii_snxt_basetime
995 				 * will be in sync henceforth.
996 				 */
997 				reset_snxt_basetimes();
998 			}
999 		}
1000 	}
1001 out:
1002 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
1003 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
1004 
1005 	/*
1006 	 * Update pii->pii_rack, i.e. the sequence number of the last received
1007 	 * probe response, based on the echo reply we have received now, if
1008 	 * either of the following conditions are satisfied.
1009 	 * a. pii_rack is outside the current receive window of
1010 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
1011 	 *    This means we have not received probe responses for a
1012 	 *    long time, and the sequence number has wrapped around.
1013 	 * b. pii_rack is within the current receive window and this echo
1014 	 *    reply corresponds to the highest sequence number we have seen
1015 	 *    so far.
1016 	 */
1017 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
1018 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
1019 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
1020 		pii->pii_rack = pr_icmp_seq;
1021 	}
1022 }
1023 
1024 /*
1025  * Returns true if seq is the highest unacknowledged seq for target tg
1026  * else returns false
1027  */
1028 static boolean_t
1029 highest_ack_tg(uint16_t seq, struct target *tg)
1030 {
1031 	struct phyint_instance *pii;
1032 	int	 pr_ndx;
1033 	uint16_t pr_seq;
1034 
1035 	pii = tg->tg_phyint_inst;
1036 
1037 	/*
1038 	 * Get the seq number of the most recent probe sent so far,
1039 	 * and also get the corresponding probe index in the probe stats
1040 	 * array.
1041 	 */
1042 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1043 	pr_seq = pii->pii_snxt;
1044 	pr_seq--;
1045 
1046 	/*
1047 	 * Start from the most recent probe and walk back, trying to find
1048 	 * an acked probe corresponding to target tg.
1049 	 */
1050 	for (; pr_ndx != pii->pii_probe_next;
1051 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
1052 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
1053 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
1054 			if (SEQ_GT(pr_seq, seq))
1055 				return (_B_FALSE);
1056 		}
1057 	}
1058 	return (_B_TRUE);
1059 }
1060 
1061 /*
1062  * Check whether the crtt for the group has improved by a factor of
1063  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
1064  * detection time flapping in the face of small crtt changes.
1065  */
1066 static boolean_t
1067 check_pg_crtt_improved(struct phyint_group *pg)
1068 {
1069 	struct	phyint *pi;
1070 
1071 	if (debug & D_PROBE)
1072 		logdebug("check_pg_crtt_improved()\n");
1073 
1074 	/*
1075 	 * The crtt for the group is only improved if each phyint_instance
1076 	 * for both ipv4 and ipv6 is improved.
1077 	 */
1078 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1079 		if (!check_pii_crtt_improved(pi->pi_v4) ||
1080 		    !check_pii_crtt_improved(pi->pi_v6))
1081 			return (_B_FALSE);
1082 	}
1083 
1084 	return (_B_TRUE);
1085 }
1086 
1087 /*
1088  * Check whether the crtt has improved substantially on this phyint_instance.
1089  * Returns _B_TRUE if there's no crtt information available, because pii
1090  * is NULL or the phyint_instance is not capable of probing.
1091  */
1092 boolean_t
1093 check_pii_crtt_improved(struct phyint_instance *pii) {
1094 	struct 	target *tg;
1095 
1096 	if (pii == NULL)
1097 		return (_B_TRUE);
1098 
1099 	if (!PROBE_CAPABLE(pii) ||
1100 	    pii->pii_phyint->pi_state == PI_FAILED)
1101 		return (_B_TRUE);
1102 
1103 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1104 		if (tg->tg_status != TG_ACTIVE)
1105 			continue;
1106 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
1107 		    LOWER_FDT_TRIGGER)) {
1108 			return (_B_FALSE);
1109 		}
1110 	}
1111 
1112 	return (_B_TRUE);
1113 }
1114 
1115 /*
1116  * This target responds very slowly to probes. The target's crtt exceeds
1117  * the probe interval of its group. Compare against other targets
1118  * and determine if this target is an exception, if so return true, else false
1119  */
1120 static boolean_t
1121 check_exception_target(struct phyint_instance *pii, struct target *target)
1122 {
1123 	struct	target *tg;
1124 	char abuf[INET6_ADDRSTRLEN];
1125 
1126 	if (debug & D_PROBE) {
1127 		logdebug("check_exception_target(%s %s target %s)\n",
1128 		    AF_STR(pii->pii_af), pii->pii_name,
1129 		    pr_addr(pii->pii_af, target->tg_address,
1130 			abuf, sizeof (abuf)));
1131 	}
1132 
1133 	/*
1134 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
1135 	 * to make a good judgement. Otherwise don't drop this target.
1136 	 */
1137 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
1138 		return (_B_FALSE);
1139 
1140 	/*
1141 	 * Determine whether only this particular target is slow.
1142 	 * We know that this target's crtt exceeds the group's probe interval.
1143 	 * If all other active targets have a
1144 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
1145 	 * then this target is considered slow.
1146 	 */
1147 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1148 		if (tg != target && tg->tg_status == TG_ACTIVE) {
1149 			if (tg->tg_crtt >
1150 			    pii->pii_phyint->pi_group->pg_probeint /
1151 			    EXCEPTION_FACTOR) {
1152 				return (_B_FALSE);
1153 			}
1154 		}
1155 	}
1156 
1157 	return (_B_TRUE);
1158 }
1159 
1160 /*
1161  * Update the target list. The icmp all hosts multicast has given us
1162  * some host to which we can send probes. If we already have sufficient
1163  * targets, discard it.
1164  */
1165 static void
1166 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
1167     struct in6_addr fromaddr)
1168 /* ARGSUSED */
1169 {
1170 	int af;
1171 	char abuf[INET6_ADDRSTRLEN];
1172 	struct phyint *pi;
1173 
1174 	if (debug & D_PROBE) {
1175 		logdebug("incoming_mcast_reply(%s %s %s)\n",
1176 		    AF_STR(pii->pii_af), pii->pii_name,
1177 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
1178 	}
1179 
1180 	/*
1181 	 * Using host targets is a fallback mechanism. If we have
1182 	 * found a router, don't add this host target. If we already
1183 	 * know MAX_PROBE_TARGETS, don't add another target.
1184 	 */
1185 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1186 	if (pii->pii_targets != NULL) {
1187 		if (pii->pii_targets_are_routers ||
1188 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
1189 			return;
1190 		}
1191 	}
1192 
1193 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
1194 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
1195 		/*
1196 		 * Guard against response from 0.0.0.0
1197 		 * and ::. Log a trace message
1198 		 */
1199 		logtrace("probe response from %s on %s\n",
1200 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
1201 		    pii->pii_name);
1202 		return;
1203 	}
1204 
1205 	/*
1206 	 * This address is one of our own, so reject this address as a
1207 	 * valid probe target.
1208 	 */
1209 	af = pii->pii_af;
1210 	if (own_address(fromaddr))
1211 		return;
1212 
1213 	/*
1214 	 * If the phyint is part a named group, then add the address to all
1215 	 * members of the group.  Otherwise, add the address only to the
1216 	 * phyint itself, since other phyints in the anongroup may not be on
1217 	 * the same subnet.
1218 	 */
1219 	pi = pii->pii_phyint;
1220 	if (pi->pi_group == phyint_anongroup) {
1221 		target_add(pii, fromaddr, _B_FALSE);
1222 	} else {
1223 		pi = pi->pi_group->pg_phyint;
1224 		for (; pi != NULL; pi = pi->pi_pgnext)
1225 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
1226 	}
1227 }
1228 
1229 /*
1230  * Compute CRTT given an existing scaled average, scaled deviation estimate
1231  * and a new rtt time.  The formula is from Jacobson and Karels'
1232  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
1233  * are the same as those in Appendix A.2 of that paper.
1234  *
1235  * m = new measurement
1236  * sa = scaled RTT average (8 * average estimates)
1237  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
1238  * crtt = Conservative round trip time. Used to determine whether probe
1239  * has timed out.
1240  *
1241  * New scaled average and deviation are passed back via sap and svp
1242  */
1243 static int
1244 compute_crtt(int *sap, int *svp, int m)
1245 {
1246 	int sa = *sap;
1247 	int sv = *svp;
1248 	int crtt;
1249 	int saved_m = m;
1250 
1251 	assert(*sap >= -1);
1252 	assert(*svp >= 0);
1253 
1254 	if (sa != -1) {
1255 		/*
1256 		 * Update average estimator:
1257 		 *	new rtt = old rtt + 1/8 Error
1258 		 *	    where Error = m - old rtt
1259 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
1260 		 *	i.e. new sa =  old sa + Error
1261 		 */
1262 		m -= sa >> 3;		/* m is now Error in estimate. */
1263 		if ((sa += m) < 0) {
1264 			/* Don't allow the smoothed average to be negative. */
1265 			sa = 0;
1266 		}
1267 
1268 		/*
1269 		 * Update deviation estimator:
1270 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
1271 		 *	i.e. 4 * new mdev = 4 * old mdev +
1272 		 *		(abs(Error) - old mdev)
1273 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
1274 		 */
1275 		if (m < 0)
1276 			m = -m;
1277 		m -= sv >> 2;
1278 		sv += m;
1279 	} else {
1280 		/* Initialization. This is the first response received. */
1281 		sa = (m << 3);
1282 		sv = (m << 1);
1283 	}
1284 
1285 	crtt = (sa >> 3) + sv;
1286 
1287 	if (debug & D_PROBE) {
1288 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
1289 		    "%d\n", saved_m, sa, sv, crtt);
1290 	}
1291 
1292 	*sap = sa;
1293 	*svp = sv;
1294 
1295 	/*
1296 	 * CRTT = average estimates  + 4 * deviation estimates
1297 	 *	= sa / 8 + sv
1298 	 */
1299 	return (crtt);
1300 }
1301 
1302 static void
1303 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
1304 {
1305 	struct phyint_instance *pii = tg->tg_phyint_inst;
1306 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1307 	int sa = tg->tg_rtt_sa;
1308 	int sv = tg->tg_rtt_sd;
1309 	int new_crtt;
1310 	int i;
1311 
1312 	if (debug & D_PROBE)
1313 		logdebug("pi_set_crtt: target -  m %d\n", m);
1314 
1315 	/* store the round trip time, in case we need to defer computation */
1316 	tg->tg_deferred[tg->tg_num_deferred] = m;
1317 
1318 	new_crtt = compute_crtt(&sa, &sv, m);
1319 
1320 	/*
1321 	 * If this probe's round trip time would singlehandedly cause an
1322 	 * increase in the group's probe interval consider it suspect.
1323 	 */
1324 	if ((new_crtt > probe_interval) && is_probe_uni) {
1325 		if (debug & D_PROBE) {
1326 			logdebug("Received a suspect probe on %s, new_crtt ="
1327 			    " %d, probe_interval = %d, num_deferred = %d\n",
1328 			    pii->pii_probe_logint->li_name, new_crtt,
1329 			    probe_interval, tg->tg_num_deferred);
1330 		}
1331 
1332 		/*
1333 		 * If we've deferred as many rtts as we plan on deferring, then
1334 		 * assume the link really did slow down and process all queued
1335 		 * rtts
1336 		 */
1337 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
1338 			if (debug & D_PROBE) {
1339 				logdebug("Received MAXDEFERREDRTT probes which "
1340 				    "would cause an increased probe_interval.  "
1341 				    "Integrating queued rtt data points.\n");
1342 			}
1343 
1344 			for (i = 0; i <= tg->tg_num_deferred; i++) {
1345 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
1346 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
1347 			}
1348 
1349 			tg->tg_num_deferred = 0;
1350 		} else {
1351 			tg->tg_num_deferred++;
1352 		}
1353 		return;
1354 	}
1355 
1356 	/*
1357 	 * If this is a normal probe, or an RTT probe that would lead to a
1358 	 * reduced CRTT, then update our CRTT data.  Further, if this was
1359 	 * a normal probe, pitch any deferred probes since our probes are
1360 	 * again being answered within our CRTT estimates.
1361 	 */
1362 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
1363 		tg->tg_rtt_sa = sa;
1364 		tg->tg_rtt_sd = sv;
1365 		tg->tg_crtt = new_crtt;
1366 		if (is_probe_uni)
1367 			tg->tg_num_deferred = 0;
1368 	}
1369 }
1370 
1371 /*
1372  * Return a pointer to the specified option buffer.
1373  * If not found return NULL.
1374  */
1375 static void *
1376 find_ancillary(struct msghdr *msg, int cmsg_type)
1377 {
1378 	struct cmsghdr *cmsg;
1379 
1380 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
1381 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
1382 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
1383 		    cmsg->cmsg_type == cmsg_type) {
1384 			return (CMSG_DATA(cmsg));
1385 		}
1386 	}
1387 	return (NULL);
1388 }
1389 
1390 /*
1391  * See if a previously failed interface has started working again.
1392  */
1393 void
1394 phyint_check_for_repair(struct phyint *pi)
1395 {
1396 	if (phyint_repaired(pi)) {
1397 		if (pi->pi_group == phyint_anongroup) {
1398 			logerr("NIC repair detected on %s\n", pi->pi_name);
1399 		} else {
1400 			logerr("NIC repair detected on %s of group %s\n",
1401 			    pi->pi_name, pi->pi_group->pg_name);
1402 		}
1403 
1404 		/*
1405 		 * If the interface is offline, just clear the FAILED flag,
1406 		 * delaying the state change and failback operation until it
1407 		 * is brought back online.
1408 		 */
1409 		if (pi->pi_state == PI_OFFLINE) {
1410 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1411 			return;
1412 		}
1413 
1414 		if (pi->pi_flags & IFF_STANDBY) {
1415 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1416 		} else {
1417 			if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) {
1418 				(void) change_lif_flags(pi,
1419 				    IFF_FAILED, _B_FALSE);
1420 				/* Per state diagram */
1421 				pi->pi_empty = 0;
1422 			}
1423 		}
1424 
1425 		phyint_chstate(pi, PI_RUNNING);
1426 
1427 		if (GROUP_FAILED(pi->pi_group)) {
1428 			/*
1429 			 * This is the 1st phyint to receive a response
1430 			 * after group failure.
1431 			 */
1432 			logerr("At least 1 interface (%s) of group %s has "
1433 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
1434 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
1435 			/*
1436 			 * If this is the STANDBY phyint to be repaired after a
1437 			 * group failure. Move data addresses on other failed
1438 			 * phyints in the group to this one.
1439 			 */
1440 			if (pi->pi_flags & IFF_STANDBY) {
1441 				struct phyint *fpi = pi->pi_group->pg_phyint;
1442 				for (; fpi != NULL; fpi = fpi->pi_pgnext) {
1443 					if (fpi != pi) {
1444 						(void) try_failover(fpi,
1445 						    FAILOVER_NORMAL);
1446 					}
1447 				}
1448 			}
1449 		}
1450 	}
1451 }
1452 
1453 /*
1454  * See if a previously functioning interface has failed, or if the
1455  * whole group of interfaces has failed.
1456  */
1457 static void
1458 phyint_inst_check_for_failure(struct phyint_instance *pii)
1459 {
1460 	struct	phyint	*pi;
1461 	struct	phyint	*pi2;
1462 
1463 	pi = pii->pii_phyint;
1464 
1465 	switch (failure_state(pii)) {
1466 	case PHYINT_FAILURE:
1467 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1468 		if (pi->pi_group == phyint_anongroup) {
1469 			logerr("NIC failure detected on %s\n", pii->pii_name);
1470 		} else {
1471 			logerr("NIC failure detected on %s of group %s\n",
1472 			    pii->pii_name, pi->pi_group->pg_name);
1473 		}
1474 		/*
1475 		 * Do the failover, unless the interface is offline (in
1476 		 * which case we've already failed over).
1477 		 */
1478 		if (pi->pi_state != PI_OFFLINE) {
1479 			phyint_chstate(pi, PI_FAILED);
1480 			reset_crtt_all(pi);
1481 			if (!(pi->pi_flags & IFF_INACTIVE))
1482 				(void) try_failover(pi, FAILOVER_NORMAL);
1483 		}
1484 		break;
1485 
1486 	case GROUP_FAILURE:
1487 		logerr("All Interfaces in group %s have failed\n",
1488 		    pi->pi_group->pg_name);
1489 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
1490 		    pi2 = pi2->pi_pgnext) {
1491 			if (pi2->pi_flags & IFF_OFFLINE)
1492 				continue;
1493 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
1494 			reset_crtt_all(pi2);
1495 
1496 			/*
1497 			 * In the case of host targets, we
1498 			 * would have flushed the targets,
1499 			 * and gone to PI_NOTARGETS state.
1500 			 */
1501 			if (pi2->pi_state == PI_RUNNING)
1502 				phyint_chstate(pi2, PI_FAILED);
1503 
1504 			pi2->pi_empty = 0;
1505 			pi2->pi_full = 0;
1506 		}
1507 		break;
1508 
1509 	default:
1510 		break;
1511 	}
1512 }
1513 
1514 /*
1515  * Determines if any timeout event has occurred and returns the number of
1516  * milliseconds until the next timeout event for the phyint. Returns
1517  * TIMER_INFINITY for "never".
1518  */
1519 uint_t
1520 phyint_inst_timer(struct phyint_instance *pii)
1521 {
1522 	int 	pr_ndx;
1523 	uint_t	timeout;
1524 	struct	target	*cur_tg;
1525 	struct	probe_stats *pr_statp;
1526 	struct	phyint_instance *pii_other;
1527 	struct	phyint *pi;
1528 	int	valid_unack_count;
1529 	int	i;
1530 	int	interval;
1531 	uint_t	check_time;
1532 	uint_t	cur_time;
1533 	hrtime_t cur_hrtime;
1534 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
1535 
1536 	cur_time = getcurrenttime();
1537 
1538 	if (debug & D_TIMER) {
1539 		logdebug("phyint_inst_timer(%s %s)\n",
1540 		    AF_STR(pii->pii_af), pii->pii_name);
1541 	}
1542 
1543 	pii_other = phyint_inst_other(pii);
1544 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
1545 		/*
1546 		 * Check to see if we're here due to link up/down flapping; If
1547 		 * enough time has passed, then try to bring the interface
1548 		 * back up; otherwise, schedule a timer to bring it back up
1549 		 * when enough time *has* elapsed.
1550 		 */
1551 		pi = pii->pii_phyint;
1552 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
1553 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
1554 			if (check_time > cur_time)
1555 				return (check_time - cur_time);
1556 
1557 			phyint_check_for_repair(pi);
1558 		}
1559 	}
1560 
1561 	/*
1562 	 * If this phyint is not yet initialized for probes,
1563 	 * don't proceed further
1564 	 */
1565 	if (pii->pii_probe_sock == -1)
1566 		return (TIMER_INFINITY);
1567 
1568 	/*
1569 	 * If the timer has fired too soon, probably triggered
1570 	 * by some other phyint instance, return the remaining
1571 	 * time
1572 	 */
1573 	if (TIME_LT(cur_time, pii->pii_snxt_time))
1574 		return (pii->pii_snxt_time - cur_time);
1575 
1576 	/*
1577 	 * If the link is down, don't send any probes for now.
1578 	 */
1579 	if (LINK_DOWN(pii->pii_phyint))
1580 		return (TIMER_INFINITY);
1581 
1582 	/*
1583 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
1584 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
1585 	 * Base probe time is strictly periodic.
1586 	 */
1587 	interval = GET_RANDOM(
1588 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
1589 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
1590 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
1591 
1592 	/*
1593 	 * Check if the current time > next time to probe. If so, we missed
1594 	 * sending 1 or more probes, probably due to heavy system load. At least
1595 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
1596 	 * were scheduled. Make adjustments to the times, in multiples of
1597 	 * user_probe_interval.
1598 	 */
1599 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
1600 		int n;
1601 
1602 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
1603 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
1604 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
1605 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
1606 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
1607 		    pii->pii_snxt_basetime);
1608 
1609 		/* Collect statistics about missed probes */
1610 		probes_missed.pm_nprobes += n + 1;
1611 		probes_missed.pm_ntimes++;
1612 	}
1613 	pii->pii_snxt_basetime += user_probe_interval;
1614 	interval = pii->pii_snxt_time - cur_time;
1615 	if (debug & D_TARGET) {
1616 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
1617 		    " interval %u\n", cur_time, pii->pii_snxt_time,
1618 		    pii->pii_snxt_basetime, interval);
1619 	}
1620 
1621 	/*
1622 	 * If no targets are known, we need to send an ICMP multicast. The
1623 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
1624 	 * to see if we found a target.
1625 	 */
1626 	if (pii->pii_target_next == NULL) {
1627 		assert(pii->pii_ntargets == 0);
1628 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1629 		probe(pii, PROBE_MULTI, cur_time);
1630 		return (interval);
1631 	}
1632 
1633 	if ((user_probe_interval != probe_interval) &&
1634 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
1635 		/*
1636 		 * the failure detection (fd) probe timer has not yet fired.
1637 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
1638 		 */
1639 		probe(pii, PROBE_RTT, cur_time);
1640 		return (interval);
1641 	}
1642 	/*
1643 	 * the fd probe timer has fired. Need to do all failure
1644 	 * detection / recovery calculations, and then send an fd probe
1645 	 * of type PROBE_UNI.
1646 	 */
1647 	if (user_probe_interval == probe_interval) {
1648 		/*
1649 		 * We could have missed some probes, and then adjusted
1650 		 * pii_snxt_basetime above. Otherwise we could have
1651 		 * blindly added probe_interval to pii_fd_snxt_basetime.
1652 		 */
1653 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1654 	} else {
1655 		pii->pii_fd_snxt_basetime += probe_interval;
1656 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
1657 			int n;
1658 
1659 			n = (cur_time - pii->pii_fd_snxt_basetime) /
1660 			    probe_interval;
1661 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
1662 		}
1663 	}
1664 
1665 	/*
1666 	 * We can have at most, the latest 2 probes that we sent, in
1667 	 * the PR_UNACKED state. All previous probes sent, are either
1668 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
1669 	 * timed out if the probe's time_sent + the CRTT < currenttime.
1670 	 * For each of the last 2 probes, examine whether it has timed
1671 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
1672 	 */
1673 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
1674 	valid_unack_count = 0;
1675 
1676 	for (i = 0; i < 2; i++) {
1677 		pr_statp = &pii->pii_probes[pr_ndx];
1678 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
1679 		switch (pr_statp->pr_status) {
1680 		case PR_ACKED:
1681 			/*
1682 			 * We received back an ACK, so the switch clearly
1683 			 * is not dropping our traffic, and thus we can
1684 			 * enable failure detection immediately.
1685 			 */
1686 			if (pii->pii_fd_hrtime > gethrtime()) {
1687 				if (debug & D_PROBE) {
1688 					logdebug("successful probe on %s; "
1689 					    "ending quiet period\n",
1690 					    pii->pii_phyint->pi_name);
1691 				}
1692 				pii->pii_fd_hrtime = gethrtime();
1693 			}
1694 			break;
1695 
1696 		case PR_UNACKED:
1697 			assert(cur_tg != NULL);
1698 			/*
1699 			 * The crtt could be zero for some reason,
1700 			 * Eg. the phyint could be failed. If the crtt is
1701 			 * not available use group's probe interval,
1702 			 * which is a worst case estimate.
1703 			 */
1704 			if (cur_tg->tg_crtt != 0) {
1705 				timeout = pr_statp->pr_time_sent +
1706 				    cur_tg->tg_crtt;
1707 			} else {
1708 				timeout = pr_statp->pr_time_sent +
1709 				    probe_interval;
1710 			}
1711 			if (TIME_LT(timeout, cur_time)) {
1712 				pr_statp->pr_status = PR_LOST;
1713 				pr_statp->pr_time_lost = timeout;
1714 			} else if (i == 1) {
1715 				/*
1716 				 * We are forced to consider this probe
1717 				 * lost, as we can have at most 2 unack.
1718 				 * probes any time, and we will be sending a
1719 				 * probe at the end of this function.
1720 				 * Normally, we should not be here, but
1721 				 * this can happen if an incoming response
1722 				 * that was considered lost has increased
1723 				 * the crtt for this target, and also bumped
1724 				 * up the FDT. Note that we never cancel or
1725 				 * increase the current pii_time_left, so
1726 				 * when the timer fires, we find 2 valid
1727 				 * unacked probes, and they are yet to timeout
1728 				 */
1729 				pr_statp->pr_status = PR_LOST;
1730 				pr_statp->pr_time_lost = cur_time;
1731 			} else {
1732 				/*
1733 				 * Only the most recent probe can enter
1734 				 * this 'else' arm. The second most recent
1735 				 * probe must take either of the above arms,
1736 				 * if it is unacked.
1737 				 */
1738 				valid_unack_count++;
1739 			}
1740 			break;
1741 		}
1742 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
1743 	}
1744 
1745 	/*
1746 	 * We send out 1 probe randomly in the interval between one half
1747 	 * and one probe interval for the group. Given that the CRTT is always
1748 	 * less than the group's probe interval, we can have at most 1
1749 	 * unacknowledged probe now.  All previous probes are either lost or
1750 	 * acked.
1751 	 */
1752 	assert(valid_unack_count == 0 || valid_unack_count == 1);
1753 
1754 	/*
1755 	 * The timer has fired. Take appropriate action depending
1756 	 * on the current state of the phyint.
1757 	 *
1758 	 * PI_RUNNING state 	- Failure detection and failover
1759 	 * PI_FAILED state 	- Repair detection and failback
1760 	 */
1761 	switch (pii->pii_phyint->pi_state) {
1762 	case PI_FAILED:
1763 		/*
1764 		 * If the most recent probe (excluding unacked probes that
1765 		 * are yet to time out) has been acked, check whether the
1766 		 * phyint is now repaired. If the phyint is repaired, then
1767 		 * attempt failback, unless it is an inactive standby.
1768 		 */
1769 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
1770 			phyint_check_for_repair(pii->pii_phyint);
1771 		}
1772 		break;
1773 
1774 	case PI_RUNNING:
1775 		/*
1776 		 * It's possible our probes have been lost because of a
1777 		 * spanning-tree mandated quiet period on the switch.  If so,
1778 		 * ignore the lost probes and consider the interface to still
1779 		 * be functioning.
1780 		 */
1781 		cur_hrtime = gethrtime();
1782 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
1783 			break;
1784 
1785 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
1786 			/*
1787 			 * We have 1 or more failed probes (excluding unacked
1788 			 * probes that are yet to time out). Determine if the
1789 			 * phyint has failed. If so attempt a failover,
1790 			 * unless it is an inactive standby
1791 			 */
1792 			phyint_inst_check_for_failure(pii);
1793 		}
1794 		break;
1795 
1796 	default:
1797 		logerr("phyint_inst_timer: invalid state %d\n",
1798 		    pii->pii_phyint->pi_state);
1799 		abort();
1800 	}
1801 
1802 	/*
1803 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
1804 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
1805 	 * was called, the target list may be empty.
1806 	 */
1807 	if (pii->pii_target_next != NULL) {
1808 		probe(pii, PROBE_UNI, cur_time);
1809 		/*
1810 		 * If we have just the one probe target, and we're not using
1811 		 * router targets, try to find another as we presently have
1812 		 * no resilience.
1813 		 */
1814 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
1815 			probe(pii, PROBE_MULTI, cur_time);
1816 	} else {
1817 		probe(pii, PROBE_MULTI, cur_time);
1818 	}
1819 	return (interval);
1820 }
1821 
1822 /*
1823  * Start the probe timer for an interface instance.
1824  */
1825 void
1826 start_timer(struct phyint_instance *pii)
1827 {
1828 	uint32_t interval;
1829 
1830 	/*
1831 	 * Spread the base probe times (pi_snxt_basetime) across phyints
1832 	 * uniformly over the (curtime..curtime + the group's probe_interval).
1833 	 * pi_snxt_basetime is strictly periodic with a frequency of
1834 	 * the group's probe interval. The actual probe time pi_snxt_time
1835 	 * adds some randomness to pi_snxt_basetime and happens in probe().
1836 	 * For the 1st probe on each phyint after the timer is started,
1837 	 * pi_snxt_time and pi_snxt_basetime are the same.
1838 	 */
1839 	interval = GET_RANDOM(0,
1840 	    (int)pii->pii_phyint->pi_group->pg_probeint);
1841 
1842 	pii->pii_snxt_basetime = getcurrenttime() + interval;
1843 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
1844 	pii->pii_snxt_time = pii->pii_snxt_basetime;
1845 	timer_schedule(interval);
1846 }
1847 
1848 /*
1849  * Restart the probe timer on an interface instance.
1850  */
1851 static void
1852 restart_timer(struct phyint_instance *pii)
1853 {
1854 	/*
1855 	 * We don't need to restart the timer if it was never started in
1856 	 * the first place (pii->pii_basetime_inited not set), as the timer
1857 	 * won't have gone off yet.
1858 	 */
1859 	if (pii->pii_basetime_inited != 0) {
1860 
1861 		if (debug & D_LINKNOTE)
1862 			logdebug("restart timer: restarting timer on %s, "
1863 			    "address family %s\n", pii->pii_phyint->pi_name,
1864 			    AF_STR(pii->pii_af));
1865 
1866 		start_timer(pii);
1867 	}
1868 }
1869 
1870 static void
1871 process_link_state_down(struct phyint *pi)
1872 {
1873 	logerr("The link has gone down on %s\n", pi->pi_name);
1874 
1875 	/*
1876 	 * Clear the probe statistics arrays, we don't want the repair
1877 	 * detection logic relying on probes that were succesful prior
1878 	 *  to the link going down.
1879 	 */
1880 	if (PROBE_CAPABLE(pi->pi_v4))
1881 		clear_pii_probe_stats(pi->pi_v4);
1882 	if (PROBE_CAPABLE(pi->pi_v6))
1883 		clear_pii_probe_stats(pi->pi_v6);
1884 	/*
1885 	 * Check for interface failure.  Although we know the interface
1886 	 * has failed, we don't know if all the other interfaces in the
1887 	 * group have failed as well.
1888 	 */
1889 	if ((pi->pi_state == PI_RUNNING) ||
1890 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
1891 		if (debug & D_LINKNOTE) {
1892 			logdebug("process_link_state_down:"
1893 			    " checking for failure on %s\n", pi->pi_name);
1894 		}
1895 
1896 		if (pi->pi_v4 != NULL)
1897 			phyint_inst_check_for_failure(pi->pi_v4);
1898 		else if (pi->pi_v6 != NULL)
1899 			phyint_inst_check_for_failure(pi->pi_v6);
1900 	}
1901 }
1902 
1903 static void
1904 process_link_state_up(struct phyint *pi)
1905 {
1906 	logerr("The link has come up on %s\n", pi->pi_name);
1907 
1908 	/*
1909 	 * We stopped any running timers on each instance when the link
1910 	 * went down, so restart them.
1911 	 */
1912 	if (pi->pi_v4)
1913 		restart_timer(pi->pi_v4);
1914 	if (pi->pi_v6)
1915 		restart_timer(pi->pi_v6);
1916 
1917 	phyint_check_for_repair(pi);
1918 
1919 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
1920 	if (pi->pi_whendx == LINK_UP_PERMIN)
1921 		pi->pi_whendx = 0;
1922 }
1923 
1924 /*
1925  * Process any changes in link state passed up from the interfaces.
1926  */
1927 void
1928 process_link_state_changes(void)
1929 {
1930 	struct phyint *pi;
1931 
1932 	/* Look for interfaces where the link state has just changed */
1933 
1934 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
1935 		boolean_t old_link_state_up = LINK_UP(pi);
1936 
1937 		/*
1938 		 * Except when the "phyint" structure is created, this is
1939 		 * the only place the link state is updated.  This allows
1940 		 * this routine to detect changes in link state, rather
1941 		 * than just the current state.
1942 		 */
1943 		UPDATE_LINK_STATE(pi);
1944 
1945 		if (LINK_DOWN(pi)) {
1946 			/*
1947 			 * Has link just gone down?
1948 			 */
1949 			if (old_link_state_up)
1950 				process_link_state_down(pi);
1951 		} else {
1952 			/*
1953 			 * Has link just gone back up?
1954 			 */
1955 			if (!old_link_state_up)
1956 				process_link_state_up(pi);
1957 		}
1958 	}
1959 }
1960 
1961 void
1962 reset_crtt_all(struct phyint *pi)
1963 {
1964 	struct phyint_instance *pii;
1965 	struct target *tg;
1966 
1967 	pii = pi->pi_v4;
1968 	if (pii != NULL) {
1969 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1970 			tg->tg_crtt = 0;
1971 			tg->tg_rtt_sa = -1;
1972 			tg->tg_rtt_sd = 0;
1973 		}
1974 	}
1975 
1976 	pii = pi->pi_v6;
1977 	if (pii != NULL) {
1978 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1979 			tg->tg_crtt = 0;
1980 			tg->tg_rtt_sa = -1;
1981 			tg->tg_rtt_sd = 0;
1982 		}
1983 	}
1984 }
1985 
1986 /*
1987  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
1988  * probes on both instances IPv4 and IPv6.
1989  * If the interface has failed, return the time of the first probe failure
1990  * in "tff".
1991  */
1992 static int
1993 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
1994 {
1995 	uint_t	pi_tff;
1996 	struct	target *cur_tg;
1997 	struct	probe_fail_count pfinfo;
1998 	struct	phyint_instance *pii_other;
1999 	int	pr_ndx;
2000 
2001 	/*
2002 	 * Get the number of consecutive failed probes on
2003 	 * this phyint across all targets. Also get the number
2004 	 * of consecutive failed probes on this target only
2005 	 */
2006 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2007 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
2008 	probe_fail_info(pii, cur_tg, &pfinfo);
2009 
2010 	/* Get the time of first failure, for later use */
2011 	pi_tff = pfinfo.pf_tff;
2012 
2013 	/*
2014 	 * If the current target has not responded to the
2015 	 * last NUM_PROBE_FAILS probes, and other targets are
2016 	 * responding delete this target. Dead gateway detection
2017 	 * will eventually remove this target (if router) from the
2018 	 * routing tables. If that does not occur, we may end
2019 	 * up adding this to our list again.
2020 	 */
2021 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
2022 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
2023 		if (pii->pii_targets_are_routers) {
2024 			if (cur_tg->tg_status == TG_ACTIVE)
2025 				pii->pii_ntargets--;
2026 			cur_tg->tg_status = TG_DEAD;
2027 			cur_tg->tg_crtt = 0;
2028 			cur_tg->tg_rtt_sa = -1;
2029 			cur_tg->tg_rtt_sd = 0;
2030 			if (pii->pii_target_next == cur_tg)
2031 				pii->pii_target_next = target_next(cur_tg);
2032 		} else {
2033 			target_delete(cur_tg);
2034 			probe(pii, PROBE_MULTI, getcurrenttime());
2035 		}
2036 		return (PHYINT_OK);
2037 	}
2038 
2039 	/*
2040 	 * If the phyint has lost NUM_PROBE_FAILS or more
2041 	 * consecutive probes, on both IPv4 and IPv6 protocol
2042 	 * instances of the phyint, then trigger failure
2043 	 * detection, else return false
2044 	 */
2045 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
2046 		return (PHYINT_OK);
2047 
2048 	pii_other = phyint_inst_other(pii);
2049 	if (PROBE_CAPABLE(pii_other)) {
2050 		probe_fail_info(pii_other, NULL, &pfinfo);
2051 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
2052 			/*
2053 			 * We have NUM_PROBE_FAILS or more failures
2054 			 * on both IPv4 and IPv6. Get the earliest
2055 			 * time when failure was detected on this
2056 			 * phyint across IPv4 and IPv6.
2057 			 */
2058 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
2059 				pi_tff = pfinfo.pf_tff;
2060 		} else {
2061 			/*
2062 			 * This instance has < NUM_PROBE_FAILS failure.
2063 			 * So return false
2064 			 */
2065 			return (PHYINT_OK);
2066 		}
2067 	}
2068 	*tff = pi_tff;
2069 	return (PHYINT_FAILURE);
2070 }
2071 
2072 /*
2073  * Check if the link has gone down on this phyint, or it has failed the
2074  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
2075  * Also look at other phyints of this group, for group failures.
2076  */
2077 int
2078 failure_state(struct phyint_instance *pii)
2079 {
2080 	struct	probe_success_count psinfo;
2081 	uint_t	pi2_tls;		/* time last success */
2082 	uint_t	pi_tff;			/* time first fail */
2083 	struct	phyint	*pi2;
2084 	struct	phyint *pi;
2085 	struct	phyint_instance *pii2;
2086 	struct  phyint_group *pg;
2087 	boolean_t alone;
2088 
2089 	if (debug & D_FAILOVER)
2090 		logdebug("phyint_failed(%s)\n", pii->pii_name);
2091 
2092 	pi = pii->pii_phyint;
2093 	pg = pi->pi_group;
2094 
2095 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2096 		PHYINT_OK)
2097 		return (PHYINT_OK);
2098 
2099 	/*
2100 	 * At this point, the link is down, or the phyint is suspect,
2101 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
2102 	 * does not belong to any group, or is the only member of the
2103 	 * group capable of being probed, return PHYINT_FAILURE.
2104 	 */
2105 	alone = _B_TRUE;
2106 	if (pg != phyint_anongroup) {
2107 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2108 			if (pi2 == pi)
2109 				continue;
2110 			if (PROBE_CAPABLE(pi2->pi_v4) ||
2111 			    PROBE_CAPABLE(pi2->pi_v6)) {
2112 				alone = _B_FALSE;
2113 				break;
2114 			}
2115 		}
2116 	}
2117 	if (alone)
2118 		return (PHYINT_FAILURE);
2119 
2120 	/*
2121 	 * Need to compare against other phyints of the same group
2122 	 * to exclude group failures. If the failure was detected via
2123 	 * probing, then if the time of last success (tls) of any
2124 	 * phyint is more recent than the time of first fail (tff) of the
2125 	 * phyint in question, and the link is up on the phyint,
2126 	 * then it is a phyint failure. Otherwise it is a group failure.
2127 	 * If failure was detected via a link down notification sent from
2128 	 * the driver to IP, we see if any phyints in the group are still
2129 	 * running and haven't received a link down notification.  We
2130 	 * will usually be processing the link down notification shortly
2131 	 * after it was received, so there is no point looking at the tls
2132 	 * of other phyints.
2133 	 */
2134 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2135 		/* Exclude ourself from comparison */
2136 		if (pi2 == pi)
2137 			continue;
2138 
2139 		if (LINK_DOWN(pi)) {
2140 			/*
2141 			 * We use FLAGS_TO_LINK_STATE() to test the
2142 			 * flags directly, rather then LINK_UP() or
2143 			 * LINK_DOWN(), as we may not have got round
2144 			 * to processing the link state for the other
2145 			 * phyints in the group yet.
2146 			 *
2147 			 * The check for PI_RUNNING and group
2148 			 * failure handles the case when the
2149 			 * group begins to recover.  The first
2150 			 * phyint to recover should not trigger
2151 			 * a failover from the soon-to-recover
2152 			 * other phyints to the first recovered
2153 			 * phyint. PI_RUNNING will be set, and
2154 			 * pg_groupfailed cleared only after
2155 			 * receipt of NUM_PROBE_REPAIRS, by
2156 			 * which time the other phyints should
2157 			 * have received at least 1 packet,
2158 			 * and so will not have NUM_PROBE_FAILS.
2159 			 */
2160 			if ((pi2->pi_state == PI_RUNNING) &&
2161 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
2162 				return (PHYINT_FAILURE);
2163 		} else {
2164 			/*
2165 			 * Need to compare against both IPv4 and
2166 			 * IPv6 instances.
2167 			 */
2168 			pii2 = pi2->pi_v4;
2169 			if (pii2 != NULL) {
2170 				probe_success_info(pii2, NULL, &psinfo);
2171 				if (psinfo.ps_tls_valid) {
2172 					pi2_tls = psinfo.ps_tls;
2173 					/*
2174 					 * See comment above regarding check
2175 					 * for PI_RUNNING and group failure.
2176 					 */
2177 					if (TIME_GT(pi2_tls, pi_tff) &&
2178 					    (pi2->pi_state == PI_RUNNING) &&
2179 					    !GROUP_FAILED(pg) &&
2180 					    FLAGS_TO_LINK_STATE(pi2))
2181 						return (PHYINT_FAILURE);
2182 				}
2183 			}
2184 
2185 			pii2 = pi2->pi_v6;
2186 			if (pii2 != NULL) {
2187 				probe_success_info(pii2, NULL, &psinfo);
2188 				if (psinfo.ps_tls_valid) {
2189 					pi2_tls = psinfo.ps_tls;
2190 					/*
2191 					 * See comment above regarding check
2192 					 * for PI_RUNNING and group failure.
2193 					 */
2194 					if (TIME_GT(pi2_tls, pi_tff) &&
2195 					    (pi2->pi_state == PI_RUNNING) &&
2196 					    !GROUP_FAILED(pg) &&
2197 					    FLAGS_TO_LINK_STATE(pi2))
2198 						return (PHYINT_FAILURE);
2199 				}
2200 			}
2201 		}
2202 	}
2203 
2204 	/*
2205 	 * Change the group state to PG_FAILED if it's not already.
2206 	 */
2207 	if (!GROUP_FAILED(pg))
2208 		phyint_group_chstate(pg, PG_FAILED);
2209 
2210 	return (GROUP_FAILURE);
2211 }
2212 
2213 /*
2214  * Return the information associated with consecutive probe successes
2215  * starting with the most recent probe. At most the last 2 probes can be
2216  * in the unacknowledged state. All previous probes have either failed
2217  * or succeeded.
2218  */
2219 static void
2220 probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
2221     struct probe_success_count *psinfo)
2222 {
2223 	uint_t	i;
2224 	struct probe_stats *pr_statp;
2225 	uint_t most_recent;
2226 	uint_t second_most_recent;
2227 	boolean_t pi_found_failure = _B_FALSE;
2228 	boolean_t tg_found_failure = _B_FALSE;
2229 	uint_t now;
2230 	uint_t timeout;
2231 	struct target *tg;
2232 
2233 	if (debug & D_FAILOVER)
2234 		logdebug("probe_success_info(%s)\n", pii->pii_name);
2235 
2236 	bzero(psinfo, sizeof (*psinfo));
2237 	now = getcurrenttime();
2238 
2239 	/*
2240 	 * Start with the most recent probe, and count the number
2241 	 * of consecutive probe successes. Latch the number of successes
2242 	 * on hitting a failure.
2243 	 */
2244 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2245 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2246 
2247 	for (i = most_recent; i != pii->pii_probe_next;
2248 	    i = PROBE_INDEX_PREV(i)) {
2249 		pr_statp = &pii->pii_probes[i];
2250 
2251 		switch (pr_statp->pr_status) {
2252 		case PR_UNACKED:
2253 			/*
2254 			 * Only the most recent 2 probes can be unacknowledged
2255 			 */
2256 			assert(i == most_recent || i == second_most_recent);
2257 
2258 			tg = pr_statp->pr_target;
2259 			assert(tg != NULL);
2260 			/*
2261 			 * The crtt could be zero for some reason,
2262 			 * Eg. the phyint could be failed. If the crtt is
2263 			 * not available use the value of the group's probe
2264 			 * interval which is a worst case estimate.
2265 			 */
2266 			if (tg->tg_crtt != 0) {
2267 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2268 			} else {
2269 				timeout = pr_statp->pr_time_sent +
2270 				    pii->pii_phyint->pi_group->pg_probeint;
2271 			}
2272 
2273 			if (TIME_LT(timeout, now)) {
2274 				/*
2275 				 * We hit a failure. Latch the total number of
2276 				 * recent consecutive successes.
2277 				 */
2278 				pr_statp->pr_time_lost = timeout;
2279 				pr_statp->pr_status = PR_LOST;
2280 				pi_found_failure = _B_TRUE;
2281 				if (cur_tg != NULL && tg == cur_tg) {
2282 					/*
2283 					 * We hit a failure for the desired
2284 					 * target. Latch the number of recent
2285 					 * consecutive successes for this target
2286 					 */
2287 					tg_found_failure = _B_TRUE;
2288 				}
2289 			}
2290 			break;
2291 
2292 		case PR_ACKED:
2293 			/*
2294 			 * Bump up the count of probe successes, if we
2295 			 * have not seen any failure so far.
2296 			 */
2297 			if (!pi_found_failure)
2298 				psinfo->ps_nsucc++;
2299 
2300 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2301 			    !tg_found_failure) {
2302 				psinfo->ps_nsucc_tg++;
2303 			}
2304 
2305 			/*
2306 			 * Record the time of last success, if this is
2307 			 * the most recent probe success.
2308 			 */
2309 			if (!psinfo->ps_tls_valid) {
2310 				psinfo->ps_tls = pr_statp->pr_time_acked;
2311 				psinfo->ps_tls_valid = _B_TRUE;
2312 			}
2313 			break;
2314 
2315 		case PR_LOST:
2316 			/*
2317 			 * We hit a failure. Latch the total number of
2318 			 * recent consecutive successes.
2319 			 */
2320 			pi_found_failure = _B_TRUE;
2321 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2322 				/*
2323 				 * We hit a failure for the desired target.
2324 				 * Latch the number of recent consecutive
2325 				 * successes for this target
2326 				 */
2327 				tg_found_failure = _B_TRUE;
2328 			}
2329 			break;
2330 
2331 		default:
2332 			return;
2333 
2334 		}
2335 	}
2336 }
2337 
2338 /*
2339  * Return the information associated with consecutive probe failures
2340  * starting with the most recent probe. Only the last 2 probes can be in the
2341  * unacknowledged state. All previous probes have either failed or succeeded.
2342  */
2343 static void
2344 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
2345     struct probe_fail_count *pfinfo)
2346 {
2347 	int	i;
2348 	struct probe_stats *pr_statp;
2349 	boolean_t	tg_found_success = _B_FALSE;
2350 	boolean_t	pi_found_success = _B_FALSE;
2351 	int	most_recent;
2352 	int	second_most_recent;
2353 	uint_t	now;
2354 	uint_t	timeout;
2355 	struct	target *tg;
2356 
2357 	if (debug & D_FAILOVER)
2358 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
2359 
2360 	bzero(pfinfo, sizeof (*pfinfo));
2361 	now = getcurrenttime();
2362 
2363 	/*
2364 	 * Start with the most recent probe, and count the number
2365 	 * of consecutive probe failures. Latch the number of failures
2366 	 * on hitting a probe success.
2367 	 */
2368 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
2369 	second_most_recent = PROBE_INDEX_PREV(most_recent);
2370 
2371 	for (i = most_recent; i != pii->pii_probe_next;
2372 	    i = PROBE_INDEX_PREV(i)) {
2373 		pr_statp = &pii->pii_probes[i];
2374 
2375 		assert(PR_STATUS_VALID(pr_statp->pr_status));
2376 
2377 		switch (pr_statp->pr_status) {
2378 		case PR_UNACKED:
2379 			/*
2380 			 * Only the most recent 2 probes can be unacknowledged
2381 			 */
2382 			assert(i == most_recent || i == second_most_recent);
2383 
2384 			tg = pr_statp->pr_target;
2385 			/*
2386 			 * Target is guaranteed to exist in the unack. state
2387 			 */
2388 			assert(tg != NULL);
2389 			/*
2390 			 * The crtt could be zero for some reason,
2391 			 * Eg. the phyint could be failed. If the crtt is
2392 			 * not available use the group's probe interval,
2393 			 * which is a worst case estimate.
2394 			 */
2395 			if (tg->tg_crtt != 0) {
2396 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
2397 			} else {
2398 				timeout = pr_statp->pr_time_sent +
2399 				    pii->pii_phyint->pi_group->pg_probeint;
2400 			}
2401 
2402 			if (TIME_GT(timeout, now))
2403 				break;
2404 
2405 			pr_statp->pr_time_lost = timeout;
2406 			pr_statp->pr_status = PR_LOST;
2407 			/* FALLTHRU */
2408 
2409 		case PR_LOST:
2410 			if (!pi_found_success) {
2411 				pfinfo->pf_nfail++;
2412 				pfinfo->pf_tff = pr_statp->pr_time_lost;
2413 			}
2414 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
2415 			    !tg_found_success)  {
2416 				pfinfo->pf_nfail_tg++;
2417 			}
2418 			break;
2419 
2420 		default:
2421 			/*
2422 			 * We hit a success or unused slot. Latch the
2423 			 * total number of recent consecutive failures.
2424 			 */
2425 			pi_found_success = _B_TRUE;
2426 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
2427 				/*
2428 				 * We hit a success for the desired target.
2429 				 * Latch the number of recent consecutive
2430 				 * failures for this target
2431 				 */
2432 				tg_found_success = _B_TRUE;
2433 			}
2434 		}
2435 	}
2436 }
2437 
2438 /*
2439  * Check if the phyint has been repaired.  If no test address has been
2440  * configured, then consider the interface repaired if the link is up (unless
2441  * the link is flapping; see below).  Otherwise, look for proof of probes
2442  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
2443  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
2444  */
2445 static boolean_t
2446 phyint_repaired(struct phyint *pi)
2447 {
2448 	struct	probe_success_count psinfo;
2449 	struct	phyint_instance *pii;
2450 	struct	target *cur_tg;
2451 	int	pr_ndx;
2452 	uint_t	cur_time;
2453 
2454 	if (debug & D_FAILOVER)
2455 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
2456 
2457 	if (LINK_DOWN(pi))
2458 		return (_B_FALSE);
2459 
2460 	/*
2461 	 * If we don't have any test addresses and the link is up, then
2462 	 * consider the interface repaired, unless we've received more than
2463 	 * LINK_UP_PERMIN link up notifications in the last minute, in
2464 	 * which case we keep the link down until we drop back below
2465 	 * the threshold.
2466 	 */
2467 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
2468 		cur_time = getcurrenttime();
2469 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
2470 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
2471 			pi->pi_lfmsg_printed = 0;
2472 			return (_B_TRUE);
2473 		}
2474 		if (!pi->pi_lfmsg_printed) {
2475 			logerr("The link has come up on %s more than %d times "
2476 			    "in the last minute; disabling failback until it "
2477 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
2478 			pi->pi_lfmsg_printed = 1;
2479 		}
2480 
2481 		return (_B_FALSE);
2482 	}
2483 
2484 	pii = pi->pi_v4;
2485 	if (PROBE_CAPABLE(pii)) {
2486 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2487 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2488 		probe_success_info(pii, cur_tg, &psinfo);
2489 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2490 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2491 			return (_B_TRUE);
2492 	}
2493 
2494 	pii = pi->pi_v6;
2495 	if (PROBE_CAPABLE(pii)) {
2496 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
2497 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
2498 		probe_success_info(pii, cur_tg, &psinfo);
2499 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
2500 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
2501 			return (_B_TRUE);
2502 	}
2503 
2504 	return (_B_FALSE);
2505 }
2506 
2507 /*
2508  * Try failover from phyint 'pi' to a suitable destination.
2509  */
2510 int
2511 try_failover(struct phyint *pi, int failover_type)
2512 {
2513 	struct phyint *dst;
2514 	int err;
2515 
2516 	if (debug & D_FAILOVER)
2517 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
2518 
2519 	/*
2520 	 * Attempt to find a failover destination 'dst'.
2521 	 * dst will be null if any of the following is true
2522 	 * Phyint is not part of a group  OR
2523 	 * Phyint is the only member of a group OR
2524 	 * No suitable failover dst was available
2525 	 */
2526 	dst = get_failover_dst(pi, failover_type);
2527 	if (dst == NULL)
2528 		return (IPMP_EMINRED);
2529 
2530 	dst->pi_empty = 0;			/* Per state diagram */
2531 	pi->pi_full = 0;			/* Per state diagram */
2532 
2533 	err = failover(pi, dst);
2534 
2535 	if (debug & D_FAILOVER) {
2536 		logdebug("failed over from %s to %s ret %d\n",
2537 		    pi->pi_name, dst->pi_name, err);
2538 	}
2539 	if (err == 0) {
2540 		pi->pi_empty = 1;		/* Per state diagram */
2541 		/*
2542 		 * we don't want to print out this message if a
2543 		 * phyint is leaving the group, nor for failover from
2544 		 * standby
2545 		 */
2546 		if (failover_type == FAILOVER_NORMAL) {
2547 			logerr("Successfully failed over from NIC %s to NIC "
2548 			    "%s\n", pi->pi_name, dst->pi_name);
2549 		}
2550 		return (0);
2551 	} else {
2552 		/*
2553 		 * The failover did not succeed. We must retry the failover
2554 		 * only after resyncing our state based on the kernel's.
2555 		 * For eg. either the src or the dst might have been unplumbed
2556 		 * causing this failure. initifs() will be called again,
2557 		 * from main, since full_scan_required has been set to true
2558 		 * by failover();
2559 		 */
2560 		return (IPMP_FAILURE);
2561 	}
2562 }
2563 
2564 /*
2565  * global_errno captures the errno value, if failover() or failback()
2566  * fails. This is sent to if_mpadm(1M).
2567  */
2568 int global_errno;
2569 
2570 /*
2571  * Attempt failover from phyint 'from' to phyint 'to'.
2572  * IP moves everything from phyint 'from' to phyint 'to'.
2573  */
2574 static int
2575 failover(struct phyint *from, struct phyint *to)
2576 {
2577 	struct	lifreq	lifr;
2578 	int 	ret;
2579 
2580 	if (debug & D_FAILOVER) {
2581 		logdebug("failing over from %s to %s\n",
2582 		    from->pi_name, to->pi_name);
2583 	}
2584 
2585 	/*
2586 	 * Perform the failover. Both IPv4 and IPv6 are failed over
2587 	 * using a single ioctl by passing in AF_UNSPEC family.
2588 	 */
2589 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2590 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2591 	lifr.lifr_movetoindex = to->pi_ifindex;
2592 
2593 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
2594 	if (ret < 0) {
2595 		global_errno = errno;
2596 		logperror("failover: ioctl (failover)");
2597 	}
2598 
2599 	/*
2600 	 * Set full_scan_required to true. This will make us read
2601 	 * the state from the kernel in initifs() and update our tables,
2602 	 * to reflect the current state after the failover. If the
2603 	 * failover has failed it will then reissue the failover.
2604 	 */
2605 	full_scan_required = _B_TRUE;
2606 	return (ret);
2607 }
2608 
2609 /*
2610  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
2611  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
2612  * Return values:
2613  * IPMP_SUCCESS:		Failback successful from each of the other
2614  *				phyints in the group.
2615  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
2616  *				phyints in the group.
2617  * IPMP_FAILURE:		Failback syscall failed with some error.
2618  *
2619  * Note that failback is attempted regardless of the setting of the
2620  * failback_enabled flag.
2621  */
2622 int
2623 do_failback(struct phyint *pi, boolean_t check_only)
2624 {
2625 	struct  phyint *from;
2626 	boolean_t done;
2627 	boolean_t partial;
2628 	boolean_t attempted_failback = _B_FALSE;
2629 
2630 	if (debug & D_FAILOVER)
2631 		logdebug("do_failback(%s)\n", pi->pi_name);
2632 
2633 	/* If this phyint is not part of a named group, return. */
2634 	if (pi->pi_group == phyint_anongroup) {
2635 		pi->pi_full = 1;
2636 		return (IPMP_SUCCESS);
2637 	}
2638 
2639 	/*
2640 	 * Attempt failback from every phyint in the group to 'pi'.
2641 	 * The reason for doing this, instead of only from the
2642 	 * phyint to which we did the failover is given below.
2643 	 *
2644 	 * After 'pi' failed, if any app. tries to join on a multicast
2645 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
2646 	 * non-failed phyint in the group, instead of the failed phyint,
2647 	 * in.mpathd is not aware of this. Thus failing back only from the
2648 	 * interface to which 'pi' failed over, will failback the ipif's
2649 	 * but not the ilm's. So we need to failback from all members of
2650 	 * the phyint group
2651 	 */
2652 	done = _B_TRUE;
2653 	partial = _B_FALSE;
2654 	for (from = pi->pi_group->pg_phyint; from != NULL;
2655 	    from = from->pi_pgnext) {
2656 		/* Exclude ourself as a failback src */
2657 		if (from == pi)
2658 			continue;
2659 
2660 		/*
2661 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
2662 		 * phyint must also have IPv4 plumbed. Similar check
2663 		 * for IPv6. IP makes the same check. Otherwise the
2664 		 * failback will fail.
2665 		 */
2666 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
2667 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
2668 			partial = _B_TRUE;
2669 			continue;
2670 		}
2671 
2672 		if (!check_only) {
2673 			pi->pi_empty = 0;	/* Per state diagram */
2674 			attempted_failback = _B_TRUE;
2675 			if (failback(from, pi) != 0) {
2676 				done = _B_FALSE;
2677 				break;
2678 			}
2679 		}
2680 	}
2681 
2682 	if (check_only) {
2683 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2684 	}
2685 
2686 	/*
2687 	 * We are done. No more phyint from which we can src the failback
2688 	 */
2689 	if (done) {
2690 		if (!partial)
2691 			pi->pi_full = 1;	/* Per state diagram */
2692 		/*
2693 		 * Don't print out a message unless there is a
2694 		 * transition from FAILED to RUNNING. For eg.
2695 		 * we don't want to print out this message if a
2696 		 * phyint is leaving the group, or at startup
2697 		 */
2698 		if (attempted_failback && (pi->pi_flags &
2699 		    (IFF_FAILED | IFF_OFFLINE))) {
2700 			logerr("Successfully failed back to NIC %s\n",
2701 			    pi->pi_name);
2702 		}
2703 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
2704 	}
2705 
2706 	return (IPMP_FAILURE);
2707 }
2708 
2709 /*
2710  * This function is similar to do_failback() above, but respects the
2711  * failback_enabled flag for phyints in named groups.
2712  */
2713 int
2714 try_failback(struct phyint *pi, boolean_t check_only)
2715 {
2716 	if (debug & D_FAILOVER)
2717 		logdebug("try_failback(%s)\n", pi->pi_name);
2718 
2719 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
2720 		return (IPMP_EFBDISABLED);
2721 
2722 	return (do_failback(pi, check_only));
2723 }
2724 
2725 /*
2726  * Failback everything from phyint 'from' that has the same ifindex
2727  * as phyint to's ifindex.
2728  */
2729 static int
2730 failback(struct phyint *from, struct phyint *to)
2731 {
2732 	struct lifreq lifr;
2733 	int ret;
2734 
2735 	if (debug & D_FAILOVER)
2736 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
2737 
2738 	lifr.lifr_addr.ss_family = AF_UNSPEC;
2739 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
2740 	lifr.lifr_movetoindex = to->pi_ifindex;
2741 
2742 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
2743 	if (ret < 0) {
2744 		global_errno = errno;
2745 		logperror("failback: ioctl (failback)");
2746 	}
2747 
2748 	/*
2749 	 * Set full_scan_required to true. This will make us read
2750 	 * the state from the kernel in initifs() and update our tables,
2751 	 * to reflect the current state after the failback. If the
2752 	 * failback has failed it will then reissue the failback.
2753 	 */
2754 	full_scan_required = _B_TRUE;
2755 
2756 	return (ret);
2757 }
2758 
2759 /*
2760  * Select a target phyint for failing over from 'pi'.
2761  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
2762  * target phyint is chosen as follows,
2763  *	1. Pick any inactive standby interface.
2764  *	2. If no inactive standby is available, select any phyint in the
2765  *	   same group that has the least number of logints, (excluding
2766  *	   IFF_NOFAILOVER and !IFF_UP logints)
2767  * If we are failing over from a standby, failover_type is
2768  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
2769  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
2770  * and we won't return NULL, as long as there is at least 1 other phyint
2771  * in the group.
2772  */
2773 static struct phyint *
2774 get_failover_dst(struct phyint *pi, int failover_type)
2775 {
2776 	struct phyint	*maybe = NULL;
2777 	struct phyint	*pi2;
2778 	struct phyint 	*last_choice = NULL;
2779 
2780 	if (pi->pi_group == phyint_anongroup)
2781 		return (NULL);
2782 
2783 	/*
2784 	 * Loop thru the phyints in the group, and pick the preferred
2785 	 * phyint for the target.
2786 	 */
2787 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
2788 		/* Exclude ourself and offlined interfaces */
2789 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
2790 			continue;
2791 
2792 		/*
2793 		 * The chosen target phyint must have IPv4 instance
2794 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
2795 		 * for IPv6.
2796 		 */
2797 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
2798 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
2799 			continue;
2800 
2801 		/* The chosen target must be PI_RUNNING. */
2802 		if (pi2->pi_state != PI_RUNNING) {
2803 			last_choice = pi2;
2804 			continue;
2805 		}
2806 
2807 		if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) &&
2808 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
2809 			return (pi2);
2810 		} else {
2811 			if (maybe == NULL)
2812 				maybe = pi2;
2813 			else if (logint_upcount(pi2) < logint_upcount(maybe))
2814 				maybe = pi2;
2815 		}
2816 	}
2817 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
2818 		return (last_choice);
2819 	else
2820 		return (maybe);
2821 }
2822 
2823 /*
2824  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
2825  */
2826 boolean_t
2827 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
2828 {
2829 	int ifsock;
2830 	struct lifreq lifr;
2831 
2832 	if (debug & D_FAILOVER) {
2833 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
2834 		    pi->pi_name, flags, (int)setfl);
2835 	}
2836 
2837 	if (pi->pi_v4 != NULL) {
2838 		ifsock = ifsock_v4;
2839 	} else  {
2840 		ifsock = ifsock_v6;
2841 	}
2842 
2843 	/*
2844 	 * Get the current flags from the kernel, and set/clear the
2845 	 * desired phyint flags. Since we set only phyint flags, we can
2846 	 * do it on either IPv4 or IPv6 instance.
2847 	 */
2848 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
2849 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
2850 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
2851 		if (errno != ENXIO)
2852 			logperror("change_lif_flags: ioctl (get flags)");
2853 		return (_B_FALSE);
2854 	}
2855 	if (setfl)
2856 		lifr.lifr_flags |= flags;
2857 	else
2858 		lifr.lifr_flags &= ~flags;
2859 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
2860 		if (errno != ENXIO)
2861 			logperror("change_lif_flags: ioctl (set flags)");
2862 		return (_B_FALSE);
2863 	}
2864 
2865 	/*
2866 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
2867 	 * phyint flags.
2868 	 */
2869 	if (setfl)
2870 		pi->pi_flags |= flags;
2871 	else
2872 		pi->pi_flags &= ~flags;
2873 
2874 	if (pi->pi_v4)
2875 		pi->pi_v4->pii_flags = pi->pi_flags;
2876 
2877 	if (pi->pi_v6)
2878 		pi->pi_v6->pii_flags = pi->pi_flags;
2879 
2880 	return (_B_TRUE);
2881 }
2882 
2883 /*
2884  * icmp cksum computation for IPv4.
2885  */
2886 static int
2887 in_cksum(ushort_t *addr, int len)
2888 {
2889 	register int nleft = len;
2890 	register ushort_t *w = addr;
2891 	register ushort_t answer;
2892 	ushort_t odd_byte = 0;
2893 	register int sum = 0;
2894 
2895 	/*
2896 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
2897 	 *  we add sequential 16 bit words to it, and at the end, fold
2898 	 *  back all the carry bits from the top 16 bits into the lower
2899 	 *  16 bits.
2900 	 */
2901 	while (nleft > 1)  {
2902 		sum += *w++;
2903 		nleft -= 2;
2904 	}
2905 
2906 	/* mop up an odd byte, if necessary */
2907 	if (nleft == 1) {
2908 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
2909 		sum += odd_byte;
2910 	}
2911 
2912 	/*
2913 	 * add back carry outs from top 16 bits to low 16 bits
2914 	 */
2915 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
2916 	sum += (sum >> 16);			/* add carry */
2917 	answer = ~sum;				/* truncate to 16 bits */
2918 	return (answer);
2919 }
2920 
2921 static void
2922 reset_snxt_basetimes(void)
2923 {
2924 	struct phyint_instance *pii;
2925 
2926 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2927 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
2928 	}
2929 }
2930 
2931 /*
2932  * Is the address one of our own addresses? Unfortunately,
2933  * we cannot check our phyint tables to determine if the address
2934  * is our own. This is because, we don't track interfaces that
2935  * are not part of any group. We have to either use a 'bind' or
2936  * get the complete list of all interfaces using SIOCGLIFCONF,
2937  * to do this check. We could also use SIOCTMYADDR.
2938  * Bind fails for the local zone address, so we might include local zone
2939  * address as target address. If local zone address is a target address
2940  * and it is up, it is not possible to detect the interface failure.
2941  * SIOCTMYADDR also doesn't consider local zone address as own address.
2942  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
2943  * are stored in laddr_list.
2944  */
2945 
2946 boolean_t
2947 own_address(struct in6_addr addr)
2948 {
2949 	struct local_addr *taddr = laddr_list;
2950 
2951 	for (; taddr != NULL; taddr = taddr->next) {
2952 		if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) {
2953 			return (_B_TRUE);
2954 		}
2955 	}
2956 	return (_B_FALSE);
2957 }
2958