1 /* 2 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #pragma ident "%Z%%M% %I% %E% SMI" 24 25 #include "mpd_defs.h" 26 #include "mpd_tables.h" 27 28 /* 29 * Probe types for probe() 30 */ 31 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 32 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 33 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 34 35 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 36 37 /* 38 * Format of probe / probe response packets. This is an ICMP Echo request 39 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 40 */ 41 struct pr_icmp 42 { 43 uint8_t pr_icmp_type; /* type field */ 44 uint8_t pr_icmp_code; /* code field */ 45 uint16_t pr_icmp_cksum; /* checksum field */ 46 uint16_t pr_icmp_id; /* Identification */ 47 uint16_t pr_icmp_seq; /* sequence number */ 48 uint32_t pr_icmp_timestamp; /* Time stamp */ 49 uint32_t pr_icmp_mtype; /* Message type */ 50 }; 51 52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x0, 54 0x0, 0x0, 0x0, 0x0, 55 0x0, 0x0, 0x0, 0x1 } }; 56 57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 58 59 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 60 61 static void *find_ancillary(struct msghdr *msg, int cmsg_type); 62 static void pi_set_crtt(struct target *tg, int m, 63 boolean_t is_probe_uni); 64 static void incoming_echo_reply(struct phyint_instance *pii, 65 struct pr_icmp *reply, struct in6_addr fromaddr); 66 static void incoming_rtt_reply(struct phyint_instance *pii, 67 struct pr_icmp *reply, struct in6_addr fromaddr); 68 static void incoming_mcast_reply(struct phyint_instance *pii, 69 struct pr_icmp *reply, struct in6_addr fromaddr); 70 71 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 72 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 73 static boolean_t check_exception_target(struct phyint_instance *pii, 74 struct target *target); 75 static void probe_fail_info(struct phyint_instance *pii, 76 struct target *cur_tg, struct probe_fail_count *pfinfo); 77 static void probe_success_info(struct phyint_instance *pii, 78 struct target *cur_tg, struct probe_success_count *psinfo); 79 static boolean_t phyint_repaired(struct phyint *pi); 80 81 static int failover(struct phyint *from, struct phyint *to); 82 static int failback(struct phyint *from, struct phyint *to); 83 static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); 84 85 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 86 static int in_cksum(ushort_t *addr, int len); 87 static void reset_snxt_basetimes(void); 88 89 /* 90 * CRTT - Conservative Round Trip Time Estimate 91 * Probe success - A matching probe reply received before CRTT ms has elapsed 92 * after sending the probe. 93 * Probe failure - No probe reply received and more than CRTT ms has elapsed 94 * after sending the probe. 95 * 96 * TLS - Time last success. Most recent probe ack received at this time. 97 * TFF - Time first fail. The time of the earliest probe failure in 98 * a consecutive series of probe failures. 99 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 100 * before declaring phyint repair. 101 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 102 * declare a phyint failure. 103 * 104 * Phyint state diagram 105 * 106 * The state of a phyint that is capable of being probed, is completely 107 * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. 108 * 109 * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 110 * of the link (according to the driver). If the phyint is also configured 111 * with a test address (the common case) and probe targets, then a phyint must 112 * also successfully be able to send and receive probes in order to remain in 113 * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 114 * 115 * Further, if a PI_RUNNING phyint is configured with a test address but is 116 * unable to find any probe targets, it will transition to the PI_NOTARGETS 117 * state, which indicates that the link is apparently functional but that 118 * in.mpathd is unable to send probes to verify functionality (in this case, 119 * in.mpathd makes the optimistic assumption that the interface is working 120 * correctly and thus does not perform a failover, but reports the interface 121 * as IPMP_IF_UNKNOWN through the async events and query interfaces). 122 * 123 * At any point, a phyint may be administratively marked offline via if_mpadm. 124 * In this case, the interface always transitions to PI_OFFLINE, regardless 125 * of its previous state. When the interface is later brought back online, 126 * in.mpathd acts as if the interface is new (and thus it transitions to 127 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 128 * its probes, if probes are sent). 129 * 130 * pi_state - PI_RUNNING or PI_FAILED 131 * PI_RUNNING: The failure detection logic says the phyint is good. 132 * PI_FAILED: The failure detection logic says the phyint has failed. 133 * 134 * pg_groupfailed - Group failure, all interfaces in the group have failed. 135 * The pi_state may be either PI_FAILED or PI_NOTARGETS. 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: No failovers have been done to this phyint, from 144 * other phyints. This phyint is inactive. Phyint can be a Standby. 145 * When failback has been disabled (FAILOVER=no configured), 146 * phyint can also be a non-STANDBY. In this case IFF_INACTIVE 147 * is set when phyint subsequently recovers after a failure. 148 * 149 * pi_empty 150 * This phyint has failed over successfully to another phyint, and 151 * this phyint is currently "empty". It does not host any addresses or 152 * multicast membership etc. This is the state of a phyint after a 153 * failover from the phyint has completed successfully and no subsequent 154 * 'failover to' or 'failback to' has occurred on the phyint. 155 * IP guarantees that no new logicals will be hosted nor any multicast 156 * joins permitted on the phyint, since the phyint is either failed or 157 * inactive. pi_empty is set implies the phyint is either failed or 158 * inactive. 159 * 160 * pi_full 161 * The phyint hosts all of its own addresses that it "owns". If the 162 * phyint was previously failed or inactive, failbacks to the phyint 163 * has completed successfully. i.e. No more failbacks to this phyint 164 * can produce any change in system state whatsoever. 165 * 166 * Not all 32 possible combinations of the above 5-tuple are possible. 167 * Furthermore some of the above combinations are transient. They may occur 168 * only because the failover or failback did not complete successfully. The 169 * failover/failback will be retried and eventually a stable state will be 170 * reached. 171 * 172 * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. 173 * The following are the state machines. 'from' and 'to' are the src and 174 * dst of the failover/failback, below 175 * 176 * pi_empty state machine 177 * --------------------------------------------------------------------------- 178 * Event State -> New State 179 * --------------------------------------------------------------------------- 180 * successful completion from.pi_empty = 0 -> from.pi_empty = 1 181 * of failover 182 * 183 * Initiate failover to.pi_empty = X -> to.pi_empty = 0 184 * 185 * Initiate failback to.pi_empty = X -> to.pi_empty = 0 186 * 187 * group failure pi_empty = X -> pi_empty = 0 188 * --------------------------------------------------------------------------- 189 * 190 * pi_full state machine 191 * --------------------------------------------------------------------------- 192 * Event State -> New State 193 * --------------------------------------------------------------------------- 194 * successful completion to.pi_full = 0 -> to.pi_full = 1 195 * of failback from 196 * each of the other phyints 197 * 198 * Initiate failover from.pi_full = X -> from.pi_full = 0 199 * 200 * group failure pi_full = X -> pi_full = 0 201 * --------------------------------------------------------------------------- 202 * 203 * pi_state state machine 204 * --------------------------------------------------------------------------- 205 * Event State New State 206 * Action: 207 * --------------------------------------------------------------------------- 208 * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 209 * detection : set IFF_FAILED on this phyint 210 * : failover from this phyint to another 211 * 212 * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 213 * detection : set IFF_FAILED on this phyint 214 * 215 * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) 216 * detection -> (PI_RUNNING, I == 0) 217 * : to.pi_empty = 0 218 * : clear IFF_FAILED on this phyint 219 * : failback to this phyint if enabled 220 * 221 * NIC repair (PI_FAILED, I == 0, FAILBACK=no) 222 * detection -> (PI_RUNNING, I == 1) 223 * : to.pi_empty = 0 224 * : clear IFF_FAILED on this phyint 225 * : if failback is disabled set I == 1 226 * 227 * Group failure (perform on all phyints in the group) 228 * detection PI_RUNNING PI_FAILED 229 * (Router targets) : set IFF_FAILED 230 * : clear pi_empty and pi_full 231 * 232 * Group failure (perform on all phyints in the group) 233 * detection PI_RUNNING PI_NOTARGETS 234 * (Host targets) : set IFF_FAILED 235 * : clear pi_empty and pi_full 236 * : delete the target list on all phyints 237 * --------------------------------------------------------------------------- 238 * 239 * I state machine 240 * --------------------------------------------------------------------------- 241 * Event State Action: 242 * --------------------------------------------------------------------------- 243 * Turn on I pi_empty == 0, STANDBY : failover from standby 244 * 245 * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 246 * pi_full == 0 : failback to this if enabled 247 * --------------------------------------------------------------------------- 248 * 249 * Assertions: (Read '==>' as implies) 250 * 251 * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) 252 * (pi_empty == 1) ==> (pi_full == 0) 253 * (pi_full == 1) ==> (pi_empty == 0) 254 * 255 * Invariants 256 * 257 * pg_groupfailed = 0 && 258 * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby 259 * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint 260 * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint 261 * 262 * 1. says that an inactive standby, that is not empty, has to be failed 263 * over. For a standby to be truly inactive, it should not host any 264 * addresses. So we move them to some other phyint. Usually we catch the 265 * turn on of IFF_INACTIVE, and perform this action. However if the failover 266 * did not complete successfully, then subsequently we have lost the edge 267 * trigger, and this invariant kicks in and completes the action. 268 * 269 * 2. says that any failed phyint that is not empty must be failed over. 270 * Usually we do the failover when we detect NIC failure. However if the 271 * failover does not complete successfully, this invariant kicks in and 272 * completes the failover. We exclude inactive standby which is covered by 1. 273 * 274 * 3. says that any running phyint that is not full must be failed back. 275 * Usually we do the failback when we detect NIC repair. However if the 276 * failback does not complete successfully, this invariant kicks in and 277 * completes the failback. Note that we don't want to failback to an inactive 278 * standby. 279 * 280 * The invariants 1 - 3 and the actions are in initifs(). 281 */ 282 283 struct probes_missed probes_missed; 284 285 /* 286 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 287 * will be added on by the kernel. The id field identifies this phyint. 288 * and the sequence number is an increasing (modulo 2^^16) integer. The data 289 * portion holds the time value when the packet is sent. On echo this is 290 * extracted to compute the round-trip time. Three different types of 291 * probe packets are used. 292 * 293 * PROBE_UNI: This type is used to do failure detection / failure recovery 294 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 295 * not less than the current CRTT. pii_probes[] stores data 296 * about these probes. These packets consume sequence number space. 297 * 298 * PROBE_RTT: This type is used to make only rtt measurments. Normally these 299 * are not used. Under heavy network load, the rtt may go up very high, 300 * due to a spike, or may appear to go high, due to extreme scheduling 301 * delays. Once the network stress is removed, mpathd takes long time to 302 * recover, because the probe_interval is already high, and it takes 303 * a long time to send out sufficient number of probes to bring down the 304 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 305 * user_probe_interval ms. and will cause only rtt updates. These packets 306 * do not consume sequence number space nor is information about these 307 * packets stored in the pii_probes[] 308 * 309 * PROBE_MULTI: This type is only used to construct a list of targets, when 310 * no targets are known. The packet is multicast to the all hosts addr. 311 */ 312 static void 313 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) 314 { 315 struct pr_icmp probe_pkt; /* Probe packet */ 316 struct sockaddr_in6 whereto6; /* target address IPv6 */ 317 struct sockaddr_in whereto; /* target address IPv4 */ 318 int pr_ndx; /* probe index in pii->pii_probes[] */ 319 boolean_t sent = _B_TRUE; 320 321 if (debug & D_TARGET) { 322 logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), 323 pii->pii_name, probe_type, cur_time); 324 } 325 326 assert(pii->pii_probe_sock != -1); 327 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 328 probe_type == PROBE_RTT); 329 330 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 331 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 332 probe_pkt.pr_icmp_code = 0; 333 probe_pkt.pr_icmp_cksum = 0; 334 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 335 336 /* 337 * Since there is no need to do arithmetic on the icmpid, 338 * (only equality check is done) pii_icmpid is stored in 339 * network byte order at initialization itself. 340 */ 341 probe_pkt.pr_icmp_id = pii->pii_icmpid; 342 probe_pkt.pr_icmp_timestamp = htonl(cur_time); 343 probe_pkt.pr_icmp_mtype = htonl(probe_type); 344 345 /* 346 * If probe_type is PROBE_MULTI, this packet will be multicast to 347 * the all hosts address. Otherwise it is unicast to the next target. 348 */ 349 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 350 pii->pii_rtt_target_next != NULL)); 351 352 if (pii->pii_af == AF_INET6) { 353 bzero(&whereto6, sizeof (whereto6)); 354 whereto6.sin6_family = AF_INET6; 355 if (probe_type == PROBE_MULTI) { 356 whereto6.sin6_addr = all_nodes_mcast_v6; 357 } else if (probe_type == PROBE_UNI) { 358 whereto6.sin6_addr = pii->pii_target_next->tg_address; 359 } else { 360 /* type is PROBE_RTT */ 361 whereto6.sin6_addr = 362 pii->pii_rtt_target_next->tg_address; 363 } 364 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 365 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, 366 sizeof (whereto6)) != sizeof (probe_pkt)) { 367 logperror_pii(pii, "probe: probe sendto"); 368 sent = _B_FALSE; 369 } 370 } else { 371 bzero(&whereto, sizeof (whereto)); 372 whereto.sin_family = AF_INET; 373 if (probe_type == PROBE_MULTI) { 374 whereto.sin_addr = all_nodes_mcast_v4; 375 } else if (probe_type == PROBE_UNI) { 376 IN6_V4MAPPED_TO_INADDR( 377 &pii->pii_target_next->tg_address, 378 &whereto.sin_addr); 379 } else { 380 /* type is PROBE_RTT */ 381 IN6_V4MAPPED_TO_INADDR( 382 &pii->pii_rtt_target_next->tg_address, 383 &whereto.sin_addr); 384 } 385 386 /* 387 * Compute the IPv4 icmp checksum. Does not cover the IP header. 388 */ 389 probe_pkt.pr_icmp_cksum = 390 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 391 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 392 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, 393 sizeof (whereto)) != sizeof (probe_pkt)) { 394 logperror_pii(pii, "probe: probe sendto"); 395 sent = _B_FALSE; 396 } 397 } 398 399 /* 400 * If this is a PROBE_UNI probe packet being unicast to a target, then 401 * update our tables. We will need this info in processing the probe 402 * response. PROBE_MULTI and PROBE_RTT packets are not used for 403 * the purpose of failure or recovery detection. PROBE_MULTI packets 404 * are only used to construct a list of targets. PROBE_RTT packets are 405 * used only for updating the rtt and not for failure detection. 406 */ 407 if (probe_type == PROBE_UNI && sent) { 408 pr_ndx = pii->pii_probe_next; 409 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 410 411 /* Collect statistics, before we reuse the last slot. */ 412 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 413 pii->pii_cum_stats.lost++; 414 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 415 pii->pii_cum_stats.acked++; 416 pii->pii_cum_stats.sent++; 417 418 pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; 419 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 420 pii->pii_probes[pr_ndx].pr_time_sent = cur_time; 421 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 422 pii->pii_target_next = target_next(pii->pii_target_next); 423 assert(pii->pii_target_next != NULL); 424 /* 425 * If we have a single variable to denote the next target to 426 * probe for both rtt probes and failure detection probes, we 427 * could end up with a situation where the failure detection 428 * probe targets become disjoint from the rtt probe targets. 429 * Eg. if 2 targets and the actual fdt is double the user 430 * specified fdt. So we have 2 variables. In this scheme 431 * we also reset pii_rtt_target_next for every fdt probe, 432 * though that may not be necessary. 433 */ 434 pii->pii_rtt_target_next = pii->pii_target_next; 435 pii->pii_snxt++; 436 } else if (probe_type == PROBE_RTT) { 437 pii->pii_rtt_target_next = 438 target_next(pii->pii_rtt_target_next); 439 assert(pii->pii_rtt_target_next != NULL); 440 } 441 } 442 443 /* 444 * Incoming IPv4 data from wire, is received here. Called from main. 445 */ 446 void 447 in_data(struct phyint_instance *pii) 448 { 449 struct sockaddr_in from; 450 struct in6_addr fromaddr; 451 uint_t fromlen; 452 static uint_t in_packet[(IP_MAXPACKET + 1)/4]; 453 struct ip *ip; 454 int iphlen; 455 int len; 456 char abuf[INET_ADDRSTRLEN]; 457 struct pr_icmp *reply; 458 459 if (debug & D_PROBE) { 460 logdebug("in_data(%s %s)\n", 461 AF_STR(pii->pii_af), pii->pii_name); 462 } 463 464 /* 465 * Poll has already told us that a message is waiting, 466 * on this socket. Read it now. We should not block. 467 */ 468 fromlen = sizeof (from); 469 len = recvfrom(pii->pii_probe_sock, (char *)in_packet, 470 sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); 471 if (len < 0) { 472 logperror_pii(pii, "in_data: recvfrom"); 473 return; 474 } 475 476 /* 477 * If the NIC has indicated the link is down, don't go 478 * any further. 479 */ 480 if (LINK_DOWN(pii->pii_phyint)) 481 return; 482 483 /* Get the printable address for error reporting */ 484 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 485 486 /* Make sure packet contains at least minimum ICMP header */ 487 ip = (struct ip *)in_packet; 488 iphlen = ip->ip_hl << 2; 489 if (len < iphlen + ICMP_MINLEN) { 490 if (debug & D_PKTBAD) { 491 logdebug("in_data: packet too short (%d bytes)" 492 " from %s\n", len, abuf); 493 } 494 return; 495 } 496 497 /* 498 * Subtract the IP hdr length, 'len' will be length of the probe 499 * reply, starting from the icmp hdr. 500 */ 501 len -= iphlen; 502 /* LINTED */ 503 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 504 505 /* Probe replies are icmp echo replies. Ignore anything else */ 506 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 507 return; 508 509 /* 510 * The icmp id should match what we sent, which is stored 511 * in pi_icmpid. The icmp code for reply must be 0. 512 * The reply content must be a struct pr_icmp 513 */ 514 if (reply->pr_icmp_id != pii->pii_icmpid) { 515 /* Not in response to our probe */ 516 return; 517 } 518 519 if (reply->pr_icmp_code != 0) { 520 logtrace("probe reply code %d from %s on %s\n", 521 reply->pr_icmp_code, abuf, pii->pii_name); 522 return; 523 } 524 525 if (len < sizeof (struct pr_icmp)) { 526 logtrace("probe reply too short: %d bytes from %s on %s\n", 527 len, abuf, pii->pii_name); 528 return; 529 } 530 531 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 532 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 533 /* Unicast probe reply */ 534 incoming_echo_reply(pii, reply, fromaddr); 535 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 536 /* Multicast reply */ 537 incoming_mcast_reply(pii, reply, fromaddr); 538 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 539 incoming_rtt_reply(pii, reply, fromaddr); 540 } else { 541 /* Probably not in response to our probe */ 542 logtrace("probe reply type: %d from %s on %s\n", 543 reply->pr_icmp_mtype, abuf, pii->pii_name); 544 return; 545 } 546 547 } 548 549 /* 550 * Incoming IPv6 data from wire is received here. Called from main. 551 */ 552 void 553 in6_data(struct phyint_instance *pii) 554 { 555 struct sockaddr_in6 from; 556 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 557 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 558 int len; 559 char abuf[INET6_ADDRSTRLEN]; 560 struct msghdr msg; 561 struct iovec iov; 562 uchar_t *opt; 563 struct pr_icmp *reply; 564 565 if (debug & D_PROBE) { 566 logdebug("in6_data(%s %s)\n", 567 AF_STR(pii->pii_af), pii->pii_name); 568 } 569 570 iov.iov_base = (char *)in_packet; 571 iov.iov_len = sizeof (in_packet); 572 msg.msg_iov = &iov; 573 msg.msg_iovlen = 1; 574 msg.msg_name = (struct sockaddr *)&from; 575 msg.msg_namelen = sizeof (from); 576 msg.msg_control = ancillary_data; 577 msg.msg_controllen = sizeof (ancillary_data); 578 579 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 580 logperror_pii(pii, "in6_data: recvfrom"); 581 return; 582 } 583 584 /* 585 * If the NIC has indicated that the link is down, don't go 586 * any further. 587 */ 588 if (LINK_DOWN(pii->pii_phyint)) 589 return; 590 591 /* Get the printable address for error reporting */ 592 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 593 if (len < ICMP_MINLEN) { 594 if (debug & D_PKTBAD) { 595 logdebug("Truncated message: msg_flags 0x%x from %s\n", 596 msg.msg_flags, abuf); 597 } 598 return; 599 } 600 /* Ignore packets > 64k or control buffers that don't fit */ 601 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 602 if (debug & D_PKTBAD) { 603 logdebug("Truncated message: msg_flags 0x%x from %s\n", 604 msg.msg_flags, abuf); 605 } 606 return; 607 } 608 609 reply = (struct pr_icmp *)in_packet; 610 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 611 return; 612 613 if (reply->pr_icmp_id != pii->pii_icmpid) { 614 /* Not in response to our probe */ 615 return; 616 } 617 618 /* 619 * The kernel has already verified the the ICMP checksum. 620 */ 621 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 622 logtrace("ICMPv6 echo reply source address not linklocal from " 623 "%s on %s\n", abuf, pii->pii_name); 624 return; 625 } 626 opt = find_ancillary(&msg, IPV6_RTHDR); 627 if (opt != NULL) { 628 /* Can't allow routing headers in probe replies */ 629 logtrace("message with routing header from %s on %s\n", 630 abuf, pii->pii_name); 631 return; 632 } 633 if (reply->pr_icmp_code != 0) { 634 logtrace("probe reply code: %d from %s on %s\n", 635 reply->pr_icmp_code, abuf, pii->pii_name); 636 return; 637 } 638 if (len < (sizeof (struct pr_icmp))) { 639 logtrace("probe reply too short: %d bytes from %s on %s\n", 640 len, abuf, pii->pii_name); 641 return; 642 } 643 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 644 incoming_echo_reply(pii, reply, from.sin6_addr); 645 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 646 incoming_mcast_reply(pii, reply, from.sin6_addr); 647 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 648 incoming_rtt_reply(pii, reply, from.sin6_addr); 649 } else { 650 /* Probably not in response to our probe */ 651 logtrace("probe reply type: %d from %s on %s\n", 652 reply->pr_icmp_mtype, abuf, pii->pii_name); 653 } 654 } 655 656 /* 657 * Process the incoming rtt reply, in response to our rtt probe. 658 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 659 * have any stored information about the probe we sent. So we don't log 660 * any errors if we receive bad replies. 661 */ 662 static void 663 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 664 struct in6_addr fromaddr) 665 { 666 int m; /* rtt measurment in ms */ 667 uint32_t cur_time; /* in ms from some arbitrary point */ 668 char abuf[INET6_ADDRSTRLEN]; 669 struct target *target; 670 uint32_t pr_icmp_timestamp; 671 struct phyint_group *pg; 672 673 /* Get the printable address for error reporting */ 674 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 675 676 if (debug & D_PROBE) { 677 logdebug("incoming_rtt_reply: %s %s %s\n", 678 AF_STR(pii->pii_af), pii->pii_name, abuf); 679 } 680 681 /* Do we know this target ? */ 682 target = target_lookup(pii, fromaddr); 683 if (target == NULL) 684 return; 685 686 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 687 cur_time = getcurrenttime(); 688 m = (int)(cur_time - pr_icmp_timestamp); 689 690 /* Invalid rtt. It has wrapped around */ 691 if (m < 0) 692 return; 693 694 /* 695 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 696 * The initial few responses after the interface is repaired may 697 * contain high rtt's because they could have been queued up waiting 698 * for ARP/NDP resolution on a failed interface. 699 */ 700 pg = pii->pii_phyint->pi_group; 701 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 702 return; 703 704 /* 705 * Update rtt only if the new rtt is lower than the current rtt. 706 * (specified by the 3rd parameter to pi_set_crtt). 707 * If a spike has caused the current probe_interval to be > 708 * user_probe_interval, then this mechanism is used to bring down 709 * the rtt rapidly once the network stress is removed. 710 * If the new rtt is higher than the current rtt, we don't want to 711 * update the rtt. We are having more than 1 outstanding probe and 712 * the increase in rtt we are seeing is being unnecessarily weighted 713 * many times. The regular rtt update will be handled by 714 * incoming_echo_reply() and will take care of any rtt increase. 715 */ 716 pi_set_crtt(target, m, _B_FALSE); 717 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 718 (user_failure_detection_time < pg->pg_fdt) && 719 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 720 /* 721 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 722 * investigate if we can improve the failure detection time to 723 * meet whatever the user specified. 724 */ 725 if (check_pg_crtt_improved(pg)) { 726 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 727 user_failure_detection_time); 728 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 729 if (pii->pii_phyint->pi_group != phyint_anongroup) { 730 logerr("Improved failure detection time %d ms " 731 "on (%s %s) for group \"%s\"\n", 732 pg->pg_fdt, AF_STR(pii->pii_af), 733 pii->pii_name, 734 pii->pii_phyint->pi_group->pg_name); 735 } 736 if (user_failure_detection_time == pg->pg_fdt) { 737 /* Avoid any truncation or rounding errors */ 738 pg->pg_probeint = user_probe_interval; 739 /* 740 * No more rtt probes will be sent. The actual 741 * fdt has dropped to the user specified value. 742 * pii_fd_snxt_basetime and pii_snxt_basetime 743 * will be in sync henceforth. 744 */ 745 reset_snxt_basetimes(); 746 } 747 } 748 } 749 } 750 751 /* 752 * Process the incoming echo reply, in response to our unicast probe. 753 * Common for both IPv4 and IPv6 754 */ 755 static void 756 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 757 struct in6_addr fromaddr) 758 { 759 int m; /* rtt measurment in ms */ 760 uint32_t cur_time; /* in ms from some arbitrary point */ 761 char abuf[INET6_ADDRSTRLEN]; 762 int pr_ndx; 763 struct target *target; 764 boolean_t exception; 765 uint32_t pr_icmp_timestamp; 766 uint16_t pr_icmp_seq; 767 struct phyint_group *pg = pii->pii_phyint->pi_group; 768 769 /* Get the printable address for error reporting */ 770 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 771 772 if (debug & D_PROBE) { 773 logdebug("incoming_echo_reply: %s %s %s seq %u\n", 774 AF_STR(pii->pii_af), pii->pii_name, abuf, 775 ntohs(reply->pr_icmp_seq)); 776 } 777 778 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 779 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 780 781 /* Reject out of window probe replies */ 782 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 783 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 784 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 785 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 786 pii->pii_cum_stats.unknown++; 787 return; 788 } 789 cur_time = getcurrenttime(); 790 m = (int)(cur_time - pr_icmp_timestamp); 791 if (m < 0) { 792 /* 793 * This is a ridiculously high value of rtt. rtt has wrapped 794 * around. Log a message, and ignore the rtt. 795 */ 796 logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " 797 "timestamp %u\n", cur_time, pr_icmp_timestamp); 798 } 799 800 /* 801 * Get the probe index pr_ndx corresponding to the received icmp seq. 802 * number in our pii->pii_probes[] array. The icmp sequence number 803 * pii_snxt corresponds to the probe index pii->pii_probe_next 804 */ 805 pr_ndx = MOD_SUB(pii->pii_probe_next, 806 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 807 808 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 809 810 target = pii->pii_probes[pr_ndx].pr_target; 811 812 /* 813 * Perform sanity checks, whether this probe reply that we 814 * have received is genuine 815 */ 816 if (target != NULL) { 817 /* 818 * Compare the src. addr of the received ICMP or ICMPv6 819 * probe reply with the target address in our tables. 820 */ 821 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 822 /* 823 * We don't have any record of having sent a probe to 824 * this target. This is a fake probe reply. Log an error 825 */ 826 logtrace("probe status %d Fake probe reply seq %u " 827 "snxt %u on %s from %s\n", 828 pii->pii_probes[pr_ndx].pr_status, 829 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 830 pii->pii_cum_stats.unknown++; 831 return; 832 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 833 /* 834 * The address matches, but our tables indicate that 835 * this probe reply has been acked already. So this 836 * is a duplicate probe reply. Log an error 837 */ 838 logtrace("probe status %d Duplicate probe reply seq %u " 839 "snxt %u on %s from %s\n", 840 pii->pii_probes[pr_ndx].pr_status, 841 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 842 pii->pii_cum_stats.unknown++; 843 return; 844 } 845 } else { 846 /* 847 * Target must not be NULL in the PR_UNACKED state 848 */ 849 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 850 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 851 /* 852 * The probe stats slot is unused. So we didn't 853 * send out any probe to this target. This is a fake. 854 * Log an error. 855 */ 856 logtrace("probe status %d Fake probe reply seq %u " 857 "snxt %u on %s from %s\n", 858 pii->pii_probes[pr_ndx].pr_status, 859 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 860 } 861 pii->pii_cum_stats.unknown++; 862 return; 863 } 864 865 /* 866 * If the rtt does not appear to be right, don't update the 867 * rtt stats. This can happen if the system dropped into the 868 * debugger, or the system was hung or too busy for a 869 * substantial time that we didn't get a chance to run. 870 */ 871 if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { 872 /* 873 * If the probe corresponding to this receieved response 874 * was truly sent 'm' ms. ago, then this response must 875 * have been rejected by the sequence number checks. The 876 * fact that it has passed the sequence number checks 877 * means that the measured rtt is wrong. We were probably 878 * scheduled long after the packet was received. 879 */ 880 goto out; 881 } 882 883 /* 884 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 885 * The initial few responses after the interface is repaired may 886 * contain high rtt's because they could have been queued up waiting 887 * for ARP/NDP resolution on a failed interface. 888 */ 889 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 890 goto out; 891 892 /* 893 * Don't update the Conservative Round Trip Time estimate for this 894 * (phint, target) pair if this is the not the highest ack seq seen 895 * thus far on this target. 896 */ 897 if (!highest_ack_tg(pr_icmp_seq, target)) 898 goto out; 899 900 /* 901 * Always update the rtt. This is a failure detection probe 902 * and we want to measure both increase / decrease in rtt. 903 */ 904 pi_set_crtt(target, m, _B_TRUE); 905 906 /* 907 * If the crtt exceeds the average time between probes, 908 * investigate if this slow target is an exception. If so we 909 * can avoid this target and still meet the failure detection 910 * time. Otherwise we can't meet the failure detection time. 911 */ 912 if (target->tg_crtt > pg->pg_probeint) { 913 exception = check_exception_target(pii, target); 914 if (exception) { 915 /* 916 * This target is exceptionally slow. Don't use it 917 * for future probes. check_exception_target() has 918 * made sure that we have at least MIN_PROBE_TARGETS 919 * other active targets 920 */ 921 if (pii->pii_targets_are_routers) { 922 /* 923 * This is a slow router, mark it as slow 924 * and don't use it for further probes. We 925 * don't delete it, since it will be populated 926 * again when we do a router scan. Hence we 927 * need to maintain extra state (unlike the 928 * host case below). Mark it as TG_SLOW. 929 */ 930 if (target->tg_status == TG_ACTIVE) 931 pii->pii_ntargets--; 932 target->tg_status = TG_SLOW; 933 target->tg_latime = gethrtime(); 934 target->tg_rtt_sa = -1; 935 target->tg_crtt = 0; 936 target->tg_rtt_sd = 0; 937 if (pii->pii_target_next == target) { 938 pii->pii_target_next = 939 target_next(target); 940 } 941 } else { 942 /* 943 * the slow target is not a router, we can 944 * just delete it. Send an icmp multicast and 945 * pick the fastest responder that is not 946 * already an active target. target_delete() 947 * adjusts pii->pii_target_next 948 */ 949 target_delete(target); 950 probe(pii, PROBE_MULTI, cur_time); 951 } 952 } else { 953 /* 954 * We can't meet the failure detection time. 955 * Log a message, and update the detection time to 956 * whatever we can achieve. 957 */ 958 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 959 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 960 last_fdt_bumpup_time = gethrtime(); 961 if (pg != phyint_anongroup) { 962 logerr("Cannot meet requested failure detection" 963 " time of %d ms on (%s %s) new failure" 964 " detection time for group \"%s\" is %d" 965 " ms\n", user_failure_detection_time, 966 AF_STR(pii->pii_af), pii->pii_name, 967 pg->pg_name, pg->pg_fdt); 968 } 969 } 970 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 971 (user_failure_detection_time < pg->pg_fdt) && 972 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 973 /* 974 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 975 * investigate if we can improve the failure detection time to 976 * meet whatever the user specified. 977 */ 978 if (check_pg_crtt_improved(pg)) { 979 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 980 user_failure_detection_time); 981 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 982 if (pg != phyint_anongroup) { 983 logerr("Improved failure detection time %d ms " 984 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 985 AF_STR(pii->pii_af), pii->pii_name, 986 pg->pg_name); 987 } 988 if (user_failure_detection_time == pg->pg_fdt) { 989 /* Avoid any truncation or rounding errors */ 990 pg->pg_probeint = user_probe_interval; 991 /* 992 * No more rtt probes will be sent. The actual 993 * fdt has dropped to the user specified value. 994 * pii_fd_snxt_basetime and pii_snxt_basetime 995 * will be in sync henceforth. 996 */ 997 reset_snxt_basetimes(); 998 } 999 } 1000 } 1001 out: 1002 pii->pii_probes[pr_ndx].pr_status = PR_ACKED; 1003 pii->pii_probes[pr_ndx].pr_time_acked = cur_time; 1004 1005 /* 1006 * Update pii->pii_rack, i.e. the sequence number of the last received 1007 * probe response, based on the echo reply we have received now, if 1008 * either of the following conditions are satisfied. 1009 * a. pii_rack is outside the current receive window of 1010 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 1011 * This means we have not received probe responses for a 1012 * long time, and the sequence number has wrapped around. 1013 * b. pii_rack is within the current receive window and this echo 1014 * reply corresponds to the highest sequence number we have seen 1015 * so far. 1016 */ 1017 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 1018 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 1019 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 1020 pii->pii_rack = pr_icmp_seq; 1021 } 1022 } 1023 1024 /* 1025 * Returns true if seq is the highest unacknowledged seq for target tg 1026 * else returns false 1027 */ 1028 static boolean_t 1029 highest_ack_tg(uint16_t seq, struct target *tg) 1030 { 1031 struct phyint_instance *pii; 1032 int pr_ndx; 1033 uint16_t pr_seq; 1034 1035 pii = tg->tg_phyint_inst; 1036 1037 /* 1038 * Get the seq number of the most recent probe sent so far, 1039 * and also get the corresponding probe index in the probe stats 1040 * array. 1041 */ 1042 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1043 pr_seq = pii->pii_snxt; 1044 pr_seq--; 1045 1046 /* 1047 * Start from the most recent probe and walk back, trying to find 1048 * an acked probe corresponding to target tg. 1049 */ 1050 for (; pr_ndx != pii->pii_probe_next; 1051 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 1052 if (pii->pii_probes[pr_ndx].pr_target == tg && 1053 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1054 if (SEQ_GT(pr_seq, seq)) 1055 return (_B_FALSE); 1056 } 1057 } 1058 return (_B_TRUE); 1059 } 1060 1061 /* 1062 * Check whether the crtt for the group has improved by a factor of 1063 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1064 * detection time flapping in the face of small crtt changes. 1065 */ 1066 static boolean_t 1067 check_pg_crtt_improved(struct phyint_group *pg) 1068 { 1069 struct phyint *pi; 1070 1071 if (debug & D_PROBE) 1072 logdebug("check_pg_crtt_improved()\n"); 1073 1074 /* 1075 * The crtt for the group is only improved if each phyint_instance 1076 * for both ipv4 and ipv6 is improved. 1077 */ 1078 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1079 if (!check_pii_crtt_improved(pi->pi_v4) || 1080 !check_pii_crtt_improved(pi->pi_v6)) 1081 return (_B_FALSE); 1082 } 1083 1084 return (_B_TRUE); 1085 } 1086 1087 /* 1088 * Check whether the crtt has improved substantially on this phyint_instance. 1089 * Returns _B_TRUE if there's no crtt information available, because pii 1090 * is NULL or the phyint_instance is not capable of probing. 1091 */ 1092 boolean_t 1093 check_pii_crtt_improved(struct phyint_instance *pii) { 1094 struct target *tg; 1095 1096 if (pii == NULL) 1097 return (_B_TRUE); 1098 1099 if (!PROBE_CAPABLE(pii) || 1100 pii->pii_phyint->pi_state == PI_FAILED) 1101 return (_B_TRUE); 1102 1103 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1104 if (tg->tg_status != TG_ACTIVE) 1105 continue; 1106 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1107 LOWER_FDT_TRIGGER)) { 1108 return (_B_FALSE); 1109 } 1110 } 1111 1112 return (_B_TRUE); 1113 } 1114 1115 /* 1116 * This target responds very slowly to probes. The target's crtt exceeds 1117 * the probe interval of its group. Compare against other targets 1118 * and determine if this target is an exception, if so return true, else false 1119 */ 1120 static boolean_t 1121 check_exception_target(struct phyint_instance *pii, struct target *target) 1122 { 1123 struct target *tg; 1124 char abuf[INET6_ADDRSTRLEN]; 1125 1126 if (debug & D_PROBE) { 1127 logdebug("check_exception_target(%s %s target %s)\n", 1128 AF_STR(pii->pii_af), pii->pii_name, 1129 pr_addr(pii->pii_af, target->tg_address, 1130 abuf, sizeof (abuf))); 1131 } 1132 1133 /* 1134 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1135 * to make a good judgement. Otherwise don't drop this target. 1136 */ 1137 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1138 return (_B_FALSE); 1139 1140 /* 1141 * Determine whether only this particular target is slow. 1142 * We know that this target's crtt exceeds the group's probe interval. 1143 * If all other active targets have a 1144 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1145 * then this target is considered slow. 1146 */ 1147 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1148 if (tg != target && tg->tg_status == TG_ACTIVE) { 1149 if (tg->tg_crtt > 1150 pii->pii_phyint->pi_group->pg_probeint / 1151 EXCEPTION_FACTOR) { 1152 return (_B_FALSE); 1153 } 1154 } 1155 } 1156 1157 return (_B_TRUE); 1158 } 1159 1160 /* 1161 * Update the target list. The icmp all hosts multicast has given us 1162 * some host to which we can send probes. If we already have sufficient 1163 * targets, discard it. 1164 */ 1165 static void 1166 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1167 struct in6_addr fromaddr) 1168 /* ARGSUSED */ 1169 { 1170 int af; 1171 char abuf[INET6_ADDRSTRLEN]; 1172 struct phyint *pi; 1173 1174 if (debug & D_PROBE) { 1175 logdebug("incoming_mcast_reply(%s %s %s)\n", 1176 AF_STR(pii->pii_af), pii->pii_name, 1177 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1178 } 1179 1180 /* 1181 * Using host targets is a fallback mechanism. If we have 1182 * found a router, don't add this host target. If we already 1183 * know MAX_PROBE_TARGETS, don't add another target. 1184 */ 1185 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1186 if (pii->pii_targets != NULL) { 1187 if (pii->pii_targets_are_routers || 1188 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1189 return; 1190 } 1191 } 1192 1193 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1194 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1195 /* 1196 * Guard against response from 0.0.0.0 1197 * and ::. Log a trace message 1198 */ 1199 logtrace("probe response from %s on %s\n", 1200 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1201 pii->pii_name); 1202 return; 1203 } 1204 1205 /* 1206 * This address is one of our own, so reject this address as a 1207 * valid probe target. 1208 */ 1209 af = pii->pii_af; 1210 if (own_address(fromaddr)) 1211 return; 1212 1213 /* 1214 * If the phyint is part a named group, then add the address to all 1215 * members of the group. Otherwise, add the address only to the 1216 * phyint itself, since other phyints in the anongroup may not be on 1217 * the same subnet. 1218 */ 1219 pi = pii->pii_phyint; 1220 if (pi->pi_group == phyint_anongroup) { 1221 target_add(pii, fromaddr, _B_FALSE); 1222 } else { 1223 pi = pi->pi_group->pg_phyint; 1224 for (; pi != NULL; pi = pi->pi_pgnext) 1225 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1226 } 1227 } 1228 1229 /* 1230 * Compute CRTT given an existing scaled average, scaled deviation estimate 1231 * and a new rtt time. The formula is from Jacobson and Karels' 1232 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1233 * are the same as those in Appendix A.2 of that paper. 1234 * 1235 * m = new measurement 1236 * sa = scaled RTT average (8 * average estimates) 1237 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1238 * crtt = Conservative round trip time. Used to determine whether probe 1239 * has timed out. 1240 * 1241 * New scaled average and deviation are passed back via sap and svp 1242 */ 1243 static int 1244 compute_crtt(int *sap, int *svp, int m) 1245 { 1246 int sa = *sap; 1247 int sv = *svp; 1248 int crtt; 1249 int saved_m = m; 1250 1251 assert(*sap >= -1); 1252 assert(*svp >= 0); 1253 1254 if (sa != -1) { 1255 /* 1256 * Update average estimator: 1257 * new rtt = old rtt + 1/8 Error 1258 * where Error = m - old rtt 1259 * i.e. 8 * new rtt = 8 * old rtt + Error 1260 * i.e. new sa = old sa + Error 1261 */ 1262 m -= sa >> 3; /* m is now Error in estimate. */ 1263 if ((sa += m) < 0) { 1264 /* Don't allow the smoothed average to be negative. */ 1265 sa = 0; 1266 } 1267 1268 /* 1269 * Update deviation estimator: 1270 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1271 * i.e. 4 * new mdev = 4 * old mdev + 1272 * (abs(Error) - old mdev) 1273 * i.e. new sv = old sv + (abs(Error) - old mdev) 1274 */ 1275 if (m < 0) 1276 m = -m; 1277 m -= sv >> 2; 1278 sv += m; 1279 } else { 1280 /* Initialization. This is the first response received. */ 1281 sa = (m << 3); 1282 sv = (m << 1); 1283 } 1284 1285 crtt = (sa >> 3) + sv; 1286 1287 if (debug & D_PROBE) { 1288 logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " 1289 "%d\n", saved_m, sa, sv, crtt); 1290 } 1291 1292 *sap = sa; 1293 *svp = sv; 1294 1295 /* 1296 * CRTT = average estimates + 4 * deviation estimates 1297 * = sa / 8 + sv 1298 */ 1299 return (crtt); 1300 } 1301 1302 static void 1303 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) 1304 { 1305 struct phyint_instance *pii = tg->tg_phyint_inst; 1306 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1307 int sa = tg->tg_rtt_sa; 1308 int sv = tg->tg_rtt_sd; 1309 int new_crtt; 1310 int i; 1311 1312 if (debug & D_PROBE) 1313 logdebug("pi_set_crtt: target - m %d\n", m); 1314 1315 /* store the round trip time, in case we need to defer computation */ 1316 tg->tg_deferred[tg->tg_num_deferred] = m; 1317 1318 new_crtt = compute_crtt(&sa, &sv, m); 1319 1320 /* 1321 * If this probe's round trip time would singlehandedly cause an 1322 * increase in the group's probe interval consider it suspect. 1323 */ 1324 if ((new_crtt > probe_interval) && is_probe_uni) { 1325 if (debug & D_PROBE) { 1326 logdebug("Received a suspect probe on %s, new_crtt =" 1327 " %d, probe_interval = %d, num_deferred = %d\n", 1328 pii->pii_probe_logint->li_name, new_crtt, 1329 probe_interval, tg->tg_num_deferred); 1330 } 1331 1332 /* 1333 * If we've deferred as many rtts as we plan on deferring, then 1334 * assume the link really did slow down and process all queued 1335 * rtts 1336 */ 1337 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1338 if (debug & D_PROBE) { 1339 logdebug("Received MAXDEFERREDRTT probes which " 1340 "would cause an increased probe_interval. " 1341 "Integrating queued rtt data points.\n"); 1342 } 1343 1344 for (i = 0; i <= tg->tg_num_deferred; i++) { 1345 tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, 1346 &tg->tg_rtt_sd, tg->tg_deferred[i]); 1347 } 1348 1349 tg->tg_num_deferred = 0; 1350 } else { 1351 tg->tg_num_deferred++; 1352 } 1353 return; 1354 } 1355 1356 /* 1357 * If this is a normal probe, or an RTT probe that would lead to a 1358 * reduced CRTT, then update our CRTT data. Further, if this was 1359 * a normal probe, pitch any deferred probes since our probes are 1360 * again being answered within our CRTT estimates. 1361 */ 1362 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1363 tg->tg_rtt_sa = sa; 1364 tg->tg_rtt_sd = sv; 1365 tg->tg_crtt = new_crtt; 1366 if (is_probe_uni) 1367 tg->tg_num_deferred = 0; 1368 } 1369 } 1370 1371 /* 1372 * Return a pointer to the specified option buffer. 1373 * If not found return NULL. 1374 */ 1375 static void * 1376 find_ancillary(struct msghdr *msg, int cmsg_type) 1377 { 1378 struct cmsghdr *cmsg; 1379 1380 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1381 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1382 if (cmsg->cmsg_level == IPPROTO_IPV6 && 1383 cmsg->cmsg_type == cmsg_type) { 1384 return (CMSG_DATA(cmsg)); 1385 } 1386 } 1387 return (NULL); 1388 } 1389 1390 /* 1391 * See if a previously failed interface has started working again. 1392 */ 1393 void 1394 phyint_check_for_repair(struct phyint *pi) 1395 { 1396 if (phyint_repaired(pi)) { 1397 if (pi->pi_group == phyint_anongroup) { 1398 logerr("NIC repair detected on %s\n", pi->pi_name); 1399 } else { 1400 logerr("NIC repair detected on %s of group %s\n", 1401 pi->pi_name, pi->pi_group->pg_name); 1402 } 1403 1404 /* 1405 * If the interface is offline, just clear the FAILED flag, 1406 * delaying the state change and failback operation until it 1407 * is brought back online. 1408 */ 1409 if (pi->pi_state == PI_OFFLINE) { 1410 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1411 return; 1412 } 1413 1414 if (pi->pi_flags & IFF_STANDBY) { 1415 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1416 } else { 1417 if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) { 1418 (void) change_lif_flags(pi, 1419 IFF_FAILED, _B_FALSE); 1420 /* Per state diagram */ 1421 pi->pi_empty = 0; 1422 } 1423 } 1424 1425 phyint_chstate(pi, PI_RUNNING); 1426 1427 if (GROUP_FAILED(pi->pi_group)) { 1428 /* 1429 * This is the 1st phyint to receive a response 1430 * after group failure. 1431 */ 1432 logerr("At least 1 interface (%s) of group %s has " 1433 "repaired\n", pi->pi_name, pi->pi_group->pg_name); 1434 phyint_group_chstate(pi->pi_group, PG_RUNNING); 1435 /* 1436 * If this is the STANDBY phyint to be repaired after a 1437 * group failure. Move data addresses on other failed 1438 * phyints in the group to this one. 1439 */ 1440 if (pi->pi_flags & IFF_STANDBY) { 1441 struct phyint *fpi = pi->pi_group->pg_phyint; 1442 for (; fpi != NULL; fpi = fpi->pi_pgnext) { 1443 if (fpi != pi) { 1444 (void) try_failover(fpi, 1445 FAILOVER_NORMAL); 1446 } 1447 } 1448 } 1449 } 1450 } 1451 } 1452 1453 /* 1454 * See if a previously functioning interface has failed, or if the 1455 * whole group of interfaces has failed. 1456 */ 1457 static void 1458 phyint_inst_check_for_failure(struct phyint_instance *pii) 1459 { 1460 struct phyint *pi; 1461 struct phyint *pi2; 1462 1463 pi = pii->pii_phyint; 1464 1465 switch (failure_state(pii)) { 1466 case PHYINT_FAILURE: 1467 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 1468 if (pi->pi_group == phyint_anongroup) { 1469 logerr("NIC failure detected on %s\n", pii->pii_name); 1470 } else { 1471 logerr("NIC failure detected on %s of group %s\n", 1472 pii->pii_name, pi->pi_group->pg_name); 1473 } 1474 /* 1475 * Do the failover, unless the interface is offline (in 1476 * which case we've already failed over). 1477 */ 1478 if (pi->pi_state != PI_OFFLINE) { 1479 phyint_chstate(pi, PI_FAILED); 1480 reset_crtt_all(pi); 1481 if (!(pi->pi_flags & IFF_INACTIVE)) 1482 (void) try_failover(pi, FAILOVER_NORMAL); 1483 } 1484 break; 1485 1486 case GROUP_FAILURE: 1487 logerr("All Interfaces in group %s have failed\n", 1488 pi->pi_group->pg_name); 1489 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; 1490 pi2 = pi2->pi_pgnext) { 1491 if (pi2->pi_flags & IFF_OFFLINE) 1492 continue; 1493 (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); 1494 reset_crtt_all(pi2); 1495 1496 /* 1497 * In the case of host targets, we 1498 * would have flushed the targets, 1499 * and gone to PI_NOTARGETS state. 1500 */ 1501 if (pi2->pi_state == PI_RUNNING) 1502 phyint_chstate(pi2, PI_FAILED); 1503 1504 pi2->pi_empty = 0; 1505 pi2->pi_full = 0; 1506 } 1507 break; 1508 1509 default: 1510 break; 1511 } 1512 } 1513 1514 /* 1515 * Determines if any timeout event has occurred and returns the number of 1516 * milliseconds until the next timeout event for the phyint. Returns 1517 * TIMER_INFINITY for "never". 1518 */ 1519 uint_t 1520 phyint_inst_timer(struct phyint_instance *pii) 1521 { 1522 int pr_ndx; 1523 uint_t timeout; 1524 struct target *cur_tg; 1525 struct probe_stats *pr_statp; 1526 struct phyint_instance *pii_other; 1527 struct phyint *pi; 1528 int valid_unack_count; 1529 int i; 1530 int interval; 1531 uint_t check_time; 1532 uint_t cur_time; 1533 hrtime_t cur_hrtime; 1534 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1535 1536 cur_time = getcurrenttime(); 1537 1538 if (debug & D_TIMER) { 1539 logdebug("phyint_inst_timer(%s %s)\n", 1540 AF_STR(pii->pii_af), pii->pii_name); 1541 } 1542 1543 pii_other = phyint_inst_other(pii); 1544 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1545 /* 1546 * Check to see if we're here due to link up/down flapping; If 1547 * enough time has passed, then try to bring the interface 1548 * back up; otherwise, schedule a timer to bring it back up 1549 * when enough time *has* elapsed. 1550 */ 1551 pi = pii->pii_phyint; 1552 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1553 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1554 if (check_time > cur_time) 1555 return (check_time - cur_time); 1556 1557 phyint_check_for_repair(pi); 1558 } 1559 } 1560 1561 /* 1562 * If this phyint is not yet initialized for probes, 1563 * don't proceed further 1564 */ 1565 if (pii->pii_probe_sock == -1) 1566 return (TIMER_INFINITY); 1567 1568 /* 1569 * If the timer has fired too soon, probably triggered 1570 * by some other phyint instance, return the remaining 1571 * time 1572 */ 1573 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1574 return (pii->pii_snxt_time - cur_time); 1575 1576 /* 1577 * If the link is down, don't send any probes for now. 1578 */ 1579 if (LINK_DOWN(pii->pii_phyint)) 1580 return (TIMER_INFINITY); 1581 1582 /* 1583 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1584 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1585 * Base probe time is strictly periodic. 1586 */ 1587 interval = GET_RANDOM( 1588 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1589 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1590 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1591 1592 /* 1593 * Check if the current time > next time to probe. If so, we missed 1594 * sending 1 or more probes, probably due to heavy system load. At least 1595 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1596 * were scheduled. Make adjustments to the times, in multiples of 1597 * user_probe_interval. 1598 */ 1599 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1600 int n; 1601 1602 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1603 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1604 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1605 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1606 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1607 pii->pii_snxt_basetime); 1608 1609 /* Collect statistics about missed probes */ 1610 probes_missed.pm_nprobes += n + 1; 1611 probes_missed.pm_ntimes++; 1612 } 1613 pii->pii_snxt_basetime += user_probe_interval; 1614 interval = pii->pii_snxt_time - cur_time; 1615 if (debug & D_TARGET) { 1616 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1617 " interval %u\n", cur_time, pii->pii_snxt_time, 1618 pii->pii_snxt_basetime, interval); 1619 } 1620 1621 /* 1622 * If no targets are known, we need to send an ICMP multicast. The 1623 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1624 * to see if we found a target. 1625 */ 1626 if (pii->pii_target_next == NULL) { 1627 assert(pii->pii_ntargets == 0); 1628 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1629 probe(pii, PROBE_MULTI, cur_time); 1630 return (interval); 1631 } 1632 1633 if ((user_probe_interval != probe_interval) && 1634 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1635 /* 1636 * the failure detection (fd) probe timer has not yet fired. 1637 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1638 */ 1639 probe(pii, PROBE_RTT, cur_time); 1640 return (interval); 1641 } 1642 /* 1643 * the fd probe timer has fired. Need to do all failure 1644 * detection / recovery calculations, and then send an fd probe 1645 * of type PROBE_UNI. 1646 */ 1647 if (user_probe_interval == probe_interval) { 1648 /* 1649 * We could have missed some probes, and then adjusted 1650 * pii_snxt_basetime above. Otherwise we could have 1651 * blindly added probe_interval to pii_fd_snxt_basetime. 1652 */ 1653 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1654 } else { 1655 pii->pii_fd_snxt_basetime += probe_interval; 1656 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1657 int n; 1658 1659 n = (cur_time - pii->pii_fd_snxt_basetime) / 1660 probe_interval; 1661 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1662 } 1663 } 1664 1665 /* 1666 * We can have at most, the latest 2 probes that we sent, in 1667 * the PR_UNACKED state. All previous probes sent, are either 1668 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1669 * timed out if the probe's time_sent + the CRTT < currenttime. 1670 * For each of the last 2 probes, examine whether it has timed 1671 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1672 */ 1673 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1674 valid_unack_count = 0; 1675 1676 for (i = 0; i < 2; i++) { 1677 pr_statp = &pii->pii_probes[pr_ndx]; 1678 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1679 switch (pr_statp->pr_status) { 1680 case PR_ACKED: 1681 /* 1682 * We received back an ACK, so the switch clearly 1683 * is not dropping our traffic, and thus we can 1684 * enable failure detection immediately. 1685 */ 1686 if (pii->pii_fd_hrtime > gethrtime()) { 1687 if (debug & D_PROBE) { 1688 logdebug("successful probe on %s; " 1689 "ending quiet period\n", 1690 pii->pii_phyint->pi_name); 1691 } 1692 pii->pii_fd_hrtime = gethrtime(); 1693 } 1694 break; 1695 1696 case PR_UNACKED: 1697 assert(cur_tg != NULL); 1698 /* 1699 * The crtt could be zero for some reason, 1700 * Eg. the phyint could be failed. If the crtt is 1701 * not available use group's probe interval, 1702 * which is a worst case estimate. 1703 */ 1704 if (cur_tg->tg_crtt != 0) { 1705 timeout = pr_statp->pr_time_sent + 1706 cur_tg->tg_crtt; 1707 } else { 1708 timeout = pr_statp->pr_time_sent + 1709 probe_interval; 1710 } 1711 if (TIME_LT(timeout, cur_time)) { 1712 pr_statp->pr_status = PR_LOST; 1713 pr_statp->pr_time_lost = timeout; 1714 } else if (i == 1) { 1715 /* 1716 * We are forced to consider this probe 1717 * lost, as we can have at most 2 unack. 1718 * probes any time, and we will be sending a 1719 * probe at the end of this function. 1720 * Normally, we should not be here, but 1721 * this can happen if an incoming response 1722 * that was considered lost has increased 1723 * the crtt for this target, and also bumped 1724 * up the FDT. Note that we never cancel or 1725 * increase the current pii_time_left, so 1726 * when the timer fires, we find 2 valid 1727 * unacked probes, and they are yet to timeout 1728 */ 1729 pr_statp->pr_status = PR_LOST; 1730 pr_statp->pr_time_lost = cur_time; 1731 } else { 1732 /* 1733 * Only the most recent probe can enter 1734 * this 'else' arm. The second most recent 1735 * probe must take either of the above arms, 1736 * if it is unacked. 1737 */ 1738 valid_unack_count++; 1739 } 1740 break; 1741 } 1742 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1743 } 1744 1745 /* 1746 * We send out 1 probe randomly in the interval between one half 1747 * and one probe interval for the group. Given that the CRTT is always 1748 * less than the group's probe interval, we can have at most 1 1749 * unacknowledged probe now. All previous probes are either lost or 1750 * acked. 1751 */ 1752 assert(valid_unack_count == 0 || valid_unack_count == 1); 1753 1754 /* 1755 * The timer has fired. Take appropriate action depending 1756 * on the current state of the phyint. 1757 * 1758 * PI_RUNNING state - Failure detection and failover 1759 * PI_FAILED state - Repair detection and failback 1760 */ 1761 switch (pii->pii_phyint->pi_state) { 1762 case PI_FAILED: 1763 /* 1764 * If the most recent probe (excluding unacked probes that 1765 * are yet to time out) has been acked, check whether the 1766 * phyint is now repaired. If the phyint is repaired, then 1767 * attempt failback, unless it is an inactive standby. 1768 */ 1769 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1770 phyint_check_for_repair(pii->pii_phyint); 1771 } 1772 break; 1773 1774 case PI_RUNNING: 1775 /* 1776 * It's possible our probes have been lost because of a 1777 * spanning-tree mandated quiet period on the switch. If so, 1778 * ignore the lost probes and consider the interface to still 1779 * be functioning. 1780 */ 1781 cur_hrtime = gethrtime(); 1782 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1783 break; 1784 1785 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1786 /* 1787 * We have 1 or more failed probes (excluding unacked 1788 * probes that are yet to time out). Determine if the 1789 * phyint has failed. If so attempt a failover, 1790 * unless it is an inactive standby 1791 */ 1792 phyint_inst_check_for_failure(pii); 1793 } 1794 break; 1795 1796 default: 1797 logerr("phyint_inst_timer: invalid state %d\n", 1798 pii->pii_phyint->pi_state); 1799 abort(); 1800 } 1801 1802 /* 1803 * Start the next probe. probe() will also set pii->pii_probe_time_left 1804 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1805 * was called, the target list may be empty. 1806 */ 1807 if (pii->pii_target_next != NULL) { 1808 probe(pii, PROBE_UNI, cur_time); 1809 /* 1810 * If we have just the one probe target, and we're not using 1811 * router targets, try to find another as we presently have 1812 * no resilience. 1813 */ 1814 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1815 probe(pii, PROBE_MULTI, cur_time); 1816 } else { 1817 probe(pii, PROBE_MULTI, cur_time); 1818 } 1819 return (interval); 1820 } 1821 1822 /* 1823 * Start the probe timer for an interface instance. 1824 */ 1825 void 1826 start_timer(struct phyint_instance *pii) 1827 { 1828 uint32_t interval; 1829 1830 /* 1831 * Spread the base probe times (pi_snxt_basetime) across phyints 1832 * uniformly over the (curtime..curtime + the group's probe_interval). 1833 * pi_snxt_basetime is strictly periodic with a frequency of 1834 * the group's probe interval. The actual probe time pi_snxt_time 1835 * adds some randomness to pi_snxt_basetime and happens in probe(). 1836 * For the 1st probe on each phyint after the timer is started, 1837 * pi_snxt_time and pi_snxt_basetime are the same. 1838 */ 1839 interval = GET_RANDOM(0, 1840 (int)pii->pii_phyint->pi_group->pg_probeint); 1841 1842 pii->pii_snxt_basetime = getcurrenttime() + interval; 1843 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1844 pii->pii_snxt_time = pii->pii_snxt_basetime; 1845 timer_schedule(interval); 1846 } 1847 1848 /* 1849 * Restart the probe timer on an interface instance. 1850 */ 1851 static void 1852 restart_timer(struct phyint_instance *pii) 1853 { 1854 /* 1855 * We don't need to restart the timer if it was never started in 1856 * the first place (pii->pii_basetime_inited not set), as the timer 1857 * won't have gone off yet. 1858 */ 1859 if (pii->pii_basetime_inited != 0) { 1860 1861 if (debug & D_LINKNOTE) 1862 logdebug("restart timer: restarting timer on %s, " 1863 "address family %s\n", pii->pii_phyint->pi_name, 1864 AF_STR(pii->pii_af)); 1865 1866 start_timer(pii); 1867 } 1868 } 1869 1870 static void 1871 process_link_state_down(struct phyint *pi) 1872 { 1873 logerr("The link has gone down on %s\n", pi->pi_name); 1874 1875 /* 1876 * Clear the probe statistics arrays, we don't want the repair 1877 * detection logic relying on probes that were succesful prior 1878 * to the link going down. 1879 */ 1880 if (PROBE_CAPABLE(pi->pi_v4)) 1881 clear_pii_probe_stats(pi->pi_v4); 1882 if (PROBE_CAPABLE(pi->pi_v6)) 1883 clear_pii_probe_stats(pi->pi_v6); 1884 /* 1885 * Check for interface failure. Although we know the interface 1886 * has failed, we don't know if all the other interfaces in the 1887 * group have failed as well. 1888 */ 1889 if ((pi->pi_state == PI_RUNNING) || 1890 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1891 if (debug & D_LINKNOTE) { 1892 logdebug("process_link_state_down:" 1893 " checking for failure on %s\n", pi->pi_name); 1894 } 1895 1896 if (pi->pi_v4 != NULL) 1897 phyint_inst_check_for_failure(pi->pi_v4); 1898 else if (pi->pi_v6 != NULL) 1899 phyint_inst_check_for_failure(pi->pi_v6); 1900 } 1901 } 1902 1903 static void 1904 process_link_state_up(struct phyint *pi) 1905 { 1906 logerr("The link has come up on %s\n", pi->pi_name); 1907 1908 /* 1909 * We stopped any running timers on each instance when the link 1910 * went down, so restart them. 1911 */ 1912 if (pi->pi_v4) 1913 restart_timer(pi->pi_v4); 1914 if (pi->pi_v6) 1915 restart_timer(pi->pi_v6); 1916 1917 phyint_check_for_repair(pi); 1918 1919 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1920 if (pi->pi_whendx == LINK_UP_PERMIN) 1921 pi->pi_whendx = 0; 1922 } 1923 1924 /* 1925 * Process any changes in link state passed up from the interfaces. 1926 */ 1927 void 1928 process_link_state_changes(void) 1929 { 1930 struct phyint *pi; 1931 1932 /* Look for interfaces where the link state has just changed */ 1933 1934 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1935 boolean_t old_link_state_up = LINK_UP(pi); 1936 1937 /* 1938 * Except when the "phyint" structure is created, this is 1939 * the only place the link state is updated. This allows 1940 * this routine to detect changes in link state, rather 1941 * than just the current state. 1942 */ 1943 UPDATE_LINK_STATE(pi); 1944 1945 if (LINK_DOWN(pi)) { 1946 /* 1947 * Has link just gone down? 1948 */ 1949 if (old_link_state_up) 1950 process_link_state_down(pi); 1951 } else { 1952 /* 1953 * Has link just gone back up? 1954 */ 1955 if (!old_link_state_up) 1956 process_link_state_up(pi); 1957 } 1958 } 1959 } 1960 1961 void 1962 reset_crtt_all(struct phyint *pi) 1963 { 1964 struct phyint_instance *pii; 1965 struct target *tg; 1966 1967 pii = pi->pi_v4; 1968 if (pii != NULL) { 1969 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1970 tg->tg_crtt = 0; 1971 tg->tg_rtt_sa = -1; 1972 tg->tg_rtt_sd = 0; 1973 } 1974 } 1975 1976 pii = pi->pi_v6; 1977 if (pii != NULL) { 1978 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1979 tg->tg_crtt = 0; 1980 tg->tg_rtt_sa = -1; 1981 tg->tg_rtt_sd = 0; 1982 } 1983 } 1984 } 1985 1986 /* 1987 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 1988 * probes on both instances IPv4 and IPv6. 1989 * If the interface has failed, return the time of the first probe failure 1990 * in "tff". 1991 */ 1992 static int 1993 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 1994 { 1995 uint_t pi_tff; 1996 struct target *cur_tg; 1997 struct probe_fail_count pfinfo; 1998 struct phyint_instance *pii_other; 1999 int pr_ndx; 2000 2001 /* 2002 * Get the number of consecutive failed probes on 2003 * this phyint across all targets. Also get the number 2004 * of consecutive failed probes on this target only 2005 */ 2006 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2007 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2008 probe_fail_info(pii, cur_tg, &pfinfo); 2009 2010 /* Get the time of first failure, for later use */ 2011 pi_tff = pfinfo.pf_tff; 2012 2013 /* 2014 * If the current target has not responded to the 2015 * last NUM_PROBE_FAILS probes, and other targets are 2016 * responding delete this target. Dead gateway detection 2017 * will eventually remove this target (if router) from the 2018 * routing tables. If that does not occur, we may end 2019 * up adding this to our list again. 2020 */ 2021 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2022 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2023 if (pii->pii_targets_are_routers) { 2024 if (cur_tg->tg_status == TG_ACTIVE) 2025 pii->pii_ntargets--; 2026 cur_tg->tg_status = TG_DEAD; 2027 cur_tg->tg_crtt = 0; 2028 cur_tg->tg_rtt_sa = -1; 2029 cur_tg->tg_rtt_sd = 0; 2030 if (pii->pii_target_next == cur_tg) 2031 pii->pii_target_next = target_next(cur_tg); 2032 } else { 2033 target_delete(cur_tg); 2034 probe(pii, PROBE_MULTI, getcurrenttime()); 2035 } 2036 return (PHYINT_OK); 2037 } 2038 2039 /* 2040 * If the phyint has lost NUM_PROBE_FAILS or more 2041 * consecutive probes, on both IPv4 and IPv6 protocol 2042 * instances of the phyint, then trigger failure 2043 * detection, else return false 2044 */ 2045 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2046 return (PHYINT_OK); 2047 2048 pii_other = phyint_inst_other(pii); 2049 if (PROBE_CAPABLE(pii_other)) { 2050 probe_fail_info(pii_other, NULL, &pfinfo); 2051 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2052 /* 2053 * We have NUM_PROBE_FAILS or more failures 2054 * on both IPv4 and IPv6. Get the earliest 2055 * time when failure was detected on this 2056 * phyint across IPv4 and IPv6. 2057 */ 2058 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2059 pi_tff = pfinfo.pf_tff; 2060 } else { 2061 /* 2062 * This instance has < NUM_PROBE_FAILS failure. 2063 * So return false 2064 */ 2065 return (PHYINT_OK); 2066 } 2067 } 2068 *tff = pi_tff; 2069 return (PHYINT_FAILURE); 2070 } 2071 2072 /* 2073 * Check if the link has gone down on this phyint, or it has failed the 2074 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2075 * Also look at other phyints of this group, for group failures. 2076 */ 2077 int 2078 failure_state(struct phyint_instance *pii) 2079 { 2080 struct probe_success_count psinfo; 2081 uint_t pi2_tls; /* time last success */ 2082 uint_t pi_tff; /* time first fail */ 2083 struct phyint *pi2; 2084 struct phyint *pi; 2085 struct phyint_instance *pii2; 2086 struct phyint_group *pg; 2087 boolean_t alone; 2088 2089 if (debug & D_FAILOVER) 2090 logdebug("phyint_failed(%s)\n", pii->pii_name); 2091 2092 pi = pii->pii_phyint; 2093 pg = pi->pi_group; 2094 2095 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2096 PHYINT_OK) 2097 return (PHYINT_OK); 2098 2099 /* 2100 * At this point, the link is down, or the phyint is suspect, 2101 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint 2102 * does not belong to any group, or is the only member of the 2103 * group capable of being probed, return PHYINT_FAILURE. 2104 */ 2105 alone = _B_TRUE; 2106 if (pg != phyint_anongroup) { 2107 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2108 if (pi2 == pi) 2109 continue; 2110 if (PROBE_CAPABLE(pi2->pi_v4) || 2111 PROBE_CAPABLE(pi2->pi_v6)) { 2112 alone = _B_FALSE; 2113 break; 2114 } 2115 } 2116 } 2117 if (alone) 2118 return (PHYINT_FAILURE); 2119 2120 /* 2121 * Need to compare against other phyints of the same group 2122 * to exclude group failures. If the failure was detected via 2123 * probing, then if the time of last success (tls) of any 2124 * phyint is more recent than the time of first fail (tff) of the 2125 * phyint in question, and the link is up on the phyint, 2126 * then it is a phyint failure. Otherwise it is a group failure. 2127 * If failure was detected via a link down notification sent from 2128 * the driver to IP, we see if any phyints in the group are still 2129 * running and haven't received a link down notification. We 2130 * will usually be processing the link down notification shortly 2131 * after it was received, so there is no point looking at the tls 2132 * of other phyints. 2133 */ 2134 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2135 /* Exclude ourself from comparison */ 2136 if (pi2 == pi) 2137 continue; 2138 2139 if (LINK_DOWN(pi)) { 2140 /* 2141 * We use FLAGS_TO_LINK_STATE() to test the 2142 * flags directly, rather then LINK_UP() or 2143 * LINK_DOWN(), as we may not have got round 2144 * to processing the link state for the other 2145 * phyints in the group yet. 2146 * 2147 * The check for PI_RUNNING and group 2148 * failure handles the case when the 2149 * group begins to recover. The first 2150 * phyint to recover should not trigger 2151 * a failover from the soon-to-recover 2152 * other phyints to the first recovered 2153 * phyint. PI_RUNNING will be set, and 2154 * pg_groupfailed cleared only after 2155 * receipt of NUM_PROBE_REPAIRS, by 2156 * which time the other phyints should 2157 * have received at least 1 packet, 2158 * and so will not have NUM_PROBE_FAILS. 2159 */ 2160 if ((pi2->pi_state == PI_RUNNING) && 2161 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) 2162 return (PHYINT_FAILURE); 2163 } else { 2164 /* 2165 * Need to compare against both IPv4 and 2166 * IPv6 instances. 2167 */ 2168 pii2 = pi2->pi_v4; 2169 if (pii2 != NULL) { 2170 probe_success_info(pii2, NULL, &psinfo); 2171 if (psinfo.ps_tls_valid) { 2172 pi2_tls = psinfo.ps_tls; 2173 /* 2174 * See comment above regarding check 2175 * for PI_RUNNING and group failure. 2176 */ 2177 if (TIME_GT(pi2_tls, pi_tff) && 2178 (pi2->pi_state == PI_RUNNING) && 2179 !GROUP_FAILED(pg) && 2180 FLAGS_TO_LINK_STATE(pi2)) 2181 return (PHYINT_FAILURE); 2182 } 2183 } 2184 2185 pii2 = pi2->pi_v6; 2186 if (pii2 != NULL) { 2187 probe_success_info(pii2, NULL, &psinfo); 2188 if (psinfo.ps_tls_valid) { 2189 pi2_tls = psinfo.ps_tls; 2190 /* 2191 * See comment above regarding check 2192 * for PI_RUNNING and group failure. 2193 */ 2194 if (TIME_GT(pi2_tls, pi_tff) && 2195 (pi2->pi_state == PI_RUNNING) && 2196 !GROUP_FAILED(pg) && 2197 FLAGS_TO_LINK_STATE(pi2)) 2198 return (PHYINT_FAILURE); 2199 } 2200 } 2201 } 2202 } 2203 2204 /* 2205 * Change the group state to PG_FAILED if it's not already. 2206 */ 2207 if (!GROUP_FAILED(pg)) 2208 phyint_group_chstate(pg, PG_FAILED); 2209 2210 return (GROUP_FAILURE); 2211 } 2212 2213 /* 2214 * Return the information associated with consecutive probe successes 2215 * starting with the most recent probe. At most the last 2 probes can be 2216 * in the unacknowledged state. All previous probes have either failed 2217 * or succeeded. 2218 */ 2219 static void 2220 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2221 struct probe_success_count *psinfo) 2222 { 2223 uint_t i; 2224 struct probe_stats *pr_statp; 2225 uint_t most_recent; 2226 uint_t second_most_recent; 2227 boolean_t pi_found_failure = _B_FALSE; 2228 boolean_t tg_found_failure = _B_FALSE; 2229 uint_t now; 2230 uint_t timeout; 2231 struct target *tg; 2232 2233 if (debug & D_FAILOVER) 2234 logdebug("probe_success_info(%s)\n", pii->pii_name); 2235 2236 bzero(psinfo, sizeof (*psinfo)); 2237 now = getcurrenttime(); 2238 2239 /* 2240 * Start with the most recent probe, and count the number 2241 * of consecutive probe successes. Latch the number of successes 2242 * on hitting a failure. 2243 */ 2244 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2245 second_most_recent = PROBE_INDEX_PREV(most_recent); 2246 2247 for (i = most_recent; i != pii->pii_probe_next; 2248 i = PROBE_INDEX_PREV(i)) { 2249 pr_statp = &pii->pii_probes[i]; 2250 2251 switch (pr_statp->pr_status) { 2252 case PR_UNACKED: 2253 /* 2254 * Only the most recent 2 probes can be unacknowledged 2255 */ 2256 assert(i == most_recent || i == second_most_recent); 2257 2258 tg = pr_statp->pr_target; 2259 assert(tg != NULL); 2260 /* 2261 * The crtt could be zero for some reason, 2262 * Eg. the phyint could be failed. If the crtt is 2263 * not available use the value of the group's probe 2264 * interval which is a worst case estimate. 2265 */ 2266 if (tg->tg_crtt != 0) { 2267 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2268 } else { 2269 timeout = pr_statp->pr_time_sent + 2270 pii->pii_phyint->pi_group->pg_probeint; 2271 } 2272 2273 if (TIME_LT(timeout, now)) { 2274 /* 2275 * We hit a failure. Latch the total number of 2276 * recent consecutive successes. 2277 */ 2278 pr_statp->pr_time_lost = timeout; 2279 pr_statp->pr_status = PR_LOST; 2280 pi_found_failure = _B_TRUE; 2281 if (cur_tg != NULL && tg == cur_tg) { 2282 /* 2283 * We hit a failure for the desired 2284 * target. Latch the number of recent 2285 * consecutive successes for this target 2286 */ 2287 tg_found_failure = _B_TRUE; 2288 } 2289 } 2290 break; 2291 2292 case PR_ACKED: 2293 /* 2294 * Bump up the count of probe successes, if we 2295 * have not seen any failure so far. 2296 */ 2297 if (!pi_found_failure) 2298 psinfo->ps_nsucc++; 2299 2300 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2301 !tg_found_failure) { 2302 psinfo->ps_nsucc_tg++; 2303 } 2304 2305 /* 2306 * Record the time of last success, if this is 2307 * the most recent probe success. 2308 */ 2309 if (!psinfo->ps_tls_valid) { 2310 psinfo->ps_tls = pr_statp->pr_time_acked; 2311 psinfo->ps_tls_valid = _B_TRUE; 2312 } 2313 break; 2314 2315 case PR_LOST: 2316 /* 2317 * We hit a failure. Latch the total number of 2318 * recent consecutive successes. 2319 */ 2320 pi_found_failure = _B_TRUE; 2321 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2322 /* 2323 * We hit a failure for the desired target. 2324 * Latch the number of recent consecutive 2325 * successes for this target 2326 */ 2327 tg_found_failure = _B_TRUE; 2328 } 2329 break; 2330 2331 default: 2332 return; 2333 2334 } 2335 } 2336 } 2337 2338 /* 2339 * Return the information associated with consecutive probe failures 2340 * starting with the most recent probe. Only the last 2 probes can be in the 2341 * unacknowledged state. All previous probes have either failed or succeeded. 2342 */ 2343 static void 2344 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2345 struct probe_fail_count *pfinfo) 2346 { 2347 int i; 2348 struct probe_stats *pr_statp; 2349 boolean_t tg_found_success = _B_FALSE; 2350 boolean_t pi_found_success = _B_FALSE; 2351 int most_recent; 2352 int second_most_recent; 2353 uint_t now; 2354 uint_t timeout; 2355 struct target *tg; 2356 2357 if (debug & D_FAILOVER) 2358 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2359 2360 bzero(pfinfo, sizeof (*pfinfo)); 2361 now = getcurrenttime(); 2362 2363 /* 2364 * Start with the most recent probe, and count the number 2365 * of consecutive probe failures. Latch the number of failures 2366 * on hitting a probe success. 2367 */ 2368 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2369 second_most_recent = PROBE_INDEX_PREV(most_recent); 2370 2371 for (i = most_recent; i != pii->pii_probe_next; 2372 i = PROBE_INDEX_PREV(i)) { 2373 pr_statp = &pii->pii_probes[i]; 2374 2375 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2376 2377 switch (pr_statp->pr_status) { 2378 case PR_UNACKED: 2379 /* 2380 * Only the most recent 2 probes can be unacknowledged 2381 */ 2382 assert(i == most_recent || i == second_most_recent); 2383 2384 tg = pr_statp->pr_target; 2385 /* 2386 * Target is guaranteed to exist in the unack. state 2387 */ 2388 assert(tg != NULL); 2389 /* 2390 * The crtt could be zero for some reason, 2391 * Eg. the phyint could be failed. If the crtt is 2392 * not available use the group's probe interval, 2393 * which is a worst case estimate. 2394 */ 2395 if (tg->tg_crtt != 0) { 2396 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2397 } else { 2398 timeout = pr_statp->pr_time_sent + 2399 pii->pii_phyint->pi_group->pg_probeint; 2400 } 2401 2402 if (TIME_GT(timeout, now)) 2403 break; 2404 2405 pr_statp->pr_time_lost = timeout; 2406 pr_statp->pr_status = PR_LOST; 2407 /* FALLTHRU */ 2408 2409 case PR_LOST: 2410 if (!pi_found_success) { 2411 pfinfo->pf_nfail++; 2412 pfinfo->pf_tff = pr_statp->pr_time_lost; 2413 } 2414 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2415 !tg_found_success) { 2416 pfinfo->pf_nfail_tg++; 2417 } 2418 break; 2419 2420 default: 2421 /* 2422 * We hit a success or unused slot. Latch the 2423 * total number of recent consecutive failures. 2424 */ 2425 pi_found_success = _B_TRUE; 2426 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2427 /* 2428 * We hit a success for the desired target. 2429 * Latch the number of recent consecutive 2430 * failures for this target 2431 */ 2432 tg_found_success = _B_TRUE; 2433 } 2434 } 2435 } 2436 } 2437 2438 /* 2439 * Check if the phyint has been repaired. If no test address has been 2440 * configured, then consider the interface repaired if the link is up (unless 2441 * the link is flapping; see below). Otherwise, look for proof of probes 2442 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2443 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2444 */ 2445 static boolean_t 2446 phyint_repaired(struct phyint *pi) 2447 { 2448 struct probe_success_count psinfo; 2449 struct phyint_instance *pii; 2450 struct target *cur_tg; 2451 int pr_ndx; 2452 uint_t cur_time; 2453 2454 if (debug & D_FAILOVER) 2455 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2456 2457 if (LINK_DOWN(pi)) 2458 return (_B_FALSE); 2459 2460 /* 2461 * If we don't have any test addresses and the link is up, then 2462 * consider the interface repaired, unless we've received more than 2463 * LINK_UP_PERMIN link up notifications in the last minute, in 2464 * which case we keep the link down until we drop back below 2465 * the threshold. 2466 */ 2467 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2468 cur_time = getcurrenttime(); 2469 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2470 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2471 pi->pi_lfmsg_printed = 0; 2472 return (_B_TRUE); 2473 } 2474 if (!pi->pi_lfmsg_printed) { 2475 logerr("The link has come up on %s more than %d times " 2476 "in the last minute; disabling failback until it " 2477 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2478 pi->pi_lfmsg_printed = 1; 2479 } 2480 2481 return (_B_FALSE); 2482 } 2483 2484 pii = pi->pi_v4; 2485 if (PROBE_CAPABLE(pii)) { 2486 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2487 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2488 probe_success_info(pii, cur_tg, &psinfo); 2489 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2490 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2491 return (_B_TRUE); 2492 } 2493 2494 pii = pi->pi_v6; 2495 if (PROBE_CAPABLE(pii)) { 2496 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2497 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2498 probe_success_info(pii, cur_tg, &psinfo); 2499 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2500 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2501 return (_B_TRUE); 2502 } 2503 2504 return (_B_FALSE); 2505 } 2506 2507 /* 2508 * Try failover from phyint 'pi' to a suitable destination. 2509 */ 2510 int 2511 try_failover(struct phyint *pi, int failover_type) 2512 { 2513 struct phyint *dst; 2514 int err; 2515 2516 if (debug & D_FAILOVER) 2517 logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); 2518 2519 /* 2520 * Attempt to find a failover destination 'dst'. 2521 * dst will be null if any of the following is true 2522 * Phyint is not part of a group OR 2523 * Phyint is the only member of a group OR 2524 * No suitable failover dst was available 2525 */ 2526 dst = get_failover_dst(pi, failover_type); 2527 if (dst == NULL) 2528 return (IPMP_EMINRED); 2529 2530 dst->pi_empty = 0; /* Per state diagram */ 2531 pi->pi_full = 0; /* Per state diagram */ 2532 2533 err = failover(pi, dst); 2534 2535 if (debug & D_FAILOVER) { 2536 logdebug("failed over from %s to %s ret %d\n", 2537 pi->pi_name, dst->pi_name, err); 2538 } 2539 if (err == 0) { 2540 pi->pi_empty = 1; /* Per state diagram */ 2541 /* 2542 * we don't want to print out this message if a 2543 * phyint is leaving the group, nor for failover from 2544 * standby 2545 */ 2546 if (failover_type == FAILOVER_NORMAL) { 2547 logerr("Successfully failed over from NIC %s to NIC " 2548 "%s\n", pi->pi_name, dst->pi_name); 2549 } 2550 return (0); 2551 } else { 2552 /* 2553 * The failover did not succeed. We must retry the failover 2554 * only after resyncing our state based on the kernel's. 2555 * For eg. either the src or the dst might have been unplumbed 2556 * causing this failure. initifs() will be called again, 2557 * from main, since full_scan_required has been set to true 2558 * by failover(); 2559 */ 2560 return (IPMP_FAILURE); 2561 } 2562 } 2563 2564 /* 2565 * global_errno captures the errno value, if failover() or failback() 2566 * fails. This is sent to if_mpadm(1M). 2567 */ 2568 int global_errno; 2569 2570 /* 2571 * Attempt failover from phyint 'from' to phyint 'to'. 2572 * IP moves everything from phyint 'from' to phyint 'to'. 2573 */ 2574 static int 2575 failover(struct phyint *from, struct phyint *to) 2576 { 2577 struct lifreq lifr; 2578 int ret; 2579 2580 if (debug & D_FAILOVER) { 2581 logdebug("failing over from %s to %s\n", 2582 from->pi_name, to->pi_name); 2583 } 2584 2585 /* 2586 * Perform the failover. Both IPv4 and IPv6 are failed over 2587 * using a single ioctl by passing in AF_UNSPEC family. 2588 */ 2589 lifr.lifr_addr.ss_family = AF_UNSPEC; 2590 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2591 lifr.lifr_movetoindex = to->pi_ifindex; 2592 2593 ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); 2594 if (ret < 0) { 2595 global_errno = errno; 2596 logperror("failover: ioctl (failover)"); 2597 } 2598 2599 /* 2600 * Set full_scan_required to true. This will make us read 2601 * the state from the kernel in initifs() and update our tables, 2602 * to reflect the current state after the failover. If the 2603 * failover has failed it will then reissue the failover. 2604 */ 2605 full_scan_required = _B_TRUE; 2606 return (ret); 2607 } 2608 2609 /* 2610 * phyint 'pi' has recovered. Attempt failback from every phyint in the same 2611 * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. 2612 * Return values: 2613 * IPMP_SUCCESS: Failback successful from each of the other 2614 * phyints in the group. 2615 * IPMP_EFBPARTIAL: Failback successful from some of the other 2616 * phyints in the group. 2617 * IPMP_FAILURE: Failback syscall failed with some error. 2618 * 2619 * Note that failback is attempted regardless of the setting of the 2620 * failback_enabled flag. 2621 */ 2622 int 2623 do_failback(struct phyint *pi, boolean_t check_only) 2624 { 2625 struct phyint *from; 2626 boolean_t done; 2627 boolean_t partial; 2628 boolean_t attempted_failback = _B_FALSE; 2629 2630 if (debug & D_FAILOVER) 2631 logdebug("do_failback(%s)\n", pi->pi_name); 2632 2633 /* If this phyint is not part of a named group, return. */ 2634 if (pi->pi_group == phyint_anongroup) { 2635 pi->pi_full = 1; 2636 return (IPMP_SUCCESS); 2637 } 2638 2639 /* 2640 * Attempt failback from every phyint in the group to 'pi'. 2641 * The reason for doing this, instead of only from the 2642 * phyint to which we did the failover is given below. 2643 * 2644 * After 'pi' failed, if any app. tries to join on a multicast 2645 * address (IPv6), on the failed phyint, IP picks any arbitrary 2646 * non-failed phyint in the group, instead of the failed phyint, 2647 * in.mpathd is not aware of this. Thus failing back only from the 2648 * interface to which 'pi' failed over, will failback the ipif's 2649 * but not the ilm's. So we need to failback from all members of 2650 * the phyint group 2651 */ 2652 done = _B_TRUE; 2653 partial = _B_FALSE; 2654 for (from = pi->pi_group->pg_phyint; from != NULL; 2655 from = from->pi_pgnext) { 2656 /* Exclude ourself as a failback src */ 2657 if (from == pi) 2658 continue; 2659 2660 /* 2661 * If the 'from' phyint has IPv4 plumbed, the 'to' 2662 * phyint must also have IPv4 plumbed. Similar check 2663 * for IPv6. IP makes the same check. Otherwise the 2664 * failback will fail. 2665 */ 2666 if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || 2667 (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { 2668 partial = _B_TRUE; 2669 continue; 2670 } 2671 2672 if (!check_only) { 2673 pi->pi_empty = 0; /* Per state diagram */ 2674 attempted_failback = _B_TRUE; 2675 if (failback(from, pi) != 0) { 2676 done = _B_FALSE; 2677 break; 2678 } 2679 } 2680 } 2681 2682 if (check_only) { 2683 return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2684 } 2685 2686 /* 2687 * We are done. No more phyint from which we can src the failback 2688 */ 2689 if (done) { 2690 if (!partial) 2691 pi->pi_full = 1; /* Per state diagram */ 2692 /* 2693 * Don't print out a message unless there is a 2694 * transition from FAILED to RUNNING. For eg. 2695 * we don't want to print out this message if a 2696 * phyint is leaving the group, or at startup 2697 */ 2698 if (attempted_failback && (pi->pi_flags & 2699 (IFF_FAILED | IFF_OFFLINE))) { 2700 logerr("Successfully failed back to NIC %s\n", 2701 pi->pi_name); 2702 } 2703 return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2704 } 2705 2706 return (IPMP_FAILURE); 2707 } 2708 2709 /* 2710 * This function is similar to do_failback() above, but respects the 2711 * failback_enabled flag for phyints in named groups. 2712 */ 2713 int 2714 try_failback(struct phyint *pi, boolean_t check_only) 2715 { 2716 if (debug & D_FAILOVER) 2717 logdebug("try_failback(%s)\n", pi->pi_name); 2718 2719 if (pi->pi_group != phyint_anongroup && !failback_enabled) 2720 return (IPMP_EFBDISABLED); 2721 2722 return (do_failback(pi, check_only)); 2723 } 2724 2725 /* 2726 * Failback everything from phyint 'from' that has the same ifindex 2727 * as phyint to's ifindex. 2728 */ 2729 static int 2730 failback(struct phyint *from, struct phyint *to) 2731 { 2732 struct lifreq lifr; 2733 int ret; 2734 2735 if (debug & D_FAILOVER) 2736 logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); 2737 2738 lifr.lifr_addr.ss_family = AF_UNSPEC; 2739 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2740 lifr.lifr_movetoindex = to->pi_ifindex; 2741 2742 ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); 2743 if (ret < 0) { 2744 global_errno = errno; 2745 logperror("failback: ioctl (failback)"); 2746 } 2747 2748 /* 2749 * Set full_scan_required to true. This will make us read 2750 * the state from the kernel in initifs() and update our tables, 2751 * to reflect the current state after the failback. If the 2752 * failback has failed it will then reissue the failback. 2753 */ 2754 full_scan_required = _B_TRUE; 2755 2756 return (ret); 2757 } 2758 2759 /* 2760 * Select a target phyint for failing over from 'pi'. 2761 * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred 2762 * target phyint is chosen as follows, 2763 * 1. Pick any inactive standby interface. 2764 * 2. If no inactive standby is available, select any phyint in the 2765 * same group that has the least number of logints, (excluding 2766 * IFF_NOFAILOVER and !IFF_UP logints) 2767 * If we are failing over from a standby, failover_type is 2768 * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. 2769 * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, 2770 * and we won't return NULL, as long as there is at least 1 other phyint 2771 * in the group. 2772 */ 2773 static struct phyint * 2774 get_failover_dst(struct phyint *pi, int failover_type) 2775 { 2776 struct phyint *maybe = NULL; 2777 struct phyint *pi2; 2778 struct phyint *last_choice = NULL; 2779 2780 if (pi->pi_group == phyint_anongroup) 2781 return (NULL); 2782 2783 /* 2784 * Loop thru the phyints in the group, and pick the preferred 2785 * phyint for the target. 2786 */ 2787 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2788 /* Exclude ourself and offlined interfaces */ 2789 if (pi2 == pi || pi2->pi_state == PI_OFFLINE) 2790 continue; 2791 2792 /* 2793 * The chosen target phyint must have IPv4 instance 2794 * plumbed, if the src phyint has IPv4 plumbed. Similarly 2795 * for IPv6. 2796 */ 2797 if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || 2798 (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) 2799 continue; 2800 2801 /* The chosen target must be PI_RUNNING. */ 2802 if (pi2->pi_state != PI_RUNNING) { 2803 last_choice = pi2; 2804 continue; 2805 } 2806 2807 if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && 2808 (failover_type != FAILOVER_TO_NONSTANDBY)) { 2809 return (pi2); 2810 } else { 2811 if (maybe == NULL) 2812 maybe = pi2; 2813 else if (logint_upcount(pi2) < logint_upcount(maybe)) 2814 maybe = pi2; 2815 } 2816 } 2817 if (maybe == NULL && failover_type == FAILOVER_TO_ANY) 2818 return (last_choice); 2819 else 2820 return (maybe); 2821 } 2822 2823 /* 2824 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2825 */ 2826 boolean_t 2827 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) 2828 { 2829 int ifsock; 2830 struct lifreq lifr; 2831 2832 if (debug & D_FAILOVER) { 2833 logdebug("change_lif_flags(%s): flags %llx setfl %d\n", 2834 pi->pi_name, flags, (int)setfl); 2835 } 2836 2837 if (pi->pi_v4 != NULL) { 2838 ifsock = ifsock_v4; 2839 } else { 2840 ifsock = ifsock_v6; 2841 } 2842 2843 /* 2844 * Get the current flags from the kernel, and set/clear the 2845 * desired phyint flags. Since we set only phyint flags, we can 2846 * do it on either IPv4 or IPv6 instance. 2847 */ 2848 (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2849 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 2850 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2851 if (errno != ENXIO) 2852 logperror("change_lif_flags: ioctl (get flags)"); 2853 return (_B_FALSE); 2854 } 2855 if (setfl) 2856 lifr.lifr_flags |= flags; 2857 else 2858 lifr.lifr_flags &= ~flags; 2859 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2860 if (errno != ENXIO) 2861 logperror("change_lif_flags: ioctl (set flags)"); 2862 return (_B_FALSE); 2863 } 2864 2865 /* 2866 * Keep pi_flags in synch. with actual flags. Assumes flags are 2867 * phyint flags. 2868 */ 2869 if (setfl) 2870 pi->pi_flags |= flags; 2871 else 2872 pi->pi_flags &= ~flags; 2873 2874 if (pi->pi_v4) 2875 pi->pi_v4->pii_flags = pi->pi_flags; 2876 2877 if (pi->pi_v6) 2878 pi->pi_v6->pii_flags = pi->pi_flags; 2879 2880 return (_B_TRUE); 2881 } 2882 2883 /* 2884 * icmp cksum computation for IPv4. 2885 */ 2886 static int 2887 in_cksum(ushort_t *addr, int len) 2888 { 2889 register int nleft = len; 2890 register ushort_t *w = addr; 2891 register ushort_t answer; 2892 ushort_t odd_byte = 0; 2893 register int sum = 0; 2894 2895 /* 2896 * Our algorithm is simple, using a 32 bit accumulator (sum), 2897 * we add sequential 16 bit words to it, and at the end, fold 2898 * back all the carry bits from the top 16 bits into the lower 2899 * 16 bits. 2900 */ 2901 while (nleft > 1) { 2902 sum += *w++; 2903 nleft -= 2; 2904 } 2905 2906 /* mop up an odd byte, if necessary */ 2907 if (nleft == 1) { 2908 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2909 sum += odd_byte; 2910 } 2911 2912 /* 2913 * add back carry outs from top 16 bits to low 16 bits 2914 */ 2915 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2916 sum += (sum >> 16); /* add carry */ 2917 answer = ~sum; /* truncate to 16 bits */ 2918 return (answer); 2919 } 2920 2921 static void 2922 reset_snxt_basetimes(void) 2923 { 2924 struct phyint_instance *pii; 2925 2926 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2927 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2928 } 2929 } 2930 2931 /* 2932 * Is the address one of our own addresses? Unfortunately, 2933 * we cannot check our phyint tables to determine if the address 2934 * is our own. This is because, we don't track interfaces that 2935 * are not part of any group. We have to either use a 'bind' or 2936 * get the complete list of all interfaces using SIOCGLIFCONF, 2937 * to do this check. We could also use SIOCTMYADDR. 2938 * Bind fails for the local zone address, so we might include local zone 2939 * address as target address. If local zone address is a target address 2940 * and it is up, it is not possible to detect the interface failure. 2941 * SIOCTMYADDR also doesn't consider local zone address as own address. 2942 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2943 * are stored in laddr_list. 2944 */ 2945 2946 boolean_t 2947 own_address(struct in6_addr addr) 2948 { 2949 struct local_addr *taddr = laddr_list; 2950 2951 for (; taddr != NULL; taddr = taddr->next) { 2952 if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { 2953 return (_B_TRUE); 2954 } 2955 } 2956 return (_B_FALSE); 2957 } 2958