1 /* 2 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #pragma ident "%Z%%M% %I% %E% SMI" 24 25 #include "mpd_defs.h" 26 #include "mpd_tables.h" 27 28 /* 29 * Probe types for probe() 30 */ 31 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 32 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 33 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 34 35 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 36 37 /* 38 * Format of probe / probe response packets. This is an ICMP Echo request 39 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 40 */ 41 struct pr_icmp 42 { 43 uint8_t pr_icmp_type; /* type field */ 44 uint8_t pr_icmp_code; /* code field */ 45 uint16_t pr_icmp_cksum; /* checksum field */ 46 uint16_t pr_icmp_id; /* Identification */ 47 uint16_t pr_icmp_seq; /* sequence number */ 48 uint32_t pr_icmp_timestamp; /* Time stamp */ 49 uint32_t pr_icmp_mtype; /* Message type */ 50 }; 51 52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x0, 54 0x0, 0x0, 0x0, 0x0, 55 0x0, 0x0, 0x0, 0x1 } }; 56 57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 58 59 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 60 61 static void *find_ancillary(struct msghdr *msg, int cmsg_type); 62 static void pi_set_crtt(struct target *tg, int m, 63 boolean_t is_probe_uni); 64 static void incoming_echo_reply(struct phyint_instance *pii, 65 struct pr_icmp *reply, struct in6_addr fromaddr); 66 static void incoming_rtt_reply(struct phyint_instance *pii, 67 struct pr_icmp *reply, struct in6_addr fromaddr); 68 static void incoming_mcast_reply(struct phyint_instance *pii, 69 struct pr_icmp *reply, struct in6_addr fromaddr); 70 71 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 72 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 73 static boolean_t check_exception_target(struct phyint_instance *pii, 74 struct target *target); 75 static void probe_fail_info(struct phyint_instance *pii, 76 struct target *cur_tg, struct probe_fail_count *pfinfo); 77 static void probe_success_info(struct phyint_instance *pii, 78 struct target *cur_tg, struct probe_success_count *psinfo); 79 static boolean_t phyint_repaired(struct phyint *pi); 80 81 static int failover(struct phyint *from, struct phyint *to); 82 static int failback(struct phyint *from, struct phyint *to); 83 static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); 84 85 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 86 static int in_cksum(ushort_t *addr, int len); 87 static void reset_snxt_basetimes(void); 88 89 /* 90 * CRTT - Conservative Round Trip Time Estimate 91 * Probe success - A matching probe reply received before CRTT ms has elapsed 92 * after sending the probe. 93 * Probe failure - No probe reply received and more than CRTT ms has elapsed 94 * after sending the probe. 95 * 96 * TLS - Time last success. Most recent probe ack received at this time. 97 * TFF - Time first fail. The time of the earliest probe failure in 98 * a consecutive series of probe failures. 99 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 100 * before declaring phyint repair. 101 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 102 * declare a phyint failure. 103 * 104 * Phyint state diagram 105 * 106 * The state of a phyint that is capable of being probed, is completely 107 * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. 108 * 109 * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 110 * of the link (according to the driver). If the phyint is also configured 111 * with a test address (the common case) and probe targets, then a phyint must 112 * also successfully be able to send and receive probes in order to remain in 113 * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 114 * 115 * Further, if a PI_RUNNING phyint is configured with a test address but is 116 * unable to find any probe targets, it will transition to the PI_NOTARGETS 117 * state, which indicates that the link is apparently functional but that 118 * in.mpathd is unable to send probes to verify functionality (in this case, 119 * in.mpathd makes the optimistic assumption that the interface is working 120 * correctly and thus does not perform a failover, but reports the interface 121 * as IPMP_IF_UNKNOWN through the async events and query interfaces). 122 * 123 * At any point, a phyint may be administratively marked offline via if_mpadm. 124 * In this case, the interface always transitions to PI_OFFLINE, regardless 125 * of its previous state. When the interface is later brought back online, 126 * in.mpathd acts as if the interface is new (and thus it transitions to 127 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 128 * its probes, if probes are sent). 129 * 130 * pi_state - PI_RUNNING or PI_FAILED 131 * PI_RUNNING: The failure detection logic says the phyint is good. 132 * PI_FAILED: The failure detection logic says the phyint has failed. 133 * 134 * pg_groupfailed - Group failure, all interfaces in the group have failed. 135 * The pi_state may be either PI_FAILED or PI_NOTARGETS. 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: No failovers have been done to this phyint, from 144 * other phyints. This phyint is inactive. Phyint can be a Standby. 145 * When failback has been disabled (FAILOVER=no configured), 146 * phyint can also be a non-STANDBY. In this case IFF_INACTIVE 147 * is set when phyint subsequently recovers after a failure. 148 * 149 * pi_empty 150 * This phyint has failed over successfully to another phyint, and 151 * this phyint is currently "empty". It does not host any addresses or 152 * multicast membership etc. This is the state of a phyint after a 153 * failover from the phyint has completed successfully and no subsequent 154 * 'failover to' or 'failback to' has occurred on the phyint. 155 * IP guarantees that no new logicals will be hosted nor any multicast 156 * joins permitted on the phyint, since the phyint is either failed or 157 * inactive. pi_empty is set implies the phyint is either failed or 158 * inactive. 159 * 160 * pi_full 161 * The phyint hosts all of its own addresses that it "owns". If the 162 * phyint was previously failed or inactive, failbacks to the phyint 163 * has completed successfully. i.e. No more failbacks to this phyint 164 * can produce any change in system state whatsoever. 165 * 166 * Not all 32 possible combinations of the above 5-tuple are possible. 167 * Furthermore some of the above combinations are transient. They may occur 168 * only because the failover or failback did not complete successfully. The 169 * failover/failback will be retried and eventually a stable state will be 170 * reached. 171 * 172 * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. 173 * The following are the state machines. 'from' and 'to' are the src and 174 * dst of the failover/failback, below 175 * 176 * pi_empty state machine 177 * --------------------------------------------------------------------------- 178 * Event State -> New State 179 * --------------------------------------------------------------------------- 180 * successful completion from.pi_empty = 0 -> from.pi_empty = 1 181 * of failover 182 * 183 * Initiate failover to.pi_empty = X -> to.pi_empty = 0 184 * 185 * Initiate failback to.pi_empty = X -> to.pi_empty = 0 186 * 187 * group failure pi_empty = X -> pi_empty = 0 188 * --------------------------------------------------------------------------- 189 * 190 * pi_full state machine 191 * --------------------------------------------------------------------------- 192 * Event State -> New State 193 * --------------------------------------------------------------------------- 194 * successful completion to.pi_full = 0 -> to.pi_full = 1 195 * of failback from 196 * each of the other phyints 197 * 198 * Initiate failover from.pi_full = X -> from.pi_full = 0 199 * 200 * group failure pi_full = X -> pi_full = 0 201 * --------------------------------------------------------------------------- 202 * 203 * pi_state state machine 204 * --------------------------------------------------------------------------- 205 * Event State New State 206 * Action: 207 * --------------------------------------------------------------------------- 208 * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 209 * detection : set IFF_FAILED on this phyint 210 * : failover from this phyint to another 211 * 212 * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 213 * detection : set IFF_FAILED on this phyint 214 * 215 * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) 216 * detection -> (PI_RUNNING, I == 0) 217 * : to.pi_empty = 0 218 * : clear IFF_FAILED on this phyint 219 * : failback to this phyint if enabled 220 * 221 * NIC repair (PI_FAILED, I == 0, FAILBACK=no) 222 * detection -> (PI_RUNNING, I == 1) 223 * : to.pi_empty = 0 224 * : clear IFF_FAILED on this phyint 225 * : if failback is disabled set I == 1 226 * 227 * Group failure (perform on all phyints in the group) 228 * detection PI_RUNNING PI_FAILED 229 * (Router targets) : set IFF_FAILED 230 * : clear pi_empty and pi_full 231 * 232 * Group failure (perform on all phyints in the group) 233 * detection PI_RUNNING PI_NOTARGETS 234 * (Host targets) : set IFF_FAILED 235 * : clear pi_empty and pi_full 236 * : delete the target list on all phyints 237 * --------------------------------------------------------------------------- 238 * 239 * I state machine 240 * --------------------------------------------------------------------------- 241 * Event State Action: 242 * --------------------------------------------------------------------------- 243 * Turn on I pi_empty == 0, STANDBY : failover from standby 244 * 245 * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 246 * pi_full == 0 : failback to this if enabled 247 * --------------------------------------------------------------------------- 248 * 249 * Assertions: (Read '==>' as implies) 250 * 251 * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) 252 * (pi_empty == 1) ==> (pi_full == 0) 253 * (pi_full == 1) ==> (pi_empty == 0) 254 * 255 * Invariants 256 * 257 * pg_groupfailed = 0 && 258 * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby 259 * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint 260 * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint 261 * 262 * 1. says that an inactive standby, that is not empty, has to be failed 263 * over. For a standby to be truly inactive, it should not host any 264 * addresses. So we move them to some other phyint. Usually we catch the 265 * turn on of IFF_INACTIVE, and perform this action. However if the failover 266 * did not complete successfully, then subsequently we have lost the edge 267 * trigger, and this invariant kicks in and completes the action. 268 * 269 * 2. says that any failed phyint that is not empty must be failed over. 270 * Usually we do the failover when we detect NIC failure. However if the 271 * failover does not complete successfully, this invariant kicks in and 272 * completes the failover. We exclude inactive standby which is covered by 1. 273 * 274 * 3. says that any running phyint that is not full must be failed back. 275 * Usually we do the failback when we detect NIC repair. However if the 276 * failback does not complete successfully, this invariant kicks in and 277 * completes the failback. Note that we don't want to failback to an inactive 278 * standby. 279 * 280 * The invariants 1 - 3 and the actions are in initifs(). 281 */ 282 283 struct probes_missed probes_missed; 284 285 /* 286 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 287 * will be added on by the kernel. The id field identifies this phyint. 288 * and the sequence number is an increasing (modulo 2^^16) integer. The data 289 * portion holds the time value when the packet is sent. On echo this is 290 * extracted to compute the round-trip time. Three different types of 291 * probe packets are used. 292 * 293 * PROBE_UNI: This type is used to do failure detection / failure recovery 294 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 295 * not less than the current CRTT. pii_probes[] stores data 296 * about these probes. These packets consume sequence number space. 297 * 298 * PROBE_RTT: This type is used to make only rtt measurments. Normally these 299 * are not used. Under heavy network load, the rtt may go up very high, 300 * due to a spike, or may appear to go high, due to extreme scheduling 301 * delays. Once the network stress is removed, mpathd takes long time to 302 * recover, because the probe_interval is already high, and it takes 303 * a long time to send out sufficient number of probes to bring down the 304 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 305 * user_probe_interval ms. and will cause only rtt updates. These packets 306 * do not consume sequence number space nor is information about these 307 * packets stored in the pii_probes[] 308 * 309 * PROBE_MULTI: This type is only used to construct a list of targets, when 310 * no targets are known. The packet is multicast to the all hosts addr. 311 */ 312 static void 313 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) 314 { 315 struct pr_icmp probe_pkt; /* Probe packet */ 316 struct sockaddr_in6 whereto6; /* target address IPv6 */ 317 struct sockaddr_in whereto; /* target address IPv4 */ 318 int pr_ndx; /* probe index in pii->pii_probes[] */ 319 boolean_t sent = _B_TRUE; 320 321 if (debug & D_TARGET) { 322 logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), 323 pii->pii_name, probe_type, cur_time); 324 } 325 326 assert(pii->pii_probe_sock != -1); 327 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 328 probe_type == PROBE_RTT); 329 330 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 331 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 332 probe_pkt.pr_icmp_code = 0; 333 probe_pkt.pr_icmp_cksum = 0; 334 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 335 336 /* 337 * Since there is no need to do arithmetic on the icmpid, 338 * (only equality check is done) pii_icmpid is stored in 339 * network byte order at initialization itself. 340 */ 341 probe_pkt.pr_icmp_id = pii->pii_icmpid; 342 probe_pkt.pr_icmp_timestamp = htonl(cur_time); 343 probe_pkt.pr_icmp_mtype = htonl(probe_type); 344 345 /* 346 * If probe_type is PROBE_MULTI, this packet will be multicast to 347 * the all hosts address. Otherwise it is unicast to the next target. 348 */ 349 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 350 pii->pii_rtt_target_next != NULL)); 351 352 if (pii->pii_af == AF_INET6) { 353 bzero(&whereto6, sizeof (whereto6)); 354 whereto6.sin6_family = AF_INET6; 355 if (probe_type == PROBE_MULTI) { 356 whereto6.sin6_addr = all_nodes_mcast_v6; 357 } else if (probe_type == PROBE_UNI) { 358 whereto6.sin6_addr = pii->pii_target_next->tg_address; 359 } else { 360 /* type is PROBE_RTT */ 361 whereto6.sin6_addr = 362 pii->pii_rtt_target_next->tg_address; 363 } 364 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 365 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, 366 sizeof (whereto6)) != sizeof (probe_pkt)) { 367 logperror_pii(pii, "probe: probe sendto"); 368 sent = _B_FALSE; 369 } 370 } else { 371 bzero(&whereto, sizeof (whereto)); 372 whereto.sin_family = AF_INET; 373 if (probe_type == PROBE_MULTI) { 374 whereto.sin_addr = all_nodes_mcast_v4; 375 } else if (probe_type == PROBE_UNI) { 376 IN6_V4MAPPED_TO_INADDR( 377 &pii->pii_target_next->tg_address, 378 &whereto.sin_addr); 379 } else { 380 /* type is PROBE_RTT */ 381 IN6_V4MAPPED_TO_INADDR( 382 &pii->pii_rtt_target_next->tg_address, 383 &whereto.sin_addr); 384 } 385 386 /* 387 * Compute the IPv4 icmp checksum. Does not cover the IP header. 388 */ 389 probe_pkt.pr_icmp_cksum = 390 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 391 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 392 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, 393 sizeof (whereto)) != sizeof (probe_pkt)) { 394 logperror_pii(pii, "probe: probe sendto"); 395 sent = _B_FALSE; 396 } 397 } 398 399 /* 400 * If this is a PROBE_UNI probe packet being unicast to a target, then 401 * update our tables. We will need this info in processing the probe 402 * response. PROBE_MULTI and PROBE_RTT packets are not used for 403 * the purpose of failure or recovery detection. PROBE_MULTI packets 404 * are only used to construct a list of targets. PROBE_RTT packets are 405 * used only for updating the rtt and not for failure detection. 406 */ 407 if (probe_type == PROBE_UNI && sent) { 408 pr_ndx = pii->pii_probe_next; 409 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 410 411 /* Collect statistics, before we reuse the last slot. */ 412 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 413 pii->pii_cum_stats.lost++; 414 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 415 pii->pii_cum_stats.acked++; 416 pii->pii_cum_stats.sent++; 417 418 pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; 419 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 420 pii->pii_probes[pr_ndx].pr_time_sent = cur_time; 421 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 422 pii->pii_target_next = target_next(pii->pii_target_next); 423 assert(pii->pii_target_next != NULL); 424 /* 425 * If we have a single variable to denote the next target to 426 * probe for both rtt probes and failure detection probes, we 427 * could end up with a situation where the failure detection 428 * probe targets become disjoint from the rtt probe targets. 429 * Eg. if 2 targets and the actual fdt is double the user 430 * specified fdt. So we have 2 variables. In this scheme 431 * we also reset pii_rtt_target_next for every fdt probe, 432 * though that may not be necessary. 433 */ 434 pii->pii_rtt_target_next = pii->pii_target_next; 435 pii->pii_snxt++; 436 } else if (probe_type == PROBE_RTT) { 437 pii->pii_rtt_target_next = 438 target_next(pii->pii_rtt_target_next); 439 assert(pii->pii_rtt_target_next != NULL); 440 } 441 } 442 443 /* 444 * Incoming IPv4 data from wire, is received here. Called from main. 445 */ 446 void 447 in_data(struct phyint_instance *pii) 448 { 449 struct sockaddr_in from; 450 struct in6_addr fromaddr; 451 uint_t fromlen; 452 static uint_t in_packet[(IP_MAXPACKET + 1)/4]; 453 struct ip *ip; 454 int iphlen; 455 int len; 456 char abuf[INET_ADDRSTRLEN]; 457 struct pr_icmp *reply; 458 459 if (debug & D_PROBE) { 460 logdebug("in_data(%s %s)\n", 461 AF_STR(pii->pii_af), pii->pii_name); 462 } 463 464 /* 465 * Poll has already told us that a message is waiting, 466 * on this socket. Read it now. We should not block. 467 */ 468 fromlen = sizeof (from); 469 len = recvfrom(pii->pii_probe_sock, (char *)in_packet, 470 sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); 471 if (len < 0) { 472 logperror_pii(pii, "in_data: recvfrom"); 473 return; 474 } 475 476 /* 477 * If the NIC has indicated the link is down, don't go 478 * any further. 479 */ 480 if (LINK_DOWN(pii->pii_phyint)) 481 return; 482 483 /* Get the printable address for error reporting */ 484 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 485 486 /* Make sure packet contains at least minimum ICMP header */ 487 ip = (struct ip *)in_packet; 488 iphlen = ip->ip_hl << 2; 489 if (len < iphlen + ICMP_MINLEN) { 490 if (debug & D_PKTBAD) { 491 logdebug("in_data: packet too short (%d bytes)" 492 " from %s\n", len, abuf); 493 } 494 return; 495 } 496 497 /* 498 * Subtract the IP hdr length, 'len' will be length of the probe 499 * reply, starting from the icmp hdr. 500 */ 501 len -= iphlen; 502 /* LINTED */ 503 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 504 505 /* Probe replies are icmp echo replies. Ignore anything else */ 506 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 507 return; 508 509 /* 510 * The icmp id should match what we sent, which is stored 511 * in pi_icmpid. The icmp code for reply must be 0. 512 * The reply content must be a struct pr_icmp 513 */ 514 if (reply->pr_icmp_id != pii->pii_icmpid) { 515 /* Not in response to our probe */ 516 return; 517 } 518 519 if (reply->pr_icmp_code != 0) { 520 logtrace("probe reply code %d from %s on %s\n", 521 reply->pr_icmp_code, abuf, pii->pii_name); 522 return; 523 } 524 525 if (len < sizeof (struct pr_icmp)) { 526 logtrace("probe reply too short: %d bytes from %s on %s\n", 527 len, abuf, pii->pii_name); 528 return; 529 } 530 531 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 532 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 533 /* Unicast probe reply */ 534 incoming_echo_reply(pii, reply, fromaddr); 535 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 536 /* Multicast reply */ 537 incoming_mcast_reply(pii, reply, fromaddr); 538 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 539 incoming_rtt_reply(pii, reply, fromaddr); 540 } else { 541 /* Probably not in response to our probe */ 542 logtrace("probe reply type: %d from %s on %s\n", 543 reply->pr_icmp_mtype, abuf, pii->pii_name); 544 return; 545 } 546 547 } 548 549 /* 550 * Incoming IPv6 data from wire is received here. Called from main. 551 */ 552 void 553 in6_data(struct phyint_instance *pii) 554 { 555 struct sockaddr_in6 from; 556 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 557 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 558 int len; 559 char abuf[INET6_ADDRSTRLEN]; 560 struct msghdr msg; 561 struct iovec iov; 562 uchar_t *opt; 563 struct pr_icmp *reply; 564 565 if (debug & D_PROBE) { 566 logdebug("in6_data(%s %s)\n", 567 AF_STR(pii->pii_af), pii->pii_name); 568 } 569 570 iov.iov_base = (char *)in_packet; 571 iov.iov_len = sizeof (in_packet); 572 msg.msg_iov = &iov; 573 msg.msg_iovlen = 1; 574 msg.msg_name = (struct sockaddr *)&from; 575 msg.msg_namelen = sizeof (from); 576 msg.msg_control = ancillary_data; 577 msg.msg_controllen = sizeof (ancillary_data); 578 579 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 580 logperror_pii(pii, "in6_data: recvfrom"); 581 return; 582 } 583 584 /* 585 * If the NIC has indicated that the link is down, don't go 586 * any further. 587 */ 588 if (LINK_DOWN(pii->pii_phyint)) 589 return; 590 591 /* Get the printable address for error reporting */ 592 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 593 if (len < ICMP_MINLEN) { 594 if (debug & D_PKTBAD) { 595 logdebug("Truncated message: msg_flags 0x%x from %s\n", 596 msg.msg_flags, abuf); 597 } 598 return; 599 } 600 /* Ignore packets > 64k or control buffers that don't fit */ 601 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 602 if (debug & D_PKTBAD) { 603 logdebug("Truncated message: msg_flags 0x%x from %s\n", 604 msg.msg_flags, abuf); 605 } 606 return; 607 } 608 609 reply = (struct pr_icmp *)in_packet; 610 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 611 return; 612 613 if (reply->pr_icmp_id != pii->pii_icmpid) { 614 /* Not in response to our probe */ 615 return; 616 } 617 618 /* 619 * The kernel has already verified the the ICMP checksum. 620 */ 621 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 622 logtrace("ICMPv6 echo reply source address not linklocal from " 623 "%s on %s\n", abuf, pii->pii_name); 624 return; 625 } 626 opt = find_ancillary(&msg, IPV6_RTHDR); 627 if (opt != NULL) { 628 /* Can't allow routing headers in probe replies */ 629 logtrace("message with routing header from %s on %s\n", 630 abuf, pii->pii_name); 631 return; 632 } 633 if (reply->pr_icmp_code != 0) { 634 logtrace("probe reply code: %d from %s on %s\n", 635 reply->pr_icmp_code, abuf, pii->pii_name); 636 return; 637 } 638 if (len < (sizeof (struct pr_icmp))) { 639 logtrace("probe reply too short: %d bytes from %s on %s\n", 640 len, abuf, pii->pii_name); 641 return; 642 } 643 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 644 incoming_echo_reply(pii, reply, from.sin6_addr); 645 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 646 incoming_mcast_reply(pii, reply, from.sin6_addr); 647 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 648 incoming_rtt_reply(pii, reply, from.sin6_addr); 649 } else { 650 /* Probably not in response to our probe */ 651 logtrace("probe reply type: %d from %s on %s\n", 652 reply->pr_icmp_mtype, abuf, pii->pii_name); 653 } 654 } 655 656 /* 657 * Process the incoming rtt reply, in response to our rtt probe. 658 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 659 * have any stored information about the probe we sent. So we don't log 660 * any errors if we receive bad replies. 661 */ 662 static void 663 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 664 struct in6_addr fromaddr) 665 { 666 int m; /* rtt measurment in ms */ 667 uint32_t cur_time; /* in ms from some arbitrary point */ 668 char abuf[INET6_ADDRSTRLEN]; 669 struct target *target; 670 uint32_t pr_icmp_timestamp; 671 struct phyint_group *pg; 672 673 /* Get the printable address for error reporting */ 674 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 675 676 if (debug & D_PROBE) { 677 logdebug("incoming_rtt_reply: %s %s %s\n", 678 AF_STR(pii->pii_af), pii->pii_name, abuf); 679 } 680 681 /* Do we know this target ? */ 682 target = target_lookup(pii, fromaddr); 683 if (target == NULL) 684 return; 685 686 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 687 cur_time = getcurrenttime(); 688 m = (int)(cur_time - pr_icmp_timestamp); 689 690 /* Invalid rtt. It has wrapped around */ 691 if (m < 0) 692 return; 693 694 /* 695 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 696 * The initial few responses after the interface is repaired may 697 * contain high rtt's because they could have been queued up waiting 698 * for ARP/NDP resolution on a failed interface. 699 */ 700 pg = pii->pii_phyint->pi_group; 701 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 702 return; 703 704 /* 705 * Update rtt only if the new rtt is lower than the current rtt. 706 * (specified by the 3rd parameter to pi_set_crtt). 707 * If a spike has caused the current probe_interval to be > 708 * user_probe_interval, then this mechanism is used to bring down 709 * the rtt rapidly once the network stress is removed. 710 * If the new rtt is higher than the current rtt, we don't want to 711 * update the rtt. We are having more than 1 outstanding probe and 712 * the increase in rtt we are seeing is being unnecessarily weighted 713 * many times. The regular rtt update will be handled by 714 * incoming_echo_reply() and will take care of any rtt increase. 715 */ 716 pi_set_crtt(target, m, _B_FALSE); 717 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 718 (user_failure_detection_time < pg->pg_fdt) && 719 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 720 /* 721 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 722 * investigate if we can improve the failure detection time to 723 * meet whatever the user specified. 724 */ 725 if (check_pg_crtt_improved(pg)) { 726 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 727 user_failure_detection_time); 728 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 729 if (pii->pii_phyint->pi_group != phyint_anongroup) { 730 logerr("Improved failure detection time %d ms " 731 "on (%s %s) for group \"%s\"\n", 732 pg->pg_fdt, AF_STR(pii->pii_af), 733 pii->pii_name, 734 pii->pii_phyint->pi_group->pg_name); 735 } 736 if (user_failure_detection_time == pg->pg_fdt) { 737 /* Avoid any truncation or rounding errors */ 738 pg->pg_probeint = user_probe_interval; 739 /* 740 * No more rtt probes will be sent. The actual 741 * fdt has dropped to the user specified value. 742 * pii_fd_snxt_basetime and pii_snxt_basetime 743 * will be in sync henceforth. 744 */ 745 reset_snxt_basetimes(); 746 } 747 } 748 } 749 } 750 751 /* 752 * Process the incoming echo reply, in response to our unicast probe. 753 * Common for both IPv4 and IPv6 754 */ 755 static void 756 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 757 struct in6_addr fromaddr) 758 { 759 int m; /* rtt measurment in ms */ 760 uint32_t cur_time; /* in ms from some arbitrary point */ 761 char abuf[INET6_ADDRSTRLEN]; 762 int pr_ndx; 763 struct target *target; 764 boolean_t exception; 765 uint32_t pr_icmp_timestamp; 766 uint16_t pr_icmp_seq; 767 struct phyint_group *pg = pii->pii_phyint->pi_group; 768 769 /* Get the printable address for error reporting */ 770 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 771 772 if (debug & D_PROBE) { 773 logdebug("incoming_echo_reply: %s %s %s seq %u\n", 774 AF_STR(pii->pii_af), pii->pii_name, abuf, 775 ntohs(reply->pr_icmp_seq)); 776 } 777 778 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 779 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 780 781 /* Reject out of window probe replies */ 782 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 783 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 784 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 785 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 786 pii->pii_cum_stats.unknown++; 787 return; 788 } 789 cur_time = getcurrenttime(); 790 m = (int)(cur_time - pr_icmp_timestamp); 791 if (m < 0) { 792 /* 793 * This is a ridiculously high value of rtt. rtt has wrapped 794 * around. Log a message, and ignore the rtt. 795 */ 796 logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " 797 "timestamp %u\n", cur_time, pr_icmp_timestamp); 798 } 799 800 /* 801 * Get the probe index pr_ndx corresponding to the received icmp seq. 802 * number in our pii->pii_probes[] array. The icmp sequence number 803 * pii_snxt corresponds to the probe index pii->pii_probe_next 804 */ 805 pr_ndx = MOD_SUB(pii->pii_probe_next, 806 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 807 808 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 809 810 target = pii->pii_probes[pr_ndx].pr_target; 811 812 /* 813 * Perform sanity checks, whether this probe reply that we 814 * have received is genuine 815 */ 816 if (target != NULL) { 817 /* 818 * Compare the src. addr of the received ICMP or ICMPv6 819 * probe reply with the target address in our tables. 820 */ 821 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 822 /* 823 * We don't have any record of having sent a probe to 824 * this target. This is a fake probe reply. Log an error 825 */ 826 logtrace("probe status %d Fake probe reply seq %u " 827 "snxt %u on %s from %s\n", 828 pii->pii_probes[pr_ndx].pr_status, 829 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 830 pii->pii_cum_stats.unknown++; 831 return; 832 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 833 /* 834 * The address matches, but our tables indicate that 835 * this probe reply has been acked already. So this 836 * is a duplicate probe reply. Log an error 837 */ 838 logtrace("probe status %d Duplicate probe reply seq %u " 839 "snxt %u on %s from %s\n", 840 pii->pii_probes[pr_ndx].pr_status, 841 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 842 pii->pii_cum_stats.unknown++; 843 return; 844 } 845 } else { 846 /* 847 * Target must not be NULL in the PR_UNACKED state 848 */ 849 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 850 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 851 /* 852 * The probe stats slot is unused. So we didn't 853 * send out any probe to this target. This is a fake. 854 * Log an error. 855 */ 856 logtrace("probe status %d Fake probe reply seq %u " 857 "snxt %u on %s from %s\n", 858 pii->pii_probes[pr_ndx].pr_status, 859 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 860 } 861 pii->pii_cum_stats.unknown++; 862 return; 863 } 864 865 /* 866 * If the rtt does not appear to be right, don't update the 867 * rtt stats. This can happen if the system dropped into the 868 * debugger, or the system was hung or too busy for a 869 * substantial time that we didn't get a chance to run. 870 */ 871 if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { 872 /* 873 * If the probe corresponding to this receieved response 874 * was truly sent 'm' ms. ago, then this response must 875 * have been rejected by the sequence number checks. The 876 * fact that it has passed the sequence number checks 877 * means that the measured rtt is wrong. We were probably 878 * scheduled long after the packet was received. 879 */ 880 goto out; 881 } 882 883 /* 884 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 885 * The initial few responses after the interface is repaired may 886 * contain high rtt's because they could have been queued up waiting 887 * for ARP/NDP resolution on a failed interface. 888 */ 889 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 890 goto out; 891 892 /* 893 * Don't update the Conservative Round Trip Time estimate for this 894 * (phint, target) pair if this is the not the highest ack seq seen 895 * thus far on this target. 896 */ 897 if (!highest_ack_tg(pr_icmp_seq, target)) 898 goto out; 899 900 /* 901 * Always update the rtt. This is a failure detection probe 902 * and we want to measure both increase / decrease in rtt. 903 */ 904 pi_set_crtt(target, m, _B_TRUE); 905 906 /* 907 * If the crtt exceeds the average time between probes, 908 * investigate if this slow target is an exception. If so we 909 * can avoid this target and still meet the failure detection 910 * time. Otherwise we can't meet the failure detection time. 911 */ 912 if (target->tg_crtt > pg->pg_probeint) { 913 exception = check_exception_target(pii, target); 914 if (exception) { 915 /* 916 * This target is exceptionally slow. Don't use it 917 * for future probes. check_exception_target() has 918 * made sure that we have at least MIN_PROBE_TARGETS 919 * other active targets 920 */ 921 if (pii->pii_targets_are_routers) { 922 /* 923 * This is a slow router, mark it as slow 924 * and don't use it for further probes. We 925 * don't delete it, since it will be populated 926 * again when we do a router scan. Hence we 927 * need to maintain extra state (unlike the 928 * host case below). Mark it as TG_SLOW. 929 */ 930 if (target->tg_status == TG_ACTIVE) 931 pii->pii_ntargets--; 932 target->tg_status = TG_SLOW; 933 target->tg_latime = gethrtime(); 934 target->tg_rtt_sa = -1; 935 target->tg_crtt = 0; 936 target->tg_rtt_sd = 0; 937 if (pii->pii_target_next == target) { 938 pii->pii_target_next = 939 target_next(target); 940 } 941 } else { 942 /* 943 * the slow target is not a router, we can 944 * just delete it. Send an icmp multicast and 945 * pick the fastest responder that is not 946 * already an active target. target_delete() 947 * adjusts pii->pii_target_next 948 */ 949 target_delete(target); 950 probe(pii, PROBE_MULTI, cur_time); 951 } 952 } else { 953 /* 954 * We can't meet the failure detection time. 955 * Log a message, and update the detection time to 956 * whatever we can achieve. 957 */ 958 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 959 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 960 last_fdt_bumpup_time = gethrtime(); 961 if (pg != phyint_anongroup) { 962 logerr("Cannot meet requested failure detection" 963 " time of %d ms on (%s %s) new failure" 964 " detection time for group \"%s\" is %d" 965 " ms\n", user_failure_detection_time, 966 AF_STR(pii->pii_af), pii->pii_name, 967 pg->pg_name, pg->pg_fdt); 968 } 969 } 970 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 971 (user_failure_detection_time < pg->pg_fdt) && 972 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 973 /* 974 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 975 * investigate if we can improve the failure detection time to 976 * meet whatever the user specified. 977 */ 978 if (check_pg_crtt_improved(pg)) { 979 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 980 user_failure_detection_time); 981 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 982 if (pg != phyint_anongroup) { 983 logerr("Improved failure detection time %d ms " 984 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 985 AF_STR(pii->pii_af), pii->pii_name, 986 pg->pg_name); 987 } 988 if (user_failure_detection_time == pg->pg_fdt) { 989 /* Avoid any truncation or rounding errors */ 990 pg->pg_probeint = user_probe_interval; 991 /* 992 * No more rtt probes will be sent. The actual 993 * fdt has dropped to the user specified value. 994 * pii_fd_snxt_basetime and pii_snxt_basetime 995 * will be in sync henceforth. 996 */ 997 reset_snxt_basetimes(); 998 } 999 } 1000 } 1001 out: 1002 pii->pii_probes[pr_ndx].pr_status = PR_ACKED; 1003 pii->pii_probes[pr_ndx].pr_time_acked = cur_time; 1004 1005 /* 1006 * Update pii->pii_rack, i.e. the sequence number of the last received 1007 * probe response, based on the echo reply we have received now, if 1008 * either of the following conditions are satisfied. 1009 * a. pii_rack is outside the current receive window of 1010 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 1011 * This means we have not received probe responses for a 1012 * long time, and the sequence number has wrapped around. 1013 * b. pii_rack is within the current receive window and this echo 1014 * reply corresponds to the highest sequence number we have seen 1015 * so far. 1016 */ 1017 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 1018 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 1019 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 1020 pii->pii_rack = pr_icmp_seq; 1021 } 1022 } 1023 1024 /* 1025 * Returns true if seq is the highest unacknowledged seq for target tg 1026 * else returns false 1027 */ 1028 static boolean_t 1029 highest_ack_tg(uint16_t seq, struct target *tg) 1030 { 1031 struct phyint_instance *pii; 1032 int pr_ndx; 1033 uint16_t pr_seq; 1034 1035 pii = tg->tg_phyint_inst; 1036 1037 /* 1038 * Get the seq number of the most recent probe sent so far, 1039 * and also get the corresponding probe index in the probe stats 1040 * array. 1041 */ 1042 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1043 pr_seq = pii->pii_snxt; 1044 pr_seq--; 1045 1046 /* 1047 * Start from the most recent probe and walk back, trying to find 1048 * an acked probe corresponding to target tg. 1049 */ 1050 for (; pr_ndx != pii->pii_probe_next; 1051 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 1052 if (pii->pii_probes[pr_ndx].pr_target == tg && 1053 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1054 if (SEQ_GT(pr_seq, seq)) 1055 return (_B_FALSE); 1056 } 1057 } 1058 return (_B_TRUE); 1059 } 1060 1061 /* 1062 * Check whether the crtt for the group has improved by a factor of 1063 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1064 * detection time flapping in the face of small crtt changes. 1065 */ 1066 static boolean_t 1067 check_pg_crtt_improved(struct phyint_group *pg) 1068 { 1069 struct phyint *pi; 1070 1071 if (debug & D_PROBE) 1072 logdebug("check_pg_crtt_improved()\n"); 1073 1074 /* 1075 * The crtt for the group is only improved if each phyint_instance 1076 * for both ipv4 and ipv6 is improved. 1077 */ 1078 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1079 if (!check_pii_crtt_improved(pi->pi_v4) || 1080 !check_pii_crtt_improved(pi->pi_v6)) 1081 return (_B_FALSE); 1082 } 1083 1084 return (_B_TRUE); 1085 } 1086 1087 /* 1088 * Check whether the crtt has improved substantially on this phyint_instance. 1089 * Returns _B_TRUE if there's no crtt information available, because pii 1090 * is NULL or the phyint_instance is not capable of probing. 1091 */ 1092 boolean_t 1093 check_pii_crtt_improved(struct phyint_instance *pii) { 1094 struct target *tg; 1095 1096 if (pii == NULL) 1097 return (_B_TRUE); 1098 1099 if (!PROBE_CAPABLE(pii) || 1100 pii->pii_phyint->pi_state == PI_FAILED) 1101 return (_B_TRUE); 1102 1103 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1104 if (tg->tg_status != TG_ACTIVE) 1105 continue; 1106 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1107 LOWER_FDT_TRIGGER)) { 1108 return (_B_FALSE); 1109 } 1110 } 1111 1112 return (_B_TRUE); 1113 } 1114 1115 /* 1116 * This target responds very slowly to probes. The target's crtt exceeds 1117 * the probe interval of its group. Compare against other targets 1118 * and determine if this target is an exception, if so return true, else false 1119 */ 1120 static boolean_t 1121 check_exception_target(struct phyint_instance *pii, struct target *target) 1122 { 1123 struct target *tg; 1124 char abuf[INET6_ADDRSTRLEN]; 1125 1126 if (debug & D_PROBE) { 1127 logdebug("check_exception_target(%s %s target %s)\n", 1128 AF_STR(pii->pii_af), pii->pii_name, 1129 pr_addr(pii->pii_af, target->tg_address, 1130 abuf, sizeof (abuf))); 1131 } 1132 1133 /* 1134 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1135 * to make a good judgement. Otherwise don't drop this target. 1136 */ 1137 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1138 return (_B_FALSE); 1139 1140 /* 1141 * Determine whether only this particular target is slow. 1142 * We know that this target's crtt exceeds the group's probe interval. 1143 * If all other active targets have a 1144 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1145 * then this target is considered slow. 1146 */ 1147 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1148 if (tg != target && tg->tg_status == TG_ACTIVE) { 1149 if (tg->tg_crtt > 1150 pii->pii_phyint->pi_group->pg_probeint / 1151 EXCEPTION_FACTOR) { 1152 return (_B_FALSE); 1153 } 1154 } 1155 } 1156 1157 return (_B_TRUE); 1158 } 1159 1160 /* 1161 * Update the target list. The icmp all hosts multicast has given us 1162 * some host to which we can send probes. If we already have sufficient 1163 * targets, discard it. 1164 */ 1165 static void 1166 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1167 struct in6_addr fromaddr) 1168 /* ARGSUSED */ 1169 { 1170 int af; 1171 char abuf[INET6_ADDRSTRLEN]; 1172 struct phyint *pi; 1173 1174 if (debug & D_PROBE) { 1175 logdebug("incoming_mcast_reply(%s %s %s)\n", 1176 AF_STR(pii->pii_af), pii->pii_name, 1177 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1178 } 1179 1180 /* 1181 * Using host targets is a fallback mechanism. If we have 1182 * found a router, don't add this host target. If we already 1183 * know MAX_PROBE_TARGETS, don't add another target. 1184 */ 1185 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1186 if (pii->pii_targets != NULL) { 1187 if (pii->pii_targets_are_routers || 1188 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1189 return; 1190 } 1191 } 1192 1193 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1194 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1195 /* 1196 * Guard against response from 0.0.0.0 1197 * and ::. Log a trace message 1198 */ 1199 logtrace("probe response from %s on %s\n", 1200 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1201 pii->pii_name); 1202 return; 1203 } 1204 1205 /* 1206 * This address is one of our own, so reject this address as a 1207 * valid probe target. 1208 */ 1209 af = pii->pii_af; 1210 if (own_address(fromaddr)) 1211 return; 1212 1213 /* 1214 * If the phyint is part a named group, then add the address to all 1215 * members of the group. Otherwise, add the address only to the 1216 * phyint itself, since other phyints in the anongroup may not be on 1217 * the same subnet. 1218 */ 1219 pi = pii->pii_phyint; 1220 if (pi->pi_group == phyint_anongroup) { 1221 target_add(pii, fromaddr, _B_FALSE); 1222 } else { 1223 pi = pi->pi_group->pg_phyint; 1224 for (; pi != NULL; pi = pi->pi_pgnext) 1225 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1226 } 1227 } 1228 1229 /* 1230 * Compute CRTT given an existing scaled average, scaled deviation estimate 1231 * and a new rtt time. The formula is from Jacobson and Karels' 1232 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1233 * are the same as those in Appendix A.2 of that paper. 1234 * 1235 * m = new measurement 1236 * sa = scaled RTT average (8 * average estimates) 1237 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1238 * crtt = Conservative round trip time. Used to determine whether probe 1239 * has timed out. 1240 * 1241 * New scaled average and deviation are passed back via sap and svp 1242 */ 1243 static int 1244 compute_crtt(int *sap, int *svp, int m) 1245 { 1246 int sa = *sap; 1247 int sv = *svp; 1248 int crtt; 1249 int saved_m = m; 1250 1251 assert(*sap >= -1); 1252 assert(*svp >= 0); 1253 1254 if (sa != -1) { 1255 /* 1256 * Update average estimator: 1257 * new rtt = old rtt + 1/8 Error 1258 * where Error = m - old rtt 1259 * i.e. 8 * new rtt = 8 * old rtt + Error 1260 * i.e. new sa = old sa + Error 1261 */ 1262 m -= sa >> 3; /* m is now Error in estimate. */ 1263 if ((sa += m) < 0) { 1264 /* Don't allow the smoothed average to be negative. */ 1265 sa = 0; 1266 } 1267 1268 /* 1269 * Update deviation estimator: 1270 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1271 * i.e. 4 * new mdev = 4 * old mdev + 1272 * (abs(Error) - old mdev) 1273 * i.e. new sv = old sv + (abs(Error) - old mdev) 1274 */ 1275 if (m < 0) 1276 m = -m; 1277 m -= sv >> 2; 1278 sv += m; 1279 } else { 1280 /* Initialization. This is the first response received. */ 1281 sa = (m << 3); 1282 sv = (m << 1); 1283 } 1284 1285 crtt = (sa >> 3) + sv; 1286 1287 if (debug & D_PROBE) { 1288 logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " 1289 "%d\n", saved_m, sa, sv, crtt); 1290 } 1291 1292 *sap = sa; 1293 *svp = sv; 1294 1295 /* 1296 * CRTT = average estimates + 4 * deviation estimates 1297 * = sa / 8 + sv 1298 */ 1299 return (crtt); 1300 } 1301 1302 static void 1303 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) 1304 { 1305 struct phyint_instance *pii = tg->tg_phyint_inst; 1306 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1307 int sa = tg->tg_rtt_sa; 1308 int sv = tg->tg_rtt_sd; 1309 int new_crtt; 1310 int i; 1311 1312 if (debug & D_PROBE) 1313 logdebug("pi_set_crtt: target - m %d\n", m); 1314 1315 /* store the round trip time, in case we need to defer computation */ 1316 tg->tg_deferred[tg->tg_num_deferred] = m; 1317 1318 new_crtt = compute_crtt(&sa, &sv, m); 1319 1320 /* 1321 * If this probe's round trip time would singlehandedly cause an 1322 * increase in the group's probe interval consider it suspect. 1323 */ 1324 if ((new_crtt > probe_interval) && is_probe_uni) { 1325 if (debug & D_PROBE) { 1326 logdebug("Received a suspect probe on %s, new_crtt =" 1327 " %d, probe_interval = %d, num_deferred = %d\n", 1328 pii->pii_probe_logint->li_name, new_crtt, 1329 probe_interval, tg->tg_num_deferred); 1330 } 1331 1332 /* 1333 * If we've deferred as many rtts as we plan on deferring, then 1334 * assume the link really did slow down and process all queued 1335 * rtts 1336 */ 1337 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1338 if (debug & D_PROBE) { 1339 logdebug("Received MAXDEFERREDRTT probes which " 1340 "would cause an increased probe_interval. " 1341 "Integrating queued rtt data points.\n"); 1342 } 1343 1344 for (i = 0; i <= tg->tg_num_deferred; i++) { 1345 tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, 1346 &tg->tg_rtt_sd, tg->tg_deferred[i]); 1347 } 1348 1349 tg->tg_num_deferred = 0; 1350 } else { 1351 tg->tg_num_deferred++; 1352 } 1353 return; 1354 } 1355 1356 /* 1357 * If this is a normal probe, or an RTT probe that would lead to a 1358 * reduced CRTT, then update our CRTT data. Further, if this was 1359 * a normal probe, pitch any deferred probes since our probes are 1360 * again being answered within our CRTT estimates. 1361 */ 1362 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1363 tg->tg_rtt_sa = sa; 1364 tg->tg_rtt_sd = sv; 1365 tg->tg_crtt = new_crtt; 1366 if (is_probe_uni) 1367 tg->tg_num_deferred = 0; 1368 } 1369 } 1370 1371 /* 1372 * Return a pointer to the specified option buffer. 1373 * If not found return NULL. 1374 */ 1375 static void * 1376 find_ancillary(struct msghdr *msg, int cmsg_type) 1377 { 1378 struct cmsghdr *cmsg; 1379 1380 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1381 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1382 if (cmsg->cmsg_level == IPPROTO_IPV6 && 1383 cmsg->cmsg_type == cmsg_type) { 1384 return (CMSG_DATA(cmsg)); 1385 } 1386 } 1387 return (NULL); 1388 } 1389 1390 /* 1391 * See if a previously failed interface has started working again. 1392 */ 1393 void 1394 phyint_check_for_repair(struct phyint *pi) 1395 { 1396 if (phyint_repaired(pi)) { 1397 if (pi->pi_group == phyint_anongroup) { 1398 logerr("NIC repair detected on %s\n", pi->pi_name); 1399 } else { 1400 logerr("NIC repair detected on %s of group %s\n", 1401 pi->pi_name, pi->pi_group->pg_name); 1402 } 1403 1404 /* 1405 * If the interface is offline, just clear the FAILED flag, 1406 * delaying the state change and failback operation until it 1407 * is brought back online. 1408 */ 1409 if (pi->pi_state == PI_OFFLINE) { 1410 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1411 return; 1412 } 1413 1414 if (pi->pi_flags & IFF_STANDBY) { 1415 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1416 } else { 1417 if (try_failback(pi) != IPMP_FAILURE) { 1418 (void) change_lif_flags(pi, 1419 IFF_FAILED, _B_FALSE); 1420 /* Per state diagram */ 1421 pi->pi_empty = 0; 1422 } 1423 } 1424 1425 phyint_chstate(pi, PI_RUNNING); 1426 1427 if (GROUP_FAILED(pi->pi_group)) { 1428 /* 1429 * This is the 1st phyint to receive a response 1430 * after group failure. 1431 */ 1432 logerr("At least 1 interface (%s) of group %s has " 1433 "repaired\n", pi->pi_name, pi->pi_group->pg_name); 1434 phyint_group_chstate(pi->pi_group, PG_RUNNING); 1435 /* 1436 * If this is the STANDBY phyint to be repaired after a 1437 * group failure. Move data addresses on other failed 1438 * phyints in the group to this one. 1439 */ 1440 if (pi->pi_flags & IFF_STANDBY) { 1441 struct phyint *fpi = pi->pi_group->pg_phyint; 1442 for (; fpi != NULL; fpi = fpi->pi_pgnext) { 1443 if (fpi != pi) { 1444 (void) try_failover(fpi, 1445 FAILOVER_NORMAL); 1446 } 1447 } 1448 } 1449 } 1450 } 1451 } 1452 1453 /* 1454 * See if a previously functioning interface has failed, or if the 1455 * whole group of interfaces has failed. 1456 */ 1457 static void 1458 phyint_inst_check_for_failure(struct phyint_instance *pii) 1459 { 1460 struct phyint *pi; 1461 struct phyint *pi2; 1462 1463 pi = pii->pii_phyint; 1464 1465 switch (failure_state(pii)) { 1466 case PHYINT_FAILURE: 1467 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 1468 if (pi->pi_group == phyint_anongroup) { 1469 logerr("NIC failure detected on %s\n", pii->pii_name); 1470 } else { 1471 logerr("NIC failure detected on %s of group %s\n", 1472 pii->pii_name, pi->pi_group->pg_name); 1473 } 1474 /* 1475 * Do the failover, unless the interface is offline (in 1476 * which case we've already failed over). 1477 */ 1478 if (pi->pi_state != PI_OFFLINE) { 1479 phyint_chstate(pi, PI_FAILED); 1480 reset_crtt_all(pi); 1481 if (!(pi->pi_flags & IFF_INACTIVE)) 1482 (void) try_failover(pi, FAILOVER_NORMAL); 1483 } 1484 break; 1485 1486 case GROUP_FAILURE: 1487 logerr("All Interfaces in group %s have failed\n", 1488 pi->pi_group->pg_name); 1489 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; 1490 pi2 = pi2->pi_pgnext) { 1491 if (pi2->pi_flags & IFF_OFFLINE) 1492 continue; 1493 (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); 1494 reset_crtt_all(pi2); 1495 1496 /* 1497 * In the case of host targets, we 1498 * would have flushed the targets, 1499 * and gone to PI_NOTARGETS state. 1500 */ 1501 if (pi2->pi_state == PI_RUNNING) 1502 phyint_chstate(pi2, PI_FAILED); 1503 1504 pi2->pi_empty = 0; 1505 pi2->pi_full = 0; 1506 } 1507 break; 1508 1509 default: 1510 break; 1511 } 1512 } 1513 1514 /* 1515 * Determines if any timeout event has occurred and returns the number of 1516 * milliseconds until the next timeout event for the phyint. Returns 1517 * TIMER_INFINITY for "never". 1518 */ 1519 uint_t 1520 phyint_inst_timer(struct phyint_instance *pii) 1521 { 1522 int pr_ndx; 1523 uint_t timeout; 1524 struct target *cur_tg; 1525 struct probe_stats *pr_statp; 1526 struct phyint_instance *pii_other; 1527 struct phyint *pi; 1528 int valid_unack_count; 1529 int i; 1530 int interval; 1531 uint_t check_time; 1532 uint_t cur_time; 1533 hrtime_t cur_hrtime; 1534 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1535 1536 cur_time = getcurrenttime(); 1537 1538 if (debug & D_TIMER) { 1539 logdebug("phyint_inst_timer(%s %s)\n", 1540 AF_STR(pii->pii_af), pii->pii_name); 1541 } 1542 1543 pii_other = phyint_inst_other(pii); 1544 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1545 /* 1546 * Check to see if we're here due to link up/down flapping; If 1547 * enough time has passed, then try to bring the interface 1548 * back up; otherwise, schedule a timer to bring it back up 1549 * when enough time *has* elapsed. 1550 */ 1551 pi = pii->pii_phyint; 1552 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1553 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1554 if (check_time > cur_time) 1555 return (check_time - cur_time); 1556 1557 phyint_check_for_repair(pi); 1558 } 1559 } 1560 1561 /* 1562 * If probing is not enabled on this phyint instance, don't proceed. 1563 */ 1564 if (!PROBE_ENABLED(pii)) 1565 return (TIMER_INFINITY); 1566 1567 /* 1568 * If the timer has fired too soon, probably triggered 1569 * by some other phyint instance, return the remaining 1570 * time 1571 */ 1572 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1573 return (pii->pii_snxt_time - cur_time); 1574 1575 /* 1576 * If the link is down, don't send any probes for now. 1577 */ 1578 if (LINK_DOWN(pii->pii_phyint)) 1579 return (TIMER_INFINITY); 1580 1581 /* 1582 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1583 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1584 * Base probe time is strictly periodic. 1585 */ 1586 interval = GET_RANDOM( 1587 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1588 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1589 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1590 1591 /* 1592 * Check if the current time > next time to probe. If so, we missed 1593 * sending 1 or more probes, probably due to heavy system load. At least 1594 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1595 * were scheduled. Make adjustments to the times, in multiples of 1596 * user_probe_interval. 1597 */ 1598 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1599 int n; 1600 1601 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1602 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1603 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1604 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1605 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1606 pii->pii_snxt_basetime); 1607 1608 /* Collect statistics about missed probes */ 1609 probes_missed.pm_nprobes += n + 1; 1610 probes_missed.pm_ntimes++; 1611 } 1612 pii->pii_snxt_basetime += user_probe_interval; 1613 interval = pii->pii_snxt_time - cur_time; 1614 if (debug & D_TARGET) { 1615 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1616 " interval %u\n", cur_time, pii->pii_snxt_time, 1617 pii->pii_snxt_basetime, interval); 1618 } 1619 1620 /* 1621 * If no targets are known, we need to send an ICMP multicast. The 1622 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1623 * to see if we found a target. 1624 */ 1625 if (pii->pii_target_next == NULL) { 1626 assert(pii->pii_ntargets == 0); 1627 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1628 probe(pii, PROBE_MULTI, cur_time); 1629 return (interval); 1630 } 1631 1632 if ((user_probe_interval != probe_interval) && 1633 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1634 /* 1635 * the failure detection (fd) probe timer has not yet fired. 1636 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1637 */ 1638 probe(pii, PROBE_RTT, cur_time); 1639 return (interval); 1640 } 1641 /* 1642 * the fd probe timer has fired. Need to do all failure 1643 * detection / recovery calculations, and then send an fd probe 1644 * of type PROBE_UNI. 1645 */ 1646 if (user_probe_interval == probe_interval) { 1647 /* 1648 * We could have missed some probes, and then adjusted 1649 * pii_snxt_basetime above. Otherwise we could have 1650 * blindly added probe_interval to pii_fd_snxt_basetime. 1651 */ 1652 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1653 } else { 1654 pii->pii_fd_snxt_basetime += probe_interval; 1655 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1656 int n; 1657 1658 n = (cur_time - pii->pii_fd_snxt_basetime) / 1659 probe_interval; 1660 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1661 } 1662 } 1663 1664 /* 1665 * We can have at most, the latest 2 probes that we sent, in 1666 * the PR_UNACKED state. All previous probes sent, are either 1667 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1668 * timed out if the probe's time_sent + the CRTT < currenttime. 1669 * For each of the last 2 probes, examine whether it has timed 1670 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1671 */ 1672 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1673 valid_unack_count = 0; 1674 1675 for (i = 0; i < 2; i++) { 1676 pr_statp = &pii->pii_probes[pr_ndx]; 1677 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1678 switch (pr_statp->pr_status) { 1679 case PR_ACKED: 1680 /* 1681 * We received back an ACK, so the switch clearly 1682 * is not dropping our traffic, and thus we can 1683 * enable failure detection immediately. 1684 */ 1685 if (pii->pii_fd_hrtime > gethrtime()) { 1686 if (debug & D_PROBE) { 1687 logdebug("successful probe on %s; " 1688 "ending quiet period\n", 1689 pii->pii_phyint->pi_name); 1690 } 1691 pii->pii_fd_hrtime = gethrtime(); 1692 } 1693 break; 1694 1695 case PR_UNACKED: 1696 assert(cur_tg != NULL); 1697 /* 1698 * The crtt could be zero for some reason, 1699 * Eg. the phyint could be failed. If the crtt is 1700 * not available use group's probe interval, 1701 * which is a worst case estimate. 1702 */ 1703 if (cur_tg->tg_crtt != 0) { 1704 timeout = pr_statp->pr_time_sent + 1705 cur_tg->tg_crtt; 1706 } else { 1707 timeout = pr_statp->pr_time_sent + 1708 probe_interval; 1709 } 1710 if (TIME_LT(timeout, cur_time)) { 1711 pr_statp->pr_status = PR_LOST; 1712 pr_statp->pr_time_lost = timeout; 1713 } else if (i == 1) { 1714 /* 1715 * We are forced to consider this probe 1716 * lost, as we can have at most 2 unack. 1717 * probes any time, and we will be sending a 1718 * probe at the end of this function. 1719 * Normally, we should not be here, but 1720 * this can happen if an incoming response 1721 * that was considered lost has increased 1722 * the crtt for this target, and also bumped 1723 * up the FDT. Note that we never cancel or 1724 * increase the current pii_time_left, so 1725 * when the timer fires, we find 2 valid 1726 * unacked probes, and they are yet to timeout 1727 */ 1728 pr_statp->pr_status = PR_LOST; 1729 pr_statp->pr_time_lost = cur_time; 1730 } else { 1731 /* 1732 * Only the most recent probe can enter 1733 * this 'else' arm. The second most recent 1734 * probe must take either of the above arms, 1735 * if it is unacked. 1736 */ 1737 valid_unack_count++; 1738 } 1739 break; 1740 } 1741 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1742 } 1743 1744 /* 1745 * We send out 1 probe randomly in the interval between one half 1746 * and one probe interval for the group. Given that the CRTT is always 1747 * less than the group's probe interval, we can have at most 1 1748 * unacknowledged probe now. All previous probes are either lost or 1749 * acked. 1750 */ 1751 assert(valid_unack_count == 0 || valid_unack_count == 1); 1752 1753 /* 1754 * The timer has fired. Take appropriate action depending 1755 * on the current state of the phyint. 1756 * 1757 * PI_RUNNING state - Failure detection and failover 1758 * PI_FAILED state - Repair detection and failback 1759 */ 1760 switch (pii->pii_phyint->pi_state) { 1761 case PI_FAILED: 1762 /* 1763 * If the most recent probe (excluding unacked probes that 1764 * are yet to time out) has been acked, check whether the 1765 * phyint is now repaired. If the phyint is repaired, then 1766 * attempt failback, unless it is an inactive standby. 1767 */ 1768 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1769 phyint_check_for_repair(pii->pii_phyint); 1770 } 1771 break; 1772 1773 case PI_RUNNING: 1774 /* 1775 * It's possible our probes have been lost because of a 1776 * spanning-tree mandated quiet period on the switch. If so, 1777 * ignore the lost probes and consider the interface to still 1778 * be functioning. 1779 */ 1780 cur_hrtime = gethrtime(); 1781 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1782 break; 1783 1784 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1785 /* 1786 * We have 1 or more failed probes (excluding unacked 1787 * probes that are yet to time out). Determine if the 1788 * phyint has failed. If so attempt a failover, 1789 * unless it is an inactive standby 1790 */ 1791 phyint_inst_check_for_failure(pii); 1792 } 1793 break; 1794 1795 default: 1796 logerr("phyint_inst_timer: invalid state %d\n", 1797 pii->pii_phyint->pi_state); 1798 abort(); 1799 } 1800 1801 /* 1802 * Start the next probe. probe() will also set pii->pii_probe_time_left 1803 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1804 * was called, the target list may be empty. 1805 */ 1806 if (pii->pii_target_next != NULL) { 1807 probe(pii, PROBE_UNI, cur_time); 1808 /* 1809 * If we have just the one probe target, and we're not using 1810 * router targets, try to find another as we presently have 1811 * no resilience. 1812 */ 1813 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1814 probe(pii, PROBE_MULTI, cur_time); 1815 } else { 1816 probe(pii, PROBE_MULTI, cur_time); 1817 } 1818 return (interval); 1819 } 1820 1821 /* 1822 * Start the probe timer for an interface instance. 1823 */ 1824 void 1825 start_timer(struct phyint_instance *pii) 1826 { 1827 uint32_t interval; 1828 1829 /* 1830 * Spread the base probe times (pi_snxt_basetime) across phyints 1831 * uniformly over the (curtime..curtime + the group's probe_interval). 1832 * pi_snxt_basetime is strictly periodic with a frequency of 1833 * the group's probe interval. The actual probe time pi_snxt_time 1834 * adds some randomness to pi_snxt_basetime and happens in probe(). 1835 * For the 1st probe on each phyint after the timer is started, 1836 * pi_snxt_time and pi_snxt_basetime are the same. 1837 */ 1838 interval = GET_RANDOM(0, 1839 (int)pii->pii_phyint->pi_group->pg_probeint); 1840 1841 pii->pii_snxt_basetime = getcurrenttime() + interval; 1842 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1843 pii->pii_snxt_time = pii->pii_snxt_basetime; 1844 timer_schedule(interval); 1845 } 1846 1847 /* 1848 * Restart the probe timer on an interface instance. 1849 */ 1850 static void 1851 restart_timer(struct phyint_instance *pii) 1852 { 1853 /* 1854 * We don't need to restart the timer if it was never started in 1855 * the first place (pii->pii_basetime_inited not set), as the timer 1856 * won't have gone off yet. 1857 */ 1858 if (pii->pii_basetime_inited != 0) { 1859 1860 if (debug & D_LINKNOTE) 1861 logdebug("restart timer: restarting timer on %s, " 1862 "address family %s\n", pii->pii_phyint->pi_name, 1863 AF_STR(pii->pii_af)); 1864 1865 start_timer(pii); 1866 } 1867 } 1868 1869 static void 1870 process_link_state_down(struct phyint *pi) 1871 { 1872 logerr("The link has gone down on %s\n", pi->pi_name); 1873 1874 /* 1875 * Clear the probe statistics arrays, we don't want the repair 1876 * detection logic relying on probes that were succesful prior 1877 * to the link going down. 1878 */ 1879 if (PROBE_CAPABLE(pi->pi_v4)) 1880 clear_pii_probe_stats(pi->pi_v4); 1881 if (PROBE_CAPABLE(pi->pi_v6)) 1882 clear_pii_probe_stats(pi->pi_v6); 1883 /* 1884 * Check for interface failure. Although we know the interface 1885 * has failed, we don't know if all the other interfaces in the 1886 * group have failed as well. 1887 */ 1888 if ((pi->pi_state == PI_RUNNING) || 1889 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1890 if (debug & D_LINKNOTE) { 1891 logdebug("process_link_state_down:" 1892 " checking for failure on %s\n", pi->pi_name); 1893 } 1894 1895 if (pi->pi_v4 != NULL) 1896 phyint_inst_check_for_failure(pi->pi_v4); 1897 else if (pi->pi_v6 != NULL) 1898 phyint_inst_check_for_failure(pi->pi_v6); 1899 } 1900 } 1901 1902 static void 1903 process_link_state_up(struct phyint *pi) 1904 { 1905 logerr("The link has come up on %s\n", pi->pi_name); 1906 1907 /* 1908 * We stopped any running timers on each instance when the link 1909 * went down, so restart them. 1910 */ 1911 if (pi->pi_v4) 1912 restart_timer(pi->pi_v4); 1913 if (pi->pi_v6) 1914 restart_timer(pi->pi_v6); 1915 1916 phyint_check_for_repair(pi); 1917 1918 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1919 if (pi->pi_whendx == LINK_UP_PERMIN) 1920 pi->pi_whendx = 0; 1921 } 1922 1923 /* 1924 * Process any changes in link state passed up from the interfaces. 1925 */ 1926 void 1927 process_link_state_changes(void) 1928 { 1929 struct phyint *pi; 1930 1931 /* Look for interfaces where the link state has just changed */ 1932 1933 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1934 boolean_t old_link_state_up = LINK_UP(pi); 1935 1936 /* 1937 * Except when the "phyint" structure is created, this is 1938 * the only place the link state is updated. This allows 1939 * this routine to detect changes in link state, rather 1940 * than just the current state. 1941 */ 1942 UPDATE_LINK_STATE(pi); 1943 1944 if (LINK_DOWN(pi)) { 1945 /* 1946 * Has link just gone down? 1947 */ 1948 if (old_link_state_up) 1949 process_link_state_down(pi); 1950 } else { 1951 /* 1952 * Has link just gone back up? 1953 */ 1954 if (!old_link_state_up) 1955 process_link_state_up(pi); 1956 } 1957 } 1958 } 1959 1960 void 1961 reset_crtt_all(struct phyint *pi) 1962 { 1963 struct phyint_instance *pii; 1964 struct target *tg; 1965 1966 pii = pi->pi_v4; 1967 if (pii != NULL) { 1968 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1969 tg->tg_crtt = 0; 1970 tg->tg_rtt_sa = -1; 1971 tg->tg_rtt_sd = 0; 1972 } 1973 } 1974 1975 pii = pi->pi_v6; 1976 if (pii != NULL) { 1977 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1978 tg->tg_crtt = 0; 1979 tg->tg_rtt_sa = -1; 1980 tg->tg_rtt_sd = 0; 1981 } 1982 } 1983 } 1984 1985 /* 1986 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 1987 * probes on both instances IPv4 and IPv6. 1988 * If the interface has failed, return the time of the first probe failure 1989 * in "tff". 1990 */ 1991 static int 1992 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 1993 { 1994 uint_t pi_tff; 1995 struct target *cur_tg; 1996 struct probe_fail_count pfinfo; 1997 struct phyint_instance *pii_other; 1998 int pr_ndx; 1999 2000 /* 2001 * Get the number of consecutive failed probes on 2002 * this phyint across all targets. Also get the number 2003 * of consecutive failed probes on this target only 2004 */ 2005 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2006 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2007 probe_fail_info(pii, cur_tg, &pfinfo); 2008 2009 /* Get the time of first failure, for later use */ 2010 pi_tff = pfinfo.pf_tff; 2011 2012 /* 2013 * If the current target has not responded to the 2014 * last NUM_PROBE_FAILS probes, and other targets are 2015 * responding delete this target. Dead gateway detection 2016 * will eventually remove this target (if router) from the 2017 * routing tables. If that does not occur, we may end 2018 * up adding this to our list again. 2019 */ 2020 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2021 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2022 if (pii->pii_targets_are_routers) { 2023 if (cur_tg->tg_status == TG_ACTIVE) 2024 pii->pii_ntargets--; 2025 cur_tg->tg_status = TG_DEAD; 2026 cur_tg->tg_crtt = 0; 2027 cur_tg->tg_rtt_sa = -1; 2028 cur_tg->tg_rtt_sd = 0; 2029 if (pii->pii_target_next == cur_tg) 2030 pii->pii_target_next = target_next(cur_tg); 2031 } else { 2032 target_delete(cur_tg); 2033 probe(pii, PROBE_MULTI, getcurrenttime()); 2034 } 2035 return (PHYINT_OK); 2036 } 2037 2038 /* 2039 * If the phyint has lost NUM_PROBE_FAILS or more 2040 * consecutive probes, on both IPv4 and IPv6 protocol 2041 * instances of the phyint, then trigger failure 2042 * detection, else return false 2043 */ 2044 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2045 return (PHYINT_OK); 2046 2047 pii_other = phyint_inst_other(pii); 2048 if (PROBE_CAPABLE(pii_other)) { 2049 probe_fail_info(pii_other, NULL, &pfinfo); 2050 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2051 /* 2052 * We have NUM_PROBE_FAILS or more failures 2053 * on both IPv4 and IPv6. Get the earliest 2054 * time when failure was detected on this 2055 * phyint across IPv4 and IPv6. 2056 */ 2057 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2058 pi_tff = pfinfo.pf_tff; 2059 } else { 2060 /* 2061 * This instance has < NUM_PROBE_FAILS failure. 2062 * So return false 2063 */ 2064 return (PHYINT_OK); 2065 } 2066 } 2067 *tff = pi_tff; 2068 return (PHYINT_FAILURE); 2069 } 2070 2071 /* 2072 * Check if the link has gone down on this phyint, or it has failed the 2073 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2074 * Also look at other phyints of this group, for group failures. 2075 */ 2076 int 2077 failure_state(struct phyint_instance *pii) 2078 { 2079 struct probe_success_count psinfo; 2080 uint_t pi2_tls; /* time last success */ 2081 uint_t pi_tff; /* time first fail */ 2082 struct phyint *pi2; 2083 struct phyint *pi; 2084 struct phyint_instance *pii2; 2085 struct phyint_group *pg; 2086 boolean_t alone; 2087 2088 if (debug & D_FAILOVER) 2089 logdebug("phyint_failed(%s)\n", pii->pii_name); 2090 2091 pi = pii->pii_phyint; 2092 pg = pi->pi_group; 2093 2094 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2095 PHYINT_OK) 2096 return (PHYINT_OK); 2097 2098 /* 2099 * At this point, the link is down, or the phyint is suspect, 2100 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint 2101 * does not belong to any group, or is the only member of the 2102 * group capable of being probed, return PHYINT_FAILURE. 2103 */ 2104 alone = _B_TRUE; 2105 if (pg != phyint_anongroup) { 2106 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2107 if (pi2 == pi) 2108 continue; 2109 if (PROBE_CAPABLE(pi2->pi_v4) || 2110 PROBE_CAPABLE(pi2->pi_v6)) { 2111 alone = _B_FALSE; 2112 break; 2113 } 2114 } 2115 } 2116 if (alone) 2117 return (PHYINT_FAILURE); 2118 2119 /* 2120 * Need to compare against other phyints of the same group 2121 * to exclude group failures. If the failure was detected via 2122 * probing, then if the time of last success (tls) of any 2123 * phyint is more recent than the time of first fail (tff) of the 2124 * phyint in question, and the link is up on the phyint, 2125 * then it is a phyint failure. Otherwise it is a group failure. 2126 * If failure was detected via a link down notification sent from 2127 * the driver to IP, we see if any phyints in the group are still 2128 * running and haven't received a link down notification. We 2129 * will usually be processing the link down notification shortly 2130 * after it was received, so there is no point looking at the tls 2131 * of other phyints. 2132 */ 2133 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2134 /* Exclude ourself from comparison */ 2135 if (pi2 == pi) 2136 continue; 2137 2138 if (LINK_DOWN(pi)) { 2139 /* 2140 * We use FLAGS_TO_LINK_STATE() to test the 2141 * flags directly, rather then LINK_UP() or 2142 * LINK_DOWN(), as we may not have got round 2143 * to processing the link state for the other 2144 * phyints in the group yet. 2145 * 2146 * The check for PI_RUNNING and group 2147 * failure handles the case when the 2148 * group begins to recover. The first 2149 * phyint to recover should not trigger 2150 * a failover from the soon-to-recover 2151 * other phyints to the first recovered 2152 * phyint. PI_RUNNING will be set, and 2153 * pg_groupfailed cleared only after 2154 * receipt of NUM_PROBE_REPAIRS, by 2155 * which time the other phyints should 2156 * have received at least 1 packet, 2157 * and so will not have NUM_PROBE_FAILS. 2158 */ 2159 if ((pi2->pi_state == PI_RUNNING) && 2160 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) 2161 return (PHYINT_FAILURE); 2162 } else { 2163 /* 2164 * Need to compare against both IPv4 and 2165 * IPv6 instances. 2166 */ 2167 pii2 = pi2->pi_v4; 2168 if (pii2 != NULL) { 2169 probe_success_info(pii2, NULL, &psinfo); 2170 if (psinfo.ps_tls_valid) { 2171 pi2_tls = psinfo.ps_tls; 2172 /* 2173 * See comment above regarding check 2174 * for PI_RUNNING and group failure. 2175 */ 2176 if (TIME_GT(pi2_tls, pi_tff) && 2177 (pi2->pi_state == PI_RUNNING) && 2178 !GROUP_FAILED(pg) && 2179 FLAGS_TO_LINK_STATE(pi2)) 2180 return (PHYINT_FAILURE); 2181 } 2182 } 2183 2184 pii2 = pi2->pi_v6; 2185 if (pii2 != NULL) { 2186 probe_success_info(pii2, NULL, &psinfo); 2187 if (psinfo.ps_tls_valid) { 2188 pi2_tls = psinfo.ps_tls; 2189 /* 2190 * See comment above regarding check 2191 * for PI_RUNNING and group failure. 2192 */ 2193 if (TIME_GT(pi2_tls, pi_tff) && 2194 (pi2->pi_state == PI_RUNNING) && 2195 !GROUP_FAILED(pg) && 2196 FLAGS_TO_LINK_STATE(pi2)) 2197 return (PHYINT_FAILURE); 2198 } 2199 } 2200 } 2201 } 2202 2203 /* 2204 * Change the group state to PG_FAILED if it's not already. 2205 */ 2206 if (!GROUP_FAILED(pg)) 2207 phyint_group_chstate(pg, PG_FAILED); 2208 2209 return (GROUP_FAILURE); 2210 } 2211 2212 /* 2213 * Return the information associated with consecutive probe successes 2214 * starting with the most recent probe. At most the last 2 probes can be 2215 * in the unacknowledged state. All previous probes have either failed 2216 * or succeeded. 2217 */ 2218 static void 2219 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2220 struct probe_success_count *psinfo) 2221 { 2222 uint_t i; 2223 struct probe_stats *pr_statp; 2224 uint_t most_recent; 2225 uint_t second_most_recent; 2226 boolean_t pi_found_failure = _B_FALSE; 2227 boolean_t tg_found_failure = _B_FALSE; 2228 uint_t now; 2229 uint_t timeout; 2230 struct target *tg; 2231 2232 if (debug & D_FAILOVER) 2233 logdebug("probe_success_info(%s)\n", pii->pii_name); 2234 2235 bzero(psinfo, sizeof (*psinfo)); 2236 now = getcurrenttime(); 2237 2238 /* 2239 * Start with the most recent probe, and count the number 2240 * of consecutive probe successes. Latch the number of successes 2241 * on hitting a failure. 2242 */ 2243 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2244 second_most_recent = PROBE_INDEX_PREV(most_recent); 2245 2246 for (i = most_recent; i != pii->pii_probe_next; 2247 i = PROBE_INDEX_PREV(i)) { 2248 pr_statp = &pii->pii_probes[i]; 2249 2250 switch (pr_statp->pr_status) { 2251 case PR_UNACKED: 2252 /* 2253 * Only the most recent 2 probes can be unacknowledged 2254 */ 2255 assert(i == most_recent || i == second_most_recent); 2256 2257 tg = pr_statp->pr_target; 2258 assert(tg != NULL); 2259 /* 2260 * The crtt could be zero for some reason, 2261 * Eg. the phyint could be failed. If the crtt is 2262 * not available use the value of the group's probe 2263 * interval which is a worst case estimate. 2264 */ 2265 if (tg->tg_crtt != 0) { 2266 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2267 } else { 2268 timeout = pr_statp->pr_time_sent + 2269 pii->pii_phyint->pi_group->pg_probeint; 2270 } 2271 2272 if (TIME_LT(timeout, now)) { 2273 /* 2274 * We hit a failure. Latch the total number of 2275 * recent consecutive successes. 2276 */ 2277 pr_statp->pr_time_lost = timeout; 2278 pr_statp->pr_status = PR_LOST; 2279 pi_found_failure = _B_TRUE; 2280 if (cur_tg != NULL && tg == cur_tg) { 2281 /* 2282 * We hit a failure for the desired 2283 * target. Latch the number of recent 2284 * consecutive successes for this target 2285 */ 2286 tg_found_failure = _B_TRUE; 2287 } 2288 } 2289 break; 2290 2291 case PR_ACKED: 2292 /* 2293 * Bump up the count of probe successes, if we 2294 * have not seen any failure so far. 2295 */ 2296 if (!pi_found_failure) 2297 psinfo->ps_nsucc++; 2298 2299 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2300 !tg_found_failure) { 2301 psinfo->ps_nsucc_tg++; 2302 } 2303 2304 /* 2305 * Record the time of last success, if this is 2306 * the most recent probe success. 2307 */ 2308 if (!psinfo->ps_tls_valid) { 2309 psinfo->ps_tls = pr_statp->pr_time_acked; 2310 psinfo->ps_tls_valid = _B_TRUE; 2311 } 2312 break; 2313 2314 case PR_LOST: 2315 /* 2316 * We hit a failure. Latch the total number of 2317 * recent consecutive successes. 2318 */ 2319 pi_found_failure = _B_TRUE; 2320 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2321 /* 2322 * We hit a failure for the desired target. 2323 * Latch the number of recent consecutive 2324 * successes for this target 2325 */ 2326 tg_found_failure = _B_TRUE; 2327 } 2328 break; 2329 2330 default: 2331 return; 2332 2333 } 2334 } 2335 } 2336 2337 /* 2338 * Return the information associated with consecutive probe failures 2339 * starting with the most recent probe. Only the last 2 probes can be in the 2340 * unacknowledged state. All previous probes have either failed or succeeded. 2341 */ 2342 static void 2343 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2344 struct probe_fail_count *pfinfo) 2345 { 2346 int i; 2347 struct probe_stats *pr_statp; 2348 boolean_t tg_found_success = _B_FALSE; 2349 boolean_t pi_found_success = _B_FALSE; 2350 int most_recent; 2351 int second_most_recent; 2352 uint_t now; 2353 uint_t timeout; 2354 struct target *tg; 2355 2356 if (debug & D_FAILOVER) 2357 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2358 2359 bzero(pfinfo, sizeof (*pfinfo)); 2360 now = getcurrenttime(); 2361 2362 /* 2363 * Start with the most recent probe, and count the number 2364 * of consecutive probe failures. Latch the number of failures 2365 * on hitting a probe success. 2366 */ 2367 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2368 second_most_recent = PROBE_INDEX_PREV(most_recent); 2369 2370 for (i = most_recent; i != pii->pii_probe_next; 2371 i = PROBE_INDEX_PREV(i)) { 2372 pr_statp = &pii->pii_probes[i]; 2373 2374 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2375 2376 switch (pr_statp->pr_status) { 2377 case PR_UNACKED: 2378 /* 2379 * Only the most recent 2 probes can be unacknowledged 2380 */ 2381 assert(i == most_recent || i == second_most_recent); 2382 2383 tg = pr_statp->pr_target; 2384 /* 2385 * Target is guaranteed to exist in the unack. state 2386 */ 2387 assert(tg != NULL); 2388 /* 2389 * The crtt could be zero for some reason, 2390 * Eg. the phyint could be failed. If the crtt is 2391 * not available use the group's probe interval, 2392 * which is a worst case estimate. 2393 */ 2394 if (tg->tg_crtt != 0) { 2395 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2396 } else { 2397 timeout = pr_statp->pr_time_sent + 2398 pii->pii_phyint->pi_group->pg_probeint; 2399 } 2400 2401 if (TIME_GT(timeout, now)) 2402 break; 2403 2404 pr_statp->pr_time_lost = timeout; 2405 pr_statp->pr_status = PR_LOST; 2406 /* FALLTHRU */ 2407 2408 case PR_LOST: 2409 if (!pi_found_success) { 2410 pfinfo->pf_nfail++; 2411 pfinfo->pf_tff = pr_statp->pr_time_lost; 2412 } 2413 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2414 !tg_found_success) { 2415 pfinfo->pf_nfail_tg++; 2416 } 2417 break; 2418 2419 default: 2420 /* 2421 * We hit a success or unused slot. Latch the 2422 * total number of recent consecutive failures. 2423 */ 2424 pi_found_success = _B_TRUE; 2425 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2426 /* 2427 * We hit a success for the desired target. 2428 * Latch the number of recent consecutive 2429 * failures for this target 2430 */ 2431 tg_found_success = _B_TRUE; 2432 } 2433 } 2434 } 2435 } 2436 2437 /* 2438 * Check if the phyint has been repaired. If no test address has been 2439 * configured, then consider the interface repaired if the link is up (unless 2440 * the link is flapping; see below). Otherwise, look for proof of probes 2441 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2442 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2443 */ 2444 static boolean_t 2445 phyint_repaired(struct phyint *pi) 2446 { 2447 struct probe_success_count psinfo; 2448 struct phyint_instance *pii; 2449 struct target *cur_tg; 2450 int pr_ndx; 2451 uint_t cur_time; 2452 2453 if (debug & D_FAILOVER) 2454 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2455 2456 if (LINK_DOWN(pi)) 2457 return (_B_FALSE); 2458 2459 /* 2460 * If we don't have any test addresses and the link is up, then 2461 * consider the interface repaired, unless we've received more than 2462 * LINK_UP_PERMIN link up notifications in the last minute, in 2463 * which case we keep the link down until we drop back below 2464 * the threshold. 2465 */ 2466 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2467 cur_time = getcurrenttime(); 2468 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2469 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2470 pi->pi_lfmsg_printed = 0; 2471 return (_B_TRUE); 2472 } 2473 if (!pi->pi_lfmsg_printed) { 2474 logerr("The link has come up on %s more than %d times " 2475 "in the last minute; disabling failback until it " 2476 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2477 pi->pi_lfmsg_printed = 1; 2478 } 2479 2480 return (_B_FALSE); 2481 } 2482 2483 pii = pi->pi_v4; 2484 if (PROBE_CAPABLE(pii)) { 2485 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2486 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2487 probe_success_info(pii, cur_tg, &psinfo); 2488 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2489 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2490 return (_B_TRUE); 2491 } 2492 2493 pii = pi->pi_v6; 2494 if (PROBE_CAPABLE(pii)) { 2495 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2496 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2497 probe_success_info(pii, cur_tg, &psinfo); 2498 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2499 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2500 return (_B_TRUE); 2501 } 2502 2503 return (_B_FALSE); 2504 } 2505 2506 /* 2507 * Try failover from phyint 'pi' to a suitable destination. 2508 */ 2509 int 2510 try_failover(struct phyint *pi, int failover_type) 2511 { 2512 struct phyint *dst; 2513 int err; 2514 2515 if (debug & D_FAILOVER) 2516 logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); 2517 2518 /* 2519 * Attempt to find a failover destination 'dst'. 2520 * dst will be null if any of the following is true 2521 * Phyint is not part of a group OR 2522 * Phyint is the only member of a group OR 2523 * No suitable failover dst was available 2524 */ 2525 dst = get_failover_dst(pi, failover_type); 2526 if (dst == NULL) 2527 return (IPMP_EMINRED); 2528 2529 dst->pi_empty = 0; /* Per state diagram */ 2530 pi->pi_full = 0; /* Per state diagram */ 2531 2532 err = failover(pi, dst); 2533 2534 if (debug & D_FAILOVER) { 2535 logdebug("failed over from %s to %s ret %d\n", 2536 pi->pi_name, dst->pi_name, err); 2537 } 2538 if (err == 0) { 2539 pi->pi_empty = 1; /* Per state diagram */ 2540 /* 2541 * we don't want to print out this message if a 2542 * phyint is leaving the group, nor for failover from 2543 * standby 2544 */ 2545 if (failover_type == FAILOVER_NORMAL) { 2546 logerr("Successfully failed over from NIC %s to NIC " 2547 "%s\n", pi->pi_name, dst->pi_name); 2548 } 2549 return (0); 2550 } else { 2551 /* 2552 * The failover did not succeed. We must retry the failover 2553 * only after resyncing our state based on the kernel's. 2554 * For eg. either the src or the dst might have been unplumbed 2555 * causing this failure. initifs() will be called again, 2556 * from main, since full_scan_required has been set to true 2557 * by failover(); 2558 */ 2559 return (IPMP_FAILURE); 2560 } 2561 } 2562 2563 /* 2564 * global_errno captures the errno value, if failover() or failback() 2565 * fails. This is sent to if_mpadm(1M). 2566 */ 2567 int global_errno; 2568 2569 /* 2570 * Attempt failover from phyint 'from' to phyint 'to'. 2571 * IP moves everything from phyint 'from' to phyint 'to'. 2572 */ 2573 static int 2574 failover(struct phyint *from, struct phyint *to) 2575 { 2576 struct lifreq lifr; 2577 int ret; 2578 2579 if (debug & D_FAILOVER) { 2580 logdebug("failing over from %s to %s\n", 2581 from->pi_name, to->pi_name); 2582 } 2583 2584 /* 2585 * Perform the failover. Both IPv4 and IPv6 are failed over 2586 * using a single ioctl by passing in AF_UNSPEC family. 2587 */ 2588 lifr.lifr_addr.ss_family = AF_UNSPEC; 2589 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2590 lifr.lifr_movetoindex = to->pi_ifindex; 2591 2592 ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); 2593 if (ret < 0) { 2594 global_errno = errno; 2595 logperror("failover: ioctl (failover)"); 2596 } 2597 2598 /* 2599 * Set full_scan_required to true. This will make us read 2600 * the state from the kernel in initifs() and update our tables, 2601 * to reflect the current state after the failover. If the 2602 * failover has failed it will then reissue the failover. 2603 */ 2604 full_scan_required = _B_TRUE; 2605 return (ret); 2606 } 2607 2608 /* 2609 * phyint 'pi' has recovered. Attempt failback from every phyint in the same 2610 * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. 2611 * Return values: 2612 * IPMP_SUCCESS: Failback successful from each of the other 2613 * phyints in the group. 2614 * IPMP_EFBPARTIAL: Failback successful from some of the other 2615 * phyints in the group. 2616 * IPMP_FAILURE: Failback syscall failed with some error. 2617 * 2618 * Note that failback is attempted regardless of the setting of the 2619 * failback_enabled flag. 2620 */ 2621 int 2622 do_failback(struct phyint *pi) 2623 { 2624 struct phyint *from; 2625 boolean_t done; 2626 boolean_t partial; 2627 boolean_t attempted_failback = _B_FALSE; 2628 2629 if (debug & D_FAILOVER) 2630 logdebug("do_failback(%s)\n", pi->pi_name); 2631 2632 /* If this phyint is not part of a named group, return. */ 2633 if (pi->pi_group == phyint_anongroup) { 2634 pi->pi_full = 1; 2635 return (IPMP_SUCCESS); 2636 } 2637 2638 /* 2639 * Attempt failback from every phyint in the group to 'pi'. 2640 * The reason for doing this, instead of only from the 2641 * phyint to which we did the failover is given below. 2642 * 2643 * After 'pi' failed, if any app. tries to join on a multicast 2644 * address (IPv6), on the failed phyint, IP picks any arbitrary 2645 * non-failed phyint in the group, instead of the failed phyint, 2646 * in.mpathd is not aware of this. Thus failing back only from the 2647 * interface to which 'pi' failed over, will failback the ipif's 2648 * but not the ilm's. So we need to failback from all members of 2649 * the phyint group 2650 */ 2651 done = _B_TRUE; 2652 partial = _B_FALSE; 2653 for (from = pi->pi_group->pg_phyint; from != NULL; 2654 from = from->pi_pgnext) { 2655 /* Exclude ourself as a failback src */ 2656 if (from == pi) 2657 continue; 2658 2659 /* 2660 * If the 'from' phyint has IPv4 plumbed, the 'to' 2661 * phyint must also have IPv4 plumbed. Similar check 2662 * for IPv6. IP makes the same check. Otherwise the 2663 * failback will fail. 2664 */ 2665 if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || 2666 (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { 2667 partial = _B_TRUE; 2668 continue; 2669 } 2670 2671 pi->pi_empty = 0; /* Per state diagram */ 2672 attempted_failback = _B_TRUE; 2673 if (failback(from, pi) != 0) { 2674 done = _B_FALSE; 2675 break; 2676 } 2677 } 2678 2679 /* 2680 * We are done. No more phyint from which we can src the failback 2681 */ 2682 if (done) { 2683 if (!partial) 2684 pi->pi_full = 1; /* Per state diagram */ 2685 /* 2686 * Don't print out a message unless there is a 2687 * transition from FAILED to RUNNING. For eg. 2688 * we don't want to print out this message if a 2689 * phyint is leaving the group, or at startup 2690 */ 2691 if (attempted_failback && (pi->pi_flags & 2692 (IFF_FAILED | IFF_OFFLINE))) { 2693 logerr("Successfully failed back to NIC %s\n", 2694 pi->pi_name); 2695 } 2696 return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2697 } 2698 2699 return (IPMP_FAILURE); 2700 } 2701 2702 /* 2703 * This function is similar to do_failback() above, but respects the 2704 * failback_enabled flag for phyints in named groups. 2705 */ 2706 int 2707 try_failback(struct phyint *pi) 2708 { 2709 if (debug & D_FAILOVER) 2710 logdebug("try_failback(%s)\n", pi->pi_name); 2711 2712 if (pi->pi_group != phyint_anongroup && !failback_enabled) 2713 return (IPMP_EFBDISABLED); 2714 2715 return (do_failback(pi)); 2716 } 2717 2718 /* 2719 * Failback everything from phyint 'from' that has the same ifindex 2720 * as phyint to's ifindex. 2721 */ 2722 static int 2723 failback(struct phyint *from, struct phyint *to) 2724 { 2725 struct lifreq lifr; 2726 int ret; 2727 2728 if (debug & D_FAILOVER) 2729 logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); 2730 2731 lifr.lifr_addr.ss_family = AF_UNSPEC; 2732 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2733 lifr.lifr_movetoindex = to->pi_ifindex; 2734 2735 ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); 2736 if (ret < 0) { 2737 global_errno = errno; 2738 logperror("failback: ioctl (failback)"); 2739 } 2740 2741 /* 2742 * Set full_scan_required to true. This will make us read 2743 * the state from the kernel in initifs() and update our tables, 2744 * to reflect the current state after the failback. If the 2745 * failback has failed it will then reissue the failback. 2746 */ 2747 full_scan_required = _B_TRUE; 2748 2749 return (ret); 2750 } 2751 2752 /* 2753 * Select a target phyint for failing over from 'pi'. 2754 * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred 2755 * target phyint is chosen as follows, 2756 * 1. Pick any inactive standby interface. 2757 * 2. If no inactive standby is available, select any phyint in the 2758 * same group that has the least number of logints, (excluding 2759 * IFF_NOFAILOVER and !IFF_UP logints) 2760 * If we are failing over from a standby, failover_type is 2761 * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. 2762 * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, 2763 * and we won't return NULL, as long as there is at least 1 other phyint 2764 * in the group. 2765 */ 2766 static struct phyint * 2767 get_failover_dst(struct phyint *pi, int failover_type) 2768 { 2769 struct phyint *maybe = NULL; 2770 struct phyint *pi2; 2771 struct phyint *last_choice = NULL; 2772 2773 if (pi->pi_group == phyint_anongroup) 2774 return (NULL); 2775 2776 /* 2777 * Loop thru the phyints in the group, and pick the preferred 2778 * phyint for the target. 2779 */ 2780 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2781 /* Exclude ourself and offlined interfaces */ 2782 if (pi2 == pi || pi2->pi_state == PI_OFFLINE) 2783 continue; 2784 2785 /* 2786 * The chosen target phyint must have IPv4 instance 2787 * plumbed, if the src phyint has IPv4 plumbed. Similarly 2788 * for IPv6. 2789 */ 2790 if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || 2791 (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) 2792 continue; 2793 2794 /* The chosen target must be PI_RUNNING. */ 2795 if (pi2->pi_state != PI_RUNNING) { 2796 last_choice = pi2; 2797 continue; 2798 } 2799 2800 if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && 2801 (failover_type != FAILOVER_TO_NONSTANDBY)) { 2802 return (pi2); 2803 } else { 2804 if (maybe == NULL) 2805 maybe = pi2; 2806 else if (logint_upcount(pi2) < logint_upcount(maybe)) 2807 maybe = pi2; 2808 } 2809 } 2810 if (maybe == NULL && failover_type == FAILOVER_TO_ANY) 2811 return (last_choice); 2812 else 2813 return (maybe); 2814 } 2815 2816 /* 2817 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2818 */ 2819 boolean_t 2820 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) 2821 { 2822 int ifsock; 2823 struct lifreq lifr; 2824 2825 if (debug & D_FAILOVER) { 2826 logdebug("change_lif_flags(%s): flags %llx setfl %d\n", 2827 pi->pi_name, flags, (int)setfl); 2828 } 2829 2830 if (pi->pi_v4 != NULL) { 2831 ifsock = ifsock_v4; 2832 } else { 2833 ifsock = ifsock_v6; 2834 } 2835 2836 /* 2837 * Get the current flags from the kernel, and set/clear the 2838 * desired phyint flags. Since we set only phyint flags, we can 2839 * do it on either IPv4 or IPv6 instance. 2840 */ 2841 (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2842 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 2843 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2844 if (errno != ENXIO) 2845 logperror("change_lif_flags: ioctl (get flags)"); 2846 return (_B_FALSE); 2847 } 2848 if (setfl) 2849 lifr.lifr_flags |= flags; 2850 else 2851 lifr.lifr_flags &= ~flags; 2852 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2853 if (errno != ENXIO) 2854 logperror("change_lif_flags: ioctl (set flags)"); 2855 return (_B_FALSE); 2856 } 2857 2858 /* 2859 * Keep pi_flags in synch. with actual flags. Assumes flags are 2860 * phyint flags. 2861 */ 2862 if (setfl) 2863 pi->pi_flags |= flags; 2864 else 2865 pi->pi_flags &= ~flags; 2866 2867 if (pi->pi_v4) 2868 pi->pi_v4->pii_flags = pi->pi_flags; 2869 2870 if (pi->pi_v6) 2871 pi->pi_v6->pii_flags = pi->pi_flags; 2872 2873 return (_B_TRUE); 2874 } 2875 2876 /* 2877 * icmp cksum computation for IPv4. 2878 */ 2879 static int 2880 in_cksum(ushort_t *addr, int len) 2881 { 2882 register int nleft = len; 2883 register ushort_t *w = addr; 2884 register ushort_t answer; 2885 ushort_t odd_byte = 0; 2886 register int sum = 0; 2887 2888 /* 2889 * Our algorithm is simple, using a 32 bit accumulator (sum), 2890 * we add sequential 16 bit words to it, and at the end, fold 2891 * back all the carry bits from the top 16 bits into the lower 2892 * 16 bits. 2893 */ 2894 while (nleft > 1) { 2895 sum += *w++; 2896 nleft -= 2; 2897 } 2898 2899 /* mop up an odd byte, if necessary */ 2900 if (nleft == 1) { 2901 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2902 sum += odd_byte; 2903 } 2904 2905 /* 2906 * add back carry outs from top 16 bits to low 16 bits 2907 */ 2908 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2909 sum += (sum >> 16); /* add carry */ 2910 answer = ~sum; /* truncate to 16 bits */ 2911 return (answer); 2912 } 2913 2914 static void 2915 reset_snxt_basetimes(void) 2916 { 2917 struct phyint_instance *pii; 2918 2919 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2920 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2921 } 2922 } 2923 2924 /* 2925 * Is the address one of our own addresses? Unfortunately, 2926 * we cannot check our phyint tables to determine if the address 2927 * is our own. This is because, we don't track interfaces that 2928 * are not part of any group. We have to either use a 'bind' or 2929 * get the complete list of all interfaces using SIOCGLIFCONF, 2930 * to do this check. We could also use SIOCTMYADDR. 2931 * Bind fails for the local zone address, so we might include local zone 2932 * address as target address. If local zone address is a target address 2933 * and it is up, it is not possible to detect the interface failure. 2934 * SIOCTMYADDR also doesn't consider local zone address as own address. 2935 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2936 * are stored in laddr_list. 2937 */ 2938 2939 boolean_t 2940 own_address(struct in6_addr addr) 2941 { 2942 struct local_addr *taddr = laddr_list; 2943 2944 for (; taddr != NULL; taddr = taddr->next) { 2945 if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { 2946 return (_B_TRUE); 2947 } 2948 } 2949 return (_B_FALSE); 2950 } 2951