1 /* 2 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #pragma ident "%Z%%M% %I% %E% SMI" 24 25 #include "mpd_defs.h" 26 #include "mpd_tables.h" 27 28 /* 29 * Probe types for probe() 30 */ 31 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 32 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 33 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 34 35 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 36 37 /* 38 * Format of probe / probe response packets. This is an ICMP Echo request 39 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 40 */ 41 struct pr_icmp 42 { 43 uint8_t pr_icmp_type; /* type field */ 44 uint8_t pr_icmp_code; /* code field */ 45 uint16_t pr_icmp_cksum; /* checksum field */ 46 uint16_t pr_icmp_id; /* Identification */ 47 uint16_t pr_icmp_seq; /* sequence number */ 48 uint32_t pr_icmp_timestamp; /* Time stamp */ 49 uint32_t pr_icmp_mtype; /* Message type */ 50 }; 51 52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x0, 54 0x0, 0x0, 0x0, 0x0, 55 0x0, 0x0, 0x0, 0x1 } }; 56 57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 58 59 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 60 61 static void *find_ancillary(struct msghdr *msg, int cmsg_type); 62 static void pi_set_crtt(struct target *tg, int m, 63 boolean_t is_probe_uni); 64 static void incoming_echo_reply(struct phyint_instance *pii, 65 struct pr_icmp *reply, struct in6_addr fromaddr); 66 static void incoming_rtt_reply(struct phyint_instance *pii, 67 struct pr_icmp *reply, struct in6_addr fromaddr); 68 static void incoming_mcast_reply(struct phyint_instance *pii, 69 struct pr_icmp *reply, struct in6_addr fromaddr); 70 71 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 72 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 73 static boolean_t check_exception_target(struct phyint_instance *pii, 74 struct target *target); 75 static void probe_fail_info(struct phyint_instance *pii, 76 struct target *cur_tg, struct probe_fail_count *pfinfo); 77 static void probe_success_info(struct phyint_instance *pii, 78 struct target *cur_tg, struct probe_success_count *psinfo); 79 static boolean_t phyint_repaired(struct phyint *pi); 80 81 static int failover(struct phyint *from, struct phyint *to); 82 static int failback(struct phyint *from, struct phyint *to); 83 static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); 84 85 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 86 static int in_cksum(ushort_t *addr, int len); 87 static void reset_snxt_basetimes(void); 88 89 /* 90 * CRTT - Conservative Round Trip Time Estimate 91 * Probe success - A matching probe reply received before CRTT ms has elapsed 92 * after sending the probe. 93 * Probe failure - No probe reply received and more than CRTT ms has elapsed 94 * after sending the probe. 95 * 96 * TLS - Time last success. Most recent probe ack received at this time. 97 * TFF - Time first fail. The time of the earliest probe failure in 98 * a consecutive series of probe failures. 99 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 100 * before declaring phyint repair. 101 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 102 * declare a phyint failure. 103 * 104 * Phyint state diagram 105 * 106 * The state of a phyint that is capable of being probed, is completely 107 * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. 108 * 109 * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 110 * of the link (according to the driver). If the phyint is also configured 111 * with a test address (the common case) and probe targets, then a phyint must 112 * also successfully be able to send and receive probes in order to remain in 113 * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 114 * 115 * Further, if a PI_RUNNING phyint is configured with a test address but is 116 * unable to find any probe targets, it will transition to the PI_NOTARGETS 117 * state, which indicates that the link is apparently functional but that 118 * in.mpathd is unable to send probes to verify functionality (in this case, 119 * in.mpathd makes the optimistic assumption that the interface is working 120 * correctly and thus does not perform a failover, but reports the interface 121 * as IPMP_IF_UNKNOWN through the async events and query interfaces). 122 * 123 * At any point, a phyint may be administratively marked offline via if_mpadm. 124 * In this case, the interface always transitions to PI_OFFLINE, regardless 125 * of its previous state. When the interface is later brought back online, 126 * in.mpathd acts as if the interface is new (and thus it transitions to 127 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 128 * its probes, if probes are sent). 129 * 130 * pi_state - PI_RUNNING or PI_FAILED 131 * PI_RUNNING: The failure detection logic says the phyint is good. 132 * PI_FAILED: The failure detection logic says the phyint has failed. 133 * 134 * pg_groupfailed - Group failure, all interfaces in the group have failed. 135 * The pi_state may be either PI_FAILED or PI_NOTARGETS. 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: No failovers have been done to this phyint, from 144 * other phyints. This phyint is inactive. Phyint can be a Standby. 145 * When failback has been disabled (FAILOVER=no configured), 146 * phyint can also be a non-STANDBY. In this case IFF_INACTIVE 147 * is set when phyint subsequently recovers after a failure. 148 * 149 * pi_empty 150 * This phyint has failed over successfully to another phyint, and 151 * this phyint is currently "empty". It does not host any addresses or 152 * multicast membership etc. This is the state of a phyint after a 153 * failover from the phyint has completed successfully and no subsequent 154 * 'failover to' or 'failback to' has occurred on the phyint. 155 * IP guarantees that no new logicals will be hosted nor any multicast 156 * joins permitted on the phyint, since the phyint is either failed or 157 * inactive. pi_empty is set implies the phyint is either failed or 158 * inactive. 159 * 160 * pi_full 161 * The phyint hosts all of its own addresses that it "owns". If the 162 * phyint was previously failed or inactive, failbacks to the phyint 163 * has completed successfully. i.e. No more failbacks to this phyint 164 * can produce any change in system state whatsoever. 165 * 166 * Not all 32 possible combinations of the above 5-tuple are possible. 167 * Furthermore some of the above combinations are transient. They may occur 168 * only because the failover or failback did not complete successfully. The 169 * failover/failback will be retried and eventually a stable state will be 170 * reached. 171 * 172 * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. 173 * The following are the state machines. 'from' and 'to' are the src and 174 * dst of the failover/failback, below 175 * 176 * pi_empty state machine 177 * --------------------------------------------------------------------------- 178 * Event State -> New State 179 * --------------------------------------------------------------------------- 180 * successful completion from.pi_empty = 0 -> from.pi_empty = 1 181 * of failover 182 * 183 * Initiate failover to.pi_empty = X -> to.pi_empty = 0 184 * 185 * Initiate failback to.pi_empty = X -> to.pi_empty = 0 186 * 187 * group failure pi_empty = X -> pi_empty = 0 188 * --------------------------------------------------------------------------- 189 * 190 * pi_full state machine 191 * --------------------------------------------------------------------------- 192 * Event State -> New State 193 * --------------------------------------------------------------------------- 194 * successful completion to.pi_full = 0 -> to.pi_full = 1 195 * of failback from 196 * each of the other phyints 197 * 198 * Initiate failover from.pi_full = X -> from.pi_full = 0 199 * 200 * group failure pi_full = X -> pi_full = 0 201 * --------------------------------------------------------------------------- 202 * 203 * pi_state state machine 204 * --------------------------------------------------------------------------- 205 * Event State New State 206 * Action: 207 * --------------------------------------------------------------------------- 208 * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 209 * detection : set IFF_FAILED on this phyint 210 * : failover from this phyint to another 211 * 212 * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 213 * detection : set IFF_FAILED on this phyint 214 * 215 * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) 216 * detection -> (PI_RUNNING, I == 0) 217 * : to.pi_empty = 0 218 * : clear IFF_FAILED on this phyint 219 * : failback to this phyint if enabled 220 * 221 * NIC repair (PI_FAILED, I == 0, FAILBACK=no) 222 * detection -> (PI_RUNNING, I == 1) 223 * : to.pi_empty = 0 224 * : clear IFF_FAILED on this phyint 225 * : if failback is disabled set I == 1 226 * 227 * Group failure (perform on all phyints in the group) 228 * detection PI_RUNNING PI_FAILED 229 * (Router targets) : set IFF_FAILED 230 * : clear pi_empty and pi_full 231 * 232 * Group failure (perform on all phyints in the group) 233 * detection PI_RUNNING PI_NOTARGETS 234 * (Host targets) : set IFF_FAILED 235 * : clear pi_empty and pi_full 236 * : delete the target list on all phyints 237 * --------------------------------------------------------------------------- 238 * 239 * I state machine 240 * --------------------------------------------------------------------------- 241 * Event State Action: 242 * --------------------------------------------------------------------------- 243 * Turn on I pi_empty == 0, STANDBY : failover from standby 244 * 245 * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 246 * pi_full == 0 : failback to this if enabled 247 * --------------------------------------------------------------------------- 248 * 249 * Assertions: (Read '==>' as implies) 250 * 251 * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) 252 * (pi_empty == 1) ==> (pi_full == 0) 253 * (pi_full == 1) ==> (pi_empty == 0) 254 * 255 * Invariants 256 * 257 * pg_groupfailed = 0 && 258 * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby 259 * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint 260 * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint 261 * 262 * 1. says that an inactive standby, that is not empty, has to be failed 263 * over. For a standby to be truly inactive, it should not host any 264 * addresses. So we move them to some other phyint. Usually we catch the 265 * turn on of IFF_INACTIVE, and perform this action. However if the failover 266 * did not complete successfully, then subsequently we have lost the edge 267 * trigger, and this invariant kicks in and completes the action. 268 * 269 * 2. says that any failed phyint that is not empty must be failed over. 270 * Usually we do the failover when we detect NIC failure. However if the 271 * failover does not complete successfully, this invariant kicks in and 272 * completes the failover. We exclude inactive standby which is covered by 1. 273 * 274 * 3. says that any running phyint that is not full must be failed back. 275 * Usually we do the failback when we detect NIC repair. However if the 276 * failback does not complete successfully, this invariant kicks in and 277 * completes the failback. Note that we don't want to failback to an inactive 278 * standby. 279 * 280 * The invariants 1 - 3 and the actions are in initifs(). 281 */ 282 283 struct probes_missed probes_missed; 284 285 /* 286 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 287 * will be added on by the kernel. The id field identifies this phyint. 288 * and the sequence number is an increasing (modulo 2^^16) integer. The data 289 * portion holds the time value when the packet is sent. On echo this is 290 * extracted to compute the round-trip time. Three different types of 291 * probe packets are used. 292 * 293 * PROBE_UNI: This type is used to do failure detection / failure recovery 294 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 295 * not less than the current CRTT. pii_probes[] stores data 296 * about these probes. These packets consume sequence number space. 297 * 298 * PROBE_RTT: This type is used to make only rtt measurments. Normally these 299 * are not used. Under heavy network load, the rtt may go up very high, 300 * due to a spike, or may appear to go high, due to extreme scheduling 301 * delays. Once the network stress is removed, mpathd takes long time to 302 * recover, because the probe_interval is already high, and it takes 303 * a long time to send out sufficient number of probes to bring down the 304 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 305 * user_probe_interval ms. and will cause only rtt updates. These packets 306 * do not consume sequence number space nor is information about these 307 * packets stored in the pii_probes[] 308 * 309 * PROBE_MULTI: This type is only used to construct a list of targets, when 310 * no targets are known. The packet is multicast to the all hosts addr. 311 */ 312 static void 313 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) 314 { 315 struct pr_icmp probe_pkt; /* Probe packet */ 316 struct sockaddr_in6 whereto6; /* target address IPv6 */ 317 struct sockaddr_in whereto; /* target address IPv4 */ 318 int pr_ndx; /* probe index in pii->pii_probes[] */ 319 boolean_t sent = _B_TRUE; 320 321 if (debug & D_TARGET) { 322 logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), 323 pii->pii_name, probe_type, cur_time); 324 } 325 326 assert(pii->pii_probe_sock != -1); 327 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 328 probe_type == PROBE_RTT); 329 330 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 331 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 332 probe_pkt.pr_icmp_code = 0; 333 probe_pkt.pr_icmp_cksum = 0; 334 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 335 336 /* 337 * Since there is no need to do arithmetic on the icmpid, 338 * (only equality check is done) pii_icmpid is stored in 339 * network byte order at initialization itself. 340 */ 341 probe_pkt.pr_icmp_id = pii->pii_icmpid; 342 probe_pkt.pr_icmp_timestamp = htonl(cur_time); 343 probe_pkt.pr_icmp_mtype = htonl(probe_type); 344 345 /* 346 * If probe_type is PROBE_MULTI, this packet will be multicast to 347 * the all hosts address. Otherwise it is unicast to the next target. 348 */ 349 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 350 pii->pii_rtt_target_next != NULL)); 351 352 if (pii->pii_af == AF_INET6) { 353 bzero(&whereto6, sizeof (whereto6)); 354 whereto6.sin6_family = AF_INET6; 355 if (probe_type == PROBE_MULTI) { 356 whereto6.sin6_addr = all_nodes_mcast_v6; 357 } else if (probe_type == PROBE_UNI) { 358 whereto6.sin6_addr = pii->pii_target_next->tg_address; 359 } else { 360 /* type is PROBE_RTT */ 361 whereto6.sin6_addr = 362 pii->pii_rtt_target_next->tg_address; 363 } 364 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 365 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, 366 sizeof (whereto6)) != sizeof (probe_pkt)) { 367 logperror_pii(pii, "probe: probe sendto"); 368 sent = _B_FALSE; 369 } 370 } else { 371 bzero(&whereto, sizeof (whereto)); 372 whereto.sin_family = AF_INET; 373 if (probe_type == PROBE_MULTI) { 374 whereto.sin_addr = all_nodes_mcast_v4; 375 } else if (probe_type == PROBE_UNI) { 376 IN6_V4MAPPED_TO_INADDR( 377 &pii->pii_target_next->tg_address, 378 &whereto.sin_addr); 379 } else { 380 /* type is PROBE_RTT */ 381 IN6_V4MAPPED_TO_INADDR( 382 &pii->pii_rtt_target_next->tg_address, 383 &whereto.sin_addr); 384 } 385 386 /* 387 * Compute the IPv4 icmp checksum. Does not cover the IP header. 388 */ 389 probe_pkt.pr_icmp_cksum = 390 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 391 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 392 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, 393 sizeof (whereto)) != sizeof (probe_pkt)) { 394 logperror_pii(pii, "probe: probe sendto"); 395 sent = _B_FALSE; 396 } 397 } 398 399 /* 400 * If this is a PROBE_UNI probe packet being unicast to a target, then 401 * update our tables. We will need this info in processing the probe 402 * response. PROBE_MULTI and PROBE_RTT packets are not used for 403 * the purpose of failure or recovery detection. PROBE_MULTI packets 404 * are only used to construct a list of targets. PROBE_RTT packets are 405 * used only for updating the rtt and not for failure detection. 406 */ 407 if (probe_type == PROBE_UNI && sent) { 408 pr_ndx = pii->pii_probe_next; 409 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 410 411 /* Collect statistics, before we reuse the last slot. */ 412 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 413 pii->pii_cum_stats.lost++; 414 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 415 pii->pii_cum_stats.acked++; 416 pii->pii_cum_stats.sent++; 417 418 pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; 419 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 420 pii->pii_probes[pr_ndx].pr_time_sent = cur_time; 421 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 422 pii->pii_target_next = target_next(pii->pii_target_next); 423 assert(pii->pii_target_next != NULL); 424 /* 425 * If we have a single variable to denote the next target to 426 * probe for both rtt probes and failure detection probes, we 427 * could end up with a situation where the failure detection 428 * probe targets become disjoint from the rtt probe targets. 429 * Eg. if 2 targets and the actual fdt is double the user 430 * specified fdt. So we have 2 variables. In this scheme 431 * we also reset pii_rtt_target_next for every fdt probe, 432 * though that may not be necessary. 433 */ 434 pii->pii_rtt_target_next = pii->pii_target_next; 435 pii->pii_snxt++; 436 } else if (probe_type == PROBE_RTT) { 437 pii->pii_rtt_target_next = 438 target_next(pii->pii_rtt_target_next); 439 assert(pii->pii_rtt_target_next != NULL); 440 } 441 } 442 443 /* 444 * Incoming IPv4 data from wire, is received here. Called from main. 445 */ 446 void 447 in_data(struct phyint_instance *pii) 448 { 449 struct sockaddr_in from; 450 struct in6_addr fromaddr; 451 uint_t fromlen; 452 static uint_t in_packet[(IP_MAXPACKET + 1)/4]; 453 struct ip *ip; 454 int iphlen; 455 int len; 456 char abuf[INET_ADDRSTRLEN]; 457 struct pr_icmp *reply; 458 459 if (debug & D_PROBE) { 460 logdebug("in_data(%s %s)\n", 461 AF_STR(pii->pii_af), pii->pii_name); 462 } 463 464 /* 465 * Poll has already told us that a message is waiting, 466 * on this socket. Read it now. We should not block. 467 */ 468 fromlen = sizeof (from); 469 len = recvfrom(pii->pii_probe_sock, (char *)in_packet, 470 sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); 471 if (len < 0) { 472 logperror_pii(pii, "in_data: recvfrom"); 473 return; 474 } 475 476 /* 477 * If the NIC has indicated the link is down, don't go 478 * any further. 479 */ 480 if (LINK_DOWN(pii->pii_phyint)) 481 return; 482 483 /* Get the printable address for error reporting */ 484 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 485 486 /* Make sure packet contains at least minimum ICMP header */ 487 ip = (struct ip *)in_packet; 488 iphlen = ip->ip_hl << 2; 489 if (len < iphlen + ICMP_MINLEN) { 490 if (debug & D_PKTBAD) { 491 logdebug("in_data: packet too short (%d bytes)" 492 " from %s\n", len, abuf); 493 } 494 return; 495 } 496 497 /* 498 * Subtract the IP hdr length, 'len' will be length of the probe 499 * reply, starting from the icmp hdr. 500 */ 501 len -= iphlen; 502 /* LINTED */ 503 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 504 505 /* Probe replies are icmp echo replies. Ignore anything else */ 506 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 507 return; 508 509 /* 510 * The icmp id should match what we sent, which is stored 511 * in pi_icmpid. The icmp code for reply must be 0. 512 * The reply content must be a struct pr_icmp 513 */ 514 if (reply->pr_icmp_id != pii->pii_icmpid) { 515 /* Not in response to our probe */ 516 return; 517 } 518 519 if (reply->pr_icmp_code != 0) { 520 logtrace("probe reply code %d from %s on %s\n", 521 reply->pr_icmp_code, abuf, pii->pii_name); 522 return; 523 } 524 525 if (len < sizeof (struct pr_icmp)) { 526 logtrace("probe reply too short: %d bytes from %s on %s\n", 527 len, abuf, pii->pii_name); 528 return; 529 } 530 531 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 532 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 533 /* Unicast probe reply */ 534 incoming_echo_reply(pii, reply, fromaddr); 535 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 536 /* Multicast reply */ 537 incoming_mcast_reply(pii, reply, fromaddr); 538 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 539 incoming_rtt_reply(pii, reply, fromaddr); 540 } else { 541 /* Probably not in response to our probe */ 542 logtrace("probe reply type: %d from %s on %s\n", 543 reply->pr_icmp_mtype, abuf, pii->pii_name); 544 return; 545 } 546 547 } 548 549 /* 550 * Incoming IPv6 data from wire is received here. Called from main. 551 */ 552 void 553 in6_data(struct phyint_instance *pii) 554 { 555 struct sockaddr_in6 from; 556 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 557 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 558 int len; 559 char abuf[INET6_ADDRSTRLEN]; 560 struct msghdr msg; 561 struct iovec iov; 562 uchar_t *opt; 563 struct pr_icmp *reply; 564 565 if (debug & D_PROBE) { 566 logdebug("in6_data(%s %s)\n", 567 AF_STR(pii->pii_af), pii->pii_name); 568 } 569 570 iov.iov_base = (char *)in_packet; 571 iov.iov_len = sizeof (in_packet); 572 msg.msg_iov = &iov; 573 msg.msg_iovlen = 1; 574 msg.msg_name = (struct sockaddr *)&from; 575 msg.msg_namelen = sizeof (from); 576 msg.msg_control = ancillary_data; 577 msg.msg_controllen = sizeof (ancillary_data); 578 579 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 580 logperror_pii(pii, "in6_data: recvfrom"); 581 return; 582 } 583 584 /* 585 * If the NIC has indicated that the link is down, don't go 586 * any further. 587 */ 588 if (LINK_DOWN(pii->pii_phyint)) 589 return; 590 591 /* Get the printable address for error reporting */ 592 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 593 if (len < ICMP_MINLEN) { 594 if (debug & D_PKTBAD) { 595 logdebug("Truncated message: msg_flags 0x%x from %s\n", 596 msg.msg_flags, abuf); 597 } 598 return; 599 } 600 /* Ignore packets > 64k or control buffers that don't fit */ 601 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 602 if (debug & D_PKTBAD) { 603 logdebug("Truncated message: msg_flags 0x%x from %s\n", 604 msg.msg_flags, abuf); 605 } 606 return; 607 } 608 609 reply = (struct pr_icmp *)in_packet; 610 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 611 return; 612 613 if (reply->pr_icmp_id != pii->pii_icmpid) { 614 /* Not in response to our probe */ 615 return; 616 } 617 618 /* 619 * The kernel has already verified the the ICMP checksum. 620 */ 621 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 622 logtrace("ICMPv6 echo reply source address not linklocal from " 623 "%s on %s\n", abuf, pii->pii_name); 624 return; 625 } 626 opt = find_ancillary(&msg, IPV6_RTHDR); 627 if (opt != NULL) { 628 /* Can't allow routing headers in probe replies */ 629 logtrace("message with routing header from %s on %s\n", 630 abuf, pii->pii_name); 631 return; 632 } 633 if (reply->pr_icmp_code != 0) { 634 logtrace("probe reply code: %d from %s on %s\n", 635 reply->pr_icmp_code, abuf, pii->pii_name); 636 return; 637 } 638 if (len < (sizeof (struct pr_icmp))) { 639 logtrace("probe reply too short: %d bytes from %s on %s\n", 640 len, abuf, pii->pii_name); 641 return; 642 } 643 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 644 incoming_echo_reply(pii, reply, from.sin6_addr); 645 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 646 incoming_mcast_reply(pii, reply, from.sin6_addr); 647 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 648 incoming_rtt_reply(pii, reply, from.sin6_addr); 649 } else { 650 /* Probably not in response to our probe */ 651 logtrace("probe reply type: %d from %s on %s\n", 652 reply->pr_icmp_mtype, abuf, pii->pii_name); 653 } 654 } 655 656 /* 657 * Process the incoming rtt reply, in response to our rtt probe. 658 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 659 * have any stored information about the probe we sent. So we don't log 660 * any errors if we receive bad replies. 661 */ 662 static void 663 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 664 struct in6_addr fromaddr) 665 { 666 int m; /* rtt measurment in ms */ 667 uint32_t cur_time; /* in ms from some arbitrary point */ 668 char abuf[INET6_ADDRSTRLEN]; 669 struct target *target; 670 uint32_t pr_icmp_timestamp; 671 struct phyint_group *pg; 672 673 /* Get the printable address for error reporting */ 674 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 675 676 if (debug & D_PROBE) { 677 logdebug("incoming_rtt_reply: %s %s %s\n", 678 AF_STR(pii->pii_af), pii->pii_name, abuf); 679 } 680 681 /* Do we know this target ? */ 682 target = target_lookup(pii, fromaddr); 683 if (target == NULL) 684 return; 685 686 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 687 cur_time = getcurrenttime(); 688 m = (int)(cur_time - pr_icmp_timestamp); 689 690 /* Invalid rtt. It has wrapped around */ 691 if (m < 0) 692 return; 693 694 /* 695 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 696 * The initial few responses after the interface is repaired may 697 * contain high rtt's because they could have been queued up waiting 698 * for ARP/NDP resolution on a failed interface. 699 */ 700 pg = pii->pii_phyint->pi_group; 701 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 702 return; 703 704 /* 705 * Update rtt only if the new rtt is lower than the current rtt. 706 * (specified by the 3rd parameter to pi_set_crtt). 707 * If a spike has caused the current probe_interval to be > 708 * user_probe_interval, then this mechanism is used to bring down 709 * the rtt rapidly once the network stress is removed. 710 * If the new rtt is higher than the current rtt, we don't want to 711 * update the rtt. We are having more than 1 outstanding probe and 712 * the increase in rtt we are seeing is being unnecessarily weighted 713 * many times. The regular rtt update will be handled by 714 * incoming_echo_reply() and will take care of any rtt increase. 715 */ 716 pi_set_crtt(target, m, _B_FALSE); 717 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 718 (user_failure_detection_time < pg->pg_fdt) && 719 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 720 /* 721 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 722 * investigate if we can improve the failure detection time to 723 * meet whatever the user specified. 724 */ 725 if (check_pg_crtt_improved(pg)) { 726 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 727 user_failure_detection_time); 728 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 729 if (pii->pii_phyint->pi_group != phyint_anongroup) { 730 logerr("Improved failure detection time %d ms " 731 "on (%s %s) for group \"%s\"\n", 732 pg->pg_fdt, AF_STR(pii->pii_af), 733 pii->pii_name, 734 pii->pii_phyint->pi_group->pg_name); 735 } 736 if (user_failure_detection_time == pg->pg_fdt) { 737 /* Avoid any truncation or rounding errors */ 738 pg->pg_probeint = user_probe_interval; 739 /* 740 * No more rtt probes will be sent. The actual 741 * fdt has dropped to the user specified value. 742 * pii_fd_snxt_basetime and pii_snxt_basetime 743 * will be in sync henceforth. 744 */ 745 reset_snxt_basetimes(); 746 } 747 } 748 } 749 } 750 751 /* 752 * Process the incoming echo reply, in response to our unicast probe. 753 * Common for both IPv4 and IPv6 754 */ 755 static void 756 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 757 struct in6_addr fromaddr) 758 { 759 int m; /* rtt measurment in ms */ 760 uint32_t cur_time; /* in ms from some arbitrary point */ 761 char abuf[INET6_ADDRSTRLEN]; 762 int pr_ndx; 763 struct target *target; 764 boolean_t exception; 765 uint32_t pr_icmp_timestamp; 766 uint16_t pr_icmp_seq; 767 struct phyint_group *pg = pii->pii_phyint->pi_group; 768 769 /* Get the printable address for error reporting */ 770 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 771 772 if (debug & D_PROBE) { 773 logdebug("incoming_echo_reply: %s %s %s seq %u\n", 774 AF_STR(pii->pii_af), pii->pii_name, abuf, 775 ntohs(reply->pr_icmp_seq)); 776 } 777 778 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 779 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 780 781 /* Reject out of window probe replies */ 782 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 783 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 784 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 785 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 786 pii->pii_cum_stats.unknown++; 787 return; 788 } 789 cur_time = getcurrenttime(); 790 m = (int)(cur_time - pr_icmp_timestamp); 791 if (m < 0) { 792 /* 793 * This is a ridiculously high value of rtt. rtt has wrapped 794 * around. Log a message, and ignore the rtt. 795 */ 796 logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " 797 "timestamp %u\n", cur_time, pr_icmp_timestamp); 798 } 799 800 /* 801 * Get the probe index pr_ndx corresponding to the received icmp seq. 802 * number in our pii->pii_probes[] array. The icmp sequence number 803 * pii_snxt corresponds to the probe index pii->pii_probe_next 804 */ 805 pr_ndx = MOD_SUB(pii->pii_probe_next, 806 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 807 808 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 809 810 target = pii->pii_probes[pr_ndx].pr_target; 811 812 /* 813 * Perform sanity checks, whether this probe reply that we 814 * have received is genuine 815 */ 816 if (target != NULL) { 817 /* 818 * Compare the src. addr of the received ICMP or ICMPv6 819 * probe reply with the target address in our tables. 820 */ 821 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 822 /* 823 * We don't have any record of having sent a probe to 824 * this target. This is a fake probe reply. Log an error 825 */ 826 logtrace("probe status %d Fake probe reply seq %u " 827 "snxt %u on %s from %s\n", 828 pii->pii_probes[pr_ndx].pr_status, 829 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 830 pii->pii_cum_stats.unknown++; 831 return; 832 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 833 /* 834 * The address matches, but our tables indicate that 835 * this probe reply has been acked already. So this 836 * is a duplicate probe reply. Log an error 837 */ 838 logtrace("probe status %d Duplicate probe reply seq %u " 839 "snxt %u on %s from %s\n", 840 pii->pii_probes[pr_ndx].pr_status, 841 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 842 pii->pii_cum_stats.unknown++; 843 return; 844 } 845 } else { 846 /* 847 * Target must not be NULL in the PR_UNACKED state 848 */ 849 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 850 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 851 /* 852 * The probe stats slot is unused. So we didn't 853 * send out any probe to this target. This is a fake. 854 * Log an error. 855 */ 856 logtrace("probe status %d Fake probe reply seq %u " 857 "snxt %u on %s from %s\n", 858 pii->pii_probes[pr_ndx].pr_status, 859 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 860 } 861 pii->pii_cum_stats.unknown++; 862 return; 863 } 864 865 /* 866 * If the rtt does not appear to be right, don't update the 867 * rtt stats. This can happen if the system dropped into the 868 * debugger, or the system was hung or too busy for a 869 * substantial time that we didn't get a chance to run. 870 */ 871 if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { 872 /* 873 * If the probe corresponding to this receieved response 874 * was truly sent 'm' ms. ago, then this response must 875 * have been rejected by the sequence number checks. The 876 * fact that it has passed the sequence number checks 877 * means that the measured rtt is wrong. We were probably 878 * scheduled long after the packet was received. 879 */ 880 goto out; 881 } 882 883 /* 884 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 885 * The initial few responses after the interface is repaired may 886 * contain high rtt's because they could have been queued up waiting 887 * for ARP/NDP resolution on a failed interface. 888 */ 889 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 890 goto out; 891 892 /* 893 * Don't update the Conservative Round Trip Time estimate for this 894 * (phint, target) pair if this is the not the highest ack seq seen 895 * thus far on this target. 896 */ 897 if (!highest_ack_tg(pr_icmp_seq, target)) 898 goto out; 899 900 /* 901 * Always update the rtt. This is a failure detection probe 902 * and we want to measure both increase / decrease in rtt. 903 */ 904 pi_set_crtt(target, m, _B_TRUE); 905 906 /* 907 * If the crtt exceeds the average time between probes, 908 * investigate if this slow target is an exception. If so we 909 * can avoid this target and still meet the failure detection 910 * time. Otherwise we can't meet the failure detection time. 911 */ 912 if (target->tg_crtt > pg->pg_probeint) { 913 exception = check_exception_target(pii, target); 914 if (exception) { 915 /* 916 * This target is exceptionally slow. Don't use it 917 * for future probes. check_exception_target() has 918 * made sure that we have at least MIN_PROBE_TARGETS 919 * other active targets 920 */ 921 if (pii->pii_targets_are_routers) { 922 /* 923 * This is a slow router, mark it as slow 924 * and don't use it for further probes. We 925 * don't delete it, since it will be populated 926 * again when we do a router scan. Hence we 927 * need to maintain extra state (unlike the 928 * host case below). Mark it as TG_SLOW. 929 */ 930 if (target->tg_status == TG_ACTIVE) 931 pii->pii_ntargets--; 932 target->tg_status = TG_SLOW; 933 target->tg_latime = gethrtime(); 934 target->tg_rtt_sa = -1; 935 target->tg_crtt = 0; 936 target->tg_rtt_sd = 0; 937 if (pii->pii_target_next == target) { 938 pii->pii_target_next = 939 target_next(target); 940 } 941 } else { 942 /* 943 * the slow target is not a router, we can 944 * just delete it. Send an icmp multicast and 945 * pick the fastest responder that is not 946 * already an active target. target_delete() 947 * adjusts pii->pii_target_next 948 */ 949 target_delete(target); 950 probe(pii, PROBE_MULTI, cur_time); 951 } 952 } else { 953 /* 954 * We can't meet the failure detection time. 955 * Log a message, and update the detection time to 956 * whatever we can achieve. 957 */ 958 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 959 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 960 last_fdt_bumpup_time = gethrtime(); 961 if (pg != phyint_anongroup) { 962 logerr("Cannot meet requested failure detection" 963 " time of %d ms on (%s %s) new failure" 964 " detection time for group \"%s\" is %d" 965 " ms\n", user_failure_detection_time, 966 AF_STR(pii->pii_af), pii->pii_name, 967 pg->pg_name, pg->pg_fdt); 968 } 969 } 970 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 971 (user_failure_detection_time < pg->pg_fdt) && 972 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 973 /* 974 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 975 * investigate if we can improve the failure detection time to 976 * meet whatever the user specified. 977 */ 978 if (check_pg_crtt_improved(pg)) { 979 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 980 user_failure_detection_time); 981 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 982 if (pg != phyint_anongroup) { 983 logerr("Improved failure detection time %d ms " 984 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 985 AF_STR(pii->pii_af), pii->pii_name, 986 pg->pg_name); 987 } 988 if (user_failure_detection_time == pg->pg_fdt) { 989 /* Avoid any truncation or rounding errors */ 990 pg->pg_probeint = user_probe_interval; 991 /* 992 * No more rtt probes will be sent. The actual 993 * fdt has dropped to the user specified value. 994 * pii_fd_snxt_basetime and pii_snxt_basetime 995 * will be in sync henceforth. 996 */ 997 reset_snxt_basetimes(); 998 } 999 } 1000 } 1001 out: 1002 pii->pii_probes[pr_ndx].pr_status = PR_ACKED; 1003 pii->pii_probes[pr_ndx].pr_time_acked = cur_time; 1004 1005 /* 1006 * Update pii->pii_rack, i.e. the sequence number of the last received 1007 * probe response, based on the echo reply we have received now, if 1008 * either of the following conditions are satisfied. 1009 * a. pii_rack is outside the current receive window of 1010 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 1011 * This means we have not received probe responses for a 1012 * long time, and the sequence number has wrapped around. 1013 * b. pii_rack is within the current receive window and this echo 1014 * reply corresponds to the highest sequence number we have seen 1015 * so far. 1016 */ 1017 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 1018 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 1019 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 1020 pii->pii_rack = pr_icmp_seq; 1021 } 1022 } 1023 1024 /* 1025 * Returns true if seq is the highest unacknowledged seq for target tg 1026 * else returns false 1027 */ 1028 static boolean_t 1029 highest_ack_tg(uint16_t seq, struct target *tg) 1030 { 1031 struct phyint_instance *pii; 1032 int pr_ndx; 1033 uint16_t pr_seq; 1034 1035 pii = tg->tg_phyint_inst; 1036 1037 /* 1038 * Get the seq number of the most recent probe sent so far, 1039 * and also get the corresponding probe index in the probe stats 1040 * array. 1041 */ 1042 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1043 pr_seq = pii->pii_snxt; 1044 pr_seq--; 1045 1046 /* 1047 * Start from the most recent probe and walk back, trying to find 1048 * an acked probe corresponding to target tg. 1049 */ 1050 for (; pr_ndx != pii->pii_probe_next; 1051 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 1052 if (pii->pii_probes[pr_ndx].pr_target == tg && 1053 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1054 if (SEQ_GT(pr_seq, seq)) 1055 return (_B_FALSE); 1056 } 1057 } 1058 return (_B_TRUE); 1059 } 1060 1061 /* 1062 * Check whether the crtt for the group has improved by a factor of 1063 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1064 * detection time flapping in the face of small crtt changes. 1065 */ 1066 static boolean_t 1067 check_pg_crtt_improved(struct phyint_group *pg) 1068 { 1069 struct phyint *pi; 1070 1071 if (debug & D_PROBE) 1072 logdebug("check_pg_crtt_improved()\n"); 1073 1074 /* 1075 * The crtt for the group is only improved if each phyint_instance 1076 * for both ipv4 and ipv6 is improved. 1077 */ 1078 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1079 if (!check_pii_crtt_improved(pi->pi_v4) || 1080 !check_pii_crtt_improved(pi->pi_v6)) 1081 return (_B_FALSE); 1082 } 1083 1084 return (_B_TRUE); 1085 } 1086 1087 /* 1088 * Check whether the crtt has improved substantially on this phyint_instance. 1089 * Returns _B_TRUE if there's no crtt information available, because pii 1090 * is NULL or the phyint_instance is not capable of probing. 1091 */ 1092 boolean_t 1093 check_pii_crtt_improved(struct phyint_instance *pii) { 1094 struct target *tg; 1095 1096 if (pii == NULL) 1097 return (_B_TRUE); 1098 1099 if (!PROBE_CAPABLE(pii) || 1100 pii->pii_phyint->pi_state == PI_FAILED) 1101 return (_B_TRUE); 1102 1103 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1104 if (tg->tg_status != TG_ACTIVE) 1105 continue; 1106 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1107 LOWER_FDT_TRIGGER)) { 1108 return (_B_FALSE); 1109 } 1110 } 1111 1112 return (_B_TRUE); 1113 } 1114 1115 /* 1116 * This target responds very slowly to probes. The target's crtt exceeds 1117 * the probe interval of its group. Compare against other targets 1118 * and determine if this target is an exception, if so return true, else false 1119 */ 1120 static boolean_t 1121 check_exception_target(struct phyint_instance *pii, struct target *target) 1122 { 1123 struct target *tg; 1124 char abuf[INET6_ADDRSTRLEN]; 1125 1126 if (debug & D_PROBE) { 1127 logdebug("check_exception_target(%s %s target %s)\n", 1128 AF_STR(pii->pii_af), pii->pii_name, 1129 pr_addr(pii->pii_af, target->tg_address, 1130 abuf, sizeof (abuf))); 1131 } 1132 1133 /* 1134 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1135 * to make a good judgement. Otherwise don't drop this target. 1136 */ 1137 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1138 return (_B_FALSE); 1139 1140 /* 1141 * Determine whether only this particular target is slow. 1142 * We know that this target's crtt exceeds the group's probe interval. 1143 * If all other active targets have a 1144 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1145 * then this target is considered slow. 1146 */ 1147 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1148 if (tg != target && tg->tg_status == TG_ACTIVE) { 1149 if (tg->tg_crtt > 1150 pii->pii_phyint->pi_group->pg_probeint / 1151 EXCEPTION_FACTOR) { 1152 return (_B_FALSE); 1153 } 1154 } 1155 } 1156 1157 return (_B_TRUE); 1158 } 1159 1160 /* 1161 * Update the target list. The icmp all hosts multicast has given us 1162 * some host to which we can send probes. If we already have sufficient 1163 * targets, discard it. 1164 */ 1165 static void 1166 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1167 struct in6_addr fromaddr) 1168 /* ARGSUSED */ 1169 { 1170 int af; 1171 char abuf[INET6_ADDRSTRLEN]; 1172 struct phyint *pi; 1173 1174 if (debug & D_PROBE) { 1175 logdebug("incoming_mcast_reply(%s %s %s)\n", 1176 AF_STR(pii->pii_af), pii->pii_name, 1177 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1178 } 1179 1180 /* 1181 * Using host targets is a fallback mechanism. If we have 1182 * found a router, don't add this host target. If we already 1183 * know MAX_PROBE_TARGETS, don't add another target. 1184 */ 1185 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1186 if (pii->pii_targets != NULL) { 1187 if (pii->pii_targets_are_routers || 1188 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1189 return; 1190 } 1191 } 1192 1193 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1194 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1195 /* 1196 * Guard against response from 0.0.0.0 1197 * and ::. Log a trace message 1198 */ 1199 logtrace("probe response from %s on %s\n", 1200 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1201 pii->pii_name); 1202 return; 1203 } 1204 1205 /* 1206 * This address is one of our own, so reject this address as a 1207 * valid probe target. 1208 */ 1209 af = pii->pii_af; 1210 if (own_address(fromaddr)) 1211 return; 1212 1213 /* 1214 * If the phyint is part a named group, then add the address to all 1215 * members of the group. Otherwise, add the address only to the 1216 * phyint itself, since other phyints in the anongroup may not be on 1217 * the same subnet. 1218 */ 1219 pi = pii->pii_phyint; 1220 if (pi->pi_group == phyint_anongroup) { 1221 target_add(pii, fromaddr, _B_FALSE); 1222 } else { 1223 pi = pi->pi_group->pg_phyint; 1224 for (; pi != NULL; pi = pi->pi_pgnext) 1225 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1226 } 1227 } 1228 1229 /* 1230 * Compute CRTT given an existing scaled average, scaled deviation estimate 1231 * and a new rtt time. The formula is from Jacobson and Karels' 1232 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1233 * are the same as those in Appendix A.2 of that paper. 1234 * 1235 * m = new measurement 1236 * sa = scaled RTT average (8 * average estimates) 1237 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1238 * crtt = Conservative round trip time. Used to determine whether probe 1239 * has timed out. 1240 * 1241 * New scaled average and deviation are passed back via sap and svp 1242 */ 1243 static int 1244 compute_crtt(int *sap, int *svp, int m) 1245 { 1246 int sa = *sap; 1247 int sv = *svp; 1248 int crtt; 1249 int saved_m = m; 1250 1251 assert(*sap >= -1); 1252 assert(*svp >= 0); 1253 1254 if (sa != -1) { 1255 /* 1256 * Update average estimator: 1257 * new rtt = old rtt + 1/8 Error 1258 * where Error = m - old rtt 1259 * i.e. 8 * new rtt = 8 * old rtt + Error 1260 * i.e. new sa = old sa + Error 1261 */ 1262 m -= sa >> 3; /* m is now Error in estimate. */ 1263 if ((sa += m) < 0) { 1264 /* Don't allow the smoothed average to be negative. */ 1265 sa = 0; 1266 } 1267 1268 /* 1269 * Update deviation estimator: 1270 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1271 * i.e. 4 * new mdev = 4 * old mdev + 1272 * (abs(Error) - old mdev) 1273 * i.e. new sv = old sv + (abs(Error) - old mdev) 1274 */ 1275 if (m < 0) 1276 m = -m; 1277 m -= sv >> 2; 1278 sv += m; 1279 } else { 1280 /* Initialization. This is the first response received. */ 1281 sa = (m << 3); 1282 sv = (m << 1); 1283 } 1284 1285 crtt = (sa >> 3) + sv; 1286 1287 if (debug & D_PROBE) { 1288 logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " 1289 "%d\n", saved_m, sa, sv, crtt); 1290 } 1291 1292 *sap = sa; 1293 *svp = sv; 1294 1295 /* 1296 * CRTT = average estimates + 4 * deviation estimates 1297 * = sa / 8 + sv 1298 */ 1299 return (crtt); 1300 } 1301 1302 static void 1303 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) 1304 { 1305 struct phyint_instance *pii = tg->tg_phyint_inst; 1306 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1307 int sa = tg->tg_rtt_sa; 1308 int sv = tg->tg_rtt_sd; 1309 int new_crtt; 1310 int i; 1311 1312 if (debug & D_PROBE) 1313 logdebug("pi_set_crtt: target - m %d\n", m); 1314 1315 /* store the round trip time, in case we need to defer computation */ 1316 tg->tg_deferred[tg->tg_num_deferred] = m; 1317 1318 new_crtt = compute_crtt(&sa, &sv, m); 1319 1320 /* 1321 * If this probe's round trip time would singlehandedly cause an 1322 * increase in the group's probe interval consider it suspect. 1323 */ 1324 if ((new_crtt > probe_interval) && is_probe_uni) { 1325 if (debug & D_PROBE) { 1326 logdebug("Received a suspect probe on %s, new_crtt =" 1327 " %d, probe_interval = %d, num_deferred = %d\n", 1328 pii->pii_probe_logint->li_name, new_crtt, 1329 probe_interval, tg->tg_num_deferred); 1330 } 1331 1332 /* 1333 * If we've deferred as many rtts as we plan on deferring, then 1334 * assume the link really did slow down and process all queued 1335 * rtts 1336 */ 1337 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1338 if (debug & D_PROBE) { 1339 logdebug("Received MAXDEFERREDRTT probes which " 1340 "would cause an increased probe_interval. " 1341 "Integrating queued rtt data points.\n"); 1342 } 1343 1344 for (i = 0; i <= tg->tg_num_deferred; i++) { 1345 tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, 1346 &tg->tg_rtt_sd, tg->tg_deferred[i]); 1347 } 1348 1349 tg->tg_num_deferred = 0; 1350 } else { 1351 tg->tg_num_deferred++; 1352 } 1353 return; 1354 } 1355 1356 /* 1357 * If this is a normal probe, or an RTT probe that would lead to a 1358 * reduced CRTT, then update our CRTT data. Further, if this was 1359 * a normal probe, pitch any deferred probes since our probes are 1360 * again being answered within our CRTT estimates. 1361 */ 1362 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1363 tg->tg_rtt_sa = sa; 1364 tg->tg_rtt_sd = sv; 1365 tg->tg_crtt = new_crtt; 1366 if (is_probe_uni) 1367 tg->tg_num_deferred = 0; 1368 } 1369 } 1370 1371 /* 1372 * Return a pointer to the specified option buffer. 1373 * If not found return NULL. 1374 */ 1375 static void * 1376 find_ancillary(struct msghdr *msg, int cmsg_type) 1377 { 1378 struct cmsghdr *cmsg; 1379 1380 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1381 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1382 if (cmsg->cmsg_level == IPPROTO_IPV6 && 1383 cmsg->cmsg_type == cmsg_type) { 1384 return (CMSG_DATA(cmsg)); 1385 } 1386 } 1387 return (NULL); 1388 } 1389 1390 /* 1391 * See if a previously failed interface has started working again. 1392 */ 1393 void 1394 phyint_check_for_repair(struct phyint *pi) 1395 { 1396 if (phyint_repaired(pi)) { 1397 if (pi->pi_group == phyint_anongroup) { 1398 logerr("NIC repair detected on %s\n", pi->pi_name); 1399 } else { 1400 logerr("NIC repair detected on %s of group %s\n", 1401 pi->pi_name, pi->pi_group->pg_name); 1402 } 1403 1404 /* 1405 * If the interface is offline, just clear the FAILED flag, 1406 * delaying the state change and failback operation until it 1407 * is brought back online. 1408 */ 1409 if (pi->pi_state == PI_OFFLINE) { 1410 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1411 return; 1412 } 1413 1414 if (pi->pi_flags & IFF_STANDBY) { 1415 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1416 } else { 1417 if (try_failback(pi) != IPMP_FAILURE) { 1418 (void) change_lif_flags(pi, 1419 IFF_FAILED, _B_FALSE); 1420 /* Per state diagram */ 1421 pi->pi_empty = 0; 1422 } 1423 } 1424 1425 phyint_chstate(pi, PI_RUNNING); 1426 1427 if (GROUP_FAILED(pi->pi_group)) { 1428 /* 1429 * This is the 1st phyint to receive a response 1430 * after group failure. 1431 */ 1432 logerr("At least 1 interface (%s) of group %s has " 1433 "repaired\n", pi->pi_name, pi->pi_group->pg_name); 1434 phyint_group_chstate(pi->pi_group, PG_RUNNING); 1435 } 1436 } 1437 } 1438 1439 /* 1440 * See if a previously functioning interface has failed, or if the 1441 * whole group of interfaces has failed. 1442 */ 1443 static void 1444 phyint_inst_check_for_failure(struct phyint_instance *pii) 1445 { 1446 struct phyint *pi; 1447 struct phyint *pi2; 1448 1449 pi = pii->pii_phyint; 1450 1451 switch (failure_state(pii)) { 1452 case PHYINT_FAILURE: 1453 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 1454 if (pi->pi_group == phyint_anongroup) { 1455 logerr("NIC failure detected on %s\n", pii->pii_name); 1456 } else { 1457 logerr("NIC failure detected on %s of group %s\n", 1458 pii->pii_name, pi->pi_group->pg_name); 1459 } 1460 /* 1461 * Do the failover, unless the interface is offline (in 1462 * which case we've already failed over). 1463 */ 1464 if (pi->pi_state != PI_OFFLINE) { 1465 phyint_chstate(pi, PI_FAILED); 1466 reset_crtt_all(pi); 1467 if (!(pi->pi_flags & IFF_INACTIVE)) 1468 (void) try_failover(pi, FAILOVER_NORMAL); 1469 } 1470 break; 1471 1472 case GROUP_FAILURE: 1473 logerr("All Interfaces in group %s have failed\n", 1474 pi->pi_group->pg_name); 1475 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; 1476 pi2 = pi2->pi_pgnext) { 1477 if (pi2->pi_flags & IFF_OFFLINE) 1478 continue; 1479 (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); 1480 reset_crtt_all(pi2); 1481 1482 /* 1483 * In the case of host targets, we 1484 * would have flushed the targets, 1485 * and gone to PI_NOTARGETS state. 1486 */ 1487 if (pi2->pi_state == PI_RUNNING) 1488 phyint_chstate(pi2, PI_FAILED); 1489 1490 pi2->pi_empty = 0; 1491 pi2->pi_full = 0; 1492 } 1493 break; 1494 1495 default: 1496 break; 1497 } 1498 } 1499 1500 /* 1501 * Determines if any timeout event has occurred and returns the number of 1502 * milliseconds until the next timeout event for the phyint. Returns 1503 * TIMER_INFINITY for "never". 1504 */ 1505 uint_t 1506 phyint_inst_timer(struct phyint_instance *pii) 1507 { 1508 int pr_ndx; 1509 uint_t timeout; 1510 struct target *cur_tg; 1511 struct probe_stats *pr_statp; 1512 struct phyint_instance *pii_other; 1513 struct phyint *pi; 1514 int valid_unack_count; 1515 int i; 1516 int interval; 1517 uint_t check_time; 1518 uint_t cur_time; 1519 hrtime_t cur_hrtime; 1520 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1521 1522 cur_time = getcurrenttime(); 1523 1524 if (debug & D_TIMER) { 1525 logdebug("phyint_inst_timer(%s %s)\n", 1526 AF_STR(pii->pii_af), pii->pii_name); 1527 } 1528 1529 pii_other = phyint_inst_other(pii); 1530 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1531 /* 1532 * Check to see if we're here due to link up/down flapping; If 1533 * enough time has passed, then try to bring the interface 1534 * back up; otherwise, schedule a timer to bring it back up 1535 * when enough time *has* elapsed. 1536 */ 1537 pi = pii->pii_phyint; 1538 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1539 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1540 if (check_time > cur_time) 1541 return (check_time - cur_time); 1542 1543 phyint_check_for_repair(pi); 1544 } 1545 } 1546 1547 /* 1548 * If probing is not enabled on this phyint instance, don't proceed. 1549 */ 1550 if (!PROBE_ENABLED(pii)) 1551 return (TIMER_INFINITY); 1552 1553 /* 1554 * If the timer has fired too soon, probably triggered 1555 * by some other phyint instance, return the remaining 1556 * time 1557 */ 1558 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1559 return (pii->pii_snxt_time - cur_time); 1560 1561 /* 1562 * If the link is down, don't send any probes for now. 1563 */ 1564 if (LINK_DOWN(pii->pii_phyint)) 1565 return (TIMER_INFINITY); 1566 1567 /* 1568 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1569 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1570 * Base probe time is strictly periodic. 1571 */ 1572 interval = GET_RANDOM( 1573 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1574 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1575 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1576 1577 /* 1578 * Check if the current time > next time to probe. If so, we missed 1579 * sending 1 or more probes, probably due to heavy system load. At least 1580 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1581 * were scheduled. Make adjustments to the times, in multiples of 1582 * user_probe_interval. 1583 */ 1584 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1585 int n; 1586 1587 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1588 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1589 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1590 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1591 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1592 pii->pii_snxt_basetime); 1593 1594 /* Collect statistics about missed probes */ 1595 probes_missed.pm_nprobes += n + 1; 1596 probes_missed.pm_ntimes++; 1597 } 1598 pii->pii_snxt_basetime += user_probe_interval; 1599 interval = pii->pii_snxt_time - cur_time; 1600 if (debug & D_TARGET) { 1601 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1602 " interval %u\n", cur_time, pii->pii_snxt_time, 1603 pii->pii_snxt_basetime, interval); 1604 } 1605 1606 /* 1607 * If no targets are known, we need to send an ICMP multicast. The 1608 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1609 * to see if we found a target. 1610 */ 1611 if (pii->pii_target_next == NULL) { 1612 assert(pii->pii_ntargets == 0); 1613 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1614 probe(pii, PROBE_MULTI, cur_time); 1615 return (interval); 1616 } 1617 1618 if ((user_probe_interval != probe_interval) && 1619 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1620 /* 1621 * the failure detection (fd) probe timer has not yet fired. 1622 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1623 */ 1624 probe(pii, PROBE_RTT, cur_time); 1625 return (interval); 1626 } 1627 /* 1628 * the fd probe timer has fired. Need to do all failure 1629 * detection / recovery calculations, and then send an fd probe 1630 * of type PROBE_UNI. 1631 */ 1632 if (user_probe_interval == probe_interval) { 1633 /* 1634 * We could have missed some probes, and then adjusted 1635 * pii_snxt_basetime above. Otherwise we could have 1636 * blindly added probe_interval to pii_fd_snxt_basetime. 1637 */ 1638 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1639 } else { 1640 pii->pii_fd_snxt_basetime += probe_interval; 1641 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1642 int n; 1643 1644 n = (cur_time - pii->pii_fd_snxt_basetime) / 1645 probe_interval; 1646 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1647 } 1648 } 1649 1650 /* 1651 * We can have at most, the latest 2 probes that we sent, in 1652 * the PR_UNACKED state. All previous probes sent, are either 1653 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1654 * timed out if the probe's time_sent + the CRTT < currenttime. 1655 * For each of the last 2 probes, examine whether it has timed 1656 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1657 */ 1658 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1659 valid_unack_count = 0; 1660 1661 for (i = 0; i < 2; i++) { 1662 pr_statp = &pii->pii_probes[pr_ndx]; 1663 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1664 switch (pr_statp->pr_status) { 1665 case PR_ACKED: 1666 /* 1667 * We received back an ACK, so the switch clearly 1668 * is not dropping our traffic, and thus we can 1669 * enable failure detection immediately. 1670 */ 1671 if (pii->pii_fd_hrtime > gethrtime()) { 1672 if (debug & D_PROBE) { 1673 logdebug("successful probe on %s; " 1674 "ending quiet period\n", 1675 pii->pii_phyint->pi_name); 1676 } 1677 pii->pii_fd_hrtime = gethrtime(); 1678 } 1679 break; 1680 1681 case PR_UNACKED: 1682 assert(cur_tg != NULL); 1683 /* 1684 * The crtt could be zero for some reason, 1685 * Eg. the phyint could be failed. If the crtt is 1686 * not available use group's probe interval, 1687 * which is a worst case estimate. 1688 */ 1689 if (cur_tg->tg_crtt != 0) { 1690 timeout = pr_statp->pr_time_sent + 1691 cur_tg->tg_crtt; 1692 } else { 1693 timeout = pr_statp->pr_time_sent + 1694 probe_interval; 1695 } 1696 if (TIME_LT(timeout, cur_time)) { 1697 pr_statp->pr_status = PR_LOST; 1698 pr_statp->pr_time_lost = timeout; 1699 } else if (i == 1) { 1700 /* 1701 * We are forced to consider this probe 1702 * lost, as we can have at most 2 unack. 1703 * probes any time, and we will be sending a 1704 * probe at the end of this function. 1705 * Normally, we should not be here, but 1706 * this can happen if an incoming response 1707 * that was considered lost has increased 1708 * the crtt for this target, and also bumped 1709 * up the FDT. Note that we never cancel or 1710 * increase the current pii_time_left, so 1711 * when the timer fires, we find 2 valid 1712 * unacked probes, and they are yet to timeout 1713 */ 1714 pr_statp->pr_status = PR_LOST; 1715 pr_statp->pr_time_lost = cur_time; 1716 } else { 1717 /* 1718 * Only the most recent probe can enter 1719 * this 'else' arm. The second most recent 1720 * probe must take either of the above arms, 1721 * if it is unacked. 1722 */ 1723 valid_unack_count++; 1724 } 1725 break; 1726 } 1727 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1728 } 1729 1730 /* 1731 * We send out 1 probe randomly in the interval between one half 1732 * and one probe interval for the group. Given that the CRTT is always 1733 * less than the group's probe interval, we can have at most 1 1734 * unacknowledged probe now. All previous probes are either lost or 1735 * acked. 1736 */ 1737 assert(valid_unack_count == 0 || valid_unack_count == 1); 1738 1739 /* 1740 * The timer has fired. Take appropriate action depending 1741 * on the current state of the phyint. 1742 * 1743 * PI_RUNNING state - Failure detection and failover 1744 * PI_FAILED state - Repair detection and failback 1745 */ 1746 switch (pii->pii_phyint->pi_state) { 1747 case PI_FAILED: 1748 /* 1749 * If the most recent probe (excluding unacked probes that 1750 * are yet to time out) has been acked, check whether the 1751 * phyint is now repaired. If the phyint is repaired, then 1752 * attempt failback, unless it is an inactive standby. 1753 */ 1754 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1755 phyint_check_for_repair(pii->pii_phyint); 1756 } 1757 break; 1758 1759 case PI_RUNNING: 1760 /* 1761 * It's possible our probes have been lost because of a 1762 * spanning-tree mandated quiet period on the switch. If so, 1763 * ignore the lost probes and consider the interface to still 1764 * be functioning. 1765 */ 1766 cur_hrtime = gethrtime(); 1767 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1768 break; 1769 1770 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1771 /* 1772 * We have 1 or more failed probes (excluding unacked 1773 * probes that are yet to time out). Determine if the 1774 * phyint has failed. If so attempt a failover, 1775 * unless it is an inactive standby 1776 */ 1777 phyint_inst_check_for_failure(pii); 1778 } 1779 break; 1780 1781 default: 1782 logerr("phyint_inst_timer: invalid state %d\n", 1783 pii->pii_phyint->pi_state); 1784 abort(); 1785 } 1786 1787 /* 1788 * Start the next probe. probe() will also set pii->pii_probe_time_left 1789 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1790 * was called, the target list may be empty. 1791 */ 1792 if (pii->pii_target_next != NULL) { 1793 probe(pii, PROBE_UNI, cur_time); 1794 /* 1795 * If we have just the one probe target, and we're not using 1796 * router targets, try to find another as we presently have 1797 * no resilience. 1798 */ 1799 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1800 probe(pii, PROBE_MULTI, cur_time); 1801 } else { 1802 probe(pii, PROBE_MULTI, cur_time); 1803 } 1804 return (interval); 1805 } 1806 1807 /* 1808 * Start the probe timer for an interface instance. 1809 */ 1810 void 1811 start_timer(struct phyint_instance *pii) 1812 { 1813 uint32_t interval; 1814 1815 /* 1816 * Spread the base probe times (pi_snxt_basetime) across phyints 1817 * uniformly over the (curtime..curtime + the group's probe_interval). 1818 * pi_snxt_basetime is strictly periodic with a frequency of 1819 * the group's probe interval. The actual probe time pi_snxt_time 1820 * adds some randomness to pi_snxt_basetime and happens in probe(). 1821 * For the 1st probe on each phyint after the timer is started, 1822 * pi_snxt_time and pi_snxt_basetime are the same. 1823 */ 1824 interval = GET_RANDOM(0, 1825 (int)pii->pii_phyint->pi_group->pg_probeint); 1826 1827 pii->pii_snxt_basetime = getcurrenttime() + interval; 1828 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1829 pii->pii_snxt_time = pii->pii_snxt_basetime; 1830 timer_schedule(interval); 1831 } 1832 1833 /* 1834 * Restart the probe timer on an interface instance. 1835 */ 1836 static void 1837 restart_timer(struct phyint_instance *pii) 1838 { 1839 /* 1840 * We don't need to restart the timer if it was never started in 1841 * the first place (pii->pii_basetime_inited not set), as the timer 1842 * won't have gone off yet. 1843 */ 1844 if (pii->pii_basetime_inited != 0) { 1845 1846 if (debug & D_LINKNOTE) 1847 logdebug("restart timer: restarting timer on %s, " 1848 "address family %s\n", pii->pii_phyint->pi_name, 1849 AF_STR(pii->pii_af)); 1850 1851 start_timer(pii); 1852 } 1853 } 1854 1855 static void 1856 process_link_state_down(struct phyint *pi) 1857 { 1858 logerr("The link has gone down on %s\n", pi->pi_name); 1859 1860 /* 1861 * Clear the probe statistics arrays, we don't want the repair 1862 * detection logic relying on probes that were succesful prior 1863 * to the link going down. 1864 */ 1865 if (PROBE_CAPABLE(pi->pi_v4)) 1866 clear_pii_probe_stats(pi->pi_v4); 1867 if (PROBE_CAPABLE(pi->pi_v6)) 1868 clear_pii_probe_stats(pi->pi_v6); 1869 /* 1870 * Check for interface failure. Although we know the interface 1871 * has failed, we don't know if all the other interfaces in the 1872 * group have failed as well. 1873 */ 1874 if ((pi->pi_state == PI_RUNNING) || 1875 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1876 if (debug & D_LINKNOTE) { 1877 logdebug("process_link_state_down:" 1878 " checking for failure on %s\n", pi->pi_name); 1879 } 1880 1881 if (pi->pi_v4 != NULL) 1882 phyint_inst_check_for_failure(pi->pi_v4); 1883 else if (pi->pi_v6 != NULL) 1884 phyint_inst_check_for_failure(pi->pi_v6); 1885 } 1886 } 1887 1888 static void 1889 process_link_state_up(struct phyint *pi) 1890 { 1891 logerr("The link has come up on %s\n", pi->pi_name); 1892 1893 /* 1894 * We stopped any running timers on each instance when the link 1895 * went down, so restart them. 1896 */ 1897 if (pi->pi_v4) 1898 restart_timer(pi->pi_v4); 1899 if (pi->pi_v6) 1900 restart_timer(pi->pi_v6); 1901 1902 phyint_check_for_repair(pi); 1903 1904 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1905 if (pi->pi_whendx == LINK_UP_PERMIN) 1906 pi->pi_whendx = 0; 1907 } 1908 1909 /* 1910 * Process any changes in link state passed up from the interfaces. 1911 */ 1912 void 1913 process_link_state_changes(void) 1914 { 1915 struct phyint *pi; 1916 1917 /* Look for interfaces where the link state has just changed */ 1918 1919 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1920 boolean_t old_link_state_up = LINK_UP(pi); 1921 1922 /* 1923 * Except when the "phyint" structure is created, this is 1924 * the only place the link state is updated. This allows 1925 * this routine to detect changes in link state, rather 1926 * than just the current state. 1927 */ 1928 UPDATE_LINK_STATE(pi); 1929 1930 if (LINK_DOWN(pi)) { 1931 /* 1932 * Has link just gone down? 1933 */ 1934 if (old_link_state_up) 1935 process_link_state_down(pi); 1936 } else { 1937 /* 1938 * Has link just gone back up? 1939 */ 1940 if (!old_link_state_up) 1941 process_link_state_up(pi); 1942 } 1943 } 1944 } 1945 1946 void 1947 reset_crtt_all(struct phyint *pi) 1948 { 1949 struct phyint_instance *pii; 1950 struct target *tg; 1951 1952 pii = pi->pi_v4; 1953 if (pii != NULL) { 1954 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1955 tg->tg_crtt = 0; 1956 tg->tg_rtt_sa = -1; 1957 tg->tg_rtt_sd = 0; 1958 } 1959 } 1960 1961 pii = pi->pi_v6; 1962 if (pii != NULL) { 1963 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1964 tg->tg_crtt = 0; 1965 tg->tg_rtt_sa = -1; 1966 tg->tg_rtt_sd = 0; 1967 } 1968 } 1969 } 1970 1971 /* 1972 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 1973 * probes on both instances IPv4 and IPv6. 1974 * If the interface has failed, return the time of the first probe failure 1975 * in "tff". 1976 */ 1977 static int 1978 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 1979 { 1980 uint_t pi_tff; 1981 struct target *cur_tg; 1982 struct probe_fail_count pfinfo; 1983 struct phyint_instance *pii_other; 1984 int pr_ndx; 1985 1986 /* 1987 * Get the number of consecutive failed probes on 1988 * this phyint across all targets. Also get the number 1989 * of consecutive failed probes on this target only 1990 */ 1991 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1992 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1993 probe_fail_info(pii, cur_tg, &pfinfo); 1994 1995 /* Get the time of first failure, for later use */ 1996 pi_tff = pfinfo.pf_tff; 1997 1998 /* 1999 * If the current target has not responded to the 2000 * last NUM_PROBE_FAILS probes, and other targets are 2001 * responding delete this target. Dead gateway detection 2002 * will eventually remove this target (if router) from the 2003 * routing tables. If that does not occur, we may end 2004 * up adding this to our list again. 2005 */ 2006 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2007 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2008 if (pii->pii_targets_are_routers) { 2009 if (cur_tg->tg_status == TG_ACTIVE) 2010 pii->pii_ntargets--; 2011 cur_tg->tg_status = TG_DEAD; 2012 cur_tg->tg_crtt = 0; 2013 cur_tg->tg_rtt_sa = -1; 2014 cur_tg->tg_rtt_sd = 0; 2015 if (pii->pii_target_next == cur_tg) 2016 pii->pii_target_next = target_next(cur_tg); 2017 } else { 2018 target_delete(cur_tg); 2019 probe(pii, PROBE_MULTI, getcurrenttime()); 2020 } 2021 return (PHYINT_OK); 2022 } 2023 2024 /* 2025 * If the phyint has lost NUM_PROBE_FAILS or more 2026 * consecutive probes, on both IPv4 and IPv6 protocol 2027 * instances of the phyint, then trigger failure 2028 * detection, else return false 2029 */ 2030 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2031 return (PHYINT_OK); 2032 2033 pii_other = phyint_inst_other(pii); 2034 if (PROBE_CAPABLE(pii_other)) { 2035 probe_fail_info(pii_other, NULL, &pfinfo); 2036 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2037 /* 2038 * We have NUM_PROBE_FAILS or more failures 2039 * on both IPv4 and IPv6. Get the earliest 2040 * time when failure was detected on this 2041 * phyint across IPv4 and IPv6. 2042 */ 2043 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2044 pi_tff = pfinfo.pf_tff; 2045 } else { 2046 /* 2047 * This instance has < NUM_PROBE_FAILS failure. 2048 * So return false 2049 */ 2050 return (PHYINT_OK); 2051 } 2052 } 2053 *tff = pi_tff; 2054 return (PHYINT_FAILURE); 2055 } 2056 2057 /* 2058 * Check if the link has gone down on this phyint, or it has failed the 2059 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2060 * Also look at other phyints of this group, for group failures. 2061 */ 2062 int 2063 failure_state(struct phyint_instance *pii) 2064 { 2065 struct probe_success_count psinfo; 2066 uint_t pi2_tls; /* time last success */ 2067 uint_t pi_tff; /* time first fail */ 2068 struct phyint *pi2; 2069 struct phyint *pi; 2070 struct phyint_instance *pii2; 2071 struct phyint_group *pg; 2072 boolean_t alone; 2073 2074 if (debug & D_FAILOVER) 2075 logdebug("phyint_failed(%s)\n", pii->pii_name); 2076 2077 pi = pii->pii_phyint; 2078 pg = pi->pi_group; 2079 2080 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2081 PHYINT_OK) 2082 return (PHYINT_OK); 2083 2084 /* 2085 * At this point, the link is down, or the phyint is suspect, 2086 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint 2087 * does not belong to any group, or is the only member of the 2088 * group capable of being probed, return PHYINT_FAILURE. 2089 */ 2090 alone = _B_TRUE; 2091 if (pg != phyint_anongroup) { 2092 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2093 if (pi2 == pi) 2094 continue; 2095 if (PROBE_CAPABLE(pi2->pi_v4) || 2096 PROBE_CAPABLE(pi2->pi_v6)) { 2097 alone = _B_FALSE; 2098 break; 2099 } 2100 } 2101 } 2102 if (alone) 2103 return (PHYINT_FAILURE); 2104 2105 /* 2106 * Need to compare against other phyints of the same group 2107 * to exclude group failures. If the failure was detected via 2108 * probing, then if the time of last success (tls) of any 2109 * phyint is more recent than the time of first fail (tff) of the 2110 * phyint in question, and the link is up on the phyint, 2111 * then it is a phyint failure. Otherwise it is a group failure. 2112 * If failure was detected via a link down notification sent from 2113 * the driver to IP, we see if any phyints in the group are still 2114 * running and haven't received a link down notification. We 2115 * will usually be processing the link down notification shortly 2116 * after it was received, so there is no point looking at the tls 2117 * of other phyints. 2118 */ 2119 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2120 /* Exclude ourself from comparison */ 2121 if (pi2 == pi) 2122 continue; 2123 2124 if (LINK_DOWN(pi)) { 2125 /* 2126 * We use FLAGS_TO_LINK_STATE() to test the 2127 * flags directly, rather then LINK_UP() or 2128 * LINK_DOWN(), as we may not have got round 2129 * to processing the link state for the other 2130 * phyints in the group yet. 2131 * 2132 * The check for PI_RUNNING and group 2133 * failure handles the case when the 2134 * group begins to recover. The first 2135 * phyint to recover should not trigger 2136 * a failover from the soon-to-recover 2137 * other phyints to the first recovered 2138 * phyint. PI_RUNNING will be set, and 2139 * pg_groupfailed cleared only after 2140 * receipt of NUM_PROBE_REPAIRS, by 2141 * which time the other phyints should 2142 * have received at least 1 packet, 2143 * and so will not have NUM_PROBE_FAILS. 2144 */ 2145 if ((pi2->pi_state == PI_RUNNING) && 2146 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) 2147 return (PHYINT_FAILURE); 2148 } else { 2149 /* 2150 * Need to compare against both IPv4 and 2151 * IPv6 instances. 2152 */ 2153 pii2 = pi2->pi_v4; 2154 if (pii2 != NULL) { 2155 probe_success_info(pii2, NULL, &psinfo); 2156 if (psinfo.ps_tls_valid) { 2157 pi2_tls = psinfo.ps_tls; 2158 /* 2159 * See comment above regarding check 2160 * for PI_RUNNING and group failure. 2161 */ 2162 if (TIME_GT(pi2_tls, pi_tff) && 2163 (pi2->pi_state == PI_RUNNING) && 2164 !GROUP_FAILED(pg) && 2165 FLAGS_TO_LINK_STATE(pi2)) 2166 return (PHYINT_FAILURE); 2167 } 2168 } 2169 2170 pii2 = pi2->pi_v6; 2171 if (pii2 != NULL) { 2172 probe_success_info(pii2, NULL, &psinfo); 2173 if (psinfo.ps_tls_valid) { 2174 pi2_tls = psinfo.ps_tls; 2175 /* 2176 * See comment above regarding check 2177 * for PI_RUNNING and group failure. 2178 */ 2179 if (TIME_GT(pi2_tls, pi_tff) && 2180 (pi2->pi_state == PI_RUNNING) && 2181 !GROUP_FAILED(pg) && 2182 FLAGS_TO_LINK_STATE(pi2)) 2183 return (PHYINT_FAILURE); 2184 } 2185 } 2186 } 2187 } 2188 2189 /* 2190 * Change the group state to PG_FAILED if it's not already. 2191 */ 2192 if (!GROUP_FAILED(pg)) 2193 phyint_group_chstate(pg, PG_FAILED); 2194 2195 return (GROUP_FAILURE); 2196 } 2197 2198 /* 2199 * Return the information associated with consecutive probe successes 2200 * starting with the most recent probe. At most the last 2 probes can be 2201 * in the unacknowledged state. All previous probes have either failed 2202 * or succeeded. 2203 */ 2204 static void 2205 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2206 struct probe_success_count *psinfo) 2207 { 2208 uint_t i; 2209 struct probe_stats *pr_statp; 2210 uint_t most_recent; 2211 uint_t second_most_recent; 2212 boolean_t pi_found_failure = _B_FALSE; 2213 boolean_t tg_found_failure = _B_FALSE; 2214 uint_t now; 2215 uint_t timeout; 2216 struct target *tg; 2217 2218 if (debug & D_FAILOVER) 2219 logdebug("probe_success_info(%s)\n", pii->pii_name); 2220 2221 bzero(psinfo, sizeof (*psinfo)); 2222 now = getcurrenttime(); 2223 2224 /* 2225 * Start with the most recent probe, and count the number 2226 * of consecutive probe successes. Latch the number of successes 2227 * on hitting a failure. 2228 */ 2229 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2230 second_most_recent = PROBE_INDEX_PREV(most_recent); 2231 2232 for (i = most_recent; i != pii->pii_probe_next; 2233 i = PROBE_INDEX_PREV(i)) { 2234 pr_statp = &pii->pii_probes[i]; 2235 2236 switch (pr_statp->pr_status) { 2237 case PR_UNACKED: 2238 /* 2239 * Only the most recent 2 probes can be unacknowledged 2240 */ 2241 assert(i == most_recent || i == second_most_recent); 2242 2243 tg = pr_statp->pr_target; 2244 assert(tg != NULL); 2245 /* 2246 * The crtt could be zero for some reason, 2247 * Eg. the phyint could be failed. If the crtt is 2248 * not available use the value of the group's probe 2249 * interval which is a worst case estimate. 2250 */ 2251 if (tg->tg_crtt != 0) { 2252 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2253 } else { 2254 timeout = pr_statp->pr_time_sent + 2255 pii->pii_phyint->pi_group->pg_probeint; 2256 } 2257 2258 if (TIME_LT(timeout, now)) { 2259 /* 2260 * We hit a failure. Latch the total number of 2261 * recent consecutive successes. 2262 */ 2263 pr_statp->pr_time_lost = timeout; 2264 pr_statp->pr_status = PR_LOST; 2265 pi_found_failure = _B_TRUE; 2266 if (cur_tg != NULL && tg == cur_tg) { 2267 /* 2268 * We hit a failure for the desired 2269 * target. Latch the number of recent 2270 * consecutive successes for this target 2271 */ 2272 tg_found_failure = _B_TRUE; 2273 } 2274 } 2275 break; 2276 2277 case PR_ACKED: 2278 /* 2279 * Bump up the count of probe successes, if we 2280 * have not seen any failure so far. 2281 */ 2282 if (!pi_found_failure) 2283 psinfo->ps_nsucc++; 2284 2285 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2286 !tg_found_failure) { 2287 psinfo->ps_nsucc_tg++; 2288 } 2289 2290 /* 2291 * Record the time of last success, if this is 2292 * the most recent probe success. 2293 */ 2294 if (!psinfo->ps_tls_valid) { 2295 psinfo->ps_tls = pr_statp->pr_time_acked; 2296 psinfo->ps_tls_valid = _B_TRUE; 2297 } 2298 break; 2299 2300 case PR_LOST: 2301 /* 2302 * We hit a failure. Latch the total number of 2303 * recent consecutive successes. 2304 */ 2305 pi_found_failure = _B_TRUE; 2306 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2307 /* 2308 * We hit a failure for the desired target. 2309 * Latch the number of recent consecutive 2310 * successes for this target 2311 */ 2312 tg_found_failure = _B_TRUE; 2313 } 2314 break; 2315 2316 default: 2317 return; 2318 2319 } 2320 } 2321 } 2322 2323 /* 2324 * Return the information associated with consecutive probe failures 2325 * starting with the most recent probe. Only the last 2 probes can be in the 2326 * unacknowledged state. All previous probes have either failed or succeeded. 2327 */ 2328 static void 2329 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2330 struct probe_fail_count *pfinfo) 2331 { 2332 int i; 2333 struct probe_stats *pr_statp; 2334 boolean_t tg_found_success = _B_FALSE; 2335 boolean_t pi_found_success = _B_FALSE; 2336 int most_recent; 2337 int second_most_recent; 2338 uint_t now; 2339 uint_t timeout; 2340 struct target *tg; 2341 2342 if (debug & D_FAILOVER) 2343 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2344 2345 bzero(pfinfo, sizeof (*pfinfo)); 2346 now = getcurrenttime(); 2347 2348 /* 2349 * Start with the most recent probe, and count the number 2350 * of consecutive probe failures. Latch the number of failures 2351 * on hitting a probe success. 2352 */ 2353 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2354 second_most_recent = PROBE_INDEX_PREV(most_recent); 2355 2356 for (i = most_recent; i != pii->pii_probe_next; 2357 i = PROBE_INDEX_PREV(i)) { 2358 pr_statp = &pii->pii_probes[i]; 2359 2360 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2361 2362 switch (pr_statp->pr_status) { 2363 case PR_UNACKED: 2364 /* 2365 * Only the most recent 2 probes can be unacknowledged 2366 */ 2367 assert(i == most_recent || i == second_most_recent); 2368 2369 tg = pr_statp->pr_target; 2370 /* 2371 * Target is guaranteed to exist in the unack. state 2372 */ 2373 assert(tg != NULL); 2374 /* 2375 * The crtt could be zero for some reason, 2376 * Eg. the phyint could be failed. If the crtt is 2377 * not available use the group's probe interval, 2378 * which is a worst case estimate. 2379 */ 2380 if (tg->tg_crtt != 0) { 2381 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2382 } else { 2383 timeout = pr_statp->pr_time_sent + 2384 pii->pii_phyint->pi_group->pg_probeint; 2385 } 2386 2387 if (TIME_GT(timeout, now)) 2388 break; 2389 2390 pr_statp->pr_time_lost = timeout; 2391 pr_statp->pr_status = PR_LOST; 2392 /* FALLTHRU */ 2393 2394 case PR_LOST: 2395 if (!pi_found_success) { 2396 pfinfo->pf_nfail++; 2397 pfinfo->pf_tff = pr_statp->pr_time_lost; 2398 } 2399 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2400 !tg_found_success) { 2401 pfinfo->pf_nfail_tg++; 2402 } 2403 break; 2404 2405 default: 2406 /* 2407 * We hit a success or unused slot. Latch the 2408 * total number of recent consecutive failures. 2409 */ 2410 pi_found_success = _B_TRUE; 2411 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2412 /* 2413 * We hit a success for the desired target. 2414 * Latch the number of recent consecutive 2415 * failures for this target 2416 */ 2417 tg_found_success = _B_TRUE; 2418 } 2419 } 2420 } 2421 } 2422 2423 /* 2424 * Check if the phyint has been repaired. If no test address has been 2425 * configured, then consider the interface repaired if the link is up (unless 2426 * the link is flapping; see below). Otherwise, look for proof of probes 2427 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2428 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2429 */ 2430 static boolean_t 2431 phyint_repaired(struct phyint *pi) 2432 { 2433 struct probe_success_count psinfo; 2434 struct phyint_instance *pii; 2435 struct target *cur_tg; 2436 int pr_ndx; 2437 uint_t cur_time; 2438 2439 if (debug & D_FAILOVER) 2440 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2441 2442 if (LINK_DOWN(pi)) 2443 return (_B_FALSE); 2444 2445 /* 2446 * If we don't have any test addresses and the link is up, then 2447 * consider the interface repaired, unless we've received more than 2448 * LINK_UP_PERMIN link up notifications in the last minute, in 2449 * which case we keep the link down until we drop back below 2450 * the threshold. 2451 */ 2452 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2453 cur_time = getcurrenttime(); 2454 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2455 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2456 pi->pi_lfmsg_printed = 0; 2457 return (_B_TRUE); 2458 } 2459 if (!pi->pi_lfmsg_printed) { 2460 logerr("The link has come up on %s more than %d times " 2461 "in the last minute; disabling failback until it " 2462 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2463 pi->pi_lfmsg_printed = 1; 2464 } 2465 2466 return (_B_FALSE); 2467 } 2468 2469 pii = pi->pi_v4; 2470 if (PROBE_CAPABLE(pii)) { 2471 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2472 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2473 probe_success_info(pii, cur_tg, &psinfo); 2474 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2475 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2476 return (_B_TRUE); 2477 } 2478 2479 pii = pi->pi_v6; 2480 if (PROBE_CAPABLE(pii)) { 2481 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2482 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2483 probe_success_info(pii, cur_tg, &psinfo); 2484 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2485 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2486 return (_B_TRUE); 2487 } 2488 2489 return (_B_FALSE); 2490 } 2491 2492 /* 2493 * Try failover from phyint 'pi' to a suitable destination. 2494 */ 2495 int 2496 try_failover(struct phyint *pi, int failover_type) 2497 { 2498 struct phyint *dst; 2499 int err; 2500 2501 if (debug & D_FAILOVER) 2502 logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); 2503 2504 /* 2505 * Attempt to find a failover destination 'dst'. 2506 * dst will be null if any of the following is true 2507 * Phyint is not part of a group OR 2508 * Phyint is the only member of a group OR 2509 * No suitable failover dst was available 2510 */ 2511 dst = get_failover_dst(pi, failover_type); 2512 if (dst == NULL) 2513 return (IPMP_EMINRED); 2514 2515 dst->pi_empty = 0; /* Per state diagram */ 2516 pi->pi_full = 0; /* Per state diagram */ 2517 2518 err = failover(pi, dst); 2519 2520 if (debug & D_FAILOVER) { 2521 logdebug("failed over from %s to %s ret %d\n", 2522 pi->pi_name, dst->pi_name, err); 2523 } 2524 if (err == 0) { 2525 pi->pi_empty = 1; /* Per state diagram */ 2526 /* 2527 * we don't want to print out this message if a 2528 * phyint is leaving the group, nor for failover from 2529 * standby 2530 */ 2531 if (failover_type == FAILOVER_NORMAL) { 2532 logerr("Successfully failed over from NIC %s to NIC " 2533 "%s\n", pi->pi_name, dst->pi_name); 2534 } 2535 return (0); 2536 } else { 2537 /* 2538 * The failover did not succeed. We must retry the failover 2539 * only after resyncing our state based on the kernel's. 2540 * For eg. either the src or the dst might have been unplumbed 2541 * causing this failure. initifs() will be called again, 2542 * from main, since full_scan_required has been set to true 2543 * by failover(); 2544 */ 2545 return (IPMP_FAILURE); 2546 } 2547 } 2548 2549 /* 2550 * global_errno captures the errno value, if failover() or failback() 2551 * fails. This is sent to if_mpadm(1M). 2552 */ 2553 int global_errno; 2554 2555 /* 2556 * Attempt failover from phyint 'from' to phyint 'to'. 2557 * IP moves everything from phyint 'from' to phyint 'to'. 2558 */ 2559 static int 2560 failover(struct phyint *from, struct phyint *to) 2561 { 2562 struct lifreq lifr; 2563 int ret; 2564 2565 if (debug & D_FAILOVER) { 2566 logdebug("failing over from %s to %s\n", 2567 from->pi_name, to->pi_name); 2568 } 2569 2570 /* 2571 * Perform the failover. Both IPv4 and IPv6 are failed over 2572 * using a single ioctl by passing in AF_UNSPEC family. 2573 */ 2574 lifr.lifr_addr.ss_family = AF_UNSPEC; 2575 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2576 lifr.lifr_movetoindex = to->pi_ifindex; 2577 2578 ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); 2579 if (ret < 0) { 2580 global_errno = errno; 2581 logperror("failover: ioctl (failover)"); 2582 } 2583 2584 /* 2585 * Set full_scan_required to true. This will make us read 2586 * the state from the kernel in initifs() and update our tables, 2587 * to reflect the current state after the failover. If the 2588 * failover has failed it will then reissue the failover. 2589 */ 2590 full_scan_required = _B_TRUE; 2591 return (ret); 2592 } 2593 2594 /* 2595 * phyint 'pi' has recovered. Attempt failback from every phyint in the same 2596 * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. 2597 * Return values: 2598 * IPMP_SUCCESS: Failback successful from each of the other 2599 * phyints in the group. 2600 * IPMP_EFBPARTIAL: Failback successful from some of the other 2601 * phyints in the group. 2602 * IPMP_FAILURE: Failback syscall failed with some error. 2603 * 2604 * Note that failback is attempted regardless of the setting of the 2605 * failback_enabled flag. 2606 */ 2607 int 2608 do_failback(struct phyint *pi) 2609 { 2610 struct phyint *from; 2611 boolean_t done; 2612 boolean_t partial; 2613 boolean_t attempted_failback = _B_FALSE; 2614 2615 if (debug & D_FAILOVER) 2616 logdebug("do_failback(%s)\n", pi->pi_name); 2617 2618 /* If this phyint is not part of a named group, return. */ 2619 if (pi->pi_group == phyint_anongroup) { 2620 pi->pi_full = 1; 2621 return (IPMP_SUCCESS); 2622 } 2623 2624 /* 2625 * Attempt failback from every phyint in the group to 'pi'. 2626 * The reason for doing this, instead of only from the 2627 * phyint to which we did the failover is given below. 2628 * 2629 * After 'pi' failed, if any app. tries to join on a multicast 2630 * address (IPv6), on the failed phyint, IP picks any arbitrary 2631 * non-failed phyint in the group, instead of the failed phyint, 2632 * in.mpathd is not aware of this. Thus failing back only from the 2633 * interface to which 'pi' failed over, will failback the ipif's 2634 * but not the ilm's. So we need to failback from all members of 2635 * the phyint group 2636 */ 2637 done = _B_TRUE; 2638 partial = _B_FALSE; 2639 for (from = pi->pi_group->pg_phyint; from != NULL; 2640 from = from->pi_pgnext) { 2641 /* Exclude ourself as a failback src */ 2642 if (from == pi) 2643 continue; 2644 2645 /* 2646 * If the 'from' phyint has IPv4 plumbed, the 'to' 2647 * phyint must also have IPv4 plumbed. Similar check 2648 * for IPv6. IP makes the same check. Otherwise the 2649 * failback will fail. 2650 */ 2651 if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || 2652 (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { 2653 partial = _B_TRUE; 2654 continue; 2655 } 2656 2657 pi->pi_empty = 0; /* Per state diagram */ 2658 attempted_failback = _B_TRUE; 2659 if (failback(from, pi) != 0) { 2660 done = _B_FALSE; 2661 break; 2662 } 2663 } 2664 2665 /* 2666 * We are done. No more phyint from which we can src the failback 2667 */ 2668 if (done) { 2669 if (!partial) 2670 pi->pi_full = 1; /* Per state diagram */ 2671 /* 2672 * Don't print out a message unless there is a 2673 * transition from FAILED to RUNNING. For eg. 2674 * we don't want to print out this message if a 2675 * phyint is leaving the group, or at startup 2676 */ 2677 if (attempted_failback && (pi->pi_flags & 2678 (IFF_FAILED | IFF_OFFLINE))) { 2679 logerr("Successfully failed back to NIC %s\n", 2680 pi->pi_name); 2681 } 2682 return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2683 } 2684 2685 return (IPMP_FAILURE); 2686 } 2687 2688 /* 2689 * This function is similar to do_failback() above, but respects the 2690 * failback_enabled flag for phyints in named groups. 2691 */ 2692 int 2693 try_failback(struct phyint *pi) 2694 { 2695 if (debug & D_FAILOVER) 2696 logdebug("try_failback(%s)\n", pi->pi_name); 2697 2698 if (pi->pi_group != phyint_anongroup && !failback_enabled) 2699 return (IPMP_EFBDISABLED); 2700 2701 return (do_failback(pi)); 2702 } 2703 2704 /* 2705 * Failback everything from phyint 'from' that has the same ifindex 2706 * as phyint to's ifindex. 2707 */ 2708 static int 2709 failback(struct phyint *from, struct phyint *to) 2710 { 2711 struct lifreq lifr; 2712 int ret; 2713 2714 if (debug & D_FAILOVER) 2715 logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); 2716 2717 lifr.lifr_addr.ss_family = AF_UNSPEC; 2718 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2719 lifr.lifr_movetoindex = to->pi_ifindex; 2720 2721 ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); 2722 if (ret < 0) { 2723 global_errno = errno; 2724 logperror("failback: ioctl (failback)"); 2725 } 2726 2727 /* 2728 * Set full_scan_required to true. This will make us read 2729 * the state from the kernel in initifs() and update our tables, 2730 * to reflect the current state after the failback. If the 2731 * failback has failed it will then reissue the failback. 2732 */ 2733 full_scan_required = _B_TRUE; 2734 2735 return (ret); 2736 } 2737 2738 /* 2739 * Select a target phyint for failing over from 'pi'. 2740 * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred 2741 * target phyint is chosen as follows, 2742 * 1. Pick any inactive standby interface. 2743 * 2. If no inactive standby is available, select any phyint in the 2744 * same group that has the least number of logints, (excluding 2745 * IFF_NOFAILOVER and !IFF_UP logints) 2746 * If we are failing over from a standby, failover_type is 2747 * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. 2748 * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, 2749 * and we won't return NULL, as long as there is at least 1 other phyint 2750 * in the group. 2751 */ 2752 static struct phyint * 2753 get_failover_dst(struct phyint *pi, int failover_type) 2754 { 2755 struct phyint *maybe = NULL; 2756 struct phyint *pi2; 2757 struct phyint *last_choice = NULL; 2758 2759 if (pi->pi_group == phyint_anongroup) 2760 return (NULL); 2761 2762 /* 2763 * Loop thru the phyints in the group, and pick the preferred 2764 * phyint for the target. 2765 */ 2766 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2767 /* Exclude ourself and offlined interfaces */ 2768 if (pi2 == pi || pi2->pi_state == PI_OFFLINE) 2769 continue; 2770 2771 /* 2772 * The chosen target phyint must have IPv4 instance 2773 * plumbed, if the src phyint has IPv4 plumbed. Similarly 2774 * for IPv6. 2775 */ 2776 if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || 2777 (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) 2778 continue; 2779 2780 /* The chosen target must be PI_RUNNING. */ 2781 if (pi2->pi_state != PI_RUNNING) { 2782 last_choice = pi2; 2783 continue; 2784 } 2785 2786 if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && 2787 (failover_type != FAILOVER_TO_NONSTANDBY)) { 2788 return (pi2); 2789 } else { 2790 if (maybe == NULL) 2791 maybe = pi2; 2792 else if (logint_upcount(pi2) < logint_upcount(maybe)) 2793 maybe = pi2; 2794 } 2795 } 2796 if (maybe == NULL && failover_type == FAILOVER_TO_ANY) 2797 return (last_choice); 2798 else 2799 return (maybe); 2800 } 2801 2802 /* 2803 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2804 */ 2805 boolean_t 2806 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) 2807 { 2808 int ifsock; 2809 struct lifreq lifr; 2810 uint64_t old_flags; 2811 2812 if (debug & D_FAILOVER) { 2813 logdebug("change_lif_flags(%s): flags %llx setfl %d\n", 2814 pi->pi_name, flags, (int)setfl); 2815 } 2816 2817 if (pi->pi_v4 != NULL) { 2818 ifsock = ifsock_v4; 2819 } else { 2820 ifsock = ifsock_v6; 2821 } 2822 2823 /* 2824 * Get the current flags from the kernel, and set/clear the 2825 * desired phyint flags. Since we set only phyint flags, we can 2826 * do it on either IPv4 or IPv6 instance. 2827 */ 2828 (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2829 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 2830 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2831 if (errno != ENXIO) 2832 logperror("change_lif_flags: ioctl (get flags)"); 2833 return (_B_FALSE); 2834 } 2835 2836 old_flags = lifr.lifr_flags; 2837 if (setfl) 2838 lifr.lifr_flags |= flags; 2839 else 2840 lifr.lifr_flags &= ~flags; 2841 2842 if (old_flags == lifr.lifr_flags) { 2843 /* No change in the flags. No need to send ioctl */ 2844 return (_B_TRUE); 2845 } 2846 2847 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2848 if (errno != ENXIO) 2849 logperror("change_lif_flags: ioctl (set flags)"); 2850 return (_B_FALSE); 2851 } 2852 2853 /* 2854 * Keep pi_flags in synch. with actual flags. Assumes flags are 2855 * phyint flags. 2856 */ 2857 if (setfl) 2858 pi->pi_flags |= flags; 2859 else 2860 pi->pi_flags &= ~flags; 2861 2862 if (pi->pi_v4) 2863 pi->pi_v4->pii_flags = pi->pi_flags; 2864 2865 if (pi->pi_v6) 2866 pi->pi_v6->pii_flags = pi->pi_flags; 2867 2868 return (_B_TRUE); 2869 } 2870 2871 /* 2872 * icmp cksum computation for IPv4. 2873 */ 2874 static int 2875 in_cksum(ushort_t *addr, int len) 2876 { 2877 register int nleft = len; 2878 register ushort_t *w = addr; 2879 register ushort_t answer; 2880 ushort_t odd_byte = 0; 2881 register int sum = 0; 2882 2883 /* 2884 * Our algorithm is simple, using a 32 bit accumulator (sum), 2885 * we add sequential 16 bit words to it, and at the end, fold 2886 * back all the carry bits from the top 16 bits into the lower 2887 * 16 bits. 2888 */ 2889 while (nleft > 1) { 2890 sum += *w++; 2891 nleft -= 2; 2892 } 2893 2894 /* mop up an odd byte, if necessary */ 2895 if (nleft == 1) { 2896 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2897 sum += odd_byte; 2898 } 2899 2900 /* 2901 * add back carry outs from top 16 bits to low 16 bits 2902 */ 2903 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2904 sum += (sum >> 16); /* add carry */ 2905 answer = ~sum; /* truncate to 16 bits */ 2906 return (answer); 2907 } 2908 2909 static void 2910 reset_snxt_basetimes(void) 2911 { 2912 struct phyint_instance *pii; 2913 2914 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2915 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2916 } 2917 } 2918 2919 /* 2920 * Is the address one of our own addresses? Unfortunately, 2921 * we cannot check our phyint tables to determine if the address 2922 * is our own. This is because, we don't track interfaces that 2923 * are not part of any group. We have to either use a 'bind' or 2924 * get the complete list of all interfaces using SIOCGLIFCONF, 2925 * to do this check. We could also use SIOCTMYADDR. 2926 * Bind fails for the local zone address, so we might include local zone 2927 * address as target address. If local zone address is a target address 2928 * and it is up, it is not possible to detect the interface failure. 2929 * SIOCTMYADDR also doesn't consider local zone address as own address. 2930 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2931 * are stored in laddr_list. 2932 */ 2933 2934 boolean_t 2935 own_address(struct in6_addr addr) 2936 { 2937 struct local_addr *taddr = laddr_list; 2938 2939 for (; taddr != NULL; taddr = taddr->next) { 2940 if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { 2941 return (_B_TRUE); 2942 } 2943 } 2944 return (_B_FALSE); 2945 } 2946