1 /* 2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #pragma ident "%Z%%M% %I% %E% SMI" 24 25 #include "mpd_defs.h" 26 #include "mpd_tables.h" 27 28 /* 29 * Probe types for probe() 30 */ 31 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 32 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 33 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 34 35 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 36 37 /* 38 * Format of probe / probe response packets. This is an ICMP Echo request 39 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 40 */ 41 struct pr_icmp 42 { 43 uint8_t pr_icmp_type; /* type field */ 44 uint8_t pr_icmp_code; /* code field */ 45 uint16_t pr_icmp_cksum; /* checksum field */ 46 uint16_t pr_icmp_id; /* Identification */ 47 uint16_t pr_icmp_seq; /* sequence number */ 48 uint32_t pr_icmp_timestamp; /* Time stamp */ 49 uint32_t pr_icmp_mtype; /* Message type */ 50 }; 51 52 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x0, 54 0x0, 0x0, 0x0, 0x0, 55 0x0, 0x0, 0x0, 0x1 } }; 56 57 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 58 59 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 60 61 static void *find_ancillary(struct msghdr *msg, int cmsg_type); 62 static void pi_set_crtt(struct target *tg, int m, 63 boolean_t is_probe_uni); 64 static void incoming_echo_reply(struct phyint_instance *pii, 65 struct pr_icmp *reply, struct in6_addr fromaddr); 66 static void incoming_rtt_reply(struct phyint_instance *pii, 67 struct pr_icmp *reply, struct in6_addr fromaddr); 68 static void incoming_mcast_reply(struct phyint_instance *pii, 69 struct pr_icmp *reply, struct in6_addr fromaddr); 70 71 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 72 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 73 static boolean_t check_exception_target(struct phyint_instance *pii, 74 struct target *target); 75 static void probe_fail_info(struct phyint_instance *pii, 76 struct target *cur_tg, struct probe_fail_count *pfinfo); 77 static void probe_success_info(struct phyint_instance *pii, 78 struct target *cur_tg, struct probe_success_count *psinfo); 79 static boolean_t phyint_repaired(struct phyint *pi); 80 81 static int failover(struct phyint *from, struct phyint *to); 82 static int failback(struct phyint *from, struct phyint *to); 83 static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); 84 85 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 86 static int in_cksum(ushort_t *addr, int len); 87 static void reset_snxt_basetimes(void); 88 89 /* 90 * CRTT - Conservative Round Trip Time Estimate 91 * Probe success - A matching probe reply received before CRTT ms has elapsed 92 * after sending the probe. 93 * Probe failure - No probe reply received and more than CRTT ms has elapsed 94 * after sending the probe. 95 * 96 * TLS - Time last success. Most recent probe ack received at this time. 97 * TFF - Time first fail. The time of the earliest probe failure in 98 * a consecutive series of probe failures. 99 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 100 * before declaring phyint repair. 101 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 102 * declare a phyint failure. 103 * 104 * Phyint state diagram 105 * 106 * The state of a phyint that is capable of being probed, is completely 107 * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. 108 * 109 * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 110 * of the link (according to the driver). If the phyint is also configured 111 * with a test address (the common case) and probe targets, then a phyint must 112 * also successfully be able to send and receive probes in order to remain in 113 * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 114 * 115 * Further, if a PI_RUNNING phyint is configured with a test address but is 116 * unable to find any probe targets, it will transition to the PI_NOTARGETS 117 * state, which indicates that the link is apparently functional but that 118 * in.mpathd is unable to send probes to verify functionality (in this case, 119 * in.mpathd makes the optimistic assumption that the interface is working 120 * correctly and thus does not perform a failover, but reports the interface 121 * as IPMP_IF_UNKNOWN through the async events and query interfaces). 122 * 123 * At any point, a phyint may be administratively marked offline via if_mpadm. 124 * In this case, the interface always transitions to PI_OFFLINE, regardless 125 * of its previous state. When the interface is later brought back online, 126 * in.mpathd acts as if the interface is new (and thus it transitions to 127 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 128 * its probes, if probes are sent). 129 * 130 * pi_state - PI_RUNNING or PI_FAILED 131 * PI_RUNNING: The failure detection logic says the phyint is good. 132 * PI_FAILED: The failure detection logic says the phyint has failed. 133 * 134 * pg_groupfailed - Group failure, all interfaces in the group have failed. 135 * The pi_state may be either PI_FAILED or PI_NOTARGETS. 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: No failovers have been done to the standby, from 144 * other phyints. This phyint is an inactive standby. 145 * 146 * pi_empty 147 * This phyint has failed over successfully to another phyint, and 148 * this phyint is currently "empty". It does not host any addresses or 149 * multicast membership etc. This is the state of a phyint after a 150 * failover from the phyint has completed successfully and no subsequent 151 * 'failover to' or 'failback to' has occurred on the phyint. 152 * IP guarantees that no new logicals will be hosted nor any multicast 153 * joins permitted on the phyint, since the phyint is either failed or 154 * inactive. pi_empty is set implies the phyint is either failed or 155 * inactive. 156 * 157 * pi_full 158 * The phyint hosts all of its own addresses that it "owns". If the 159 * phyint was previously failed or inactive, failbacks to the phyint 160 * has completed successfully. i.e. No more failbacks to this phyint 161 * can produce any change in system state whatsoever. 162 * 163 * Not all 32 possible combinations of the above 5-tuple are possible. 164 * Furthermore some of the above combinations are transient. They may occur 165 * only because the failover or failback did not complete successfully. The 166 * failover/failback will be retried and eventually a stable state will be 167 * reached. 168 * 169 * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. 170 * The following are the state machines. 'from' and 'to' are the src and 171 * dst of the failover/failback, below 172 * 173 * pi_empty state machine 174 * --------------------------------------------------------------------------- 175 * Event State -> New State 176 * --------------------------------------------------------------------------- 177 * successful completion from.pi_empty = 0 -> from.pi_empty = 1 178 * of failover 179 * 180 * Initiate failover to.pi_empty = X -> to.pi_empty = 0 181 * 182 * Initiate failback to.pi_empty = X -> to.pi_empty = 0 183 * 184 * group failure pi_empty = X -> pi_empty = 0 185 * --------------------------------------------------------------------------- 186 * 187 * pi_full state machine 188 * --------------------------------------------------------------------------- 189 * Event State -> New State 190 * --------------------------------------------------------------------------- 191 * successful completion to.pi_full = 0 -> to.pi_full = 1 192 * of failback from 193 * each of the other phyints 194 * 195 * Initiate failover from.pi_full = X -> from.pi_full = 0 196 * 197 * group failure pi_full = X -> pi_full = 0 198 * --------------------------------------------------------------------------- 199 * 200 * pi_state state machine 201 * --------------------------------------------------------------------------- 202 * Event State New State 203 * Action: 204 * --------------------------------------------------------------------------- 205 * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 206 * detection : set IFF_FAILED on this phyint 207 * : failover from this phyint to another 208 * 209 * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 1) 210 * detection : set IFF_FAILED on this phyint 211 * 212 * NIC repair (PI_FAILED, I == 0) -> (PI_RUNNING, I == 0) 213 * detection : to.pi_empty = 0 214 * : failback to this phyint if enabled 215 * : clear IFF_FAILED on this phyint 216 * 217 * NIC repair (PI_FAILED, I == 1) -> (PI_RUNNING, I == 1) 218 * detection : clear IFF_FAILED on this phyint 219 * 220 * Group failure (perform on all phyints in the group) 221 * detection PI_RUNNING PI_FAILED 222 * (Router targets) : set IFF_FAILED 223 * : clear pi_empty and pi_full 224 * 225 * Group failure (perform on all phyints in the group) 226 * detection PI_RUNNING PI_NOTARGETS 227 * (Host targets) : set IFF_FAILED 228 * : clear pi_empty and pi_full 229 * : delete the target list on all phyints 230 * --------------------------------------------------------------------------- 231 * 232 * I state machine 233 * --------------------------------------------------------------------------- 234 * Event State Action: 235 * --------------------------------------------------------------------------- 236 * Turn on I pi_empty == 0 : failover from standby 237 * 238 * Turn off I PI_RUNNING, : pi_empty = 0 239 * pi_full == 0 : failback to this if enabled 240 * --------------------------------------------------------------------------- 241 * 242 * Assertions: (Read '==>' as implies) 243 * 244 * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) 245 * (pi_empty == 1) ==> (pi_full == 0) 246 * (pi_full == 1) ==> (pi_empty == 0) 247 * 248 * Invariants 249 * 250 * pg_groupfailed = 0 && 251 * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby 252 * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint 253 * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint 254 * 255 * 1. says that an inactive standby, that is not empty, has to be failed 256 * over. For a standby to be truly inactive, it should not host any 257 * addresses. So we move them to some other phyint. Usually we catch the 258 * turn on of IFF_INACTIVE, and perform this action. However if the failover 259 * did not complete successfully, then subsequently we have lost the edge 260 * trigger, and this invariant kicks in and completes the action. 261 * 262 * 2. says that any failed phyint that is not empty must be failed over. 263 * Usually we do the failover when we detect NIC failure. However if the 264 * failover does not complete successfully, this invariant kicks in and 265 * completes the failover. We exclude inactive standby which is covered by 1. 266 * 267 * 3. says that any running phyint that is not full must be failed back. 268 * Usually we do the failback when we detect NIC repair. However if the 269 * failback does not complete successfully, this invariant kicks in and 270 * completes the failback. Note that we don't want to failback to an inactive 271 * standby. 272 * 273 * The invariants 1 - 3 and the actions are in initifs(). 274 */ 275 276 struct probes_missed probes_missed; 277 278 /* 279 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 280 * will be added on by the kernel. The id field identifies this phyint. 281 * and the sequence number is an increasing (modulo 2^^16) integer. The data 282 * portion holds the time value when the packet is sent. On echo this is 283 * extracted to compute the round-trip time. Three different types of 284 * probe packets are used. 285 * 286 * PROBE_UNI: This type is used to do failure detection / failure recovery 287 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 288 * not less than the current CRTT. pii_probes[] stores data 289 * about these probes. These packets consume sequence number space. 290 * 291 * PROBE_RTT: This type is used to make only rtt measurments. Normally these 292 * are not used. Under heavy network load, the rtt may go up very high, 293 * due to a spike, or may appear to go high, due to extreme scheduling 294 * delays. Once the network stress is removed, mpathd takes long time to 295 * recover, because the probe_interval is already high, and it takes 296 * a long time to send out sufficient number of probes to bring down the 297 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 298 * user_probe_interval ms. and will cause only rtt updates. These packets 299 * do not consume sequence number space nor is information about these 300 * packets stored in the pii_probes[] 301 * 302 * PROBE_MULTI: This type is only used to construct a list of targets, when 303 * no targets are known. The packet is multicast to the all hosts addr. 304 */ 305 static void 306 probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) 307 { 308 struct pr_icmp probe_pkt; /* Probe packet */ 309 struct sockaddr_in6 whereto6; /* target address IPv6 */ 310 struct sockaddr_in whereto; /* target address IPv4 */ 311 int pr_ndx; /* probe index in pii->pii_probes[] */ 312 boolean_t sent = _B_TRUE; 313 314 if (debug & D_TARGET) { 315 logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), 316 pii->pii_name, probe_type, cur_time); 317 } 318 319 assert(pii->pii_probe_sock != -1); 320 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 321 probe_type == PROBE_RTT); 322 323 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 324 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 325 probe_pkt.pr_icmp_code = 0; 326 probe_pkt.pr_icmp_cksum = 0; 327 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 328 329 /* 330 * Since there is no need to do arithmetic on the icmpid, 331 * (only equality check is done) pii_icmpid is stored in 332 * network byte order at initialization itself. 333 */ 334 probe_pkt.pr_icmp_id = pii->pii_icmpid; 335 probe_pkt.pr_icmp_timestamp = htonl(cur_time); 336 probe_pkt.pr_icmp_mtype = htonl(probe_type); 337 338 /* 339 * If probe_type is PROBE_MULTI, this packet will be multicast to 340 * the all hosts address. Otherwise it is unicast to the next target. 341 */ 342 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 343 pii->pii_rtt_target_next != NULL)); 344 345 if (pii->pii_af == AF_INET6) { 346 bzero(&whereto6, sizeof (whereto6)); 347 whereto6.sin6_family = AF_INET6; 348 if (probe_type == PROBE_MULTI) { 349 whereto6.sin6_addr = all_nodes_mcast_v6; 350 } else if (probe_type == PROBE_UNI) { 351 whereto6.sin6_addr = pii->pii_target_next->tg_address; 352 } else { 353 /* type is PROBE_RTT */ 354 whereto6.sin6_addr = 355 pii->pii_rtt_target_next->tg_address; 356 } 357 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 358 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, 359 sizeof (whereto6)) != sizeof (probe_pkt)) { 360 logperror_pii(pii, "probe: probe sendto"); 361 sent = _B_FALSE; 362 } 363 } else { 364 bzero(&whereto, sizeof (whereto)); 365 whereto.sin_family = AF_INET; 366 if (probe_type == PROBE_MULTI) { 367 whereto.sin_addr = all_nodes_mcast_v4; 368 } else if (probe_type == PROBE_UNI) { 369 IN6_V4MAPPED_TO_INADDR( 370 &pii->pii_target_next->tg_address, 371 &whereto.sin_addr); 372 } else { 373 /* type is PROBE_RTT */ 374 IN6_V4MAPPED_TO_INADDR( 375 &pii->pii_rtt_target_next->tg_address, 376 &whereto.sin_addr); 377 } 378 379 /* 380 * Compute the IPv4 icmp checksum. Does not cover the IP header. 381 */ 382 probe_pkt.pr_icmp_cksum = 383 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 384 if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 385 sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, 386 sizeof (whereto)) != sizeof (probe_pkt)) { 387 logperror_pii(pii, "probe: probe sendto"); 388 sent = _B_FALSE; 389 } 390 } 391 392 /* 393 * If this is a PROBE_UNI probe packet being unicast to a target, then 394 * update our tables. We will need this info in processing the probe 395 * response. PROBE_MULTI and PROBE_RTT packets are not used for 396 * the purpose of failure or recovery detection. PROBE_MULTI packets 397 * are only used to construct a list of targets. PROBE_RTT packets are 398 * used only for updating the rtt and not for failure detection. 399 */ 400 if (probe_type == PROBE_UNI && sent) { 401 pr_ndx = pii->pii_probe_next; 402 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 403 404 /* Collect statistics, before we reuse the last slot. */ 405 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 406 pii->pii_cum_stats.lost++; 407 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 408 pii->pii_cum_stats.acked++; 409 pii->pii_cum_stats.sent++; 410 411 pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; 412 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 413 pii->pii_probes[pr_ndx].pr_time_sent = cur_time; 414 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 415 pii->pii_target_next = target_next(pii->pii_target_next); 416 assert(pii->pii_target_next != NULL); 417 /* 418 * If we have a single variable to denote the next target to 419 * probe for both rtt probes and failure detection probes, we 420 * could end up with a situation where the failure detection 421 * probe targets become disjoint from the rtt probe targets. 422 * Eg. if 2 targets and the actual fdt is double the user 423 * specified fdt. So we have 2 variables. In this scheme 424 * we also reset pii_rtt_target_next for every fdt probe, 425 * though that may not be necessary. 426 */ 427 pii->pii_rtt_target_next = pii->pii_target_next; 428 pii->pii_snxt++; 429 } else if (probe_type == PROBE_RTT) { 430 pii->pii_rtt_target_next = 431 target_next(pii->pii_rtt_target_next); 432 assert(pii->pii_rtt_target_next != NULL); 433 } 434 } 435 436 /* 437 * Incoming IPv4 data from wire, is received here. Called from main. 438 */ 439 void 440 in_data(struct phyint_instance *pii) 441 { 442 struct sockaddr_in from; 443 struct in6_addr fromaddr; 444 uint_t fromlen; 445 static uint_t in_packet[(IP_MAXPACKET + 1)/4]; 446 struct ip *ip; 447 int iphlen; 448 int len; 449 char abuf[INET_ADDRSTRLEN]; 450 struct pr_icmp *reply; 451 452 if (debug & D_PROBE) { 453 logdebug("in_data(%s %s)\n", 454 AF_STR(pii->pii_af), pii->pii_name); 455 } 456 457 /* 458 * Poll has already told us that a message is waiting, 459 * on this socket. Read it now. We should not block. 460 */ 461 fromlen = sizeof (from); 462 len = recvfrom(pii->pii_probe_sock, (char *)in_packet, 463 sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); 464 if (len < 0) { 465 logperror_pii(pii, "in_data: recvfrom"); 466 return; 467 } 468 469 /* 470 * If the NIC has indicated the link is down, don't go 471 * any further. 472 */ 473 if (LINK_DOWN(pii->pii_phyint)) 474 return; 475 476 /* Get the printable address for error reporting */ 477 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 478 479 /* Make sure packet contains at least minimum ICMP header */ 480 ip = (struct ip *)in_packet; 481 iphlen = ip->ip_hl << 2; 482 if (len < iphlen + ICMP_MINLEN) { 483 if (debug & D_PKTBAD) { 484 logdebug("in_data: packet too short (%d bytes)" 485 " from %s\n", len, abuf); 486 } 487 return; 488 } 489 490 /* 491 * Subtract the IP hdr length, 'len' will be length of the probe 492 * reply, starting from the icmp hdr. 493 */ 494 len -= iphlen; 495 /* LINTED */ 496 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 497 498 /* Probe replies are icmp echo replies. Ignore anything else */ 499 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 500 return; 501 502 /* 503 * The icmp id should match what we sent, which is stored 504 * in pi_icmpid. The icmp code for reply must be 0. 505 * The reply content must be a struct pr_icmp 506 */ 507 if (reply->pr_icmp_id != pii->pii_icmpid) { 508 /* Not in response to our probe */ 509 return; 510 } 511 512 if (reply->pr_icmp_code != 0) { 513 logtrace("probe reply code %d from %s on %s\n", 514 reply->pr_icmp_code, abuf, pii->pii_name); 515 return; 516 } 517 518 if (len < sizeof (struct pr_icmp)) { 519 logtrace("probe reply too short: %d bytes from %s on %s\n", 520 len, abuf, pii->pii_name); 521 return; 522 } 523 524 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 525 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 526 /* Unicast probe reply */ 527 incoming_echo_reply(pii, reply, fromaddr); 528 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 529 /* Multicast reply */ 530 incoming_mcast_reply(pii, reply, fromaddr); 531 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 532 incoming_rtt_reply(pii, reply, fromaddr); 533 } else { 534 /* Probably not in response to our probe */ 535 logtrace("probe reply type: %d from %s on %s\n", 536 reply->pr_icmp_mtype, abuf, pii->pii_name); 537 return; 538 } 539 540 } 541 542 /* 543 * Incoming IPv6 data from wire is received here. Called from main. 544 */ 545 void 546 in6_data(struct phyint_instance *pii) 547 { 548 struct sockaddr_in6 from; 549 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 550 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 551 int len; 552 char abuf[INET6_ADDRSTRLEN]; 553 struct msghdr msg; 554 struct iovec iov; 555 uchar_t *opt; 556 struct pr_icmp *reply; 557 558 if (debug & D_PROBE) { 559 logdebug("in6_data(%s %s)\n", 560 AF_STR(pii->pii_af), pii->pii_name); 561 } 562 563 iov.iov_base = (char *)in_packet; 564 iov.iov_len = sizeof (in_packet); 565 msg.msg_iov = &iov; 566 msg.msg_iovlen = 1; 567 msg.msg_name = (struct sockaddr *)&from; 568 msg.msg_namelen = sizeof (from); 569 msg.msg_control = ancillary_data; 570 msg.msg_controllen = sizeof (ancillary_data); 571 572 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 573 logperror_pii(pii, "in6_data: recvfrom"); 574 return; 575 } 576 577 /* 578 * If the NIC has indicated that the link is down, don't go 579 * any further. 580 */ 581 if (LINK_DOWN(pii->pii_phyint)) 582 return; 583 584 /* Get the printable address for error reporting */ 585 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 586 if (len < ICMP_MINLEN) { 587 if (debug & D_PKTBAD) { 588 logdebug("Truncated message: msg_flags 0x%x from %s\n", 589 msg.msg_flags, abuf); 590 } 591 return; 592 } 593 /* Ignore packets > 64k or control buffers that don't fit */ 594 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 595 if (debug & D_PKTBAD) { 596 logdebug("Truncated message: msg_flags 0x%x from %s\n", 597 msg.msg_flags, abuf); 598 } 599 return; 600 } 601 602 reply = (struct pr_icmp *)in_packet; 603 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 604 return; 605 606 if (reply->pr_icmp_id != pii->pii_icmpid) { 607 /* Not in response to our probe */ 608 return; 609 } 610 611 /* 612 * The kernel has already verified the the ICMP checksum. 613 */ 614 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 615 logtrace("ICMPv6 echo reply source address not linklocal from " 616 "%s on %s\n", abuf, pii->pii_name); 617 return; 618 } 619 opt = find_ancillary(&msg, IPV6_RTHDR); 620 if (opt != NULL) { 621 /* Can't allow routing headers in probe replies */ 622 logtrace("message with routing header from %s on %s\n", 623 abuf, pii->pii_name); 624 return; 625 } 626 if (reply->pr_icmp_code != 0) { 627 logtrace("probe reply code: %d from %s on %s\n", 628 reply->pr_icmp_code, abuf, pii->pii_name); 629 return; 630 } 631 if (len < (sizeof (struct pr_icmp))) { 632 logtrace("probe reply too short: %d bytes from %s on %s\n", 633 len, abuf, pii->pii_name); 634 return; 635 } 636 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 637 incoming_echo_reply(pii, reply, from.sin6_addr); 638 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 639 incoming_mcast_reply(pii, reply, from.sin6_addr); 640 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 641 incoming_rtt_reply(pii, reply, from.sin6_addr); 642 } else { 643 /* Probably not in response to our probe */ 644 logtrace("probe reply type: %d from %s on %s\n", 645 reply->pr_icmp_mtype, abuf, pii->pii_name); 646 } 647 } 648 649 /* 650 * Process the incoming rtt reply, in response to our rtt probe. 651 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 652 * have any stored information about the probe we sent. So we don't log 653 * any errors if we receive bad replies. 654 */ 655 static void 656 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 657 struct in6_addr fromaddr) 658 { 659 int m; /* rtt measurment in ms */ 660 uint32_t cur_time; /* in ms from some arbitrary point */ 661 char abuf[INET6_ADDRSTRLEN]; 662 struct target *target; 663 uint32_t pr_icmp_timestamp; 664 struct phyint_group *pg; 665 666 /* Get the printable address for error reporting */ 667 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 668 669 if (debug & D_PROBE) { 670 logdebug("incoming_rtt_reply: %s %s %s\n", 671 AF_STR(pii->pii_af), pii->pii_name, abuf); 672 } 673 674 /* Do we know this target ? */ 675 target = target_lookup(pii, fromaddr); 676 if (target == NULL) 677 return; 678 679 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 680 cur_time = getcurrenttime(); 681 m = (int)(cur_time - pr_icmp_timestamp); 682 683 /* Invalid rtt. It has wrapped around */ 684 if (m < 0) 685 return; 686 687 /* 688 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 689 * The initial few responses after the interface is repaired may 690 * contain high rtt's because they could have been queued up waiting 691 * for ARP/NDP resolution on a failed interface. 692 */ 693 pg = pii->pii_phyint->pi_group; 694 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 695 return; 696 697 /* 698 * Update rtt only if the new rtt is lower than the current rtt. 699 * (specified by the 3rd parameter to pi_set_crtt). 700 * If a spike has caused the current probe_interval to be > 701 * user_probe_interval, then this mechanism is used to bring down 702 * the rtt rapidly once the network stress is removed. 703 * If the new rtt is higher than the current rtt, we don't want to 704 * update the rtt. We are having more than 1 outstanding probe and 705 * the increase in rtt we are seeing is being unnecessarily weighted 706 * many times. The regular rtt update will be handled by 707 * incoming_echo_reply() and will take care of any rtt increase. 708 */ 709 pi_set_crtt(target, m, _B_FALSE); 710 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 711 (user_failure_detection_time < pg->pg_fdt) && 712 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 713 /* 714 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 715 * investigate if we can improve the failure detection time to 716 * meet whatever the user specified. 717 */ 718 if (check_pg_crtt_improved(pg)) { 719 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 720 user_failure_detection_time); 721 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 722 if (pii->pii_phyint->pi_group != phyint_anongroup) { 723 logerr("Improved failure detection time %d ms " 724 "on (%s %s) for group \"%s\"\n", 725 pg->pg_fdt, AF_STR(pii->pii_af), 726 pii->pii_name, 727 pii->pii_phyint->pi_group->pg_name); 728 } 729 if (user_failure_detection_time == pg->pg_fdt) { 730 /* Avoid any truncation or rounding errors */ 731 pg->pg_probeint = user_probe_interval; 732 /* 733 * No more rtt probes will be sent. The actual 734 * fdt has dropped to the user specified value. 735 * pii_fd_snxt_basetime and pii_snxt_basetime 736 * will be in sync henceforth. 737 */ 738 reset_snxt_basetimes(); 739 } 740 } 741 } 742 } 743 744 /* 745 * Process the incoming echo reply, in response to our unicast probe. 746 * Common for both IPv4 and IPv6 747 */ 748 static void 749 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 750 struct in6_addr fromaddr) 751 { 752 int m; /* rtt measurment in ms */ 753 uint32_t cur_time; /* in ms from some arbitrary point */ 754 char abuf[INET6_ADDRSTRLEN]; 755 int pr_ndx; 756 struct target *target; 757 boolean_t exception; 758 uint32_t pr_icmp_timestamp; 759 uint16_t pr_icmp_seq; 760 struct phyint_group *pg = pii->pii_phyint->pi_group; 761 762 /* Get the printable address for error reporting */ 763 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 764 765 if (debug & D_PROBE) { 766 logdebug("incoming_echo_reply: %s %s %s seq %u\n", 767 AF_STR(pii->pii_af), pii->pii_name, abuf, 768 ntohs(reply->pr_icmp_seq)); 769 } 770 771 pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 772 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 773 774 /* Reject out of window probe replies */ 775 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 776 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 777 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 778 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 779 pii->pii_cum_stats.unknown++; 780 return; 781 } 782 cur_time = getcurrenttime(); 783 m = (int)(cur_time - pr_icmp_timestamp); 784 if (m < 0) { 785 /* 786 * This is a ridiculously high value of rtt. rtt has wrapped 787 * around. Log a message, and ignore the rtt. 788 */ 789 logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " 790 "timestamp %u\n", cur_time, pr_icmp_timestamp); 791 } 792 793 /* 794 * Get the probe index pr_ndx corresponding to the received icmp seq. 795 * number in our pii->pii_probes[] array. The icmp sequence number 796 * pii_snxt corresponds to the probe index pii->pii_probe_next 797 */ 798 pr_ndx = MOD_SUB(pii->pii_probe_next, 799 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 800 801 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 802 803 target = pii->pii_probes[pr_ndx].pr_target; 804 805 /* 806 * Perform sanity checks, whether this probe reply that we 807 * have received is genuine 808 */ 809 if (target != NULL) { 810 /* 811 * Compare the src. addr of the received ICMP or ICMPv6 812 * probe reply with the target address in our tables. 813 */ 814 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 815 /* 816 * We don't have any record of having sent a probe to 817 * this target. This is a fake probe reply. Log an error 818 */ 819 logtrace("probe status %d Fake probe reply seq %u " 820 "snxt %u on %s from %s\n", 821 pii->pii_probes[pr_ndx].pr_status, 822 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 823 pii->pii_cum_stats.unknown++; 824 return; 825 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 826 /* 827 * The address matches, but our tables indicate that 828 * this probe reply has been acked already. So this 829 * is a duplicate probe reply. Log an error 830 */ 831 logtrace("probe status %d Duplicate probe reply seq %u " 832 "snxt %u on %s from %s\n", 833 pii->pii_probes[pr_ndx].pr_status, 834 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 835 pii->pii_cum_stats.unknown++; 836 return; 837 } 838 } else { 839 /* 840 * Target must not be NULL in the PR_UNACKED state 841 */ 842 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 843 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 844 /* 845 * The probe stats slot is unused. So we didn't 846 * send out any probe to this target. This is a fake. 847 * Log an error. 848 */ 849 logtrace("probe status %d Fake probe reply seq %u " 850 "snxt %u on %s from %s\n", 851 pii->pii_probes[pr_ndx].pr_status, 852 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 853 } 854 pii->pii_cum_stats.unknown++; 855 return; 856 } 857 858 /* 859 * If the rtt does not appear to be right, don't update the 860 * rtt stats. This can happen if the system dropped into the 861 * debugger, or the system was hung or too busy for a 862 * substantial time that we didn't get a chance to run. 863 */ 864 if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { 865 /* 866 * If the probe corresponding to this receieved response 867 * was truly sent 'm' ms. ago, then this response must 868 * have been rejected by the sequence number checks. The 869 * fact that it has passed the sequence number checks 870 * means that the measured rtt is wrong. We were probably 871 * scheduled long after the packet was received. 872 */ 873 goto out; 874 } 875 876 /* 877 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 878 * The initial few responses after the interface is repaired may 879 * contain high rtt's because they could have been queued up waiting 880 * for ARP/NDP resolution on a failed interface. 881 */ 882 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 883 goto out; 884 885 /* 886 * Don't update the Conservative Round Trip Time estimate for this 887 * (phint, target) pair if this is the not the highest ack seq seen 888 * thus far on this target. 889 */ 890 if (!highest_ack_tg(pr_icmp_seq, target)) 891 goto out; 892 893 /* 894 * Always update the rtt. This is a failure detection probe 895 * and we want to measure both increase / decrease in rtt. 896 */ 897 pi_set_crtt(target, m, _B_TRUE); 898 899 /* 900 * If the crtt exceeds the average time between probes, 901 * investigate if this slow target is an exception. If so we 902 * can avoid this target and still meet the failure detection 903 * time. Otherwise we can't meet the failure detection time. 904 */ 905 if (target->tg_crtt > pg->pg_probeint) { 906 exception = check_exception_target(pii, target); 907 if (exception) { 908 /* 909 * This target is exceptionally slow. Don't use it 910 * for future probes. check_exception_target() has 911 * made sure that we have at least MIN_PROBE_TARGETS 912 * other active targets 913 */ 914 if (pii->pii_targets_are_routers) { 915 /* 916 * This is a slow router, mark it as slow 917 * and don't use it for further probes. We 918 * don't delete it, since it will be populated 919 * again when we do a router scan. Hence we 920 * need to maintain extra state (unlike the 921 * host case below). Mark it as TG_SLOW. 922 */ 923 if (target->tg_status == TG_ACTIVE) 924 pii->pii_ntargets--; 925 target->tg_status = TG_SLOW; 926 target->tg_latime = gethrtime(); 927 target->tg_rtt_sa = -1; 928 target->tg_crtt = 0; 929 target->tg_rtt_sd = 0; 930 if (pii->pii_target_next == target) { 931 pii->pii_target_next = 932 target_next(target); 933 } 934 } else { 935 /* 936 * the slow target is not a router, we can 937 * just delete it. Send an icmp multicast and 938 * pick the fastest responder that is not 939 * already an active target. target_delete() 940 * adjusts pii->pii_target_next 941 */ 942 target_delete(target); 943 probe(pii, PROBE_MULTI, cur_time); 944 } 945 } else { 946 /* 947 * We can't meet the failure detection time. 948 * Log a message, and update the detection time to 949 * whatever we can achieve. 950 */ 951 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 952 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 953 last_fdt_bumpup_time = gethrtime(); 954 if (pg != phyint_anongroup) { 955 logerr("Cannot meet requested failure detection" 956 " time of %d ms on (%s %s) new failure" 957 " detection time for group \"%s\" is %d" 958 " ms\n", user_failure_detection_time, 959 AF_STR(pii->pii_af), pii->pii_name, 960 pg->pg_name, pg->pg_fdt); 961 } 962 } 963 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 964 (user_failure_detection_time < pg->pg_fdt) && 965 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 966 /* 967 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 968 * investigate if we can improve the failure detection time to 969 * meet whatever the user specified. 970 */ 971 if (check_pg_crtt_improved(pg)) { 972 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 973 user_failure_detection_time); 974 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 975 if (pg != phyint_anongroup) { 976 logerr("Improved failure detection time %d ms " 977 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 978 AF_STR(pii->pii_af), pii->pii_name, 979 pg->pg_name); 980 } 981 if (user_failure_detection_time == pg->pg_fdt) { 982 /* Avoid any truncation or rounding errors */ 983 pg->pg_probeint = user_probe_interval; 984 /* 985 * No more rtt probes will be sent. The actual 986 * fdt has dropped to the user specified value. 987 * pii_fd_snxt_basetime and pii_snxt_basetime 988 * will be in sync henceforth. 989 */ 990 reset_snxt_basetimes(); 991 } 992 } 993 } 994 out: 995 pii->pii_probes[pr_ndx].pr_status = PR_ACKED; 996 pii->pii_probes[pr_ndx].pr_time_acked = cur_time; 997 998 /* 999 * Update pii->pii_rack, i.e. the sequence number of the last received 1000 * probe response, based on the echo reply we have received now, if 1001 * either of the following conditions are satisfied. 1002 * a. pii_rack is outside the current receive window of 1003 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 1004 * This means we have not received probe responses for a 1005 * long time, and the sequence number has wrapped around. 1006 * b. pii_rack is within the current receive window and this echo 1007 * reply corresponds to the highest sequence number we have seen 1008 * so far. 1009 */ 1010 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 1011 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 1012 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 1013 pii->pii_rack = pr_icmp_seq; 1014 } 1015 } 1016 1017 /* 1018 * Returns true if seq is the highest unacknowledged seq for target tg 1019 * else returns false 1020 */ 1021 static boolean_t 1022 highest_ack_tg(uint16_t seq, struct target *tg) 1023 { 1024 struct phyint_instance *pii; 1025 int pr_ndx; 1026 uint16_t pr_seq; 1027 1028 pii = tg->tg_phyint_inst; 1029 1030 /* 1031 * Get the seq number of the most recent probe sent so far, 1032 * and also get the corresponding probe index in the probe stats 1033 * array. 1034 */ 1035 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1036 pr_seq = pii->pii_snxt; 1037 pr_seq--; 1038 1039 /* 1040 * Start from the most recent probe and walk back, trying to find 1041 * an acked probe corresponding to target tg. 1042 */ 1043 for (; pr_ndx != pii->pii_probe_next; 1044 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 1045 if (pii->pii_probes[pr_ndx].pr_target == tg && 1046 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1047 if (SEQ_GT(pr_seq, seq)) 1048 return (_B_FALSE); 1049 } 1050 } 1051 return (_B_TRUE); 1052 } 1053 1054 /* 1055 * Check whether the crtt for the group has improved by a factor of 1056 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1057 * detection time flapping in the face of small crtt changes. 1058 */ 1059 static boolean_t 1060 check_pg_crtt_improved(struct phyint_group *pg) 1061 { 1062 struct phyint *pi; 1063 1064 if (debug & D_PROBE) 1065 logdebug("check_pg_crtt_improved()\n"); 1066 1067 /* 1068 * The crtt for the group is only improved if each phyint_instance 1069 * for both ipv4 and ipv6 is improved. 1070 */ 1071 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1072 if (!check_pii_crtt_improved(pi->pi_v4) || 1073 !check_pii_crtt_improved(pi->pi_v6)) 1074 return (_B_FALSE); 1075 } 1076 1077 return (_B_TRUE); 1078 } 1079 1080 /* 1081 * Check whether the crtt has improved substantially on this phyint_instance. 1082 * Returns _B_TRUE if there's no crtt information available, because pii 1083 * is NULL or the phyint_instance is not capable of probing. 1084 */ 1085 boolean_t 1086 check_pii_crtt_improved(struct phyint_instance *pii) { 1087 struct target *tg; 1088 1089 if (pii == NULL) 1090 return (_B_TRUE); 1091 1092 if (!PROBE_CAPABLE(pii) || 1093 pii->pii_phyint->pi_state == PI_FAILED) 1094 return (_B_TRUE); 1095 1096 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1097 if (tg->tg_status != TG_ACTIVE) 1098 continue; 1099 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1100 LOWER_FDT_TRIGGER)) { 1101 return (_B_FALSE); 1102 } 1103 } 1104 1105 return (_B_TRUE); 1106 } 1107 1108 /* 1109 * This target responds very slowly to probes. The target's crtt exceeds 1110 * the probe interval of its group. Compare against other targets 1111 * and determine if this target is an exception, if so return true, else false 1112 */ 1113 static boolean_t 1114 check_exception_target(struct phyint_instance *pii, struct target *target) 1115 { 1116 struct target *tg; 1117 char abuf[INET6_ADDRSTRLEN]; 1118 1119 if (debug & D_PROBE) { 1120 logdebug("check_exception_target(%s %s target %s)\n", 1121 AF_STR(pii->pii_af), pii->pii_name, 1122 pr_addr(pii->pii_af, target->tg_address, 1123 abuf, sizeof (abuf))); 1124 } 1125 1126 /* 1127 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1128 * to make a good judgement. Otherwise don't drop this target. 1129 */ 1130 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1131 return (_B_FALSE); 1132 1133 /* 1134 * Determine whether only this particular target is slow. 1135 * We know that this target's crtt exceeds the group's probe interval. 1136 * If all other active targets have a 1137 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1138 * then this target is considered slow. 1139 */ 1140 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1141 if (tg != target && tg->tg_status == TG_ACTIVE) { 1142 if (tg->tg_crtt > 1143 pii->pii_phyint->pi_group->pg_probeint / 1144 EXCEPTION_FACTOR) { 1145 return (_B_FALSE); 1146 } 1147 } 1148 } 1149 1150 return (_B_TRUE); 1151 } 1152 1153 /* 1154 * Update the target list. The icmp all hosts multicast has given us 1155 * some host to which we can send probes. If we already have sufficient 1156 * targets, discard it. 1157 */ 1158 static void 1159 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1160 struct in6_addr fromaddr) 1161 /* ARGSUSED */ 1162 { 1163 int af; 1164 char abuf[INET6_ADDRSTRLEN]; 1165 struct phyint *pi; 1166 1167 if (debug & D_PROBE) { 1168 logdebug("incoming_mcast_reply(%s %s %s)\n", 1169 AF_STR(pii->pii_af), pii->pii_name, 1170 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1171 } 1172 1173 /* 1174 * Using host targets is a fallback mechanism. If we have 1175 * found a router, don't add this host target. If we already 1176 * know MAX_PROBE_TARGETS, don't add another target. 1177 */ 1178 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1179 if (pii->pii_targets != NULL) { 1180 if (pii->pii_targets_are_routers || 1181 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1182 return; 1183 } 1184 } 1185 1186 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1187 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1188 /* 1189 * Guard against response from 0.0.0.0 1190 * and ::. Log a trace message 1191 */ 1192 logtrace("probe response from %s on %s\n", 1193 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1194 pii->pii_name); 1195 return; 1196 } 1197 1198 /* 1199 * This address is one of our own, so reject this address as a 1200 * valid probe target. 1201 */ 1202 af = pii->pii_af; 1203 if (own_address(af, fromaddr)) 1204 return; 1205 1206 /* 1207 * If the phyint is part a named group, then add the address to all 1208 * members of the group. Otherwise, add the address only to the 1209 * phyint itself, since other phyints in the anongroup may not be on 1210 * the same subnet. 1211 */ 1212 pi = pii->pii_phyint; 1213 if (pi->pi_group == phyint_anongroup) { 1214 target_add(pii, fromaddr, _B_FALSE); 1215 } else { 1216 pi = pi->pi_group->pg_phyint; 1217 for (; pi != NULL; pi = pi->pi_pgnext) 1218 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1219 } 1220 } 1221 1222 /* 1223 * Compute CRTT given an existing scaled average, scaled deviation estimate 1224 * and a new rtt time. The formula is from Jacobson and Karels' 1225 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1226 * are the same as those in Appendix A.2 of that paper. 1227 * 1228 * m = new measurement 1229 * sa = scaled RTT average (8 * average estimates) 1230 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1231 * crtt = Conservative round trip time. Used to determine whether probe 1232 * has timed out. 1233 * 1234 * New scaled average and deviation are passed back via sap and svp 1235 */ 1236 static int 1237 compute_crtt(int *sap, int *svp, int m) 1238 { 1239 int sa = *sap; 1240 int sv = *svp; 1241 int crtt; 1242 int saved_m = m; 1243 1244 assert(*sap >= -1); 1245 assert(*svp >= 0); 1246 1247 if (sa != -1) { 1248 /* 1249 * Update average estimator: 1250 * new rtt = old rtt + 1/8 Error 1251 * where Error = m - old rtt 1252 * i.e. 8 * new rtt = 8 * old rtt + Error 1253 * i.e. new sa = old sa + Error 1254 */ 1255 m -= sa >> 3; /* m is now Error in estimate. */ 1256 if ((sa += m) < 0) { 1257 /* Don't allow the smoothed average to be negative. */ 1258 sa = 0; 1259 } 1260 1261 /* 1262 * Update deviation estimator: 1263 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1264 * i.e. 4 * new mdev = 4 * old mdev + 1265 * (abs(Error) - old mdev) 1266 * i.e. new sv = old sv + (abs(Error) - old mdev) 1267 */ 1268 if (m < 0) 1269 m = -m; 1270 m -= sv >> 2; 1271 sv += m; 1272 } else { 1273 /* Initialization. This is the first response received. */ 1274 sa = (m << 3); 1275 sv = (m << 1); 1276 } 1277 1278 crtt = (sa >> 3) + sv; 1279 1280 if (debug & D_PROBE) { 1281 logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " 1282 "%d\n", saved_m, sa, sv, crtt); 1283 } 1284 1285 *sap = sa; 1286 *svp = sv; 1287 1288 /* 1289 * CRTT = average estimates + 4 * deviation estimates 1290 * = sa / 8 + sv 1291 */ 1292 return (crtt); 1293 } 1294 1295 static void 1296 pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) 1297 { 1298 struct phyint_instance *pii = tg->tg_phyint_inst; 1299 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1300 int sa = tg->tg_rtt_sa; 1301 int sv = tg->tg_rtt_sd; 1302 int new_crtt; 1303 int i; 1304 1305 if (debug & D_PROBE) 1306 logdebug("pi_set_crtt: target - m %d\n", m); 1307 1308 /* store the round trip time, in case we need to defer computation */ 1309 tg->tg_deferred[tg->tg_num_deferred] = m; 1310 1311 new_crtt = compute_crtt(&sa, &sv, m); 1312 1313 /* 1314 * If this probe's round trip time would singlehandedly cause an 1315 * increase in the group's probe interval consider it suspect. 1316 */ 1317 if ((new_crtt > probe_interval) && is_probe_uni) { 1318 if (debug & D_PROBE) { 1319 logdebug("Received a suspect probe on %s, new_crtt =" 1320 " %d, probe_interval = %d, num_deferred = %d\n", 1321 pii->pii_probe_logint->li_name, new_crtt, 1322 probe_interval, tg->tg_num_deferred); 1323 } 1324 1325 /* 1326 * If we've deferred as many rtts as we plan on deferring, then 1327 * assume the link really did slow down and process all queued 1328 * rtts 1329 */ 1330 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1331 if (debug & D_PROBE) { 1332 logdebug("Received MAXDEFERREDRTT probes which " 1333 "would cause an increased probe_interval. " 1334 "Integrating queued rtt data points.\n"); 1335 } 1336 1337 for (i = 0; i <= tg->tg_num_deferred; i++) { 1338 tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, 1339 &tg->tg_rtt_sd, tg->tg_deferred[i]); 1340 } 1341 1342 tg->tg_num_deferred = 0; 1343 } else { 1344 tg->tg_num_deferred++; 1345 } 1346 return; 1347 } 1348 1349 /* 1350 * If this is a normal probe, or an RTT probe that would lead to a 1351 * reduced CRTT, then update our CRTT data. Further, if this was 1352 * a normal probe, pitch any deferred probes since our probes are 1353 * again being answered within our CRTT estimates. 1354 */ 1355 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1356 tg->tg_rtt_sa = sa; 1357 tg->tg_rtt_sd = sv; 1358 tg->tg_crtt = new_crtt; 1359 if (is_probe_uni) 1360 tg->tg_num_deferred = 0; 1361 } 1362 } 1363 1364 /* 1365 * Return a pointer to the specified option buffer. 1366 * If not found return NULL. 1367 */ 1368 static void * 1369 find_ancillary(struct msghdr *msg, int cmsg_type) 1370 { 1371 struct cmsghdr *cmsg; 1372 1373 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1374 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1375 if (cmsg->cmsg_level == IPPROTO_IPV6 && 1376 cmsg->cmsg_type == cmsg_type) { 1377 return (CMSG_DATA(cmsg)); 1378 } 1379 } 1380 return (NULL); 1381 } 1382 1383 /* 1384 * See if a previously failed interface has started working again. 1385 */ 1386 void 1387 phyint_check_for_repair(struct phyint *pi) 1388 { 1389 if (phyint_repaired(pi)) { 1390 if (pi->pi_group == phyint_anongroup) { 1391 logerr("NIC repair detected on %s\n", pi->pi_name); 1392 } else { 1393 logerr("NIC repair detected on %s of group %s\n", 1394 pi->pi_name, pi->pi_group->pg_name); 1395 } 1396 1397 /* 1398 * If the interface is offline, just clear the FAILED flag, 1399 * delaying the state change and failback operation until it 1400 * is brought back online. 1401 */ 1402 if (pi->pi_state == PI_OFFLINE) { 1403 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1404 return; 1405 } 1406 1407 if (pi->pi_flags & IFF_INACTIVE) { 1408 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1409 } else { 1410 if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) { 1411 (void) change_lif_flags(pi, 1412 IFF_FAILED, _B_FALSE); 1413 /* Per state diagram */ 1414 pi->pi_empty = 0; 1415 } 1416 } 1417 1418 phyint_chstate(pi, PI_RUNNING); 1419 1420 if (GROUP_FAILED(pi->pi_group)) { 1421 /* 1422 * This is the 1st phyint to receive a response 1423 * after group failure. 1424 */ 1425 logerr("At least 1 interface (%s) of group %s has " 1426 "repaired\n", pi->pi_name, pi->pi_group->pg_name); 1427 phyint_group_chstate(pi->pi_group, PG_RUNNING); 1428 } 1429 } 1430 } 1431 1432 /* 1433 * See if a previously functioning interface has failed, or if the 1434 * whole group of interfaces has failed. 1435 */ 1436 static void 1437 phyint_inst_check_for_failure(struct phyint_instance *pii) 1438 { 1439 struct phyint *pi; 1440 struct phyint *pi2; 1441 1442 pi = pii->pii_phyint; 1443 1444 switch (failure_state(pii)) { 1445 case PHYINT_FAILURE: 1446 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 1447 if (pi->pi_group == phyint_anongroup) { 1448 logerr("NIC failure detected on %s\n", pii->pii_name); 1449 } else { 1450 logerr("NIC failure detected on %s of group %s\n", 1451 pii->pii_name, pi->pi_group->pg_name); 1452 } 1453 /* 1454 * Do the failover, unless the interface is offline (in 1455 * which case we've already failed over). 1456 */ 1457 if (pi->pi_state != PI_OFFLINE) { 1458 phyint_chstate(pi, PI_FAILED); 1459 reset_crtt_all(pi); 1460 if (!(pi->pi_flags & IFF_INACTIVE)) 1461 (void) try_failover(pi, FAILOVER_NORMAL); 1462 } 1463 break; 1464 1465 case GROUP_FAILURE: 1466 logerr("All Interfaces in group %s have failed\n", 1467 pi->pi_group->pg_name); 1468 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; 1469 pi2 = pi2->pi_pgnext) { 1470 if (pi2->pi_flags & IFF_OFFLINE) 1471 continue; 1472 (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); 1473 reset_crtt_all(pi2); 1474 1475 /* 1476 * In the case of host targets, we 1477 * would have flushed the targets, 1478 * and gone to PI_NOTARGETS state. 1479 */ 1480 if (pi2->pi_state == PI_RUNNING) 1481 phyint_chstate(pi, PI_FAILED); 1482 1483 pi2->pi_empty = 0; 1484 pi2->pi_full = 0; 1485 } 1486 break; 1487 1488 default: 1489 break; 1490 } 1491 } 1492 1493 /* 1494 * Determines if any timeout event has occurred and returns the number of 1495 * milliseconds until the next timeout event for the phyint. Returns 1496 * TIMER_INFINITY for "never". 1497 */ 1498 uint_t 1499 phyint_inst_timer(struct phyint_instance *pii) 1500 { 1501 int pr_ndx; 1502 uint_t timeout; 1503 struct target *cur_tg; 1504 struct probe_stats *pr_statp; 1505 struct phyint_instance *pii_other; 1506 struct phyint *pi; 1507 int valid_unack_count; 1508 int i; 1509 int interval; 1510 uint_t check_time; 1511 uint_t cur_time; 1512 hrtime_t cur_hrtime; 1513 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1514 1515 cur_time = getcurrenttime(); 1516 1517 if (debug & D_TIMER) { 1518 logdebug("phyint_inst_timer(%s %s)\n", 1519 AF_STR(pii->pii_af), pii->pii_name); 1520 } 1521 1522 pii_other = phyint_inst_other(pii); 1523 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1524 /* 1525 * Check to see if we're here due to link up/down flapping; If 1526 * enough time has passed, then try to bring the interface 1527 * back up; otherwise, schedule a timer to bring it back up 1528 * when enough time *has* elapsed. 1529 */ 1530 pi = pii->pii_phyint; 1531 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1532 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1533 if (check_time > cur_time) 1534 return (check_time - cur_time); 1535 1536 phyint_check_for_repair(pi); 1537 } 1538 } 1539 1540 /* 1541 * If this phyint is not yet initialized for probes, 1542 * don't proceed further 1543 */ 1544 if (pii->pii_probe_sock == -1) 1545 return (TIMER_INFINITY); 1546 1547 /* 1548 * If the timer has fired too soon, probably triggered 1549 * by some other phyint instance, return the remaining 1550 * time 1551 */ 1552 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1553 return (pii->pii_snxt_time - cur_time); 1554 1555 /* 1556 * If the link is down, don't send any probes for now. 1557 */ 1558 if (LINK_DOWN(pii->pii_phyint)) 1559 return (TIMER_INFINITY); 1560 1561 /* 1562 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1563 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1564 * Base probe time is strictly periodic. 1565 */ 1566 interval = GET_RANDOM( 1567 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1568 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1569 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1570 1571 /* 1572 * Check if the current time > next time to probe. If so, we missed 1573 * sending 1 or more probes, probably due to heavy system load. At least 1574 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1575 * were scheduled. Make adjustments to the times, in multiples of 1576 * user_probe_interval. 1577 */ 1578 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1579 int n; 1580 1581 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1582 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1583 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1584 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1585 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1586 pii->pii_snxt_basetime); 1587 1588 /* Collect statistics about missed probes */ 1589 probes_missed.pm_nprobes += n + 1; 1590 probes_missed.pm_ntimes++; 1591 } 1592 pii->pii_snxt_basetime += user_probe_interval; 1593 interval = pii->pii_snxt_time - cur_time; 1594 if (debug & D_TARGET) { 1595 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1596 " interval %u\n", cur_time, pii->pii_snxt_time, 1597 pii->pii_snxt_basetime, interval); 1598 } 1599 1600 /* 1601 * If no targets are known, we need to send an ICMP multicast. The 1602 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1603 * to see if we found a target. 1604 */ 1605 if (pii->pii_target_next == NULL) { 1606 assert(pii->pii_ntargets == 0); 1607 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1608 probe(pii, PROBE_MULTI, cur_time); 1609 return (interval); 1610 } 1611 1612 if ((user_probe_interval != probe_interval) && 1613 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1614 /* 1615 * the failure detection (fd) probe timer has not yet fired. 1616 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1617 */ 1618 probe(pii, PROBE_RTT, cur_time); 1619 return (interval); 1620 } 1621 /* 1622 * the fd probe timer has fired. Need to do all failure 1623 * detection / recovery calculations, and then send an fd probe 1624 * of type PROBE_UNI. 1625 */ 1626 if (user_probe_interval == probe_interval) { 1627 /* 1628 * We could have missed some probes, and then adjusted 1629 * pii_snxt_basetime above. Otherwise we could have 1630 * blindly added probe_interval to pii_fd_snxt_basetime. 1631 */ 1632 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1633 } else { 1634 pii->pii_fd_snxt_basetime += probe_interval; 1635 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1636 int n; 1637 1638 n = (cur_time - pii->pii_fd_snxt_basetime) / 1639 probe_interval; 1640 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1641 } 1642 } 1643 1644 /* 1645 * We can have at most, the latest 2 probes that we sent, in 1646 * the PR_UNACKED state. All previous probes sent, are either 1647 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1648 * timed out if the probe's time_sent + the CRTT < currenttime. 1649 * For each of the last 2 probes, examine whether it has timed 1650 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1651 */ 1652 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1653 valid_unack_count = 0; 1654 1655 for (i = 0; i < 2; i++) { 1656 pr_statp = &pii->pii_probes[pr_ndx]; 1657 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1658 switch (pr_statp->pr_status) { 1659 case PR_ACKED: 1660 /* 1661 * We received back an ACK, so the switch clearly 1662 * is not dropping our traffic, and thus we can 1663 * enable failure detection immediately. 1664 */ 1665 if (pii->pii_fd_hrtime > gethrtime()) { 1666 if (debug & D_PROBE) { 1667 logdebug("successful probe on %s; " 1668 "ending quiet period\n", 1669 pii->pii_phyint->pi_name); 1670 } 1671 pii->pii_fd_hrtime = gethrtime(); 1672 } 1673 break; 1674 1675 case PR_UNACKED: 1676 assert(cur_tg != NULL); 1677 /* 1678 * The crtt could be zero for some reason, 1679 * Eg. the phyint could be failed. If the crtt is 1680 * not available use group's probe interval, 1681 * which is a worst case estimate. 1682 */ 1683 if (cur_tg->tg_crtt != 0) { 1684 timeout = pr_statp->pr_time_sent + 1685 cur_tg->tg_crtt; 1686 } else { 1687 timeout = pr_statp->pr_time_sent + 1688 probe_interval; 1689 } 1690 if (TIME_LT(timeout, cur_time)) { 1691 pr_statp->pr_status = PR_LOST; 1692 pr_statp->pr_time_lost = timeout; 1693 } else if (i == 1) { 1694 /* 1695 * We are forced to consider this probe 1696 * lost, as we can have at most 2 unack. 1697 * probes any time, and we will be sending a 1698 * probe at the end of this function. 1699 * Normally, we should not be here, but 1700 * this can happen if an incoming response 1701 * that was considered lost has increased 1702 * the crtt for this target, and also bumped 1703 * up the FDT. Note that we never cancel or 1704 * increase the current pii_time_left, so 1705 * when the timer fires, we find 2 valid 1706 * unacked probes, and they are yet to timeout 1707 */ 1708 pr_statp->pr_status = PR_LOST; 1709 pr_statp->pr_time_lost = cur_time; 1710 } else { 1711 /* 1712 * Only the most recent probe can enter 1713 * this 'else' arm. The second most recent 1714 * probe must take either of the above arms, 1715 * if it is unacked. 1716 */ 1717 valid_unack_count++; 1718 } 1719 break; 1720 } 1721 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1722 } 1723 1724 /* 1725 * We send out 1 probe randomly in the interval between one half 1726 * and one probe interval for the group. Given that the CRTT is always 1727 * less than the group's probe interval, we can have at most 1 1728 * unacknowledged probe now. All previous probes are either lost or 1729 * acked. 1730 */ 1731 assert(valid_unack_count == 0 || valid_unack_count == 1); 1732 1733 /* 1734 * The timer has fired. Take appropriate action depending 1735 * on the current state of the phyint. 1736 * 1737 * PI_RUNNING state - Failure detection and failover 1738 * PI_FAILED state - Repair detection and failback 1739 */ 1740 switch (pii->pii_phyint->pi_state) { 1741 case PI_FAILED: 1742 /* 1743 * If the most recent probe (excluding unacked probes that 1744 * are yet to time out) has been acked, check whether the 1745 * phyint is now repaired. If the phyint is repaired, then 1746 * attempt failback, unless it is an inactive standby. 1747 */ 1748 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1749 phyint_check_for_repair(pii->pii_phyint); 1750 } 1751 break; 1752 1753 case PI_RUNNING: 1754 /* 1755 * It's possible our probes have been lost because of a 1756 * spanning-tree mandated quiet period on the switch. If so, 1757 * ignore the lost probes and consider the interface to still 1758 * be functioning. 1759 */ 1760 cur_hrtime = gethrtime(); 1761 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1762 break; 1763 1764 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1765 /* 1766 * We have 1 or more failed probes (excluding unacked 1767 * probes that are yet to time out). Determine if the 1768 * phyint has failed. If so attempt a failover, 1769 * unless it is an inactive standby 1770 */ 1771 phyint_inst_check_for_failure(pii); 1772 } 1773 break; 1774 1775 default: 1776 logerr("phyint_inst_timer: invalid state %d\n", 1777 pii->pii_phyint->pi_state); 1778 abort(); 1779 } 1780 1781 /* 1782 * Start the next probe. probe() will also set pii->pii_probe_time_left 1783 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1784 * was called, the target list may be empty. 1785 */ 1786 if (pii->pii_target_next != NULL) { 1787 probe(pii, PROBE_UNI, cur_time); 1788 /* 1789 * If we have just the one probe target, and we're not using 1790 * router targets, try to find another as we presently have 1791 * no resilience. 1792 */ 1793 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1794 probe(pii, PROBE_MULTI, cur_time); 1795 } else { 1796 probe(pii, PROBE_MULTI, cur_time); 1797 } 1798 return (interval); 1799 } 1800 1801 /* 1802 * Start the probe timer for an interface instance. 1803 */ 1804 void 1805 start_timer(struct phyint_instance *pii) 1806 { 1807 uint32_t interval; 1808 1809 /* 1810 * Spread the base probe times (pi_snxt_basetime) across phyints 1811 * uniformly over the (curtime..curtime + the group's probe_interval). 1812 * pi_snxt_basetime is strictly periodic with a frequency of 1813 * the group's probe interval. The actual probe time pi_snxt_time 1814 * adds some randomness to pi_snxt_basetime and happens in probe(). 1815 * For the 1st probe on each phyint after the timer is started, 1816 * pi_snxt_time and pi_snxt_basetime are the same. 1817 */ 1818 interval = GET_RANDOM(0, 1819 (int)pii->pii_phyint->pi_group->pg_probeint); 1820 1821 pii->pii_snxt_basetime = getcurrenttime() + interval; 1822 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1823 pii->pii_snxt_time = pii->pii_snxt_basetime; 1824 timer_schedule(interval); 1825 } 1826 1827 /* 1828 * Restart the probe timer on an interface instance. 1829 */ 1830 static void 1831 restart_timer(struct phyint_instance *pii) 1832 { 1833 /* 1834 * We don't need to restart the timer if it was never started in 1835 * the first place (pii->pii_basetime_inited not set), as the timer 1836 * won't have gone off yet. 1837 */ 1838 if (pii->pii_basetime_inited != 0) { 1839 1840 if (debug & D_LINKNOTE) 1841 logdebug("restart timer: restarting timer on %s, " 1842 "address family %s\n", pii->pii_phyint->pi_name, 1843 AF_STR(pii->pii_af)); 1844 1845 start_timer(pii); 1846 } 1847 } 1848 1849 static void 1850 process_link_state_down(struct phyint *pi) 1851 { 1852 logerr("The link has gone down on %s\n", pi->pi_name); 1853 1854 /* 1855 * Clear the probe statistics arrays, we don't want the repair 1856 * detection logic relying on probes that were succesful prior 1857 * to the link going down. 1858 */ 1859 if (PROBE_CAPABLE(pi->pi_v4)) 1860 clear_pii_probe_stats(pi->pi_v4); 1861 if (PROBE_CAPABLE(pi->pi_v6)) 1862 clear_pii_probe_stats(pi->pi_v6); 1863 /* 1864 * Check for interface failure. Although we know the interface 1865 * has failed, we don't know if all the other interfaces in the 1866 * group have failed as well. 1867 */ 1868 if ((pi->pi_state == PI_RUNNING) || 1869 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1870 if (debug & D_LINKNOTE) { 1871 logdebug("process_link_state_down:" 1872 " checking for failure on %s\n", pi->pi_name); 1873 } 1874 1875 if (pi->pi_v4 != NULL) 1876 phyint_inst_check_for_failure(pi->pi_v4); 1877 else if (pi->pi_v6 != NULL) 1878 phyint_inst_check_for_failure(pi->pi_v6); 1879 } 1880 } 1881 1882 static void 1883 process_link_state_up(struct phyint *pi) 1884 { 1885 logerr("The link has come up on %s\n", pi->pi_name); 1886 1887 /* 1888 * We stopped any running timers on each instance when the link 1889 * went down, so restart them. 1890 */ 1891 if (pi->pi_v4) 1892 restart_timer(pi->pi_v4); 1893 if (pi->pi_v6) 1894 restart_timer(pi->pi_v6); 1895 1896 phyint_check_for_repair(pi); 1897 1898 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1899 if (pi->pi_whendx == LINK_UP_PERMIN) 1900 pi->pi_whendx = 0; 1901 } 1902 1903 /* 1904 * Process any changes in link state passed up from the interfaces. 1905 */ 1906 void 1907 process_link_state_changes(void) 1908 { 1909 struct phyint *pi; 1910 1911 /* Look for interfaces where the link state has just changed */ 1912 1913 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1914 boolean_t old_link_state_up = LINK_UP(pi); 1915 1916 /* 1917 * Except when the "phyint" structure is created, this is 1918 * the only place the link state is updated. This allows 1919 * this routine to detect changes in link state, rather 1920 * than just the current state. 1921 */ 1922 UPDATE_LINK_STATE(pi); 1923 1924 if (LINK_DOWN(pi)) { 1925 /* 1926 * Has link just gone down? 1927 */ 1928 if (old_link_state_up) 1929 process_link_state_down(pi); 1930 } else { 1931 /* 1932 * Has link just gone back up? 1933 */ 1934 if (!old_link_state_up) 1935 process_link_state_up(pi); 1936 } 1937 } 1938 } 1939 1940 void 1941 reset_crtt_all(struct phyint *pi) 1942 { 1943 struct phyint_instance *pii; 1944 struct target *tg; 1945 1946 pii = pi->pi_v4; 1947 if (pii != NULL) { 1948 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1949 tg->tg_crtt = 0; 1950 tg->tg_rtt_sa = -1; 1951 tg->tg_rtt_sd = 0; 1952 } 1953 } 1954 1955 pii = pi->pi_v6; 1956 if (pii != NULL) { 1957 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1958 tg->tg_crtt = 0; 1959 tg->tg_rtt_sa = -1; 1960 tg->tg_rtt_sd = 0; 1961 } 1962 } 1963 } 1964 1965 /* 1966 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 1967 * probes on both instances IPv4 and IPv6. 1968 * If the interface has failed, return the time of the first probe failure 1969 * in "tff". 1970 */ 1971 static int 1972 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 1973 { 1974 uint_t pi_tff; 1975 struct target *cur_tg; 1976 struct probe_fail_count pfinfo; 1977 struct phyint_instance *pii_other; 1978 int pr_ndx; 1979 1980 /* 1981 * Get the number of consecutive failed probes on 1982 * this phyint across all targets. Also get the number 1983 * of consecutive failed probes on this target only 1984 */ 1985 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1986 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1987 probe_fail_info(pii, cur_tg, &pfinfo); 1988 1989 /* Get the time of first failure, for later use */ 1990 pi_tff = pfinfo.pf_tff; 1991 1992 /* 1993 * If the current target has not responded to the 1994 * last NUM_PROBE_FAILS probes, and other targets are 1995 * responding delete this target. Dead gateway detection 1996 * will eventually remove this target (if router) from the 1997 * routing tables. If that does not occur, we may end 1998 * up adding this to our list again. 1999 */ 2000 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2001 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2002 if (pii->pii_targets_are_routers) { 2003 if (cur_tg->tg_status == TG_ACTIVE) 2004 pii->pii_ntargets--; 2005 cur_tg->tg_status = TG_DEAD; 2006 cur_tg->tg_crtt = 0; 2007 cur_tg->tg_rtt_sa = -1; 2008 cur_tg->tg_rtt_sd = 0; 2009 if (pii->pii_target_next == cur_tg) 2010 pii->pii_target_next = target_next(cur_tg); 2011 } else { 2012 target_delete(cur_tg); 2013 probe(pii, PROBE_MULTI, getcurrenttime()); 2014 } 2015 return (PHYINT_OK); 2016 } 2017 2018 /* 2019 * If the phyint has lost NUM_PROBE_FAILS or more 2020 * consecutive probes, on both IPv4 and IPv6 protocol 2021 * instances of the phyint, then trigger failure 2022 * detection, else return false 2023 */ 2024 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2025 return (PHYINT_OK); 2026 2027 pii_other = phyint_inst_other(pii); 2028 if (PROBE_CAPABLE(pii_other)) { 2029 probe_fail_info(pii_other, NULL, &pfinfo); 2030 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2031 /* 2032 * We have NUM_PROBE_FAILS or more failures 2033 * on both IPv4 and IPv6. Get the earliest 2034 * time when failure was detected on this 2035 * phyint across IPv4 and IPv6. 2036 */ 2037 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2038 pi_tff = pfinfo.pf_tff; 2039 } else { 2040 /* 2041 * This instance has < NUM_PROBE_FAILS failure. 2042 * So return false 2043 */ 2044 return (PHYINT_OK); 2045 } 2046 } 2047 *tff = pi_tff; 2048 return (PHYINT_FAILURE); 2049 } 2050 2051 /* 2052 * Check if the link has gone down on this phyint, or it has failed the 2053 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2054 * Also look at other phyints of this group, for group failures. 2055 */ 2056 int 2057 failure_state(struct phyint_instance *pii) 2058 { 2059 struct probe_success_count psinfo; 2060 uint_t pi2_tls; /* time last success */ 2061 uint_t pi_tff; /* time first fail */ 2062 struct phyint *pi2; 2063 struct phyint *pi; 2064 struct phyint_instance *pii2; 2065 struct phyint_group *pg; 2066 boolean_t alone; 2067 2068 if (debug & D_FAILOVER) 2069 logdebug("phyint_failed(%s)\n", pii->pii_name); 2070 2071 pi = pii->pii_phyint; 2072 pg = pi->pi_group; 2073 2074 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2075 PHYINT_OK) 2076 return (PHYINT_OK); 2077 2078 /* 2079 * At this point, the link is down, or the phyint is suspect, 2080 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint 2081 * does not belong to any group, or is the only member of the 2082 * group capable of being probed, return PHYINT_FAILURE. 2083 */ 2084 alone = _B_TRUE; 2085 if (pg != phyint_anongroup) { 2086 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2087 if (pi2 == pi) 2088 continue; 2089 if (PROBE_CAPABLE(pi2->pi_v4) || 2090 PROBE_CAPABLE(pi2->pi_v6)) { 2091 alone = _B_FALSE; 2092 break; 2093 } 2094 } 2095 } 2096 if (alone) 2097 return (PHYINT_FAILURE); 2098 2099 /* 2100 * Need to compare against other phyints of the same group 2101 * to exclude group failures. If the failure was detected via 2102 * probing, then if the time of last success (tls) of any 2103 * phyint is more recent than the time of first fail (tff) of the 2104 * phyint in question, and the link is up on the phyint, 2105 * then it is a phyint failure. Otherwise it is a group failure. 2106 * If failure was detected via a link down notification sent from 2107 * the driver to IP, we see if any phyints in the group are still 2108 * running and haven't received a link down notification. We 2109 * will usually be processing the link down notification shortly 2110 * after it was received, so there is no point looking at the tls 2111 * of other phyints. 2112 */ 2113 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2114 /* Exclude ourself from comparison */ 2115 if (pi2 == pi) 2116 continue; 2117 2118 if (LINK_DOWN(pi)) { 2119 /* 2120 * We use FLAGS_TO_LINK_STATE() to test the 2121 * flags directly, rather then LINK_UP() or 2122 * LINK_DOWN(), as we may not have got round 2123 * to processing the link state for the other 2124 * phyints in the group yet. 2125 * 2126 * The check for PI_RUNNING and group 2127 * failure handles the case when the 2128 * group begins to recover. The first 2129 * phyint to recover should not trigger 2130 * a failover from the soon-to-recover 2131 * other phyints to the first recovered 2132 * phyint. PI_RUNNING will be set, and 2133 * pg_groupfailed cleared only after 2134 * receipt of NUM_PROBE_REPAIRS, by 2135 * which time the other phyints should 2136 * have received at least 1 packet, 2137 * and so will not have NUM_PROBE_FAILS. 2138 */ 2139 if ((pi2->pi_state == PI_RUNNING) && 2140 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) 2141 return (PHYINT_FAILURE); 2142 } else { 2143 /* 2144 * Need to compare against both IPv4 and 2145 * IPv6 instances. 2146 */ 2147 pii2 = pi2->pi_v4; 2148 if (pii2 != NULL) { 2149 probe_success_info(pii2, NULL, &psinfo); 2150 if (psinfo.ps_tls_valid) { 2151 pi2_tls = psinfo.ps_tls; 2152 /* 2153 * See comment above regarding check 2154 * for PI_RUNNING and group failure. 2155 */ 2156 if (TIME_GT(pi2_tls, pi_tff) && 2157 (pi2->pi_state == PI_RUNNING) && 2158 !GROUP_FAILED(pg) && 2159 FLAGS_TO_LINK_STATE(pi2)) 2160 return (PHYINT_FAILURE); 2161 } 2162 } 2163 2164 pii2 = pi2->pi_v6; 2165 if (pii2 != NULL) { 2166 probe_success_info(pii2, NULL, &psinfo); 2167 if (psinfo.ps_tls_valid) { 2168 pi2_tls = psinfo.ps_tls; 2169 /* 2170 * See comment above regarding check 2171 * for PI_RUNNING and group failure. 2172 */ 2173 if (TIME_GT(pi2_tls, pi_tff) && 2174 (pi2->pi_state == PI_RUNNING) && 2175 !GROUP_FAILED(pg) && 2176 FLAGS_TO_LINK_STATE(pi2)) 2177 return (PHYINT_FAILURE); 2178 } 2179 } 2180 } 2181 } 2182 2183 /* 2184 * Change the group state to PG_FAILED if it's not already. 2185 */ 2186 if (!GROUP_FAILED(pg)) 2187 phyint_group_chstate(pg, PG_FAILED); 2188 2189 return (GROUP_FAILURE); 2190 } 2191 2192 /* 2193 * Return the information associated with consecutive probe successes 2194 * starting with the most recent probe. At most the last 2 probes can be 2195 * in the unacknowledged state. All previous probes have either failed 2196 * or succeeded. 2197 */ 2198 static void 2199 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2200 struct probe_success_count *psinfo) 2201 { 2202 uint_t i; 2203 struct probe_stats *pr_statp; 2204 uint_t most_recent; 2205 uint_t second_most_recent; 2206 boolean_t pi_found_failure = _B_FALSE; 2207 boolean_t tg_found_failure = _B_FALSE; 2208 uint_t now; 2209 uint_t timeout; 2210 struct target *tg; 2211 2212 if (debug & D_FAILOVER) 2213 logdebug("probe_success_info(%s)\n", pii->pii_name); 2214 2215 bzero(psinfo, sizeof (*psinfo)); 2216 now = getcurrenttime(); 2217 2218 /* 2219 * Start with the most recent probe, and count the number 2220 * of consecutive probe successes. Latch the number of successes 2221 * on hitting a failure. 2222 */ 2223 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2224 second_most_recent = PROBE_INDEX_PREV(most_recent); 2225 2226 for (i = most_recent; i != pii->pii_probe_next; 2227 i = PROBE_INDEX_PREV(i)) { 2228 pr_statp = &pii->pii_probes[i]; 2229 2230 switch (pr_statp->pr_status) { 2231 case PR_UNACKED: 2232 /* 2233 * Only the most recent 2 probes can be unacknowledged 2234 */ 2235 assert(i == most_recent || i == second_most_recent); 2236 2237 tg = pr_statp->pr_target; 2238 assert(tg != NULL); 2239 /* 2240 * The crtt could be zero for some reason, 2241 * Eg. the phyint could be failed. If the crtt is 2242 * not available use the value of the group's probe 2243 * interval which is a worst case estimate. 2244 */ 2245 if (tg->tg_crtt != 0) { 2246 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2247 } else { 2248 timeout = pr_statp->pr_time_sent + 2249 pii->pii_phyint->pi_group->pg_probeint; 2250 } 2251 2252 if (TIME_LT(timeout, now)) { 2253 /* 2254 * We hit a failure. Latch the total number of 2255 * recent consecutive successes. 2256 */ 2257 pr_statp->pr_time_lost = timeout; 2258 pr_statp->pr_status = PR_LOST; 2259 pi_found_failure = _B_TRUE; 2260 if (cur_tg != NULL && tg == cur_tg) { 2261 /* 2262 * We hit a failure for the desired 2263 * target. Latch the number of recent 2264 * consecutive successes for this target 2265 */ 2266 tg_found_failure = _B_TRUE; 2267 } 2268 } 2269 break; 2270 2271 case PR_ACKED: 2272 /* 2273 * Bump up the count of probe successes, if we 2274 * have not seen any failure so far. 2275 */ 2276 if (!pi_found_failure) 2277 psinfo->ps_nsucc++; 2278 2279 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2280 !tg_found_failure) { 2281 psinfo->ps_nsucc_tg++; 2282 } 2283 2284 /* 2285 * Record the time of last success, if this is 2286 * the most recent probe success. 2287 */ 2288 if (!psinfo->ps_tls_valid) { 2289 psinfo->ps_tls = pr_statp->pr_time_acked; 2290 psinfo->ps_tls_valid = _B_TRUE; 2291 } 2292 break; 2293 2294 case PR_LOST: 2295 /* 2296 * We hit a failure. Latch the total number of 2297 * recent consecutive successes. 2298 */ 2299 pi_found_failure = _B_TRUE; 2300 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2301 /* 2302 * We hit a failure for the desired target. 2303 * Latch the number of recent consecutive 2304 * successes for this target 2305 */ 2306 tg_found_failure = _B_TRUE; 2307 } 2308 break; 2309 2310 default: 2311 return; 2312 2313 } 2314 } 2315 } 2316 2317 /* 2318 * Return the information associated with consecutive probe failures 2319 * starting with the most recent probe. Only the last 2 probes can be in the 2320 * unacknowledged state. All previous probes have either failed or succeeded. 2321 */ 2322 static void 2323 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2324 struct probe_fail_count *pfinfo) 2325 { 2326 int i; 2327 struct probe_stats *pr_statp; 2328 boolean_t tg_found_success = _B_FALSE; 2329 boolean_t pi_found_success = _B_FALSE; 2330 int most_recent; 2331 int second_most_recent; 2332 uint_t now; 2333 uint_t timeout; 2334 struct target *tg; 2335 2336 if (debug & D_FAILOVER) 2337 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2338 2339 bzero(pfinfo, sizeof (*pfinfo)); 2340 now = getcurrenttime(); 2341 2342 /* 2343 * Start with the most recent probe, and count the number 2344 * of consecutive probe failures. Latch the number of failures 2345 * on hitting a probe success. 2346 */ 2347 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2348 second_most_recent = PROBE_INDEX_PREV(most_recent); 2349 2350 for (i = most_recent; i != pii->pii_probe_next; 2351 i = PROBE_INDEX_PREV(i)) { 2352 pr_statp = &pii->pii_probes[i]; 2353 2354 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2355 2356 switch (pr_statp->pr_status) { 2357 case PR_UNACKED: 2358 /* 2359 * Only the most recent 2 probes can be unacknowledged 2360 */ 2361 assert(i == most_recent || i == second_most_recent); 2362 2363 tg = pr_statp->pr_target; 2364 /* 2365 * Target is guaranteed to exist in the unack. state 2366 */ 2367 assert(tg != NULL); 2368 /* 2369 * The crtt could be zero for some reason, 2370 * Eg. the phyint could be failed. If the crtt is 2371 * not available use the group's probe interval, 2372 * which is a worst case estimate. 2373 */ 2374 if (tg->tg_crtt != 0) { 2375 timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2376 } else { 2377 timeout = pr_statp->pr_time_sent + 2378 pii->pii_phyint->pi_group->pg_probeint; 2379 } 2380 2381 if (TIME_GT(timeout, now)) 2382 break; 2383 2384 pr_statp->pr_time_lost = timeout; 2385 pr_statp->pr_status = PR_LOST; 2386 /* FALLTHRU */ 2387 2388 case PR_LOST: 2389 if (!pi_found_success) { 2390 pfinfo->pf_nfail++; 2391 pfinfo->pf_tff = pr_statp->pr_time_lost; 2392 } 2393 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2394 !tg_found_success) { 2395 pfinfo->pf_nfail_tg++; 2396 } 2397 break; 2398 2399 default: 2400 /* 2401 * We hit a success or unused slot. Latch the 2402 * total number of recent consecutive failures. 2403 */ 2404 pi_found_success = _B_TRUE; 2405 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2406 /* 2407 * We hit a success for the desired target. 2408 * Latch the number of recent consecutive 2409 * failures for this target 2410 */ 2411 tg_found_success = _B_TRUE; 2412 } 2413 } 2414 } 2415 } 2416 2417 /* 2418 * Check if the phyint has been repaired. If no test address has been 2419 * configured, then consider the interface repaired if the link is up (unless 2420 * the link is flapping; see below). Otherwise, look for proof of probes 2421 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2422 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2423 */ 2424 static boolean_t 2425 phyint_repaired(struct phyint *pi) 2426 { 2427 struct probe_success_count psinfo; 2428 struct phyint_instance *pii; 2429 struct target *cur_tg; 2430 int pr_ndx; 2431 uint_t cur_time; 2432 2433 if (debug & D_FAILOVER) 2434 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2435 2436 if (LINK_DOWN(pi)) 2437 return (_B_FALSE); 2438 2439 /* 2440 * If we don't have any test addresses and the link is up, then 2441 * consider the interface repaired, unless we've received more than 2442 * LINK_UP_PERMIN link up notifications in the last minute, in 2443 * which case we keep the link down until we drop back below 2444 * the threshold. 2445 */ 2446 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2447 cur_time = getcurrenttime(); 2448 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2449 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2450 pi->pi_lfmsg_printed = 0; 2451 return (_B_TRUE); 2452 } 2453 if (!pi->pi_lfmsg_printed) { 2454 logerr("The link has come up on %s more than %d times " 2455 "in the last minute; disabling failback until it " 2456 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2457 pi->pi_lfmsg_printed = 1; 2458 } 2459 2460 return (_B_FALSE); 2461 } 2462 2463 pii = pi->pi_v4; 2464 if (PROBE_CAPABLE(pii)) { 2465 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2466 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2467 probe_success_info(pii, cur_tg, &psinfo); 2468 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2469 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2470 return (_B_TRUE); 2471 } 2472 2473 pii = pi->pi_v6; 2474 if (PROBE_CAPABLE(pii)) { 2475 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2476 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2477 probe_success_info(pii, cur_tg, &psinfo); 2478 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2479 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2480 return (_B_TRUE); 2481 } 2482 2483 return (_B_FALSE); 2484 } 2485 2486 /* 2487 * Try failover from phyint 'pi' to a suitable destination. 2488 */ 2489 int 2490 try_failover(struct phyint *pi, int failover_type) 2491 { 2492 struct phyint *dst; 2493 int err; 2494 2495 if (debug & D_FAILOVER) 2496 logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); 2497 2498 /* 2499 * Attempt to find a failover destination 'dst'. 2500 * dst will be null if any of the following is true 2501 * Phyint is not part of a group OR 2502 * Phyint is the only member of a group OR 2503 * No suitable failover dst was available 2504 */ 2505 dst = get_failover_dst(pi, failover_type); 2506 if (dst == NULL) 2507 return (IPMP_EMINRED); 2508 2509 dst->pi_empty = 0; /* Per state diagram */ 2510 pi->pi_full = 0; /* Per state diagram */ 2511 2512 err = failover(pi, dst); 2513 2514 if (debug & D_FAILOVER) { 2515 logdebug("failed over from %s to %s ret %d\n", 2516 pi->pi_name, dst->pi_name, err); 2517 } 2518 if (err == 0) { 2519 pi->pi_empty = 1; /* Per state diagram */ 2520 /* 2521 * we don't want to print out this message if a 2522 * phyint is leaving the group, nor for failover from 2523 * standby 2524 */ 2525 if (failover_type == FAILOVER_NORMAL) { 2526 logerr("Successfully failed over from NIC %s to NIC " 2527 "%s\n", pi->pi_name, dst->pi_name); 2528 } 2529 return (0); 2530 } else { 2531 /* 2532 * The failover did not succeed. We must retry the failover 2533 * only after resyncing our state based on the kernel's. 2534 * For eg. either the src or the dst might have been unplumbed 2535 * causing this failure. initifs() will be called again, 2536 * from main, since full_scan_required has been set to true 2537 * by failover(); 2538 */ 2539 return (IPMP_FAILURE); 2540 } 2541 } 2542 2543 /* 2544 * global_errno captures the errno value, if failover() or failback() 2545 * fails. This is sent to if_mpadm(1M). 2546 */ 2547 int global_errno; 2548 2549 /* 2550 * Attempt failover from phyint 'from' to phyint 'to'. 2551 * IP moves everything from phyint 'from' to phyint 'to'. 2552 */ 2553 static int 2554 failover(struct phyint *from, struct phyint *to) 2555 { 2556 struct lifreq lifr; 2557 int ret; 2558 2559 if (debug & D_FAILOVER) { 2560 logdebug("failing over from %s to %s\n", 2561 from->pi_name, to->pi_name); 2562 } 2563 2564 /* 2565 * Perform the failover. Both IPv4 and IPv6 are failed over 2566 * using a single ioctl by passing in AF_UNSPEC family. 2567 */ 2568 lifr.lifr_addr.ss_family = AF_UNSPEC; 2569 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2570 lifr.lifr_movetoindex = to->pi_ifindex; 2571 2572 ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); 2573 if (ret < 0) { 2574 global_errno = errno; 2575 logperror("failover: ioctl (failover)"); 2576 } 2577 2578 /* 2579 * Set full_scan_required to true. This will make us read 2580 * the state from the kernel in initifs() and update our tables, 2581 * to reflect the current state after the failover. If the 2582 * failover has failed it will then reissue the failover. 2583 */ 2584 full_scan_required = _B_TRUE; 2585 return (ret); 2586 } 2587 2588 /* 2589 * phyint 'pi' has recovered. Attempt failback from every phyint in the same 2590 * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. 2591 * Return values: 2592 * IPMP_SUCCESS: Failback successful from each of the other 2593 * phyints in the group. 2594 * IPMP_EFBPARTIAL: Failback successful from some of the other 2595 * phyints in the group. 2596 * IPMP_FAILURE: Failback syscall failed with some error. 2597 * 2598 * Note that failback is attempted regardless of the setting of the 2599 * failback_enabled flag. 2600 */ 2601 int 2602 do_failback(struct phyint *pi, boolean_t check_only) 2603 { 2604 struct phyint *from; 2605 boolean_t done; 2606 boolean_t partial; 2607 boolean_t attempted_failback = _B_FALSE; 2608 2609 if (debug & D_FAILOVER) 2610 logdebug("do_failback(%s)\n", pi->pi_name); 2611 2612 /* If this phyint is not part of a named group, return. */ 2613 if (pi->pi_group == phyint_anongroup) { 2614 pi->pi_full = 1; 2615 return (IPMP_SUCCESS); 2616 } 2617 2618 /* 2619 * Attempt failback from every phyint in the group to 'pi'. 2620 * The reason for doing this, instead of only from the 2621 * phyint to which we did the failover is given below. 2622 * 2623 * After 'pi' failed, if any app. tries to join on a multicast 2624 * address (IPv6), on the failed phyint, IP picks any arbitrary 2625 * non-failed phyint in the group, instead of the failed phyint, 2626 * in.mpathd is not aware of this. Thus failing back only from the 2627 * interface to which 'pi' failed over, will failback the ipif's 2628 * but not the ilm's. So we need to failback from all members of 2629 * the phyint group 2630 */ 2631 done = _B_TRUE; 2632 partial = _B_FALSE; 2633 for (from = pi->pi_group->pg_phyint; from != NULL; 2634 from = from->pi_pgnext) { 2635 /* Exclude ourself as a failback src */ 2636 if (from == pi) 2637 continue; 2638 2639 /* 2640 * If the 'from' phyint has IPv4 plumbed, the 'to' 2641 * phyint must also have IPv4 plumbed. Similar check 2642 * for IPv6. IP makes the same check. Otherwise the 2643 * failback will fail. 2644 */ 2645 if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || 2646 (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { 2647 partial = _B_TRUE; 2648 continue; 2649 } 2650 2651 if (!check_only) { 2652 pi->pi_empty = 0; /* Per state diagram */ 2653 attempted_failback = _B_TRUE; 2654 if (failback(from, pi) != 0) { 2655 done = _B_FALSE; 2656 break; 2657 } 2658 } 2659 } 2660 2661 if (check_only) { 2662 return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2663 } 2664 2665 /* 2666 * We are done. No more phyint from which we can src the failback 2667 */ 2668 if (done) { 2669 if (!partial) 2670 pi->pi_full = 1; /* Per state diagram */ 2671 /* 2672 * Don't print out a message unless there is a 2673 * transition from FAILED to RUNNING. For eg. 2674 * we don't want to print out this message if a 2675 * phyint is leaving the group, or at startup 2676 */ 2677 if (attempted_failback && (pi->pi_flags & 2678 (IFF_FAILED | IFF_OFFLINE))) { 2679 logerr("Successfully failed back to NIC %s\n", 2680 pi->pi_name); 2681 } 2682 return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2683 } 2684 2685 return (IPMP_FAILURE); 2686 } 2687 2688 /* 2689 * This function is similar to do_failback() above, but respects the 2690 * failback_enabled flag for phyints in named groups. 2691 */ 2692 int 2693 try_failback(struct phyint *pi, boolean_t check_only) 2694 { 2695 if (debug & D_FAILOVER) 2696 logdebug("try_failback(%s)\n", pi->pi_name); 2697 2698 if (pi->pi_group != phyint_anongroup && !failback_enabled) 2699 return (IPMP_EFBDISABLED); 2700 2701 return (do_failback(pi, check_only)); 2702 } 2703 2704 /* 2705 * Failback everything from phyint 'from' that has the same ifindex 2706 * as phyint to's ifindex. 2707 */ 2708 static int 2709 failback(struct phyint *from, struct phyint *to) 2710 { 2711 struct lifreq lifr; 2712 int ret; 2713 2714 if (debug & D_FAILOVER) 2715 logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); 2716 2717 lifr.lifr_addr.ss_family = AF_UNSPEC; 2718 (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2719 lifr.lifr_movetoindex = to->pi_ifindex; 2720 2721 ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); 2722 if (ret < 0) { 2723 global_errno = errno; 2724 logperror("failback: ioctl (failback)"); 2725 } 2726 2727 /* 2728 * Set full_scan_required to true. This will make us read 2729 * the state from the kernel in initifs() and update our tables, 2730 * to reflect the current state after the failback. If the 2731 * failback has failed it will then reissue the failback. 2732 */ 2733 full_scan_required = _B_TRUE; 2734 2735 return (ret); 2736 } 2737 2738 /* 2739 * Select a target phyint for failing over from 'pi'. 2740 * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred 2741 * target phyint is chosen as follows, 2742 * 1. Pick any inactive standby interface. 2743 * 2. If no inactive standby is available, select any phyint in the 2744 * same group that has the least number of logints, (excluding 2745 * IFF_NOFAILOVER and !IFF_UP logints) 2746 * If we are failing over from a standby, failover_type is 2747 * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. 2748 * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, 2749 * and we won't return NULL, as long as there is at least 1 other phyint 2750 * in the group. 2751 */ 2752 static struct phyint * 2753 get_failover_dst(struct phyint *pi, int failover_type) 2754 { 2755 struct phyint *maybe = NULL; 2756 struct phyint *pi2; 2757 struct phyint *last_choice = NULL; 2758 2759 if (pi->pi_group == phyint_anongroup) 2760 return (NULL); 2761 2762 /* 2763 * Loop thru the phyints in the group, and pick the preferred 2764 * phyint for the target. 2765 */ 2766 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2767 /* Exclude ourself and offlined interfaces */ 2768 if (pi2 == pi || pi2->pi_state == PI_OFFLINE) 2769 continue; 2770 2771 /* 2772 * The chosen target phyint must have IPv4 instance 2773 * plumbed, if the src phyint has IPv4 plumbed. Similarly 2774 * for IPv6. 2775 */ 2776 if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || 2777 (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) 2778 continue; 2779 2780 /* The chosen target must be PI_RUNNING. */ 2781 if (pi2->pi_state != PI_RUNNING) { 2782 last_choice = pi2; 2783 continue; 2784 } 2785 2786 if ((pi2->pi_flags & IFF_INACTIVE) && 2787 (failover_type != FAILOVER_TO_NONSTANDBY)) { 2788 return (pi2); 2789 } else { 2790 if (maybe == NULL) 2791 maybe = pi2; 2792 else if (logint_upcount(pi2) < logint_upcount(maybe)) 2793 maybe = pi2; 2794 } 2795 } 2796 if (maybe == NULL && failover_type == FAILOVER_TO_ANY) 2797 return (last_choice); 2798 else 2799 return (maybe); 2800 } 2801 2802 /* 2803 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2804 */ 2805 boolean_t 2806 change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) 2807 { 2808 int ifsock; 2809 struct lifreq lifr; 2810 2811 if (debug & D_FAILOVER) { 2812 logdebug("change_lif_flags(%s): flags %llx setfl %d\n", 2813 pi->pi_name, flags, (int)setfl); 2814 } 2815 2816 if (pi->pi_v4 != NULL) { 2817 ifsock = ifsock_v4; 2818 } else { 2819 ifsock = ifsock_v6; 2820 } 2821 2822 /* 2823 * Get the current flags from the kernel, and set/clear the 2824 * desired phyint flags. Since we set only phyint flags, we can 2825 * do it on either IPv4 or IPv6 instance. 2826 */ 2827 (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2828 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 2829 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2830 if (errno != ENXIO) 2831 logperror("change_lif_flags: ioctl (get flags)"); 2832 return (_B_FALSE); 2833 } 2834 if (setfl) 2835 lifr.lifr_flags |= flags; 2836 else 2837 lifr.lifr_flags &= ~flags; 2838 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2839 if (errno != ENXIO) 2840 logperror("change_lif_flags: ioctl (set flags)"); 2841 return (_B_FALSE); 2842 } 2843 2844 /* 2845 * Keep pi_flags in synch. with actual flags. Assumes flags are 2846 * phyint flags. 2847 */ 2848 if (setfl) 2849 pi->pi_flags |= flags; 2850 else 2851 pi->pi_flags &= ~flags; 2852 2853 if (pi->pi_v4) 2854 pi->pi_v4->pii_flags = pi->pi_flags; 2855 2856 if (pi->pi_v6) 2857 pi->pi_v6->pii_flags = pi->pi_flags; 2858 2859 return (_B_TRUE); 2860 } 2861 2862 /* 2863 * icmp cksum computation for IPv4. 2864 */ 2865 static int 2866 in_cksum(ushort_t *addr, int len) 2867 { 2868 register int nleft = len; 2869 register ushort_t *w = addr; 2870 register ushort_t answer; 2871 ushort_t odd_byte = 0; 2872 register int sum = 0; 2873 2874 /* 2875 * Our algorithm is simple, using a 32 bit accumulator (sum), 2876 * we add sequential 16 bit words to it, and at the end, fold 2877 * back all the carry bits from the top 16 bits into the lower 2878 * 16 bits. 2879 */ 2880 while (nleft > 1) { 2881 sum += *w++; 2882 nleft -= 2; 2883 } 2884 2885 /* mop up an odd byte, if necessary */ 2886 if (nleft == 1) { 2887 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2888 sum += odd_byte; 2889 } 2890 2891 /* 2892 * add back carry outs from top 16 bits to low 16 bits 2893 */ 2894 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2895 sum += (sum >> 16); /* add carry */ 2896 answer = ~sum; /* truncate to 16 bits */ 2897 return (answer); 2898 } 2899 2900 static void 2901 reset_snxt_basetimes(void) 2902 { 2903 struct phyint_instance *pii; 2904 2905 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2906 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2907 } 2908 } 2909 2910 /* 2911 * Is the address one of our own addresses? Unfortunately, 2912 * we cannot check our phyint tables to determine if the address 2913 * is our own. This is because, we don't track interfaces that 2914 * are not part of any group. We have to either use a 'bind' or 2915 * get the complete list of all interfaces using SIOCGLIFCONF, 2916 * to do this check. We choose to use 'bind'. We could use 2917 * SIOCTMYADDR, but bind is preferred, since it is stronger. 2918 * SIOCTMYADDR excludes down interfaces, while bind includes even 2919 * down interfaces. 2920 */ 2921 boolean_t 2922 own_address(int af, struct in6_addr addr) 2923 { 2924 int sock; 2925 boolean_t ours = _B_TRUE; 2926 2927 sock = socket(AF_INET6, SOCK_DGRAM, 0); 2928 if (sock == -1) { 2929 logperror("own_address: socket"); 2930 /* 2931 * If the socket call fails, err on the side of caution, 2932 * and return true. 2933 */ 2934 } else { 2935 struct sockaddr_in6 sin6; 2936 2937 (void) memset(&sin6, 0, sizeof (struct sockaddr_in6)); 2938 sin6.sin6_family = AF_INET6; 2939 sin6.sin6_addr = addr; 2940 /* 2941 * If the bind succeeds, then this address is one of our 2942 * addresses. 2943 * If bind returns error EADDRNOTAVAIL, the address is 2944 * not one of ours. 2945 * If bind returns an error other than EADDRNOTAVAIL, err 2946 * on the side of caution and report the address as one of 2947 * our own. 2948 */ 2949 if (bind(sock, (struct sockaddr *)&sin6, 2950 sizeof (struct sockaddr_in6)) == -1) { 2951 if (errno == EADDRNOTAVAIL) 2952 ours = _B_FALSE; 2953 else 2954 logperror("own_address: bind"); 2955 } 2956 (void) close(sock); 2957 } 2958 if (debug & D_TARGET) { 2959 char abuf[INET6_ADDRSTRLEN]; 2960 2961 logdebug("own_address: addr %s is %s ours\n", 2962 pr_addr(af, addr, abuf, sizeof (abuf)), 2963 ours ? "one of" : "not"); 2964 } 2965 return (ours); 2966 } 2967