1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #include "mpd_defs.h" 24 #include "mpd_tables.h" 25 26 /* 27 * Probe types for probe() 28 */ 29 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 30 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 31 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 32 33 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 34 35 /* 36 * Format of probe / probe response packets. This is an ICMP Echo request 37 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 38 */ 39 struct pr_icmp 40 { 41 uint8_t pr_icmp_type; /* type field */ 42 uint8_t pr_icmp_code; /* code field */ 43 uint16_t pr_icmp_cksum; /* checksum field */ 44 uint16_t pr_icmp_id; /* Identification */ 45 uint16_t pr_icmp_seq; /* sequence number */ 46 uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ 47 uint32_t pr_icmp_mtype; /* Message type */ 48 }; 49 50 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 51 0x0, 0x0, 0x0, 0x0, 52 0x0, 0x0, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x1 } }; 54 55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 56 57 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 58 59 static void *find_ancillary(struct msghdr *msg, int cmsg_level, 60 int cmsg_type); 61 static void pi_set_crtt(struct target *tg, int64_t m, 62 boolean_t is_probe_uni); 63 static void incoming_echo_reply(struct phyint_instance *pii, 64 struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); 65 static void incoming_rtt_reply(struct phyint_instance *pii, 66 struct pr_icmp *reply, struct in6_addr fromaddr); 67 static void incoming_mcast_reply(struct phyint_instance *pii, 68 struct pr_icmp *reply, struct in6_addr fromaddr); 69 70 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 71 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 72 static boolean_t check_exception_target(struct phyint_instance *pii, 73 struct target *target); 74 static void probe_fail_info(struct phyint_instance *pii, 75 struct target *cur_tg, struct probe_fail_count *pfinfo); 76 static void probe_success_info(struct phyint_instance *pii, 77 struct target *cur_tg, struct probe_success_count *psinfo); 78 static boolean_t phyint_repaired(struct phyint *pi); 79 80 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 81 static int in_cksum(ushort_t *addr, int len); 82 static void reset_snxt_basetimes(void); 83 static int ns2ms(int64_t ns); 84 static int64_t tv2ns(struct timeval *); 85 86 /* 87 * CRTT - Conservative Round Trip Time Estimate 88 * Probe success - A matching probe reply received before CRTT ms has elapsed 89 * after sending the probe. 90 * Probe failure - No probe reply received and more than CRTT ms has elapsed 91 * after sending the probe. 92 * 93 * TLS - Time last success. Most recent probe ack received at this time. 94 * TFF - Time first fail. The time of the earliest probe failure in 95 * a consecutive series of probe failures. 96 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 97 * before declaring phyint repair. 98 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 99 * declare a phyint failure. 100 * 101 * Phyint state diagram 102 * 103 * The state of a phyint that is capable of being probed, is completely 104 * specified by the 3-tuple <pi_state, pg_state, I>. 105 * 106 * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether 107 * IFF_OFFLINE is set. If the phyint is also configured with a test address 108 * (the common case) and probe targets, then a phyint must also successfully 109 * be able to send and receive probes in order to remain in the PI_RUNNING 110 * state (otherwise, it transitions to PI_FAILED). 111 * 112 * Further, if a PI_RUNNING phyint is configured with a test address but is 113 * unable to find any probe targets, it will transition to the PI_NOTARGETS 114 * state, which indicates that the link is apparently functional but that 115 * in.mpathd is unable to send probes to verify functionality (in this case, 116 * in.mpathd makes the optimistic assumption that the interface is working 117 * correctly and thus does not mark the interface FAILED, but reports it as 118 * IPMP_IF_UNKNOWN through the async events and query interfaces). 119 * 120 * At any point, a phyint may be administratively marked offline via if_mpadm. 121 * In this case, the interface always transitions to PI_OFFLINE, regardless 122 * of its previous state. When the interface is later brought back online, 123 * in.mpathd acts as if the interface is new (and thus it transitions to 124 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 125 * its probes, if probes are sent). 126 * 127 * pi_state - PI_RUNNING or PI_FAILED 128 * PI_RUNNING: The failure detection logic says the phyint is good. 129 * PI_FAILED: The failure detection logic says the phyint has failed. 130 * 131 * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. 132 * PG_OK: All interfaces in the group are OK. 133 * PG_DEGRADED: Some interfaces in the group are unusable. 134 * PG_FAILED: All interfaces in the group are unusable. 135 * 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: This phyint will not send or receive packets. 144 * Usually, inactive is tied to standby interfaces that are not yet 145 * needed (e.g., no non-standby interfaces in the group have failed). 146 * When failback has been disabled (FAILBACK=no configured), phyint can 147 * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint 148 * subsequently recovers after a failure. 149 * 150 * Not all 9 possible combinations of the above 3-tuple are possible. 151 * 152 * I is tracked by IP. pi_state is tracked by mpathd. 153 * 154 * pi_state state machine 155 * --------------------------------------------------------------------------- 156 * Event State New State 157 * Action: 158 * --------------------------------------------------------------------------- 159 * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 160 * detection : set IFF_FAILED on this phyint 161 * 162 * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 163 * detection : set IFF_FAILED on this phyint 164 * 165 * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) 166 * detection -> (PI_RUNNING, I == 0) 167 * : clear IFF_FAILED on this phyint 168 * 169 * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) 170 * detection -> (PI_RUNNING, I == 1) 171 * : clear IFF_FAILED on this phyint 172 * : if failback is disabled set I == 1 173 * 174 * Group failure (perform on all phyints in the group) 175 * detection PI_RUNNING PI_FAILED 176 * (Router targets) : set IFF_FAILED 177 * 178 * Group failure (perform on all phyints in the group) 179 * detection PI_RUNNING PI_NOTARGETS 180 * (Host targets) : set IFF_FAILED 181 * : delete the target list on all phyints 182 * --------------------------------------------------------------------------- 183 */ 184 185 struct probes_missed probes_missed; 186 187 /* 188 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 189 * will be added on by the kernel. The id field identifies this phyint. 190 * and the sequence number is an increasing (modulo 2^^16) integer. The data 191 * portion holds the time value when the packet is sent. On echo this is 192 * extracted to compute the round-trip time. Three different types of 193 * probe packets are used. 194 * 195 * PROBE_UNI: This type is used to do failure detection / failure recovery 196 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 197 * not less than the current CRTT. pii_probes[] stores data 198 * about these probes. These packets consume sequence number space. 199 * 200 * PROBE_RTT: This type is used to make only rtt measurements. Normally these 201 * are not used. Under heavy network load, the rtt may go up very high, 202 * due to a spike, or may appear to go high, due to extreme scheduling 203 * delays. Once the network stress is removed, mpathd takes long time to 204 * recover, because the probe_interval is already high, and it takes 205 * a long time to send out sufficient number of probes to bring down the 206 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 207 * user_probe_interval ms. and will cause only rtt updates. These packets 208 * do not consume sequence number space nor is information about these 209 * packets stored in the pii_probes[] 210 * 211 * PROBE_MULTI: This type is only used to construct a list of targets, when 212 * no targets are known. The packet is multicast to the all hosts addr. 213 */ 214 static void 215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) 216 { 217 hrtime_t sent_hrtime; 218 struct timeval sent_tv; 219 struct pr_icmp probe_pkt; /* Probe packet */ 220 struct sockaddr_storage targ; /* target address */ 221 uint_t targaddrlen; /* targed address length */ 222 int pr_ndx; /* probe index in pii->pii_probes[] */ 223 boolean_t sent = _B_FALSE; 224 int rval; 225 226 if (debug & D_TARGET) { 227 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), 228 pii->pii_name, probe_type, start_hrtime); 229 } 230 231 assert(pii->pii_probe_sock != -1); 232 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 233 probe_type == PROBE_RTT); 234 235 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 236 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 237 probe_pkt.pr_icmp_code = 0; 238 probe_pkt.pr_icmp_cksum = 0; 239 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 240 241 /* 242 * Since there is no need to do arithmetic on the icmpid, 243 * (only equality check is done) pii_icmpid is stored in 244 * network byte order at initialization itself. 245 */ 246 probe_pkt.pr_icmp_id = pii->pii_icmpid; 247 probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); 248 probe_pkt.pr_icmp_mtype = htonl(probe_type); 249 250 /* 251 * If probe_type is PROBE_MULTI, this packet will be multicast to 252 * the all hosts address. Otherwise it is unicast to the next target. 253 */ 254 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 255 pii->pii_rtt_target_next != NULL)); 256 257 bzero(&targ, sizeof (targ)); 258 targ.ss_family = pii->pii_af; 259 260 if (pii->pii_af == AF_INET6) { 261 struct in6_addr *addr6; 262 263 addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; 264 targaddrlen = sizeof (struct sockaddr_in6); 265 if (probe_type == PROBE_MULTI) { 266 *addr6 = all_nodes_mcast_v6; 267 } else if (probe_type == PROBE_UNI) { 268 *addr6 = pii->pii_target_next->tg_address; 269 } else { /* type is PROBE_RTT */ 270 *addr6 = pii->pii_rtt_target_next->tg_address; 271 } 272 } else { 273 struct in_addr *addr4; 274 275 addr4 = &((struct sockaddr_in *)&targ)->sin_addr; 276 targaddrlen = sizeof (struct sockaddr_in); 277 if (probe_type == PROBE_MULTI) { 278 *addr4 = all_nodes_mcast_v4; 279 } else if (probe_type == PROBE_UNI) { 280 IN6_V4MAPPED_TO_INADDR( 281 &pii->pii_target_next->tg_address, addr4); 282 } else { /* type is PROBE_RTT */ 283 IN6_V4MAPPED_TO_INADDR( 284 &pii->pii_rtt_target_next->tg_address, addr4); 285 } 286 287 /* 288 * Compute the IPv4 icmp checksum. Does not cover the IP header. 289 */ 290 probe_pkt.pr_icmp_cksum = 291 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 292 } 293 294 /* 295 * Use the current time as the time we sent. Not atomic, but the best 296 * we can do from here. 297 */ 298 sent_hrtime = gethrtime(); 299 (void) gettimeofday(&sent_tv, NULL); 300 rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, 301 (struct sockaddr *)&targ, targaddrlen); 302 /* 303 * If the send would block, this may either be transient or a hang in a 304 * lower layer. We pretend the probe was actually sent, the daemon will 305 * not see a reply to the probe and will fail the interface if normal 306 * failure detection criteria are met. 307 */ 308 if (rval == sizeof (probe_pkt) || 309 (rval == -1 && errno == EWOULDBLOCK)) { 310 sent = _B_TRUE; 311 } else { 312 logperror_pii(pii, "probe: probe sendto"); 313 } 314 315 /* 316 * If this is a PROBE_UNI probe packet being unicast to a target, then 317 * update our tables. We will need this info in processing the probe 318 * response. PROBE_MULTI and PROBE_RTT packets are not used for 319 * the purpose of failure or recovery detection. PROBE_MULTI packets 320 * are only used to construct a list of targets. PROBE_RTT packets are 321 * used only for updating the rtt and not for failure detection. 322 */ 323 if (probe_type == PROBE_UNI && sent) { 324 pr_ndx = pii->pii_probe_next; 325 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 326 327 /* Collect statistics, before we reuse the last slot. */ 328 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 329 pii->pii_cum_stats.lost++; 330 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 331 pii->pii_cum_stats.acked++; 332 pii->pii_cum_stats.sent++; 333 334 pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; 335 pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; 336 pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; 337 pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; 338 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 339 probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); 340 341 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 342 pii->pii_target_next = target_next(pii->pii_target_next); 343 assert(pii->pii_target_next != NULL); 344 /* 345 * If we have a single variable to denote the next target to 346 * probe for both rtt probes and failure detection probes, we 347 * could end up with a situation where the failure detection 348 * probe targets become disjoint from the rtt probe targets. 349 * Eg. if 2 targets and the actual fdt is double the user 350 * specified fdt. So we have 2 variables. In this scheme 351 * we also reset pii_rtt_target_next for every fdt probe, 352 * though that may not be necessary. 353 */ 354 pii->pii_rtt_target_next = pii->pii_target_next; 355 pii->pii_snxt++; 356 } else if (probe_type == PROBE_RTT) { 357 pii->pii_rtt_target_next = 358 target_next(pii->pii_rtt_target_next); 359 assert(pii->pii_rtt_target_next != NULL); 360 } 361 } 362 363 /* 364 * Incoming IPv4 data from wire, is received here. Called from main. 365 */ 366 void 367 in_data(struct phyint_instance *pii) 368 { 369 struct sockaddr_in from; 370 struct in6_addr fromaddr; 371 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 372 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 373 struct ip *ip; 374 int iphlen; 375 int len; 376 char abuf[INET_ADDRSTRLEN]; 377 struct msghdr msg; 378 struct iovec iov; 379 struct pr_icmp *reply; 380 struct timeval *recv_tvp; 381 382 if (debug & D_PROBE) { 383 logdebug("in_data(%s %s)\n", 384 AF_STR(pii->pii_af), pii->pii_name); 385 } 386 387 iov.iov_base = (char *)in_packet; 388 iov.iov_len = sizeof (in_packet); 389 msg.msg_iov = &iov; 390 msg.msg_iovlen = 1; 391 msg.msg_name = (struct sockaddr *)&from; 392 msg.msg_namelen = sizeof (from); 393 msg.msg_control = ancillary_data; 394 msg.msg_controllen = sizeof (ancillary_data); 395 396 /* 397 * Poll has already told us that a message is waiting, 398 * on this socket. Read it now. We should not block. 399 */ 400 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 401 logperror_pii(pii, "in_data: recvmsg"); 402 return; 403 } 404 405 /* 406 * If the datalink has indicated the link is down, don't go 407 * any further. 408 */ 409 if (LINK_DOWN(pii->pii_phyint)) 410 return; 411 412 /* Get the printable address for error reporting */ 413 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 414 415 /* Ignore packets > 64k or control buffers that don't fit */ 416 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 417 if (debug & D_PKTBAD) { 418 logdebug("Truncated message: msg_flags 0x%x from %s\n", 419 msg.msg_flags, abuf); 420 } 421 return; 422 } 423 424 /* Make sure packet contains at least minimum ICMP header */ 425 ip = (struct ip *)in_packet; 426 iphlen = ip->ip_hl << 2; 427 if (len < iphlen + ICMP_MINLEN) { 428 if (debug & D_PKTBAD) { 429 logdebug("in_data: packet too short (%d bytes)" 430 " from %s\n", len, abuf); 431 } 432 return; 433 } 434 435 /* 436 * Subtract the IP hdr length, 'len' will be length of the probe 437 * reply, starting from the icmp hdr. 438 */ 439 len -= iphlen; 440 /* LINTED */ 441 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 442 443 /* Probe replies are icmp echo replies. Ignore anything else */ 444 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 445 return; 446 447 /* 448 * The icmp id should match what we sent, which is stored 449 * in pi_icmpid. The icmp code for reply must be 0. 450 * The reply content must be a struct pr_icmp 451 */ 452 if (reply->pr_icmp_id != pii->pii_icmpid) { 453 /* Not in response to our probe */ 454 return; 455 } 456 457 if (reply->pr_icmp_code != 0) { 458 logtrace("probe reply code %d from %s on %s\n", 459 reply->pr_icmp_code, abuf, pii->pii_name); 460 return; 461 } 462 463 if (len < sizeof (struct pr_icmp)) { 464 logtrace("probe reply too short: %d bytes from %s on %s\n", 465 len, abuf, pii->pii_name); 466 return; 467 } 468 469 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 470 if (recv_tvp == NULL) { 471 logtrace("message without timestamp from %s on %s\n", 472 abuf, pii->pii_name); 473 return; 474 } 475 476 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 477 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 478 /* Unicast probe reply */ 479 incoming_echo_reply(pii, reply, fromaddr, recv_tvp); 480 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 481 /* Multicast reply */ 482 incoming_mcast_reply(pii, reply, fromaddr); 483 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 484 incoming_rtt_reply(pii, reply, fromaddr); 485 } else { 486 /* Probably not in response to our probe */ 487 logtrace("probe reply type: %d from %s on %s\n", 488 reply->pr_icmp_mtype, abuf, pii->pii_name); 489 return; 490 } 491 } 492 493 /* 494 * Incoming IPv6 data from wire is received here. Called from main. 495 */ 496 void 497 in6_data(struct phyint_instance *pii) 498 { 499 struct sockaddr_in6 from; 500 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 501 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 502 int len; 503 char abuf[INET6_ADDRSTRLEN]; 504 struct msghdr msg; 505 struct iovec iov; 506 void *opt; 507 struct pr_icmp *reply; 508 struct timeval *recv_tvp; 509 510 if (debug & D_PROBE) { 511 logdebug("in6_data(%s %s)\n", 512 AF_STR(pii->pii_af), pii->pii_name); 513 } 514 515 iov.iov_base = (char *)in_packet; 516 iov.iov_len = sizeof (in_packet); 517 msg.msg_iov = &iov; 518 msg.msg_iovlen = 1; 519 msg.msg_name = (struct sockaddr *)&from; 520 msg.msg_namelen = sizeof (from); 521 msg.msg_control = ancillary_data; 522 msg.msg_controllen = sizeof (ancillary_data); 523 524 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 525 logperror_pii(pii, "in6_data: recvmsg"); 526 return; 527 } 528 529 /* 530 * If the datalink has indicated that the link is down, don't go 531 * any further. 532 */ 533 if (LINK_DOWN(pii->pii_phyint)) 534 return; 535 536 /* Get the printable address for error reporting */ 537 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 538 if (len < ICMP_MINLEN) { 539 if (debug & D_PKTBAD) { 540 logdebug("Truncated message: msg_flags 0x%x from %s\n", 541 msg.msg_flags, abuf); 542 } 543 return; 544 } 545 /* Ignore packets > 64k or control buffers that don't fit */ 546 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 547 if (debug & D_PKTBAD) { 548 logdebug("Truncated message: msg_flags 0x%x from %s\n", 549 msg.msg_flags, abuf); 550 } 551 return; 552 } 553 554 reply = (struct pr_icmp *)in_packet; 555 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 556 return; 557 558 if (reply->pr_icmp_id != pii->pii_icmpid) { 559 /* Not in response to our probe */ 560 return; 561 } 562 563 /* 564 * The kernel has already verified the the ICMP checksum. 565 */ 566 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 567 logtrace("ICMPv6 echo reply source address not linklocal from " 568 "%s on %s\n", abuf, pii->pii_name); 569 return; 570 } 571 opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); 572 if (opt != NULL) { 573 /* Can't allow routing headers in probe replies */ 574 logtrace("message with routing header from %s on %s\n", 575 abuf, pii->pii_name); 576 return; 577 } 578 579 if (reply->pr_icmp_code != 0) { 580 logtrace("probe reply code: %d from %s on %s\n", 581 reply->pr_icmp_code, abuf, pii->pii_name); 582 return; 583 } 584 if (len < (sizeof (struct pr_icmp))) { 585 logtrace("probe reply too short: %d bytes from %s on %s\n", 586 len, abuf, pii->pii_name); 587 return; 588 } 589 590 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 591 if (recv_tvp == NULL) { 592 logtrace("message without timestamp from %s on %s\n", 593 abuf, pii->pii_name); 594 return; 595 } 596 597 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 598 incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); 599 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 600 incoming_mcast_reply(pii, reply, from.sin6_addr); 601 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 602 incoming_rtt_reply(pii, reply, from.sin6_addr); 603 } else { 604 /* Probably not in response to our probe */ 605 logtrace("probe reply type: %d from %s on %s\n", 606 reply->pr_icmp_mtype, abuf, pii->pii_name); 607 } 608 } 609 610 /* 611 * Process the incoming rtt reply, in response to our rtt probe. 612 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 613 * have any stored information about the probe we sent. So we don't log 614 * any errors if we receive bad replies. 615 */ 616 static void 617 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 618 struct in6_addr fromaddr) 619 { 620 int64_t m; /* rtt measurement in ns */ 621 char abuf[INET6_ADDRSTRLEN]; 622 struct target *target; 623 struct phyint_group *pg; 624 625 /* Get the printable address for error reporting */ 626 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 627 628 if (debug & D_PROBE) { 629 logdebug("incoming_rtt_reply: %s %s %s\n", 630 AF_STR(pii->pii_af), pii->pii_name, abuf); 631 } 632 633 /* Do we know this target ? */ 634 target = target_lookup(pii, fromaddr); 635 if (target == NULL) 636 return; 637 638 m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); 639 /* Invalid rtt. It has wrapped around */ 640 if (m < 0) 641 return; 642 643 /* 644 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 645 * The initial few responses after the interface is repaired may 646 * contain high rtt's because they could have been queued up waiting 647 * for ARP/NDP resolution on a failed interface. 648 */ 649 pg = pii->pii_phyint->pi_group; 650 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 651 return; 652 653 /* 654 * Update rtt only if the new rtt is lower than the current rtt. 655 * (specified by the 3rd parameter to pi_set_crtt). 656 * If a spike has caused the current probe_interval to be > 657 * user_probe_interval, then this mechanism is used to bring down 658 * the rtt rapidly once the network stress is removed. 659 * If the new rtt is higher than the current rtt, we don't want to 660 * update the rtt. We are having more than 1 outstanding probe and 661 * the increase in rtt we are seeing is being unnecessarily weighted 662 * many times. The regular rtt update will be handled by 663 * incoming_echo_reply() and will take care of any rtt increase. 664 */ 665 pi_set_crtt(target, m, _B_FALSE); 666 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 667 (user_failure_detection_time < pg->pg_fdt) && 668 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 669 /* 670 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 671 * investigate if we can improve the failure detection time to 672 * meet whatever the user specified. 673 */ 674 if (check_pg_crtt_improved(pg)) { 675 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 676 user_failure_detection_time); 677 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 678 if (pii->pii_phyint->pi_group != phyint_anongroup) { 679 logerr("Improved failure detection time %d ms " 680 "on (%s %s) for group \"%s\"\n", 681 pg->pg_fdt, AF_STR(pii->pii_af), 682 pii->pii_name, 683 pii->pii_phyint->pi_group->pg_name); 684 } 685 if (user_failure_detection_time == pg->pg_fdt) { 686 /* Avoid any truncation or rounding errors */ 687 pg->pg_probeint = user_probe_interval; 688 /* 689 * No more rtt probes will be sent. The actual 690 * fdt has dropped to the user specified value. 691 * pii_fd_snxt_basetime and pii_snxt_basetime 692 * will be in sync henceforth. 693 */ 694 reset_snxt_basetimes(); 695 } 696 } 697 } 698 } 699 700 /* 701 * Process the incoming echo reply, in response to our unicast probe. 702 * Common for both IPv4 and IPv6 703 */ 704 static void 705 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 706 struct in6_addr fromaddr, struct timeval *recv_tvp) 707 { 708 int64_t m; /* rtt measurement in ns */ 709 hrtime_t cur_hrtime; /* in ns from some arbitrary point */ 710 char abuf[INET6_ADDRSTRLEN]; 711 int pr_ndx; 712 struct target *target; 713 boolean_t exception; 714 uint64_t pr_icmp_timestamp; 715 uint16_t pr_icmp_seq; 716 struct probe_stats *pr_statp; 717 struct phyint_group *pg = pii->pii_phyint->pi_group; 718 719 /* Get the printable address for error reporting */ 720 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 721 722 if (debug & D_PROBE) { 723 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", 724 AF_STR(pii->pii_af), pii->pii_name, abuf, 725 ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); 726 } 727 728 pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); 729 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 730 731 /* Reject out of window probe replies */ 732 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 733 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 734 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 735 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 736 pii->pii_cum_stats.unknown++; 737 return; 738 } 739 740 cur_hrtime = gethrtime(); 741 m = (int64_t)(cur_hrtime - pr_icmp_timestamp); 742 if (m < 0) { 743 /* 744 * This is a ridiculously high value of rtt. rtt has wrapped 745 * around. Log a message, and ignore the rtt. 746 */ 747 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " 748 "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); 749 } 750 751 /* 752 * Get the probe index pr_ndx corresponding to the received icmp seq. 753 * number in our pii->pii_probes[] array. The icmp sequence number 754 * pii_snxt corresponds to the probe index pii->pii_probe_next 755 */ 756 pr_ndx = MOD_SUB(pii->pii_probe_next, 757 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 758 759 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 760 761 target = pii->pii_probes[pr_ndx].pr_target; 762 763 /* 764 * Perform sanity checks, whether this probe reply that we 765 * have received is genuine 766 */ 767 if (target != NULL) { 768 /* 769 * Compare the src. addr of the received ICMP or ICMPv6 770 * probe reply with the target address in our tables. 771 */ 772 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 773 /* 774 * We don't have any record of having sent a probe to 775 * this target. This is a fake probe reply. Log an error 776 */ 777 logtrace("probe status %d Fake probe reply seq %u " 778 "snxt %u on %s from %s\n", 779 pii->pii_probes[pr_ndx].pr_status, 780 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 781 pii->pii_cum_stats.unknown++; 782 return; 783 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 784 /* 785 * The address matches, but our tables indicate that 786 * this probe reply has been acked already. So this 787 * is a duplicate probe reply. Log an error 788 */ 789 logtrace("probe status %d Duplicate probe reply seq %u " 790 "snxt %u on %s from %s\n", 791 pii->pii_probes[pr_ndx].pr_status, 792 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 793 pii->pii_cum_stats.unknown++; 794 return; 795 } 796 } else { 797 /* 798 * Target must not be NULL in the PR_UNACKED state 799 */ 800 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 801 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 802 /* 803 * The probe stats slot is unused. So we didn't 804 * send out any probe to this target. This is a fake. 805 * Log an error. 806 */ 807 logtrace("probe status %d Fake probe reply seq %u " 808 "snxt %u on %s from %s\n", 809 pii->pii_probes[pr_ndx].pr_status, 810 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 811 } 812 pii->pii_cum_stats.unknown++; 813 return; 814 } 815 816 /* 817 * If the rtt does not appear to be right, don't update the 818 * rtt stats. This can happen if the system dropped into the 819 * debugger, or the system was hung or too busy for a 820 * substantial time that we didn't get a chance to run. 821 */ 822 if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { 823 /* 824 * If the probe corresponding to this received response 825 * was truly sent 'm' ns. ago, then this response must 826 * have been rejected by the sequence number checks. The 827 * fact that it has passed the sequence number checks 828 * means that the measured rtt is wrong. We were probably 829 * scheduled long after the packet was received. 830 */ 831 goto out; 832 } 833 834 /* 835 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 836 * The initial few responses after the interface is repaired may 837 * contain high rtt's because they could have been queued up waiting 838 * for ARP/NDP resolution on a failed interface. 839 */ 840 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 841 goto out; 842 843 /* 844 * Don't update the Conservative Round Trip Time estimate for this 845 * (phint, target) pair if this is the not the highest ack seq seen 846 * thus far on this target. 847 */ 848 if (!highest_ack_tg(pr_icmp_seq, target)) 849 goto out; 850 851 /* 852 * Always update the rtt. This is a failure detection probe 853 * and we want to measure both increase / decrease in rtt. 854 */ 855 pi_set_crtt(target, m, _B_TRUE); 856 857 /* 858 * If the crtt exceeds the average time between probes, 859 * investigate if this slow target is an exception. If so we 860 * can avoid this target and still meet the failure detection 861 * time. Otherwise we can't meet the failure detection time. 862 */ 863 if (target->tg_crtt > pg->pg_probeint) { 864 exception = check_exception_target(pii, target); 865 if (exception) { 866 /* 867 * This target is exceptionally slow. Don't use it 868 * for future probes. check_exception_target() has 869 * made sure that we have at least MIN_PROBE_TARGETS 870 * other active targets 871 */ 872 if (pii->pii_targets_are_routers) { 873 /* 874 * This is a slow router, mark it as slow 875 * and don't use it for further probes. We 876 * don't delete it, since it will be populated 877 * again when we do a router scan. Hence we 878 * need to maintain extra state (unlike the 879 * host case below). Mark it as TG_SLOW. 880 */ 881 if (target->tg_status == TG_ACTIVE) 882 pii->pii_ntargets--; 883 target->tg_status = TG_SLOW; 884 target->tg_latime = gethrtime(); 885 target->tg_rtt_sa = -1; 886 target->tg_crtt = 0; 887 target->tg_rtt_sd = 0; 888 if (pii->pii_target_next == target) { 889 pii->pii_target_next = 890 target_next(target); 891 } 892 } else { 893 /* 894 * the slow target is not a router, we can 895 * just delete it. Send an icmp multicast and 896 * pick the fastest responder that is not 897 * already an active target. target_delete() 898 * adjusts pii->pii_target_next 899 */ 900 target_delete(target); 901 probe(pii, PROBE_MULTI, cur_hrtime); 902 } 903 } else { 904 /* 905 * We can't meet the failure detection time. 906 * Log a message, and update the detection time to 907 * whatever we can achieve. 908 */ 909 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 910 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 911 last_fdt_bumpup_time = gethrtime(); 912 if (pg != phyint_anongroup) { 913 logtrace("Cannot meet requested failure" 914 " detection time of %d ms on (%s %s) new" 915 " failure detection time for group \"%s\"" 916 " is %d ms\n", user_failure_detection_time, 917 AF_STR(pii->pii_af), pii->pii_name, 918 pg->pg_name, pg->pg_fdt); 919 } 920 } 921 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 922 (user_failure_detection_time < pg->pg_fdt) && 923 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 924 /* 925 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 926 * investigate if we can improve the failure detection time to 927 * meet whatever the user specified. 928 */ 929 if (check_pg_crtt_improved(pg)) { 930 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 931 user_failure_detection_time); 932 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 933 if (pg != phyint_anongroup) { 934 logtrace("Improved failure detection time %d ms" 935 " on (%s %s) for group \"%s\"\n", 936 pg->pg_fdt, AF_STR(pii->pii_af), 937 pii->pii_name, pg->pg_name); 938 } 939 if (user_failure_detection_time == pg->pg_fdt) { 940 /* Avoid any truncation or rounding errors */ 941 pg->pg_probeint = user_probe_interval; 942 /* 943 * No more rtt probes will be sent. The actual 944 * fdt has dropped to the user specified value. 945 * pii_fd_snxt_basetime and pii_snxt_basetime 946 * will be in sync henceforth. 947 */ 948 reset_snxt_basetimes(); 949 } 950 } 951 } 952 out: 953 pr_statp = &pii->pii_probes[pr_ndx]; 954 pr_statp->pr_hrtime_ackproc = cur_hrtime; 955 pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + 956 (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); 957 958 probe_chstate(pr_statp, pii, PR_ACKED); 959 960 /* 961 * Update pii->pii_rack, i.e. the sequence number of the last received 962 * probe response, based on the echo reply we have received now, if 963 * either of the following conditions are satisfied. 964 * a. pii_rack is outside the current receive window of 965 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 966 * This means we have not received probe responses for a 967 * long time, and the sequence number has wrapped around. 968 * b. pii_rack is within the current receive window and this echo 969 * reply corresponds to the highest sequence number we have seen 970 * so far. 971 */ 972 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 973 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 974 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 975 pii->pii_rack = pr_icmp_seq; 976 } 977 } 978 979 /* 980 * Returns true if seq is the highest unacknowledged seq for target tg 981 * else returns false 982 */ 983 static boolean_t 984 highest_ack_tg(uint16_t seq, struct target *tg) 985 { 986 struct phyint_instance *pii; 987 int pr_ndx; 988 uint16_t pr_seq; 989 990 pii = tg->tg_phyint_inst; 991 992 /* 993 * Get the seq number of the most recent probe sent so far, 994 * and also get the corresponding probe index in the probe stats 995 * array. 996 */ 997 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 998 pr_seq = pii->pii_snxt; 999 pr_seq--; 1000 1001 /* 1002 * Start from the most recent probe and walk back, trying to find 1003 * an acked probe corresponding to target tg. 1004 */ 1005 for (; pr_ndx != pii->pii_probe_next; 1006 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 1007 if (pii->pii_probes[pr_ndx].pr_target == tg && 1008 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1009 if (SEQ_GT(pr_seq, seq)) 1010 return (_B_FALSE); 1011 } 1012 } 1013 return (_B_TRUE); 1014 } 1015 1016 /* 1017 * Check whether the crtt for the group has improved by a factor of 1018 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1019 * detection time flapping in the face of small crtt changes. 1020 */ 1021 static boolean_t 1022 check_pg_crtt_improved(struct phyint_group *pg) 1023 { 1024 struct phyint *pi; 1025 1026 if (debug & D_PROBE) 1027 logdebug("check_pg_crtt_improved()\n"); 1028 1029 /* 1030 * The crtt for the group is only improved if each phyint_instance 1031 * for both ipv4 and ipv6 is improved. 1032 */ 1033 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1034 if (!check_pii_crtt_improved(pi->pi_v4) || 1035 !check_pii_crtt_improved(pi->pi_v6)) 1036 return (_B_FALSE); 1037 } 1038 1039 return (_B_TRUE); 1040 } 1041 1042 /* 1043 * Check whether the crtt has improved substantially on this phyint_instance. 1044 * Returns _B_TRUE if there's no crtt information available, because pii 1045 * is NULL or the phyint_instance is not capable of probing. 1046 */ 1047 boolean_t 1048 check_pii_crtt_improved(struct phyint_instance *pii) { 1049 struct target *tg; 1050 1051 if (pii == NULL) 1052 return (_B_TRUE); 1053 1054 if (!PROBE_CAPABLE(pii) || 1055 pii->pii_phyint->pi_state == PI_FAILED) 1056 return (_B_TRUE); 1057 1058 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1059 if (tg->tg_status != TG_ACTIVE) 1060 continue; 1061 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1062 LOWER_FDT_TRIGGER)) { 1063 return (_B_FALSE); 1064 } 1065 } 1066 1067 return (_B_TRUE); 1068 } 1069 1070 /* 1071 * This target responds very slowly to probes. The target's crtt exceeds 1072 * the probe interval of its group. Compare against other targets 1073 * and determine if this target is an exception, if so return true, else false 1074 */ 1075 static boolean_t 1076 check_exception_target(struct phyint_instance *pii, struct target *target) 1077 { 1078 struct target *tg; 1079 char abuf[INET6_ADDRSTRLEN]; 1080 1081 if (debug & D_PROBE) { 1082 logdebug("check_exception_target(%s %s target %s)\n", 1083 AF_STR(pii->pii_af), pii->pii_name, 1084 pr_addr(pii->pii_af, target->tg_address, 1085 abuf, sizeof (abuf))); 1086 } 1087 1088 /* 1089 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1090 * to make a good judgement. Otherwise don't drop this target. 1091 */ 1092 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1093 return (_B_FALSE); 1094 1095 /* 1096 * Determine whether only this particular target is slow. 1097 * We know that this target's crtt exceeds the group's probe interval. 1098 * If all other active targets have a 1099 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1100 * then this target is considered slow. 1101 */ 1102 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1103 if (tg != target && tg->tg_status == TG_ACTIVE) { 1104 if (tg->tg_crtt > 1105 pii->pii_phyint->pi_group->pg_probeint / 1106 EXCEPTION_FACTOR) { 1107 return (_B_FALSE); 1108 } 1109 } 1110 } 1111 1112 return (_B_TRUE); 1113 } 1114 1115 /* 1116 * Update the target list. The icmp all hosts multicast has given us 1117 * some host to which we can send probes. If we already have sufficient 1118 * targets, discard it. 1119 */ 1120 static void 1121 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1122 struct in6_addr fromaddr) 1123 /* ARGSUSED */ 1124 { 1125 int af; 1126 char abuf[INET6_ADDRSTRLEN]; 1127 struct phyint *pi; 1128 1129 if (debug & D_PROBE) { 1130 logdebug("incoming_mcast_reply(%s %s %s)\n", 1131 AF_STR(pii->pii_af), pii->pii_name, 1132 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1133 } 1134 1135 /* 1136 * Using host targets is a fallback mechanism. If we have 1137 * found a router, don't add this host target. If we already 1138 * know MAX_PROBE_TARGETS, don't add another target. 1139 */ 1140 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1141 if (pii->pii_targets != NULL) { 1142 if (pii->pii_targets_are_routers || 1143 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1144 return; 1145 } 1146 } 1147 1148 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1149 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1150 /* 1151 * Guard against response from 0.0.0.0 1152 * and ::. Log a trace message 1153 */ 1154 logtrace("probe response from %s on %s\n", 1155 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1156 pii->pii_name); 1157 return; 1158 } 1159 1160 /* 1161 * This address is one of our own, so reject this address as a 1162 * valid probe target. 1163 */ 1164 af = pii->pii_af; 1165 if (own_address(fromaddr)) 1166 return; 1167 1168 /* 1169 * If the phyint is part a named group, then add the address to all 1170 * members of the group. Otherwise, add the address only to the 1171 * phyint itself, since other phyints in the anongroup may not be on 1172 * the same subnet. 1173 */ 1174 pi = pii->pii_phyint; 1175 if (pi->pi_group == phyint_anongroup) { 1176 target_add(pii, fromaddr, _B_FALSE); 1177 } else { 1178 pi = pi->pi_group->pg_phyint; 1179 for (; pi != NULL; pi = pi->pi_pgnext) 1180 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1181 } 1182 } 1183 1184 /* 1185 * Compute CRTT given an existing scaled average, scaled deviation estimate 1186 * and a new rtt time. The formula is from Jacobson and Karels' 1187 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1188 * are the same as those in Appendix A.2 of that paper. 1189 * 1190 * m = new measurement 1191 * sa = scaled RTT average (8 * average estimates) 1192 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1193 * crtt = Conservative round trip time. Used to determine whether probe 1194 * has timed out. 1195 * 1196 * New scaled average and deviation are passed back via sap and svp 1197 */ 1198 static int64_t 1199 compute_crtt(int64_t *sap, int64_t *svp, int64_t m) 1200 { 1201 int64_t sa = *sap; 1202 int64_t sv = *svp; 1203 int64_t crtt; 1204 int64_t saved_m = m; 1205 1206 assert(*sap >= -1); 1207 assert(*svp >= 0); 1208 1209 if (sa != -1) { 1210 /* 1211 * Update average estimator: 1212 * new rtt = old rtt + 1/8 Error 1213 * where Error = m - old rtt 1214 * i.e. 8 * new rtt = 8 * old rtt + Error 1215 * i.e. new sa = old sa + Error 1216 */ 1217 m -= sa >> 3; /* m is now Error in estimate. */ 1218 if ((sa += m) < 0) { 1219 /* Don't allow the smoothed average to be negative. */ 1220 sa = 0; 1221 } 1222 1223 /* 1224 * Update deviation estimator: 1225 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1226 * i.e. 4 * new mdev = 4 * old mdev + 1227 * (abs(Error) - old mdev) 1228 * i.e. new sv = old sv + (abs(Error) - old mdev) 1229 */ 1230 if (m < 0) 1231 m = -m; 1232 m -= sv >> 2; 1233 sv += m; 1234 } else { 1235 /* Initialization. This is the first response received. */ 1236 sa = (m << 3); 1237 sv = (m << 1); 1238 } 1239 1240 crtt = (sa >> 3) + sv; 1241 1242 if (debug & D_PROBE) { 1243 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " 1244 "crtt = %lld\n", saved_m, sa, sv, crtt); 1245 } 1246 1247 *sap = sa; 1248 *svp = sv; 1249 1250 /* 1251 * CRTT = average estimates + 4 * deviation estimates 1252 * = sa / 8 + sv 1253 */ 1254 return (crtt); 1255 } 1256 1257 static void 1258 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) 1259 { 1260 struct phyint_instance *pii = tg->tg_phyint_inst; 1261 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1262 int64_t sa = tg->tg_rtt_sa; 1263 int64_t sv = tg->tg_rtt_sd; 1264 int new_crtt; 1265 int i; 1266 1267 if (debug & D_PROBE) 1268 logdebug("pi_set_crtt: target - m %lld\n", m); 1269 1270 /* store the round trip time, in case we need to defer computation */ 1271 tg->tg_deferred[tg->tg_num_deferred] = m; 1272 1273 new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); 1274 1275 /* 1276 * If this probe's round trip time would singlehandedly cause an 1277 * increase in the group's probe interval consider it suspect. 1278 */ 1279 if ((new_crtt > probe_interval) && is_probe_uni) { 1280 if (debug & D_PROBE) { 1281 logdebug("Received a suspect probe on %s, new_crtt =" 1282 " %d, probe_interval = %d, num_deferred = %d\n", 1283 pii->pii_probe_logint->li_name, new_crtt, 1284 probe_interval, tg->tg_num_deferred); 1285 } 1286 1287 /* 1288 * If we've deferred as many rtts as we plan on deferring, then 1289 * assume the link really did slow down and process all queued 1290 * rtts 1291 */ 1292 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1293 if (debug & D_PROBE) { 1294 logdebug("Received MAXDEFERREDRTT probes which " 1295 "would cause an increased probe_interval. " 1296 "Integrating queued rtt data points.\n"); 1297 } 1298 1299 for (i = 0; i <= tg->tg_num_deferred; i++) { 1300 tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, 1301 &tg->tg_rtt_sd, tg->tg_deferred[i])); 1302 } 1303 1304 tg->tg_num_deferred = 0; 1305 } else { 1306 tg->tg_num_deferred++; 1307 } 1308 return; 1309 } 1310 1311 /* 1312 * If this is a normal probe, or an RTT probe that would lead to a 1313 * reduced CRTT, then update our CRTT data. Further, if this was 1314 * a normal probe, pitch any deferred probes since our probes are 1315 * again being answered within our CRTT estimates. 1316 */ 1317 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1318 tg->tg_rtt_sa = sa; 1319 tg->tg_rtt_sd = sv; 1320 tg->tg_crtt = new_crtt; 1321 if (is_probe_uni) 1322 tg->tg_num_deferred = 0; 1323 } 1324 } 1325 1326 /* 1327 * Return a pointer to the specified option buffer. 1328 * If not found return NULL. 1329 */ 1330 static void * 1331 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) 1332 { 1333 struct cmsghdr *cmsg; 1334 1335 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1336 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1337 if (cmsg->cmsg_level == cmsg_level && 1338 cmsg->cmsg_type == cmsg_type) { 1339 return (CMSG_DATA(cmsg)); 1340 } 1341 } 1342 return (NULL); 1343 } 1344 1345 /* 1346 * Try to activate another INACTIVE interface in the same group as `pi'. 1347 * Prefer STANDBY INACTIVE to just INACTIVE. 1348 */ 1349 void 1350 phyint_activate_another(struct phyint *pi) 1351 { 1352 struct phyint *pi2; 1353 struct phyint *inactivepi = NULL; 1354 1355 if (pi->pi_group == phyint_anongroup) 1356 return; 1357 1358 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1359 if (pi == pi2 || !phyint_is_functioning(pi2) || 1360 !(pi2->pi_flags & IFF_INACTIVE)) 1361 continue; 1362 1363 inactivepi = pi2; 1364 if (pi2->pi_flags & IFF_STANDBY) 1365 break; 1366 } 1367 1368 if (inactivepi != NULL) 1369 (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); 1370 } 1371 1372 /* 1373 * Transition a phyint to PI_RUNNING. The caller must ensure that the 1374 * transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if 1375 * appropriate. Also sets IFF_INACTIVE on this or other interfaces as 1376 * appropriate (see comment below). Finally, also updates the phyint's group 1377 * state to account for the change. 1378 */ 1379 void 1380 phyint_transition_to_running(struct phyint *pi) 1381 { 1382 struct phyint *pi2; 1383 struct phyint *actstandbypi = NULL; 1384 uint_t nactive = 0, nnonstandby = 0; 1385 boolean_t onlining = (pi->pi_state == PI_OFFLINE); 1386 boolean_t initial = (pi->pi_state == PI_INIT); 1387 uint64_t set, clear; 1388 1389 /* 1390 * The interface is running again, but should it or another interface 1391 * in the group end up INACTIVE? There are three cases: 1392 * 1393 * 1. If it's a STANDBY interface, it should be end up INACTIVE if 1394 * the group is operating at capacity (i.e., there are at least as 1395 * many active interfaces as non-STANDBY interfaces in the group). 1396 * No other interfaces should be changed. 1397 * 1398 * 2. If it's a non-STANDBY interface and we're onlining it or 1399 * FAILBACK is enabled, then it should *not* end up INACTIVE. 1400 * Further, if the group is above capacity as a result of this 1401 * interface, then an active STANDBY interface in the group should 1402 * end up INACTIVE. 1403 * 1404 * 3. If it's a non-STANDBY interface, we're repairing it, and 1405 * FAILBACK is disabled, then it should end up INACTIVE *unless* 1406 * the group was failed (in which case we have no choice but to 1407 * use it). No other interfaces should be changed. 1408 */ 1409 if (pi->pi_group != phyint_anongroup) { 1410 pi2 = pi->pi_group->pg_phyint; 1411 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1412 if (!(pi2->pi_flags & IFF_STANDBY)) 1413 nnonstandby++; 1414 1415 if (phyint_is_functioning(pi2) && 1416 !(pi2->pi_flags & IFF_INACTIVE)) { 1417 nactive++; 1418 if (pi2->pi_flags & IFF_STANDBY) 1419 actstandbypi = pi2; 1420 } 1421 } 1422 } 1423 1424 set = 0; 1425 clear = (onlining ? IFF_OFFLINE : IFF_FAILED); 1426 1427 if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ 1428 if (nactive >= nnonstandby) 1429 set |= IFF_INACTIVE; 1430 else 1431 clear |= IFF_INACTIVE; 1432 } else if (onlining || failback_enabled) { /* case 2 */ 1433 if (nactive >= nnonstandby && actstandbypi != NULL) 1434 (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); 1435 } else if (!initial && !GROUP_FAILED(pi->pi_group)) { /* case 3 */ 1436 set |= IFF_INACTIVE; 1437 } 1438 (void) change_pif_flags(pi, set, clear); 1439 1440 phyint_chstate(pi, PI_RUNNING); 1441 1442 /* 1443 * Update the group state to account for the change. 1444 */ 1445 phyint_group_refresh_state(pi->pi_group); 1446 } 1447 1448 /* 1449 * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration 1450 * to have at least one active interface and as many active interfaces as 1451 * non-standby interfaces. 1452 */ 1453 void 1454 phyint_standby_refresh_inactive(struct phyint *pi) 1455 { 1456 struct phyint *pi2; 1457 uint_t nactive = 0, nnonstandby = 0; 1458 1459 /* 1460 * All phyints in the anonymous group are effectively in their own 1461 * group and thus active regardless of whether they're marked standby. 1462 */ 1463 if (pi->pi_group == phyint_anongroup) { 1464 (void) change_pif_flags(pi, 0, IFF_INACTIVE); 1465 return; 1466 } 1467 1468 /* 1469 * If the phyint isn't functioning we can't consider it. 1470 */ 1471 if (!phyint_is_functioning(pi)) 1472 return; 1473 1474 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1475 if (!(pi2->pi_flags & IFF_STANDBY)) 1476 nnonstandby++; 1477 1478 if (phyint_is_functioning(pi2) && 1479 !(pi2->pi_flags & IFF_INACTIVE)) 1480 nactive++; 1481 } 1482 1483 if (nactive == 0 || nactive < nnonstandby) 1484 (void) change_pif_flags(pi, 0, IFF_INACTIVE); 1485 else if (nactive > nnonstandby) 1486 (void) change_pif_flags(pi, IFF_INACTIVE, 0); 1487 } 1488 1489 /* 1490 * See if a previously failed interface has started working again. 1491 */ 1492 void 1493 phyint_check_for_repair(struct phyint *pi) 1494 { 1495 if (!phyint_repaired(pi)) 1496 return; 1497 1498 if (pi->pi_group == phyint_anongroup) { 1499 logerr("IP interface repair detected on %s\n", pi->pi_name); 1500 } else { 1501 logerr("IP interface repair detected on %s of group %s\n", 1502 pi->pi_name, pi->pi_group->pg_name); 1503 } 1504 1505 /* 1506 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. 1507 * So just clear IFF_OFFLINE and defer phyint_transition_to_running() 1508 * until it is brought back online. 1509 */ 1510 if (pi->pi_state == PI_OFFLINE) { 1511 (void) change_pif_flags(pi, 0, IFF_FAILED); 1512 return; 1513 } 1514 1515 phyint_transition_to_running(pi); /* calls phyint_chstate() */ 1516 } 1517 1518 /* 1519 * See if an interface has failed, or if the whole group of interfaces has 1520 * failed. 1521 */ 1522 static void 1523 phyint_inst_check_for_failure(struct phyint_instance *pii) 1524 { 1525 struct phyint *pi = pii->pii_phyint; 1526 struct phyint *pi2; 1527 boolean_t was_active; 1528 1529 switch (failure_state(pii)) { 1530 case PHYINT_FAILURE: 1531 was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); 1532 1533 (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); 1534 if (pi->pi_group == phyint_anongroup) { 1535 logerr("IP interface failure detected on %s\n", 1536 pii->pii_name); 1537 } else { 1538 logerr("IP interface failure detected on %s of group" 1539 " %s\n", pii->pii_name, pi->pi_group->pg_name); 1540 } 1541 1542 /* 1543 * If the failed interface was active, activate another 1544 * INACTIVE interface in the group if possible. 1545 */ 1546 if (was_active) 1547 phyint_activate_another(pi); 1548 1549 /* 1550 * If the interface is offline, the state change will be 1551 * noted when it comes back online. 1552 */ 1553 if (pi->pi_state != PI_OFFLINE) { 1554 phyint_chstate(pi, PI_FAILED); 1555 reset_crtt_all(pi); 1556 } 1557 break; 1558 1559 case GROUP_FAILURE: 1560 pi2 = pi->pi_group->pg_phyint; 1561 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1562 (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); 1563 if (pi2->pi_state == PI_OFFLINE) /* see comment above */ 1564 continue; 1565 1566 reset_crtt_all(pi2); 1567 /* 1568 * In the case of host targets, we would have flushed 1569 * the targets, and gone to PI_NOTARGETS state. 1570 */ 1571 if (pi2->pi_state == PI_RUNNING) 1572 phyint_chstate(pi2, PI_FAILED); 1573 } 1574 break; 1575 1576 default: 1577 break; 1578 } 1579 } 1580 1581 /* 1582 * Determines if any timeout event has occurred and returns the number of 1583 * milliseconds until the next timeout event for the phyint. Returns 1584 * TIMER_INFINITY for "never". 1585 */ 1586 uint_t 1587 phyint_inst_timer(struct phyint_instance *pii) 1588 { 1589 int pr_ndx; 1590 uint_t timeout; 1591 struct target *cur_tg; 1592 struct probe_stats *pr_statp; 1593 struct phyint_instance *pii_other; 1594 struct phyint *pi; 1595 int valid_unack_count; 1596 int i; 1597 int interval; 1598 uint_t check_time; 1599 uint_t cur_time; 1600 hrtime_t cur_hrtime; 1601 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1602 1603 cur_hrtime = gethrtime(); 1604 cur_time = ns2ms(cur_hrtime); 1605 1606 if (debug & D_TIMER) { 1607 logdebug("phyint_inst_timer(%s %s)\n", 1608 AF_STR(pii->pii_af), pii->pii_name); 1609 } 1610 1611 pii_other = phyint_inst_other(pii); 1612 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1613 /* 1614 * Check to see if we're here due to link up/down flapping; If 1615 * enough time has passed, then try to bring the interface 1616 * back up; otherwise, schedule a timer to bring it back up 1617 * when enough time *has* elapsed. 1618 */ 1619 pi = pii->pii_phyint; 1620 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1621 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1622 if (check_time > cur_time) 1623 return (check_time - cur_time); 1624 1625 phyint_check_for_repair(pi); 1626 } 1627 } 1628 1629 /* 1630 * If probing is not enabled on this phyint instance, don't proceed. 1631 */ 1632 if (!PROBE_ENABLED(pii)) 1633 return (TIMER_INFINITY); 1634 1635 /* 1636 * If the timer has fired too soon, probably triggered 1637 * by some other phyint instance, return the remaining 1638 * time 1639 */ 1640 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1641 return (pii->pii_snxt_time - cur_time); 1642 1643 /* 1644 * If the link is down, don't send any probes for now. 1645 */ 1646 if (LINK_DOWN(pii->pii_phyint)) 1647 return (TIMER_INFINITY); 1648 1649 /* 1650 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1651 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1652 * Base probe time is strictly periodic. 1653 */ 1654 interval = GET_RANDOM( 1655 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1656 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1657 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1658 1659 /* 1660 * Check if the current time > next time to probe. If so, we missed 1661 * sending 1 or more probes, probably due to heavy system load. At least 1662 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1663 * were scheduled. Make adjustments to the times, in multiples of 1664 * user_probe_interval. 1665 */ 1666 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1667 int n; 1668 1669 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1670 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1671 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1672 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1673 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1674 pii->pii_snxt_basetime); 1675 1676 /* Collect statistics about missed probes */ 1677 probes_missed.pm_nprobes += n + 1; 1678 probes_missed.pm_ntimes++; 1679 } 1680 pii->pii_snxt_basetime += user_probe_interval; 1681 interval = pii->pii_snxt_time - cur_time; 1682 if (debug & D_TARGET) { 1683 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1684 " interval %u\n", cur_time, pii->pii_snxt_time, 1685 pii->pii_snxt_basetime, interval); 1686 } 1687 1688 /* 1689 * If no targets are known, we need to send an ICMP multicast. The 1690 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1691 * to see if we found a target. 1692 */ 1693 if (pii->pii_target_next == NULL) { 1694 assert(pii->pii_ntargets == 0); 1695 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1696 probe(pii, PROBE_MULTI, cur_time); 1697 return (interval); 1698 } 1699 1700 if ((user_probe_interval != probe_interval) && 1701 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1702 /* 1703 * the failure detection (fd) probe timer has not yet fired. 1704 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1705 */ 1706 probe(pii, PROBE_RTT, cur_hrtime); 1707 return (interval); 1708 } 1709 /* 1710 * the fd probe timer has fired. Need to do all failure 1711 * detection / recovery calculations, and then send an fd probe 1712 * of type PROBE_UNI. 1713 */ 1714 if (user_probe_interval == probe_interval) { 1715 /* 1716 * We could have missed some probes, and then adjusted 1717 * pii_snxt_basetime above. Otherwise we could have 1718 * blindly added probe_interval to pii_fd_snxt_basetime. 1719 */ 1720 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1721 } else { 1722 pii->pii_fd_snxt_basetime += probe_interval; 1723 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1724 int n; 1725 1726 n = (cur_time - pii->pii_fd_snxt_basetime) / 1727 probe_interval; 1728 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1729 } 1730 } 1731 1732 /* 1733 * We can have at most, the latest 2 probes that we sent, in 1734 * the PR_UNACKED state. All previous probes sent, are either 1735 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1736 * timed out if the probe's time_start + the CRTT < currenttime. 1737 * For each of the last 2 probes, examine whether it has timed 1738 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1739 */ 1740 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1741 valid_unack_count = 0; 1742 1743 for (i = 0; i < 2; i++) { 1744 pr_statp = &pii->pii_probes[pr_ndx]; 1745 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1746 switch (pr_statp->pr_status) { 1747 case PR_ACKED: 1748 /* 1749 * We received back an ACK, so the switch clearly 1750 * is not dropping our traffic, and thus we can 1751 * enable failure detection immediately. 1752 */ 1753 if (pii->pii_fd_hrtime > gethrtime()) { 1754 if (debug & D_PROBE) { 1755 logdebug("successful probe on %s; " 1756 "ending quiet period\n", 1757 pii->pii_phyint->pi_name); 1758 } 1759 pii->pii_fd_hrtime = gethrtime(); 1760 } 1761 break; 1762 1763 case PR_UNACKED: 1764 assert(cur_tg != NULL); 1765 /* 1766 * The crtt could be zero for some reason, 1767 * Eg. the phyint could be failed. If the crtt is 1768 * not available use group's probe interval, 1769 * which is a worst case estimate. 1770 */ 1771 timeout = ns2ms(pr_statp->pr_hrtime_start); 1772 if (cur_tg->tg_crtt != 0) { 1773 timeout += cur_tg->tg_crtt; 1774 } else { 1775 timeout += probe_interval; 1776 } 1777 if (TIME_LT(timeout, cur_time)) { 1778 pr_statp->pr_time_lost = timeout; 1779 probe_chstate(pr_statp, pii, PR_LOST); 1780 } else if (i == 1) { 1781 /* 1782 * We are forced to consider this probe 1783 * lost, as we can have at most 2 unack. 1784 * probes any time, and we will be sending a 1785 * probe at the end of this function. 1786 * Normally, we should not be here, but 1787 * this can happen if an incoming response 1788 * that was considered lost has increased 1789 * the crtt for this target, and also bumped 1790 * up the FDT. Note that we never cancel or 1791 * increase the current pii_time_left, so 1792 * when the timer fires, we find 2 valid 1793 * unacked probes, and they are yet to timeout 1794 */ 1795 pr_statp->pr_time_lost = cur_time; 1796 probe_chstate(pr_statp, pii, PR_LOST); 1797 } else { 1798 /* 1799 * Only the most recent probe can enter 1800 * this 'else' arm. The second most recent 1801 * probe must take either of the above arms, 1802 * if it is unacked. 1803 */ 1804 valid_unack_count++; 1805 } 1806 break; 1807 } 1808 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1809 } 1810 1811 /* 1812 * We send out 1 probe randomly in the interval between one half 1813 * and one probe interval for the group. Given that the CRTT is always 1814 * less than the group's probe interval, we can have at most 1 1815 * unacknowledged probe now. All previous probes are either lost or 1816 * acked. 1817 */ 1818 assert(valid_unack_count == 0 || valid_unack_count == 1); 1819 1820 /* 1821 * The timer has fired. Take appropriate action depending 1822 * on the current state of the phyint. 1823 * 1824 * PI_RUNNING state - Failure detection 1825 * PI_FAILED state - Repair detection 1826 */ 1827 switch (pii->pii_phyint->pi_state) { 1828 case PI_FAILED: 1829 /* 1830 * If the most recent probe (excluding unacked probes that 1831 * are yet to time out) has been acked, check whether the 1832 * phyint is now repaired. 1833 */ 1834 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1835 phyint_check_for_repair(pii->pii_phyint); 1836 } 1837 break; 1838 1839 case PI_RUNNING: 1840 /* 1841 * It's possible our probes have been lost because of a 1842 * spanning-tree mandated quiet period on the switch. If so, 1843 * ignore the lost probes. 1844 */ 1845 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1846 break; 1847 1848 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1849 /* 1850 * We have 1 or more failed probes (excluding unacked 1851 * probes that are yet to time out). Determine if the 1852 * phyint has failed. 1853 */ 1854 phyint_inst_check_for_failure(pii); 1855 } 1856 break; 1857 1858 default: 1859 logerr("phyint_inst_timer: invalid state %d\n", 1860 pii->pii_phyint->pi_state); 1861 abort(); 1862 } 1863 1864 /* 1865 * Start the next probe. probe() will also set pii->pii_probe_time_left 1866 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1867 * was called, the target list may be empty. 1868 */ 1869 if (pii->pii_target_next != NULL) { 1870 probe(pii, PROBE_UNI, cur_hrtime); 1871 /* 1872 * If we have just the one probe target, and we're not using 1873 * router targets, try to find another as we presently have 1874 * no resilience. 1875 */ 1876 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1877 probe(pii, PROBE_MULTI, cur_hrtime); 1878 } else { 1879 probe(pii, PROBE_MULTI, cur_hrtime); 1880 } 1881 return (interval); 1882 } 1883 1884 /* 1885 * Start the probe timer for an interface instance. 1886 */ 1887 void 1888 start_timer(struct phyint_instance *pii) 1889 { 1890 uint32_t interval; 1891 1892 /* 1893 * Spread the base probe times (pi_snxt_basetime) across phyints 1894 * uniformly over the (curtime..curtime + the group's probe_interval). 1895 * pi_snxt_basetime is strictly periodic with a frequency of 1896 * the group's probe interval. The actual probe time pi_snxt_time 1897 * adds some randomness to pi_snxt_basetime and happens in probe(). 1898 * For the 1st probe on each phyint after the timer is started, 1899 * pi_snxt_time and pi_snxt_basetime are the same. 1900 */ 1901 interval = GET_RANDOM(0, 1902 (int)pii->pii_phyint->pi_group->pg_probeint); 1903 1904 pii->pii_snxt_basetime = getcurrenttime() + interval; 1905 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1906 pii->pii_snxt_time = pii->pii_snxt_basetime; 1907 timer_schedule(interval); 1908 } 1909 1910 /* 1911 * Restart the probe timer on an interface instance. 1912 */ 1913 static void 1914 restart_timer(struct phyint_instance *pii) 1915 { 1916 /* 1917 * We don't need to restart the timer if it was never started in 1918 * the first place (pii->pii_basetime_inited not set), as the timer 1919 * won't have gone off yet. 1920 */ 1921 if (pii->pii_basetime_inited != 0) { 1922 1923 if (debug & D_LINKNOTE) 1924 logdebug("restart timer: restarting timer on %s, " 1925 "address family %s\n", pii->pii_phyint->pi_name, 1926 AF_STR(pii->pii_af)); 1927 1928 start_timer(pii); 1929 } 1930 } 1931 1932 static void 1933 process_link_state_down(struct phyint *pi) 1934 { 1935 logerr("The link has gone down on %s\n", pi->pi_name); 1936 1937 /* 1938 * Clear the probe statistics arrays, we don't want the repair 1939 * detection logic relying on probes that were successful prior 1940 * to the link going down. 1941 */ 1942 if (PROBE_CAPABLE(pi->pi_v4)) 1943 clear_pii_probe_stats(pi->pi_v4); 1944 if (PROBE_CAPABLE(pi->pi_v6)) 1945 clear_pii_probe_stats(pi->pi_v6); 1946 /* 1947 * Check for interface failure. Although we know the interface 1948 * has failed, we don't know if all the other interfaces in the 1949 * group have failed as well. 1950 */ 1951 if ((pi->pi_state == PI_RUNNING) || 1952 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1953 if (debug & D_LINKNOTE) { 1954 logdebug("process_link_state_down:" 1955 " checking for failure on %s\n", pi->pi_name); 1956 } 1957 1958 if (pi->pi_v4 != NULL) 1959 phyint_inst_check_for_failure(pi->pi_v4); 1960 else if (pi->pi_v6 != NULL) 1961 phyint_inst_check_for_failure(pi->pi_v6); 1962 } 1963 } 1964 1965 static void 1966 process_link_state_up(struct phyint *pi) 1967 { 1968 logerr("The link has come up on %s\n", pi->pi_name); 1969 1970 /* 1971 * We stopped any running timers on each instance when the link 1972 * went down, so restart them. 1973 */ 1974 if (pi->pi_v4) 1975 restart_timer(pi->pi_v4); 1976 if (pi->pi_v6) 1977 restart_timer(pi->pi_v6); 1978 1979 phyint_check_for_repair(pi); 1980 1981 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1982 if (pi->pi_whendx == LINK_UP_PERMIN) 1983 pi->pi_whendx = 0; 1984 } 1985 1986 /* 1987 * Process any changes in link state passed up from the interfaces. 1988 */ 1989 void 1990 process_link_state_changes(void) 1991 { 1992 struct phyint *pi; 1993 1994 /* Look for interfaces where the link state has just changed */ 1995 1996 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1997 boolean_t old_link_state_up = LINK_UP(pi); 1998 1999 /* 2000 * Except when the "phyint" structure is created, this is 2001 * the only place the link state is updated. This allows 2002 * this routine to detect changes in link state, rather 2003 * than just the current state. 2004 */ 2005 UPDATE_LINK_STATE(pi); 2006 2007 if (LINK_DOWN(pi)) { 2008 /* 2009 * Has link just gone down? 2010 */ 2011 if (old_link_state_up) 2012 process_link_state_down(pi); 2013 } else { 2014 /* 2015 * Has link just gone back up? 2016 */ 2017 if (!old_link_state_up) 2018 process_link_state_up(pi); 2019 } 2020 } 2021 } 2022 2023 void 2024 reset_crtt_all(struct phyint *pi) 2025 { 2026 struct phyint_instance *pii; 2027 struct target *tg; 2028 2029 pii = pi->pi_v4; 2030 if (pii != NULL) { 2031 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2032 tg->tg_crtt = 0; 2033 tg->tg_rtt_sa = -1; 2034 tg->tg_rtt_sd = 0; 2035 } 2036 } 2037 2038 pii = pi->pi_v6; 2039 if (pii != NULL) { 2040 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2041 tg->tg_crtt = 0; 2042 tg->tg_rtt_sa = -1; 2043 tg->tg_rtt_sd = 0; 2044 } 2045 } 2046 } 2047 2048 /* 2049 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 2050 * probes on both instances IPv4 and IPv6. 2051 * If the interface has failed, return the time of the first probe failure 2052 * in "tff". 2053 */ 2054 static int 2055 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 2056 { 2057 uint_t pi_tff; 2058 struct target *cur_tg; 2059 struct probe_fail_count pfinfo; 2060 struct phyint_instance *pii_other; 2061 int pr_ndx; 2062 2063 /* 2064 * Get the number of consecutive failed probes on 2065 * this phyint across all targets. Also get the number 2066 * of consecutive failed probes on this target only 2067 */ 2068 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2069 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2070 probe_fail_info(pii, cur_tg, &pfinfo); 2071 2072 /* Get the time of first failure, for later use */ 2073 pi_tff = pfinfo.pf_tff; 2074 2075 /* 2076 * If the current target has not responded to the 2077 * last NUM_PROBE_FAILS probes, and other targets are 2078 * responding delete this target. Dead gateway detection 2079 * will eventually remove this target (if router) from the 2080 * routing tables. If that does not occur, we may end 2081 * up adding this to our list again. 2082 */ 2083 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2084 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2085 if (pii->pii_targets_are_routers) { 2086 if (cur_tg->tg_status == TG_ACTIVE) 2087 pii->pii_ntargets--; 2088 cur_tg->tg_status = TG_DEAD; 2089 cur_tg->tg_crtt = 0; 2090 cur_tg->tg_rtt_sa = -1; 2091 cur_tg->tg_rtt_sd = 0; 2092 if (pii->pii_target_next == cur_tg) 2093 pii->pii_target_next = target_next(cur_tg); 2094 } else { 2095 target_delete(cur_tg); 2096 probe(pii, PROBE_MULTI, gethrtime()); 2097 } 2098 return (PHYINT_OK); 2099 } 2100 2101 /* 2102 * If the phyint has lost NUM_PROBE_FAILS or more 2103 * consecutive probes, on both IPv4 and IPv6 protocol 2104 * instances of the phyint, then trigger failure 2105 * detection, else return false 2106 */ 2107 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2108 return (PHYINT_OK); 2109 2110 pii_other = phyint_inst_other(pii); 2111 if (PROBE_CAPABLE(pii_other)) { 2112 probe_fail_info(pii_other, NULL, &pfinfo); 2113 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2114 /* 2115 * We have NUM_PROBE_FAILS or more failures 2116 * on both IPv4 and IPv6. Get the earliest 2117 * time when failure was detected on this 2118 * phyint across IPv4 and IPv6. 2119 */ 2120 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2121 pi_tff = pfinfo.pf_tff; 2122 } else { 2123 /* 2124 * This instance has < NUM_PROBE_FAILS failure. 2125 * So return false 2126 */ 2127 return (PHYINT_OK); 2128 } 2129 } 2130 *tff = pi_tff; 2131 return (PHYINT_FAILURE); 2132 } 2133 2134 /* 2135 * Check if the link has gone down on this phyint, or it has failed the 2136 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2137 * Also look at other phyints of this group, for group failures. 2138 */ 2139 int 2140 failure_state(struct phyint_instance *pii) 2141 { 2142 struct probe_success_count psinfo; 2143 uint_t pi2_tls; /* time last success */ 2144 uint_t pi_tff; /* time first fail */ 2145 struct phyint *pi2; 2146 struct phyint *pi; 2147 struct phyint_instance *pii2; 2148 struct phyint_group *pg; 2149 int retval; 2150 2151 if (debug & D_FAILREP) 2152 logdebug("phyint_failed(%s)\n", pii->pii_name); 2153 2154 pi = pii->pii_phyint; 2155 pg = pi->pi_group; 2156 2157 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2158 PHYINT_OK) 2159 return (PHYINT_OK); 2160 2161 /* 2162 * At this point, the link is down, or the phyint is suspect, as it 2163 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not 2164 * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue 2165 * on to determine whether this should be considered a PHYINT_FAILURE 2166 * or GROUP_FAILURE. 2167 */ 2168 if (pg == phyint_anongroup) 2169 return (PHYINT_FAILURE); 2170 2171 /* 2172 * Need to compare against other phyints of the same group 2173 * to exclude group failures. If the failure was detected via 2174 * probing, then if the time of last success (tls) of any 2175 * phyint is more recent than the time of first fail (tff) of the 2176 * phyint in question, and the link is up on the phyint, 2177 * then it is a phyint failure. Otherwise it is a group failure. 2178 * If failure was detected via a link down notification sent from 2179 * the driver to IP, we see if any phyints in the group are still 2180 * running and haven't received a link down notification. We 2181 * will usually be processing the link down notification shortly 2182 * after it was received, so there is no point looking at the tls 2183 * of other phyints. 2184 */ 2185 retval = GROUP_FAILURE; 2186 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2187 /* Exclude ourself from comparison */ 2188 if (pi2 == pi) 2189 continue; 2190 2191 if (LINK_DOWN(pi)) { 2192 /* 2193 * We use FLAGS_TO_LINK_STATE() to test the flags 2194 * directly, rather then LINK_UP() or LINK_DOWN(), as 2195 * we may not have got round to processing the link 2196 * state for the other phyints in the group yet. 2197 * 2198 * The check for PI_RUNNING and group failure handles 2199 * the case when the group begins to recover. 2200 * PI_RUNNING will be set, and group failure cleared 2201 * only after receipt of NUM_PROBE_REPAIRS, by which 2202 * time the other phyints should have received at 2203 * least 1 packet, and so will not have NUM_PROBE_FAILS. 2204 */ 2205 if ((pi2->pi_state == PI_RUNNING) && 2206 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { 2207 retval = PHYINT_FAILURE; 2208 break; 2209 } 2210 continue; 2211 } 2212 2213 if (LINK_DOWN(pi2)) 2214 continue; 2215 2216 /* 2217 * If there's no probe-based failure detection on this 2218 * interface, and its link is still up, then it's still 2219 * working and thus the group has not failed. 2220 */ 2221 if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { 2222 retval = PHYINT_FAILURE; 2223 break; 2224 } 2225 2226 /* 2227 * Need to compare against both IPv4 and IPv6 instances. 2228 */ 2229 pii2 = pi2->pi_v4; 2230 if (pii2 != NULL) { 2231 probe_success_info(pii2, NULL, &psinfo); 2232 if (psinfo.ps_tls_valid) { 2233 pi2_tls = psinfo.ps_tls; 2234 /* 2235 * See comment above regarding check 2236 * for PI_RUNNING and group failure. 2237 */ 2238 if (TIME_GT(pi2_tls, pi_tff) && 2239 (pi2->pi_state == PI_RUNNING) && 2240 !GROUP_FAILED(pg) && 2241 FLAGS_TO_LINK_STATE(pi2)) { 2242 retval = PHYINT_FAILURE; 2243 break; 2244 } 2245 } 2246 } 2247 2248 pii2 = pi2->pi_v6; 2249 if (pii2 != NULL) { 2250 probe_success_info(pii2, NULL, &psinfo); 2251 if (psinfo.ps_tls_valid) { 2252 pi2_tls = psinfo.ps_tls; 2253 /* 2254 * See comment above regarding check 2255 * for PI_RUNNING and group failure. 2256 */ 2257 if (TIME_GT(pi2_tls, pi_tff) && 2258 (pi2->pi_state == PI_RUNNING) && 2259 !GROUP_FAILED(pg) && 2260 FLAGS_TO_LINK_STATE(pi2)) { 2261 retval = PHYINT_FAILURE; 2262 break; 2263 } 2264 } 2265 } 2266 } 2267 2268 /* 2269 * Update the group state to account for the changes. 2270 */ 2271 phyint_group_refresh_state(pg); 2272 return (retval); 2273 } 2274 2275 /* 2276 * Return the information associated with consecutive probe successes 2277 * starting with the most recent probe. At most the last 2 probes can be 2278 * in the unacknowledged state. All previous probes have either failed 2279 * or succeeded. 2280 */ 2281 static void 2282 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2283 struct probe_success_count *psinfo) 2284 { 2285 uint_t i; 2286 struct probe_stats *pr_statp; 2287 uint_t most_recent; 2288 uint_t second_most_recent; 2289 boolean_t pi_found_failure = _B_FALSE; 2290 boolean_t tg_found_failure = _B_FALSE; 2291 uint_t now; 2292 uint_t timeout; 2293 struct target *tg; 2294 2295 if (debug & D_FAILREP) 2296 logdebug("probe_success_info(%s)\n", pii->pii_name); 2297 2298 bzero(psinfo, sizeof (*psinfo)); 2299 now = getcurrenttime(); 2300 2301 /* 2302 * Start with the most recent probe, and count the number 2303 * of consecutive probe successes. Latch the number of successes 2304 * on hitting a failure. 2305 */ 2306 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2307 second_most_recent = PROBE_INDEX_PREV(most_recent); 2308 2309 for (i = most_recent; i != pii->pii_probe_next; 2310 i = PROBE_INDEX_PREV(i)) { 2311 pr_statp = &pii->pii_probes[i]; 2312 2313 switch (pr_statp->pr_status) { 2314 case PR_UNACKED: 2315 /* 2316 * Only the most recent 2 probes can be unacknowledged 2317 */ 2318 assert(i == most_recent || i == second_most_recent); 2319 2320 tg = pr_statp->pr_target; 2321 assert(tg != NULL); 2322 /* 2323 * The crtt could be zero for some reason, 2324 * Eg. the phyint could be failed. If the crtt is 2325 * not available use the value of the group's probe 2326 * interval which is a worst case estimate. 2327 */ 2328 timeout = ns2ms(pr_statp->pr_hrtime_start); 2329 if (tg->tg_crtt != 0) { 2330 timeout += tg->tg_crtt; 2331 } else { 2332 timeout += 2333 pii->pii_phyint->pi_group->pg_probeint; 2334 } 2335 2336 if (TIME_LT(timeout, now)) { 2337 /* 2338 * We hit a failure. Latch the total number of 2339 * recent consecutive successes. 2340 */ 2341 pr_statp->pr_time_lost = timeout; 2342 probe_chstate(pr_statp, pii, PR_LOST); 2343 pi_found_failure = _B_TRUE; 2344 if (cur_tg != NULL && tg == cur_tg) { 2345 /* 2346 * We hit a failure for the desired 2347 * target. Latch the number of recent 2348 * consecutive successes for this target 2349 */ 2350 tg_found_failure = _B_TRUE; 2351 } 2352 } 2353 break; 2354 2355 case PR_ACKED: 2356 /* 2357 * Bump up the count of probe successes, if we 2358 * have not seen any failure so far. 2359 */ 2360 if (!pi_found_failure) 2361 psinfo->ps_nsucc++; 2362 2363 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2364 !tg_found_failure) { 2365 psinfo->ps_nsucc_tg++; 2366 } 2367 2368 /* 2369 * Record the time of last success, if this is 2370 * the most recent probe success. 2371 */ 2372 if (!psinfo->ps_tls_valid) { 2373 psinfo->ps_tls = 2374 ns2ms(pr_statp->pr_hrtime_ackproc); 2375 psinfo->ps_tls_valid = _B_TRUE; 2376 } 2377 break; 2378 2379 case PR_LOST: 2380 /* 2381 * We hit a failure. Latch the total number of 2382 * recent consecutive successes. 2383 */ 2384 pi_found_failure = _B_TRUE; 2385 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2386 /* 2387 * We hit a failure for the desired target. 2388 * Latch the number of recent consecutive 2389 * successes for this target 2390 */ 2391 tg_found_failure = _B_TRUE; 2392 } 2393 break; 2394 2395 default: 2396 return; 2397 2398 } 2399 } 2400 } 2401 2402 /* 2403 * Return the information associated with consecutive probe failures 2404 * starting with the most recent probe. Only the last 2 probes can be in the 2405 * unacknowledged state. All previous probes have either failed or succeeded. 2406 */ 2407 static void 2408 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2409 struct probe_fail_count *pfinfo) 2410 { 2411 int i; 2412 struct probe_stats *pr_statp; 2413 boolean_t tg_found_success = _B_FALSE; 2414 boolean_t pi_found_success = _B_FALSE; 2415 int most_recent; 2416 int second_most_recent; 2417 uint_t now; 2418 uint_t timeout; 2419 struct target *tg; 2420 2421 if (debug & D_FAILREP) 2422 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2423 2424 bzero(pfinfo, sizeof (*pfinfo)); 2425 now = getcurrenttime(); 2426 2427 /* 2428 * Start with the most recent probe, and count the number 2429 * of consecutive probe failures. Latch the number of failures 2430 * on hitting a probe success. 2431 */ 2432 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2433 second_most_recent = PROBE_INDEX_PREV(most_recent); 2434 2435 for (i = most_recent; i != pii->pii_probe_next; 2436 i = PROBE_INDEX_PREV(i)) { 2437 pr_statp = &pii->pii_probes[i]; 2438 2439 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2440 2441 switch (pr_statp->pr_status) { 2442 case PR_UNACKED: 2443 /* 2444 * Only the most recent 2 probes can be unacknowledged 2445 */ 2446 assert(i == most_recent || i == second_most_recent); 2447 2448 tg = pr_statp->pr_target; 2449 /* 2450 * Target is guaranteed to exist in the unack. state 2451 */ 2452 assert(tg != NULL); 2453 /* 2454 * The crtt could be zero for some reason, 2455 * Eg. the phyint could be failed. If the crtt is 2456 * not available use the group's probe interval, 2457 * which is a worst case estimate. 2458 */ 2459 timeout = ns2ms(pr_statp->pr_hrtime_start); 2460 if (tg->tg_crtt != 0) { 2461 timeout += tg->tg_crtt; 2462 } else { 2463 timeout += 2464 pii->pii_phyint->pi_group->pg_probeint; 2465 } 2466 2467 if (TIME_GT(timeout, now)) 2468 break; 2469 2470 pr_statp->pr_time_lost = timeout; 2471 probe_chstate(pr_statp, pii, PR_LOST); 2472 /* FALLTHRU */ 2473 2474 case PR_LOST: 2475 if (!pi_found_success) { 2476 pfinfo->pf_nfail++; 2477 pfinfo->pf_tff = pr_statp->pr_time_lost; 2478 } 2479 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2480 !tg_found_success) { 2481 pfinfo->pf_nfail_tg++; 2482 } 2483 break; 2484 2485 default: 2486 /* 2487 * We hit a success or unused slot. Latch the 2488 * total number of recent consecutive failures. 2489 */ 2490 pi_found_success = _B_TRUE; 2491 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2492 /* 2493 * We hit a success for the desired target. 2494 * Latch the number of recent consecutive 2495 * failures for this target 2496 */ 2497 tg_found_success = _B_TRUE; 2498 } 2499 } 2500 } 2501 } 2502 2503 /* 2504 * Change the state of probe `pr' on phyint_instance `pii' to state `state'. 2505 */ 2506 void 2507 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) 2508 { 2509 if (pr->pr_status == state) 2510 return; 2511 2512 pr->pr_status = state; 2513 (void) probe_state_event(pr, pii); 2514 } 2515 2516 /* 2517 * Check if the phyint has been repaired. If no test address has been 2518 * configured, then consider the interface repaired if the link is up (unless 2519 * the link is flapping; see below). Otherwise, look for proof of probes 2520 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2521 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2522 */ 2523 static boolean_t 2524 phyint_repaired(struct phyint *pi) 2525 { 2526 struct probe_success_count psinfo; 2527 struct phyint_instance *pii; 2528 struct target *cur_tg; 2529 int pr_ndx; 2530 uint_t cur_time; 2531 2532 if (debug & D_FAILREP) 2533 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2534 2535 if (LINK_DOWN(pi)) 2536 return (_B_FALSE); 2537 2538 /* 2539 * If we don't have any test addresses and the link is up, then 2540 * consider the interface repaired, unless we've received more than 2541 * LINK_UP_PERMIN link up notifications in the last minute, in 2542 * which case we keep the link down until we drop back below 2543 * the threshold. 2544 */ 2545 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2546 cur_time = getcurrenttime(); 2547 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2548 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2549 pi->pi_lfmsg_printed = 0; 2550 return (_B_TRUE); 2551 } 2552 if (!pi->pi_lfmsg_printed) { 2553 logerr("The link has come up on %s more than %d times " 2554 "in the last minute; disabling repair until it " 2555 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2556 pi->pi_lfmsg_printed = 1; 2557 } 2558 2559 return (_B_FALSE); 2560 } 2561 2562 pii = pi->pi_v4; 2563 if (PROBE_CAPABLE(pii)) { 2564 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2565 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2566 probe_success_info(pii, cur_tg, &psinfo); 2567 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2568 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2569 return (_B_TRUE); 2570 } 2571 2572 pii = pi->pi_v6; 2573 if (PROBE_CAPABLE(pii)) { 2574 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2575 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2576 probe_success_info(pii, cur_tg, &psinfo); 2577 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2578 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2579 return (_B_TRUE); 2580 } 2581 2582 return (_B_FALSE); 2583 } 2584 2585 /* 2586 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2587 */ 2588 boolean_t 2589 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) 2590 { 2591 int ifsock; 2592 struct lifreq lifr; 2593 uint64_t old_flags; 2594 2595 if (debug & D_FAILREP) { 2596 logdebug("change_pif_flags(%s): set %llx clear %llx\n", 2597 pi->pi_name, set, clear); 2598 } 2599 2600 if (pi->pi_v4 != NULL) 2601 ifsock = ifsock_v4; 2602 else 2603 ifsock = ifsock_v6; 2604 2605 /* 2606 * Get the current flags from the kernel, and set/clear the 2607 * desired phyint flags. Since we set only phyint flags, we can 2608 * do it on either IPv4 or IPv6 instance. 2609 */ 2610 (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2611 2612 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2613 if (errno != ENXIO) 2614 logperror("change_pif_flags: ioctl (get flags)"); 2615 return (_B_FALSE); 2616 } 2617 2618 old_flags = lifr.lifr_flags; 2619 lifr.lifr_flags |= set; 2620 lifr.lifr_flags &= ~clear; 2621 2622 if (old_flags == lifr.lifr_flags) { 2623 /* No change in the flags. No need to send ioctl */ 2624 return (_B_TRUE); 2625 } 2626 2627 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2628 if (errno != ENXIO) 2629 logperror("change_pif_flags: ioctl (set flags)"); 2630 return (_B_FALSE); 2631 } 2632 2633 /* 2634 * Keep pi_flags in synch. with actual flags. Assumes flags are 2635 * phyint flags. 2636 */ 2637 pi->pi_flags |= set; 2638 pi->pi_flags &= ~clear; 2639 2640 if (pi->pi_v4 != NULL) 2641 pi->pi_v4->pii_flags = pi->pi_flags; 2642 2643 if (pi->pi_v6 != NULL) 2644 pi->pi_v6->pii_flags = pi->pi_flags; 2645 2646 return (_B_TRUE); 2647 } 2648 2649 /* 2650 * icmp cksum computation for IPv4. 2651 */ 2652 static int 2653 in_cksum(ushort_t *addr, int len) 2654 { 2655 register int nleft = len; 2656 register ushort_t *w = addr; 2657 register ushort_t answer; 2658 ushort_t odd_byte = 0; 2659 register int sum = 0; 2660 2661 /* 2662 * Our algorithm is simple, using a 32 bit accumulator (sum), 2663 * we add sequential 16 bit words to it, and at the end, fold 2664 * back all the carry bits from the top 16 bits into the lower 2665 * 16 bits. 2666 */ 2667 while (nleft > 1) { 2668 sum += *w++; 2669 nleft -= 2; 2670 } 2671 2672 /* mop up an odd byte, if necessary */ 2673 if (nleft == 1) { 2674 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2675 sum += odd_byte; 2676 } 2677 2678 /* 2679 * add back carry outs from top 16 bits to low 16 bits 2680 */ 2681 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2682 sum += (sum >> 16); /* add carry */ 2683 answer = ~sum; /* truncate to 16 bits */ 2684 return (answer); 2685 } 2686 2687 static void 2688 reset_snxt_basetimes(void) 2689 { 2690 struct phyint_instance *pii; 2691 2692 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2693 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2694 } 2695 } 2696 2697 /* 2698 * Is the address one of our own addresses? Unfortunately, 2699 * we cannot check our phyint tables to determine if the address 2700 * is our own. This is because, we don't track interfaces that 2701 * are not part of any group. We have to either use a 'bind' or 2702 * get the complete list of all interfaces using SIOCGLIFCONF, 2703 * to do this check. We could also use SIOCTMYADDR. 2704 * Bind fails for the local zone address, so we might include local zone 2705 * address as target address. If local zone address is a target address 2706 * and it is up, it is not possible to detect the interface failure. 2707 * SIOCTMYADDR also doesn't consider local zone address as own address. 2708 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2709 * are stored in `localaddrs' 2710 */ 2711 boolean_t 2712 own_address(struct in6_addr addr) 2713 { 2714 addrlist_t *addrp; 2715 struct sockaddr_storage ss; 2716 int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; 2717 2718 addr2storage(af, &addr, &ss); 2719 for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { 2720 if (sockaddrcmp(&ss, &addrp->al_addr)) 2721 return (_B_TRUE); 2722 } 2723 return (_B_FALSE); 2724 } 2725 2726 static int 2727 ns2ms(int64_t ns) 2728 { 2729 return (NSEC2MSEC(ns)); 2730 } 2731 2732 static int64_t 2733 tv2ns(struct timeval *tvp) 2734 { 2735 return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); 2736 } 2737