1 2 /* 3 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 4 * Use is subject to license terms. 5 */ 6 7 /* 8 * Copyright (c) 1987 Regents of the University of California. 9 * All rights reserved. 10 * 11 * Redistribution and use in source and binary forms are permitted 12 * provided that the above copyright notice and this paragraph are 13 * duplicated in all such forms and that any documentation, 14 * advertising materials, and other materials related to such 15 * distribution and use acknowledge that the software was developed 16 * by the University of California, Berkeley. The name of the 17 * University may not be used to endorse or promote products derived 18 * from this software without specific prior written permission. 19 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 21 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 22 */ 23 24 #include "mpd_defs.h" 25 #include "mpd_tables.h" 26 27 /* 28 * Probe types for probe() 29 */ 30 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 31 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 32 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 33 34 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 35 36 /* 37 * Format of probe / probe response packets. This is an ICMP Echo request 38 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 39 */ 40 struct pr_icmp 41 { 42 uint8_t pr_icmp_type; /* type field */ 43 uint8_t pr_icmp_code; /* code field */ 44 uint16_t pr_icmp_cksum; /* checksum field */ 45 uint16_t pr_icmp_id; /* Identification */ 46 uint16_t pr_icmp_seq; /* sequence number */ 47 uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ 48 uint32_t pr_icmp_mtype; /* Message type */ 49 }; 50 51 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 52 0x0, 0x0, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x0, 54 0x0, 0x0, 0x0, 0x1 } }; 55 56 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 57 58 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 59 60 static void *find_ancillary(struct msghdr *msg, int cmsg_level, 61 int cmsg_type); 62 static void pi_set_crtt(struct target *tg, int64_t m, 63 boolean_t is_probe_uni); 64 static void incoming_echo_reply(struct phyint_instance *pii, 65 struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); 66 static void incoming_rtt_reply(struct phyint_instance *pii, 67 struct pr_icmp *reply, struct in6_addr fromaddr); 68 static void incoming_mcast_reply(struct phyint_instance *pii, 69 struct pr_icmp *reply, struct in6_addr fromaddr); 70 71 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 72 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 73 static boolean_t check_exception_target(struct phyint_instance *pii, 74 struct target *target); 75 static void probe_fail_info(struct phyint_instance *pii, 76 struct target *cur_tg, struct probe_fail_count *pfinfo); 77 static void probe_success_info(struct phyint_instance *pii, 78 struct target *cur_tg, struct probe_success_count *psinfo); 79 static boolean_t phyint_repaired(struct phyint *pi); 80 81 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 82 static int in_cksum(ushort_t *addr, int len); 83 static void reset_snxt_basetimes(void); 84 static int ns2ms(int64_t ns); 85 static int64_t tv2ns(struct timeval *); 86 87 /* 88 * CRTT - Conservative Round Trip Time Estimate 89 * Probe success - A matching probe reply received before CRTT ms has elapsed 90 * after sending the probe. 91 * Probe failure - No probe reply received and more than CRTT ms has elapsed 92 * after sending the probe. 93 * 94 * TLS - Time last success. Most recent probe ack received at this time. 95 * TFF - Time first fail. The time of the earliest probe failure in 96 * a consecutive series of probe failures. 97 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 98 * before declaring phyint repair. 99 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 100 * declare a phyint failure. 101 * 102 * Phyint state diagram 103 * 104 * The state of a phyint that is capable of being probed, is completely 105 * specified by the 3-tuple <pi_state, pg_state, I>. 106 * 107 * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether 108 * IFF_OFFLINE is set. If the phyint is also configured with a test address 109 * (the common case) and probe targets, then a phyint must also successfully 110 * be able to send and receive probes in order to remain in the PI_RUNNING 111 * state (otherwise, it transitions to PI_FAILED). 112 * 113 * Further, if a PI_RUNNING phyint is configured with a test address but is 114 * unable to find any probe targets, it will transition to the PI_NOTARGETS 115 * state, which indicates that the link is apparently functional but that 116 * in.mpathd is unable to send probes to verify functionality (in this case, 117 * in.mpathd makes the optimistic assumption that the interface is working 118 * correctly and thus does not mark the interface FAILED, but reports it as 119 * IPMP_IF_UNKNOWN through the async events and query interfaces). 120 * 121 * At any point, a phyint may be administratively marked offline via if_mpadm. 122 * In this case, the interface always transitions to PI_OFFLINE, regardless 123 * of its previous state. When the interface is later brought back online, 124 * in.mpathd acts as if the interface is new (and thus it transitions to 125 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 126 * its probes, if probes are sent). 127 * 128 * pi_state - PI_RUNNING or PI_FAILED 129 * PI_RUNNING: The failure detection logic says the phyint is good. 130 * PI_FAILED: The failure detection logic says the phyint has failed. 131 * 132 * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. 133 * PG_OK: All interfaces in the group are OK. 134 * PG_DEGRADED: Some interfaces in the group are unusable. 135 * PG_FAILED: All interfaces in the group are unusable. 136 * 137 * In the case of router targets, we assume that the current list of 138 * targets obtained from the routing table, is still valid, so the 139 * phyint stat is PI_FAILED. In the case of host targets, we delete the 140 * list of targets, and multicast to the all hosts, to reconstruct the 141 * target list. So the phyints are in the PI_NOTARGETS state. 142 * 143 * I - value of (pi_flags & IFF_INACTIVE) 144 * IFF_INACTIVE: This phyint will not send or receive packets. 145 * Usually, inactive is tied to standby interfaces that are not yet 146 * needed (e.g., no non-standby interfaces in the group have failed). 147 * When failback has been disabled (FAILBACK=no configured), phyint can 148 * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint 149 * subsequently recovers after a failure. 150 * 151 * Not all 9 possible combinations of the above 3-tuple are possible. 152 * 153 * I is tracked by IP. pi_state is tracked by mpathd. 154 * 155 * pi_state state machine 156 * --------------------------------------------------------------------------- 157 * Event State New State 158 * Action: 159 * --------------------------------------------------------------------------- 160 * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 161 * detection : set IFF_FAILED on this phyint 162 * 163 * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 164 * detection : set IFF_FAILED on this phyint 165 * 166 * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) 167 * detection -> (PI_RUNNING, I == 0) 168 * : clear IFF_FAILED on this phyint 169 * 170 * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) 171 * detection -> (PI_RUNNING, I == 1) 172 * : clear IFF_FAILED on this phyint 173 * : if failback is disabled set I == 1 174 * 175 * Group failure (perform on all phyints in the group) 176 * detection PI_RUNNING PI_FAILED 177 * (Router targets) : set IFF_FAILED 178 * 179 * Group failure (perform on all phyints in the group) 180 * detection PI_RUNNING PI_NOTARGETS 181 * (Host targets) : set IFF_FAILED 182 * : delete the target list on all phyints 183 * --------------------------------------------------------------------------- 184 */ 185 186 struct probes_missed probes_missed; 187 188 /* 189 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 190 * will be added on by the kernel. The id field identifies this phyint. 191 * and the sequence number is an increasing (modulo 2^^16) integer. The data 192 * portion holds the time value when the packet is sent. On echo this is 193 * extracted to compute the round-trip time. Three different types of 194 * probe packets are used. 195 * 196 * PROBE_UNI: This type is used to do failure detection / failure recovery 197 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 198 * not less than the current CRTT. pii_probes[] stores data 199 * about these probes. These packets consume sequence number space. 200 * 201 * PROBE_RTT: This type is used to make only rtt measurements. Normally these 202 * are not used. Under heavy network load, the rtt may go up very high, 203 * due to a spike, or may appear to go high, due to extreme scheduling 204 * delays. Once the network stress is removed, mpathd takes long time to 205 * recover, because the probe_interval is already high, and it takes 206 * a long time to send out sufficient number of probes to bring down the 207 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 208 * user_probe_interval ms. and will cause only rtt updates. These packets 209 * do not consume sequence number space nor is information about these 210 * packets stored in the pii_probes[] 211 * 212 * PROBE_MULTI: This type is only used to construct a list of targets, when 213 * no targets are known. The packet is multicast to the all hosts addr. 214 */ 215 static void 216 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) 217 { 218 hrtime_t sent_hrtime; 219 struct timeval sent_tv; 220 struct pr_icmp probe_pkt; /* Probe packet */ 221 struct sockaddr_storage targ; /* target address */ 222 uint_t targaddrlen; /* targed address length */ 223 int pr_ndx; /* probe index in pii->pii_probes[] */ 224 boolean_t sent = _B_TRUE; 225 226 if (debug & D_TARGET) { 227 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), 228 pii->pii_name, probe_type, start_hrtime); 229 } 230 231 assert(pii->pii_probe_sock != -1); 232 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 233 probe_type == PROBE_RTT); 234 235 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 236 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 237 probe_pkt.pr_icmp_code = 0; 238 probe_pkt.pr_icmp_cksum = 0; 239 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 240 241 /* 242 * Since there is no need to do arithmetic on the icmpid, 243 * (only equality check is done) pii_icmpid is stored in 244 * network byte order at initialization itself. 245 */ 246 probe_pkt.pr_icmp_id = pii->pii_icmpid; 247 probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); 248 probe_pkt.pr_icmp_mtype = htonl(probe_type); 249 250 /* 251 * If probe_type is PROBE_MULTI, this packet will be multicast to 252 * the all hosts address. Otherwise it is unicast to the next target. 253 */ 254 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 255 pii->pii_rtt_target_next != NULL)); 256 257 bzero(&targ, sizeof (targ)); 258 targ.ss_family = pii->pii_af; 259 260 if (pii->pii_af == AF_INET6) { 261 struct in6_addr *addr6; 262 263 addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; 264 targaddrlen = sizeof (struct sockaddr_in6); 265 if (probe_type == PROBE_MULTI) { 266 *addr6 = all_nodes_mcast_v6; 267 } else if (probe_type == PROBE_UNI) { 268 *addr6 = pii->pii_target_next->tg_address; 269 } else { /* type is PROBE_RTT */ 270 *addr6 = pii->pii_rtt_target_next->tg_address; 271 } 272 } else { 273 struct in_addr *addr4; 274 275 addr4 = &((struct sockaddr_in *)&targ)->sin_addr; 276 targaddrlen = sizeof (struct sockaddr_in); 277 if (probe_type == PROBE_MULTI) { 278 *addr4 = all_nodes_mcast_v4; 279 } else if (probe_type == PROBE_UNI) { 280 IN6_V4MAPPED_TO_INADDR( 281 &pii->pii_target_next->tg_address, addr4); 282 } else { /* type is PROBE_RTT */ 283 IN6_V4MAPPED_TO_INADDR( 284 &pii->pii_rtt_target_next->tg_address, addr4); 285 } 286 287 /* 288 * Compute the IPv4 icmp checksum. Does not cover the IP header. 289 */ 290 probe_pkt.pr_icmp_cksum = 291 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 292 } 293 294 /* 295 * Use the current time as the time we sent. Not atomic, but the best 296 * we can do from here. 297 */ 298 sent_hrtime = gethrtime(); 299 (void) gettimeofday(&sent_tv, NULL); 300 if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, 301 (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) { 302 logperror_pii(pii, "probe: probe sendto"); 303 sent = _B_FALSE; 304 } 305 306 /* 307 * If this is a PROBE_UNI probe packet being unicast to a target, then 308 * update our tables. We will need this info in processing the probe 309 * response. PROBE_MULTI and PROBE_RTT packets are not used for 310 * the purpose of failure or recovery detection. PROBE_MULTI packets 311 * are only used to construct a list of targets. PROBE_RTT packets are 312 * used only for updating the rtt and not for failure detection. 313 */ 314 if (probe_type == PROBE_UNI && sent) { 315 pr_ndx = pii->pii_probe_next; 316 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 317 318 /* Collect statistics, before we reuse the last slot. */ 319 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 320 pii->pii_cum_stats.lost++; 321 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 322 pii->pii_cum_stats.acked++; 323 pii->pii_cum_stats.sent++; 324 325 pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; 326 pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; 327 pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; 328 pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; 329 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 330 probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); 331 332 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 333 pii->pii_target_next = target_next(pii->pii_target_next); 334 assert(pii->pii_target_next != NULL); 335 /* 336 * If we have a single variable to denote the next target to 337 * probe for both rtt probes and failure detection probes, we 338 * could end up with a situation where the failure detection 339 * probe targets become disjoint from the rtt probe targets. 340 * Eg. if 2 targets and the actual fdt is double the user 341 * specified fdt. So we have 2 variables. In this scheme 342 * we also reset pii_rtt_target_next for every fdt probe, 343 * though that may not be necessary. 344 */ 345 pii->pii_rtt_target_next = pii->pii_target_next; 346 pii->pii_snxt++; 347 } else if (probe_type == PROBE_RTT) { 348 pii->pii_rtt_target_next = 349 target_next(pii->pii_rtt_target_next); 350 assert(pii->pii_rtt_target_next != NULL); 351 } 352 } 353 354 /* 355 * Incoming IPv4 data from wire, is received here. Called from main. 356 */ 357 void 358 in_data(struct phyint_instance *pii) 359 { 360 struct sockaddr_in from; 361 struct in6_addr fromaddr; 362 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 363 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 364 struct ip *ip; 365 int iphlen; 366 int len; 367 char abuf[INET_ADDRSTRLEN]; 368 struct msghdr msg; 369 struct iovec iov; 370 struct pr_icmp *reply; 371 struct timeval *recv_tvp; 372 373 if (debug & D_PROBE) { 374 logdebug("in_data(%s %s)\n", 375 AF_STR(pii->pii_af), pii->pii_name); 376 } 377 378 iov.iov_base = (char *)in_packet; 379 iov.iov_len = sizeof (in_packet); 380 msg.msg_iov = &iov; 381 msg.msg_iovlen = 1; 382 msg.msg_name = (struct sockaddr *)&from; 383 msg.msg_namelen = sizeof (from); 384 msg.msg_control = ancillary_data; 385 msg.msg_controllen = sizeof (ancillary_data); 386 387 /* 388 * Poll has already told us that a message is waiting, 389 * on this socket. Read it now. We should not block. 390 */ 391 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 392 logperror_pii(pii, "in_data: recvmsg"); 393 return; 394 } 395 396 /* 397 * If the datalink has indicated the link is down, don't go 398 * any further. 399 */ 400 if (LINK_DOWN(pii->pii_phyint)) 401 return; 402 403 /* Get the printable address for error reporting */ 404 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 405 406 /* Ignore packets > 64k or control buffers that don't fit */ 407 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 408 if (debug & D_PKTBAD) { 409 logdebug("Truncated message: msg_flags 0x%x from %s\n", 410 msg.msg_flags, abuf); 411 } 412 return; 413 } 414 415 /* Make sure packet contains at least minimum ICMP header */ 416 ip = (struct ip *)in_packet; 417 iphlen = ip->ip_hl << 2; 418 if (len < iphlen + ICMP_MINLEN) { 419 if (debug & D_PKTBAD) { 420 logdebug("in_data: packet too short (%d bytes)" 421 " from %s\n", len, abuf); 422 } 423 return; 424 } 425 426 /* 427 * Subtract the IP hdr length, 'len' will be length of the probe 428 * reply, starting from the icmp hdr. 429 */ 430 len -= iphlen; 431 /* LINTED */ 432 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 433 434 /* Probe replies are icmp echo replies. Ignore anything else */ 435 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 436 return; 437 438 /* 439 * The icmp id should match what we sent, which is stored 440 * in pi_icmpid. The icmp code for reply must be 0. 441 * The reply content must be a struct pr_icmp 442 */ 443 if (reply->pr_icmp_id != pii->pii_icmpid) { 444 /* Not in response to our probe */ 445 return; 446 } 447 448 if (reply->pr_icmp_code != 0) { 449 logtrace("probe reply code %d from %s on %s\n", 450 reply->pr_icmp_code, abuf, pii->pii_name); 451 return; 452 } 453 454 if (len < sizeof (struct pr_icmp)) { 455 logtrace("probe reply too short: %d bytes from %s on %s\n", 456 len, abuf, pii->pii_name); 457 return; 458 } 459 460 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 461 if (recv_tvp == NULL) { 462 logtrace("message without timestamp from %s on %s\n", 463 abuf, pii->pii_name); 464 return; 465 } 466 467 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 468 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 469 /* Unicast probe reply */ 470 incoming_echo_reply(pii, reply, fromaddr, recv_tvp); 471 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 472 /* Multicast reply */ 473 incoming_mcast_reply(pii, reply, fromaddr); 474 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 475 incoming_rtt_reply(pii, reply, fromaddr); 476 } else { 477 /* Probably not in response to our probe */ 478 logtrace("probe reply type: %d from %s on %s\n", 479 reply->pr_icmp_mtype, abuf, pii->pii_name); 480 return; 481 } 482 } 483 484 /* 485 * Incoming IPv6 data from wire is received here. Called from main. 486 */ 487 void 488 in6_data(struct phyint_instance *pii) 489 { 490 struct sockaddr_in6 from; 491 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 492 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 493 int len; 494 char abuf[INET6_ADDRSTRLEN]; 495 struct msghdr msg; 496 struct iovec iov; 497 void *opt; 498 struct pr_icmp *reply; 499 struct timeval *recv_tvp; 500 501 if (debug & D_PROBE) { 502 logdebug("in6_data(%s %s)\n", 503 AF_STR(pii->pii_af), pii->pii_name); 504 } 505 506 iov.iov_base = (char *)in_packet; 507 iov.iov_len = sizeof (in_packet); 508 msg.msg_iov = &iov; 509 msg.msg_iovlen = 1; 510 msg.msg_name = (struct sockaddr *)&from; 511 msg.msg_namelen = sizeof (from); 512 msg.msg_control = ancillary_data; 513 msg.msg_controllen = sizeof (ancillary_data); 514 515 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 516 logperror_pii(pii, "in6_data: recvmsg"); 517 return; 518 } 519 520 /* 521 * If the datalink has indicated that the link is down, don't go 522 * any further. 523 */ 524 if (LINK_DOWN(pii->pii_phyint)) 525 return; 526 527 /* Get the printable address for error reporting */ 528 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 529 if (len < ICMP_MINLEN) { 530 if (debug & D_PKTBAD) { 531 logdebug("Truncated message: msg_flags 0x%x from %s\n", 532 msg.msg_flags, abuf); 533 } 534 return; 535 } 536 /* Ignore packets > 64k or control buffers that don't fit */ 537 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 538 if (debug & D_PKTBAD) { 539 logdebug("Truncated message: msg_flags 0x%x from %s\n", 540 msg.msg_flags, abuf); 541 } 542 return; 543 } 544 545 reply = (struct pr_icmp *)in_packet; 546 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 547 return; 548 549 if (reply->pr_icmp_id != pii->pii_icmpid) { 550 /* Not in response to our probe */ 551 return; 552 } 553 554 /* 555 * The kernel has already verified the the ICMP checksum. 556 */ 557 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 558 logtrace("ICMPv6 echo reply source address not linklocal from " 559 "%s on %s\n", abuf, pii->pii_name); 560 return; 561 } 562 opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); 563 if (opt != NULL) { 564 /* Can't allow routing headers in probe replies */ 565 logtrace("message with routing header from %s on %s\n", 566 abuf, pii->pii_name); 567 return; 568 } 569 570 if (reply->pr_icmp_code != 0) { 571 logtrace("probe reply code: %d from %s on %s\n", 572 reply->pr_icmp_code, abuf, pii->pii_name); 573 return; 574 } 575 if (len < (sizeof (struct pr_icmp))) { 576 logtrace("probe reply too short: %d bytes from %s on %s\n", 577 len, abuf, pii->pii_name); 578 return; 579 } 580 581 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 582 if (recv_tvp == NULL) { 583 logtrace("message without timestamp from %s on %s\n", 584 abuf, pii->pii_name); 585 return; 586 } 587 588 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 589 incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); 590 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 591 incoming_mcast_reply(pii, reply, from.sin6_addr); 592 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 593 incoming_rtt_reply(pii, reply, from.sin6_addr); 594 } else { 595 /* Probably not in response to our probe */ 596 logtrace("probe reply type: %d from %s on %s\n", 597 reply->pr_icmp_mtype, abuf, pii->pii_name); 598 } 599 } 600 601 /* 602 * Process the incoming rtt reply, in response to our rtt probe. 603 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 604 * have any stored information about the probe we sent. So we don't log 605 * any errors if we receive bad replies. 606 */ 607 static void 608 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 609 struct in6_addr fromaddr) 610 { 611 int64_t m; /* rtt measurement in ns */ 612 char abuf[INET6_ADDRSTRLEN]; 613 struct target *target; 614 struct phyint_group *pg; 615 616 /* Get the printable address for error reporting */ 617 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 618 619 if (debug & D_PROBE) { 620 logdebug("incoming_rtt_reply: %s %s %s\n", 621 AF_STR(pii->pii_af), pii->pii_name, abuf); 622 } 623 624 /* Do we know this target ? */ 625 target = target_lookup(pii, fromaddr); 626 if (target == NULL) 627 return; 628 629 m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); 630 /* Invalid rtt. It has wrapped around */ 631 if (m < 0) 632 return; 633 634 /* 635 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 636 * The initial few responses after the interface is repaired may 637 * contain high rtt's because they could have been queued up waiting 638 * for ARP/NDP resolution on a failed interface. 639 */ 640 pg = pii->pii_phyint->pi_group; 641 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 642 return; 643 644 /* 645 * Update rtt only if the new rtt is lower than the current rtt. 646 * (specified by the 3rd parameter to pi_set_crtt). 647 * If a spike has caused the current probe_interval to be > 648 * user_probe_interval, then this mechanism is used to bring down 649 * the rtt rapidly once the network stress is removed. 650 * If the new rtt is higher than the current rtt, we don't want to 651 * update the rtt. We are having more than 1 outstanding probe and 652 * the increase in rtt we are seeing is being unnecessarily weighted 653 * many times. The regular rtt update will be handled by 654 * incoming_echo_reply() and will take care of any rtt increase. 655 */ 656 pi_set_crtt(target, m, _B_FALSE); 657 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 658 (user_failure_detection_time < pg->pg_fdt) && 659 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 660 /* 661 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 662 * investigate if we can improve the failure detection time to 663 * meet whatever the user specified. 664 */ 665 if (check_pg_crtt_improved(pg)) { 666 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 667 user_failure_detection_time); 668 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 669 if (pii->pii_phyint->pi_group != phyint_anongroup) { 670 logerr("Improved failure detection time %d ms " 671 "on (%s %s) for group \"%s\"\n", 672 pg->pg_fdt, AF_STR(pii->pii_af), 673 pii->pii_name, 674 pii->pii_phyint->pi_group->pg_name); 675 } 676 if (user_failure_detection_time == pg->pg_fdt) { 677 /* Avoid any truncation or rounding errors */ 678 pg->pg_probeint = user_probe_interval; 679 /* 680 * No more rtt probes will be sent. The actual 681 * fdt has dropped to the user specified value. 682 * pii_fd_snxt_basetime and pii_snxt_basetime 683 * will be in sync henceforth. 684 */ 685 reset_snxt_basetimes(); 686 } 687 } 688 } 689 } 690 691 /* 692 * Process the incoming echo reply, in response to our unicast probe. 693 * Common for both IPv4 and IPv6 694 */ 695 static void 696 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 697 struct in6_addr fromaddr, struct timeval *recv_tvp) 698 { 699 int64_t m; /* rtt measurement in ns */ 700 hrtime_t cur_hrtime; /* in ns from some arbitrary point */ 701 char abuf[INET6_ADDRSTRLEN]; 702 int pr_ndx; 703 struct target *target; 704 boolean_t exception; 705 uint64_t pr_icmp_timestamp; 706 uint16_t pr_icmp_seq; 707 struct probe_stats *pr_statp; 708 struct phyint_group *pg = pii->pii_phyint->pi_group; 709 710 /* Get the printable address for error reporting */ 711 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 712 713 if (debug & D_PROBE) { 714 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", 715 AF_STR(pii->pii_af), pii->pii_name, abuf, 716 ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); 717 } 718 719 pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); 720 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 721 722 /* Reject out of window probe replies */ 723 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 724 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 725 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 726 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 727 pii->pii_cum_stats.unknown++; 728 return; 729 } 730 731 cur_hrtime = gethrtime(); 732 m = (int64_t)(cur_hrtime - pr_icmp_timestamp); 733 if (m < 0) { 734 /* 735 * This is a ridiculously high value of rtt. rtt has wrapped 736 * around. Log a message, and ignore the rtt. 737 */ 738 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " 739 "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); 740 } 741 742 /* 743 * Get the probe index pr_ndx corresponding to the received icmp seq. 744 * number in our pii->pii_probes[] array. The icmp sequence number 745 * pii_snxt corresponds to the probe index pii->pii_probe_next 746 */ 747 pr_ndx = MOD_SUB(pii->pii_probe_next, 748 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 749 750 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 751 752 target = pii->pii_probes[pr_ndx].pr_target; 753 754 /* 755 * Perform sanity checks, whether this probe reply that we 756 * have received is genuine 757 */ 758 if (target != NULL) { 759 /* 760 * Compare the src. addr of the received ICMP or ICMPv6 761 * probe reply with the target address in our tables. 762 */ 763 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 764 /* 765 * We don't have any record of having sent a probe to 766 * this target. This is a fake probe reply. Log an error 767 */ 768 logtrace("probe status %d Fake probe reply seq %u " 769 "snxt %u on %s from %s\n", 770 pii->pii_probes[pr_ndx].pr_status, 771 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 772 pii->pii_cum_stats.unknown++; 773 return; 774 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 775 /* 776 * The address matches, but our tables indicate that 777 * this probe reply has been acked already. So this 778 * is a duplicate probe reply. Log an error 779 */ 780 logtrace("probe status %d Duplicate probe reply seq %u " 781 "snxt %u on %s from %s\n", 782 pii->pii_probes[pr_ndx].pr_status, 783 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 784 pii->pii_cum_stats.unknown++; 785 return; 786 } 787 } else { 788 /* 789 * Target must not be NULL in the PR_UNACKED state 790 */ 791 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 792 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 793 /* 794 * The probe stats slot is unused. So we didn't 795 * send out any probe to this target. This is a fake. 796 * Log an error. 797 */ 798 logtrace("probe status %d Fake probe reply seq %u " 799 "snxt %u on %s from %s\n", 800 pii->pii_probes[pr_ndx].pr_status, 801 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 802 } 803 pii->pii_cum_stats.unknown++; 804 return; 805 } 806 807 /* 808 * If the rtt does not appear to be right, don't update the 809 * rtt stats. This can happen if the system dropped into the 810 * debugger, or the system was hung or too busy for a 811 * substantial time that we didn't get a chance to run. 812 */ 813 if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { 814 /* 815 * If the probe corresponding to this received response 816 * was truly sent 'm' ns. ago, then this response must 817 * have been rejected by the sequence number checks. The 818 * fact that it has passed the sequence number checks 819 * means that the measured rtt is wrong. We were probably 820 * scheduled long after the packet was received. 821 */ 822 goto out; 823 } 824 825 /* 826 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 827 * The initial few responses after the interface is repaired may 828 * contain high rtt's because they could have been queued up waiting 829 * for ARP/NDP resolution on a failed interface. 830 */ 831 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 832 goto out; 833 834 /* 835 * Don't update the Conservative Round Trip Time estimate for this 836 * (phint, target) pair if this is the not the highest ack seq seen 837 * thus far on this target. 838 */ 839 if (!highest_ack_tg(pr_icmp_seq, target)) 840 goto out; 841 842 /* 843 * Always update the rtt. This is a failure detection probe 844 * and we want to measure both increase / decrease in rtt. 845 */ 846 pi_set_crtt(target, m, _B_TRUE); 847 848 /* 849 * If the crtt exceeds the average time between probes, 850 * investigate if this slow target is an exception. If so we 851 * can avoid this target and still meet the failure detection 852 * time. Otherwise we can't meet the failure detection time. 853 */ 854 if (target->tg_crtt > pg->pg_probeint) { 855 exception = check_exception_target(pii, target); 856 if (exception) { 857 /* 858 * This target is exceptionally slow. Don't use it 859 * for future probes. check_exception_target() has 860 * made sure that we have at least MIN_PROBE_TARGETS 861 * other active targets 862 */ 863 if (pii->pii_targets_are_routers) { 864 /* 865 * This is a slow router, mark it as slow 866 * and don't use it for further probes. We 867 * don't delete it, since it will be populated 868 * again when we do a router scan. Hence we 869 * need to maintain extra state (unlike the 870 * host case below). Mark it as TG_SLOW. 871 */ 872 if (target->tg_status == TG_ACTIVE) 873 pii->pii_ntargets--; 874 target->tg_status = TG_SLOW; 875 target->tg_latime = gethrtime(); 876 target->tg_rtt_sa = -1; 877 target->tg_crtt = 0; 878 target->tg_rtt_sd = 0; 879 if (pii->pii_target_next == target) { 880 pii->pii_target_next = 881 target_next(target); 882 } 883 } else { 884 /* 885 * the slow target is not a router, we can 886 * just delete it. Send an icmp multicast and 887 * pick the fastest responder that is not 888 * already an active target. target_delete() 889 * adjusts pii->pii_target_next 890 */ 891 target_delete(target); 892 probe(pii, PROBE_MULTI, cur_hrtime); 893 } 894 } else { 895 /* 896 * We can't meet the failure detection time. 897 * Log a message, and update the detection time to 898 * whatever we can achieve. 899 */ 900 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 901 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 902 last_fdt_bumpup_time = gethrtime(); 903 if (pg != phyint_anongroup) { 904 logerr("Cannot meet requested failure detection" 905 " time of %d ms on (%s %s) new failure" 906 " detection time for group \"%s\" is %d" 907 " ms\n", user_failure_detection_time, 908 AF_STR(pii->pii_af), pii->pii_name, 909 pg->pg_name, pg->pg_fdt); 910 } 911 } 912 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 913 (user_failure_detection_time < pg->pg_fdt) && 914 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 915 /* 916 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 917 * investigate if we can improve the failure detection time to 918 * meet whatever the user specified. 919 */ 920 if (check_pg_crtt_improved(pg)) { 921 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 922 user_failure_detection_time); 923 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 924 if (pg != phyint_anongroup) { 925 logerr("Improved failure detection time %d ms " 926 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 927 AF_STR(pii->pii_af), pii->pii_name, 928 pg->pg_name); 929 } 930 if (user_failure_detection_time == pg->pg_fdt) { 931 /* Avoid any truncation or rounding errors */ 932 pg->pg_probeint = user_probe_interval; 933 /* 934 * No more rtt probes will be sent. The actual 935 * fdt has dropped to the user specified value. 936 * pii_fd_snxt_basetime and pii_snxt_basetime 937 * will be in sync henceforth. 938 */ 939 reset_snxt_basetimes(); 940 } 941 } 942 } 943 out: 944 pr_statp = &pii->pii_probes[pr_ndx]; 945 pr_statp->pr_hrtime_ackproc = cur_hrtime; 946 pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + 947 (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); 948 949 probe_chstate(pr_statp, pii, PR_ACKED); 950 951 /* 952 * Update pii->pii_rack, i.e. the sequence number of the last received 953 * probe response, based on the echo reply we have received now, if 954 * either of the following conditions are satisfied. 955 * a. pii_rack is outside the current receive window of 956 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 957 * This means we have not received probe responses for a 958 * long time, and the sequence number has wrapped around. 959 * b. pii_rack is within the current receive window and this echo 960 * reply corresponds to the highest sequence number we have seen 961 * so far. 962 */ 963 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 964 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 965 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 966 pii->pii_rack = pr_icmp_seq; 967 } 968 } 969 970 /* 971 * Returns true if seq is the highest unacknowledged seq for target tg 972 * else returns false 973 */ 974 static boolean_t 975 highest_ack_tg(uint16_t seq, struct target *tg) 976 { 977 struct phyint_instance *pii; 978 int pr_ndx; 979 uint16_t pr_seq; 980 981 pii = tg->tg_phyint_inst; 982 983 /* 984 * Get the seq number of the most recent probe sent so far, 985 * and also get the corresponding probe index in the probe stats 986 * array. 987 */ 988 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 989 pr_seq = pii->pii_snxt; 990 pr_seq--; 991 992 /* 993 * Start from the most recent probe and walk back, trying to find 994 * an acked probe corresponding to target tg. 995 */ 996 for (; pr_ndx != pii->pii_probe_next; 997 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 998 if (pii->pii_probes[pr_ndx].pr_target == tg && 999 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1000 if (SEQ_GT(pr_seq, seq)) 1001 return (_B_FALSE); 1002 } 1003 } 1004 return (_B_TRUE); 1005 } 1006 1007 /* 1008 * Check whether the crtt for the group has improved by a factor of 1009 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1010 * detection time flapping in the face of small crtt changes. 1011 */ 1012 static boolean_t 1013 check_pg_crtt_improved(struct phyint_group *pg) 1014 { 1015 struct phyint *pi; 1016 1017 if (debug & D_PROBE) 1018 logdebug("check_pg_crtt_improved()\n"); 1019 1020 /* 1021 * The crtt for the group is only improved if each phyint_instance 1022 * for both ipv4 and ipv6 is improved. 1023 */ 1024 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1025 if (!check_pii_crtt_improved(pi->pi_v4) || 1026 !check_pii_crtt_improved(pi->pi_v6)) 1027 return (_B_FALSE); 1028 } 1029 1030 return (_B_TRUE); 1031 } 1032 1033 /* 1034 * Check whether the crtt has improved substantially on this phyint_instance. 1035 * Returns _B_TRUE if there's no crtt information available, because pii 1036 * is NULL or the phyint_instance is not capable of probing. 1037 */ 1038 boolean_t 1039 check_pii_crtt_improved(struct phyint_instance *pii) { 1040 struct target *tg; 1041 1042 if (pii == NULL) 1043 return (_B_TRUE); 1044 1045 if (!PROBE_CAPABLE(pii) || 1046 pii->pii_phyint->pi_state == PI_FAILED) 1047 return (_B_TRUE); 1048 1049 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1050 if (tg->tg_status != TG_ACTIVE) 1051 continue; 1052 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1053 LOWER_FDT_TRIGGER)) { 1054 return (_B_FALSE); 1055 } 1056 } 1057 1058 return (_B_TRUE); 1059 } 1060 1061 /* 1062 * This target responds very slowly to probes. The target's crtt exceeds 1063 * the probe interval of its group. Compare against other targets 1064 * and determine if this target is an exception, if so return true, else false 1065 */ 1066 static boolean_t 1067 check_exception_target(struct phyint_instance *pii, struct target *target) 1068 { 1069 struct target *tg; 1070 char abuf[INET6_ADDRSTRLEN]; 1071 1072 if (debug & D_PROBE) { 1073 logdebug("check_exception_target(%s %s target %s)\n", 1074 AF_STR(pii->pii_af), pii->pii_name, 1075 pr_addr(pii->pii_af, target->tg_address, 1076 abuf, sizeof (abuf))); 1077 } 1078 1079 /* 1080 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1081 * to make a good judgement. Otherwise don't drop this target. 1082 */ 1083 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1084 return (_B_FALSE); 1085 1086 /* 1087 * Determine whether only this particular target is slow. 1088 * We know that this target's crtt exceeds the group's probe interval. 1089 * If all other active targets have a 1090 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1091 * then this target is considered slow. 1092 */ 1093 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1094 if (tg != target && tg->tg_status == TG_ACTIVE) { 1095 if (tg->tg_crtt > 1096 pii->pii_phyint->pi_group->pg_probeint / 1097 EXCEPTION_FACTOR) { 1098 return (_B_FALSE); 1099 } 1100 } 1101 } 1102 1103 return (_B_TRUE); 1104 } 1105 1106 /* 1107 * Update the target list. The icmp all hosts multicast has given us 1108 * some host to which we can send probes. If we already have sufficient 1109 * targets, discard it. 1110 */ 1111 static void 1112 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1113 struct in6_addr fromaddr) 1114 /* ARGSUSED */ 1115 { 1116 int af; 1117 char abuf[INET6_ADDRSTRLEN]; 1118 struct phyint *pi; 1119 1120 if (debug & D_PROBE) { 1121 logdebug("incoming_mcast_reply(%s %s %s)\n", 1122 AF_STR(pii->pii_af), pii->pii_name, 1123 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1124 } 1125 1126 /* 1127 * Using host targets is a fallback mechanism. If we have 1128 * found a router, don't add this host target. If we already 1129 * know MAX_PROBE_TARGETS, don't add another target. 1130 */ 1131 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1132 if (pii->pii_targets != NULL) { 1133 if (pii->pii_targets_are_routers || 1134 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1135 return; 1136 } 1137 } 1138 1139 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1140 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1141 /* 1142 * Guard against response from 0.0.0.0 1143 * and ::. Log a trace message 1144 */ 1145 logtrace("probe response from %s on %s\n", 1146 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1147 pii->pii_name); 1148 return; 1149 } 1150 1151 /* 1152 * This address is one of our own, so reject this address as a 1153 * valid probe target. 1154 */ 1155 af = pii->pii_af; 1156 if (own_address(fromaddr)) 1157 return; 1158 1159 /* 1160 * If the phyint is part a named group, then add the address to all 1161 * members of the group. Otherwise, add the address only to the 1162 * phyint itself, since other phyints in the anongroup may not be on 1163 * the same subnet. 1164 */ 1165 pi = pii->pii_phyint; 1166 if (pi->pi_group == phyint_anongroup) { 1167 target_add(pii, fromaddr, _B_FALSE); 1168 } else { 1169 pi = pi->pi_group->pg_phyint; 1170 for (; pi != NULL; pi = pi->pi_pgnext) 1171 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1172 } 1173 } 1174 1175 /* 1176 * Compute CRTT given an existing scaled average, scaled deviation estimate 1177 * and a new rtt time. The formula is from Jacobson and Karels' 1178 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1179 * are the same as those in Appendix A.2 of that paper. 1180 * 1181 * m = new measurement 1182 * sa = scaled RTT average (8 * average estimates) 1183 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1184 * crtt = Conservative round trip time. Used to determine whether probe 1185 * has timed out. 1186 * 1187 * New scaled average and deviation are passed back via sap and svp 1188 */ 1189 static int64_t 1190 compute_crtt(int64_t *sap, int64_t *svp, int64_t m) 1191 { 1192 int64_t sa = *sap; 1193 int64_t sv = *svp; 1194 int64_t crtt; 1195 int64_t saved_m = m; 1196 1197 assert(*sap >= -1); 1198 assert(*svp >= 0); 1199 1200 if (sa != -1) { 1201 /* 1202 * Update average estimator: 1203 * new rtt = old rtt + 1/8 Error 1204 * where Error = m - old rtt 1205 * i.e. 8 * new rtt = 8 * old rtt + Error 1206 * i.e. new sa = old sa + Error 1207 */ 1208 m -= sa >> 3; /* m is now Error in estimate. */ 1209 if ((sa += m) < 0) { 1210 /* Don't allow the smoothed average to be negative. */ 1211 sa = 0; 1212 } 1213 1214 /* 1215 * Update deviation estimator: 1216 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1217 * i.e. 4 * new mdev = 4 * old mdev + 1218 * (abs(Error) - old mdev) 1219 * i.e. new sv = old sv + (abs(Error) - old mdev) 1220 */ 1221 if (m < 0) 1222 m = -m; 1223 m -= sv >> 2; 1224 sv += m; 1225 } else { 1226 /* Initialization. This is the first response received. */ 1227 sa = (m << 3); 1228 sv = (m << 1); 1229 } 1230 1231 crtt = (sa >> 3) + sv; 1232 1233 if (debug & D_PROBE) { 1234 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " 1235 "crtt = %lld\n", saved_m, sa, sv, crtt); 1236 } 1237 1238 *sap = sa; 1239 *svp = sv; 1240 1241 /* 1242 * CRTT = average estimates + 4 * deviation estimates 1243 * = sa / 8 + sv 1244 */ 1245 return (crtt); 1246 } 1247 1248 static void 1249 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) 1250 { 1251 struct phyint_instance *pii = tg->tg_phyint_inst; 1252 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1253 int64_t sa = tg->tg_rtt_sa; 1254 int64_t sv = tg->tg_rtt_sd; 1255 int new_crtt; 1256 int i; 1257 1258 if (debug & D_PROBE) 1259 logdebug("pi_set_crtt: target - m %lld\n", m); 1260 1261 /* store the round trip time, in case we need to defer computation */ 1262 tg->tg_deferred[tg->tg_num_deferred] = m; 1263 1264 new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); 1265 1266 /* 1267 * If this probe's round trip time would singlehandedly cause an 1268 * increase in the group's probe interval consider it suspect. 1269 */ 1270 if ((new_crtt > probe_interval) && is_probe_uni) { 1271 if (debug & D_PROBE) { 1272 logdebug("Received a suspect probe on %s, new_crtt =" 1273 " %d, probe_interval = %d, num_deferred = %d\n", 1274 pii->pii_probe_logint->li_name, new_crtt, 1275 probe_interval, tg->tg_num_deferred); 1276 } 1277 1278 /* 1279 * If we've deferred as many rtts as we plan on deferring, then 1280 * assume the link really did slow down and process all queued 1281 * rtts 1282 */ 1283 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1284 if (debug & D_PROBE) { 1285 logdebug("Received MAXDEFERREDRTT probes which " 1286 "would cause an increased probe_interval. " 1287 "Integrating queued rtt data points.\n"); 1288 } 1289 1290 for (i = 0; i <= tg->tg_num_deferred; i++) { 1291 tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, 1292 &tg->tg_rtt_sd, tg->tg_deferred[i])); 1293 } 1294 1295 tg->tg_num_deferred = 0; 1296 } else { 1297 tg->tg_num_deferred++; 1298 } 1299 return; 1300 } 1301 1302 /* 1303 * If this is a normal probe, or an RTT probe that would lead to a 1304 * reduced CRTT, then update our CRTT data. Further, if this was 1305 * a normal probe, pitch any deferred probes since our probes are 1306 * again being answered within our CRTT estimates. 1307 */ 1308 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1309 tg->tg_rtt_sa = sa; 1310 tg->tg_rtt_sd = sv; 1311 tg->tg_crtt = new_crtt; 1312 if (is_probe_uni) 1313 tg->tg_num_deferred = 0; 1314 } 1315 } 1316 1317 /* 1318 * Return a pointer to the specified option buffer. 1319 * If not found return NULL. 1320 */ 1321 static void * 1322 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) 1323 { 1324 struct cmsghdr *cmsg; 1325 1326 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1327 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1328 if (cmsg->cmsg_level == cmsg_level && 1329 cmsg->cmsg_type == cmsg_type) { 1330 return (CMSG_DATA(cmsg)); 1331 } 1332 } 1333 return (NULL); 1334 } 1335 1336 /* 1337 * Try to activate another INACTIVE interface in the same group as `pi'. 1338 * Prefer STANDBY INACTIVE to just INACTIVE. 1339 */ 1340 void 1341 phyint_activate_another(struct phyint *pi) 1342 { 1343 struct phyint *pi2; 1344 struct phyint *inactivepi = NULL; 1345 1346 if (pi->pi_group == phyint_anongroup) 1347 return; 1348 1349 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1350 if (pi == pi2 || pi2->pi_state != PI_RUNNING || 1351 !(pi2->pi_flags & IFF_INACTIVE)) 1352 continue; 1353 1354 inactivepi = pi2; 1355 if (pi2->pi_flags & IFF_STANDBY) 1356 break; 1357 } 1358 1359 if (inactivepi != NULL) 1360 (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); 1361 } 1362 1363 /* 1364 * Transition a phyint to PI_RUNNING. The caller must ensure that the 1365 * transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if 1366 * appropriate. Also sets IFF_INACTIVE on this or other interfaces as 1367 * appropriate (see comment below). Finally, also updates the phyint's group 1368 * state to account for the change. 1369 */ 1370 void 1371 phyint_transition_to_running(struct phyint *pi) 1372 { 1373 struct phyint *pi2; 1374 struct phyint *actstandbypi = NULL; 1375 uint_t nactive = 0, nnonstandby = 0; 1376 boolean_t onlining = (pi->pi_state == PI_OFFLINE); 1377 boolean_t initial = (pi->pi_state == PI_INIT); 1378 uint64_t set, clear; 1379 1380 /* 1381 * The interface is running again, but should it or another interface 1382 * in the group end up INACTIVE? There are three cases: 1383 * 1384 * 1. If it's a STANDBY interface, it should be end up INACTIVE if 1385 * the group is operating at capacity (i.e., there are at least as 1386 * many active interfaces as non-STANDBY interfaces in the group). 1387 * No other interfaces should be changed. 1388 * 1389 * 2. If it's a non-STANDBY interface and we're onlining it or 1390 * FAILBACK is enabled, then it should *not* end up INACTIVE. 1391 * Further, if the group is above capacity as a result of this 1392 * interface, then an active STANDBY interface in the group should 1393 * end up INACTIVE. 1394 * 1395 * 3. If it's a non-STANDBY interface, we're repairing it, and 1396 * FAILBACK is disabled, then it should end up INACTIVE *unless* 1397 * the group was failed (in which case we have no choice but to 1398 * use it). No other interfaces should be changed. 1399 */ 1400 if (pi->pi_group != phyint_anongroup) { 1401 pi2 = pi->pi_group->pg_phyint; 1402 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1403 if (!(pi2->pi_flags & IFF_STANDBY)) 1404 nnonstandby++; 1405 1406 if (pi2->pi_state == PI_RUNNING) { 1407 if (!(pi2->pi_flags & IFF_INACTIVE)) { 1408 nactive++; 1409 if (pi2->pi_flags & IFF_STANDBY) 1410 actstandbypi = pi2; 1411 } 1412 } 1413 } 1414 } 1415 1416 set = 0; 1417 clear = (onlining ? IFF_OFFLINE : IFF_FAILED); 1418 1419 if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ 1420 if (nactive >= nnonstandby) 1421 set |= IFF_INACTIVE; 1422 else 1423 clear |= IFF_INACTIVE; 1424 } else if (onlining || failback_enabled) { /* case 2 */ 1425 if (nactive >= nnonstandby && actstandbypi != NULL) 1426 (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); 1427 } else if (!initial && !GROUP_FAILED(pi->pi_group)) { /* case 3 */ 1428 set |= IFF_INACTIVE; 1429 } 1430 (void) change_pif_flags(pi, set, clear); 1431 1432 phyint_chstate(pi, PI_RUNNING); 1433 1434 /* 1435 * Update the group state to account for the change. 1436 */ 1437 phyint_group_refresh_state(pi->pi_group); 1438 } 1439 1440 /* 1441 * See if a previously failed interface has started working again. 1442 */ 1443 void 1444 phyint_check_for_repair(struct phyint *pi) 1445 { 1446 if (!phyint_repaired(pi)) 1447 return; 1448 1449 if (pi->pi_group == phyint_anongroup) { 1450 logerr("IP interface repair detected on %s\n", pi->pi_name); 1451 } else { 1452 logerr("IP interface repair detected on %s of group %s\n", 1453 pi->pi_name, pi->pi_group->pg_name); 1454 } 1455 1456 /* 1457 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. 1458 * So just clear IFF_OFFLINE and defer phyint_transition_to_running() 1459 * until it is brought back online. 1460 */ 1461 if (pi->pi_state == PI_OFFLINE) { 1462 (void) change_pif_flags(pi, 0, IFF_FAILED); 1463 return; 1464 } 1465 1466 phyint_transition_to_running(pi); /* calls phyint_chstate() */ 1467 } 1468 1469 /* 1470 * See if an interface has failed, or if the whole group of interfaces has 1471 * failed. 1472 */ 1473 static void 1474 phyint_inst_check_for_failure(struct phyint_instance *pii) 1475 { 1476 struct phyint *pi = pii->pii_phyint; 1477 struct phyint *pi2; 1478 boolean_t was_active; 1479 1480 switch (failure_state(pii)) { 1481 case PHYINT_FAILURE: 1482 was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); 1483 1484 (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); 1485 if (pi->pi_group == phyint_anongroup) { 1486 logerr("IP interface failure detected on %s\n", 1487 pii->pii_name); 1488 } else { 1489 logerr("IP interface failure detected on %s of group" 1490 " %s\n", pii->pii_name, pi->pi_group->pg_name); 1491 } 1492 1493 /* 1494 * If the failed interface was active, activate another 1495 * INACTIVE interface in the group if possible. 1496 */ 1497 if (was_active) 1498 phyint_activate_another(pi); 1499 1500 /* 1501 * If the interface is offline, the state change will be 1502 * noted when it comes back online. 1503 */ 1504 if (pi->pi_state != PI_OFFLINE) { 1505 phyint_chstate(pi, PI_FAILED); 1506 reset_crtt_all(pi); 1507 } 1508 break; 1509 1510 case GROUP_FAILURE: 1511 pi2 = pi->pi_group->pg_phyint; 1512 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1513 (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); 1514 if (pi2->pi_state == PI_OFFLINE) /* see comment above */ 1515 continue; 1516 1517 reset_crtt_all(pi2); 1518 /* 1519 * In the case of host targets, we would have flushed 1520 * the targets, and gone to PI_NOTARGETS state. 1521 */ 1522 if (pi2->pi_state == PI_RUNNING) 1523 phyint_chstate(pi2, PI_FAILED); 1524 } 1525 break; 1526 1527 default: 1528 break; 1529 } 1530 } 1531 1532 /* 1533 * Determines if any timeout event has occurred and returns the number of 1534 * milliseconds until the next timeout event for the phyint. Returns 1535 * TIMER_INFINITY for "never". 1536 */ 1537 uint_t 1538 phyint_inst_timer(struct phyint_instance *pii) 1539 { 1540 int pr_ndx; 1541 uint_t timeout; 1542 struct target *cur_tg; 1543 struct probe_stats *pr_statp; 1544 struct phyint_instance *pii_other; 1545 struct phyint *pi; 1546 int valid_unack_count; 1547 int i; 1548 int interval; 1549 uint_t check_time; 1550 uint_t cur_time; 1551 hrtime_t cur_hrtime; 1552 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1553 1554 cur_hrtime = gethrtime(); 1555 cur_time = ns2ms(cur_hrtime); 1556 1557 if (debug & D_TIMER) { 1558 logdebug("phyint_inst_timer(%s %s)\n", 1559 AF_STR(pii->pii_af), pii->pii_name); 1560 } 1561 1562 pii_other = phyint_inst_other(pii); 1563 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1564 /* 1565 * Check to see if we're here due to link up/down flapping; If 1566 * enough time has passed, then try to bring the interface 1567 * back up; otherwise, schedule a timer to bring it back up 1568 * when enough time *has* elapsed. 1569 */ 1570 pi = pii->pii_phyint; 1571 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1572 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1573 if (check_time > cur_time) 1574 return (check_time - cur_time); 1575 1576 phyint_check_for_repair(pi); 1577 } 1578 } 1579 1580 /* 1581 * If probing is not enabled on this phyint instance, don't proceed. 1582 */ 1583 if (!PROBE_ENABLED(pii)) 1584 return (TIMER_INFINITY); 1585 1586 /* 1587 * If the timer has fired too soon, probably triggered 1588 * by some other phyint instance, return the remaining 1589 * time 1590 */ 1591 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1592 return (pii->pii_snxt_time - cur_time); 1593 1594 /* 1595 * If the link is down, don't send any probes for now. 1596 */ 1597 if (LINK_DOWN(pii->pii_phyint)) 1598 return (TIMER_INFINITY); 1599 1600 /* 1601 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1602 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1603 * Base probe time is strictly periodic. 1604 */ 1605 interval = GET_RANDOM( 1606 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1607 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1608 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1609 1610 /* 1611 * Check if the current time > next time to probe. If so, we missed 1612 * sending 1 or more probes, probably due to heavy system load. At least 1613 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1614 * were scheduled. Make adjustments to the times, in multiples of 1615 * user_probe_interval. 1616 */ 1617 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1618 int n; 1619 1620 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1621 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1622 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1623 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1624 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1625 pii->pii_snxt_basetime); 1626 1627 /* Collect statistics about missed probes */ 1628 probes_missed.pm_nprobes += n + 1; 1629 probes_missed.pm_ntimes++; 1630 } 1631 pii->pii_snxt_basetime += user_probe_interval; 1632 interval = pii->pii_snxt_time - cur_time; 1633 if (debug & D_TARGET) { 1634 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1635 " interval %u\n", cur_time, pii->pii_snxt_time, 1636 pii->pii_snxt_basetime, interval); 1637 } 1638 1639 /* 1640 * If no targets are known, we need to send an ICMP multicast. The 1641 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1642 * to see if we found a target. 1643 */ 1644 if (pii->pii_target_next == NULL) { 1645 assert(pii->pii_ntargets == 0); 1646 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1647 probe(pii, PROBE_MULTI, cur_time); 1648 return (interval); 1649 } 1650 1651 if ((user_probe_interval != probe_interval) && 1652 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1653 /* 1654 * the failure detection (fd) probe timer has not yet fired. 1655 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1656 */ 1657 probe(pii, PROBE_RTT, cur_hrtime); 1658 return (interval); 1659 } 1660 /* 1661 * the fd probe timer has fired. Need to do all failure 1662 * detection / recovery calculations, and then send an fd probe 1663 * of type PROBE_UNI. 1664 */ 1665 if (user_probe_interval == probe_interval) { 1666 /* 1667 * We could have missed some probes, and then adjusted 1668 * pii_snxt_basetime above. Otherwise we could have 1669 * blindly added probe_interval to pii_fd_snxt_basetime. 1670 */ 1671 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1672 } else { 1673 pii->pii_fd_snxt_basetime += probe_interval; 1674 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1675 int n; 1676 1677 n = (cur_time - pii->pii_fd_snxt_basetime) / 1678 probe_interval; 1679 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1680 } 1681 } 1682 1683 /* 1684 * We can have at most, the latest 2 probes that we sent, in 1685 * the PR_UNACKED state. All previous probes sent, are either 1686 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1687 * timed out if the probe's time_start + the CRTT < currenttime. 1688 * For each of the last 2 probes, examine whether it has timed 1689 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1690 */ 1691 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1692 valid_unack_count = 0; 1693 1694 for (i = 0; i < 2; i++) { 1695 pr_statp = &pii->pii_probes[pr_ndx]; 1696 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1697 switch (pr_statp->pr_status) { 1698 case PR_ACKED: 1699 /* 1700 * We received back an ACK, so the switch clearly 1701 * is not dropping our traffic, and thus we can 1702 * enable failure detection immediately. 1703 */ 1704 if (pii->pii_fd_hrtime > gethrtime()) { 1705 if (debug & D_PROBE) { 1706 logdebug("successful probe on %s; " 1707 "ending quiet period\n", 1708 pii->pii_phyint->pi_name); 1709 } 1710 pii->pii_fd_hrtime = gethrtime(); 1711 } 1712 break; 1713 1714 case PR_UNACKED: 1715 assert(cur_tg != NULL); 1716 /* 1717 * The crtt could be zero for some reason, 1718 * Eg. the phyint could be failed. If the crtt is 1719 * not available use group's probe interval, 1720 * which is a worst case estimate. 1721 */ 1722 timeout = ns2ms(pr_statp->pr_hrtime_start); 1723 if (cur_tg->tg_crtt != 0) { 1724 timeout += cur_tg->tg_crtt; 1725 } else { 1726 timeout += probe_interval; 1727 } 1728 if (TIME_LT(timeout, cur_time)) { 1729 pr_statp->pr_time_lost = timeout; 1730 probe_chstate(pr_statp, pii, PR_LOST); 1731 } else if (i == 1) { 1732 /* 1733 * We are forced to consider this probe 1734 * lost, as we can have at most 2 unack. 1735 * probes any time, and we will be sending a 1736 * probe at the end of this function. 1737 * Normally, we should not be here, but 1738 * this can happen if an incoming response 1739 * that was considered lost has increased 1740 * the crtt for this target, and also bumped 1741 * up the FDT. Note that we never cancel or 1742 * increase the current pii_time_left, so 1743 * when the timer fires, we find 2 valid 1744 * unacked probes, and they are yet to timeout 1745 */ 1746 pr_statp->pr_time_lost = cur_time; 1747 probe_chstate(pr_statp, pii, PR_LOST); 1748 } else { 1749 /* 1750 * Only the most recent probe can enter 1751 * this 'else' arm. The second most recent 1752 * probe must take either of the above arms, 1753 * if it is unacked. 1754 */ 1755 valid_unack_count++; 1756 } 1757 break; 1758 } 1759 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1760 } 1761 1762 /* 1763 * We send out 1 probe randomly in the interval between one half 1764 * and one probe interval for the group. Given that the CRTT is always 1765 * less than the group's probe interval, we can have at most 1 1766 * unacknowledged probe now. All previous probes are either lost or 1767 * acked. 1768 */ 1769 assert(valid_unack_count == 0 || valid_unack_count == 1); 1770 1771 /* 1772 * The timer has fired. Take appropriate action depending 1773 * on the current state of the phyint. 1774 * 1775 * PI_RUNNING state - Failure detection 1776 * PI_FAILED state - Repair detection 1777 */ 1778 switch (pii->pii_phyint->pi_state) { 1779 case PI_FAILED: 1780 /* 1781 * If the most recent probe (excluding unacked probes that 1782 * are yet to time out) has been acked, check whether the 1783 * phyint is now repaired. 1784 */ 1785 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1786 phyint_check_for_repair(pii->pii_phyint); 1787 } 1788 break; 1789 1790 case PI_RUNNING: 1791 /* 1792 * It's possible our probes have been lost because of a 1793 * spanning-tree mandated quiet period on the switch. If so, 1794 * ignore the lost probes. 1795 */ 1796 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1797 break; 1798 1799 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1800 /* 1801 * We have 1 or more failed probes (excluding unacked 1802 * probes that are yet to time out). Determine if the 1803 * phyint has failed. 1804 */ 1805 phyint_inst_check_for_failure(pii); 1806 } 1807 break; 1808 1809 default: 1810 logerr("phyint_inst_timer: invalid state %d\n", 1811 pii->pii_phyint->pi_state); 1812 abort(); 1813 } 1814 1815 /* 1816 * Start the next probe. probe() will also set pii->pii_probe_time_left 1817 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1818 * was called, the target list may be empty. 1819 */ 1820 if (pii->pii_target_next != NULL) { 1821 probe(pii, PROBE_UNI, cur_hrtime); 1822 /* 1823 * If we have just the one probe target, and we're not using 1824 * router targets, try to find another as we presently have 1825 * no resilience. 1826 */ 1827 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1828 probe(pii, PROBE_MULTI, cur_hrtime); 1829 } else { 1830 probe(pii, PROBE_MULTI, cur_hrtime); 1831 } 1832 return (interval); 1833 } 1834 1835 /* 1836 * Start the probe timer for an interface instance. 1837 */ 1838 void 1839 start_timer(struct phyint_instance *pii) 1840 { 1841 uint32_t interval; 1842 1843 /* 1844 * Spread the base probe times (pi_snxt_basetime) across phyints 1845 * uniformly over the (curtime..curtime + the group's probe_interval). 1846 * pi_snxt_basetime is strictly periodic with a frequency of 1847 * the group's probe interval. The actual probe time pi_snxt_time 1848 * adds some randomness to pi_snxt_basetime and happens in probe(). 1849 * For the 1st probe on each phyint after the timer is started, 1850 * pi_snxt_time and pi_snxt_basetime are the same. 1851 */ 1852 interval = GET_RANDOM(0, 1853 (int)pii->pii_phyint->pi_group->pg_probeint); 1854 1855 pii->pii_snxt_basetime = getcurrenttime() + interval; 1856 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1857 pii->pii_snxt_time = pii->pii_snxt_basetime; 1858 timer_schedule(interval); 1859 } 1860 1861 /* 1862 * Restart the probe timer on an interface instance. 1863 */ 1864 static void 1865 restart_timer(struct phyint_instance *pii) 1866 { 1867 /* 1868 * We don't need to restart the timer if it was never started in 1869 * the first place (pii->pii_basetime_inited not set), as the timer 1870 * won't have gone off yet. 1871 */ 1872 if (pii->pii_basetime_inited != 0) { 1873 1874 if (debug & D_LINKNOTE) 1875 logdebug("restart timer: restarting timer on %s, " 1876 "address family %s\n", pii->pii_phyint->pi_name, 1877 AF_STR(pii->pii_af)); 1878 1879 start_timer(pii); 1880 } 1881 } 1882 1883 static void 1884 process_link_state_down(struct phyint *pi) 1885 { 1886 logerr("The link has gone down on %s\n", pi->pi_name); 1887 1888 /* 1889 * Clear the probe statistics arrays, we don't want the repair 1890 * detection logic relying on probes that were successful prior 1891 * to the link going down. 1892 */ 1893 if (PROBE_CAPABLE(pi->pi_v4)) 1894 clear_pii_probe_stats(pi->pi_v4); 1895 if (PROBE_CAPABLE(pi->pi_v6)) 1896 clear_pii_probe_stats(pi->pi_v6); 1897 /* 1898 * Check for interface failure. Although we know the interface 1899 * has failed, we don't know if all the other interfaces in the 1900 * group have failed as well. 1901 */ 1902 if ((pi->pi_state == PI_RUNNING) || 1903 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1904 if (debug & D_LINKNOTE) { 1905 logdebug("process_link_state_down:" 1906 " checking for failure on %s\n", pi->pi_name); 1907 } 1908 1909 if (pi->pi_v4 != NULL) 1910 phyint_inst_check_for_failure(pi->pi_v4); 1911 else if (pi->pi_v6 != NULL) 1912 phyint_inst_check_for_failure(pi->pi_v6); 1913 } 1914 } 1915 1916 static void 1917 process_link_state_up(struct phyint *pi) 1918 { 1919 logerr("The link has come up on %s\n", pi->pi_name); 1920 1921 /* 1922 * We stopped any running timers on each instance when the link 1923 * went down, so restart them. 1924 */ 1925 if (pi->pi_v4) 1926 restart_timer(pi->pi_v4); 1927 if (pi->pi_v6) 1928 restart_timer(pi->pi_v6); 1929 1930 phyint_check_for_repair(pi); 1931 1932 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1933 if (pi->pi_whendx == LINK_UP_PERMIN) 1934 pi->pi_whendx = 0; 1935 } 1936 1937 /* 1938 * Process any changes in link state passed up from the interfaces. 1939 */ 1940 void 1941 process_link_state_changes(void) 1942 { 1943 struct phyint *pi; 1944 1945 /* Look for interfaces where the link state has just changed */ 1946 1947 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1948 boolean_t old_link_state_up = LINK_UP(pi); 1949 1950 /* 1951 * Except when the "phyint" structure is created, this is 1952 * the only place the link state is updated. This allows 1953 * this routine to detect changes in link state, rather 1954 * than just the current state. 1955 */ 1956 UPDATE_LINK_STATE(pi); 1957 1958 if (LINK_DOWN(pi)) { 1959 /* 1960 * Has link just gone down? 1961 */ 1962 if (old_link_state_up) 1963 process_link_state_down(pi); 1964 } else { 1965 /* 1966 * Has link just gone back up? 1967 */ 1968 if (!old_link_state_up) 1969 process_link_state_up(pi); 1970 } 1971 } 1972 } 1973 1974 void 1975 reset_crtt_all(struct phyint *pi) 1976 { 1977 struct phyint_instance *pii; 1978 struct target *tg; 1979 1980 pii = pi->pi_v4; 1981 if (pii != NULL) { 1982 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1983 tg->tg_crtt = 0; 1984 tg->tg_rtt_sa = -1; 1985 tg->tg_rtt_sd = 0; 1986 } 1987 } 1988 1989 pii = pi->pi_v6; 1990 if (pii != NULL) { 1991 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1992 tg->tg_crtt = 0; 1993 tg->tg_rtt_sa = -1; 1994 tg->tg_rtt_sd = 0; 1995 } 1996 } 1997 } 1998 1999 /* 2000 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 2001 * probes on both instances IPv4 and IPv6. 2002 * If the interface has failed, return the time of the first probe failure 2003 * in "tff". 2004 */ 2005 static int 2006 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 2007 { 2008 uint_t pi_tff; 2009 struct target *cur_tg; 2010 struct probe_fail_count pfinfo; 2011 struct phyint_instance *pii_other; 2012 int pr_ndx; 2013 2014 /* 2015 * Get the number of consecutive failed probes on 2016 * this phyint across all targets. Also get the number 2017 * of consecutive failed probes on this target only 2018 */ 2019 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2020 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2021 probe_fail_info(pii, cur_tg, &pfinfo); 2022 2023 /* Get the time of first failure, for later use */ 2024 pi_tff = pfinfo.pf_tff; 2025 2026 /* 2027 * If the current target has not responded to the 2028 * last NUM_PROBE_FAILS probes, and other targets are 2029 * responding delete this target. Dead gateway detection 2030 * will eventually remove this target (if router) from the 2031 * routing tables. If that does not occur, we may end 2032 * up adding this to our list again. 2033 */ 2034 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2035 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2036 if (pii->pii_targets_are_routers) { 2037 if (cur_tg->tg_status == TG_ACTIVE) 2038 pii->pii_ntargets--; 2039 cur_tg->tg_status = TG_DEAD; 2040 cur_tg->tg_crtt = 0; 2041 cur_tg->tg_rtt_sa = -1; 2042 cur_tg->tg_rtt_sd = 0; 2043 if (pii->pii_target_next == cur_tg) 2044 pii->pii_target_next = target_next(cur_tg); 2045 } else { 2046 target_delete(cur_tg); 2047 probe(pii, PROBE_MULTI, gethrtime()); 2048 } 2049 return (PHYINT_OK); 2050 } 2051 2052 /* 2053 * If the phyint has lost NUM_PROBE_FAILS or more 2054 * consecutive probes, on both IPv4 and IPv6 protocol 2055 * instances of the phyint, then trigger failure 2056 * detection, else return false 2057 */ 2058 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2059 return (PHYINT_OK); 2060 2061 pii_other = phyint_inst_other(pii); 2062 if (PROBE_CAPABLE(pii_other)) { 2063 probe_fail_info(pii_other, NULL, &pfinfo); 2064 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2065 /* 2066 * We have NUM_PROBE_FAILS or more failures 2067 * on both IPv4 and IPv6. Get the earliest 2068 * time when failure was detected on this 2069 * phyint across IPv4 and IPv6. 2070 */ 2071 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2072 pi_tff = pfinfo.pf_tff; 2073 } else { 2074 /* 2075 * This instance has < NUM_PROBE_FAILS failure. 2076 * So return false 2077 */ 2078 return (PHYINT_OK); 2079 } 2080 } 2081 *tff = pi_tff; 2082 return (PHYINT_FAILURE); 2083 } 2084 2085 /* 2086 * Check if the link has gone down on this phyint, or it has failed the 2087 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2088 * Also look at other phyints of this group, for group failures. 2089 */ 2090 int 2091 failure_state(struct phyint_instance *pii) 2092 { 2093 struct probe_success_count psinfo; 2094 uint_t pi2_tls; /* time last success */ 2095 uint_t pi_tff; /* time first fail */ 2096 struct phyint *pi2; 2097 struct phyint *pi; 2098 struct phyint_instance *pii2; 2099 struct phyint_group *pg; 2100 int retval; 2101 2102 if (debug & D_FAILREP) 2103 logdebug("phyint_failed(%s)\n", pii->pii_name); 2104 2105 pi = pii->pii_phyint; 2106 pg = pi->pi_group; 2107 2108 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2109 PHYINT_OK) 2110 return (PHYINT_OK); 2111 2112 /* 2113 * At this point, the link is down, or the phyint is suspect, as it 2114 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not 2115 * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue 2116 * on to determine whether this should be considered a PHYINT_FAILURE 2117 * or GROUP_FAILURE. 2118 */ 2119 if (pg == phyint_anongroup) 2120 return (PHYINT_FAILURE); 2121 2122 /* 2123 * Need to compare against other phyints of the same group 2124 * to exclude group failures. If the failure was detected via 2125 * probing, then if the time of last success (tls) of any 2126 * phyint is more recent than the time of first fail (tff) of the 2127 * phyint in question, and the link is up on the phyint, 2128 * then it is a phyint failure. Otherwise it is a group failure. 2129 * If failure was detected via a link down notification sent from 2130 * the driver to IP, we see if any phyints in the group are still 2131 * running and haven't received a link down notification. We 2132 * will usually be processing the link down notification shortly 2133 * after it was received, so there is no point looking at the tls 2134 * of other phyints. 2135 */ 2136 retval = GROUP_FAILURE; 2137 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2138 /* Exclude ourself from comparison */ 2139 if (pi2 == pi) 2140 continue; 2141 2142 if (LINK_DOWN(pi)) { 2143 /* 2144 * We use FLAGS_TO_LINK_STATE() to test the flags 2145 * directly, rather then LINK_UP() or LINK_DOWN(), as 2146 * we may not have got round to processing the link 2147 * state for the other phyints in the group yet. 2148 * 2149 * The check for PI_RUNNING and group failure handles 2150 * the case when the group begins to recover. 2151 * PI_RUNNING will be set, and group failure cleared 2152 * only after receipt of NUM_PROBE_REPAIRS, by which 2153 * time the other phyints should have received at 2154 * least 1 packet, and so will not have NUM_PROBE_FAILS. 2155 */ 2156 if ((pi2->pi_state == PI_RUNNING) && 2157 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { 2158 retval = PHYINT_FAILURE; 2159 break; 2160 } 2161 continue; 2162 } 2163 2164 if (LINK_DOWN(pi2)) 2165 continue; 2166 2167 /* 2168 * If there's no probe-based failure detection on this 2169 * interface, and its link is still up, then it's still 2170 * working and thus the group has not failed. 2171 */ 2172 if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { 2173 retval = PHYINT_FAILURE; 2174 break; 2175 } 2176 2177 /* 2178 * Need to compare against both IPv4 and IPv6 instances. 2179 */ 2180 pii2 = pi2->pi_v4; 2181 if (pii2 != NULL) { 2182 probe_success_info(pii2, NULL, &psinfo); 2183 if (psinfo.ps_tls_valid) { 2184 pi2_tls = psinfo.ps_tls; 2185 /* 2186 * See comment above regarding check 2187 * for PI_RUNNING and group failure. 2188 */ 2189 if (TIME_GT(pi2_tls, pi_tff) && 2190 (pi2->pi_state == PI_RUNNING) && 2191 !GROUP_FAILED(pg) && 2192 FLAGS_TO_LINK_STATE(pi2)) { 2193 retval = PHYINT_FAILURE; 2194 break; 2195 } 2196 } 2197 } 2198 2199 pii2 = pi2->pi_v6; 2200 if (pii2 != NULL) { 2201 probe_success_info(pii2, NULL, &psinfo); 2202 if (psinfo.ps_tls_valid) { 2203 pi2_tls = psinfo.ps_tls; 2204 /* 2205 * See comment above regarding check 2206 * for PI_RUNNING and group failure. 2207 */ 2208 if (TIME_GT(pi2_tls, pi_tff) && 2209 (pi2->pi_state == PI_RUNNING) && 2210 !GROUP_FAILED(pg) && 2211 FLAGS_TO_LINK_STATE(pi2)) { 2212 retval = PHYINT_FAILURE; 2213 break; 2214 } 2215 } 2216 } 2217 } 2218 2219 /* 2220 * Update the group state to account for the changes. 2221 */ 2222 phyint_group_refresh_state(pg); 2223 return (retval); 2224 } 2225 2226 /* 2227 * Return the information associated with consecutive probe successes 2228 * starting with the most recent probe. At most the last 2 probes can be 2229 * in the unacknowledged state. All previous probes have either failed 2230 * or succeeded. 2231 */ 2232 static void 2233 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2234 struct probe_success_count *psinfo) 2235 { 2236 uint_t i; 2237 struct probe_stats *pr_statp; 2238 uint_t most_recent; 2239 uint_t second_most_recent; 2240 boolean_t pi_found_failure = _B_FALSE; 2241 boolean_t tg_found_failure = _B_FALSE; 2242 uint_t now; 2243 uint_t timeout; 2244 struct target *tg; 2245 2246 if (debug & D_FAILREP) 2247 logdebug("probe_success_info(%s)\n", pii->pii_name); 2248 2249 bzero(psinfo, sizeof (*psinfo)); 2250 now = getcurrenttime(); 2251 2252 /* 2253 * Start with the most recent probe, and count the number 2254 * of consecutive probe successes. Latch the number of successes 2255 * on hitting a failure. 2256 */ 2257 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2258 second_most_recent = PROBE_INDEX_PREV(most_recent); 2259 2260 for (i = most_recent; i != pii->pii_probe_next; 2261 i = PROBE_INDEX_PREV(i)) { 2262 pr_statp = &pii->pii_probes[i]; 2263 2264 switch (pr_statp->pr_status) { 2265 case PR_UNACKED: 2266 /* 2267 * Only the most recent 2 probes can be unacknowledged 2268 */ 2269 assert(i == most_recent || i == second_most_recent); 2270 2271 tg = pr_statp->pr_target; 2272 assert(tg != NULL); 2273 /* 2274 * The crtt could be zero for some reason, 2275 * Eg. the phyint could be failed. If the crtt is 2276 * not available use the value of the group's probe 2277 * interval which is a worst case estimate. 2278 */ 2279 timeout = ns2ms(pr_statp->pr_hrtime_start); 2280 if (tg->tg_crtt != 0) { 2281 timeout += tg->tg_crtt; 2282 } else { 2283 timeout += 2284 pii->pii_phyint->pi_group->pg_probeint; 2285 } 2286 2287 if (TIME_LT(timeout, now)) { 2288 /* 2289 * We hit a failure. Latch the total number of 2290 * recent consecutive successes. 2291 */ 2292 pr_statp->pr_time_lost = timeout; 2293 probe_chstate(pr_statp, pii, PR_LOST); 2294 pi_found_failure = _B_TRUE; 2295 if (cur_tg != NULL && tg == cur_tg) { 2296 /* 2297 * We hit a failure for the desired 2298 * target. Latch the number of recent 2299 * consecutive successes for this target 2300 */ 2301 tg_found_failure = _B_TRUE; 2302 } 2303 } 2304 break; 2305 2306 case PR_ACKED: 2307 /* 2308 * Bump up the count of probe successes, if we 2309 * have not seen any failure so far. 2310 */ 2311 if (!pi_found_failure) 2312 psinfo->ps_nsucc++; 2313 2314 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2315 !tg_found_failure) { 2316 psinfo->ps_nsucc_tg++; 2317 } 2318 2319 /* 2320 * Record the time of last success, if this is 2321 * the most recent probe success. 2322 */ 2323 if (!psinfo->ps_tls_valid) { 2324 psinfo->ps_tls = 2325 ns2ms(pr_statp->pr_hrtime_ackproc); 2326 psinfo->ps_tls_valid = _B_TRUE; 2327 } 2328 break; 2329 2330 case PR_LOST: 2331 /* 2332 * We hit a failure. Latch the total number of 2333 * recent consecutive successes. 2334 */ 2335 pi_found_failure = _B_TRUE; 2336 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2337 /* 2338 * We hit a failure for the desired target. 2339 * Latch the number of recent consecutive 2340 * successes for this target 2341 */ 2342 tg_found_failure = _B_TRUE; 2343 } 2344 break; 2345 2346 default: 2347 return; 2348 2349 } 2350 } 2351 } 2352 2353 /* 2354 * Return the information associated with consecutive probe failures 2355 * starting with the most recent probe. Only the last 2 probes can be in the 2356 * unacknowledged state. All previous probes have either failed or succeeded. 2357 */ 2358 static void 2359 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2360 struct probe_fail_count *pfinfo) 2361 { 2362 int i; 2363 struct probe_stats *pr_statp; 2364 boolean_t tg_found_success = _B_FALSE; 2365 boolean_t pi_found_success = _B_FALSE; 2366 int most_recent; 2367 int second_most_recent; 2368 uint_t now; 2369 uint_t timeout; 2370 struct target *tg; 2371 2372 if (debug & D_FAILREP) 2373 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2374 2375 bzero(pfinfo, sizeof (*pfinfo)); 2376 now = getcurrenttime(); 2377 2378 /* 2379 * Start with the most recent probe, and count the number 2380 * of consecutive probe failures. Latch the number of failures 2381 * on hitting a probe success. 2382 */ 2383 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2384 second_most_recent = PROBE_INDEX_PREV(most_recent); 2385 2386 for (i = most_recent; i != pii->pii_probe_next; 2387 i = PROBE_INDEX_PREV(i)) { 2388 pr_statp = &pii->pii_probes[i]; 2389 2390 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2391 2392 switch (pr_statp->pr_status) { 2393 case PR_UNACKED: 2394 /* 2395 * Only the most recent 2 probes can be unacknowledged 2396 */ 2397 assert(i == most_recent || i == second_most_recent); 2398 2399 tg = pr_statp->pr_target; 2400 /* 2401 * Target is guaranteed to exist in the unack. state 2402 */ 2403 assert(tg != NULL); 2404 /* 2405 * The crtt could be zero for some reason, 2406 * Eg. the phyint could be failed. If the crtt is 2407 * not available use the group's probe interval, 2408 * which is a worst case estimate. 2409 */ 2410 timeout = ns2ms(pr_statp->pr_hrtime_start); 2411 if (tg->tg_crtt != 0) { 2412 timeout += tg->tg_crtt; 2413 } else { 2414 timeout += 2415 pii->pii_phyint->pi_group->pg_probeint; 2416 } 2417 2418 if (TIME_GT(timeout, now)) 2419 break; 2420 2421 pr_statp->pr_time_lost = timeout; 2422 probe_chstate(pr_statp, pii, PR_LOST); 2423 /* FALLTHRU */ 2424 2425 case PR_LOST: 2426 if (!pi_found_success) { 2427 pfinfo->pf_nfail++; 2428 pfinfo->pf_tff = pr_statp->pr_time_lost; 2429 } 2430 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2431 !tg_found_success) { 2432 pfinfo->pf_nfail_tg++; 2433 } 2434 break; 2435 2436 default: 2437 /* 2438 * We hit a success or unused slot. Latch the 2439 * total number of recent consecutive failures. 2440 */ 2441 pi_found_success = _B_TRUE; 2442 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2443 /* 2444 * We hit a success for the desired target. 2445 * Latch the number of recent consecutive 2446 * failures for this target 2447 */ 2448 tg_found_success = _B_TRUE; 2449 } 2450 } 2451 } 2452 } 2453 2454 /* 2455 * Change the state of probe `pr' on phyint_instance `pii' to state `state'. 2456 */ 2457 void 2458 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) 2459 { 2460 if (pr->pr_status == state) 2461 return; 2462 2463 pr->pr_status = state; 2464 (void) probe_state_event(pr, pii); 2465 } 2466 2467 /* 2468 * Check if the phyint has been repaired. If no test address has been 2469 * configured, then consider the interface repaired if the link is up (unless 2470 * the link is flapping; see below). Otherwise, look for proof of probes 2471 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2472 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2473 */ 2474 static boolean_t 2475 phyint_repaired(struct phyint *pi) 2476 { 2477 struct probe_success_count psinfo; 2478 struct phyint_instance *pii; 2479 struct target *cur_tg; 2480 int pr_ndx; 2481 uint_t cur_time; 2482 2483 if (debug & D_FAILREP) 2484 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2485 2486 if (LINK_DOWN(pi)) 2487 return (_B_FALSE); 2488 2489 /* 2490 * If we don't have any test addresses and the link is up, then 2491 * consider the interface repaired, unless we've received more than 2492 * LINK_UP_PERMIN link up notifications in the last minute, in 2493 * which case we keep the link down until we drop back below 2494 * the threshold. 2495 */ 2496 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2497 cur_time = getcurrenttime(); 2498 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2499 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2500 pi->pi_lfmsg_printed = 0; 2501 return (_B_TRUE); 2502 } 2503 if (!pi->pi_lfmsg_printed) { 2504 logerr("The link has come up on %s more than %d times " 2505 "in the last minute; disabling repair until it " 2506 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2507 pi->pi_lfmsg_printed = 1; 2508 } 2509 2510 return (_B_FALSE); 2511 } 2512 2513 pii = pi->pi_v4; 2514 if (PROBE_CAPABLE(pii)) { 2515 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2516 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2517 probe_success_info(pii, cur_tg, &psinfo); 2518 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2519 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2520 return (_B_TRUE); 2521 } 2522 2523 pii = pi->pi_v6; 2524 if (PROBE_CAPABLE(pii)) { 2525 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2526 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2527 probe_success_info(pii, cur_tg, &psinfo); 2528 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2529 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2530 return (_B_TRUE); 2531 } 2532 2533 return (_B_FALSE); 2534 } 2535 2536 /* 2537 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2538 */ 2539 boolean_t 2540 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) 2541 { 2542 int ifsock; 2543 struct lifreq lifr; 2544 uint64_t old_flags; 2545 2546 if (debug & D_FAILREP) { 2547 logdebug("change_pif_flags(%s): set %llx clear %llx\n", 2548 pi->pi_name, set, clear); 2549 } 2550 2551 if (pi->pi_v4 != NULL) 2552 ifsock = ifsock_v4; 2553 else 2554 ifsock = ifsock_v6; 2555 2556 /* 2557 * Get the current flags from the kernel, and set/clear the 2558 * desired phyint flags. Since we set only phyint flags, we can 2559 * do it on either IPv4 or IPv6 instance. 2560 */ 2561 (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2562 2563 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2564 if (errno != ENXIO) 2565 logperror("change_pif_flags: ioctl (get flags)"); 2566 return (_B_FALSE); 2567 } 2568 2569 old_flags = lifr.lifr_flags; 2570 lifr.lifr_flags |= set; 2571 lifr.lifr_flags &= ~clear; 2572 2573 if (old_flags == lifr.lifr_flags) { 2574 /* No change in the flags. No need to send ioctl */ 2575 return (_B_TRUE); 2576 } 2577 2578 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2579 if (errno != ENXIO) 2580 logperror("change_pif_flags: ioctl (set flags)"); 2581 return (_B_FALSE); 2582 } 2583 2584 /* 2585 * Keep pi_flags in synch. with actual flags. Assumes flags are 2586 * phyint flags. 2587 */ 2588 pi->pi_flags |= set; 2589 pi->pi_flags &= ~clear; 2590 2591 if (pi->pi_v4 != NULL) 2592 pi->pi_v4->pii_flags = pi->pi_flags; 2593 2594 if (pi->pi_v6 != NULL) 2595 pi->pi_v6->pii_flags = pi->pi_flags; 2596 2597 return (_B_TRUE); 2598 } 2599 2600 /* 2601 * icmp cksum computation for IPv4. 2602 */ 2603 static int 2604 in_cksum(ushort_t *addr, int len) 2605 { 2606 register int nleft = len; 2607 register ushort_t *w = addr; 2608 register ushort_t answer; 2609 ushort_t odd_byte = 0; 2610 register int sum = 0; 2611 2612 /* 2613 * Our algorithm is simple, using a 32 bit accumulator (sum), 2614 * we add sequential 16 bit words to it, and at the end, fold 2615 * back all the carry bits from the top 16 bits into the lower 2616 * 16 bits. 2617 */ 2618 while (nleft > 1) { 2619 sum += *w++; 2620 nleft -= 2; 2621 } 2622 2623 /* mop up an odd byte, if necessary */ 2624 if (nleft == 1) { 2625 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2626 sum += odd_byte; 2627 } 2628 2629 /* 2630 * add back carry outs from top 16 bits to low 16 bits 2631 */ 2632 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2633 sum += (sum >> 16); /* add carry */ 2634 answer = ~sum; /* truncate to 16 bits */ 2635 return (answer); 2636 } 2637 2638 static void 2639 reset_snxt_basetimes(void) 2640 { 2641 struct phyint_instance *pii; 2642 2643 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2644 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2645 } 2646 } 2647 2648 /* 2649 * Is the address one of our own addresses? Unfortunately, 2650 * we cannot check our phyint tables to determine if the address 2651 * is our own. This is because, we don't track interfaces that 2652 * are not part of any group. We have to either use a 'bind' or 2653 * get the complete list of all interfaces using SIOCGLIFCONF, 2654 * to do this check. We could also use SIOCTMYADDR. 2655 * Bind fails for the local zone address, so we might include local zone 2656 * address as target address. If local zone address is a target address 2657 * and it is up, it is not possible to detect the interface failure. 2658 * SIOCTMYADDR also doesn't consider local zone address as own address. 2659 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2660 * are stored in `localaddrs' 2661 */ 2662 boolean_t 2663 own_address(struct in6_addr addr) 2664 { 2665 addrlist_t *addrp; 2666 struct sockaddr_storage ss; 2667 int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; 2668 2669 addr2storage(af, &addr, &ss); 2670 for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { 2671 if (sockaddrcmp(&ss, &addrp->al_addr)) 2672 return (_B_TRUE); 2673 } 2674 return (_B_FALSE); 2675 } 2676 2677 static int 2678 ns2ms(int64_t ns) 2679 { 2680 return (ns / (NANOSEC / MILLISEC)); 2681 } 2682 2683 static int64_t 2684 tv2ns(struct timeval *tvp) 2685 { 2686 return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); 2687 } 2688