1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #include "mpd_defs.h" 24 #include "mpd_tables.h" 25 26 /* 27 * Probe types for probe() 28 */ 29 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 30 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 31 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 32 33 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 34 35 /* 36 * Format of probe / probe response packets. This is an ICMP Echo request 37 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 38 */ 39 struct pr_icmp 40 { 41 uint8_t pr_icmp_type; /* type field */ 42 uint8_t pr_icmp_code; /* code field */ 43 uint16_t pr_icmp_cksum; /* checksum field */ 44 uint16_t pr_icmp_id; /* Identification */ 45 uint16_t pr_icmp_seq; /* sequence number */ 46 uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ 47 uint32_t pr_icmp_mtype; /* Message type */ 48 }; 49 50 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 51 0x0, 0x0, 0x0, 0x0, 52 0x0, 0x0, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x1 } }; 54 55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 56 57 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 58 59 static void *find_ancillary(struct msghdr *msg, int cmsg_level, 60 int cmsg_type); 61 static void pi_set_crtt(struct target *tg, int64_t m, 62 boolean_t is_probe_uni); 63 static void incoming_echo_reply(struct phyint_instance *pii, 64 struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); 65 static void incoming_rtt_reply(struct phyint_instance *pii, 66 struct pr_icmp *reply, struct in6_addr fromaddr); 67 static void incoming_mcast_reply(struct phyint_instance *pii, 68 struct pr_icmp *reply, struct in6_addr fromaddr); 69 70 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 71 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 72 static boolean_t check_exception_target(struct phyint_instance *pii, 73 struct target *target); 74 static void probe_fail_info(struct phyint_instance *pii, 75 struct target *cur_tg, struct probe_fail_count *pfinfo); 76 static void probe_success_info(struct phyint_instance *pii, 77 struct target *cur_tg, struct probe_success_count *psinfo); 78 static boolean_t phyint_repaired(struct phyint *pi); 79 80 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 81 static int in_cksum(ushort_t *addr, int len); 82 static void reset_snxt_basetimes(void); 83 static int ns2ms(int64_t ns); 84 static int64_t tv2ns(struct timeval *); 85 86 /* 87 * CRTT - Conservative Round Trip Time Estimate 88 * Probe success - A matching probe reply received before CRTT ms has elapsed 89 * after sending the probe. 90 * Probe failure - No probe reply received and more than CRTT ms has elapsed 91 * after sending the probe. 92 * 93 * TLS - Time last success. Most recent probe ack received at this time. 94 * TFF - Time first fail. The time of the earliest probe failure in 95 * a consecutive series of probe failures. 96 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 97 * before declaring phyint repair. 98 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 99 * declare a phyint failure. 100 * 101 * Phyint state diagram 102 * 103 * The state of a phyint that is capable of being probed, is completely 104 * specified by the 3-tuple <pi_state, pg_state, I>. 105 * 106 * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 107 * of the link (according to the driver). If the phyint is also configured 108 * with a test address (the common case) and probe targets, then a phyint must 109 * also successfully be able to send and receive probes in order to remain in 110 * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 111 * 112 * Further, if a PI_RUNNING phyint is configured with a test address but is 113 * unable to find any probe targets, it will transition to the PI_NOTARGETS 114 * state, which indicates that the link is apparently functional but that 115 * in.mpathd is unable to send probes to verify functionality (in this case, 116 * in.mpathd makes the optimistic assumption that the interface is working 117 * correctly and thus does not mark the interface FAILED, but reports it as 118 * IPMP_IF_UNKNOWN through the async events and query interfaces). 119 * 120 * At any point, a phyint may be administratively marked offline via if_mpadm. 121 * In this case, the interface always transitions to PI_OFFLINE, regardless 122 * of its previous state. When the interface is later brought back online, 123 * in.mpathd acts as if the interface is new (and thus it transitions to 124 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 125 * its probes, if probes are sent). 126 * 127 * pi_state - PI_RUNNING or PI_FAILED 128 * PI_RUNNING: The failure detection logic says the phyint is good. 129 * PI_FAILED: The failure detection logic says the phyint has failed. 130 * 131 * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. 132 * PG_OK: All interfaces in the group are OK. 133 * PG_DEGRADED: Some interfaces in the group are unusable. 134 * PG_FAILED: All interfaces in the group are unusable. 135 * 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: This phyint will not send or receive packets. 144 * Usually, inactive is tied to standby interfaces that are not yet 145 * needed (e.g., no non-standby interfaces in the group have failed). 146 * When failback has been disabled (FAILBACK=no configured), phyint can 147 * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint 148 * subsequently recovers after a failure. 149 * 150 * Not all 9 possible combinations of the above 3-tuple are possible. 151 * 152 * I is tracked by IP. pi_state is tracked by mpathd. 153 * 154 * pi_state state machine 155 * --------------------------------------------------------------------------- 156 * Event State New State 157 * Action: 158 * --------------------------------------------------------------------------- 159 * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 160 * detection : set IFF_FAILED on this phyint 161 * 162 * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 163 * detection : set IFF_FAILED on this phyint 164 * 165 * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) 166 * detection -> (PI_RUNNING, I == 0) 167 * : clear IFF_FAILED on this phyint 168 * 169 * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) 170 * detection -> (PI_RUNNING, I == 1) 171 * : clear IFF_FAILED on this phyint 172 * : if failback is disabled set I == 1 173 * 174 * Group failure (perform on all phyints in the group) 175 * detection PI_RUNNING PI_FAILED 176 * (Router targets) : set IFF_FAILED 177 * 178 * Group failure (perform on all phyints in the group) 179 * detection PI_RUNNING PI_NOTARGETS 180 * (Host targets) : set IFF_FAILED 181 * : delete the target list on all phyints 182 * --------------------------------------------------------------------------- 183 */ 184 185 struct probes_missed probes_missed; 186 187 /* 188 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 189 * will be added on by the kernel. The id field identifies this phyint. 190 * and the sequence number is an increasing (modulo 2^^16) integer. The data 191 * portion holds the time value when the packet is sent. On echo this is 192 * extracted to compute the round-trip time. Three different types of 193 * probe packets are used. 194 * 195 * PROBE_UNI: This type is used to do failure detection / failure recovery 196 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 197 * not less than the current CRTT. pii_probes[] stores data 198 * about these probes. These packets consume sequence number space. 199 * 200 * PROBE_RTT: This type is used to make only rtt measurements. Normally these 201 * are not used. Under heavy network load, the rtt may go up very high, 202 * due to a spike, or may appear to go high, due to extreme scheduling 203 * delays. Once the network stress is removed, mpathd takes long time to 204 * recover, because the probe_interval is already high, and it takes 205 * a long time to send out sufficient number of probes to bring down the 206 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 207 * user_probe_interval ms. and will cause only rtt updates. These packets 208 * do not consume sequence number space nor is information about these 209 * packets stored in the pii_probes[] 210 * 211 * PROBE_MULTI: This type is only used to construct a list of targets, when 212 * no targets are known. The packet is multicast to the all hosts addr. 213 */ 214 static void 215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) 216 { 217 hrtime_t sent_hrtime; 218 struct timeval sent_tv; 219 struct pr_icmp probe_pkt; /* Probe packet */ 220 struct sockaddr_storage targ; /* target address */ 221 uint_t targaddrlen; /* targed address length */ 222 int pr_ndx; /* probe index in pii->pii_probes[] */ 223 boolean_t sent = _B_TRUE; 224 225 if (debug & D_TARGET) { 226 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), 227 pii->pii_name, probe_type, start_hrtime); 228 } 229 230 assert(pii->pii_probe_sock != -1); 231 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 232 probe_type == PROBE_RTT); 233 234 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 235 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 236 probe_pkt.pr_icmp_code = 0; 237 probe_pkt.pr_icmp_cksum = 0; 238 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 239 240 /* 241 * Since there is no need to do arithmetic on the icmpid, 242 * (only equality check is done) pii_icmpid is stored in 243 * network byte order at initialization itself. 244 */ 245 probe_pkt.pr_icmp_id = pii->pii_icmpid; 246 probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); 247 probe_pkt.pr_icmp_mtype = htonl(probe_type); 248 249 /* 250 * If probe_type is PROBE_MULTI, this packet will be multicast to 251 * the all hosts address. Otherwise it is unicast to the next target. 252 */ 253 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 254 pii->pii_rtt_target_next != NULL)); 255 256 bzero(&targ, sizeof (targ)); 257 targ.ss_family = pii->pii_af; 258 259 if (pii->pii_af == AF_INET6) { 260 struct in6_addr *addr6; 261 262 addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; 263 targaddrlen = sizeof (struct sockaddr_in6); 264 if (probe_type == PROBE_MULTI) { 265 *addr6 = all_nodes_mcast_v6; 266 } else if (probe_type == PROBE_UNI) { 267 *addr6 = pii->pii_target_next->tg_address; 268 } else { /* type is PROBE_RTT */ 269 *addr6 = pii->pii_rtt_target_next->tg_address; 270 } 271 } else { 272 struct in_addr *addr4; 273 274 addr4 = &((struct sockaddr_in *)&targ)->sin_addr; 275 targaddrlen = sizeof (struct sockaddr_in); 276 if (probe_type == PROBE_MULTI) { 277 *addr4 = all_nodes_mcast_v4; 278 } else if (probe_type == PROBE_UNI) { 279 IN6_V4MAPPED_TO_INADDR( 280 &pii->pii_target_next->tg_address, addr4); 281 } else { /* type is PROBE_RTT */ 282 IN6_V4MAPPED_TO_INADDR( 283 &pii->pii_rtt_target_next->tg_address, addr4); 284 } 285 286 /* 287 * Compute the IPv4 icmp checksum. Does not cover the IP header. 288 */ 289 probe_pkt.pr_icmp_cksum = 290 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 291 } 292 293 /* 294 * Use the current time as the time we sent. Not atomic, but the best 295 * we can do from here. 296 */ 297 sent_hrtime = gethrtime(); 298 (void) gettimeofday(&sent_tv, NULL); 299 if (sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, 300 (struct sockaddr *)&targ, targaddrlen) != sizeof (probe_pkt)) { 301 logperror_pii(pii, "probe: probe sendto"); 302 sent = _B_FALSE; 303 } 304 305 /* 306 * If this is a PROBE_UNI probe packet being unicast to a target, then 307 * update our tables. We will need this info in processing the probe 308 * response. PROBE_MULTI and PROBE_RTT packets are not used for 309 * the purpose of failure or recovery detection. PROBE_MULTI packets 310 * are only used to construct a list of targets. PROBE_RTT packets are 311 * used only for updating the rtt and not for failure detection. 312 */ 313 if (probe_type == PROBE_UNI && sent) { 314 pr_ndx = pii->pii_probe_next; 315 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 316 317 /* Collect statistics, before we reuse the last slot. */ 318 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 319 pii->pii_cum_stats.lost++; 320 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 321 pii->pii_cum_stats.acked++; 322 pii->pii_cum_stats.sent++; 323 324 pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; 325 pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; 326 pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; 327 pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; 328 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 329 probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); 330 331 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 332 pii->pii_target_next = target_next(pii->pii_target_next); 333 assert(pii->pii_target_next != NULL); 334 /* 335 * If we have a single variable to denote the next target to 336 * probe for both rtt probes and failure detection probes, we 337 * could end up with a situation where the failure detection 338 * probe targets become disjoint from the rtt probe targets. 339 * Eg. if 2 targets and the actual fdt is double the user 340 * specified fdt. So we have 2 variables. In this scheme 341 * we also reset pii_rtt_target_next for every fdt probe, 342 * though that may not be necessary. 343 */ 344 pii->pii_rtt_target_next = pii->pii_target_next; 345 pii->pii_snxt++; 346 } else if (probe_type == PROBE_RTT) { 347 pii->pii_rtt_target_next = 348 target_next(pii->pii_rtt_target_next); 349 assert(pii->pii_rtt_target_next != NULL); 350 } 351 } 352 353 /* 354 * Incoming IPv4 data from wire, is received here. Called from main. 355 */ 356 void 357 in_data(struct phyint_instance *pii) 358 { 359 struct sockaddr_in from; 360 struct in6_addr fromaddr; 361 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 362 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 363 struct ip *ip; 364 int iphlen; 365 int len; 366 char abuf[INET_ADDRSTRLEN]; 367 struct msghdr msg; 368 struct iovec iov; 369 struct pr_icmp *reply; 370 struct timeval *recv_tvp; 371 372 if (debug & D_PROBE) { 373 logdebug("in_data(%s %s)\n", 374 AF_STR(pii->pii_af), pii->pii_name); 375 } 376 377 iov.iov_base = (char *)in_packet; 378 iov.iov_len = sizeof (in_packet); 379 msg.msg_iov = &iov; 380 msg.msg_iovlen = 1; 381 msg.msg_name = (struct sockaddr *)&from; 382 msg.msg_namelen = sizeof (from); 383 msg.msg_control = ancillary_data; 384 msg.msg_controllen = sizeof (ancillary_data); 385 386 /* 387 * Poll has already told us that a message is waiting, 388 * on this socket. Read it now. We should not block. 389 */ 390 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 391 logperror_pii(pii, "in_data: recvmsg"); 392 return; 393 } 394 395 /* 396 * If the datalink has indicated the link is down, don't go 397 * any further. 398 */ 399 if (LINK_DOWN(pii->pii_phyint)) 400 return; 401 402 /* Get the printable address for error reporting */ 403 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 404 405 /* Ignore packets > 64k or control buffers that don't fit */ 406 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 407 if (debug & D_PKTBAD) { 408 logdebug("Truncated message: msg_flags 0x%x from %s\n", 409 msg.msg_flags, abuf); 410 } 411 return; 412 } 413 414 /* Make sure packet contains at least minimum ICMP header */ 415 ip = (struct ip *)in_packet; 416 iphlen = ip->ip_hl << 2; 417 if (len < iphlen + ICMP_MINLEN) { 418 if (debug & D_PKTBAD) { 419 logdebug("in_data: packet too short (%d bytes)" 420 " from %s\n", len, abuf); 421 } 422 return; 423 } 424 425 /* 426 * Subtract the IP hdr length, 'len' will be length of the probe 427 * reply, starting from the icmp hdr. 428 */ 429 len -= iphlen; 430 /* LINTED */ 431 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 432 433 /* Probe replies are icmp echo replies. Ignore anything else */ 434 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 435 return; 436 437 /* 438 * The icmp id should match what we sent, which is stored 439 * in pi_icmpid. The icmp code for reply must be 0. 440 * The reply content must be a struct pr_icmp 441 */ 442 if (reply->pr_icmp_id != pii->pii_icmpid) { 443 /* Not in response to our probe */ 444 return; 445 } 446 447 if (reply->pr_icmp_code != 0) { 448 logtrace("probe reply code %d from %s on %s\n", 449 reply->pr_icmp_code, abuf, pii->pii_name); 450 return; 451 } 452 453 if (len < sizeof (struct pr_icmp)) { 454 logtrace("probe reply too short: %d bytes from %s on %s\n", 455 len, abuf, pii->pii_name); 456 return; 457 } 458 459 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 460 if (recv_tvp == NULL) { 461 logtrace("message without timestamp from %s on %s\n", 462 abuf, pii->pii_name); 463 return; 464 } 465 466 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 467 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 468 /* Unicast probe reply */ 469 incoming_echo_reply(pii, reply, fromaddr, recv_tvp); 470 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 471 /* Multicast reply */ 472 incoming_mcast_reply(pii, reply, fromaddr); 473 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 474 incoming_rtt_reply(pii, reply, fromaddr); 475 } else { 476 /* Probably not in response to our probe */ 477 logtrace("probe reply type: %d from %s on %s\n", 478 reply->pr_icmp_mtype, abuf, pii->pii_name); 479 return; 480 } 481 } 482 483 /* 484 * Incoming IPv6 data from wire is received here. Called from main. 485 */ 486 void 487 in6_data(struct phyint_instance *pii) 488 { 489 struct sockaddr_in6 from; 490 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 491 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 492 int len; 493 char abuf[INET6_ADDRSTRLEN]; 494 struct msghdr msg; 495 struct iovec iov; 496 void *opt; 497 struct pr_icmp *reply; 498 struct timeval *recv_tvp; 499 500 if (debug & D_PROBE) { 501 logdebug("in6_data(%s %s)\n", 502 AF_STR(pii->pii_af), pii->pii_name); 503 } 504 505 iov.iov_base = (char *)in_packet; 506 iov.iov_len = sizeof (in_packet); 507 msg.msg_iov = &iov; 508 msg.msg_iovlen = 1; 509 msg.msg_name = (struct sockaddr *)&from; 510 msg.msg_namelen = sizeof (from); 511 msg.msg_control = ancillary_data; 512 msg.msg_controllen = sizeof (ancillary_data); 513 514 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 515 logperror_pii(pii, "in6_data: recvmsg"); 516 return; 517 } 518 519 /* 520 * If the datalink has indicated that the link is down, don't go 521 * any further. 522 */ 523 if (LINK_DOWN(pii->pii_phyint)) 524 return; 525 526 /* Get the printable address for error reporting */ 527 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 528 if (len < ICMP_MINLEN) { 529 if (debug & D_PKTBAD) { 530 logdebug("Truncated message: msg_flags 0x%x from %s\n", 531 msg.msg_flags, abuf); 532 } 533 return; 534 } 535 /* Ignore packets > 64k or control buffers that don't fit */ 536 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 537 if (debug & D_PKTBAD) { 538 logdebug("Truncated message: msg_flags 0x%x from %s\n", 539 msg.msg_flags, abuf); 540 } 541 return; 542 } 543 544 reply = (struct pr_icmp *)in_packet; 545 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 546 return; 547 548 if (reply->pr_icmp_id != pii->pii_icmpid) { 549 /* Not in response to our probe */ 550 return; 551 } 552 553 /* 554 * The kernel has already verified the the ICMP checksum. 555 */ 556 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 557 logtrace("ICMPv6 echo reply source address not linklocal from " 558 "%s on %s\n", abuf, pii->pii_name); 559 return; 560 } 561 opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); 562 if (opt != NULL) { 563 /* Can't allow routing headers in probe replies */ 564 logtrace("message with routing header from %s on %s\n", 565 abuf, pii->pii_name); 566 return; 567 } 568 569 if (reply->pr_icmp_code != 0) { 570 logtrace("probe reply code: %d from %s on %s\n", 571 reply->pr_icmp_code, abuf, pii->pii_name); 572 return; 573 } 574 if (len < (sizeof (struct pr_icmp))) { 575 logtrace("probe reply too short: %d bytes from %s on %s\n", 576 len, abuf, pii->pii_name); 577 return; 578 } 579 580 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 581 if (recv_tvp == NULL) { 582 logtrace("message without timestamp from %s on %s\n", 583 abuf, pii->pii_name); 584 return; 585 } 586 587 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 588 incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); 589 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 590 incoming_mcast_reply(pii, reply, from.sin6_addr); 591 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 592 incoming_rtt_reply(pii, reply, from.sin6_addr); 593 } else { 594 /* Probably not in response to our probe */ 595 logtrace("probe reply type: %d from %s on %s\n", 596 reply->pr_icmp_mtype, abuf, pii->pii_name); 597 } 598 } 599 600 /* 601 * Process the incoming rtt reply, in response to our rtt probe. 602 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 603 * have any stored information about the probe we sent. So we don't log 604 * any errors if we receive bad replies. 605 */ 606 static void 607 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 608 struct in6_addr fromaddr) 609 { 610 int64_t m; /* rtt measurement in ns */ 611 char abuf[INET6_ADDRSTRLEN]; 612 struct target *target; 613 struct phyint_group *pg; 614 615 /* Get the printable address for error reporting */ 616 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 617 618 if (debug & D_PROBE) { 619 logdebug("incoming_rtt_reply: %s %s %s\n", 620 AF_STR(pii->pii_af), pii->pii_name, abuf); 621 } 622 623 /* Do we know this target ? */ 624 target = target_lookup(pii, fromaddr); 625 if (target == NULL) 626 return; 627 628 m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); 629 /* Invalid rtt. It has wrapped around */ 630 if (m < 0) 631 return; 632 633 /* 634 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 635 * The initial few responses after the interface is repaired may 636 * contain high rtt's because they could have been queued up waiting 637 * for ARP/NDP resolution on a failed interface. 638 */ 639 pg = pii->pii_phyint->pi_group; 640 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 641 return; 642 643 /* 644 * Update rtt only if the new rtt is lower than the current rtt. 645 * (specified by the 3rd parameter to pi_set_crtt). 646 * If a spike has caused the current probe_interval to be > 647 * user_probe_interval, then this mechanism is used to bring down 648 * the rtt rapidly once the network stress is removed. 649 * If the new rtt is higher than the current rtt, we don't want to 650 * update the rtt. We are having more than 1 outstanding probe and 651 * the increase in rtt we are seeing is being unnecessarily weighted 652 * many times. The regular rtt update will be handled by 653 * incoming_echo_reply() and will take care of any rtt increase. 654 */ 655 pi_set_crtt(target, m, _B_FALSE); 656 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 657 (user_failure_detection_time < pg->pg_fdt) && 658 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 659 /* 660 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 661 * investigate if we can improve the failure detection time to 662 * meet whatever the user specified. 663 */ 664 if (check_pg_crtt_improved(pg)) { 665 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 666 user_failure_detection_time); 667 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 668 if (pii->pii_phyint->pi_group != phyint_anongroup) { 669 logerr("Improved failure detection time %d ms " 670 "on (%s %s) for group \"%s\"\n", 671 pg->pg_fdt, AF_STR(pii->pii_af), 672 pii->pii_name, 673 pii->pii_phyint->pi_group->pg_name); 674 } 675 if (user_failure_detection_time == pg->pg_fdt) { 676 /* Avoid any truncation or rounding errors */ 677 pg->pg_probeint = user_probe_interval; 678 /* 679 * No more rtt probes will be sent. The actual 680 * fdt has dropped to the user specified value. 681 * pii_fd_snxt_basetime and pii_snxt_basetime 682 * will be in sync henceforth. 683 */ 684 reset_snxt_basetimes(); 685 } 686 } 687 } 688 } 689 690 /* 691 * Process the incoming echo reply, in response to our unicast probe. 692 * Common for both IPv4 and IPv6 693 */ 694 static void 695 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 696 struct in6_addr fromaddr, struct timeval *recv_tvp) 697 { 698 int64_t m; /* rtt measurement in ns */ 699 hrtime_t cur_hrtime; /* in ns from some arbitrary point */ 700 char abuf[INET6_ADDRSTRLEN]; 701 int pr_ndx; 702 struct target *target; 703 boolean_t exception; 704 uint64_t pr_icmp_timestamp; 705 uint16_t pr_icmp_seq; 706 struct probe_stats *pr_statp; 707 struct phyint_group *pg = pii->pii_phyint->pi_group; 708 709 /* Get the printable address for error reporting */ 710 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 711 712 if (debug & D_PROBE) { 713 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", 714 AF_STR(pii->pii_af), pii->pii_name, abuf, 715 ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); 716 } 717 718 pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); 719 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 720 721 /* Reject out of window probe replies */ 722 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 723 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 724 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 725 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 726 pii->pii_cum_stats.unknown++; 727 return; 728 } 729 730 cur_hrtime = gethrtime(); 731 m = (int64_t)(cur_hrtime - pr_icmp_timestamp); 732 if (m < 0) { 733 /* 734 * This is a ridiculously high value of rtt. rtt has wrapped 735 * around. Log a message, and ignore the rtt. 736 */ 737 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " 738 "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); 739 } 740 741 /* 742 * Get the probe index pr_ndx corresponding to the received icmp seq. 743 * number in our pii->pii_probes[] array. The icmp sequence number 744 * pii_snxt corresponds to the probe index pii->pii_probe_next 745 */ 746 pr_ndx = MOD_SUB(pii->pii_probe_next, 747 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 748 749 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 750 751 target = pii->pii_probes[pr_ndx].pr_target; 752 753 /* 754 * Perform sanity checks, whether this probe reply that we 755 * have received is genuine 756 */ 757 if (target != NULL) { 758 /* 759 * Compare the src. addr of the received ICMP or ICMPv6 760 * probe reply with the target address in our tables. 761 */ 762 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 763 /* 764 * We don't have any record of having sent a probe to 765 * this target. This is a fake probe reply. Log an error 766 */ 767 logtrace("probe status %d Fake probe reply seq %u " 768 "snxt %u on %s from %s\n", 769 pii->pii_probes[pr_ndx].pr_status, 770 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 771 pii->pii_cum_stats.unknown++; 772 return; 773 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 774 /* 775 * The address matches, but our tables indicate that 776 * this probe reply has been acked already. So this 777 * is a duplicate probe reply. Log an error 778 */ 779 logtrace("probe status %d Duplicate probe reply seq %u " 780 "snxt %u on %s from %s\n", 781 pii->pii_probes[pr_ndx].pr_status, 782 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 783 pii->pii_cum_stats.unknown++; 784 return; 785 } 786 } else { 787 /* 788 * Target must not be NULL in the PR_UNACKED state 789 */ 790 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 791 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 792 /* 793 * The probe stats slot is unused. So we didn't 794 * send out any probe to this target. This is a fake. 795 * Log an error. 796 */ 797 logtrace("probe status %d Fake probe reply seq %u " 798 "snxt %u on %s from %s\n", 799 pii->pii_probes[pr_ndx].pr_status, 800 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 801 } 802 pii->pii_cum_stats.unknown++; 803 return; 804 } 805 806 /* 807 * If the rtt does not appear to be right, don't update the 808 * rtt stats. This can happen if the system dropped into the 809 * debugger, or the system was hung or too busy for a 810 * substantial time that we didn't get a chance to run. 811 */ 812 if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { 813 /* 814 * If the probe corresponding to this received response 815 * was truly sent 'm' ns. ago, then this response must 816 * have been rejected by the sequence number checks. The 817 * fact that it has passed the sequence number checks 818 * means that the measured rtt is wrong. We were probably 819 * scheduled long after the packet was received. 820 */ 821 goto out; 822 } 823 824 /* 825 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 826 * The initial few responses after the interface is repaired may 827 * contain high rtt's because they could have been queued up waiting 828 * for ARP/NDP resolution on a failed interface. 829 */ 830 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 831 goto out; 832 833 /* 834 * Don't update the Conservative Round Trip Time estimate for this 835 * (phint, target) pair if this is the not the highest ack seq seen 836 * thus far on this target. 837 */ 838 if (!highest_ack_tg(pr_icmp_seq, target)) 839 goto out; 840 841 /* 842 * Always update the rtt. This is a failure detection probe 843 * and we want to measure both increase / decrease in rtt. 844 */ 845 pi_set_crtt(target, m, _B_TRUE); 846 847 /* 848 * If the crtt exceeds the average time between probes, 849 * investigate if this slow target is an exception. If so we 850 * can avoid this target and still meet the failure detection 851 * time. Otherwise we can't meet the failure detection time. 852 */ 853 if (target->tg_crtt > pg->pg_probeint) { 854 exception = check_exception_target(pii, target); 855 if (exception) { 856 /* 857 * This target is exceptionally slow. Don't use it 858 * for future probes. check_exception_target() has 859 * made sure that we have at least MIN_PROBE_TARGETS 860 * other active targets 861 */ 862 if (pii->pii_targets_are_routers) { 863 /* 864 * This is a slow router, mark it as slow 865 * and don't use it for further probes. We 866 * don't delete it, since it will be populated 867 * again when we do a router scan. Hence we 868 * need to maintain extra state (unlike the 869 * host case below). Mark it as TG_SLOW. 870 */ 871 if (target->tg_status == TG_ACTIVE) 872 pii->pii_ntargets--; 873 target->tg_status = TG_SLOW; 874 target->tg_latime = gethrtime(); 875 target->tg_rtt_sa = -1; 876 target->tg_crtt = 0; 877 target->tg_rtt_sd = 0; 878 if (pii->pii_target_next == target) { 879 pii->pii_target_next = 880 target_next(target); 881 } 882 } else { 883 /* 884 * the slow target is not a router, we can 885 * just delete it. Send an icmp multicast and 886 * pick the fastest responder that is not 887 * already an active target. target_delete() 888 * adjusts pii->pii_target_next 889 */ 890 target_delete(target); 891 probe(pii, PROBE_MULTI, cur_hrtime); 892 } 893 } else { 894 /* 895 * We can't meet the failure detection time. 896 * Log a message, and update the detection time to 897 * whatever we can achieve. 898 */ 899 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 900 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 901 last_fdt_bumpup_time = gethrtime(); 902 if (pg != phyint_anongroup) { 903 logerr("Cannot meet requested failure detection" 904 " time of %d ms on (%s %s) new failure" 905 " detection time for group \"%s\" is %d" 906 " ms\n", user_failure_detection_time, 907 AF_STR(pii->pii_af), pii->pii_name, 908 pg->pg_name, pg->pg_fdt); 909 } 910 } 911 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 912 (user_failure_detection_time < pg->pg_fdt) && 913 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 914 /* 915 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 916 * investigate if we can improve the failure detection time to 917 * meet whatever the user specified. 918 */ 919 if (check_pg_crtt_improved(pg)) { 920 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 921 user_failure_detection_time); 922 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 923 if (pg != phyint_anongroup) { 924 logerr("Improved failure detection time %d ms " 925 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 926 AF_STR(pii->pii_af), pii->pii_name, 927 pg->pg_name); 928 } 929 if (user_failure_detection_time == pg->pg_fdt) { 930 /* Avoid any truncation or rounding errors */ 931 pg->pg_probeint = user_probe_interval; 932 /* 933 * No more rtt probes will be sent. The actual 934 * fdt has dropped to the user specified value. 935 * pii_fd_snxt_basetime and pii_snxt_basetime 936 * will be in sync henceforth. 937 */ 938 reset_snxt_basetimes(); 939 } 940 } 941 } 942 out: 943 pr_statp = &pii->pii_probes[pr_ndx]; 944 pr_statp->pr_hrtime_ackproc = cur_hrtime; 945 pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + 946 (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); 947 948 probe_chstate(pr_statp, pii, PR_ACKED); 949 950 /* 951 * Update pii->pii_rack, i.e. the sequence number of the last received 952 * probe response, based on the echo reply we have received now, if 953 * either of the following conditions are satisfied. 954 * a. pii_rack is outside the current receive window of 955 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 956 * This means we have not received probe responses for a 957 * long time, and the sequence number has wrapped around. 958 * b. pii_rack is within the current receive window and this echo 959 * reply corresponds to the highest sequence number we have seen 960 * so far. 961 */ 962 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 963 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 964 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 965 pii->pii_rack = pr_icmp_seq; 966 } 967 } 968 969 /* 970 * Returns true if seq is the highest unacknowledged seq for target tg 971 * else returns false 972 */ 973 static boolean_t 974 highest_ack_tg(uint16_t seq, struct target *tg) 975 { 976 struct phyint_instance *pii; 977 int pr_ndx; 978 uint16_t pr_seq; 979 980 pii = tg->tg_phyint_inst; 981 982 /* 983 * Get the seq number of the most recent probe sent so far, 984 * and also get the corresponding probe index in the probe stats 985 * array. 986 */ 987 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 988 pr_seq = pii->pii_snxt; 989 pr_seq--; 990 991 /* 992 * Start from the most recent probe and walk back, trying to find 993 * an acked probe corresponding to target tg. 994 */ 995 for (; pr_ndx != pii->pii_probe_next; 996 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 997 if (pii->pii_probes[pr_ndx].pr_target == tg && 998 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 999 if (SEQ_GT(pr_seq, seq)) 1000 return (_B_FALSE); 1001 } 1002 } 1003 return (_B_TRUE); 1004 } 1005 1006 /* 1007 * Check whether the crtt for the group has improved by a factor of 1008 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1009 * detection time flapping in the face of small crtt changes. 1010 */ 1011 static boolean_t 1012 check_pg_crtt_improved(struct phyint_group *pg) 1013 { 1014 struct phyint *pi; 1015 1016 if (debug & D_PROBE) 1017 logdebug("check_pg_crtt_improved()\n"); 1018 1019 /* 1020 * The crtt for the group is only improved if each phyint_instance 1021 * for both ipv4 and ipv6 is improved. 1022 */ 1023 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1024 if (!check_pii_crtt_improved(pi->pi_v4) || 1025 !check_pii_crtt_improved(pi->pi_v6)) 1026 return (_B_FALSE); 1027 } 1028 1029 return (_B_TRUE); 1030 } 1031 1032 /* 1033 * Check whether the crtt has improved substantially on this phyint_instance. 1034 * Returns _B_TRUE if there's no crtt information available, because pii 1035 * is NULL or the phyint_instance is not capable of probing. 1036 */ 1037 boolean_t 1038 check_pii_crtt_improved(struct phyint_instance *pii) { 1039 struct target *tg; 1040 1041 if (pii == NULL) 1042 return (_B_TRUE); 1043 1044 if (!PROBE_CAPABLE(pii) || 1045 pii->pii_phyint->pi_state == PI_FAILED) 1046 return (_B_TRUE); 1047 1048 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1049 if (tg->tg_status != TG_ACTIVE) 1050 continue; 1051 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1052 LOWER_FDT_TRIGGER)) { 1053 return (_B_FALSE); 1054 } 1055 } 1056 1057 return (_B_TRUE); 1058 } 1059 1060 /* 1061 * This target responds very slowly to probes. The target's crtt exceeds 1062 * the probe interval of its group. Compare against other targets 1063 * and determine if this target is an exception, if so return true, else false 1064 */ 1065 static boolean_t 1066 check_exception_target(struct phyint_instance *pii, struct target *target) 1067 { 1068 struct target *tg; 1069 char abuf[INET6_ADDRSTRLEN]; 1070 1071 if (debug & D_PROBE) { 1072 logdebug("check_exception_target(%s %s target %s)\n", 1073 AF_STR(pii->pii_af), pii->pii_name, 1074 pr_addr(pii->pii_af, target->tg_address, 1075 abuf, sizeof (abuf))); 1076 } 1077 1078 /* 1079 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1080 * to make a good judgement. Otherwise don't drop this target. 1081 */ 1082 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1083 return (_B_FALSE); 1084 1085 /* 1086 * Determine whether only this particular target is slow. 1087 * We know that this target's crtt exceeds the group's probe interval. 1088 * If all other active targets have a 1089 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1090 * then this target is considered slow. 1091 */ 1092 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1093 if (tg != target && tg->tg_status == TG_ACTIVE) { 1094 if (tg->tg_crtt > 1095 pii->pii_phyint->pi_group->pg_probeint / 1096 EXCEPTION_FACTOR) { 1097 return (_B_FALSE); 1098 } 1099 } 1100 } 1101 1102 return (_B_TRUE); 1103 } 1104 1105 /* 1106 * Update the target list. The icmp all hosts multicast has given us 1107 * some host to which we can send probes. If we already have sufficient 1108 * targets, discard it. 1109 */ 1110 static void 1111 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1112 struct in6_addr fromaddr) 1113 /* ARGSUSED */ 1114 { 1115 int af; 1116 char abuf[INET6_ADDRSTRLEN]; 1117 struct phyint *pi; 1118 1119 if (debug & D_PROBE) { 1120 logdebug("incoming_mcast_reply(%s %s %s)\n", 1121 AF_STR(pii->pii_af), pii->pii_name, 1122 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1123 } 1124 1125 /* 1126 * Using host targets is a fallback mechanism. If we have 1127 * found a router, don't add this host target. If we already 1128 * know MAX_PROBE_TARGETS, don't add another target. 1129 */ 1130 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1131 if (pii->pii_targets != NULL) { 1132 if (pii->pii_targets_are_routers || 1133 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1134 return; 1135 } 1136 } 1137 1138 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1139 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1140 /* 1141 * Guard against response from 0.0.0.0 1142 * and ::. Log a trace message 1143 */ 1144 logtrace("probe response from %s on %s\n", 1145 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1146 pii->pii_name); 1147 return; 1148 } 1149 1150 /* 1151 * This address is one of our own, so reject this address as a 1152 * valid probe target. 1153 */ 1154 af = pii->pii_af; 1155 if (own_address(fromaddr)) 1156 return; 1157 1158 /* 1159 * If the phyint is part a named group, then add the address to all 1160 * members of the group. Otherwise, add the address only to the 1161 * phyint itself, since other phyints in the anongroup may not be on 1162 * the same subnet. 1163 */ 1164 pi = pii->pii_phyint; 1165 if (pi->pi_group == phyint_anongroup) { 1166 target_add(pii, fromaddr, _B_FALSE); 1167 } else { 1168 pi = pi->pi_group->pg_phyint; 1169 for (; pi != NULL; pi = pi->pi_pgnext) 1170 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1171 } 1172 } 1173 1174 /* 1175 * Compute CRTT given an existing scaled average, scaled deviation estimate 1176 * and a new rtt time. The formula is from Jacobson and Karels' 1177 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1178 * are the same as those in Appendix A.2 of that paper. 1179 * 1180 * m = new measurement 1181 * sa = scaled RTT average (8 * average estimates) 1182 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1183 * crtt = Conservative round trip time. Used to determine whether probe 1184 * has timed out. 1185 * 1186 * New scaled average and deviation are passed back via sap and svp 1187 */ 1188 static int64_t 1189 compute_crtt(int64_t *sap, int64_t *svp, int64_t m) 1190 { 1191 int64_t sa = *sap; 1192 int64_t sv = *svp; 1193 int64_t crtt; 1194 int64_t saved_m = m; 1195 1196 assert(*sap >= -1); 1197 assert(*svp >= 0); 1198 1199 if (sa != -1) { 1200 /* 1201 * Update average estimator: 1202 * new rtt = old rtt + 1/8 Error 1203 * where Error = m - old rtt 1204 * i.e. 8 * new rtt = 8 * old rtt + Error 1205 * i.e. new sa = old sa + Error 1206 */ 1207 m -= sa >> 3; /* m is now Error in estimate. */ 1208 if ((sa += m) < 0) { 1209 /* Don't allow the smoothed average to be negative. */ 1210 sa = 0; 1211 } 1212 1213 /* 1214 * Update deviation estimator: 1215 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1216 * i.e. 4 * new mdev = 4 * old mdev + 1217 * (abs(Error) - old mdev) 1218 * i.e. new sv = old sv + (abs(Error) - old mdev) 1219 */ 1220 if (m < 0) 1221 m = -m; 1222 m -= sv >> 2; 1223 sv += m; 1224 } else { 1225 /* Initialization. This is the first response received. */ 1226 sa = (m << 3); 1227 sv = (m << 1); 1228 } 1229 1230 crtt = (sa >> 3) + sv; 1231 1232 if (debug & D_PROBE) { 1233 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " 1234 "crtt = %lld\n", saved_m, sa, sv, crtt); 1235 } 1236 1237 *sap = sa; 1238 *svp = sv; 1239 1240 /* 1241 * CRTT = average estimates + 4 * deviation estimates 1242 * = sa / 8 + sv 1243 */ 1244 return (crtt); 1245 } 1246 1247 static void 1248 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) 1249 { 1250 struct phyint_instance *pii = tg->tg_phyint_inst; 1251 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1252 int64_t sa = tg->tg_rtt_sa; 1253 int64_t sv = tg->tg_rtt_sd; 1254 int new_crtt; 1255 int i; 1256 1257 if (debug & D_PROBE) 1258 logdebug("pi_set_crtt: target - m %lld\n", m); 1259 1260 /* store the round trip time, in case we need to defer computation */ 1261 tg->tg_deferred[tg->tg_num_deferred] = m; 1262 1263 new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); 1264 1265 /* 1266 * If this probe's round trip time would singlehandedly cause an 1267 * increase in the group's probe interval consider it suspect. 1268 */ 1269 if ((new_crtt > probe_interval) && is_probe_uni) { 1270 if (debug & D_PROBE) { 1271 logdebug("Received a suspect probe on %s, new_crtt =" 1272 " %d, probe_interval = %d, num_deferred = %d\n", 1273 pii->pii_probe_logint->li_name, new_crtt, 1274 probe_interval, tg->tg_num_deferred); 1275 } 1276 1277 /* 1278 * If we've deferred as many rtts as we plan on deferring, then 1279 * assume the link really did slow down and process all queued 1280 * rtts 1281 */ 1282 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1283 if (debug & D_PROBE) { 1284 logdebug("Received MAXDEFERREDRTT probes which " 1285 "would cause an increased probe_interval. " 1286 "Integrating queued rtt data points.\n"); 1287 } 1288 1289 for (i = 0; i <= tg->tg_num_deferred; i++) { 1290 tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, 1291 &tg->tg_rtt_sd, tg->tg_deferred[i])); 1292 } 1293 1294 tg->tg_num_deferred = 0; 1295 } else { 1296 tg->tg_num_deferred++; 1297 } 1298 return; 1299 } 1300 1301 /* 1302 * If this is a normal probe, or an RTT probe that would lead to a 1303 * reduced CRTT, then update our CRTT data. Further, if this was 1304 * a normal probe, pitch any deferred probes since our probes are 1305 * again being answered within our CRTT estimates. 1306 */ 1307 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1308 tg->tg_rtt_sa = sa; 1309 tg->tg_rtt_sd = sv; 1310 tg->tg_crtt = new_crtt; 1311 if (is_probe_uni) 1312 tg->tg_num_deferred = 0; 1313 } 1314 } 1315 1316 /* 1317 * Return a pointer to the specified option buffer. 1318 * If not found return NULL. 1319 */ 1320 static void * 1321 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) 1322 { 1323 struct cmsghdr *cmsg; 1324 1325 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1326 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1327 if (cmsg->cmsg_level == cmsg_level && 1328 cmsg->cmsg_type == cmsg_type) { 1329 return (CMSG_DATA(cmsg)); 1330 } 1331 } 1332 return (NULL); 1333 } 1334 1335 /* 1336 * Try to activate another INACTIVE interface in the same group as `pi'. 1337 * Prefer STANDBY INACTIVE to just INACTIVE. 1338 */ 1339 void 1340 phyint_activate_another(struct phyint *pi) 1341 { 1342 struct phyint *pi2; 1343 struct phyint *inactivepi = NULL; 1344 1345 if (pi->pi_group == phyint_anongroup) 1346 return; 1347 1348 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1349 if (pi == pi2 || pi2->pi_state != PI_RUNNING || 1350 !(pi2->pi_flags & IFF_INACTIVE)) 1351 continue; 1352 1353 inactivepi = pi2; 1354 if (pi2->pi_flags & IFF_STANDBY) 1355 break; 1356 } 1357 1358 if (inactivepi != NULL) 1359 (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); 1360 } 1361 1362 /* 1363 * Transition a phyint back to PI_RUNNING (from PI_FAILED or PI_OFFLINE). The 1364 * caller must ensure that the transition is appropriate. Clears IFF_OFFLINE 1365 * or IFF_FAILED, as appropriate. Also sets IFF_INACTIVE on this or other 1366 * interfaces as appropriate (see comment below). Finally, also updates the 1367 * phyint's group state to account for the change. 1368 */ 1369 void 1370 phyint_transition_to_running(struct phyint *pi) 1371 { 1372 struct phyint *pi2; 1373 struct phyint *actstandbypi = NULL; 1374 uint_t nactive = 0, nnonstandby = 0; 1375 boolean_t onlining = (pi->pi_state == PI_OFFLINE); 1376 uint64_t set, clear; 1377 1378 /* 1379 * The interface is running again, but should it or another interface 1380 * in the group end up INACTIVE? There are three cases: 1381 * 1382 * 1. If it's a STANDBY interface, it should be end up INACTIVE if 1383 * the group is operating at capacity (i.e., there are at least as 1384 * many active interfaces as non-STANDBY interfaces in the group). 1385 * No other interfaces should be changed. 1386 * 1387 * 2. If it's a non-STANDBY interface and we're onlining it or 1388 * FAILBACK is enabled, then it should *not* end up INACTIVE. 1389 * Further, if the group is above capacity as a result of this 1390 * interface, then an active STANDBY interface in the group should 1391 * end up INACTIVE. 1392 * 1393 * 3. If it's a non-STANDBY interface, we're repairing it, and 1394 * FAILBACK is disabled, then it should end up INACTIVE *unless* 1395 * the group was failed (in which case we have no choice but to 1396 * use it). No other interfaces should be changed. 1397 */ 1398 if (pi->pi_group != phyint_anongroup) { 1399 pi2 = pi->pi_group->pg_phyint; 1400 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1401 if (!(pi2->pi_flags & IFF_STANDBY)) 1402 nnonstandby++; 1403 1404 if (pi2->pi_state == PI_RUNNING) { 1405 if (!(pi2->pi_flags & IFF_INACTIVE)) { 1406 nactive++; 1407 if (pi2->pi_flags & IFF_STANDBY) 1408 actstandbypi = pi2; 1409 } 1410 } 1411 } 1412 } 1413 1414 set = 0; 1415 clear = (onlining ? IFF_OFFLINE : IFF_FAILED); 1416 1417 if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ 1418 if (nactive >= nnonstandby) 1419 set |= IFF_INACTIVE; 1420 else 1421 clear |= IFF_INACTIVE; 1422 } else if (onlining || failback_enabled) { /* case 2 */ 1423 if (nactive >= nnonstandby && actstandbypi != NULL) 1424 (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); 1425 } else if (!GROUP_FAILED(pi->pi_group)) { /* case 3 */ 1426 set |= IFF_INACTIVE; 1427 } 1428 (void) change_pif_flags(pi, set, clear); 1429 1430 phyint_chstate(pi, PI_RUNNING); 1431 1432 /* 1433 * Update the group state to account for the change. 1434 */ 1435 phyint_group_refresh_state(pi->pi_group); 1436 } 1437 1438 /* 1439 * See if a previously failed interface has started working again. 1440 */ 1441 void 1442 phyint_check_for_repair(struct phyint *pi) 1443 { 1444 if (!phyint_repaired(pi)) 1445 return; 1446 1447 if (pi->pi_group == phyint_anongroup) { 1448 logerr("IP interface repair detected on %s\n", pi->pi_name); 1449 } else { 1450 logerr("IP interface repair detected on %s of group %s\n", 1451 pi->pi_name, pi->pi_group->pg_name); 1452 } 1453 1454 /* 1455 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. 1456 * So just clear IFF_OFFLINE and defer phyint_transition_to_running() 1457 * until it is brought back online. 1458 */ 1459 if (pi->pi_state == PI_OFFLINE) { 1460 (void) change_pif_flags(pi, 0, IFF_FAILED); 1461 return; 1462 } 1463 1464 phyint_transition_to_running(pi); /* calls phyint_chstate() */ 1465 } 1466 1467 /* 1468 * See if an interface has failed, or if the whole group of interfaces has 1469 * failed. 1470 */ 1471 static void 1472 phyint_inst_check_for_failure(struct phyint_instance *pii) 1473 { 1474 struct phyint *pi = pii->pii_phyint; 1475 struct phyint *pi2; 1476 boolean_t was_active; 1477 1478 switch (failure_state(pii)) { 1479 case PHYINT_FAILURE: 1480 was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); 1481 1482 (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); 1483 if (pi->pi_group == phyint_anongroup) { 1484 logerr("IP interface failure detected on %s\n", 1485 pii->pii_name); 1486 } else { 1487 logerr("IP interface failure detected on %s of group" 1488 " %s\n", pii->pii_name, pi->pi_group->pg_name); 1489 } 1490 1491 /* 1492 * If the interface is offline, the state change will be 1493 * noted when it comes back online. 1494 */ 1495 if (pi->pi_state != PI_OFFLINE) { 1496 /* 1497 * If the failed interface was active, activate 1498 * another INACTIVE interface in the group if 1499 * possible. (If the interface is PI_OFFLINE, 1500 * we already activated another.) 1501 */ 1502 if (was_active) 1503 phyint_activate_another(pi); 1504 1505 phyint_chstate(pi, PI_FAILED); 1506 reset_crtt_all(pi); 1507 } 1508 break; 1509 1510 case GROUP_FAILURE: 1511 pi2 = pi->pi_group->pg_phyint; 1512 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1513 (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); 1514 if (pi2->pi_state == PI_OFFLINE) /* see comment above */ 1515 continue; 1516 1517 reset_crtt_all(pi2); 1518 /* 1519 * In the case of host targets, we would have flushed 1520 * the targets, and gone to PI_NOTARGETS state. 1521 */ 1522 if (pi2->pi_state == PI_RUNNING) 1523 phyint_chstate(pi2, PI_FAILED); 1524 } 1525 break; 1526 1527 default: 1528 break; 1529 } 1530 } 1531 1532 /* 1533 * Determines if any timeout event has occurred and returns the number of 1534 * milliseconds until the next timeout event for the phyint. Returns 1535 * TIMER_INFINITY for "never". 1536 */ 1537 uint_t 1538 phyint_inst_timer(struct phyint_instance *pii) 1539 { 1540 int pr_ndx; 1541 uint_t timeout; 1542 struct target *cur_tg; 1543 struct probe_stats *pr_statp; 1544 struct phyint_instance *pii_other; 1545 struct phyint *pi; 1546 int valid_unack_count; 1547 int i; 1548 int interval; 1549 uint_t check_time; 1550 uint_t cur_time; 1551 hrtime_t cur_hrtime; 1552 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1553 1554 cur_hrtime = gethrtime(); 1555 cur_time = ns2ms(cur_hrtime); 1556 1557 if (debug & D_TIMER) { 1558 logdebug("phyint_inst_timer(%s %s)\n", 1559 AF_STR(pii->pii_af), pii->pii_name); 1560 } 1561 1562 pii_other = phyint_inst_other(pii); 1563 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1564 /* 1565 * Check to see if we're here due to link up/down flapping; If 1566 * enough time has passed, then try to bring the interface 1567 * back up; otherwise, schedule a timer to bring it back up 1568 * when enough time *has* elapsed. 1569 */ 1570 pi = pii->pii_phyint; 1571 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1572 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1573 if (check_time > cur_time) 1574 return (check_time - cur_time); 1575 1576 phyint_check_for_repair(pi); 1577 } 1578 } 1579 1580 /* 1581 * If probing is not enabled on this phyint instance, don't proceed. 1582 */ 1583 if (!PROBE_ENABLED(pii)) 1584 return (TIMER_INFINITY); 1585 1586 /* 1587 * If the timer has fired too soon, probably triggered 1588 * by some other phyint instance, return the remaining 1589 * time 1590 */ 1591 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1592 return (pii->pii_snxt_time - cur_time); 1593 1594 /* 1595 * If the link is down, don't send any probes for now. 1596 */ 1597 if (LINK_DOWN(pii->pii_phyint)) 1598 return (TIMER_INFINITY); 1599 1600 /* 1601 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1602 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1603 * Base probe time is strictly periodic. 1604 */ 1605 interval = GET_RANDOM( 1606 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1607 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1608 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1609 1610 /* 1611 * Check if the current time > next time to probe. If so, we missed 1612 * sending 1 or more probes, probably due to heavy system load. At least 1613 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1614 * were scheduled. Make adjustments to the times, in multiples of 1615 * user_probe_interval. 1616 */ 1617 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1618 int n; 1619 1620 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1621 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1622 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1623 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1624 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1625 pii->pii_snxt_basetime); 1626 1627 /* Collect statistics about missed probes */ 1628 probes_missed.pm_nprobes += n + 1; 1629 probes_missed.pm_ntimes++; 1630 } 1631 pii->pii_snxt_basetime += user_probe_interval; 1632 interval = pii->pii_snxt_time - cur_time; 1633 if (debug & D_TARGET) { 1634 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1635 " interval %u\n", cur_time, pii->pii_snxt_time, 1636 pii->pii_snxt_basetime, interval); 1637 } 1638 1639 /* 1640 * If no targets are known, we need to send an ICMP multicast. The 1641 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1642 * to see if we found a target. 1643 */ 1644 if (pii->pii_target_next == NULL) { 1645 assert(pii->pii_ntargets == 0); 1646 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1647 probe(pii, PROBE_MULTI, cur_time); 1648 return (interval); 1649 } 1650 1651 if ((user_probe_interval != probe_interval) && 1652 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1653 /* 1654 * the failure detection (fd) probe timer has not yet fired. 1655 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1656 */ 1657 probe(pii, PROBE_RTT, cur_hrtime); 1658 return (interval); 1659 } 1660 /* 1661 * the fd probe timer has fired. Need to do all failure 1662 * detection / recovery calculations, and then send an fd probe 1663 * of type PROBE_UNI. 1664 */ 1665 if (user_probe_interval == probe_interval) { 1666 /* 1667 * We could have missed some probes, and then adjusted 1668 * pii_snxt_basetime above. Otherwise we could have 1669 * blindly added probe_interval to pii_fd_snxt_basetime. 1670 */ 1671 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1672 } else { 1673 pii->pii_fd_snxt_basetime += probe_interval; 1674 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1675 int n; 1676 1677 n = (cur_time - pii->pii_fd_snxt_basetime) / 1678 probe_interval; 1679 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1680 } 1681 } 1682 1683 /* 1684 * We can have at most, the latest 2 probes that we sent, in 1685 * the PR_UNACKED state. All previous probes sent, are either 1686 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1687 * timed out if the probe's time_start + the CRTT < currenttime. 1688 * For each of the last 2 probes, examine whether it has timed 1689 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1690 */ 1691 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1692 valid_unack_count = 0; 1693 1694 for (i = 0; i < 2; i++) { 1695 pr_statp = &pii->pii_probes[pr_ndx]; 1696 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1697 switch (pr_statp->pr_status) { 1698 case PR_ACKED: 1699 /* 1700 * We received back an ACK, so the switch clearly 1701 * is not dropping our traffic, and thus we can 1702 * enable failure detection immediately. 1703 */ 1704 if (pii->pii_fd_hrtime > gethrtime()) { 1705 if (debug & D_PROBE) { 1706 logdebug("successful probe on %s; " 1707 "ending quiet period\n", 1708 pii->pii_phyint->pi_name); 1709 } 1710 pii->pii_fd_hrtime = gethrtime(); 1711 } 1712 break; 1713 1714 case PR_UNACKED: 1715 assert(cur_tg != NULL); 1716 /* 1717 * The crtt could be zero for some reason, 1718 * Eg. the phyint could be failed. If the crtt is 1719 * not available use group's probe interval, 1720 * which is a worst case estimate. 1721 */ 1722 timeout = ns2ms(pr_statp->pr_hrtime_start); 1723 if (cur_tg->tg_crtt != 0) { 1724 timeout += cur_tg->tg_crtt; 1725 } else { 1726 timeout += probe_interval; 1727 } 1728 if (TIME_LT(timeout, cur_time)) { 1729 pr_statp->pr_time_lost = timeout; 1730 probe_chstate(pr_statp, pii, PR_LOST); 1731 } else if (i == 1) { 1732 /* 1733 * We are forced to consider this probe 1734 * lost, as we can have at most 2 unack. 1735 * probes any time, and we will be sending a 1736 * probe at the end of this function. 1737 * Normally, we should not be here, but 1738 * this can happen if an incoming response 1739 * that was considered lost has increased 1740 * the crtt for this target, and also bumped 1741 * up the FDT. Note that we never cancel or 1742 * increase the current pii_time_left, so 1743 * when the timer fires, we find 2 valid 1744 * unacked probes, and they are yet to timeout 1745 */ 1746 pr_statp->pr_time_lost = cur_time; 1747 probe_chstate(pr_statp, pii, PR_LOST); 1748 } else { 1749 /* 1750 * Only the most recent probe can enter 1751 * this 'else' arm. The second most recent 1752 * probe must take either of the above arms, 1753 * if it is unacked. 1754 */ 1755 valid_unack_count++; 1756 } 1757 break; 1758 } 1759 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1760 } 1761 1762 /* 1763 * We send out 1 probe randomly in the interval between one half 1764 * and one probe interval for the group. Given that the CRTT is always 1765 * less than the group's probe interval, we can have at most 1 1766 * unacknowledged probe now. All previous probes are either lost or 1767 * acked. 1768 */ 1769 assert(valid_unack_count == 0 || valid_unack_count == 1); 1770 1771 /* 1772 * The timer has fired. Take appropriate action depending 1773 * on the current state of the phyint. 1774 * 1775 * PI_RUNNING state - Failure detection 1776 * PI_FAILED state - Repair detection 1777 */ 1778 switch (pii->pii_phyint->pi_state) { 1779 case PI_FAILED: 1780 /* 1781 * If the most recent probe (excluding unacked probes that 1782 * are yet to time out) has been acked, check whether the 1783 * phyint is now repaired. 1784 */ 1785 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1786 phyint_check_for_repair(pii->pii_phyint); 1787 } 1788 break; 1789 1790 case PI_RUNNING: 1791 /* 1792 * It's possible our probes have been lost because of a 1793 * spanning-tree mandated quiet period on the switch. If so, 1794 * ignore the lost probes. 1795 */ 1796 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1797 break; 1798 1799 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1800 /* 1801 * We have 1 or more failed probes (excluding unacked 1802 * probes that are yet to time out). Determine if the 1803 * phyint has failed. 1804 */ 1805 phyint_inst_check_for_failure(pii); 1806 } 1807 break; 1808 1809 default: 1810 logerr("phyint_inst_timer: invalid state %d\n", 1811 pii->pii_phyint->pi_state); 1812 abort(); 1813 } 1814 1815 /* 1816 * Start the next probe. probe() will also set pii->pii_probe_time_left 1817 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1818 * was called, the target list may be empty. 1819 */ 1820 if (pii->pii_target_next != NULL) { 1821 probe(pii, PROBE_UNI, cur_hrtime); 1822 /* 1823 * If we have just the one probe target, and we're not using 1824 * router targets, try to find another as we presently have 1825 * no resilience. 1826 */ 1827 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1828 probe(pii, PROBE_MULTI, cur_hrtime); 1829 } else { 1830 probe(pii, PROBE_MULTI, cur_hrtime); 1831 } 1832 return (interval); 1833 } 1834 1835 /* 1836 * Start the probe timer for an interface instance. 1837 */ 1838 void 1839 start_timer(struct phyint_instance *pii) 1840 { 1841 uint32_t interval; 1842 1843 /* 1844 * Spread the base probe times (pi_snxt_basetime) across phyints 1845 * uniformly over the (curtime..curtime + the group's probe_interval). 1846 * pi_snxt_basetime is strictly periodic with a frequency of 1847 * the group's probe interval. The actual probe time pi_snxt_time 1848 * adds some randomness to pi_snxt_basetime and happens in probe(). 1849 * For the 1st probe on each phyint after the timer is started, 1850 * pi_snxt_time and pi_snxt_basetime are the same. 1851 */ 1852 interval = GET_RANDOM(0, 1853 (int)pii->pii_phyint->pi_group->pg_probeint); 1854 1855 pii->pii_snxt_basetime = getcurrenttime() + interval; 1856 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1857 pii->pii_snxt_time = pii->pii_snxt_basetime; 1858 timer_schedule(interval); 1859 } 1860 1861 /* 1862 * Restart the probe timer on an interface instance. 1863 */ 1864 static void 1865 restart_timer(struct phyint_instance *pii) 1866 { 1867 /* 1868 * We don't need to restart the timer if it was never started in 1869 * the first place (pii->pii_basetime_inited not set), as the timer 1870 * won't have gone off yet. 1871 */ 1872 if (pii->pii_basetime_inited != 0) { 1873 1874 if (debug & D_LINKNOTE) 1875 logdebug("restart timer: restarting timer on %s, " 1876 "address family %s\n", pii->pii_phyint->pi_name, 1877 AF_STR(pii->pii_af)); 1878 1879 start_timer(pii); 1880 } 1881 } 1882 1883 static void 1884 process_link_state_down(struct phyint *pi) 1885 { 1886 logerr("The link has gone down on %s\n", pi->pi_name); 1887 1888 /* 1889 * Clear the probe statistics arrays, we don't want the repair 1890 * detection logic relying on probes that were successful prior 1891 * to the link going down. 1892 */ 1893 if (PROBE_CAPABLE(pi->pi_v4)) 1894 clear_pii_probe_stats(pi->pi_v4); 1895 if (PROBE_CAPABLE(pi->pi_v6)) 1896 clear_pii_probe_stats(pi->pi_v6); 1897 /* 1898 * Check for interface failure. Although we know the interface 1899 * has failed, we don't know if all the other interfaces in the 1900 * group have failed as well. 1901 */ 1902 if ((pi->pi_state == PI_RUNNING) || 1903 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1904 if (debug & D_LINKNOTE) { 1905 logdebug("process_link_state_down:" 1906 " checking for failure on %s\n", pi->pi_name); 1907 } 1908 1909 if (pi->pi_v4 != NULL) 1910 phyint_inst_check_for_failure(pi->pi_v4); 1911 else if (pi->pi_v6 != NULL) 1912 phyint_inst_check_for_failure(pi->pi_v6); 1913 } 1914 } 1915 1916 static void 1917 process_link_state_up(struct phyint *pi) 1918 { 1919 logerr("The link has come up on %s\n", pi->pi_name); 1920 1921 /* 1922 * We stopped any running timers on each instance when the link 1923 * went down, so restart them. 1924 */ 1925 if (pi->pi_v4) 1926 restart_timer(pi->pi_v4); 1927 if (pi->pi_v6) 1928 restart_timer(pi->pi_v6); 1929 1930 phyint_check_for_repair(pi); 1931 1932 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1933 if (pi->pi_whendx == LINK_UP_PERMIN) 1934 pi->pi_whendx = 0; 1935 } 1936 1937 /* 1938 * Process any changes in link state passed up from the interfaces. 1939 */ 1940 void 1941 process_link_state_changes(void) 1942 { 1943 struct phyint *pi; 1944 1945 /* Look for interfaces where the link state has just changed */ 1946 1947 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1948 boolean_t old_link_state_up = LINK_UP(pi); 1949 1950 /* 1951 * Except when the "phyint" structure is created, this is 1952 * the only place the link state is updated. This allows 1953 * this routine to detect changes in link state, rather 1954 * than just the current state. 1955 */ 1956 UPDATE_LINK_STATE(pi); 1957 1958 if (LINK_DOWN(pi)) { 1959 /* 1960 * Has link just gone down? 1961 */ 1962 if (old_link_state_up) 1963 process_link_state_down(pi); 1964 } else { 1965 /* 1966 * Has link just gone back up? 1967 */ 1968 if (!old_link_state_up) 1969 process_link_state_up(pi); 1970 } 1971 } 1972 } 1973 1974 void 1975 reset_crtt_all(struct phyint *pi) 1976 { 1977 struct phyint_instance *pii; 1978 struct target *tg; 1979 1980 pii = pi->pi_v4; 1981 if (pii != NULL) { 1982 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1983 tg->tg_crtt = 0; 1984 tg->tg_rtt_sa = -1; 1985 tg->tg_rtt_sd = 0; 1986 } 1987 } 1988 1989 pii = pi->pi_v6; 1990 if (pii != NULL) { 1991 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1992 tg->tg_crtt = 0; 1993 tg->tg_rtt_sa = -1; 1994 tg->tg_rtt_sd = 0; 1995 } 1996 } 1997 } 1998 1999 /* 2000 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 2001 * probes on both instances IPv4 and IPv6. 2002 * If the interface has failed, return the time of the first probe failure 2003 * in "tff". 2004 */ 2005 static int 2006 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 2007 { 2008 uint_t pi_tff; 2009 struct target *cur_tg; 2010 struct probe_fail_count pfinfo; 2011 struct phyint_instance *pii_other; 2012 int pr_ndx; 2013 2014 /* 2015 * Get the number of consecutive failed probes on 2016 * this phyint across all targets. Also get the number 2017 * of consecutive failed probes on this target only 2018 */ 2019 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2020 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2021 probe_fail_info(pii, cur_tg, &pfinfo); 2022 2023 /* Get the time of first failure, for later use */ 2024 pi_tff = pfinfo.pf_tff; 2025 2026 /* 2027 * If the current target has not responded to the 2028 * last NUM_PROBE_FAILS probes, and other targets are 2029 * responding delete this target. Dead gateway detection 2030 * will eventually remove this target (if router) from the 2031 * routing tables. If that does not occur, we may end 2032 * up adding this to our list again. 2033 */ 2034 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2035 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2036 if (pii->pii_targets_are_routers) { 2037 if (cur_tg->tg_status == TG_ACTIVE) 2038 pii->pii_ntargets--; 2039 cur_tg->tg_status = TG_DEAD; 2040 cur_tg->tg_crtt = 0; 2041 cur_tg->tg_rtt_sa = -1; 2042 cur_tg->tg_rtt_sd = 0; 2043 if (pii->pii_target_next == cur_tg) 2044 pii->pii_target_next = target_next(cur_tg); 2045 } else { 2046 target_delete(cur_tg); 2047 probe(pii, PROBE_MULTI, gethrtime()); 2048 } 2049 return (PHYINT_OK); 2050 } 2051 2052 /* 2053 * If the phyint has lost NUM_PROBE_FAILS or more 2054 * consecutive probes, on both IPv4 and IPv6 protocol 2055 * instances of the phyint, then trigger failure 2056 * detection, else return false 2057 */ 2058 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2059 return (PHYINT_OK); 2060 2061 pii_other = phyint_inst_other(pii); 2062 if (PROBE_CAPABLE(pii_other)) { 2063 probe_fail_info(pii_other, NULL, &pfinfo); 2064 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2065 /* 2066 * We have NUM_PROBE_FAILS or more failures 2067 * on both IPv4 and IPv6. Get the earliest 2068 * time when failure was detected on this 2069 * phyint across IPv4 and IPv6. 2070 */ 2071 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2072 pi_tff = pfinfo.pf_tff; 2073 } else { 2074 /* 2075 * This instance has < NUM_PROBE_FAILS failure. 2076 * So return false 2077 */ 2078 return (PHYINT_OK); 2079 } 2080 } 2081 *tff = pi_tff; 2082 return (PHYINT_FAILURE); 2083 } 2084 2085 /* 2086 * Check if the link has gone down on this phyint, or it has failed the 2087 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2088 * Also look at other phyints of this group, for group failures. 2089 */ 2090 int 2091 failure_state(struct phyint_instance *pii) 2092 { 2093 struct probe_success_count psinfo; 2094 uint_t pi2_tls; /* time last success */ 2095 uint_t pi_tff; /* time first fail */ 2096 struct phyint *pi2; 2097 struct phyint *pi; 2098 struct phyint_instance *pii2; 2099 struct phyint_group *pg; 2100 int retval; 2101 2102 if (debug & D_FAILREP) 2103 logdebug("phyint_failed(%s)\n", pii->pii_name); 2104 2105 pi = pii->pii_phyint; 2106 pg = pi->pi_group; 2107 2108 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2109 PHYINT_OK) 2110 return (PHYINT_OK); 2111 2112 /* 2113 * At this point, the link is down, or the phyint is suspect, as it 2114 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not 2115 * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue 2116 * on to determine whether this should be considered a PHYINT_FAILURE 2117 * or GROUP_FAILURE. 2118 */ 2119 if (pg == phyint_anongroup) 2120 return (PHYINT_FAILURE); 2121 2122 /* 2123 * Need to compare against other phyints of the same group 2124 * to exclude group failures. If the failure was detected via 2125 * probing, then if the time of last success (tls) of any 2126 * phyint is more recent than the time of first fail (tff) of the 2127 * phyint in question, and the link is up on the phyint, 2128 * then it is a phyint failure. Otherwise it is a group failure. 2129 * If failure was detected via a link down notification sent from 2130 * the driver to IP, we see if any phyints in the group are still 2131 * running and haven't received a link down notification. We 2132 * will usually be processing the link down notification shortly 2133 * after it was received, so there is no point looking at the tls 2134 * of other phyints. 2135 */ 2136 retval = GROUP_FAILURE; 2137 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2138 /* Exclude ourself from comparison */ 2139 if (pi2 == pi) 2140 continue; 2141 2142 if (LINK_DOWN(pi)) { 2143 /* 2144 * We use FLAGS_TO_LINK_STATE() to test the flags 2145 * directly, rather then LINK_UP() or LINK_DOWN(), as 2146 * we may not have got round to processing the link 2147 * state for the other phyints in the group yet. 2148 * 2149 * The check for PI_RUNNING and group failure handles 2150 * the case when the group begins to recover. 2151 * PI_RUNNING will be set, and group failure cleared 2152 * only after receipt of NUM_PROBE_REPAIRS, by which 2153 * time the other phyints should have received at 2154 * least 1 packet, and so will not have NUM_PROBE_FAILS. 2155 */ 2156 if ((pi2->pi_state == PI_RUNNING) && 2157 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { 2158 retval = PHYINT_FAILURE; 2159 break; 2160 } 2161 continue; 2162 } 2163 2164 if (LINK_DOWN(pi2)) 2165 continue; 2166 2167 /* 2168 * If there's no probe-based failure detection on this 2169 * interface, and its link is still up, then it's still 2170 * working and thus the group has not failed. 2171 */ 2172 if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { 2173 retval = PHYINT_FAILURE; 2174 break; 2175 } 2176 2177 /* 2178 * Need to compare against both IPv4 and IPv6 instances. 2179 */ 2180 pii2 = pi2->pi_v4; 2181 if (pii2 != NULL) { 2182 probe_success_info(pii2, NULL, &psinfo); 2183 if (psinfo.ps_tls_valid) { 2184 pi2_tls = psinfo.ps_tls; 2185 /* 2186 * See comment above regarding check 2187 * for PI_RUNNING and group failure. 2188 */ 2189 if (TIME_GT(pi2_tls, pi_tff) && 2190 (pi2->pi_state == PI_RUNNING) && 2191 !GROUP_FAILED(pg) && 2192 FLAGS_TO_LINK_STATE(pi2)) { 2193 retval = PHYINT_FAILURE; 2194 break; 2195 } 2196 } 2197 } 2198 2199 pii2 = pi2->pi_v6; 2200 if (pii2 != NULL) { 2201 probe_success_info(pii2, NULL, &psinfo); 2202 if (psinfo.ps_tls_valid) { 2203 pi2_tls = psinfo.ps_tls; 2204 /* 2205 * See comment above regarding check 2206 * for PI_RUNNING and group failure. 2207 */ 2208 if (TIME_GT(pi2_tls, pi_tff) && 2209 (pi2->pi_state == PI_RUNNING) && 2210 !GROUP_FAILED(pg) && 2211 FLAGS_TO_LINK_STATE(pi2)) { 2212 retval = PHYINT_FAILURE; 2213 break; 2214 } 2215 } 2216 } 2217 } 2218 2219 /* 2220 * Update the group state to account for the changes. 2221 */ 2222 phyint_group_refresh_state(pg); 2223 return (retval); 2224 } 2225 2226 /* 2227 * Return the information associated with consecutive probe successes 2228 * starting with the most recent probe. At most the last 2 probes can be 2229 * in the unacknowledged state. All previous probes have either failed 2230 * or succeeded. 2231 */ 2232 static void 2233 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2234 struct probe_success_count *psinfo) 2235 { 2236 uint_t i; 2237 struct probe_stats *pr_statp; 2238 uint_t most_recent; 2239 uint_t second_most_recent; 2240 boolean_t pi_found_failure = _B_FALSE; 2241 boolean_t tg_found_failure = _B_FALSE; 2242 uint_t now; 2243 uint_t timeout; 2244 struct target *tg; 2245 2246 if (debug & D_FAILREP) 2247 logdebug("probe_success_info(%s)\n", pii->pii_name); 2248 2249 bzero(psinfo, sizeof (*psinfo)); 2250 now = getcurrenttime(); 2251 2252 /* 2253 * Start with the most recent probe, and count the number 2254 * of consecutive probe successes. Latch the number of successes 2255 * on hitting a failure. 2256 */ 2257 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2258 second_most_recent = PROBE_INDEX_PREV(most_recent); 2259 2260 for (i = most_recent; i != pii->pii_probe_next; 2261 i = PROBE_INDEX_PREV(i)) { 2262 pr_statp = &pii->pii_probes[i]; 2263 2264 switch (pr_statp->pr_status) { 2265 case PR_UNACKED: 2266 /* 2267 * Only the most recent 2 probes can be unacknowledged 2268 */ 2269 assert(i == most_recent || i == second_most_recent); 2270 2271 tg = pr_statp->pr_target; 2272 assert(tg != NULL); 2273 /* 2274 * The crtt could be zero for some reason, 2275 * Eg. the phyint could be failed. If the crtt is 2276 * not available use the value of the group's probe 2277 * interval which is a worst case estimate. 2278 */ 2279 timeout = ns2ms(pr_statp->pr_hrtime_start); 2280 if (tg->tg_crtt != 0) { 2281 timeout += tg->tg_crtt; 2282 } else { 2283 timeout += 2284 pii->pii_phyint->pi_group->pg_probeint; 2285 } 2286 2287 if (TIME_LT(timeout, now)) { 2288 /* 2289 * We hit a failure. Latch the total number of 2290 * recent consecutive successes. 2291 */ 2292 pr_statp->pr_time_lost = timeout; 2293 probe_chstate(pr_statp, pii, PR_LOST); 2294 pi_found_failure = _B_TRUE; 2295 if (cur_tg != NULL && tg == cur_tg) { 2296 /* 2297 * We hit a failure for the desired 2298 * target. Latch the number of recent 2299 * consecutive successes for this target 2300 */ 2301 tg_found_failure = _B_TRUE; 2302 } 2303 } 2304 break; 2305 2306 case PR_ACKED: 2307 /* 2308 * Bump up the count of probe successes, if we 2309 * have not seen any failure so far. 2310 */ 2311 if (!pi_found_failure) 2312 psinfo->ps_nsucc++; 2313 2314 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2315 !tg_found_failure) { 2316 psinfo->ps_nsucc_tg++; 2317 } 2318 2319 /* 2320 * Record the time of last success, if this is 2321 * the most recent probe success. 2322 */ 2323 if (!psinfo->ps_tls_valid) { 2324 psinfo->ps_tls = 2325 ns2ms(pr_statp->pr_hrtime_ackproc); 2326 psinfo->ps_tls_valid = _B_TRUE; 2327 } 2328 break; 2329 2330 case PR_LOST: 2331 /* 2332 * We hit a failure. Latch the total number of 2333 * recent consecutive successes. 2334 */ 2335 pi_found_failure = _B_TRUE; 2336 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2337 /* 2338 * We hit a failure for the desired target. 2339 * Latch the number of recent consecutive 2340 * successes for this target 2341 */ 2342 tg_found_failure = _B_TRUE; 2343 } 2344 break; 2345 2346 default: 2347 return; 2348 2349 } 2350 } 2351 } 2352 2353 /* 2354 * Return the information associated with consecutive probe failures 2355 * starting with the most recent probe. Only the last 2 probes can be in the 2356 * unacknowledged state. All previous probes have either failed or succeeded. 2357 */ 2358 static void 2359 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2360 struct probe_fail_count *pfinfo) 2361 { 2362 int i; 2363 struct probe_stats *pr_statp; 2364 boolean_t tg_found_success = _B_FALSE; 2365 boolean_t pi_found_success = _B_FALSE; 2366 int most_recent; 2367 int second_most_recent; 2368 uint_t now; 2369 uint_t timeout; 2370 struct target *tg; 2371 2372 if (debug & D_FAILREP) 2373 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2374 2375 bzero(pfinfo, sizeof (*pfinfo)); 2376 now = getcurrenttime(); 2377 2378 /* 2379 * Start with the most recent probe, and count the number 2380 * of consecutive probe failures. Latch the number of failures 2381 * on hitting a probe success. 2382 */ 2383 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2384 second_most_recent = PROBE_INDEX_PREV(most_recent); 2385 2386 for (i = most_recent; i != pii->pii_probe_next; 2387 i = PROBE_INDEX_PREV(i)) { 2388 pr_statp = &pii->pii_probes[i]; 2389 2390 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2391 2392 switch (pr_statp->pr_status) { 2393 case PR_UNACKED: 2394 /* 2395 * Only the most recent 2 probes can be unacknowledged 2396 */ 2397 assert(i == most_recent || i == second_most_recent); 2398 2399 tg = pr_statp->pr_target; 2400 /* 2401 * Target is guaranteed to exist in the unack. state 2402 */ 2403 assert(tg != NULL); 2404 /* 2405 * The crtt could be zero for some reason, 2406 * Eg. the phyint could be failed. If the crtt is 2407 * not available use the group's probe interval, 2408 * which is a worst case estimate. 2409 */ 2410 timeout = ns2ms(pr_statp->pr_hrtime_start); 2411 if (tg->tg_crtt != 0) { 2412 timeout += tg->tg_crtt; 2413 } else { 2414 timeout += 2415 pii->pii_phyint->pi_group->pg_probeint; 2416 } 2417 2418 if (TIME_GT(timeout, now)) 2419 break; 2420 2421 pr_statp->pr_time_lost = timeout; 2422 probe_chstate(pr_statp, pii, PR_LOST); 2423 /* FALLTHRU */ 2424 2425 case PR_LOST: 2426 if (!pi_found_success) { 2427 pfinfo->pf_nfail++; 2428 pfinfo->pf_tff = pr_statp->pr_time_lost; 2429 } 2430 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2431 !tg_found_success) { 2432 pfinfo->pf_nfail_tg++; 2433 } 2434 break; 2435 2436 default: 2437 /* 2438 * We hit a success or unused slot. Latch the 2439 * total number of recent consecutive failures. 2440 */ 2441 pi_found_success = _B_TRUE; 2442 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2443 /* 2444 * We hit a success for the desired target. 2445 * Latch the number of recent consecutive 2446 * failures for this target 2447 */ 2448 tg_found_success = _B_TRUE; 2449 } 2450 } 2451 } 2452 } 2453 2454 /* 2455 * Change the state of probe `pr' on phyint_instance `pii' to state `state'. 2456 */ 2457 void 2458 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) 2459 { 2460 if (pr->pr_status == state) 2461 return; 2462 2463 pr->pr_status = state; 2464 (void) probe_state_event(pr, pii); 2465 } 2466 2467 /* 2468 * Check if the phyint has been repaired. If no test address has been 2469 * configured, then consider the interface repaired if the link is up (unless 2470 * the link is flapping; see below). Otherwise, look for proof of probes 2471 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2472 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2473 */ 2474 static boolean_t 2475 phyint_repaired(struct phyint *pi) 2476 { 2477 struct probe_success_count psinfo; 2478 struct phyint_instance *pii; 2479 struct target *cur_tg; 2480 int pr_ndx; 2481 uint_t cur_time; 2482 2483 if (debug & D_FAILREP) 2484 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2485 2486 if (LINK_DOWN(pi)) 2487 return (_B_FALSE); 2488 2489 /* 2490 * If we don't have any test addresses and the link is up, then 2491 * consider the interface repaired, unless we've received more than 2492 * LINK_UP_PERMIN link up notifications in the last minute, in 2493 * which case we keep the link down until we drop back below 2494 * the threshold. 2495 */ 2496 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2497 cur_time = getcurrenttime(); 2498 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2499 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2500 pi->pi_lfmsg_printed = 0; 2501 return (_B_TRUE); 2502 } 2503 if (!pi->pi_lfmsg_printed) { 2504 logerr("The link has come up on %s more than %d times " 2505 "in the last minute; disabling repair until it " 2506 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2507 pi->pi_lfmsg_printed = 1; 2508 } 2509 2510 return (_B_FALSE); 2511 } 2512 2513 pii = pi->pi_v4; 2514 if (PROBE_CAPABLE(pii)) { 2515 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2516 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2517 probe_success_info(pii, cur_tg, &psinfo); 2518 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2519 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2520 return (_B_TRUE); 2521 } 2522 2523 pii = pi->pi_v6; 2524 if (PROBE_CAPABLE(pii)) { 2525 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2526 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2527 probe_success_info(pii, cur_tg, &psinfo); 2528 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2529 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2530 return (_B_TRUE); 2531 } 2532 2533 return (_B_FALSE); 2534 } 2535 2536 /* 2537 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2538 */ 2539 boolean_t 2540 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) 2541 { 2542 int ifsock; 2543 struct lifreq lifr; 2544 uint64_t old_flags; 2545 2546 if (debug & D_FAILREP) { 2547 logdebug("change_pif_flags(%s): set %llx clear %llx\n", 2548 pi->pi_name, set, clear); 2549 } 2550 2551 if (pi->pi_v4 != NULL) 2552 ifsock = ifsock_v4; 2553 else 2554 ifsock = ifsock_v6; 2555 2556 /* 2557 * Get the current flags from the kernel, and set/clear the 2558 * desired phyint flags. Since we set only phyint flags, we can 2559 * do it on either IPv4 or IPv6 instance. 2560 */ 2561 (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2562 2563 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2564 if (errno != ENXIO) 2565 logperror("change_pif_flags: ioctl (get flags)"); 2566 return (_B_FALSE); 2567 } 2568 2569 old_flags = lifr.lifr_flags; 2570 lifr.lifr_flags |= set; 2571 lifr.lifr_flags &= ~clear; 2572 2573 if (old_flags == lifr.lifr_flags) { 2574 /* No change in the flags. No need to send ioctl */ 2575 return (_B_TRUE); 2576 } 2577 2578 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2579 if (errno != ENXIO) 2580 logperror("change_pif_flags: ioctl (set flags)"); 2581 return (_B_FALSE); 2582 } 2583 2584 /* 2585 * Keep pi_flags in synch. with actual flags. Assumes flags are 2586 * phyint flags. 2587 */ 2588 pi->pi_flags |= set; 2589 pi->pi_flags &= ~clear; 2590 2591 if (pi->pi_v4 != NULL) 2592 pi->pi_v4->pii_flags = pi->pi_flags; 2593 2594 if (pi->pi_v6 != NULL) 2595 pi->pi_v6->pii_flags = pi->pi_flags; 2596 2597 return (_B_TRUE); 2598 } 2599 2600 /* 2601 * icmp cksum computation for IPv4. 2602 */ 2603 static int 2604 in_cksum(ushort_t *addr, int len) 2605 { 2606 register int nleft = len; 2607 register ushort_t *w = addr; 2608 register ushort_t answer; 2609 ushort_t odd_byte = 0; 2610 register int sum = 0; 2611 2612 /* 2613 * Our algorithm is simple, using a 32 bit accumulator (sum), 2614 * we add sequential 16 bit words to it, and at the end, fold 2615 * back all the carry bits from the top 16 bits into the lower 2616 * 16 bits. 2617 */ 2618 while (nleft > 1) { 2619 sum += *w++; 2620 nleft -= 2; 2621 } 2622 2623 /* mop up an odd byte, if necessary */ 2624 if (nleft == 1) { 2625 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2626 sum += odd_byte; 2627 } 2628 2629 /* 2630 * add back carry outs from top 16 bits to low 16 bits 2631 */ 2632 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2633 sum += (sum >> 16); /* add carry */ 2634 answer = ~sum; /* truncate to 16 bits */ 2635 return (answer); 2636 } 2637 2638 static void 2639 reset_snxt_basetimes(void) 2640 { 2641 struct phyint_instance *pii; 2642 2643 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2644 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2645 } 2646 } 2647 2648 /* 2649 * Is the address one of our own addresses? Unfortunately, 2650 * we cannot check our phyint tables to determine if the address 2651 * is our own. This is because, we don't track interfaces that 2652 * are not part of any group. We have to either use a 'bind' or 2653 * get the complete list of all interfaces using SIOCGLIFCONF, 2654 * to do this check. We could also use SIOCTMYADDR. 2655 * Bind fails for the local zone address, so we might include local zone 2656 * address as target address. If local zone address is a target address 2657 * and it is up, it is not possible to detect the interface failure. 2658 * SIOCTMYADDR also doesn't consider local zone address as own address. 2659 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2660 * are stored in `localaddrs' 2661 */ 2662 boolean_t 2663 own_address(struct in6_addr addr) 2664 { 2665 addrlist_t *addrp; 2666 struct sockaddr_storage ss; 2667 int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; 2668 2669 addr2storage(af, &addr, &ss); 2670 for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { 2671 if (sockaddrcmp(&ss, &addrp->al_addr)) 2672 return (_B_TRUE); 2673 } 2674 return (_B_FALSE); 2675 } 2676 2677 static int 2678 ns2ms(int64_t ns) 2679 { 2680 return (ns / (NANOSEC / MILLISEC)); 2681 } 2682 2683 static int64_t 2684 tv2ns(struct timeval *tvp) 2685 { 2686 return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); 2687 } 2688