1 /* 2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 /* 7 * Copyright (c) 1987 Regents of the University of California. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms are permitted 11 * provided that the above copyright notice and this paragraph are 12 * duplicated in all such forms and that any documentation, 13 * advertising materials, and other materials related to such 14 * distribution and use acknowledge that the software was developed 15 * by the University of California, Berkeley. The name of the 16 * University may not be used to endorse or promote products derived 17 * from this software without specific prior written permission. 18 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20 * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21 */ 22 23 #include "mpd_defs.h" 24 #include "mpd_tables.h" 25 26 /* 27 * Probe types for probe() 28 */ 29 #define PROBE_UNI 0x1234 /* Unicast probe packet */ 30 #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 31 #define PROBE_RTT 0x9abc /* RTT only probe packet */ 32 33 #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 34 35 /* 36 * Format of probe / probe response packets. This is an ICMP Echo request 37 * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 38 */ 39 struct pr_icmp 40 { 41 uint8_t pr_icmp_type; /* type field */ 42 uint8_t pr_icmp_code; /* code field */ 43 uint16_t pr_icmp_cksum; /* checksum field */ 44 uint16_t pr_icmp_id; /* Identification */ 45 uint16_t pr_icmp_seq; /* sequence number */ 46 uint64_t pr_icmp_timestamp; /* Time stamp (in ns) */ 47 uint32_t pr_icmp_mtype; /* Message type */ 48 }; 49 50 static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 51 0x0, 0x0, 0x0, 0x0, 52 0x0, 0x0, 0x0, 0x0, 53 0x0, 0x0, 0x0, 0x1 } }; 54 55 static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 56 57 static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 58 59 static void *find_ancillary(struct msghdr *msg, int cmsg_level, 60 int cmsg_type); 61 static void pi_set_crtt(struct target *tg, int64_t m, 62 boolean_t is_probe_uni); 63 static void incoming_echo_reply(struct phyint_instance *pii, 64 struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp); 65 static void incoming_rtt_reply(struct phyint_instance *pii, 66 struct pr_icmp *reply, struct in6_addr fromaddr); 67 static void incoming_mcast_reply(struct phyint_instance *pii, 68 struct pr_icmp *reply, struct in6_addr fromaddr); 69 70 static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 71 static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 72 static boolean_t check_exception_target(struct phyint_instance *pii, 73 struct target *target); 74 static void probe_fail_info(struct phyint_instance *pii, 75 struct target *cur_tg, struct probe_fail_count *pfinfo); 76 static void probe_success_info(struct phyint_instance *pii, 77 struct target *cur_tg, struct probe_success_count *psinfo); 78 static boolean_t phyint_repaired(struct phyint *pi); 79 80 static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 81 static int in_cksum(ushort_t *addr, int len); 82 static void reset_snxt_basetimes(void); 83 static int ns2ms(int64_t ns); 84 static int64_t tv2ns(struct timeval *); 85 86 /* 87 * CRTT - Conservative Round Trip Time Estimate 88 * Probe success - A matching probe reply received before CRTT ms has elapsed 89 * after sending the probe. 90 * Probe failure - No probe reply received and more than CRTT ms has elapsed 91 * after sending the probe. 92 * 93 * TLS - Time last success. Most recent probe ack received at this time. 94 * TFF - Time first fail. The time of the earliest probe failure in 95 * a consecutive series of probe failures. 96 * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 97 * before declaring phyint repair. 98 * NUM_PROBE_FAILS - Number of consecutive probe failures required to 99 * declare a phyint failure. 100 * 101 * Phyint state diagram 102 * 103 * The state of a phyint that is capable of being probed, is completely 104 * specified by the 3-tuple <pi_state, pg_state, I>. 105 * 106 * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether 107 * IFF_OFFLINE is set. If the phyint is also configured with a test address 108 * (the common case) and probe targets, then a phyint must also successfully 109 * be able to send and receive probes in order to remain in the PI_RUNNING 110 * state (otherwise, it transitions to PI_FAILED). 111 * 112 * Further, if a PI_RUNNING phyint is configured with a test address but is 113 * unable to find any probe targets, it will transition to the PI_NOTARGETS 114 * state, which indicates that the link is apparently functional but that 115 * in.mpathd is unable to send probes to verify functionality (in this case, 116 * in.mpathd makes the optimistic assumption that the interface is working 117 * correctly and thus does not mark the interface FAILED, but reports it as 118 * IPMP_IF_UNKNOWN through the async events and query interfaces). 119 * 120 * At any point, a phyint may be administratively marked offline via if_mpadm. 121 * In this case, the interface always transitions to PI_OFFLINE, regardless 122 * of its previous state. When the interface is later brought back online, 123 * in.mpathd acts as if the interface is new (and thus it transitions to 124 * PI_RUNNING or PI_FAILED based on the status of the link and the result of 125 * its probes, if probes are sent). 126 * 127 * pi_state - PI_RUNNING or PI_FAILED 128 * PI_RUNNING: The failure detection logic says the phyint is good. 129 * PI_FAILED: The failure detection logic says the phyint has failed. 130 * 131 * pg_state - PG_OK, PG_DEGRADED, or PG_FAILED. 132 * PG_OK: All interfaces in the group are OK. 133 * PG_DEGRADED: Some interfaces in the group are unusable. 134 * PG_FAILED: All interfaces in the group are unusable. 135 * 136 * In the case of router targets, we assume that the current list of 137 * targets obtained from the routing table, is still valid, so the 138 * phyint stat is PI_FAILED. In the case of host targets, we delete the 139 * list of targets, and multicast to the all hosts, to reconstruct the 140 * target list. So the phyints are in the PI_NOTARGETS state. 141 * 142 * I - value of (pi_flags & IFF_INACTIVE) 143 * IFF_INACTIVE: This phyint will not send or receive packets. 144 * Usually, inactive is tied to standby interfaces that are not yet 145 * needed (e.g., no non-standby interfaces in the group have failed). 146 * When failback has been disabled (FAILBACK=no configured), phyint can 147 * also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint 148 * subsequently recovers after a failure. 149 * 150 * Not all 9 possible combinations of the above 3-tuple are possible. 151 * 152 * I is tracked by IP. pi_state is tracked by mpathd. 153 * 154 * pi_state state machine 155 * --------------------------------------------------------------------------- 156 * Event State New State 157 * Action: 158 * --------------------------------------------------------------------------- 159 * IP interface failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 160 * detection : set IFF_FAILED on this phyint 161 * 162 * IP interface failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 163 * detection : set IFF_FAILED on this phyint 164 * 165 * IP interface repair (PI_FAILED, I == 0, FAILBACK=yes) 166 * detection -> (PI_RUNNING, I == 0) 167 * : clear IFF_FAILED on this phyint 168 * 169 * IP interface repair (PI_FAILED, I == 0, FAILBACK=no) 170 * detection -> (PI_RUNNING, I == 1) 171 * : clear IFF_FAILED on this phyint 172 * : if failback is disabled set I == 1 173 * 174 * Group failure (perform on all phyints in the group) 175 * detection PI_RUNNING PI_FAILED 176 * (Router targets) : set IFF_FAILED 177 * 178 * Group failure (perform on all phyints in the group) 179 * detection PI_RUNNING PI_NOTARGETS 180 * (Host targets) : set IFF_FAILED 181 * : delete the target list on all phyints 182 * --------------------------------------------------------------------------- 183 */ 184 185 struct probes_missed probes_missed; 186 187 /* 188 * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 189 * will be added on by the kernel. The id field identifies this phyint. 190 * and the sequence number is an increasing (modulo 2^^16) integer. The data 191 * portion holds the time value when the packet is sent. On echo this is 192 * extracted to compute the round-trip time. Three different types of 193 * probe packets are used. 194 * 195 * PROBE_UNI: This type is used to do failure detection / failure recovery 196 * and RTT calculation. PROBE_UNI probes are spaced apart in time, 197 * not less than the current CRTT. pii_probes[] stores data 198 * about these probes. These packets consume sequence number space. 199 * 200 * PROBE_RTT: This type is used to make only rtt measurements. Normally these 201 * are not used. Under heavy network load, the rtt may go up very high, 202 * due to a spike, or may appear to go high, due to extreme scheduling 203 * delays. Once the network stress is removed, mpathd takes long time to 204 * recover, because the probe_interval is already high, and it takes 205 * a long time to send out sufficient number of probes to bring down the 206 * rtt. To avoid this problem, PROBE_RTT probes are sent out every 207 * user_probe_interval ms. and will cause only rtt updates. These packets 208 * do not consume sequence number space nor is information about these 209 * packets stored in the pii_probes[] 210 * 211 * PROBE_MULTI: This type is only used to construct a list of targets, when 212 * no targets are known. The packet is multicast to the all hosts addr. 213 */ 214 static void 215 probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime) 216 { 217 hrtime_t sent_hrtime; 218 struct timeval sent_tv; 219 struct pr_icmp probe_pkt; /* Probe packet */ 220 struct sockaddr_storage targ; /* target address */ 221 uint_t targaddrlen; /* targed address length */ 222 int pr_ndx; /* probe index in pii->pii_probes[] */ 223 boolean_t sent = _B_FALSE; 224 int rval; 225 226 if (debug & D_TARGET) { 227 logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af), 228 pii->pii_name, probe_type, start_hrtime); 229 } 230 231 assert(pii->pii_probe_sock != -1); 232 assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 233 probe_type == PROBE_RTT); 234 235 probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 236 ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 237 probe_pkt.pr_icmp_code = 0; 238 probe_pkt.pr_icmp_cksum = 0; 239 probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 240 241 /* 242 * Since there is no need to do arithmetic on the icmpid, 243 * (only equality check is done) pii_icmpid is stored in 244 * network byte order at initialization itself. 245 */ 246 probe_pkt.pr_icmp_id = pii->pii_icmpid; 247 probe_pkt.pr_icmp_timestamp = htonll(start_hrtime); 248 probe_pkt.pr_icmp_mtype = htonl(probe_type); 249 250 /* 251 * If probe_type is PROBE_MULTI, this packet will be multicast to 252 * the all hosts address. Otherwise it is unicast to the next target. 253 */ 254 assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 255 pii->pii_rtt_target_next != NULL)); 256 257 bzero(&targ, sizeof (targ)); 258 targ.ss_family = pii->pii_af; 259 260 if (pii->pii_af == AF_INET6) { 261 struct in6_addr *addr6; 262 263 addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr; 264 targaddrlen = sizeof (struct sockaddr_in6); 265 if (probe_type == PROBE_MULTI) { 266 *addr6 = all_nodes_mcast_v6; 267 } else if (probe_type == PROBE_UNI) { 268 *addr6 = pii->pii_target_next->tg_address; 269 } else { /* type is PROBE_RTT */ 270 *addr6 = pii->pii_rtt_target_next->tg_address; 271 } 272 } else { 273 struct in_addr *addr4; 274 275 addr4 = &((struct sockaddr_in *)&targ)->sin_addr; 276 targaddrlen = sizeof (struct sockaddr_in); 277 if (probe_type == PROBE_MULTI) { 278 *addr4 = all_nodes_mcast_v4; 279 } else if (probe_type == PROBE_UNI) { 280 IN6_V4MAPPED_TO_INADDR( 281 &pii->pii_target_next->tg_address, addr4); 282 } else { /* type is PROBE_RTT */ 283 IN6_V4MAPPED_TO_INADDR( 284 &pii->pii_rtt_target_next->tg_address, addr4); 285 } 286 287 /* 288 * Compute the IPv4 icmp checksum. Does not cover the IP header. 289 */ 290 probe_pkt.pr_icmp_cksum = 291 in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 292 } 293 294 /* 295 * Use the current time as the time we sent. Not atomic, but the best 296 * we can do from here. 297 */ 298 sent_hrtime = gethrtime(); 299 (void) gettimeofday(&sent_tv, NULL); 300 rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0, 301 (struct sockaddr *)&targ, targaddrlen); 302 /* 303 * If the send would block, this may either be transient or a hang in a 304 * lower layer. We pretend the probe was actually sent, the daemon will 305 * not see a reply to the probe and will fail the interface if normal 306 * failure detection criteria are met. 307 */ 308 if (rval == sizeof (probe_pkt) || 309 (rval == -1 && errno == EWOULDBLOCK)) { 310 sent = _B_TRUE; 311 } else { 312 logperror_pii(pii, "probe: probe sendto"); 313 } 314 315 /* 316 * If this is a PROBE_UNI probe packet being unicast to a target, then 317 * update our tables. We will need this info in processing the probe 318 * response. PROBE_MULTI and PROBE_RTT packets are not used for 319 * the purpose of failure or recovery detection. PROBE_MULTI packets 320 * are only used to construct a list of targets. PROBE_RTT packets are 321 * used only for updating the rtt and not for failure detection. 322 */ 323 if (probe_type == PROBE_UNI && sent) { 324 pr_ndx = pii->pii_probe_next; 325 assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 326 327 /* Collect statistics, before we reuse the last slot. */ 328 if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 329 pii->pii_cum_stats.lost++; 330 else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 331 pii->pii_cum_stats.acked++; 332 pii->pii_cum_stats.sent++; 333 334 pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt; 335 pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv; 336 pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime; 337 pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime; 338 pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 339 probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED); 340 341 pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 342 pii->pii_target_next = target_next(pii->pii_target_next); 343 assert(pii->pii_target_next != NULL); 344 /* 345 * If we have a single variable to denote the next target to 346 * probe for both rtt probes and failure detection probes, we 347 * could end up with a situation where the failure detection 348 * probe targets become disjoint from the rtt probe targets. 349 * Eg. if 2 targets and the actual fdt is double the user 350 * specified fdt. So we have 2 variables. In this scheme 351 * we also reset pii_rtt_target_next for every fdt probe, 352 * though that may not be necessary. 353 */ 354 pii->pii_rtt_target_next = pii->pii_target_next; 355 pii->pii_snxt++; 356 } else if (probe_type == PROBE_RTT) { 357 pii->pii_rtt_target_next = 358 target_next(pii->pii_rtt_target_next); 359 assert(pii->pii_rtt_target_next != NULL); 360 } 361 } 362 363 /* 364 * Incoming IPv4 data from wire, is received here. Called from main. 365 */ 366 void 367 in_data(struct phyint_instance *pii) 368 { 369 struct sockaddr_in from; 370 struct in6_addr fromaddr; 371 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 372 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 373 struct ip *ip; 374 int iphlen; 375 int len; 376 char abuf[INET_ADDRSTRLEN]; 377 struct msghdr msg; 378 struct iovec iov; 379 struct pr_icmp *reply; 380 struct timeval *recv_tvp; 381 382 if (debug & D_PROBE) { 383 logdebug("in_data(%s %s)\n", 384 AF_STR(pii->pii_af), pii->pii_name); 385 } 386 387 iov.iov_base = (char *)in_packet; 388 iov.iov_len = sizeof (in_packet); 389 msg.msg_iov = &iov; 390 msg.msg_iovlen = 1; 391 msg.msg_name = (struct sockaddr *)&from; 392 msg.msg_namelen = sizeof (from); 393 msg.msg_control = ancillary_data; 394 msg.msg_controllen = sizeof (ancillary_data); 395 396 /* 397 * Poll has already told us that a message is waiting, 398 * on this socket. Read it now. We should not block. 399 */ 400 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 401 logperror_pii(pii, "in_data: recvmsg"); 402 return; 403 } 404 405 /* 406 * If the datalink has indicated the link is down, don't go 407 * any further. 408 */ 409 if (LINK_DOWN(pii->pii_phyint)) 410 return; 411 412 /* Get the printable address for error reporting */ 413 (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 414 415 /* Ignore packets > 64k or control buffers that don't fit */ 416 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 417 if (debug & D_PKTBAD) { 418 logdebug("Truncated message: msg_flags 0x%x from %s\n", 419 msg.msg_flags, abuf); 420 } 421 return; 422 } 423 424 /* Make sure packet contains at least minimum ICMP header */ 425 ip = (struct ip *)in_packet; 426 iphlen = ip->ip_hl << 2; 427 if (len < iphlen + ICMP_MINLEN) { 428 if (debug & D_PKTBAD) { 429 logdebug("in_data: packet too short (%d bytes)" 430 " from %s\n", len, abuf); 431 } 432 return; 433 } 434 435 /* 436 * Subtract the IP hdr length, 'len' will be length of the probe 437 * reply, starting from the icmp hdr. 438 */ 439 len -= iphlen; 440 /* LINTED */ 441 reply = (struct pr_icmp *)((char *)in_packet + iphlen); 442 443 /* Probe replies are icmp echo replies. Ignore anything else */ 444 if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 445 return; 446 447 /* 448 * The icmp id should match what we sent, which is stored 449 * in pi_icmpid. The icmp code for reply must be 0. 450 * The reply content must be a struct pr_icmp 451 */ 452 if (reply->pr_icmp_id != pii->pii_icmpid) { 453 /* Not in response to our probe */ 454 return; 455 } 456 457 if (reply->pr_icmp_code != 0) { 458 logtrace("probe reply code %d from %s on %s\n", 459 reply->pr_icmp_code, abuf, pii->pii_name); 460 return; 461 } 462 463 if (len < sizeof (struct pr_icmp)) { 464 logtrace("probe reply too short: %d bytes from %s on %s\n", 465 len, abuf, pii->pii_name); 466 return; 467 } 468 469 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 470 if (recv_tvp == NULL) { 471 logtrace("message without timestamp from %s on %s\n", 472 abuf, pii->pii_name); 473 return; 474 } 475 476 IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 477 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 478 /* Unicast probe reply */ 479 incoming_echo_reply(pii, reply, fromaddr, recv_tvp); 480 else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 481 /* Multicast reply */ 482 incoming_mcast_reply(pii, reply, fromaddr); 483 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 484 incoming_rtt_reply(pii, reply, fromaddr); 485 } else { 486 /* Probably not in response to our probe */ 487 logtrace("probe reply type: %d from %s on %s\n", 488 reply->pr_icmp_mtype, abuf, pii->pii_name); 489 return; 490 } 491 } 492 493 /* 494 * Incoming IPv6 data from wire is received here. Called from main. 495 */ 496 void 497 in6_data(struct phyint_instance *pii) 498 { 499 struct sockaddr_in6 from; 500 static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 501 static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 502 int len; 503 char abuf[INET6_ADDRSTRLEN]; 504 struct msghdr msg; 505 struct iovec iov; 506 void *opt; 507 struct pr_icmp *reply; 508 struct timeval *recv_tvp; 509 510 if (debug & D_PROBE) { 511 logdebug("in6_data(%s %s)\n", 512 AF_STR(pii->pii_af), pii->pii_name); 513 } 514 515 iov.iov_base = (char *)in_packet; 516 iov.iov_len = sizeof (in_packet); 517 msg.msg_iov = &iov; 518 msg.msg_iovlen = 1; 519 msg.msg_name = (struct sockaddr *)&from; 520 msg.msg_namelen = sizeof (from); 521 msg.msg_control = ancillary_data; 522 msg.msg_controllen = sizeof (ancillary_data); 523 524 if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 525 logperror_pii(pii, "in6_data: recvmsg"); 526 return; 527 } 528 529 /* 530 * If the datalink has indicated that the link is down, don't go 531 * any further. 532 */ 533 if (LINK_DOWN(pii->pii_phyint)) 534 return; 535 536 /* Get the printable address for error reporting */ 537 (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 538 if (len < ICMP_MINLEN) { 539 if (debug & D_PKTBAD) { 540 logdebug("Truncated message: msg_flags 0x%x from %s\n", 541 msg.msg_flags, abuf); 542 } 543 return; 544 } 545 /* Ignore packets > 64k or control buffers that don't fit */ 546 if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 547 if (debug & D_PKTBAD) { 548 logdebug("Truncated message: msg_flags 0x%x from %s\n", 549 msg.msg_flags, abuf); 550 } 551 return; 552 } 553 554 reply = (struct pr_icmp *)in_packet; 555 if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 556 return; 557 558 if (reply->pr_icmp_id != pii->pii_icmpid) { 559 /* Not in response to our probe */ 560 return; 561 } 562 563 /* 564 * The kernel has already verified the the ICMP checksum. 565 */ 566 if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 567 logtrace("ICMPv6 echo reply source address not linklocal from " 568 "%s on %s\n", abuf, pii->pii_name); 569 return; 570 } 571 opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR); 572 if (opt != NULL) { 573 /* Can't allow routing headers in probe replies */ 574 logtrace("message with routing header from %s on %s\n", 575 abuf, pii->pii_name); 576 return; 577 } 578 579 if (reply->pr_icmp_code != 0) { 580 logtrace("probe reply code: %d from %s on %s\n", 581 reply->pr_icmp_code, abuf, pii->pii_name); 582 return; 583 } 584 if (len < (sizeof (struct pr_icmp))) { 585 logtrace("probe reply too short: %d bytes from %s on %s\n", 586 len, abuf, pii->pii_name); 587 return; 588 } 589 590 recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP); 591 if (recv_tvp == NULL) { 592 logtrace("message without timestamp from %s on %s\n", 593 abuf, pii->pii_name); 594 return; 595 } 596 597 if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 598 incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp); 599 } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 600 incoming_mcast_reply(pii, reply, from.sin6_addr); 601 } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 602 incoming_rtt_reply(pii, reply, from.sin6_addr); 603 } else { 604 /* Probably not in response to our probe */ 605 logtrace("probe reply type: %d from %s on %s\n", 606 reply->pr_icmp_mtype, abuf, pii->pii_name); 607 } 608 } 609 610 /* 611 * Process the incoming rtt reply, in response to our rtt probe. 612 * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 613 * have any stored information about the probe we sent. So we don't log 614 * any errors if we receive bad replies. 615 */ 616 static void 617 incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 618 struct in6_addr fromaddr) 619 { 620 int64_t m; /* rtt measurement in ns */ 621 char abuf[INET6_ADDRSTRLEN]; 622 struct target *target; 623 struct phyint_group *pg; 624 625 /* Get the printable address for error reporting */ 626 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 627 628 if (debug & D_PROBE) { 629 logdebug("incoming_rtt_reply: %s %s %s\n", 630 AF_STR(pii->pii_af), pii->pii_name, abuf); 631 } 632 633 /* Do we know this target ? */ 634 target = target_lookup(pii, fromaddr); 635 if (target == NULL) 636 return; 637 638 m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp)); 639 /* Invalid rtt. It has wrapped around */ 640 if (m < 0) 641 return; 642 643 /* 644 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 645 * The initial few responses after the interface is repaired may 646 * contain high rtt's because they could have been queued up waiting 647 * for ARP/NDP resolution on a failed interface. 648 */ 649 pg = pii->pii_phyint->pi_group; 650 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 651 return; 652 653 /* 654 * Update rtt only if the new rtt is lower than the current rtt. 655 * (specified by the 3rd parameter to pi_set_crtt). 656 * If a spike has caused the current probe_interval to be > 657 * user_probe_interval, then this mechanism is used to bring down 658 * the rtt rapidly once the network stress is removed. 659 * If the new rtt is higher than the current rtt, we don't want to 660 * update the rtt. We are having more than 1 outstanding probe and 661 * the increase in rtt we are seeing is being unnecessarily weighted 662 * many times. The regular rtt update will be handled by 663 * incoming_echo_reply() and will take care of any rtt increase. 664 */ 665 pi_set_crtt(target, m, _B_FALSE); 666 if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 667 (user_failure_detection_time < pg->pg_fdt) && 668 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 669 /* 670 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 671 * investigate if we can improve the failure detection time to 672 * meet whatever the user specified. 673 */ 674 if (check_pg_crtt_improved(pg)) { 675 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 676 user_failure_detection_time); 677 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 678 if (pii->pii_phyint->pi_group != phyint_anongroup) { 679 logerr("Improved failure detection time %d ms " 680 "on (%s %s) for group \"%s\"\n", 681 pg->pg_fdt, AF_STR(pii->pii_af), 682 pii->pii_name, 683 pii->pii_phyint->pi_group->pg_name); 684 } 685 if (user_failure_detection_time == pg->pg_fdt) { 686 /* Avoid any truncation or rounding errors */ 687 pg->pg_probeint = user_probe_interval; 688 /* 689 * No more rtt probes will be sent. The actual 690 * fdt has dropped to the user specified value. 691 * pii_fd_snxt_basetime and pii_snxt_basetime 692 * will be in sync henceforth. 693 */ 694 reset_snxt_basetimes(); 695 } 696 } 697 } 698 } 699 700 /* 701 * Process the incoming echo reply, in response to our unicast probe. 702 * Common for both IPv4 and IPv6 703 */ 704 static void 705 incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 706 struct in6_addr fromaddr, struct timeval *recv_tvp) 707 { 708 int64_t m; /* rtt measurement in ns */ 709 hrtime_t cur_hrtime; /* in ns from some arbitrary point */ 710 char abuf[INET6_ADDRSTRLEN]; 711 int pr_ndx; 712 struct target *target; 713 boolean_t exception; 714 uint64_t pr_icmp_timestamp; 715 uint16_t pr_icmp_seq; 716 struct probe_stats *pr_statp; 717 struct phyint_group *pg = pii->pii_phyint->pi_group; 718 719 /* Get the printable address for error reporting */ 720 (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 721 722 if (debug & D_PROBE) { 723 logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n", 724 AF_STR(pii->pii_af), pii->pii_name, abuf, 725 ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp)); 726 } 727 728 pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp); 729 pr_icmp_seq = ntohs(reply->pr_icmp_seq); 730 731 /* Reject out of window probe replies */ 732 if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 733 SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 734 logtrace("out of window probe seq %u snxt %u on %s from %s\n", 735 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 736 pii->pii_cum_stats.unknown++; 737 return; 738 } 739 740 cur_hrtime = gethrtime(); 741 m = (int64_t)(cur_hrtime - pr_icmp_timestamp); 742 if (m < 0) { 743 /* 744 * This is a ridiculously high value of rtt. rtt has wrapped 745 * around. Log a message, and ignore the rtt. 746 */ 747 logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld " 748 "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp); 749 } 750 751 /* 752 * Get the probe index pr_ndx corresponding to the received icmp seq. 753 * number in our pii->pii_probes[] array. The icmp sequence number 754 * pii_snxt corresponds to the probe index pii->pii_probe_next 755 */ 756 pr_ndx = MOD_SUB(pii->pii_probe_next, 757 (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 758 759 assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 760 761 target = pii->pii_probes[pr_ndx].pr_target; 762 763 /* 764 * Perform sanity checks, whether this probe reply that we 765 * have received is genuine 766 */ 767 if (target != NULL) { 768 /* 769 * Compare the src. addr of the received ICMP or ICMPv6 770 * probe reply with the target address in our tables. 771 */ 772 if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 773 /* 774 * We don't have any record of having sent a probe to 775 * this target. This is a fake probe reply. Log an error 776 */ 777 logtrace("probe status %d Fake probe reply seq %u " 778 "snxt %u on %s from %s\n", 779 pii->pii_probes[pr_ndx].pr_status, 780 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 781 pii->pii_cum_stats.unknown++; 782 return; 783 } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 784 /* 785 * The address matches, but our tables indicate that 786 * this probe reply has been acked already. So this 787 * is a duplicate probe reply. Log an error 788 */ 789 logtrace("probe status %d Duplicate probe reply seq %u " 790 "snxt %u on %s from %s\n", 791 pii->pii_probes[pr_ndx].pr_status, 792 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 793 pii->pii_cum_stats.unknown++; 794 return; 795 } 796 } else { 797 /* 798 * Target must not be NULL in the PR_UNACKED state 799 */ 800 assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 801 if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 802 /* 803 * The probe stats slot is unused. So we didn't 804 * send out any probe to this target. This is a fake. 805 * Log an error. 806 */ 807 logtrace("probe status %d Fake probe reply seq %u " 808 "snxt %u on %s from %s\n", 809 pii->pii_probes[pr_ndx].pr_status, 810 pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 811 } 812 pii->pii_cum_stats.unknown++; 813 return; 814 } 815 816 /* 817 * If the rtt does not appear to be right, don't update the 818 * rtt stats. This can happen if the system dropped into the 819 * debugger, or the system was hung or too busy for a 820 * substantial time that we didn't get a chance to run. 821 */ 822 if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) { 823 /* 824 * If the probe corresponding to this received response 825 * was truly sent 'm' ns. ago, then this response must 826 * have been rejected by the sequence number checks. The 827 * fact that it has passed the sequence number checks 828 * means that the measured rtt is wrong. We were probably 829 * scheduled long after the packet was received. 830 */ 831 goto out; 832 } 833 834 /* 835 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 836 * The initial few responses after the interface is repaired may 837 * contain high rtt's because they could have been queued up waiting 838 * for ARP/NDP resolution on a failed interface. 839 */ 840 if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 841 goto out; 842 843 /* 844 * Don't update the Conservative Round Trip Time estimate for this 845 * (phint, target) pair if this is the not the highest ack seq seen 846 * thus far on this target. 847 */ 848 if (!highest_ack_tg(pr_icmp_seq, target)) 849 goto out; 850 851 /* 852 * Always update the rtt. This is a failure detection probe 853 * and we want to measure both increase / decrease in rtt. 854 */ 855 pi_set_crtt(target, m, _B_TRUE); 856 857 /* 858 * If the crtt exceeds the average time between probes, 859 * investigate if this slow target is an exception. If so we 860 * can avoid this target and still meet the failure detection 861 * time. Otherwise we can't meet the failure detection time. 862 */ 863 if (target->tg_crtt > pg->pg_probeint) { 864 exception = check_exception_target(pii, target); 865 if (exception) { 866 /* 867 * This target is exceptionally slow. Don't use it 868 * for future probes. check_exception_target() has 869 * made sure that we have at least MIN_PROBE_TARGETS 870 * other active targets 871 */ 872 if (pii->pii_targets_are_routers) { 873 /* 874 * This is a slow router, mark it as slow 875 * and don't use it for further probes. We 876 * don't delete it, since it will be populated 877 * again when we do a router scan. Hence we 878 * need to maintain extra state (unlike the 879 * host case below). Mark it as TG_SLOW. 880 */ 881 if (target->tg_status == TG_ACTIVE) 882 pii->pii_ntargets--; 883 target->tg_status = TG_SLOW; 884 target->tg_latime = gethrtime(); 885 target->tg_rtt_sa = -1; 886 target->tg_crtt = 0; 887 target->tg_rtt_sd = 0; 888 if (pii->pii_target_next == target) { 889 pii->pii_target_next = 890 target_next(target); 891 } 892 } else { 893 /* 894 * the slow target is not a router, we can 895 * just delete it. Send an icmp multicast and 896 * pick the fastest responder that is not 897 * already an active target. target_delete() 898 * adjusts pii->pii_target_next 899 */ 900 target_delete(target); 901 probe(pii, PROBE_MULTI, cur_hrtime); 902 } 903 } else { 904 /* 905 * We can't meet the failure detection time. 906 * Log a message, and update the detection time to 907 * whatever we can achieve. 908 */ 909 pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 910 pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 911 last_fdt_bumpup_time = gethrtime(); 912 if (pg != phyint_anongroup) { 913 logerr("Cannot meet requested failure detection" 914 " time of %d ms on (%s %s) new failure" 915 " detection time for group \"%s\" is %d" 916 " ms\n", user_failure_detection_time, 917 AF_STR(pii->pii_af), pii->pii_name, 918 pg->pg_name, pg->pg_fdt); 919 } 920 } 921 } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 922 (user_failure_detection_time < pg->pg_fdt) && 923 (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 924 /* 925 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 926 * investigate if we can improve the failure detection time to 927 * meet whatever the user specified. 928 */ 929 if (check_pg_crtt_improved(pg)) { 930 pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 931 user_failure_detection_time); 932 pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 933 if (pg != phyint_anongroup) { 934 logerr("Improved failure detection time %d ms " 935 "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 936 AF_STR(pii->pii_af), pii->pii_name, 937 pg->pg_name); 938 } 939 if (user_failure_detection_time == pg->pg_fdt) { 940 /* Avoid any truncation or rounding errors */ 941 pg->pg_probeint = user_probe_interval; 942 /* 943 * No more rtt probes will be sent. The actual 944 * fdt has dropped to the user specified value. 945 * pii_fd_snxt_basetime and pii_snxt_basetime 946 * will be in sync henceforth. 947 */ 948 reset_snxt_basetimes(); 949 } 950 } 951 } 952 out: 953 pr_statp = &pii->pii_probes[pr_ndx]; 954 pr_statp->pr_hrtime_ackproc = cur_hrtime; 955 pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent + 956 (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent)); 957 958 probe_chstate(pr_statp, pii, PR_ACKED); 959 960 /* 961 * Update pii->pii_rack, i.e. the sequence number of the last received 962 * probe response, based on the echo reply we have received now, if 963 * either of the following conditions are satisfied. 964 * a. pii_rack is outside the current receive window of 965 * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 966 * This means we have not received probe responses for a 967 * long time, and the sequence number has wrapped around. 968 * b. pii_rack is within the current receive window and this echo 969 * reply corresponds to the highest sequence number we have seen 970 * so far. 971 */ 972 if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 973 SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 974 SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 975 pii->pii_rack = pr_icmp_seq; 976 } 977 } 978 979 /* 980 * Returns true if seq is the highest unacknowledged seq for target tg 981 * else returns false 982 */ 983 static boolean_t 984 highest_ack_tg(uint16_t seq, struct target *tg) 985 { 986 struct phyint_instance *pii; 987 int pr_ndx; 988 uint16_t pr_seq; 989 990 pii = tg->tg_phyint_inst; 991 992 /* 993 * Get the seq number of the most recent probe sent so far, 994 * and also get the corresponding probe index in the probe stats 995 * array. 996 */ 997 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 998 pr_seq = pii->pii_snxt; 999 pr_seq--; 1000 1001 /* 1002 * Start from the most recent probe and walk back, trying to find 1003 * an acked probe corresponding to target tg. 1004 */ 1005 for (; pr_ndx != pii->pii_probe_next; 1006 pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 1007 if (pii->pii_probes[pr_ndx].pr_target == tg && 1008 pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1009 if (SEQ_GT(pr_seq, seq)) 1010 return (_B_FALSE); 1011 } 1012 } 1013 return (_B_TRUE); 1014 } 1015 1016 /* 1017 * Check whether the crtt for the group has improved by a factor of 1018 * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1019 * detection time flapping in the face of small crtt changes. 1020 */ 1021 static boolean_t 1022 check_pg_crtt_improved(struct phyint_group *pg) 1023 { 1024 struct phyint *pi; 1025 1026 if (debug & D_PROBE) 1027 logdebug("check_pg_crtt_improved()\n"); 1028 1029 /* 1030 * The crtt for the group is only improved if each phyint_instance 1031 * for both ipv4 and ipv6 is improved. 1032 */ 1033 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1034 if (!check_pii_crtt_improved(pi->pi_v4) || 1035 !check_pii_crtt_improved(pi->pi_v6)) 1036 return (_B_FALSE); 1037 } 1038 1039 return (_B_TRUE); 1040 } 1041 1042 /* 1043 * Check whether the crtt has improved substantially on this phyint_instance. 1044 * Returns _B_TRUE if there's no crtt information available, because pii 1045 * is NULL or the phyint_instance is not capable of probing. 1046 */ 1047 boolean_t 1048 check_pii_crtt_improved(struct phyint_instance *pii) { 1049 struct target *tg; 1050 1051 if (pii == NULL) 1052 return (_B_TRUE); 1053 1054 if (!PROBE_CAPABLE(pii) || 1055 pii->pii_phyint->pi_state == PI_FAILED) 1056 return (_B_TRUE); 1057 1058 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1059 if (tg->tg_status != TG_ACTIVE) 1060 continue; 1061 if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1062 LOWER_FDT_TRIGGER)) { 1063 return (_B_FALSE); 1064 } 1065 } 1066 1067 return (_B_TRUE); 1068 } 1069 1070 /* 1071 * This target responds very slowly to probes. The target's crtt exceeds 1072 * the probe interval of its group. Compare against other targets 1073 * and determine if this target is an exception, if so return true, else false 1074 */ 1075 static boolean_t 1076 check_exception_target(struct phyint_instance *pii, struct target *target) 1077 { 1078 struct target *tg; 1079 char abuf[INET6_ADDRSTRLEN]; 1080 1081 if (debug & D_PROBE) { 1082 logdebug("check_exception_target(%s %s target %s)\n", 1083 AF_STR(pii->pii_af), pii->pii_name, 1084 pr_addr(pii->pii_af, target->tg_address, 1085 abuf, sizeof (abuf))); 1086 } 1087 1088 /* 1089 * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1090 * to make a good judgement. Otherwise don't drop this target. 1091 */ 1092 if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1093 return (_B_FALSE); 1094 1095 /* 1096 * Determine whether only this particular target is slow. 1097 * We know that this target's crtt exceeds the group's probe interval. 1098 * If all other active targets have a 1099 * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1100 * then this target is considered slow. 1101 */ 1102 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1103 if (tg != target && tg->tg_status == TG_ACTIVE) { 1104 if (tg->tg_crtt > 1105 pii->pii_phyint->pi_group->pg_probeint / 1106 EXCEPTION_FACTOR) { 1107 return (_B_FALSE); 1108 } 1109 } 1110 } 1111 1112 return (_B_TRUE); 1113 } 1114 1115 /* 1116 * Update the target list. The icmp all hosts multicast has given us 1117 * some host to which we can send probes. If we already have sufficient 1118 * targets, discard it. 1119 */ 1120 static void 1121 incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1122 struct in6_addr fromaddr) 1123 /* ARGSUSED */ 1124 { 1125 int af; 1126 char abuf[INET6_ADDRSTRLEN]; 1127 struct phyint *pi; 1128 1129 if (debug & D_PROBE) { 1130 logdebug("incoming_mcast_reply(%s %s %s)\n", 1131 AF_STR(pii->pii_af), pii->pii_name, 1132 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1133 } 1134 1135 /* 1136 * Using host targets is a fallback mechanism. If we have 1137 * found a router, don't add this host target. If we already 1138 * know MAX_PROBE_TARGETS, don't add another target. 1139 */ 1140 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1141 if (pii->pii_targets != NULL) { 1142 if (pii->pii_targets_are_routers || 1143 (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1144 return; 1145 } 1146 } 1147 1148 if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1149 IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1150 /* 1151 * Guard against response from 0.0.0.0 1152 * and ::. Log a trace message 1153 */ 1154 logtrace("probe response from %s on %s\n", 1155 pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1156 pii->pii_name); 1157 return; 1158 } 1159 1160 /* 1161 * This address is one of our own, so reject this address as a 1162 * valid probe target. 1163 */ 1164 af = pii->pii_af; 1165 if (own_address(fromaddr)) 1166 return; 1167 1168 /* 1169 * If the phyint is part a named group, then add the address to all 1170 * members of the group. Otherwise, add the address only to the 1171 * phyint itself, since other phyints in the anongroup may not be on 1172 * the same subnet. 1173 */ 1174 pi = pii->pii_phyint; 1175 if (pi->pi_group == phyint_anongroup) { 1176 target_add(pii, fromaddr, _B_FALSE); 1177 } else { 1178 pi = pi->pi_group->pg_phyint; 1179 for (; pi != NULL; pi = pi->pi_pgnext) 1180 target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1181 } 1182 } 1183 1184 /* 1185 * Compute CRTT given an existing scaled average, scaled deviation estimate 1186 * and a new rtt time. The formula is from Jacobson and Karels' 1187 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1188 * are the same as those in Appendix A.2 of that paper. 1189 * 1190 * m = new measurement 1191 * sa = scaled RTT average (8 * average estimates) 1192 * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1193 * crtt = Conservative round trip time. Used to determine whether probe 1194 * has timed out. 1195 * 1196 * New scaled average and deviation are passed back via sap and svp 1197 */ 1198 static int64_t 1199 compute_crtt(int64_t *sap, int64_t *svp, int64_t m) 1200 { 1201 int64_t sa = *sap; 1202 int64_t sv = *svp; 1203 int64_t crtt; 1204 int64_t saved_m = m; 1205 1206 assert(*sap >= -1); 1207 assert(*svp >= 0); 1208 1209 if (sa != -1) { 1210 /* 1211 * Update average estimator: 1212 * new rtt = old rtt + 1/8 Error 1213 * where Error = m - old rtt 1214 * i.e. 8 * new rtt = 8 * old rtt + Error 1215 * i.e. new sa = old sa + Error 1216 */ 1217 m -= sa >> 3; /* m is now Error in estimate. */ 1218 if ((sa += m) < 0) { 1219 /* Don't allow the smoothed average to be negative. */ 1220 sa = 0; 1221 } 1222 1223 /* 1224 * Update deviation estimator: 1225 * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1226 * i.e. 4 * new mdev = 4 * old mdev + 1227 * (abs(Error) - old mdev) 1228 * i.e. new sv = old sv + (abs(Error) - old mdev) 1229 */ 1230 if (m < 0) 1231 m = -m; 1232 m -= sv >> 2; 1233 sv += m; 1234 } else { 1235 /* Initialization. This is the first response received. */ 1236 sa = (m << 3); 1237 sv = (m << 1); 1238 } 1239 1240 crtt = (sa >> 3) + sv; 1241 1242 if (debug & D_PROBE) { 1243 logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> " 1244 "crtt = %lld\n", saved_m, sa, sv, crtt); 1245 } 1246 1247 *sap = sa; 1248 *svp = sv; 1249 1250 /* 1251 * CRTT = average estimates + 4 * deviation estimates 1252 * = sa / 8 + sv 1253 */ 1254 return (crtt); 1255 } 1256 1257 static void 1258 pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni) 1259 { 1260 struct phyint_instance *pii = tg->tg_phyint_inst; 1261 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1262 int64_t sa = tg->tg_rtt_sa; 1263 int64_t sv = tg->tg_rtt_sd; 1264 int new_crtt; 1265 int i; 1266 1267 if (debug & D_PROBE) 1268 logdebug("pi_set_crtt: target - m %lld\n", m); 1269 1270 /* store the round trip time, in case we need to defer computation */ 1271 tg->tg_deferred[tg->tg_num_deferred] = m; 1272 1273 new_crtt = ns2ms(compute_crtt(&sa, &sv, m)); 1274 1275 /* 1276 * If this probe's round trip time would singlehandedly cause an 1277 * increase in the group's probe interval consider it suspect. 1278 */ 1279 if ((new_crtt > probe_interval) && is_probe_uni) { 1280 if (debug & D_PROBE) { 1281 logdebug("Received a suspect probe on %s, new_crtt =" 1282 " %d, probe_interval = %d, num_deferred = %d\n", 1283 pii->pii_probe_logint->li_name, new_crtt, 1284 probe_interval, tg->tg_num_deferred); 1285 } 1286 1287 /* 1288 * If we've deferred as many rtts as we plan on deferring, then 1289 * assume the link really did slow down and process all queued 1290 * rtts 1291 */ 1292 if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1293 if (debug & D_PROBE) { 1294 logdebug("Received MAXDEFERREDRTT probes which " 1295 "would cause an increased probe_interval. " 1296 "Integrating queued rtt data points.\n"); 1297 } 1298 1299 for (i = 0; i <= tg->tg_num_deferred; i++) { 1300 tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa, 1301 &tg->tg_rtt_sd, tg->tg_deferred[i])); 1302 } 1303 1304 tg->tg_num_deferred = 0; 1305 } else { 1306 tg->tg_num_deferred++; 1307 } 1308 return; 1309 } 1310 1311 /* 1312 * If this is a normal probe, or an RTT probe that would lead to a 1313 * reduced CRTT, then update our CRTT data. Further, if this was 1314 * a normal probe, pitch any deferred probes since our probes are 1315 * again being answered within our CRTT estimates. 1316 */ 1317 if (is_probe_uni || new_crtt < tg->tg_crtt) { 1318 tg->tg_rtt_sa = sa; 1319 tg->tg_rtt_sd = sv; 1320 tg->tg_crtt = new_crtt; 1321 if (is_probe_uni) 1322 tg->tg_num_deferred = 0; 1323 } 1324 } 1325 1326 /* 1327 * Return a pointer to the specified option buffer. 1328 * If not found return NULL. 1329 */ 1330 static void * 1331 find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type) 1332 { 1333 struct cmsghdr *cmsg; 1334 1335 for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1336 cmsg = CMSG_NXTHDR(msg, cmsg)) { 1337 if (cmsg->cmsg_level == cmsg_level && 1338 cmsg->cmsg_type == cmsg_type) { 1339 return (CMSG_DATA(cmsg)); 1340 } 1341 } 1342 return (NULL); 1343 } 1344 1345 /* 1346 * Try to activate another INACTIVE interface in the same group as `pi'. 1347 * Prefer STANDBY INACTIVE to just INACTIVE. 1348 */ 1349 void 1350 phyint_activate_another(struct phyint *pi) 1351 { 1352 struct phyint *pi2; 1353 struct phyint *inactivepi = NULL; 1354 1355 if (pi->pi_group == phyint_anongroup) 1356 return; 1357 1358 for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1359 if (pi == pi2 || pi2->pi_state != PI_RUNNING || 1360 !(pi2->pi_flags & IFF_INACTIVE)) 1361 continue; 1362 1363 inactivepi = pi2; 1364 if (pi2->pi_flags & IFF_STANDBY) 1365 break; 1366 } 1367 1368 if (inactivepi != NULL) 1369 (void) change_pif_flags(inactivepi, 0, IFF_INACTIVE); 1370 } 1371 1372 /* 1373 * Transition a phyint to PI_RUNNING. The caller must ensure that the 1374 * transition is appropriate. Clears IFF_OFFLINE or IFF_FAILED if 1375 * appropriate. Also sets IFF_INACTIVE on this or other interfaces as 1376 * appropriate (see comment below). Finally, also updates the phyint's group 1377 * state to account for the change. 1378 */ 1379 void 1380 phyint_transition_to_running(struct phyint *pi) 1381 { 1382 struct phyint *pi2; 1383 struct phyint *actstandbypi = NULL; 1384 uint_t nactive = 0, nnonstandby = 0; 1385 boolean_t onlining = (pi->pi_state == PI_OFFLINE); 1386 boolean_t initial = (pi->pi_state == PI_INIT); 1387 uint64_t set, clear; 1388 1389 /* 1390 * The interface is running again, but should it or another interface 1391 * in the group end up INACTIVE? There are three cases: 1392 * 1393 * 1. If it's a STANDBY interface, it should be end up INACTIVE if 1394 * the group is operating at capacity (i.e., there are at least as 1395 * many active interfaces as non-STANDBY interfaces in the group). 1396 * No other interfaces should be changed. 1397 * 1398 * 2. If it's a non-STANDBY interface and we're onlining it or 1399 * FAILBACK is enabled, then it should *not* end up INACTIVE. 1400 * Further, if the group is above capacity as a result of this 1401 * interface, then an active STANDBY interface in the group should 1402 * end up INACTIVE. 1403 * 1404 * 3. If it's a non-STANDBY interface, we're repairing it, and 1405 * FAILBACK is disabled, then it should end up INACTIVE *unless* 1406 * the group was failed (in which case we have no choice but to 1407 * use it). No other interfaces should be changed. 1408 */ 1409 if (pi->pi_group != phyint_anongroup) { 1410 pi2 = pi->pi_group->pg_phyint; 1411 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1412 if (!(pi2->pi_flags & IFF_STANDBY)) 1413 nnonstandby++; 1414 1415 if (pi2->pi_state == PI_RUNNING) { 1416 if (!(pi2->pi_flags & IFF_INACTIVE)) { 1417 nactive++; 1418 if (pi2->pi_flags & IFF_STANDBY) 1419 actstandbypi = pi2; 1420 } 1421 } 1422 } 1423 } 1424 1425 set = 0; 1426 clear = (onlining ? IFF_OFFLINE : IFF_FAILED); 1427 1428 if (pi->pi_flags & IFF_STANDBY) { /* case 1 */ 1429 if (nactive >= nnonstandby) 1430 set |= IFF_INACTIVE; 1431 else 1432 clear |= IFF_INACTIVE; 1433 } else if (onlining || failback_enabled) { /* case 2 */ 1434 if (nactive >= nnonstandby && actstandbypi != NULL) 1435 (void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0); 1436 } else if (!initial && !GROUP_FAILED(pi->pi_group)) { /* case 3 */ 1437 set |= IFF_INACTIVE; 1438 } 1439 (void) change_pif_flags(pi, set, clear); 1440 1441 phyint_chstate(pi, PI_RUNNING); 1442 1443 /* 1444 * Update the group state to account for the change. 1445 */ 1446 phyint_group_refresh_state(pi->pi_group); 1447 } 1448 1449 /* 1450 * See if a previously failed interface has started working again. 1451 */ 1452 void 1453 phyint_check_for_repair(struct phyint *pi) 1454 { 1455 if (!phyint_repaired(pi)) 1456 return; 1457 1458 if (pi->pi_group == phyint_anongroup) { 1459 logerr("IP interface repair detected on %s\n", pi->pi_name); 1460 } else { 1461 logerr("IP interface repair detected on %s of group %s\n", 1462 pi->pi_name, pi->pi_group->pg_name); 1463 } 1464 1465 /* 1466 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet. 1467 * So just clear IFF_OFFLINE and defer phyint_transition_to_running() 1468 * until it is brought back online. 1469 */ 1470 if (pi->pi_state == PI_OFFLINE) { 1471 (void) change_pif_flags(pi, 0, IFF_FAILED); 1472 return; 1473 } 1474 1475 phyint_transition_to_running(pi); /* calls phyint_chstate() */ 1476 } 1477 1478 /* 1479 * See if an interface has failed, or if the whole group of interfaces has 1480 * failed. 1481 */ 1482 static void 1483 phyint_inst_check_for_failure(struct phyint_instance *pii) 1484 { 1485 struct phyint *pi = pii->pii_phyint; 1486 struct phyint *pi2; 1487 boolean_t was_active; 1488 1489 switch (failure_state(pii)) { 1490 case PHYINT_FAILURE: 1491 was_active = ((pi->pi_flags & IFF_INACTIVE) == 0); 1492 1493 (void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE); 1494 if (pi->pi_group == phyint_anongroup) { 1495 logerr("IP interface failure detected on %s\n", 1496 pii->pii_name); 1497 } else { 1498 logerr("IP interface failure detected on %s of group" 1499 " %s\n", pii->pii_name, pi->pi_group->pg_name); 1500 } 1501 1502 /* 1503 * If the failed interface was active, activate another 1504 * INACTIVE interface in the group if possible. 1505 */ 1506 if (was_active) 1507 phyint_activate_another(pi); 1508 1509 /* 1510 * If the interface is offline, the state change will be 1511 * noted when it comes back online. 1512 */ 1513 if (pi->pi_state != PI_OFFLINE) { 1514 phyint_chstate(pi, PI_FAILED); 1515 reset_crtt_all(pi); 1516 } 1517 break; 1518 1519 case GROUP_FAILURE: 1520 pi2 = pi->pi_group->pg_phyint; 1521 for (; pi2 != NULL; pi2 = pi2->pi_pgnext) { 1522 (void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE); 1523 if (pi2->pi_state == PI_OFFLINE) /* see comment above */ 1524 continue; 1525 1526 reset_crtt_all(pi2); 1527 /* 1528 * In the case of host targets, we would have flushed 1529 * the targets, and gone to PI_NOTARGETS state. 1530 */ 1531 if (pi2->pi_state == PI_RUNNING) 1532 phyint_chstate(pi2, PI_FAILED); 1533 } 1534 break; 1535 1536 default: 1537 break; 1538 } 1539 } 1540 1541 /* 1542 * Determines if any timeout event has occurred and returns the number of 1543 * milliseconds until the next timeout event for the phyint. Returns 1544 * TIMER_INFINITY for "never". 1545 */ 1546 uint_t 1547 phyint_inst_timer(struct phyint_instance *pii) 1548 { 1549 int pr_ndx; 1550 uint_t timeout; 1551 struct target *cur_tg; 1552 struct probe_stats *pr_statp; 1553 struct phyint_instance *pii_other; 1554 struct phyint *pi; 1555 int valid_unack_count; 1556 int i; 1557 int interval; 1558 uint_t check_time; 1559 uint_t cur_time; 1560 hrtime_t cur_hrtime; 1561 int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1562 1563 cur_hrtime = gethrtime(); 1564 cur_time = ns2ms(cur_hrtime); 1565 1566 if (debug & D_TIMER) { 1567 logdebug("phyint_inst_timer(%s %s)\n", 1568 AF_STR(pii->pii_af), pii->pii_name); 1569 } 1570 1571 pii_other = phyint_inst_other(pii); 1572 if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1573 /* 1574 * Check to see if we're here due to link up/down flapping; If 1575 * enough time has passed, then try to bring the interface 1576 * back up; otherwise, schedule a timer to bring it back up 1577 * when enough time *has* elapsed. 1578 */ 1579 pi = pii->pii_phyint; 1580 if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1581 check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1582 if (check_time > cur_time) 1583 return (check_time - cur_time); 1584 1585 phyint_check_for_repair(pi); 1586 } 1587 } 1588 1589 /* 1590 * If probing is not enabled on this phyint instance, don't proceed. 1591 */ 1592 if (!PROBE_ENABLED(pii)) 1593 return (TIMER_INFINITY); 1594 1595 /* 1596 * If the timer has fired too soon, probably triggered 1597 * by some other phyint instance, return the remaining 1598 * time 1599 */ 1600 if (TIME_LT(cur_time, pii->pii_snxt_time)) 1601 return (pii->pii_snxt_time - cur_time); 1602 1603 /* 1604 * If the link is down, don't send any probes for now. 1605 */ 1606 if (LINK_DOWN(pii->pii_phyint)) 1607 return (TIMER_INFINITY); 1608 1609 /* 1610 * Randomize the next probe time, between MIN_RANDOM_FACTOR 1611 * and MAX_RANDOM_FACTOR with respect to the base probe time. 1612 * Base probe time is strictly periodic. 1613 */ 1614 interval = GET_RANDOM( 1615 (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1616 (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1617 pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1618 1619 /* 1620 * Check if the current time > next time to probe. If so, we missed 1621 * sending 1 or more probes, probably due to heavy system load. At least 1622 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1623 * were scheduled. Make adjustments to the times, in multiples of 1624 * user_probe_interval. 1625 */ 1626 if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1627 int n; 1628 1629 n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1630 pii->pii_snxt_time += (n + 1) * user_probe_interval; 1631 pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1632 logtrace("missed sending %d probes cur_time %u snxt_time %u" 1633 " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1634 pii->pii_snxt_basetime); 1635 1636 /* Collect statistics about missed probes */ 1637 probes_missed.pm_nprobes += n + 1; 1638 probes_missed.pm_ntimes++; 1639 } 1640 pii->pii_snxt_basetime += user_probe_interval; 1641 interval = pii->pii_snxt_time - cur_time; 1642 if (debug & D_TARGET) { 1643 logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1644 " interval %u\n", cur_time, pii->pii_snxt_time, 1645 pii->pii_snxt_basetime, interval); 1646 } 1647 1648 /* 1649 * If no targets are known, we need to send an ICMP multicast. The 1650 * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1651 * to see if we found a target. 1652 */ 1653 if (pii->pii_target_next == NULL) { 1654 assert(pii->pii_ntargets == 0); 1655 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1656 probe(pii, PROBE_MULTI, cur_time); 1657 return (interval); 1658 } 1659 1660 if ((user_probe_interval != probe_interval) && 1661 TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1662 /* 1663 * the failure detection (fd) probe timer has not yet fired. 1664 * Need to send only an rtt probe. The probe type is PROBE_RTT. 1665 */ 1666 probe(pii, PROBE_RTT, cur_hrtime); 1667 return (interval); 1668 } 1669 /* 1670 * the fd probe timer has fired. Need to do all failure 1671 * detection / recovery calculations, and then send an fd probe 1672 * of type PROBE_UNI. 1673 */ 1674 if (user_probe_interval == probe_interval) { 1675 /* 1676 * We could have missed some probes, and then adjusted 1677 * pii_snxt_basetime above. Otherwise we could have 1678 * blindly added probe_interval to pii_fd_snxt_basetime. 1679 */ 1680 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1681 } else { 1682 pii->pii_fd_snxt_basetime += probe_interval; 1683 if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1684 int n; 1685 1686 n = (cur_time - pii->pii_fd_snxt_basetime) / 1687 probe_interval; 1688 pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1689 } 1690 } 1691 1692 /* 1693 * We can have at most, the latest 2 probes that we sent, in 1694 * the PR_UNACKED state. All previous probes sent, are either 1695 * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1696 * timed out if the probe's time_start + the CRTT < currenttime. 1697 * For each of the last 2 probes, examine whether it has timed 1698 * out. If so, mark it PR_LOST. The probe stats is a circular array. 1699 */ 1700 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1701 valid_unack_count = 0; 1702 1703 for (i = 0; i < 2; i++) { 1704 pr_statp = &pii->pii_probes[pr_ndx]; 1705 cur_tg = pii->pii_probes[pr_ndx].pr_target; 1706 switch (pr_statp->pr_status) { 1707 case PR_ACKED: 1708 /* 1709 * We received back an ACK, so the switch clearly 1710 * is not dropping our traffic, and thus we can 1711 * enable failure detection immediately. 1712 */ 1713 if (pii->pii_fd_hrtime > gethrtime()) { 1714 if (debug & D_PROBE) { 1715 logdebug("successful probe on %s; " 1716 "ending quiet period\n", 1717 pii->pii_phyint->pi_name); 1718 } 1719 pii->pii_fd_hrtime = gethrtime(); 1720 } 1721 break; 1722 1723 case PR_UNACKED: 1724 assert(cur_tg != NULL); 1725 /* 1726 * The crtt could be zero for some reason, 1727 * Eg. the phyint could be failed. If the crtt is 1728 * not available use group's probe interval, 1729 * which is a worst case estimate. 1730 */ 1731 timeout = ns2ms(pr_statp->pr_hrtime_start); 1732 if (cur_tg->tg_crtt != 0) { 1733 timeout += cur_tg->tg_crtt; 1734 } else { 1735 timeout += probe_interval; 1736 } 1737 if (TIME_LT(timeout, cur_time)) { 1738 pr_statp->pr_time_lost = timeout; 1739 probe_chstate(pr_statp, pii, PR_LOST); 1740 } else if (i == 1) { 1741 /* 1742 * We are forced to consider this probe 1743 * lost, as we can have at most 2 unack. 1744 * probes any time, and we will be sending a 1745 * probe at the end of this function. 1746 * Normally, we should not be here, but 1747 * this can happen if an incoming response 1748 * that was considered lost has increased 1749 * the crtt for this target, and also bumped 1750 * up the FDT. Note that we never cancel or 1751 * increase the current pii_time_left, so 1752 * when the timer fires, we find 2 valid 1753 * unacked probes, and they are yet to timeout 1754 */ 1755 pr_statp->pr_time_lost = cur_time; 1756 probe_chstate(pr_statp, pii, PR_LOST); 1757 } else { 1758 /* 1759 * Only the most recent probe can enter 1760 * this 'else' arm. The second most recent 1761 * probe must take either of the above arms, 1762 * if it is unacked. 1763 */ 1764 valid_unack_count++; 1765 } 1766 break; 1767 } 1768 pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1769 } 1770 1771 /* 1772 * We send out 1 probe randomly in the interval between one half 1773 * and one probe interval for the group. Given that the CRTT is always 1774 * less than the group's probe interval, we can have at most 1 1775 * unacknowledged probe now. All previous probes are either lost or 1776 * acked. 1777 */ 1778 assert(valid_unack_count == 0 || valid_unack_count == 1); 1779 1780 /* 1781 * The timer has fired. Take appropriate action depending 1782 * on the current state of the phyint. 1783 * 1784 * PI_RUNNING state - Failure detection 1785 * PI_FAILED state - Repair detection 1786 */ 1787 switch (pii->pii_phyint->pi_state) { 1788 case PI_FAILED: 1789 /* 1790 * If the most recent probe (excluding unacked probes that 1791 * are yet to time out) has been acked, check whether the 1792 * phyint is now repaired. 1793 */ 1794 if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1795 phyint_check_for_repair(pii->pii_phyint); 1796 } 1797 break; 1798 1799 case PI_RUNNING: 1800 /* 1801 * It's possible our probes have been lost because of a 1802 * spanning-tree mandated quiet period on the switch. If so, 1803 * ignore the lost probes. 1804 */ 1805 if (pii->pii_fd_hrtime - cur_hrtime > 0) 1806 break; 1807 1808 if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1809 /* 1810 * We have 1 or more failed probes (excluding unacked 1811 * probes that are yet to time out). Determine if the 1812 * phyint has failed. 1813 */ 1814 phyint_inst_check_for_failure(pii); 1815 } 1816 break; 1817 1818 default: 1819 logerr("phyint_inst_timer: invalid state %d\n", 1820 pii->pii_phyint->pi_state); 1821 abort(); 1822 } 1823 1824 /* 1825 * Start the next probe. probe() will also set pii->pii_probe_time_left 1826 * to the group's probe interval. If phyint_failed -> target_flush_hosts 1827 * was called, the target list may be empty. 1828 */ 1829 if (pii->pii_target_next != NULL) { 1830 probe(pii, PROBE_UNI, cur_hrtime); 1831 /* 1832 * If we have just the one probe target, and we're not using 1833 * router targets, try to find another as we presently have 1834 * no resilience. 1835 */ 1836 if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1837 probe(pii, PROBE_MULTI, cur_hrtime); 1838 } else { 1839 probe(pii, PROBE_MULTI, cur_hrtime); 1840 } 1841 return (interval); 1842 } 1843 1844 /* 1845 * Start the probe timer for an interface instance. 1846 */ 1847 void 1848 start_timer(struct phyint_instance *pii) 1849 { 1850 uint32_t interval; 1851 1852 /* 1853 * Spread the base probe times (pi_snxt_basetime) across phyints 1854 * uniformly over the (curtime..curtime + the group's probe_interval). 1855 * pi_snxt_basetime is strictly periodic with a frequency of 1856 * the group's probe interval. The actual probe time pi_snxt_time 1857 * adds some randomness to pi_snxt_basetime and happens in probe(). 1858 * For the 1st probe on each phyint after the timer is started, 1859 * pi_snxt_time and pi_snxt_basetime are the same. 1860 */ 1861 interval = GET_RANDOM(0, 1862 (int)pii->pii_phyint->pi_group->pg_probeint); 1863 1864 pii->pii_snxt_basetime = getcurrenttime() + interval; 1865 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1866 pii->pii_snxt_time = pii->pii_snxt_basetime; 1867 timer_schedule(interval); 1868 } 1869 1870 /* 1871 * Restart the probe timer on an interface instance. 1872 */ 1873 static void 1874 restart_timer(struct phyint_instance *pii) 1875 { 1876 /* 1877 * We don't need to restart the timer if it was never started in 1878 * the first place (pii->pii_basetime_inited not set), as the timer 1879 * won't have gone off yet. 1880 */ 1881 if (pii->pii_basetime_inited != 0) { 1882 1883 if (debug & D_LINKNOTE) 1884 logdebug("restart timer: restarting timer on %s, " 1885 "address family %s\n", pii->pii_phyint->pi_name, 1886 AF_STR(pii->pii_af)); 1887 1888 start_timer(pii); 1889 } 1890 } 1891 1892 static void 1893 process_link_state_down(struct phyint *pi) 1894 { 1895 logerr("The link has gone down on %s\n", pi->pi_name); 1896 1897 /* 1898 * Clear the probe statistics arrays, we don't want the repair 1899 * detection logic relying on probes that were successful prior 1900 * to the link going down. 1901 */ 1902 if (PROBE_CAPABLE(pi->pi_v4)) 1903 clear_pii_probe_stats(pi->pi_v4); 1904 if (PROBE_CAPABLE(pi->pi_v6)) 1905 clear_pii_probe_stats(pi->pi_v6); 1906 /* 1907 * Check for interface failure. Although we know the interface 1908 * has failed, we don't know if all the other interfaces in the 1909 * group have failed as well. 1910 */ 1911 if ((pi->pi_state == PI_RUNNING) || 1912 (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1913 if (debug & D_LINKNOTE) { 1914 logdebug("process_link_state_down:" 1915 " checking for failure on %s\n", pi->pi_name); 1916 } 1917 1918 if (pi->pi_v4 != NULL) 1919 phyint_inst_check_for_failure(pi->pi_v4); 1920 else if (pi->pi_v6 != NULL) 1921 phyint_inst_check_for_failure(pi->pi_v6); 1922 } 1923 } 1924 1925 static void 1926 process_link_state_up(struct phyint *pi) 1927 { 1928 logerr("The link has come up on %s\n", pi->pi_name); 1929 1930 /* 1931 * We stopped any running timers on each instance when the link 1932 * went down, so restart them. 1933 */ 1934 if (pi->pi_v4) 1935 restart_timer(pi->pi_v4); 1936 if (pi->pi_v6) 1937 restart_timer(pi->pi_v6); 1938 1939 phyint_check_for_repair(pi); 1940 1941 pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1942 if (pi->pi_whendx == LINK_UP_PERMIN) 1943 pi->pi_whendx = 0; 1944 } 1945 1946 /* 1947 * Process any changes in link state passed up from the interfaces. 1948 */ 1949 void 1950 process_link_state_changes(void) 1951 { 1952 struct phyint *pi; 1953 1954 /* Look for interfaces where the link state has just changed */ 1955 1956 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1957 boolean_t old_link_state_up = LINK_UP(pi); 1958 1959 /* 1960 * Except when the "phyint" structure is created, this is 1961 * the only place the link state is updated. This allows 1962 * this routine to detect changes in link state, rather 1963 * than just the current state. 1964 */ 1965 UPDATE_LINK_STATE(pi); 1966 1967 if (LINK_DOWN(pi)) { 1968 /* 1969 * Has link just gone down? 1970 */ 1971 if (old_link_state_up) 1972 process_link_state_down(pi); 1973 } else { 1974 /* 1975 * Has link just gone back up? 1976 */ 1977 if (!old_link_state_up) 1978 process_link_state_up(pi); 1979 } 1980 } 1981 } 1982 1983 void 1984 reset_crtt_all(struct phyint *pi) 1985 { 1986 struct phyint_instance *pii; 1987 struct target *tg; 1988 1989 pii = pi->pi_v4; 1990 if (pii != NULL) { 1991 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1992 tg->tg_crtt = 0; 1993 tg->tg_rtt_sa = -1; 1994 tg->tg_rtt_sd = 0; 1995 } 1996 } 1997 1998 pii = pi->pi_v6; 1999 if (pii != NULL) { 2000 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2001 tg->tg_crtt = 0; 2002 tg->tg_rtt_sa = -1; 2003 tg->tg_rtt_sd = 0; 2004 } 2005 } 2006 } 2007 2008 /* 2009 * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 2010 * probes on both instances IPv4 and IPv6. 2011 * If the interface has failed, return the time of the first probe failure 2012 * in "tff". 2013 */ 2014 static int 2015 phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 2016 { 2017 uint_t pi_tff; 2018 struct target *cur_tg; 2019 struct probe_fail_count pfinfo; 2020 struct phyint_instance *pii_other; 2021 int pr_ndx; 2022 2023 /* 2024 * Get the number of consecutive failed probes on 2025 * this phyint across all targets. Also get the number 2026 * of consecutive failed probes on this target only 2027 */ 2028 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2029 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2030 probe_fail_info(pii, cur_tg, &pfinfo); 2031 2032 /* Get the time of first failure, for later use */ 2033 pi_tff = pfinfo.pf_tff; 2034 2035 /* 2036 * If the current target has not responded to the 2037 * last NUM_PROBE_FAILS probes, and other targets are 2038 * responding delete this target. Dead gateway detection 2039 * will eventually remove this target (if router) from the 2040 * routing tables. If that does not occur, we may end 2041 * up adding this to our list again. 2042 */ 2043 if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2044 pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2045 if (pii->pii_targets_are_routers) { 2046 if (cur_tg->tg_status == TG_ACTIVE) 2047 pii->pii_ntargets--; 2048 cur_tg->tg_status = TG_DEAD; 2049 cur_tg->tg_crtt = 0; 2050 cur_tg->tg_rtt_sa = -1; 2051 cur_tg->tg_rtt_sd = 0; 2052 if (pii->pii_target_next == cur_tg) 2053 pii->pii_target_next = target_next(cur_tg); 2054 } else { 2055 target_delete(cur_tg); 2056 probe(pii, PROBE_MULTI, gethrtime()); 2057 } 2058 return (PHYINT_OK); 2059 } 2060 2061 /* 2062 * If the phyint has lost NUM_PROBE_FAILS or more 2063 * consecutive probes, on both IPv4 and IPv6 protocol 2064 * instances of the phyint, then trigger failure 2065 * detection, else return false 2066 */ 2067 if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2068 return (PHYINT_OK); 2069 2070 pii_other = phyint_inst_other(pii); 2071 if (PROBE_CAPABLE(pii_other)) { 2072 probe_fail_info(pii_other, NULL, &pfinfo); 2073 if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2074 /* 2075 * We have NUM_PROBE_FAILS or more failures 2076 * on both IPv4 and IPv6. Get the earliest 2077 * time when failure was detected on this 2078 * phyint across IPv4 and IPv6. 2079 */ 2080 if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2081 pi_tff = pfinfo.pf_tff; 2082 } else { 2083 /* 2084 * This instance has < NUM_PROBE_FAILS failure. 2085 * So return false 2086 */ 2087 return (PHYINT_OK); 2088 } 2089 } 2090 *tff = pi_tff; 2091 return (PHYINT_FAILURE); 2092 } 2093 2094 /* 2095 * Check if the link has gone down on this phyint, or it has failed the 2096 * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2097 * Also look at other phyints of this group, for group failures. 2098 */ 2099 int 2100 failure_state(struct phyint_instance *pii) 2101 { 2102 struct probe_success_count psinfo; 2103 uint_t pi2_tls; /* time last success */ 2104 uint_t pi_tff; /* time first fail */ 2105 struct phyint *pi2; 2106 struct phyint *pi; 2107 struct phyint_instance *pii2; 2108 struct phyint_group *pg; 2109 int retval; 2110 2111 if (debug & D_FAILREP) 2112 logdebug("phyint_failed(%s)\n", pii->pii_name); 2113 2114 pi = pii->pii_phyint; 2115 pg = pi->pi_group; 2116 2117 if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2118 PHYINT_OK) 2119 return (PHYINT_OK); 2120 2121 /* 2122 * At this point, the link is down, or the phyint is suspect, as it 2123 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not 2124 * belong to any group, this is a PHYINT_FAILURE. Otherwise, continue 2125 * on to determine whether this should be considered a PHYINT_FAILURE 2126 * or GROUP_FAILURE. 2127 */ 2128 if (pg == phyint_anongroup) 2129 return (PHYINT_FAILURE); 2130 2131 /* 2132 * Need to compare against other phyints of the same group 2133 * to exclude group failures. If the failure was detected via 2134 * probing, then if the time of last success (tls) of any 2135 * phyint is more recent than the time of first fail (tff) of the 2136 * phyint in question, and the link is up on the phyint, 2137 * then it is a phyint failure. Otherwise it is a group failure. 2138 * If failure was detected via a link down notification sent from 2139 * the driver to IP, we see if any phyints in the group are still 2140 * running and haven't received a link down notification. We 2141 * will usually be processing the link down notification shortly 2142 * after it was received, so there is no point looking at the tls 2143 * of other phyints. 2144 */ 2145 retval = GROUP_FAILURE; 2146 for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2147 /* Exclude ourself from comparison */ 2148 if (pi2 == pi) 2149 continue; 2150 2151 if (LINK_DOWN(pi)) { 2152 /* 2153 * We use FLAGS_TO_LINK_STATE() to test the flags 2154 * directly, rather then LINK_UP() or LINK_DOWN(), as 2155 * we may not have got round to processing the link 2156 * state for the other phyints in the group yet. 2157 * 2158 * The check for PI_RUNNING and group failure handles 2159 * the case when the group begins to recover. 2160 * PI_RUNNING will be set, and group failure cleared 2161 * only after receipt of NUM_PROBE_REPAIRS, by which 2162 * time the other phyints should have received at 2163 * least 1 packet, and so will not have NUM_PROBE_FAILS. 2164 */ 2165 if ((pi2->pi_state == PI_RUNNING) && 2166 !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) { 2167 retval = PHYINT_FAILURE; 2168 break; 2169 } 2170 continue; 2171 } 2172 2173 if (LINK_DOWN(pi2)) 2174 continue; 2175 2176 /* 2177 * If there's no probe-based failure detection on this 2178 * interface, and its link is still up, then it's still 2179 * working and thus the group has not failed. 2180 */ 2181 if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) { 2182 retval = PHYINT_FAILURE; 2183 break; 2184 } 2185 2186 /* 2187 * Need to compare against both IPv4 and IPv6 instances. 2188 */ 2189 pii2 = pi2->pi_v4; 2190 if (pii2 != NULL) { 2191 probe_success_info(pii2, NULL, &psinfo); 2192 if (psinfo.ps_tls_valid) { 2193 pi2_tls = psinfo.ps_tls; 2194 /* 2195 * See comment above regarding check 2196 * for PI_RUNNING and group failure. 2197 */ 2198 if (TIME_GT(pi2_tls, pi_tff) && 2199 (pi2->pi_state == PI_RUNNING) && 2200 !GROUP_FAILED(pg) && 2201 FLAGS_TO_LINK_STATE(pi2)) { 2202 retval = PHYINT_FAILURE; 2203 break; 2204 } 2205 } 2206 } 2207 2208 pii2 = pi2->pi_v6; 2209 if (pii2 != NULL) { 2210 probe_success_info(pii2, NULL, &psinfo); 2211 if (psinfo.ps_tls_valid) { 2212 pi2_tls = psinfo.ps_tls; 2213 /* 2214 * See comment above regarding check 2215 * for PI_RUNNING and group failure. 2216 */ 2217 if (TIME_GT(pi2_tls, pi_tff) && 2218 (pi2->pi_state == PI_RUNNING) && 2219 !GROUP_FAILED(pg) && 2220 FLAGS_TO_LINK_STATE(pi2)) { 2221 retval = PHYINT_FAILURE; 2222 break; 2223 } 2224 } 2225 } 2226 } 2227 2228 /* 2229 * Update the group state to account for the changes. 2230 */ 2231 phyint_group_refresh_state(pg); 2232 return (retval); 2233 } 2234 2235 /* 2236 * Return the information associated with consecutive probe successes 2237 * starting with the most recent probe. At most the last 2 probes can be 2238 * in the unacknowledged state. All previous probes have either failed 2239 * or succeeded. 2240 */ 2241 static void 2242 probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2243 struct probe_success_count *psinfo) 2244 { 2245 uint_t i; 2246 struct probe_stats *pr_statp; 2247 uint_t most_recent; 2248 uint_t second_most_recent; 2249 boolean_t pi_found_failure = _B_FALSE; 2250 boolean_t tg_found_failure = _B_FALSE; 2251 uint_t now; 2252 uint_t timeout; 2253 struct target *tg; 2254 2255 if (debug & D_FAILREP) 2256 logdebug("probe_success_info(%s)\n", pii->pii_name); 2257 2258 bzero(psinfo, sizeof (*psinfo)); 2259 now = getcurrenttime(); 2260 2261 /* 2262 * Start with the most recent probe, and count the number 2263 * of consecutive probe successes. Latch the number of successes 2264 * on hitting a failure. 2265 */ 2266 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2267 second_most_recent = PROBE_INDEX_PREV(most_recent); 2268 2269 for (i = most_recent; i != pii->pii_probe_next; 2270 i = PROBE_INDEX_PREV(i)) { 2271 pr_statp = &pii->pii_probes[i]; 2272 2273 switch (pr_statp->pr_status) { 2274 case PR_UNACKED: 2275 /* 2276 * Only the most recent 2 probes can be unacknowledged 2277 */ 2278 assert(i == most_recent || i == second_most_recent); 2279 2280 tg = pr_statp->pr_target; 2281 assert(tg != NULL); 2282 /* 2283 * The crtt could be zero for some reason, 2284 * Eg. the phyint could be failed. If the crtt is 2285 * not available use the value of the group's probe 2286 * interval which is a worst case estimate. 2287 */ 2288 timeout = ns2ms(pr_statp->pr_hrtime_start); 2289 if (tg->tg_crtt != 0) { 2290 timeout += tg->tg_crtt; 2291 } else { 2292 timeout += 2293 pii->pii_phyint->pi_group->pg_probeint; 2294 } 2295 2296 if (TIME_LT(timeout, now)) { 2297 /* 2298 * We hit a failure. Latch the total number of 2299 * recent consecutive successes. 2300 */ 2301 pr_statp->pr_time_lost = timeout; 2302 probe_chstate(pr_statp, pii, PR_LOST); 2303 pi_found_failure = _B_TRUE; 2304 if (cur_tg != NULL && tg == cur_tg) { 2305 /* 2306 * We hit a failure for the desired 2307 * target. Latch the number of recent 2308 * consecutive successes for this target 2309 */ 2310 tg_found_failure = _B_TRUE; 2311 } 2312 } 2313 break; 2314 2315 case PR_ACKED: 2316 /* 2317 * Bump up the count of probe successes, if we 2318 * have not seen any failure so far. 2319 */ 2320 if (!pi_found_failure) 2321 psinfo->ps_nsucc++; 2322 2323 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2324 !tg_found_failure) { 2325 psinfo->ps_nsucc_tg++; 2326 } 2327 2328 /* 2329 * Record the time of last success, if this is 2330 * the most recent probe success. 2331 */ 2332 if (!psinfo->ps_tls_valid) { 2333 psinfo->ps_tls = 2334 ns2ms(pr_statp->pr_hrtime_ackproc); 2335 psinfo->ps_tls_valid = _B_TRUE; 2336 } 2337 break; 2338 2339 case PR_LOST: 2340 /* 2341 * We hit a failure. Latch the total number of 2342 * recent consecutive successes. 2343 */ 2344 pi_found_failure = _B_TRUE; 2345 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2346 /* 2347 * We hit a failure for the desired target. 2348 * Latch the number of recent consecutive 2349 * successes for this target 2350 */ 2351 tg_found_failure = _B_TRUE; 2352 } 2353 break; 2354 2355 default: 2356 return; 2357 2358 } 2359 } 2360 } 2361 2362 /* 2363 * Return the information associated with consecutive probe failures 2364 * starting with the most recent probe. Only the last 2 probes can be in the 2365 * unacknowledged state. All previous probes have either failed or succeeded. 2366 */ 2367 static void 2368 probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2369 struct probe_fail_count *pfinfo) 2370 { 2371 int i; 2372 struct probe_stats *pr_statp; 2373 boolean_t tg_found_success = _B_FALSE; 2374 boolean_t pi_found_success = _B_FALSE; 2375 int most_recent; 2376 int second_most_recent; 2377 uint_t now; 2378 uint_t timeout; 2379 struct target *tg; 2380 2381 if (debug & D_FAILREP) 2382 logdebug("probe_fail_info(%s)\n", pii->pii_name); 2383 2384 bzero(pfinfo, sizeof (*pfinfo)); 2385 now = getcurrenttime(); 2386 2387 /* 2388 * Start with the most recent probe, and count the number 2389 * of consecutive probe failures. Latch the number of failures 2390 * on hitting a probe success. 2391 */ 2392 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2393 second_most_recent = PROBE_INDEX_PREV(most_recent); 2394 2395 for (i = most_recent; i != pii->pii_probe_next; 2396 i = PROBE_INDEX_PREV(i)) { 2397 pr_statp = &pii->pii_probes[i]; 2398 2399 assert(PR_STATUS_VALID(pr_statp->pr_status)); 2400 2401 switch (pr_statp->pr_status) { 2402 case PR_UNACKED: 2403 /* 2404 * Only the most recent 2 probes can be unacknowledged 2405 */ 2406 assert(i == most_recent || i == second_most_recent); 2407 2408 tg = pr_statp->pr_target; 2409 /* 2410 * Target is guaranteed to exist in the unack. state 2411 */ 2412 assert(tg != NULL); 2413 /* 2414 * The crtt could be zero for some reason, 2415 * Eg. the phyint could be failed. If the crtt is 2416 * not available use the group's probe interval, 2417 * which is a worst case estimate. 2418 */ 2419 timeout = ns2ms(pr_statp->pr_hrtime_start); 2420 if (tg->tg_crtt != 0) { 2421 timeout += tg->tg_crtt; 2422 } else { 2423 timeout += 2424 pii->pii_phyint->pi_group->pg_probeint; 2425 } 2426 2427 if (TIME_GT(timeout, now)) 2428 break; 2429 2430 pr_statp->pr_time_lost = timeout; 2431 probe_chstate(pr_statp, pii, PR_LOST); 2432 /* FALLTHRU */ 2433 2434 case PR_LOST: 2435 if (!pi_found_success) { 2436 pfinfo->pf_nfail++; 2437 pfinfo->pf_tff = pr_statp->pr_time_lost; 2438 } 2439 if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2440 !tg_found_success) { 2441 pfinfo->pf_nfail_tg++; 2442 } 2443 break; 2444 2445 default: 2446 /* 2447 * We hit a success or unused slot. Latch the 2448 * total number of recent consecutive failures. 2449 */ 2450 pi_found_success = _B_TRUE; 2451 if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2452 /* 2453 * We hit a success for the desired target. 2454 * Latch the number of recent consecutive 2455 * failures for this target 2456 */ 2457 tg_found_success = _B_TRUE; 2458 } 2459 } 2460 } 2461 } 2462 2463 /* 2464 * Change the state of probe `pr' on phyint_instance `pii' to state `state'. 2465 */ 2466 void 2467 probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state) 2468 { 2469 if (pr->pr_status == state) 2470 return; 2471 2472 pr->pr_status = state; 2473 (void) probe_state_event(pr, pii); 2474 } 2475 2476 /* 2477 * Check if the phyint has been repaired. If no test address has been 2478 * configured, then consider the interface repaired if the link is up (unless 2479 * the link is flapping; see below). Otherwise, look for proof of probes 2480 * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2481 * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2482 */ 2483 static boolean_t 2484 phyint_repaired(struct phyint *pi) 2485 { 2486 struct probe_success_count psinfo; 2487 struct phyint_instance *pii; 2488 struct target *cur_tg; 2489 int pr_ndx; 2490 uint_t cur_time; 2491 2492 if (debug & D_FAILREP) 2493 logdebug("phyint_repaired(%s)\n", pi->pi_name); 2494 2495 if (LINK_DOWN(pi)) 2496 return (_B_FALSE); 2497 2498 /* 2499 * If we don't have any test addresses and the link is up, then 2500 * consider the interface repaired, unless we've received more than 2501 * LINK_UP_PERMIN link up notifications in the last minute, in 2502 * which case we keep the link down until we drop back below 2503 * the threshold. 2504 */ 2505 if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2506 cur_time = getcurrenttime(); 2507 if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2508 (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2509 pi->pi_lfmsg_printed = 0; 2510 return (_B_TRUE); 2511 } 2512 if (!pi->pi_lfmsg_printed) { 2513 logerr("The link has come up on %s more than %d times " 2514 "in the last minute; disabling repair until it " 2515 "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2516 pi->pi_lfmsg_printed = 1; 2517 } 2518 2519 return (_B_FALSE); 2520 } 2521 2522 pii = pi->pi_v4; 2523 if (PROBE_CAPABLE(pii)) { 2524 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2525 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2526 probe_success_info(pii, cur_tg, &psinfo); 2527 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2528 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2529 return (_B_TRUE); 2530 } 2531 2532 pii = pi->pi_v6; 2533 if (PROBE_CAPABLE(pii)) { 2534 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2535 cur_tg = pii->pii_probes[pr_ndx].pr_target; 2536 probe_success_info(pii, cur_tg, &psinfo); 2537 if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2538 psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2539 return (_B_TRUE); 2540 } 2541 2542 return (_B_FALSE); 2543 } 2544 2545 /* 2546 * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2547 */ 2548 boolean_t 2549 change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear) 2550 { 2551 int ifsock; 2552 struct lifreq lifr; 2553 uint64_t old_flags; 2554 2555 if (debug & D_FAILREP) { 2556 logdebug("change_pif_flags(%s): set %llx clear %llx\n", 2557 pi->pi_name, set, clear); 2558 } 2559 2560 if (pi->pi_v4 != NULL) 2561 ifsock = ifsock_v4; 2562 else 2563 ifsock = ifsock_v6; 2564 2565 /* 2566 * Get the current flags from the kernel, and set/clear the 2567 * desired phyint flags. Since we set only phyint flags, we can 2568 * do it on either IPv4 or IPv6 instance. 2569 */ 2570 (void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2571 2572 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2573 if (errno != ENXIO) 2574 logperror("change_pif_flags: ioctl (get flags)"); 2575 return (_B_FALSE); 2576 } 2577 2578 old_flags = lifr.lifr_flags; 2579 lifr.lifr_flags |= set; 2580 lifr.lifr_flags &= ~clear; 2581 2582 if (old_flags == lifr.lifr_flags) { 2583 /* No change in the flags. No need to send ioctl */ 2584 return (_B_TRUE); 2585 } 2586 2587 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2588 if (errno != ENXIO) 2589 logperror("change_pif_flags: ioctl (set flags)"); 2590 return (_B_FALSE); 2591 } 2592 2593 /* 2594 * Keep pi_flags in synch. with actual flags. Assumes flags are 2595 * phyint flags. 2596 */ 2597 pi->pi_flags |= set; 2598 pi->pi_flags &= ~clear; 2599 2600 if (pi->pi_v4 != NULL) 2601 pi->pi_v4->pii_flags = pi->pi_flags; 2602 2603 if (pi->pi_v6 != NULL) 2604 pi->pi_v6->pii_flags = pi->pi_flags; 2605 2606 return (_B_TRUE); 2607 } 2608 2609 /* 2610 * icmp cksum computation for IPv4. 2611 */ 2612 static int 2613 in_cksum(ushort_t *addr, int len) 2614 { 2615 register int nleft = len; 2616 register ushort_t *w = addr; 2617 register ushort_t answer; 2618 ushort_t odd_byte = 0; 2619 register int sum = 0; 2620 2621 /* 2622 * Our algorithm is simple, using a 32 bit accumulator (sum), 2623 * we add sequential 16 bit words to it, and at the end, fold 2624 * back all the carry bits from the top 16 bits into the lower 2625 * 16 bits. 2626 */ 2627 while (nleft > 1) { 2628 sum += *w++; 2629 nleft -= 2; 2630 } 2631 2632 /* mop up an odd byte, if necessary */ 2633 if (nleft == 1) { 2634 *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2635 sum += odd_byte; 2636 } 2637 2638 /* 2639 * add back carry outs from top 16 bits to low 16 bits 2640 */ 2641 sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2642 sum += (sum >> 16); /* add carry */ 2643 answer = ~sum; /* truncate to 16 bits */ 2644 return (answer); 2645 } 2646 2647 static void 2648 reset_snxt_basetimes(void) 2649 { 2650 struct phyint_instance *pii; 2651 2652 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2653 pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2654 } 2655 } 2656 2657 /* 2658 * Is the address one of our own addresses? Unfortunately, 2659 * we cannot check our phyint tables to determine if the address 2660 * is our own. This is because, we don't track interfaces that 2661 * are not part of any group. We have to either use a 'bind' or 2662 * get the complete list of all interfaces using SIOCGLIFCONF, 2663 * to do this check. We could also use SIOCTMYADDR. 2664 * Bind fails for the local zone address, so we might include local zone 2665 * address as target address. If local zone address is a target address 2666 * and it is up, it is not possible to detect the interface failure. 2667 * SIOCTMYADDR also doesn't consider local zone address as own address. 2668 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 2669 * are stored in `localaddrs' 2670 */ 2671 boolean_t 2672 own_address(struct in6_addr addr) 2673 { 2674 addrlist_t *addrp; 2675 struct sockaddr_storage ss; 2676 int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6; 2677 2678 addr2storage(af, &addr, &ss); 2679 for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) { 2680 if (sockaddrcmp(&ss, &addrp->al_addr)) 2681 return (_B_TRUE); 2682 } 2683 return (_B_FALSE); 2684 } 2685 2686 static int 2687 ns2ms(int64_t ns) 2688 { 2689 return (ns / (NANOSEC / MILLISEC)); 2690 } 2691 2692 static int64_t 2693 tv2ns(struct timeval *tvp) 2694 { 2695 return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000); 2696 } 2697