1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include "mpd_defs.h" 27 #include "mpd_tables.h" 28 29 int debug = 0; /* Debug flag */ 30 static int pollfd_num = 0; /* Num. of poll descriptors */ 31 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 32 /* All times below in ms */ 33 int user_failure_detection_time; /* user specified failure detection */ 34 /* time (fdt) */ 35 int user_probe_interval; /* derived from user specified fdt */ 36 37 /* 38 * Structure to store mib2 information returned by the kernel. 39 * This is used to process routing table information. 40 */ 41 typedef struct mib_item_s { 42 struct mib_item_s *mi_next; 43 struct opthdr mi_opthdr; 44 void *mi_valp; 45 } mib_item_t; 46 47 static int rtsock_v4; /* AF_INET routing socket */ 48 static int rtsock_v6; /* AF_INET6 routing socket */ 49 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 50 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 51 static int lsock_v4; /* Listen socket to detect mpathd */ 52 static int lsock_v6; /* Listen socket to detect mpathd */ 53 static int mibfd = -1; /* fd to get mib info */ 54 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 55 56 static uint_t last_initifs_time; /* Time when initifs was last run */ 57 static char **argv0; /* Saved for re-exec on SIGHUP */ 58 boolean_t handle_link_notifications = _B_TRUE; 59 static int ipRouteEntrySize; /* Size of IPv4 route entry */ 60 static int ipv6RouteEntrySize; /* Size of IPv6 route entry */ 61 62 static void initlog(void); 63 static void run_timeouts(void); 64 static void initifs(void); 65 static void check_if_removed(struct phyint_instance *pii); 66 static void select_test_ifs(void); 67 static void update_router_list(mib_item_t *item); 68 static void mib_get_constants(mib_item_t *item); 69 static int mibwalk(void (*proc)(mib_item_t *)); 70 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 71 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 72 static void router_add_common(int af, char *ifname, 73 struct in6_addr nexthop); 74 static void init_router_targets(); 75 static void cleanup(void); 76 static int setup_listener(int af); 77 static void check_config(void); 78 static void check_testconfig(void); 79 static void check_addr_unique(struct phyint_instance *, 80 struct sockaddr_storage *); 81 static void init_host_targets(void); 82 static void dup_host_targets(struct phyint_instance *desired_pii); 83 static void loopback_cmd(int sock, int family); 84 static boolean_t daemonize(void); 85 static int closefunc(void *, int); 86 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 87 static unsigned int process_query(int fd, mi_query_t *miq); 88 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); 89 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 90 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 91 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 92 static unsigned int send_result(int fd, unsigned int error, int syserror); 93 94 addrlist_t *localaddrs; 95 96 /* 97 * Return the current time in milliseconds (from an arbitrary reference) 98 * truncated to fit into an int. Truncation is ok since we are interested 99 * only in differences and not the absolute values. 100 */ 101 uint_t 102 getcurrenttime(void) 103 { 104 uint_t cur_time; /* In ms */ 105 106 /* 107 * Use of a non-user-adjustable source of time is 108 * required. However millisecond precision is sufficient. 109 * divide by 10^6 110 */ 111 cur_time = (uint_t)(gethrtime() / 1000000LL); 112 return (cur_time); 113 } 114 115 uint64_t 116 getcurrentsec(void) 117 { 118 return (gethrtime() / NANOSEC); 119 } 120 121 /* 122 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 123 */ 124 int 125 poll_add(int fd) 126 { 127 int i; 128 int new_num; 129 struct pollfd *newfds; 130 retry: 131 /* Check if already present */ 132 for (i = 0; i < pollfd_num; i++) { 133 if (pollfds[i].fd == fd) 134 return (0); 135 } 136 /* Check for empty spot already present */ 137 for (i = 0; i < pollfd_num; i++) { 138 if (pollfds[i].fd == -1) { 139 pollfds[i].fd = fd; 140 return (0); 141 } 142 } 143 144 /* Allocate space for 32 more fds and initialize to -1 */ 145 new_num = pollfd_num + 32; 146 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 147 if (newfds == NULL) { 148 logperror("poll_add: realloc"); 149 return (-1); 150 } 151 for (i = pollfd_num; i < new_num; i++) { 152 newfds[i].fd = -1; 153 newfds[i].events = POLLIN; 154 } 155 pollfd_num = new_num; 156 pollfds = newfds; 157 goto retry; 158 } 159 160 /* 161 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 162 */ 163 int 164 poll_remove(int fd) 165 { 166 int i; 167 168 /* Check if already present */ 169 for (i = 0; i < pollfd_num; i++) { 170 if (pollfds[i].fd == fd) { 171 pollfds[i].fd = -1; 172 return (0); 173 } 174 } 175 return (-1); 176 } 177 178 /* 179 * Extract information about the phyint instance. If the phyint instance still 180 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 181 * will use it to detect phyint instances that don't exist any longer and 182 * remove them, from our database of phyint instances. 183 * Return value: 184 * returns true if the phyint instance exists in the kernel, 185 * returns false otherwise 186 */ 187 static boolean_t 188 pii_process(int af, char *name, struct phyint_instance **pii_p) 189 { 190 int err; 191 struct phyint_instance *pii; 192 struct phyint_instance *pii_other; 193 194 if (debug & D_PHYINT) 195 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 196 197 pii = phyint_inst_lookup(af, name); 198 if (pii == NULL) { 199 /* 200 * Phyint instance does not exist in our tables, 201 * create new phyint instance 202 */ 203 pii = phyint_inst_init_from_k(af, name); 204 } else { 205 /* Phyint exists in our tables */ 206 err = phyint_inst_update_from_k(pii); 207 208 switch (err) { 209 case PI_IOCTL_ERROR: 210 /* Some ioctl error. don't change anything */ 211 pii->pii_in_use = 1; 212 break; 213 214 case PI_GROUP_CHANGED: 215 case PI_IFINDEX_CHANGED: 216 /* 217 * Interface index or group membership has changed. 218 * Delete the old state and recreate based on the new 219 * state (it may no longer be in a group). 220 */ 221 pii_other = phyint_inst_other(pii); 222 if (pii_other != NULL) 223 phyint_inst_delete(pii_other); 224 phyint_inst_delete(pii); 225 pii = phyint_inst_init_from_k(af, name); 226 break; 227 228 case PI_DELETED: 229 /* Phyint instance has disappeared from kernel */ 230 pii->pii_in_use = 0; 231 break; 232 233 case PI_OK: 234 /* Phyint instance exists and is fine */ 235 pii->pii_in_use = 1; 236 break; 237 238 default: 239 /* Unknown status */ 240 logerr("pii_process: Unknown status %d\n", err); 241 break; 242 } 243 } 244 245 *pii_p = pii; 246 if (pii != NULL) 247 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 248 else 249 return (_B_FALSE); 250 } 251 252 /* 253 * Scan all interfaces to detect changes as well as new and deleted interfaces 254 */ 255 static void 256 initifs() 257 { 258 int i, nlifr; 259 int af; 260 char *cp; 261 char *buf; 262 int sockfd; 263 uint64_t flags; 264 struct lifnum lifn; 265 struct lifconf lifc; 266 struct lifreq lifreq; 267 struct lifreq *lifr; 268 struct logint *li; 269 struct phyint_instance *pii; 270 struct phyint_instance *next_pii; 271 struct phyint_group *pg, *next_pg; 272 char pi_name[LIFNAMSIZ + 1]; 273 274 if (debug & D_PHYINT) 275 logdebug("initifs: Scanning interfaces\n"); 276 277 last_initifs_time = getcurrenttime(); 278 279 /* 280 * Free the existing local address list; we'll build a new list below. 281 */ 282 addrlist_free(&localaddrs); 283 284 /* 285 * Mark the interfaces so that we can find phyints and logints 286 * which have disappeared from the kernel. pii_process() and 287 * logint_init_from_k() will set {pii,li}_in_use when they find 288 * the interface in the kernel. Also, clear dupaddr bit on probe 289 * logint. check_addr_unique() will set the dupaddr bit on the 290 * probe logint, if the testaddress is not unique. 291 */ 292 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 293 pii->pii_in_use = 0; 294 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 295 li->li_in_use = 0; 296 if (pii->pii_probe_logint == li) 297 li->li_dupaddr = 0; 298 } 299 } 300 301 /* 302 * As above, mark groups so that we can detect IPMP interfaces which 303 * have been removed from the kernel. Also, delete the group address 304 * list since we'll iteratively recreate it below. 305 */ 306 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 307 pg->pg_in_use = _B_FALSE; 308 addrlist_free(&pg->pg_addrs); 309 } 310 311 lifn.lifn_family = AF_UNSPEC; 312 lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 313 again: 314 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 315 logperror("initifs: ioctl (get interface count)"); 316 return; 317 } 318 /* 319 * Pad the interface count to detect when additional interfaces have 320 * been configured between SIOCGLIFNUM and SIOCGLIFCONF. 321 */ 322 lifn.lifn_count += 4; 323 324 if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { 325 logperror("initifs: calloc"); 326 return; 327 } 328 329 lifc.lifc_family = AF_UNSPEC; 330 lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 331 lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); 332 lifc.lifc_buf = buf; 333 334 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 335 logperror("initifs: ioctl (get interface configuration)"); 336 free(buf); 337 return; 338 } 339 340 /* 341 * If every lifr_req slot is taken, then additional interfaces must 342 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. 343 * Recalculate to make sure we didn't miss any interfaces. 344 */ 345 nlifr = lifc.lifc_len / sizeof (struct lifreq); 346 if (nlifr >= lifn.lifn_count) { 347 free(buf); 348 goto again; 349 } 350 351 /* 352 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the 353 * global list of addresses, phyint groups, phyints, and logints. 354 */ 355 for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { 356 af = lifr->lifr_addr.ss_family; 357 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 358 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); 359 360 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 361 if (errno != ENXIO) 362 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 363 continue; 364 } 365 flags = lifreq.lifr_flags; 366 367 /* 368 * If the address is IFF_UP, add it to the local address list. 369 * (We ignore addresses that aren't IFF_UP since another node 370 * might legitimately have that address IFF_UP.) 371 */ 372 if (flags & IFF_UP) { 373 (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, 374 &lifr->lifr_addr); 375 } 376 377 /* 378 * If this address is on an IPMP meta-interface, update our 379 * phyint_group information (either by recording that group 380 * still exists or creating a new group), and track what 381 * group the address is part of. 382 */ 383 if (flags & IFF_IPMP) { 384 if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { 385 if (errno != ENXIO) 386 logperror("initifs: ioctl " 387 "(SIOCGLIFGROUPNAME)"); 388 continue; 389 } 390 391 pg = phyint_group_lookup(lifreq.lifr_groupname); 392 if (pg == NULL) { 393 pg = phyint_group_create(lifreq.lifr_groupname); 394 if (pg == NULL) { 395 logerr("initifs: cannot create group " 396 "%s\n", lifreq.lifr_groupname); 397 continue; 398 } 399 phyint_group_insert(pg); 400 } 401 pg->pg_in_use = _B_TRUE; 402 403 /* 404 * Add this to the group's list of data addresses. 405 */ 406 if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, 407 &lifr->lifr_addr)) { 408 logerr("initifs: insufficient memory to track " 409 "data address information for %s\n", 410 lifr->lifr_name); 411 } 412 continue; 413 } 414 415 /* 416 * This isn't an address on an IPMP meta-interface, so it's 417 * either on an underlying interface or not related to any 418 * group. Update our phyint and logint information (via 419 * pii_process() and logint_init_from_k()) -- but first, 420 * convert the logint name to a phyint name so we can call 421 * pii_process(). 422 */ 423 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 424 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 425 *cp = '\0'; 426 427 if (pii_process(af, pi_name, &pii)) { 428 /* The phyint is fine. So process the logint */ 429 logint_init_from_k(pii, lifr->lifr_name); 430 check_addr_unique(pii, &lifr->lifr_addr); 431 } 432 } 433 free(buf); 434 435 /* 436 * Scan for groups, phyints and logints that have disappeared from the 437 * kernel, and delete them. 438 */ 439 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 440 next_pii = pii->pii_next; 441 check_if_removed(pii); 442 } 443 444 for (pg = phyint_groups; pg != NULL; pg = next_pg) { 445 next_pg = pg->pg_next; 446 if (!pg->pg_in_use) { 447 phyint_group_delete(pg); 448 continue; 449 } 450 /* 451 * Refresh the group's state. This is necessary since the 452 * group's state is defined by the set of usable interfaces in 453 * the group, and an interface is considered unusable if all 454 * of its addresses are down. When an address goes down/up, 455 * the RTM_DELADDR/RTM_NEWADDR brings us through here. 456 */ 457 phyint_group_refresh_state(pg); 458 } 459 460 /* 461 * Select a test address for sending probes on each phyint instance 462 */ 463 select_test_ifs(); 464 465 /* 466 * Handle link up/down notifications. 467 */ 468 process_link_state_changes(); 469 } 470 471 /* 472 * Check that a given test address is unique across all of the interfaces in a 473 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding 474 * to such an (IFF_NOFAILOVER) address can produce unexpected results.) 475 * Any issues will be reported by check_testconfig(). 476 */ 477 static void 478 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) 479 { 480 struct phyint *pi; 481 struct phyint_group *pg; 482 struct in6_addr addr; 483 struct phyint_instance *pii; 484 struct sockaddr_in *sin; 485 486 if (ss->ss_family == AF_INET) { 487 sin = (struct sockaddr_in *)ss; 488 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 489 } else { 490 assert(ss->ss_family == AF_INET6); 491 addr = ((struct sockaddr_in6 *)ss)->sin6_addr; 492 } 493 494 /* 495 * For anonymous groups, every interface is assumed to be on its own 496 * link, so there is no chance of overlapping addresses. 497 */ 498 pg = ourpii->pii_phyint->pi_group; 499 if (pg == phyint_anongroup) 500 return; 501 502 /* 503 * Walk the list of phyint instances in the group and check for test 504 * addresses matching ours. Of course, we skip ourself. 505 */ 506 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 507 pii = PHYINT_INSTANCE(pi, ss->ss_family); 508 if (pii == NULL || pii == ourpii || 509 pii->pii_probe_logint == NULL) 510 continue; 511 512 /* 513 * If this test address is not unique, set the dupaddr bit. 514 */ 515 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr)) 516 pii->pii_probe_logint->li_dupaddr = 1; 517 } 518 } 519 520 /* 521 * Stop probing an interface. Called when an interface is offlined. 522 * The probe socket is closed on each interface instance, and the 523 * interface state set to PI_OFFLINE. 524 */ 525 void 526 stop_probing(struct phyint *pi) 527 { 528 struct phyint_instance *pii; 529 530 pii = pi->pi_v4; 531 if (pii != NULL) { 532 if (pii->pii_probe_sock != -1) 533 close_probe_socket(pii, _B_TRUE); 534 pii->pii_probe_logint = NULL; 535 } 536 537 pii = pi->pi_v6; 538 if (pii != NULL) { 539 if (pii->pii_probe_sock != -1) 540 close_probe_socket(pii, _B_TRUE); 541 pii->pii_probe_logint = NULL; 542 } 543 544 phyint_chstate(pi, PI_OFFLINE); 545 } 546 547 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 548 549 /* 550 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 551 * IFF_UP must also be set so that the associated address can be used as a 552 * source address. Further, we must be able to exchange packets with local 553 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 554 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 555 */ 556 static int 557 rate_testflags(uint64_t flags) 558 { 559 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 560 return (BAD_TESTFLAGS); 561 562 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 563 return (BAD_TESTFLAGS); 564 565 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 566 return (BEST_TESTFLAGS); 567 568 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 569 return (BEST_TESTFLAGS); 570 571 return (OK_TESTFLAGS); 572 } 573 574 /* 575 * Attempt to select a test address for each phyint instance. 576 * Call phyint_inst_sockinit() to complete the initializations. 577 */ 578 static void 579 select_test_ifs(void) 580 { 581 struct phyint *pi; 582 struct phyint_instance *pii; 583 struct phyint_instance *next_pii; 584 struct logint *li; 585 struct logint *probe_logint; 586 boolean_t target_scan_reqd = _B_FALSE; 587 int rating; 588 589 if (debug & D_PHYINT) 590 logdebug("select_test_ifs\n"); 591 592 /* 593 * For each phyint instance, do the test address selection 594 */ 595 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 596 next_pii = pii->pii_next; 597 probe_logint = NULL; 598 599 /* 600 * An interface that is offline should not be probed. 601 * IFF_OFFLINE interfaces should always be PI_OFFLINE 602 * unless some other entity has set the offline flag. 603 */ 604 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 605 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 606 logerr("shouldn't be probing offline" 607 " interface %s (state is: %u)." 608 " Stopping probes.\n", 609 pii->pii_phyint->pi_name, 610 pii->pii_phyint->pi_state); 611 stop_probing(pii->pii_phyint); 612 } 613 continue; 614 } else { 615 /* 616 * If something cleared IFF_OFFLINE (e.g., by accident 617 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is 618 * inherently racy), the phyint may still be offline. 619 * Just ignore it. 620 */ 621 if (pii->pii_phyint->pi_state == PI_OFFLINE) 622 continue; 623 } 624 625 li = pii->pii_probe_logint; 626 if (li != NULL) { 627 /* 628 * We've already got a test address; only proceed 629 * if it's suboptimal. 630 */ 631 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 632 continue; 633 } 634 635 /* 636 * Walk the logints of this phyint instance, and select 637 * the best available test address 638 */ 639 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 640 /* 641 * Skip 0.0.0.0 addresses, as those are never 642 * actually usable. 643 */ 644 if (pii->pii_af == AF_INET && 645 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr)) 646 continue; 647 648 /* 649 * Skip any IPv6 logints that are not link-local, 650 * since we should always have a link-local address 651 * anyway and in6_data() expects link-local replies. 652 */ 653 if (pii->pii_af == AF_INET6 && 654 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 655 continue; 656 657 /* 658 * Rate the testflags. If we've found an optimal 659 * match, then break out; otherwise, record the most 660 * recent OK one. 661 */ 662 rating = rate_testflags(li->li_flags); 663 if (rating == BAD_TESTFLAGS) 664 continue; 665 666 probe_logint = li; 667 if (rating == BEST_TESTFLAGS) 668 break; 669 } 670 671 /* 672 * If the probe logint has changed, ditch the old one. 673 */ 674 if (pii->pii_probe_logint != NULL && 675 pii->pii_probe_logint != probe_logint) { 676 if (pii->pii_probe_sock != -1) 677 close_probe_socket(pii, _B_TRUE); 678 pii->pii_probe_logint = NULL; 679 } 680 681 if (probe_logint == NULL) { 682 /* 683 * We don't have a test address; zero out the probe 684 * stats array since it is no longer relevant. 685 * Optimize by checking if it is already zeroed out. 686 */ 687 int pr_ndx; 688 689 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 690 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 691 clear_pii_probe_stats(pii); 692 reset_crtt_all(pii->pii_phyint); 693 } 694 continue; 695 } else if (probe_logint == pii->pii_probe_logint) { 696 /* 697 * If we didn't find any new test addr, go to the 698 * next phyint. 699 */ 700 continue; 701 } 702 703 /* 704 * The phyint is either being assigned a new testaddr 705 * or is being assigned a testaddr for the 1st time. 706 * Need to initialize the phyint socket 707 */ 708 pii->pii_probe_logint = probe_logint; 709 if (!phyint_inst_sockinit(pii)) { 710 if (debug & D_PHYINT) { 711 logdebug("select_test_ifs: " 712 "phyint_sockinit failed\n"); 713 } 714 phyint_inst_delete(pii); 715 continue; 716 } 717 718 /* 719 * This phyint instance is now enabled for probes; this 720 * impacts our state machine in two ways: 721 * 722 * 1. If we're probe *capable* as well (i.e., we have 723 * probe targets) and the interface is in PI_NOTARGETS, 724 * then transition to PI_RUNNING. 725 * 726 * 2. If we're not probe capable, and the other phyint 727 * instance is also not probe capable, and we were in 728 * PI_RUNNING, then transition to PI_NOTARGETS. 729 * 730 * Also see the state diagram in mpd_probe.c. 731 */ 732 if (PROBE_CAPABLE(pii)) { 733 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 734 phyint_chstate(pii->pii_phyint, PI_RUNNING); 735 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 736 if (pii->pii_phyint->pi_state == PI_RUNNING) 737 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 738 } 739 740 /* 741 * If no targets are currently known for this phyint 742 * we need to call init_router_targets. Since 743 * init_router_targets() initializes the list of targets 744 * for all phyints it is done below the loop. 745 */ 746 if (pii->pii_targets == NULL) 747 target_scan_reqd = _B_TRUE; 748 749 /* 750 * Start the probe timer for this instance. 751 */ 752 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) { 753 start_timer(pii); 754 pii->pii_basetime_inited = 1; 755 } 756 } 757 758 /* 759 * Scan the interface list for any interfaces that are PI_FAILED or 760 * PI_NOTARGETS but no longer enabled to send probes, and call 761 * phyint_check_for_repair() to see if the link state indicates that 762 * the interface should be repaired. Also see the state diagram in 763 * mpd_probe.c. 764 */ 765 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 766 if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && 767 (pi->pi_state == PI_FAILED || 768 pi->pi_state == PI_NOTARGETS)) { 769 phyint_check_for_repair(pi); 770 } 771 } 772 773 check_testconfig(); 774 775 /* 776 * Try to populate the target list. init_router_targets populates 777 * the target list from the routing table. If our target list is 778 * still empty, init_host_targets adds host targets based on the 779 * host target list of other phyints in the group. 780 */ 781 if (target_scan_reqd) { 782 init_router_targets(); 783 init_host_targets(); 784 } 785 } 786 787 /* 788 * Check test address configuration, and log notices/errors if appropriate. 789 * Note that this function only logs pre-existing conditions (e.g., that 790 * probe-based failure detection is disabled). 791 */ 792 static void 793 check_testconfig(void) 794 { 795 struct phyint *pi; 796 struct logint *li; 797 char abuf[INET6_ADDRSTRLEN]; 798 int pri; 799 800 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 801 if (pi->pi_flags & IFF_OFFLINE) 802 continue; 803 804 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) { 805 if (pi->pi_taddrmsg_printed || 806 pi->pi_duptaddrmsg_printed) { 807 if (pi->pi_duptaddrmsg_printed) 808 pri = LOG_ERR; 809 else 810 pri = LOG_INFO; 811 logmsg(pri, "Test address now configured on " 812 "interface %s; enabling probe-based " 813 "failure detection on it\n", pi->pi_name); 814 pi->pi_taddrmsg_printed = 0; 815 pi->pi_duptaddrmsg_printed = 0; 816 } 817 continue; 818 } 819 820 li = NULL; 821 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL && 822 pi->pi_v4->pii_probe_logint->li_dupaddr) 823 li = pi->pi_v4->pii_probe_logint; 824 825 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL && 826 pi->pi_v6->pii_probe_logint->li_dupaddr) 827 li = pi->pi_v6->pii_probe_logint; 828 829 if (li != NULL && li->li_dupaddr) { 830 if (pi->pi_duptaddrmsg_printed) 831 continue; 832 logerr("Test address %s is not unique in group; " 833 "disabling probe-based failure detection on %s\n", 834 pr_addr(li->li_phyint_inst->pii_af, 835 li->li_addr, abuf, sizeof (abuf)), pi->pi_name); 836 pi->pi_duptaddrmsg_printed = 1; 837 continue; 838 } 839 840 if (getcurrentsec() < pi->pi_taddrthresh) 841 continue; 842 843 if (!pi->pi_taddrmsg_printed) { 844 logtrace("No test address configured on interface %s; " 845 "disabling probe-based failure detection on it\n", 846 pi->pi_name); 847 pi->pi_taddrmsg_printed = 1; 848 } 849 } 850 } 851 852 /* 853 * Check phyint group configuration, to detect any inconsistencies, 854 * and log an error message. This is called from runtimeouts every 855 * 20 secs. But the error message is displayed once. If the 856 * consistency is resolved by the admin, a recovery message is displayed 857 * once. 858 */ 859 static void 860 check_config(void) 861 { 862 struct phyint_group *pg; 863 struct phyint *pi; 864 boolean_t v4_in_group; 865 boolean_t v6_in_group; 866 867 /* 868 * All phyints of a group must be homogeneous to ensure that they can 869 * take over for one another. If any phyint in a group has IPv4 870 * plumbed, check that all phyints have IPv4 plumbed. Do a similar 871 * check for IPv6. 872 */ 873 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 874 if (pg == phyint_anongroup) 875 continue; 876 877 v4_in_group = _B_FALSE; 878 v6_in_group = _B_FALSE; 879 /* 880 * 1st pass. Determine if at least 1 phyint in the group 881 * has IPv4 plumbed and if so set v4_in_group to true. 882 * Repeat similarly for IPv6. 883 */ 884 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 885 if (pi->pi_v4 != NULL) 886 v4_in_group = _B_TRUE; 887 if (pi->pi_v6 != NULL) 888 v6_in_group = _B_TRUE; 889 } 890 891 /* 892 * 2nd pass. If v4_in_group is true, check that phyint 893 * has IPv4 plumbed. Repeat similarly for IPv6. Print 894 * out a message the 1st time only. 895 */ 896 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 897 if (pi->pi_flags & IFF_OFFLINE) 898 continue; 899 900 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 901 if (!pi->pi_cfgmsg_printed) { 902 logerr("IP interface %s in group %s is" 903 " not plumbed for IPv4, affecting" 904 " IPv4 connectivity\n", 905 pi->pi_name, 906 pi->pi_group->pg_name); 907 pi->pi_cfgmsg_printed = 1; 908 } 909 } else if (v6_in_group == _B_TRUE && 910 pi->pi_v6 == NULL) { 911 if (!pi->pi_cfgmsg_printed) { 912 logerr("IP interface %s in group %s is" 913 " not plumbed for IPv6, affecting" 914 " IPv6 connectivity\n", 915 pi->pi_name, 916 pi->pi_group->pg_name); 917 pi->pi_cfgmsg_printed = 1; 918 } 919 } else { 920 /* 921 * The phyint matches the group configuration, 922 * if we have reached this point. If it was 923 * improperly configured earlier, log an 924 * error recovery message 925 */ 926 if (pi->pi_cfgmsg_printed) { 927 logerr("IP interface %s is now" 928 " consistent with group %s " 929 " and connectivity is restored\n", 930 pi->pi_name, pi->pi_group->pg_name); 931 pi->pi_cfgmsg_printed = 0; 932 } 933 } 934 935 } 936 } 937 } 938 939 /* 940 * Timer mechanism using relative time (in milliseconds) from the 941 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 942 * will fire after TIMER_INFINITY milliseconds. 943 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 944 * time values. Hence 2 consecutive timer events cannot be spaced farther 945 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 946 * that can be passed for the delay parameter of timer_schedule() 947 */ 948 static uint_t timer_next; /* Currently scheduled timeout */ 949 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 950 951 static void 952 timer_init(void) 953 { 954 timer_next = getcurrenttime() + TIMER_INFINITY; 955 /* 956 * The call to run_timeouts() will get the timer started 957 * Since there are no phyints at this point, the timer will 958 * be set for IF_SCAN_INTERVAL ms. 959 */ 960 run_timeouts(); 961 } 962 963 /* 964 * Make sure the next SIGALRM occurs delay milliseconds from the current 965 * time if not earlier. We are interested only in time differences. 966 */ 967 void 968 timer_schedule(uint_t delay) 969 { 970 uint_t now; 971 struct itimerval itimerval; 972 973 if (debug & D_TIMER) 974 logdebug("timer_schedule(%u)\n", delay); 975 976 assert(delay <= TIMER_INFINITY); 977 978 now = getcurrenttime(); 979 if (delay == 0) { 980 /* Minimum allowed delay */ 981 delay = 1; 982 } 983 /* Will this timer occur before the currently scheduled SIGALRM? */ 984 if (timer_active && TIME_GE(now + delay, timer_next)) { 985 if (debug & D_TIMER) { 986 logdebug("timer_schedule(%u) - no action: " 987 "now %u next %u\n", delay, now, timer_next); 988 } 989 return; 990 } 991 timer_next = now + delay; 992 993 itimerval.it_value.tv_sec = delay / 1000; 994 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 995 itimerval.it_interval.tv_sec = 0; 996 itimerval.it_interval.tv_usec = 0; 997 if (debug & D_TIMER) { 998 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 999 delay, itimerval.it_value.tv_sec, 1000 itimerval.it_value.tv_usec); 1001 } 1002 timer_active = _B_TRUE; 1003 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1004 logperror("timer_schedule: setitimer"); 1005 exit(2); 1006 } 1007 } 1008 1009 /* 1010 * Timer has fired. Determine when the next timer event will occur by asking 1011 * all the timer routines. Should not be called from a timer routine. 1012 */ 1013 static void 1014 run_timeouts(void) 1015 { 1016 uint_t next; 1017 uint_t next_event_time; 1018 struct phyint_instance *pii; 1019 struct phyint_instance *next_pii; 1020 static boolean_t timeout_running; 1021 1022 /* assert that recursive timeouts don't happen. */ 1023 assert(!timeout_running); 1024 1025 timeout_running = _B_TRUE; 1026 1027 if (debug & D_TIMER) 1028 logdebug("run_timeouts()\n"); 1029 1030 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1031 initifs(); 1032 check_config(); 1033 } 1034 1035 next = TIMER_INFINITY; 1036 1037 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1038 next_pii = pii->pii_next; 1039 next_event_time = phyint_inst_timer(pii); 1040 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1041 next = next_event_time; 1042 1043 if (debug & D_TIMER) { 1044 logdebug("run_timeouts(%s %s): next scheduled for" 1045 " this phyint inst %u, next scheduled global" 1046 " %u ms\n", 1047 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1048 next_event_time, next); 1049 } 1050 } 1051 1052 /* 1053 * Make sure initifs() is called at least once every 1054 * IF_SCAN_INTERVAL, to make sure that we are in sync 1055 * with the kernel, in case we have missed any routing 1056 * socket messages. 1057 */ 1058 if (next > IF_SCAN_INTERVAL) 1059 next = IF_SCAN_INTERVAL; 1060 1061 if (debug & D_TIMER) 1062 logdebug("run_timeouts: %u ms\n", next); 1063 1064 timer_schedule(next); 1065 timeout_running = _B_FALSE; 1066 } 1067 1068 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1069 static int eventpipe_write = -1; 1070 boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ 1071 1072 /* 1073 * Ensure that signals are processed synchronously with the rest of 1074 * the code by just writing a one character signal number on the pipe. 1075 * The poll loop will pick this up and process the signal event. 1076 */ 1077 static void 1078 sig_handler(int signo) 1079 { 1080 uchar_t buf = (uchar_t)signo; 1081 1082 /* 1083 * Don't write to pipe if cleanup has already begun. cleanup() 1084 * might have closed the pipe already 1085 */ 1086 if (cleanup_started) 1087 return; 1088 1089 if (eventpipe_write == -1) { 1090 logerr("sig_handler: no pipe found\n"); 1091 return; 1092 } 1093 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1094 logperror("sig_handler: write"); 1095 } 1096 1097 extern struct probes_missed probes_missed; 1098 1099 /* 1100 * Pick up a signal "byte" from the pipe and process it. 1101 */ 1102 static void 1103 in_signal(int fd) 1104 { 1105 uchar_t buf; 1106 uint64_t sent, acked, lost, unacked, unknown; 1107 struct phyint_instance *pii; 1108 int pr_ndx; 1109 1110 switch (read(fd, &buf, sizeof (buf))) { 1111 case -1: 1112 logperror("in_signal: read"); 1113 exit(1); 1114 /* NOTREACHED */ 1115 case 1: 1116 break; 1117 case 0: 1118 logerr("in_signal: read end of file\n"); 1119 exit(1); 1120 /* NOTREACHED */ 1121 default: 1122 logerr("in_signal: read > 1\n"); 1123 exit(1); 1124 } 1125 1126 if (debug & D_TIMER) 1127 logdebug("in_signal() got %d\n", buf); 1128 1129 switch (buf) { 1130 case SIGALRM: 1131 if (debug & D_TIMER) { 1132 uint_t now = getcurrenttime(); 1133 1134 logdebug("in_signal(SIGALRM) delta %u\n", 1135 now - timer_next); 1136 } 1137 timer_active = _B_FALSE; 1138 run_timeouts(); 1139 break; 1140 case SIGUSR1: 1141 logdebug("Printing configuration:\n"); 1142 /* Print out the internal tables */ 1143 phyint_inst_print_all(); 1144 1145 /* 1146 * Print out the accumulated statistics about missed 1147 * probes (happens due to scheduling delay). 1148 */ 1149 logerr("Missed sending total of %d probes spread over" 1150 " %d occurrences\n", probes_missed.pm_nprobes, 1151 probes_missed.pm_ntimes); 1152 1153 /* 1154 * Print out the accumulated statistics about probes 1155 * that were sent. 1156 */ 1157 for (pii = phyint_instances; pii != NULL; 1158 pii = pii->pii_next) { 1159 unacked = 0; 1160 acked = pii->pii_cum_stats.acked; 1161 lost = pii->pii_cum_stats.lost; 1162 sent = pii->pii_cum_stats.sent; 1163 unknown = pii->pii_cum_stats.unknown; 1164 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1165 switch (pii->pii_probes[pr_ndx].pr_status) { 1166 case PR_ACKED: 1167 acked++; 1168 break; 1169 case PR_LOST: 1170 lost++; 1171 break; 1172 case PR_UNACKED: 1173 unacked++; 1174 break; 1175 } 1176 } 1177 logerr("\nProbe stats on (%s %s)\n" 1178 "Number of probes sent %lld\n" 1179 "Number of probe acks received %lld\n" 1180 "Number of probes/acks lost %lld\n" 1181 "Number of valid unacknowledged probes %lld\n" 1182 "Number of ambiguous probe acks received %lld\n", 1183 AF_STR(pii->pii_af), pii->pii_name, 1184 sent, acked, lost, unacked, unknown); 1185 } 1186 break; 1187 case SIGHUP: 1188 logerr("SIGHUP: restart and reread config file\n"); 1189 cleanup(); 1190 (void) execv(argv0[0], argv0); 1191 _exit(0177); 1192 /* NOTREACHED */ 1193 case SIGINT: 1194 case SIGTERM: 1195 case SIGQUIT: 1196 cleanup(); 1197 exit(0); 1198 /* NOTREACHED */ 1199 default: 1200 logerr("in_signal: unknown signal: %d\n", buf); 1201 } 1202 } 1203 1204 static void 1205 cleanup(void) 1206 { 1207 struct phyint_instance *pii; 1208 struct phyint_instance *next_pii; 1209 1210 /* 1211 * Make sure that we don't write to eventpipe in 1212 * sig_handler() if any signal notably SIGALRM, 1213 * occurs after we close the eventpipe descriptor below 1214 */ 1215 cleanup_started = _B_TRUE; 1216 1217 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1218 next_pii = pii->pii_next; 1219 phyint_inst_delete(pii); 1220 } 1221 1222 (void) close(ifsock_v4); 1223 (void) close(ifsock_v6); 1224 (void) close(rtsock_v4); 1225 (void) close(rtsock_v6); 1226 (void) close(lsock_v4); 1227 (void) close(lsock_v6); 1228 (void) close(0); 1229 (void) close(1); 1230 (void) close(2); 1231 (void) close(mibfd); 1232 (void) close(eventpipe_read); 1233 (void) close(eventpipe_write); 1234 } 1235 1236 /* 1237 * Create pipe for signal delivery and set up signal handlers. 1238 */ 1239 static void 1240 setup_eventpipe(void) 1241 { 1242 int fds[2]; 1243 struct sigaction act; 1244 1245 if ((pipe(fds)) < 0) { 1246 logperror("setup_eventpipe: pipe"); 1247 exit(1); 1248 } 1249 eventpipe_read = fds[0]; 1250 eventpipe_write = fds[1]; 1251 if (poll_add(eventpipe_read) == -1) { 1252 exit(1); 1253 } 1254 1255 act.sa_handler = sig_handler; 1256 act.sa_flags = SA_RESTART; 1257 (void) sigaction(SIGALRM, &act, NULL); 1258 1259 (void) sigset(SIGHUP, sig_handler); 1260 (void) sigset(SIGUSR1, sig_handler); 1261 (void) sigset(SIGTERM, sig_handler); 1262 (void) sigset(SIGINT, sig_handler); 1263 (void) sigset(SIGQUIT, sig_handler); 1264 } 1265 1266 /* 1267 * Create a routing socket for receiving RTM_IFINFO messages. 1268 */ 1269 static int 1270 setup_rtsock(int af) 1271 { 1272 int s; 1273 int flags; 1274 int aware = RTAW_UNDER_IPMP; 1275 1276 s = socket(PF_ROUTE, SOCK_RAW, af); 1277 if (s == -1) { 1278 logperror("setup_rtsock: socket PF_ROUTE"); 1279 exit(1); 1280 } 1281 1282 if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { 1283 logperror("setup_rtsock: setsockopt RT_AWARE"); 1284 (void) close(s); 1285 exit(1); 1286 } 1287 1288 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1289 logperror("setup_rtsock: fcntl F_GETFL"); 1290 (void) close(s); 1291 exit(1); 1292 } 1293 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1294 logperror("setup_rtsock: fcntl F_SETFL"); 1295 (void) close(s); 1296 exit(1); 1297 } 1298 if (poll_add(s) == -1) { 1299 (void) close(s); 1300 exit(1); 1301 } 1302 return (s); 1303 } 1304 1305 /* 1306 * Process an RTM_IFINFO message received on a routing socket. 1307 * The return value indicates whether a full interface scan is required. 1308 * Link up/down notifications are reflected in the IFF_RUNNING flag. 1309 * If just the state of the IFF_RUNNING interface flag has changed, a 1310 * a full interface scan isn't required. 1311 */ 1312 static boolean_t 1313 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1314 { 1315 struct sockaddr_dl *sdl; 1316 struct phyint *pi; 1317 uint64_t old_flags; 1318 struct phyint_instance *pii; 1319 1320 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1321 1322 /* 1323 * Although the sockaddr_dl structure is directly after the 1324 * if_msghdr_t structure. At the time of writing, the size of the 1325 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1326 * to the presence of a timeval structure, which contains longs, 1327 * in the if_data structure. Anyway, we know where the message ends, 1328 * so we work backwards to get the start of the sockaddr_dl structure. 1329 */ 1330 /*LINTED*/ 1331 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1332 sizeof (struct sockaddr_dl)); 1333 1334 assert(sdl->sdl_family == AF_LINK); 1335 1336 /* 1337 * The interface name is in sdl_data. 1338 * RTM_IFINFO messages are only generated for logical interface 1339 * zero, so there is no colon and logical interface number to 1340 * strip from the name. The name is not null terminated, but 1341 * there should be enough space in sdl_data to add the null. 1342 */ 1343 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1344 if (debug & D_LINKNOTE) 1345 logdebug("process_rtm_ifinfo: phyint name too long\n"); 1346 return (_B_TRUE); 1347 } 1348 sdl->sdl_data[sdl->sdl_nlen] = 0; 1349 1350 pi = phyint_lookup(sdl->sdl_data); 1351 if (pi == NULL) { 1352 if (debug & D_LINKNOTE) 1353 logdebug("process_rtm_ifinfo: phyint lookup failed" 1354 " for %s\n", sdl->sdl_data); 1355 return (_B_TRUE); 1356 } 1357 1358 /* 1359 * We want to try and avoid doing a full interface scan for 1360 * link state notifications from the datalink layer, as indicated 1361 * by the state of the IFF_RUNNING flag. If just the 1362 * IFF_RUNNING flag has changed state, the link state changes 1363 * are processed without a full scan. 1364 * If there is both an IPv4 and IPv6 instance associated with 1365 * the physical interface, we will get an RTM_IFINFO message 1366 * for each instance. If we just maintained a single copy of 1367 * the physical interface flags, it would appear that no flags 1368 * had changed when the second message is processed, leading us 1369 * to believe that the message wasn't generated by a flags change, 1370 * and that a full interface scan is required. 1371 * To get around this problem, two additional copies of the flags 1372 * are kept, one copy for each instance. These are only used in 1373 * this routine. At any one time, all three copies of the flags 1374 * should be identical except for the IFF_RUNNING flag. The 1375 * copy of the flags in the "phyint" structure is always up to 1376 * date. 1377 */ 1378 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1379 if (pii == NULL) { 1380 if (debug & D_LINKNOTE) 1381 logdebug("process_rtm_ifinfo: no instance of address " 1382 "family %s for %s\n", AF_STR(type), pi->pi_name); 1383 return (_B_TRUE); 1384 } 1385 1386 old_flags = pii->pii_flags; 1387 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1388 pi->pi_flags = pii->pii_flags; 1389 1390 if (debug & D_LINKNOTE) { 1391 logdebug("process_rtm_ifinfo: %s address family: %s, " 1392 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1393 AF_STR(type), old_flags, pi->pi_flags); 1394 } 1395 1396 /* 1397 * If IFF_STANDBY has changed, indicate that the interface has changed 1398 * types. 1399 */ 1400 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1401 phyint_changed(pi); 1402 1403 /* Has just the IFF_RUNNING flag changed state ? */ 1404 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1405 struct phyint_instance *pii_other; 1406 /* 1407 * It wasn't just a link state change. Update 1408 * the other instance's copy of the flags. 1409 */ 1410 pii_other = phyint_inst_other(pii); 1411 if (pii_other != NULL) 1412 pii_other->pii_flags = pii->pii_flags; 1413 return (_B_TRUE); 1414 } 1415 1416 return (_B_FALSE); 1417 } 1418 1419 /* 1420 * Retrieve as many routing socket messages as possible, and try to 1421 * empty the routing sockets. Initiate full scan of targets or interfaces 1422 * as needed. 1423 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1424 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1425 */ 1426 static void 1427 process_rtsock(int rtsock_v4, int rtsock_v6) 1428 { 1429 int nbytes; 1430 int64_t msg[2048 / 8]; 1431 struct rt_msghdr *rtm; 1432 boolean_t need_if_scan = _B_FALSE; 1433 boolean_t need_rt_scan = _B_FALSE; 1434 boolean_t rtm_ifinfo_seen = _B_FALSE; 1435 int type; 1436 1437 /* Read as many messages as possible and try to empty the sockets */ 1438 for (type = AF_INET; ; type = AF_INET6) { 1439 for (;;) { 1440 nbytes = read((type == AF_INET) ? rtsock_v4 : 1441 rtsock_v6, msg, sizeof (msg)); 1442 if (nbytes <= 0) { 1443 /* No more messages */ 1444 break; 1445 } 1446 rtm = (struct rt_msghdr *)msg; 1447 if (rtm->rtm_version != RTM_VERSION) { 1448 logerr("process_rtsock: version %d " 1449 "not understood\n", rtm->rtm_version); 1450 break; 1451 } 1452 1453 if (debug & D_PHYINT) { 1454 logdebug("process_rtsock: message %d\n", 1455 rtm->rtm_type); 1456 } 1457 1458 switch (rtm->rtm_type) { 1459 case RTM_NEWADDR: 1460 case RTM_DELADDR: 1461 /* 1462 * Some logical interface has changed, 1463 * have to scan everything to determine 1464 * what actually changed. 1465 */ 1466 need_if_scan = _B_TRUE; 1467 break; 1468 1469 case RTM_IFINFO: 1470 rtm_ifinfo_seen = _B_TRUE; 1471 need_if_scan |= process_rtm_ifinfo( 1472 (if_msghdr_t *)rtm, type); 1473 break; 1474 1475 case RTM_ADD: 1476 case RTM_DELETE: 1477 case RTM_CHANGE: 1478 case RTM_OLDADD: 1479 case RTM_OLDDEL: 1480 need_rt_scan = _B_TRUE; 1481 break; 1482 1483 default: 1484 /* Not interesting */ 1485 break; 1486 } 1487 } 1488 if (type == AF_INET6) 1489 break; 1490 } 1491 1492 if (need_if_scan) { 1493 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1494 logdebug("process_rtsock: synchronizing with kernel\n"); 1495 initifs(); 1496 } else if (rtm_ifinfo_seen) { 1497 if (debug & D_LINKNOTE) 1498 logdebug("process_rtsock: " 1499 "link up/down notification(s) seen\n"); 1500 process_link_state_changes(); 1501 } 1502 1503 if (need_rt_scan) 1504 init_router_targets(); 1505 } 1506 1507 /* 1508 * Look if the phyint instance or one of its logints have been removed from 1509 * the kernel and take appropriate action. 1510 * Uses {pii,li}_in_use. 1511 */ 1512 static void 1513 check_if_removed(struct phyint_instance *pii) 1514 { 1515 struct logint *li; 1516 struct logint *next_li; 1517 1518 /* Detect phyints that have been removed from the kernel. */ 1519 if (!pii->pii_in_use) { 1520 logtrace("%s %s has been removed from kernel\n", 1521 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1522 phyint_inst_delete(pii); 1523 } else { 1524 /* Detect logints that have been removed. */ 1525 for (li = pii->pii_logint; li != NULL; li = next_li) { 1526 next_li = li->li_next; 1527 if (!li->li_in_use) { 1528 logint_delete(li); 1529 } 1530 } 1531 } 1532 } 1533 1534 /* 1535 * Parse the supplied mib2 information to extract the routing information 1536 * table. Process the routing table to get the list of known onlink routers 1537 * and update our database. These onlink routers will serve as probe 1538 * targets. 1539 */ 1540 static void 1541 update_router_list(mib_item_t *item) 1542 { 1543 for (; item != NULL; item = item->mi_next) { 1544 if (item->mi_opthdr.name == 0) 1545 continue; 1546 if (item->mi_opthdr.level == MIB2_IP && 1547 item->mi_opthdr.name == MIB2_IP_ROUTE) { 1548 ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp, 1549 item->mi_opthdr.len); 1550 } else if (item->mi_opthdr.level == MIB2_IP6 && 1551 item->mi_opthdr.name == MIB2_IP6_ROUTE) { 1552 ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp, 1553 item->mi_opthdr.len); 1554 } 1555 } 1556 } 1557 1558 1559 /* 1560 * Convert octet `octp' to a phyint name and store in `ifname' 1561 */ 1562 static void 1563 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) 1564 { 1565 char *cp; 1566 size_t len = MIN(octp->o_length, ifsize - 1); 1567 1568 (void) strncpy(ifname, octp->o_bytes, len); 1569 ifname[len] = '\0'; 1570 1571 if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) 1572 *cp = '\0'; 1573 } 1574 1575 /* 1576 * Examine the IPv4 routing table `buf' for possible targets. For each 1577 * possible target, if it's on the same subnet an interface route, pass 1578 * it to router_add_common() for further consideration. 1579 */ 1580 static void 1581 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1582 { 1583 char ifname[LIFNAMSIZ]; 1584 mib2_ipRouteEntry_t *rp, *rp1, *endp; 1585 struct in_addr nexthop_v4; 1586 struct in6_addr nexthop; 1587 1588 if (debug & D_TARGET) 1589 logdebug("ire_process_v4(len %d)\n", len); 1590 1591 if (len == 0) 1592 return; 1593 1594 assert((len % ipRouteEntrySize) == 0); 1595 endp = buf + (len / ipRouteEntrySize); 1596 1597 /* 1598 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1599 * cross-reference them with the interface routes to determine if 1600 * they're possible probe targets. 1601 */ 1602 for (rp = buf; rp < endp; rp++) { 1603 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1604 continue; 1605 1606 /* Get the nexthop address. */ 1607 nexthop_v4.s_addr = rp->ipRouteNextHop; 1608 1609 /* 1610 * Rescan the routing table looking for interface routes that 1611 * are on the same subnet, and try to add them. If they're 1612 * not relevant (e.g., the interface route isn't part of an 1613 * IPMP group, router_add_common() will discard). 1614 */ 1615 for (rp1 = buf; rp1 < endp; rp1++) { 1616 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || 1617 rp1->ipRouteIfIndex.o_length == 0) 1618 continue; 1619 1620 if ((rp1->ipRouteDest & rp1->ipRouteMask) != 1621 (nexthop_v4.s_addr & rp1->ipRouteMask)) 1622 continue; 1623 1624 oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); 1625 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1626 router_add_common(AF_INET, ifname, nexthop); 1627 } 1628 } 1629 } 1630 1631 void 1632 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1633 { 1634 struct phyint_instance *pii; 1635 struct phyint *pi; 1636 1637 if (debug & D_TARGET) 1638 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1639 1640 /* 1641 * Retrieve the phyint instance; bail if it's not known to us yet. 1642 */ 1643 pii = phyint_inst_lookup(af, ifname); 1644 if (pii == NULL) 1645 return; 1646 1647 /* 1648 * Don't use our own addresses as targets. 1649 */ 1650 if (own_address(nexthop)) 1651 return; 1652 1653 /* 1654 * If the phyint is part a named group, then add the address to all 1655 * members of the group; note that this is suboptimal in the IPv4 case 1656 * as it has already been added to all matching interfaces in 1657 * ire_process_v4(). Otherwise, add the address only to the phyint 1658 * itself, since other phyints in the anongroup may not be on the same 1659 * subnet. 1660 */ 1661 pi = pii->pii_phyint; 1662 if (pi->pi_group == phyint_anongroup) { 1663 target_add(pii, nexthop, _B_TRUE); 1664 } else { 1665 pi = pi->pi_group->pg_phyint; 1666 for (; pi != NULL; pi = pi->pi_pgnext) 1667 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1668 } 1669 } 1670 1671 /* 1672 * Examine the IPv6 routing table `buf' for possible link-local targets, and 1673 * pass any contenders to router_add_common() for further consideration. 1674 */ 1675 static void 1676 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1677 { 1678 struct lifreq lifr; 1679 char ifname[LIFNAMSIZ]; 1680 char grname[LIFGRNAMSIZ]; 1681 mib2_ipv6RouteEntry_t *rp, *rp1, *endp; 1682 struct in6_addr nexthop_v6; 1683 1684 if (debug & D_TARGET) 1685 logdebug("ire_process_v6(len %d)\n", len); 1686 1687 if (len == 0) 1688 return; 1689 1690 assert((len % ipv6RouteEntrySize) == 0); 1691 endp = buf + (len / ipv6RouteEntrySize); 1692 1693 /* 1694 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1695 * cross-reference them with the interface routes to determine if 1696 * they're possible probe targets. 1697 */ 1698 for (rp = buf; rp < endp; rp++) { 1699 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || 1700 !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) 1701 continue; 1702 1703 /* Get the nexthop address. */ 1704 nexthop_v6 = rp->ipv6RouteNextHop; 1705 1706 /* 1707 * The interface name should always exist for link-locals; 1708 * we use it to map this entry to an IPMP group name. 1709 */ 1710 if (rp->ipv6RouteIfIndex.o_length == 0) 1711 continue; 1712 1713 oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); 1714 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || 1715 strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { 1716 continue; 1717 } 1718 1719 /* 1720 * Rescan the list of routes for interface routes, and add the 1721 * above target to any interfaces in the same IPMP group. 1722 */ 1723 for (rp1 = buf; rp1 < endp; rp1++) { 1724 if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || 1725 rp1->ipv6RouteIfIndex.o_length == 0) { 1726 continue; 1727 } 1728 oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); 1729 (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); 1730 1731 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && 1732 strcmp(lifr.lifr_groupname, grname) == 0) { 1733 router_add_common(AF_INET6, ifname, nexthop_v6); 1734 } 1735 } 1736 } 1737 } 1738 1739 /* 1740 * Build a list of target routers, by scanning the routing tables. 1741 * It is assumed that interface routes exist, to reach the routers. 1742 */ 1743 static void 1744 init_router_targets(void) 1745 { 1746 struct target *tg; 1747 struct target *next_tg; 1748 struct phyint_instance *pii; 1749 struct phyint *pi; 1750 1751 if (force_mcast) 1752 return; 1753 1754 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1755 pi = pii->pii_phyint; 1756 /* 1757 * Set tg_in_use to false only for router targets. 1758 */ 1759 if (!pii->pii_targets_are_routers) 1760 continue; 1761 1762 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1763 tg->tg_in_use = 0; 1764 } 1765 1766 if (mibwalk(update_router_list) == -1) 1767 exit(1); 1768 1769 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1770 pi = pii->pii_phyint; 1771 if (!pii->pii_targets_are_routers) 1772 continue; 1773 1774 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 1775 next_tg = tg->tg_next; 1776 /* 1777 * If the group has failed, it's likely the route was 1778 * removed by an application affected by that failure. 1779 * In that case, we keep the target so that we can 1780 * reliably repair, at which point we'll refresh the 1781 * target list again. 1782 */ 1783 if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) 1784 target_delete(tg); 1785 } 1786 } 1787 } 1788 1789 /* 1790 * Attempt to assign host targets to any interfaces that do not currently 1791 * have probe targets by sharing targets with other interfaces in the group. 1792 */ 1793 static void 1794 init_host_targets(void) 1795 { 1796 struct phyint_instance *pii; 1797 struct phyint_group *pg; 1798 1799 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1800 pg = pii->pii_phyint->pi_group; 1801 if (pg != phyint_anongroup && pii->pii_targets == NULL) 1802 dup_host_targets(pii); 1803 } 1804 } 1805 1806 /* 1807 * Duplicate host targets from other phyints of the group to 1808 * the phyint instance 'desired_pii'. 1809 */ 1810 static void 1811 dup_host_targets(struct phyint_instance *desired_pii) 1812 { 1813 int af; 1814 struct phyint *pi; 1815 struct phyint_instance *pii; 1816 struct target *tg; 1817 1818 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 1819 1820 af = desired_pii->pii_af; 1821 1822 /* 1823 * For every phyint in the same group as desired_pii, check if 1824 * it has any host targets. If so add them to desired_pii. 1825 */ 1826 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 1827 pii = PHYINT_INSTANCE(pi, af); 1828 /* 1829 * We know that we don't have targets on this phyint instance 1830 * since we have been called. But we still check for 1831 * pii_targets_are_routers because another phyint instance 1832 * could have router targets, since IFF_NOFAILOVER addresses 1833 * on different phyint instances may belong to different 1834 * subnets. 1835 */ 1836 if ((pii == NULL) || (pii == desired_pii) || 1837 pii->pii_targets_are_routers) 1838 continue; 1839 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1840 target_create(desired_pii, tg->tg_address, _B_FALSE); 1841 } 1842 } 1843 } 1844 1845 static void 1846 usage(char *cmd) 1847 { 1848 (void) fprintf(stderr, "usage: %s\n", cmd); 1849 } 1850 1851 1852 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 1853 1854 /* Get an option from the /etc/default/mpathd file */ 1855 static char * 1856 getdefault(char *name) 1857 { 1858 char namebuf[BUFSIZ]; 1859 char *value = NULL; 1860 1861 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 1862 char *cp; 1863 int flags; 1864 1865 /* 1866 * ignore case 1867 */ 1868 flags = defcntl(DC_GETFLAGS, 0); 1869 TURNOFF(flags, DC_CASE); 1870 (void) defcntl(DC_SETFLAGS, flags); 1871 1872 /* Add "=" to the name */ 1873 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 1874 (void) strncat(namebuf, "=", 2); 1875 1876 if ((cp = defread(namebuf)) != NULL) 1877 value = strdup(cp); 1878 1879 /* close */ 1880 (void) defopen((char *)NULL); 1881 } 1882 return (value); 1883 } 1884 1885 1886 /* 1887 * Command line options below 1888 */ 1889 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 1890 boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ 1891 static boolean_t adopt = _B_FALSE; 1892 static boolean_t foreground = _B_FALSE; 1893 1894 int 1895 main(int argc, char *argv[]) 1896 { 1897 int i; 1898 int c; 1899 struct phyint *pi; 1900 struct phyint_instance *pii; 1901 char *value; 1902 1903 argv0 = argv; /* Saved for re-exec on SIGHUP */ 1904 srandom(gethostid()); /* Initialize the random number generator */ 1905 1906 /* 1907 * NOTE: The messages output by in.mpathd are not suitable for 1908 * translation, so we do not call textdomain(). 1909 */ 1910 (void) setlocale(LC_ALL, ""); 1911 1912 /* 1913 * Get the user specified value of 'failure detection time' 1914 * from /etc/default/mpathd 1915 */ 1916 value = getdefault("FAILURE_DETECTION_TIME"); 1917 if (value != NULL) { 1918 user_failure_detection_time = 1919 (int)strtol((char *)value, NULL, 0); 1920 1921 if (user_failure_detection_time <= 0) { 1922 user_failure_detection_time = FAILURE_DETECTION_TIME; 1923 logerr("Invalid failure detection time %s, assuming " 1924 "default of %d ms\n", value, 1925 user_failure_detection_time); 1926 1927 } else if (user_failure_detection_time < 1928 MIN_FAILURE_DETECTION_TIME) { 1929 user_failure_detection_time = 1930 MIN_FAILURE_DETECTION_TIME; 1931 logerr("Too small failure detection time of %s, " 1932 "assuming minimum of %d ms\n", value, 1933 user_failure_detection_time); 1934 } 1935 free(value); 1936 } else { 1937 /* User has not specified the parameter, Use default value */ 1938 user_failure_detection_time = FAILURE_DETECTION_TIME; 1939 } 1940 1941 /* 1942 * This gives the frequency at which probes will be sent. 1943 * When fdt ms elapses, we should be able to determine 1944 * whether 5 consecutive probes have failed or not. 1945 * 1 probe will be sent in every user_probe_interval ms, 1946 * randomly anytime in the (0.5 - 1.0) 2nd half of every 1947 * user_probe_interval. Thus when we send out probe 'n' we 1948 * can be sure that probe 'n - 2' is lost, if we have not 1949 * got the ack. (since the probe interval is > crtt). But 1950 * probe 'n - 1' may be a valid unacked probe, since the 1951 * time between 2 successive probes could be as small as 1952 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 1953 */ 1954 user_probe_interval = user_failure_detection_time / 1955 (NUM_PROBE_FAILS + 2); 1956 1957 /* 1958 * Get the user specified value of failback_enabled from 1959 * /etc/default/mpathd 1960 */ 1961 value = getdefault("FAILBACK"); 1962 if (value != NULL) { 1963 if (strcasecmp(value, "yes") == 0) 1964 failback_enabled = _B_TRUE; 1965 else if (strcasecmp(value, "no") == 0) 1966 failback_enabled = _B_FALSE; 1967 else 1968 logerr("Invalid value for FAILBACK %s\n", value); 1969 free(value); 1970 } else { 1971 failback_enabled = _B_TRUE; 1972 } 1973 1974 /* 1975 * Get the user specified value of track_all_phyints from 1976 * /etc/default/mpathd. The sense is reversed in 1977 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 1978 */ 1979 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 1980 if (value != NULL) { 1981 if (strcasecmp(value, "yes") == 0) 1982 track_all_phyints = _B_FALSE; 1983 else if (strcasecmp(value, "no") == 0) 1984 track_all_phyints = _B_TRUE; 1985 else 1986 logerr("Invalid value for " 1987 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 1988 free(value); 1989 } else { 1990 track_all_phyints = _B_FALSE; 1991 } 1992 1993 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 1994 switch (c) { 1995 case 'a': 1996 adopt = _B_TRUE; 1997 break; 1998 case 'm': 1999 force_mcast = _B_TRUE; 2000 break; 2001 case 'd': 2002 debug = D_ALL; 2003 foreground = _B_TRUE; 2004 break; 2005 case 'D': 2006 i = (int)strtol(optarg, NULL, 0); 2007 if (i == 0) { 2008 (void) fprintf(stderr, "Bad debug flags: %s\n", 2009 optarg); 2010 exit(1); 2011 } 2012 debug |= i; 2013 foreground = _B_TRUE; 2014 break; 2015 case 'l': 2016 /* 2017 * Turn off link state notification handling. 2018 * Undocumented command line flag, for debugging 2019 * purposes. 2020 */ 2021 handle_link_notifications = _B_FALSE; 2022 break; 2023 default: 2024 usage(argv[0]); 2025 exit(1); 2026 } 2027 } 2028 2029 /* 2030 * The sockets for the loopback command interface should be listening 2031 * before we fork and exit in daemonize(). This way, whoever started us 2032 * can use the loopback interface as soon as they get a zero exit 2033 * status. 2034 */ 2035 lsock_v4 = setup_listener(AF_INET); 2036 lsock_v6 = setup_listener(AF_INET6); 2037 2038 if (lsock_v4 < 0 && lsock_v6 < 0) { 2039 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2040 exit(1); 2041 } 2042 2043 if (!foreground) { 2044 if (!daemonize()) { 2045 logerr("cannot daemonize\n"); 2046 exit(EXIT_FAILURE); 2047 } 2048 initlog(); 2049 } 2050 2051 /* 2052 * Initializations: 2053 * 1. Create ifsock* sockets. These are used for performing SIOC* 2054 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2055 * 2. Initialize a pipe for handling/recording signal events. 2056 * 3. Create the routing sockets, used for listening 2057 * to routing / interface changes. 2058 * 4. phyint_init() - Initialize physical interface state 2059 * (in mpd_tables.c). Must be done before creating interfaces, 2060 * which timer_init() does indirectly. 2061 * 5. Query kernel for route entry sizes (v4 and v6). 2062 * 6. timer_init() - Initialize timer related stuff 2063 * 7. initifs() - Initialize our database of all known interfaces 2064 * 8. init_router_targets() - Initialize our database of all known 2065 * router targets. 2066 */ 2067 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2068 if (ifsock_v4 < 0) { 2069 logperror("main: IPv4 socket open"); 2070 exit(1); 2071 } 2072 2073 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2074 if (ifsock_v6 < 0) { 2075 logperror("main: IPv6 socket open"); 2076 exit(1); 2077 } 2078 2079 setup_eventpipe(); 2080 2081 rtsock_v4 = setup_rtsock(AF_INET); 2082 rtsock_v6 = setup_rtsock(AF_INET6); 2083 2084 if (phyint_init() == -1) { 2085 logerr("cannot initialize physical interface structures"); 2086 exit(1); 2087 } 2088 2089 if (mibwalk(mib_get_constants) == -1) 2090 exit(1); 2091 2092 timer_init(); 2093 2094 initifs(); 2095 2096 /* 2097 * If we're operating in "adopt" mode and no interfaces need to be 2098 * tracked, shut down (ifconfig(1M) will restart us on demand if 2099 * interfaces are subsequently put into multipathing groups). 2100 */ 2101 if (adopt && phyint_instances == NULL) 2102 exit(0); 2103 2104 /* 2105 * Main body. Keep listening for activity on any of the sockets 2106 * that we are monitoring and take appropriate action as necessary. 2107 * signals are also handled synchronously. 2108 */ 2109 for (;;) { 2110 if (poll(pollfds, pollfd_num, -1) < 0) { 2111 if (errno == EINTR) 2112 continue; 2113 logperror("main: poll"); 2114 exit(1); 2115 } 2116 for (i = 0; i < pollfd_num; i++) { 2117 if ((pollfds[i].fd == -1) || 2118 !(pollfds[i].revents & POLLIN)) 2119 continue; 2120 if (pollfds[i].fd == eventpipe_read) { 2121 in_signal(eventpipe_read); 2122 break; 2123 } 2124 if (pollfds[i].fd == rtsock_v4 || 2125 pollfds[i].fd == rtsock_v6) { 2126 process_rtsock(rtsock_v4, rtsock_v6); 2127 break; 2128 } 2129 2130 for (pii = phyint_instances; pii != NULL; 2131 pii = pii->pii_next) { 2132 if (pollfds[i].fd == pii->pii_probe_sock) { 2133 if (pii->pii_af == AF_INET) 2134 in_data(pii); 2135 else 2136 in6_data(pii); 2137 break; 2138 } 2139 } 2140 2141 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 2142 if (pi->pi_notes != 0 && 2143 pollfds[i].fd == dlpi_fd(pi->pi_dh)) { 2144 (void) dlpi_recv(pi->pi_dh, NULL, NULL, 2145 NULL, NULL, 0, NULL); 2146 break; 2147 } 2148 } 2149 2150 if (pollfds[i].fd == lsock_v4) 2151 loopback_cmd(lsock_v4, AF_INET); 2152 else if (pollfds[i].fd == lsock_v6) 2153 loopback_cmd(lsock_v6, AF_INET6); 2154 } 2155 } 2156 /* NOTREACHED */ 2157 return (EXIT_SUCCESS); 2158 } 2159 2160 static int 2161 setup_listener(int af) 2162 { 2163 int sock; 2164 int on; 2165 int len; 2166 int ret; 2167 struct sockaddr_storage laddr; 2168 struct sockaddr_in *sin; 2169 struct sockaddr_in6 *sin6; 2170 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2171 2172 assert(af == AF_INET || af == AF_INET6); 2173 2174 sock = socket(af, SOCK_STREAM, 0); 2175 if (sock < 0) { 2176 logperror("setup_listener: socket"); 2177 exit(1); 2178 } 2179 2180 on = 1; 2181 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2182 sizeof (on)) < 0) { 2183 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2184 exit(1); 2185 } 2186 2187 bzero(&laddr, sizeof (laddr)); 2188 laddr.ss_family = af; 2189 2190 if (af == AF_INET) { 2191 sin = (struct sockaddr_in *)&laddr; 2192 sin->sin_port = htons(MPATHD_PORT); 2193 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2194 len = sizeof (struct sockaddr_in); 2195 } else { 2196 sin6 = (struct sockaddr_in6 *)&laddr; 2197 sin6->sin6_port = htons(MPATHD_PORT); 2198 sin6->sin6_addr = loopback_addr; 2199 len = sizeof (struct sockaddr_in6); 2200 } 2201 2202 ret = bind(sock, (struct sockaddr *)&laddr, len); 2203 if (ret < 0) { 2204 if (errno == EADDRINUSE) { 2205 /* 2206 * Another instance of mpathd may be already active. 2207 */ 2208 logerr("main: is another instance of in.mpathd " 2209 "already active?\n"); 2210 exit(1); 2211 } else { 2212 (void) close(sock); 2213 return (-1); 2214 } 2215 } 2216 if (listen(sock, 30) < 0) { 2217 logperror("main: listen"); 2218 exit(1); 2219 } 2220 if (poll_add(sock) == -1) { 2221 (void) close(sock); 2222 exit(1); 2223 } 2224 2225 return (sock); 2226 } 2227 2228 /* 2229 * Table of commands and their expected size; used by loopback_cmd(). 2230 */ 2231 static struct { 2232 const char *name; 2233 unsigned int size; 2234 } commands[] = { 2235 { "MI_PING", sizeof (uint32_t) }, 2236 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2237 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2238 { "MI_QUERY", sizeof (mi_query_t) } 2239 }; 2240 2241 /* 2242 * Commands received over the loopback interface come here (via libipmp). 2243 */ 2244 static void 2245 loopback_cmd(int sock, int family) 2246 { 2247 int newfd; 2248 ssize_t len; 2249 boolean_t is_priv = _B_FALSE; 2250 struct sockaddr_storage peer; 2251 struct sockaddr_in *peer_sin; 2252 struct sockaddr_in6 *peer_sin6; 2253 socklen_t peerlen; 2254 union mi_commands mpi; 2255 char abuf[INET6_ADDRSTRLEN]; 2256 uint_t cmd; 2257 int retval; 2258 2259 peerlen = sizeof (peer); 2260 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2261 if (newfd < 0) { 2262 logperror("loopback_cmd: accept"); 2263 return; 2264 } 2265 2266 switch (family) { 2267 case AF_INET: 2268 /* 2269 * Validate the address and port to make sure that 2270 * non privileged processes don't connect and start 2271 * talking to us. 2272 */ 2273 if (peerlen != sizeof (struct sockaddr_in)) { 2274 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2275 (void) close(newfd); 2276 return; 2277 } 2278 peer_sin = (struct sockaddr_in *)&peer; 2279 is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; 2280 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2281 abuf, sizeof (abuf)); 2282 2283 if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { 2284 logerr("Attempt to connect from addr %s port %d\n", 2285 abuf, ntohs(peer_sin->sin_port)); 2286 (void) close(newfd); 2287 return; 2288 } 2289 break; 2290 2291 case AF_INET6: 2292 if (peerlen != sizeof (struct sockaddr_in6)) { 2293 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2294 (void) close(newfd); 2295 return; 2296 } 2297 /* 2298 * Validate the address and port to make sure that 2299 * non privileged processes don't connect and start 2300 * talking to us. 2301 */ 2302 peer_sin6 = (struct sockaddr_in6 *)&peer; 2303 is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; 2304 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2305 sizeof (abuf)); 2306 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { 2307 logerr("Attempt to connect from addr %s port %d\n", 2308 abuf, ntohs(peer_sin6->sin6_port)); 2309 (void) close(newfd); 2310 return; 2311 } 2312 2313 default: 2314 logdebug("loopback_cmd: family %d\n", family); 2315 (void) close(newfd); 2316 return; 2317 } 2318 2319 /* 2320 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2321 * all supported commands 2322 */ 2323 len = read(newfd, &mpi, sizeof (mpi)); 2324 2325 /* 2326 * In theory, we can receive any sized message for a stream socket, 2327 * but we don't expect that to happen for a small message over a 2328 * loopback connection. 2329 */ 2330 if (len < sizeof (uint32_t)) { 2331 logerr("loopback_cmd: bad command format or read returns " 2332 "partial data %d\n", len); 2333 (void) close(newfd); 2334 return; 2335 } 2336 2337 cmd = mpi.mi_command; 2338 if (cmd >= MI_NCMD) { 2339 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2340 (void) close(newfd); 2341 return; 2342 } 2343 2344 /* 2345 * Only MI_PING and MI_QUERY can come from unprivileged sources. 2346 */ 2347 if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { 2348 logerr("Unprivileged request from %s for privileged " 2349 "command %s\n", abuf, commands[cmd].name); 2350 (void) close(newfd); 2351 return; 2352 } 2353 2354 if (len < commands[cmd].size) { 2355 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2356 commands[cmd].name, commands[cmd].size, len); 2357 (void) close(newfd); 2358 return; 2359 } 2360 2361 retval = process_cmd(newfd, &mpi); 2362 if (retval != IPMP_SUCCESS) { 2363 logerr("failed processing %s: %s\n", commands[cmd].name, 2364 ipmp_errmsg(retval)); 2365 } 2366 (void) close(newfd); 2367 } 2368 2369 /* 2370 * Process the commands received via libipmp. 2371 */ 2372 static unsigned int 2373 process_cmd(int newfd, union mi_commands *mpi) 2374 { 2375 struct phyint *pi; 2376 struct mi_offline *mio; 2377 struct mi_undo_offline *miu; 2378 unsigned int retval; 2379 2380 switch (mpi->mi_command) { 2381 case MI_PING: 2382 return (send_result(newfd, IPMP_SUCCESS, 0)); 2383 2384 case MI_OFFLINE: 2385 mio = &mpi->mi_ocmd; 2386 2387 pi = phyint_lookup(mio->mio_ifname); 2388 if (pi == NULL) 2389 return (send_result(newfd, IPMP_EUNKIF, 0)); 2390 2391 retval = phyint_offline(pi, mio->mio_min_redundancy); 2392 if (retval == IPMP_FAILURE) 2393 return (send_result(newfd, IPMP_FAILURE, errno)); 2394 2395 return (send_result(newfd, retval, 0)); 2396 2397 case MI_UNDO_OFFLINE: 2398 miu = &mpi->mi_ucmd; 2399 2400 pi = phyint_lookup(miu->miu_ifname); 2401 if (pi == NULL) 2402 return (send_result(newfd, IPMP_EUNKIF, 0)); 2403 2404 retval = phyint_undo_offline(pi); 2405 if (retval == IPMP_FAILURE) 2406 return (send_result(newfd, IPMP_FAILURE, errno)); 2407 2408 return (send_result(newfd, retval, 0)); 2409 2410 case MI_QUERY: 2411 return (process_query(newfd, &mpi->mi_qcmd)); 2412 2413 default: 2414 break; 2415 } 2416 2417 return (send_result(newfd, IPMP_EPROTO, 0)); 2418 } 2419 2420 /* 2421 * Process the query request pointed to by `miq' and send a reply on file 2422 * descriptor `fd'. Returns an IPMP error code. 2423 */ 2424 static unsigned int 2425 process_query(int fd, mi_query_t *miq) 2426 { 2427 ipmp_addrinfo_t *adinfop; 2428 ipmp_addrinfolist_t *adlp; 2429 ipmp_groupinfo_t *grinfop; 2430 ipmp_groupinfolist_t *grlp; 2431 ipmp_grouplist_t *grlistp; 2432 ipmp_ifinfo_t *ifinfop; 2433 ipmp_ifinfolist_t *iflp; 2434 ipmp_snap_t *snap; 2435 unsigned int retval; 2436 2437 switch (miq->miq_inforeq) { 2438 case IPMP_ADDRINFO: 2439 retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, 2440 &adinfop); 2441 if (retval != IPMP_SUCCESS) 2442 return (send_result(fd, retval, errno)); 2443 2444 retval = send_result(fd, IPMP_SUCCESS, 0); 2445 if (retval == IPMP_SUCCESS) 2446 retval = send_addrinfo(fd, adinfop); 2447 2448 ipmp_freeaddrinfo(adinfop); 2449 return (retval); 2450 2451 case IPMP_GROUPLIST: 2452 retval = getgrouplist(&grlistp); 2453 if (retval != IPMP_SUCCESS) 2454 return (send_result(fd, retval, errno)); 2455 2456 retval = send_result(fd, IPMP_SUCCESS, 0); 2457 if (retval == IPMP_SUCCESS) 2458 retval = send_grouplist(fd, grlistp); 2459 2460 ipmp_freegrouplist(grlistp); 2461 return (retval); 2462 2463 case IPMP_GROUPINFO: 2464 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2465 retval = getgroupinfo(miq->miq_grname, &grinfop); 2466 if (retval != IPMP_SUCCESS) 2467 return (send_result(fd, retval, errno)); 2468 2469 retval = send_result(fd, IPMP_SUCCESS, 0); 2470 if (retval == IPMP_SUCCESS) 2471 retval = send_groupinfo(fd, grinfop); 2472 2473 ipmp_freegroupinfo(grinfop); 2474 return (retval); 2475 2476 case IPMP_IFINFO: 2477 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2478 retval = getifinfo(miq->miq_ifname, &ifinfop); 2479 if (retval != IPMP_SUCCESS) 2480 return (send_result(fd, retval, errno)); 2481 2482 retval = send_result(fd, IPMP_SUCCESS, 0); 2483 if (retval == IPMP_SUCCESS) 2484 retval = send_ifinfo(fd, ifinfop); 2485 2486 ipmp_freeifinfo(ifinfop); 2487 return (retval); 2488 2489 case IPMP_SNAP: 2490 /* 2491 * Before taking the snapshot, sync with the kernel. 2492 */ 2493 initifs(); 2494 2495 retval = getsnap(&snap); 2496 if (retval != IPMP_SUCCESS) 2497 return (send_result(fd, retval, errno)); 2498 2499 retval = send_result(fd, IPMP_SUCCESS, 0); 2500 if (retval != IPMP_SUCCESS) 2501 goto out; 2502 2503 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2504 if (retval != IPMP_SUCCESS) 2505 goto out; 2506 2507 retval = send_grouplist(fd, snap->sn_grlistp); 2508 if (retval != IPMP_SUCCESS) 2509 goto out; 2510 2511 iflp = snap->sn_ifinfolistp; 2512 for (; iflp != NULL; iflp = iflp->ifl_next) { 2513 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2514 if (retval != IPMP_SUCCESS) 2515 goto out; 2516 } 2517 2518 grlp = snap->sn_grinfolistp; 2519 for (; grlp != NULL; grlp = grlp->grl_next) { 2520 retval = send_groupinfo(fd, grlp->grl_grinfop); 2521 if (retval != IPMP_SUCCESS) 2522 goto out; 2523 } 2524 2525 adlp = snap->sn_adinfolistp; 2526 for (; adlp != NULL; adlp = adlp->adl_next) { 2527 retval = send_addrinfo(fd, adlp->adl_adinfop); 2528 if (retval != IPMP_SUCCESS) 2529 goto out; 2530 } 2531 out: 2532 ipmp_snap_free(snap); 2533 return (retval); 2534 2535 default: 2536 break; 2537 2538 } 2539 return (send_result(fd, IPMP_EPROTO, 0)); 2540 } 2541 2542 /* 2543 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2544 * Returns an IPMP error code. 2545 */ 2546 static unsigned int 2547 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2548 { 2549 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2550 ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; 2551 unsigned int retval; 2552 2553 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2554 if (retval != IPMP_SUCCESS) 2555 return (retval); 2556 2557 retval = ipmp_writetlv(fd, IPMP_IFLIST, 2558 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); 2559 if (retval != IPMP_SUCCESS) 2560 return (retval); 2561 2562 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2563 IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); 2564 } 2565 2566 /* 2567 * Send the interface information pointed to by `ifinfop' on file descriptor 2568 * `fd'. Returns an IPMP error code. 2569 */ 2570 static unsigned int 2571 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2572 { 2573 ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; 2574 ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; 2575 unsigned int retval; 2576 2577 retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop); 2578 if (retval != IPMP_SUCCESS) 2579 return (retval); 2580 2581 retval = ipmp_writetlv(fd, IPMP_ADDRLIST, 2582 IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); 2583 if (retval != IPMP_SUCCESS) 2584 return (retval); 2585 2586 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2587 IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); 2588 } 2589 2590 /* 2591 * Send the address information pointed to by `adinfop' on file descriptor 2592 * `fd'. Returns an IPMP error code. 2593 */ 2594 static unsigned int 2595 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) 2596 { 2597 return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); 2598 } 2599 2600 /* 2601 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2602 * Returns an IPMP error code. 2603 */ 2604 static unsigned int 2605 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2606 { 2607 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2608 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2609 } 2610 2611 /* 2612 * Initialize an mi_result_t structure using `error' and `syserror' and 2613 * send it on file descriptor `fd'. Returns an IPMP error code. 2614 */ 2615 static unsigned int 2616 send_result(int fd, unsigned int error, int syserror) 2617 { 2618 mi_result_t me; 2619 2620 me.me_mpathd_error = error; 2621 if (error == IPMP_FAILURE) 2622 me.me_sys_error = syserror; 2623 else 2624 me.me_sys_error = 0; 2625 2626 return (ipmp_write(fd, &me, sizeof (me))); 2627 } 2628 2629 /* 2630 * Daemonize the process. 2631 */ 2632 static boolean_t 2633 daemonize(void) 2634 { 2635 switch (fork()) { 2636 case -1: 2637 return (_B_FALSE); 2638 2639 case 0: 2640 /* 2641 * Lose our controlling terminal, and become both a session 2642 * leader and a process group leader. 2643 */ 2644 if (setsid() == -1) 2645 return (_B_FALSE); 2646 2647 /* 2648 * Under POSIX, a session leader can accidentally (through 2649 * open(2)) acquire a controlling terminal if it does not 2650 * have one. Just to be safe, fork() again so we are not a 2651 * session leader. 2652 */ 2653 switch (fork()) { 2654 case -1: 2655 return (_B_FALSE); 2656 2657 case 0: 2658 (void) chdir("/"); 2659 (void) umask(022); 2660 (void) fdwalk(closefunc, NULL); 2661 break; 2662 2663 default: 2664 _exit(EXIT_SUCCESS); 2665 } 2666 break; 2667 2668 default: 2669 _exit(EXIT_SUCCESS); 2670 } 2671 2672 return (_B_TRUE); 2673 } 2674 2675 /* 2676 * The parent has created some fds before forking on purpose, keep them open. 2677 */ 2678 static int 2679 closefunc(void *not_used, int fd) 2680 /* ARGSUSED */ 2681 { 2682 if (fd != lsock_v4 && fd != lsock_v6) 2683 (void) close(fd); 2684 return (0); 2685 } 2686 2687 /* LOGGER */ 2688 2689 #include <syslog.h> 2690 2691 /* 2692 * Logging routines. All routines log to syslog, unless the daemon is 2693 * running in the foreground, in which case the logging goes to stderr. 2694 * 2695 * The following routines are available: 2696 * 2697 * logdebug(): A printf-like function for outputting debug messages 2698 * (messages at LOG_DEBUG) that are only of use to developers. 2699 * 2700 * logtrace(): A printf-like function for outputting tracing messages 2701 * (messages at LOG_INFO) from the daemon. This is typically used 2702 * to log the receipt of interesting network-related conditions. 2703 * 2704 * logerr(): A printf-like function for outputting error messages 2705 * (messages at LOG_ERR) from the daemon. 2706 * 2707 * logperror*(): A set of functions used to output error messages 2708 * (messages at LOG_ERR); these automatically append strerror(errno) 2709 * and a newline to the message passed to them. 2710 * 2711 * NOTE: since the logging functions write to syslog, the messages passed 2712 * to them are not eligible for localization. Thus, gettext() must 2713 * *not* be used. 2714 */ 2715 2716 static int logging = 0; 2717 2718 static void 2719 initlog(void) 2720 { 2721 logging++; 2722 openlog("in.mpathd", LOG_PID, LOG_DAEMON); 2723 } 2724 2725 /* PRINTFLIKE2 */ 2726 void 2727 logmsg(int pri, const char *fmt, ...) 2728 { 2729 va_list ap; 2730 2731 va_start(ap, fmt); 2732 2733 if (logging) 2734 vsyslog(pri, fmt, ap); 2735 else 2736 (void) vfprintf(stderr, fmt, ap); 2737 va_end(ap); 2738 } 2739 2740 /* PRINTFLIKE1 */ 2741 void 2742 logperror(const char *str) 2743 { 2744 if (logging) 2745 syslog(LOG_ERR, "%s: %m\n", str); 2746 else 2747 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 2748 } 2749 2750 void 2751 logperror_pii(struct phyint_instance *pii, const char *str) 2752 { 2753 if (logging) { 2754 syslog(LOG_ERR, "%s (%s %s): %m\n", 2755 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 2756 } else { 2757 (void) fprintf(stderr, "%s (%s %s): %s\n", 2758 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 2759 strerror(errno)); 2760 } 2761 } 2762 2763 void 2764 logperror_li(struct logint *li, const char *str) 2765 { 2766 struct phyint_instance *pii = li->li_phyint_inst; 2767 2768 if (logging) { 2769 syslog(LOG_ERR, "%s (%s %s): %m\n", 2770 str, AF_STR(pii->pii_af), li->li_name); 2771 } else { 2772 (void) fprintf(stderr, "%s (%s %s): %s\n", 2773 str, AF_STR(pii->pii_af), li->li_name, 2774 strerror(errno)); 2775 } 2776 } 2777 2778 void 2779 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 2780 { 2781 if (polled) 2782 (void) poll_remove(pii->pii_probe_sock); 2783 (void) close(pii->pii_probe_sock); 2784 pii->pii_probe_sock = -1; 2785 pii->pii_basetime_inited = 0; 2786 } 2787 2788 boolean_t 2789 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, 2790 struct sockaddr_storage *ssp) 2791 { 2792 addrlist_t *addrp; 2793 2794 if ((addrp = malloc(sizeof (addrlist_t))) == NULL) 2795 return (_B_FALSE); 2796 2797 (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); 2798 addrp->al_flags = flags; 2799 addrp->al_addr = *ssp; 2800 addrp->al_next = *addrsp; 2801 *addrsp = addrp; 2802 return (_B_TRUE); 2803 } 2804 2805 void 2806 addrlist_free(addrlist_t **addrsp) 2807 { 2808 addrlist_t *addrp, *next_addrp; 2809 2810 for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { 2811 next_addrp = addrp->al_next; 2812 free(addrp); 2813 } 2814 *addrsp = NULL; 2815 } 2816 2817 /* 2818 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 2819 * tables defined by mib2.h. Pass the table information returned to the 2820 * supplied function. 2821 */ 2822 static int 2823 mibwalk(void (*proc)(mib_item_t *)) 2824 { 2825 mib_item_t *head_item = NULL; 2826 mib_item_t *last_item = NULL; 2827 mib_item_t *tmp; 2828 struct strbuf ctlbuf, databuf; 2829 int flags; 2830 int rval; 2831 uintptr_t buf[512 / sizeof (uintptr_t)]; 2832 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)buf; 2833 struct T_optmgmt_ack *toa = (struct T_optmgmt_ack *)buf; 2834 struct T_error_ack *tea = (struct T_error_ack *)buf; 2835 struct opthdr *req, *optp; 2836 int status = -1; 2837 2838 if (mibfd == -1) { 2839 if ((mibfd = open("/dev/ip", O_RDWR)) < 0) { 2840 logperror("mibwalk(): ip open"); 2841 return (status); 2842 } 2843 } 2844 2845 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2846 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2847 tor->OPT_length = sizeof (struct opthdr); 2848 tor->MGMT_flags = T_CURRENT; 2849 2850 /* 2851 * Note: we use the special level value below so that IP will return 2852 * us information concerning IRE_MARK_TESTHIDDEN routes. 2853 */ 2854 req = (struct opthdr *)&tor[1]; 2855 req->level = EXPER_IP_AND_TESTHIDDEN; 2856 req->name = 0; 2857 req->len = 0; 2858 2859 ctlbuf.buf = (char *)&buf; 2860 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 2861 2862 if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) { 2863 logperror("mibwalk(): putmsg(ctl)"); 2864 return (status); 2865 } 2866 2867 /* 2868 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 2869 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 2870 * a control and data part. The control part contains a struct 2871 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 2872 * the level, name and length of the data in the data part. The 2873 * data part contains the actual table data. The last message 2874 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 2875 * single option with zero optlen. 2876 */ 2877 for (;;) { 2878 errno = flags = 0; 2879 ctlbuf.maxlen = sizeof (buf); 2880 rval = getmsg(mibfd, &ctlbuf, NULL, &flags); 2881 if (rval & MORECTL || rval < 0) { 2882 if (errno == EINTR) 2883 continue; 2884 logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n", 2885 rval, errno); 2886 goto error; 2887 } 2888 if (ctlbuf.len < sizeof (t_scalar_t)) { 2889 logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len); 2890 goto error; 2891 } 2892 2893 switch (toa->PRIM_type) { 2894 case T_ERROR_ACK: 2895 if (ctlbuf.len < sizeof (struct T_error_ack)) { 2896 logerr("mibwalk(): T_ERROR_ACK ctlbuf " 2897 "too short: %d\n", ctlbuf.len); 2898 goto error; 2899 } 2900 logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n" 2901 " UNIX_err = 0x%lx\n", tea->TLI_error, 2902 t_strerror(tea->TLI_error), tea->UNIX_error); 2903 goto error; 2904 2905 case T_OPTMGMT_ACK: 2906 optp = (struct opthdr *)&toa[1]; 2907 if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + 2908 sizeof (struct opthdr))) { 2909 logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too " 2910 "short: %d\n", ctlbuf.len); 2911 goto error; 2912 } 2913 if (toa->MGMT_flags != T_SUCCESS) { 2914 logerr("mibwalk(): MGMT_flags != T_SUCCESS: " 2915 "0x%lx\n", toa->MGMT_flags); 2916 goto error; 2917 } 2918 break; 2919 2920 default: 2921 goto error; 2922 } 2923 /* The following assert also implies MGMT_flags == T_SUCCESS */ 2924 assert(toa->PRIM_type == T_OPTMGMT_ACK); 2925 2926 /* 2927 * We have reached the end of this T_OPTMGMT_ACK 2928 * message. If this is the last message i.e EOD, 2929 * break, else process the next T_OPTMGMT_ACK msg. 2930 */ 2931 if (rval == 0) { 2932 if (optp->len == 0 && optp->name == 0 && 2933 optp->level == 0) { 2934 /* This is the EOD message. */ 2935 break; 2936 } 2937 /* Not EOD but no data to retrieve */ 2938 continue; 2939 } 2940 2941 /* 2942 * We should only be here if MOREDATA was set. 2943 * Allocate an empty mib_item_t and link into the list 2944 * of MIB items. 2945 */ 2946 if ((tmp = malloc(sizeof (*tmp))) == NULL) { 2947 logperror("mibwalk(): malloc() failed."); 2948 goto error; 2949 } 2950 if (last_item != NULL) 2951 last_item->mi_next = tmp; 2952 else 2953 head_item = tmp; 2954 last_item = tmp; 2955 last_item->mi_next = NULL; 2956 last_item->mi_opthdr = *optp; 2957 last_item->mi_valp = malloc(optp->len); 2958 if (last_item->mi_valp == NULL) { 2959 logperror("mibwalk(): malloc() failed."); 2960 goto error; 2961 } 2962 2963 databuf.maxlen = last_item->mi_opthdr.len; 2964 databuf.buf = (char *)last_item->mi_valp; 2965 databuf.len = 0; 2966 2967 /* Retrieve the actual MIB data */ 2968 for (;;) { 2969 flags = 0; 2970 if ((rval = getmsg(mibfd, NULL, &databuf, 2971 &flags)) != 0) { 2972 if (rval < 0 && errno == EINTR) 2973 continue; 2974 /* 2975 * We shouldn't get MOREDATA here so treat that 2976 * as an error. 2977 */ 2978 logperror("mibwalk(): getmsg(data)"); 2979 goto error; 2980 } 2981 break; 2982 } 2983 } 2984 status = 0; 2985 /* Pass the accumulated MIB data to the supplied function pointer */ 2986 (*proc)(head_item); 2987 error: 2988 while (head_item != NULL) { 2989 tmp = head_item; 2990 head_item = tmp->mi_next; 2991 free(tmp->mi_valp); 2992 free(tmp); 2993 } 2994 return (status); 2995 } 2996 2997 /* 2998 * Parse the supplied mib2 information to get the size of routing table 2999 * entries. This is needed when running in a branded zone where the 3000 * Solaris application environment and the Solaris kernel may not be the 3001 * the same release version. 3002 */ 3003 static void 3004 mib_get_constants(mib_item_t *item) 3005 { 3006 mib2_ip_t *ipv4; 3007 mib2_ipv6IfStatsEntry_t *ipv6; 3008 3009 for (; item != NULL; item = item->mi_next) { 3010 if (item->mi_opthdr.name != 0) 3011 continue; 3012 if (item->mi_opthdr.level == MIB2_IP) { 3013 ipv4 = (mib2_ip_t *)item->mi_valp; 3014 ipRouteEntrySize = ipv4->ipRouteEntrySize; 3015 } else if (item->mi_opthdr.level == MIB2_IP6) { 3016 ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp; 3017 ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize; 3018 } 3019 } 3020 } 3021