1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28
29 int debug = 0; /* Debug flag */
30 static int pollfd_num = 0; /* Num. of poll descriptors */
31 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */
32 /* All times below in ms */
33 int user_failure_detection_time; /* user specified failure detection */
34 /* time (fdt) */
35 int user_probe_interval; /* derived from user specified fdt */
36
37 /*
38 * Structure to store mib2 information returned by the kernel.
39 * This is used to process routing table information.
40 */
41 typedef struct mib_item_s {
42 struct mib_item_s *mi_next;
43 struct opthdr mi_opthdr;
44 void *mi_valp;
45 } mib_item_t;
46
47 static int rtsock_v4; /* AF_INET routing socket */
48 static int rtsock_v6; /* AF_INET6 routing socket */
49 int ifsock_v4 = -1; /* IPv4 socket for ioctls */
50 int ifsock_v6 = -1; /* IPv6 socket for ioctls */
51 static int lsock_v4; /* Listen socket to detect mpathd */
52 static int lsock_v6; /* Listen socket to detect mpathd */
53 static int mibfd = -1; /* fd to get mib info */
54 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
55
56 static uint_t last_initifs_time; /* Time when initifs was last run */
57 static char **argv0; /* Saved for re-exec on SIGHUP */
58 boolean_t handle_link_notifications = _B_TRUE;
59 static int ipRouteEntrySize; /* Size of IPv4 route entry */
60 static int ipv6RouteEntrySize; /* Size of IPv6 route entry */
61
62 static void initlog(void);
63 static void run_timeouts(void);
64 static void initifs(void);
65 static void check_if_removed(struct phyint_instance *pii);
66 static void select_test_ifs(void);
67 static void update_router_list(mib_item_t *item);
68 static void mib_get_constants(mib_item_t *item);
69 static int mibwalk(void (*proc)(mib_item_t *));
70 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
71 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
72 static void router_add_common(int af, char *ifname,
73 struct in6_addr nexthop);
74 static void init_router_targets();
75 static void cleanup(void);
76 static int setup_listener(int af);
77 static void check_config(void);
78 static void check_testconfig(void);
79 static void check_addr_unique(struct phyint_instance *,
80 struct sockaddr_storage *);
81 static void init_host_targets(void);
82 static void dup_host_targets(struct phyint_instance *desired_pii);
83 static void loopback_cmd(int sock, int family);
84 static boolean_t daemonize(void);
85 static int closefunc(void *, int);
86 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
87 static unsigned int process_query(int fd, mi_query_t *miq);
88 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
89 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
90 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
91 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
92 static unsigned int send_result(int fd, unsigned int error, int syserror);
93
94 addrlist_t *localaddrs;
95
96 /*
97 * Return the current time in milliseconds (from an arbitrary reference)
98 * truncated to fit into an int. Truncation is ok since we are interested
99 * only in differences and not the absolute values.
100 */
101 uint_t
getcurrenttime(void)102 getcurrenttime(void)
103 {
104 uint_t cur_time; /* In ms */
105
106 /*
107 * Use of a non-user-adjustable source of time is
108 * required. However millisecond precision is sufficient.
109 * divide by 10^6
110 */
111 cur_time = (uint_t)(gethrtime() / 1000000LL);
112 return (cur_time);
113 }
114
115 uint64_t
getcurrentsec(void)116 getcurrentsec(void)
117 {
118 return (gethrtime() / NANOSEC);
119 }
120
121 /*
122 * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
123 */
124 int
poll_add(int fd)125 poll_add(int fd)
126 {
127 int i;
128 int new_num;
129 struct pollfd *newfds;
130 retry:
131 /* Check if already present */
132 for (i = 0; i < pollfd_num; i++) {
133 if (pollfds[i].fd == fd)
134 return (0);
135 }
136 /* Check for empty spot already present */
137 for (i = 0; i < pollfd_num; i++) {
138 if (pollfds[i].fd == -1) {
139 pollfds[i].fd = fd;
140 return (0);
141 }
142 }
143
144 /* Allocate space for 32 more fds and initialize to -1 */
145 new_num = pollfd_num + 32;
146 newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
147 if (newfds == NULL) {
148 logperror("poll_add: realloc");
149 return (-1);
150 }
151 for (i = pollfd_num; i < new_num; i++) {
152 newfds[i].fd = -1;
153 newfds[i].events = POLLIN;
154 }
155 pollfd_num = new_num;
156 pollfds = newfds;
157 goto retry;
158 }
159
160 /*
161 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
162 */
163 int
poll_remove(int fd)164 poll_remove(int fd)
165 {
166 int i;
167
168 /* Check if already present */
169 for (i = 0; i < pollfd_num; i++) {
170 if (pollfds[i].fd == fd) {
171 pollfds[i].fd = -1;
172 return (0);
173 }
174 }
175 return (-1);
176 }
177
178 /*
179 * Extract information about the phyint instance. If the phyint instance still
180 * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
181 * will use it to detect phyint instances that don't exist any longer and
182 * remove them, from our database of phyint instances.
183 * Return value:
184 * returns true if the phyint instance exists in the kernel,
185 * returns false otherwise
186 */
187 static boolean_t
pii_process(int af,char * name,struct phyint_instance ** pii_p)188 pii_process(int af, char *name, struct phyint_instance **pii_p)
189 {
190 int err;
191 struct phyint_instance *pii;
192 struct phyint_instance *pii_other;
193
194 if (debug & D_PHYINT)
195 logdebug("pii_process(%s %s)\n", AF_STR(af), name);
196
197 pii = phyint_inst_lookup(af, name);
198 if (pii == NULL) {
199 /*
200 * Phyint instance does not exist in our tables,
201 * create new phyint instance
202 */
203 pii = phyint_inst_init_from_k(af, name);
204 } else {
205 /* Phyint exists in our tables */
206 err = phyint_inst_update_from_k(pii);
207
208 switch (err) {
209 case PI_IOCTL_ERROR:
210 /* Some ioctl error. don't change anything */
211 pii->pii_in_use = 1;
212 break;
213
214 case PI_GROUP_CHANGED:
215 case PI_IFINDEX_CHANGED:
216 /*
217 * Interface index or group membership has changed.
218 * Delete the old state and recreate based on the new
219 * state (it may no longer be in a group).
220 */
221 pii_other = phyint_inst_other(pii);
222 if (pii_other != NULL)
223 phyint_inst_delete(pii_other);
224 phyint_inst_delete(pii);
225 pii = phyint_inst_init_from_k(af, name);
226 break;
227
228 case PI_DELETED:
229 /* Phyint instance has disappeared from kernel */
230 pii->pii_in_use = 0;
231 break;
232
233 case PI_OK:
234 /* Phyint instance exists and is fine */
235 pii->pii_in_use = 1;
236 break;
237
238 default:
239 /* Unknown status */
240 logerr("pii_process: Unknown status %d\n", err);
241 break;
242 }
243 }
244
245 *pii_p = pii;
246 if (pii != NULL)
247 return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
248 else
249 return (_B_FALSE);
250 }
251
252 /*
253 * Scan all interfaces to detect changes as well as new and deleted interfaces
254 */
255 static void
initifs()256 initifs()
257 {
258 int i, nlifr;
259 int af;
260 char *cp;
261 char *buf;
262 int sockfd;
263 uint64_t flags;
264 struct lifnum lifn;
265 struct lifconf lifc;
266 struct lifreq lifreq;
267 struct lifreq *lifr;
268 struct logint *li;
269 struct phyint_instance *pii;
270 struct phyint_instance *next_pii;
271 struct phyint_group *pg, *next_pg;
272 char pi_name[LIFNAMSIZ + 1];
273
274 if (debug & D_PHYINT)
275 logdebug("initifs: Scanning interfaces\n");
276
277 last_initifs_time = getcurrenttime();
278
279 /*
280 * Free the existing local address list; we'll build a new list below.
281 */
282 addrlist_free(&localaddrs);
283
284 /*
285 * Mark the interfaces so that we can find phyints and logints
286 * which have disappeared from the kernel. pii_process() and
287 * logint_init_from_k() will set {pii,li}_in_use when they find
288 * the interface in the kernel. Also, clear dupaddr bit on probe
289 * logint. check_addr_unique() will set the dupaddr bit on the
290 * probe logint, if the testaddress is not unique.
291 */
292 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
293 pii->pii_in_use = 0;
294 for (li = pii->pii_logint; li != NULL; li = li->li_next) {
295 li->li_in_use = 0;
296 if (pii->pii_probe_logint == li)
297 li->li_dupaddr = 0;
298 }
299 }
300
301 /*
302 * As above, mark groups so that we can detect IPMP interfaces which
303 * have been removed from the kernel. Also, delete the group address
304 * list since we'll iteratively recreate it below.
305 */
306 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
307 pg->pg_in_use = _B_FALSE;
308 addrlist_free(&pg->pg_addrs);
309 }
310
311 lifn.lifn_family = AF_UNSPEC;
312 lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
313 again:
314 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
315 logperror("initifs: ioctl (get interface count)");
316 return;
317 }
318 /*
319 * Pad the interface count to detect when additional interfaces have
320 * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
321 */
322 lifn.lifn_count += 4;
323
324 if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
325 logperror("initifs: calloc");
326 return;
327 }
328
329 lifc.lifc_family = AF_UNSPEC;
330 lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
331 lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
332 lifc.lifc_buf = buf;
333
334 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
335 logperror("initifs: ioctl (get interface configuration)");
336 free(buf);
337 return;
338 }
339
340 /*
341 * If every lifr_req slot is taken, then additional interfaces must
342 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
343 * Recalculate to make sure we didn't miss any interfaces.
344 */
345 nlifr = lifc.lifc_len / sizeof (struct lifreq);
346 if (nlifr >= lifn.lifn_count) {
347 free(buf);
348 goto again;
349 }
350
351 /*
352 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
353 * global list of addresses, phyint groups, phyints, and logints.
354 */
355 for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
356 af = lifr->lifr_addr.ss_family;
357 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
358 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
359
360 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
361 if (errno != ENXIO)
362 logperror("initifs: ioctl (SIOCGLIFFLAGS)");
363 continue;
364 }
365 flags = lifreq.lifr_flags;
366
367 /*
368 * If the address is IFF_UP, add it to the local address list.
369 * (We ignore addresses that aren't IFF_UP since another node
370 * might legitimately have that address IFF_UP.)
371 */
372 if (flags & IFF_UP) {
373 (void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
374 &lifr->lifr_addr);
375 }
376
377 /*
378 * If this address is on an IPMP meta-interface, update our
379 * phyint_group information (either by recording that group
380 * still exists or creating a new group), and track what
381 * group the address is part of.
382 */
383 if (flags & IFF_IPMP) {
384 if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
385 if (errno != ENXIO)
386 logperror("initifs: ioctl "
387 "(SIOCGLIFGROUPNAME)");
388 continue;
389 }
390
391 pg = phyint_group_lookup(lifreq.lifr_groupname);
392 if (pg == NULL) {
393 pg = phyint_group_create(lifreq.lifr_groupname);
394 if (pg == NULL) {
395 logerr("initifs: cannot create group "
396 "%s\n", lifreq.lifr_groupname);
397 continue;
398 }
399 phyint_group_insert(pg);
400 }
401 pg->pg_in_use = _B_TRUE;
402
403 /*
404 * Add this to the group's list of data addresses.
405 */
406 if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
407 &lifr->lifr_addr)) {
408 logerr("initifs: insufficient memory to track "
409 "data address information for %s\n",
410 lifr->lifr_name);
411 }
412 continue;
413 }
414
415 /*
416 * This isn't an address on an IPMP meta-interface, so it's
417 * either on an underlying interface or not related to any
418 * group. Update our phyint and logint information (via
419 * pii_process() and logint_init_from_k()) -- but first,
420 * convert the logint name to a phyint name so we can call
421 * pii_process().
422 */
423 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
424 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
425 *cp = '\0';
426
427 if (pii_process(af, pi_name, &pii)) {
428 /* The phyint is fine. So process the logint */
429 logint_init_from_k(pii, lifr->lifr_name);
430 check_addr_unique(pii, &lifr->lifr_addr);
431 }
432 }
433 free(buf);
434
435 /*
436 * Scan for groups, phyints and logints that have disappeared from the
437 * kernel, and delete them.
438 */
439 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
440 next_pii = pii->pii_next;
441 check_if_removed(pii);
442 }
443
444 for (pg = phyint_groups; pg != NULL; pg = next_pg) {
445 next_pg = pg->pg_next;
446 if (!pg->pg_in_use) {
447 phyint_group_delete(pg);
448 continue;
449 }
450 /*
451 * Refresh the group's state. This is necessary since the
452 * group's state is defined by the set of usable interfaces in
453 * the group, and an interface is considered unusable if all
454 * of its addresses are down. When an address goes down/up,
455 * the RTM_DELADDR/RTM_NEWADDR brings us through here.
456 */
457 phyint_group_refresh_state(pg);
458 }
459
460 /*
461 * Select a test address for sending probes on each phyint instance
462 */
463 select_test_ifs();
464
465 /*
466 * Handle link up/down notifications.
467 */
468 process_link_state_changes();
469 }
470
471 /*
472 * Check that a given test address is unique across all of the interfaces in a
473 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding
474 * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
475 * Any issues will be reported by check_testconfig().
476 */
477 static void
check_addr_unique(struct phyint_instance * ourpii,struct sockaddr_storage * ss)478 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
479 {
480 struct phyint *pi;
481 struct phyint_group *pg;
482 struct in6_addr addr;
483 struct phyint_instance *pii;
484 struct sockaddr_in *sin;
485
486 if (ss->ss_family == AF_INET) {
487 sin = (struct sockaddr_in *)ss;
488 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
489 } else {
490 assert(ss->ss_family == AF_INET6);
491 addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
492 }
493
494 /*
495 * For anonymous groups, every interface is assumed to be on its own
496 * link, so there is no chance of overlapping addresses.
497 */
498 pg = ourpii->pii_phyint->pi_group;
499 if (pg == phyint_anongroup)
500 return;
501
502 /*
503 * Walk the list of phyint instances in the group and check for test
504 * addresses matching ours. Of course, we skip ourself.
505 */
506 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
507 pii = PHYINT_INSTANCE(pi, ss->ss_family);
508 if (pii == NULL || pii == ourpii ||
509 pii->pii_probe_logint == NULL)
510 continue;
511
512 /*
513 * If this test address is not unique, set the dupaddr bit.
514 */
515 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr))
516 pii->pii_probe_logint->li_dupaddr = 1;
517 }
518 }
519
520 /*
521 * Stop probing an interface. Called when an interface is offlined.
522 * The probe socket is closed on each interface instance, and the
523 * interface state set to PI_OFFLINE.
524 */
525 void
stop_probing(struct phyint * pi)526 stop_probing(struct phyint *pi)
527 {
528 struct phyint_instance *pii;
529
530 pii = pi->pi_v4;
531 if (pii != NULL) {
532 if (pii->pii_probe_sock != -1)
533 close_probe_socket(pii, _B_TRUE);
534 pii->pii_probe_logint = NULL;
535 }
536
537 pii = pi->pi_v6;
538 if (pii != NULL) {
539 if (pii->pii_probe_sock != -1)
540 close_probe_socket(pii, _B_TRUE);
541 pii->pii_probe_logint = NULL;
542 }
543
544 phyint_chstate(pi, PI_OFFLINE);
545 }
546
547 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
548
549 /*
550 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set.
551 * IFF_UP must also be set so that the associated address can be used as a
552 * source address. Further, we must be able to exchange packets with local
553 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical
554 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
555 */
556 static int
rate_testflags(uint64_t flags)557 rate_testflags(uint64_t flags)
558 {
559 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
560 return (BAD_TESTFLAGS);
561
562 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
563 return (BAD_TESTFLAGS);
564
565 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
566 return (BEST_TESTFLAGS);
567
568 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
569 return (BEST_TESTFLAGS);
570
571 return (OK_TESTFLAGS);
572 }
573
574 /*
575 * Attempt to select a test address for each phyint instance.
576 * Call phyint_inst_sockinit() to complete the initializations.
577 */
578 static void
select_test_ifs(void)579 select_test_ifs(void)
580 {
581 struct phyint *pi;
582 struct phyint_instance *pii;
583 struct phyint_instance *next_pii;
584 struct logint *li;
585 struct logint *probe_logint;
586 boolean_t target_scan_reqd = _B_FALSE;
587 int rating;
588
589 if (debug & D_PHYINT)
590 logdebug("select_test_ifs\n");
591
592 /*
593 * For each phyint instance, do the test address selection
594 */
595 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
596 next_pii = pii->pii_next;
597 probe_logint = NULL;
598
599 /*
600 * An interface that is offline should not be probed.
601 * IFF_OFFLINE interfaces should always be PI_OFFLINE
602 * unless some other entity has set the offline flag.
603 */
604 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
605 if (pii->pii_phyint->pi_state != PI_OFFLINE) {
606 logerr("shouldn't be probing offline"
607 " interface %s (state is: %u)."
608 " Stopping probes.\n",
609 pii->pii_phyint->pi_name,
610 pii->pii_phyint->pi_state);
611 stop_probing(pii->pii_phyint);
612 }
613 continue;
614 } else {
615 /*
616 * If something cleared IFF_OFFLINE (e.g., by accident
617 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
618 * inherently racy), the phyint may still be offline.
619 * Just ignore it.
620 */
621 if (pii->pii_phyint->pi_state == PI_OFFLINE)
622 continue;
623 }
624
625 li = pii->pii_probe_logint;
626 if (li != NULL) {
627 /*
628 * We've already got a test address; only proceed
629 * if it's suboptimal.
630 */
631 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
632 continue;
633 }
634
635 /*
636 * Walk the logints of this phyint instance, and select
637 * the best available test address
638 */
639 for (li = pii->pii_logint; li != NULL; li = li->li_next) {
640 /*
641 * Skip 0.0.0.0 addresses, as those are never
642 * actually usable.
643 */
644 if (pii->pii_af == AF_INET &&
645 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
646 continue;
647
648 /*
649 * Skip any IPv6 logints that are not link-local,
650 * since we should always have a link-local address
651 * anyway and in6_data() expects link-local replies.
652 */
653 if (pii->pii_af == AF_INET6 &&
654 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
655 continue;
656
657 /*
658 * Rate the testflags. If we've found an optimal
659 * match, then break out; otherwise, record the most
660 * recent OK one.
661 */
662 rating = rate_testflags(li->li_flags);
663 if (rating == BAD_TESTFLAGS)
664 continue;
665
666 probe_logint = li;
667 if (rating == BEST_TESTFLAGS)
668 break;
669 }
670
671 /*
672 * If the probe logint has changed, ditch the old one.
673 */
674 if (pii->pii_probe_logint != NULL &&
675 pii->pii_probe_logint != probe_logint) {
676 if (pii->pii_probe_sock != -1)
677 close_probe_socket(pii, _B_TRUE);
678 pii->pii_probe_logint = NULL;
679 }
680
681 if (probe_logint == NULL) {
682 /*
683 * We don't have a test address; zero out the probe
684 * stats array since it is no longer relevant.
685 * Optimize by checking if it is already zeroed out.
686 */
687 int pr_ndx;
688
689 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
690 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
691 clear_pii_probe_stats(pii);
692 reset_crtt_all(pii->pii_phyint);
693 }
694 continue;
695 } else if (probe_logint == pii->pii_probe_logint) {
696 /*
697 * If we didn't find any new test addr, go to the
698 * next phyint.
699 */
700 continue;
701 }
702
703 /*
704 * The phyint is either being assigned a new testaddr
705 * or is being assigned a testaddr for the 1st time.
706 * Need to initialize the phyint socket
707 */
708 pii->pii_probe_logint = probe_logint;
709 if (!phyint_inst_sockinit(pii)) {
710 if (debug & D_PHYINT) {
711 logdebug("select_test_ifs: "
712 "phyint_sockinit failed\n");
713 }
714 phyint_inst_delete(pii);
715 continue;
716 }
717
718 /*
719 * This phyint instance is now enabled for probes; this
720 * impacts our state machine in two ways:
721 *
722 * 1. If we're probe *capable* as well (i.e., we have
723 * probe targets) and the interface is in PI_NOTARGETS,
724 * then transition to PI_RUNNING.
725 *
726 * 2. If we're not probe capable, and the other phyint
727 * instance is also not probe capable, and we were in
728 * PI_RUNNING, then transition to PI_NOTARGETS.
729 *
730 * Also see the state diagram in mpd_probe.c.
731 */
732 if (PROBE_CAPABLE(pii)) {
733 if (pii->pii_phyint->pi_state == PI_NOTARGETS)
734 phyint_chstate(pii->pii_phyint, PI_RUNNING);
735 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
736 if (pii->pii_phyint->pi_state == PI_RUNNING)
737 phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
738 }
739
740 /*
741 * If no targets are currently known for this phyint
742 * we need to call init_router_targets. Since
743 * init_router_targets() initializes the list of targets
744 * for all phyints it is done below the loop.
745 */
746 if (pii->pii_targets == NULL)
747 target_scan_reqd = _B_TRUE;
748
749 /*
750 * Start the probe timer for this instance.
751 */
752 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
753 start_timer(pii);
754 pii->pii_basetime_inited = 1;
755 }
756 }
757
758 /*
759 * Scan the interface list for any interfaces that are PI_FAILED or
760 * PI_NOTARGETS but no longer enabled to send probes, and call
761 * phyint_check_for_repair() to see if the link state indicates that
762 * the interface should be repaired. Also see the state diagram in
763 * mpd_probe.c.
764 */
765 for (pi = phyints; pi != NULL; pi = pi->pi_next) {
766 if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
767 (pi->pi_state == PI_FAILED ||
768 pi->pi_state == PI_NOTARGETS)) {
769 phyint_check_for_repair(pi);
770 }
771 }
772
773 check_testconfig();
774
775 /*
776 * Try to populate the target list. init_router_targets populates
777 * the target list from the routing table. If our target list is
778 * still empty, init_host_targets adds host targets based on the
779 * host target list of other phyints in the group.
780 */
781 if (target_scan_reqd) {
782 init_router_targets();
783 init_host_targets();
784 }
785 }
786
787 /*
788 * Check test address configuration, and log notices/errors if appropriate.
789 * Note that this function only logs pre-existing conditions (e.g., that
790 * probe-based failure detection is disabled).
791 */
792 static void
check_testconfig(void)793 check_testconfig(void)
794 {
795 struct phyint *pi;
796 struct logint *li;
797 char abuf[INET6_ADDRSTRLEN];
798 int pri;
799
800 for (pi = phyints; pi != NULL; pi = pi->pi_next) {
801 if (pi->pi_flags & IFF_OFFLINE)
802 continue;
803
804 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) {
805 if (pi->pi_taddrmsg_printed ||
806 pi->pi_duptaddrmsg_printed) {
807 if (pi->pi_duptaddrmsg_printed)
808 pri = LOG_ERR;
809 else
810 pri = LOG_INFO;
811 logmsg(pri, "Test address now configured on "
812 "interface %s; enabling probe-based "
813 "failure detection on it\n", pi->pi_name);
814 pi->pi_taddrmsg_printed = 0;
815 pi->pi_duptaddrmsg_printed = 0;
816 }
817 continue;
818 }
819
820 li = NULL;
821 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL &&
822 pi->pi_v4->pii_probe_logint->li_dupaddr)
823 li = pi->pi_v4->pii_probe_logint;
824
825 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL &&
826 pi->pi_v6->pii_probe_logint->li_dupaddr)
827 li = pi->pi_v6->pii_probe_logint;
828
829 if (li != NULL && li->li_dupaddr) {
830 if (pi->pi_duptaddrmsg_printed)
831 continue;
832 logerr("Test address %s is not unique in group; "
833 "disabling probe-based failure detection on %s\n",
834 pr_addr(li->li_phyint_inst->pii_af,
835 li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
836 pi->pi_duptaddrmsg_printed = 1;
837 continue;
838 }
839
840 if (getcurrentsec() < pi->pi_taddrthresh)
841 continue;
842
843 if (!pi->pi_taddrmsg_printed) {
844 logtrace("No test address configured on interface %s; "
845 "disabling probe-based failure detection on it\n",
846 pi->pi_name);
847 pi->pi_taddrmsg_printed = 1;
848 }
849 }
850 }
851
852 /*
853 * Check phyint group configuration, to detect any inconsistencies,
854 * and log an error message. This is called from runtimeouts every
855 * 20 secs. But the error message is displayed once. If the
856 * consistency is resolved by the admin, a recovery message is displayed
857 * once.
858 */
859 static void
check_config(void)860 check_config(void)
861 {
862 struct phyint_group *pg;
863 struct phyint *pi;
864 boolean_t v4_in_group;
865 boolean_t v6_in_group;
866
867 /*
868 * All phyints of a group must be homogeneous to ensure that they can
869 * take over for one another. If any phyint in a group has IPv4
870 * plumbed, check that all phyints have IPv4 plumbed. Do a similar
871 * check for IPv6.
872 */
873 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
874 if (pg == phyint_anongroup)
875 continue;
876
877 v4_in_group = _B_FALSE;
878 v6_in_group = _B_FALSE;
879 /*
880 * 1st pass. Determine if at least 1 phyint in the group
881 * has IPv4 plumbed and if so set v4_in_group to true.
882 * Repeat similarly for IPv6.
883 */
884 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
885 if (pi->pi_v4 != NULL)
886 v4_in_group = _B_TRUE;
887 if (pi->pi_v6 != NULL)
888 v6_in_group = _B_TRUE;
889 }
890
891 /*
892 * 2nd pass. If v4_in_group is true, check that phyint
893 * has IPv4 plumbed. Repeat similarly for IPv6. Print
894 * out a message the 1st time only.
895 */
896 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
897 if (pi->pi_flags & IFF_OFFLINE)
898 continue;
899
900 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
901 if (!pi->pi_cfgmsg_printed) {
902 logerr("IP interface %s in group %s is"
903 " not plumbed for IPv4, affecting"
904 " IPv4 connectivity\n",
905 pi->pi_name,
906 pi->pi_group->pg_name);
907 pi->pi_cfgmsg_printed = 1;
908 }
909 } else if (v6_in_group == _B_TRUE &&
910 pi->pi_v6 == NULL) {
911 if (!pi->pi_cfgmsg_printed) {
912 logerr("IP interface %s in group %s is"
913 " not plumbed for IPv6, affecting"
914 " IPv6 connectivity\n",
915 pi->pi_name,
916 pi->pi_group->pg_name);
917 pi->pi_cfgmsg_printed = 1;
918 }
919 } else {
920 /*
921 * The phyint matches the group configuration,
922 * if we have reached this point. If it was
923 * improperly configured earlier, log an
924 * error recovery message
925 */
926 if (pi->pi_cfgmsg_printed) {
927 logerr("IP interface %s is now"
928 " consistent with group %s "
929 " and connectivity is restored\n",
930 pi->pi_name, pi->pi_group->pg_name);
931 pi->pi_cfgmsg_printed = 0;
932 }
933 }
934
935 }
936 }
937 }
938
939 /*
940 * Timer mechanism using relative time (in milliseconds) from the
941 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
942 * will fire after TIMER_INFINITY milliseconds.
943 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
944 * time values. Hence 2 consecutive timer events cannot be spaced farther
945 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
946 * that can be passed for the delay parameter of timer_schedule()
947 */
948 static uint_t timer_next; /* Currently scheduled timeout */
949 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
950
951 static void
timer_init(void)952 timer_init(void)
953 {
954 timer_next = getcurrenttime() + TIMER_INFINITY;
955 /*
956 * The call to run_timeouts() will get the timer started
957 * Since there are no phyints at this point, the timer will
958 * be set for IF_SCAN_INTERVAL ms.
959 */
960 run_timeouts();
961 }
962
963 /*
964 * Make sure the next SIGALRM occurs delay milliseconds from the current
965 * time if not earlier. We are interested only in time differences.
966 */
967 void
timer_schedule(uint_t delay)968 timer_schedule(uint_t delay)
969 {
970 uint_t now;
971 struct itimerval itimerval;
972
973 if (debug & D_TIMER)
974 logdebug("timer_schedule(%u)\n", delay);
975
976 assert(delay <= TIMER_INFINITY);
977
978 now = getcurrenttime();
979 if (delay == 0) {
980 /* Minimum allowed delay */
981 delay = 1;
982 }
983 /* Will this timer occur before the currently scheduled SIGALRM? */
984 if (timer_active && TIME_GE(now + delay, timer_next)) {
985 if (debug & D_TIMER) {
986 logdebug("timer_schedule(%u) - no action: "
987 "now %u next %u\n", delay, now, timer_next);
988 }
989 return;
990 }
991 timer_next = now + delay;
992
993 itimerval.it_value.tv_sec = delay / 1000;
994 itimerval.it_value.tv_usec = (delay % 1000) * 1000;
995 itimerval.it_interval.tv_sec = 0;
996 itimerval.it_interval.tv_usec = 0;
997 if (debug & D_TIMER) {
998 logdebug("timer_schedule(%u): sec %ld usec %ld\n",
999 delay, itimerval.it_value.tv_sec,
1000 itimerval.it_value.tv_usec);
1001 }
1002 timer_active = _B_TRUE;
1003 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1004 logperror("timer_schedule: setitimer");
1005 exit(2);
1006 }
1007 }
1008
1009 static void
timer_cancel(void)1010 timer_cancel(void)
1011 {
1012 struct itimerval itimerval;
1013
1014 if (debug & D_TIMER)
1015 logdebug("timer_cancel()\n");
1016
1017 bzero(&itimerval, sizeof (itimerval));
1018 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0)
1019 logperror("timer_cancel: setitimer");
1020 }
1021
1022 /*
1023 * Timer has fired. Determine when the next timer event will occur by asking
1024 * all the timer routines. Should not be called from a timer routine.
1025 */
1026 static void
run_timeouts(void)1027 run_timeouts(void)
1028 {
1029 uint_t next;
1030 uint_t next_event_time;
1031 struct phyint_instance *pii;
1032 struct phyint_instance *next_pii;
1033 static boolean_t timeout_running;
1034
1035 /* assert that recursive timeouts don't happen. */
1036 assert(!timeout_running);
1037
1038 timeout_running = _B_TRUE;
1039
1040 if (debug & D_TIMER)
1041 logdebug("run_timeouts()\n");
1042
1043 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1044 initifs();
1045 check_config();
1046 }
1047
1048 next = TIMER_INFINITY;
1049
1050 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1051 next_pii = pii->pii_next;
1052 next_event_time = phyint_inst_timer(pii);
1053 if (next_event_time != TIMER_INFINITY && next_event_time < next)
1054 next = next_event_time;
1055
1056 if (debug & D_TIMER) {
1057 logdebug("run_timeouts(%s %s): next scheduled for"
1058 " this phyint inst %u, next scheduled global"
1059 " %u ms\n",
1060 AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1061 next_event_time, next);
1062 }
1063 }
1064
1065 /*
1066 * Make sure initifs() is called at least once every
1067 * IF_SCAN_INTERVAL, to make sure that we are in sync
1068 * with the kernel, in case we have missed any routing
1069 * socket messages.
1070 */
1071 if (next > IF_SCAN_INTERVAL)
1072 next = IF_SCAN_INTERVAL;
1073
1074 if (debug & D_TIMER)
1075 logdebug("run_timeouts: %u ms\n", next);
1076
1077 timer_schedule(next);
1078 timeout_running = _B_FALSE;
1079 }
1080
1081 static int eventpipe_read = -1; /* Used for synchronous signal delivery */
1082 static int eventpipe_write = -1;
1083 boolean_t cleanup_started = _B_FALSE; /* true if we're going away */
1084
1085 /*
1086 * Ensure that signals are processed synchronously with the rest of
1087 * the code by just writing a one character signal number on the pipe.
1088 * The poll loop will pick this up and process the signal event.
1089 */
1090 static void
sig_handler(int signo)1091 sig_handler(int signo)
1092 {
1093 uchar_t buf = (uchar_t)signo;
1094
1095 /*
1096 * Don't write to pipe if cleanup has already begun. cleanup()
1097 * might have closed the pipe already
1098 */
1099 if (cleanup_started)
1100 return;
1101
1102 if (eventpipe_write == -1) {
1103 logerr("sig_handler: no pipe found\n");
1104 return;
1105 }
1106 if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1107 logperror("sig_handler: write");
1108 }
1109
1110 extern struct probes_missed probes_missed;
1111
1112 /*
1113 * Pick up a signal "byte" from the pipe and process it.
1114 */
1115 static void
in_signal(int fd)1116 in_signal(int fd)
1117 {
1118 uchar_t buf;
1119 uint64_t sent, acked, lost, unacked, unknown;
1120 struct phyint_instance *pii;
1121 int pr_ndx;
1122
1123 switch (read(fd, &buf, sizeof (buf))) {
1124 case -1:
1125 logperror("in_signal: read");
1126 exit(1);
1127 /* NOTREACHED */
1128 case 1:
1129 break;
1130 case 0:
1131 logerr("in_signal: read end of file\n");
1132 exit(1);
1133 /* NOTREACHED */
1134 default:
1135 logerr("in_signal: read > 1\n");
1136 exit(1);
1137 }
1138
1139 if (debug & D_TIMER)
1140 logdebug("in_signal() got %d\n", buf);
1141
1142 switch (buf) {
1143 case SIGALRM:
1144 if (debug & D_TIMER) {
1145 uint_t now = getcurrenttime();
1146
1147 logdebug("in_signal(SIGALRM) delta %u\n",
1148 now - timer_next);
1149 }
1150 timer_active = _B_FALSE;
1151 run_timeouts();
1152 break;
1153 case SIGUSR1:
1154 logdebug("Printing configuration:\n");
1155 /* Print out the internal tables */
1156 phyint_inst_print_all();
1157
1158 /*
1159 * Print out the accumulated statistics about missed
1160 * probes (happens due to scheduling delay).
1161 */
1162 logerr("Missed sending total of %d probes spread over"
1163 " %d occurrences\n", probes_missed.pm_nprobes,
1164 probes_missed.pm_ntimes);
1165
1166 /*
1167 * Print out the accumulated statistics about probes
1168 * that were sent.
1169 */
1170 for (pii = phyint_instances; pii != NULL;
1171 pii = pii->pii_next) {
1172 unacked = 0;
1173 acked = pii->pii_cum_stats.acked;
1174 lost = pii->pii_cum_stats.lost;
1175 sent = pii->pii_cum_stats.sent;
1176 unknown = pii->pii_cum_stats.unknown;
1177 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1178 switch (pii->pii_probes[pr_ndx].pr_status) {
1179 case PR_ACKED:
1180 acked++;
1181 break;
1182 case PR_LOST:
1183 lost++;
1184 break;
1185 case PR_UNACKED:
1186 unacked++;
1187 break;
1188 }
1189 }
1190 logerr("\nProbe stats on (%s %s)\n"
1191 "Number of probes sent %lld\n"
1192 "Number of probe acks received %lld\n"
1193 "Number of probes/acks lost %lld\n"
1194 "Number of valid unacknowledged probes %lld\n"
1195 "Number of ambiguous probe acks received %lld\n",
1196 AF_STR(pii->pii_af), pii->pii_name,
1197 sent, acked, lost, unacked, unknown);
1198 }
1199 break;
1200 case SIGHUP:
1201 logerr("SIGHUP: restart and reread config file\n");
1202 /*
1203 * Cancel the interval timer. Needed since setitimer() uses
1204 * alarm() and the time left is inherited across exec(), and
1205 * thus the SIGALRM may be delivered before a handler has been
1206 * setup, causing in.mpathd to erroneously exit.
1207 */
1208 timer_cancel();
1209 cleanup();
1210 (void) execv(argv0[0], argv0);
1211 _exit(0177);
1212 /* NOTREACHED */
1213 case SIGINT:
1214 case SIGTERM:
1215 case SIGQUIT:
1216 cleanup();
1217 exit(0);
1218 /* NOTREACHED */
1219 default:
1220 logerr("in_signal: unknown signal: %d\n", buf);
1221 }
1222 }
1223
1224 static void
cleanup(void)1225 cleanup(void)
1226 {
1227 struct phyint_instance *pii;
1228 struct phyint_instance *next_pii;
1229
1230 /*
1231 * Make sure that we don't write to eventpipe in
1232 * sig_handler() if any signal notably SIGALRM,
1233 * occurs after we close the eventpipe descriptor below
1234 */
1235 cleanup_started = _B_TRUE;
1236
1237 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1238 next_pii = pii->pii_next;
1239 phyint_inst_delete(pii);
1240 }
1241
1242 (void) close(ifsock_v4);
1243 (void) close(ifsock_v6);
1244 (void) close(rtsock_v4);
1245 (void) close(rtsock_v6);
1246 (void) close(lsock_v4);
1247 (void) close(lsock_v6);
1248 (void) close(0);
1249 (void) close(1);
1250 (void) close(2);
1251 (void) close(mibfd);
1252 (void) close(eventpipe_read);
1253 (void) close(eventpipe_write);
1254 }
1255
1256 /*
1257 * Create pipe for signal delivery and set up signal handlers.
1258 */
1259 static void
setup_eventpipe(void)1260 setup_eventpipe(void)
1261 {
1262 int fds[2];
1263 struct sigaction act;
1264
1265 if ((pipe(fds)) < 0) {
1266 logperror("setup_eventpipe: pipe");
1267 exit(1);
1268 }
1269 eventpipe_read = fds[0];
1270 eventpipe_write = fds[1];
1271 if (poll_add(eventpipe_read) == -1) {
1272 exit(1);
1273 }
1274
1275 act.sa_handler = sig_handler;
1276 act.sa_flags = SA_RESTART;
1277 (void) sigaction(SIGALRM, &act, NULL);
1278
1279 (void) sigset(SIGHUP, sig_handler);
1280 (void) sigset(SIGUSR1, sig_handler);
1281 (void) sigset(SIGTERM, sig_handler);
1282 (void) sigset(SIGINT, sig_handler);
1283 (void) sigset(SIGQUIT, sig_handler);
1284 }
1285
1286 /*
1287 * Create a routing socket for receiving RTM_IFINFO messages.
1288 */
1289 static int
setup_rtsock(int af)1290 setup_rtsock(int af)
1291 {
1292 int s;
1293 int flags;
1294 int aware = RTAW_UNDER_IPMP;
1295
1296 s = socket(PF_ROUTE, SOCK_RAW, af);
1297 if (s == -1) {
1298 logperror("setup_rtsock: socket PF_ROUTE");
1299 exit(1);
1300 }
1301
1302 if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
1303 logperror("setup_rtsock: setsockopt RT_AWARE");
1304 (void) close(s);
1305 exit(1);
1306 }
1307
1308 if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1309 logperror("setup_rtsock: fcntl F_GETFL");
1310 (void) close(s);
1311 exit(1);
1312 }
1313 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1314 logperror("setup_rtsock: fcntl F_SETFL");
1315 (void) close(s);
1316 exit(1);
1317 }
1318 if (poll_add(s) == -1) {
1319 (void) close(s);
1320 exit(1);
1321 }
1322 return (s);
1323 }
1324
1325 /*
1326 * Process an RTM_IFINFO message received on a routing socket.
1327 * The return value indicates whether a full interface scan is required.
1328 * Link up/down notifications are reflected in the IFF_RUNNING flag.
1329 * If just the state of the IFF_RUNNING interface flag has changed, a
1330 * a full interface scan isn't required.
1331 */
1332 static boolean_t
process_rtm_ifinfo(if_msghdr_t * ifm,int type)1333 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1334 {
1335 struct sockaddr_dl *sdl;
1336 struct phyint *pi;
1337 uint64_t old_flags;
1338 struct phyint_instance *pii;
1339
1340 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1341
1342 /*
1343 * Although the sockaddr_dl structure is directly after the
1344 * if_msghdr_t structure. At the time of writing, the size of the
1345 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1346 * to the presence of a timeval structure, which contains longs,
1347 * in the if_data structure. Anyway, we know where the message ends,
1348 * so we work backwards to get the start of the sockaddr_dl structure.
1349 */
1350 /*LINTED*/
1351 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1352 sizeof (struct sockaddr_dl));
1353
1354 assert(sdl->sdl_family == AF_LINK);
1355
1356 /*
1357 * The interface name is in sdl_data.
1358 * RTM_IFINFO messages are only generated for logical interface
1359 * zero, so there is no colon and logical interface number to
1360 * strip from the name. The name is not null terminated, but
1361 * there should be enough space in sdl_data to add the null.
1362 */
1363 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1364 if (debug & D_LINKNOTE)
1365 logdebug("process_rtm_ifinfo: phyint name too long\n");
1366 return (_B_TRUE);
1367 }
1368 sdl->sdl_data[sdl->sdl_nlen] = 0;
1369
1370 pi = phyint_lookup(sdl->sdl_data);
1371 if (pi == NULL) {
1372 if (debug & D_LINKNOTE)
1373 logdebug("process_rtm_ifinfo: phyint lookup failed"
1374 " for %s\n", sdl->sdl_data);
1375 return (_B_TRUE);
1376 }
1377
1378 /*
1379 * We want to try and avoid doing a full interface scan for
1380 * link state notifications from the datalink layer, as indicated
1381 * by the state of the IFF_RUNNING flag. If just the
1382 * IFF_RUNNING flag has changed state, the link state changes
1383 * are processed without a full scan.
1384 * If there is both an IPv4 and IPv6 instance associated with
1385 * the physical interface, we will get an RTM_IFINFO message
1386 * for each instance. If we just maintained a single copy of
1387 * the physical interface flags, it would appear that no flags
1388 * had changed when the second message is processed, leading us
1389 * to believe that the message wasn't generated by a flags change,
1390 * and that a full interface scan is required.
1391 * To get around this problem, two additional copies of the flags
1392 * are kept, one copy for each instance. These are only used in
1393 * this routine. At any one time, all three copies of the flags
1394 * should be identical except for the IFF_RUNNING flag. The
1395 * copy of the flags in the "phyint" structure is always up to
1396 * date.
1397 */
1398 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1399 if (pii == NULL) {
1400 if (debug & D_LINKNOTE)
1401 logdebug("process_rtm_ifinfo: no instance of address "
1402 "family %s for %s\n", AF_STR(type), pi->pi_name);
1403 return (_B_TRUE);
1404 }
1405
1406 old_flags = pii->pii_flags;
1407 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1408 pi->pi_flags = pii->pii_flags;
1409
1410 if (debug & D_LINKNOTE) {
1411 logdebug("process_rtm_ifinfo: %s address family: %s, "
1412 "old flags: %llx, new flags: %llx\n", pi->pi_name,
1413 AF_STR(type), old_flags, pi->pi_flags);
1414 }
1415
1416 /*
1417 * If IFF_STANDBY has changed, indicate that the interface has changed
1418 * types and refresh IFF_INACTIVE if need be.
1419 */
1420 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) {
1421 phyint_changed(pi);
1422 if (pii->pii_flags & IFF_STANDBY)
1423 phyint_standby_refresh_inactive(pi);
1424 }
1425
1426 /* Has just the IFF_RUNNING flag changed state ? */
1427 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1428 struct phyint_instance *pii_other;
1429 /*
1430 * It wasn't just a link state change. Update
1431 * the other instance's copy of the flags.
1432 */
1433 pii_other = phyint_inst_other(pii);
1434 if (pii_other != NULL)
1435 pii_other->pii_flags = pii->pii_flags;
1436 return (_B_TRUE);
1437 }
1438
1439 return (_B_FALSE);
1440 }
1441
1442 /*
1443 * Retrieve as many routing socket messages as possible, and try to
1444 * empty the routing sockets. Initiate full scan of targets or interfaces
1445 * as needed.
1446 * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1447 * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1448 */
1449 static void
process_rtsock(int rtsock_v4,int rtsock_v6)1450 process_rtsock(int rtsock_v4, int rtsock_v6)
1451 {
1452 int nbytes;
1453 int64_t msg[2048 / 8];
1454 struct rt_msghdr *rtm;
1455 boolean_t need_if_scan = _B_FALSE;
1456 boolean_t need_rt_scan = _B_FALSE;
1457 boolean_t rtm_ifinfo_seen = _B_FALSE;
1458 int type;
1459
1460 /* Read as many messages as possible and try to empty the sockets */
1461 for (type = AF_INET; ; type = AF_INET6) {
1462 for (;;) {
1463 nbytes = read((type == AF_INET) ? rtsock_v4 :
1464 rtsock_v6, msg, sizeof (msg));
1465 if (nbytes <= 0) {
1466 /* No more messages */
1467 break;
1468 }
1469 rtm = (struct rt_msghdr *)msg;
1470 if (rtm->rtm_version != RTM_VERSION) {
1471 logerr("process_rtsock: version %d "
1472 "not understood\n", rtm->rtm_version);
1473 break;
1474 }
1475
1476 if (debug & D_PHYINT) {
1477 logdebug("process_rtsock: message %d\n",
1478 rtm->rtm_type);
1479 }
1480
1481 switch (rtm->rtm_type) {
1482 case RTM_NEWADDR:
1483 case RTM_DELADDR:
1484 /*
1485 * Some logical interface has changed,
1486 * have to scan everything to determine
1487 * what actually changed.
1488 */
1489 need_if_scan = _B_TRUE;
1490 break;
1491
1492 case RTM_IFINFO:
1493 rtm_ifinfo_seen = _B_TRUE;
1494 need_if_scan |= process_rtm_ifinfo(
1495 (if_msghdr_t *)rtm, type);
1496 break;
1497
1498 case RTM_ADD:
1499 case RTM_DELETE:
1500 case RTM_CHANGE:
1501 case RTM_OLDADD:
1502 case RTM_OLDDEL:
1503 need_rt_scan = _B_TRUE;
1504 break;
1505
1506 default:
1507 /* Not interesting */
1508 break;
1509 }
1510 }
1511 if (type == AF_INET6)
1512 break;
1513 }
1514
1515 if (need_if_scan) {
1516 if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1517 logdebug("process_rtsock: synchronizing with kernel\n");
1518 initifs();
1519 } else if (rtm_ifinfo_seen) {
1520 if (debug & D_LINKNOTE)
1521 logdebug("process_rtsock: "
1522 "link up/down notification(s) seen\n");
1523 process_link_state_changes();
1524 }
1525
1526 if (need_rt_scan)
1527 init_router_targets();
1528 }
1529
1530 /*
1531 * Look if the phyint instance or one of its logints have been removed from
1532 * the kernel and take appropriate action.
1533 * Uses {pii,li}_in_use.
1534 */
1535 static void
check_if_removed(struct phyint_instance * pii)1536 check_if_removed(struct phyint_instance *pii)
1537 {
1538 struct logint *li;
1539 struct logint *next_li;
1540
1541 /* Detect phyints that have been removed from the kernel. */
1542 if (!pii->pii_in_use) {
1543 logtrace("%s %s has been removed from kernel\n",
1544 AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1545 phyint_inst_delete(pii);
1546 } else {
1547 /* Detect logints that have been removed. */
1548 for (li = pii->pii_logint; li != NULL; li = next_li) {
1549 next_li = li->li_next;
1550 if (!li->li_in_use) {
1551 logint_delete(li);
1552 }
1553 }
1554 }
1555 }
1556
1557 /*
1558 * Parse the supplied mib2 information to extract the routing information
1559 * table. Process the routing table to get the list of known onlink routers
1560 * and update our database. These onlink routers will serve as probe
1561 * targets.
1562 */
1563 static void
update_router_list(mib_item_t * item)1564 update_router_list(mib_item_t *item)
1565 {
1566 for (; item != NULL; item = item->mi_next) {
1567 if (item->mi_opthdr.name == 0)
1568 continue;
1569 if (item->mi_opthdr.level == MIB2_IP &&
1570 item->mi_opthdr.name == MIB2_IP_ROUTE) {
1571 ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp,
1572 item->mi_opthdr.len);
1573 } else if (item->mi_opthdr.level == MIB2_IP6 &&
1574 item->mi_opthdr.name == MIB2_IP6_ROUTE) {
1575 ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp,
1576 item->mi_opthdr.len);
1577 }
1578 }
1579 }
1580
1581
1582 /*
1583 * Convert octet `octp' to a phyint name and store in `ifname'
1584 */
1585 static void
oct2ifname(const Octet_t * octp,char * ifname,size_t ifsize)1586 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
1587 {
1588 char *cp;
1589 size_t len = MIN(octp->o_length, ifsize - 1);
1590
1591 (void) strncpy(ifname, octp->o_bytes, len);
1592 ifname[len] = '\0';
1593
1594 if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
1595 *cp = '\0';
1596 }
1597
1598 /*
1599 * Examine the IPv4 routing table `buf' for possible targets. For each
1600 * possible target, if it's on the same subnet an interface route, pass
1601 * it to router_add_common() for further consideration.
1602 */
1603 static void
ire_process_v4(mib2_ipRouteEntry_t * buf,size_t len)1604 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1605 {
1606 char ifname[LIFNAMSIZ];
1607 mib2_ipRouteEntry_t *rp, *rp1, *endp;
1608 struct in_addr nexthop_v4;
1609 struct in6_addr nexthop;
1610
1611 if (debug & D_TARGET)
1612 logdebug("ire_process_v4(len %d)\n", len);
1613
1614 if (len == 0)
1615 return;
1616
1617 assert((len % ipRouteEntrySize) == 0);
1618 endp = buf + (len / ipRouteEntrySize);
1619
1620 /*
1621 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1622 * cross-reference them with the interface routes to determine if
1623 * they're possible probe targets.
1624 */
1625 for (rp = buf; rp < endp; rp++) {
1626 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1627 continue;
1628
1629 /* Get the nexthop address. */
1630 nexthop_v4.s_addr = rp->ipRouteNextHop;
1631
1632 /*
1633 * Rescan the routing table looking for interface routes that
1634 * are on the same subnet, and try to add them. If they're
1635 * not relevant (e.g., the interface route isn't part of an
1636 * IPMP group, router_add_common() will discard).
1637 */
1638 for (rp1 = buf; rp1 < endp; rp1++) {
1639 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
1640 rp1->ipRouteIfIndex.o_length == 0)
1641 continue;
1642
1643 if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
1644 (nexthop_v4.s_addr & rp1->ipRouteMask))
1645 continue;
1646
1647 oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
1648 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1649 router_add_common(AF_INET, ifname, nexthop);
1650 }
1651 }
1652 }
1653
1654 void
router_add_common(int af,char * ifname,struct in6_addr nexthop)1655 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1656 {
1657 struct phyint_instance *pii;
1658 struct phyint *pi;
1659
1660 if (debug & D_TARGET)
1661 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1662
1663 /*
1664 * Retrieve the phyint instance; bail if it's not known to us yet.
1665 */
1666 pii = phyint_inst_lookup(af, ifname);
1667 if (pii == NULL)
1668 return;
1669
1670 /*
1671 * Don't use our own addresses as targets.
1672 */
1673 if (own_address(nexthop))
1674 return;
1675
1676 /*
1677 * If the phyint is part a named group, then add the address to all
1678 * members of the group; note that this is suboptimal in the IPv4 case
1679 * as it has already been added to all matching interfaces in
1680 * ire_process_v4(). Otherwise, add the address only to the phyint
1681 * itself, since other phyints in the anongroup may not be on the same
1682 * subnet.
1683 */
1684 pi = pii->pii_phyint;
1685 if (pi->pi_group == phyint_anongroup) {
1686 target_add(pii, nexthop, _B_TRUE);
1687 } else {
1688 pi = pi->pi_group->pg_phyint;
1689 for (; pi != NULL; pi = pi->pi_pgnext)
1690 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1691 }
1692 }
1693
1694 /*
1695 * Examine the IPv6 routing table `buf' for possible link-local targets, and
1696 * pass any contenders to router_add_common() for further consideration.
1697 */
1698 static void
ire_process_v6(mib2_ipv6RouteEntry_t * buf,size_t len)1699 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1700 {
1701 struct lifreq lifr;
1702 char ifname[LIFNAMSIZ];
1703 char grname[LIFGRNAMSIZ];
1704 mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
1705 struct in6_addr nexthop_v6;
1706
1707 if (debug & D_TARGET)
1708 logdebug("ire_process_v6(len %d)\n", len);
1709
1710 if (len == 0)
1711 return;
1712
1713 assert((len % ipv6RouteEntrySize) == 0);
1714 endp = buf + (len / ipv6RouteEntrySize);
1715
1716 /*
1717 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1718 * cross-reference them with the interface routes to determine if
1719 * they're possible probe targets.
1720 */
1721 for (rp = buf; rp < endp; rp++) {
1722 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
1723 !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
1724 continue;
1725
1726 /* Get the nexthop address. */
1727 nexthop_v6 = rp->ipv6RouteNextHop;
1728
1729 /*
1730 * The interface name should always exist for link-locals;
1731 * we use it to map this entry to an IPMP group name.
1732 */
1733 if (rp->ipv6RouteIfIndex.o_length == 0)
1734 continue;
1735
1736 oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
1737 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
1738 strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
1739 continue;
1740 }
1741
1742 /*
1743 * Rescan the list of routes for interface routes, and add the
1744 * above target to any interfaces in the same IPMP group.
1745 */
1746 for (rp1 = buf; rp1 < endp; rp1++) {
1747 if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
1748 rp1->ipv6RouteIfIndex.o_length == 0) {
1749 continue;
1750 }
1751 oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
1752 (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
1753
1754 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
1755 strcmp(lifr.lifr_groupname, grname) == 0) {
1756 router_add_common(AF_INET6, ifname, nexthop_v6);
1757 }
1758 }
1759 }
1760 }
1761
1762 /*
1763 * Build a list of target routers, by scanning the routing tables.
1764 * It is assumed that interface routes exist, to reach the routers.
1765 */
1766 static void
init_router_targets(void)1767 init_router_targets(void)
1768 {
1769 struct target *tg;
1770 struct target *next_tg;
1771 struct phyint_instance *pii;
1772 struct phyint *pi;
1773
1774 if (force_mcast)
1775 return;
1776
1777 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1778 pi = pii->pii_phyint;
1779 /*
1780 * Set tg_in_use to false only for router targets.
1781 */
1782 if (!pii->pii_targets_are_routers)
1783 continue;
1784
1785 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1786 tg->tg_in_use = 0;
1787 }
1788
1789 if (mibwalk(update_router_list) == -1)
1790 exit(1);
1791
1792 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1793 pi = pii->pii_phyint;
1794 if (!pii->pii_targets_are_routers)
1795 continue;
1796
1797 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1798 next_tg = tg->tg_next;
1799 /*
1800 * If the group has failed, it's likely the route was
1801 * removed by an application affected by that failure.
1802 * In that case, we keep the target so that we can
1803 * reliably repair, at which point we'll refresh the
1804 * target list again.
1805 */
1806 if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
1807 target_delete(tg);
1808 }
1809 }
1810 }
1811
1812 /*
1813 * Attempt to assign host targets to any interfaces that do not currently
1814 * have probe targets by sharing targets with other interfaces in the group.
1815 */
1816 static void
init_host_targets(void)1817 init_host_targets(void)
1818 {
1819 struct phyint_instance *pii;
1820 struct phyint_group *pg;
1821
1822 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1823 pg = pii->pii_phyint->pi_group;
1824 if (pg != phyint_anongroup && pii->pii_targets == NULL)
1825 dup_host_targets(pii);
1826 }
1827 }
1828
1829 /*
1830 * Duplicate host targets from other phyints of the group to
1831 * the phyint instance 'desired_pii'.
1832 */
1833 static void
dup_host_targets(struct phyint_instance * desired_pii)1834 dup_host_targets(struct phyint_instance *desired_pii)
1835 {
1836 int af;
1837 struct phyint *pi;
1838 struct phyint_instance *pii;
1839 struct target *tg;
1840
1841 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
1842
1843 af = desired_pii->pii_af;
1844
1845 /*
1846 * For every phyint in the same group as desired_pii, check if
1847 * it has any host targets. If so add them to desired_pii.
1848 */
1849 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
1850 pii = PHYINT_INSTANCE(pi, af);
1851 /*
1852 * We know that we don't have targets on this phyint instance
1853 * since we have been called. But we still check for
1854 * pii_targets_are_routers because another phyint instance
1855 * could have router targets, since IFF_NOFAILOVER addresses
1856 * on different phyint instances may belong to different
1857 * subnets.
1858 */
1859 if ((pii == NULL) || (pii == desired_pii) ||
1860 pii->pii_targets_are_routers)
1861 continue;
1862 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1863 target_create(desired_pii, tg->tg_address, _B_FALSE);
1864 }
1865 }
1866 }
1867
1868 static void
usage(char * cmd)1869 usage(char *cmd)
1870 {
1871 (void) fprintf(stderr, "usage: %s\n", cmd);
1872 }
1873
1874
1875 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd"
1876
1877 /* Get an option from the /etc/default/mpathd file */
1878 static char *
getdefault(char * name)1879 getdefault(char *name)
1880 {
1881 char namebuf[BUFSIZ];
1882 char *value = NULL;
1883
1884 if (defopen(MPATHD_DEFAULT_FILE) == 0) {
1885 char *cp;
1886 int flags;
1887
1888 /*
1889 * ignore case
1890 */
1891 flags = defcntl(DC_GETFLAGS, 0);
1892 TURNOFF(flags, DC_CASE);
1893 (void) defcntl(DC_SETFLAGS, flags);
1894
1895 /* Add "=" to the name */
1896 (void) strncpy(namebuf, name, sizeof (namebuf) - 2);
1897 (void) strncat(namebuf, "=", 2);
1898
1899 if ((cp = defread(namebuf)) != NULL)
1900 value = strdup(cp);
1901
1902 /* close */
1903 (void) defopen((char *)NULL);
1904 }
1905 return (value);
1906 }
1907
1908
1909 /*
1910 * Command line options below
1911 */
1912 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */
1913 boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */
1914 static boolean_t adopt = _B_FALSE;
1915 static boolean_t foreground = _B_FALSE;
1916
1917 int
main(int argc,char * argv[])1918 main(int argc, char *argv[])
1919 {
1920 int i;
1921 int c;
1922 struct phyint *pi;
1923 struct phyint_instance *pii;
1924 char *value;
1925
1926 argv0 = argv; /* Saved for re-exec on SIGHUP */
1927 srandom(gethostid()); /* Initialize the random number generator */
1928
1929 /*
1930 * NOTE: The messages output by in.mpathd are not suitable for
1931 * translation, so we do not call textdomain().
1932 */
1933 (void) setlocale(LC_ALL, "");
1934
1935 /*
1936 * Get the user specified value of 'failure detection time'
1937 * from /etc/default/mpathd
1938 */
1939 value = getdefault("FAILURE_DETECTION_TIME");
1940 if (value != NULL) {
1941 user_failure_detection_time =
1942 (int)strtol((char *)value, NULL, 0);
1943
1944 if (user_failure_detection_time <= 0) {
1945 user_failure_detection_time = FAILURE_DETECTION_TIME;
1946 logerr("Invalid failure detection time %s, assuming "
1947 "default of %d ms\n", value,
1948 user_failure_detection_time);
1949
1950 } else if (user_failure_detection_time <
1951 MIN_FAILURE_DETECTION_TIME) {
1952 user_failure_detection_time =
1953 MIN_FAILURE_DETECTION_TIME;
1954 logerr("Too small failure detection time of %s, "
1955 "assuming minimum of %d ms\n", value,
1956 user_failure_detection_time);
1957 }
1958 free(value);
1959 } else {
1960 /* User has not specified the parameter, Use default value */
1961 user_failure_detection_time = FAILURE_DETECTION_TIME;
1962 }
1963
1964 /*
1965 * This gives the frequency at which probes will be sent.
1966 * When fdt ms elapses, we should be able to determine
1967 * whether 5 consecutive probes have failed or not.
1968 * 1 probe will be sent in every user_probe_interval ms,
1969 * randomly anytime in the (0.5 - 1.0) 2nd half of every
1970 * user_probe_interval. Thus when we send out probe 'n' we
1971 * can be sure that probe 'n - 2' is lost, if we have not
1972 * got the ack. (since the probe interval is > crtt). But
1973 * probe 'n - 1' may be a valid unacked probe, since the
1974 * time between 2 successive probes could be as small as
1975 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2
1976 */
1977 user_probe_interval = user_failure_detection_time /
1978 (NUM_PROBE_FAILS + 2);
1979
1980 /*
1981 * Get the user specified value of failback_enabled from
1982 * /etc/default/mpathd
1983 */
1984 value = getdefault("FAILBACK");
1985 if (value != NULL) {
1986 if (strcasecmp(value, "yes") == 0)
1987 failback_enabled = _B_TRUE;
1988 else if (strcasecmp(value, "no") == 0)
1989 failback_enabled = _B_FALSE;
1990 else
1991 logerr("Invalid value for FAILBACK %s\n", value);
1992 free(value);
1993 } else {
1994 failback_enabled = _B_TRUE;
1995 }
1996
1997 /*
1998 * Get the user specified value of track_all_phyints from
1999 * /etc/default/mpathd. The sense is reversed in
2000 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2001 */
2002 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2003 if (value != NULL) {
2004 if (strcasecmp(value, "yes") == 0)
2005 track_all_phyints = _B_FALSE;
2006 else if (strcasecmp(value, "no") == 0)
2007 track_all_phyints = _B_TRUE;
2008 else
2009 logerr("Invalid value for "
2010 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2011 free(value);
2012 } else {
2013 track_all_phyints = _B_FALSE;
2014 }
2015
2016 while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2017 switch (c) {
2018 case 'a':
2019 adopt = _B_TRUE;
2020 break;
2021 case 'm':
2022 force_mcast = _B_TRUE;
2023 break;
2024 case 'd':
2025 debug = D_ALL;
2026 foreground = _B_TRUE;
2027 break;
2028 case 'D':
2029 i = (int)strtol(optarg, NULL, 0);
2030 if (i == 0) {
2031 (void) fprintf(stderr, "Bad debug flags: %s\n",
2032 optarg);
2033 exit(1);
2034 }
2035 debug |= i;
2036 foreground = _B_TRUE;
2037 break;
2038 case 'l':
2039 /*
2040 * Turn off link state notification handling.
2041 * Undocumented command line flag, for debugging
2042 * purposes.
2043 */
2044 handle_link_notifications = _B_FALSE;
2045 break;
2046 default:
2047 usage(argv[0]);
2048 exit(1);
2049 }
2050 }
2051
2052 /*
2053 * The sockets for the loopback command interface should be listening
2054 * before we fork and exit in daemonize(). This way, whoever started us
2055 * can use the loopback interface as soon as they get a zero exit
2056 * status.
2057 */
2058 lsock_v4 = setup_listener(AF_INET);
2059 lsock_v6 = setup_listener(AF_INET6);
2060
2061 if (lsock_v4 < 0 && lsock_v6 < 0) {
2062 logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2063 exit(1);
2064 }
2065
2066 if (!foreground) {
2067 if (!daemonize()) {
2068 logerr("cannot daemonize\n");
2069 exit(EXIT_FAILURE);
2070 }
2071 initlog();
2072 }
2073
2074 /*
2075 * Initializations:
2076 * 1. Create ifsock* sockets. These are used for performing SIOC*
2077 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2078 * 2. Initialize a pipe for handling/recording signal events.
2079 * 3. Create the routing sockets, used for listening
2080 * to routing / interface changes.
2081 * 4. phyint_init() - Initialize physical interface state
2082 * (in mpd_tables.c). Must be done before creating interfaces,
2083 * which timer_init() does indirectly.
2084 * 5. Query kernel for route entry sizes (v4 and v6).
2085 * 6. timer_init() - Initialize timer related stuff
2086 * 7. initifs() - Initialize our database of all known interfaces
2087 * 8. init_router_targets() - Initialize our database of all known
2088 * router targets.
2089 */
2090 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2091 if (ifsock_v4 < 0) {
2092 logperror("main: IPv4 socket open");
2093 exit(1);
2094 }
2095
2096 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2097 if (ifsock_v6 < 0) {
2098 logperror("main: IPv6 socket open");
2099 exit(1);
2100 }
2101
2102 setup_eventpipe();
2103
2104 rtsock_v4 = setup_rtsock(AF_INET);
2105 rtsock_v6 = setup_rtsock(AF_INET6);
2106
2107 if (phyint_init() == -1) {
2108 logerr("cannot initialize physical interface structures");
2109 exit(1);
2110 }
2111
2112 if (mibwalk(mib_get_constants) == -1)
2113 exit(1);
2114
2115 timer_init();
2116
2117 initifs();
2118
2119 /*
2120 * If we're operating in "adopt" mode and no interfaces need to be
2121 * tracked, shut down (ifconfig(1M) will restart us on demand if
2122 * interfaces are subsequently put into multipathing groups).
2123 */
2124 if (adopt && phyint_instances == NULL)
2125 exit(0);
2126
2127 /*
2128 * Main body. Keep listening for activity on any of the sockets
2129 * that we are monitoring and take appropriate action as necessary.
2130 * signals are also handled synchronously.
2131 */
2132 for (;;) {
2133 if (poll(pollfds, pollfd_num, -1) < 0) {
2134 if (errno == EINTR)
2135 continue;
2136 logperror("main: poll");
2137 exit(1);
2138 }
2139 for (i = 0; i < pollfd_num; i++) {
2140 if ((pollfds[i].fd == -1) ||
2141 !(pollfds[i].revents & POLLIN))
2142 continue;
2143 if (pollfds[i].fd == eventpipe_read) {
2144 in_signal(eventpipe_read);
2145 break;
2146 }
2147 if (pollfds[i].fd == rtsock_v4 ||
2148 pollfds[i].fd == rtsock_v6) {
2149 process_rtsock(rtsock_v4, rtsock_v6);
2150 break;
2151 }
2152
2153 for (pii = phyint_instances; pii != NULL;
2154 pii = pii->pii_next) {
2155 if (pollfds[i].fd == pii->pii_probe_sock) {
2156 if (pii->pii_af == AF_INET)
2157 in_data(pii);
2158 else
2159 in6_data(pii);
2160 break;
2161 }
2162 }
2163
2164 for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2165 if (pi->pi_notes != 0 &&
2166 pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
2167 (void) dlpi_recv(pi->pi_dh, NULL, NULL,
2168 NULL, NULL, 0, NULL);
2169 break;
2170 }
2171 }
2172
2173 if (pollfds[i].fd == lsock_v4)
2174 loopback_cmd(lsock_v4, AF_INET);
2175 else if (pollfds[i].fd == lsock_v6)
2176 loopback_cmd(lsock_v6, AF_INET6);
2177 }
2178 }
2179 /* NOTREACHED */
2180 return (EXIT_SUCCESS);
2181 }
2182
2183 static int
setup_listener(int af)2184 setup_listener(int af)
2185 {
2186 int sock;
2187 int on;
2188 int len;
2189 int ret;
2190 struct sockaddr_storage laddr;
2191 struct sockaddr_in *sin;
2192 struct sockaddr_in6 *sin6;
2193 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2194
2195 assert(af == AF_INET || af == AF_INET6);
2196
2197 sock = socket(af, SOCK_STREAM, 0);
2198 if (sock < 0) {
2199 logperror("setup_listener: socket");
2200 exit(1);
2201 }
2202
2203 on = 1;
2204 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2205 sizeof (on)) < 0) {
2206 logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2207 exit(1);
2208 }
2209
2210 bzero(&laddr, sizeof (laddr));
2211 laddr.ss_family = af;
2212
2213 if (af == AF_INET) {
2214 sin = (struct sockaddr_in *)&laddr;
2215 sin->sin_port = htons(MPATHD_PORT);
2216 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2217 len = sizeof (struct sockaddr_in);
2218 } else {
2219 sin6 = (struct sockaddr_in6 *)&laddr;
2220 sin6->sin6_port = htons(MPATHD_PORT);
2221 sin6->sin6_addr = loopback_addr;
2222 len = sizeof (struct sockaddr_in6);
2223 }
2224
2225 ret = bind(sock, (struct sockaddr *)&laddr, len);
2226 if (ret < 0) {
2227 if (errno == EADDRINUSE) {
2228 /*
2229 * Another instance of mpathd may be already active.
2230 */
2231 logerr("main: is another instance of in.mpathd "
2232 "already active?\n");
2233 exit(1);
2234 } else {
2235 (void) close(sock);
2236 return (-1);
2237 }
2238 }
2239 if (listen(sock, 30) < 0) {
2240 logperror("main: listen");
2241 exit(1);
2242 }
2243 if (poll_add(sock) == -1) {
2244 (void) close(sock);
2245 exit(1);
2246 }
2247
2248 return (sock);
2249 }
2250
2251 /*
2252 * Table of commands and their expected size; used by loopback_cmd().
2253 */
2254 static struct {
2255 const char *name;
2256 unsigned int size;
2257 } commands[] = {
2258 { "MI_PING", sizeof (uint32_t) },
2259 { "MI_OFFLINE", sizeof (mi_offline_t) },
2260 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) },
2261 { "MI_QUERY", sizeof (mi_query_t) }
2262 };
2263
2264 /*
2265 * Commands received over the loopback interface come here (via libipmp).
2266 */
2267 static void
loopback_cmd(int sock,int family)2268 loopback_cmd(int sock, int family)
2269 {
2270 int newfd;
2271 ssize_t len;
2272 boolean_t is_priv = _B_FALSE;
2273 struct sockaddr_storage peer;
2274 struct sockaddr_in *peer_sin;
2275 struct sockaddr_in6 *peer_sin6;
2276 socklen_t peerlen;
2277 union mi_commands mpi;
2278 char abuf[INET6_ADDRSTRLEN];
2279 uint_t cmd;
2280 int retval;
2281
2282 peerlen = sizeof (peer);
2283 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2284 if (newfd < 0) {
2285 logperror("loopback_cmd: accept");
2286 return;
2287 }
2288
2289 switch (family) {
2290 case AF_INET:
2291 /*
2292 * Validate the address and port to make sure that
2293 * non privileged processes don't connect and start
2294 * talking to us.
2295 */
2296 if (peerlen != sizeof (struct sockaddr_in)) {
2297 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2298 (void) close(newfd);
2299 return;
2300 }
2301 peer_sin = (struct sockaddr_in *)&peer;
2302 is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
2303 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2304 abuf, sizeof (abuf));
2305
2306 if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
2307 logerr("Attempt to connect from addr %s port %d\n",
2308 abuf, ntohs(peer_sin->sin_port));
2309 (void) close(newfd);
2310 return;
2311 }
2312 break;
2313
2314 case AF_INET6:
2315 if (peerlen != sizeof (struct sockaddr_in6)) {
2316 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2317 (void) close(newfd);
2318 return;
2319 }
2320 /*
2321 * Validate the address and port to make sure that
2322 * non privileged processes don't connect and start
2323 * talking to us.
2324 */
2325 peer_sin6 = (struct sockaddr_in6 *)&peer;
2326 is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
2327 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2328 sizeof (abuf));
2329 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
2330 logerr("Attempt to connect from addr %s port %d\n",
2331 abuf, ntohs(peer_sin6->sin6_port));
2332 (void) close(newfd);
2333 return;
2334 }
2335
2336 default:
2337 logdebug("loopback_cmd: family %d\n", family);
2338 (void) close(newfd);
2339 return;
2340 }
2341
2342 /*
2343 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2344 * all supported commands
2345 */
2346 len = read(newfd, &mpi, sizeof (mpi));
2347
2348 /*
2349 * In theory, we can receive any sized message for a stream socket,
2350 * but we don't expect that to happen for a small message over a
2351 * loopback connection.
2352 */
2353 if (len < sizeof (uint32_t)) {
2354 logerr("loopback_cmd: bad command format or read returns "
2355 "partial data %d\n", len);
2356 (void) close(newfd);
2357 return;
2358 }
2359
2360 cmd = mpi.mi_command;
2361 if (cmd >= MI_NCMD) {
2362 logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2363 (void) close(newfd);
2364 return;
2365 }
2366
2367 /*
2368 * Only MI_PING and MI_QUERY can come from unprivileged sources.
2369 */
2370 if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
2371 logerr("Unprivileged request from %s for privileged "
2372 "command %s\n", abuf, commands[cmd].name);
2373 (void) close(newfd);
2374 return;
2375 }
2376
2377 if (len < commands[cmd].size) {
2378 logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2379 commands[cmd].name, commands[cmd].size, len);
2380 (void) close(newfd);
2381 return;
2382 }
2383
2384 retval = process_cmd(newfd, &mpi);
2385 if (retval != IPMP_SUCCESS) {
2386 logerr("failed processing %s: %s\n", commands[cmd].name,
2387 ipmp_errmsg(retval));
2388 }
2389 (void) close(newfd);
2390 }
2391
2392 /*
2393 * Process the commands received via libipmp.
2394 */
2395 static unsigned int
process_cmd(int newfd,union mi_commands * mpi)2396 process_cmd(int newfd, union mi_commands *mpi)
2397 {
2398 struct phyint *pi;
2399 struct mi_offline *mio;
2400 struct mi_undo_offline *miu;
2401 unsigned int retval;
2402
2403 switch (mpi->mi_command) {
2404 case MI_PING:
2405 return (send_result(newfd, IPMP_SUCCESS, 0));
2406
2407 case MI_OFFLINE:
2408 mio = &mpi->mi_ocmd;
2409
2410 pi = phyint_lookup(mio->mio_ifname);
2411 if (pi == NULL)
2412 return (send_result(newfd, IPMP_EUNKIF, 0));
2413
2414 retval = phyint_offline(pi, mio->mio_min_redundancy);
2415 if (retval == IPMP_FAILURE)
2416 return (send_result(newfd, IPMP_FAILURE, errno));
2417
2418 return (send_result(newfd, retval, 0));
2419
2420 case MI_UNDO_OFFLINE:
2421 miu = &mpi->mi_ucmd;
2422
2423 pi = phyint_lookup(miu->miu_ifname);
2424 if (pi == NULL)
2425 return (send_result(newfd, IPMP_EUNKIF, 0));
2426
2427 retval = phyint_undo_offline(pi);
2428 if (retval == IPMP_FAILURE)
2429 return (send_result(newfd, IPMP_FAILURE, errno));
2430
2431 return (send_result(newfd, retval, 0));
2432
2433 case MI_QUERY:
2434 return (process_query(newfd, &mpi->mi_qcmd));
2435
2436 default:
2437 break;
2438 }
2439
2440 return (send_result(newfd, IPMP_EPROTO, 0));
2441 }
2442
2443 /*
2444 * Process the query request pointed to by `miq' and send a reply on file
2445 * descriptor `fd'. Returns an IPMP error code.
2446 */
2447 static unsigned int
process_query(int fd,mi_query_t * miq)2448 process_query(int fd, mi_query_t *miq)
2449 {
2450 ipmp_addrinfo_t *adinfop;
2451 ipmp_addrinfolist_t *adlp;
2452 ipmp_groupinfo_t *grinfop;
2453 ipmp_groupinfolist_t *grlp;
2454 ipmp_grouplist_t *grlistp;
2455 ipmp_ifinfo_t *ifinfop;
2456 ipmp_ifinfolist_t *iflp;
2457 ipmp_snap_t *snap;
2458 unsigned int retval;
2459
2460 switch (miq->miq_inforeq) {
2461 case IPMP_ADDRINFO:
2462 retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
2463 &adinfop);
2464 if (retval != IPMP_SUCCESS)
2465 return (send_result(fd, retval, errno));
2466
2467 retval = send_result(fd, IPMP_SUCCESS, 0);
2468 if (retval == IPMP_SUCCESS)
2469 retval = send_addrinfo(fd, adinfop);
2470
2471 ipmp_freeaddrinfo(adinfop);
2472 return (retval);
2473
2474 case IPMP_GROUPLIST:
2475 retval = getgrouplist(&grlistp);
2476 if (retval != IPMP_SUCCESS)
2477 return (send_result(fd, retval, errno));
2478
2479 retval = send_result(fd, IPMP_SUCCESS, 0);
2480 if (retval == IPMP_SUCCESS)
2481 retval = send_grouplist(fd, grlistp);
2482
2483 ipmp_freegrouplist(grlistp);
2484 return (retval);
2485
2486 case IPMP_GROUPINFO:
2487 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2488 retval = getgroupinfo(miq->miq_grname, &grinfop);
2489 if (retval != IPMP_SUCCESS)
2490 return (send_result(fd, retval, errno));
2491
2492 retval = send_result(fd, IPMP_SUCCESS, 0);
2493 if (retval == IPMP_SUCCESS)
2494 retval = send_groupinfo(fd, grinfop);
2495
2496 ipmp_freegroupinfo(grinfop);
2497 return (retval);
2498
2499 case IPMP_IFINFO:
2500 miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2501 retval = getifinfo(miq->miq_ifname, &ifinfop);
2502 if (retval != IPMP_SUCCESS)
2503 return (send_result(fd, retval, errno));
2504
2505 retval = send_result(fd, IPMP_SUCCESS, 0);
2506 if (retval == IPMP_SUCCESS)
2507 retval = send_ifinfo(fd, ifinfop);
2508
2509 ipmp_freeifinfo(ifinfop);
2510 return (retval);
2511
2512 case IPMP_SNAP:
2513 /*
2514 * Before taking the snapshot, sync with the kernel.
2515 */
2516 initifs();
2517
2518 retval = getsnap(&snap);
2519 if (retval != IPMP_SUCCESS)
2520 return (send_result(fd, retval, errno));
2521
2522 retval = send_result(fd, IPMP_SUCCESS, 0);
2523 if (retval != IPMP_SUCCESS)
2524 goto out;
2525
2526 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2527 if (retval != IPMP_SUCCESS)
2528 goto out;
2529
2530 retval = send_grouplist(fd, snap->sn_grlistp);
2531 if (retval != IPMP_SUCCESS)
2532 goto out;
2533
2534 iflp = snap->sn_ifinfolistp;
2535 for (; iflp != NULL; iflp = iflp->ifl_next) {
2536 retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2537 if (retval != IPMP_SUCCESS)
2538 goto out;
2539 }
2540
2541 grlp = snap->sn_grinfolistp;
2542 for (; grlp != NULL; grlp = grlp->grl_next) {
2543 retval = send_groupinfo(fd, grlp->grl_grinfop);
2544 if (retval != IPMP_SUCCESS)
2545 goto out;
2546 }
2547
2548 adlp = snap->sn_adinfolistp;
2549 for (; adlp != NULL; adlp = adlp->adl_next) {
2550 retval = send_addrinfo(fd, adlp->adl_adinfop);
2551 if (retval != IPMP_SUCCESS)
2552 goto out;
2553 }
2554 out:
2555 ipmp_snap_free(snap);
2556 return (retval);
2557
2558 default:
2559 break;
2560
2561 }
2562 return (send_result(fd, IPMP_EPROTO, 0));
2563 }
2564
2565 /*
2566 * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2567 * Returns an IPMP error code.
2568 */
2569 static unsigned int
send_groupinfo(int fd,ipmp_groupinfo_t * grinfop)2570 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2571 {
2572 ipmp_iflist_t *iflistp = grinfop->gr_iflistp;
2573 ipmp_addrlist_t *adlistp = grinfop->gr_adlistp;
2574 unsigned int retval;
2575
2576 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2577 if (retval != IPMP_SUCCESS)
2578 return (retval);
2579
2580 retval = ipmp_writetlv(fd, IPMP_IFLIST,
2581 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
2582 if (retval != IPMP_SUCCESS)
2583 return (retval);
2584
2585 return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2586 IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
2587 }
2588
2589 /*
2590 * Send the interface information pointed to by `ifinfop' on file descriptor
2591 * `fd'. Returns an IPMP error code.
2592 */
2593 static unsigned int
send_ifinfo(int fd,ipmp_ifinfo_t * ifinfop)2594 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2595 {
2596 ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp;
2597 ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp;
2598 unsigned int retval;
2599
2600 retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop);
2601 if (retval != IPMP_SUCCESS)
2602 return (retval);
2603
2604 retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
2605 IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
2606 if (retval != IPMP_SUCCESS)
2607 return (retval);
2608
2609 return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2610 IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
2611 }
2612
2613 /*
2614 * Send the address information pointed to by `adinfop' on file descriptor
2615 * `fd'. Returns an IPMP error code.
2616 */
2617 static unsigned int
send_addrinfo(int fd,ipmp_addrinfo_t * adinfop)2618 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
2619 {
2620 return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
2621 }
2622
2623 /*
2624 * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2625 * Returns an IPMP error code.
2626 */
2627 static unsigned int
send_grouplist(int fd,ipmp_grouplist_t * grlistp)2628 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2629 {
2630 return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2631 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2632 }
2633
2634 /*
2635 * Initialize an mi_result_t structure using `error' and `syserror' and
2636 * send it on file descriptor `fd'. Returns an IPMP error code.
2637 */
2638 static unsigned int
send_result(int fd,unsigned int error,int syserror)2639 send_result(int fd, unsigned int error, int syserror)
2640 {
2641 mi_result_t me;
2642
2643 me.me_mpathd_error = error;
2644 if (error == IPMP_FAILURE)
2645 me.me_sys_error = syserror;
2646 else
2647 me.me_sys_error = 0;
2648
2649 return (ipmp_write(fd, &me, sizeof (me)));
2650 }
2651
2652 /*
2653 * Daemonize the process.
2654 */
2655 static boolean_t
daemonize(void)2656 daemonize(void)
2657 {
2658 switch (fork()) {
2659 case -1:
2660 return (_B_FALSE);
2661
2662 case 0:
2663 /*
2664 * Lose our controlling terminal, and become both a session
2665 * leader and a process group leader.
2666 */
2667 if (setsid() == -1)
2668 return (_B_FALSE);
2669
2670 /*
2671 * Under POSIX, a session leader can accidentally (through
2672 * open(2)) acquire a controlling terminal if it does not
2673 * have one. Just to be safe, fork() again so we are not a
2674 * session leader.
2675 */
2676 switch (fork()) {
2677 case -1:
2678 return (_B_FALSE);
2679
2680 case 0:
2681 (void) chdir("/");
2682 (void) umask(022);
2683 (void) fdwalk(closefunc, NULL);
2684 break;
2685
2686 default:
2687 _exit(EXIT_SUCCESS);
2688 }
2689 break;
2690
2691 default:
2692 _exit(EXIT_SUCCESS);
2693 }
2694
2695 return (_B_TRUE);
2696 }
2697
2698 /*
2699 * The parent has created some fds before forking on purpose, keep them open.
2700 */
2701 static int
closefunc(void * not_used,int fd)2702 closefunc(void *not_used, int fd)
2703 /* ARGSUSED */
2704 {
2705 if (fd != lsock_v4 && fd != lsock_v6)
2706 (void) close(fd);
2707 return (0);
2708 }
2709
2710 /* LOGGER */
2711
2712 #include <syslog.h>
2713
2714 /*
2715 * Logging routines. All routines log to syslog, unless the daemon is
2716 * running in the foreground, in which case the logging goes to stderr.
2717 *
2718 * The following routines are available:
2719 *
2720 * logdebug(): A printf-like function for outputting debug messages
2721 * (messages at LOG_DEBUG) that are only of use to developers.
2722 *
2723 * logtrace(): A printf-like function for outputting tracing messages
2724 * (messages at LOG_INFO) from the daemon. This is typically used
2725 * to log the receipt of interesting network-related conditions.
2726 *
2727 * logerr(): A printf-like function for outputting error messages
2728 * (messages at LOG_ERR) from the daemon.
2729 *
2730 * logperror*(): A set of functions used to output error messages
2731 * (messages at LOG_ERR); these automatically append strerror(errno)
2732 * and a newline to the message passed to them.
2733 *
2734 * NOTE: since the logging functions write to syslog, the messages passed
2735 * to them are not eligible for localization. Thus, gettext() must
2736 * *not* be used.
2737 */
2738
2739 static int logging = 0;
2740
2741 static void
initlog(void)2742 initlog(void)
2743 {
2744 logging++;
2745 openlog("in.mpathd", LOG_PID, LOG_DAEMON);
2746 }
2747
2748 /* PRINTFLIKE2 */
2749 void
logmsg(int pri,const char * fmt,...)2750 logmsg(int pri, const char *fmt, ...)
2751 {
2752 va_list ap;
2753
2754 va_start(ap, fmt);
2755
2756 if (logging)
2757 vsyslog(pri, fmt, ap);
2758 else
2759 (void) vfprintf(stderr, fmt, ap);
2760 va_end(ap);
2761 }
2762
2763 /* PRINTFLIKE1 */
2764 void
logperror(const char * str)2765 logperror(const char *str)
2766 {
2767 if (logging)
2768 syslog(LOG_ERR, "%s: %m\n", str);
2769 else
2770 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
2771 }
2772
2773 void
logperror_pii(struct phyint_instance * pii,const char * str)2774 logperror_pii(struct phyint_instance *pii, const char *str)
2775 {
2776 if (logging) {
2777 syslog(LOG_ERR, "%s (%s %s): %m\n",
2778 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
2779 } else {
2780 (void) fprintf(stderr, "%s (%s %s): %s\n",
2781 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
2782 strerror(errno));
2783 }
2784 }
2785
2786 void
logperror_li(struct logint * li,const char * str)2787 logperror_li(struct logint *li, const char *str)
2788 {
2789 struct phyint_instance *pii = li->li_phyint_inst;
2790
2791 if (logging) {
2792 syslog(LOG_ERR, "%s (%s %s): %m\n",
2793 str, AF_STR(pii->pii_af), li->li_name);
2794 } else {
2795 (void) fprintf(stderr, "%s (%s %s): %s\n",
2796 str, AF_STR(pii->pii_af), li->li_name,
2797 strerror(errno));
2798 }
2799 }
2800
2801 void
close_probe_socket(struct phyint_instance * pii,boolean_t polled)2802 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
2803 {
2804 if (polled)
2805 (void) poll_remove(pii->pii_probe_sock);
2806 (void) close(pii->pii_probe_sock);
2807 pii->pii_probe_sock = -1;
2808 pii->pii_basetime_inited = 0;
2809 }
2810
2811 boolean_t
addrlist_add(addrlist_t ** addrsp,const char * name,uint64_t flags,struct sockaddr_storage * ssp)2812 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
2813 struct sockaddr_storage *ssp)
2814 {
2815 addrlist_t *addrp;
2816
2817 if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
2818 return (_B_FALSE);
2819
2820 (void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
2821 addrp->al_flags = flags;
2822 addrp->al_addr = *ssp;
2823 addrp->al_next = *addrsp;
2824 *addrsp = addrp;
2825 return (_B_TRUE);
2826 }
2827
2828 void
addrlist_free(addrlist_t ** addrsp)2829 addrlist_free(addrlist_t **addrsp)
2830 {
2831 addrlist_t *addrp, *next_addrp;
2832
2833 for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
2834 next_addrp = addrp->al_next;
2835 free(addrp);
2836 }
2837 *addrsp = NULL;
2838 }
2839
2840 /*
2841 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
2842 * tables defined by mib2.h. Pass the table information returned to the
2843 * supplied function.
2844 */
2845 static int
mibwalk(void (* proc)(mib_item_t *))2846 mibwalk(void (*proc)(mib_item_t *))
2847 {
2848 mib_item_t *head_item = NULL;
2849 mib_item_t *last_item = NULL;
2850 mib_item_t *tmp;
2851 struct strbuf ctlbuf, databuf;
2852 int flags;
2853 int rval;
2854 uintptr_t buf[512 / sizeof (uintptr_t)];
2855 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)buf;
2856 struct T_optmgmt_ack *toa = (struct T_optmgmt_ack *)buf;
2857 struct T_error_ack *tea = (struct T_error_ack *)buf;
2858 struct opthdr *req, *optp;
2859 int status = -1;
2860
2861 if (mibfd == -1) {
2862 if ((mibfd = open("/dev/ip", O_RDWR)) < 0) {
2863 logperror("mibwalk(): ip open");
2864 return (status);
2865 }
2866 }
2867
2868 tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2869 tor->OPT_offset = sizeof (struct T_optmgmt_req);
2870 tor->OPT_length = sizeof (struct opthdr);
2871 tor->MGMT_flags = T_CURRENT;
2872
2873 /*
2874 * Note: we use the special level value below so that IP will return
2875 * us information concerning IRE_MARK_TESTHIDDEN routes.
2876 */
2877 req = (struct opthdr *)&tor[1];
2878 req->level = EXPER_IP_AND_ALL_IRES;
2879 req->name = 0;
2880 req->len = 0;
2881
2882 ctlbuf.buf = (char *)&buf;
2883 ctlbuf.len = tor->OPT_length + tor->OPT_offset;
2884
2885 if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) {
2886 logperror("mibwalk(): putmsg(ctl)");
2887 return (status);
2888 }
2889
2890 /*
2891 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
2892 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains
2893 * a control and data part. The control part contains a struct
2894 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
2895 * the level, name and length of the data in the data part. The
2896 * data part contains the actual table data. The last message
2897 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
2898 * single option with zero optlen.
2899 */
2900 for (;;) {
2901 errno = flags = 0;
2902 ctlbuf.maxlen = sizeof (buf);
2903 rval = getmsg(mibfd, &ctlbuf, NULL, &flags);
2904 if (rval & MORECTL || rval < 0) {
2905 if (errno == EINTR)
2906 continue;
2907 logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n",
2908 rval, errno);
2909 goto error;
2910 }
2911 if (ctlbuf.len < sizeof (t_scalar_t)) {
2912 logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len);
2913 goto error;
2914 }
2915
2916 switch (toa->PRIM_type) {
2917 case T_ERROR_ACK:
2918 if (ctlbuf.len < sizeof (struct T_error_ack)) {
2919 logerr("mibwalk(): T_ERROR_ACK ctlbuf "
2920 "too short: %d\n", ctlbuf.len);
2921 goto error;
2922 }
2923 logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n"
2924 " UNIX_err = 0x%lx\n", tea->TLI_error,
2925 t_strerror(tea->TLI_error), tea->UNIX_error);
2926 goto error;
2927
2928 case T_OPTMGMT_ACK:
2929 optp = (struct opthdr *)&toa[1];
2930 if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
2931 sizeof (struct opthdr))) {
2932 logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too "
2933 "short: %d\n", ctlbuf.len);
2934 goto error;
2935 }
2936 if (toa->MGMT_flags != T_SUCCESS) {
2937 logerr("mibwalk(): MGMT_flags != T_SUCCESS: "
2938 "0x%lx\n", toa->MGMT_flags);
2939 goto error;
2940 }
2941 break;
2942
2943 default:
2944 goto error;
2945 }
2946 /* The following assert also implies MGMT_flags == T_SUCCESS */
2947 assert(toa->PRIM_type == T_OPTMGMT_ACK);
2948
2949 /*
2950 * We have reached the end of this T_OPTMGMT_ACK
2951 * message. If this is the last message i.e EOD,
2952 * break, else process the next T_OPTMGMT_ACK msg.
2953 */
2954 if (rval == 0) {
2955 if (optp->len == 0 && optp->name == 0 &&
2956 optp->level == 0) {
2957 /* This is the EOD message. */
2958 break;
2959 }
2960 /* Not EOD but no data to retrieve */
2961 continue;
2962 }
2963
2964 /*
2965 * We should only be here if MOREDATA was set.
2966 * Allocate an empty mib_item_t and link into the list
2967 * of MIB items.
2968 */
2969 if ((tmp = malloc(sizeof (*tmp))) == NULL) {
2970 logperror("mibwalk(): malloc() failed.");
2971 goto error;
2972 }
2973 if (last_item != NULL)
2974 last_item->mi_next = tmp;
2975 else
2976 head_item = tmp;
2977 last_item = tmp;
2978 last_item->mi_next = NULL;
2979 last_item->mi_opthdr = *optp;
2980 last_item->mi_valp = malloc(optp->len);
2981 if (last_item->mi_valp == NULL) {
2982 logperror("mibwalk(): malloc() failed.");
2983 goto error;
2984 }
2985
2986 databuf.maxlen = last_item->mi_opthdr.len;
2987 databuf.buf = (char *)last_item->mi_valp;
2988 databuf.len = 0;
2989
2990 /* Retrieve the actual MIB data */
2991 for (;;) {
2992 flags = 0;
2993 if ((rval = getmsg(mibfd, NULL, &databuf,
2994 &flags)) != 0) {
2995 if (rval < 0 && errno == EINTR)
2996 continue;
2997 /*
2998 * We shouldn't get MOREDATA here so treat that
2999 * as an error.
3000 */
3001 logperror("mibwalk(): getmsg(data)");
3002 goto error;
3003 }
3004 break;
3005 }
3006 }
3007 status = 0;
3008 /* Pass the accumulated MIB data to the supplied function pointer */
3009 (*proc)(head_item);
3010 error:
3011 while (head_item != NULL) {
3012 tmp = head_item;
3013 head_item = tmp->mi_next;
3014 free(tmp->mi_valp);
3015 free(tmp);
3016 }
3017 return (status);
3018 }
3019
3020 /*
3021 * Parse the supplied mib2 information to get the size of routing table
3022 * entries. This is needed when running in a branded zone where the
3023 * Solaris application environment and the Solaris kernel may not be the
3024 * the same release version.
3025 */
3026 static void
mib_get_constants(mib_item_t * item)3027 mib_get_constants(mib_item_t *item)
3028 {
3029 mib2_ip_t *ipv4;
3030 mib2_ipv6IfStatsEntry_t *ipv6;
3031
3032 for (; item != NULL; item = item->mi_next) {
3033 if (item->mi_opthdr.name != 0)
3034 continue;
3035 if (item->mi_opthdr.level == MIB2_IP) {
3036 ipv4 = (mib2_ip_t *)item->mi_valp;
3037 ipRouteEntrySize = ipv4->ipRouteEntrySize;
3038 } else if (item->mi_opthdr.level == MIB2_IP6) {
3039 ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp;
3040 ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize;
3041 }
3042 }
3043 }
3044