1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
26 */
27
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30
31 int debug = 0; /* Debug flag */
32 static int pollfd_num = 0; /* Num. of poll descriptors */
33 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */
34 /* All times below in ms */
35 int user_failure_detection_time; /* user specified failure detection */
36 /* time (fdt) */
37 int user_probe_interval; /* derived from user specified fdt */
38
39 /*
40 * Structure to store mib2 information returned by the kernel.
41 * This is used to process routing table information.
42 */
43 typedef struct mib_item_s {
44 struct mib_item_s *mi_next;
45 struct opthdr mi_opthdr;
46 void *mi_valp;
47 } mib_item_t;
48
49 static int rtsock_v4; /* AF_INET routing socket */
50 static int rtsock_v6; /* AF_INET6 routing socket */
51 int ifsock_v4 = -1; /* IPv4 socket for ioctls */
52 int ifsock_v6 = -1; /* IPv6 socket for ioctls */
53 static int lsock_v4; /* Listen socket to detect mpathd */
54 static int lsock_v6; /* Listen socket to detect mpathd */
55 static int mibfd = -1; /* fd to get mib info */
56 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
57
58 static uint_t last_initifs_time; /* Time when initifs was last run */
59 static char **argv0; /* Saved for re-exec on SIGHUP */
60 boolean_t handle_link_notifications = _B_TRUE;
61 static int ipRouteEntrySize; /* Size of IPv4 route entry */
62 static int ipv6RouteEntrySize; /* Size of IPv6 route entry */
63
64 static void initlog(void);
65 static void run_timeouts(void);
66 static void initifs(void);
67 static void check_if_removed(struct phyint_instance *pii);
68 static void select_test_ifs(void);
69 static void update_router_list(mib_item_t *item);
70 static void mib_get_constants(mib_item_t *item);
71 static int mibwalk(void (*proc)(mib_item_t *));
72 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
73 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
74 static void router_add_common(int af, char *ifname,
75 struct in6_addr nexthop);
76 static void init_router_targets();
77 static void cleanup(void);
78 static int setup_listener(int af);
79 static void check_config(void);
80 static void check_testconfig(void);
81 static void check_addr_unique(struct phyint_instance *,
82 struct sockaddr_storage *);
83 static void init_host_targets(void);
84 static void dup_host_targets(struct phyint_instance *desired_pii);
85 static void loopback_cmd(int sock, int family);
86 static boolean_t daemonize(void);
87 static int closefunc(void *, int);
88 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
89 static unsigned int process_query(int fd, mi_query_t *miq);
90 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
91 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
92 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
93 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
94 static unsigned int send_result(int fd, unsigned int error, int syserror);
95
96 addrlist_t *localaddrs;
97
98 /*
99 * Return the current time in milliseconds (from an arbitrary reference)
100 * truncated to fit into an int. Truncation is ok since we are interested
101 * only in differences and not the absolute values.
102 */
103 uint_t
getcurrenttime(void)104 getcurrenttime(void)
105 {
106 uint_t cur_time; /* In ms */
107
108 /*
109 * Use of a non-user-adjustable source of time is
110 * required. However millisecond precision is sufficient.
111 * divide by 10^6
112 */
113 cur_time = (uint_t)(gethrtime() / 1000000LL);
114 return (cur_time);
115 }
116
117 uint64_t
getcurrentsec(void)118 getcurrentsec(void)
119 {
120 return (gethrtime() / NANOSEC);
121 }
122
123 /*
124 * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
125 */
126 int
poll_add(int fd)127 poll_add(int fd)
128 {
129 int i;
130 int new_num;
131 struct pollfd *newfds;
132 retry:
133 /* Check if already present */
134 for (i = 0; i < pollfd_num; i++) {
135 if (pollfds[i].fd == fd)
136 return (0);
137 }
138 /* Check for empty spot already present */
139 for (i = 0; i < pollfd_num; i++) {
140 if (pollfds[i].fd == -1) {
141 pollfds[i].fd = fd;
142 return (0);
143 }
144 }
145
146 /* Allocate space for 32 more fds and initialize to -1 */
147 new_num = pollfd_num + 32;
148 newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
149 if (newfds == NULL) {
150 logperror("poll_add: realloc");
151 return (-1);
152 }
153 for (i = pollfd_num; i < new_num; i++) {
154 newfds[i].fd = -1;
155 newfds[i].events = POLLIN;
156 }
157 pollfd_num = new_num;
158 pollfds = newfds;
159 goto retry;
160 }
161
162 /*
163 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
164 */
165 int
poll_remove(int fd)166 poll_remove(int fd)
167 {
168 int i;
169
170 /* Check if already present */
171 for (i = 0; i < pollfd_num; i++) {
172 if (pollfds[i].fd == fd) {
173 pollfds[i].fd = -1;
174 return (0);
175 }
176 }
177 return (-1);
178 }
179
180 /*
181 * Extract information about the phyint instance. If the phyint instance still
182 * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
183 * will use it to detect phyint instances that don't exist any longer and
184 * remove them, from our database of phyint instances.
185 * Return value:
186 * returns true if the phyint instance exists in the kernel,
187 * returns false otherwise
188 */
189 static boolean_t
pii_process(int af,char * name,struct phyint_instance ** pii_p)190 pii_process(int af, char *name, struct phyint_instance **pii_p)
191 {
192 int err;
193 struct phyint_instance *pii;
194 struct phyint_instance *pii_other;
195
196 if (debug & D_PHYINT)
197 logdebug("pii_process(%s %s)\n", AF_STR(af), name);
198
199 pii = phyint_inst_lookup(af, name);
200 if (pii == NULL) {
201 /*
202 * Phyint instance does not exist in our tables,
203 * create new phyint instance
204 */
205 pii = phyint_inst_init_from_k(af, name);
206 } else {
207 /* Phyint exists in our tables */
208 err = phyint_inst_update_from_k(pii);
209
210 switch (err) {
211 case PI_IOCTL_ERROR:
212 /* Some ioctl error. don't change anything */
213 pii->pii_in_use = 1;
214 break;
215
216 case PI_GROUP_CHANGED:
217 case PI_IFINDEX_CHANGED:
218 /*
219 * Interface index or group membership has changed.
220 * Delete the old state and recreate based on the new
221 * state (it may no longer be in a group).
222 */
223 pii_other = phyint_inst_other(pii);
224 if (pii_other != NULL)
225 phyint_inst_delete(pii_other);
226 phyint_inst_delete(pii);
227 pii = phyint_inst_init_from_k(af, name);
228 break;
229
230 case PI_DELETED:
231 /* Phyint instance has disappeared from kernel */
232 pii->pii_in_use = 0;
233 break;
234
235 case PI_OK:
236 /* Phyint instance exists and is fine */
237 pii->pii_in_use = 1;
238 break;
239
240 default:
241 /* Unknown status */
242 logerr("pii_process: Unknown status %d\n", err);
243 break;
244 }
245 }
246
247 *pii_p = pii;
248 if (pii != NULL)
249 return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
250 else
251 return (_B_FALSE);
252 }
253
254 /*
255 * Scan all interfaces to detect changes as well as new and deleted interfaces
256 */
257 static void
initifs()258 initifs()
259 {
260 int i, nlifr;
261 int af;
262 char *cp;
263 char *buf;
264 int sockfd;
265 uint64_t flags;
266 struct lifnum lifn;
267 struct lifconf lifc;
268 struct lifreq lifreq;
269 struct lifreq *lifr;
270 struct logint *li;
271 struct phyint_instance *pii;
272 struct phyint_instance *next_pii;
273 struct phyint_group *pg, *next_pg;
274 char pi_name[LIFNAMSIZ + 1];
275
276 if (debug & D_PHYINT)
277 logdebug("initifs: Scanning interfaces\n");
278
279 last_initifs_time = getcurrenttime();
280
281 /*
282 * Free the existing local address list; we'll build a new list below.
283 */
284 addrlist_free(&localaddrs);
285
286 /*
287 * Mark the interfaces so that we can find phyints and logints
288 * which have disappeared from the kernel. pii_process() and
289 * logint_init_from_k() will set {pii,li}_in_use when they find
290 * the interface in the kernel. Also, clear dupaddr bit on probe
291 * logint. check_addr_unique() will set the dupaddr bit on the
292 * probe logint, if the testaddress is not unique.
293 */
294 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
295 pii->pii_in_use = 0;
296 for (li = pii->pii_logint; li != NULL; li = li->li_next) {
297 li->li_in_use = 0;
298 if (pii->pii_probe_logint == li)
299 li->li_dupaddr = 0;
300 }
301 }
302
303 /*
304 * As above, mark groups so that we can detect IPMP interfaces which
305 * have been removed from the kernel. Also, delete the group address
306 * list since we'll iteratively recreate it below.
307 */
308 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
309 pg->pg_in_use = _B_FALSE;
310 addrlist_free(&pg->pg_addrs);
311 }
312
313 lifn.lifn_family = AF_UNSPEC;
314 lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
315 again:
316 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
317 logperror("initifs: ioctl (get interface count)");
318 return;
319 }
320 /*
321 * Pad the interface count to detect when additional interfaces have
322 * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
323 */
324 lifn.lifn_count += 4;
325
326 if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
327 logperror("initifs: calloc");
328 return;
329 }
330
331 lifc.lifc_family = AF_UNSPEC;
332 lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
333 lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
334 lifc.lifc_buf = buf;
335
336 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
337 logperror("initifs: ioctl (get interface configuration)");
338 free(buf);
339 return;
340 }
341
342 /*
343 * If every lifr_req slot is taken, then additional interfaces must
344 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
345 * Recalculate to make sure we didn't miss any interfaces.
346 */
347 nlifr = lifc.lifc_len / sizeof (struct lifreq);
348 if (nlifr >= lifn.lifn_count) {
349 free(buf);
350 goto again;
351 }
352
353 /*
354 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
355 * global list of addresses, phyint groups, phyints, and logints.
356 */
357 for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
358 af = lifr->lifr_addr.ss_family;
359 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
360 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
361
362 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
363 if (errno != ENXIO)
364 logperror("initifs: ioctl (SIOCGLIFFLAGS)");
365 continue;
366 }
367 flags = lifreq.lifr_flags;
368
369 /*
370 * If the address is IFF_UP, add it to the local address list.
371 * (We ignore addresses that aren't IFF_UP since another node
372 * might legitimately have that address IFF_UP.)
373 */
374 if (flags & IFF_UP) {
375 (void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
376 &lifr->lifr_addr);
377 }
378
379 /*
380 * If this address is on an IPMP meta-interface, update our
381 * phyint_group information (either by recording that group
382 * still exists or creating a new group), and track what
383 * group the address is part of.
384 */
385 if (flags & IFF_IPMP) {
386 if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
387 if (errno != ENXIO)
388 logperror("initifs: ioctl "
389 "(SIOCGLIFGROUPNAME)");
390 continue;
391 }
392
393 pg = phyint_group_lookup(lifreq.lifr_groupname);
394 if (pg == NULL) {
395 pg = phyint_group_create(lifreq.lifr_groupname);
396 if (pg == NULL) {
397 logerr("initifs: cannot create group "
398 "%s\n", lifreq.lifr_groupname);
399 continue;
400 }
401 phyint_group_insert(pg);
402 }
403 pg->pg_in_use = _B_TRUE;
404
405 /*
406 * Add this to the group's list of data addresses.
407 */
408 if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
409 &lifr->lifr_addr)) {
410 logerr("initifs: insufficient memory to track "
411 "data address information for %s\n",
412 lifr->lifr_name);
413 }
414 continue;
415 }
416
417 /*
418 * This isn't an address on an IPMP meta-interface, so it's
419 * either on an underlying interface or not related to any
420 * group. Update our phyint and logint information (via
421 * pii_process() and logint_init_from_k()) -- but first,
422 * convert the logint name to a phyint name so we can call
423 * pii_process().
424 */
425 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
426 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
427 *cp = '\0';
428
429 if (pii_process(af, pi_name, &pii)) {
430 /* The phyint is fine. So process the logint */
431 logint_init_from_k(pii, lifr->lifr_name);
432 check_addr_unique(pii, &lifr->lifr_addr);
433 }
434 }
435 free(buf);
436
437 /*
438 * Scan for groups, phyints and logints that have disappeared from the
439 * kernel, and delete them.
440 */
441 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
442 next_pii = pii->pii_next;
443 check_if_removed(pii);
444 }
445
446 for (pg = phyint_groups; pg != NULL; pg = next_pg) {
447 next_pg = pg->pg_next;
448 if (!pg->pg_in_use) {
449 phyint_group_delete(pg);
450 continue;
451 }
452 /*
453 * Refresh the group's state. This is necessary since the
454 * group's state is defined by the set of usable interfaces in
455 * the group, and an interface is considered unusable if all
456 * of its addresses are down. When an address goes down/up,
457 * the RTM_DELADDR/RTM_NEWADDR brings us through here.
458 */
459 phyint_group_refresh_state(pg);
460 }
461
462 /*
463 * Select a test address for sending probes on each phyint instance
464 */
465 select_test_ifs();
466
467 /*
468 * Handle link up/down notifications.
469 */
470 process_link_state_changes();
471 }
472
473 /*
474 * Check that a given test address is unique across all of the interfaces in a
475 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding
476 * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
477 * Any issues will be reported by check_testconfig().
478 */
479 static void
check_addr_unique(struct phyint_instance * ourpii,struct sockaddr_storage * ss)480 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
481 {
482 struct phyint *pi;
483 struct phyint_group *pg;
484 struct in6_addr addr;
485 struct phyint_instance *pii;
486 struct sockaddr_in *sin;
487
488 if (ss->ss_family == AF_INET) {
489 sin = (struct sockaddr_in *)ss;
490 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
491 } else {
492 assert(ss->ss_family == AF_INET6);
493 addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
494 }
495
496 /*
497 * For anonymous groups, every interface is assumed to be on its own
498 * link, so there is no chance of overlapping addresses.
499 */
500 pg = ourpii->pii_phyint->pi_group;
501 if (pg == phyint_anongroup)
502 return;
503
504 /*
505 * Walk the list of phyint instances in the group and check for test
506 * addresses matching ours. Of course, we skip ourself.
507 */
508 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
509 pii = PHYINT_INSTANCE(pi, ss->ss_family);
510 if (pii == NULL || pii == ourpii ||
511 pii->pii_probe_logint == NULL)
512 continue;
513
514 /*
515 * If this test address is not unique, set the dupaddr bit.
516 */
517 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr))
518 pii->pii_probe_logint->li_dupaddr = 1;
519 }
520 }
521
522 /*
523 * Stop probing an interface. Called when an interface is offlined.
524 * The probe socket is closed on each interface instance, and the
525 * interface state set to PI_OFFLINE.
526 */
527 void
stop_probing(struct phyint * pi)528 stop_probing(struct phyint *pi)
529 {
530 struct phyint_instance *pii;
531
532 pii = pi->pi_v4;
533 if (pii != NULL) {
534 if (pii->pii_probe_sock != -1)
535 close_probe_socket(pii, _B_TRUE);
536 pii->pii_probe_logint = NULL;
537 }
538
539 pii = pi->pi_v6;
540 if (pii != NULL) {
541 if (pii->pii_probe_sock != -1)
542 close_probe_socket(pii, _B_TRUE);
543 pii->pii_probe_logint = NULL;
544 }
545
546 phyint_chstate(pi, PI_OFFLINE);
547 }
548
549 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
550
551 /*
552 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set.
553 * IFF_UP must also be set so that the associated address can be used as a
554 * source address. Further, we must be able to exchange packets with local
555 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical
556 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
557 */
558 static int
rate_testflags(uint64_t flags)559 rate_testflags(uint64_t flags)
560 {
561 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
562 return (BAD_TESTFLAGS);
563
564 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
565 return (BAD_TESTFLAGS);
566
567 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
568 return (BEST_TESTFLAGS);
569
570 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
571 return (BEST_TESTFLAGS);
572
573 return (OK_TESTFLAGS);
574 }
575
576 /*
577 * Attempt to select a test address for each phyint instance.
578 * Call phyint_inst_sockinit() to complete the initializations.
579 */
580 static void
select_test_ifs(void)581 select_test_ifs(void)
582 {
583 struct phyint *pi;
584 struct phyint_instance *pii;
585 struct phyint_instance *next_pii;
586 struct logint *li;
587 struct logint *probe_logint;
588 boolean_t target_scan_reqd = _B_FALSE;
589 int rating;
590
591 if (debug & D_PHYINT)
592 logdebug("select_test_ifs\n");
593
594 /*
595 * For each phyint instance, do the test address selection
596 */
597 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
598 next_pii = pii->pii_next;
599 probe_logint = NULL;
600
601 /*
602 * An interface that is offline should not be probed.
603 * IFF_OFFLINE interfaces should always be PI_OFFLINE
604 * unless some other entity has set the offline flag.
605 */
606 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
607 if (pii->pii_phyint->pi_state != PI_OFFLINE) {
608 logerr("shouldn't be probing offline"
609 " interface %s (state is: %u)."
610 " Stopping probes.\n",
611 pii->pii_phyint->pi_name,
612 pii->pii_phyint->pi_state);
613 stop_probing(pii->pii_phyint);
614 }
615 continue;
616 } else {
617 /*
618 * If something cleared IFF_OFFLINE (e.g., by accident
619 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
620 * inherently racy), the phyint may still be offline.
621 * Just ignore it.
622 */
623 if (pii->pii_phyint->pi_state == PI_OFFLINE)
624 continue;
625 }
626
627 li = pii->pii_probe_logint;
628 if (li != NULL) {
629 /*
630 * We've already got a test address; only proceed
631 * if it's suboptimal.
632 */
633 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
634 continue;
635 }
636
637 /*
638 * Walk the logints of this phyint instance, and select
639 * the best available test address
640 */
641 for (li = pii->pii_logint; li != NULL; li = li->li_next) {
642 /*
643 * Skip 0.0.0.0 addresses, as those are never
644 * actually usable.
645 */
646 if (pii->pii_af == AF_INET &&
647 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
648 continue;
649
650 /*
651 * Skip any IPv6 logints that are not link-local,
652 * since we should always have a link-local address
653 * anyway and in6_data() expects link-local replies.
654 */
655 if (pii->pii_af == AF_INET6 &&
656 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
657 continue;
658
659 /*
660 * Rate the testflags. If we've found an optimal
661 * match, then break out; otherwise, record the most
662 * recent OK one.
663 */
664 rating = rate_testflags(li->li_flags);
665 if (rating == BAD_TESTFLAGS)
666 continue;
667
668 probe_logint = li;
669 if (rating == BEST_TESTFLAGS)
670 break;
671 }
672
673 /*
674 * If the probe logint has changed, ditch the old one.
675 */
676 if (pii->pii_probe_logint != NULL &&
677 pii->pii_probe_logint != probe_logint) {
678 if (pii->pii_probe_sock != -1)
679 close_probe_socket(pii, _B_TRUE);
680 pii->pii_probe_logint = NULL;
681 }
682
683 if (probe_logint == NULL) {
684 /*
685 * We don't have a test address; zero out the probe
686 * stats array since it is no longer relevant.
687 * Optimize by checking if it is already zeroed out.
688 */
689 int pr_ndx;
690
691 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
692 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
693 clear_pii_probe_stats(pii);
694 reset_crtt_all(pii->pii_phyint);
695 }
696 continue;
697 } else if (probe_logint == pii->pii_probe_logint) {
698 /*
699 * If we didn't find any new test addr, go to the
700 * next phyint.
701 */
702 continue;
703 }
704
705 /*
706 * The phyint is either being assigned a new testaddr
707 * or is being assigned a testaddr for the 1st time.
708 * Need to initialize the phyint socket
709 */
710 pii->pii_probe_logint = probe_logint;
711 if (!phyint_inst_sockinit(pii)) {
712 if (debug & D_PHYINT) {
713 logdebug("select_test_ifs: "
714 "phyint_sockinit failed\n");
715 }
716 phyint_inst_delete(pii);
717 continue;
718 }
719
720 /*
721 * This phyint instance is now enabled for probes; this
722 * impacts our state machine in two ways:
723 *
724 * 1. If we're probe *capable* as well (i.e., we have
725 * probe targets) and the interface is in PI_NOTARGETS,
726 * then transition to PI_RUNNING.
727 *
728 * 2. If we're not probe capable, and the other phyint
729 * instance is also not probe capable, and we were in
730 * PI_RUNNING, then transition to PI_NOTARGETS.
731 *
732 * Also see the state diagram in mpd_probe.c.
733 */
734 if (PROBE_CAPABLE(pii)) {
735 if (pii->pii_phyint->pi_state == PI_NOTARGETS)
736 phyint_chstate(pii->pii_phyint, PI_RUNNING);
737 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
738 if (pii->pii_phyint->pi_state == PI_RUNNING)
739 phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
740 }
741
742 /*
743 * If no targets are currently known for this phyint
744 * we need to call init_router_targets. Since
745 * init_router_targets() initializes the list of targets
746 * for all phyints it is done below the loop.
747 */
748 if (pii->pii_targets == NULL)
749 target_scan_reqd = _B_TRUE;
750
751 /*
752 * Start the probe timer for this instance.
753 */
754 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
755 start_timer(pii);
756 pii->pii_basetime_inited = 1;
757 }
758 }
759
760 /*
761 * Scan the interface list for any interfaces that are PI_FAILED or
762 * PI_NOTARGETS but no longer enabled to send probes, and call
763 * phyint_check_for_repair() to see if the link state indicates that
764 * the interface should be repaired. Also see the state diagram in
765 * mpd_probe.c.
766 */
767 for (pi = phyints; pi != NULL; pi = pi->pi_next) {
768 if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
769 (pi->pi_state == PI_FAILED ||
770 pi->pi_state == PI_NOTARGETS)) {
771 phyint_check_for_repair(pi);
772 }
773 }
774
775 check_testconfig();
776
777 /*
778 * Try to populate the target list. init_router_targets populates
779 * the target list from the routing table. If our target list is
780 * still empty, init_host_targets adds host targets based on the
781 * host target list of other phyints in the group.
782 */
783 if (target_scan_reqd) {
784 init_router_targets();
785 init_host_targets();
786 }
787 }
788
789 /*
790 * Check test address configuration, and log notices/errors if appropriate.
791 * Note that this function only logs pre-existing conditions (e.g., that
792 * probe-based failure detection is disabled).
793 */
794 static void
check_testconfig(void)795 check_testconfig(void)
796 {
797 struct phyint *pi;
798 struct logint *li;
799 char abuf[INET6_ADDRSTRLEN];
800 int pri;
801
802 for (pi = phyints; pi != NULL; pi = pi->pi_next) {
803 if (pi->pi_flags & IFF_OFFLINE)
804 continue;
805
806 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) {
807 if (pi->pi_taddrmsg_printed ||
808 pi->pi_duptaddrmsg_printed) {
809 if (pi->pi_duptaddrmsg_printed)
810 pri = LOG_ERR;
811 else
812 pri = LOG_INFO;
813 logmsg(pri, "Test address now configured on "
814 "interface %s; enabling probe-based "
815 "failure detection on it\n", pi->pi_name);
816 pi->pi_taddrmsg_printed = 0;
817 pi->pi_duptaddrmsg_printed = 0;
818 }
819 continue;
820 }
821
822 li = NULL;
823 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL &&
824 pi->pi_v4->pii_probe_logint->li_dupaddr)
825 li = pi->pi_v4->pii_probe_logint;
826
827 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL &&
828 pi->pi_v6->pii_probe_logint->li_dupaddr)
829 li = pi->pi_v6->pii_probe_logint;
830
831 if (li != NULL && li->li_dupaddr) {
832 if (pi->pi_duptaddrmsg_printed)
833 continue;
834 logerr("Test address %s is not unique in group; "
835 "disabling probe-based failure detection on %s\n",
836 pr_addr(li->li_phyint_inst->pii_af,
837 li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
838 pi->pi_duptaddrmsg_printed = 1;
839 continue;
840 }
841
842 if (getcurrentsec() < pi->pi_taddrthresh)
843 continue;
844
845 if (!pi->pi_taddrmsg_printed) {
846 logtrace("No test address configured on interface %s; "
847 "disabling probe-based failure detection on it\n",
848 pi->pi_name);
849 pi->pi_taddrmsg_printed = 1;
850 }
851 }
852 }
853
854 /*
855 * Check phyint group configuration, to detect any inconsistencies,
856 * and log an error message. This is called from runtimeouts every
857 * 20 secs. But the error message is displayed once. If the
858 * consistency is resolved by the admin, a recovery message is displayed
859 * once.
860 */
861 static void
check_config(void)862 check_config(void)
863 {
864 struct phyint_group *pg;
865 struct phyint *pi;
866 boolean_t v4_in_group;
867 boolean_t v6_in_group;
868
869 /*
870 * All phyints of a group must be homogeneous to ensure that they can
871 * take over for one another. If any phyint in a group has IPv4
872 * plumbed, check that all phyints have IPv4 plumbed. Do a similar
873 * check for IPv6.
874 */
875 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
876 if (pg == phyint_anongroup)
877 continue;
878
879 v4_in_group = _B_FALSE;
880 v6_in_group = _B_FALSE;
881 /*
882 * 1st pass. Determine if at least 1 phyint in the group
883 * has IPv4 plumbed and if so set v4_in_group to true.
884 * Repeat similarly for IPv6.
885 */
886 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
887 if (pi->pi_v4 != NULL)
888 v4_in_group = _B_TRUE;
889 if (pi->pi_v6 != NULL)
890 v6_in_group = _B_TRUE;
891 }
892
893 /*
894 * 2nd pass. If v4_in_group is true, check that phyint
895 * has IPv4 plumbed. Repeat similarly for IPv6. Print
896 * out a message the 1st time only.
897 */
898 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
899 if (pi->pi_flags & IFF_OFFLINE)
900 continue;
901
902 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
903 if (!pi->pi_cfgmsg_printed) {
904 logerr("IP interface %s in group %s is"
905 " not plumbed for IPv4, affecting"
906 " IPv4 connectivity\n",
907 pi->pi_name,
908 pi->pi_group->pg_name);
909 pi->pi_cfgmsg_printed = 1;
910 }
911 } else if (v6_in_group == _B_TRUE &&
912 pi->pi_v6 == NULL) {
913 if (!pi->pi_cfgmsg_printed) {
914 logerr("IP interface %s in group %s is"
915 " not plumbed for IPv6, affecting"
916 " IPv6 connectivity\n",
917 pi->pi_name,
918 pi->pi_group->pg_name);
919 pi->pi_cfgmsg_printed = 1;
920 }
921 } else {
922 /*
923 * The phyint matches the group configuration,
924 * if we have reached this point. If it was
925 * improperly configured earlier, log an
926 * error recovery message
927 */
928 if (pi->pi_cfgmsg_printed) {
929 logerr("IP interface %s is now"
930 " consistent with group %s "
931 " and connectivity is restored\n",
932 pi->pi_name, pi->pi_group->pg_name);
933 pi->pi_cfgmsg_printed = 0;
934 }
935 }
936
937 }
938 }
939 }
940
941 /*
942 * Timer mechanism using relative time (in milliseconds) from the
943 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
944 * will fire after TIMER_INFINITY milliseconds.
945 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
946 * time values. Hence 2 consecutive timer events cannot be spaced farther
947 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
948 * that can be passed for the delay parameter of timer_schedule()
949 */
950 static uint_t timer_next; /* Currently scheduled timeout */
951 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
952
953 static void
timer_init(void)954 timer_init(void)
955 {
956 timer_next = getcurrenttime() + TIMER_INFINITY;
957 /*
958 * The call to run_timeouts() will get the timer started
959 * Since there are no phyints at this point, the timer will
960 * be set for IF_SCAN_INTERVAL ms.
961 */
962 run_timeouts();
963 }
964
965 /*
966 * Make sure the next SIGALRM occurs delay milliseconds from the current
967 * time if not earlier. We are interested only in time differences.
968 */
969 void
timer_schedule(uint_t delay)970 timer_schedule(uint_t delay)
971 {
972 uint_t now;
973 struct itimerval itimerval;
974
975 if (debug & D_TIMER)
976 logdebug("timer_schedule(%u)\n", delay);
977
978 assert(delay <= TIMER_INFINITY);
979
980 now = getcurrenttime();
981 if (delay == 0) {
982 /* Minimum allowed delay */
983 delay = 1;
984 }
985 /* Will this timer occur before the currently scheduled SIGALRM? */
986 if (timer_active && TIME_GE(now + delay, timer_next)) {
987 if (debug & D_TIMER) {
988 logdebug("timer_schedule(%u) - no action: "
989 "now %u next %u\n", delay, now, timer_next);
990 }
991 return;
992 }
993 timer_next = now + delay;
994
995 itimerval.it_value.tv_sec = delay / 1000;
996 itimerval.it_value.tv_usec = (delay % 1000) * 1000;
997 itimerval.it_interval.tv_sec = 0;
998 itimerval.it_interval.tv_usec = 0;
999 if (debug & D_TIMER) {
1000 logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1001 delay, itimerval.it_value.tv_sec,
1002 itimerval.it_value.tv_usec);
1003 }
1004 timer_active = _B_TRUE;
1005 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1006 logperror("timer_schedule: setitimer");
1007 exit(2);
1008 }
1009 }
1010
1011 static void
timer_cancel(void)1012 timer_cancel(void)
1013 {
1014 struct itimerval itimerval;
1015
1016 if (debug & D_TIMER)
1017 logdebug("timer_cancel()\n");
1018
1019 bzero(&itimerval, sizeof (itimerval));
1020 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0)
1021 logperror("timer_cancel: setitimer");
1022 }
1023
1024 /*
1025 * Timer has fired. Determine when the next timer event will occur by asking
1026 * all the timer routines. Should not be called from a timer routine.
1027 */
1028 static void
run_timeouts(void)1029 run_timeouts(void)
1030 {
1031 uint_t next;
1032 uint_t next_event_time;
1033 struct phyint_instance *pii;
1034 struct phyint_instance *next_pii;
1035 static boolean_t timeout_running;
1036
1037 /* assert that recursive timeouts don't happen. */
1038 assert(!timeout_running);
1039
1040 timeout_running = _B_TRUE;
1041
1042 if (debug & D_TIMER)
1043 logdebug("run_timeouts()\n");
1044
1045 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1046 initifs();
1047 check_config();
1048 }
1049
1050 next = TIMER_INFINITY;
1051
1052 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1053 next_pii = pii->pii_next;
1054 next_event_time = phyint_inst_timer(pii);
1055 if (next_event_time != TIMER_INFINITY && next_event_time < next)
1056 next = next_event_time;
1057
1058 if (debug & D_TIMER) {
1059 logdebug("run_timeouts(%s %s): next scheduled for"
1060 " this phyint inst %u, next scheduled global"
1061 " %u ms\n",
1062 AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1063 next_event_time, next);
1064 }
1065 }
1066
1067 /*
1068 * Make sure initifs() is called at least once every
1069 * IF_SCAN_INTERVAL, to make sure that we are in sync
1070 * with the kernel, in case we have missed any routing
1071 * socket messages.
1072 */
1073 if (next > IF_SCAN_INTERVAL)
1074 next = IF_SCAN_INTERVAL;
1075
1076 if (debug & D_TIMER)
1077 logdebug("run_timeouts: %u ms\n", next);
1078
1079 timer_schedule(next);
1080 timeout_running = _B_FALSE;
1081 }
1082
1083 static int eventpipe_read = -1; /* Used for synchronous signal delivery */
1084 static int eventpipe_write = -1;
1085 boolean_t cleanup_started = _B_FALSE; /* true if we're going away */
1086
1087 /*
1088 * Ensure that signals are processed synchronously with the rest of
1089 * the code by just writing a one character signal number on the pipe.
1090 * The poll loop will pick this up and process the signal event.
1091 */
1092 static void
sig_handler(int signo)1093 sig_handler(int signo)
1094 {
1095 uchar_t buf = (uchar_t)signo;
1096
1097 /*
1098 * Don't write to pipe if cleanup has already begun. cleanup()
1099 * might have closed the pipe already
1100 */
1101 if (cleanup_started)
1102 return;
1103
1104 if (eventpipe_write == -1) {
1105 logerr("sig_handler: no pipe found\n");
1106 return;
1107 }
1108 if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1109 logperror("sig_handler: write");
1110 }
1111
1112 extern struct probes_missed probes_missed;
1113
1114 /*
1115 * Pick up a signal "byte" from the pipe and process it.
1116 */
1117 static void
in_signal(int fd)1118 in_signal(int fd)
1119 {
1120 uchar_t buf;
1121 uint64_t sent, acked, lost, unacked, unknown;
1122 struct phyint_instance *pii;
1123 int pr_ndx;
1124
1125 switch (read(fd, &buf, sizeof (buf))) {
1126 case -1:
1127 logperror("in_signal: read");
1128 exit(1);
1129 /* NOTREACHED */
1130 case 1:
1131 break;
1132 case 0:
1133 logerr("in_signal: read end of file\n");
1134 exit(1);
1135 /* NOTREACHED */
1136 default:
1137 logerr("in_signal: read > 1\n");
1138 exit(1);
1139 }
1140
1141 if (debug & D_TIMER)
1142 logdebug("in_signal() got %d\n", buf);
1143
1144 switch (buf) {
1145 case SIGALRM:
1146 if (debug & D_TIMER) {
1147 uint_t now = getcurrenttime();
1148
1149 logdebug("in_signal(SIGALRM) delta %u\n",
1150 now - timer_next);
1151 }
1152 timer_active = _B_FALSE;
1153 run_timeouts();
1154 break;
1155 case SIGUSR1:
1156 logdebug("Printing configuration:\n");
1157 /* Print out the internal tables */
1158 phyint_inst_print_all();
1159
1160 /*
1161 * Print out the accumulated statistics about missed
1162 * probes (happens due to scheduling delay).
1163 */
1164 logerr("Missed sending total of %d probes spread over"
1165 " %d occurrences\n", probes_missed.pm_nprobes,
1166 probes_missed.pm_ntimes);
1167
1168 /*
1169 * Print out the accumulated statistics about probes
1170 * that were sent.
1171 */
1172 for (pii = phyint_instances; pii != NULL;
1173 pii = pii->pii_next) {
1174 unacked = 0;
1175 acked = pii->pii_cum_stats.acked;
1176 lost = pii->pii_cum_stats.lost;
1177 sent = pii->pii_cum_stats.sent;
1178 unknown = pii->pii_cum_stats.unknown;
1179 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1180 switch (pii->pii_probes[pr_ndx].pr_status) {
1181 case PR_ACKED:
1182 acked++;
1183 break;
1184 case PR_LOST:
1185 lost++;
1186 break;
1187 case PR_UNACKED:
1188 unacked++;
1189 break;
1190 }
1191 }
1192 logerr("\nProbe stats on (%s %s)\n"
1193 "Number of probes sent %lld\n"
1194 "Number of probe acks received %lld\n"
1195 "Number of probes/acks lost %lld\n"
1196 "Number of valid unacknowledged probes %lld\n"
1197 "Number of ambiguous probe acks received %lld\n",
1198 AF_STR(pii->pii_af), pii->pii_name,
1199 sent, acked, lost, unacked, unknown);
1200 }
1201 break;
1202 case SIGHUP:
1203 logerr("SIGHUP: restart and reread config file\n");
1204 /*
1205 * Cancel the interval timer. Needed since setitimer() uses
1206 * alarm() and the time left is inherited across exec(), and
1207 * thus the SIGALRM may be delivered before a handler has been
1208 * setup, causing in.mpathd to erroneously exit.
1209 */
1210 timer_cancel();
1211 cleanup();
1212 (void) execv(argv0[0], argv0);
1213 _exit(0177);
1214 /* NOTREACHED */
1215 case SIGINT:
1216 case SIGTERM:
1217 case SIGQUIT:
1218 cleanup();
1219 exit(0);
1220 /* NOTREACHED */
1221 default:
1222 logerr("in_signal: unknown signal: %d\n", buf);
1223 }
1224 }
1225
1226 static void
cleanup(void)1227 cleanup(void)
1228 {
1229 struct phyint_instance *pii;
1230 struct phyint_instance *next_pii;
1231
1232 /*
1233 * Make sure that we don't write to eventpipe in
1234 * sig_handler() if any signal notably SIGALRM,
1235 * occurs after we close the eventpipe descriptor below
1236 */
1237 cleanup_started = _B_TRUE;
1238
1239 for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1240 next_pii = pii->pii_next;
1241 phyint_inst_delete(pii);
1242 }
1243
1244 (void) close(ifsock_v4);
1245 (void) close(ifsock_v6);
1246 (void) close(rtsock_v4);
1247 (void) close(rtsock_v6);
1248 (void) close(lsock_v4);
1249 (void) close(lsock_v6);
1250 (void) close(0);
1251 (void) close(1);
1252 (void) close(2);
1253 (void) close(mibfd);
1254 (void) close(eventpipe_read);
1255 (void) close(eventpipe_write);
1256 }
1257
1258 /*
1259 * Create pipe for signal delivery and set up signal handlers.
1260 */
1261 static void
setup_eventpipe(void)1262 setup_eventpipe(void)
1263 {
1264 int fds[2];
1265 struct sigaction act;
1266
1267 if ((pipe(fds)) < 0) {
1268 logperror("setup_eventpipe: pipe");
1269 exit(1);
1270 }
1271 eventpipe_read = fds[0];
1272 eventpipe_write = fds[1];
1273 if (poll_add(eventpipe_read) == -1) {
1274 exit(1);
1275 }
1276
1277 act.sa_handler = sig_handler;
1278 act.sa_flags = SA_RESTART;
1279 (void) sigaction(SIGALRM, &act, NULL);
1280
1281 (void) sigset(SIGHUP, sig_handler);
1282 (void) sigset(SIGUSR1, sig_handler);
1283 (void) sigset(SIGTERM, sig_handler);
1284 (void) sigset(SIGINT, sig_handler);
1285 (void) sigset(SIGQUIT, sig_handler);
1286 }
1287
1288 /*
1289 * Create a routing socket for receiving RTM_IFINFO messages.
1290 */
1291 static int
setup_rtsock(int af)1292 setup_rtsock(int af)
1293 {
1294 int s;
1295 int flags;
1296 int aware = RTAW_UNDER_IPMP;
1297
1298 s = socket(PF_ROUTE, SOCK_RAW, af);
1299 if (s == -1) {
1300 logperror("setup_rtsock: socket PF_ROUTE");
1301 exit(1);
1302 }
1303
1304 if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
1305 logperror("setup_rtsock: setsockopt RT_AWARE");
1306 (void) close(s);
1307 exit(1);
1308 }
1309
1310 if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1311 logperror("setup_rtsock: fcntl F_GETFL");
1312 (void) close(s);
1313 exit(1);
1314 }
1315 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1316 logperror("setup_rtsock: fcntl F_SETFL");
1317 (void) close(s);
1318 exit(1);
1319 }
1320 if (poll_add(s) == -1) {
1321 (void) close(s);
1322 exit(1);
1323 }
1324 return (s);
1325 }
1326
1327 /*
1328 * Process an RTM_IFINFO message received on a routing socket.
1329 * The return value indicates whether a full interface scan is required.
1330 * Link up/down notifications are reflected in the IFF_RUNNING flag.
1331 * If just the state of the IFF_RUNNING interface flag has changed, a
1332 * a full interface scan isn't required.
1333 */
1334 static boolean_t
process_rtm_ifinfo(if_msghdr_t * ifm,int type)1335 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1336 {
1337 struct sockaddr_dl *sdl;
1338 struct phyint *pi;
1339 uint64_t old_flags;
1340 struct phyint_instance *pii;
1341
1342 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1343
1344 /*
1345 * Although the sockaddr_dl structure is directly after the
1346 * if_msghdr_t structure. At the time of writing, the size of the
1347 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1348 * to the presence of a timeval structure, which contains longs,
1349 * in the if_data structure. Anyway, we know where the message ends,
1350 * so we work backwards to get the start of the sockaddr_dl structure.
1351 */
1352 /*LINTED*/
1353 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1354 sizeof (struct sockaddr_dl));
1355
1356 assert(sdl->sdl_family == AF_LINK);
1357
1358 /*
1359 * The interface name is in sdl_data.
1360 * RTM_IFINFO messages are only generated for logical interface
1361 * zero, so there is no colon and logical interface number to
1362 * strip from the name. The name is not null terminated, but
1363 * there should be enough space in sdl_data to add the null.
1364 */
1365 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1366 if (debug & D_LINKNOTE)
1367 logdebug("process_rtm_ifinfo: phyint name too long\n");
1368 return (_B_TRUE);
1369 }
1370 sdl->sdl_data[sdl->sdl_nlen] = 0;
1371
1372 pi = phyint_lookup(sdl->sdl_data);
1373 if (pi == NULL) {
1374 if (debug & D_LINKNOTE)
1375 logdebug("process_rtm_ifinfo: phyint lookup failed"
1376 " for %s\n", sdl->sdl_data);
1377 return (_B_TRUE);
1378 }
1379
1380 /*
1381 * We want to try and avoid doing a full interface scan for
1382 * link state notifications from the datalink layer, as indicated
1383 * by the state of the IFF_RUNNING flag. If just the
1384 * IFF_RUNNING flag has changed state, the link state changes
1385 * are processed without a full scan.
1386 * If there is both an IPv4 and IPv6 instance associated with
1387 * the physical interface, we will get an RTM_IFINFO message
1388 * for each instance. If we just maintained a single copy of
1389 * the physical interface flags, it would appear that no flags
1390 * had changed when the second message is processed, leading us
1391 * to believe that the message wasn't generated by a flags change,
1392 * and that a full interface scan is required.
1393 * To get around this problem, two additional copies of the flags
1394 * are kept, one copy for each instance. These are only used in
1395 * this routine. At any one time, all three copies of the flags
1396 * should be identical except for the IFF_RUNNING flag. The
1397 * copy of the flags in the "phyint" structure is always up to
1398 * date.
1399 */
1400 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1401 if (pii == NULL) {
1402 if (debug & D_LINKNOTE)
1403 logdebug("process_rtm_ifinfo: no instance of address "
1404 "family %s for %s\n", AF_STR(type), pi->pi_name);
1405 return (_B_TRUE);
1406 }
1407
1408 old_flags = pii->pii_flags;
1409 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1410 pi->pi_flags = pii->pii_flags;
1411
1412 if (debug & D_LINKNOTE) {
1413 logdebug("process_rtm_ifinfo: %s address family: %s, "
1414 "old flags: %llx, new flags: %llx\n", pi->pi_name,
1415 AF_STR(type), old_flags, pi->pi_flags);
1416 }
1417
1418 /*
1419 * If IFF_STANDBY has changed, indicate that the interface has changed
1420 * types and refresh IFF_INACTIVE if need be.
1421 */
1422 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) {
1423 phyint_changed(pi);
1424 if (pii->pii_flags & IFF_STANDBY)
1425 phyint_standby_refresh_inactive(pi);
1426 }
1427
1428 /* Has just the IFF_RUNNING flag changed state ? */
1429 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1430 struct phyint_instance *pii_other;
1431 /*
1432 * It wasn't just a link state change. Update
1433 * the other instance's copy of the flags.
1434 */
1435 pii_other = phyint_inst_other(pii);
1436 if (pii_other != NULL)
1437 pii_other->pii_flags = pii->pii_flags;
1438 return (_B_TRUE);
1439 }
1440
1441 return (_B_FALSE);
1442 }
1443
1444 /*
1445 * Retrieve as many routing socket messages as possible, and try to
1446 * empty the routing sockets. Initiate full scan of targets or interfaces
1447 * as needed.
1448 * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1449 * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1450 */
1451 static void
process_rtsock(int rtsock_v4,int rtsock_v6)1452 process_rtsock(int rtsock_v4, int rtsock_v6)
1453 {
1454 int nbytes;
1455 int64_t msg[2048 / 8];
1456 struct rt_msghdr *rtm;
1457 boolean_t need_if_scan = _B_FALSE;
1458 boolean_t need_rt_scan = _B_FALSE;
1459 boolean_t rtm_ifinfo_seen = _B_FALSE;
1460 int type;
1461
1462 /* Read as many messages as possible and try to empty the sockets */
1463 for (type = AF_INET; ; type = AF_INET6) {
1464 for (;;) {
1465 nbytes = read((type == AF_INET) ? rtsock_v4 :
1466 rtsock_v6, msg, sizeof (msg));
1467 if (nbytes <= 0) {
1468 /* No more messages */
1469 break;
1470 }
1471 rtm = (struct rt_msghdr *)msg;
1472 if (rtm->rtm_version != RTM_VERSION) {
1473 logerr("process_rtsock: version %d "
1474 "not understood\n", rtm->rtm_version);
1475 break;
1476 }
1477
1478 if (debug & D_PHYINT) {
1479 logdebug("process_rtsock: message %d\n",
1480 rtm->rtm_type);
1481 }
1482
1483 switch (rtm->rtm_type) {
1484 case RTM_NEWADDR:
1485 case RTM_DELADDR:
1486 /*
1487 * Some logical interface has changed,
1488 * have to scan everything to determine
1489 * what actually changed.
1490 */
1491 need_if_scan = _B_TRUE;
1492 break;
1493
1494 case RTM_IFINFO:
1495 rtm_ifinfo_seen = _B_TRUE;
1496 need_if_scan |= process_rtm_ifinfo(
1497 (if_msghdr_t *)rtm, type);
1498 break;
1499
1500 case RTM_ADD:
1501 case RTM_DELETE:
1502 case RTM_CHANGE:
1503 case RTM_OLDADD:
1504 case RTM_OLDDEL:
1505 need_rt_scan = _B_TRUE;
1506 break;
1507
1508 default:
1509 /* Not interesting */
1510 break;
1511 }
1512 }
1513 if (type == AF_INET6)
1514 break;
1515 }
1516
1517 if (need_if_scan) {
1518 if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1519 logdebug("process_rtsock: synchronizing with kernel\n");
1520 initifs();
1521 } else if (rtm_ifinfo_seen) {
1522 if (debug & D_LINKNOTE)
1523 logdebug("process_rtsock: "
1524 "link up/down notification(s) seen\n");
1525 process_link_state_changes();
1526 }
1527
1528 if (need_rt_scan)
1529 init_router_targets();
1530 }
1531
1532 /*
1533 * Look if the phyint instance or one of its logints have been removed from
1534 * the kernel and take appropriate action.
1535 * Uses {pii,li}_in_use.
1536 */
1537 static void
check_if_removed(struct phyint_instance * pii)1538 check_if_removed(struct phyint_instance *pii)
1539 {
1540 struct logint *li;
1541 struct logint *next_li;
1542
1543 /* Detect phyints that have been removed from the kernel. */
1544 if (!pii->pii_in_use) {
1545 logtrace("%s %s has been removed from kernel\n",
1546 AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1547 phyint_inst_delete(pii);
1548 } else {
1549 /* Detect logints that have been removed. */
1550 for (li = pii->pii_logint; li != NULL; li = next_li) {
1551 next_li = li->li_next;
1552 if (!li->li_in_use) {
1553 logint_delete(li);
1554 }
1555 }
1556 }
1557 }
1558
1559 /*
1560 * Parse the supplied mib2 information to extract the routing information
1561 * table. Process the routing table to get the list of known onlink routers
1562 * and update our database. These onlink routers will serve as probe
1563 * targets.
1564 */
1565 static void
update_router_list(mib_item_t * item)1566 update_router_list(mib_item_t *item)
1567 {
1568 for (; item != NULL; item = item->mi_next) {
1569 if (item->mi_opthdr.name == 0)
1570 continue;
1571 if (item->mi_opthdr.level == MIB2_IP &&
1572 item->mi_opthdr.name == MIB2_IP_ROUTE) {
1573 ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp,
1574 item->mi_opthdr.len);
1575 } else if (item->mi_opthdr.level == MIB2_IP6 &&
1576 item->mi_opthdr.name == MIB2_IP6_ROUTE) {
1577 ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp,
1578 item->mi_opthdr.len);
1579 }
1580 }
1581 }
1582
1583
1584 /*
1585 * Convert octet `octp' to a phyint name and store in `ifname'
1586 */
1587 static void
oct2ifname(const Octet_t * octp,char * ifname,size_t ifsize)1588 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
1589 {
1590 char *cp;
1591 size_t len = MIN(octp->o_length, ifsize - 1);
1592
1593 (void) strncpy(ifname, octp->o_bytes, len);
1594 ifname[len] = '\0';
1595
1596 if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
1597 *cp = '\0';
1598 }
1599
1600 /*
1601 * Examine the IPv4 routing table `buf' for possible targets. For each
1602 * possible target, if it's on the same subnet an interface route, pass
1603 * it to router_add_common() for further consideration.
1604 */
1605 static void
ire_process_v4(mib2_ipRouteEntry_t * buf,size_t len)1606 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1607 {
1608 char ifname[LIFNAMSIZ];
1609 mib2_ipRouteEntry_t *rp, *rp1, *endp;
1610 struct in_addr nexthop_v4;
1611 struct in6_addr nexthop;
1612
1613 if (debug & D_TARGET)
1614 logdebug("ire_process_v4(len %d)\n", len);
1615
1616 if (len == 0)
1617 return;
1618
1619 assert((len % ipRouteEntrySize) == 0);
1620 endp = buf + (len / ipRouteEntrySize);
1621
1622 /*
1623 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1624 * cross-reference them with the interface routes to determine if
1625 * they're possible probe targets.
1626 */
1627 for (rp = buf; rp < endp; rp++) {
1628 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1629 continue;
1630
1631 /* Get the nexthop address. */
1632 nexthop_v4.s_addr = rp->ipRouteNextHop;
1633
1634 /*
1635 * Rescan the routing table looking for interface routes that
1636 * are on the same subnet, and try to add them. If they're
1637 * not relevant (e.g., the interface route isn't part of an
1638 * IPMP group, router_add_common() will discard).
1639 */
1640 for (rp1 = buf; rp1 < endp; rp1++) {
1641 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
1642 rp1->ipRouteIfIndex.o_length == 0)
1643 continue;
1644
1645 if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
1646 (nexthop_v4.s_addr & rp1->ipRouteMask))
1647 continue;
1648
1649 oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
1650 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1651 router_add_common(AF_INET, ifname, nexthop);
1652 }
1653 }
1654 }
1655
1656 void
router_add_common(int af,char * ifname,struct in6_addr nexthop)1657 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1658 {
1659 struct phyint_instance *pii;
1660 struct phyint *pi;
1661
1662 if (debug & D_TARGET)
1663 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1664
1665 /*
1666 * Retrieve the phyint instance; bail if it's not known to us yet.
1667 */
1668 pii = phyint_inst_lookup(af, ifname);
1669 if (pii == NULL)
1670 return;
1671
1672 /*
1673 * Don't use our own addresses as targets.
1674 */
1675 if (own_address(nexthop))
1676 return;
1677
1678 /*
1679 * If the phyint is part a named group, then add the address to all
1680 * members of the group; note that this is suboptimal in the IPv4 case
1681 * as it has already been added to all matching interfaces in
1682 * ire_process_v4(). Otherwise, add the address only to the phyint
1683 * itself, since other phyints in the anongroup may not be on the same
1684 * subnet.
1685 */
1686 pi = pii->pii_phyint;
1687 if (pi->pi_group == phyint_anongroup) {
1688 target_add(pii, nexthop, _B_TRUE);
1689 } else {
1690 pi = pi->pi_group->pg_phyint;
1691 for (; pi != NULL; pi = pi->pi_pgnext)
1692 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1693 }
1694 }
1695
1696 /*
1697 * Examine the IPv6 routing table `buf' for possible link-local targets, and
1698 * pass any contenders to router_add_common() for further consideration.
1699 */
1700 static void
ire_process_v6(mib2_ipv6RouteEntry_t * buf,size_t len)1701 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1702 {
1703 struct lifreq lifr;
1704 char ifname[LIFNAMSIZ];
1705 char grname[LIFGRNAMSIZ];
1706 mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
1707 struct in6_addr nexthop_v6;
1708
1709 if (debug & D_TARGET)
1710 logdebug("ire_process_v6(len %d)\n", len);
1711
1712 if (len == 0)
1713 return;
1714
1715 assert((len % ipv6RouteEntrySize) == 0);
1716 endp = buf + (len / ipv6RouteEntrySize);
1717
1718 /*
1719 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1720 * cross-reference them with the interface routes to determine if
1721 * they're possible probe targets.
1722 */
1723 for (rp = buf; rp < endp; rp++) {
1724 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
1725 !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
1726 continue;
1727
1728 /* Get the nexthop address. */
1729 nexthop_v6 = rp->ipv6RouteNextHop;
1730
1731 /*
1732 * The interface name should always exist for link-locals;
1733 * we use it to map this entry to an IPMP group name.
1734 */
1735 if (rp->ipv6RouteIfIndex.o_length == 0)
1736 continue;
1737
1738 oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
1739 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
1740 strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
1741 continue;
1742 }
1743
1744 /*
1745 * Rescan the list of routes for interface routes, and add the
1746 * above target to any interfaces in the same IPMP group.
1747 */
1748 for (rp1 = buf; rp1 < endp; rp1++) {
1749 if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
1750 rp1->ipv6RouteIfIndex.o_length == 0) {
1751 continue;
1752 }
1753 oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
1754 (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
1755
1756 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
1757 strcmp(lifr.lifr_groupname, grname) == 0) {
1758 router_add_common(AF_INET6, ifname, nexthop_v6);
1759 }
1760 }
1761 }
1762 }
1763
1764 /*
1765 * Build a list of target routers, by scanning the routing tables.
1766 * It is assumed that interface routes exist, to reach the routers.
1767 */
1768 static void
init_router_targets(void)1769 init_router_targets(void)
1770 {
1771 struct target *tg;
1772 struct target *next_tg;
1773 struct phyint_instance *pii;
1774 struct phyint *pi;
1775
1776 if (force_mcast)
1777 return;
1778
1779 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1780 pi = pii->pii_phyint;
1781 /*
1782 * Set tg_in_use to false only for router targets.
1783 */
1784 if (!pii->pii_targets_are_routers)
1785 continue;
1786
1787 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1788 tg->tg_in_use = 0;
1789 }
1790
1791 if (mibwalk(update_router_list) == -1)
1792 exit(1);
1793
1794 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1795 pi = pii->pii_phyint;
1796 if (!pii->pii_targets_are_routers)
1797 continue;
1798
1799 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1800 next_tg = tg->tg_next;
1801 /*
1802 * If the group has failed, it's likely the route was
1803 * removed by an application affected by that failure.
1804 * In that case, we keep the target so that we can
1805 * reliably repair, at which point we'll refresh the
1806 * target list again.
1807 */
1808 if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
1809 target_delete(tg);
1810 }
1811 }
1812 }
1813
1814 /*
1815 * Attempt to assign host targets to any interfaces that do not currently
1816 * have probe targets by sharing targets with other interfaces in the group.
1817 */
1818 static void
init_host_targets(void)1819 init_host_targets(void)
1820 {
1821 struct phyint_instance *pii;
1822 struct phyint_group *pg;
1823
1824 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1825 pg = pii->pii_phyint->pi_group;
1826 if (pg != phyint_anongroup && pii->pii_targets == NULL)
1827 dup_host_targets(pii);
1828 }
1829 }
1830
1831 /*
1832 * Duplicate host targets from other phyints of the group to
1833 * the phyint instance 'desired_pii'.
1834 */
1835 static void
dup_host_targets(struct phyint_instance * desired_pii)1836 dup_host_targets(struct phyint_instance *desired_pii)
1837 {
1838 int af;
1839 struct phyint *pi;
1840 struct phyint_instance *pii;
1841 struct target *tg;
1842
1843 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
1844
1845 af = desired_pii->pii_af;
1846
1847 /*
1848 * For every phyint in the same group as desired_pii, check if
1849 * it has any host targets. If so add them to desired_pii.
1850 */
1851 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
1852 pii = PHYINT_INSTANCE(pi, af);
1853 /*
1854 * We know that we don't have targets on this phyint instance
1855 * since we have been called. But we still check for
1856 * pii_targets_are_routers because another phyint instance
1857 * could have router targets, since IFF_NOFAILOVER addresses
1858 * on different phyint instances may belong to different
1859 * subnets.
1860 */
1861 if ((pii == NULL) || (pii == desired_pii) ||
1862 pii->pii_targets_are_routers)
1863 continue;
1864 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1865 target_create(desired_pii, tg->tg_address, _B_FALSE);
1866 }
1867 }
1868 }
1869
1870 static void
usage(char * cmd)1871 usage(char *cmd)
1872 {
1873 (void) fprintf(stderr, "usage: %s\n", cmd);
1874 }
1875
1876
1877 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd"
1878
1879 /* Get an option from the /etc/default/mpathd file */
1880 static char *
getdefault(char * name)1881 getdefault(char *name)
1882 {
1883 char namebuf[BUFSIZ];
1884 char *value = NULL;
1885
1886 if (defopen(MPATHD_DEFAULT_FILE) == 0) {
1887 char *cp;
1888 int flags;
1889
1890 /*
1891 * ignore case
1892 */
1893 flags = defcntl(DC_GETFLAGS, 0);
1894 TURNOFF(flags, DC_CASE);
1895 (void) defcntl(DC_SETFLAGS, flags);
1896
1897 /* Add "=" to the name */
1898 (void) strncpy(namebuf, name, sizeof (namebuf) - 2);
1899 (void) strncat(namebuf, "=", 2);
1900
1901 if ((cp = defread(namebuf)) != NULL)
1902 value = strdup(cp);
1903
1904 /* close */
1905 (void) defopen((char *)NULL);
1906 }
1907 return (value);
1908 }
1909
1910
1911 /*
1912 * Command line options below
1913 */
1914 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */
1915 boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */
1916 static boolean_t adopt = _B_FALSE;
1917 static boolean_t foreground = _B_FALSE;
1918
1919 int
main(int argc,char * argv[])1920 main(int argc, char *argv[])
1921 {
1922 int i;
1923 int c;
1924 struct phyint *pi;
1925 struct phyint_instance *pii;
1926 char *value;
1927
1928 argv0 = argv; /* Saved for re-exec on SIGHUP */
1929 srandom(gethostid()); /* Initialize the random number generator */
1930
1931 /*
1932 * NOTE: The messages output by in.mpathd are not suitable for
1933 * translation, so we do not call textdomain().
1934 */
1935 (void) setlocale(LC_ALL, "");
1936
1937 /*
1938 * Get the user specified value of 'failure detection time'
1939 * from /etc/default/mpathd
1940 */
1941 value = getdefault("FAILURE_DETECTION_TIME");
1942 if (value != NULL) {
1943 user_failure_detection_time =
1944 (int)strtol((char *)value, NULL, 0);
1945
1946 if (user_failure_detection_time <= 0) {
1947 user_failure_detection_time = FAILURE_DETECTION_TIME;
1948 logerr("Invalid failure detection time %s, assuming "
1949 "default of %d ms\n", value,
1950 user_failure_detection_time);
1951
1952 } else if (user_failure_detection_time <
1953 MIN_FAILURE_DETECTION_TIME) {
1954 user_failure_detection_time =
1955 MIN_FAILURE_DETECTION_TIME;
1956 logerr("Too small failure detection time of %s, "
1957 "assuming minimum of %d ms\n", value,
1958 user_failure_detection_time);
1959 }
1960 free(value);
1961 } else {
1962 /* User has not specified the parameter, Use default value */
1963 user_failure_detection_time = FAILURE_DETECTION_TIME;
1964 }
1965
1966 /*
1967 * This gives the frequency at which probes will be sent.
1968 * When fdt ms elapses, we should be able to determine
1969 * whether 5 consecutive probes have failed or not.
1970 * 1 probe will be sent in every user_probe_interval ms,
1971 * randomly anytime in the (0.5 - 1.0) 2nd half of every
1972 * user_probe_interval. Thus when we send out probe 'n' we
1973 * can be sure that probe 'n - 2' is lost, if we have not
1974 * got the ack. (since the probe interval is > crtt). But
1975 * probe 'n - 1' may be a valid unacked probe, since the
1976 * time between 2 successive probes could be as small as
1977 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2
1978 */
1979 user_probe_interval = user_failure_detection_time /
1980 (NUM_PROBE_FAILS + 2);
1981
1982 /*
1983 * Get the user specified value of failback_enabled from
1984 * /etc/default/mpathd
1985 */
1986 value = getdefault("FAILBACK");
1987 if (value != NULL) {
1988 if (strcasecmp(value, "yes") == 0)
1989 failback_enabled = _B_TRUE;
1990 else if (strcasecmp(value, "no") == 0)
1991 failback_enabled = _B_FALSE;
1992 else
1993 logerr("Invalid value for FAILBACK %s\n", value);
1994 free(value);
1995 } else {
1996 failback_enabled = _B_TRUE;
1997 }
1998
1999 /*
2000 * Get the user specified value of track_all_phyints from
2001 * /etc/default/mpathd. The sense is reversed in
2002 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2003 */
2004 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2005 if (value != NULL) {
2006 if (strcasecmp(value, "yes") == 0)
2007 track_all_phyints = _B_FALSE;
2008 else if (strcasecmp(value, "no") == 0)
2009 track_all_phyints = _B_TRUE;
2010 else
2011 logerr("Invalid value for "
2012 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2013 free(value);
2014 } else {
2015 track_all_phyints = _B_FALSE;
2016 }
2017
2018 while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2019 switch (c) {
2020 case 'a':
2021 adopt = _B_TRUE;
2022 break;
2023 case 'm':
2024 force_mcast = _B_TRUE;
2025 break;
2026 case 'd':
2027 debug = D_ALL;
2028 foreground = _B_TRUE;
2029 break;
2030 case 'D':
2031 i = (int)strtol(optarg, NULL, 0);
2032 if (i == 0) {
2033 (void) fprintf(stderr, "Bad debug flags: %s\n",
2034 optarg);
2035 exit(1);
2036 }
2037 debug |= i;
2038 foreground = _B_TRUE;
2039 break;
2040 case 'l':
2041 /*
2042 * Turn off link state notification handling.
2043 * Undocumented command line flag, for debugging
2044 * purposes.
2045 */
2046 handle_link_notifications = _B_FALSE;
2047 break;
2048 default:
2049 usage(argv[0]);
2050 exit(1);
2051 }
2052 }
2053
2054 /*
2055 * The sockets for the loopback command interface should be listening
2056 * before we fork and exit in daemonize(). This way, whoever started us
2057 * can use the loopback interface as soon as they get a zero exit
2058 * status.
2059 */
2060 lsock_v4 = setup_listener(AF_INET);
2061 lsock_v6 = setup_listener(AF_INET6);
2062
2063 if (lsock_v4 < 0 && lsock_v6 < 0) {
2064 logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2065 exit(1);
2066 }
2067
2068 if (!foreground) {
2069 if (!daemonize()) {
2070 logerr("cannot daemonize\n");
2071 exit(EXIT_FAILURE);
2072 }
2073 initlog();
2074 }
2075
2076 /*
2077 * Initializations:
2078 * 1. Create ifsock* sockets. These are used for performing SIOC*
2079 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2080 * 2. Initialize a pipe for handling/recording signal events.
2081 * 3. Create the routing sockets, used for listening
2082 * to routing / interface changes.
2083 * 4. phyint_init() - Initialize physical interface state
2084 * (in mpd_tables.c). Must be done before creating interfaces,
2085 * which timer_init() does indirectly.
2086 * 5. Query kernel for route entry sizes (v4 and v6).
2087 * 6. timer_init() - Initialize timer related stuff
2088 * 7. initifs() - Initialize our database of all known interfaces
2089 * 8. init_router_targets() - Initialize our database of all known
2090 * router targets.
2091 */
2092 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2093 if (ifsock_v4 < 0) {
2094 logperror("main: IPv4 socket open");
2095 exit(1);
2096 }
2097
2098 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2099 if (ifsock_v6 < 0) {
2100 logperror("main: IPv6 socket open");
2101 exit(1);
2102 }
2103
2104 setup_eventpipe();
2105
2106 rtsock_v4 = setup_rtsock(AF_INET);
2107 rtsock_v6 = setup_rtsock(AF_INET6);
2108
2109 if (phyint_init() == -1) {
2110 logerr("cannot initialize physical interface structures");
2111 exit(1);
2112 }
2113
2114 if (mibwalk(mib_get_constants) == -1)
2115 exit(1);
2116
2117 timer_init();
2118
2119 initifs();
2120
2121 /*
2122 * If we're operating in "adopt" mode and no interfaces need to be
2123 * tracked, shut down (ifconfig(8) will restart us on demand if
2124 * interfaces are subsequently put into multipathing groups).
2125 */
2126 if (adopt && phyint_instances == NULL)
2127 exit(0);
2128
2129 /*
2130 * Main body. Keep listening for activity on any of the sockets
2131 * that we are monitoring and take appropriate action as necessary.
2132 * signals are also handled synchronously.
2133 */
2134 for (;;) {
2135 if (poll(pollfds, pollfd_num, -1) < 0) {
2136 if (errno == EINTR)
2137 continue;
2138 logperror("main: poll");
2139 exit(1);
2140 }
2141 for (i = 0; i < pollfd_num; i++) {
2142 if ((pollfds[i].fd == -1) ||
2143 !(pollfds[i].revents & POLLIN))
2144 continue;
2145 if (pollfds[i].fd == eventpipe_read) {
2146 in_signal(eventpipe_read);
2147 break;
2148 }
2149 if (pollfds[i].fd == rtsock_v4 ||
2150 pollfds[i].fd == rtsock_v6) {
2151 process_rtsock(rtsock_v4, rtsock_v6);
2152 break;
2153 }
2154
2155 for (pii = phyint_instances; pii != NULL;
2156 pii = pii->pii_next) {
2157 if (pollfds[i].fd == pii->pii_probe_sock) {
2158 if (pii->pii_af == AF_INET)
2159 in_data(pii);
2160 else
2161 in6_data(pii);
2162 break;
2163 }
2164 }
2165
2166 for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2167 if (pi->pi_notes != 0 &&
2168 pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
2169 (void) dlpi_recv(pi->pi_dh, NULL, NULL,
2170 NULL, NULL, 0, NULL);
2171 break;
2172 }
2173 }
2174
2175 if (pollfds[i].fd == lsock_v4)
2176 loopback_cmd(lsock_v4, AF_INET);
2177 else if (pollfds[i].fd == lsock_v6)
2178 loopback_cmd(lsock_v6, AF_INET6);
2179 }
2180 }
2181 /* NOTREACHED */
2182 return (EXIT_SUCCESS);
2183 }
2184
2185 static int
setup_listener(int af)2186 setup_listener(int af)
2187 {
2188 int sock;
2189 int on;
2190 int len;
2191 int ret;
2192 struct sockaddr_storage laddr;
2193 struct sockaddr_in *sin;
2194 struct sockaddr_in6 *sin6;
2195 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2196
2197 assert(af == AF_INET || af == AF_INET6);
2198
2199 sock = socket(af, SOCK_STREAM, 0);
2200 if (sock < 0) {
2201 logperror("setup_listener: socket");
2202 exit(1);
2203 }
2204
2205 on = 1;
2206 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2207 sizeof (on)) < 0) {
2208 logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2209 exit(1);
2210 }
2211
2212 bzero(&laddr, sizeof (laddr));
2213 laddr.ss_family = af;
2214
2215 if (af == AF_INET) {
2216 sin = (struct sockaddr_in *)&laddr;
2217 sin->sin_port = htons(MPATHD_PORT);
2218 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2219 len = sizeof (struct sockaddr_in);
2220 } else {
2221 sin6 = (struct sockaddr_in6 *)&laddr;
2222 sin6->sin6_port = htons(MPATHD_PORT);
2223 sin6->sin6_addr = loopback_addr;
2224 len = sizeof (struct sockaddr_in6);
2225 }
2226
2227 ret = bind(sock, (struct sockaddr *)&laddr, len);
2228 if (ret < 0) {
2229 if (errno == EADDRINUSE) {
2230 /*
2231 * Another instance of mpathd may be already active.
2232 */
2233 logerr("main: is another instance of in.mpathd "
2234 "already active?\n");
2235 exit(1);
2236 } else {
2237 (void) close(sock);
2238 return (-1);
2239 }
2240 }
2241 if (listen(sock, 30) < 0) {
2242 logperror("main: listen");
2243 exit(1);
2244 }
2245 if (poll_add(sock) == -1) {
2246 (void) close(sock);
2247 exit(1);
2248 }
2249
2250 return (sock);
2251 }
2252
2253 /*
2254 * Table of commands and their expected size; used by loopback_cmd().
2255 */
2256 static struct {
2257 const char *name;
2258 unsigned int size;
2259 } commands[] = {
2260 { "MI_PING", sizeof (uint32_t) },
2261 { "MI_OFFLINE", sizeof (mi_offline_t) },
2262 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) },
2263 { "MI_QUERY", sizeof (mi_query_t) }
2264 };
2265
2266 /*
2267 * Commands received over the loopback interface come here (via libipmp).
2268 */
2269 static void
loopback_cmd(int sock,int family)2270 loopback_cmd(int sock, int family)
2271 {
2272 int newfd;
2273 ssize_t len;
2274 boolean_t is_priv = _B_FALSE;
2275 struct sockaddr_storage peer;
2276 struct sockaddr_in *peer_sin;
2277 struct sockaddr_in6 *peer_sin6;
2278 socklen_t peerlen;
2279 union mi_commands mpi;
2280 char abuf[INET6_ADDRSTRLEN];
2281 uint_t cmd;
2282 int retval;
2283
2284 peerlen = sizeof (peer);
2285 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2286 if (newfd < 0) {
2287 logperror("loopback_cmd: accept");
2288 return;
2289 }
2290
2291 switch (family) {
2292 case AF_INET:
2293 /*
2294 * Validate the address and port to make sure that
2295 * non privileged processes don't connect and start
2296 * talking to us.
2297 */
2298 if (peerlen != sizeof (struct sockaddr_in)) {
2299 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2300 (void) close(newfd);
2301 return;
2302 }
2303 peer_sin = (struct sockaddr_in *)&peer;
2304 is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
2305 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2306 abuf, sizeof (abuf));
2307
2308 if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
2309 logerr("Attempt to connect from addr %s port %d\n",
2310 abuf, ntohs(peer_sin->sin_port));
2311 (void) close(newfd);
2312 return;
2313 }
2314 break;
2315
2316 case AF_INET6:
2317 if (peerlen != sizeof (struct sockaddr_in6)) {
2318 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2319 (void) close(newfd);
2320 return;
2321 }
2322 /*
2323 * Validate the address and port to make sure that
2324 * non privileged processes don't connect and start
2325 * talking to us.
2326 */
2327 peer_sin6 = (struct sockaddr_in6 *)&peer;
2328 is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
2329 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2330 sizeof (abuf));
2331 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
2332 logerr("Attempt to connect from addr %s port %d\n",
2333 abuf, ntohs(peer_sin6->sin6_port));
2334 (void) close(newfd);
2335 return;
2336 }
2337 break;
2338
2339 default:
2340 logdebug("loopback_cmd: family %d\n", family);
2341 (void) close(newfd);
2342 return;
2343 }
2344
2345 /*
2346 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2347 * all supported commands
2348 */
2349 len = read(newfd, &mpi, sizeof (mpi));
2350
2351 /*
2352 * In theory, we can receive any sized message for a stream socket,
2353 * but we don't expect that to happen for a small message over a
2354 * loopback connection.
2355 */
2356 if (len < sizeof (uint32_t)) {
2357 logerr("loopback_cmd: bad command format or read returns "
2358 "partial data %d\n", len);
2359 (void) close(newfd);
2360 return;
2361 }
2362
2363 cmd = mpi.mi_command;
2364 if (cmd >= MI_NCMD) {
2365 logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2366 (void) close(newfd);
2367 return;
2368 }
2369
2370 /*
2371 * Only MI_PING and MI_QUERY can come from unprivileged sources.
2372 */
2373 if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
2374 logerr("Unprivileged request from %s for privileged "
2375 "command %s\n", abuf, commands[cmd].name);
2376 (void) close(newfd);
2377 return;
2378 }
2379
2380 if (len < commands[cmd].size) {
2381 logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2382 commands[cmd].name, commands[cmd].size, len);
2383 (void) close(newfd);
2384 return;
2385 }
2386
2387 retval = process_cmd(newfd, &mpi);
2388 if (retval != IPMP_SUCCESS) {
2389 logerr("failed processing %s: %s\n", commands[cmd].name,
2390 ipmp_errmsg(retval));
2391 }
2392 (void) close(newfd);
2393 }
2394
2395 /*
2396 * Process the commands received via libipmp.
2397 */
2398 static unsigned int
process_cmd(int newfd,union mi_commands * mpi)2399 process_cmd(int newfd, union mi_commands *mpi)
2400 {
2401 struct phyint *pi;
2402 struct mi_offline *mio;
2403 struct mi_undo_offline *miu;
2404 unsigned int retval;
2405
2406 switch (mpi->mi_command) {
2407 case MI_PING:
2408 return (send_result(newfd, IPMP_SUCCESS, 0));
2409
2410 case MI_OFFLINE:
2411 mio = &mpi->mi_ocmd;
2412
2413 pi = phyint_lookup(mio->mio_ifname);
2414 if (pi == NULL)
2415 return (send_result(newfd, IPMP_EUNKIF, 0));
2416
2417 retval = phyint_offline(pi, mio->mio_min_redundancy);
2418 if (retval == IPMP_FAILURE)
2419 return (send_result(newfd, IPMP_FAILURE, errno));
2420
2421 return (send_result(newfd, retval, 0));
2422
2423 case MI_UNDO_OFFLINE:
2424 miu = &mpi->mi_ucmd;
2425
2426 pi = phyint_lookup(miu->miu_ifname);
2427 if (pi == NULL)
2428 return (send_result(newfd, IPMP_EUNKIF, 0));
2429
2430 retval = phyint_undo_offline(pi);
2431 if (retval == IPMP_FAILURE)
2432 return (send_result(newfd, IPMP_FAILURE, errno));
2433
2434 return (send_result(newfd, retval, 0));
2435
2436 case MI_QUERY:
2437 return (process_query(newfd, &mpi->mi_qcmd));
2438
2439 default:
2440 break;
2441 }
2442
2443 return (send_result(newfd, IPMP_EPROTO, 0));
2444 }
2445
2446 /*
2447 * Process the query request pointed to by `miq' and send a reply on file
2448 * descriptor `fd'. Returns an IPMP error code.
2449 */
2450 static unsigned int
process_query(int fd,mi_query_t * miq)2451 process_query(int fd, mi_query_t *miq)
2452 {
2453 ipmp_addrinfo_t *adinfop;
2454 ipmp_addrinfolist_t *adlp;
2455 ipmp_groupinfo_t *grinfop;
2456 ipmp_groupinfolist_t *grlp;
2457 ipmp_grouplist_t *grlistp;
2458 ipmp_ifinfo_t *ifinfop;
2459 ipmp_ifinfolist_t *iflp;
2460 ipmp_snap_t *snap;
2461 unsigned int retval;
2462
2463 switch (miq->miq_inforeq) {
2464 case IPMP_ADDRINFO:
2465 retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
2466 &adinfop);
2467 if (retval != IPMP_SUCCESS)
2468 return (send_result(fd, retval, errno));
2469
2470 retval = send_result(fd, IPMP_SUCCESS, 0);
2471 if (retval == IPMP_SUCCESS)
2472 retval = send_addrinfo(fd, adinfop);
2473
2474 ipmp_freeaddrinfo(adinfop);
2475 return (retval);
2476
2477 case IPMP_GROUPLIST:
2478 retval = getgrouplist(&grlistp);
2479 if (retval != IPMP_SUCCESS)
2480 return (send_result(fd, retval, errno));
2481
2482 retval = send_result(fd, IPMP_SUCCESS, 0);
2483 if (retval == IPMP_SUCCESS)
2484 retval = send_grouplist(fd, grlistp);
2485
2486 ipmp_freegrouplist(grlistp);
2487 return (retval);
2488
2489 case IPMP_GROUPINFO:
2490 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2491 retval = getgroupinfo(miq->miq_grname, &grinfop);
2492 if (retval != IPMP_SUCCESS)
2493 return (send_result(fd, retval, errno));
2494
2495 retval = send_result(fd, IPMP_SUCCESS, 0);
2496 if (retval == IPMP_SUCCESS)
2497 retval = send_groupinfo(fd, grinfop);
2498
2499 ipmp_freegroupinfo(grinfop);
2500 return (retval);
2501
2502 case IPMP_IFINFO:
2503 miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2504 retval = getifinfo(miq->miq_ifname, &ifinfop);
2505 if (retval != IPMP_SUCCESS)
2506 return (send_result(fd, retval, errno));
2507
2508 retval = send_result(fd, IPMP_SUCCESS, 0);
2509 if (retval == IPMP_SUCCESS)
2510 retval = send_ifinfo(fd, ifinfop);
2511
2512 ipmp_freeifinfo(ifinfop);
2513 return (retval);
2514
2515 case IPMP_SNAP:
2516 /*
2517 * Before taking the snapshot, sync with the kernel.
2518 */
2519 initifs();
2520
2521 retval = getsnap(&snap);
2522 if (retval != IPMP_SUCCESS)
2523 return (send_result(fd, retval, errno));
2524
2525 retval = send_result(fd, IPMP_SUCCESS, 0);
2526 if (retval != IPMP_SUCCESS)
2527 goto out;
2528
2529 retval = send_grouplist(fd, snap->sn_grlistp);
2530 if (retval != IPMP_SUCCESS)
2531 goto out;
2532
2533 retval = ipmp_writetlv(fd, IPMP_IFCNT, sizeof (uint32_t),
2534 &snap->sn_nif);
2535 if (retval != IPMP_SUCCESS)
2536 goto out;
2537
2538 iflp = snap->sn_ifinfolistp;
2539 for (; iflp != NULL; iflp = iflp->ifl_next) {
2540 retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2541 if (retval != IPMP_SUCCESS)
2542 goto out;
2543 }
2544
2545 retval = ipmp_writetlv(fd, IPMP_GROUPCNT, sizeof (uint32_t),
2546 &snap->sn_ngroup);
2547 if (retval != IPMP_SUCCESS)
2548 goto out;
2549
2550 grlp = snap->sn_grinfolistp;
2551 for (; grlp != NULL; grlp = grlp->grl_next) {
2552 retval = send_groupinfo(fd, grlp->grl_grinfop);
2553 if (retval != IPMP_SUCCESS)
2554 goto out;
2555 }
2556
2557 retval = ipmp_writetlv(fd, IPMP_ADDRCNT, sizeof (uint32_t),
2558 &snap->sn_naddr);
2559 if (retval != IPMP_SUCCESS)
2560 goto out;
2561
2562 adlp = snap->sn_adinfolistp;
2563 for (; adlp != NULL; adlp = adlp->adl_next) {
2564 retval = send_addrinfo(fd, adlp->adl_adinfop);
2565 if (retval != IPMP_SUCCESS)
2566 goto out;
2567 }
2568 out:
2569 ipmp_snap_free(snap);
2570 return (retval);
2571
2572 default:
2573 break;
2574
2575 }
2576 return (send_result(fd, IPMP_EPROTO, 0));
2577 }
2578
2579 /*
2580 * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2581 * Returns an IPMP error code.
2582 */
2583 static unsigned int
send_groupinfo(int fd,ipmp_groupinfo_t * grinfop)2584 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2585 {
2586 ipmp_iflist_t *iflistp = grinfop->gr_iflistp;
2587 ipmp_addrlist_t *adlistp = grinfop->gr_adlistp;
2588 ipmp_groupinfo_xfer_t grxfer;
2589 unsigned int retval;
2590
2591 /*
2592 * We can't directly transfer an ipmp_groupinfo_t due to the embedded
2593 * pointers to ipmp_iflist_t and ipmp_addr_list_t. Copy the data over
2594 * to a temporary transfer structure that doesn't have these embedded
2595 * pointers.
2596 */
2597 memset(&grxfer, 0, sizeof (grxfer));
2598
2599 grxfer.grx_sig = grinfop->gr_sig;
2600 grxfer.grx_state = grinfop->gr_state;
2601 grxfer.grx_fdt = grinfop->gr_fdt;
2602
2603 memcpy(grxfer.grx_name, grinfop->gr_name, sizeof (grxfer.grx_name));
2604 memcpy(grxfer.grx_ifname, grinfop->gr_ifname,
2605 sizeof (grxfer.grx_ifname));
2606 memcpy(grxfer.grx_m4ifname, grinfop->gr_m4ifname,
2607 sizeof (grxfer.grx_m4ifname));
2608 memcpy(grxfer.grx_m6ifname, grinfop->gr_m6ifname,
2609 sizeof (grxfer.grx_m6ifname));
2610 memcpy(grxfer.grx_bcifname, grinfop->gr_bcifname,
2611 sizeof (grxfer.grx_bcifname));
2612
2613 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (grxfer), &grxfer);
2614 if (retval != IPMP_SUCCESS)
2615 return (retval);
2616
2617 retval = ipmp_writetlv(fd, IPMP_IFLIST,
2618 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
2619 if (retval != IPMP_SUCCESS)
2620 return (retval);
2621
2622 return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2623 IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
2624 }
2625
2626 /*
2627 * Send the interface information pointed to by `ifinfop' on file descriptor
2628 * `fd'. Returns an IPMP error code.
2629 */
2630 static unsigned int
send_ifinfo(int fd,ipmp_ifinfo_t * ifinfop)2631 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2632 {
2633 ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp;
2634 ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp;
2635 ipmp_ifinfo_xfer_t ifxfer;
2636 unsigned int retval;
2637
2638 /*
2639 * We can't directly tranfer an ipmp_ifinfo_t due to the embedded
2640 * ipmp_addrlist_t pointer in if_targinfo_t. Copy the data over to
2641 * a temporary transfer structure that doesn't have that embedded
2642 * pointer.
2643 */
2644 memset(&ifxfer, 0, sizeof (ifxfer));
2645
2646 ifxfer.ifx_state = ifinfop->if_state;
2647 ifxfer.ifx_type = ifinfop->if_type;
2648 ifxfer.ifx_linkstate = ifinfop->if_linkstate;
2649 ifxfer.ifx_probestate = ifinfop->if_probestate;
2650 ifxfer.ifx_flags = ifinfop->if_flags;
2651 ifxfer.ifx_targinfo4.itx_testaddr = ifinfop->if_targinfo4.it_testaddr;
2652 ifxfer.ifx_targinfo4.itx_targmode = ifinfop->if_targinfo4.it_targmode;
2653 ifxfer.ifx_targinfo6.itx_testaddr = ifinfop->if_targinfo6.it_testaddr;
2654 ifxfer.ifx_targinfo6.itx_targmode = ifinfop->if_targinfo6.it_targmode;
2655
2656 memcpy(ifxfer.ifx_name, ifinfop->if_name, sizeof (ifxfer.ifx_name));
2657 memcpy(ifxfer.ifx_group, ifinfop->if_group, sizeof (ifxfer.ifx_group));
2658 memcpy(ifxfer.ifx_targinfo4.itx_name, ifinfop->if_targinfo4.it_name,
2659 sizeof (ifxfer.ifx_targinfo4.itx_name));
2660 memcpy(ifxfer.ifx_targinfo6.itx_name, ifinfop->if_targinfo6.it_name,
2661 sizeof (ifxfer.ifx_targinfo6.itx_name));
2662
2663 retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (ifxfer), &ifxfer);
2664 if (retval != IPMP_SUCCESS)
2665 return (retval);
2666
2667 retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
2668 IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
2669 if (retval != IPMP_SUCCESS)
2670 return (retval);
2671
2672 return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2673 IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
2674 }
2675
2676 /*
2677 * Send the address information pointed to by `adinfop' on file descriptor
2678 * `fd'. Returns an IPMP error code.
2679 */
2680 static unsigned int
send_addrinfo(int fd,ipmp_addrinfo_t * adinfop)2681 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
2682 {
2683 return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
2684 }
2685
2686 /*
2687 * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2688 * Returns an IPMP error code.
2689 */
2690 static unsigned int
send_grouplist(int fd,ipmp_grouplist_t * grlistp)2691 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2692 {
2693 return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2694 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2695 }
2696
2697 /*
2698 * Initialize an mi_result_t structure using `error' and `syserror' and
2699 * send it on file descriptor `fd'. Returns an IPMP error code.
2700 */
2701 static unsigned int
send_result(int fd,unsigned int error,int syserror)2702 send_result(int fd, unsigned int error, int syserror)
2703 {
2704 mi_result_t me;
2705
2706 me.me_mpathd_error = error;
2707 if (error == IPMP_FAILURE)
2708 me.me_sys_error = syserror;
2709 else
2710 me.me_sys_error = 0;
2711
2712 return (ipmp_write(fd, &me, sizeof (me)));
2713 }
2714
2715 /*
2716 * Daemonize the process.
2717 */
2718 static boolean_t
daemonize(void)2719 daemonize(void)
2720 {
2721 switch (fork()) {
2722 case -1:
2723 return (_B_FALSE);
2724
2725 case 0:
2726 /*
2727 * Lose our controlling terminal, and become both a session
2728 * leader and a process group leader.
2729 */
2730 if (setsid() == -1)
2731 return (_B_FALSE);
2732
2733 /*
2734 * Under POSIX, a session leader can accidentally (through
2735 * open(2)) acquire a controlling terminal if it does not
2736 * have one. Just to be safe, fork() again so we are not a
2737 * session leader.
2738 */
2739 switch (fork()) {
2740 case -1:
2741 return (_B_FALSE);
2742
2743 case 0:
2744 (void) chdir("/");
2745 (void) umask(022);
2746 (void) fdwalk(closefunc, NULL);
2747 break;
2748
2749 default:
2750 _exit(EXIT_SUCCESS);
2751 }
2752 break;
2753
2754 default:
2755 _exit(EXIT_SUCCESS);
2756 }
2757
2758 return (_B_TRUE);
2759 }
2760
2761 /*
2762 * The parent has created some fds before forking on purpose, keep them open.
2763 */
2764 static int
closefunc(void * not_used,int fd)2765 closefunc(void *not_used, int fd)
2766 {
2767 if (fd != lsock_v4 && fd != lsock_v6)
2768 (void) close(fd);
2769 return (0);
2770 }
2771
2772 /* LOGGER */
2773
2774 #include <syslog.h>
2775
2776 /*
2777 * Logging routines. All routines log to syslog, unless the daemon is
2778 * running in the foreground, in which case the logging goes to stderr.
2779 *
2780 * The following routines are available:
2781 *
2782 * logdebug(): A printf-like function for outputting debug messages
2783 * (messages at LOG_DEBUG) that are only of use to developers.
2784 *
2785 * logtrace(): A printf-like function for outputting tracing messages
2786 * (messages at LOG_INFO) from the daemon. This is typically used
2787 * to log the receipt of interesting network-related conditions.
2788 *
2789 * logerr(): A printf-like function for outputting error messages
2790 * (messages at LOG_ERR) from the daemon.
2791 *
2792 * logperror*(): A set of functions used to output error messages
2793 * (messages at LOG_ERR); these automatically append strerror(errno)
2794 * and a newline to the message passed to them.
2795 *
2796 * NOTE: since the logging functions write to syslog, the messages passed
2797 * to them are not eligible for localization. Thus, gettext() must
2798 * *not* be used.
2799 */
2800
2801 static int logging = 0;
2802
2803 static void
initlog(void)2804 initlog(void)
2805 {
2806 logging++;
2807 openlog("in.mpathd", LOG_PID, LOG_DAEMON);
2808 }
2809
2810 /* PRINTFLIKE2 */
2811 void
logmsg(int pri,const char * fmt,...)2812 logmsg(int pri, const char *fmt, ...)
2813 {
2814 va_list ap;
2815
2816 va_start(ap, fmt);
2817
2818 if (logging)
2819 vsyslog(pri, fmt, ap);
2820 else
2821 (void) vfprintf(stderr, fmt, ap);
2822 va_end(ap);
2823 }
2824
2825 /* PRINTFLIKE1 */
2826 void
logperror(const char * str)2827 logperror(const char *str)
2828 {
2829 if (logging)
2830 syslog(LOG_ERR, "%s: %m\n", str);
2831 else
2832 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
2833 }
2834
2835 void
logperror_pii(struct phyint_instance * pii,const char * str)2836 logperror_pii(struct phyint_instance *pii, const char *str)
2837 {
2838 if (logging) {
2839 syslog(LOG_ERR, "%s (%s %s): %m\n",
2840 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
2841 } else {
2842 (void) fprintf(stderr, "%s (%s %s): %s\n",
2843 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
2844 strerror(errno));
2845 }
2846 }
2847
2848 void
logperror_li(struct logint * li,const char * str)2849 logperror_li(struct logint *li, const char *str)
2850 {
2851 struct phyint_instance *pii = li->li_phyint_inst;
2852
2853 if (logging) {
2854 syslog(LOG_ERR, "%s (%s %s): %m\n",
2855 str, AF_STR(pii->pii_af), li->li_name);
2856 } else {
2857 (void) fprintf(stderr, "%s (%s %s): %s\n",
2858 str, AF_STR(pii->pii_af), li->li_name,
2859 strerror(errno));
2860 }
2861 }
2862
2863 void
close_probe_socket(struct phyint_instance * pii,boolean_t polled)2864 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
2865 {
2866 if (polled)
2867 (void) poll_remove(pii->pii_probe_sock);
2868 (void) close(pii->pii_probe_sock);
2869 pii->pii_probe_sock = -1;
2870 pii->pii_basetime_inited = 0;
2871 }
2872
2873 boolean_t
addrlist_add(addrlist_t ** addrsp,const char * name,uint64_t flags,struct sockaddr_storage * ssp)2874 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
2875 struct sockaddr_storage *ssp)
2876 {
2877 addrlist_t *addrp;
2878
2879 if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
2880 return (_B_FALSE);
2881
2882 (void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
2883 addrp->al_flags = flags;
2884 addrp->al_addr = *ssp;
2885 addrp->al_next = *addrsp;
2886 *addrsp = addrp;
2887 return (_B_TRUE);
2888 }
2889
2890 void
addrlist_free(addrlist_t ** addrsp)2891 addrlist_free(addrlist_t **addrsp)
2892 {
2893 addrlist_t *addrp, *next_addrp;
2894
2895 for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
2896 next_addrp = addrp->al_next;
2897 free(addrp);
2898 }
2899 *addrsp = NULL;
2900 }
2901
2902 /*
2903 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
2904 * tables defined by mib2.h. Pass the table information returned to the
2905 * supplied function.
2906 */
2907 static int
mibwalk(void (* proc)(mib_item_t *))2908 mibwalk(void (*proc)(mib_item_t *))
2909 {
2910 mib_item_t *head_item = NULL;
2911 mib_item_t *last_item = NULL;
2912 mib_item_t *tmp;
2913 struct strbuf ctlbuf, databuf;
2914 int flags;
2915 int rval;
2916 uintptr_t buf[512 / sizeof (uintptr_t)];
2917 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)buf;
2918 struct T_optmgmt_ack *toa = (struct T_optmgmt_ack *)buf;
2919 struct T_error_ack *tea = (struct T_error_ack *)buf;
2920 struct opthdr *req, *optp;
2921 int status = -1;
2922
2923 if (mibfd == -1) {
2924 if ((mibfd = open("/dev/ip", O_RDWR)) < 0) {
2925 logperror("mibwalk(): ip open");
2926 return (status);
2927 }
2928 }
2929
2930 tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2931 tor->OPT_offset = sizeof (struct T_optmgmt_req);
2932 tor->OPT_length = sizeof (struct opthdr);
2933 tor->MGMT_flags = T_CURRENT;
2934
2935 /*
2936 * Note: we use the special level value below so that IP will return
2937 * us information concerning IRE_MARK_TESTHIDDEN routes.
2938 */
2939 req = (struct opthdr *)&tor[1];
2940 req->level = EXPER_IP_AND_ALL_IRES;
2941 req->name = 0;
2942 req->len = 0;
2943
2944 ctlbuf.buf = (char *)&buf;
2945 ctlbuf.len = tor->OPT_length + tor->OPT_offset;
2946
2947 if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) {
2948 logperror("mibwalk(): putmsg(ctl)");
2949 return (status);
2950 }
2951
2952 /*
2953 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
2954 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains
2955 * a control and data part. The control part contains a struct
2956 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
2957 * the level, name and length of the data in the data part. The
2958 * data part contains the actual table data. The last message
2959 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
2960 * single option with zero optlen.
2961 */
2962 for (;;) {
2963 errno = flags = 0;
2964 ctlbuf.maxlen = sizeof (buf);
2965 rval = getmsg(mibfd, &ctlbuf, NULL, &flags);
2966 if (rval & MORECTL || rval < 0) {
2967 if (errno == EINTR)
2968 continue;
2969 logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n",
2970 rval, errno);
2971 goto error;
2972 }
2973 if (ctlbuf.len < sizeof (t_scalar_t)) {
2974 logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len);
2975 goto error;
2976 }
2977
2978 switch (toa->PRIM_type) {
2979 case T_ERROR_ACK:
2980 if (ctlbuf.len < sizeof (struct T_error_ack)) {
2981 logerr("mibwalk(): T_ERROR_ACK ctlbuf "
2982 "too short: %d\n", ctlbuf.len);
2983 goto error;
2984 }
2985 logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n"
2986 " UNIX_err = 0x%lx\n", tea->TLI_error,
2987 t_strerror(tea->TLI_error), tea->UNIX_error);
2988 goto error;
2989
2990 case T_OPTMGMT_ACK:
2991 optp = (struct opthdr *)&toa[1];
2992 if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
2993 sizeof (struct opthdr))) {
2994 logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too "
2995 "short: %d\n", ctlbuf.len);
2996 goto error;
2997 }
2998 if (toa->MGMT_flags != T_SUCCESS) {
2999 logerr("mibwalk(): MGMT_flags != T_SUCCESS: "
3000 "0x%lx\n", toa->MGMT_flags);
3001 goto error;
3002 }
3003 break;
3004
3005 default:
3006 goto error;
3007 }
3008 /* The following assert also implies MGMT_flags == T_SUCCESS */
3009 assert(toa->PRIM_type == T_OPTMGMT_ACK);
3010
3011 /*
3012 * We have reached the end of this T_OPTMGMT_ACK
3013 * message. If this is the last message i.e EOD,
3014 * break, else process the next T_OPTMGMT_ACK msg.
3015 */
3016 if (rval == 0) {
3017 if (optp->len == 0 && optp->name == 0 &&
3018 optp->level == 0) {
3019 /* This is the EOD message. */
3020 break;
3021 }
3022 /* Not EOD but no data to retrieve */
3023 continue;
3024 }
3025
3026 /*
3027 * We should only be here if MOREDATA was set.
3028 * Allocate an empty mib_item_t and link into the list
3029 * of MIB items.
3030 */
3031 if ((tmp = malloc(sizeof (*tmp))) == NULL) {
3032 logperror("mibwalk(): malloc() failed.");
3033 goto error;
3034 }
3035 if (last_item != NULL)
3036 last_item->mi_next = tmp;
3037 else
3038 head_item = tmp;
3039 last_item = tmp;
3040 last_item->mi_next = NULL;
3041 last_item->mi_opthdr = *optp;
3042 last_item->mi_valp = malloc(optp->len);
3043 if (last_item->mi_valp == NULL) {
3044 logperror("mibwalk(): malloc() failed.");
3045 goto error;
3046 }
3047
3048 databuf.maxlen = last_item->mi_opthdr.len;
3049 databuf.buf = (char *)last_item->mi_valp;
3050 databuf.len = 0;
3051
3052 /* Retrieve the actual MIB data */
3053 for (;;) {
3054 flags = 0;
3055 if ((rval = getmsg(mibfd, NULL, &databuf,
3056 &flags)) != 0) {
3057 if (rval < 0 && errno == EINTR)
3058 continue;
3059 /*
3060 * We shouldn't get MOREDATA here so treat that
3061 * as an error.
3062 */
3063 logperror("mibwalk(): getmsg(data)");
3064 goto error;
3065 }
3066 break;
3067 }
3068 }
3069 status = 0;
3070 /* Pass the accumulated MIB data to the supplied function pointer */
3071 (*proc)(head_item);
3072 error:
3073 while (head_item != NULL) {
3074 tmp = head_item;
3075 head_item = tmp->mi_next;
3076 free(tmp->mi_valp);
3077 free(tmp);
3078 }
3079 return (status);
3080 }
3081
3082 /*
3083 * Parse the supplied mib2 information to get the size of routing table
3084 * entries. This is needed when running in a branded zone where the
3085 * Solaris application environment and the Solaris kernel may not be the
3086 * the same release version.
3087 */
3088 static void
mib_get_constants(mib_item_t * item)3089 mib_get_constants(mib_item_t *item)
3090 {
3091 mib2_ip_t *ipv4;
3092 mib2_ipv6IfStatsEntry_t *ipv6;
3093
3094 for (; item != NULL; item = item->mi_next) {
3095 if (item->mi_opthdr.name != 0)
3096 continue;
3097 if (item->mi_opthdr.level == MIB2_IP) {
3098 ipv4 = (mib2_ip_t *)item->mi_valp;
3099 ipRouteEntrySize = ipv4->ipRouteEntrySize;
3100 } else if (item->mi_opthdr.level == MIB2_IP6) {
3101 ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp;
3102 ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize;
3103 }
3104 }
3105 }
3106