xref: /illumos-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 45744051679350ee063cdc366b66bee5223a11ea)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
26  */
27 
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30 
31 int debug = 0;				/* Debug flag */
32 static int pollfd_num = 0;		/* Num. of poll descriptors */
33 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
34 					/* All times below in ms */
35 int	user_failure_detection_time;	/* user specified failure detection */
36 					/* time (fdt) */
37 int	user_probe_interval;		/* derived from user specified fdt */
38 
39 /*
40  * Structure to store mib2 information returned by the kernel.
41  * This is used to process routing table information.
42  */
43 typedef struct mib_item_s {
44 	struct mib_item_s	*mi_next;
45 	struct opthdr		mi_opthdr;
46 	void			*mi_valp;
47 } mib_item_t;
48 
49 static int	rtsock_v4;		/* AF_INET routing socket */
50 static int	rtsock_v6;		/* AF_INET6 routing socket */
51 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
52 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
53 static int	lsock_v4;		/* Listen socket to detect mpathd */
54 static int	lsock_v6;		/* Listen socket to detect mpathd */
55 static int	mibfd = -1;		/* fd to get mib info */
56 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
57 
58 static uint_t	last_initifs_time;	/* Time when initifs was last run */
59 static	char **argv0;			/* Saved for re-exec on SIGHUP */
60 boolean_t handle_link_notifications = _B_TRUE;
61 static int	ipRouteEntrySize;	/* Size of IPv4 route entry */
62 static int	ipv6RouteEntrySize;	/* Size of IPv6 route entry */
63 
64 static void	initlog(void);
65 static void	run_timeouts(void);
66 static void	initifs(void);
67 static void	check_if_removed(struct phyint_instance *pii);
68 static void	select_test_ifs(void);
69 static void	update_router_list(mib_item_t *item);
70 static void	mib_get_constants(mib_item_t *item);
71 static int	mibwalk(void (*proc)(mib_item_t *));
72 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
73 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
74 static void	router_add_common(int af, char *ifname,
75     struct in6_addr nexthop);
76 static void	init_router_targets();
77 static void	cleanup(void);
78 static int	setup_listener(int af);
79 static void	check_config(void);
80 static void	check_testconfig(void);
81 static void	check_addr_unique(struct phyint_instance *,
82     struct sockaddr_storage *);
83 static void	init_host_targets(void);
84 static void	dup_host_targets(struct phyint_instance *desired_pii);
85 static void	loopback_cmd(int sock, int family);
86 static boolean_t daemonize(void);
87 static int	closefunc(void *, int);
88 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
89 static unsigned int process_query(int fd, mi_query_t *miq);
90 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
91 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
92 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
93 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
94 static unsigned int send_result(int fd, unsigned int error, int syserror);
95 
96 addrlist_t *localaddrs;
97 
98 /*
99  * Return the current time in milliseconds (from an arbitrary reference)
100  * truncated to fit into an int. Truncation is ok since we are interested
101  * only in differences and not the absolute values.
102  */
103 uint_t
104 getcurrenttime(void)
105 {
106 	uint_t	cur_time;	/* In ms */
107 
108 	/*
109 	 * Use of a non-user-adjustable source of time is
110 	 * required. However millisecond precision is sufficient.
111 	 * divide by 10^6
112 	 */
113 	cur_time = (uint_t)(gethrtime() / 1000000LL);
114 	return (cur_time);
115 }
116 
117 uint64_t
118 getcurrentsec(void)
119 {
120 	return (gethrtime() / NANOSEC);
121 }
122 
123 /*
124  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
125  */
126 int
127 poll_add(int fd)
128 {
129 	int i;
130 	int new_num;
131 	struct pollfd *newfds;
132 retry:
133 	/* Check if already present */
134 	for (i = 0; i < pollfd_num; i++) {
135 		if (pollfds[i].fd == fd)
136 			return (0);
137 	}
138 	/* Check for empty spot already present */
139 	for (i = 0; i < pollfd_num; i++) {
140 		if (pollfds[i].fd == -1) {
141 			pollfds[i].fd = fd;
142 			return (0);
143 		}
144 	}
145 
146 	/* Allocate space for 32 more fds and initialize to -1 */
147 	new_num = pollfd_num + 32;
148 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
149 	if (newfds == NULL) {
150 		logperror("poll_add: realloc");
151 		return (-1);
152 	}
153 	for (i = pollfd_num; i < new_num; i++) {
154 		newfds[i].fd = -1;
155 		newfds[i].events = POLLIN;
156 	}
157 	pollfd_num = new_num;
158 	pollfds = newfds;
159 	goto retry;
160 }
161 
162 /*
163  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
164  */
165 int
166 poll_remove(int fd)
167 {
168 	int i;
169 
170 	/* Check if already present */
171 	for (i = 0; i < pollfd_num; i++) {
172 		if (pollfds[i].fd == fd) {
173 			pollfds[i].fd = -1;
174 			return (0);
175 		}
176 	}
177 	return (-1);
178 }
179 
180 /*
181  * Extract information about the phyint instance. If the phyint instance still
182  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
183  * will use it to detect phyint instances that don't exist any longer and
184  * remove them, from our database of phyint instances.
185  * Return value:
186  *	returns true if the phyint instance exists in the kernel,
187  *	returns false otherwise
188  */
189 static boolean_t
190 pii_process(int af, char *name, struct phyint_instance **pii_p)
191 {
192 	int err;
193 	struct phyint_instance *pii;
194 	struct phyint_instance *pii_other;
195 
196 	if (debug & D_PHYINT)
197 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
198 
199 	pii = phyint_inst_lookup(af, name);
200 	if (pii == NULL) {
201 		/*
202 		 * Phyint instance does not exist in our tables,
203 		 * create new phyint instance
204 		 */
205 		pii = phyint_inst_init_from_k(af, name);
206 	} else {
207 		/* Phyint exists in our tables */
208 		err = phyint_inst_update_from_k(pii);
209 
210 		switch (err) {
211 		case PI_IOCTL_ERROR:
212 			/* Some ioctl error. don't change anything */
213 			pii->pii_in_use = 1;
214 			break;
215 
216 		case PI_GROUP_CHANGED:
217 		case PI_IFINDEX_CHANGED:
218 			/*
219 			 * Interface index or group membership has changed.
220 			 * Delete the old state and recreate based on the new
221 			 * state (it may no longer be in a group).
222 			 */
223 			pii_other = phyint_inst_other(pii);
224 			if (pii_other != NULL)
225 				phyint_inst_delete(pii_other);
226 			phyint_inst_delete(pii);
227 			pii = phyint_inst_init_from_k(af, name);
228 			break;
229 
230 		case PI_DELETED:
231 			/* Phyint instance has disappeared from kernel */
232 			pii->pii_in_use = 0;
233 			break;
234 
235 		case PI_OK:
236 			/* Phyint instance exists and is fine */
237 			pii->pii_in_use = 1;
238 			break;
239 
240 		default:
241 			/* Unknown status */
242 			logerr("pii_process: Unknown status %d\n", err);
243 			break;
244 		}
245 	}
246 
247 	*pii_p = pii;
248 	if (pii != NULL)
249 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
250 	else
251 		return (_B_FALSE);
252 }
253 
254 /*
255  * Scan all interfaces to detect changes as well as new and deleted interfaces
256  */
257 static void
258 initifs()
259 {
260 	int	i, nlifr;
261 	int	af;
262 	char	*cp;
263 	char	*buf;
264 	int	sockfd;
265 	uint64_t	flags;
266 	struct lifnum	lifn;
267 	struct lifconf	lifc;
268 	struct lifreq	lifreq;
269 	struct lifreq	*lifr;
270 	struct logint	*li;
271 	struct phyint_instance *pii;
272 	struct phyint_instance *next_pii;
273 	struct phyint_group *pg, *next_pg;
274 	char		pi_name[LIFNAMSIZ + 1];
275 
276 	if (debug & D_PHYINT)
277 		logdebug("initifs: Scanning interfaces\n");
278 
279 	last_initifs_time = getcurrenttime();
280 
281 	/*
282 	 * Free the existing local address list; we'll build a new list below.
283 	 */
284 	addrlist_free(&localaddrs);
285 
286 	/*
287 	 * Mark the interfaces so that we can find phyints and logints
288 	 * which have disappeared from the kernel. pii_process() and
289 	 * logint_init_from_k() will set {pii,li}_in_use when they find
290 	 * the interface in the kernel. Also, clear dupaddr bit on probe
291 	 * logint. check_addr_unique() will set the dupaddr bit on the
292 	 * probe logint, if the testaddress is not unique.
293 	 */
294 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
295 		pii->pii_in_use = 0;
296 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
297 			li->li_in_use = 0;
298 			if (pii->pii_probe_logint == li)
299 				li->li_dupaddr = 0;
300 		}
301 	}
302 
303 	/*
304 	 * As above, mark groups so that we can detect IPMP interfaces which
305 	 * have been removed from the kernel.  Also, delete the group address
306 	 * list since we'll iteratively recreate it below.
307 	 */
308 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
309 		pg->pg_in_use = _B_FALSE;
310 		addrlist_free(&pg->pg_addrs);
311 	}
312 
313 	lifn.lifn_family = AF_UNSPEC;
314 	lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
315 again:
316 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
317 		logperror("initifs: ioctl (get interface count)");
318 		return;
319 	}
320 	/*
321 	 * Pad the interface count to detect when additional interfaces have
322 	 * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
323 	 */
324 	lifn.lifn_count += 4;
325 
326 	if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
327 		logperror("initifs: calloc");
328 		return;
329 	}
330 
331 	lifc.lifc_family = AF_UNSPEC;
332 	lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
333 	lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
334 	lifc.lifc_buf = buf;
335 
336 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
337 		logperror("initifs: ioctl (get interface configuration)");
338 		free(buf);
339 		return;
340 	}
341 
342 	/*
343 	 * If every lifr_req slot is taken, then additional interfaces must
344 	 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
345 	 * Recalculate to make sure we didn't miss any interfaces.
346 	 */
347 	nlifr = lifc.lifc_len / sizeof (struct lifreq);
348 	if (nlifr >= lifn.lifn_count) {
349 		free(buf);
350 		goto again;
351 	}
352 
353 	/*
354 	 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
355 	 * global list of addresses, phyint groups, phyints, and logints.
356 	 */
357 	for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
358 		af = lifr->lifr_addr.ss_family;
359 		sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
360 		(void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
361 
362 		if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
363 			if (errno != ENXIO)
364 				logperror("initifs: ioctl (SIOCGLIFFLAGS)");
365 			continue;
366 		}
367 		flags = lifreq.lifr_flags;
368 
369 		/*
370 		 * If the address is IFF_UP, add it to the local address list.
371 		 * (We ignore addresses that aren't IFF_UP since another node
372 		 * might legitimately have that address IFF_UP.)
373 		 */
374 		if (flags & IFF_UP) {
375 			(void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
376 			    &lifr->lifr_addr);
377 		}
378 
379 		/*
380 		 * If this address is on an IPMP meta-interface, update our
381 		 * phyint_group information (either by recording that group
382 		 * still exists or creating a new group), and track what
383 		 * group the address is part of.
384 		 */
385 		if (flags & IFF_IPMP) {
386 			if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
387 				if (errno != ENXIO)
388 					logperror("initifs: ioctl "
389 					    "(SIOCGLIFGROUPNAME)");
390 				continue;
391 			}
392 
393 			pg = phyint_group_lookup(lifreq.lifr_groupname);
394 			if (pg == NULL) {
395 				pg = phyint_group_create(lifreq.lifr_groupname);
396 				if (pg == NULL) {
397 					logerr("initifs: cannot create group "
398 					    "%s\n", lifreq.lifr_groupname);
399 					continue;
400 				}
401 				phyint_group_insert(pg);
402 			}
403 			pg->pg_in_use = _B_TRUE;
404 
405 			/*
406 			 * Add this to the group's list of data addresses.
407 			 */
408 			if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
409 			    &lifr->lifr_addr)) {
410 				logerr("initifs: insufficient memory to track "
411 				    "data address information for %s\n",
412 				    lifr->lifr_name);
413 			}
414 			continue;
415 		}
416 
417 		/*
418 		 * This isn't an address on an IPMP meta-interface, so it's
419 		 * either on an underlying interface or not related to any
420 		 * group.  Update our phyint and logint information (via
421 		 * pii_process() and logint_init_from_k()) -- but first,
422 		 * convert the logint name to a phyint name so we can call
423 		 * pii_process().
424 		 */
425 		(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
426 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
427 			*cp = '\0';
428 
429 		if (pii_process(af, pi_name, &pii)) {
430 			/* The phyint is fine. So process the logint */
431 			logint_init_from_k(pii, lifr->lifr_name);
432 			check_addr_unique(pii, &lifr->lifr_addr);
433 		}
434 	}
435 	free(buf);
436 
437 	/*
438 	 * Scan for groups, phyints and logints that have disappeared from the
439 	 * kernel, and delete them.
440 	 */
441 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
442 		next_pii = pii->pii_next;
443 		check_if_removed(pii);
444 	}
445 
446 	for (pg = phyint_groups; pg != NULL; pg = next_pg) {
447 		next_pg = pg->pg_next;
448 		if (!pg->pg_in_use) {
449 			phyint_group_delete(pg);
450 			continue;
451 		}
452 		/*
453 		 * Refresh the group's state.  This is necessary since the
454 		 * group's state is defined by the set of usable interfaces in
455 		 * the group, and an interface is considered unusable if all
456 		 * of its addresses are down.  When an address goes down/up,
457 		 * the RTM_DELADDR/RTM_NEWADDR brings us through here.
458 		 */
459 		phyint_group_refresh_state(pg);
460 	}
461 
462 	/*
463 	 * Select a test address for sending probes on each phyint instance
464 	 */
465 	select_test_ifs();
466 
467 	/*
468 	 * Handle link up/down notifications.
469 	 */
470 	process_link_state_changes();
471 }
472 
473 /*
474  * Check that a given test address is unique across all of the interfaces in a
475  * group.  (e.g., IPv6 link-locals may not be inherently unique, and binding
476  * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
477  * Any issues will be reported by check_testconfig().
478  */
479 static void
480 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
481 {
482 	struct phyint		*pi;
483 	struct phyint_group	*pg;
484 	struct in6_addr		addr;
485 	struct phyint_instance	*pii;
486 	struct sockaddr_in	*sin;
487 
488 	if (ss->ss_family == AF_INET) {
489 		sin = (struct sockaddr_in *)ss;
490 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
491 	} else {
492 		assert(ss->ss_family == AF_INET6);
493 		addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
494 	}
495 
496 	/*
497 	 * For anonymous groups, every interface is assumed to be on its own
498 	 * link, so there is no chance of overlapping addresses.
499 	 */
500 	pg = ourpii->pii_phyint->pi_group;
501 	if (pg == phyint_anongroup)
502 		return;
503 
504 	/*
505 	 * Walk the list of phyint instances in the group and check for test
506 	 * addresses matching ours.  Of course, we skip ourself.
507 	 */
508 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
509 		pii = PHYINT_INSTANCE(pi, ss->ss_family);
510 		if (pii == NULL || pii == ourpii ||
511 		    pii->pii_probe_logint == NULL)
512 			continue;
513 
514 		/*
515 		 * If this test address is not unique, set the dupaddr bit.
516 		 */
517 		if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr))
518 			pii->pii_probe_logint->li_dupaddr = 1;
519 	}
520 }
521 
522 /*
523  * Stop probing an interface.  Called when an interface is offlined.
524  * The probe socket is closed on each interface instance, and the
525  * interface state set to PI_OFFLINE.
526  */
527 void
528 stop_probing(struct phyint *pi)
529 {
530 	struct phyint_instance *pii;
531 
532 	pii = pi->pi_v4;
533 	if (pii != NULL) {
534 		if (pii->pii_probe_sock != -1)
535 			close_probe_socket(pii, _B_TRUE);
536 		pii->pii_probe_logint = NULL;
537 	}
538 
539 	pii = pi->pi_v6;
540 	if (pii != NULL) {
541 		if (pii->pii_probe_sock != -1)
542 			close_probe_socket(pii, _B_TRUE);
543 		pii->pii_probe_logint = NULL;
544 	}
545 
546 	phyint_chstate(pi, PI_OFFLINE);
547 }
548 
549 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
550 
551 /*
552  * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
553  * IFF_UP must also be set so that the associated address can be used as a
554  * source address.  Further, we must be able to exchange packets with local
555  * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
556  * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
557  */
558 static int
559 rate_testflags(uint64_t flags)
560 {
561 	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
562 		return (BAD_TESTFLAGS);
563 
564 	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
565 		return (BAD_TESTFLAGS);
566 
567 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
568 		return (BEST_TESTFLAGS);
569 
570 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
571 		return (BEST_TESTFLAGS);
572 
573 	return (OK_TESTFLAGS);
574 }
575 
576 /*
577  * Attempt to select a test address for each phyint instance.
578  * Call phyint_inst_sockinit() to complete the initializations.
579  */
580 static void
581 select_test_ifs(void)
582 {
583 	struct phyint		*pi;
584 	struct phyint_instance	*pii;
585 	struct phyint_instance	*next_pii;
586 	struct logint		*li;
587 	struct logint		*probe_logint;
588 	boolean_t		target_scan_reqd = _B_FALSE;
589 	int			rating;
590 
591 	if (debug & D_PHYINT)
592 		logdebug("select_test_ifs\n");
593 
594 	/*
595 	 * For each phyint instance, do the test address selection
596 	 */
597 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
598 		next_pii = pii->pii_next;
599 		probe_logint = NULL;
600 
601 		/*
602 		 * An interface that is offline should not be probed.
603 		 * IFF_OFFLINE interfaces should always be PI_OFFLINE
604 		 * unless some other entity has set the offline flag.
605 		 */
606 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
607 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
608 				logerr("shouldn't be probing offline"
609 				    " interface %s (state is: %u)."
610 				    " Stopping probes.\n",
611 				    pii->pii_phyint->pi_name,
612 				    pii->pii_phyint->pi_state);
613 				stop_probing(pii->pii_phyint);
614 			}
615 			continue;
616 		} else {
617 			/*
618 			 * If something cleared IFF_OFFLINE (e.g., by accident
619 			 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
620 			 * inherently racy), the phyint may still be offline.
621 			 * Just ignore it.
622 			 */
623 			if (pii->pii_phyint->pi_state == PI_OFFLINE)
624 				continue;
625 		}
626 
627 		li = pii->pii_probe_logint;
628 		if (li != NULL) {
629 			/*
630 			 * We've already got a test address; only proceed
631 			 * if it's suboptimal.
632 			 */
633 			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
634 				continue;
635 		}
636 
637 		/*
638 		 * Walk the logints of this phyint instance, and select
639 		 * the best available test address
640 		 */
641 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
642 			/*
643 			 * Skip 0.0.0.0 addresses, as those are never
644 			 * actually usable.
645 			 */
646 			if (pii->pii_af == AF_INET &&
647 			    IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
648 				continue;
649 
650 			/*
651 			 * Skip any IPv6 logints that are not link-local,
652 			 * since we should always have a link-local address
653 			 * anyway and in6_data() expects link-local replies.
654 			 */
655 			if (pii->pii_af == AF_INET6 &&
656 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
657 				continue;
658 
659 			/*
660 			 * Rate the testflags. If we've found an optimal
661 			 * match, then break out; otherwise, record the most
662 			 * recent OK one.
663 			 */
664 			rating = rate_testflags(li->li_flags);
665 			if (rating == BAD_TESTFLAGS)
666 				continue;
667 
668 			probe_logint = li;
669 			if (rating == BEST_TESTFLAGS)
670 				break;
671 		}
672 
673 		/*
674 		 * If the probe logint has changed, ditch the old one.
675 		 */
676 		if (pii->pii_probe_logint != NULL &&
677 		    pii->pii_probe_logint != probe_logint) {
678 			if (pii->pii_probe_sock != -1)
679 				close_probe_socket(pii, _B_TRUE);
680 			pii->pii_probe_logint = NULL;
681 		}
682 
683 		if (probe_logint == NULL) {
684 			/*
685 			 * We don't have a test address; zero out the probe
686 			 * stats array since it is no longer relevant.
687 			 * Optimize by checking if it is already zeroed out.
688 			 */
689 			int pr_ndx;
690 
691 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
692 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
693 				clear_pii_probe_stats(pii);
694 				reset_crtt_all(pii->pii_phyint);
695 			}
696 			continue;
697 		} else if (probe_logint == pii->pii_probe_logint) {
698 			/*
699 			 * If we didn't find any new test addr, go to the
700 			 * next phyint.
701 			 */
702 			continue;
703 		}
704 
705 		/*
706 		 * The phyint is either being assigned a new testaddr
707 		 * or is being assigned a testaddr for the 1st time.
708 		 * Need to initialize the phyint socket
709 		 */
710 		pii->pii_probe_logint = probe_logint;
711 		if (!phyint_inst_sockinit(pii)) {
712 			if (debug & D_PHYINT) {
713 				logdebug("select_test_ifs: "
714 				    "phyint_sockinit failed\n");
715 			}
716 			phyint_inst_delete(pii);
717 			continue;
718 		}
719 
720 		/*
721 		 * This phyint instance is now enabled for probes; this
722 		 * impacts our state machine in two ways:
723 		 *
724 		 * 1. If we're probe *capable* as well (i.e., we have
725 		 *    probe targets) and the interface is in PI_NOTARGETS,
726 		 *    then transition to PI_RUNNING.
727 		 *
728 		 * 2. If we're not probe capable, and the other phyint
729 		 *    instance is also not probe capable, and we were in
730 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
731 		 *
732 		 * Also see the state diagram in mpd_probe.c.
733 		 */
734 		if (PROBE_CAPABLE(pii)) {
735 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
736 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
737 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
738 			if (pii->pii_phyint->pi_state == PI_RUNNING)
739 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
740 		}
741 
742 		/*
743 		 * If no targets are currently known for this phyint
744 		 * we need to call init_router_targets. Since
745 		 * init_router_targets() initializes the list of targets
746 		 * for all phyints it is done below the loop.
747 		 */
748 		if (pii->pii_targets == NULL)
749 			target_scan_reqd = _B_TRUE;
750 
751 		/*
752 		 * Start the probe timer for this instance.
753 		 */
754 		if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
755 			start_timer(pii);
756 			pii->pii_basetime_inited = 1;
757 		}
758 	}
759 
760 	/*
761 	 * Scan the interface list for any interfaces that are PI_FAILED or
762 	 * PI_NOTARGETS but no longer enabled to send probes, and call
763 	 * phyint_check_for_repair() to see if the link state indicates that
764 	 * the interface should be repaired.  Also see the state diagram in
765 	 * mpd_probe.c.
766 	 */
767 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
768 		if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
769 		    (pi->pi_state == PI_FAILED ||
770 		    pi->pi_state == PI_NOTARGETS)) {
771 			phyint_check_for_repair(pi);
772 		}
773 	}
774 
775 	check_testconfig();
776 
777 	/*
778 	 * Try to populate the target list. init_router_targets populates
779 	 * the target list from the routing table. If our target list is
780 	 * still empty, init_host_targets adds host targets based on the
781 	 * host target list of other phyints in the group.
782 	 */
783 	if (target_scan_reqd) {
784 		init_router_targets();
785 		init_host_targets();
786 	}
787 }
788 
789 /*
790  * Check test address configuration, and log notices/errors if appropriate.
791  * Note that this function only logs pre-existing conditions (e.g., that
792  * probe-based failure detection is disabled).
793  */
794 static void
795 check_testconfig(void)
796 {
797 	struct phyint	*pi;
798 	struct logint	*li;
799 	char		abuf[INET6_ADDRSTRLEN];
800 	int		pri;
801 
802 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
803 		if (pi->pi_flags & IFF_OFFLINE)
804 			continue;
805 
806 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) {
807 			if (pi->pi_taddrmsg_printed ||
808 			    pi->pi_duptaddrmsg_printed) {
809 				if (pi->pi_duptaddrmsg_printed)
810 					pri = LOG_ERR;
811 				else
812 					pri = LOG_INFO;
813 				logmsg(pri, "Test address now configured on "
814 				    "interface %s; enabling probe-based "
815 				    "failure detection on it\n", pi->pi_name);
816 				pi->pi_taddrmsg_printed = 0;
817 				pi->pi_duptaddrmsg_printed = 0;
818 			}
819 			continue;
820 		}
821 
822 		li = NULL;
823 		if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL &&
824 		    pi->pi_v4->pii_probe_logint->li_dupaddr)
825 			li = pi->pi_v4->pii_probe_logint;
826 
827 		if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL &&
828 		    pi->pi_v6->pii_probe_logint->li_dupaddr)
829 			li = pi->pi_v6->pii_probe_logint;
830 
831 		if (li != NULL && li->li_dupaddr) {
832 			if (pi->pi_duptaddrmsg_printed)
833 				continue;
834 			logerr("Test address %s is not unique in group; "
835 			    "disabling probe-based failure detection on %s\n",
836 			    pr_addr(li->li_phyint_inst->pii_af,
837 			    li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
838 			pi->pi_duptaddrmsg_printed = 1;
839 			continue;
840 		}
841 
842 		if (getcurrentsec() < pi->pi_taddrthresh)
843 			continue;
844 
845 		if (!pi->pi_taddrmsg_printed) {
846 			logtrace("No test address configured on interface %s; "
847 			    "disabling probe-based failure detection on it\n",
848 			    pi->pi_name);
849 			pi->pi_taddrmsg_printed = 1;
850 		}
851 	}
852 }
853 
854 /*
855  * Check phyint group configuration, to detect any inconsistencies,
856  * and log an error message. This is called from runtimeouts every
857  * 20 secs. But the error message is displayed once. If the
858  * consistency is resolved by the admin, a recovery message is displayed
859  * once.
860  */
861 static void
862 check_config(void)
863 {
864 	struct phyint_group *pg;
865 	struct phyint *pi;
866 	boolean_t v4_in_group;
867 	boolean_t v6_in_group;
868 
869 	/*
870 	 * All phyints of a group must be homogeneous to ensure that they can
871 	 * take over for one another.  If any phyint in a group has IPv4
872 	 * plumbed, check that all phyints have IPv4 plumbed.  Do a similar
873 	 * check for IPv6.
874 	 */
875 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
876 		if (pg == phyint_anongroup)
877 			continue;
878 
879 		v4_in_group = _B_FALSE;
880 		v6_in_group = _B_FALSE;
881 		/*
882 		 * 1st pass. Determine if at least 1 phyint in the group
883 		 * has IPv4 plumbed and if so set v4_in_group to true.
884 		 * Repeat similarly for IPv6.
885 		 */
886 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
887 			if (pi->pi_v4 != NULL)
888 				v4_in_group = _B_TRUE;
889 			if (pi->pi_v6 != NULL)
890 				v6_in_group = _B_TRUE;
891 		}
892 
893 		/*
894 		 * 2nd pass. If v4_in_group is true, check that phyint
895 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
896 		 * out a message the 1st time only.
897 		 */
898 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
899 			if (pi->pi_flags & IFF_OFFLINE)
900 				continue;
901 
902 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
903 				if (!pi->pi_cfgmsg_printed) {
904 					logerr("IP interface %s in group %s is"
905 					    " not plumbed for IPv4, affecting"
906 					    " IPv4 connectivity\n",
907 					    pi->pi_name,
908 					    pi->pi_group->pg_name);
909 					pi->pi_cfgmsg_printed = 1;
910 				}
911 			} else if (v6_in_group == _B_TRUE &&
912 			    pi->pi_v6 == NULL) {
913 				if (!pi->pi_cfgmsg_printed) {
914 					logerr("IP interface %s in group %s is"
915 					    " not plumbed for IPv6, affecting"
916 					    " IPv6 connectivity\n",
917 					    pi->pi_name,
918 					    pi->pi_group->pg_name);
919 					pi->pi_cfgmsg_printed = 1;
920 				}
921 			} else {
922 				/*
923 				 * The phyint matches the group configuration,
924 				 * if we have reached this point. If it was
925 				 * improperly configured earlier, log an
926 				 * error recovery message
927 				 */
928 				if (pi->pi_cfgmsg_printed) {
929 					logerr("IP interface %s is now"
930 					    " consistent with group %s "
931 					    " and connectivity is restored\n",
932 					    pi->pi_name, pi->pi_group->pg_name);
933 					pi->pi_cfgmsg_printed = 0;
934 				}
935 			}
936 
937 		}
938 	}
939 }
940 
941 /*
942  * Timer mechanism using relative time (in milliseconds) from the
943  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
944  * will fire after TIMER_INFINITY milliseconds.
945  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
946  * time values. Hence 2 consecutive timer events cannot be spaced farther
947  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
948  * that can be passed for the delay parameter of timer_schedule()
949  */
950 static uint_t timer_next;	/* Currently scheduled timeout */
951 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
952 
953 static void
954 timer_init(void)
955 {
956 	timer_next = getcurrenttime() + TIMER_INFINITY;
957 	/*
958 	 * The call to run_timeouts() will get the timer started
959 	 * Since there are no phyints at this point, the timer will
960 	 * be set for IF_SCAN_INTERVAL ms.
961 	 */
962 	run_timeouts();
963 }
964 
965 /*
966  * Make sure the next SIGALRM occurs delay milliseconds from the current
967  * time if not earlier. We are interested only in time differences.
968  */
969 void
970 timer_schedule(uint_t delay)
971 {
972 	uint_t now;
973 	struct itimerval itimerval;
974 
975 	if (debug & D_TIMER)
976 		logdebug("timer_schedule(%u)\n", delay);
977 
978 	assert(delay <= TIMER_INFINITY);
979 
980 	now = getcurrenttime();
981 	if (delay == 0) {
982 		/* Minimum allowed delay */
983 		delay = 1;
984 	}
985 	/* Will this timer occur before the currently scheduled SIGALRM? */
986 	if (timer_active && TIME_GE(now + delay, timer_next)) {
987 		if (debug & D_TIMER) {
988 			logdebug("timer_schedule(%u) - no action: "
989 			    "now %u next %u\n", delay, now, timer_next);
990 		}
991 		return;
992 	}
993 	timer_next = now + delay;
994 
995 	itimerval.it_value.tv_sec = delay / 1000;
996 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
997 	itimerval.it_interval.tv_sec = 0;
998 	itimerval.it_interval.tv_usec = 0;
999 	if (debug & D_TIMER) {
1000 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1001 		    delay, itimerval.it_value.tv_sec,
1002 		    itimerval.it_value.tv_usec);
1003 	}
1004 	timer_active = _B_TRUE;
1005 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1006 		logperror("timer_schedule: setitimer");
1007 		exit(2);
1008 	}
1009 }
1010 
1011 static void
1012 timer_cancel(void)
1013 {
1014 	struct itimerval itimerval;
1015 
1016 	if (debug & D_TIMER)
1017 		logdebug("timer_cancel()\n");
1018 
1019 	bzero(&itimerval, sizeof (itimerval));
1020 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0)
1021 		logperror("timer_cancel: setitimer");
1022 }
1023 
1024 /*
1025  * Timer has fired. Determine when the next timer event will occur by asking
1026  * all the timer routines. Should not be called from a timer routine.
1027  */
1028 static void
1029 run_timeouts(void)
1030 {
1031 	uint_t next;
1032 	uint_t next_event_time;
1033 	struct phyint_instance *pii;
1034 	struct phyint_instance *next_pii;
1035 	static boolean_t timeout_running;
1036 
1037 	/* assert that recursive timeouts don't happen. */
1038 	assert(!timeout_running);
1039 
1040 	timeout_running = _B_TRUE;
1041 
1042 	if (debug & D_TIMER)
1043 		logdebug("run_timeouts()\n");
1044 
1045 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1046 		initifs();
1047 		check_config();
1048 	}
1049 
1050 	next = TIMER_INFINITY;
1051 
1052 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1053 		next_pii = pii->pii_next;
1054 		next_event_time = phyint_inst_timer(pii);
1055 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1056 			next = next_event_time;
1057 
1058 		if (debug & D_TIMER) {
1059 			logdebug("run_timeouts(%s %s): next scheduled for"
1060 			    " this phyint inst %u, next scheduled global"
1061 			    " %u ms\n",
1062 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1063 			    next_event_time, next);
1064 		}
1065 	}
1066 
1067 	/*
1068 	 * Make sure initifs() is called at least once every
1069 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1070 	 * with the kernel, in case we have missed any routing
1071 	 * socket messages.
1072 	 */
1073 	if (next > IF_SCAN_INTERVAL)
1074 		next = IF_SCAN_INTERVAL;
1075 
1076 	if (debug & D_TIMER)
1077 		logdebug("run_timeouts: %u ms\n", next);
1078 
1079 	timer_schedule(next);
1080 	timeout_running = _B_FALSE;
1081 }
1082 
1083 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1084 static int eventpipe_write = -1;
1085 boolean_t cleanup_started = _B_FALSE;	/* true if we're going away */
1086 
1087 /*
1088  * Ensure that signals are processed synchronously with the rest of
1089  * the code by just writing a one character signal number on the pipe.
1090  * The poll loop will pick this up and process the signal event.
1091  */
1092 static void
1093 sig_handler(int signo)
1094 {
1095 	uchar_t buf = (uchar_t)signo;
1096 
1097 	/*
1098 	 * Don't write to pipe if cleanup has already begun. cleanup()
1099 	 * might have closed the pipe already
1100 	 */
1101 	if (cleanup_started)
1102 		return;
1103 
1104 	if (eventpipe_write == -1) {
1105 		logerr("sig_handler: no pipe found\n");
1106 		return;
1107 	}
1108 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1109 		logperror("sig_handler: write");
1110 }
1111 
1112 extern struct probes_missed probes_missed;
1113 
1114 /*
1115  * Pick up a signal "byte" from the pipe and process it.
1116  */
1117 static void
1118 in_signal(int fd)
1119 {
1120 	uchar_t buf;
1121 	uint64_t  sent, acked, lost, unacked, unknown;
1122 	struct phyint_instance *pii;
1123 	int pr_ndx;
1124 
1125 	switch (read(fd, &buf, sizeof (buf))) {
1126 	case -1:
1127 		logperror("in_signal: read");
1128 		exit(1);
1129 		/* NOTREACHED */
1130 	case 1:
1131 		break;
1132 	case 0:
1133 		logerr("in_signal: read end of file\n");
1134 		exit(1);
1135 		/* NOTREACHED */
1136 	default:
1137 		logerr("in_signal: read > 1\n");
1138 		exit(1);
1139 	}
1140 
1141 	if (debug & D_TIMER)
1142 		logdebug("in_signal() got %d\n", buf);
1143 
1144 	switch (buf) {
1145 	case SIGALRM:
1146 		if (debug & D_TIMER) {
1147 			uint_t now = getcurrenttime();
1148 
1149 			logdebug("in_signal(SIGALRM) delta %u\n",
1150 			    now - timer_next);
1151 		}
1152 		timer_active = _B_FALSE;
1153 		run_timeouts();
1154 		break;
1155 	case SIGUSR1:
1156 		logdebug("Printing configuration:\n");
1157 		/* Print out the internal tables */
1158 		phyint_inst_print_all();
1159 
1160 		/*
1161 		 * Print out the accumulated statistics about missed
1162 		 * probes (happens due to scheduling delay).
1163 		 */
1164 		logerr("Missed sending total of %d probes spread over"
1165 		    " %d occurrences\n", probes_missed.pm_nprobes,
1166 		    probes_missed.pm_ntimes);
1167 
1168 		/*
1169 		 * Print out the accumulated statistics about probes
1170 		 * that were sent.
1171 		 */
1172 		for (pii = phyint_instances; pii != NULL;
1173 		    pii = pii->pii_next) {
1174 			unacked = 0;
1175 			acked = pii->pii_cum_stats.acked;
1176 			lost = pii->pii_cum_stats.lost;
1177 			sent = pii->pii_cum_stats.sent;
1178 			unknown = pii->pii_cum_stats.unknown;
1179 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1180 				switch (pii->pii_probes[pr_ndx].pr_status) {
1181 				case PR_ACKED:
1182 					acked++;
1183 					break;
1184 				case PR_LOST:
1185 					lost++;
1186 					break;
1187 				case PR_UNACKED:
1188 					unacked++;
1189 					break;
1190 				}
1191 			}
1192 			logerr("\nProbe stats on (%s %s)\n"
1193 			    "Number of probes sent %lld\n"
1194 			    "Number of probe acks received %lld\n"
1195 			    "Number of probes/acks lost %lld\n"
1196 			    "Number of valid unacknowledged probes %lld\n"
1197 			    "Number of ambiguous probe acks received %lld\n",
1198 			    AF_STR(pii->pii_af), pii->pii_name,
1199 			    sent, acked, lost, unacked, unknown);
1200 		}
1201 		break;
1202 	case SIGHUP:
1203 		logerr("SIGHUP: restart and reread config file\n");
1204 		/*
1205 		 * Cancel the interval timer.  Needed since setitimer() uses
1206 		 * alarm() and the time left is inherited across exec(), and
1207 		 * thus the SIGALRM may be delivered before a handler has been
1208 		 * setup, causing in.mpathd to erroneously exit.
1209 		 */
1210 		timer_cancel();
1211 		cleanup();
1212 		(void) execv(argv0[0], argv0);
1213 		_exit(0177);
1214 		/* NOTREACHED */
1215 	case SIGINT:
1216 	case SIGTERM:
1217 	case SIGQUIT:
1218 		cleanup();
1219 		exit(0);
1220 		/* NOTREACHED */
1221 	default:
1222 		logerr("in_signal: unknown signal: %d\n", buf);
1223 	}
1224 }
1225 
1226 static void
1227 cleanup(void)
1228 {
1229 	struct phyint_instance *pii;
1230 	struct phyint_instance *next_pii;
1231 
1232 	/*
1233 	 * Make sure that we don't write to eventpipe in
1234 	 * sig_handler() if any signal notably SIGALRM,
1235 	 * occurs after we close the eventpipe descriptor below
1236 	 */
1237 	cleanup_started = _B_TRUE;
1238 
1239 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1240 		next_pii = pii->pii_next;
1241 		phyint_inst_delete(pii);
1242 	}
1243 
1244 	(void) close(ifsock_v4);
1245 	(void) close(ifsock_v6);
1246 	(void) close(rtsock_v4);
1247 	(void) close(rtsock_v6);
1248 	(void) close(lsock_v4);
1249 	(void) close(lsock_v6);
1250 	(void) close(0);
1251 	(void) close(1);
1252 	(void) close(2);
1253 	(void) close(mibfd);
1254 	(void) close(eventpipe_read);
1255 	(void) close(eventpipe_write);
1256 }
1257 
1258 /*
1259  * Create pipe for signal delivery and set up signal handlers.
1260  */
1261 static void
1262 setup_eventpipe(void)
1263 {
1264 	int fds[2];
1265 	struct sigaction act;
1266 
1267 	if ((pipe(fds)) < 0) {
1268 		logperror("setup_eventpipe: pipe");
1269 		exit(1);
1270 	}
1271 	eventpipe_read = fds[0];
1272 	eventpipe_write = fds[1];
1273 	if (poll_add(eventpipe_read) == -1) {
1274 		exit(1);
1275 	}
1276 
1277 	act.sa_handler = sig_handler;
1278 	act.sa_flags = SA_RESTART;
1279 	(void) sigaction(SIGALRM, &act, NULL);
1280 
1281 	(void) sigset(SIGHUP, sig_handler);
1282 	(void) sigset(SIGUSR1, sig_handler);
1283 	(void) sigset(SIGTERM, sig_handler);
1284 	(void) sigset(SIGINT, sig_handler);
1285 	(void) sigset(SIGQUIT, sig_handler);
1286 }
1287 
1288 /*
1289  * Create a routing socket for receiving RTM_IFINFO messages.
1290  */
1291 static int
1292 setup_rtsock(int af)
1293 {
1294 	int	s;
1295 	int	flags;
1296 	int	aware = RTAW_UNDER_IPMP;
1297 
1298 	s = socket(PF_ROUTE, SOCK_RAW, af);
1299 	if (s == -1) {
1300 		logperror("setup_rtsock: socket PF_ROUTE");
1301 		exit(1);
1302 	}
1303 
1304 	if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
1305 		logperror("setup_rtsock: setsockopt RT_AWARE");
1306 		(void) close(s);
1307 		exit(1);
1308 	}
1309 
1310 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1311 		logperror("setup_rtsock: fcntl F_GETFL");
1312 		(void) close(s);
1313 		exit(1);
1314 	}
1315 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1316 		logperror("setup_rtsock: fcntl F_SETFL");
1317 		(void) close(s);
1318 		exit(1);
1319 	}
1320 	if (poll_add(s) == -1) {
1321 		(void) close(s);
1322 		exit(1);
1323 	}
1324 	return (s);
1325 }
1326 
1327 /*
1328  * Process an RTM_IFINFO message received on a routing socket.
1329  * The return value indicates whether a full interface scan is required.
1330  * Link up/down notifications are reflected in the IFF_RUNNING flag.
1331  * If just the state of the IFF_RUNNING interface flag has changed, a
1332  * a full interface scan isn't required.
1333  */
1334 static boolean_t
1335 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1336 {
1337 	struct sockaddr_dl *sdl;
1338 	struct phyint *pi;
1339 	uint64_t old_flags;
1340 	struct phyint_instance *pii;
1341 
1342 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1343 
1344 	/*
1345 	 * Although the sockaddr_dl structure is directly after the
1346 	 * if_msghdr_t structure. At the time of writing, the size of the
1347 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1348 	 * to the presence of a timeval structure, which contains longs,
1349 	 * in the if_data structure.  Anyway, we know where the message ends,
1350 	 * so we work backwards to get the start of the sockaddr_dl structure.
1351 	 */
1352 	/*LINTED*/
1353 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1354 	    sizeof (struct sockaddr_dl));
1355 
1356 	assert(sdl->sdl_family == AF_LINK);
1357 
1358 	/*
1359 	 * The interface name is in sdl_data.
1360 	 * RTM_IFINFO messages are only generated for logical interface
1361 	 * zero, so there is no colon and logical interface number to
1362 	 * strip from the name.	 The name is not null terminated, but
1363 	 * there should be enough space in sdl_data to add the null.
1364 	 */
1365 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1366 		if (debug & D_LINKNOTE)
1367 			logdebug("process_rtm_ifinfo: phyint name too long\n");
1368 		return (_B_TRUE);
1369 	}
1370 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1371 
1372 	pi = phyint_lookup(sdl->sdl_data);
1373 	if (pi == NULL) {
1374 		if (debug & D_LINKNOTE)
1375 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1376 			    " for %s\n", sdl->sdl_data);
1377 		return (_B_TRUE);
1378 	}
1379 
1380 	/*
1381 	 * We want to try and avoid doing a full interface scan for
1382 	 * link state notifications from the datalink layer, as indicated
1383 	 * by the state of the IFF_RUNNING flag.  If just the
1384 	 * IFF_RUNNING flag has changed state, the link state changes
1385 	 * are processed without a full scan.
1386 	 * If there is both an IPv4 and IPv6 instance associated with
1387 	 * the physical interface, we will get an RTM_IFINFO message
1388 	 * for each instance.  If we just maintained a single copy of
1389 	 * the physical interface flags, it would appear that no flags
1390 	 * had changed when the second message is processed, leading us
1391 	 * to believe that the message wasn't generated by a flags change,
1392 	 * and that a full interface scan is required.
1393 	 * To get around this problem, two additional copies of the flags
1394 	 * are kept, one copy for each instance.  These are only used in
1395 	 * this routine.  At any one time, all three copies of the flags
1396 	 * should be identical except for the IFF_RUNNING flag.	 The
1397 	 * copy of the flags in the "phyint" structure is always up to
1398 	 * date.
1399 	 */
1400 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1401 	if (pii == NULL) {
1402 		if (debug & D_LINKNOTE)
1403 			logdebug("process_rtm_ifinfo: no instance of address "
1404 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1405 		return (_B_TRUE);
1406 	}
1407 
1408 	old_flags = pii->pii_flags;
1409 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1410 	pi->pi_flags = pii->pii_flags;
1411 
1412 	if (debug & D_LINKNOTE) {
1413 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1414 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1415 		    AF_STR(type), old_flags, pi->pi_flags);
1416 	}
1417 
1418 	/*
1419 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1420 	 * types and refresh IFF_INACTIVE if need be.
1421 	 */
1422 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) {
1423 		phyint_changed(pi);
1424 		if (pii->pii_flags & IFF_STANDBY)
1425 			phyint_standby_refresh_inactive(pi);
1426 	}
1427 
1428 	/* Has just the IFF_RUNNING flag changed state ? */
1429 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1430 		struct phyint_instance *pii_other;
1431 		/*
1432 		 * It wasn't just a link state change.	Update
1433 		 * the other instance's copy of the flags.
1434 		 */
1435 		pii_other = phyint_inst_other(pii);
1436 		if (pii_other != NULL)
1437 			pii_other->pii_flags = pii->pii_flags;
1438 		return (_B_TRUE);
1439 	}
1440 
1441 	return (_B_FALSE);
1442 }
1443 
1444 /*
1445  * Retrieve as many routing socket messages as possible, and try to
1446  * empty the routing sockets. Initiate full scan of targets or interfaces
1447  * as needed.
1448  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1449  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1450  */
1451 static void
1452 process_rtsock(int rtsock_v4, int rtsock_v6)
1453 {
1454 	int	nbytes;
1455 	int64_t msg[2048 / 8];
1456 	struct rt_msghdr *rtm;
1457 	boolean_t need_if_scan = _B_FALSE;
1458 	boolean_t need_rt_scan = _B_FALSE;
1459 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1460 	int type;
1461 
1462 	/* Read as many messages as possible and try to empty the sockets */
1463 	for (type = AF_INET; ; type = AF_INET6) {
1464 		for (;;) {
1465 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1466 			    rtsock_v6, msg, sizeof (msg));
1467 			if (nbytes <= 0) {
1468 				/* No more messages */
1469 				break;
1470 			}
1471 			rtm = (struct rt_msghdr *)msg;
1472 			if (rtm->rtm_version != RTM_VERSION) {
1473 				logerr("process_rtsock: version %d "
1474 				    "not understood\n", rtm->rtm_version);
1475 				break;
1476 			}
1477 
1478 			if (debug & D_PHYINT) {
1479 				logdebug("process_rtsock: message %d\n",
1480 				    rtm->rtm_type);
1481 			}
1482 
1483 			switch (rtm->rtm_type) {
1484 			case RTM_NEWADDR:
1485 			case RTM_DELADDR:
1486 				/*
1487 				 * Some logical interface has changed,
1488 				 * have to scan everything to determine
1489 				 * what actually changed.
1490 				 */
1491 				need_if_scan = _B_TRUE;
1492 				break;
1493 
1494 			case RTM_IFINFO:
1495 				rtm_ifinfo_seen = _B_TRUE;
1496 				need_if_scan |= process_rtm_ifinfo(
1497 				    (if_msghdr_t *)rtm, type);
1498 				break;
1499 
1500 			case RTM_ADD:
1501 			case RTM_DELETE:
1502 			case RTM_CHANGE:
1503 			case RTM_OLDADD:
1504 			case RTM_OLDDEL:
1505 				need_rt_scan = _B_TRUE;
1506 				break;
1507 
1508 			default:
1509 				/* Not interesting */
1510 				break;
1511 			}
1512 		}
1513 		if (type == AF_INET6)
1514 			break;
1515 	}
1516 
1517 	if (need_if_scan) {
1518 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1519 			logdebug("process_rtsock: synchronizing with kernel\n");
1520 		initifs();
1521 	} else if (rtm_ifinfo_seen) {
1522 		if (debug & D_LINKNOTE)
1523 			logdebug("process_rtsock: "
1524 			    "link up/down notification(s) seen\n");
1525 		process_link_state_changes();
1526 	}
1527 
1528 	if (need_rt_scan)
1529 		init_router_targets();
1530 }
1531 
1532 /*
1533  * Look if the phyint instance or one of its logints have been removed from
1534  * the kernel and take appropriate action.
1535  * Uses {pii,li}_in_use.
1536  */
1537 static void
1538 check_if_removed(struct phyint_instance *pii)
1539 {
1540 	struct logint *li;
1541 	struct logint *next_li;
1542 
1543 	/* Detect phyints that have been removed from the kernel. */
1544 	if (!pii->pii_in_use) {
1545 		logtrace("%s %s has been removed from kernel\n",
1546 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1547 		phyint_inst_delete(pii);
1548 	} else {
1549 		/* Detect logints that have been removed. */
1550 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1551 			next_li = li->li_next;
1552 			if (!li->li_in_use) {
1553 				logint_delete(li);
1554 			}
1555 		}
1556 	}
1557 }
1558 
1559 /*
1560  * Parse the supplied mib2 information to extract the routing information
1561  * table. Process the routing table to get the list of known onlink routers
1562  * and update our database. These onlink routers will serve as probe
1563  * targets.
1564  */
1565 static void
1566 update_router_list(mib_item_t *item)
1567 {
1568 	for (; item != NULL; item = item->mi_next) {
1569 		if (item->mi_opthdr.name == 0)
1570 			continue;
1571 		if (item->mi_opthdr.level == MIB2_IP &&
1572 		    item->mi_opthdr.name == MIB2_IP_ROUTE) {
1573 			ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp,
1574 			    item->mi_opthdr.len);
1575 		} else if (item->mi_opthdr.level == MIB2_IP6 &&
1576 		    item->mi_opthdr.name == MIB2_IP6_ROUTE) {
1577 			ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp,
1578 			    item->mi_opthdr.len);
1579 		}
1580 	}
1581 }
1582 
1583 
1584 /*
1585  * Convert octet `octp' to a phyint name and store in `ifname'
1586  */
1587 static void
1588 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
1589 {
1590 	char *cp;
1591 	size_t len = MIN(octp->o_length, ifsize - 1);
1592 
1593 	(void) strncpy(ifname, octp->o_bytes, len);
1594 	ifname[len] = '\0';
1595 
1596 	if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
1597 		*cp = '\0';
1598 }
1599 
1600 /*
1601  * Examine the IPv4 routing table `buf' for possible targets.  For each
1602  * possible target, if it's on the same subnet an interface route, pass
1603  * it to router_add_common() for further consideration.
1604  */
1605 static void
1606 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1607 {
1608 	char ifname[LIFNAMSIZ];
1609 	mib2_ipRouteEntry_t	*rp, *rp1, *endp;
1610 	struct in_addr		nexthop_v4;
1611 	struct in6_addr		nexthop;
1612 
1613 	if (debug & D_TARGET)
1614 		logdebug("ire_process_v4(len %d)\n", len);
1615 
1616 	if (len == 0)
1617 		return;
1618 
1619 	assert((len % ipRouteEntrySize) == 0);
1620 	endp = buf + (len / ipRouteEntrySize);
1621 
1622 	/*
1623 	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1624 	 * cross-reference them with the interface routes to determine if
1625 	 * they're possible probe targets.
1626 	 */
1627 	for (rp = buf; rp < endp; rp++) {
1628 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1629 			continue;
1630 
1631 		/* Get the nexthop address. */
1632 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1633 
1634 		/*
1635 		 * Rescan the routing table looking for interface routes that
1636 		 * are on the same subnet, and try to add them.  If they're
1637 		 * not relevant (e.g., the interface route isn't part of an
1638 		 * IPMP group, router_add_common() will discard).
1639 		 */
1640 		for (rp1 = buf; rp1 < endp; rp1++) {
1641 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
1642 			    rp1->ipRouteIfIndex.o_length == 0)
1643 				continue;
1644 
1645 			if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
1646 			    (nexthop_v4.s_addr & rp1->ipRouteMask))
1647 				continue;
1648 
1649 			oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
1650 			IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1651 			router_add_common(AF_INET, ifname, nexthop);
1652 		}
1653 	}
1654 }
1655 
1656 void
1657 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1658 {
1659 	struct phyint_instance *pii;
1660 	struct phyint *pi;
1661 
1662 	if (debug & D_TARGET)
1663 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1664 
1665 	/*
1666 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1667 	 */
1668 	pii = phyint_inst_lookup(af, ifname);
1669 	if (pii == NULL)
1670 		return;
1671 
1672 	/*
1673 	 * Don't use our own addresses as targets.
1674 	 */
1675 	if (own_address(nexthop))
1676 		return;
1677 
1678 	/*
1679 	 * If the phyint is part a named group, then add the address to all
1680 	 * members of the group; note that this is suboptimal in the IPv4 case
1681 	 * as it has already been added to all matching interfaces in
1682 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1683 	 * itself, since other phyints in the anongroup may not be on the same
1684 	 * subnet.
1685 	 */
1686 	pi = pii->pii_phyint;
1687 	if (pi->pi_group == phyint_anongroup) {
1688 		target_add(pii, nexthop, _B_TRUE);
1689 	} else {
1690 		pi = pi->pi_group->pg_phyint;
1691 		for (; pi != NULL; pi = pi->pi_pgnext)
1692 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1693 	}
1694 }
1695 
1696 /*
1697  * Examine the IPv6 routing table `buf' for possible link-local targets, and
1698  * pass any contenders to router_add_common() for further consideration.
1699  */
1700 static void
1701 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1702 {
1703 	struct lifreq lifr;
1704 	char ifname[LIFNAMSIZ];
1705 	char grname[LIFGRNAMSIZ];
1706 	mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
1707 	struct in6_addr nexthop_v6;
1708 
1709 	if (debug & D_TARGET)
1710 		logdebug("ire_process_v6(len %d)\n", len);
1711 
1712 	if (len == 0)
1713 		return;
1714 
1715 	assert((len % ipv6RouteEntrySize) == 0);
1716 	endp = buf + (len / ipv6RouteEntrySize);
1717 
1718 	/*
1719 	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1720 	 * cross-reference them with the interface routes to determine if
1721 	 * they're possible probe targets.
1722 	 */
1723 	for (rp = buf; rp < endp; rp++) {
1724 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
1725 		    !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
1726 			continue;
1727 
1728 		/* Get the nexthop address. */
1729 		nexthop_v6 = rp->ipv6RouteNextHop;
1730 
1731 		/*
1732 		 * The interface name should always exist for link-locals;
1733 		 * we use it to map this entry to an IPMP group name.
1734 		 */
1735 		if (rp->ipv6RouteIfIndex.o_length == 0)
1736 			continue;
1737 
1738 		oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
1739 		if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
1740 		    strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
1741 			continue;
1742 		}
1743 
1744 		/*
1745 		 * Rescan the list of routes for interface routes, and add the
1746 		 * above target to any interfaces in the same IPMP group.
1747 		 */
1748 		for (rp1 = buf; rp1 < endp; rp1++) {
1749 			if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
1750 			    rp1->ipv6RouteIfIndex.o_length == 0) {
1751 				continue;
1752 			}
1753 			oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
1754 			(void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
1755 
1756 			if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
1757 			    strcmp(lifr.lifr_groupname, grname) == 0) {
1758 				router_add_common(AF_INET6, ifname, nexthop_v6);
1759 			}
1760 		}
1761 	}
1762 }
1763 
1764 /*
1765  * Build a list of target routers, by scanning the routing tables.
1766  * It is assumed that interface routes exist, to reach the routers.
1767  */
1768 static void
1769 init_router_targets(void)
1770 {
1771 	struct	target *tg;
1772 	struct	target *next_tg;
1773 	struct	phyint_instance *pii;
1774 	struct	phyint *pi;
1775 
1776 	if (force_mcast)
1777 		return;
1778 
1779 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1780 		pi = pii->pii_phyint;
1781 		/*
1782 		 * Set tg_in_use to false only for router targets.
1783 		 */
1784 		if (!pii->pii_targets_are_routers)
1785 			continue;
1786 
1787 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1788 			tg->tg_in_use = 0;
1789 	}
1790 
1791 	if (mibwalk(update_router_list) == -1)
1792 		exit(1);
1793 
1794 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1795 		pi = pii->pii_phyint;
1796 		if (!pii->pii_targets_are_routers)
1797 			continue;
1798 
1799 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1800 			next_tg = tg->tg_next;
1801 			/*
1802 			 * If the group has failed, it's likely the route was
1803 			 * removed by an application affected by that failure.
1804 			 * In that case, we keep the target so that we can
1805 			 * reliably repair, at which point we'll refresh the
1806 			 * target list again.
1807 			 */
1808 			if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
1809 				target_delete(tg);
1810 		}
1811 	}
1812 }
1813 
1814 /*
1815  * Attempt to assign host targets to any interfaces that do not currently
1816  * have probe targets by sharing targets with other interfaces in the group.
1817  */
1818 static void
1819 init_host_targets(void)
1820 {
1821 	struct phyint_instance *pii;
1822 	struct phyint_group *pg;
1823 
1824 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1825 		pg = pii->pii_phyint->pi_group;
1826 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
1827 			dup_host_targets(pii);
1828 	}
1829 }
1830 
1831 /*
1832  * Duplicate host targets from other phyints of the group to
1833  * the phyint instance 'desired_pii'.
1834  */
1835 static void
1836 dup_host_targets(struct phyint_instance	 *desired_pii)
1837 {
1838 	int af;
1839 	struct phyint *pi;
1840 	struct phyint_instance *pii;
1841 	struct target *tg;
1842 
1843 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
1844 
1845 	af = desired_pii->pii_af;
1846 
1847 	/*
1848 	 * For every phyint in the same group as desired_pii, check if
1849 	 * it has any host targets. If so add them to desired_pii.
1850 	 */
1851 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
1852 		pii = PHYINT_INSTANCE(pi, af);
1853 		/*
1854 		 * We know that we don't have targets on this phyint instance
1855 		 * since we have been called. But we still check for
1856 		 * pii_targets_are_routers because another phyint instance
1857 		 * could have router targets, since IFF_NOFAILOVER addresses
1858 		 * on different phyint instances may belong to different
1859 		 * subnets.
1860 		 */
1861 		if ((pii == NULL) || (pii == desired_pii) ||
1862 		    pii->pii_targets_are_routers)
1863 			continue;
1864 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1865 			target_create(desired_pii, tg->tg_address, _B_FALSE);
1866 		}
1867 	}
1868 }
1869 
1870 static void
1871 usage(char *cmd)
1872 {
1873 	(void) fprintf(stderr, "usage: %s\n", cmd);
1874 }
1875 
1876 
1877 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
1878 
1879 /* Get an option from the /etc/default/mpathd file */
1880 static char *
1881 getdefault(char *name)
1882 {
1883 	char namebuf[BUFSIZ];
1884 	char *value = NULL;
1885 
1886 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
1887 		char	*cp;
1888 		int	flags;
1889 
1890 		/*
1891 		 * ignore case
1892 		 */
1893 		flags = defcntl(DC_GETFLAGS, 0);
1894 		TURNOFF(flags, DC_CASE);
1895 		(void) defcntl(DC_SETFLAGS, flags);
1896 
1897 		/* Add "=" to the name */
1898 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
1899 		(void) strncat(namebuf, "=", 2);
1900 
1901 		if ((cp = defread(namebuf)) != NULL)
1902 			value = strdup(cp);
1903 
1904 		/* close */
1905 		(void) defopen((char *)NULL);
1906 	}
1907 	return (value);
1908 }
1909 
1910 
1911 /*
1912  * Command line options below
1913  */
1914 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
1915 boolean_t	track_all_phyints = _B_FALSE;	/* track all IP interfaces */
1916 static boolean_t adopt = _B_FALSE;
1917 static boolean_t foreground = _B_FALSE;
1918 
1919 int
1920 main(int argc, char *argv[])
1921 {
1922 	int i;
1923 	int c;
1924 	struct phyint *pi;
1925 	struct phyint_instance *pii;
1926 	char *value;
1927 
1928 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
1929 	srandom(gethostid());	/* Initialize the random number generator */
1930 
1931 	/*
1932 	 * NOTE: The messages output by in.mpathd are not suitable for
1933 	 * translation, so we do not call textdomain().
1934 	 */
1935 	(void) setlocale(LC_ALL, "");
1936 
1937 	/*
1938 	 * Get the user specified value of 'failure detection time'
1939 	 * from /etc/default/mpathd
1940 	 */
1941 	value = getdefault("FAILURE_DETECTION_TIME");
1942 	if (value != NULL) {
1943 		user_failure_detection_time =
1944 		    (int)strtol((char *)value, NULL, 0);
1945 
1946 		if (user_failure_detection_time <= 0) {
1947 			user_failure_detection_time = FAILURE_DETECTION_TIME;
1948 			logerr("Invalid failure detection time %s, assuming "
1949 			    "default of %d ms\n", value,
1950 			    user_failure_detection_time);
1951 
1952 		} else if (user_failure_detection_time <
1953 		    MIN_FAILURE_DETECTION_TIME) {
1954 			user_failure_detection_time =
1955 			    MIN_FAILURE_DETECTION_TIME;
1956 			logerr("Too small failure detection time of %s, "
1957 			    "assuming minimum of %d ms\n", value,
1958 			    user_failure_detection_time);
1959 		}
1960 		free(value);
1961 	} else {
1962 		/* User has not specified the parameter, Use default value */
1963 		user_failure_detection_time = FAILURE_DETECTION_TIME;
1964 	}
1965 
1966 	/*
1967 	 * This gives the frequency at which probes will be sent.
1968 	 * When fdt ms elapses, we should be able to determine
1969 	 * whether 5 consecutive probes have failed or not.
1970 	 * 1 probe will be sent in every user_probe_interval ms,
1971 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
1972 	 * user_probe_interval. Thus when we send out probe 'n' we
1973 	 * can be sure that probe 'n - 2' is lost, if we have not
1974 	 * got the ack. (since the probe interval is > crtt). But
1975 	 * probe 'n - 1' may be a valid unacked probe, since the
1976 	 * time between 2 successive probes could be as small as
1977 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
1978 	 */
1979 	user_probe_interval = user_failure_detection_time /
1980 	    (NUM_PROBE_FAILS + 2);
1981 
1982 	/*
1983 	 * Get the user specified value of failback_enabled from
1984 	 * /etc/default/mpathd
1985 	 */
1986 	value = getdefault("FAILBACK");
1987 	if (value != NULL) {
1988 		if (strcasecmp(value, "yes") == 0)
1989 			failback_enabled = _B_TRUE;
1990 		else if (strcasecmp(value, "no") == 0)
1991 			failback_enabled = _B_FALSE;
1992 		else
1993 			logerr("Invalid value for FAILBACK %s\n", value);
1994 		free(value);
1995 	} else {
1996 		failback_enabled = _B_TRUE;
1997 	}
1998 
1999 	/*
2000 	 * Get the user specified value of track_all_phyints from
2001 	 * /etc/default/mpathd. The sense is reversed in
2002 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2003 	 */
2004 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2005 	if (value != NULL) {
2006 		if (strcasecmp(value, "yes") == 0)
2007 			track_all_phyints = _B_FALSE;
2008 		else if (strcasecmp(value, "no") == 0)
2009 			track_all_phyints = _B_TRUE;
2010 		else
2011 			logerr("Invalid value for "
2012 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2013 		free(value);
2014 	} else {
2015 		track_all_phyints = _B_FALSE;
2016 	}
2017 
2018 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2019 		switch (c) {
2020 		case 'a':
2021 			adopt = _B_TRUE;
2022 			break;
2023 		case 'm':
2024 			force_mcast = _B_TRUE;
2025 			break;
2026 		case 'd':
2027 			debug = D_ALL;
2028 			foreground = _B_TRUE;
2029 			break;
2030 		case 'D':
2031 			i = (int)strtol(optarg, NULL, 0);
2032 			if (i == 0) {
2033 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2034 				    optarg);
2035 				exit(1);
2036 			}
2037 			debug |= i;
2038 			foreground = _B_TRUE;
2039 			break;
2040 		case 'l':
2041 			/*
2042 			 * Turn off link state notification handling.
2043 			 * Undocumented command line flag, for debugging
2044 			 * purposes.
2045 			 */
2046 			handle_link_notifications = _B_FALSE;
2047 			break;
2048 		default:
2049 			usage(argv[0]);
2050 			exit(1);
2051 		}
2052 	}
2053 
2054 	/*
2055 	 * The sockets for the loopback command interface should be listening
2056 	 * before we fork and exit in daemonize(). This way, whoever started us
2057 	 * can use the loopback interface as soon as they get a zero exit
2058 	 * status.
2059 	 */
2060 	lsock_v4 = setup_listener(AF_INET);
2061 	lsock_v6 = setup_listener(AF_INET6);
2062 
2063 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2064 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2065 		exit(1);
2066 	}
2067 
2068 	if (!foreground) {
2069 		if (!daemonize()) {
2070 			logerr("cannot daemonize\n");
2071 			exit(EXIT_FAILURE);
2072 		}
2073 		initlog();
2074 	}
2075 
2076 	/*
2077 	 * Initializations:
2078 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2079 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2080 	 * 2. Initialize a pipe for handling/recording signal events.
2081 	 * 3. Create the routing sockets,  used for listening
2082 	 *    to routing / interface changes.
2083 	 * 4. phyint_init() - Initialize physical interface state
2084 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2085 	 *    which timer_init() does indirectly.
2086 	 * 5. Query kernel for route entry sizes (v4 and v6).
2087 	 * 6. timer_init()  - Initialize timer related stuff
2088 	 * 7. initifs() - Initialize our database of all known interfaces
2089 	 * 8. init_router_targets() - Initialize our database of all known
2090 	 *    router targets.
2091 	 */
2092 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2093 	if (ifsock_v4 < 0) {
2094 		logperror("main: IPv4 socket open");
2095 		exit(1);
2096 	}
2097 
2098 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2099 	if (ifsock_v6 < 0) {
2100 		logperror("main: IPv6 socket open");
2101 		exit(1);
2102 	}
2103 
2104 	setup_eventpipe();
2105 
2106 	rtsock_v4 = setup_rtsock(AF_INET);
2107 	rtsock_v6 = setup_rtsock(AF_INET6);
2108 
2109 	if (phyint_init() == -1) {
2110 		logerr("cannot initialize physical interface structures");
2111 		exit(1);
2112 	}
2113 
2114 	if (mibwalk(mib_get_constants) == -1)
2115 		exit(1);
2116 
2117 	timer_init();
2118 
2119 	initifs();
2120 
2121 	/*
2122 	 * If we're operating in "adopt" mode and no interfaces need to be
2123 	 * tracked, shut down (ifconfig(8) will restart us on demand if
2124 	 * interfaces are subsequently put into multipathing groups).
2125 	 */
2126 	if (adopt && phyint_instances == NULL)
2127 		exit(0);
2128 
2129 	/*
2130 	 * Main body. Keep listening for activity on any of the sockets
2131 	 * that we are monitoring and take appropriate action as necessary.
2132 	 * signals are also handled synchronously.
2133 	 */
2134 	for (;;) {
2135 		if (poll(pollfds, pollfd_num, -1) < 0) {
2136 			if (errno == EINTR)
2137 				continue;
2138 			logperror("main: poll");
2139 			exit(1);
2140 		}
2141 		for (i = 0; i < pollfd_num; i++) {
2142 			if ((pollfds[i].fd == -1) ||
2143 			    !(pollfds[i].revents & POLLIN))
2144 				continue;
2145 			if (pollfds[i].fd == eventpipe_read) {
2146 				in_signal(eventpipe_read);
2147 				break;
2148 			}
2149 			if (pollfds[i].fd == rtsock_v4 ||
2150 			    pollfds[i].fd == rtsock_v6) {
2151 				process_rtsock(rtsock_v4, rtsock_v6);
2152 				break;
2153 			}
2154 
2155 			for (pii = phyint_instances; pii != NULL;
2156 			    pii = pii->pii_next) {
2157 				if (pollfds[i].fd == pii->pii_probe_sock) {
2158 					if (pii->pii_af == AF_INET)
2159 						in_data(pii);
2160 					else
2161 						in6_data(pii);
2162 					break;
2163 				}
2164 			}
2165 
2166 			for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2167 				if (pi->pi_notes != 0 &&
2168 				    pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
2169 					(void) dlpi_recv(pi->pi_dh, NULL, NULL,
2170 					    NULL, NULL, 0, NULL);
2171 					break;
2172 				}
2173 			}
2174 
2175 			if (pollfds[i].fd == lsock_v4)
2176 				loopback_cmd(lsock_v4, AF_INET);
2177 			else if (pollfds[i].fd == lsock_v6)
2178 				loopback_cmd(lsock_v6, AF_INET6);
2179 		}
2180 	}
2181 	/* NOTREACHED */
2182 	return (EXIT_SUCCESS);
2183 }
2184 
2185 static int
2186 setup_listener(int af)
2187 {
2188 	int sock;
2189 	int on;
2190 	int len;
2191 	int ret;
2192 	struct sockaddr_storage laddr;
2193 	struct sockaddr_in  *sin;
2194 	struct sockaddr_in6 *sin6;
2195 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2196 
2197 	assert(af == AF_INET || af == AF_INET6);
2198 
2199 	sock = socket(af, SOCK_STREAM, 0);
2200 	if (sock < 0) {
2201 		logperror("setup_listener: socket");
2202 		exit(1);
2203 	}
2204 
2205 	on = 1;
2206 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2207 	    sizeof (on)) < 0) {
2208 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2209 		exit(1);
2210 	}
2211 
2212 	bzero(&laddr, sizeof (laddr));
2213 	laddr.ss_family = af;
2214 
2215 	if (af == AF_INET) {
2216 		sin = (struct sockaddr_in *)&laddr;
2217 		sin->sin_port = htons(MPATHD_PORT);
2218 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2219 		len = sizeof (struct sockaddr_in);
2220 	} else {
2221 		sin6 = (struct sockaddr_in6 *)&laddr;
2222 		sin6->sin6_port = htons(MPATHD_PORT);
2223 		sin6->sin6_addr = loopback_addr;
2224 		len = sizeof (struct sockaddr_in6);
2225 	}
2226 
2227 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2228 	if (ret < 0) {
2229 		if (errno == EADDRINUSE) {
2230 			/*
2231 			 * Another instance of mpathd may be already active.
2232 			 */
2233 			logerr("main: is another instance of in.mpathd "
2234 			    "already active?\n");
2235 			exit(1);
2236 		} else {
2237 			(void) close(sock);
2238 			return (-1);
2239 		}
2240 	}
2241 	if (listen(sock, 30) < 0) {
2242 		logperror("main: listen");
2243 		exit(1);
2244 	}
2245 	if (poll_add(sock) == -1) {
2246 		(void) close(sock);
2247 		exit(1);
2248 	}
2249 
2250 	return (sock);
2251 }
2252 
2253 /*
2254  * Table of commands and their expected size; used by loopback_cmd().
2255  */
2256 static struct {
2257 	const char	*name;
2258 	unsigned int	size;
2259 } commands[] = {
2260 	{ "MI_PING",		sizeof (uint32_t)	},
2261 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2262 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2263 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2264 };
2265 
2266 /*
2267  * Commands received over the loopback interface come here (via libipmp).
2268  */
2269 static void
2270 loopback_cmd(int sock, int family)
2271 {
2272 	int newfd;
2273 	ssize_t len;
2274 	boolean_t is_priv = _B_FALSE;
2275 	struct sockaddr_storage	peer;
2276 	struct sockaddr_in	*peer_sin;
2277 	struct sockaddr_in6	*peer_sin6;
2278 	socklen_t peerlen;
2279 	union mi_commands mpi;
2280 	char abuf[INET6_ADDRSTRLEN];
2281 	uint_t cmd;
2282 	int retval;
2283 
2284 	peerlen = sizeof (peer);
2285 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2286 	if (newfd < 0) {
2287 		logperror("loopback_cmd: accept");
2288 		return;
2289 	}
2290 
2291 	switch (family) {
2292 	case AF_INET:
2293 		/*
2294 		 * Validate the address and port to make sure that
2295 		 * non privileged processes don't connect and start
2296 		 * talking to us.
2297 		 */
2298 		if (peerlen != sizeof (struct sockaddr_in)) {
2299 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2300 			(void) close(newfd);
2301 			return;
2302 		}
2303 		peer_sin = (struct sockaddr_in *)&peer;
2304 		is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
2305 		(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2306 		    abuf, sizeof (abuf));
2307 
2308 		if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
2309 			logerr("Attempt to connect from addr %s port %d\n",
2310 			    abuf, ntohs(peer_sin->sin_port));
2311 			(void) close(newfd);
2312 			return;
2313 		}
2314 		break;
2315 
2316 	case AF_INET6:
2317 		if (peerlen != sizeof (struct sockaddr_in6)) {
2318 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2319 			(void) close(newfd);
2320 			return;
2321 		}
2322 		/*
2323 		 * Validate the address and port to make sure that
2324 		 * non privileged processes don't connect and start
2325 		 * talking to us.
2326 		 */
2327 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2328 		is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
2329 		(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2330 		    sizeof (abuf));
2331 		if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
2332 			logerr("Attempt to connect from addr %s port %d\n",
2333 			    abuf, ntohs(peer_sin6->sin6_port));
2334 			(void) close(newfd);
2335 			return;
2336 		}
2337 		break;
2338 
2339 	default:
2340 		logdebug("loopback_cmd: family %d\n", family);
2341 		(void) close(newfd);
2342 		return;
2343 	}
2344 
2345 	/*
2346 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2347 	 * all supported commands
2348 	 */
2349 	len = read(newfd, &mpi, sizeof (mpi));
2350 
2351 	/*
2352 	 * In theory, we can receive any sized message for a stream socket,
2353 	 * but we don't expect that to happen for a small message over a
2354 	 * loopback connection.
2355 	 */
2356 	if (len < sizeof (uint32_t)) {
2357 		logerr("loopback_cmd: bad command format or read returns "
2358 		    "partial data %d\n", len);
2359 		(void) close(newfd);
2360 		return;
2361 	}
2362 
2363 	cmd = mpi.mi_command;
2364 	if (cmd >= MI_NCMD) {
2365 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2366 		(void) close(newfd);
2367 		return;
2368 	}
2369 
2370 	/*
2371 	 * Only MI_PING and MI_QUERY can come from unprivileged sources.
2372 	 */
2373 	if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
2374 		logerr("Unprivileged request from %s for privileged "
2375 		    "command %s\n", abuf, commands[cmd].name);
2376 		(void) close(newfd);
2377 		return;
2378 	}
2379 
2380 	if (len < commands[cmd].size) {
2381 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2382 		    commands[cmd].name, commands[cmd].size, len);
2383 		(void) close(newfd);
2384 		return;
2385 	}
2386 
2387 	retval = process_cmd(newfd, &mpi);
2388 	if (retval != IPMP_SUCCESS) {
2389 		logerr("failed processing %s: %s\n", commands[cmd].name,
2390 		    ipmp_errmsg(retval));
2391 	}
2392 	(void) close(newfd);
2393 }
2394 
2395 /*
2396  * Process the commands received via libipmp.
2397  */
2398 static unsigned int
2399 process_cmd(int newfd, union mi_commands *mpi)
2400 {
2401 	struct phyint *pi;
2402 	struct mi_offline *mio;
2403 	struct mi_undo_offline *miu;
2404 	unsigned int retval;
2405 
2406 	switch (mpi->mi_command) {
2407 	case MI_PING:
2408 		return (send_result(newfd, IPMP_SUCCESS, 0));
2409 
2410 	case MI_OFFLINE:
2411 		mio = &mpi->mi_ocmd;
2412 
2413 		pi = phyint_lookup(mio->mio_ifname);
2414 		if (pi == NULL)
2415 			return (send_result(newfd, IPMP_EUNKIF, 0));
2416 
2417 		retval = phyint_offline(pi, mio->mio_min_redundancy);
2418 		if (retval == IPMP_FAILURE)
2419 			return (send_result(newfd, IPMP_FAILURE, errno));
2420 
2421 		return (send_result(newfd, retval, 0));
2422 
2423 	case MI_UNDO_OFFLINE:
2424 		miu = &mpi->mi_ucmd;
2425 
2426 		pi = phyint_lookup(miu->miu_ifname);
2427 		if (pi == NULL)
2428 			return (send_result(newfd, IPMP_EUNKIF, 0));
2429 
2430 		retval = phyint_undo_offline(pi);
2431 		if (retval == IPMP_FAILURE)
2432 			return (send_result(newfd, IPMP_FAILURE, errno));
2433 
2434 		return (send_result(newfd, retval, 0));
2435 
2436 	case MI_QUERY:
2437 		return (process_query(newfd, &mpi->mi_qcmd));
2438 
2439 	default:
2440 		break;
2441 	}
2442 
2443 	return (send_result(newfd, IPMP_EPROTO, 0));
2444 }
2445 
2446 /*
2447  * Process the query request pointed to by `miq' and send a reply on file
2448  * descriptor `fd'.  Returns an IPMP error code.
2449  */
2450 static unsigned int
2451 process_query(int fd, mi_query_t *miq)
2452 {
2453 	ipmp_addrinfo_t		*adinfop;
2454 	ipmp_addrinfolist_t	*adlp;
2455 	ipmp_groupinfo_t	*grinfop;
2456 	ipmp_groupinfolist_t	*grlp;
2457 	ipmp_grouplist_t	*grlistp;
2458 	ipmp_ifinfo_t		*ifinfop;
2459 	ipmp_ifinfolist_t	*iflp;
2460 	ipmp_snap_t		*snap;
2461 	unsigned int		retval;
2462 
2463 	switch (miq->miq_inforeq) {
2464 	case IPMP_ADDRINFO:
2465 		retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
2466 		    &adinfop);
2467 		if (retval != IPMP_SUCCESS)
2468 			return (send_result(fd, retval, errno));
2469 
2470 		retval = send_result(fd, IPMP_SUCCESS, 0);
2471 		if (retval == IPMP_SUCCESS)
2472 			retval = send_addrinfo(fd, adinfop);
2473 
2474 		ipmp_freeaddrinfo(adinfop);
2475 		return (retval);
2476 
2477 	case IPMP_GROUPLIST:
2478 		retval = getgrouplist(&grlistp);
2479 		if (retval != IPMP_SUCCESS)
2480 			return (send_result(fd, retval, errno));
2481 
2482 		retval = send_result(fd, IPMP_SUCCESS, 0);
2483 		if (retval == IPMP_SUCCESS)
2484 			retval = send_grouplist(fd, grlistp);
2485 
2486 		ipmp_freegrouplist(grlistp);
2487 		return (retval);
2488 
2489 	case IPMP_GROUPINFO:
2490 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2491 		retval = getgroupinfo(miq->miq_grname, &grinfop);
2492 		if (retval != IPMP_SUCCESS)
2493 			return (send_result(fd, retval, errno));
2494 
2495 		retval = send_result(fd, IPMP_SUCCESS, 0);
2496 		if (retval == IPMP_SUCCESS)
2497 			retval = send_groupinfo(fd, grinfop);
2498 
2499 		ipmp_freegroupinfo(grinfop);
2500 		return (retval);
2501 
2502 	case IPMP_IFINFO:
2503 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2504 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2505 		if (retval != IPMP_SUCCESS)
2506 			return (send_result(fd, retval, errno));
2507 
2508 		retval = send_result(fd, IPMP_SUCCESS, 0);
2509 		if (retval == IPMP_SUCCESS)
2510 			retval = send_ifinfo(fd, ifinfop);
2511 
2512 		ipmp_freeifinfo(ifinfop);
2513 		return (retval);
2514 
2515 	case IPMP_SNAP:
2516 		/*
2517 		 * Before taking the snapshot, sync with the kernel.
2518 		 */
2519 		initifs();
2520 
2521 		retval = getsnap(&snap);
2522 		if (retval != IPMP_SUCCESS)
2523 			return (send_result(fd, retval, errno));
2524 
2525 		retval = send_result(fd, IPMP_SUCCESS, 0);
2526 		if (retval != IPMP_SUCCESS)
2527 			goto out;
2528 
2529 		retval = send_grouplist(fd, snap->sn_grlistp);
2530 		if (retval != IPMP_SUCCESS)
2531 			goto out;
2532 
2533 		retval = ipmp_writetlv(fd, IPMP_IFCNT, sizeof (uint32_t),
2534 		    &snap->sn_nif);
2535 		if (retval != IPMP_SUCCESS)
2536 			goto out;
2537 
2538 		iflp = snap->sn_ifinfolistp;
2539 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2540 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2541 			if (retval != IPMP_SUCCESS)
2542 				goto out;
2543 		}
2544 
2545 		retval = ipmp_writetlv(fd, IPMP_GROUPCNT, sizeof (uint32_t),
2546 		    &snap->sn_ngroup);
2547 		if (retval != IPMP_SUCCESS)
2548 			goto out;
2549 
2550 		grlp = snap->sn_grinfolistp;
2551 		for (; grlp != NULL; grlp = grlp->grl_next) {
2552 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2553 			if (retval != IPMP_SUCCESS)
2554 				goto out;
2555 		}
2556 
2557 		retval = ipmp_writetlv(fd, IPMP_ADDRCNT, sizeof (uint32_t),
2558 		    &snap->sn_naddr);
2559 		if (retval != IPMP_SUCCESS)
2560 			goto out;
2561 
2562 		adlp = snap->sn_adinfolistp;
2563 		for (; adlp != NULL; adlp = adlp->adl_next) {
2564 			retval = send_addrinfo(fd, adlp->adl_adinfop);
2565 			if (retval != IPMP_SUCCESS)
2566 				goto out;
2567 		}
2568 	out:
2569 		ipmp_snap_free(snap);
2570 		return (retval);
2571 
2572 	default:
2573 		break;
2574 
2575 	}
2576 	return (send_result(fd, IPMP_EPROTO, 0));
2577 }
2578 
2579 /*
2580  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2581  * Returns an IPMP error code.
2582  */
2583 static unsigned int
2584 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2585 {
2586 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2587 	ipmp_addrlist_t	*adlistp = grinfop->gr_adlistp;
2588 	ipmp_groupinfo_xfer_t grxfer;
2589 	unsigned int	retval;
2590 
2591 	/*
2592 	 * We can't directly transfer an ipmp_groupinfo_t due to the embedded
2593 	 * pointers to ipmp_iflist_t and ipmp_addr_list_t. Copy the data over
2594 	 * to a temporary transfer structure that doesn't have these embedded
2595 	 * pointers.
2596 	 */
2597 	memset(&grxfer, 0, sizeof (grxfer));
2598 
2599 	grxfer.grx_sig = grinfop->gr_sig;
2600 	grxfer.grx_state = grinfop->gr_state;
2601 	grxfer.grx_fdt = grinfop->gr_fdt;
2602 
2603 	memcpy(grxfer.grx_name, grinfop->gr_name, sizeof (grxfer.grx_name));
2604 	memcpy(grxfer.grx_ifname, grinfop->gr_ifname,
2605 	    sizeof (grxfer.grx_ifname));
2606 	memcpy(grxfer.grx_m4ifname, grinfop->gr_m4ifname,
2607 	    sizeof (grxfer.grx_m4ifname));
2608 	memcpy(grxfer.grx_m6ifname, grinfop->gr_m6ifname,
2609 	    sizeof (grxfer.grx_m6ifname));
2610 	memcpy(grxfer.grx_bcifname, grinfop->gr_bcifname,
2611 	    sizeof (grxfer.grx_bcifname));
2612 
2613 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (grxfer), &grxfer);
2614 	if (retval != IPMP_SUCCESS)
2615 		return (retval);
2616 
2617 	retval = ipmp_writetlv(fd, IPMP_IFLIST,
2618 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
2619 	if (retval != IPMP_SUCCESS)
2620 		return (retval);
2621 
2622 	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2623 	    IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
2624 }
2625 
2626 /*
2627  * Send the interface information pointed to by `ifinfop' on file descriptor
2628  * `fd'.  Returns an IPMP error code.
2629  */
2630 static unsigned int
2631 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2632 {
2633 	ipmp_addrlist_t	*adlist4p = ifinfop->if_targinfo4.it_targlistp;
2634 	ipmp_addrlist_t	*adlist6p = ifinfop->if_targinfo6.it_targlistp;
2635 	ipmp_ifinfo_xfer_t ifxfer;
2636 	unsigned int	retval;
2637 
2638 	/*
2639 	 * We can't directly tranfer an ipmp_ifinfo_t due to the embedded
2640 	 * ipmp_addrlist_t pointer in if_targinfo_t. Copy the data over to
2641 	 * a temporary transfer structure that doesn't have that embedded
2642 	 * pointer.
2643 	 */
2644 	memset(&ifxfer, 0, sizeof (ifxfer));
2645 
2646 	ifxfer.ifx_state = ifinfop->if_state;
2647 	ifxfer.ifx_type = ifinfop->if_type;
2648 	ifxfer.ifx_linkstate = ifinfop->if_linkstate;
2649 	ifxfer.ifx_probestate = ifinfop->if_probestate;
2650 	ifxfer.ifx_flags = ifinfop->if_flags;
2651 	ifxfer.ifx_targinfo4.itx_testaddr = ifinfop->if_targinfo4.it_testaddr;
2652 	ifxfer.ifx_targinfo4.itx_targmode = ifinfop->if_targinfo4.it_targmode;
2653 	ifxfer.ifx_targinfo6.itx_testaddr = ifinfop->if_targinfo6.it_testaddr;
2654 	ifxfer.ifx_targinfo6.itx_targmode = ifinfop->if_targinfo6.it_targmode;
2655 
2656 	memcpy(ifxfer.ifx_name, ifinfop->if_name, sizeof (ifxfer.ifx_name));
2657 	memcpy(ifxfer.ifx_group, ifinfop->if_group, sizeof (ifxfer.ifx_group));
2658 	memcpy(ifxfer.ifx_targinfo4.itx_name, ifinfop->if_targinfo4.it_name,
2659 	    sizeof (ifxfer.ifx_targinfo4.itx_name));
2660 	memcpy(ifxfer.ifx_targinfo6.itx_name, ifinfop->if_targinfo6.it_name,
2661 	    sizeof (ifxfer.ifx_targinfo6.itx_name));
2662 
2663 	retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (ifxfer), &ifxfer);
2664 	if (retval != IPMP_SUCCESS)
2665 		return (retval);
2666 
2667 	retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
2668 	    IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
2669 	if (retval != IPMP_SUCCESS)
2670 		return (retval);
2671 
2672 	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2673 	    IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
2674 }
2675 
2676 /*
2677  * Send the address information pointed to by `adinfop' on file descriptor
2678  * `fd'.  Returns an IPMP error code.
2679  */
2680 static unsigned int
2681 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
2682 {
2683 	return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
2684 }
2685 
2686 /*
2687  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2688  * Returns an IPMP error code.
2689  */
2690 static unsigned int
2691 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2692 {
2693 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2694 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2695 }
2696 
2697 /*
2698  * Initialize an mi_result_t structure using `error' and `syserror' and
2699  * send it on file descriptor `fd'.  Returns an IPMP error code.
2700  */
2701 static unsigned int
2702 send_result(int fd, unsigned int error, int syserror)
2703 {
2704 	mi_result_t me;
2705 
2706 	me.me_mpathd_error = error;
2707 	if (error == IPMP_FAILURE)
2708 		me.me_sys_error = syserror;
2709 	else
2710 		me.me_sys_error = 0;
2711 
2712 	return (ipmp_write(fd, &me, sizeof (me)));
2713 }
2714 
2715 /*
2716  * Daemonize the process.
2717  */
2718 static boolean_t
2719 daemonize(void)
2720 {
2721 	switch (fork()) {
2722 	case -1:
2723 		return (_B_FALSE);
2724 
2725 	case  0:
2726 		/*
2727 		 * Lose our controlling terminal, and become both a session
2728 		 * leader and a process group leader.
2729 		 */
2730 		if (setsid() == -1)
2731 			return (_B_FALSE);
2732 
2733 		/*
2734 		 * Under POSIX, a session leader can accidentally (through
2735 		 * open(2)) acquire a controlling terminal if it does not
2736 		 * have one.  Just to be safe, fork() again so we are not a
2737 		 * session leader.
2738 		 */
2739 		switch (fork()) {
2740 		case -1:
2741 			return (_B_FALSE);
2742 
2743 		case 0:
2744 			(void) chdir("/");
2745 			(void) umask(022);
2746 			(void) fdwalk(closefunc, NULL);
2747 			break;
2748 
2749 		default:
2750 			_exit(EXIT_SUCCESS);
2751 		}
2752 		break;
2753 
2754 	default:
2755 		_exit(EXIT_SUCCESS);
2756 	}
2757 
2758 	return (_B_TRUE);
2759 }
2760 
2761 /*
2762  * The parent has created some fds before forking on purpose, keep them open.
2763  */
2764 static int
2765 closefunc(void *not_used, int fd)
2766 {
2767 	if (fd != lsock_v4 && fd != lsock_v6)
2768 		(void) close(fd);
2769 	return (0);
2770 }
2771 
2772 /* LOGGER */
2773 
2774 #include <syslog.h>
2775 
2776 /*
2777  * Logging routines.  All routines log to syslog, unless the daemon is
2778  * running in the foreground, in which case the logging goes to stderr.
2779  *
2780  * The following routines are available:
2781  *
2782  *	logdebug(): A printf-like function for outputting debug messages
2783  *	(messages at LOG_DEBUG) that are only of use to developers.
2784  *
2785  *	logtrace(): A printf-like function for outputting tracing messages
2786  *	(messages at LOG_INFO) from the daemon.	 This is typically used
2787  *	to log the receipt of interesting network-related conditions.
2788  *
2789  *	logerr(): A printf-like function for outputting error messages
2790  *	(messages at LOG_ERR) from the daemon.
2791  *
2792  *	logperror*(): A set of functions used to output error messages
2793  *	(messages at LOG_ERR); these automatically append strerror(errno)
2794  *	and a newline to the message passed to them.
2795  *
2796  * NOTE: since the logging functions write to syslog, the messages passed
2797  *	 to them are not eligible for localization.  Thus, gettext() must
2798  *	 *not* be used.
2799  */
2800 
2801 static int logging = 0;
2802 
2803 static void
2804 initlog(void)
2805 {
2806 	logging++;
2807 	openlog("in.mpathd", LOG_PID, LOG_DAEMON);
2808 }
2809 
2810 /* PRINTFLIKE2 */
2811 void
2812 logmsg(int pri, const char *fmt, ...)
2813 {
2814 	va_list ap;
2815 
2816 	va_start(ap, fmt);
2817 
2818 	if (logging)
2819 		vsyslog(pri, fmt, ap);
2820 	else
2821 		(void) vfprintf(stderr, fmt, ap);
2822 	va_end(ap);
2823 }
2824 
2825 /* PRINTFLIKE1 */
2826 void
2827 logperror(const char *str)
2828 {
2829 	if (logging)
2830 		syslog(LOG_ERR, "%s: %m\n", str);
2831 	else
2832 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
2833 }
2834 
2835 void
2836 logperror_pii(struct phyint_instance *pii, const char *str)
2837 {
2838 	if (logging) {
2839 		syslog(LOG_ERR, "%s (%s %s): %m\n",
2840 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
2841 	} else {
2842 		(void) fprintf(stderr, "%s (%s %s): %s\n",
2843 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
2844 		    strerror(errno));
2845 	}
2846 }
2847 
2848 void
2849 logperror_li(struct logint *li, const char *str)
2850 {
2851 	struct	phyint_instance	*pii = li->li_phyint_inst;
2852 
2853 	if (logging) {
2854 		syslog(LOG_ERR, "%s (%s %s): %m\n",
2855 		    str, AF_STR(pii->pii_af), li->li_name);
2856 	} else {
2857 		(void) fprintf(stderr, "%s (%s %s): %s\n",
2858 		    str, AF_STR(pii->pii_af), li->li_name,
2859 		    strerror(errno));
2860 	}
2861 }
2862 
2863 void
2864 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
2865 {
2866 	if (polled)
2867 		(void) poll_remove(pii->pii_probe_sock);
2868 	(void) close(pii->pii_probe_sock);
2869 	pii->pii_probe_sock = -1;
2870 	pii->pii_basetime_inited = 0;
2871 }
2872 
2873 boolean_t
2874 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
2875     struct sockaddr_storage *ssp)
2876 {
2877 	addrlist_t *addrp;
2878 
2879 	if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
2880 		return (_B_FALSE);
2881 
2882 	(void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
2883 	addrp->al_flags = flags;
2884 	addrp->al_addr = *ssp;
2885 	addrp->al_next = *addrsp;
2886 	*addrsp = addrp;
2887 	return (_B_TRUE);
2888 }
2889 
2890 void
2891 addrlist_free(addrlist_t **addrsp)
2892 {
2893 	addrlist_t *addrp, *next_addrp;
2894 
2895 	for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
2896 		next_addrp = addrp->al_next;
2897 		free(addrp);
2898 	}
2899 	*addrsp = NULL;
2900 }
2901 
2902 /*
2903  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
2904  * tables defined by mib2.h. Pass the table information returned to the
2905  * supplied function.
2906  */
2907 static int
2908 mibwalk(void (*proc)(mib_item_t *))
2909 {
2910 	mib_item_t		*head_item = NULL;
2911 	mib_item_t		*last_item = NULL;
2912 	mib_item_t		*tmp;
2913 	struct strbuf		ctlbuf, databuf;
2914 	int			flags;
2915 	int			rval;
2916 	uintptr_t		buf[512 / sizeof (uintptr_t)];
2917 	struct T_optmgmt_req	*tor = (struct T_optmgmt_req *)buf;
2918 	struct T_optmgmt_ack	*toa = (struct T_optmgmt_ack *)buf;
2919 	struct T_error_ack	*tea = (struct T_error_ack *)buf;
2920 	struct opthdr		*req, *optp;
2921 	int			status = -1;
2922 
2923 	if (mibfd == -1) {
2924 		if ((mibfd = open("/dev/ip", O_RDWR)) < 0) {
2925 			logperror("mibwalk(): ip open");
2926 			return (status);
2927 		}
2928 	}
2929 
2930 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2931 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
2932 	tor->OPT_length = sizeof (struct opthdr);
2933 	tor->MGMT_flags = T_CURRENT;
2934 
2935 	/*
2936 	 * Note: we use the special level value below so that IP will return
2937 	 * us information concerning IRE_MARK_TESTHIDDEN routes.
2938 	 */
2939 	req = (struct opthdr *)&tor[1];
2940 	req->level = EXPER_IP_AND_ALL_IRES;
2941 	req->name  = 0;
2942 	req->len   = 0;
2943 
2944 	ctlbuf.buf = (char *)&buf;
2945 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
2946 
2947 	if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) {
2948 		logperror("mibwalk(): putmsg(ctl)");
2949 		return (status);
2950 	}
2951 
2952 	/*
2953 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
2954 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
2955 	 * a control and data part. The control part contains a struct
2956 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
2957 	 * the level, name and length of the data in the data part. The
2958 	 * data part contains the actual table data. The last message
2959 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
2960 	 * single option with zero optlen.
2961 	 */
2962 	for (;;) {
2963 		errno = flags = 0;
2964 		ctlbuf.maxlen = sizeof (buf);
2965 		rval = getmsg(mibfd, &ctlbuf, NULL, &flags);
2966 		if (rval & MORECTL || rval < 0) {
2967 			if (errno == EINTR)
2968 				continue;
2969 			logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n",
2970 			    rval, errno);
2971 			goto error;
2972 		}
2973 		if (ctlbuf.len < sizeof (t_scalar_t)) {
2974 			logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len);
2975 			goto error;
2976 		}
2977 
2978 		switch (toa->PRIM_type) {
2979 		case T_ERROR_ACK:
2980 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
2981 				logerr("mibwalk(): T_ERROR_ACK ctlbuf "
2982 				    "too short: %d\n", ctlbuf.len);
2983 				goto error;
2984 			}
2985 			logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n"
2986 			    " UNIX_err = 0x%lx\n", tea->TLI_error,
2987 			    t_strerror(tea->TLI_error), tea->UNIX_error);
2988 			goto error;
2989 
2990 		case T_OPTMGMT_ACK:
2991 			optp = (struct opthdr *)&toa[1];
2992 			if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
2993 			    sizeof (struct opthdr))) {
2994 				logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too "
2995 				    "short: %d\n", ctlbuf.len);
2996 				goto error;
2997 			}
2998 			if (toa->MGMT_flags != T_SUCCESS) {
2999 				logerr("mibwalk(): MGMT_flags != T_SUCCESS: "
3000 				    "0x%lx\n", toa->MGMT_flags);
3001 				goto error;
3002 			}
3003 			break;
3004 
3005 		default:
3006 			goto error;
3007 		}
3008 		/* The following assert also implies MGMT_flags == T_SUCCESS */
3009 		assert(toa->PRIM_type == T_OPTMGMT_ACK);
3010 
3011 		/*
3012 		 * We have reached the end of this T_OPTMGMT_ACK
3013 		 * message. If this is the last message i.e EOD,
3014 		 * break, else process the next T_OPTMGMT_ACK msg.
3015 		 */
3016 		if (rval == 0) {
3017 			if (optp->len == 0 && optp->name == 0 &&
3018 			    optp->level == 0) {
3019 				/* This is the EOD message. */
3020 				break;
3021 			}
3022 			/* Not EOD but no data to retrieve */
3023 			continue;
3024 		}
3025 
3026 		/*
3027 		 * We should only be here if MOREDATA was set.
3028 		 * Allocate an empty mib_item_t and link into the list
3029 		 * of MIB items.
3030 		 */
3031 		if ((tmp = malloc(sizeof (*tmp))) == NULL) {
3032 			logperror("mibwalk(): malloc() failed.");
3033 			goto error;
3034 		}
3035 		if (last_item != NULL)
3036 			last_item->mi_next = tmp;
3037 		else
3038 			head_item = tmp;
3039 		last_item = tmp;
3040 		last_item->mi_next = NULL;
3041 		last_item->mi_opthdr = *optp;
3042 		last_item->mi_valp = malloc(optp->len);
3043 		if (last_item->mi_valp == NULL) {
3044 			logperror("mibwalk(): malloc() failed.");
3045 			goto error;
3046 		}
3047 
3048 		databuf.maxlen = last_item->mi_opthdr.len;
3049 		databuf.buf = (char *)last_item->mi_valp;
3050 		databuf.len = 0;
3051 
3052 		/* Retrieve the actual MIB data */
3053 		for (;;) {
3054 			flags = 0;
3055 			if ((rval = getmsg(mibfd, NULL, &databuf,
3056 			    &flags)) != 0) {
3057 				if (rval < 0 && errno == EINTR)
3058 					continue;
3059 				/*
3060 				 * We shouldn't get MOREDATA here so treat that
3061 				 * as an error.
3062 				 */
3063 				logperror("mibwalk(): getmsg(data)");
3064 				goto error;
3065 			}
3066 			break;
3067 		}
3068 	}
3069 	status = 0;
3070 	/* Pass the accumulated MIB data to the supplied function pointer */
3071 	(*proc)(head_item);
3072 error:
3073 	while (head_item != NULL) {
3074 		tmp = head_item;
3075 		head_item = tmp->mi_next;
3076 		free(tmp->mi_valp);
3077 		free(tmp);
3078 	}
3079 	return (status);
3080 }
3081 
3082 /*
3083  * Parse the supplied mib2 information to get the size of routing table
3084  * entries. This is needed when running in a branded zone where the
3085  * Solaris application environment and the Solaris kernel may not be the
3086  * the same release version.
3087  */
3088 static void
3089 mib_get_constants(mib_item_t *item)
3090 {
3091 	mib2_ip_t		*ipv4;
3092 	mib2_ipv6IfStatsEntry_t	*ipv6;
3093 
3094 	for (; item != NULL; item = item->mi_next) {
3095 		if (item->mi_opthdr.name != 0)
3096 			continue;
3097 		if (item->mi_opthdr.level == MIB2_IP) {
3098 			ipv4 = (mib2_ip_t *)item->mi_valp;
3099 			ipRouteEntrySize = ipv4->ipRouteEntrySize;
3100 		} else if (item->mi_opthdr.level == MIB2_IP6) {
3101 			ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp;
3102 			ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize;
3103 		}
3104 	}
3105 }
3106