xref: /titanic_50/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision bdb9230ac765cb7af3fc1f4119caf2c5720dceb3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 int debug = 0;				/* Debug flag */
30 static int pollfd_num = 0;		/* Num. of poll descriptors */
31 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
32 					/* All times below in ms */
33 int	user_failure_detection_time;	/* user specified failure detection */
34 					/* time (fdt) */
35 int	user_probe_interval;		/* derived from user specified fdt */
36 
37 /*
38  * Structure to store mib2 information returned by the kernel.
39  * This is used to process routing table information.
40  */
41 typedef struct mib_item_s {
42 	struct mib_item_s	*mi_next;
43 	struct opthdr		mi_opthdr;
44 	void			*mi_valp;
45 } mib_item_t;
46 
47 static int	rtsock_v4;		/* AF_INET routing socket */
48 static int	rtsock_v6;		/* AF_INET6 routing socket */
49 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
50 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
51 static int	lsock_v4;		/* Listen socket to detect mpathd */
52 static int	lsock_v6;		/* Listen socket to detect mpathd */
53 static int	mibfd = -1;		/* fd to get mib info */
54 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
55 
56 static uint_t	last_initifs_time;	/* Time when initifs was last run */
57 static	char **argv0;			/* Saved for re-exec on SIGHUP */
58 boolean_t handle_link_notifications = _B_TRUE;
59 static int	ipRouteEntrySize;	/* Size of IPv4 route entry */
60 static int	ipv6RouteEntrySize;	/* Size of IPv6 route entry */
61 
62 static void	initlog(void);
63 static void	run_timeouts(void);
64 static void	initifs(void);
65 static void	check_if_removed(struct phyint_instance *pii);
66 static void	select_test_ifs(void);
67 static void	update_router_list(mib_item_t *item);
68 static void	mib_get_constants(mib_item_t *item);
69 static int	mibwalk(void (*proc)(mib_item_t *));
70 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
71 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
72 static void	router_add_common(int af, char *ifname,
73     struct in6_addr nexthop);
74 static void	init_router_targets();
75 static void	cleanup(void);
76 static int	setup_listener(int af);
77 static void	check_config(void);
78 static void	check_testconfig(void);
79 static void	check_addr_unique(struct phyint_instance *,
80     struct sockaddr_storage *);
81 static void	init_host_targets(void);
82 static void	dup_host_targets(struct phyint_instance *desired_pii);
83 static void	loopback_cmd(int sock, int family);
84 static boolean_t daemonize(void);
85 static int	closefunc(void *, int);
86 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
87 static unsigned int process_query(int fd, mi_query_t *miq);
88 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
89 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
90 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
91 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
92 static unsigned int send_result(int fd, unsigned int error, int syserror);
93 
94 addrlist_t *localaddrs;
95 
96 /*
97  * Return the current time in milliseconds (from an arbitrary reference)
98  * truncated to fit into an int. Truncation is ok since we are interested
99  * only in differences and not the absolute values.
100  */
101 uint_t
102 getcurrenttime(void)
103 {
104 	uint_t	cur_time;	/* In ms */
105 
106 	/*
107 	 * Use of a non-user-adjustable source of time is
108 	 * required. However millisecond precision is sufficient.
109 	 * divide by 10^6
110 	 */
111 	cur_time = (uint_t)(gethrtime() / 1000000LL);
112 	return (cur_time);
113 }
114 
115 uint64_t
116 getcurrentsec(void)
117 {
118 	return (gethrtime() / NANOSEC);
119 }
120 
121 /*
122  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
123  */
124 int
125 poll_add(int fd)
126 {
127 	int i;
128 	int new_num;
129 	struct pollfd *newfds;
130 retry:
131 	/* Check if already present */
132 	for (i = 0; i < pollfd_num; i++) {
133 		if (pollfds[i].fd == fd)
134 			return (0);
135 	}
136 	/* Check for empty spot already present */
137 	for (i = 0; i < pollfd_num; i++) {
138 		if (pollfds[i].fd == -1) {
139 			pollfds[i].fd = fd;
140 			return (0);
141 		}
142 	}
143 
144 	/* Allocate space for 32 more fds and initialize to -1 */
145 	new_num = pollfd_num + 32;
146 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
147 	if (newfds == NULL) {
148 		logperror("poll_add: realloc");
149 		return (-1);
150 	}
151 	for (i = pollfd_num; i < new_num; i++) {
152 		newfds[i].fd = -1;
153 		newfds[i].events = POLLIN;
154 	}
155 	pollfd_num = new_num;
156 	pollfds = newfds;
157 	goto retry;
158 }
159 
160 /*
161  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
162  */
163 int
164 poll_remove(int fd)
165 {
166 	int i;
167 
168 	/* Check if already present */
169 	for (i = 0; i < pollfd_num; i++) {
170 		if (pollfds[i].fd == fd) {
171 			pollfds[i].fd = -1;
172 			return (0);
173 		}
174 	}
175 	return (-1);
176 }
177 
178 /*
179  * Extract information about the phyint instance. If the phyint instance still
180  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
181  * will use it to detect phyint instances that don't exist any longer and
182  * remove them, from our database of phyint instances.
183  * Return value:
184  *	returns true if the phyint instance exists in the kernel,
185  *	returns false otherwise
186  */
187 static boolean_t
188 pii_process(int af, char *name, struct phyint_instance **pii_p)
189 {
190 	int err;
191 	struct phyint_instance *pii;
192 	struct phyint_instance *pii_other;
193 
194 	if (debug & D_PHYINT)
195 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
196 
197 	pii = phyint_inst_lookup(af, name);
198 	if (pii == NULL) {
199 		/*
200 		 * Phyint instance does not exist in our tables,
201 		 * create new phyint instance
202 		 */
203 		pii = phyint_inst_init_from_k(af, name);
204 	} else {
205 		/* Phyint exists in our tables */
206 		err = phyint_inst_update_from_k(pii);
207 
208 		switch (err) {
209 		case PI_IOCTL_ERROR:
210 			/* Some ioctl error. don't change anything */
211 			pii->pii_in_use = 1;
212 			break;
213 
214 		case PI_GROUP_CHANGED:
215 		case PI_IFINDEX_CHANGED:
216 			/*
217 			 * Interface index or group membership has changed.
218 			 * Delete the old state and recreate based on the new
219 			 * state (it may no longer be in a group).
220 			 */
221 			pii_other = phyint_inst_other(pii);
222 			if (pii_other != NULL)
223 				phyint_inst_delete(pii_other);
224 			phyint_inst_delete(pii);
225 			pii = phyint_inst_init_from_k(af, name);
226 			break;
227 
228 		case PI_DELETED:
229 			/* Phyint instance has disappeared from kernel */
230 			pii->pii_in_use = 0;
231 			break;
232 
233 		case PI_OK:
234 			/* Phyint instance exists and is fine */
235 			pii->pii_in_use = 1;
236 			break;
237 
238 		default:
239 			/* Unknown status */
240 			logerr("pii_process: Unknown status %d\n", err);
241 			break;
242 		}
243 	}
244 
245 	*pii_p = pii;
246 	if (pii != NULL)
247 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
248 	else
249 		return (_B_FALSE);
250 }
251 
252 /*
253  * Scan all interfaces to detect changes as well as new and deleted interfaces
254  */
255 static void
256 initifs()
257 {
258 	int	i, nlifr;
259 	int	af;
260 	char	*cp;
261 	char	*buf;
262 	int	sockfd;
263 	uint64_t	flags;
264 	struct lifnum	lifn;
265 	struct lifconf	lifc;
266 	struct lifreq	lifreq;
267 	struct lifreq	*lifr;
268 	struct logint	*li;
269 	struct phyint_instance *pii;
270 	struct phyint_instance *next_pii;
271 	struct phyint_group *pg, *next_pg;
272 	char		pi_name[LIFNAMSIZ + 1];
273 
274 	if (debug & D_PHYINT)
275 		logdebug("initifs: Scanning interfaces\n");
276 
277 	last_initifs_time = getcurrenttime();
278 
279 	/*
280 	 * Free the existing local address list; we'll build a new list below.
281 	 */
282 	addrlist_free(&localaddrs);
283 
284 	/*
285 	 * Mark the interfaces so that we can find phyints and logints
286 	 * which have disappeared from the kernel. pii_process() and
287 	 * logint_init_from_k() will set {pii,li}_in_use when they find
288 	 * the interface in the kernel. Also, clear dupaddr bit on probe
289 	 * logint. check_addr_unique() will set the dupaddr bit on the
290 	 * probe logint, if the testaddress is not unique.
291 	 */
292 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
293 		pii->pii_in_use = 0;
294 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
295 			li->li_in_use = 0;
296 			if (pii->pii_probe_logint == li)
297 				li->li_dupaddr = 0;
298 		}
299 	}
300 
301 	/*
302 	 * As above, mark groups so that we can detect IPMP interfaces which
303 	 * have been removed from the kernel.  Also, delete the group address
304 	 * list since we'll iteratively recreate it below.
305 	 */
306 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
307 		pg->pg_in_use = _B_FALSE;
308 		addrlist_free(&pg->pg_addrs);
309 	}
310 
311 	lifn.lifn_family = AF_UNSPEC;
312 	lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
313 again:
314 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
315 		logperror("initifs: ioctl (get interface count)");
316 		return;
317 	}
318 	/*
319 	 * Pad the interface count to detect when additional interfaces have
320 	 * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
321 	 */
322 	lifn.lifn_count += 4;
323 
324 	if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
325 		logperror("initifs: calloc");
326 		return;
327 	}
328 
329 	lifc.lifc_family = AF_UNSPEC;
330 	lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
331 	lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
332 	lifc.lifc_buf = buf;
333 
334 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
335 		logperror("initifs: ioctl (get interface configuration)");
336 		free(buf);
337 		return;
338 	}
339 
340 	/*
341 	 * If every lifr_req slot is taken, then additional interfaces must
342 	 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
343 	 * Recalculate to make sure we didn't miss any interfaces.
344 	 */
345 	nlifr = lifc.lifc_len / sizeof (struct lifreq);
346 	if (nlifr >= lifn.lifn_count) {
347 		free(buf);
348 		goto again;
349 	}
350 
351 	/*
352 	 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
353 	 * global list of addresses, phyint groups, phyints, and logints.
354 	 */
355 	for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
356 		af = lifr->lifr_addr.ss_family;
357 		sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
358 		(void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
359 
360 		if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
361 			if (errno != ENXIO)
362 				logperror("initifs: ioctl (SIOCGLIFFLAGS)");
363 			continue;
364 		}
365 		flags = lifreq.lifr_flags;
366 
367 		/*
368 		 * If the address is IFF_UP, add it to the local address list.
369 		 * (We ignore addresses that aren't IFF_UP since another node
370 		 * might legitimately have that address IFF_UP.)
371 		 */
372 		if (flags & IFF_UP) {
373 			(void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
374 			    &lifr->lifr_addr);
375 		}
376 
377 		/*
378 		 * If this address is on an IPMP meta-interface, update our
379 		 * phyint_group information (either by recording that group
380 		 * still exists or creating a new group), and track what
381 		 * group the address is part of.
382 		 */
383 		if (flags & IFF_IPMP) {
384 			if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
385 				if (errno != ENXIO)
386 					logperror("initifs: ioctl "
387 					    "(SIOCGLIFGROUPNAME)");
388 				continue;
389 			}
390 
391 			pg = phyint_group_lookup(lifreq.lifr_groupname);
392 			if (pg == NULL) {
393 				pg = phyint_group_create(lifreq.lifr_groupname);
394 				if (pg == NULL) {
395 					logerr("initifs: cannot create group "
396 					    "%s\n", lifreq.lifr_groupname);
397 					continue;
398 				}
399 				phyint_group_insert(pg);
400 			}
401 			pg->pg_in_use = _B_TRUE;
402 
403 			/*
404 			 * Add this to the group's list of data addresses.
405 			 */
406 			if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
407 			    &lifr->lifr_addr)) {
408 				logerr("initifs: insufficient memory to track "
409 				    "data address information for %s\n",
410 				    lifr->lifr_name);
411 			}
412 			continue;
413 		}
414 
415 		/*
416 		 * This isn't an address on an IPMP meta-interface, so it's
417 		 * either on an underlying interface or not related to any
418 		 * group.  Update our phyint and logint information (via
419 		 * pii_process() and logint_init_from_k()) -- but first,
420 		 * convert the logint name to a phyint name so we can call
421 		 * pii_process().
422 		 */
423 		(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
424 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
425 			*cp = '\0';
426 
427 		if (pii_process(af, pi_name, &pii)) {
428 			/* The phyint is fine. So process the logint */
429 			logint_init_from_k(pii, lifr->lifr_name);
430 			check_addr_unique(pii, &lifr->lifr_addr);
431 		}
432 	}
433 	free(buf);
434 
435 	/*
436 	 * Scan for groups, phyints and logints that have disappeared from the
437 	 * kernel, and delete them.
438 	 */
439 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
440 		next_pii = pii->pii_next;
441 		check_if_removed(pii);
442 	}
443 
444 	for (pg = phyint_groups; pg != NULL; pg = next_pg) {
445 		next_pg = pg->pg_next;
446 		if (!pg->pg_in_use) {
447 			phyint_group_delete(pg);
448 			continue;
449 		}
450 		/*
451 		 * Refresh the group's state.  This is necessary since the
452 		 * group's state is defined by the set of usable interfaces in
453 		 * the group, and an interface is considered unusable if all
454 		 * of its addresses are down.  When an address goes down/up,
455 		 * the RTM_DELADDR/RTM_NEWADDR brings us through here.
456 		 */
457 		phyint_group_refresh_state(pg);
458 	}
459 
460 	/*
461 	 * Select a test address for sending probes on each phyint instance
462 	 */
463 	select_test_ifs();
464 
465 	/*
466 	 * Handle link up/down notifications.
467 	 */
468 	process_link_state_changes();
469 }
470 
471 /*
472  * Check that a given test address is unique across all of the interfaces in a
473  * group.  (e.g., IPv6 link-locals may not be inherently unique, and binding
474  * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
475  * Any issues will be reported by check_testconfig().
476  */
477 static void
478 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
479 {
480 	struct phyint		*pi;
481 	struct phyint_group	*pg;
482 	struct in6_addr		addr;
483 	struct phyint_instance	*pii;
484 	struct sockaddr_in	*sin;
485 
486 	if (ss->ss_family == AF_INET) {
487 		sin = (struct sockaddr_in *)ss;
488 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
489 	} else {
490 		assert(ss->ss_family == AF_INET6);
491 		addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
492 	}
493 
494 	/*
495 	 * For anonymous groups, every interface is assumed to be on its own
496 	 * link, so there is no chance of overlapping addresses.
497 	 */
498 	pg = ourpii->pii_phyint->pi_group;
499 	if (pg == phyint_anongroup)
500 		return;
501 
502 	/*
503 	 * Walk the list of phyint instances in the group and check for test
504 	 * addresses matching ours.  Of course, we skip ourself.
505 	 */
506 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
507 		pii = PHYINT_INSTANCE(pi, ss->ss_family);
508 		if (pii == NULL || pii == ourpii ||
509 		    pii->pii_probe_logint == NULL)
510 			continue;
511 
512 		/*
513 		 * If this test address is not unique, set the dupaddr bit.
514 		 */
515 		if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr))
516 			pii->pii_probe_logint->li_dupaddr = 1;
517 	}
518 }
519 
520 /*
521  * Stop probing an interface.  Called when an interface is offlined.
522  * The probe socket is closed on each interface instance, and the
523  * interface state set to PI_OFFLINE.
524  */
525 void
526 stop_probing(struct phyint *pi)
527 {
528 	struct phyint_instance *pii;
529 
530 	pii = pi->pi_v4;
531 	if (pii != NULL) {
532 		if (pii->pii_probe_sock != -1)
533 			close_probe_socket(pii, _B_TRUE);
534 		pii->pii_probe_logint = NULL;
535 	}
536 
537 	pii = pi->pi_v6;
538 	if (pii != NULL) {
539 		if (pii->pii_probe_sock != -1)
540 			close_probe_socket(pii, _B_TRUE);
541 		pii->pii_probe_logint = NULL;
542 	}
543 
544 	phyint_chstate(pi, PI_OFFLINE);
545 }
546 
547 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
548 
549 /*
550  * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
551  * IFF_UP must also be set so that the associated address can be used as a
552  * source address.  Further, we must be able to exchange packets with local
553  * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
554  * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
555  */
556 static int
557 rate_testflags(uint64_t flags)
558 {
559 	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
560 		return (BAD_TESTFLAGS);
561 
562 	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
563 		return (BAD_TESTFLAGS);
564 
565 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
566 		return (BEST_TESTFLAGS);
567 
568 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
569 		return (BEST_TESTFLAGS);
570 
571 	return (OK_TESTFLAGS);
572 }
573 
574 /*
575  * Attempt to select a test address for each phyint instance.
576  * Call phyint_inst_sockinit() to complete the initializations.
577  */
578 static void
579 select_test_ifs(void)
580 {
581 	struct phyint		*pi;
582 	struct phyint_instance	*pii;
583 	struct phyint_instance	*next_pii;
584 	struct logint		*li;
585 	struct logint  		*probe_logint;
586 	boolean_t		target_scan_reqd = _B_FALSE;
587 	int			rating;
588 
589 	if (debug & D_PHYINT)
590 		logdebug("select_test_ifs\n");
591 
592 	/*
593 	 * For each phyint instance, do the test address selection
594 	 */
595 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
596 		next_pii = pii->pii_next;
597 		probe_logint = NULL;
598 
599 		/*
600 		 * An interface that is offline should not be probed.
601 		 * IFF_OFFLINE interfaces should always be PI_OFFLINE
602 		 * unless some other entity has set the offline flag.
603 		 */
604 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
605 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
606 				logerr("shouldn't be probing offline"
607 				    " interface %s (state is: %u)."
608 				    " Stopping probes.\n",
609 				    pii->pii_phyint->pi_name,
610 				    pii->pii_phyint->pi_state);
611 				stop_probing(pii->pii_phyint);
612 			}
613 			continue;
614 		} else {
615 			/*
616 			 * If something cleared IFF_OFFLINE (e.g., by accident
617 			 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
618 			 * inherently racy), the phyint may still be offline.
619 			 * Just ignore it.
620 			 */
621 			if (pii->pii_phyint->pi_state == PI_OFFLINE)
622 				continue;
623 		}
624 
625 		li = pii->pii_probe_logint;
626 		if (li != NULL) {
627 			/*
628 			 * We've already got a test address; only proceed
629 			 * if it's suboptimal.
630 			 */
631 			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
632 				continue;
633 		}
634 
635 		/*
636 		 * Walk the logints of this phyint instance, and select
637 		 * the best available test address
638 		 */
639 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
640 			/*
641 			 * Skip 0.0.0.0 addresses, as those are never
642 			 * actually usable.
643 			 */
644 			if (pii->pii_af == AF_INET &&
645 			    IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
646 				continue;
647 
648 			/*
649 			 * Skip any IPv6 logints that are not link-local,
650 			 * since we should always have a link-local address
651 			 * anyway and in6_data() expects link-local replies.
652 			 */
653 			if (pii->pii_af == AF_INET6 &&
654 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
655 				continue;
656 
657 			/*
658 			 * Rate the testflags. If we've found an optimal
659 			 * match, then break out; otherwise, record the most
660 			 * recent OK one.
661 			 */
662 			rating = rate_testflags(li->li_flags);
663 			if (rating == BAD_TESTFLAGS)
664 				continue;
665 
666 			probe_logint = li;
667 			if (rating == BEST_TESTFLAGS)
668 				break;
669 		}
670 
671 		/*
672 		 * If the probe logint has changed, ditch the old one.
673 		 */
674 		if (pii->pii_probe_logint != NULL &&
675 		    pii->pii_probe_logint != probe_logint) {
676 			if (pii->pii_probe_sock != -1)
677 				close_probe_socket(pii, _B_TRUE);
678 			pii->pii_probe_logint = NULL;
679 		}
680 
681 		if (probe_logint == NULL) {
682 			/*
683 			 * We don't have a test address; zero out the probe
684 			 * stats array since it is no longer relevant.
685 			 * Optimize by checking if it is already zeroed out.
686 			 */
687 			int pr_ndx;
688 
689 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
690 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
691 				clear_pii_probe_stats(pii);
692 				reset_crtt_all(pii->pii_phyint);
693 			}
694 			continue;
695 		} else if (probe_logint == pii->pii_probe_logint) {
696 			/*
697 			 * If we didn't find any new test addr, go to the
698 			 * next phyint.
699 			 */
700 			continue;
701 		}
702 
703 		/*
704 		 * The phyint is either being assigned a new testaddr
705 		 * or is being assigned a testaddr for the 1st time.
706 		 * Need to initialize the phyint socket
707 		 */
708 		pii->pii_probe_logint = probe_logint;
709 		if (!phyint_inst_sockinit(pii)) {
710 			if (debug & D_PHYINT) {
711 				logdebug("select_test_ifs: "
712 				    "phyint_sockinit failed\n");
713 			}
714 			phyint_inst_delete(pii);
715 			continue;
716 		}
717 
718 		/*
719 		 * This phyint instance is now enabled for probes; this
720 		 * impacts our state machine in two ways:
721 		 *
722 		 * 1. If we're probe *capable* as well (i.e., we have
723 		 *    probe targets) and the interface is in PI_NOTARGETS,
724 		 *    then transition to PI_RUNNING.
725 		 *
726 		 * 2. If we're not probe capable, and the other phyint
727 		 *    instance is also not probe capable, and we were in
728 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
729 		 *
730 		 * Also see the state diagram in mpd_probe.c.
731 		 */
732 		if (PROBE_CAPABLE(pii)) {
733 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
734 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
735 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
736 			if (pii->pii_phyint->pi_state == PI_RUNNING)
737 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
738 		}
739 
740 		/*
741 		 * If no targets are currently known for this phyint
742 		 * we need to call init_router_targets. Since
743 		 * init_router_targets() initializes the list of targets
744 		 * for all phyints it is done below the loop.
745 		 */
746 		if (pii->pii_targets == NULL)
747 			target_scan_reqd = _B_TRUE;
748 
749 		/*
750 		 * Start the probe timer for this instance.
751 		 */
752 		if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
753 			start_timer(pii);
754 			pii->pii_basetime_inited = 1;
755 		}
756 	}
757 
758 	/*
759 	 * Scan the interface list for any interfaces that are PI_FAILED or
760 	 * PI_NOTARGETS but no longer enabled to send probes, and call
761 	 * phyint_check_for_repair() to see if the link state indicates that
762 	 * the interface should be repaired.  Also see the state diagram in
763 	 * mpd_probe.c.
764 	 */
765 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
766 		if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
767 		    (pi->pi_state == PI_FAILED ||
768 		    pi->pi_state == PI_NOTARGETS)) {
769 			phyint_check_for_repair(pi);
770 		}
771 	}
772 
773 	check_testconfig();
774 
775 	/*
776 	 * Try to populate the target list. init_router_targets populates
777 	 * the target list from the routing table. If our target list is
778 	 * still empty, init_host_targets adds host targets based on the
779 	 * host target list of other phyints in the group.
780 	 */
781 	if (target_scan_reqd) {
782 		init_router_targets();
783 		init_host_targets();
784 	}
785 }
786 
787 /*
788  * Check test address configuration, and log notices/errors if appropriate.
789  * Note that this function only logs pre-existing conditions (e.g., that
790  * probe-based failure detection is disabled).
791  */
792 static void
793 check_testconfig(void)
794 {
795 	struct phyint	*pi;
796 	struct logint  	*li;
797 	char		abuf[INET6_ADDRSTRLEN];
798 	int		pri;
799 
800 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
801 		if (pi->pi_flags & IFF_OFFLINE)
802 			continue;
803 
804 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) {
805 			if (pi->pi_taddrmsg_printed ||
806 			    pi->pi_duptaddrmsg_printed) {
807 				if (pi->pi_duptaddrmsg_printed)
808 					pri = LOG_ERR;
809 				else
810 					pri = LOG_INFO;
811 				logmsg(pri, "Test address now configured on "
812 				    "interface %s; enabling probe-based "
813 				    "failure detection on it\n", pi->pi_name);
814 				pi->pi_taddrmsg_printed = 0;
815 				pi->pi_duptaddrmsg_printed = 0;
816 			}
817 			continue;
818 		}
819 
820 		li = NULL;
821 		if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL &&
822 		    pi->pi_v4->pii_probe_logint->li_dupaddr)
823 			li = pi->pi_v4->pii_probe_logint;
824 
825 		if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL &&
826 		    pi->pi_v6->pii_probe_logint->li_dupaddr)
827 			li = pi->pi_v6->pii_probe_logint;
828 
829 		if (li != NULL && li->li_dupaddr) {
830 			if (pi->pi_duptaddrmsg_printed)
831 				continue;
832 			logerr("Test address %s is not unique in group; "
833 			    "disabling probe-based failure detection on %s\n",
834 			    pr_addr(li->li_phyint_inst->pii_af,
835 			    li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
836 			pi->pi_duptaddrmsg_printed = 1;
837 			continue;
838 		}
839 
840 		if (getcurrentsec() < pi->pi_taddrthresh)
841 			continue;
842 
843 		if (!pi->pi_taddrmsg_printed) {
844 			logtrace("No test address configured on interface %s; "
845 			    "disabling probe-based failure detection on it\n",
846 			    pi->pi_name);
847 			pi->pi_taddrmsg_printed = 1;
848 		}
849 	}
850 }
851 
852 /*
853  * Check phyint group configuration, to detect any inconsistencies,
854  * and log an error message. This is called from runtimeouts every
855  * 20 secs. But the error message is displayed once. If the
856  * consistency is resolved by the admin, a recovery message is displayed
857  * once.
858  */
859 static void
860 check_config(void)
861 {
862 	struct phyint_group *pg;
863 	struct phyint *pi;
864 	boolean_t v4_in_group;
865 	boolean_t v6_in_group;
866 
867 	/*
868 	 * All phyints of a group must be homogeneous to ensure that they can
869 	 * take over for one another.  If any phyint in a group has IPv4
870 	 * plumbed, check that all phyints have IPv4 plumbed.  Do a similar
871 	 * check for IPv6.
872 	 */
873 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
874 		if (pg == phyint_anongroup)
875 			continue;
876 
877 		v4_in_group = _B_FALSE;
878 		v6_in_group = _B_FALSE;
879 		/*
880 		 * 1st pass. Determine if at least 1 phyint in the group
881 		 * has IPv4 plumbed and if so set v4_in_group to true.
882 		 * Repeat similarly for IPv6.
883 		 */
884 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
885 			if (pi->pi_v4 != NULL)
886 				v4_in_group = _B_TRUE;
887 			if (pi->pi_v6 != NULL)
888 				v6_in_group = _B_TRUE;
889 		}
890 
891 		/*
892 		 * 2nd pass. If v4_in_group is true, check that phyint
893 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
894 		 * out a message the 1st time only.
895 		 */
896 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
897 			if (pi->pi_flags & IFF_OFFLINE)
898 				continue;
899 
900 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
901 				if (!pi->pi_cfgmsg_printed) {
902 					logerr("IP interface %s in group %s is"
903 					    " not plumbed for IPv4, affecting"
904 					    " IPv4 connectivity\n",
905 					    pi->pi_name,
906 					    pi->pi_group->pg_name);
907 					pi->pi_cfgmsg_printed = 1;
908 				}
909 			} else if (v6_in_group == _B_TRUE &&
910 			    pi->pi_v6 == NULL) {
911 				if (!pi->pi_cfgmsg_printed) {
912 					logerr("IP interface %s in group %s is"
913 					    " not plumbed for IPv6, affecting"
914 					    " IPv6 connectivity\n",
915 					    pi->pi_name,
916 					    pi->pi_group->pg_name);
917 					pi->pi_cfgmsg_printed = 1;
918 				}
919 			} else {
920 				/*
921 				 * The phyint matches the group configuration,
922 				 * if we have reached this point. If it was
923 				 * improperly configured earlier, log an
924 				 * error recovery message
925 				 */
926 				if (pi->pi_cfgmsg_printed) {
927 					logerr("IP interface %s is now"
928 					    " consistent with group %s "
929 					    " and connectivity is restored\n",
930 					    pi->pi_name, pi->pi_group->pg_name);
931 					pi->pi_cfgmsg_printed = 0;
932 				}
933 			}
934 
935 		}
936 	}
937 }
938 
939 /*
940  * Timer mechanism using relative time (in milliseconds) from the
941  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
942  * will fire after TIMER_INFINITY milliseconds.
943  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
944  * time values. Hence 2 consecutive timer events cannot be spaced farther
945  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
946  * that can be passed for the delay parameter of timer_schedule()
947  */
948 static uint_t timer_next;	/* Currently scheduled timeout */
949 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
950 
951 static void
952 timer_init(void)
953 {
954 	timer_next = getcurrenttime() + TIMER_INFINITY;
955 	/*
956 	 * The call to run_timeouts() will get the timer started
957 	 * Since there are no phyints at this point, the timer will
958 	 * be set for IF_SCAN_INTERVAL ms.
959 	 */
960 	run_timeouts();
961 }
962 
963 /*
964  * Make sure the next SIGALRM occurs delay milliseconds from the current
965  * time if not earlier. We are interested only in time differences.
966  */
967 void
968 timer_schedule(uint_t delay)
969 {
970 	uint_t now;
971 	struct itimerval itimerval;
972 
973 	if (debug & D_TIMER)
974 		logdebug("timer_schedule(%u)\n", delay);
975 
976 	assert(delay <= TIMER_INFINITY);
977 
978 	now = getcurrenttime();
979 	if (delay == 0) {
980 		/* Minimum allowed delay */
981 		delay = 1;
982 	}
983 	/* Will this timer occur before the currently scheduled SIGALRM? */
984 	if (timer_active && TIME_GE(now + delay, timer_next)) {
985 		if (debug & D_TIMER) {
986 			logdebug("timer_schedule(%u) - no action: "
987 			    "now %u next %u\n", delay, now, timer_next);
988 		}
989 		return;
990 	}
991 	timer_next = now + delay;
992 
993 	itimerval.it_value.tv_sec = delay / 1000;
994 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
995 	itimerval.it_interval.tv_sec = 0;
996 	itimerval.it_interval.tv_usec = 0;
997 	if (debug & D_TIMER) {
998 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
999 		    delay, itimerval.it_value.tv_sec,
1000 		    itimerval.it_value.tv_usec);
1001 	}
1002 	timer_active = _B_TRUE;
1003 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1004 		logperror("timer_schedule: setitimer");
1005 		exit(2);
1006 	}
1007 }
1008 
1009 static void
1010 timer_cancel(void)
1011 {
1012 	struct itimerval itimerval;
1013 
1014 	if (debug & D_TIMER)
1015 		logdebug("timer_cancel()\n");
1016 
1017 	bzero(&itimerval, sizeof (itimerval));
1018 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0)
1019 		logperror("timer_cancel: setitimer");
1020 }
1021 
1022 /*
1023  * Timer has fired. Determine when the next timer event will occur by asking
1024  * all the timer routines. Should not be called from a timer routine.
1025  */
1026 static void
1027 run_timeouts(void)
1028 {
1029 	uint_t next;
1030 	uint_t next_event_time;
1031 	struct phyint_instance *pii;
1032 	struct phyint_instance *next_pii;
1033 	static boolean_t timeout_running;
1034 
1035 	/* assert that recursive timeouts don't happen. */
1036 	assert(!timeout_running);
1037 
1038 	timeout_running = _B_TRUE;
1039 
1040 	if (debug & D_TIMER)
1041 		logdebug("run_timeouts()\n");
1042 
1043 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1044 		initifs();
1045 		check_config();
1046 	}
1047 
1048 	next = TIMER_INFINITY;
1049 
1050 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1051 		next_pii = pii->pii_next;
1052 		next_event_time = phyint_inst_timer(pii);
1053 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1054 			next = next_event_time;
1055 
1056 		if (debug & D_TIMER) {
1057 			logdebug("run_timeouts(%s %s): next scheduled for"
1058 			    " this phyint inst %u, next scheduled global"
1059 			    " %u ms\n",
1060 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1061 			    next_event_time, next);
1062 		}
1063 	}
1064 
1065 	/*
1066 	 * Make sure initifs() is called at least once every
1067 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1068 	 * with the kernel, in case we have missed any routing
1069 	 * socket messages.
1070 	 */
1071 	if (next > IF_SCAN_INTERVAL)
1072 		next = IF_SCAN_INTERVAL;
1073 
1074 	if (debug & D_TIMER)
1075 		logdebug("run_timeouts: %u ms\n", next);
1076 
1077 	timer_schedule(next);
1078 	timeout_running = _B_FALSE;
1079 }
1080 
1081 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1082 static int eventpipe_write = -1;
1083 boolean_t cleanup_started = _B_FALSE;	/* true if we're going away */
1084 
1085 /*
1086  * Ensure that signals are processed synchronously with the rest of
1087  * the code by just writing a one character signal number on the pipe.
1088  * The poll loop will pick this up and process the signal event.
1089  */
1090 static void
1091 sig_handler(int signo)
1092 {
1093 	uchar_t buf = (uchar_t)signo;
1094 
1095 	/*
1096 	 * Don't write to pipe if cleanup has already begun. cleanup()
1097 	 * might have closed the pipe already
1098 	 */
1099 	if (cleanup_started)
1100 		return;
1101 
1102 	if (eventpipe_write == -1) {
1103 		logerr("sig_handler: no pipe found\n");
1104 		return;
1105 	}
1106 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1107 		logperror("sig_handler: write");
1108 }
1109 
1110 extern struct probes_missed probes_missed;
1111 
1112 /*
1113  * Pick up a signal "byte" from the pipe and process it.
1114  */
1115 static void
1116 in_signal(int fd)
1117 {
1118 	uchar_t buf;
1119 	uint64_t  sent, acked, lost, unacked, unknown;
1120 	struct phyint_instance *pii;
1121 	int pr_ndx;
1122 
1123 	switch (read(fd, &buf, sizeof (buf))) {
1124 	case -1:
1125 		logperror("in_signal: read");
1126 		exit(1);
1127 		/* NOTREACHED */
1128 	case 1:
1129 		break;
1130 	case 0:
1131 		logerr("in_signal: read end of file\n");
1132 		exit(1);
1133 		/* NOTREACHED */
1134 	default:
1135 		logerr("in_signal: read > 1\n");
1136 		exit(1);
1137 	}
1138 
1139 	if (debug & D_TIMER)
1140 		logdebug("in_signal() got %d\n", buf);
1141 
1142 	switch (buf) {
1143 	case SIGALRM:
1144 		if (debug & D_TIMER) {
1145 			uint_t now = getcurrenttime();
1146 
1147 			logdebug("in_signal(SIGALRM) delta %u\n",
1148 			    now - timer_next);
1149 		}
1150 		timer_active = _B_FALSE;
1151 		run_timeouts();
1152 		break;
1153 	case SIGUSR1:
1154 		logdebug("Printing configuration:\n");
1155 		/* Print out the internal tables */
1156 		phyint_inst_print_all();
1157 
1158 		/*
1159 		 * Print out the accumulated statistics about missed
1160 		 * probes (happens due to scheduling delay).
1161 		 */
1162 		logerr("Missed sending total of %d probes spread over"
1163 		    " %d occurrences\n", probes_missed.pm_nprobes,
1164 		    probes_missed.pm_ntimes);
1165 
1166 		/*
1167 		 * Print out the accumulated statistics about probes
1168 		 * that were sent.
1169 		 */
1170 		for (pii = phyint_instances; pii != NULL;
1171 		    pii = pii->pii_next) {
1172 			unacked = 0;
1173 			acked = pii->pii_cum_stats.acked;
1174 			lost = pii->pii_cum_stats.lost;
1175 			sent = pii->pii_cum_stats.sent;
1176 			unknown = pii->pii_cum_stats.unknown;
1177 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1178 				switch (pii->pii_probes[pr_ndx].pr_status) {
1179 				case PR_ACKED:
1180 					acked++;
1181 					break;
1182 				case PR_LOST:
1183 					lost++;
1184 					break;
1185 				case PR_UNACKED:
1186 					unacked++;
1187 					break;
1188 				}
1189 			}
1190 			logerr("\nProbe stats on (%s %s)\n"
1191 			    "Number of probes sent %lld\n"
1192 			    "Number of probe acks received %lld\n"
1193 			    "Number of probes/acks lost %lld\n"
1194 			    "Number of valid unacknowledged probes %lld\n"
1195 			    "Number of ambiguous probe acks received %lld\n",
1196 			    AF_STR(pii->pii_af), pii->pii_name,
1197 			    sent, acked, lost, unacked, unknown);
1198 		}
1199 		break;
1200 	case SIGHUP:
1201 		logerr("SIGHUP: restart and reread config file\n");
1202 		/*
1203 		 * Cancel the interval timer.  Needed since setitimer() uses
1204 		 * alarm() and the time left is inherited across exec(), and
1205 		 * thus the SIGALRM may be delivered before a handler has been
1206 		 * setup, causing in.mpathd to erroneously exit.
1207 		 */
1208 		timer_cancel();
1209 		cleanup();
1210 		(void) execv(argv0[0], argv0);
1211 		_exit(0177);
1212 		/* NOTREACHED */
1213 	case SIGINT:
1214 	case SIGTERM:
1215 	case SIGQUIT:
1216 		cleanup();
1217 		exit(0);
1218 		/* NOTREACHED */
1219 	default:
1220 		logerr("in_signal: unknown signal: %d\n", buf);
1221 	}
1222 }
1223 
1224 static void
1225 cleanup(void)
1226 {
1227 	struct phyint_instance *pii;
1228 	struct phyint_instance *next_pii;
1229 
1230 	/*
1231 	 * Make sure that we don't write to eventpipe in
1232 	 * sig_handler() if any signal notably SIGALRM,
1233 	 * occurs after we close the eventpipe descriptor below
1234 	 */
1235 	cleanup_started = _B_TRUE;
1236 
1237 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1238 		next_pii = pii->pii_next;
1239 		phyint_inst_delete(pii);
1240 	}
1241 
1242 	(void) close(ifsock_v4);
1243 	(void) close(ifsock_v6);
1244 	(void) close(rtsock_v4);
1245 	(void) close(rtsock_v6);
1246 	(void) close(lsock_v4);
1247 	(void) close(lsock_v6);
1248 	(void) close(0);
1249 	(void) close(1);
1250 	(void) close(2);
1251 	(void) close(mibfd);
1252 	(void) close(eventpipe_read);
1253 	(void) close(eventpipe_write);
1254 }
1255 
1256 /*
1257  * Create pipe for signal delivery and set up signal handlers.
1258  */
1259 static void
1260 setup_eventpipe(void)
1261 {
1262 	int fds[2];
1263 	struct sigaction act;
1264 
1265 	if ((pipe(fds)) < 0) {
1266 		logperror("setup_eventpipe: pipe");
1267 		exit(1);
1268 	}
1269 	eventpipe_read = fds[0];
1270 	eventpipe_write = fds[1];
1271 	if (poll_add(eventpipe_read) == -1) {
1272 		exit(1);
1273 	}
1274 
1275 	act.sa_handler = sig_handler;
1276 	act.sa_flags = SA_RESTART;
1277 	(void) sigaction(SIGALRM, &act, NULL);
1278 
1279 	(void) sigset(SIGHUP, sig_handler);
1280 	(void) sigset(SIGUSR1, sig_handler);
1281 	(void) sigset(SIGTERM, sig_handler);
1282 	(void) sigset(SIGINT, sig_handler);
1283 	(void) sigset(SIGQUIT, sig_handler);
1284 }
1285 
1286 /*
1287  * Create a routing socket for receiving RTM_IFINFO messages.
1288  */
1289 static int
1290 setup_rtsock(int af)
1291 {
1292 	int	s;
1293 	int	flags;
1294 	int	aware = RTAW_UNDER_IPMP;
1295 
1296 	s = socket(PF_ROUTE, SOCK_RAW, af);
1297 	if (s == -1) {
1298 		logperror("setup_rtsock: socket PF_ROUTE");
1299 		exit(1);
1300 	}
1301 
1302 	if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
1303 		logperror("setup_rtsock: setsockopt RT_AWARE");
1304 		(void) close(s);
1305 		exit(1);
1306 	}
1307 
1308 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1309 		logperror("setup_rtsock: fcntl F_GETFL");
1310 		(void) close(s);
1311 		exit(1);
1312 	}
1313 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1314 		logperror("setup_rtsock: fcntl F_SETFL");
1315 		(void) close(s);
1316 		exit(1);
1317 	}
1318 	if (poll_add(s) == -1) {
1319 		(void) close(s);
1320 		exit(1);
1321 	}
1322 	return (s);
1323 }
1324 
1325 /*
1326  * Process an RTM_IFINFO message received on a routing socket.
1327  * The return value indicates whether a full interface scan is required.
1328  * Link up/down notifications are reflected in the IFF_RUNNING flag.
1329  * If just the state of the IFF_RUNNING interface flag has changed, a
1330  * a full interface scan isn't required.
1331  */
1332 static boolean_t
1333 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1334 {
1335 	struct sockaddr_dl *sdl;
1336 	struct phyint *pi;
1337 	uint64_t old_flags;
1338 	struct phyint_instance *pii;
1339 
1340 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1341 
1342 	/*
1343 	 * Although the sockaddr_dl structure is directly after the
1344 	 * if_msghdr_t structure. At the time of writing, the size of the
1345 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1346 	 * to the presence of a timeval structure, which contains longs,
1347 	 * in the if_data structure.  Anyway, we know where the message ends,
1348 	 * so we work backwards to get the start of the sockaddr_dl structure.
1349 	 */
1350 	/*LINTED*/
1351 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1352 	    sizeof (struct sockaddr_dl));
1353 
1354 	assert(sdl->sdl_family == AF_LINK);
1355 
1356 	/*
1357 	 * The interface name is in sdl_data.
1358 	 * RTM_IFINFO messages are only generated for logical interface
1359 	 * zero, so there is no colon and logical interface number to
1360 	 * strip from the name.	 The name is not null terminated, but
1361 	 * there should be enough space in sdl_data to add the null.
1362 	 */
1363 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1364 		if (debug & D_LINKNOTE)
1365 			logdebug("process_rtm_ifinfo: phyint name too long\n");
1366 		return (_B_TRUE);
1367 	}
1368 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1369 
1370 	pi = phyint_lookup(sdl->sdl_data);
1371 	if (pi == NULL) {
1372 		if (debug & D_LINKNOTE)
1373 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1374 			    " for %s\n", sdl->sdl_data);
1375 		return (_B_TRUE);
1376 	}
1377 
1378 	/*
1379 	 * We want to try and avoid doing a full interface scan for
1380 	 * link state notifications from the datalink layer, as indicated
1381 	 * by the state of the IFF_RUNNING flag.  If just the
1382 	 * IFF_RUNNING flag has changed state, the link state changes
1383 	 * are processed without a full scan.
1384 	 * If there is both an IPv4 and IPv6 instance associated with
1385 	 * the physical interface, we will get an RTM_IFINFO message
1386 	 * for each instance.  If we just maintained a single copy of
1387 	 * the physical interface flags, it would appear that no flags
1388 	 * had changed when the second message is processed, leading us
1389 	 * to believe that the message wasn't generated by a flags change,
1390 	 * and that a full interface scan is required.
1391 	 * To get around this problem, two additional copies of the flags
1392 	 * are kept, one copy for each instance.  These are only used in
1393 	 * this routine.  At any one time, all three copies of the flags
1394 	 * should be identical except for the IFF_RUNNING flag.	 The
1395 	 * copy of the flags in the "phyint" structure is always up to
1396 	 * date.
1397 	 */
1398 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1399 	if (pii == NULL) {
1400 		if (debug & D_LINKNOTE)
1401 			logdebug("process_rtm_ifinfo: no instance of address "
1402 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1403 		return (_B_TRUE);
1404 	}
1405 
1406 	old_flags = pii->pii_flags;
1407 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1408 	pi->pi_flags = pii->pii_flags;
1409 
1410 	if (debug & D_LINKNOTE) {
1411 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1412 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1413 		    AF_STR(type), old_flags, pi->pi_flags);
1414 	}
1415 
1416 	/*
1417 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1418 	 * types.
1419 	 */
1420 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1421 		phyint_changed(pi);
1422 
1423 	/* Has just the IFF_RUNNING flag changed state ? */
1424 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1425 		struct phyint_instance *pii_other;
1426 		/*
1427 		 * It wasn't just a link state change.	Update
1428 		 * the other instance's copy of the flags.
1429 		 */
1430 		pii_other = phyint_inst_other(pii);
1431 		if (pii_other != NULL)
1432 			pii_other->pii_flags = pii->pii_flags;
1433 		return (_B_TRUE);
1434 	}
1435 
1436 	return (_B_FALSE);
1437 }
1438 
1439 /*
1440  * Retrieve as many routing socket messages as possible, and try to
1441  * empty the routing sockets. Initiate full scan of targets or interfaces
1442  * as needed.
1443  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1444  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1445  */
1446 static void
1447 process_rtsock(int rtsock_v4, int rtsock_v6)
1448 {
1449 	int	nbytes;
1450 	int64_t msg[2048 / 8];
1451 	struct rt_msghdr *rtm;
1452 	boolean_t need_if_scan = _B_FALSE;
1453 	boolean_t need_rt_scan = _B_FALSE;
1454 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1455 	int type;
1456 
1457 	/* Read as many messages as possible and try to empty the sockets */
1458 	for (type = AF_INET; ; type = AF_INET6) {
1459 		for (;;) {
1460 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1461 			    rtsock_v6, msg, sizeof (msg));
1462 			if (nbytes <= 0) {
1463 				/* No more messages */
1464 				break;
1465 			}
1466 			rtm = (struct rt_msghdr *)msg;
1467 			if (rtm->rtm_version != RTM_VERSION) {
1468 				logerr("process_rtsock: version %d "
1469 				    "not understood\n", rtm->rtm_version);
1470 				break;
1471 			}
1472 
1473 			if (debug & D_PHYINT) {
1474 				logdebug("process_rtsock: message %d\n",
1475 				    rtm->rtm_type);
1476 			}
1477 
1478 			switch (rtm->rtm_type) {
1479 			case RTM_NEWADDR:
1480 			case RTM_DELADDR:
1481 				/*
1482 				 * Some logical interface has changed,
1483 				 * have to scan everything to determine
1484 				 * what actually changed.
1485 				 */
1486 				need_if_scan = _B_TRUE;
1487 				break;
1488 
1489 			case RTM_IFINFO:
1490 				rtm_ifinfo_seen = _B_TRUE;
1491 				need_if_scan |= process_rtm_ifinfo(
1492 				    (if_msghdr_t *)rtm, type);
1493 				break;
1494 
1495 			case RTM_ADD:
1496 			case RTM_DELETE:
1497 			case RTM_CHANGE:
1498 			case RTM_OLDADD:
1499 			case RTM_OLDDEL:
1500 				need_rt_scan = _B_TRUE;
1501 				break;
1502 
1503 			default:
1504 				/* Not interesting */
1505 				break;
1506 			}
1507 		}
1508 		if (type == AF_INET6)
1509 			break;
1510 	}
1511 
1512 	if (need_if_scan) {
1513 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1514 			logdebug("process_rtsock: synchronizing with kernel\n");
1515 		initifs();
1516 	} else if (rtm_ifinfo_seen) {
1517 		if (debug & D_LINKNOTE)
1518 			logdebug("process_rtsock: "
1519 			    "link up/down notification(s) seen\n");
1520 		process_link_state_changes();
1521 	}
1522 
1523 	if (need_rt_scan)
1524 		init_router_targets();
1525 }
1526 
1527 /*
1528  * Look if the phyint instance or one of its logints have been removed from
1529  * the kernel and take appropriate action.
1530  * Uses {pii,li}_in_use.
1531  */
1532 static void
1533 check_if_removed(struct phyint_instance *pii)
1534 {
1535 	struct logint *li;
1536 	struct logint *next_li;
1537 
1538 	/* Detect phyints that have been removed from the kernel. */
1539 	if (!pii->pii_in_use) {
1540 		logtrace("%s %s has been removed from kernel\n",
1541 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1542 		phyint_inst_delete(pii);
1543 	} else {
1544 		/* Detect logints that have been removed. */
1545 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1546 			next_li = li->li_next;
1547 			if (!li->li_in_use) {
1548 				logint_delete(li);
1549 			}
1550 		}
1551 	}
1552 }
1553 
1554 /*
1555  * Parse the supplied mib2 information to extract the routing information
1556  * table. Process the routing table to get the list of known onlink routers
1557  * and update our database. These onlink routers will serve as probe
1558  * targets.
1559  */
1560 static void
1561 update_router_list(mib_item_t *item)
1562 {
1563 	for (; item != NULL; item = item->mi_next) {
1564 		if (item->mi_opthdr.name == 0)
1565 			continue;
1566 		if (item->mi_opthdr.level == MIB2_IP &&
1567 		    item->mi_opthdr.name == MIB2_IP_ROUTE) {
1568 			ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp,
1569 			    item->mi_opthdr.len);
1570 		} else if (item->mi_opthdr.level == MIB2_IP6 &&
1571 		    item->mi_opthdr.name == MIB2_IP6_ROUTE) {
1572 			ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp,
1573 			    item->mi_opthdr.len);
1574 		}
1575 	}
1576 }
1577 
1578 
1579 /*
1580  * Convert octet `octp' to a phyint name and store in `ifname'
1581  */
1582 static void
1583 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
1584 {
1585 	char *cp;
1586 	size_t len = MIN(octp->o_length, ifsize - 1);
1587 
1588 	(void) strncpy(ifname, octp->o_bytes, len);
1589 	ifname[len] = '\0';
1590 
1591 	if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
1592 		*cp = '\0';
1593 }
1594 
1595 /*
1596  * Examine the IPv4 routing table `buf' for possible targets.  For each
1597  * possible target, if it's on the same subnet an interface route, pass
1598  * it to router_add_common() for further consideration.
1599  */
1600 static void
1601 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1602 {
1603 	char ifname[LIFNAMSIZ];
1604 	mib2_ipRouteEntry_t	*rp, *rp1, *endp;
1605 	struct in_addr		nexthop_v4;
1606 	struct in6_addr		nexthop;
1607 
1608 	if (debug & D_TARGET)
1609 		logdebug("ire_process_v4(len %d)\n", len);
1610 
1611 	if (len == 0)
1612 		return;
1613 
1614 	assert((len % ipRouteEntrySize) == 0);
1615 	endp = buf + (len / ipRouteEntrySize);
1616 
1617 	/*
1618 	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1619 	 * cross-reference them with the interface routes to determine if
1620 	 * they're possible probe targets.
1621 	 */
1622 	for (rp = buf; rp < endp; rp++) {
1623 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1624 			continue;
1625 
1626 		/* Get the nexthop address. */
1627 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1628 
1629 		/*
1630 		 * Rescan the routing table looking for interface routes that
1631 		 * are on the same subnet, and try to add them.  If they're
1632 		 * not relevant (e.g., the interface route isn't part of an
1633 		 * IPMP group, router_add_common() will discard).
1634 		 */
1635 		for (rp1 = buf; rp1 < endp; rp1++) {
1636 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
1637 			    rp1->ipRouteIfIndex.o_length == 0)
1638 				continue;
1639 
1640 			if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
1641 			    (nexthop_v4.s_addr & rp1->ipRouteMask))
1642 				continue;
1643 
1644 			oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
1645 			IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1646 			router_add_common(AF_INET, ifname, nexthop);
1647 		}
1648 	}
1649 }
1650 
1651 void
1652 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1653 {
1654 	struct phyint_instance *pii;
1655 	struct phyint *pi;
1656 
1657 	if (debug & D_TARGET)
1658 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1659 
1660 	/*
1661 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1662 	 */
1663 	pii = phyint_inst_lookup(af, ifname);
1664 	if (pii == NULL)
1665 		return;
1666 
1667 	/*
1668 	 * Don't use our own addresses as targets.
1669 	 */
1670 	if (own_address(nexthop))
1671 		return;
1672 
1673 	/*
1674 	 * If the phyint is part a named group, then add the address to all
1675 	 * members of the group; note that this is suboptimal in the IPv4 case
1676 	 * as it has already been added to all matching interfaces in
1677 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1678 	 * itself, since other phyints in the anongroup may not be on the same
1679 	 * subnet.
1680 	 */
1681 	pi = pii->pii_phyint;
1682 	if (pi->pi_group == phyint_anongroup) {
1683 		target_add(pii, nexthop, _B_TRUE);
1684 	} else {
1685 		pi = pi->pi_group->pg_phyint;
1686 		for (; pi != NULL; pi = pi->pi_pgnext)
1687 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1688 	}
1689 }
1690 
1691 /*
1692  * Examine the IPv6 routing table `buf' for possible link-local targets, and
1693  * pass any contenders to router_add_common() for further consideration.
1694  */
1695 static void
1696 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1697 {
1698 	struct lifreq lifr;
1699 	char ifname[LIFNAMSIZ];
1700 	char grname[LIFGRNAMSIZ];
1701 	mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
1702 	struct in6_addr nexthop_v6;
1703 
1704 	if (debug & D_TARGET)
1705 		logdebug("ire_process_v6(len %d)\n", len);
1706 
1707 	if (len == 0)
1708 		return;
1709 
1710 	assert((len % ipv6RouteEntrySize) == 0);
1711 	endp = buf + (len / ipv6RouteEntrySize);
1712 
1713 	/*
1714 	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1715 	 * cross-reference them with the interface routes to determine if
1716 	 * they're possible probe targets.
1717 	 */
1718 	for (rp = buf; rp < endp; rp++) {
1719 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
1720 		    !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
1721 			continue;
1722 
1723 		/* Get the nexthop address. */
1724 		nexthop_v6 = rp->ipv6RouteNextHop;
1725 
1726 		/*
1727 		 * The interface name should always exist for link-locals;
1728 		 * we use it to map this entry to an IPMP group name.
1729 		 */
1730 		if (rp->ipv6RouteIfIndex.o_length == 0)
1731 			continue;
1732 
1733 		oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
1734 		if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
1735 		    strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
1736 			continue;
1737 		}
1738 
1739 		/*
1740 		 * Rescan the list of routes for interface routes, and add the
1741 		 * above target to any interfaces in the same IPMP group.
1742 		 */
1743 		for (rp1 = buf; rp1 < endp; rp1++) {
1744 			if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
1745 			    rp1->ipv6RouteIfIndex.o_length == 0) {
1746 				continue;
1747 			}
1748 			oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
1749 			(void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
1750 
1751 			if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
1752 			    strcmp(lifr.lifr_groupname, grname) == 0) {
1753 				router_add_common(AF_INET6, ifname, nexthop_v6);
1754 			}
1755 		}
1756 	}
1757 }
1758 
1759 /*
1760  * Build a list of target routers, by scanning the routing tables.
1761  * It is assumed that interface routes exist, to reach the routers.
1762  */
1763 static void
1764 init_router_targets(void)
1765 {
1766 	struct	target *tg;
1767 	struct	target *next_tg;
1768 	struct	phyint_instance *pii;
1769 	struct	phyint *pi;
1770 
1771 	if (force_mcast)
1772 		return;
1773 
1774 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1775 		pi = pii->pii_phyint;
1776 		/*
1777 		 * Set tg_in_use to false only for router targets.
1778 		 */
1779 		if (!pii->pii_targets_are_routers)
1780 			continue;
1781 
1782 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1783 			tg->tg_in_use = 0;
1784 	}
1785 
1786 	if (mibwalk(update_router_list) == -1)
1787 		exit(1);
1788 
1789 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1790 		pi = pii->pii_phyint;
1791 		if (!pii->pii_targets_are_routers)
1792 			continue;
1793 
1794 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1795 			next_tg = tg->tg_next;
1796 			/*
1797 			 * If the group has failed, it's likely the route was
1798 			 * removed by an application affected by that failure.
1799 			 * In that case, we keep the target so that we can
1800 			 * reliably repair, at which point we'll refresh the
1801 			 * target list again.
1802 			 */
1803 			if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
1804 				target_delete(tg);
1805 		}
1806 	}
1807 }
1808 
1809 /*
1810  * Attempt to assign host targets to any interfaces that do not currently
1811  * have probe targets by sharing targets with other interfaces in the group.
1812  */
1813 static void
1814 init_host_targets(void)
1815 {
1816 	struct phyint_instance *pii;
1817 	struct phyint_group *pg;
1818 
1819 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1820 		pg = pii->pii_phyint->pi_group;
1821 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
1822 			dup_host_targets(pii);
1823 	}
1824 }
1825 
1826 /*
1827  * Duplicate host targets from other phyints of the group to
1828  * the phyint instance 'desired_pii'.
1829  */
1830 static void
1831 dup_host_targets(struct phyint_instance	 *desired_pii)
1832 {
1833 	int af;
1834 	struct phyint *pi;
1835 	struct phyint_instance *pii;
1836 	struct target *tg;
1837 
1838 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
1839 
1840 	af = desired_pii->pii_af;
1841 
1842 	/*
1843 	 * For every phyint in the same group as desired_pii, check if
1844 	 * it has any host targets. If so add them to desired_pii.
1845 	 */
1846 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
1847 		pii = PHYINT_INSTANCE(pi, af);
1848 		/*
1849 		 * We know that we don't have targets on this phyint instance
1850 		 * since we have been called. But we still check for
1851 		 * pii_targets_are_routers because another phyint instance
1852 		 * could have router targets, since IFF_NOFAILOVER addresses
1853 		 * on different phyint instances may belong to different
1854 		 * subnets.
1855 		 */
1856 		if ((pii == NULL) || (pii == desired_pii) ||
1857 		    pii->pii_targets_are_routers)
1858 			continue;
1859 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1860 			target_create(desired_pii, tg->tg_address, _B_FALSE);
1861 		}
1862 	}
1863 }
1864 
1865 static void
1866 usage(char *cmd)
1867 {
1868 	(void) fprintf(stderr, "usage: %s\n", cmd);
1869 }
1870 
1871 
1872 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
1873 
1874 /* Get an option from the /etc/default/mpathd file */
1875 static char *
1876 getdefault(char *name)
1877 {
1878 	char namebuf[BUFSIZ];
1879 	char *value = NULL;
1880 
1881 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
1882 		char	*cp;
1883 		int	flags;
1884 
1885 		/*
1886 		 * ignore case
1887 		 */
1888 		flags = defcntl(DC_GETFLAGS, 0);
1889 		TURNOFF(flags, DC_CASE);
1890 		(void) defcntl(DC_SETFLAGS, flags);
1891 
1892 		/* Add "=" to the name */
1893 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
1894 		(void) strncat(namebuf, "=", 2);
1895 
1896 		if ((cp = defread(namebuf)) != NULL)
1897 			value = strdup(cp);
1898 
1899 		/* close */
1900 		(void) defopen((char *)NULL);
1901 	}
1902 	return (value);
1903 }
1904 
1905 
1906 /*
1907  * Command line options below
1908  */
1909 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
1910 boolean_t	track_all_phyints = _B_FALSE;	/* track all IP interfaces */
1911 static boolean_t adopt = _B_FALSE;
1912 static boolean_t foreground = _B_FALSE;
1913 
1914 int
1915 main(int argc, char *argv[])
1916 {
1917 	int i;
1918 	int c;
1919 	struct phyint *pi;
1920 	struct phyint_instance *pii;
1921 	char *value;
1922 
1923 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
1924 	srandom(gethostid());	/* Initialize the random number generator */
1925 
1926 	/*
1927 	 * NOTE: The messages output by in.mpathd are not suitable for
1928 	 * translation, so we do not call textdomain().
1929 	 */
1930 	(void) setlocale(LC_ALL, "");
1931 
1932 	/*
1933 	 * Get the user specified value of 'failure detection time'
1934 	 * from /etc/default/mpathd
1935 	 */
1936 	value = getdefault("FAILURE_DETECTION_TIME");
1937 	if (value != NULL) {
1938 		user_failure_detection_time =
1939 		    (int)strtol((char *)value, NULL, 0);
1940 
1941 		if (user_failure_detection_time <= 0) {
1942 			user_failure_detection_time = FAILURE_DETECTION_TIME;
1943 			logerr("Invalid failure detection time %s, assuming "
1944 			    "default of %d ms\n", value,
1945 			    user_failure_detection_time);
1946 
1947 		} else if (user_failure_detection_time <
1948 		    MIN_FAILURE_DETECTION_TIME) {
1949 			user_failure_detection_time =
1950 			    MIN_FAILURE_DETECTION_TIME;
1951 			logerr("Too small failure detection time of %s, "
1952 			    "assuming minimum of %d ms\n", value,
1953 			    user_failure_detection_time);
1954 		}
1955 		free(value);
1956 	} else {
1957 		/* User has not specified the parameter, Use default value */
1958 		user_failure_detection_time = FAILURE_DETECTION_TIME;
1959 	}
1960 
1961 	/*
1962 	 * This gives the frequency at which probes will be sent.
1963 	 * When fdt ms elapses, we should be able to determine
1964 	 * whether 5 consecutive probes have failed or not.
1965 	 * 1 probe will be sent in every user_probe_interval ms,
1966 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
1967 	 * user_probe_interval. Thus when we send out probe 'n' we
1968 	 * can be sure that probe 'n - 2' is lost, if we have not
1969 	 * got the ack. (since the probe interval is > crtt). But
1970 	 * probe 'n - 1' may be a valid unacked probe, since the
1971 	 * time between 2 successive probes could be as small as
1972 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
1973 	 */
1974 	user_probe_interval = user_failure_detection_time /
1975 	    (NUM_PROBE_FAILS + 2);
1976 
1977 	/*
1978 	 * Get the user specified value of failback_enabled from
1979 	 * /etc/default/mpathd
1980 	 */
1981 	value = getdefault("FAILBACK");
1982 	if (value != NULL) {
1983 		if (strcasecmp(value, "yes") == 0)
1984 			failback_enabled = _B_TRUE;
1985 		else if (strcasecmp(value, "no") == 0)
1986 			failback_enabled = _B_FALSE;
1987 		else
1988 			logerr("Invalid value for FAILBACK %s\n", value);
1989 		free(value);
1990 	} else {
1991 		failback_enabled = _B_TRUE;
1992 	}
1993 
1994 	/*
1995 	 * Get the user specified value of track_all_phyints from
1996 	 * /etc/default/mpathd. The sense is reversed in
1997 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
1998 	 */
1999 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2000 	if (value != NULL) {
2001 		if (strcasecmp(value, "yes") == 0)
2002 			track_all_phyints = _B_FALSE;
2003 		else if (strcasecmp(value, "no") == 0)
2004 			track_all_phyints = _B_TRUE;
2005 		else
2006 			logerr("Invalid value for "
2007 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2008 		free(value);
2009 	} else {
2010 		track_all_phyints = _B_FALSE;
2011 	}
2012 
2013 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2014 		switch (c) {
2015 		case 'a':
2016 			adopt = _B_TRUE;
2017 			break;
2018 		case 'm':
2019 			force_mcast = _B_TRUE;
2020 			break;
2021 		case 'd':
2022 			debug = D_ALL;
2023 			foreground = _B_TRUE;
2024 			break;
2025 		case 'D':
2026 			i = (int)strtol(optarg, NULL, 0);
2027 			if (i == 0) {
2028 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2029 				    optarg);
2030 				exit(1);
2031 			}
2032 			debug |= i;
2033 			foreground = _B_TRUE;
2034 			break;
2035 		case 'l':
2036 			/*
2037 			 * Turn off link state notification handling.
2038 			 * Undocumented command line flag, for debugging
2039 			 * purposes.
2040 			 */
2041 			handle_link_notifications = _B_FALSE;
2042 			break;
2043 		default:
2044 			usage(argv[0]);
2045 			exit(1);
2046 		}
2047 	}
2048 
2049 	/*
2050 	 * The sockets for the loopback command interface should be listening
2051 	 * before we fork and exit in daemonize(). This way, whoever started us
2052 	 * can use the loopback interface as soon as they get a zero exit
2053 	 * status.
2054 	 */
2055 	lsock_v4 = setup_listener(AF_INET);
2056 	lsock_v6 = setup_listener(AF_INET6);
2057 
2058 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2059 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2060 		exit(1);
2061 	}
2062 
2063 	if (!foreground) {
2064 		if (!daemonize()) {
2065 			logerr("cannot daemonize\n");
2066 			exit(EXIT_FAILURE);
2067 		}
2068 		initlog();
2069 	}
2070 
2071 	/*
2072 	 * Initializations:
2073 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2074 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2075 	 * 2. Initialize a pipe for handling/recording signal events.
2076 	 * 3. Create the routing sockets,  used for listening
2077 	 *    to routing / interface changes.
2078 	 * 4. phyint_init() - Initialize physical interface state
2079 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2080 	 *    which timer_init() does indirectly.
2081 	 * 5. Query kernel for route entry sizes (v4 and v6).
2082 	 * 6. timer_init()  - Initialize timer related stuff
2083 	 * 7. initifs() - Initialize our database of all known interfaces
2084 	 * 8. init_router_targets() - Initialize our database of all known
2085 	 *    router targets.
2086 	 */
2087 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2088 	if (ifsock_v4 < 0) {
2089 		logperror("main: IPv4 socket open");
2090 		exit(1);
2091 	}
2092 
2093 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2094 	if (ifsock_v6 < 0) {
2095 		logperror("main: IPv6 socket open");
2096 		exit(1);
2097 	}
2098 
2099 	setup_eventpipe();
2100 
2101 	rtsock_v4 = setup_rtsock(AF_INET);
2102 	rtsock_v6 = setup_rtsock(AF_INET6);
2103 
2104 	if (phyint_init() == -1) {
2105 		logerr("cannot initialize physical interface structures");
2106 		exit(1);
2107 	}
2108 
2109 	if (mibwalk(mib_get_constants) == -1)
2110 		exit(1);
2111 
2112 	timer_init();
2113 
2114 	initifs();
2115 
2116 	/*
2117 	 * If we're operating in "adopt" mode and no interfaces need to be
2118 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2119 	 * interfaces are subsequently put into multipathing groups).
2120 	 */
2121 	if (adopt && phyint_instances == NULL)
2122 		exit(0);
2123 
2124 	/*
2125 	 * Main body. Keep listening for activity on any of the sockets
2126 	 * that we are monitoring and take appropriate action as necessary.
2127 	 * signals are also handled synchronously.
2128 	 */
2129 	for (;;) {
2130 		if (poll(pollfds, pollfd_num, -1) < 0) {
2131 			if (errno == EINTR)
2132 				continue;
2133 			logperror("main: poll");
2134 			exit(1);
2135 		}
2136 		for (i = 0; i < pollfd_num; i++) {
2137 			if ((pollfds[i].fd == -1) ||
2138 			    !(pollfds[i].revents & POLLIN))
2139 				continue;
2140 			if (pollfds[i].fd == eventpipe_read) {
2141 				in_signal(eventpipe_read);
2142 				break;
2143 			}
2144 			if (pollfds[i].fd == rtsock_v4 ||
2145 			    pollfds[i].fd == rtsock_v6) {
2146 				process_rtsock(rtsock_v4, rtsock_v6);
2147 				break;
2148 			}
2149 
2150 			for (pii = phyint_instances; pii != NULL;
2151 			    pii = pii->pii_next) {
2152 				if (pollfds[i].fd == pii->pii_probe_sock) {
2153 					if (pii->pii_af == AF_INET)
2154 						in_data(pii);
2155 					else
2156 						in6_data(pii);
2157 					break;
2158 				}
2159 			}
2160 
2161 			for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2162 				if (pi->pi_notes != 0 &&
2163 				    pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
2164 					(void) dlpi_recv(pi->pi_dh, NULL, NULL,
2165 					    NULL, NULL, 0, NULL);
2166 					break;
2167 				}
2168 			}
2169 
2170 			if (pollfds[i].fd == lsock_v4)
2171 				loopback_cmd(lsock_v4, AF_INET);
2172 			else if (pollfds[i].fd == lsock_v6)
2173 				loopback_cmd(lsock_v6, AF_INET6);
2174 		}
2175 	}
2176 	/* NOTREACHED */
2177 	return (EXIT_SUCCESS);
2178 }
2179 
2180 static int
2181 setup_listener(int af)
2182 {
2183 	int sock;
2184 	int on;
2185 	int len;
2186 	int ret;
2187 	struct sockaddr_storage laddr;
2188 	struct sockaddr_in  *sin;
2189 	struct sockaddr_in6 *sin6;
2190 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2191 
2192 	assert(af == AF_INET || af == AF_INET6);
2193 
2194 	sock = socket(af, SOCK_STREAM, 0);
2195 	if (sock < 0) {
2196 		logperror("setup_listener: socket");
2197 		exit(1);
2198 	}
2199 
2200 	on = 1;
2201 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2202 	    sizeof (on)) < 0) {
2203 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2204 		exit(1);
2205 	}
2206 
2207 	bzero(&laddr, sizeof (laddr));
2208 	laddr.ss_family = af;
2209 
2210 	if (af == AF_INET) {
2211 		sin = (struct sockaddr_in *)&laddr;
2212 		sin->sin_port = htons(MPATHD_PORT);
2213 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2214 		len = sizeof (struct sockaddr_in);
2215 	} else {
2216 		sin6 = (struct sockaddr_in6 *)&laddr;
2217 		sin6->sin6_port = htons(MPATHD_PORT);
2218 		sin6->sin6_addr = loopback_addr;
2219 		len = sizeof (struct sockaddr_in6);
2220 	}
2221 
2222 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2223 	if (ret < 0) {
2224 		if (errno == EADDRINUSE) {
2225 			/*
2226 			 * Another instance of mpathd may be already active.
2227 			 */
2228 			logerr("main: is another instance of in.mpathd "
2229 			    "already active?\n");
2230 			exit(1);
2231 		} else {
2232 			(void) close(sock);
2233 			return (-1);
2234 		}
2235 	}
2236 	if (listen(sock, 30) < 0) {
2237 		logperror("main: listen");
2238 		exit(1);
2239 	}
2240 	if (poll_add(sock) == -1) {
2241 		(void) close(sock);
2242 		exit(1);
2243 	}
2244 
2245 	return (sock);
2246 }
2247 
2248 /*
2249  * Table of commands and their expected size; used by loopback_cmd().
2250  */
2251 static struct {
2252 	const char	*name;
2253 	unsigned int	size;
2254 } commands[] = {
2255 	{ "MI_PING",		sizeof (uint32_t)	},
2256 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2257 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2258 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2259 };
2260 
2261 /*
2262  * Commands received over the loopback interface come here (via libipmp).
2263  */
2264 static void
2265 loopback_cmd(int sock, int family)
2266 {
2267 	int newfd;
2268 	ssize_t len;
2269 	boolean_t is_priv = _B_FALSE;
2270 	struct sockaddr_storage	peer;
2271 	struct sockaddr_in	*peer_sin;
2272 	struct sockaddr_in6	*peer_sin6;
2273 	socklen_t peerlen;
2274 	union mi_commands mpi;
2275 	char abuf[INET6_ADDRSTRLEN];
2276 	uint_t cmd;
2277 	int retval;
2278 
2279 	peerlen = sizeof (peer);
2280 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2281 	if (newfd < 0) {
2282 		logperror("loopback_cmd: accept");
2283 		return;
2284 	}
2285 
2286 	switch (family) {
2287 	case AF_INET:
2288 		/*
2289 		 * Validate the address and port to make sure that
2290 		 * non privileged processes don't connect and start
2291 		 * talking to us.
2292 		 */
2293 		if (peerlen != sizeof (struct sockaddr_in)) {
2294 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2295 			(void) close(newfd);
2296 			return;
2297 		}
2298 		peer_sin = (struct sockaddr_in *)&peer;
2299 		is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
2300 		(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2301 		    abuf, sizeof (abuf));
2302 
2303 		if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
2304 			logerr("Attempt to connect from addr %s port %d\n",
2305 			    abuf, ntohs(peer_sin->sin_port));
2306 			(void) close(newfd);
2307 			return;
2308 		}
2309 		break;
2310 
2311 	case AF_INET6:
2312 		if (peerlen != sizeof (struct sockaddr_in6)) {
2313 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2314 			(void) close(newfd);
2315 			return;
2316 		}
2317 		/*
2318 		 * Validate the address and port to make sure that
2319 		 * non privileged processes don't connect and start
2320 		 * talking to us.
2321 		 */
2322 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2323 		is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
2324 		(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2325 		    sizeof (abuf));
2326 		if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
2327 			logerr("Attempt to connect from addr %s port %d\n",
2328 			    abuf, ntohs(peer_sin6->sin6_port));
2329 			(void) close(newfd);
2330 			return;
2331 		}
2332 
2333 	default:
2334 		logdebug("loopback_cmd: family %d\n", family);
2335 		(void) close(newfd);
2336 		return;
2337 	}
2338 
2339 	/*
2340 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2341 	 * all supported commands
2342 	 */
2343 	len = read(newfd, &mpi, sizeof (mpi));
2344 
2345 	/*
2346 	 * In theory, we can receive any sized message for a stream socket,
2347 	 * but we don't expect that to happen for a small message over a
2348 	 * loopback connection.
2349 	 */
2350 	if (len < sizeof (uint32_t)) {
2351 		logerr("loopback_cmd: bad command format or read returns "
2352 		    "partial data %d\n", len);
2353 		(void) close(newfd);
2354 		return;
2355 	}
2356 
2357 	cmd = mpi.mi_command;
2358 	if (cmd >= MI_NCMD) {
2359 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2360 		(void) close(newfd);
2361 		return;
2362 	}
2363 
2364 	/*
2365 	 * Only MI_PING and MI_QUERY can come from unprivileged sources.
2366 	 */
2367 	if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
2368 		logerr("Unprivileged request from %s for privileged "
2369 		    "command %s\n", abuf, commands[cmd].name);
2370 		(void) close(newfd);
2371 		return;
2372 	}
2373 
2374 	if (len < commands[cmd].size) {
2375 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2376 		    commands[cmd].name, commands[cmd].size, len);
2377 		(void) close(newfd);
2378 		return;
2379 	}
2380 
2381 	retval = process_cmd(newfd, &mpi);
2382 	if (retval != IPMP_SUCCESS) {
2383 		logerr("failed processing %s: %s\n", commands[cmd].name,
2384 		    ipmp_errmsg(retval));
2385 	}
2386 	(void) close(newfd);
2387 }
2388 
2389 /*
2390  * Process the commands received via libipmp.
2391  */
2392 static unsigned int
2393 process_cmd(int newfd, union mi_commands *mpi)
2394 {
2395 	struct phyint *pi;
2396 	struct mi_offline *mio;
2397 	struct mi_undo_offline *miu;
2398 	unsigned int retval;
2399 
2400 	switch (mpi->mi_command) {
2401 	case MI_PING:
2402 		return (send_result(newfd, IPMP_SUCCESS, 0));
2403 
2404 	case MI_OFFLINE:
2405 		mio = &mpi->mi_ocmd;
2406 
2407 		pi = phyint_lookup(mio->mio_ifname);
2408 		if (pi == NULL)
2409 			return (send_result(newfd, IPMP_EUNKIF, 0));
2410 
2411 		retval = phyint_offline(pi, mio->mio_min_redundancy);
2412 		if (retval == IPMP_FAILURE)
2413 			return (send_result(newfd, IPMP_FAILURE, errno));
2414 
2415 		return (send_result(newfd, retval, 0));
2416 
2417 	case MI_UNDO_OFFLINE:
2418 		miu = &mpi->mi_ucmd;
2419 
2420 		pi = phyint_lookup(miu->miu_ifname);
2421 		if (pi == NULL)
2422 			return (send_result(newfd, IPMP_EUNKIF, 0));
2423 
2424 		retval = phyint_undo_offline(pi);
2425 		if (retval == IPMP_FAILURE)
2426 			return (send_result(newfd, IPMP_FAILURE, errno));
2427 
2428 		return (send_result(newfd, retval, 0));
2429 
2430 	case MI_QUERY:
2431 		return (process_query(newfd, &mpi->mi_qcmd));
2432 
2433 	default:
2434 		break;
2435 	}
2436 
2437 	return (send_result(newfd, IPMP_EPROTO, 0));
2438 }
2439 
2440 /*
2441  * Process the query request pointed to by `miq' and send a reply on file
2442  * descriptor `fd'.  Returns an IPMP error code.
2443  */
2444 static unsigned int
2445 process_query(int fd, mi_query_t *miq)
2446 {
2447 	ipmp_addrinfo_t		*adinfop;
2448 	ipmp_addrinfolist_t	*adlp;
2449 	ipmp_groupinfo_t	*grinfop;
2450 	ipmp_groupinfolist_t	*grlp;
2451 	ipmp_grouplist_t	*grlistp;
2452 	ipmp_ifinfo_t		*ifinfop;
2453 	ipmp_ifinfolist_t	*iflp;
2454 	ipmp_snap_t		*snap;
2455 	unsigned int		retval;
2456 
2457 	switch (miq->miq_inforeq) {
2458 	case IPMP_ADDRINFO:
2459 		retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
2460 		    &adinfop);
2461 		if (retval != IPMP_SUCCESS)
2462 			return (send_result(fd, retval, errno));
2463 
2464 		retval = send_result(fd, IPMP_SUCCESS, 0);
2465 		if (retval == IPMP_SUCCESS)
2466 			retval = send_addrinfo(fd, adinfop);
2467 
2468 		ipmp_freeaddrinfo(adinfop);
2469 		return (retval);
2470 
2471 	case IPMP_GROUPLIST:
2472 		retval = getgrouplist(&grlistp);
2473 		if (retval != IPMP_SUCCESS)
2474 			return (send_result(fd, retval, errno));
2475 
2476 		retval = send_result(fd, IPMP_SUCCESS, 0);
2477 		if (retval == IPMP_SUCCESS)
2478 			retval = send_grouplist(fd, grlistp);
2479 
2480 		ipmp_freegrouplist(grlistp);
2481 		return (retval);
2482 
2483 	case IPMP_GROUPINFO:
2484 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2485 		retval = getgroupinfo(miq->miq_grname, &grinfop);
2486 		if (retval != IPMP_SUCCESS)
2487 			return (send_result(fd, retval, errno));
2488 
2489 		retval = send_result(fd, IPMP_SUCCESS, 0);
2490 		if (retval == IPMP_SUCCESS)
2491 			retval = send_groupinfo(fd, grinfop);
2492 
2493 		ipmp_freegroupinfo(grinfop);
2494 		return (retval);
2495 
2496 	case IPMP_IFINFO:
2497 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2498 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2499 		if (retval != IPMP_SUCCESS)
2500 			return (send_result(fd, retval, errno));
2501 
2502 		retval = send_result(fd, IPMP_SUCCESS, 0);
2503 		if (retval == IPMP_SUCCESS)
2504 			retval = send_ifinfo(fd, ifinfop);
2505 
2506 		ipmp_freeifinfo(ifinfop);
2507 		return (retval);
2508 
2509 	case IPMP_SNAP:
2510 		/*
2511 		 * Before taking the snapshot, sync with the kernel.
2512 		 */
2513 		initifs();
2514 
2515 		retval = getsnap(&snap);
2516 		if (retval != IPMP_SUCCESS)
2517 			return (send_result(fd, retval, errno));
2518 
2519 		retval = send_result(fd, IPMP_SUCCESS, 0);
2520 		if (retval != IPMP_SUCCESS)
2521 			goto out;
2522 
2523 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2524 		if (retval != IPMP_SUCCESS)
2525 			goto out;
2526 
2527 		retval = send_grouplist(fd, snap->sn_grlistp);
2528 		if (retval != IPMP_SUCCESS)
2529 			goto out;
2530 
2531 		iflp = snap->sn_ifinfolistp;
2532 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2533 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2534 			if (retval != IPMP_SUCCESS)
2535 				goto out;
2536 		}
2537 
2538 		grlp = snap->sn_grinfolistp;
2539 		for (; grlp != NULL; grlp = grlp->grl_next) {
2540 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2541 			if (retval != IPMP_SUCCESS)
2542 				goto out;
2543 		}
2544 
2545 		adlp = snap->sn_adinfolistp;
2546 		for (; adlp != NULL; adlp = adlp->adl_next) {
2547 			retval = send_addrinfo(fd, adlp->adl_adinfop);
2548 			if (retval != IPMP_SUCCESS)
2549 				goto out;
2550 		}
2551 	out:
2552 		ipmp_snap_free(snap);
2553 		return (retval);
2554 
2555 	default:
2556 		break;
2557 
2558 	}
2559 	return (send_result(fd, IPMP_EPROTO, 0));
2560 }
2561 
2562 /*
2563  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2564  * Returns an IPMP error code.
2565  */
2566 static unsigned int
2567 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2568 {
2569 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2570 	ipmp_addrlist_t	*adlistp = grinfop->gr_adlistp;
2571 	unsigned int	retval;
2572 
2573 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2574 	if (retval != IPMP_SUCCESS)
2575 		return (retval);
2576 
2577 	retval = ipmp_writetlv(fd, IPMP_IFLIST,
2578 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
2579 	if (retval != IPMP_SUCCESS)
2580 		return (retval);
2581 
2582 	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2583 	    IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
2584 }
2585 
2586 /*
2587  * Send the interface information pointed to by `ifinfop' on file descriptor
2588  * `fd'.  Returns an IPMP error code.
2589  */
2590 static unsigned int
2591 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2592 {
2593 	ipmp_addrlist_t	*adlist4p = ifinfop->if_targinfo4.it_targlistp;
2594 	ipmp_addrlist_t	*adlist6p = ifinfop->if_targinfo6.it_targlistp;
2595 	unsigned int	retval;
2596 
2597 	retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop);
2598 	if (retval != IPMP_SUCCESS)
2599 		return (retval);
2600 
2601 	retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
2602 	    IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
2603 	if (retval != IPMP_SUCCESS)
2604 		return (retval);
2605 
2606 	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2607 	    IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
2608 }
2609 
2610 /*
2611  * Send the address information pointed to by `adinfop' on file descriptor
2612  * `fd'.  Returns an IPMP error code.
2613  */
2614 static unsigned int
2615 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
2616 {
2617 	return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
2618 }
2619 
2620 /*
2621  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2622  * Returns an IPMP error code.
2623  */
2624 static unsigned int
2625 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2626 {
2627 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2628 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2629 }
2630 
2631 /*
2632  * Initialize an mi_result_t structure using `error' and `syserror' and
2633  * send it on file descriptor `fd'.  Returns an IPMP error code.
2634  */
2635 static unsigned int
2636 send_result(int fd, unsigned int error, int syserror)
2637 {
2638 	mi_result_t me;
2639 
2640 	me.me_mpathd_error = error;
2641 	if (error == IPMP_FAILURE)
2642 		me.me_sys_error = syserror;
2643 	else
2644 		me.me_sys_error = 0;
2645 
2646 	return (ipmp_write(fd, &me, sizeof (me)));
2647 }
2648 
2649 /*
2650  * Daemonize the process.
2651  */
2652 static boolean_t
2653 daemonize(void)
2654 {
2655 	switch (fork()) {
2656 	case -1:
2657 		return (_B_FALSE);
2658 
2659 	case  0:
2660 		/*
2661 		 * Lose our controlling terminal, and become both a session
2662 		 * leader and a process group leader.
2663 		 */
2664 		if (setsid() == -1)
2665 			return (_B_FALSE);
2666 
2667 		/*
2668 		 * Under POSIX, a session leader can accidentally (through
2669 		 * open(2)) acquire a controlling terminal if it does not
2670 		 * have one.  Just to be safe, fork() again so we are not a
2671 		 * session leader.
2672 		 */
2673 		switch (fork()) {
2674 		case -1:
2675 			return (_B_FALSE);
2676 
2677 		case 0:
2678 			(void) chdir("/");
2679 			(void) umask(022);
2680 			(void) fdwalk(closefunc, NULL);
2681 			break;
2682 
2683 		default:
2684 			_exit(EXIT_SUCCESS);
2685 		}
2686 		break;
2687 
2688 	default:
2689 		_exit(EXIT_SUCCESS);
2690 	}
2691 
2692 	return (_B_TRUE);
2693 }
2694 
2695 /*
2696  * The parent has created some fds before forking on purpose, keep them open.
2697  */
2698 static int
2699 closefunc(void *not_used, int fd)
2700 /* ARGSUSED */
2701 {
2702 	if (fd != lsock_v4 && fd != lsock_v6)
2703 		(void) close(fd);
2704 	return (0);
2705 }
2706 
2707 /* LOGGER */
2708 
2709 #include <syslog.h>
2710 
2711 /*
2712  * Logging routines.  All routines log to syslog, unless the daemon is
2713  * running in the foreground, in which case the logging goes to stderr.
2714  *
2715  * The following routines are available:
2716  *
2717  *	logdebug(): A printf-like function for outputting debug messages
2718  *	(messages at LOG_DEBUG) that are only of use to developers.
2719  *
2720  *	logtrace(): A printf-like function for outputting tracing messages
2721  *	(messages at LOG_INFO) from the daemon.	 This is typically used
2722  *	to log the receipt of interesting network-related conditions.
2723  *
2724  *	logerr(): A printf-like function for outputting error messages
2725  *	(messages at LOG_ERR) from the daemon.
2726  *
2727  *	logperror*(): A set of functions used to output error messages
2728  *	(messages at LOG_ERR); these automatically append strerror(errno)
2729  *	and a newline to the message passed to them.
2730  *
2731  * NOTE: since the logging functions write to syslog, the messages passed
2732  *	 to them are not eligible for localization.  Thus, gettext() must
2733  *	 *not* be used.
2734  */
2735 
2736 static int logging = 0;
2737 
2738 static void
2739 initlog(void)
2740 {
2741 	logging++;
2742 	openlog("in.mpathd", LOG_PID, LOG_DAEMON);
2743 }
2744 
2745 /* PRINTFLIKE2 */
2746 void
2747 logmsg(int pri, const char *fmt, ...)
2748 {
2749 	va_list ap;
2750 
2751 	va_start(ap, fmt);
2752 
2753 	if (logging)
2754 		vsyslog(pri, fmt, ap);
2755 	else
2756 		(void) vfprintf(stderr, fmt, ap);
2757 	va_end(ap);
2758 }
2759 
2760 /* PRINTFLIKE1 */
2761 void
2762 logperror(const char *str)
2763 {
2764 	if (logging)
2765 		syslog(LOG_ERR, "%s: %m\n", str);
2766 	else
2767 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
2768 }
2769 
2770 void
2771 logperror_pii(struct phyint_instance *pii, const char *str)
2772 {
2773 	if (logging) {
2774 		syslog(LOG_ERR, "%s (%s %s): %m\n",
2775 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
2776 	} else {
2777 		(void) fprintf(stderr, "%s (%s %s): %s\n",
2778 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
2779 		    strerror(errno));
2780 	}
2781 }
2782 
2783 void
2784 logperror_li(struct logint *li, const char *str)
2785 {
2786 	struct	phyint_instance	*pii = li->li_phyint_inst;
2787 
2788 	if (logging) {
2789 		syslog(LOG_ERR, "%s (%s %s): %m\n",
2790 		    str, AF_STR(pii->pii_af), li->li_name);
2791 	} else {
2792 		(void) fprintf(stderr, "%s (%s %s): %s\n",
2793 		    str, AF_STR(pii->pii_af), li->li_name,
2794 		    strerror(errno));
2795 	}
2796 }
2797 
2798 void
2799 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
2800 {
2801 	if (polled)
2802 		(void) poll_remove(pii->pii_probe_sock);
2803 	(void) close(pii->pii_probe_sock);
2804 	pii->pii_probe_sock = -1;
2805 	pii->pii_basetime_inited = 0;
2806 }
2807 
2808 boolean_t
2809 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
2810     struct sockaddr_storage *ssp)
2811 {
2812 	addrlist_t *addrp;
2813 
2814 	if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
2815 		return (_B_FALSE);
2816 
2817 	(void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
2818 	addrp->al_flags = flags;
2819 	addrp->al_addr = *ssp;
2820 	addrp->al_next = *addrsp;
2821 	*addrsp = addrp;
2822 	return (_B_TRUE);
2823 }
2824 
2825 void
2826 addrlist_free(addrlist_t **addrsp)
2827 {
2828 	addrlist_t *addrp, *next_addrp;
2829 
2830 	for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
2831 		next_addrp = addrp->al_next;
2832 		free(addrp);
2833 	}
2834 	*addrsp = NULL;
2835 }
2836 
2837 /*
2838  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
2839  * tables defined by mib2.h. Pass the table information returned to the
2840  * supplied function.
2841  */
2842 static int
2843 mibwalk(void (*proc)(mib_item_t *))
2844 {
2845 	mib_item_t		*head_item = NULL;
2846 	mib_item_t		*last_item = NULL;
2847 	mib_item_t		*tmp;
2848 	struct strbuf		ctlbuf, databuf;
2849 	int			flags;
2850 	int			rval;
2851 	uintptr_t		buf[512 / sizeof (uintptr_t)];
2852 	struct T_optmgmt_req	*tor = (struct T_optmgmt_req *)buf;
2853 	struct T_optmgmt_ack	*toa = (struct T_optmgmt_ack *)buf;
2854 	struct T_error_ack	*tea = (struct T_error_ack *)buf;
2855 	struct opthdr		*req, *optp;
2856 	int			status = -1;
2857 
2858 	if (mibfd == -1) {
2859 		if ((mibfd = open("/dev/ip", O_RDWR)) < 0) {
2860 			logperror("mibwalk(): ip open");
2861 			return (status);
2862 		}
2863 	}
2864 
2865 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2866 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
2867 	tor->OPT_length = sizeof (struct opthdr);
2868 	tor->MGMT_flags = T_CURRENT;
2869 
2870 	/*
2871 	 * Note: we use the special level value below so that IP will return
2872 	 * us information concerning IRE_MARK_TESTHIDDEN routes.
2873 	 */
2874 	req = (struct opthdr *)&tor[1];
2875 	req->level = EXPER_IP_AND_TESTHIDDEN;
2876 	req->name  = 0;
2877 	req->len   = 0;
2878 
2879 	ctlbuf.buf = (char *)&buf;
2880 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
2881 
2882 	if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) {
2883 		logperror("mibwalk(): putmsg(ctl)");
2884 		return (status);
2885 	}
2886 
2887 	/*
2888 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
2889 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
2890 	 * a control and data part. The control part contains a struct
2891 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
2892 	 * the level, name and length of the data in the data part. The
2893 	 * data part contains the actual table data. The last message
2894 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
2895 	 * single option with zero optlen.
2896 	 */
2897 	for (;;) {
2898 		errno = flags = 0;
2899 		ctlbuf.maxlen = sizeof (buf);
2900 		rval = getmsg(mibfd, &ctlbuf, NULL, &flags);
2901 		if (rval & MORECTL || rval < 0) {
2902 			if (errno == EINTR)
2903 				continue;
2904 			logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n",
2905 			    rval, errno);
2906 			goto error;
2907 		}
2908 		if (ctlbuf.len < sizeof (t_scalar_t)) {
2909 			logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len);
2910 			goto error;
2911 		}
2912 
2913 		switch (toa->PRIM_type) {
2914 		case T_ERROR_ACK:
2915 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
2916 				logerr("mibwalk(): T_ERROR_ACK ctlbuf "
2917 				    "too short: %d\n", ctlbuf.len);
2918 				goto error;
2919 			}
2920 			logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n"
2921 			    " UNIX_err = 0x%lx\n", tea->TLI_error,
2922 			    t_strerror(tea->TLI_error), tea->UNIX_error);
2923 			goto error;
2924 
2925 		case T_OPTMGMT_ACK:
2926 			optp = (struct opthdr *)&toa[1];
2927 			if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
2928 			    sizeof (struct opthdr))) {
2929 				logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too "
2930 				    "short: %d\n", ctlbuf.len);
2931 				goto error;
2932 			}
2933 			if (toa->MGMT_flags != T_SUCCESS) {
2934 				logerr("mibwalk(): MGMT_flags != T_SUCCESS: "
2935 				    "0x%lx\n", toa->MGMT_flags);
2936 				goto error;
2937 			}
2938 			break;
2939 
2940 		default:
2941 			goto error;
2942 		}
2943 		/* The following assert also implies MGMT_flags == T_SUCCESS */
2944 		assert(toa->PRIM_type == T_OPTMGMT_ACK);
2945 
2946 		/*
2947 		 * We have reached the end of this T_OPTMGMT_ACK
2948 		 * message. If this is the last message i.e EOD,
2949 		 * break, else process the next T_OPTMGMT_ACK msg.
2950 		 */
2951 		if (rval == 0) {
2952 			if (optp->len == 0 && optp->name == 0 &&
2953 			    optp->level == 0) {
2954 				/* This is the EOD message. */
2955 				break;
2956 			}
2957 			/* Not EOD but no data to retrieve */
2958 			continue;
2959 		}
2960 
2961 		/*
2962 		 * We should only be here if MOREDATA was set.
2963 		 * Allocate an empty mib_item_t and link into the list
2964 		 * of MIB items.
2965 		 */
2966 		if ((tmp = malloc(sizeof (*tmp))) == NULL) {
2967 			logperror("mibwalk(): malloc() failed.");
2968 			goto error;
2969 		}
2970 		if (last_item != NULL)
2971 			last_item->mi_next = tmp;
2972 		else
2973 			head_item = tmp;
2974 		last_item = tmp;
2975 		last_item->mi_next = NULL;
2976 		last_item->mi_opthdr = *optp;
2977 		last_item->mi_valp = malloc(optp->len);
2978 		if (last_item->mi_valp == NULL) {
2979 			logperror("mibwalk(): malloc() failed.");
2980 			goto error;
2981 		}
2982 
2983 		databuf.maxlen = last_item->mi_opthdr.len;
2984 		databuf.buf = (char *)last_item->mi_valp;
2985 		databuf.len = 0;
2986 
2987 		/* Retrieve the actual MIB data */
2988 		for (;;) {
2989 			flags = 0;
2990 			if ((rval = getmsg(mibfd, NULL, &databuf,
2991 			    &flags)) != 0) {
2992 				if (rval < 0 && errno == EINTR)
2993 					continue;
2994 				/*
2995 				 * We shouldn't get MOREDATA here so treat that
2996 				 * as an error.
2997 				 */
2998 				logperror("mibwalk(): getmsg(data)");
2999 				goto error;
3000 			}
3001 			break;
3002 		}
3003 	}
3004 	status = 0;
3005 	/* Pass the accumulated MIB data to the supplied function pointer */
3006 	(*proc)(head_item);
3007 error:
3008 	while (head_item != NULL) {
3009 		tmp = head_item;
3010 		head_item = tmp->mi_next;
3011 		free(tmp->mi_valp);
3012 		free(tmp);
3013 	}
3014 	return (status);
3015 }
3016 
3017 /*
3018  * Parse the supplied mib2 information to get the size of routing table
3019  * entries. This is needed when running in a branded zone where the
3020  * Solaris application environment and the Solaris kernel may not be the
3021  * the same release version.
3022  */
3023 static void
3024 mib_get_constants(mib_item_t *item)
3025 {
3026 	mib2_ip_t		*ipv4;
3027 	mib2_ipv6IfStatsEntry_t	*ipv6;
3028 
3029 	for (; item != NULL; item = item->mi_next) {
3030 		if (item->mi_opthdr.name != 0)
3031 			continue;
3032 		if (item->mi_opthdr.level == MIB2_IP) {
3033 			ipv4 = (mib2_ip_t *)item->mi_valp;
3034 			ipRouteEntrySize = ipv4->ipRouteEntrySize;
3035 		} else if (item->mi_opthdr.level == MIB2_IP6) {
3036 			ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp;
3037 			ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize;
3038 		}
3039 	}
3040 }
3041