xref: /titanic_51/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 4fb0018bf832424363cfcc05b23323c48ab7a076)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 int debug = 0;				/* Debug flag */
30 static int pollfd_num = 0;		/* Num. of poll descriptors */
31 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
32 
33 					/* All times below in ms */
34 int	user_failure_detection_time;	/* user specified failure detection */
35 					/* time (fdt) */
36 int	user_probe_interval;		/* derived from user specified fdt */
37 
38 static int	rtsock_v4;		/* AF_INET routing socket */
39 static int	rtsock_v6;		/* AF_INET6 routing socket */
40 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
41 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
42 static int	lsock_v4;		/* Listen socket to detect mpathd */
43 static int	lsock_v6;		/* Listen socket to detect mpathd */
44 static int	mibfd = -1;		/* fd to get mib info */
45 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
46 
47 static uint_t	last_initifs_time;	/* Time when initifs was last run */
48 static	char **argv0;			/* Saved for re-exec on SIGHUP */
49 boolean_t handle_link_notifications = _B_TRUE;
50 
51 static void	initlog(void);
52 static void	run_timeouts(void);
53 static void	initifs(void);
54 static void	check_if_removed(struct phyint_instance *pii);
55 static void	select_test_ifs(void);
56 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
57 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
58 static void	router_add_common(int af, char *ifname,
59     struct in6_addr nexthop);
60 static void	init_router_targets();
61 static void	cleanup(void);
62 static int	setup_listener(int af);
63 static void	check_config(void);
64 static void	check_testconfig(void);
65 static void	check_addr_unique(struct phyint_instance *,
66     struct sockaddr_storage *);
67 static void	init_host_targets(void);
68 static void	dup_host_targets(struct phyint_instance *desired_pii);
69 static void	loopback_cmd(int sock, int family);
70 static boolean_t daemonize(void);
71 static int	closefunc(void *, int);
72 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
73 static unsigned int process_query(int fd, mi_query_t *miq);
74 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop);
75 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
76 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
77 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
78 static unsigned int send_result(int fd, unsigned int error, int syserror);
79 
80 addrlist_t *localaddrs;
81 
82 /*
83  * Return the current time in milliseconds (from an arbitrary reference)
84  * truncated to fit into an int. Truncation is ok since we are interested
85  * only in differences and not the absolute values.
86  */
87 uint_t
88 getcurrenttime(void)
89 {
90 	uint_t	cur_time;	/* In ms */
91 
92 	/*
93 	 * Use of a non-user-adjustable source of time is
94 	 * required. However millisecond precision is sufficient.
95 	 * divide by 10^6
96 	 */
97 	cur_time = (uint_t)(gethrtime() / 1000000LL);
98 	return (cur_time);
99 }
100 
101 uint64_t
102 getcurrentsec(void)
103 {
104 	return (gethrtime() / NANOSEC);
105 }
106 
107 /*
108  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
109  */
110 int
111 poll_add(int fd)
112 {
113 	int i;
114 	int new_num;
115 	struct pollfd *newfds;
116 retry:
117 	/* Check if already present */
118 	for (i = 0; i < pollfd_num; i++) {
119 		if (pollfds[i].fd == fd)
120 			return (0);
121 	}
122 	/* Check for empty spot already present */
123 	for (i = 0; i < pollfd_num; i++) {
124 		if (pollfds[i].fd == -1) {
125 			pollfds[i].fd = fd;
126 			return (0);
127 		}
128 	}
129 
130 	/* Allocate space for 32 more fds and initialize to -1 */
131 	new_num = pollfd_num + 32;
132 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
133 	if (newfds == NULL) {
134 		logperror("poll_add: realloc");
135 		return (-1);
136 	}
137 	for (i = pollfd_num; i < new_num; i++) {
138 		newfds[i].fd = -1;
139 		newfds[i].events = POLLIN;
140 	}
141 	pollfd_num = new_num;
142 	pollfds = newfds;
143 	goto retry;
144 }
145 
146 /*
147  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
148  */
149 int
150 poll_remove(int fd)
151 {
152 	int i;
153 
154 	/* Check if already present */
155 	for (i = 0; i < pollfd_num; i++) {
156 		if (pollfds[i].fd == fd) {
157 			pollfds[i].fd = -1;
158 			return (0);
159 		}
160 	}
161 	return (-1);
162 }
163 
164 /*
165  * Extract information about the phyint instance. If the phyint instance still
166  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
167  * will use it to detect phyint instances that don't exist any longer and
168  * remove them, from our database of phyint instances.
169  * Return value:
170  *	returns true if the phyint instance exists in the kernel,
171  *	returns false otherwise
172  */
173 static boolean_t
174 pii_process(int af, char *name, struct phyint_instance **pii_p)
175 {
176 	int err;
177 	struct phyint_instance *pii;
178 	struct phyint_instance *pii_other;
179 
180 	if (debug & D_PHYINT)
181 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
182 
183 	pii = phyint_inst_lookup(af, name);
184 	if (pii == NULL) {
185 		/*
186 		 * Phyint instance does not exist in our tables,
187 		 * create new phyint instance
188 		 */
189 		pii = phyint_inst_init_from_k(af, name);
190 	} else {
191 		/* Phyint exists in our tables */
192 		err = phyint_inst_update_from_k(pii);
193 
194 		switch (err) {
195 		case PI_IOCTL_ERROR:
196 			/* Some ioctl error. don't change anything */
197 			pii->pii_in_use = 1;
198 			break;
199 
200 		case PI_GROUP_CHANGED:
201 		case PI_IFINDEX_CHANGED:
202 			/*
203 			 * Interface index or group membership has changed.
204 			 * Delete the old state and recreate based on the new
205 			 * state (it may no longer be in a group).
206 			 */
207 			pii_other = phyint_inst_other(pii);
208 			if (pii_other != NULL)
209 				phyint_inst_delete(pii_other);
210 			phyint_inst_delete(pii);
211 			pii = phyint_inst_init_from_k(af, name);
212 			break;
213 
214 		case PI_DELETED:
215 			/* Phyint instance has disappeared from kernel */
216 			pii->pii_in_use = 0;
217 			break;
218 
219 		case PI_OK:
220 			/* Phyint instance exists and is fine */
221 			pii->pii_in_use = 1;
222 			break;
223 
224 		default:
225 			/* Unknown status */
226 			logerr("pii_process: Unknown status %d\n", err);
227 			break;
228 		}
229 	}
230 
231 	*pii_p = pii;
232 	if (pii != NULL)
233 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
234 	else
235 		return (_B_FALSE);
236 }
237 
238 /*
239  * Scan all interfaces to detect changes as well as new and deleted interfaces
240  */
241 static void
242 initifs()
243 {
244 	int	i, nlifr;
245 	int	af;
246 	char	*cp;
247 	char	*buf;
248 	int	sockfd;
249 	uint64_t	flags;
250 	struct lifnum	lifn;
251 	struct lifconf	lifc;
252 	struct lifreq	lifreq;
253 	struct lifreq	*lifr;
254 	struct logint	*li;
255 	struct phyint_instance *pii;
256 	struct phyint_instance *next_pii;
257 	struct phyint_group *pg, *next_pg;
258 	char		pi_name[LIFNAMSIZ + 1];
259 
260 	if (debug & D_PHYINT)
261 		logdebug("initifs: Scanning interfaces\n");
262 
263 	last_initifs_time = getcurrenttime();
264 
265 	/*
266 	 * Free the existing local address list; we'll build a new list below.
267 	 */
268 	addrlist_free(&localaddrs);
269 
270 	/*
271 	 * Mark the interfaces so that we can find phyints and logints
272 	 * which have disappeared from the kernel. pii_process() and
273 	 * logint_init_from_k() will set {pii,li}_in_use when they find
274 	 * the interface in the kernel. Also, clear dupaddr bit on probe
275 	 * logint. check_addr_unique() will set the dupaddr bit on the
276 	 * probe logint, if the testaddress is not unique.
277 	 */
278 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
279 		pii->pii_in_use = 0;
280 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
281 			li->li_in_use = 0;
282 			if (pii->pii_probe_logint == li)
283 				li->li_dupaddr = 0;
284 		}
285 	}
286 
287 	/*
288 	 * As above, mark groups so that we can detect IPMP interfaces which
289 	 * have been removed from the kernel.  Also, delete the group address
290 	 * list since we'll iteratively recreate it below.
291 	 */
292 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
293 		pg->pg_in_use = _B_FALSE;
294 		addrlist_free(&pg->pg_addrs);
295 	}
296 
297 	lifn.lifn_family = AF_UNSPEC;
298 	lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
299 again:
300 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
301 		logperror("initifs: ioctl (get interface count)");
302 		return;
303 	}
304 	/*
305 	 * Pad the interface count to detect when additional interfaces have
306 	 * been configured between SIOCGLIFNUM and SIOCGLIFCONF.
307 	 */
308 	lifn.lifn_count += 4;
309 
310 	if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) {
311 		logperror("initifs: calloc");
312 		return;
313 	}
314 
315 	lifc.lifc_family = AF_UNSPEC;
316 	lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP;
317 	lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq);
318 	lifc.lifc_buf = buf;
319 
320 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
321 		logperror("initifs: ioctl (get interface configuration)");
322 		free(buf);
323 		return;
324 	}
325 
326 	/*
327 	 * If every lifr_req slot is taken, then additional interfaces must
328 	 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF.
329 	 * Recalculate to make sure we didn't miss any interfaces.
330 	 */
331 	nlifr = lifc.lifc_len / sizeof (struct lifreq);
332 	if (nlifr >= lifn.lifn_count) {
333 		free(buf);
334 		goto again;
335 	}
336 
337 	/*
338 	 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the
339 	 * global list of addresses, phyint groups, phyints, and logints.
340 	 */
341 	for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) {
342 		af = lifr->lifr_addr.ss_family;
343 		sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
344 		(void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ);
345 
346 		if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
347 			if (errno != ENXIO)
348 				logperror("initifs: ioctl (SIOCGLIFFLAGS)");
349 			continue;
350 		}
351 		flags = lifreq.lifr_flags;
352 
353 		/*
354 		 * If the address is IFF_UP, add it to the local address list.
355 		 * (We ignore addresses that aren't IFF_UP since another node
356 		 * might legitimately have that address IFF_UP.)
357 		 */
358 		if (flags & IFF_UP) {
359 			(void) addrlist_add(&localaddrs, lifr->lifr_name, flags,
360 			    &lifr->lifr_addr);
361 		}
362 
363 		/*
364 		 * If this address is on an IPMP meta-interface, update our
365 		 * phyint_group information (either by recording that group
366 		 * still exists or creating a new group), and track what
367 		 * group the address is part of.
368 		 */
369 		if (flags & IFF_IPMP) {
370 			if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) {
371 				if (errno != ENXIO)
372 					logperror("initifs: ioctl "
373 					    "(SIOCGLIFGROUPNAME)");
374 				continue;
375 			}
376 
377 			pg = phyint_group_lookup(lifreq.lifr_groupname);
378 			if (pg == NULL) {
379 				pg = phyint_group_create(lifreq.lifr_groupname);
380 				if (pg == NULL) {
381 					logerr("initifs: cannot create group "
382 					    "%s\n", lifreq.lifr_groupname);
383 					continue;
384 				}
385 				phyint_group_insert(pg);
386 			}
387 			pg->pg_in_use = _B_TRUE;
388 
389 			/*
390 			 * Add this to the group's list of data addresses.
391 			 */
392 			if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags,
393 			    &lifr->lifr_addr)) {
394 				logerr("initifs: insufficient memory to track "
395 				    "data address information for %s\n",
396 				    lifr->lifr_name);
397 			}
398 			continue;
399 		}
400 
401 		/*
402 		 * This isn't an address on an IPMP meta-interface, so it's
403 		 * either on an underlying interface or not related to any
404 		 * group.  Update our phyint and logint information (via
405 		 * pii_process() and logint_init_from_k()) -- but first,
406 		 * convert the logint name to a phyint name so we can call
407 		 * pii_process().
408 		 */
409 		(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
410 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
411 			*cp = '\0';
412 
413 		if (pii_process(af, pi_name, &pii)) {
414 			/* The phyint is fine. So process the logint */
415 			logint_init_from_k(pii, lifr->lifr_name);
416 			check_addr_unique(pii, &lifr->lifr_addr);
417 		}
418 	}
419 	free(buf);
420 
421 	/*
422 	 * Scan for groups, phyints and logints that have disappeared from the
423 	 * kernel, and delete them.
424 	 */
425 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
426 		next_pii = pii->pii_next;
427 		check_if_removed(pii);
428 	}
429 
430 	for (pg = phyint_groups; pg != NULL; pg = next_pg) {
431 		next_pg = pg->pg_next;
432 		if (!pg->pg_in_use) {
433 			phyint_group_delete(pg);
434 			continue;
435 		}
436 		/*
437 		 * Refresh the group's state.  This is necessary since the
438 		 * group's state is defined by the set of usable interfaces in
439 		 * the group, and an interface is considered unusable if all
440 		 * of its addresses are down.  When an address goes down/up,
441 		 * the RTM_DELADDR/RTM_NEWADDR brings us through here.
442 		 */
443 		phyint_group_refresh_state(pg);
444 	}
445 
446 	/*
447 	 * Select a test address for sending probes on each phyint instance
448 	 */
449 	select_test_ifs();
450 
451 	/*
452 	 * Handle link up/down notifications.
453 	 */
454 	process_link_state_changes();
455 }
456 
457 /*
458  * Check that a given test address is unique across all of the interfaces in a
459  * group.  (e.g., IPv6 link-locals may not be inherently unique, and binding
460  * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
461  * Any issues will be reported by check_testconfig().
462  */
463 static void
464 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
465 {
466 	struct phyint		*pi;
467 	struct phyint_group	*pg;
468 	struct in6_addr		addr;
469 	struct phyint_instance	*pii;
470 	struct sockaddr_in	*sin;
471 
472 	if (ss->ss_family == AF_INET) {
473 		sin = (struct sockaddr_in *)ss;
474 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
475 	} else {
476 		assert(ss->ss_family == AF_INET6);
477 		addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
478 	}
479 
480 	/*
481 	 * For anonymous groups, every interface is assumed to be on its own
482 	 * link, so there is no chance of overlapping addresses.
483 	 */
484 	pg = ourpii->pii_phyint->pi_group;
485 	if (pg == phyint_anongroup)
486 		return;
487 
488 	/*
489 	 * Walk the list of phyint instances in the group and check for test
490 	 * addresses matching ours.  Of course, we skip ourself.
491 	 */
492 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
493 		pii = PHYINT_INSTANCE(pi, ss->ss_family);
494 		if (pii == NULL || pii == ourpii ||
495 		    pii->pii_probe_logint == NULL)
496 			continue;
497 
498 		/*
499 		 * If this test address is not unique, set the dupaddr bit.
500 		 */
501 		if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr))
502 			pii->pii_probe_logint->li_dupaddr = 1;
503 	}
504 }
505 
506 /*
507  * Stop probing an interface.  Called when an interface is offlined.
508  * The probe socket is closed on each interface instance, and the
509  * interface state set to PI_OFFLINE.
510  */
511 void
512 stop_probing(struct phyint *pi)
513 {
514 	struct phyint_instance *pii;
515 
516 	pii = pi->pi_v4;
517 	if (pii != NULL) {
518 		if (pii->pii_probe_sock != -1)
519 			close_probe_socket(pii, _B_TRUE);
520 		pii->pii_probe_logint = NULL;
521 	}
522 
523 	pii = pi->pi_v6;
524 	if (pii != NULL) {
525 		if (pii->pii_probe_sock != -1)
526 			close_probe_socket(pii, _B_TRUE);
527 		pii->pii_probe_logint = NULL;
528 	}
529 
530 	phyint_chstate(pi, PI_OFFLINE);
531 }
532 
533 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
534 
535 /*
536  * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
537  * IFF_UP must also be set so that the associated address can be used as a
538  * source address.  Further, we must be able to exchange packets with local
539  * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
540  * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
541  */
542 static int
543 rate_testflags(uint64_t flags)
544 {
545 	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
546 		return (BAD_TESTFLAGS);
547 
548 	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
549 		return (BAD_TESTFLAGS);
550 
551 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
552 		return (BEST_TESTFLAGS);
553 
554 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
555 		return (BEST_TESTFLAGS);
556 
557 	return (OK_TESTFLAGS);
558 }
559 
560 /*
561  * Attempt to select a test address for each phyint instance.
562  * Call phyint_inst_sockinit() to complete the initializations.
563  */
564 static void
565 select_test_ifs(void)
566 {
567 	struct phyint		*pi;
568 	struct phyint_instance	*pii;
569 	struct phyint_instance	*next_pii;
570 	struct logint		*li;
571 	struct logint  		*probe_logint;
572 	boolean_t		target_scan_reqd = _B_FALSE;
573 	int			rating;
574 
575 	if (debug & D_PHYINT)
576 		logdebug("select_test_ifs\n");
577 
578 	/*
579 	 * For each phyint instance, do the test address selection
580 	 */
581 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
582 		next_pii = pii->pii_next;
583 		probe_logint = NULL;
584 
585 		/*
586 		 * An interface that is offline should not be probed.
587 		 * IFF_OFFLINE interfaces should always be PI_OFFLINE
588 		 * unless some other entity has set the offline flag.
589 		 */
590 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
591 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
592 				logerr("shouldn't be probing offline"
593 				    " interface %s (state is: %u)."
594 				    " Stopping probes.\n",
595 				    pii->pii_phyint->pi_name,
596 				    pii->pii_phyint->pi_state);
597 				stop_probing(pii->pii_phyint);
598 			}
599 			continue;
600 		} else {
601 			/*
602 			 * If something cleared IFF_OFFLINE (e.g., by accident
603 			 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is
604 			 * inherently racy), the phyint may still be offline.
605 			 * Just ignore it.
606 			 */
607 			if (pii->pii_phyint->pi_state == PI_OFFLINE)
608 				continue;
609 		}
610 
611 		li = pii->pii_probe_logint;
612 		if (li != NULL) {
613 			/*
614 			 * We've already got a test address; only proceed
615 			 * if it's suboptimal.
616 			 */
617 			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
618 				continue;
619 		}
620 
621 		/*
622 		 * Walk the logints of this phyint instance, and select
623 		 * the best available test address
624 		 */
625 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
626 			/*
627 			 * Skip 0.0.0.0 addresses, as those are never
628 			 * actually usable.
629 			 */
630 			if (pii->pii_af == AF_INET &&
631 			    IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
632 				continue;
633 
634 			/*
635 			 * Skip any IPv6 logints that are not link-local,
636 			 * since we should always have a link-local address
637 			 * anyway and in6_data() expects link-local replies.
638 			 */
639 			if (pii->pii_af == AF_INET6 &&
640 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
641 				continue;
642 
643 			/*
644 			 * Rate the testflags. If we've found an optimal
645 			 * match, then break out; otherwise, record the most
646 			 * recent OK one.
647 			 */
648 			rating = rate_testflags(li->li_flags);
649 			if (rating == BAD_TESTFLAGS)
650 				continue;
651 
652 			probe_logint = li;
653 			if (rating == BEST_TESTFLAGS)
654 				break;
655 		}
656 
657 		/*
658 		 * If the probe logint has changed, ditch the old one.
659 		 */
660 		if (pii->pii_probe_logint != NULL &&
661 		    pii->pii_probe_logint != probe_logint) {
662 			if (pii->pii_probe_sock != -1)
663 				close_probe_socket(pii, _B_TRUE);
664 			pii->pii_probe_logint = NULL;
665 		}
666 
667 		if (probe_logint == NULL) {
668 			/*
669 			 * We don't have a test address; zero out the probe
670 			 * stats array since it is no longer relevant.
671 			 * Optimize by checking if it is already zeroed out.
672 			 */
673 			int pr_ndx;
674 
675 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
676 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
677 				clear_pii_probe_stats(pii);
678 				reset_crtt_all(pii->pii_phyint);
679 			}
680 			continue;
681 		} else if (probe_logint == pii->pii_probe_logint) {
682 			/*
683 			 * If we didn't find any new test addr, go to the
684 			 * next phyint.
685 			 */
686 			continue;
687 		}
688 
689 		/*
690 		 * The phyint is either being assigned a new testaddr
691 		 * or is being assigned a testaddr for the 1st time.
692 		 * Need to initialize the phyint socket
693 		 */
694 		pii->pii_probe_logint = probe_logint;
695 		if (!phyint_inst_sockinit(pii)) {
696 			if (debug & D_PHYINT) {
697 				logdebug("select_test_ifs: "
698 				    "phyint_sockinit failed\n");
699 			}
700 			phyint_inst_delete(pii);
701 			continue;
702 		}
703 
704 		/*
705 		 * This phyint instance is now enabled for probes; this
706 		 * impacts our state machine in two ways:
707 		 *
708 		 * 1. If we're probe *capable* as well (i.e., we have
709 		 *    probe targets) and the interface is in PI_NOTARGETS,
710 		 *    then transition to PI_RUNNING.
711 		 *
712 		 * 2. If we're not probe capable, and the other phyint
713 		 *    instance is also not probe capable, and we were in
714 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
715 		 *
716 		 * Also see the state diagram in mpd_probe.c.
717 		 */
718 		if (PROBE_CAPABLE(pii)) {
719 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
720 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
721 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
722 			if (pii->pii_phyint->pi_state == PI_RUNNING)
723 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
724 		}
725 
726 		/*
727 		 * If no targets are currently known for this phyint
728 		 * we need to call init_router_targets. Since
729 		 * init_router_targets() initializes the list of targets
730 		 * for all phyints it is done below the loop.
731 		 */
732 		if (pii->pii_targets == NULL)
733 			target_scan_reqd = _B_TRUE;
734 
735 		/*
736 		 * Start the probe timer for this instance.
737 		 */
738 		if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
739 			start_timer(pii);
740 			pii->pii_basetime_inited = 1;
741 		}
742 	}
743 
744 	/*
745 	 * Scan the interface list for any interfaces that are PI_FAILED or
746 	 * PI_NOTARGETS but no longer enabled to send probes, and call
747 	 * phyint_check_for_repair() to see if the link state indicates that
748 	 * the interface should be repaired.  Also see the state diagram in
749 	 * mpd_probe.c.
750 	 */
751 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
752 		if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) &&
753 		    (pi->pi_state == PI_FAILED ||
754 		    pi->pi_state == PI_NOTARGETS)) {
755 			phyint_check_for_repair(pi);
756 		}
757 	}
758 
759 	check_testconfig();
760 
761 	/*
762 	 * Try to populate the target list. init_router_targets populates
763 	 * the target list from the routing table. If our target list is
764 	 * still empty, init_host_targets adds host targets based on the
765 	 * host target list of other phyints in the group.
766 	 */
767 	if (target_scan_reqd) {
768 		init_router_targets();
769 		init_host_targets();
770 	}
771 }
772 
773 /*
774  * Check test address configuration, and log notices/errors if appropriate.
775  * Note that this function only logs pre-existing conditions (e.g., that
776  * probe-based failure detection is disabled).
777  */
778 static void
779 check_testconfig(void)
780 {
781 	struct phyint	*pi;
782 	struct logint  	*li;
783 	char		abuf[INET6_ADDRSTRLEN];
784 	int		pri;
785 
786 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
787 		if (pi->pi_flags & IFF_OFFLINE)
788 			continue;
789 
790 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) {
791 			if (pi->pi_taddrmsg_printed ||
792 			    pi->pi_duptaddrmsg_printed) {
793 				if (pi->pi_duptaddrmsg_printed)
794 					pri = LOG_ERR;
795 				else
796 					pri = LOG_INFO;
797 				logmsg(pri, "Test address now configured on "
798 				    "interface %s; enabling probe-based "
799 				    "failure detection on it\n", pi->pi_name);
800 				pi->pi_taddrmsg_printed = 0;
801 				pi->pi_duptaddrmsg_printed = 0;
802 			}
803 			continue;
804 		}
805 
806 		li = NULL;
807 		if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL &&
808 		    pi->pi_v4->pii_probe_logint->li_dupaddr)
809 			li = pi->pi_v4->pii_probe_logint;
810 
811 		if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL &&
812 		    pi->pi_v6->pii_probe_logint->li_dupaddr)
813 			li = pi->pi_v6->pii_probe_logint;
814 
815 		if (li != NULL && li->li_dupaddr) {
816 			if (pi->pi_duptaddrmsg_printed)
817 				continue;
818 			logerr("Test address %s is not unique in group; "
819 			    "disabling probe-based failure detection on %s\n",
820 			    pr_addr(li->li_phyint_inst->pii_af,
821 			    li->li_addr, abuf, sizeof (abuf)), pi->pi_name);
822 			pi->pi_duptaddrmsg_printed = 1;
823 			continue;
824 		}
825 
826 		if (getcurrentsec() < pi->pi_taddrthresh)
827 			continue;
828 
829 		if (!pi->pi_taddrmsg_printed) {
830 			logtrace("No test address configured on interface %s; "
831 			    "disabling probe-based failure detection on it\n",
832 			    pi->pi_name);
833 			pi->pi_taddrmsg_printed = 1;
834 		}
835 	}
836 }
837 
838 /*
839  * Check phyint group configuration, to detect any inconsistencies,
840  * and log an error message. This is called from runtimeouts every
841  * 20 secs. But the error message is displayed once. If the
842  * consistency is resolved by the admin, a recovery message is displayed
843  * once.
844  */
845 static void
846 check_config(void)
847 {
848 	struct phyint_group *pg;
849 	struct phyint *pi;
850 	boolean_t v4_in_group;
851 	boolean_t v6_in_group;
852 
853 	/*
854 	 * All phyints of a group must be homogeneous to ensure that they can
855 	 * take over for one another.  If any phyint in a group has IPv4
856 	 * plumbed, check that all phyints have IPv4 plumbed.  Do a similar
857 	 * check for IPv6.
858 	 */
859 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
860 		if (pg == phyint_anongroup)
861 			continue;
862 
863 		v4_in_group = _B_FALSE;
864 		v6_in_group = _B_FALSE;
865 		/*
866 		 * 1st pass. Determine if at least 1 phyint in the group
867 		 * has IPv4 plumbed and if so set v4_in_group to true.
868 		 * Repeat similarly for IPv6.
869 		 */
870 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
871 			if (pi->pi_v4 != NULL)
872 				v4_in_group = _B_TRUE;
873 			if (pi->pi_v6 != NULL)
874 				v6_in_group = _B_TRUE;
875 		}
876 
877 		/*
878 		 * 2nd pass. If v4_in_group is true, check that phyint
879 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
880 		 * out a message the 1st time only.
881 		 */
882 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
883 			if (pi->pi_flags & IFF_OFFLINE)
884 				continue;
885 
886 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
887 				if (!pi->pi_cfgmsg_printed) {
888 					logerr("IP interface %s in group %s is"
889 					    " not plumbed for IPv4, affecting"
890 					    " IPv4 connectivity\n",
891 					    pi->pi_name,
892 					    pi->pi_group->pg_name);
893 					pi->pi_cfgmsg_printed = 1;
894 				}
895 			} else if (v6_in_group == _B_TRUE &&
896 			    pi->pi_v6 == NULL) {
897 				if (!pi->pi_cfgmsg_printed) {
898 					logerr("IP interface %s in group %s is"
899 					    " not plumbed for IPv6, affecting"
900 					    " IPv6 connectivity\n",
901 					    pi->pi_name,
902 					    pi->pi_group->pg_name);
903 					pi->pi_cfgmsg_printed = 1;
904 				}
905 			} else {
906 				/*
907 				 * The phyint matches the group configuration,
908 				 * if we have reached this point. If it was
909 				 * improperly configured earlier, log an
910 				 * error recovery message
911 				 */
912 				if (pi->pi_cfgmsg_printed) {
913 					logerr("IP interface %s is now"
914 					    " consistent with group %s "
915 					    " and connectivity is restored\n",
916 					    pi->pi_name, pi->pi_group->pg_name);
917 					pi->pi_cfgmsg_printed = 0;
918 				}
919 			}
920 
921 		}
922 	}
923 }
924 
925 /*
926  * Timer mechanism using relative time (in milliseconds) from the
927  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
928  * will fire after TIMER_INFINITY milliseconds.
929  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
930  * time values. Hence 2 consecutive timer events cannot be spaced farther
931  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
932  * that can be passed for the delay parameter of timer_schedule()
933  */
934 static uint_t timer_next;	/* Currently scheduled timeout */
935 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
936 
937 static void
938 timer_init(void)
939 {
940 	timer_next = getcurrenttime() + TIMER_INFINITY;
941 	/*
942 	 * The call to run_timeouts() will get the timer started
943 	 * Since there are no phyints at this point, the timer will
944 	 * be set for IF_SCAN_INTERVAL ms.
945 	 */
946 	run_timeouts();
947 }
948 
949 /*
950  * Make sure the next SIGALRM occurs delay milliseconds from the current
951  * time if not earlier. We are interested only in time differences.
952  */
953 void
954 timer_schedule(uint_t delay)
955 {
956 	uint_t now;
957 	struct itimerval itimerval;
958 
959 	if (debug & D_TIMER)
960 		logdebug("timer_schedule(%u)\n", delay);
961 
962 	assert(delay <= TIMER_INFINITY);
963 
964 	now = getcurrenttime();
965 	if (delay == 0) {
966 		/* Minimum allowed delay */
967 		delay = 1;
968 	}
969 	/* Will this timer occur before the currently scheduled SIGALRM? */
970 	if (timer_active && TIME_GE(now + delay, timer_next)) {
971 		if (debug & D_TIMER) {
972 			logdebug("timer_schedule(%u) - no action: "
973 			    "now %u next %u\n", delay, now, timer_next);
974 		}
975 		return;
976 	}
977 	timer_next = now + delay;
978 
979 	itimerval.it_value.tv_sec = delay / 1000;
980 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
981 	itimerval.it_interval.tv_sec = 0;
982 	itimerval.it_interval.tv_usec = 0;
983 	if (debug & D_TIMER) {
984 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
985 		    delay, itimerval.it_value.tv_sec,
986 		    itimerval.it_value.tv_usec);
987 	}
988 	timer_active = _B_TRUE;
989 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
990 		logperror("timer_schedule: setitimer");
991 		exit(2);
992 	}
993 }
994 
995 /*
996  * Timer has fired. Determine when the next timer event will occur by asking
997  * all the timer routines. Should not be called from a timer routine.
998  */
999 static void
1000 run_timeouts(void)
1001 {
1002 	uint_t next;
1003 	uint_t next_event_time;
1004 	struct phyint_instance *pii;
1005 	struct phyint_instance *next_pii;
1006 	static boolean_t timeout_running;
1007 
1008 	/* assert that recursive timeouts don't happen. */
1009 	assert(!timeout_running);
1010 
1011 	timeout_running = _B_TRUE;
1012 
1013 	if (debug & D_TIMER)
1014 		logdebug("run_timeouts()\n");
1015 
1016 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1017 		initifs();
1018 		check_config();
1019 	}
1020 
1021 	next = TIMER_INFINITY;
1022 
1023 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1024 		next_pii = pii->pii_next;
1025 		next_event_time = phyint_inst_timer(pii);
1026 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1027 			next = next_event_time;
1028 
1029 		if (debug & D_TIMER) {
1030 			logdebug("run_timeouts(%s %s): next scheduled for"
1031 			    " this phyint inst %u, next scheduled global"
1032 			    " %u ms\n",
1033 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1034 			    next_event_time, next);
1035 		}
1036 	}
1037 
1038 	/*
1039 	 * Make sure initifs() is called at least once every
1040 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1041 	 * with the kernel, in case we have missed any routing
1042 	 * socket messages.
1043 	 */
1044 	if (next > IF_SCAN_INTERVAL)
1045 		next = IF_SCAN_INTERVAL;
1046 
1047 	if (debug & D_TIMER)
1048 		logdebug("run_timeouts: %u ms\n", next);
1049 
1050 	timer_schedule(next);
1051 	timeout_running = _B_FALSE;
1052 }
1053 
1054 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1055 static int eventpipe_write = -1;
1056 boolean_t cleanup_started = _B_FALSE;	/* true if we're going away */
1057 
1058 /*
1059  * Ensure that signals are processed synchronously with the rest of
1060  * the code by just writing a one character signal number on the pipe.
1061  * The poll loop will pick this up and process the signal event.
1062  */
1063 static void
1064 sig_handler(int signo)
1065 {
1066 	uchar_t buf = (uchar_t)signo;
1067 
1068 	/*
1069 	 * Don't write to pipe if cleanup has already begun. cleanup()
1070 	 * might have closed the pipe already
1071 	 */
1072 	if (cleanup_started)
1073 		return;
1074 
1075 	if (eventpipe_write == -1) {
1076 		logerr("sig_handler: no pipe found\n");
1077 		return;
1078 	}
1079 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1080 		logperror("sig_handler: write");
1081 }
1082 
1083 extern struct probes_missed probes_missed;
1084 
1085 /*
1086  * Pick up a signal "byte" from the pipe and process it.
1087  */
1088 static void
1089 in_signal(int fd)
1090 {
1091 	uchar_t buf;
1092 	uint64_t  sent, acked, lost, unacked, unknown;
1093 	struct phyint_instance *pii;
1094 	int pr_ndx;
1095 
1096 	switch (read(fd, &buf, sizeof (buf))) {
1097 	case -1:
1098 		logperror("in_signal: read");
1099 		exit(1);
1100 		/* NOTREACHED */
1101 	case 1:
1102 		break;
1103 	case 0:
1104 		logerr("in_signal: read end of file\n");
1105 		exit(1);
1106 		/* NOTREACHED */
1107 	default:
1108 		logerr("in_signal: read > 1\n");
1109 		exit(1);
1110 	}
1111 
1112 	if (debug & D_TIMER)
1113 		logdebug("in_signal() got %d\n", buf);
1114 
1115 	switch (buf) {
1116 	case SIGALRM:
1117 		if (debug & D_TIMER) {
1118 			uint_t now = getcurrenttime();
1119 
1120 			logdebug("in_signal(SIGALRM) delta %u\n",
1121 			    now - timer_next);
1122 		}
1123 		timer_active = _B_FALSE;
1124 		run_timeouts();
1125 		break;
1126 	case SIGUSR1:
1127 		logdebug("Printing configuration:\n");
1128 		/* Print out the internal tables */
1129 		phyint_inst_print_all();
1130 
1131 		/*
1132 		 * Print out the accumulated statistics about missed
1133 		 * probes (happens due to scheduling delay).
1134 		 */
1135 		logerr("Missed sending total of %d probes spread over"
1136 		    " %d occurrences\n", probes_missed.pm_nprobes,
1137 		    probes_missed.pm_ntimes);
1138 
1139 		/*
1140 		 * Print out the accumulated statistics about probes
1141 		 * that were sent.
1142 		 */
1143 		for (pii = phyint_instances; pii != NULL;
1144 		    pii = pii->pii_next) {
1145 			unacked = 0;
1146 			acked = pii->pii_cum_stats.acked;
1147 			lost = pii->pii_cum_stats.lost;
1148 			sent = pii->pii_cum_stats.sent;
1149 			unknown = pii->pii_cum_stats.unknown;
1150 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1151 				switch (pii->pii_probes[pr_ndx].pr_status) {
1152 				case PR_ACKED:
1153 					acked++;
1154 					break;
1155 				case PR_LOST:
1156 					lost++;
1157 					break;
1158 				case PR_UNACKED:
1159 					unacked++;
1160 					break;
1161 				}
1162 			}
1163 			logerr("\nProbe stats on (%s %s)\n"
1164 			    "Number of probes sent %lld\n"
1165 			    "Number of probe acks received %lld\n"
1166 			    "Number of probes/acks lost %lld\n"
1167 			    "Number of valid unacknowledged probes %lld\n"
1168 			    "Number of ambiguous probe acks received %lld\n",
1169 			    AF_STR(pii->pii_af), pii->pii_name,
1170 			    sent, acked, lost, unacked, unknown);
1171 		}
1172 		break;
1173 	case SIGHUP:
1174 		logerr("SIGHUP: restart and reread config file\n");
1175 		cleanup();
1176 		(void) execv(argv0[0], argv0);
1177 		_exit(0177);
1178 		/* NOTREACHED */
1179 	case SIGINT:
1180 	case SIGTERM:
1181 	case SIGQUIT:
1182 		cleanup();
1183 		exit(0);
1184 		/* NOTREACHED */
1185 	default:
1186 		logerr("in_signal: unknown signal: %d\n", buf);
1187 	}
1188 }
1189 
1190 static void
1191 cleanup(void)
1192 {
1193 	struct phyint_instance *pii;
1194 	struct phyint_instance *next_pii;
1195 
1196 	/*
1197 	 * Make sure that we don't write to eventpipe in
1198 	 * sig_handler() if any signal notably SIGALRM,
1199 	 * occurs after we close the eventpipe descriptor below
1200 	 */
1201 	cleanup_started = _B_TRUE;
1202 
1203 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1204 		next_pii = pii->pii_next;
1205 		phyint_inst_delete(pii);
1206 	}
1207 
1208 	(void) close(ifsock_v4);
1209 	(void) close(ifsock_v6);
1210 	(void) close(rtsock_v4);
1211 	(void) close(rtsock_v6);
1212 	(void) close(lsock_v4);
1213 	(void) close(lsock_v6);
1214 	(void) close(0);
1215 	(void) close(1);
1216 	(void) close(2);
1217 	(void) close(mibfd);
1218 	(void) close(eventpipe_read);
1219 	(void) close(eventpipe_write);
1220 }
1221 
1222 /*
1223  * Create pipe for signal delivery and set up signal handlers.
1224  */
1225 static void
1226 setup_eventpipe(void)
1227 {
1228 	int fds[2];
1229 	struct sigaction act;
1230 
1231 	if ((pipe(fds)) < 0) {
1232 		logperror("setup_eventpipe: pipe");
1233 		exit(1);
1234 	}
1235 	eventpipe_read = fds[0];
1236 	eventpipe_write = fds[1];
1237 	if (poll_add(eventpipe_read) == -1) {
1238 		exit(1);
1239 	}
1240 
1241 	act.sa_handler = sig_handler;
1242 	act.sa_flags = SA_RESTART;
1243 	(void) sigaction(SIGALRM, &act, NULL);
1244 
1245 	(void) sigset(SIGHUP, sig_handler);
1246 	(void) sigset(SIGUSR1, sig_handler);
1247 	(void) sigset(SIGTERM, sig_handler);
1248 	(void) sigset(SIGINT, sig_handler);
1249 	(void) sigset(SIGQUIT, sig_handler);
1250 }
1251 
1252 /*
1253  * Create a routing socket for receiving RTM_IFINFO messages.
1254  */
1255 static int
1256 setup_rtsock(int af)
1257 {
1258 	int	s;
1259 	int	flags;
1260 	int	aware = RTAW_UNDER_IPMP;
1261 
1262 	s = socket(PF_ROUTE, SOCK_RAW, af);
1263 	if (s == -1) {
1264 		logperror("setup_rtsock: socket PF_ROUTE");
1265 		exit(1);
1266 	}
1267 
1268 	if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) {
1269 		logperror("setup_rtsock: setsockopt RT_AWARE");
1270 		(void) close(s);
1271 		exit(1);
1272 	}
1273 
1274 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1275 		logperror("setup_rtsock: fcntl F_GETFL");
1276 		(void) close(s);
1277 		exit(1);
1278 	}
1279 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1280 		logperror("setup_rtsock: fcntl F_SETFL");
1281 		(void) close(s);
1282 		exit(1);
1283 	}
1284 	if (poll_add(s) == -1) {
1285 		(void) close(s);
1286 		exit(1);
1287 	}
1288 	return (s);
1289 }
1290 
1291 /*
1292  * Process an RTM_IFINFO message received on a routing socket.
1293  * The return value indicates whether a full interface scan is required.
1294  * Link up/down notifications are reflected in the IFF_RUNNING flag.
1295  * If just the state of the IFF_RUNNING interface flag has changed, a
1296  * a full interface scan isn't required.
1297  */
1298 static boolean_t
1299 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1300 {
1301 	struct sockaddr_dl *sdl;
1302 	struct phyint *pi;
1303 	uint64_t old_flags;
1304 	struct phyint_instance *pii;
1305 
1306 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1307 
1308 	/*
1309 	 * Although the sockaddr_dl structure is directly after the
1310 	 * if_msghdr_t structure. At the time of writing, the size of the
1311 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1312 	 * to the presence of a timeval structure, which contains longs,
1313 	 * in the if_data structure.  Anyway, we know where the message ends,
1314 	 * so we work backwards to get the start of the sockaddr_dl structure.
1315 	 */
1316 	/*LINTED*/
1317 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1318 	    sizeof (struct sockaddr_dl));
1319 
1320 	assert(sdl->sdl_family == AF_LINK);
1321 
1322 	/*
1323 	 * The interface name is in sdl_data.
1324 	 * RTM_IFINFO messages are only generated for logical interface
1325 	 * zero, so there is no colon and logical interface number to
1326 	 * strip from the name.	 The name is not null terminated, but
1327 	 * there should be enough space in sdl_data to add the null.
1328 	 */
1329 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1330 		if (debug & D_LINKNOTE)
1331 			logdebug("process_rtm_ifinfo: phyint name too long\n");
1332 		return (_B_TRUE);
1333 	}
1334 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1335 
1336 	pi = phyint_lookup(sdl->sdl_data);
1337 	if (pi == NULL) {
1338 		if (debug & D_LINKNOTE)
1339 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1340 			    " for %s\n", sdl->sdl_data);
1341 		return (_B_TRUE);
1342 	}
1343 
1344 	/*
1345 	 * We want to try and avoid doing a full interface scan for
1346 	 * link state notifications from the datalink layer, as indicated
1347 	 * by the state of the IFF_RUNNING flag.  If just the
1348 	 * IFF_RUNNING flag has changed state, the link state changes
1349 	 * are processed without a full scan.
1350 	 * If there is both an IPv4 and IPv6 instance associated with
1351 	 * the physical interface, we will get an RTM_IFINFO message
1352 	 * for each instance.  If we just maintained a single copy of
1353 	 * the physical interface flags, it would appear that no flags
1354 	 * had changed when the second message is processed, leading us
1355 	 * to believe that the message wasn't generated by a flags change,
1356 	 * and that a full interface scan is required.
1357 	 * To get around this problem, two additional copies of the flags
1358 	 * are kept, one copy for each instance.  These are only used in
1359 	 * this routine.  At any one time, all three copies of the flags
1360 	 * should be identical except for the IFF_RUNNING flag.	 The
1361 	 * copy of the flags in the "phyint" structure is always up to
1362 	 * date.
1363 	 */
1364 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1365 	if (pii == NULL) {
1366 		if (debug & D_LINKNOTE)
1367 			logdebug("process_rtm_ifinfo: no instance of address "
1368 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1369 		return (_B_TRUE);
1370 	}
1371 
1372 	old_flags = pii->pii_flags;
1373 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1374 	pi->pi_flags = pii->pii_flags;
1375 
1376 	if (debug & D_LINKNOTE) {
1377 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1378 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1379 		    AF_STR(type), old_flags, pi->pi_flags);
1380 	}
1381 
1382 	/*
1383 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1384 	 * types.
1385 	 */
1386 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1387 		phyint_changed(pi);
1388 
1389 	/* Has just the IFF_RUNNING flag changed state ? */
1390 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1391 		struct phyint_instance *pii_other;
1392 		/*
1393 		 * It wasn't just a link state change.	Update
1394 		 * the other instance's copy of the flags.
1395 		 */
1396 		pii_other = phyint_inst_other(pii);
1397 		if (pii_other != NULL)
1398 			pii_other->pii_flags = pii->pii_flags;
1399 		return (_B_TRUE);
1400 	}
1401 
1402 	return (_B_FALSE);
1403 }
1404 
1405 /*
1406  * Retrieve as many routing socket messages as possible, and try to
1407  * empty the routing sockets. Initiate full scan of targets or interfaces
1408  * as needed.
1409  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1410  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1411  */
1412 static void
1413 process_rtsock(int rtsock_v4, int rtsock_v6)
1414 {
1415 	int	nbytes;
1416 	int64_t msg[2048 / 8];
1417 	struct rt_msghdr *rtm;
1418 	boolean_t need_if_scan = _B_FALSE;
1419 	boolean_t need_rt_scan = _B_FALSE;
1420 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1421 	int type;
1422 
1423 	/* Read as many messages as possible and try to empty the sockets */
1424 	for (type = AF_INET; ; type = AF_INET6) {
1425 		for (;;) {
1426 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1427 			    rtsock_v6, msg, sizeof (msg));
1428 			if (nbytes <= 0) {
1429 				/* No more messages */
1430 				break;
1431 			}
1432 			rtm = (struct rt_msghdr *)msg;
1433 			if (rtm->rtm_version != RTM_VERSION) {
1434 				logerr("process_rtsock: version %d "
1435 				    "not understood\n", rtm->rtm_version);
1436 				break;
1437 			}
1438 
1439 			if (debug & D_PHYINT) {
1440 				logdebug("process_rtsock: message %d\n",
1441 				    rtm->rtm_type);
1442 			}
1443 
1444 			switch (rtm->rtm_type) {
1445 			case RTM_NEWADDR:
1446 			case RTM_DELADDR:
1447 				/*
1448 				 * Some logical interface has changed,
1449 				 * have to scan everything to determine
1450 				 * what actually changed.
1451 				 */
1452 				need_if_scan = _B_TRUE;
1453 				break;
1454 
1455 			case RTM_IFINFO:
1456 				rtm_ifinfo_seen = _B_TRUE;
1457 				need_if_scan |= process_rtm_ifinfo(
1458 				    (if_msghdr_t *)rtm, type);
1459 				break;
1460 
1461 			case RTM_ADD:
1462 			case RTM_DELETE:
1463 			case RTM_CHANGE:
1464 			case RTM_OLDADD:
1465 			case RTM_OLDDEL:
1466 				need_rt_scan = _B_TRUE;
1467 				break;
1468 
1469 			default:
1470 				/* Not interesting */
1471 				break;
1472 			}
1473 		}
1474 		if (type == AF_INET6)
1475 			break;
1476 	}
1477 
1478 	if (need_if_scan) {
1479 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1480 			logdebug("process_rtsock: synchronizing with kernel\n");
1481 		initifs();
1482 	} else if (rtm_ifinfo_seen) {
1483 		if (debug & D_LINKNOTE)
1484 			logdebug("process_rtsock: "
1485 			    "link up/down notification(s) seen\n");
1486 		process_link_state_changes();
1487 	}
1488 
1489 	if (need_rt_scan)
1490 		init_router_targets();
1491 }
1492 
1493 /*
1494  * Look if the phyint instance or one of its logints have been removed from
1495  * the kernel and take appropriate action.
1496  * Uses {pii,li}_in_use.
1497  */
1498 static void
1499 check_if_removed(struct phyint_instance *pii)
1500 {
1501 	struct logint *li;
1502 	struct logint *next_li;
1503 
1504 	/* Detect phyints that have been removed from the kernel. */
1505 	if (!pii->pii_in_use) {
1506 		logtrace("%s %s has been removed from kernel\n",
1507 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1508 		phyint_inst_delete(pii);
1509 	} else {
1510 		/* Detect logints that have been removed. */
1511 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1512 			next_li = li->li_next;
1513 			if (!li->li_in_use) {
1514 				logint_delete(li);
1515 			}
1516 		}
1517 	}
1518 }
1519 
1520 /*
1521  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1522  * tables defined by mib2.h. Parse the returned data and extract
1523  * the 'routing' information table. Process the 'routing' table
1524  * to get the list of known onlink routers, and update our database.
1525  * These onlink routers will serve as our probe targets.
1526  * Returns false, if any system calls resulted in errors, true otherwise.
1527  */
1528 static boolean_t
1529 update_router_list(int fd)
1530 {
1531 	union {
1532 		char	ubuf[1024];
1533 		union T_primitives uprim;
1534 	} buf;
1535 
1536 	int			flags;
1537 	struct strbuf		ctlbuf;
1538 	struct strbuf		databuf;
1539 	struct T_optmgmt_req	*tor;
1540 	struct T_optmgmt_ack	*toa;
1541 	struct T_error_ack	*tea;
1542 	struct opthdr		*optp;
1543 	struct opthdr		*req;
1544 	int			status;
1545 	t_scalar_t		prim;
1546 
1547 	tor = (struct T_optmgmt_req *)&buf;
1548 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1549 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1550 	tor->OPT_length = sizeof (struct opthdr);
1551 	tor->MGMT_flags = T_CURRENT;
1552 
1553 	/*
1554 	 * Note: we use the special level value below so that IP will return
1555 	 * us information concerning IRE_MARK_TESTHIDDEN routes.
1556 	 */
1557 	req = (struct opthdr *)&tor[1];
1558 	req->level = EXPER_IP_AND_TESTHIDDEN;
1559 	req->name  = 0;
1560 	req->len   = 0;
1561 
1562 	ctlbuf.buf = (char *)&buf;
1563 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1564 	ctlbuf.maxlen = sizeof (buf);
1565 	if (putmsg(fd, &ctlbuf, NULL, 0) == -1) {
1566 		logperror("update_router_list: putmsg(ctl)");
1567 		return (_B_FALSE);
1568 	}
1569 
1570 	/*
1571 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1572 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1573 	 * a control and data part. The control part contains a struct
1574 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1575 	 * the level, name and length of the data in the data part. The
1576 	 * data part contains the actual table data. The last message
1577 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1578 	 * single option with zero optlen.
1579 	 */
1580 
1581 	for (;;) {
1582 		/*
1583 		 * Go around this loop once for each table. Ignore
1584 		 * all tables except the routing information table.
1585 		 */
1586 		flags = 0;
1587 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1588 		if (status < 0) {
1589 			if (errno == EINTR)
1590 				continue;
1591 			logperror("update_router_list: getmsg(ctl)");
1592 			return (_B_FALSE);
1593 		}
1594 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1595 			logerr("update_router_list: ctlbuf.len %d\n",
1596 			    ctlbuf.len);
1597 			return (_B_FALSE);
1598 		}
1599 
1600 		prim = buf.uprim.type;
1601 
1602 		switch (prim) {
1603 
1604 		case T_ERROR_ACK:
1605 			tea = &buf.uprim.error_ack;
1606 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1607 				logerr("update_router_list: T_ERROR_ACK"
1608 				    " ctlbuf.len %d\n", ctlbuf.len);
1609 				return (_B_FALSE);
1610 			}
1611 			logerr("update_router_list: T_ERROR_ACK:"
1612 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1613 			    tea->TLI_error, tea->UNIX_error);
1614 			return (_B_FALSE);
1615 
1616 		case T_OPTMGMT_ACK:
1617 			toa = &buf.uprim.optmgmt_ack;
1618 			optp = (struct opthdr *)&toa[1];
1619 			if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) +
1620 			    sizeof (struct opthdr))) {
1621 				logerr("update_router_list: ctlbuf.len %d\n",
1622 				    ctlbuf.len);
1623 				return (_B_FALSE);
1624 			}
1625 			if (toa->MGMT_flags != T_SUCCESS) {
1626 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1627 				    toa->MGMT_flags);
1628 				return (_B_FALSE);
1629 			}
1630 			break;
1631 
1632 		default:
1633 			logerr("update_router_list: unknown primitive %ld\n",
1634 			    prim);
1635 			return (_B_FALSE);
1636 		}
1637 
1638 		/* Process the T_OPTMGMT_ACK below */
1639 		assert(prim == T_OPTMGMT_ACK);
1640 
1641 		switch (status) {
1642 		case 0:
1643 			/*
1644 			 * We have reached the end of this T_OPTMGMT_ACK
1645 			 * message. If this is the last message i.e EOD,
1646 			 * return, else process the next T_OPTMGMT_ACK msg.
1647 			 */
1648 			if (optp->len == 0 && optp->name == 0 &&
1649 			    optp->level == 0) {
1650 				/*
1651 				 * This is the EOD message. Return
1652 				 */
1653 				return (_B_TRUE);
1654 			}
1655 			continue;
1656 
1657 		case MORECTL:
1658 		case MORECTL | MOREDATA:
1659 			/*
1660 			 * This should not happen. We should be able to read
1661 			 * the control portion in a single getmsg.
1662 			 */
1663 			logerr("update_router_list: MORECTL\n");
1664 			return (_B_FALSE);
1665 
1666 		case MOREDATA:
1667 			databuf.maxlen = optp->len;
1668 			/* malloc of 0 bytes is ok */
1669 			databuf.buf = malloc((size_t)optp->len);
1670 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1671 				logperror("update_router_list: malloc");
1672 				return (_B_FALSE);
1673 			}
1674 			databuf.len = 0;
1675 			flags = 0;
1676 			for (;;) {
1677 				if (getmsg(fd, NULL, &databuf, &flags) >= 0)
1678 					break;
1679 				if (errno == EINTR)
1680 					continue;
1681 
1682 				logperror("update_router_list: getmsg(data)");
1683 				free(databuf.buf);
1684 				return (_B_FALSE);
1685 			}
1686 
1687 			if (optp->level == MIB2_IP &&
1688 			    optp->name == MIB2_IP_ROUTE) {
1689 				/* LINTED */
1690 				ire_process_v4((mib2_ipRouteEntry_t *)
1691 				    databuf.buf, databuf.len);
1692 			} else if (optp->level == MIB2_IP6 &&
1693 			    optp->name == MIB2_IP6_ROUTE) {
1694 				/* LINTED */
1695 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1696 				    databuf.buf, databuf.len);
1697 			}
1698 			free(databuf.buf);
1699 		}
1700 	}
1701 	/* NOTREACHED */
1702 }
1703 
1704 
1705 /*
1706  * Convert octet `octp' to a phyint name and store in `ifname'
1707  */
1708 static void
1709 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize)
1710 {
1711 	char *cp;
1712 	size_t len = MIN(octp->o_length, ifsize - 1);
1713 
1714 	(void) strncpy(ifname, octp->o_bytes, len);
1715 	ifname[len] = '\0';
1716 
1717 	if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL)
1718 		*cp = '\0';
1719 }
1720 
1721 /*
1722  * Examine the IPv4 routing table `buf' for possible targets.  For each
1723  * possible target, if it's on the same subnet an interface route, pass
1724  * it to router_add_common() for further consideration.
1725  */
1726 static void
1727 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1728 {
1729 	char ifname[LIFNAMSIZ];
1730 	mib2_ipRouteEntry_t	*rp, *rp1, *endp;
1731 	struct in_addr		nexthop_v4;
1732 	struct in6_addr		nexthop;
1733 
1734 	if (len == 0)
1735 		return;
1736 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1737 
1738 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1739 
1740 	/*
1741 	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1742 	 * cross-reference them with the interface routes to determine if
1743 	 * they're possible probe targets.
1744 	 */
1745 	for (rp = buf; rp < endp; rp++) {
1746 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1747 			continue;
1748 
1749 		/* Get the nexthop address. */
1750 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1751 
1752 		/*
1753 		 * Rescan the routing table looking for interface routes that
1754 		 * are on the same subnet, and try to add them.  If they're
1755 		 * not relevant (e.g., the interface route isn't part of an
1756 		 * IPMP group, router_add_common() will discard).
1757 		 */
1758 		for (rp1 = buf; rp1 < endp; rp1++) {
1759 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) ||
1760 			    rp1->ipRouteIfIndex.o_length == 0)
1761 				continue;
1762 
1763 			if ((rp1->ipRouteDest & rp1->ipRouteMask) !=
1764 			    (nexthop_v4.s_addr & rp1->ipRouteMask))
1765 				continue;
1766 
1767 			oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ);
1768 			IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1769 			router_add_common(AF_INET, ifname, nexthop);
1770 		}
1771 	}
1772 }
1773 
1774 void
1775 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1776 {
1777 	struct phyint_instance *pii;
1778 	struct phyint *pi;
1779 
1780 	if (debug & D_TARGET)
1781 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1782 
1783 	/*
1784 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1785 	 */
1786 	pii = phyint_inst_lookup(af, ifname);
1787 	if (pii == NULL)
1788 		return;
1789 
1790 	/*
1791 	 * Don't use our own addresses as targets.
1792 	 */
1793 	if (own_address(nexthop))
1794 		return;
1795 
1796 	/*
1797 	 * If the phyint is part a named group, then add the address to all
1798 	 * members of the group; note that this is suboptimal in the IPv4 case
1799 	 * as it has already been added to all matching interfaces in
1800 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1801 	 * itself, since other phyints in the anongroup may not be on the same
1802 	 * subnet.
1803 	 */
1804 	pi = pii->pii_phyint;
1805 	if (pi->pi_group == phyint_anongroup) {
1806 		target_add(pii, nexthop, _B_TRUE);
1807 	} else {
1808 		pi = pi->pi_group->pg_phyint;
1809 		for (; pi != NULL; pi = pi->pi_pgnext)
1810 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1811 	}
1812 }
1813 
1814 /*
1815  * Examine the IPv6 routing table `buf' for possible link-local targets, and
1816  * pass any contenders to router_add_common() for further consideration.
1817  */
1818 static void
1819 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1820 {
1821 	struct lifreq lifr;
1822 	char ifname[LIFNAMSIZ];
1823 	char grname[LIFGRNAMSIZ];
1824 	mib2_ipv6RouteEntry_t *rp, *rp1, *endp;
1825 	struct in6_addr nexthop_v6;
1826 
1827 	if (debug & D_TARGET)
1828 		logdebug("ire_process_v6(len %d)\n", len);
1829 
1830 	if (len == 0)
1831 		return;
1832 
1833 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1834 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1835 
1836 	/*
1837 	 * Scan the routing table entries for any IRE_OFFSUBNET entries, and
1838 	 * cross-reference them with the interface routes to determine if
1839 	 * they're possible probe targets.
1840 	 */
1841 	for (rp = buf; rp < endp; rp++) {
1842 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) ||
1843 		    !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop))
1844 			continue;
1845 
1846 		/* Get the nexthop address. */
1847 		nexthop_v6 = rp->ipv6RouteNextHop;
1848 
1849 		/*
1850 		 * The interface name should always exist for link-locals;
1851 		 * we use it to map this entry to an IPMP group name.
1852 		 */
1853 		if (rp->ipv6RouteIfIndex.o_length == 0)
1854 			continue;
1855 
1856 		oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ);
1857 		if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 ||
1858 		    strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) {
1859 			continue;
1860 		}
1861 
1862 		/*
1863 		 * Rescan the list of routes for interface routes, and add the
1864 		 * above target to any interfaces in the same IPMP group.
1865 		 */
1866 		for (rp1 = buf; rp1 < endp; rp1++) {
1867 			if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) ||
1868 			    rp1->ipv6RouteIfIndex.o_length == 0) {
1869 				continue;
1870 			}
1871 			oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ);
1872 			(void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ);
1873 
1874 			if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 &&
1875 			    strcmp(lifr.lifr_groupname, grname) == 0) {
1876 				router_add_common(AF_INET6, ifname, nexthop_v6);
1877 			}
1878 		}
1879 	}
1880 }
1881 
1882 /*
1883  * Build a list of target routers, by scanning the routing tables.
1884  * It is assumed that interface routes exist, to reach the routers.
1885  */
1886 static void
1887 init_router_targets(void)
1888 {
1889 	struct	target *tg;
1890 	struct	target *next_tg;
1891 	struct	phyint_instance *pii;
1892 	struct	phyint *pi;
1893 
1894 	if (force_mcast)
1895 		return;
1896 
1897 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1898 		pi = pii->pii_phyint;
1899 		/*
1900 		 * Set tg_in_use to false only for router targets.
1901 		 */
1902 		if (!pii->pii_targets_are_routers)
1903 			continue;
1904 
1905 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1906 			tg->tg_in_use = 0;
1907 	}
1908 
1909 	if (mibfd < 0) {
1910 		mibfd = open("/dev/ip", O_RDWR);
1911 		if (mibfd < 0) {
1912 			logperror("mibopen: ip open");
1913 			exit(1);
1914 		}
1915 	}
1916 
1917 	if (!update_router_list(mibfd)) {
1918 		(void) close(mibfd);
1919 		mibfd = -1;
1920 	}
1921 
1922 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1923 		pi = pii->pii_phyint;
1924 		if (!pii->pii_targets_are_routers)
1925 			continue;
1926 
1927 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1928 			next_tg = tg->tg_next;
1929 			/*
1930 			 * If the group has failed, it's likely the route was
1931 			 * removed by an application affected by that failure.
1932 			 * In that case, we keep the target so that we can
1933 			 * reliably repair, at which point we'll refresh the
1934 			 * target list again.
1935 			 */
1936 			if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group))
1937 				target_delete(tg);
1938 		}
1939 	}
1940 }
1941 
1942 /*
1943  * Attempt to assign host targets to any interfaces that do not currently
1944  * have probe targets by sharing targets with other interfaces in the group.
1945  */
1946 static void
1947 init_host_targets(void)
1948 {
1949 	struct phyint_instance *pii;
1950 	struct phyint_group *pg;
1951 
1952 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1953 		pg = pii->pii_phyint->pi_group;
1954 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
1955 			dup_host_targets(pii);
1956 	}
1957 }
1958 
1959 /*
1960  * Duplicate host targets from other phyints of the group to
1961  * the phyint instance 'desired_pii'.
1962  */
1963 static void
1964 dup_host_targets(struct phyint_instance	 *desired_pii)
1965 {
1966 	int af;
1967 	struct phyint *pi;
1968 	struct phyint_instance *pii;
1969 	struct target *tg;
1970 
1971 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
1972 
1973 	af = desired_pii->pii_af;
1974 
1975 	/*
1976 	 * For every phyint in the same group as desired_pii, check if
1977 	 * it has any host targets. If so add them to desired_pii.
1978 	 */
1979 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
1980 		pii = PHYINT_INSTANCE(pi, af);
1981 		/*
1982 		 * We know that we don't have targets on this phyint instance
1983 		 * since we have been called. But we still check for
1984 		 * pii_targets_are_routers because another phyint instance
1985 		 * could have router targets, since IFF_NOFAILOVER addresses
1986 		 * on different phyint instances may belong to different
1987 		 * subnets.
1988 		 */
1989 		if ((pii == NULL) || (pii == desired_pii) ||
1990 		    pii->pii_targets_are_routers)
1991 			continue;
1992 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1993 			target_create(desired_pii, tg->tg_address, _B_FALSE);
1994 		}
1995 	}
1996 }
1997 
1998 static void
1999 usage(char *cmd)
2000 {
2001 	(void) fprintf(stderr, "usage: %s\n", cmd);
2002 }
2003 
2004 
2005 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2006 
2007 /* Get an option from the /etc/default/mpathd file */
2008 static char *
2009 getdefault(char *name)
2010 {
2011 	char namebuf[BUFSIZ];
2012 	char *value = NULL;
2013 
2014 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2015 		char	*cp;
2016 		int	flags;
2017 
2018 		/*
2019 		 * ignore case
2020 		 */
2021 		flags = defcntl(DC_GETFLAGS, 0);
2022 		TURNOFF(flags, DC_CASE);
2023 		(void) defcntl(DC_SETFLAGS, flags);
2024 
2025 		/* Add "=" to the name */
2026 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2027 		(void) strncat(namebuf, "=", 2);
2028 
2029 		if ((cp = defread(namebuf)) != NULL)
2030 			value = strdup(cp);
2031 
2032 		/* close */
2033 		(void) defopen((char *)NULL);
2034 	}
2035 	return (value);
2036 }
2037 
2038 
2039 /*
2040  * Command line options below
2041  */
2042 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2043 boolean_t	track_all_phyints = _B_FALSE;	/* track all IP interfaces */
2044 static boolean_t adopt = _B_FALSE;
2045 static boolean_t foreground = _B_FALSE;
2046 
2047 int
2048 main(int argc, char *argv[])
2049 {
2050 	int i;
2051 	int c;
2052 	struct phyint *pi;
2053 	struct phyint_instance *pii;
2054 	char *value;
2055 
2056 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2057 	srandom(gethostid());	/* Initialize the random number generator */
2058 
2059 	/*
2060 	 * NOTE: The messages output by in.mpathd are not suitable for
2061 	 * translation, so we do not call textdomain().
2062 	 */
2063 	(void) setlocale(LC_ALL, "");
2064 
2065 	/*
2066 	 * Get the user specified value of 'failure detection time'
2067 	 * from /etc/default/mpathd
2068 	 */
2069 	value = getdefault("FAILURE_DETECTION_TIME");
2070 	if (value != NULL) {
2071 		user_failure_detection_time =
2072 		    (int)strtol((char *)value, NULL, 0);
2073 
2074 		if (user_failure_detection_time <= 0) {
2075 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2076 			logerr("Invalid failure detection time %s, assuming "
2077 			    "default of %d ms\n", value,
2078 			    user_failure_detection_time);
2079 
2080 		} else if (user_failure_detection_time <
2081 		    MIN_FAILURE_DETECTION_TIME) {
2082 			user_failure_detection_time =
2083 			    MIN_FAILURE_DETECTION_TIME;
2084 			logerr("Too small failure detection time of %s, "
2085 			    "assuming minimum of %d ms\n", value,
2086 			    user_failure_detection_time);
2087 		}
2088 		free(value);
2089 	} else {
2090 		/* User has not specified the parameter, Use default value */
2091 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2092 	}
2093 
2094 	/*
2095 	 * This gives the frequency at which probes will be sent.
2096 	 * When fdt ms elapses, we should be able to determine
2097 	 * whether 5 consecutive probes have failed or not.
2098 	 * 1 probe will be sent in every user_probe_interval ms,
2099 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2100 	 * user_probe_interval. Thus when we send out probe 'n' we
2101 	 * can be sure that probe 'n - 2' is lost, if we have not
2102 	 * got the ack. (since the probe interval is > crtt). But
2103 	 * probe 'n - 1' may be a valid unacked probe, since the
2104 	 * time between 2 successive probes could be as small as
2105 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2106 	 */
2107 	user_probe_interval = user_failure_detection_time /
2108 	    (NUM_PROBE_FAILS + 2);
2109 
2110 	/*
2111 	 * Get the user specified value of failback_enabled from
2112 	 * /etc/default/mpathd
2113 	 */
2114 	value = getdefault("FAILBACK");
2115 	if (value != NULL) {
2116 		if (strcasecmp(value, "yes") == 0)
2117 			failback_enabled = _B_TRUE;
2118 		else if (strcasecmp(value, "no") == 0)
2119 			failback_enabled = _B_FALSE;
2120 		else
2121 			logerr("Invalid value for FAILBACK %s\n", value);
2122 		free(value);
2123 	} else {
2124 		failback_enabled = _B_TRUE;
2125 	}
2126 
2127 	/*
2128 	 * Get the user specified value of track_all_phyints from
2129 	 * /etc/default/mpathd. The sense is reversed in
2130 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2131 	 */
2132 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2133 	if (value != NULL) {
2134 		if (strcasecmp(value, "yes") == 0)
2135 			track_all_phyints = _B_FALSE;
2136 		else if (strcasecmp(value, "no") == 0)
2137 			track_all_phyints = _B_TRUE;
2138 		else
2139 			logerr("Invalid value for "
2140 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2141 		free(value);
2142 	} else {
2143 		track_all_phyints = _B_FALSE;
2144 	}
2145 
2146 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2147 		switch (c) {
2148 		case 'a':
2149 			adopt = _B_TRUE;
2150 			break;
2151 		case 'm':
2152 			force_mcast = _B_TRUE;
2153 			break;
2154 		case 'd':
2155 			debug = D_ALL;
2156 			foreground = _B_TRUE;
2157 			break;
2158 		case 'D':
2159 			i = (int)strtol(optarg, NULL, 0);
2160 			if (i == 0) {
2161 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2162 				    optarg);
2163 				exit(1);
2164 			}
2165 			debug |= i;
2166 			foreground = _B_TRUE;
2167 			break;
2168 		case 'l':
2169 			/*
2170 			 * Turn off link state notification handling.
2171 			 * Undocumented command line flag, for debugging
2172 			 * purposes.
2173 			 */
2174 			handle_link_notifications = _B_FALSE;
2175 			break;
2176 		default:
2177 			usage(argv[0]);
2178 			exit(1);
2179 		}
2180 	}
2181 
2182 	/*
2183 	 * The sockets for the loopback command interface should be listening
2184 	 * before we fork and exit in daemonize(). This way, whoever started us
2185 	 * can use the loopback interface as soon as they get a zero exit
2186 	 * status.
2187 	 */
2188 	lsock_v4 = setup_listener(AF_INET);
2189 	lsock_v6 = setup_listener(AF_INET6);
2190 
2191 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2192 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2193 		exit(1);
2194 	}
2195 
2196 	if (!foreground) {
2197 		if (!daemonize()) {
2198 			logerr("cannot daemonize\n");
2199 			exit(EXIT_FAILURE);
2200 		}
2201 		initlog();
2202 	}
2203 
2204 	/*
2205 	 * Initializations:
2206 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2207 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2208 	 * 2. Initialize a pipe for handling/recording signal events.
2209 	 * 3. Create the routing sockets,  used for listening
2210 	 *    to routing / interface changes.
2211 	 * 4. phyint_init() - Initialize physical interface state
2212 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2213 	 *    which timer_init() does indirectly.
2214 	 * 5. timer_init()  - Initialize timer related stuff
2215 	 * 6. initifs() - Initialize our database of all known interfaces
2216 	 * 7. init_router_targets() - Initialize our database of all known
2217 	 *    router targets.
2218 	 */
2219 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2220 	if (ifsock_v4 < 0) {
2221 		logperror("main: IPv4 socket open");
2222 		exit(1);
2223 	}
2224 
2225 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2226 	if (ifsock_v6 < 0) {
2227 		logperror("main: IPv6 socket open");
2228 		exit(1);
2229 	}
2230 
2231 	setup_eventpipe();
2232 
2233 	rtsock_v4 = setup_rtsock(AF_INET);
2234 	rtsock_v6 = setup_rtsock(AF_INET6);
2235 
2236 	if (phyint_init() == -1) {
2237 		logerr("cannot initialize physical interface structures");
2238 		exit(1);
2239 	}
2240 
2241 	timer_init();
2242 
2243 	initifs();
2244 
2245 	/*
2246 	 * If we're operating in "adopt" mode and no interfaces need to be
2247 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2248 	 * interfaces are subsequently put into multipathing groups).
2249 	 */
2250 	if (adopt && phyint_instances == NULL)
2251 		exit(0);
2252 
2253 	/*
2254 	 * Main body. Keep listening for activity on any of the sockets
2255 	 * that we are monitoring and take appropriate action as necessary.
2256 	 * signals are also handled synchronously.
2257 	 */
2258 	for (;;) {
2259 		if (poll(pollfds, pollfd_num, -1) < 0) {
2260 			if (errno == EINTR)
2261 				continue;
2262 			logperror("main: poll");
2263 			exit(1);
2264 		}
2265 		for (i = 0; i < pollfd_num; i++) {
2266 			if ((pollfds[i].fd == -1) ||
2267 			    !(pollfds[i].revents & POLLIN))
2268 				continue;
2269 			if (pollfds[i].fd == eventpipe_read) {
2270 				in_signal(eventpipe_read);
2271 				break;
2272 			}
2273 			if (pollfds[i].fd == rtsock_v4 ||
2274 			    pollfds[i].fd == rtsock_v6) {
2275 				process_rtsock(rtsock_v4, rtsock_v6);
2276 				break;
2277 			}
2278 
2279 			for (pii = phyint_instances; pii != NULL;
2280 			    pii = pii->pii_next) {
2281 				if (pollfds[i].fd == pii->pii_probe_sock) {
2282 					if (pii->pii_af == AF_INET)
2283 						in_data(pii);
2284 					else
2285 						in6_data(pii);
2286 					break;
2287 				}
2288 			}
2289 
2290 			for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2291 				if (pi->pi_notes != 0 &&
2292 				    pollfds[i].fd == dlpi_fd(pi->pi_dh)) {
2293 					(void) dlpi_recv(pi->pi_dh, NULL, NULL,
2294 					    NULL, NULL, 0, NULL);
2295 					break;
2296 				}
2297 			}
2298 
2299 			if (pollfds[i].fd == lsock_v4)
2300 				loopback_cmd(lsock_v4, AF_INET);
2301 			else if (pollfds[i].fd == lsock_v6)
2302 				loopback_cmd(lsock_v6, AF_INET6);
2303 		}
2304 	}
2305 	/* NOTREACHED */
2306 	return (EXIT_SUCCESS);
2307 }
2308 
2309 static int
2310 setup_listener(int af)
2311 {
2312 	int sock;
2313 	int on;
2314 	int len;
2315 	int ret;
2316 	struct sockaddr_storage laddr;
2317 	struct sockaddr_in  *sin;
2318 	struct sockaddr_in6 *sin6;
2319 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2320 
2321 	assert(af == AF_INET || af == AF_INET6);
2322 
2323 	sock = socket(af, SOCK_STREAM, 0);
2324 	if (sock < 0) {
2325 		logperror("setup_listener: socket");
2326 		exit(1);
2327 	}
2328 
2329 	on = 1;
2330 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2331 	    sizeof (on)) < 0) {
2332 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2333 		exit(1);
2334 	}
2335 
2336 	bzero(&laddr, sizeof (laddr));
2337 	laddr.ss_family = af;
2338 
2339 	if (af == AF_INET) {
2340 		sin = (struct sockaddr_in *)&laddr;
2341 		sin->sin_port = htons(MPATHD_PORT);
2342 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2343 		len = sizeof (struct sockaddr_in);
2344 	} else {
2345 		sin6 = (struct sockaddr_in6 *)&laddr;
2346 		sin6->sin6_port = htons(MPATHD_PORT);
2347 		sin6->sin6_addr = loopback_addr;
2348 		len = sizeof (struct sockaddr_in6);
2349 	}
2350 
2351 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2352 	if (ret < 0) {
2353 		if (errno == EADDRINUSE) {
2354 			/*
2355 			 * Another instance of mpathd may be already active.
2356 			 */
2357 			logerr("main: is another instance of in.mpathd "
2358 			    "already active?\n");
2359 			exit(1);
2360 		} else {
2361 			(void) close(sock);
2362 			return (-1);
2363 		}
2364 	}
2365 	if (listen(sock, 30) < 0) {
2366 		logperror("main: listen");
2367 		exit(1);
2368 	}
2369 	if (poll_add(sock) == -1) {
2370 		(void) close(sock);
2371 		exit(1);
2372 	}
2373 
2374 	return (sock);
2375 }
2376 
2377 /*
2378  * Table of commands and their expected size; used by loopback_cmd().
2379  */
2380 static struct {
2381 	const char	*name;
2382 	unsigned int	size;
2383 } commands[] = {
2384 	{ "MI_PING",		sizeof (uint32_t)	},
2385 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2386 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2387 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2388 };
2389 
2390 /*
2391  * Commands received over the loopback interface come here (via libipmp).
2392  */
2393 static void
2394 loopback_cmd(int sock, int family)
2395 {
2396 	int newfd;
2397 	ssize_t len;
2398 	boolean_t is_priv = _B_FALSE;
2399 	struct sockaddr_storage	peer;
2400 	struct sockaddr_in	*peer_sin;
2401 	struct sockaddr_in6	*peer_sin6;
2402 	socklen_t peerlen;
2403 	union mi_commands mpi;
2404 	char abuf[INET6_ADDRSTRLEN];
2405 	uint_t cmd;
2406 	int retval;
2407 
2408 	peerlen = sizeof (peer);
2409 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2410 	if (newfd < 0) {
2411 		logperror("loopback_cmd: accept");
2412 		return;
2413 	}
2414 
2415 	switch (family) {
2416 	case AF_INET:
2417 		/*
2418 		 * Validate the address and port to make sure that
2419 		 * non privileged processes don't connect and start
2420 		 * talking to us.
2421 		 */
2422 		if (peerlen != sizeof (struct sockaddr_in)) {
2423 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2424 			(void) close(newfd);
2425 			return;
2426 		}
2427 		peer_sin = (struct sockaddr_in *)&peer;
2428 		is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED;
2429 		(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2430 		    abuf, sizeof (abuf));
2431 
2432 		if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) {
2433 			logerr("Attempt to connect from addr %s port %d\n",
2434 			    abuf, ntohs(peer_sin->sin_port));
2435 			(void) close(newfd);
2436 			return;
2437 		}
2438 		break;
2439 
2440 	case AF_INET6:
2441 		if (peerlen != sizeof (struct sockaddr_in6)) {
2442 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2443 			(void) close(newfd);
2444 			return;
2445 		}
2446 		/*
2447 		 * Validate the address and port to make sure that
2448 		 * non privileged processes don't connect and start
2449 		 * talking to us.
2450 		 */
2451 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2452 		is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED;
2453 		(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2454 		    sizeof (abuf));
2455 		if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) {
2456 			logerr("Attempt to connect from addr %s port %d\n",
2457 			    abuf, ntohs(peer_sin6->sin6_port));
2458 			(void) close(newfd);
2459 			return;
2460 		}
2461 
2462 	default:
2463 		logdebug("loopback_cmd: family %d\n", family);
2464 		(void) close(newfd);
2465 		return;
2466 	}
2467 
2468 	/*
2469 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2470 	 * all supported commands
2471 	 */
2472 	len = read(newfd, &mpi, sizeof (mpi));
2473 
2474 	/*
2475 	 * In theory, we can receive any sized message for a stream socket,
2476 	 * but we don't expect that to happen for a small message over a
2477 	 * loopback connection.
2478 	 */
2479 	if (len < sizeof (uint32_t)) {
2480 		logerr("loopback_cmd: bad command format or read returns "
2481 		    "partial data %d\n", len);
2482 		(void) close(newfd);
2483 		return;
2484 	}
2485 
2486 	cmd = mpi.mi_command;
2487 	if (cmd >= MI_NCMD) {
2488 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2489 		(void) close(newfd);
2490 		return;
2491 	}
2492 
2493 	/*
2494 	 * Only MI_PING and MI_QUERY can come from unprivileged sources.
2495 	 */
2496 	if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) {
2497 		logerr("Unprivileged request from %s for privileged "
2498 		    "command %s\n", abuf, commands[cmd].name);
2499 		(void) close(newfd);
2500 		return;
2501 	}
2502 
2503 	if (len < commands[cmd].size) {
2504 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2505 		    commands[cmd].name, commands[cmd].size, len);
2506 		(void) close(newfd);
2507 		return;
2508 	}
2509 
2510 	retval = process_cmd(newfd, &mpi);
2511 	if (retval != IPMP_SUCCESS) {
2512 		logerr("failed processing %s: %s\n", commands[cmd].name,
2513 		    ipmp_errmsg(retval));
2514 	}
2515 	(void) close(newfd);
2516 }
2517 
2518 /*
2519  * Process the commands received via libipmp.
2520  */
2521 static unsigned int
2522 process_cmd(int newfd, union mi_commands *mpi)
2523 {
2524 	struct phyint *pi;
2525 	struct mi_offline *mio;
2526 	struct mi_undo_offline *miu;
2527 	unsigned int retval;
2528 
2529 	switch (mpi->mi_command) {
2530 	case MI_PING:
2531 		return (send_result(newfd, IPMP_SUCCESS, 0));
2532 
2533 	case MI_OFFLINE:
2534 		mio = &mpi->mi_ocmd;
2535 
2536 		pi = phyint_lookup(mio->mio_ifname);
2537 		if (pi == NULL)
2538 			return (send_result(newfd, IPMP_EUNKIF, 0));
2539 
2540 		retval = phyint_offline(pi, mio->mio_min_redundancy);
2541 		if (retval == IPMP_FAILURE)
2542 			return (send_result(newfd, IPMP_FAILURE, errno));
2543 
2544 		return (send_result(newfd, retval, 0));
2545 
2546 	case MI_UNDO_OFFLINE:
2547 		miu = &mpi->mi_ucmd;
2548 
2549 		pi = phyint_lookup(miu->miu_ifname);
2550 		if (pi == NULL)
2551 			return (send_result(newfd, IPMP_EUNKIF, 0));
2552 
2553 		retval = phyint_undo_offline(pi);
2554 		if (retval == IPMP_FAILURE)
2555 			return (send_result(newfd, IPMP_FAILURE, errno));
2556 
2557 		return (send_result(newfd, retval, 0));
2558 
2559 	case MI_QUERY:
2560 		return (process_query(newfd, &mpi->mi_qcmd));
2561 
2562 	default:
2563 		break;
2564 	}
2565 
2566 	return (send_result(newfd, IPMP_EPROTO, 0));
2567 }
2568 
2569 /*
2570  * Process the query request pointed to by `miq' and send a reply on file
2571  * descriptor `fd'.  Returns an IPMP error code.
2572  */
2573 static unsigned int
2574 process_query(int fd, mi_query_t *miq)
2575 {
2576 	ipmp_addrinfo_t		*adinfop;
2577 	ipmp_addrinfolist_t	*adlp;
2578 	ipmp_groupinfo_t	*grinfop;
2579 	ipmp_groupinfolist_t	*grlp;
2580 	ipmp_grouplist_t	*grlistp;
2581 	ipmp_ifinfo_t		*ifinfop;
2582 	ipmp_ifinfolist_t	*iflp;
2583 	ipmp_snap_t		*snap;
2584 	unsigned int		retval;
2585 
2586 	switch (miq->miq_inforeq) {
2587 	case IPMP_ADDRINFO:
2588 		retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr,
2589 		    &adinfop);
2590 		if (retval != IPMP_SUCCESS)
2591 			return (send_result(fd, retval, errno));
2592 
2593 		retval = send_result(fd, IPMP_SUCCESS, 0);
2594 		if (retval == IPMP_SUCCESS)
2595 			retval = send_addrinfo(fd, adinfop);
2596 
2597 		ipmp_freeaddrinfo(adinfop);
2598 		return (retval);
2599 
2600 	case IPMP_GROUPLIST:
2601 		retval = getgrouplist(&grlistp);
2602 		if (retval != IPMP_SUCCESS)
2603 			return (send_result(fd, retval, errno));
2604 
2605 		retval = send_result(fd, IPMP_SUCCESS, 0);
2606 		if (retval == IPMP_SUCCESS)
2607 			retval = send_grouplist(fd, grlistp);
2608 
2609 		ipmp_freegrouplist(grlistp);
2610 		return (retval);
2611 
2612 	case IPMP_GROUPINFO:
2613 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2614 		retval = getgroupinfo(miq->miq_grname, &grinfop);
2615 		if (retval != IPMP_SUCCESS)
2616 			return (send_result(fd, retval, errno));
2617 
2618 		retval = send_result(fd, IPMP_SUCCESS, 0);
2619 		if (retval == IPMP_SUCCESS)
2620 			retval = send_groupinfo(fd, grinfop);
2621 
2622 		ipmp_freegroupinfo(grinfop);
2623 		return (retval);
2624 
2625 	case IPMP_IFINFO:
2626 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2627 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2628 		if (retval != IPMP_SUCCESS)
2629 			return (send_result(fd, retval, errno));
2630 
2631 		retval = send_result(fd, IPMP_SUCCESS, 0);
2632 		if (retval == IPMP_SUCCESS)
2633 			retval = send_ifinfo(fd, ifinfop);
2634 
2635 		ipmp_freeifinfo(ifinfop);
2636 		return (retval);
2637 
2638 	case IPMP_SNAP:
2639 		/*
2640 		 * Before taking the snapshot, sync with the kernel.
2641 		 */
2642 		initifs();
2643 
2644 		retval = getsnap(&snap);
2645 		if (retval != IPMP_SUCCESS)
2646 			return (send_result(fd, retval, errno));
2647 
2648 		retval = send_result(fd, IPMP_SUCCESS, 0);
2649 		if (retval != IPMP_SUCCESS)
2650 			goto out;
2651 
2652 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2653 		if (retval != IPMP_SUCCESS)
2654 			goto out;
2655 
2656 		retval = send_grouplist(fd, snap->sn_grlistp);
2657 		if (retval != IPMP_SUCCESS)
2658 			goto out;
2659 
2660 		iflp = snap->sn_ifinfolistp;
2661 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2662 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2663 			if (retval != IPMP_SUCCESS)
2664 				goto out;
2665 		}
2666 
2667 		grlp = snap->sn_grinfolistp;
2668 		for (; grlp != NULL; grlp = grlp->grl_next) {
2669 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2670 			if (retval != IPMP_SUCCESS)
2671 				goto out;
2672 		}
2673 
2674 		adlp = snap->sn_adinfolistp;
2675 		for (; adlp != NULL; adlp = adlp->adl_next) {
2676 			retval = send_addrinfo(fd, adlp->adl_adinfop);
2677 			if (retval != IPMP_SUCCESS)
2678 				goto out;
2679 		}
2680 	out:
2681 		ipmp_snap_free(snap);
2682 		return (retval);
2683 
2684 	default:
2685 		break;
2686 
2687 	}
2688 	return (send_result(fd, IPMP_EPROTO, 0));
2689 }
2690 
2691 /*
2692  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2693  * Returns an IPMP error code.
2694  */
2695 static unsigned int
2696 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2697 {
2698 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2699 	ipmp_addrlist_t	*adlistp = grinfop->gr_adlistp;
2700 	unsigned int	retval;
2701 
2702 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2703 	if (retval != IPMP_SUCCESS)
2704 		return (retval);
2705 
2706 	retval = ipmp_writetlv(fd, IPMP_IFLIST,
2707 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp);
2708 	if (retval != IPMP_SUCCESS)
2709 		return (retval);
2710 
2711 	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2712 	    IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp));
2713 }
2714 
2715 /*
2716  * Send the interface information pointed to by `ifinfop' on file descriptor
2717  * `fd'.  Returns an IPMP error code.
2718  */
2719 static unsigned int
2720 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2721 {
2722 	ipmp_addrlist_t	*adlist4p = ifinfop->if_targinfo4.it_targlistp;
2723 	ipmp_addrlist_t	*adlist6p = ifinfop->if_targinfo6.it_targlistp;
2724 	unsigned int	retval;
2725 
2726 	retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop);
2727 	if (retval != IPMP_SUCCESS)
2728 		return (retval);
2729 
2730 	retval = ipmp_writetlv(fd, IPMP_ADDRLIST,
2731 	    IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p);
2732 	if (retval != IPMP_SUCCESS)
2733 		return (retval);
2734 
2735 	return (ipmp_writetlv(fd, IPMP_ADDRLIST,
2736 	    IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p));
2737 }
2738 
2739 /*
2740  * Send the address information pointed to by `adinfop' on file descriptor
2741  * `fd'.  Returns an IPMP error code.
2742  */
2743 static unsigned int
2744 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop)
2745 {
2746 	return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop));
2747 }
2748 
2749 /*
2750  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2751  * Returns an IPMP error code.
2752  */
2753 static unsigned int
2754 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2755 {
2756 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2757 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2758 }
2759 
2760 /*
2761  * Initialize an mi_result_t structure using `error' and `syserror' and
2762  * send it on file descriptor `fd'.  Returns an IPMP error code.
2763  */
2764 static unsigned int
2765 send_result(int fd, unsigned int error, int syserror)
2766 {
2767 	mi_result_t me;
2768 
2769 	me.me_mpathd_error = error;
2770 	if (error == IPMP_FAILURE)
2771 		me.me_sys_error = syserror;
2772 	else
2773 		me.me_sys_error = 0;
2774 
2775 	return (ipmp_write(fd, &me, sizeof (me)));
2776 }
2777 
2778 /*
2779  * Daemonize the process.
2780  */
2781 static boolean_t
2782 daemonize(void)
2783 {
2784 	switch (fork()) {
2785 	case -1:
2786 		return (_B_FALSE);
2787 
2788 	case  0:
2789 		/*
2790 		 * Lose our controlling terminal, and become both a session
2791 		 * leader and a process group leader.
2792 		 */
2793 		if (setsid() == -1)
2794 			return (_B_FALSE);
2795 
2796 		/*
2797 		 * Under POSIX, a session leader can accidentally (through
2798 		 * open(2)) acquire a controlling terminal if it does not
2799 		 * have one.  Just to be safe, fork() again so we are not a
2800 		 * session leader.
2801 		 */
2802 		switch (fork()) {
2803 		case -1:
2804 			return (_B_FALSE);
2805 
2806 		case 0:
2807 			(void) chdir("/");
2808 			(void) umask(022);
2809 			(void) fdwalk(closefunc, NULL);
2810 			break;
2811 
2812 		default:
2813 			_exit(EXIT_SUCCESS);
2814 		}
2815 		break;
2816 
2817 	default:
2818 		_exit(EXIT_SUCCESS);
2819 	}
2820 
2821 	return (_B_TRUE);
2822 }
2823 
2824 /*
2825  * The parent has created some fds before forking on purpose, keep them open.
2826  */
2827 static int
2828 closefunc(void *not_used, int fd)
2829 /* ARGSUSED */
2830 {
2831 	if (fd != lsock_v4 && fd != lsock_v6)
2832 		(void) close(fd);
2833 	return (0);
2834 }
2835 
2836 /* LOGGER */
2837 
2838 #include <syslog.h>
2839 
2840 /*
2841  * Logging routines.  All routines log to syslog, unless the daemon is
2842  * running in the foreground, in which case the logging goes to stderr.
2843  *
2844  * The following routines are available:
2845  *
2846  *	logdebug(): A printf-like function for outputting debug messages
2847  *	(messages at LOG_DEBUG) that are only of use to developers.
2848  *
2849  *	logtrace(): A printf-like function for outputting tracing messages
2850  *	(messages at LOG_INFO) from the daemon.	 This is typically used
2851  *	to log the receipt of interesting network-related conditions.
2852  *
2853  *	logerr(): A printf-like function for outputting error messages
2854  *	(messages at LOG_ERR) from the daemon.
2855  *
2856  *	logperror*(): A set of functions used to output error messages
2857  *	(messages at LOG_ERR); these automatically append strerror(errno)
2858  *	and a newline to the message passed to them.
2859  *
2860  * NOTE: since the logging functions write to syslog, the messages passed
2861  *	 to them are not eligible for localization.  Thus, gettext() must
2862  *	 *not* be used.
2863  */
2864 
2865 static int logging = 0;
2866 
2867 static void
2868 initlog(void)
2869 {
2870 	logging++;
2871 	openlog("in.mpathd", LOG_PID, LOG_DAEMON);
2872 }
2873 
2874 /* PRINTFLIKE2 */
2875 void
2876 logmsg(int pri, const char *fmt, ...)
2877 {
2878 	va_list ap;
2879 
2880 	va_start(ap, fmt);
2881 
2882 	if (logging)
2883 		vsyslog(pri, fmt, ap);
2884 	else
2885 		(void) vfprintf(stderr, fmt, ap);
2886 	va_end(ap);
2887 }
2888 
2889 /* PRINTFLIKE1 */
2890 void
2891 logperror(const char *str)
2892 {
2893 	if (logging)
2894 		syslog(LOG_ERR, "%s: %m\n", str);
2895 	else
2896 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
2897 }
2898 
2899 void
2900 logperror_pii(struct phyint_instance *pii, const char *str)
2901 {
2902 	if (logging) {
2903 		syslog(LOG_ERR, "%s (%s %s): %m\n",
2904 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
2905 	} else {
2906 		(void) fprintf(stderr, "%s (%s %s): %s\n",
2907 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
2908 		    strerror(errno));
2909 	}
2910 }
2911 
2912 void
2913 logperror_li(struct logint *li, const char *str)
2914 {
2915 	struct	phyint_instance	*pii = li->li_phyint_inst;
2916 
2917 	if (logging) {
2918 		syslog(LOG_ERR, "%s (%s %s): %m\n",
2919 		    str, AF_STR(pii->pii_af), li->li_name);
2920 	} else {
2921 		(void) fprintf(stderr, "%s (%s %s): %s\n",
2922 		    str, AF_STR(pii->pii_af), li->li_name,
2923 		    strerror(errno));
2924 	}
2925 }
2926 
2927 void
2928 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
2929 {
2930 	if (polled)
2931 		(void) poll_remove(pii->pii_probe_sock);
2932 	(void) close(pii->pii_probe_sock);
2933 	pii->pii_probe_sock = -1;
2934 	pii->pii_basetime_inited = 0;
2935 }
2936 
2937 boolean_t
2938 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags,
2939     struct sockaddr_storage *ssp)
2940 {
2941 	addrlist_t *addrp;
2942 
2943 	if ((addrp = malloc(sizeof (addrlist_t))) == NULL)
2944 		return (_B_FALSE);
2945 
2946 	(void) strlcpy(addrp->al_name, name, LIFNAMSIZ);
2947 	addrp->al_flags = flags;
2948 	addrp->al_addr = *ssp;
2949 	addrp->al_next = *addrsp;
2950 	*addrsp = addrp;
2951 	return (_B_TRUE);
2952 }
2953 
2954 void
2955 addrlist_free(addrlist_t **addrsp)
2956 {
2957 	addrlist_t *addrp, *next_addrp;
2958 
2959 	for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) {
2960 		next_addrp = addrp->al_next;
2961 		free(addrp);
2962 	}
2963 	*addrsp = NULL;
2964 }
2965