xref: /titanic_52/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 3c4993fb5a74112f361d71dab20997bdc749a7fb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30 
31 int debug = 0;				/* Debug flag */
32 static int pollfd_num = 0;		/* Num. of poll descriptors */
33 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
34 
35 					/* All times below in ms */
36 int	user_failure_detection_time;	/* user specified failure detection */
37 					/* time (fdt) */
38 int	user_probe_interval;		/* derived from user specified fdt */
39 
40 static int	rtsock_v4;		/* AF_INET routing socket */
41 static int	rtsock_v6;		/* AF_INET6 routing socket */
42 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
43 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
44 static int	lsock_v4;		/* Listen socket to detect mpathd */
45 static int	lsock_v6;		/* Listen socket to detect mpathd */
46 static int	mibfd = -1;		/* fd to get mib info */
47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
48 
49 boolean_t	full_scan_required = _B_FALSE;
50 static uint_t	last_initifs_time;	/* Time when initifs was last run */
51 static	char **argv0;			/* Saved for re-exec on SIGHUP */
52 boolean_t handle_link_notifications = _B_TRUE;
53 
54 static void	initlog(void);
55 static void	run_timeouts(void);
56 static void	initifs(void);
57 static void	check_if_removed(struct phyint_instance *pii);
58 static void	select_test_ifs(void);
59 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
60 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
61 static void	router_add_v4(mib2_ipRouteEntry_t *rp1,
62     struct in_addr nexthop_v4);
63 static void	router_add_v6(mib2_ipv6RouteEntry_t *rp1,
64     struct in6_addr nexthop_v6);
65 static void	router_add_common(int af, char *ifname,
66     struct in6_addr nexthop);
67 static void	init_router_targets();
68 static void	cleanup(void);
69 static int	setup_listener(int af);
70 static void	check_config(void);
71 static void	check_testconfig(void);
72 static void	check_addr_unique(struct phyint_instance *,
73     struct sockaddr_storage *);
74 static void	init_host_targets(void);
75 static void	dup_host_targets(struct phyint_instance *desired_pii);
76 static void	loopback_cmd(int sock, int family);
77 static int	poll_remove(int fd);
78 static boolean_t daemonize(void);
79 static int	closefunc(void *, int);
80 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
81 static unsigned int process_query(int fd, mi_query_t *miq);
82 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
83 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
84 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
85 static unsigned int send_result(int fd, unsigned int error, int syserror);
86 
87 struct local_addr *laddr_list = NULL;
88 
89 /*
90  * Return the current time in milliseconds (from an arbitrary reference)
91  * truncated to fit into an int. Truncation is ok since we are interested
92  * only in differences and not the absolute values.
93  */
94 uint_t
95 getcurrenttime(void)
96 {
97 	uint_t	cur_time;	/* In ms */
98 
99 	/*
100 	 * Use of a non-user-adjustable source of time is
101 	 * required. However millisecond precision is sufficient.
102 	 * divide by 10^6
103 	 */
104 	cur_time = (uint_t)(gethrtime() / 1000000LL);
105 	return (cur_time);
106 }
107 
108 uint64_t
109 getcurrentsec(void)
110 {
111 	return (gethrtime() / NANOSEC);
112 }
113 
114 /*
115  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
116  */
117 int
118 poll_add(int fd)
119 {
120 	int i;
121 	int new_num;
122 	struct pollfd *newfds;
123 retry:
124 	/* Check if already present */
125 	for (i = 0; i < pollfd_num; i++) {
126 		if (pollfds[i].fd == fd)
127 			return (0);
128 	}
129 	/* Check for empty spot already present */
130 	for (i = 0; i < pollfd_num; i++) {
131 		if (pollfds[i].fd == -1) {
132 			pollfds[i].fd = fd;
133 			return (0);
134 		}
135 	}
136 
137 	/* Allocate space for 32 more fds and initialize to -1 */
138 	new_num = pollfd_num + 32;
139 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
140 	if (newfds == NULL) {
141 		logperror("poll_add: realloc");
142 		return (-1);
143 	}
144 	for (i = pollfd_num; i < new_num; i++) {
145 		newfds[i].fd = -1;
146 		newfds[i].events = POLLIN;
147 	}
148 	pollfd_num = new_num;
149 	pollfds = newfds;
150 	goto retry;
151 }
152 
153 /*
154  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
155  */
156 static int
157 poll_remove(int fd)
158 {
159 	int i;
160 
161 	/* Check if already present */
162 	for (i = 0; i < pollfd_num; i++) {
163 		if (pollfds[i].fd == fd) {
164 			pollfds[i].fd = -1;
165 			return (0);
166 		}
167 	}
168 	return (-1);
169 }
170 
171 /*
172  * Extract information about the phyint instance. If the phyint instance still
173  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
174  * will use it to detect phyint instances that don't exist any longer and
175  * remove them, from our database of phyint instances.
176  * Return value:
177  *	returns true if the phyint instance exists in the kernel,
178  *	returns false otherwise
179  */
180 static boolean_t
181 pii_process(int af, char *name, struct phyint_instance **pii_p)
182 {
183 	int err;
184 	struct phyint_instance *pii;
185 	struct phyint_instance *pii_other;
186 
187 	if (debug & D_PHYINT)
188 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
189 
190 	pii = phyint_inst_lookup(af, name);
191 	if (pii == NULL) {
192 		/*
193 		 * Phyint instance does not exist in our tables,
194 		 * create new phyint instance
195 		 */
196 		pii = phyint_inst_init_from_k(af, name);
197 	} else {
198 		/* Phyint exists in our tables */
199 		err = phyint_inst_update_from_k(pii);
200 
201 		switch (err) {
202 		case PI_IOCTL_ERROR:
203 			/* Some ioctl error. don't change anything */
204 			pii->pii_in_use = 1;
205 			break;
206 
207 		case PI_GROUP_CHANGED:
208 			/*
209 			 * The phyint has changed group.
210 			 */
211 			restore_phyint(pii->pii_phyint);
212 			/* FALLTHRU */
213 
214 		case PI_IFINDEX_CHANGED:
215 			/*
216 			 * Interface index has changed. Delete and
217 			 * recreate the phyint as it is quite likely
218 			 * the interface has been unplumbed and replumbed.
219 			 */
220 			pii_other = phyint_inst_other(pii);
221 			if (pii_other != NULL)
222 				phyint_inst_delete(pii_other);
223 			phyint_inst_delete(pii);
224 			pii = phyint_inst_init_from_k(af, name);
225 			break;
226 
227 		case PI_DELETED:
228 			/* Phyint instance has disappeared from kernel */
229 			pii->pii_in_use = 0;
230 			break;
231 
232 		case PI_OK:
233 			/* Phyint instance exists and is fine */
234 			pii->pii_in_use = 1;
235 			break;
236 
237 		default:
238 			/* Unknown status */
239 			logerr("pii_process: Unknown status %d\n", err);
240 			break;
241 		}
242 	}
243 
244 	*pii_p = pii;
245 	if (pii != NULL)
246 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
247 	else
248 		return (_B_FALSE);
249 }
250 
251 /*
252  * This phyint is leaving the group. Try to restore the phyint to its
253  * initial state. Return the addresses that belong to other group members,
254  * to the group, and take back any addresses owned by this phyint
255  */
256 void
257 restore_phyint(struct phyint *pi)
258 {
259 	if (pi->pi_group == phyint_anongroup)
260 		return;
261 
262 	/*
263 	 * Move everthing to some other member in the group.
264 	 * The phyint has changed group in the kernel. But we
265 	 * have yet to do it in our tables.
266 	 */
267 	if (!pi->pi_empty)
268 		(void) try_failover(pi, FAILOVER_TO_ANY);
269 	/*
270 	 * Move all addresses owned by 'pi' back to pi, from each
271 	 * of the other members of the group
272 	 */
273 	(void) try_failback(pi);
274 }
275 
276 /*
277  * Scan all interfaces to detect changes as well as new and deleted interfaces
278  */
279 static void
280 initifs()
281 {
282 	int	n;
283 	int	af;
284 	char	*cp;
285 	char	*buf;
286 	int	numifs;
287 	struct lifnum	lifn;
288 	struct lifconf	lifc;
289 	struct lifreq	*lifr;
290 	struct logint	*li;
291 	struct phyint_instance *pii;
292 	struct phyint_instance *next_pii;
293 	char	pi_name[LIFNAMSIZ + 1];
294 	boolean_t exists;
295 	struct phyint	*pi;
296 	struct local_addr *next;
297 
298 	if (debug & D_PHYINT)
299 		logdebug("initifs: Scanning interfaces\n");
300 
301 	last_initifs_time = getcurrenttime();
302 
303 	/*
304 	 * Free the laddr_list before collecting the local addresses.
305 	 */
306 	while (laddr_list != NULL) {
307 		next = laddr_list->next;
308 		free(laddr_list);
309 		laddr_list = next;
310 	}
311 
312 	/*
313 	 * Mark the interfaces so that we can find phyints and logints
314 	 * which have disappeared from the kernel. pii_process() and
315 	 * logint_init_from_k() will set {pii,li}_in_use when they find
316 	 * the interface in the kernel. Also, clear dupaddr bit on probe
317 	 * logint. check_addr_unique() will set the dupaddr bit on the
318 	 * probe logint, if the testaddress is not unique.
319 	 */
320 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
321 		pii->pii_in_use = 0;
322 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
323 			li->li_in_use = 0;
324 			if (pii->pii_probe_logint == li)
325 				li->li_dupaddr = 0;
326 		}
327 	}
328 
329 	lifn.lifn_family = AF_UNSPEC;
330 	lifn.lifn_flags = LIFC_ALLZONES;
331 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
332 		logperror("initifs: ioctl (get interface numbers)");
333 		return;
334 	}
335 	numifs = lifn.lifn_count;
336 
337 	buf = (char *)calloc(numifs, sizeof (struct lifreq));
338 	if (buf == NULL) {
339 		logperror("initifs: calloc");
340 		return;
341 	}
342 
343 	lifc.lifc_family = AF_UNSPEC;
344 	lifc.lifc_flags = LIFC_ALLZONES;
345 	lifc.lifc_len = numifs * sizeof (struct lifreq);
346 	lifc.lifc_buf = buf;
347 
348 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
349 		/*
350 		 * EINVAL is commonly encountered, when things change
351 		 * underneath us rapidly, (eg. at boot, when new interfaces
352 		 * are plumbed successively) and the kernel finds the buffer
353 		 * size we passed as too small. We will retry again
354 		 * when we see the next routing socket msg, or at worst after
355 		 * IF_SCAN_INTERVAL ms.
356 		 */
357 		if (errno != EINVAL) {
358 			logperror("initifs: ioctl"
359 			    " (get interface configuration)");
360 		}
361 		free(buf);
362 		return;
363 	}
364 
365 	lifr = (struct lifreq *)lifc.lifc_req;
366 
367 	/*
368 	 * For each lifreq returned by SIOGGLIFCONF, call pii_process()
369 	 * and get the state of the corresponding phyint_instance. If it is
370 	 * successful, then call logint_init_from_k() to get the state of the
371 	 * logint.
372 	 */
373 	for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
374 		int	sockfd;
375 		struct local_addr	*taddr;
376 		struct sockaddr_in	*sin;
377 		struct sockaddr_in6	*sin6;
378 		struct lifreq	lifreq;
379 
380 		af = lifr->lifr_addr.ss_family;
381 
382 		/*
383 		 * Collect all local addresses.
384 		 */
385 		sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
386 		(void) memset(&lifreq, 0, sizeof (lifreq));
387 		(void) strlcpy(lifreq.lifr_name, lifr->lifr_name,
388 		    sizeof (lifreq.lifr_name));
389 
390 		if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
391 			if (errno != ENXIO)
392 				logperror("initifs: ioctl (SIOCGLIFFLAGS)");
393 			continue;
394 		}
395 
396 		/*
397 		 * Add the interface address to laddr_list.
398 		 * Another node might have the same IP address which is up.
399 		 * In that case, it is appropriate  to use the address as a
400 		 * target, even though it is also configured (but not up) on
401 		 * the local system.
402 		 * Hence,the interface address is not added to laddr_list
403 		 * unless it is IFF_UP.
404 		 */
405 		if (lifreq.lifr_flags & IFF_UP) {
406 			taddr = malloc(sizeof (struct local_addr));
407 			if (taddr == NULL) {
408 				logperror("initifs: malloc");
409 				continue;
410 			}
411 			if (af == AF_INET) {
412 				sin = (struct sockaddr_in *)&lifr->lifr_addr;
413 				IN6_INADDR_TO_V4MAPPED(&sin->sin_addr,
414 				    &taddr->addr);
415 			} else {
416 				sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
417 				taddr->addr = sin6->sin6_addr;
418 			}
419 			taddr->next = laddr_list;
420 			laddr_list = taddr;
421 		}
422 
423 		/*
424 		 * Need to pass a phyint name to pii_process. Insert the
425 		 * null where the ':' IF_SEPARATOR is found in the logical
426 		 * name.
427 		 */
428 		(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
429 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
430 			*cp = '\0';
431 
432 		exists = pii_process(af, pi_name, &pii);
433 		if (exists) {
434 			/* The phyint is fine. So process the logint */
435 			logint_init_from_k(pii, lifr->lifr_name);
436 			check_addr_unique(pii, &lifr->lifr_addr);
437 		}
438 
439 	}
440 
441 	free(buf);
442 
443 	/*
444 	 * Scan for phyints and logints that have disappeared from the
445 	 * kernel, and delete them.
446 	 */
447 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
448 		next_pii = pii->pii_next;
449 		check_if_removed(pii);
450 	}
451 
452 	/*
453 	 * Select a test address for sending probes on each phyint instance
454 	 */
455 	select_test_ifs();
456 
457 	/*
458 	 * Handle link up/down notifications from the NICs.
459 	 */
460 	process_link_state_changes();
461 
462 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
463 		/*
464 		 * If this is a case of group failure, we don't have much
465 		 * to do until the group recovers again.
466 		 */
467 		if (GROUP_FAILED(pi->pi_group))
468 			continue;
469 
470 		/*
471 		 * Try/Retry any pending failovers / failbacks, that did not
472 		 * not complete, or that could not be initiated previously.
473 		 * This implements the 3 invariants described in the big block
474 		 * comment at the beginning of probe.c
475 		 */
476 		if (pi->pi_flags & IFF_INACTIVE) {
477 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
478 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
479 		} else {
480 			struct phyint_instance *pii;
481 
482 			/*
483 			 * Skip LINK UP interfaces which are not capable
484 			 * of probing.
485 			 */
486 			pii = pi->pi_v4;
487 			if (pii == NULL ||
488 			    (LINK_UP(pi) && !PROBE_CAPABLE(pii))) {
489 				pii = pi->pi_v6;
490 				if (pii == NULL ||
491 				    (LINK_UP(pi) && !PROBE_CAPABLE(pii)))
492 					continue;
493 			}
494 
495 			/*
496 			 * It is possible that the phyint has started
497 			 * receiving packets, after it has been marked
498 			 * PI_FAILED. Don't initiate failover, if the
499 			 * phyint has started recovering. failure_state()
500 			 * captures this check. A similar logic is used
501 			 * for failback/repair case.
502 			 */
503 			if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
504 			    (failure_state(pii) == PHYINT_FAILURE)) {
505 				(void) try_failover(pi, FAILOVER_NORMAL);
506 			} else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
507 				if (try_failback(pi) != IPMP_FAILURE) {
508 					(void) change_lif_flags(pi, IFF_FAILED,
509 					    _B_FALSE);
510 					/* Per state diagram */
511 					pi->pi_empty = 0;
512 				}
513 			}
514 		}
515 	}
516 }
517 
518 /*
519  * Check that a given test address is unique across all of the interfaces in a
520  * group.  (e.g., IPv6 link-locals may not be inherently unique, and binding
521  * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
522  * Any issues will be reported by check_testconfig().
523  */
524 static void
525 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
526 {
527 	struct phyint		*pi;
528 	struct phyint_group	*pg;
529 	struct in6_addr		addr;
530 	struct phyint_instance	*pii;
531 	struct sockaddr_in	*sin;
532 
533 	if (ss->ss_family == AF_INET) {
534 		sin = (struct sockaddr_in *)ss;
535 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
536 	} else {
537 		assert(ss->ss_family == AF_INET6);
538 		addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
539 	}
540 
541 	/*
542 	 * For anonymous groups, every interface is assumed to be on its own
543 	 * link, so there is no chance of overlapping addresses.
544 	 */
545 	pg = ourpii->pii_phyint->pi_group;
546 	if (pg == phyint_anongroup)
547 		return;
548 
549 	/*
550 	 * Walk the list of phyint instances in the group and check for test
551 	 * addresses matching ours.  Of course, we skip ourself.
552 	 */
553 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
554 		pii = PHYINT_INSTANCE(pi, ss->ss_family);
555 		if (pii == NULL || pii == ourpii ||
556 		    pii->pii_probe_logint == NULL)
557 			continue;
558 
559 		/*
560 		 * If this test address is not unique, set the dupaddr bit.
561 		 */
562 		if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr))
563 			pii->pii_probe_logint->li_dupaddr = 1;
564 	}
565 }
566 
567 /*
568  * Stop probing an interface.  Called when an interface is offlined.
569  * The probe socket is closed on each interface instance, and the
570  * interface state set to PI_OFFLINE.
571  */
572 static void
573 stop_probing(struct phyint *pi)
574 {
575 	struct phyint_instance *pii;
576 
577 	pii = pi->pi_v4;
578 	if (pii != NULL) {
579 		if (pii->pii_probe_sock != -1)
580 			close_probe_socket(pii, _B_TRUE);
581 		pii->pii_probe_logint = NULL;
582 	}
583 
584 	pii = pi->pi_v6;
585 	if (pii != NULL) {
586 		if (pii->pii_probe_sock != -1)
587 			close_probe_socket(pii, _B_TRUE);
588 		pii->pii_probe_logint = NULL;
589 	}
590 
591 	phyint_chstate(pi, PI_OFFLINE);
592 }
593 
594 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
595 
596 /*
597  * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
598  * IFF_UP must also be set so that the associated address can be used as a
599  * source address.  Further, we must be able to exchange packets with local
600  * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
601  * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
602  */
603 static int
604 rate_testflags(uint64_t flags)
605 {
606 	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
607 		return (BAD_TESTFLAGS);
608 
609 	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
610 		return (BAD_TESTFLAGS);
611 
612 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
613 		return (BEST_TESTFLAGS);
614 
615 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
616 		return (BEST_TESTFLAGS);
617 
618 	return (OK_TESTFLAGS);
619 }
620 
621 /*
622  * Attempt to select a test address for each phyint instance.
623  * Call phyint_inst_sockinit() to complete the initializations.
624  */
625 static void
626 select_test_ifs(void)
627 {
628 	struct phyint		*pi;
629 	struct phyint_instance	*pii;
630 	struct phyint_instance	*next_pii;
631 	struct logint		*li;
632 	struct logint  		*probe_logint;
633 	boolean_t		target_scan_reqd = _B_FALSE;
634 	struct target		*tg;
635 	int			rating;
636 
637 	if (debug & D_PHYINT)
638 		logdebug("select_test_ifs\n");
639 
640 	/*
641 	 * For each phyint instance, do the test address selection
642 	 */
643 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
644 		next_pii = pii->pii_next;
645 		probe_logint = NULL;
646 
647 		/*
648 		 * An interface that is offline, should not be probed.
649 		 * Offline interfaces should always in PI_OFFLINE state,
650 		 * unless some other entity has set the offline flag.
651 		 */
652 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
653 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
654 				logerr("shouldn't be probing offline"
655 				    " interface %s (state is: %u)."
656 				    " Stopping probes.\n",
657 				    pii->pii_phyint->pi_name,
658 				    pii->pii_phyint->pi_state);
659 				stop_probing(pii->pii_phyint);
660 			}
661 			continue;
662 		}
663 
664 		li = pii->pii_probe_logint;
665 		if (li != NULL) {
666 			/*
667 			 * We've already got a test address; only proceed
668 			 * if it's suboptimal.
669 			 */
670 			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
671 				continue;
672 		}
673 
674 		/*
675 		 * Walk the logints of this phyint instance, and select
676 		 * the best available test address
677 		 */
678 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
679 			/*
680 			 * Skip 0.0.0.0 addresses, as those are never
681 			 * actually usable.
682 			 */
683 			if (pii->pii_af == AF_INET &&
684 			    IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
685 				continue;
686 
687 			/*
688 			 * Skip any IPv6 logints that are not link-local,
689 			 * since we should always have a link-local address
690 			 * anyway and in6_data() expects link-local replies.
691 			 */
692 			if (pii->pii_af == AF_INET6 &&
693 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
694 				continue;
695 
696 			/*
697 			 * Rate the testflags. If we've found an optimal
698 			 * match, then break out; otherwise, record the most
699 			 * recent OK one.
700 			 */
701 			rating = rate_testflags(li->li_flags);
702 			if (rating == BAD_TESTFLAGS)
703 				continue;
704 
705 			probe_logint = li;
706 			if (rating == BEST_TESTFLAGS)
707 				break;
708 		}
709 
710 		/*
711 		 * If the probe logint has changed, ditch the old one.
712 		 */
713 		if (pii->pii_probe_logint != NULL &&
714 		    pii->pii_probe_logint != probe_logint) {
715 			if (pii->pii_probe_sock != -1)
716 				close_probe_socket(pii, _B_TRUE);
717 			pii->pii_probe_logint = NULL;
718 		}
719 
720 		if (probe_logint == NULL) {
721 			/*
722 			 * We don't have a test address; zero out the probe
723 			 * stats array since it is no longer relevant.
724 			 * Optimize by checking if it is already zeroed out.
725 			 */
726 			int pr_ndx;
727 
728 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
729 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
730 				clear_pii_probe_stats(pii);
731 				reset_crtt_all(pii->pii_phyint);
732 			}
733 			continue;
734 		} else if (probe_logint == pii->pii_probe_logint) {
735 			/*
736 			 * If we didn't find any new test addr, go to the
737 			 * next phyint.
738 			 */
739 			continue;
740 		}
741 
742 		/*
743 		 * The phyint is either being assigned a new testaddr
744 		 * or is being assigned a testaddr for the 1st time.
745 		 * Need to initialize the phyint socket
746 		 */
747 		pii->pii_probe_logint = probe_logint;
748 		if (!phyint_inst_sockinit(pii)) {
749 			if (debug & D_PHYINT) {
750 				logdebug("select_test_ifs: "
751 				    "phyint_sockinit failed\n");
752 			}
753 			phyint_inst_delete(pii);
754 			continue;
755 		}
756 
757 		/*
758 		 * This phyint instance is now enabled for probes; this
759 		 * impacts our state machine in two ways:
760 		 *
761 		 * 1. If we're probe *capable* as well (i.e., we have
762 		 *    probe targets) and the interface is in PI_NOTARGETS,
763 		 *    then transition to PI_RUNNING.
764 		 *
765 		 * 2. If we're not probe capable, and the other phyint
766 		 *    instance is also not probe capable, and we were in
767 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
768 		 *
769 		 * Also see the state diagram in mpd_probe.c.
770 		 */
771 		if (PROBE_CAPABLE(pii)) {
772 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
773 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
774 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
775 			if (pii->pii_phyint->pi_state == PI_RUNNING)
776 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
777 		}
778 
779 		if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
780 			tg = pii->pii_targets;
781 			if (tg != NULL)
782 				target_delete(tg);
783 			assert(pii->pii_targets == NULL);
784 			assert(pii->pii_target_next == NULL);
785 			assert(pii->pii_ntargets == 0);
786 			target_create(pii, probe_logint->li_dstaddr,
787 			    _B_TRUE);
788 		}
789 
790 		/*
791 		 * If no targets are currently known for this phyint
792 		 * we need to call init_router_targets. Since
793 		 * init_router_targets() initializes the list of targets
794 		 * for all phyints it is done below the loop.
795 		 */
796 		if (pii->pii_targets == NULL)
797 			target_scan_reqd = _B_TRUE;
798 
799 		/*
800 		 * Start the probe timer for this instance.
801 		 */
802 		if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
803 			start_timer(pii);
804 			pii->pii_basetime_inited = 1;
805 		}
806 	}
807 
808 	/*
809 	 * Check the interface list for any interfaces that are marked
810 	 * PI_FAILED but no longer enabled to send probes, and call
811 	 * phyint_check_for_repair() to see if the link now indicates that the
812 	 * interface should be repaired.  Also see the state diagram in
813 	 * mpd_probe.c.
814 	 */
815 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
816 		if (pi->pi_state == PI_FAILED &&
817 		    !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
818 			phyint_check_for_repair(pi);
819 		}
820 	}
821 
822 	check_testconfig();
823 
824 	/*
825 	 * Try to populate the target list. init_router_targets populates
826 	 * the target list from the routing table. If our target list is
827 	 * still empty, init_host_targets adds host targets based on the
828 	 * host target list of other phyints in the group.
829 	 */
830 	if (target_scan_reqd) {
831 		init_router_targets();
832 		init_host_targets();
833 	}
834 }
835 
836 /*
837  * Check test address configuration, and log warnings if appropriate.  Note
838  * that this function only logs pre-existing conditions (e.g., that probe-
839  * based failure detection is disabled).
840  */
841 static void
842 check_testconfig(void)
843 {
844 	struct phyint	*pi;
845 	struct logint  	*li;
846 	char		abuf[INET6_ADDRSTRLEN];
847 
848 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
849 		if (pi->pi_flags & IFF_OFFLINE)
850 			continue;
851 
852 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) {
853 			if (pi->pi_taddrmsg_printed ||
854 			    pi->pi_duptaddrmsg_printed) {
855 				logerr("Test address now configured on "
856 				    "interface %s; enabling probe-based "
857 				    "failure detection on it\n", pi->pi_name);
858 				pi->pi_taddrmsg_printed = 0;
859 				pi->pi_duptaddrmsg_printed = 0;
860 			}
861 			continue;
862 		}
863 
864 		li = NULL;
865 		if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL &&
866 		    pi->pi_v4->pii_probe_logint->li_dupaddr)
867 			li = pi->pi_v4->pii_probe_logint;
868 
869 		if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL &&
870 		    pi->pi_v6->pii_probe_logint->li_dupaddr)
871 			li = pi->pi_v6->pii_probe_logint;
872 
873 		if (li != NULL) {
874 			if (!pi->pi_duptaddrmsg_printed) {
875 				(void) pr_addr(li->li_phyint_inst->pii_af,
876 				    li->li_addr, abuf, sizeof (abuf));
877 				logerr("Test address %s is not unique in "
878 				    "group; disabling probe-based failure "
879 				    "detection on %s\n", abuf, pi->pi_name);
880 				pi->pi_duptaddrmsg_printed = 1;
881 			}
882 			continue;
883 		}
884 
885 		if (getcurrentsec() < pi->pi_taddrthresh)
886 			continue;
887 
888 		if (!pi->pi_taddrmsg_printed) {
889 			logerr("No test address configured on interface %s; "
890 			    "disabling probe-based failure detection on it\n",
891 			    pi->pi_name);
892 			pi->pi_taddrmsg_printed = 1;
893 		}
894 	}
895 }
896 
897 /*
898  * Check phyint group configuration, to detect any inconsistencies,
899  * and log an error message. This is called from runtimeouts every
900  * 20 secs. But the error message is displayed once. If the
901  * consistency is resolved by the admin, a recovery message is displayed
902  * once.
903  */
904 static void
905 check_config(void)
906 {
907 	struct phyint_group *pg;
908 	struct phyint *pi;
909 	boolean_t v4_in_group;
910 	boolean_t v6_in_group;
911 
912 	/*
913 	 * All phyints of a group must be homogenous to ensure that
914 	 * failover or failback can be done. If any phyint in a group
915 	 * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
916 	 * Do a similar check for IPv6.
917 	 */
918 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
919 		if (pg == phyint_anongroup)
920 			continue;
921 
922 		v4_in_group = _B_FALSE;
923 		v6_in_group = _B_FALSE;
924 		/*
925 		 * 1st pass. Determine if at least 1 phyint in the group
926 		 * has IPv4 plumbed and if so set v4_in_group to true.
927 		 * Repeat similarly for IPv6.
928 		 */
929 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
930 			if (pi->pi_v4 != NULL)
931 				v4_in_group = _B_TRUE;
932 			if (pi->pi_v6 != NULL)
933 				v6_in_group = _B_TRUE;
934 		}
935 
936 		/*
937 		 * 2nd pass. If v4_in_group is true, check that phyint
938 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
939 		 * out a message the 1st time only.
940 		 */
941 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
942 			if (pi->pi_flags & IFF_OFFLINE)
943 				continue;
944 
945 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
946 				if (!pi->pi_cfgmsg_printed) {
947 					logerr("NIC %s of group %s is"
948 					    " not plumbed for IPv4 and may"
949 					    " affect failover capability\n",
950 					    pi->pi_name,
951 					    pi->pi_group->pg_name);
952 					pi->pi_cfgmsg_printed = 1;
953 				}
954 			} else if (v6_in_group == _B_TRUE &&
955 			    pi->pi_v6 == NULL) {
956 				if (!pi->pi_cfgmsg_printed) {
957 					logerr("NIC %s of group %s is"
958 					    " not plumbed for IPv6 and may"
959 					    " affect failover capability\n",
960 					    pi->pi_name,
961 					    pi->pi_group->pg_name);
962 					pi->pi_cfgmsg_printed = 1;
963 				}
964 			} else {
965 				/*
966 				 * The phyint matches the group configuration,
967 				 * if we have reached this point. If it was
968 				 * improperly configured earlier, log an
969 				 * error recovery message
970 				 */
971 				if (pi->pi_cfgmsg_printed) {
972 					logerr("NIC %s is now consistent with "
973 					    "group %s and failover capability "
974 					    "is restored\n", pi->pi_name,
975 					    pi->pi_group->pg_name);
976 					pi->pi_cfgmsg_printed = 0;
977 				}
978 			}
979 
980 		}
981 	}
982 }
983 
984 /*
985  * Timer mechanism using relative time (in milliseconds) from the
986  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
987  * will fire after TIMER_INFINITY milliseconds.
988  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
989  * time values. Hence 2 consecutive timer events cannot be spaced farther
990  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
991  * that can be passed for the delay parameter of timer_schedule()
992  */
993 static uint_t timer_next;	/* Currently scheduled timeout */
994 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
995 
996 static void
997 timer_init(void)
998 {
999 	timer_next = getcurrenttime() + TIMER_INFINITY;
1000 	/*
1001 	 * The call to run_timeouts() will get the timer started
1002 	 * Since there are no phyints at this point, the timer will
1003 	 * be set for IF_SCAN_INTERVAL ms.
1004 	 */
1005 	run_timeouts();
1006 }
1007 
1008 /*
1009  * Make sure the next SIGALRM occurs delay milliseconds from the current
1010  * time if not earlier. We are interested only in time differences.
1011  */
1012 void
1013 timer_schedule(uint_t delay)
1014 {
1015 	uint_t now;
1016 	struct itimerval itimerval;
1017 
1018 	if (debug & D_TIMER)
1019 		logdebug("timer_schedule(%u)\n", delay);
1020 
1021 	assert(delay <= TIMER_INFINITY);
1022 
1023 	now = getcurrenttime();
1024 	if (delay == 0) {
1025 		/* Minimum allowed delay */
1026 		delay = 1;
1027 	}
1028 	/* Will this timer occur before the currently scheduled SIGALRM? */
1029 	if (timer_active && TIME_GE(now + delay, timer_next)) {
1030 		if (debug & D_TIMER) {
1031 			logdebug("timer_schedule(%u) - no action: "
1032 			    "now %u next %u\n", delay, now, timer_next);
1033 		}
1034 		return;
1035 	}
1036 	timer_next = now + delay;
1037 
1038 	itimerval.it_value.tv_sec = delay / 1000;
1039 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
1040 	itimerval.it_interval.tv_sec = 0;
1041 	itimerval.it_interval.tv_usec = 0;
1042 	if (debug & D_TIMER) {
1043 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1044 		    delay, itimerval.it_value.tv_sec,
1045 		    itimerval.it_value.tv_usec);
1046 	}
1047 	timer_active = _B_TRUE;
1048 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1049 		logperror("timer_schedule: setitimer");
1050 		exit(2);
1051 	}
1052 }
1053 
1054 /*
1055  * Timer has fired. Determine when the next timer event will occur by asking
1056  * all the timer routines. Should not be called from a timer routine.
1057  */
1058 static void
1059 run_timeouts(void)
1060 {
1061 	uint_t next;
1062 	uint_t next_event_time;
1063 	struct phyint_instance *pii;
1064 	struct phyint_instance *next_pii;
1065 	static boolean_t timeout_running;
1066 
1067 	/* assert that recursive timeouts don't happen. */
1068 	assert(!timeout_running);
1069 
1070 	timeout_running = _B_TRUE;
1071 
1072 	if (debug & D_TIMER)
1073 		logdebug("run_timeouts()\n");
1074 
1075 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1076 		initifs();
1077 		check_config();
1078 	}
1079 
1080 	next = TIMER_INFINITY;
1081 
1082 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1083 		next_pii = pii->pii_next;
1084 		next_event_time = phyint_inst_timer(pii);
1085 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1086 			next = next_event_time;
1087 
1088 		if (debug & D_TIMER) {
1089 			logdebug("run_timeouts(%s %s): next scheduled for"
1090 			    " this phyint inst %u, next scheduled global"
1091 			    " %u ms\n",
1092 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1093 			    next_event_time, next);
1094 		}
1095 	}
1096 
1097 	/*
1098 	 * Make sure initifs() is called at least once every
1099 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1100 	 * with the kernel, in case we have missed any routing
1101 	 * socket messages.
1102 	 */
1103 	if (next > IF_SCAN_INTERVAL)
1104 		next = IF_SCAN_INTERVAL;
1105 
1106 	if (debug & D_TIMER)
1107 		logdebug("run_timeouts: %u ms\n", next);
1108 
1109 	timer_schedule(next);
1110 	timeout_running = _B_FALSE;
1111 }
1112 
1113 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1114 static int eventpipe_write = -1;
1115 static boolean_t cleanup_started = _B_FALSE;
1116 				/* Don't write to eventpipe if in cleanup */
1117 /*
1118  * Ensure that signals are processed synchronously with the rest of
1119  * the code by just writing a one character signal number on the pipe.
1120  * The poll loop will pick this up and process the signal event.
1121  */
1122 static void
1123 sig_handler(int signo)
1124 {
1125 	uchar_t buf = (uchar_t)signo;
1126 
1127 	/*
1128 	 * Don't write to pipe if cleanup has already begun. cleanup()
1129 	 * might have closed the pipe already
1130 	 */
1131 	if (cleanup_started)
1132 		return;
1133 
1134 	if (eventpipe_write == -1) {
1135 		logerr("sig_handler: no pipe found\n");
1136 		return;
1137 	}
1138 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1139 		logperror("sig_handler: write");
1140 }
1141 
1142 extern struct probes_missed probes_missed;
1143 
1144 /*
1145  * Pick up a signal "byte" from the pipe and process it.
1146  */
1147 static void
1148 in_signal(int fd)
1149 {
1150 	uchar_t buf;
1151 	uint64_t  sent, acked, lost, unacked, unknown;
1152 	struct phyint_instance *pii;
1153 	int pr_ndx;
1154 
1155 	switch (read(fd, &buf, sizeof (buf))) {
1156 	case -1:
1157 		logperror("in_signal: read");
1158 		exit(1);
1159 		/* NOTREACHED */
1160 	case 1:
1161 		break;
1162 	case 0:
1163 		logerr("in_signal: read end of file\n");
1164 		exit(1);
1165 		/* NOTREACHED */
1166 	default:
1167 		logerr("in_signal: read > 1\n");
1168 		exit(1);
1169 	}
1170 
1171 	if (debug & D_TIMER)
1172 		logdebug("in_signal() got %d\n", buf);
1173 
1174 	switch (buf) {
1175 	case SIGALRM:
1176 		if (debug & D_TIMER) {
1177 			uint_t now = getcurrenttime();
1178 
1179 			logdebug("in_signal(SIGALRM) delta %u\n",
1180 			    now - timer_next);
1181 		}
1182 		timer_active = _B_FALSE;
1183 		run_timeouts();
1184 		break;
1185 	case SIGUSR1:
1186 		logdebug("Printing configuration:\n");
1187 		/* Print out the internal tables */
1188 		phyint_inst_print_all();
1189 
1190 		/*
1191 		 * Print out the accumulated statistics about missed
1192 		 * probes (happens due to scheduling delay).
1193 		 */
1194 		logerr("Missed sending total of %d probes spread over"
1195 		    " %d occurrences\n", probes_missed.pm_nprobes,
1196 		    probes_missed.pm_ntimes);
1197 
1198 		/*
1199 		 * Print out the accumulated statistics about probes
1200 		 * that were sent.
1201 		 */
1202 		for (pii = phyint_instances; pii != NULL;
1203 		    pii = pii->pii_next) {
1204 			unacked = 0;
1205 			acked = pii->pii_cum_stats.acked;
1206 			lost = pii->pii_cum_stats.lost;
1207 			sent = pii->pii_cum_stats.sent;
1208 			unknown = pii->pii_cum_stats.unknown;
1209 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1210 				switch (pii->pii_probes[pr_ndx].pr_status) {
1211 				case PR_ACKED:
1212 					acked++;
1213 					break;
1214 				case PR_LOST:
1215 					lost++;
1216 					break;
1217 				case PR_UNACKED:
1218 					unacked++;
1219 					break;
1220 				}
1221 			}
1222 			logerr("\nProbe stats on (%s %s)\n"
1223 			    "Number of probes sent %lld\n"
1224 			    "Number of probe acks received %lld\n"
1225 			    "Number of probes/acks lost %lld\n"
1226 			    "Number of valid unacknowled probes %lld\n"
1227 			    "Number of ambiguous probe acks received %lld\n",
1228 			    AF_STR(pii->pii_af), pii->pii_name,
1229 			    sent, acked, lost, unacked, unknown);
1230 		}
1231 		break;
1232 	case SIGHUP:
1233 		logerr("SIGHUP: restart and reread config file\n");
1234 		cleanup();
1235 		(void) execv(argv0[0], argv0);
1236 		_exit(0177);
1237 		/* NOTREACHED */
1238 	case SIGINT:
1239 	case SIGTERM:
1240 	case SIGQUIT:
1241 		cleanup();
1242 		exit(0);
1243 		/* NOTREACHED */
1244 	default:
1245 		logerr("in_signal: unknown signal: %d\n", buf);
1246 	}
1247 }
1248 
1249 static void
1250 cleanup(void)
1251 {
1252 	struct phyint_instance *pii;
1253 	struct phyint_instance *next_pii;
1254 
1255 	/*
1256 	 * Make sure that we don't write to eventpipe in
1257 	 * sig_handler() if any signal notably SIGALRM,
1258 	 * occurs after we close the eventpipe descriptor below
1259 	 */
1260 	cleanup_started = _B_TRUE;
1261 
1262 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1263 		next_pii = pii->pii_next;
1264 		phyint_inst_delete(pii);
1265 	}
1266 
1267 	(void) close(ifsock_v4);
1268 	(void) close(ifsock_v6);
1269 	(void) close(rtsock_v4);
1270 	(void) close(rtsock_v6);
1271 	(void) close(lsock_v4);
1272 	(void) close(lsock_v6);
1273 	(void) close(0);
1274 	(void) close(1);
1275 	(void) close(2);
1276 	(void) close(mibfd);
1277 	(void) close(eventpipe_read);
1278 	(void) close(eventpipe_write);
1279 }
1280 
1281 /*
1282  * Create pipe for signal delivery and set up signal handlers.
1283  */
1284 static void
1285 setup_eventpipe(void)
1286 {
1287 	int fds[2];
1288 	struct sigaction act;
1289 
1290 	if ((pipe(fds)) < 0) {
1291 		logperror("setup_eventpipe: pipe");
1292 		exit(1);
1293 	}
1294 	eventpipe_read = fds[0];
1295 	eventpipe_write = fds[1];
1296 	if (poll_add(eventpipe_read) == -1) {
1297 		exit(1);
1298 	}
1299 
1300 	act.sa_handler = sig_handler;
1301 	act.sa_flags = SA_RESTART;
1302 	(void) sigaction(SIGALRM, &act, NULL);
1303 
1304 	(void) sigset(SIGHUP, sig_handler);
1305 	(void) sigset(SIGUSR1, sig_handler);
1306 	(void) sigset(SIGTERM, sig_handler);
1307 	(void) sigset(SIGINT, sig_handler);
1308 	(void) sigset(SIGQUIT, sig_handler);
1309 }
1310 
1311 /*
1312  * Create a routing socket for receiving RTM_IFINFO messages.
1313  */
1314 static int
1315 setup_rtsock(int af)
1316 {
1317 	int	s;
1318 	int	flags;
1319 
1320 	s = socket(PF_ROUTE, SOCK_RAW, af);
1321 	if (s == -1) {
1322 		logperror("setup_rtsock: socket PF_ROUTE");
1323 		exit(1);
1324 	}
1325 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1326 		logperror("setup_rtsock: fcntl F_GETFL");
1327 		(void) close(s);
1328 		exit(1);
1329 	}
1330 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1331 		logperror("setup_rtsock: fcntl F_SETFL");
1332 		(void) close(s);
1333 		exit(1);
1334 	}
1335 	if (poll_add(s) == -1) {
1336 		(void) close(s);
1337 		exit(1);
1338 	}
1339 	return (s);
1340 }
1341 
1342 /*
1343  * Process an RTM_IFINFO message received on a routing socket.
1344  * The return value indicates whether a full interface scan is required.
1345  * Link up/down notifications from the NICs are reflected in the
1346  * IFF_RUNNING flag.
1347  * If just the state of the IFF_RUNNING interface flag has changed, a
1348  * a full interface scan isn't required.
1349  */
1350 static boolean_t
1351 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1352 {
1353 	struct sockaddr_dl *sdl;
1354 	struct phyint *pi;
1355 	uint64_t old_flags;
1356 	struct phyint_instance *pii;
1357 
1358 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1359 
1360 	/*
1361 	 * Although the sockaddr_dl structure is directly after the
1362 	 * if_msghdr_t structure. At the time of writing, the size of the
1363 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1364 	 * to the presence of a timeval structure, which contains longs,
1365 	 * in the if_data structure.  Anyway, we know where the message ends,
1366 	 * so we work backwards to get the start of the sockaddr_dl structure.
1367 	 */
1368 	/*LINTED*/
1369 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1370 	    sizeof (struct sockaddr_dl));
1371 
1372 	assert(sdl->sdl_family == AF_LINK);
1373 
1374 	/*
1375 	 * The interface name is in sdl_data.
1376 	 * RTM_IFINFO messages are only generated for logical interface
1377 	 * zero, so there is no colon and logical interface number to
1378 	 * strip from the name.	 The name is not null terminated, but
1379 	 * there should be enough space in sdl_data to add the null.
1380 	 */
1381 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1382 		if (debug & D_LINKNOTE)
1383 			logdebug("process_rtm_ifinfo: phyint name too long\n");
1384 		return (_B_TRUE);
1385 	}
1386 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1387 
1388 	pi = phyint_lookup(sdl->sdl_data);
1389 	if (pi == NULL) {
1390 		if (debug & D_LINKNOTE)
1391 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1392 			    " for %s\n", sdl->sdl_data);
1393 		return (_B_TRUE);
1394 	}
1395 
1396 	/*
1397 	 * We want to try and avoid doing a full interface scan for
1398 	 * link state notifications from the NICs, as indicated
1399 	 * by the state of the IFF_RUNNING flag.  If just the
1400 	 * IFF_RUNNING flag has changed state, the link state changes
1401 	 * are processed without a full scan.
1402 	 * If there is both an IPv4 and IPv6 instance associated with
1403 	 * the physical interface, we will get an RTM_IFINFO message
1404 	 * for each instance.  If we just maintained a single copy of
1405 	 * the physical interface flags, it would appear that no flags
1406 	 * had changed when the second message is processed, leading us
1407 	 * to believe that the message wasn't generated by a flags change,
1408 	 * and that a full interface scan is required.
1409 	 * To get around this problem, two additional copies of the flags
1410 	 * are kept, one copy for each instance.  These are only used in
1411 	 * this routine.  At any one time, all three copies of the flags
1412 	 * should be identical except for the IFF_RUNNING flag.	 The
1413 	 * copy of the flags in the "phyint" structure is always up to
1414 	 * date.
1415 	 */
1416 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1417 	if (pii == NULL) {
1418 		if (debug & D_LINKNOTE)
1419 			logdebug("process_rtm_ifinfo: no instance of address "
1420 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1421 		return (_B_TRUE);
1422 	}
1423 
1424 	old_flags = pii->pii_flags;
1425 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1426 	pi->pi_flags = pii->pii_flags;
1427 
1428 	if (debug & D_LINKNOTE) {
1429 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1430 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1431 		    AF_STR(type), old_flags, pi->pi_flags);
1432 	}
1433 
1434 	/*
1435 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1436 	 * types.
1437 	 */
1438 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1439 		phyint_newtype(pi);
1440 
1441 	/*
1442 	 * If IFF_INACTIVE has been set, then no data addresses should be
1443 	 * hosted on the interface.  If IFF_INACTIVE has been cleared, then
1444 	 * move previously failed-over addresses back to it, provided it is
1445 	 * not failed.	For details, see the state diagram in mpd_probe.c.
1446 	 */
1447 	if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
1448 		if (pii->pii_flags & IFF_INACTIVE) {
1449 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
1450 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
1451 		} else {
1452 			if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
1453 				pi->pi_empty = 0;
1454 				(void) try_failback(pi);
1455 			}
1456 		}
1457 	}
1458 
1459 	/* Has just the IFF_RUNNING flag changed state ? */
1460 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1461 		struct phyint_instance *pii_other;
1462 		/*
1463 		 * It wasn't just a link state change.	Update
1464 		 * the other instance's copy of the flags.
1465 		 */
1466 		pii_other = phyint_inst_other(pii);
1467 		if (pii_other != NULL)
1468 			pii_other->pii_flags = pii->pii_flags;
1469 		return (_B_TRUE);
1470 	}
1471 
1472 	return (_B_FALSE);
1473 }
1474 
1475 /*
1476  * Retrieve as many routing socket messages as possible, and try to
1477  * empty the routing sockets. Initiate full scan of targets or interfaces
1478  * as needed.
1479  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1480  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1481  */
1482 static void
1483 process_rtsock(int rtsock_v4, int rtsock_v6)
1484 {
1485 	int	nbytes;
1486 	int64_t msg[2048 / 8];
1487 	struct rt_msghdr *rtm;
1488 	boolean_t need_if_scan = _B_FALSE;
1489 	boolean_t need_rt_scan = _B_FALSE;
1490 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1491 	int type;
1492 
1493 	/* Read as many messages as possible and try to empty the sockets */
1494 	for (type = AF_INET; ; type = AF_INET6) {
1495 		for (;;) {
1496 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1497 			    rtsock_v6, msg, sizeof (msg));
1498 			if (nbytes <= 0) {
1499 				/* No more messages */
1500 				break;
1501 			}
1502 			rtm = (struct rt_msghdr *)msg;
1503 			if (rtm->rtm_version != RTM_VERSION) {
1504 				logerr("process_rtsock: version %d "
1505 				    "not understood\n", rtm->rtm_version);
1506 				break;
1507 			}
1508 
1509 			if (debug & D_PHYINT) {
1510 				logdebug("process_rtsock: message %d\n",
1511 				    rtm->rtm_type);
1512 			}
1513 
1514 			switch (rtm->rtm_type) {
1515 			case RTM_NEWADDR:
1516 			case RTM_DELADDR:
1517 				/*
1518 				 * Some logical interface has changed,
1519 				 * have to scan everything to determine
1520 				 * what actually changed.
1521 				 */
1522 				need_if_scan = _B_TRUE;
1523 				break;
1524 
1525 			case RTM_IFINFO:
1526 				rtm_ifinfo_seen = _B_TRUE;
1527 				need_if_scan |= process_rtm_ifinfo(
1528 				    (if_msghdr_t *)rtm, type);
1529 				break;
1530 
1531 			case RTM_ADD:
1532 			case RTM_DELETE:
1533 			case RTM_CHANGE:
1534 			case RTM_OLDADD:
1535 			case RTM_OLDDEL:
1536 				need_rt_scan = _B_TRUE;
1537 				break;
1538 
1539 			default:
1540 				/* Not interesting */
1541 				break;
1542 			}
1543 		}
1544 		if (type == AF_INET6)
1545 			break;
1546 	}
1547 
1548 	if (need_if_scan) {
1549 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1550 			logdebug("process_rtsock: synchronizing with kernel\n");
1551 		initifs();
1552 	} else if (rtm_ifinfo_seen) {
1553 		if (debug & D_LINKNOTE)
1554 			logdebug("process_rtsock: "
1555 			    "link up/down notification(s) seen\n");
1556 		process_link_state_changes();
1557 	}
1558 
1559 	if (need_rt_scan)
1560 		init_router_targets();
1561 }
1562 
1563 /*
1564  * Look if the phyint instance or one of its logints have been removed from
1565  * the kernel and take appropriate action.
1566  * Uses {pii,li}_in_use.
1567  */
1568 static void
1569 check_if_removed(struct phyint_instance *pii)
1570 {
1571 	struct logint *li;
1572 	struct logint *next_li;
1573 
1574 	/* Detect phyints that have been removed from the kernel. */
1575 	if (!pii->pii_in_use) {
1576 		logtrace("%s %s has been removed from kernel\n",
1577 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1578 		phyint_inst_delete(pii);
1579 	} else {
1580 		/* Detect logints that have been removed. */
1581 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1582 			next_li = li->li_next;
1583 			if (!li->li_in_use) {
1584 				logint_delete(li);
1585 			}
1586 		}
1587 	}
1588 }
1589 
1590 /*
1591  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1592  * tables defined by mib2.h. Parse the returned data and extract
1593  * the 'routing' information table. Process the 'routing' table
1594  * to get the list of known onlink routers, and update our database.
1595  * These onlink routers will serve as our probe targets.
1596  * Returns false, if any system calls resulted in errors, true otherwise.
1597  */
1598 static boolean_t
1599 update_router_list(int fd)
1600 {
1601 	union {
1602 		char	ubuf[1024];
1603 		union T_primitives uprim;
1604 	} buf;
1605 
1606 	int			flags;
1607 	struct strbuf		ctlbuf;
1608 	struct strbuf		databuf;
1609 	struct T_optmgmt_req	*tor;
1610 	struct T_optmgmt_ack	*toa;
1611 	struct T_error_ack	*tea;
1612 	struct opthdr		*optp;
1613 	struct opthdr		*req;
1614 	int			status;
1615 	t_scalar_t		prim;
1616 
1617 	tor = (struct T_optmgmt_req *)&buf;
1618 
1619 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1620 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1621 	tor->OPT_length = sizeof (struct opthdr);
1622 	tor->MGMT_flags = T_CURRENT;
1623 
1624 	req = (struct opthdr *)&tor[1];
1625 	req->level = MIB2_IP;	/* any MIB2_xxx value ok here */
1626 	req->name  = 0;
1627 	req->len   = 0;
1628 
1629 	ctlbuf.buf = (char *)&buf;
1630 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1631 	ctlbuf.maxlen = sizeof (buf);
1632 	flags = 0;
1633 	if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
1634 		logperror("update_router_list: putmsg(ctl)");
1635 		return (_B_FALSE);
1636 	}
1637 
1638 	/*
1639 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1640 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1641 	 * a control and data part. The control part contains a struct
1642 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1643 	 * the level, name and length of the data in the data part. The
1644 	 * data part contains the actual table data. The last message
1645 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1646 	 * single option with zero optlen.
1647 	 */
1648 
1649 	for (;;) {
1650 		/*
1651 		 * Go around this loop once for each table. Ignore
1652 		 * all tables except the routing information table.
1653 		 */
1654 		flags = 0;
1655 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1656 		if (status < 0) {
1657 			if (errno == EINTR)
1658 				continue;
1659 			logperror("update_router_list: getmsg(ctl)");
1660 			return (_B_FALSE);
1661 		}
1662 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1663 			logerr("update_router_list: ctlbuf.len %d\n",
1664 			    ctlbuf.len);
1665 			return (_B_FALSE);
1666 		}
1667 
1668 		prim = buf.uprim.type;
1669 
1670 		switch (prim) {
1671 
1672 		case T_ERROR_ACK:
1673 			tea = &buf.uprim.error_ack;
1674 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1675 				logerr("update_router_list: T_ERROR_ACK"
1676 				    " ctlbuf.len %d\n", ctlbuf.len);
1677 				return (_B_FALSE);
1678 			}
1679 			logerr("update_router_list: T_ERROR_ACK:"
1680 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1681 			    tea->TLI_error, tea->UNIX_error);
1682 			return (_B_FALSE);
1683 
1684 		case T_OPTMGMT_ACK:
1685 			toa = &buf.uprim.optmgmt_ack;
1686 			optp = (struct opthdr *)&toa[1];
1687 			if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
1688 				logerr("update_router_list: ctlbuf.len %d\n",
1689 				    ctlbuf.len);
1690 				return (_B_FALSE);
1691 			}
1692 			if (toa->MGMT_flags != T_SUCCESS) {
1693 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1694 				    toa->MGMT_flags);
1695 				return (_B_FALSE);
1696 			}
1697 			break;
1698 
1699 		default:
1700 			logerr("update_router_list: unknown primitive %ld\n",
1701 			    prim);
1702 			return (_B_FALSE);
1703 		}
1704 
1705 		/* Process the T_OPGMGMT_ACK below */
1706 		assert(prim == T_OPTMGMT_ACK);
1707 
1708 		switch (status) {
1709 		case 0:
1710 			/*
1711 			 * We have reached the end of this T_OPTMGMT_ACK
1712 			 * message. If this is the last message i.e EOD,
1713 			 * return, else process the next T_OPTMGMT_ACK msg.
1714 			 */
1715 			if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
1716 			    sizeof (struct opthdr)) && optp->len == 0 &&
1717 			    optp->name == 0 && optp->level == 0) {
1718 				/*
1719 				 * This is the EOD message. Return
1720 				 */
1721 				return (_B_TRUE);
1722 			}
1723 			continue;
1724 
1725 		case MORECTL:
1726 		case MORECTL | MOREDATA:
1727 			/*
1728 			 * This should not happen. We should be able to read
1729 			 * the control portion in a single getmsg.
1730 			 */
1731 			logerr("update_router_list: MORECTL\n");
1732 			return (_B_FALSE);
1733 
1734 		case MOREDATA:
1735 			databuf.maxlen = optp->len;
1736 			/* malloc of 0 bytes is ok */
1737 			databuf.buf = malloc((size_t)optp->len);
1738 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1739 				logperror("update_router_list: malloc");
1740 				return (_B_FALSE);
1741 			}
1742 			databuf.len = 0;
1743 			flags = 0;
1744 			for (;;) {
1745 				status = getmsg(fd, NULL, &databuf, &flags);
1746 				if (status >= 0) {
1747 					break;
1748 				} else if (errno == EINTR) {
1749 					continue;
1750 				} else {
1751 					logperror("update_router_list:"
1752 					    " getmsg(data)");
1753 					free(databuf.buf);
1754 					return (_B_FALSE);
1755 				}
1756 			}
1757 
1758 			if (optp->level == MIB2_IP &&
1759 			    optp->name == MIB2_IP_ROUTE) {
1760 				/* LINTED */
1761 				ire_process_v4((mib2_ipRouteEntry_t *)
1762 				    databuf.buf, databuf.len);
1763 			} else if (optp->level == MIB2_IP6 &&
1764 			    optp->name == MIB2_IP6_ROUTE) {
1765 				/* LINTED */
1766 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1767 				    databuf.buf, databuf.len);
1768 			}
1769 			free(databuf.buf);
1770 		}
1771 	}
1772 	/* NOTREACHED */
1773 }
1774 
1775 /*
1776  * Examine the IPv4 routing table, for default routers. For each default
1777  * router, populate the list of targets of each phyint that is on the same
1778  * link as the default router
1779  */
1780 static void
1781 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1782 {
1783 	mib2_ipRouteEntry_t	*rp;
1784 	mib2_ipRouteEntry_t	*rp1;
1785 	struct	in_addr		nexthop_v4;
1786 	mib2_ipRouteEntry_t	*endp;
1787 
1788 	if (len == 0)
1789 		return;
1790 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1791 
1792 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1793 
1794 	/*
1795 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1796 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1797 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1798 	 * This is a potential target for probing, which we try to add
1799 	 * to the list of probe targets.
1800 	 */
1801 	for (rp = buf; rp < endp; rp++) {
1802 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1803 			continue;
1804 
1805 		/*  Get the nexthop address. */
1806 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1807 
1808 		/*
1809 		 * Get the nexthop address. Then determine the outgoing
1810 		 * interface, by examining all interface IREs, and picking the
1811 		 * match. We don't look at the interface specified in the route
1812 		 * because we need to add the router target on all matching
1813 		 * interfaces anyway; the goal is to avoid falling back to
1814 		 * multicast when some interfaces are in the same subnet but
1815 		 * not in the same group.
1816 		 */
1817 		for (rp1 = buf; rp1 < endp; rp1++) {
1818 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
1819 				continue;
1820 			}
1821 
1822 			/*
1823 			 * Determine the interface IRE that matches the nexthop.
1824 			 * i.e.	 (IRE addr & IRE mask) == (nexthop & IRE mask)
1825 			 */
1826 			if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
1827 			    (nexthop_v4.s_addr & rp1->ipRouteMask)) {
1828 				/*
1829 				 * We found the interface ire
1830 				 */
1831 				router_add_v4(rp1, nexthop_v4);
1832 			}
1833 		}
1834 	}
1835 }
1836 
1837 void
1838 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
1839 {
1840 	char *cp;
1841 	char ifname[LIFNAMSIZ + 1];
1842 	struct in6_addr	nexthop;
1843 	int len;
1844 
1845 	if (debug & D_TARGET)
1846 		logdebug("router_add_v4()\n");
1847 
1848 	len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
1849 	(void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
1850 	ifname[len] = '\0';
1851 
1852 	if (ifname[0] == '\0')
1853 		return;
1854 
1855 	cp = strchr(ifname, IF_SEPARATOR);
1856 	if (cp != NULL)
1857 		*cp = '\0';
1858 
1859 	IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1860 	router_add_common(AF_INET, ifname, nexthop);
1861 }
1862 
1863 void
1864 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1865 {
1866 	struct phyint_instance *pii;
1867 	struct phyint *pi;
1868 
1869 	if (debug & D_TARGET)
1870 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1871 
1872 	/*
1873 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1874 	 */
1875 	pii = phyint_inst_lookup(af, ifname);
1876 	if (pii == NULL)
1877 		return;
1878 
1879 	/*
1880 	 * Don't use our own addresses as targets.
1881 	 */
1882 	if (own_address(nexthop))
1883 		return;
1884 
1885 	/*
1886 	 * If the phyint is part a named group, then add the address to all
1887 	 * members of the group; note that this is suboptimal in the IPv4 case
1888 	 * as it has already been added to all matching interfaces in
1889 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1890 	 * itself, since other phyints in the anongroup may not be on the same
1891 	 * subnet.
1892 	 */
1893 	pi = pii->pii_phyint;
1894 	if (pi->pi_group == phyint_anongroup) {
1895 		target_add(pii, nexthop, _B_TRUE);
1896 	} else {
1897 		pi = pi->pi_group->pg_phyint;
1898 		for (; pi != NULL; pi = pi->pi_pgnext)
1899 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1900 	}
1901 }
1902 
1903 /*
1904  * Examine the IPv6 routing table, for default routers. For each default
1905  * router, populate the list of targets of each phyint that is on the same
1906  * link as the default router
1907  */
1908 static void
1909 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1910 {
1911 	mib2_ipv6RouteEntry_t	*rp;
1912 	mib2_ipv6RouteEntry_t	*endp;
1913 	struct	in6_addr nexthop_v6;
1914 
1915 	if (debug & D_TARGET)
1916 		logdebug("ire_process_v6(len %d)\n", len);
1917 
1918 	if (len == 0)
1919 		return;
1920 
1921 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1922 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1923 
1924 	/*
1925 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1926 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1927 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1928 	 * This is a potential target for probing, which we try to add
1929 	 * to the list of probe targets.
1930 	 */
1931 	for (rp = buf; rp < endp; rp++) {
1932 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
1933 			continue;
1934 
1935 		/*
1936 		 * We have the outgoing interface in ipv6RouteIfIndex
1937 		 * if ipv6RouteIfindex.o_length is non-zero. The outgoing
1938 		 * interface must be present for link-local addresses. Since
1939 		 * we use only link-local addreses for probing, we don't
1940 		 * consider the case when the outgoing interface is not
1941 		 * known and we need to scan interface ires
1942 		 */
1943 		nexthop_v6 = rp->ipv6RouteNextHop;
1944 		if (rp->ipv6RouteIfIndex.o_length != 0) {
1945 			/*
1946 			 * We already have the outgoing interface
1947 			 * in ipv6RouteIfIndex.
1948 			 */
1949 			router_add_v6(rp, nexthop_v6);
1950 		}
1951 	}
1952 }
1953 
1954 
1955 void
1956 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
1957 {
1958 	char ifname[LIFNAMSIZ + 1];
1959 	char *cp;
1960 	int  len;
1961 
1962 	if (debug & D_TARGET)
1963 		logdebug("router_add_v6()\n");
1964 
1965 	len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
1966 	(void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
1967 	ifname[len] = '\0';
1968 
1969 	if (ifname[0] == '\0')
1970 		return;
1971 
1972 	cp = strchr(ifname, IF_SEPARATOR);
1973 	if (cp != NULL)
1974 		*cp = '\0';
1975 
1976 	router_add_common(AF_INET6, ifname, nexthop_v6);
1977 }
1978 
1979 
1980 
1981 /*
1982  * Build a list of target routers, by scanning the routing tables.
1983  * It is assumed that interface routes exist, to reach the routers.
1984  */
1985 static void
1986 init_router_targets(void)
1987 {
1988 	struct	target *tg;
1989 	struct	target *next_tg;
1990 	struct	phyint_instance *pii;
1991 	struct	phyint *pi;
1992 
1993 	if (force_mcast)
1994 		return;
1995 
1996 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1997 		pi = pii->pii_phyint;
1998 		/*
1999 		 * Exclude ptp and host targets. Set tg_in_use to false,
2000 		 * only for router targets.
2001 		 */
2002 		if (!pii->pii_targets_are_routers ||
2003 		    (pi->pi_flags & IFF_POINTOPOINT))
2004 			continue;
2005 
2006 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
2007 			tg->tg_in_use = 0;
2008 	}
2009 
2010 	if (mibfd < 0) {
2011 		mibfd = open("/dev/ip", O_RDWR);
2012 		if (mibfd < 0) {
2013 			logperror("mibopen: ip open");
2014 			exit(1);
2015 		}
2016 	}
2017 
2018 	if (!update_router_list(mibfd)) {
2019 		(void) close(mibfd);
2020 		mibfd = -1;
2021 	}
2022 
2023 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2024 		if (!pii->pii_targets_are_routers ||
2025 		    (pi->pi_flags & IFF_POINTOPOINT))
2026 			continue;
2027 
2028 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
2029 			next_tg = tg->tg_next;
2030 			if (!tg->tg_in_use) {
2031 				target_delete(tg);
2032 			}
2033 		}
2034 	}
2035 }
2036 
2037 /*
2038  * Attempt to assign host targets to any interfaces that do not currently
2039  * have probe targets by sharing targets with other interfaces in the group.
2040  */
2041 static void
2042 init_host_targets(void)
2043 {
2044 	struct phyint_instance *pii;
2045 	struct phyint_group *pg;
2046 
2047 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2048 		pg = pii->pii_phyint->pi_group;
2049 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
2050 			dup_host_targets(pii);
2051 	}
2052 }
2053 
2054 /*
2055  * Duplicate host targets from other phyints of the group to
2056  * the phyint instance 'desired_pii'.
2057  */
2058 static void
2059 dup_host_targets(struct phyint_instance	 *desired_pii)
2060 {
2061 	int af;
2062 	struct phyint *pi;
2063 	struct phyint_instance *pii;
2064 	struct target *tg;
2065 
2066 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
2067 
2068 	af = desired_pii->pii_af;
2069 
2070 	/*
2071 	 * For every phyint in the same group as desired_pii, check if
2072 	 * it has any host targets. If so add them to desired_pii.
2073 	 */
2074 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
2075 		pii = PHYINT_INSTANCE(pi, af);
2076 		/*
2077 		 * We know that we don't have targets on this phyint instance
2078 		 * since we have been called. But we still check for
2079 		 * pii_targets_are_routers because another phyint instance
2080 		 * could have router targets, since IFF_NOFAILOVER addresses
2081 		 * on different phyint instances may belong to different
2082 		 * subnets.
2083 		 */
2084 		if ((pii == NULL) || (pii == desired_pii) ||
2085 		    pii->pii_targets_are_routers)
2086 			continue;
2087 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2088 			target_create(desired_pii, tg->tg_address, _B_FALSE);
2089 		}
2090 	}
2091 }
2092 
2093 static void
2094 usage(char *cmd)
2095 {
2096 	(void) fprintf(stderr, "usage: %s\n", cmd);
2097 }
2098 
2099 
2100 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2101 
2102 /* Get an option from the /etc/default/mpathd file */
2103 static char *
2104 getdefault(char *name)
2105 {
2106 	char namebuf[BUFSIZ];
2107 	char *value = NULL;
2108 
2109 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2110 		char	*cp;
2111 		int	flags;
2112 
2113 		/*
2114 		 * ignore case
2115 		 */
2116 		flags = defcntl(DC_GETFLAGS, 0);
2117 		TURNOFF(flags, DC_CASE);
2118 		(void) defcntl(DC_SETFLAGS, flags);
2119 
2120 		/* Add "=" to the name */
2121 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2122 		(void) strncat(namebuf, "=", 2);
2123 
2124 		if ((cp = defread(namebuf)) != NULL)
2125 			value = strdup(cp);
2126 
2127 		/* close */
2128 		(void) defopen((char *)NULL);
2129 	}
2130 	return (value);
2131 }
2132 
2133 
2134 /*
2135  * Command line options below
2136  */
2137 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2138 boolean_t	track_all_phyints = _B_FALSE;	/* option to track all NICs */
2139 static boolean_t adopt = _B_FALSE;
2140 static boolean_t foreground = _B_FALSE;
2141 
2142 int
2143 main(int argc, char *argv[])
2144 {
2145 	int i;
2146 	int c;
2147 	struct phyint_instance *pii;
2148 	char *value;
2149 
2150 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2151 	srandom(gethostid());	/* Initialize the random number generator */
2152 
2153 	/*
2154 	 * NOTE: The messages output by in.mpathd are not suitable for
2155 	 * translation, so we do not call textdomain().
2156 	 */
2157 	(void) setlocale(LC_ALL, "");
2158 
2159 	/*
2160 	 * Get the user specified value of 'failure detection time'
2161 	 * from /etc/default/mpathd
2162 	 */
2163 	value = getdefault("FAILURE_DETECTION_TIME");
2164 	if (value != NULL) {
2165 		user_failure_detection_time =
2166 		    (int)strtol((char *)value, NULL, 0);
2167 
2168 		if (user_failure_detection_time <= 0) {
2169 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2170 			logerr("Invalid failure detection time %s, assuming "
2171 			    "default %d\n", value, user_failure_detection_time);
2172 
2173 		} else if (user_failure_detection_time <
2174 		    MIN_FAILURE_DETECTION_TIME) {
2175 			user_failure_detection_time =
2176 			    MIN_FAILURE_DETECTION_TIME;
2177 			logerr("Too small failure detection time of %s, "
2178 			    "assuming minimum %d\n", value,
2179 			    user_failure_detection_time);
2180 		}
2181 		free(value);
2182 	} else {
2183 		/* User has not specified the parameter, Use default value */
2184 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2185 	}
2186 
2187 	/*
2188 	 * This gives the frequency at which probes will be sent.
2189 	 * When fdt ms elapses, we should be able to determine
2190 	 * whether 5 consecutive probes have failed or not.
2191 	 * 1 probe will be sent in every user_probe_interval ms,
2192 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2193 	 * user_probe_interval. Thus when we send out probe 'n' we
2194 	 * can be sure that probe 'n - 2' is lost, if we have not
2195 	 * got the ack. (since the probe interval is > crtt). But
2196 	 * probe 'n - 1' may be a valid unacked probe, since the
2197 	 * time between 2 successive probes could be as small as
2198 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2199 	 */
2200 	user_probe_interval = user_failure_detection_time /
2201 	    (NUM_PROBE_FAILS + 2);
2202 
2203 	/*
2204 	 * Get the user specified value of failback_enabled from
2205 	 * /etc/default/mpathd
2206 	 */
2207 	value = getdefault("FAILBACK");
2208 	if (value != NULL) {
2209 		if (strncasecmp(value, "yes", 3) == 0)
2210 			failback_enabled = _B_TRUE;
2211 		else if (strncasecmp(value, "no", 2) == 0)
2212 			failback_enabled = _B_FALSE;
2213 		else
2214 			logerr("Invalid value for FAILBACK %s\n", value);
2215 		free(value);
2216 	} else {
2217 		failback_enabled = _B_TRUE;
2218 	}
2219 
2220 	/*
2221 	 * Get the user specified value of track_all_phyints from
2222 	 * /etc/default/mpathd. The sense is reversed in
2223 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2224 	 */
2225 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2226 	if (value != NULL) {
2227 		if (strncasecmp(value, "yes", 3) == 0)
2228 			track_all_phyints = _B_FALSE;
2229 		else if (strncasecmp(value, "no", 2) == 0)
2230 			track_all_phyints = _B_TRUE;
2231 		else
2232 			logerr("Invalid value for "
2233 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2234 		free(value);
2235 	} else {
2236 		track_all_phyints = _B_FALSE;
2237 	}
2238 
2239 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2240 		switch (c) {
2241 		case 'a':
2242 			adopt = _B_TRUE;
2243 			break;
2244 		case 'm':
2245 			force_mcast = _B_TRUE;
2246 			break;
2247 		case 'd':
2248 			debug = D_ALL;
2249 			foreground = _B_TRUE;
2250 			break;
2251 		case 'D':
2252 			i = (int)strtol(optarg, NULL, 0);
2253 			if (i == 0) {
2254 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2255 				    optarg);
2256 				exit(1);
2257 			}
2258 			debug |= i;
2259 			foreground = _B_TRUE;
2260 			break;
2261 		case 'l':
2262 			/*
2263 			 * Turn off link state notification handling.
2264 			 * Undocumented command line flag, for debugging
2265 			 * purposes.
2266 			 */
2267 			handle_link_notifications = _B_FALSE;
2268 			break;
2269 		default:
2270 			usage(argv[0]);
2271 			exit(1);
2272 		}
2273 	}
2274 
2275 	/*
2276 	 * The sockets for the loopback command interface should be listening
2277 	 * before we fork and exit in daemonize(). This way, whoever started us
2278 	 * can use the loopback interface as soon as they get a zero exit
2279 	 * status.
2280 	 */
2281 	lsock_v4 = setup_listener(AF_INET);
2282 	lsock_v6 = setup_listener(AF_INET6);
2283 
2284 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2285 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2286 		exit(1);
2287 	}
2288 
2289 	if (!foreground) {
2290 		if (!daemonize()) {
2291 			logerr("cannot daemonize\n");
2292 			exit(EXIT_FAILURE);
2293 		}
2294 		initlog();
2295 	}
2296 
2297 	/*
2298 	 * Initializations:
2299 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2300 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2301 	 * 2. Initialize a pipe for handling/recording signal events.
2302 	 * 3. Create the routing sockets,  used for listening
2303 	 *    to routing / interface changes.
2304 	 * 4. phyint_init() - Initialize physical interface state
2305 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2306 	 *    which timer_init() does indirectly.
2307 	 * 5. timer_init()  - Initialize timer related stuff
2308 	 * 6. initifs() - Initialize our database of all known interfaces
2309 	 * 7. init_router_targets() - Initialize our database of all known
2310 	 *    router targets.
2311 	 */
2312 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2313 	if (ifsock_v4 < 0) {
2314 		logperror("main: IPv4 socket open");
2315 		exit(1);
2316 	}
2317 
2318 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2319 	if (ifsock_v6 < 0) {
2320 		logperror("main: IPv6 socket open");
2321 		exit(1);
2322 	}
2323 
2324 	setup_eventpipe();
2325 
2326 	rtsock_v4 = setup_rtsock(AF_INET);
2327 	rtsock_v6 = setup_rtsock(AF_INET6);
2328 
2329 	if (phyint_init() == -1) {
2330 		logerr("cannot initialize physical interface structures");
2331 		exit(1);
2332 	}
2333 
2334 	timer_init();
2335 
2336 	initifs();
2337 
2338 	/* Inform kernel whether failback is enabled or disabled */
2339 	if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) {
2340 		logperror("main: ioctl (SIOCSIPMPFAILBACK)");
2341 		exit(1);
2342 	}
2343 
2344 	/*
2345 	 * If we're operating in "adopt" mode and no interfaces need to be
2346 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2347 	 * interfaces are subsequently put into multipathing groups).
2348 	 */
2349 	if (adopt && phyint_instances == NULL)
2350 		exit(0);
2351 
2352 	/*
2353 	 * Main body. Keep listening for activity on any of the sockets
2354 	 * that we are monitoring and take appropriate action as necessary.
2355 	 * signals are also handled synchronously.
2356 	 */
2357 	for (;;) {
2358 		if (poll(pollfds, pollfd_num, -1) < 0) {
2359 			if (errno == EINTR)
2360 				continue;
2361 			logperror("main: poll");
2362 			exit(1);
2363 		}
2364 		for (i = 0; i < pollfd_num; i++) {
2365 			if ((pollfds[i].fd == -1) ||
2366 			    !(pollfds[i].revents & POLLIN))
2367 				continue;
2368 			if (pollfds[i].fd == eventpipe_read) {
2369 				in_signal(eventpipe_read);
2370 				break;
2371 			}
2372 			if (pollfds[i].fd == rtsock_v4 ||
2373 			    pollfds[i].fd == rtsock_v6) {
2374 				process_rtsock(rtsock_v4, rtsock_v6);
2375 				break;
2376 			}
2377 			for (pii = phyint_instances; pii != NULL;
2378 			    pii = pii->pii_next) {
2379 				if (pollfds[i].fd == pii->pii_probe_sock) {
2380 					if (pii->pii_af == AF_INET)
2381 						in_data(pii);
2382 					else
2383 						in6_data(pii);
2384 					break;
2385 				}
2386 			}
2387 			if (pollfds[i].fd == lsock_v4)
2388 				loopback_cmd(lsock_v4, AF_INET);
2389 			else if (pollfds[i].fd == lsock_v6)
2390 				loopback_cmd(lsock_v6, AF_INET6);
2391 		}
2392 		if (full_scan_required) {
2393 			initifs();
2394 			full_scan_required = _B_FALSE;
2395 		}
2396 	}
2397 	/* NOTREACHED */
2398 	return (EXIT_SUCCESS);
2399 }
2400 
2401 static int
2402 setup_listener(int af)
2403 {
2404 	int sock;
2405 	int on;
2406 	int len;
2407 	int ret;
2408 	struct sockaddr_storage laddr;
2409 	struct sockaddr_in  *sin;
2410 	struct sockaddr_in6 *sin6;
2411 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2412 
2413 	assert(af == AF_INET || af == AF_INET6);
2414 
2415 	sock = socket(af, SOCK_STREAM, 0);
2416 	if (sock < 0) {
2417 		logperror("setup_listener: socket");
2418 		exit(1);
2419 	}
2420 
2421 	on = 1;
2422 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2423 	    sizeof (on)) < 0) {
2424 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2425 		exit(1);
2426 	}
2427 
2428 	bzero(&laddr, sizeof (laddr));
2429 	laddr.ss_family = af;
2430 
2431 	if (af == AF_INET) {
2432 		sin = (struct sockaddr_in *)&laddr;
2433 		sin->sin_port = htons(MPATHD_PORT);
2434 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2435 		len = sizeof (struct sockaddr_in);
2436 	} else {
2437 		sin6 = (struct sockaddr_in6 *)&laddr;
2438 		sin6->sin6_port = htons(MPATHD_PORT);
2439 		sin6->sin6_addr = loopback_addr;
2440 		len = sizeof (struct sockaddr_in6);
2441 	}
2442 
2443 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2444 	if (ret < 0) {
2445 		if (errno == EADDRINUSE) {
2446 			/*
2447 			 * Another instance of mpathd may be already active.
2448 			 */
2449 			logerr("main: is another instance of in.mpathd "
2450 			    "already active?\n");
2451 			exit(1);
2452 		} else {
2453 			(void) close(sock);
2454 			return (-1);
2455 		}
2456 	}
2457 	if (listen(sock, 30) < 0) {
2458 		logperror("main: listen");
2459 		exit(1);
2460 	}
2461 	if (poll_add(sock) == -1) {
2462 		(void) close(sock);
2463 		exit(1);
2464 	}
2465 
2466 	return (sock);
2467 }
2468 
2469 /*
2470  * Table of commands and their expected size; used by loopback_cmd().
2471  */
2472 static struct {
2473 	const char	*name;
2474 	unsigned int	size;
2475 } commands[] = {
2476 	{ "MI_PING",		sizeof (uint32_t)	},
2477 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2478 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2479 	{ "MI_SETOINDEX",	sizeof (mi_setoindex_t) },
2480 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2481 };
2482 
2483 /*
2484  * Commands received over the loopback interface come here. Currently
2485  * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
2486  * module. ifconfig only makes a connection, and closes it to check if
2487  * in.mpathd is running.
2488  * if_mpadm sends commands in the format specified by the mpathd_interface
2489  * structure.
2490  */
2491 static void
2492 loopback_cmd(int sock, int family)
2493 {
2494 	int newfd;
2495 	ssize_t len;
2496 	struct sockaddr_storage	peer;
2497 	struct sockaddr_in	*peer_sin;
2498 	struct sockaddr_in6	*peer_sin6;
2499 	socklen_t peerlen;
2500 	union mi_commands mpi;
2501 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2502 	char abuf[INET6_ADDRSTRLEN];
2503 	uint_t cmd;
2504 	int retval;
2505 
2506 	peerlen = sizeof (peer);
2507 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2508 	if (newfd < 0) {
2509 		logperror("loopback_cmd: accept");
2510 		return;
2511 	}
2512 
2513 	switch (family) {
2514 	case AF_INET:
2515 		/*
2516 		 * Validate the address and port to make sure that
2517 		 * non privileged processes don't connect and start
2518 		 * talking to us.
2519 		 */
2520 		if (peerlen != sizeof (struct sockaddr_in)) {
2521 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2522 			(void) close(newfd);
2523 			return;
2524 		}
2525 		peer_sin = (struct sockaddr_in *)&peer;
2526 		if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
2527 		    (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
2528 			(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2529 			    abuf, sizeof (abuf));
2530 			logerr("Attempt to connect from addr %s port %d\n",
2531 			    abuf, ntohs(peer_sin->sin_port));
2532 			(void) close(newfd);
2533 			return;
2534 		}
2535 		break;
2536 
2537 	case AF_INET6:
2538 		if (peerlen != sizeof (struct sockaddr_in6)) {
2539 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2540 			(void) close(newfd);
2541 			return;
2542 		}
2543 		/*
2544 		 * Validate the address and port to make sure that
2545 		 * non privileged processes don't connect and start
2546 		 * talking to us.
2547 		 */
2548 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2549 		if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
2550 		    (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
2551 		    &loopback_addr))) {
2552 			(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2553 			    sizeof (abuf));
2554 			logerr("Attempt to connect from addr %s port %d\n",
2555 			    abuf, ntohs(peer_sin6->sin6_port));
2556 			(void) close(newfd);
2557 			return;
2558 		}
2559 
2560 	default:
2561 		logdebug("loopback_cmd: family %d\n", family);
2562 		(void) close(newfd);
2563 		return;
2564 	}
2565 
2566 	/*
2567 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2568 	 * all supported commands
2569 	 */
2570 	len = read(newfd, &mpi, sizeof (mpi));
2571 
2572 	/*
2573 	 * ifconfig does not send any data. Just tests to see if mpathd
2574 	 * is already running.
2575 	 */
2576 	if (len <= 0) {
2577 		(void) close(newfd);
2578 		return;
2579 	}
2580 
2581 	/*
2582 	 * In theory, we can receive any sized message for a stream socket,
2583 	 * but we don't expect that to happen for a small message over a
2584 	 * loopback connection.
2585 	 */
2586 	if (len < sizeof (uint32_t)) {
2587 		logerr("loopback_cmd: bad command format or read returns "
2588 		    "partial data %d\n", len);
2589 	}
2590 
2591 	cmd = mpi.mi_command;
2592 	if (cmd >= MI_NCMD) {
2593 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2594 		(void) close(newfd);
2595 		return;
2596 	}
2597 
2598 	if (len < commands[cmd].size) {
2599 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2600 		    commands[cmd].name, commands[cmd].size, len);
2601 		(void) close(newfd);
2602 		return;
2603 	}
2604 
2605 	retval = process_cmd(newfd, &mpi);
2606 	if (retval != IPMP_SUCCESS) {
2607 		logerr("failed processing %s: %s\n", commands[cmd].name,
2608 		    ipmp_errmsg(retval));
2609 	}
2610 	(void) close(newfd);
2611 }
2612 
2613 extern int global_errno;	/* set by failover() or failback() */
2614 
2615 /*
2616  * Process the offline, undo offline and set original index commands,
2617  * received from if_mpadm(1M)
2618  */
2619 static unsigned int
2620 process_cmd(int newfd, union mi_commands *mpi)
2621 {
2622 	uint_t	nif = 0;
2623 	uint32_t cmd;
2624 	struct phyint *pi;
2625 	struct phyint *pi2;
2626 	struct phyint_group *pg;
2627 	boolean_t success;
2628 	int error;
2629 	struct mi_offline *mio;
2630 	struct mi_undo_offline *miu;
2631 	struct lifreq lifr;
2632 	int ifsock;
2633 	struct mi_setoindex *mis;
2634 
2635 	cmd = mpi->mi_command;
2636 
2637 	switch (cmd) {
2638 	case MI_OFFLINE:
2639 		mio = &mpi->mi_ocmd;
2640 		/*
2641 		 * Lookup the interface that needs to be offlined.
2642 		 * If it does not exist, return a suitable error.
2643 		 */
2644 		pi = phyint_lookup(mio->mio_ifname);
2645 		if (pi == NULL)
2646 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2647 
2648 		/*
2649 		 * Verify that the minimum redundancy requirements are met.
2650 		 * The multipathing group must have at least the specified
2651 		 * number of functional interfaces after offlining the
2652 		 * requested interface. Otherwise return a suitable error.
2653 		 */
2654 		pg = pi->pi_group;
2655 		nif = 0;
2656 		if (pg != phyint_anongroup) {
2657 			for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
2658 			    pi2 = pi2->pi_pgnext) {
2659 				if ((pi2->pi_state == PI_RUNNING) ||
2660 				    (pg->pg_groupfailed &&
2661 				    !(pi2->pi_flags & IFF_OFFLINE)))
2662 					nif++;
2663 			}
2664 		}
2665 		if (nif < mio->mio_min_redundancy)
2666 			return (send_result(newfd, IPMP_EMINRED, 0));
2667 
2668 		/*
2669 		 * The order of operation is to set IFF_OFFLINE, followed by
2670 		 * failover. Setting IFF_OFFLINE ensures that no new ipif's
2671 		 * can be created. Subsequent failover moves everything on
2672 		 * the OFFLINE interface to some other functional interface.
2673 		 */
2674 		success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
2675 		if (success) {
2676 			if (!pi->pi_empty) {
2677 				error = try_failover(pi, FAILOVER_NORMAL);
2678 				if (error != 0) {
2679 					if (!change_lif_flags(pi, IFF_OFFLINE,
2680 					    _B_FALSE)) {
2681 						logerr("process_cmd: couldn't"
2682 						    " clear OFFLINE flag on"
2683 						    " %s\n", pi->pi_name);
2684 						/*
2685 						 * Offline interfaces should
2686 						 * not be probed.
2687 						 */
2688 						stop_probing(pi);
2689 					}
2690 					return (send_result(newfd, error,
2691 					    global_errno));
2692 				}
2693 			}
2694 		} else {
2695 			return (send_result(newfd, IPMP_FAILURE, errno));
2696 		}
2697 
2698 		/*
2699 		 * The interface is now Offline, so stop probing it.
2700 		 * Note that if_mpadm(1M) will down the test addresses,
2701 		 * after receiving a success reply from us. The routing
2702 		 * socket message will then make us close the socket used
2703 		 * for sending probes. But it is more logical that an
2704 		 * offlined interface must not be probed, even if it has
2705 		 * test addresses.
2706 		 */
2707 		stop_probing(pi);
2708 		return (send_result(newfd, IPMP_SUCCESS, 0));
2709 
2710 	case MI_UNDO_OFFLINE:
2711 		miu = &mpi->mi_ucmd;
2712 		/*
2713 		 * Undo the offline command. As usual lookup the interface.
2714 		 * Send an error if it does not exist or is not offline.
2715 		 */
2716 		pi = phyint_lookup(miu->miu_ifname);
2717 		if (pi == NULL || pi->pi_state != PI_OFFLINE)
2718 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2719 
2720 		/*
2721 		 * Reset the state of the interface based on the current link
2722 		 * state; if this phyint subsequently acquires a test address,
2723 		 * the state will be updated later as a result of the probes.
2724 		 */
2725 		if (LINK_UP(pi))
2726 			phyint_chstate(pi, PI_RUNNING);
2727 		else
2728 			phyint_chstate(pi, PI_FAILED);
2729 
2730 		if (pi->pi_state == PI_RUNNING) {
2731 			/*
2732 			 * Note that the success of MI_UNDO_OFFLINE is not
2733 			 * contingent on actually failing back; in the odd
2734 			 * case where we cannot do it here, we will try again
2735 			 * in initifs() since pi->pi_full will still be zero.
2736 			 */
2737 			if (do_failback(pi) != IPMP_SUCCESS) {
2738 				logdebug("process_cmd: cannot failback from "
2739 				    "%s during MI_UNDO_OFFLINE\n", pi->pi_name);
2740 			}
2741 		}
2742 
2743 		/*
2744 		 * Clear the IFF_OFFLINE flag.  We have to do this last
2745 		 * because do_failback() relies on it being set to decide
2746 		 * when to display messages.
2747 		 */
2748 		(void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE);
2749 
2750 		/*
2751 		 * Give the requestor time to configure test addresses
2752 		 * before complaining that they're missing.
2753 		 */
2754 		pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
2755 
2756 		return (send_result(newfd, IPMP_SUCCESS, 0));
2757 
2758 	case MI_SETOINDEX:
2759 		mis = &mpi->mi_scmd;
2760 
2761 		/* Get the socket for doing ioctls */
2762 		ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
2763 
2764 		/*
2765 		 * Get index of new original interface.
2766 		 * The index is returned in lifr.lifr_index.
2767 		 */
2768 		(void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
2769 		    sizeof (lifr.lifr_name));
2770 
2771 		if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
2772 			return (send_result(newfd, IPMP_FAILURE, errno));
2773 
2774 		/*
2775 		 * Set new original interface index.
2776 		 * The new index was put into lifr.lifr_index by the
2777 		 * SIOCGLIFINDEX ioctl.
2778 		 */
2779 		(void) strlcpy(lifr.lifr_name, mis->mis_lifname,
2780 		    sizeof (lifr.lifr_name));
2781 
2782 		if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
2783 			return (send_result(newfd, IPMP_FAILURE, errno));
2784 
2785 		return (send_result(newfd, IPMP_SUCCESS, 0));
2786 
2787 	case MI_QUERY:
2788 		return (process_query(newfd, &mpi->mi_qcmd));
2789 
2790 	default:
2791 		break;
2792 	}
2793 
2794 	return (send_result(newfd, IPMP_EPROTO, 0));
2795 }
2796 
2797 /*
2798  * Process the query request pointed to by `miq' and send a reply on file
2799  * descriptor `fd'.  Returns an IPMP error code.
2800  */
2801 static unsigned int
2802 process_query(int fd, mi_query_t *miq)
2803 {
2804 	ipmp_groupinfo_t	*grinfop;
2805 	ipmp_groupinfolist_t	*grlp;
2806 	ipmp_grouplist_t	*grlistp;
2807 	ipmp_ifinfo_t		*ifinfop;
2808 	ipmp_ifinfolist_t	*iflp;
2809 	ipmp_snap_t		*snap;
2810 	unsigned int		retval;
2811 
2812 	switch (miq->miq_inforeq) {
2813 	case IPMP_GROUPLIST:
2814 		retval = getgrouplist(&grlistp);
2815 		if (retval != IPMP_SUCCESS)
2816 			return (send_result(fd, retval, errno));
2817 
2818 		retval = send_result(fd, IPMP_SUCCESS, 0);
2819 		if (retval == IPMP_SUCCESS)
2820 			retval = send_grouplist(fd, grlistp);
2821 
2822 		ipmp_freegrouplist(grlistp);
2823 		return (retval);
2824 
2825 	case IPMP_GROUPINFO:
2826 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2827 		retval = getgroupinfo(miq->miq_ifname, &grinfop);
2828 		if (retval != IPMP_SUCCESS)
2829 			return (send_result(fd, retval, errno));
2830 
2831 		retval = send_result(fd, IPMP_SUCCESS, 0);
2832 		if (retval == IPMP_SUCCESS)
2833 			retval = send_groupinfo(fd, grinfop);
2834 
2835 		ipmp_freegroupinfo(grinfop);
2836 		return (retval);
2837 
2838 	case IPMP_IFINFO:
2839 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2840 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2841 		if (retval != IPMP_SUCCESS)
2842 			return (send_result(fd, retval, errno));
2843 
2844 		retval = send_result(fd, IPMP_SUCCESS, 0);
2845 		if (retval == IPMP_SUCCESS)
2846 			retval = send_ifinfo(fd, ifinfop);
2847 
2848 		ipmp_freeifinfo(ifinfop);
2849 		return (retval);
2850 
2851 	case IPMP_SNAP:
2852 		retval = getsnap(&snap);
2853 		if (retval != IPMP_SUCCESS)
2854 			return (send_result(fd, retval, errno));
2855 
2856 		retval = send_result(fd, IPMP_SUCCESS, 0);
2857 		if (retval != IPMP_SUCCESS)
2858 			goto out;
2859 
2860 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2861 		if (retval != IPMP_SUCCESS)
2862 			goto out;
2863 
2864 		retval = send_grouplist(fd, snap->sn_grlistp);
2865 		if (retval != IPMP_SUCCESS)
2866 			goto out;
2867 
2868 		iflp = snap->sn_ifinfolistp;
2869 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2870 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2871 			if (retval != IPMP_SUCCESS)
2872 				goto out;
2873 		}
2874 
2875 		grlp = snap->sn_grinfolistp;
2876 		for (; grlp != NULL; grlp = grlp->grl_next) {
2877 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2878 			if (retval != IPMP_SUCCESS)
2879 				goto out;
2880 		}
2881 	out:
2882 		ipmp_snap_free(snap);
2883 		return (retval);
2884 
2885 	default:
2886 		break;
2887 
2888 	}
2889 	return (send_result(fd, IPMP_EPROTO, 0));
2890 }
2891 
2892 /*
2893  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2894  * Returns an IPMP error code.
2895  */
2896 static unsigned int
2897 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2898 {
2899 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2900 	unsigned int	retval;
2901 
2902 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2903 	if (retval != IPMP_SUCCESS)
2904 		return (retval);
2905 
2906 	return (ipmp_writetlv(fd, IPMP_IFLIST,
2907 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
2908 }
2909 
2910 /*
2911  * Send the interface information pointed to by `ifinfop' on file descriptor
2912  * `fd'.  Returns an IPMP error code.
2913  */
2914 static unsigned int
2915 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2916 {
2917 	return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
2918 }
2919 
2920 /*
2921  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2922  * Returns an IPMP error code.
2923  */
2924 static unsigned int
2925 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2926 {
2927 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2928 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2929 }
2930 
2931 /*
2932  * Initialize an mi_result_t structure using `error' and `syserror' and
2933  * send it on file descriptor `fd'.  Returns an IPMP error code.
2934  */
2935 static unsigned int
2936 send_result(int fd, unsigned int error, int syserror)
2937 {
2938 	mi_result_t me;
2939 
2940 	me.me_mpathd_error = error;
2941 	if (error == IPMP_FAILURE)
2942 		me.me_sys_error = syserror;
2943 	else
2944 		me.me_sys_error = 0;
2945 
2946 	return (ipmp_write(fd, &me, sizeof (me)));
2947 }
2948 
2949 /*
2950  * Daemonize the process.
2951  */
2952 static boolean_t
2953 daemonize(void)
2954 {
2955 	switch (fork()) {
2956 	case -1:
2957 		return (_B_FALSE);
2958 
2959 	case  0:
2960 		/*
2961 		 * Lose our controlling terminal, and become both a session
2962 		 * leader and a process group leader.
2963 		 */
2964 		if (setsid() == -1)
2965 			return (_B_FALSE);
2966 
2967 		/*
2968 		 * Under POSIX, a session leader can accidentally (through
2969 		 * open(2)) acquire a controlling terminal if it does not
2970 		 * have one.  Just to be safe, fork() again so we are not a
2971 		 * session leader.
2972 		 */
2973 		switch (fork()) {
2974 		case -1:
2975 			return (_B_FALSE);
2976 
2977 		case 0:
2978 			(void) chdir("/");
2979 			(void) umask(022);
2980 			(void) fdwalk(closefunc, NULL);
2981 			break;
2982 
2983 		default:
2984 			_exit(EXIT_SUCCESS);
2985 		}
2986 		break;
2987 
2988 	default:
2989 		_exit(EXIT_SUCCESS);
2990 	}
2991 
2992 	return (_B_TRUE);
2993 }
2994 
2995 /*
2996  * The parent has created some fds before forking on purpose, keep them open.
2997  */
2998 static int
2999 closefunc(void *not_used, int fd)
3000 /* ARGSUSED */
3001 {
3002 	if (fd != lsock_v4 && fd != lsock_v6)
3003 		(void) close(fd);
3004 	return (0);
3005 }
3006 
3007 /* LOGGER */
3008 
3009 #include <syslog.h>
3010 
3011 /*
3012  * Logging routines.  All routines log to syslog, unless the daemon is
3013  * running in the foreground, in which case the logging goes to stderr.
3014  *
3015  * The following routines are available:
3016  *
3017  *	logdebug(): A printf-like function for outputting debug messages
3018  *	(messages at LOG_DEBUG) that are only of use to developers.
3019  *
3020  *	logtrace(): A printf-like function for outputting tracing messages
3021  *	(messages at LOG_INFO) from the daemon.	 This is typically used
3022  *	to log the receipt of interesting network-related conditions.
3023  *
3024  *	logerr(): A printf-like function for outputting error messages
3025  *	(messages at LOG_ERR) from the daemon.
3026  *
3027  *	logperror*(): A set of functions used to output error messages
3028  *	(messages at LOG_ERR); these automatically append strerror(errno)
3029  *	and a newline to the message passed to them.
3030  *
3031  * NOTE: since the logging functions write to syslog, the messages passed
3032  *	 to them are not eligible for localization.  Thus, gettext() must
3033  *	 *not* be used.
3034  */
3035 
3036 static int logging = 0;
3037 
3038 static void
3039 initlog(void)
3040 {
3041 	logging++;
3042 	openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON);
3043 }
3044 
3045 /* PRINTFLIKE1 */
3046 void
3047 logerr(char *fmt, ...)
3048 {
3049 	va_list ap;
3050 
3051 	va_start(ap, fmt);
3052 
3053 	if (logging)
3054 		vsyslog(LOG_ERR, fmt, ap);
3055 	else
3056 		(void) vfprintf(stderr, fmt, ap);
3057 	va_end(ap);
3058 }
3059 
3060 /* PRINTFLIKE1 */
3061 void
3062 logtrace(char *fmt, ...)
3063 {
3064 	va_list ap;
3065 
3066 	va_start(ap, fmt);
3067 
3068 	if (logging)
3069 		vsyslog(LOG_INFO, fmt, ap);
3070 	else
3071 		(void) vfprintf(stderr, fmt, ap);
3072 	va_end(ap);
3073 }
3074 
3075 /* PRINTFLIKE1 */
3076 void
3077 logdebug(char *fmt, ...)
3078 {
3079 	va_list ap;
3080 
3081 	va_start(ap, fmt);
3082 
3083 	if (logging)
3084 		vsyslog(LOG_DEBUG, fmt, ap);
3085 	else
3086 		(void) vfprintf(stderr, fmt, ap);
3087 	va_end(ap);
3088 }
3089 
3090 /* PRINTFLIKE1 */
3091 void
3092 logperror(char *str)
3093 {
3094 	if (logging)
3095 		syslog(LOG_ERR, "%s: %m\n", str);
3096 	else
3097 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
3098 }
3099 
3100 void
3101 logperror_pii(struct phyint_instance *pii, char *str)
3102 {
3103 	if (logging) {
3104 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3105 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
3106 	} else {
3107 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3108 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
3109 		    strerror(errno));
3110 	}
3111 }
3112 
3113 void
3114 logperror_li(struct logint *li, char *str)
3115 {
3116 	struct	phyint_instance	*pii = li->li_phyint_inst;
3117 
3118 	if (logging) {
3119 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3120 		    str, AF_STR(pii->pii_af), li->li_name);
3121 	} else {
3122 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3123 		    str, AF_STR(pii->pii_af), li->li_name,
3124 		    strerror(errno));
3125 	}
3126 }
3127 
3128 void
3129 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
3130 {
3131 	if (polled)
3132 		(void) poll_remove(pii->pii_probe_sock);
3133 	(void) close(pii->pii_probe_sock);
3134 	pii->pii_probe_sock = -1;
3135 	pii->pii_basetime_inited = 0;
3136 }
3137