xref: /titanic_50/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 4088bb40326b75ef60834a6c2a92e29e25474b68)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30 
31 int debug = 0;				/* Debug flag */
32 static int pollfd_num = 0;		/* Num. of poll descriptors */
33 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
34 
35 					/* All times below in ms */
36 int	user_failure_detection_time;	/* user specified failure detection */
37 					/* time (fdt) */
38 int	user_probe_interval;		/* derived from user specified fdt */
39 
40 static int	rtsock_v4;		/* AF_INET routing socket */
41 static int	rtsock_v6;		/* AF_INET6 routing socket */
42 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
43 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
44 static int	lsock_v4;		/* Listen socket to detect mpathd */
45 static int	lsock_v6;		/* Listen socket to detect mpathd */
46 static int	mibfd = -1;		/* fd to get mib info */
47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
48 
49 boolean_t	full_scan_required = _B_FALSE;
50 static uint_t	last_initifs_time;	/* Time when initifs was last run */
51 static	char **argv0;			/* Saved for re-exec on SIGHUP */
52 boolean_t handle_link_notifications = _B_TRUE;
53 
54 static void	initlog(void);
55 static void	run_timeouts(void);
56 static void	initifs(void);
57 static void	check_if_removed(struct phyint_instance *pii);
58 static void	select_test_ifs(void);
59 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
60 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
61 static void	router_add_v4(mib2_ipRouteEntry_t *rp1,
62     struct in_addr nexthop_v4);
63 static void	router_add_v6(mib2_ipv6RouteEntry_t *rp1,
64     struct in6_addr nexthop_v6);
65 static void	router_add_common(int af, char *ifname,
66     struct in6_addr nexthop);
67 static void	init_router_targets();
68 static void	cleanup(void);
69 static int	setup_listener(int af);
70 static void	check_config(void);
71 static void	check_addr_unique(struct phyint_instance *,
72     struct sockaddr_storage *);
73 static void	init_host_targets(void);
74 static void	dup_host_targets(struct phyint_instance *desired_pii);
75 static void	loopback_cmd(int sock, int family);
76 static int	poll_remove(int fd);
77 static boolean_t daemonize(void);
78 static int	closefunc(void *, int);
79 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
80 static unsigned int process_query(int fd, mi_query_t *miq);
81 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
82 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
83 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
84 static unsigned int send_result(int fd, unsigned int error, int syserror);
85 
86 struct local_addr *laddr_list = NULL;
87 
88 /*
89  * Return the current time in milliseconds (from an arbitrary reference)
90  * truncated to fit into an int. Truncation is ok since we are interested
91  * only in differences and not the absolute values.
92  */
93 uint_t
94 getcurrenttime(void)
95 {
96 	uint_t	cur_time;	/* In ms */
97 
98 	/*
99 	 * Use of a non-user-adjustable source of time is
100 	 * required. However millisecond precision is sufficient.
101 	 * divide by 10^6
102 	 */
103 	cur_time = (uint_t)(gethrtime() / 1000000LL);
104 	return (cur_time);
105 }
106 
107 /*
108  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
109  */
110 int
111 poll_add(int fd)
112 {
113 	int i;
114 	int new_num;
115 	struct pollfd *newfds;
116 retry:
117 	/* Check if already present */
118 	for (i = 0; i < pollfd_num; i++) {
119 		if (pollfds[i].fd == fd)
120 			return (0);
121 	}
122 	/* Check for empty spot already present */
123 	for (i = 0; i < pollfd_num; i++) {
124 		if (pollfds[i].fd == -1) {
125 			pollfds[i].fd = fd;
126 			return (0);
127 		}
128 	}
129 
130 	/* Allocate space for 32 more fds and initialize to -1 */
131 	new_num = pollfd_num + 32;
132 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
133 	if (newfds == NULL) {
134 		logperror("poll_add: realloc");
135 		return (-1);
136 	}
137 	for (i = pollfd_num; i < new_num; i++) {
138 		newfds[i].fd = -1;
139 		newfds[i].events = POLLIN;
140 	}
141 	pollfd_num = new_num;
142 	pollfds = newfds;
143 	goto retry;
144 }
145 
146 /*
147  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
148  */
149 static int
150 poll_remove(int fd)
151 {
152 	int i;
153 
154 	/* Check if already present */
155 	for (i = 0; i < pollfd_num; i++) {
156 		if (pollfds[i].fd == fd) {
157 			pollfds[i].fd = -1;
158 			return (0);
159 		}
160 	}
161 	return (-1);
162 }
163 
164 /*
165  * Extract information about the phyint instance. If the phyint instance still
166  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
167  * will use it to detect phyint instances that don't exist any longer and
168  * remove them, from our database of phyint instances.
169  * Return value:
170  *	returns true if the phyint instance exists in the kernel,
171  *	returns false otherwise
172  */
173 static boolean_t
174 pii_process(int af, char *name, struct phyint_instance **pii_p)
175 {
176 	int err;
177 	struct phyint_instance *pii;
178 	struct phyint_instance *pii_other;
179 
180 	if (debug & D_PHYINT)
181 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
182 
183 	pii = phyint_inst_lookup(af, name);
184 	if (pii == NULL) {
185 		/*
186 		 * Phyint instance does not exist in our tables,
187 		 * create new phyint instance
188 		 */
189 		pii = phyint_inst_init_from_k(af, name);
190 	} else {
191 		/* Phyint exists in our tables */
192 		err = phyint_inst_update_from_k(pii);
193 
194 		switch (err) {
195 		case PI_IOCTL_ERROR:
196 			/* Some ioctl error. don't change anything */
197 			pii->pii_in_use = 1;
198 			break;
199 
200 		case PI_GROUP_CHANGED:
201 			/*
202 			 * The phyint has changed group.
203 			 */
204 			restore_phyint(pii->pii_phyint);
205 			/* FALLTHRU */
206 
207 		case PI_IFINDEX_CHANGED:
208 			/*
209 			 * Interface index has changed. Delete and
210 			 * recreate the phyint as it is quite likely
211 			 * the interface has been unplumbed and replumbed.
212 			 */
213 			pii_other = phyint_inst_other(pii);
214 			if (pii_other != NULL)
215 				phyint_inst_delete(pii_other);
216 			phyint_inst_delete(pii);
217 			pii = phyint_inst_init_from_k(af, name);
218 			break;
219 
220 		case PI_DELETED:
221 			/* Phyint instance has disappeared from kernel */
222 			pii->pii_in_use = 0;
223 			break;
224 
225 		case PI_OK:
226 			/* Phyint instance exists and is fine */
227 			pii->pii_in_use = 1;
228 			break;
229 
230 		default:
231 			/* Unknown status */
232 			logerr("pii_process: Unknown status %d\n", err);
233 			break;
234 		}
235 	}
236 
237 	*pii_p = pii;
238 	if (pii != NULL)
239 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
240 	else
241 		return (_B_FALSE);
242 }
243 
244 /*
245  * This phyint is leaving the group. Try to restore the phyint to its
246  * initial state. Return the addresses that belong to other group members,
247  * to the group, and take back any addresses owned by this phyint
248  */
249 void
250 restore_phyint(struct phyint *pi)
251 {
252 	if (pi->pi_group == phyint_anongroup)
253 		return;
254 
255 	/*
256 	 * Move everthing to some other member in the group.
257 	 * The phyint has changed group in the kernel. But we
258 	 * have yet to do it in our tables.
259 	 */
260 	if (!pi->pi_empty)
261 		(void) try_failover(pi, FAILOVER_TO_ANY);
262 	/*
263 	 * Move all addresses owned by 'pi' back to pi, from each
264 	 * of the other members of the group
265 	 */
266 	(void) try_failback(pi);
267 }
268 
269 /*
270  * Scan all interfaces to detect changes as well as new and deleted interfaces
271  */
272 static void
273 initifs()
274 {
275 	int	n;
276 	int	af;
277 	char	*cp;
278 	char	*buf;
279 	int	numifs;
280 	struct lifnum	lifn;
281 	struct lifconf	lifc;
282 	struct lifreq	*lifr;
283 	struct logint	*li;
284 	struct phyint_instance *pii;
285 	struct phyint_instance *next_pii;
286 	char	pi_name[LIFNAMSIZ + 1];
287 	boolean_t exists;
288 	struct phyint	*pi;
289 	struct local_addr *next;
290 
291 	if (debug & D_PHYINT)
292 		logdebug("initifs: Scanning interfaces\n");
293 
294 	last_initifs_time = getcurrenttime();
295 
296 	/*
297 	 * Free the laddr_list before collecting the local addresses.
298 	 */
299 	while (laddr_list != NULL) {
300 		next = laddr_list->next;
301 		free(laddr_list);
302 		laddr_list = next;
303 	}
304 
305 	/*
306 	 * Mark the interfaces so that we can find phyints and logints
307 	 * which have disappeared from the kernel. pii_process() and
308 	 * logint_init_from_k() will set {pii,li}_in_use when they find
309 	 * the interface in the kernel. Also, clear dupaddr bit on probe
310 	 * logint. check_addr_unique() will set the dupaddr bit on the
311 	 * probe logint, if the testaddress is not unique.
312 	 */
313 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
314 		pii->pii_in_use = 0;
315 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
316 			li->li_in_use = 0;
317 			if (pii->pii_probe_logint == li)
318 				li->li_dupaddr = 0;
319 		}
320 	}
321 
322 	lifn.lifn_family = AF_UNSPEC;
323 	lifn.lifn_flags = LIFC_ALLZONES;
324 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
325 		logperror("initifs: ioctl (get interface numbers)");
326 		return;
327 	}
328 	numifs = lifn.lifn_count;
329 
330 	buf = (char *)calloc(numifs, sizeof (struct lifreq));
331 	if (buf == NULL) {
332 		logperror("initifs: calloc");
333 		return;
334 	}
335 
336 	lifc.lifc_family = AF_UNSPEC;
337 	lifc.lifc_flags = LIFC_ALLZONES;
338 	lifc.lifc_len = numifs * sizeof (struct lifreq);
339 	lifc.lifc_buf = buf;
340 
341 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
342 		/*
343 		 * EINVAL is commonly encountered, when things change
344 		 * underneath us rapidly, (eg. at boot, when new interfaces
345 		 * are plumbed successively) and the kernel finds the buffer
346 		 * size we passed as too small. We will retry again
347 		 * when we see the next routing socket msg, or at worst after
348 		 * IF_SCAN_INTERVAL ms.
349 		 */
350 		if (errno != EINVAL) {
351 			logperror("initifs: ioctl"
352 			    " (get interface configuration)");
353 		}
354 		free(buf);
355 		return;
356 	}
357 
358 	lifr = (struct lifreq *)lifc.lifc_req;
359 
360 	/*
361 	 * For each lifreq returned by SIOGGLIFCONF, call pii_process()
362 	 * and get the state of the corresponding phyint_instance. If it is
363 	 * successful, then call logint_init_from_k() to get the state of the
364 	 * logint.
365 	 */
366 	for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
367 		int	sockfd;
368 		struct local_addr	*taddr;
369 		struct sockaddr_in	*sin;
370 		struct sockaddr_in6	*sin6;
371 		struct lifreq	lifreq;
372 
373 		af = lifr->lifr_addr.ss_family;
374 
375 		/*
376 		 * Collect all local addresses.
377 		 */
378 		sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
379 		(void) memset(&lifreq, 0, sizeof (lifreq));
380 		(void) strlcpy(lifreq.lifr_name, lifr->lifr_name,
381 		    sizeof (lifreq.lifr_name));
382 
383 		if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
384 			if (errno != ENXIO)
385 				logperror("initifs: ioctl (SIOCGLIFFLAGS)");
386 			continue;
387 		}
388 
389 		/*
390 		 * Add the interface address to laddr_list.
391 		 * Another node might have the same IP address which is up.
392 		 * In that case, it is appropriate  to use the address as a
393 		 * target, even though it is also configured (but not up) on
394 		 * the local system.
395 		 * Hence,the interface address is not added to laddr_list
396 		 * unless it is IFF_UP.
397 		 */
398 		if (lifreq.lifr_flags & IFF_UP) {
399 			taddr = malloc(sizeof (struct local_addr));
400 			if (taddr == NULL) {
401 				logperror("initifs: malloc");
402 				continue;
403 			}
404 			if (af == AF_INET) {
405 				sin = (struct sockaddr_in *)&lifr->lifr_addr;
406 				IN6_INADDR_TO_V4MAPPED(&sin->sin_addr,
407 				    &taddr->addr);
408 			} else {
409 				sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
410 				taddr->addr = sin6->sin6_addr;
411 			}
412 			taddr->next = laddr_list;
413 			laddr_list = taddr;
414 		}
415 
416 		/*
417 		 * Need to pass a phyint name to pii_process. Insert the
418 		 * null where the ':' IF_SEPARATOR is found in the logical
419 		 * name.
420 		 */
421 		(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
422 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
423 			*cp = '\0';
424 
425 		exists = pii_process(af, pi_name, &pii);
426 		if (exists) {
427 			/* The phyint is fine. So process the logint */
428 			logint_init_from_k(pii, lifr->lifr_name);
429 			check_addr_unique(pii, &lifr->lifr_addr);
430 		}
431 
432 	}
433 
434 	free(buf);
435 
436 	/*
437 	 * If the test address is now unique, and if it was not unique
438 	 * previously,	clear the li_dupaddrmsg_printed flag and log a
439 	 * recovery message
440 	 */
441 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
442 		struct logint *li;
443 		char abuf[INET6_ADDRSTRLEN];
444 
445 		li = pii->pii_probe_logint;
446 		if ((li != NULL) && !li->li_dupaddr &&
447 		    li->li_dupaddrmsg_printed) {
448 			logerr("Test address %s is unique in group; enabling "
449 			    "probe-based failure detection on %s\n",
450 			    pr_addr(pii->pii_af, li->li_addr, abuf,
451 				sizeof (abuf)), pii->pii_phyint->pi_name);
452 			li->li_dupaddrmsg_printed = 0;
453 		}
454 	}
455 
456 	/*
457 	 * Scan for phyints and logints that have disappeared from the
458 	 * kernel, and delete them.
459 	 */
460 	pii = phyint_instances;
461 
462 	while (pii != NULL) {
463 		next_pii = pii->pii_next;
464 		check_if_removed(pii);
465 		pii = next_pii;
466 	}
467 
468 	/*
469 	 * Select a test address for sending probes on each phyint instance
470 	 */
471 	select_test_ifs();
472 
473 	/*
474 	 * Handle link up/down notifications from the NICs.
475 	 */
476 	process_link_state_changes();
477 
478 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
479 		/*
480 		 * If this is a case of group failure, we don't have much
481 		 * to do until the group recovers again.
482 		 */
483 		if (GROUP_FAILED(pi->pi_group))
484 			continue;
485 
486 		/*
487 		 * Try/Retry any pending failovers / failbacks, that did not
488 		 * not complete, or that could not be initiated previously.
489 		 * This implements the 3 invariants described in the big block
490 		 * comment at the beginning of probe.c
491 		 */
492 		if (pi->pi_flags & IFF_INACTIVE) {
493 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
494 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
495 		} else {
496 			struct phyint_instance *pii;
497 
498 			/*
499 			 * Skip interfaces which are not capable of probing,
500 			 * and interfaces that have downed links (as we will
501 			 * not get any response).
502 			 */
503 			if (LINK_DOWN(pi))
504 				continue;
505 
506 			pii = pi->pi_v4;
507 			if (!PROBE_CAPABLE(pii)) {
508 				pii = pi->pi_v6;
509 				if (!PROBE_CAPABLE(pii))
510 					continue;
511 			}
512 
513 			/*
514 			 * It is possible that the phyint has started
515 			 * receiving packets, after it has been marked
516 			 * PI_FAILED. Don't initiate failover, if the
517 			 * phyint has started recovering. failure_state()
518 			 * captures this check. A similar logic is used
519 			 * for failback/repair case.
520 			 */
521 			if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
522 			    (failure_state(pii) == PHYINT_FAILURE)) {
523 				(void) try_failover(pi, FAILOVER_NORMAL);
524 			} else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
525 				if (try_failback(pi) != IPMP_FAILURE) {
526 					(void) change_lif_flags(pi, IFF_FAILED,
527 					    _B_FALSE);
528 					/* Per state diagram */
529 					pi->pi_empty = 0;
530 				}
531 			}
532 		}
533 	}
534 }
535 
536 /*
537  * Check that a given test address is unique across all of the interfaces in a
538  * group.  (e.g., IPv6 link-locals may not be inherently unique, and binding
539  * to such an (IFF_NOFAILOVER) address can produce unexpected results.)
540  * Log an error and alert the user.
541  */
542 static void
543 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss)
544 {
545 	struct phyint		*pi;
546 	struct phyint_group	*pg;
547 	struct in6_addr		addr;
548 	struct phyint_instance	*pii;
549 	struct sockaddr_in	*sin;
550 	char			abuf[INET6_ADDRSTRLEN];
551 
552 	if (ss->ss_family == AF_INET) {
553 		sin = (struct sockaddr_in *)ss;
554 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
555 	} else {
556 		assert(ss->ss_family == AF_INET6);
557 		addr = ((struct sockaddr_in6 *)ss)->sin6_addr;
558 	}
559 
560 	/*
561 	 * For anonymous groups, every interface is assumed to be on its own
562 	 * link, so there is no chance of overlapping addresses.
563 	 */
564 	pg = ourpii->pii_phyint->pi_group;
565 	if (pg == phyint_anongroup)
566 		return;
567 
568 	/*
569 	 * Walk the list of phyint instances in the group and check for test
570 	 * addresses matching ours.  Of course, we skip ourself.
571 	 */
572 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
573 		pii = PHYINT_INSTANCE(pi, ss->ss_family);
574 		if (pii == NULL || pii == ourpii ||
575 		    pii->pii_probe_logint == NULL)
576 			continue;
577 
578 		if (!IN6_ARE_ADDR_EQUAL(&addr,
579 		    &pii->pii_probe_logint->li_addr)) {
580 			continue;
581 		}
582 
583 		/*
584 		 * This test address is not unique. Set the dupaddr bit
585 		 * and log an error message if not already logged.
586 		 */
587 		pii->pii_probe_logint->li_dupaddr = 1;
588 		if (!pii->pii_probe_logint->li_dupaddrmsg_printed) {
589 			logerr("Test address %s is not unique in group; "
590 			    "disabling probe-based failure detection on %s\n",
591 			    pr_addr(ss->ss_family, addr, abuf, sizeof (abuf)),
592 			    pii->pii_phyint->pi_name);
593 			pii->pii_probe_logint->li_dupaddrmsg_printed = 1;
594 		}
595 	}
596 }
597 
598 /*
599  * Stop probing an interface.  Called when an interface is offlined.
600  * The probe socket is closed on each interface instance, and the
601  * interface state set to PI_OFFLINE.
602  */
603 static void
604 stop_probing(struct phyint *pi)
605 {
606 	struct phyint_instance *pii;
607 
608 	pii = pi->pi_v4;
609 	if (pii != NULL) {
610 		if (pii->pii_probe_sock != -1)
611 			close_probe_socket(pii, _B_TRUE);
612 		pii->pii_probe_logint = NULL;
613 	}
614 
615 	pii = pi->pi_v6;
616 	if (pii != NULL) {
617 		if (pii->pii_probe_sock != -1)
618 			close_probe_socket(pii, _B_TRUE);
619 		pii->pii_probe_logint = NULL;
620 	}
621 
622 	phyint_chstate(pi, PI_OFFLINE);
623 }
624 
625 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
626 
627 /*
628  * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
629  * IFF_UP must also be set so that the associated address can be used as a
630  * source address.  Further, we must be able to exchange packets with local
631  * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
632  * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
633  */
634 static int
635 rate_testflags(uint64_t flags)
636 {
637 	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
638 		return (BAD_TESTFLAGS);
639 
640 	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
641 		return (BAD_TESTFLAGS);
642 
643 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
644 		return (BEST_TESTFLAGS);
645 
646 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
647 		return (BEST_TESTFLAGS);
648 
649 	return (OK_TESTFLAGS);
650 }
651 
652 /*
653  * Attempt to select a test address for each phyint instance.
654  * Call phyint_inst_sockinit() to complete the initializations.
655  */
656 static void
657 select_test_ifs(void)
658 {
659 	struct phyint		*pi;
660 	struct phyint_instance	*pii;
661 	struct phyint_instance	*next_pii;
662 	struct logint		*li;
663 	struct logint  		*probe_logint;
664 	boolean_t		target_scan_reqd = _B_FALSE;
665 	struct target		*tg;
666 	int			rating;
667 
668 	if (debug & D_PHYINT)
669 		logdebug("select_test_ifs\n");
670 
671 	/*
672 	 * For each phyint instance, do the test address selection
673 	 */
674 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
675 		next_pii = pii->pii_next;
676 		probe_logint = NULL;
677 
678 		/*
679 		 * An interface that is offline, should not be probed.
680 		 * Offline interfaces should always in PI_OFFLINE state,
681 		 * unless some other entity has set the offline flag.
682 		 */
683 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
684 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
685 				logerr("shouldn't be probing offline"
686 					" interface %s (state is: %u)."
687 					" Stopping probes.\n",
688 					pii->pii_phyint->pi_name,
689 					pii->pii_phyint->pi_state);
690 				stop_probing(pii->pii_phyint);
691 			}
692 			continue;
693 		}
694 
695 		li = pii->pii_probe_logint;
696 		if (li != NULL) {
697 			/*
698 			 * We've already got a test address; only proceed
699 			 * if it's suboptimal.
700 			 */
701 			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
702 				continue;
703 		}
704 
705 		/*
706 		 * Walk the logints of this phyint instance, and select
707 		 * the best available test address
708 		 */
709 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
710 			/*
711 			 * Skip 0.0.0.0 addresses, as those are never
712 			 * actually usable.
713 			 */
714 			if (pii->pii_af == AF_INET &&
715 			    IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr))
716 				continue;
717 
718 			/*
719 			 * Skip any IPv6 logints that are not link-local,
720 			 * since we should always have a link-local address
721 			 * anyway and in6_data() expects link-local replies.
722 			 */
723 			if (pii->pii_af == AF_INET6 &&
724 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
725 				continue;
726 
727 			/*
728 			 * Rate the testflags. If we've found an optimal
729 			 * match, then break out; otherwise, record the most
730 			 * recent OK one.
731 			 */
732 			rating = rate_testflags(li->li_flags);
733 			if (rating == BAD_TESTFLAGS)
734 				continue;
735 
736 			probe_logint = li;
737 			if (rating == BEST_TESTFLAGS)
738 				break;
739 		}
740 
741 		/*
742 		 * If the probe logint has changed, ditch the old one.
743 		 */
744 		if (pii->pii_probe_logint != NULL &&
745 		    pii->pii_probe_logint != probe_logint) {
746 			if (pii->pii_probe_sock != -1)
747 				close_probe_socket(pii, _B_TRUE);
748 			pii->pii_probe_logint = NULL;
749 		}
750 
751 		if (probe_logint == NULL) {
752 			/*
753 			 * We don't have a test address. Don't print an
754 			 * error message immediately. check_config() will
755 			 * take care of it. Zero out the probe stats array
756 			 * since it is no longer relevant. Optimize by
757 			 * checking if it is already zeroed out.
758 			 */
759 			int pr_ndx;
760 
761 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
762 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
763 				clear_pii_probe_stats(pii);
764 				reset_crtt_all(pii->pii_phyint);
765 			}
766 			continue;
767 		} else if (probe_logint == pii->pii_probe_logint) {
768 			/*
769 			 * If we didn't find any new test addr, go to the
770 			 * next phyint.
771 			 */
772 			continue;
773 		}
774 
775 		/*
776 		 * The phyint is either being assigned a new testaddr
777 		 * or is being assigned a testaddr for the 1st time.
778 		 * Need to initialize the phyint socket
779 		 */
780 		pii->pii_probe_logint = probe_logint;
781 		if (!phyint_inst_sockinit(pii)) {
782 			if (debug & D_PHYINT) {
783 				logdebug("select_test_ifs: "
784 				    "phyint_sockinit failed\n");
785 			}
786 			phyint_inst_delete(pii);
787 			continue;
788 		}
789 
790 		/*
791 		 * This phyint instance is now enabled for probes; this
792 		 * impacts our state machine in two ways:
793 		 *
794 		 * 1. If we're probe *capable* as well (i.e., we have
795 		 *    probe targets) and the interface is in PI_NOTARGETS,
796 		 *    then transition to PI_RUNNING.
797 		 *
798 		 * 2. If we're not probe capable, and the other phyint
799 		 *    instance is also not probe capable, and we were in
800 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
801 		 *
802 		 * Also see the state diagram in mpd_probe.c.
803 		 */
804 		if (PROBE_CAPABLE(pii)) {
805 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
806 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
807 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
808 			if (pii->pii_phyint->pi_state == PI_RUNNING)
809 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
810 		}
811 
812 		if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
813 			tg = pii->pii_targets;
814 			if (tg != NULL)
815 				target_delete(tg);
816 			assert(pii->pii_targets == NULL);
817 			assert(pii->pii_target_next == NULL);
818 			assert(pii->pii_ntargets == 0);
819 			target_create(pii, probe_logint->li_dstaddr,
820 			    _B_TRUE);
821 		}
822 
823 		/*
824 		 * If no targets are currently known for this phyint
825 		 * we need to call init_router_targets. Since
826 		 * init_router_targets() initializes the list of targets
827 		 * for all phyints it is done below the loop.
828 		 */
829 		if (pii->pii_targets == NULL)
830 			target_scan_reqd = _B_TRUE;
831 
832 		/*
833 		 * Start the probe timer for this instance.
834 		 */
835 		if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) {
836 			start_timer(pii);
837 			pii->pii_basetime_inited = 1;
838 		}
839 	}
840 
841 	/*
842 	 * Check the interface list for any interfaces that are marked
843 	 * PI_FAILED but no longer enabled to send probes, and call
844 	 * phyint_check_for_repair() to see if the link now indicates that the
845 	 * interface should be repaired.  Also see the state diagram in
846 	 * mpd_probe.c.
847 	 */
848 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
849 		if (pi->pi_state == PI_FAILED &&
850 		    !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
851 			phyint_check_for_repair(pi);
852 		}
853 	}
854 
855 	/*
856 	 * Try to populate the target list. init_router_targets populates
857 	 * the target list from the routing table. If our target list is
858 	 * still empty, init_host_targets adds host targets based on the
859 	 * host target list of other phyints in the group.
860 	 */
861 	if (target_scan_reqd) {
862 		init_router_targets();
863 		init_host_targets();
864 	}
865 }
866 
867 /*
868  * Check phyint group configuration, to detect any inconsistencies,
869  * and log an error message. This is called from runtimeouts every
870  * 20 secs. But the error message is displayed once. If the
871  * consistency is resolved by the admin, a recovery message is displayed
872  * once.
873  */
874 static void
875 check_config(void)
876 {
877 	struct phyint_group *pg;
878 	struct phyint *pi;
879 	boolean_t v4_in_group;
880 	boolean_t v6_in_group;
881 
882 	/*
883 	 * All phyints of a group must be homogenous to ensure that
884 	 * failover or failback can be done. If any phyint in a group
885 	 * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
886 	 * Do a similar check for IPv6.
887 	 */
888 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
889 		if (pg == phyint_anongroup)
890 			continue;
891 
892 		v4_in_group = _B_FALSE;
893 		v6_in_group = _B_FALSE;
894 		/*
895 		 * 1st pass. Determine if at least 1 phyint in the group
896 		 * has IPv4 plumbed and if so set v4_in_group to true.
897 		 * Repeat similarly for IPv6.
898 		 */
899 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
900 			if (pi->pi_v4 != NULL)
901 				v4_in_group = _B_TRUE;
902 			if (pi->pi_v6 != NULL)
903 				v6_in_group = _B_TRUE;
904 		}
905 
906 		/*
907 		 * 2nd pass. If v4_in_group is true, check that phyint
908 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
909 		 * out a message the 1st time only.
910 		 */
911 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
912 			if (pi->pi_flags & IFF_OFFLINE)
913 				continue;
914 
915 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
916 				if (!pi->pi_cfgmsg_printed) {
917 					logerr("NIC %s of group %s is"
918 					    " not plumbed for IPv4 and may"
919 					    " affect failover capability\n",
920 					    pi->pi_name,
921 					    pi->pi_group->pg_name);
922 					pi->pi_cfgmsg_printed = 1;
923 				}
924 			} else if (v6_in_group == _B_TRUE &&
925 			    pi->pi_v6 == NULL) {
926 				if (!pi->pi_cfgmsg_printed) {
927 					logerr("NIC %s of group %s is"
928 					    " not plumbed for IPv6 and may"
929 					    " affect failover capability\n",
930 					    pi->pi_name,
931 					    pi->pi_group->pg_name);
932 					pi->pi_cfgmsg_printed = 1;
933 				}
934 			} else {
935 				/*
936 				 * The phyint matches the group configuration,
937 				 * if we have reached this point. If it was
938 				 * improperly configured earlier, log an
939 				 * error recovery message
940 				 */
941 				if (pi->pi_cfgmsg_printed) {
942 					logerr("NIC %s is now consistent with "
943 					    "group %s and failover capability "
944 					    "is restored\n", pi->pi_name,
945 					    pi->pi_group->pg_name);
946 					pi->pi_cfgmsg_printed = 0;
947 				}
948 			}
949 
950 		}
951 	}
952 
953 	/*
954 	 * In order to perform probe-based failure detection, a phyint must
955 	 * have at least 1 test/probe address for sending and receiving probes
956 	 * (either on IPv4 or IPv6 instance or both).  If no test address has
957 	 * been configured, notify the administrator, but continue on since we
958 	 * can still perform load spreading, along with "link up/down" based
959 	 * failure detection.
960 	 */
961 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
962 		if (pi->pi_flags & IFF_OFFLINE)
963 			continue;
964 
965 		if ((pi->pi_v4 == NULL ||
966 		    pi->pi_v4->pii_probe_logint == NULL) &&
967 		    (pi->pi_v6 == NULL ||
968 		    pi->pi_v6->pii_probe_logint == NULL)) {
969 			if (!pi->pi_taddrmsg_printed) {
970 				logerr("No test address configured on "
971 				    "interface %s; disabling probe-based "
972 				    "failure detection on it\n", pi->pi_name);
973 				pi->pi_taddrmsg_printed = 1;
974 			}
975 		} else if (pi->pi_taddrmsg_printed) {
976 			logerr("Test address now configured on interface %s; "
977 			    "enabling probe-based failure detection on it\n",
978 			    pi->pi_name);
979 			pi->pi_taddrmsg_printed = 0;
980 		}
981 
982 	}
983 }
984 
985 /*
986  * Timer mechanism using relative time (in milliseconds) from the
987  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
988  * will fire after TIMER_INFINITY milliseconds.
989  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
990  * time values. Hence 2 consecutive timer events cannot be spaced farther
991  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
992  * that can be passed for the delay parameter of timer_schedule()
993  */
994 static uint_t timer_next;	/* Currently scheduled timeout */
995 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
996 
997 static void
998 timer_init(void)
999 {
1000 	timer_next = getcurrenttime() + TIMER_INFINITY;
1001 	/*
1002 	 * The call to run_timeouts() will get the timer started
1003 	 * Since there are no phyints at this point, the timer will
1004 	 * be set for IF_SCAN_INTERVAL ms.
1005 	 */
1006 	run_timeouts();
1007 }
1008 
1009 /*
1010  * Make sure the next SIGALRM occurs delay milliseconds from the current
1011  * time if not earlier. We are interested only in time differences.
1012  */
1013 void
1014 timer_schedule(uint_t delay)
1015 {
1016 	uint_t now;
1017 	struct itimerval itimerval;
1018 
1019 	if (debug & D_TIMER)
1020 		logdebug("timer_schedule(%u)\n", delay);
1021 
1022 	assert(delay <= TIMER_INFINITY);
1023 
1024 	now = getcurrenttime();
1025 	if (delay == 0) {
1026 		/* Minimum allowed delay */
1027 		delay = 1;
1028 	}
1029 	/* Will this timer occur before the currently scheduled SIGALRM? */
1030 	if (timer_active && TIME_GE(now + delay, timer_next)) {
1031 		if (debug & D_TIMER) {
1032 			logdebug("timer_schedule(%u) - no action: "
1033 			    "now %u next %u\n", delay, now, timer_next);
1034 		}
1035 		return;
1036 	}
1037 	timer_next = now + delay;
1038 
1039 	itimerval.it_value.tv_sec = delay / 1000;
1040 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
1041 	itimerval.it_interval.tv_sec = 0;
1042 	itimerval.it_interval.tv_usec = 0;
1043 	if (debug & D_TIMER) {
1044 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1045 		    delay, itimerval.it_value.tv_sec,
1046 		    itimerval.it_value.tv_usec);
1047 	}
1048 	timer_active = _B_TRUE;
1049 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1050 		logperror("timer_schedule: setitimer");
1051 		exit(2);
1052 	}
1053 }
1054 
1055 /*
1056  * Timer has fired. Determine when the next timer event will occur by asking
1057  * all the timer routines. Should not be called from a timer routine.
1058  */
1059 static void
1060 run_timeouts(void)
1061 {
1062 	uint_t next;
1063 	uint_t next_event_time;
1064 	struct phyint_instance *pii;
1065 	struct phyint_instance *next_pii;
1066 	static boolean_t timeout_running;
1067 
1068 	/* assert that recursive timeouts don't happen. */
1069 	assert(!timeout_running);
1070 
1071 	timeout_running = _B_TRUE;
1072 
1073 	if (debug & D_TIMER)
1074 		logdebug("run_timeouts()\n");
1075 
1076 	next = TIMER_INFINITY;
1077 
1078 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1079 		next_pii = pii->pii_next;
1080 		next_event_time = phyint_inst_timer(pii);
1081 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1082 			next = next_event_time;
1083 
1084 		if (debug & D_TIMER) {
1085 			logdebug("run_timeouts(%s %s): next scheduled for"
1086 			    " this phyint inst %u, next scheduled global"
1087 			    " %u ms\n",
1088 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1089 			    next_event_time, next);
1090 		}
1091 	}
1092 
1093 	/*
1094 	 * Make sure initifs() is called at least once every
1095 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1096 	 * with the kernel, in case we have missed any routing
1097 	 * socket messages.
1098 	 */
1099 	if (next > IF_SCAN_INTERVAL)
1100 		next = IF_SCAN_INTERVAL;
1101 
1102 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1103 		initifs();
1104 		check_config();
1105 	}
1106 
1107 	if (debug & D_TIMER)
1108 		logdebug("run_timeouts: %u ms\n", next);
1109 
1110 	timer_schedule(next);
1111 	timeout_running = _B_FALSE;
1112 }
1113 
1114 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1115 static int eventpipe_write = -1;
1116 static boolean_t cleanup_started = _B_FALSE;
1117 				/* Don't write to eventpipe if in cleanup */
1118 /*
1119  * Ensure that signals are processed synchronously with the rest of
1120  * the code by just writing a one character signal number on the pipe.
1121  * The poll loop will pick this up and process the signal event.
1122  */
1123 static void
1124 sig_handler(int signo)
1125 {
1126 	uchar_t buf = (uchar_t)signo;
1127 
1128 	/*
1129 	 * Don't write to pipe if cleanup has already begun. cleanup()
1130 	 * might have closed the pipe already
1131 	 */
1132 	if (cleanup_started)
1133 		return;
1134 
1135 	if (eventpipe_write == -1) {
1136 		logerr("sig_handler: no pipe found\n");
1137 		return;
1138 	}
1139 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1140 		logperror("sig_handler: write");
1141 }
1142 
1143 extern struct probes_missed probes_missed;
1144 
1145 /*
1146  * Pick up a signal "byte" from the pipe and process it.
1147  */
1148 static void
1149 in_signal(int fd)
1150 {
1151 	uchar_t buf;
1152 	uint64_t  sent, acked, lost, unacked, unknown;
1153 	struct phyint_instance *pii;
1154 	int pr_ndx;
1155 
1156 	switch (read(fd, &buf, sizeof (buf))) {
1157 	case -1:
1158 		logperror("in_signal: read");
1159 		exit(1);
1160 		/* NOTREACHED */
1161 	case 1:
1162 		break;
1163 	case 0:
1164 		logerr("in_signal: read end of file\n");
1165 		exit(1);
1166 		/* NOTREACHED */
1167 	default:
1168 		logerr("in_signal: read > 1\n");
1169 		exit(1);
1170 	}
1171 
1172 	if (debug & D_TIMER)
1173 		logdebug("in_signal() got %d\n", buf);
1174 
1175 	switch (buf) {
1176 	case SIGALRM:
1177 		if (debug & D_TIMER) {
1178 			uint_t now = getcurrenttime();
1179 
1180 			logdebug("in_signal(SIGALRM) delta %u\n",
1181 			    now - timer_next);
1182 		}
1183 		timer_active = _B_FALSE;
1184 		run_timeouts();
1185 		break;
1186 	case SIGUSR1:
1187 		logdebug("Printing configuration:\n");
1188 		/* Print out the internal tables */
1189 		phyint_inst_print_all();
1190 
1191 		/*
1192 		 * Print out the accumulated statistics about missed
1193 		 * probes (happens due to scheduling delay).
1194 		 */
1195 		logerr("Missed sending total of %d probes spread over"
1196 		    " %d occurrences\n", probes_missed.pm_nprobes,
1197 		    probes_missed.pm_ntimes);
1198 
1199 		/*
1200 		 * Print out the accumulated statistics about probes
1201 		 * that were sent.
1202 		 */
1203 		for (pii = phyint_instances; pii != NULL;
1204 		    pii = pii->pii_next) {
1205 			unacked = 0;
1206 			acked = pii->pii_cum_stats.acked;
1207 			lost = pii->pii_cum_stats.lost;
1208 			sent = pii->pii_cum_stats.sent;
1209 			unknown = pii->pii_cum_stats.unknown;
1210 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1211 				switch (pii->pii_probes[pr_ndx].pr_status) {
1212 				case PR_ACKED:
1213 					acked++;
1214 					break;
1215 				case PR_LOST:
1216 					lost++;
1217 					break;
1218 				case PR_UNACKED:
1219 					unacked++;
1220 					break;
1221 				}
1222 			}
1223 			logerr("\nProbe stats on (%s %s)\n"
1224 			    "Number of probes sent %lld\n"
1225 			    "Number of probe acks received %lld\n"
1226 			    "Number of probes/acks lost %lld\n"
1227 			    "Number of valid unacknowled probes %lld\n"
1228 			    "Number of ambiguous probe acks received %lld\n",
1229 			    AF_STR(pii->pii_af), pii->pii_name,
1230 			    sent, acked, lost, unacked, unknown);
1231 		}
1232 		break;
1233 	case SIGHUP:
1234 		logerr("SIGHUP: restart and reread config file\n");
1235 		cleanup();
1236 		(void) execv(argv0[0], argv0);
1237 		_exit(0177);
1238 		/* NOTREACHED */
1239 	case SIGINT:
1240 	case SIGTERM:
1241 	case SIGQUIT:
1242 		cleanup();
1243 		exit(0);
1244 		/* NOTREACHED */
1245 	default:
1246 		logerr("in_signal: unknown signal: %d\n", buf);
1247 	}
1248 }
1249 
1250 static void
1251 cleanup(void)
1252 {
1253 	struct phyint_instance *pii;
1254 	struct phyint_instance *next_pii;
1255 
1256 	/*
1257 	 * Make sure that we don't write to eventpipe in
1258 	 * sig_handler() if any signal notably SIGALRM,
1259 	 * occurs after we close the eventpipe descriptor below
1260 	 */
1261 	cleanup_started = _B_TRUE;
1262 
1263 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1264 		next_pii = pii->pii_next;
1265 		phyint_inst_delete(pii);
1266 	}
1267 
1268 	(void) close(ifsock_v4);
1269 	(void) close(ifsock_v6);
1270 	(void) close(rtsock_v4);
1271 	(void) close(rtsock_v6);
1272 	(void) close(lsock_v4);
1273 	(void) close(lsock_v6);
1274 	(void) close(0);
1275 	(void) close(1);
1276 	(void) close(2);
1277 	(void) close(mibfd);
1278 	(void) close(eventpipe_read);
1279 	(void) close(eventpipe_write);
1280 }
1281 
1282 /*
1283  * Create pipe for signal delivery and set up signal handlers.
1284  */
1285 static void
1286 setup_eventpipe(void)
1287 {
1288 	int fds[2];
1289 	struct sigaction act;
1290 
1291 	if ((pipe(fds)) < 0) {
1292 		logperror("setup_eventpipe: pipe");
1293 		exit(1);
1294 	}
1295 	eventpipe_read = fds[0];
1296 	eventpipe_write = fds[1];
1297 	if (poll_add(eventpipe_read) == -1) {
1298 		exit(1);
1299 	}
1300 
1301 	act.sa_handler = sig_handler;
1302 	act.sa_flags = SA_RESTART;
1303 	(void) sigaction(SIGALRM, &act, NULL);
1304 
1305 	(void) sigset(SIGHUP, sig_handler);
1306 	(void) sigset(SIGUSR1, sig_handler);
1307 	(void) sigset(SIGTERM, sig_handler);
1308 	(void) sigset(SIGINT, sig_handler);
1309 	(void) sigset(SIGQUIT, sig_handler);
1310 }
1311 
1312 /*
1313  * Create a routing socket for receiving RTM_IFINFO messages.
1314  */
1315 static int
1316 setup_rtsock(int af)
1317 {
1318 	int	s;
1319 	int	flags;
1320 
1321 	s = socket(PF_ROUTE, SOCK_RAW, af);
1322 	if (s == -1) {
1323 		logperror("setup_rtsock: socket PF_ROUTE");
1324 		exit(1);
1325 	}
1326 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1327 		logperror("setup_rtsock: fcntl F_GETFL");
1328 		(void) close(s);
1329 		exit(1);
1330 	}
1331 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1332 		logperror("setup_rtsock: fcntl F_SETFL");
1333 		(void) close(s);
1334 		exit(1);
1335 	}
1336 	if (poll_add(s) == -1) {
1337 		(void) close(s);
1338 		exit(1);
1339 	}
1340 	return (s);
1341 }
1342 
1343 /*
1344  * Process an RTM_IFINFO message received on a routing socket.
1345  * The return value indicates whether a full interface scan is required.
1346  * Link up/down notifications from the NICs are reflected in the
1347  * IFF_RUNNING flag.
1348  * If just the state of the IFF_RUNNING interface flag has changed, a
1349  * a full interface scan isn't required.
1350  */
1351 static boolean_t
1352 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1353 {
1354 	struct sockaddr_dl *sdl;
1355 	struct phyint *pi;
1356 	uint64_t old_flags;
1357 	struct phyint_instance *pii;
1358 
1359 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1360 
1361 	/*
1362 	 * Although the sockaddr_dl structure is directly after the
1363 	 * if_msghdr_t structure. At the time of writing, the size of the
1364 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1365 	 * to the presence of a timeval structure, which contains longs,
1366 	 * in the if_data structure.  Anyway, we know where the message ends,
1367 	 * so we work backwards to get the start of the sockaddr_dl structure.
1368 	 */
1369 	/*LINTED*/
1370 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1371 		sizeof (struct sockaddr_dl));
1372 
1373 	assert(sdl->sdl_family == AF_LINK);
1374 
1375 	/*
1376 	 * The interface name is in sdl_data.
1377 	 * RTM_IFINFO messages are only generated for logical interface
1378 	 * zero, so there is no colon and logical interface number to
1379 	 * strip from the name.	 The name is not null terminated, but
1380 	 * there should be enough space in sdl_data to add the null.
1381 	 */
1382 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1383 		if (debug & D_LINKNOTE)
1384 			logdebug("process_rtm_ifinfo: "
1385 				"phyint name too long\n");
1386 		return (_B_TRUE);
1387 	}
1388 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1389 
1390 	pi = phyint_lookup(sdl->sdl_data);
1391 	if (pi == NULL) {
1392 		if (debug & D_LINKNOTE)
1393 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1394 				" for %s\n", sdl->sdl_data);
1395 		return (_B_TRUE);
1396 	}
1397 
1398 	/*
1399 	 * We want to try and avoid doing a full interface scan for
1400 	 * link state notifications from the NICs, as indicated
1401 	 * by the state of the IFF_RUNNING flag.  If just the
1402 	 * IFF_RUNNING flag has changed state, the link state changes
1403 	 * are processed without a full scan.
1404 	 * If there is both an IPv4 and IPv6 instance associated with
1405 	 * the physical interface, we will get an RTM_IFINFO message
1406 	 * for each instance.  If we just maintained a single copy of
1407 	 * the physical interface flags, it would appear that no flags
1408 	 * had changed when the second message is processed, leading us
1409 	 * to believe that the message wasn't generated by a flags change,
1410 	 * and that a full interface scan is required.
1411 	 * To get around this problem, two additional copies of the flags
1412 	 * are kept, one copy for each instance.  These are only used in
1413 	 * this routine.  At any one time, all three copies of the flags
1414 	 * should be identical except for the IFF_RUNNING flag.	 The
1415 	 * copy of the flags in the "phyint" structure is always up to
1416 	 * date.
1417 	 */
1418 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1419 	if (pii == NULL) {
1420 		if (debug & D_LINKNOTE)
1421 			logdebug("process_rtm_ifinfo: no instance of address "
1422 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1423 		return (_B_TRUE);
1424 	}
1425 
1426 	old_flags = pii->pii_flags;
1427 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1428 	pi->pi_flags = pii->pii_flags;
1429 
1430 	if (debug & D_LINKNOTE) {
1431 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1432 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1433 		    AF_STR(type), old_flags, pi->pi_flags);
1434 	}
1435 
1436 	/*
1437 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1438 	 * types.
1439 	 */
1440 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1441 		phyint_newtype(pi);
1442 
1443 	/*
1444 	 * If IFF_INACTIVE has been set, then no data addresses should be
1445 	 * hosted on the interface.  If IFF_INACTIVE has been cleared, then
1446 	 * move previously failed-over addresses back to it, provided it is
1447 	 * not failed.	For details, see the state diagram in mpd_probe.c.
1448 	 */
1449 	if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
1450 		if (pii->pii_flags & IFF_INACTIVE) {
1451 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
1452 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
1453 		} else {
1454 			if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
1455 				pi->pi_empty = 0;
1456 				(void) try_failback(pi);
1457 			}
1458 		}
1459 	}
1460 
1461 	/* Has just the IFF_RUNNING flag changed state ? */
1462 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1463 		struct phyint_instance *pii_other;
1464 		/*
1465 		 * It wasn't just a link state change.	Update
1466 		 * the other instance's copy of the flags.
1467 		 */
1468 		pii_other = phyint_inst_other(pii);
1469 		if (pii_other != NULL)
1470 			pii_other->pii_flags = pii->pii_flags;
1471 		return (_B_TRUE);
1472 	}
1473 
1474 	return (_B_FALSE);
1475 }
1476 
1477 /*
1478  * Retrieve as many routing socket messages as possible, and try to
1479  * empty the routing sockets. Initiate full scan of targets or interfaces
1480  * as needed.
1481  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1482  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1483  */
1484 static void
1485 process_rtsock(int rtsock_v4, int rtsock_v6)
1486 {
1487 	int	nbytes;
1488 	int64_t msg[2048 / 8];
1489 	struct rt_msghdr *rtm;
1490 	boolean_t need_if_scan = _B_FALSE;
1491 	boolean_t need_rt_scan = _B_FALSE;
1492 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1493 	int type;
1494 
1495 	/* Read as many messages as possible and try to empty the sockets */
1496 	for (type = AF_INET; ; type = AF_INET6) {
1497 		for (;;) {
1498 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1499 				rtsock_v6, msg, sizeof (msg));
1500 			if (nbytes <= 0) {
1501 				/* No more messages */
1502 				break;
1503 			}
1504 			rtm = (struct rt_msghdr *)msg;
1505 			if (rtm->rtm_version != RTM_VERSION) {
1506 				logerr("process_rtsock: version %d "
1507 				    "not understood\n", rtm->rtm_version);
1508 				break;
1509 			}
1510 
1511 			if (debug & D_PHYINT) {
1512 				logdebug("process_rtsock: message %d\n",
1513 				    rtm->rtm_type);
1514 			}
1515 
1516 			switch (rtm->rtm_type) {
1517 			case RTM_NEWADDR:
1518 			case RTM_DELADDR:
1519 				/*
1520 				 * Some logical interface has changed,
1521 				 * have to scan everything to determine
1522 				 * what actually changed.
1523 				 */
1524 				need_if_scan = _B_TRUE;
1525 				break;
1526 
1527 			case RTM_IFINFO:
1528 				rtm_ifinfo_seen = _B_TRUE;
1529 				need_if_scan |=
1530 					process_rtm_ifinfo((if_msghdr_t *)rtm,
1531 					type);
1532 				break;
1533 
1534 			case RTM_ADD:
1535 			case RTM_DELETE:
1536 			case RTM_CHANGE:
1537 			case RTM_OLDADD:
1538 			case RTM_OLDDEL:
1539 				need_rt_scan = _B_TRUE;
1540 				break;
1541 
1542 			default:
1543 				/* Not interesting */
1544 				break;
1545 			}
1546 		}
1547 		if (type == AF_INET6)
1548 			break;
1549 	}
1550 
1551 	if (need_if_scan) {
1552 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1553 			logdebug("process_rtsock: synchronizing with kernel\n");
1554 		initifs();
1555 	} else if (rtm_ifinfo_seen) {
1556 		if (debug & D_LINKNOTE)
1557 			logdebug("process_rtsock: "
1558 			    "link up/down notification(s) seen\n");
1559 		process_link_state_changes();
1560 	}
1561 
1562 	if (need_rt_scan)
1563 		init_router_targets();
1564 }
1565 
1566 /*
1567  * Look if the phyint instance or one of its logints have been removed from
1568  * the kernel and take appropriate action.
1569  * Uses {pii,li}_in_use.
1570  */
1571 static void
1572 check_if_removed(struct phyint_instance *pii)
1573 {
1574 	struct logint *li;
1575 	struct logint *next_li;
1576 
1577 	/* Detect phyints that have been removed from the kernel. */
1578 	if (!pii->pii_in_use) {
1579 		logtrace("%s %s has been removed from kernel\n",
1580 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1581 		phyint_inst_delete(pii);
1582 	} else {
1583 		/* Detect logints that have been removed. */
1584 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1585 			next_li = li->li_next;
1586 			if (!li->li_in_use) {
1587 				logint_delete(li);
1588 			}
1589 		}
1590 	}
1591 }
1592 
1593 /*
1594  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1595  * tables defined by mib2.h. Parse the returned data and extract
1596  * the 'routing' information table. Process the 'routing' table
1597  * to get the list of known onlink routers, and update our database.
1598  * These onlink routers will serve as our probe targets.
1599  * Returns false, if any system calls resulted in errors, true otherwise.
1600  */
1601 static boolean_t
1602 update_router_list(int fd)
1603 {
1604 	union {
1605 		char	ubuf[1024];
1606 		union T_primitives uprim;
1607 	} buf;
1608 
1609 	int			flags;
1610 	struct strbuf		ctlbuf;
1611 	struct strbuf		databuf;
1612 	struct T_optmgmt_req	*tor;
1613 	struct T_optmgmt_ack	*toa;
1614 	struct T_error_ack	*tea;
1615 	struct opthdr		*optp;
1616 	struct opthdr		*req;
1617 	int			status;
1618 	t_scalar_t		prim;
1619 
1620 	tor = (struct T_optmgmt_req *)&buf;
1621 
1622 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1623 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1624 	tor->OPT_length = sizeof (struct opthdr);
1625 	tor->MGMT_flags = T_CURRENT;
1626 
1627 	req = (struct opthdr *)&tor[1];
1628 	req->level = MIB2_IP;	/* any MIB2_xxx value ok here */
1629 	req->name  = 0;
1630 	req->len   = 0;
1631 
1632 	ctlbuf.buf = (char *)&buf;
1633 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1634 	ctlbuf.maxlen = sizeof (buf);
1635 	flags = 0;
1636 	if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
1637 		logperror("update_router_list: putmsg(ctl)");
1638 		return (_B_FALSE);
1639 	}
1640 
1641 	/*
1642 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1643 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1644 	 * a control and data part. The control part contains a struct
1645 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1646 	 * the level, name and length of the data in the data part. The
1647 	 * data part contains the actual table data. The last message
1648 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1649 	 * single option with zero optlen.
1650 	 */
1651 
1652 	for (;;) {
1653 		/*
1654 		 * Go around this loop once for each table. Ignore
1655 		 * all tables except the routing information table.
1656 		 */
1657 		flags = 0;
1658 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1659 		if (status < 0) {
1660 			if (errno == EINTR)
1661 				continue;
1662 			logperror("update_router_list: getmsg(ctl)");
1663 			return (_B_FALSE);
1664 		}
1665 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1666 			logerr("update_router_list: ctlbuf.len %d\n",
1667 			    ctlbuf.len);
1668 			return (_B_FALSE);
1669 		}
1670 
1671 		prim = buf.uprim.type;
1672 
1673 		switch (prim) {
1674 
1675 		case T_ERROR_ACK:
1676 			tea = &buf.uprim.error_ack;
1677 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1678 				logerr("update_router_list: T_ERROR_ACK"
1679 				    " ctlbuf.len %d\n", ctlbuf.len);
1680 				return (_B_FALSE);
1681 			}
1682 			logerr("update_router_list: T_ERROR_ACK:"
1683 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1684 			    tea->TLI_error, tea->UNIX_error);
1685 			return (_B_FALSE);
1686 
1687 		case T_OPTMGMT_ACK:
1688 			toa = &buf.uprim.optmgmt_ack;
1689 			optp = (struct opthdr *)&toa[1];
1690 			if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
1691 				logerr("update_router_list: ctlbuf.len %d\n",
1692 				    ctlbuf.len);
1693 				return (_B_FALSE);
1694 			}
1695 			if (toa->MGMT_flags != T_SUCCESS) {
1696 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1697 				    toa->MGMT_flags);
1698 				return (_B_FALSE);
1699 			}
1700 			break;
1701 
1702 		default:
1703 			logerr("update_router_list: unknown primitive %ld\n",
1704 			    prim);
1705 			return (_B_FALSE);
1706 		}
1707 
1708 		/* Process the T_OPGMGMT_ACK below */
1709 		assert(prim == T_OPTMGMT_ACK);
1710 
1711 		switch (status) {
1712 		case 0:
1713 			/*
1714 			 * We have reached the end of this T_OPTMGMT_ACK
1715 			 * message. If this is the last message i.e EOD,
1716 			 * return, else process the next T_OPTMGMT_ACK msg.
1717 			 */
1718 			if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
1719 			    sizeof (struct opthdr)) && optp->len == 0 &&
1720 			    optp->name == 0 && optp->level == 0) {
1721 				/*
1722 				 * This is the EOD message. Return
1723 				 */
1724 				return (_B_TRUE);
1725 			}
1726 			continue;
1727 
1728 		case MORECTL:
1729 		case MORECTL | MOREDATA:
1730 			/*
1731 			 * This should not happen. We should be able to read
1732 			 * the control portion in a single getmsg.
1733 			 */
1734 			logerr("update_router_list: MORECTL\n");
1735 			return (_B_FALSE);
1736 
1737 		case MOREDATA:
1738 			databuf.maxlen = optp->len;
1739 			/* malloc of 0 bytes is ok */
1740 			databuf.buf = malloc((size_t)optp->len);
1741 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1742 				logperror("update_router_list: malloc");
1743 				return (_B_FALSE);
1744 			}
1745 			databuf.len = 0;
1746 			flags = 0;
1747 			for (;;) {
1748 				status = getmsg(fd, NULL, &databuf, &flags);
1749 				if (status >= 0) {
1750 					break;
1751 				} else if (errno == EINTR) {
1752 					continue;
1753 				} else {
1754 					logperror("update_router_list:"
1755 					    " getmsg(data)");
1756 					free(databuf.buf);
1757 					return (_B_FALSE);
1758 				}
1759 			}
1760 
1761 			if (optp->level == MIB2_IP &&
1762 			    optp->name == MIB2_IP_ROUTE) {
1763 				/* LINTED */
1764 				ire_process_v4((mib2_ipRouteEntry_t *)
1765 				    databuf.buf, databuf.len);
1766 			} else if (optp->level == MIB2_IP6 &&
1767 			    optp->name == MIB2_IP6_ROUTE) {
1768 				/* LINTED */
1769 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1770 				    databuf.buf, databuf.len);
1771 			}
1772 			free(databuf.buf);
1773 		}
1774 	}
1775 	/* NOTREACHED */
1776 }
1777 
1778 /*
1779  * Examine the IPv4 routing table, for default routers. For each default
1780  * router, populate the list of targets of each phyint that is on the same
1781  * link as the default router
1782  */
1783 static void
1784 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1785 {
1786 	mib2_ipRouteEntry_t	*rp;
1787 	mib2_ipRouteEntry_t	*rp1;
1788 	struct	in_addr		nexthop_v4;
1789 	mib2_ipRouteEntry_t	*endp;
1790 
1791 	if (len == 0)
1792 		return;
1793 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1794 
1795 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1796 
1797 	/*
1798 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1799 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1800 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1801 	 * This is a potential target for probing, which we try to add
1802 	 * to the list of probe targets.
1803 	 */
1804 	for (rp = buf; rp < endp; rp++) {
1805 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1806 			continue;
1807 
1808 		/*  Get the nexthop address. */
1809 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1810 
1811 		/*
1812 		 * Get the nexthop address. Then determine the outgoing
1813 		 * interface, by examining all interface IREs, and picking the
1814 		 * match. We don't look at the interface specified in the route
1815 		 * because we need to add the router target on all matching
1816 		 * interfaces anyway; the goal is to avoid falling back to
1817 		 * multicast when some interfaces are in the same subnet but
1818 		 * not in the same group.
1819 		 */
1820 		for (rp1 = buf; rp1 < endp; rp1++) {
1821 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
1822 				continue;
1823 			}
1824 
1825 			/*
1826 			 * Determine the interface IRE that matches the nexthop.
1827 			 * i.e.	 (IRE addr & IRE mask) == (nexthop & IRE mask)
1828 			 */
1829 			if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
1830 			    (nexthop_v4.s_addr & rp1->ipRouteMask)) {
1831 				/*
1832 				 * We found the interface ire
1833 				 */
1834 				router_add_v4(rp1, nexthop_v4);
1835 			}
1836 		}
1837 	}
1838 }
1839 
1840 void
1841 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
1842 {
1843 	char *cp;
1844 	char ifname[LIFNAMSIZ + 1];
1845 	struct in6_addr	nexthop;
1846 	int len;
1847 
1848 	if (debug & D_TARGET)
1849 		logdebug("router_add_v4()\n");
1850 
1851 	len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
1852 	(void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
1853 	ifname[len] = '\0';
1854 
1855 	if (ifname[0] == '\0')
1856 		return;
1857 
1858 	cp = strchr(ifname, IF_SEPARATOR);
1859 	if (cp != NULL)
1860 		*cp = '\0';
1861 
1862 	IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1863 	router_add_common(AF_INET, ifname, nexthop);
1864 }
1865 
1866 void
1867 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1868 {
1869 	struct phyint_instance *pii;
1870 	struct phyint *pi;
1871 
1872 	if (debug & D_TARGET)
1873 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1874 
1875 	/*
1876 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1877 	 */
1878 	pii = phyint_inst_lookup(af, ifname);
1879 	if (pii == NULL)
1880 		return;
1881 
1882 	/*
1883 	 * Don't use our own addresses as targets.
1884 	 */
1885 	if (own_address(nexthop))
1886 		return;
1887 
1888 	/*
1889 	 * If the phyint is part a named group, then add the address to all
1890 	 * members of the group; note that this is suboptimal in the IPv4 case
1891 	 * as it has already been added to all matching interfaces in
1892 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1893 	 * itself, since other phyints in the anongroup may not be on the same
1894 	 * subnet.
1895 	 */
1896 	pi = pii->pii_phyint;
1897 	if (pi->pi_group == phyint_anongroup) {
1898 		target_add(pii, nexthop, _B_TRUE);
1899 	} else {
1900 		pi = pi->pi_group->pg_phyint;
1901 		for (; pi != NULL; pi = pi->pi_pgnext)
1902 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1903 	}
1904 }
1905 
1906 /*
1907  * Examine the IPv6 routing table, for default routers. For each default
1908  * router, populate the list of targets of each phyint that is on the same
1909  * link as the default router
1910  */
1911 static void
1912 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1913 {
1914 	mib2_ipv6RouteEntry_t	*rp;
1915 	mib2_ipv6RouteEntry_t	*endp;
1916 	struct	in6_addr nexthop_v6;
1917 
1918 	if (debug & D_TARGET)
1919 		logdebug("ire_process_v6(len %d)\n", len);
1920 
1921 	if (len == 0)
1922 		return;
1923 
1924 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1925 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1926 
1927 	/*
1928 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1929 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1930 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1931 	 * This is a potential target for probing, which we try to add
1932 	 * to the list of probe targets.
1933 	 */
1934 	for (rp = buf; rp < endp; rp++) {
1935 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
1936 			continue;
1937 
1938 		/*
1939 		 * We have the outgoing interface in ipv6RouteIfIndex
1940 		 * if ipv6RouteIfindex.o_length is non-zero. The outgoing
1941 		 * interface must be present for link-local addresses. Since
1942 		 * we use only link-local addreses for probing, we don't
1943 		 * consider the case when the outgoing interface is not
1944 		 * known and we need to scan interface ires
1945 		 */
1946 		nexthop_v6 = rp->ipv6RouteNextHop;
1947 		if (rp->ipv6RouteIfIndex.o_length != 0) {
1948 			/*
1949 			 * We already have the outgoing interface
1950 			 * in ipv6RouteIfIndex.
1951 			 */
1952 			router_add_v6(rp, nexthop_v6);
1953 		}
1954 	}
1955 }
1956 
1957 
1958 void
1959 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
1960 {
1961 	char ifname[LIFNAMSIZ + 1];
1962 	char *cp;
1963 	int  len;
1964 
1965 	if (debug & D_TARGET)
1966 		logdebug("router_add_v6()\n");
1967 
1968 	len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
1969 	(void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
1970 	ifname[len] = '\0';
1971 
1972 	if (ifname[0] == '\0')
1973 		return;
1974 
1975 	cp = strchr(ifname, IF_SEPARATOR);
1976 	if (cp != NULL)
1977 		*cp = '\0';
1978 
1979 	router_add_common(AF_INET6, ifname, nexthop_v6);
1980 }
1981 
1982 
1983 
1984 /*
1985  * Build a list of target routers, by scanning the routing tables.
1986  * It is assumed that interface routes exist, to reach the routers.
1987  */
1988 static void
1989 init_router_targets(void)
1990 {
1991 	struct	target *tg;
1992 	struct	target *next_tg;
1993 	struct	phyint_instance *pii;
1994 	struct	phyint *pi;
1995 
1996 	if (force_mcast)
1997 		return;
1998 
1999 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2000 		pi = pii->pii_phyint;
2001 		/*
2002 		 * Exclude ptp and host targets. Set tg_in_use to false,
2003 		 * only for router targets.
2004 		 */
2005 		if (!pii->pii_targets_are_routers ||
2006 		    (pi->pi_flags & IFF_POINTOPOINT))
2007 			continue;
2008 
2009 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
2010 			tg->tg_in_use = 0;
2011 	}
2012 
2013 	if (mibfd < 0) {
2014 		mibfd = open("/dev/ip", O_RDWR);
2015 		if (mibfd < 0) {
2016 			logperror("mibopen: ip open");
2017 			exit(1);
2018 		}
2019 	}
2020 
2021 	if (!update_router_list(mibfd)) {
2022 		(void) close(mibfd);
2023 		mibfd = -1;
2024 	}
2025 
2026 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2027 		if (!pii->pii_targets_are_routers ||
2028 		    (pi->pi_flags & IFF_POINTOPOINT))
2029 			continue;
2030 
2031 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
2032 			next_tg = tg->tg_next;
2033 			if (!tg->tg_in_use) {
2034 				target_delete(tg);
2035 			}
2036 		}
2037 	}
2038 }
2039 
2040 /*
2041  * Attempt to assign host targets to any interfaces that do not currently
2042  * have probe targets by sharing targets with other interfaces in the group.
2043  */
2044 static void
2045 init_host_targets(void)
2046 {
2047 	struct phyint_instance *pii;
2048 	struct phyint_group *pg;
2049 
2050 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2051 		pg = pii->pii_phyint->pi_group;
2052 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
2053 			dup_host_targets(pii);
2054 	}
2055 }
2056 
2057 /*
2058  * Duplicate host targets from other phyints of the group to
2059  * the phyint instance 'desired_pii'.
2060  */
2061 static void
2062 dup_host_targets(struct phyint_instance	 *desired_pii)
2063 {
2064 	int af;
2065 	struct phyint *pi;
2066 	struct phyint_instance *pii;
2067 	struct target *tg;
2068 
2069 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
2070 
2071 	af = desired_pii->pii_af;
2072 
2073 	/*
2074 	 * For every phyint in the same group as desired_pii, check if
2075 	 * it has any host targets. If so add them to desired_pii.
2076 	 */
2077 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
2078 		pii = PHYINT_INSTANCE(pi, af);
2079 		/*
2080 		 * We know that we don't have targets on this phyint instance
2081 		 * since we have been called. But we still check for
2082 		 * pii_targets_are_routers because another phyint instance
2083 		 * could have router targets, since IFF_NOFAILOVER addresses
2084 		 * on different phyint instances may belong to different
2085 		 * subnets.
2086 		 */
2087 		if ((pii == NULL) || (pii == desired_pii) ||
2088 		    pii->pii_targets_are_routers)
2089 			continue;
2090 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2091 			target_create(desired_pii, tg->tg_address, _B_FALSE);
2092 		}
2093 	}
2094 }
2095 
2096 static void
2097 usage(char *cmd)
2098 {
2099 	(void) fprintf(stderr, "usage: %s\n", cmd);
2100 }
2101 
2102 
2103 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2104 
2105 /* Get an option from the /etc/default/mpathd file */
2106 static char *
2107 getdefault(char *name)
2108 {
2109 	char namebuf[BUFSIZ];
2110 	char *value = NULL;
2111 
2112 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2113 		char	*cp;
2114 		int	flags;
2115 
2116 		/*
2117 		 * ignore case
2118 		 */
2119 		flags = defcntl(DC_GETFLAGS, 0);
2120 		TURNOFF(flags, DC_CASE);
2121 		(void) defcntl(DC_SETFLAGS, flags);
2122 
2123 		/* Add "=" to the name */
2124 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2125 		(void) strncat(namebuf, "=", 2);
2126 
2127 		if ((cp = defread(namebuf)) != NULL)
2128 			value = strdup(cp);
2129 
2130 		/* close */
2131 		(void) defopen((char *)NULL);
2132 	}
2133 	return (value);
2134 }
2135 
2136 
2137 /*
2138  * Command line options below
2139  */
2140 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2141 boolean_t	track_all_phyints = _B_FALSE;	/* option to track all NICs */
2142 static boolean_t adopt = _B_FALSE;
2143 static boolean_t foreground = _B_FALSE;
2144 
2145 int
2146 main(int argc, char *argv[])
2147 {
2148 	int i;
2149 	int c;
2150 	struct phyint_instance *pii;
2151 	char *value;
2152 
2153 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2154 	srandom(gethostid());	/* Initialize the random number generator */
2155 
2156 	/*
2157 	 * NOTE: The messages output by in.mpathd are not suitable for
2158 	 * translation, so we do not call textdomain().
2159 	 */
2160 	(void) setlocale(LC_ALL, "");
2161 
2162 	/*
2163 	 * Get the user specified value of 'failure detection time'
2164 	 * from /etc/default/mpathd
2165 	 */
2166 	value = getdefault("FAILURE_DETECTION_TIME");
2167 	if (value != NULL) {
2168 		user_failure_detection_time =
2169 		    (int)strtol((char *)value, NULL, 0);
2170 
2171 		if (user_failure_detection_time <= 0) {
2172 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2173 			logerr("Invalid failure detection time %s, assuming "
2174 			    "default %d\n", value, user_failure_detection_time);
2175 
2176 		} else if (user_failure_detection_time <
2177 		    MIN_FAILURE_DETECTION_TIME) {
2178 			user_failure_detection_time =
2179 			    MIN_FAILURE_DETECTION_TIME;
2180 			logerr("Too small failure detection time of %s, "
2181 			    "assuming minimum %d\n", value,
2182 			    user_failure_detection_time);
2183 		}
2184 		free(value);
2185 	} else {
2186 		/* User has not specified the parameter, Use default value */
2187 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2188 	}
2189 
2190 	/*
2191 	 * This gives the frequency at which probes will be sent.
2192 	 * When fdt ms elapses, we should be able to determine
2193 	 * whether 5 consecutive probes have failed or not.
2194 	 * 1 probe will be sent in every user_probe_interval ms,
2195 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2196 	 * user_probe_interval. Thus when we send out probe 'n' we
2197 	 * can be sure that probe 'n - 2' is lost, if we have not
2198 	 * got the ack. (since the probe interval is > crtt). But
2199 	 * probe 'n - 1' may be a valid unacked probe, since the
2200 	 * time between 2 successive probes could be as small as
2201 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2202 	 */
2203 	user_probe_interval = user_failure_detection_time /
2204 	    (NUM_PROBE_FAILS + 2);
2205 
2206 	/*
2207 	 * Get the user specified value of failback_enabled from
2208 	 * /etc/default/mpathd
2209 	 */
2210 	value = getdefault("FAILBACK");
2211 	if (value != NULL) {
2212 		if (strncasecmp(value, "yes", 3) == 0)
2213 			failback_enabled = _B_TRUE;
2214 		else if (strncasecmp(value, "no", 2) == 0)
2215 			failback_enabled = _B_FALSE;
2216 		else
2217 			logerr("Invalid value for FAILBACK %s\n", value);
2218 		free(value);
2219 	} else {
2220 		failback_enabled = _B_TRUE;
2221 	}
2222 
2223 	/*
2224 	 * Get the user specified value of track_all_phyints from
2225 	 * /etc/default/mpathd. The sense is reversed in
2226 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2227 	 */
2228 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2229 	if (value != NULL) {
2230 		if (strncasecmp(value, "yes", 3) == 0)
2231 			track_all_phyints = _B_FALSE;
2232 		else if (strncasecmp(value, "no", 2) == 0)
2233 			track_all_phyints = _B_TRUE;
2234 		else
2235 			logerr("Invalid value for "
2236 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2237 		free(value);
2238 	} else {
2239 		track_all_phyints = _B_FALSE;
2240 	}
2241 
2242 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2243 		switch (c) {
2244 		case 'a':
2245 			adopt = _B_TRUE;
2246 			break;
2247 		case 'm':
2248 			force_mcast = _B_TRUE;
2249 			break;
2250 		case 'd':
2251 			debug = D_ALL;
2252 			foreground = _B_TRUE;
2253 			break;
2254 		case 'D':
2255 			i = (int)strtol(optarg, NULL, 0);
2256 			if (i == 0) {
2257 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2258 				    optarg);
2259 				exit(1);
2260 			}
2261 			debug |= i;
2262 			foreground = _B_TRUE;
2263 			break;
2264 		case 'l':
2265 			/*
2266 			 * Turn off link state notification handling.
2267 			 * Undocumented command line flag, for debugging
2268 			 * purposes.
2269 			 */
2270 			handle_link_notifications = _B_FALSE;
2271 			break;
2272 		default:
2273 			usage(argv[0]);
2274 			exit(1);
2275 		}
2276 	}
2277 
2278 	/*
2279 	 * The sockets for the loopback command interface should be listening
2280 	 * before we fork and exit in daemonize(). This way, whoever started us
2281 	 * can use the loopback interface as soon as they get a zero exit
2282 	 * status.
2283 	 */
2284 	lsock_v4 = setup_listener(AF_INET);
2285 	lsock_v6 = setup_listener(AF_INET6);
2286 
2287 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2288 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2289 		exit(1);
2290 	}
2291 
2292 	if (!foreground) {
2293 		if (!daemonize()) {
2294 			logerr("cannot daemonize\n");
2295 			exit(EXIT_FAILURE);
2296 		}
2297 		initlog();
2298 	}
2299 
2300 	/*
2301 	 * Initializations:
2302 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2303 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2304 	 * 2. Initialize a pipe for handling/recording signal events.
2305 	 * 3. Create the routing sockets,  used for listening
2306 	 *    to routing / interface changes.
2307 	 * 4. phyint_init() - Initialize physical interface state
2308 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2309 	 *    which timer_init() does indirectly.
2310 	 * 5. timer_init()  - Initialize timer related stuff
2311 	 * 6. initifs() - Initialize our database of all known interfaces
2312 	 * 7. init_router_targets() - Initialize our database of all known
2313 	 *    router targets.
2314 	 */
2315 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2316 	if (ifsock_v4 < 0) {
2317 		logperror("main: IPv4 socket open");
2318 		exit(1);
2319 	}
2320 
2321 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2322 	if (ifsock_v6 < 0) {
2323 		logperror("main: IPv6 socket open");
2324 		exit(1);
2325 	}
2326 
2327 	setup_eventpipe();
2328 
2329 	rtsock_v4 = setup_rtsock(AF_INET);
2330 	rtsock_v6 = setup_rtsock(AF_INET6);
2331 
2332 	if (phyint_init() == -1) {
2333 		logerr("cannot initialize physical interface structures");
2334 		exit(1);
2335 	}
2336 
2337 	timer_init();
2338 
2339 	initifs();
2340 
2341 	/* Inform kernel whether failback is enabled or disabled */
2342 	if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) {
2343 		logperror("main: ioctl (SIOCSIPMPFAILBACK)");
2344 		exit(1);
2345 	}
2346 
2347 	/*
2348 	 * If we're operating in "adopt" mode and no interfaces need to be
2349 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2350 	 * interfaces are subsequently put into multipathing groups).
2351 	 */
2352 	if (adopt && phyint_instances == NULL)
2353 		exit(0);
2354 
2355 	/*
2356 	 * Main body. Keep listening for activity on any of the sockets
2357 	 * that we are monitoring and take appropriate action as necessary.
2358 	 * signals are also handled synchronously.
2359 	 */
2360 	for (;;) {
2361 		if (poll(pollfds, pollfd_num, -1) < 0) {
2362 			if (errno == EINTR)
2363 				continue;
2364 			logperror("main: poll");
2365 			exit(1);
2366 		}
2367 		for (i = 0; i < pollfd_num; i++) {
2368 			if ((pollfds[i].fd == -1) ||
2369 			    !(pollfds[i].revents & POLLIN))
2370 				continue;
2371 			if (pollfds[i].fd == eventpipe_read) {
2372 				in_signal(eventpipe_read);
2373 				break;
2374 			}
2375 			if (pollfds[i].fd == rtsock_v4 ||
2376 			    pollfds[i].fd == rtsock_v6) {
2377 				process_rtsock(rtsock_v4, rtsock_v6);
2378 				break;
2379 			}
2380 			for (pii = phyint_instances; pii != NULL;
2381 			    pii = pii->pii_next) {
2382 				if (pollfds[i].fd == pii->pii_probe_sock) {
2383 					if (pii->pii_af == AF_INET)
2384 						in_data(pii);
2385 					else
2386 						in6_data(pii);
2387 					break;
2388 				}
2389 			}
2390 			if (pollfds[i].fd == lsock_v4)
2391 				loopback_cmd(lsock_v4, AF_INET);
2392 			else if (pollfds[i].fd == lsock_v6)
2393 				loopback_cmd(lsock_v6, AF_INET6);
2394 		}
2395 		if (full_scan_required) {
2396 			initifs();
2397 			full_scan_required = _B_FALSE;
2398 		}
2399 	}
2400 	/* NOTREACHED */
2401 	return (EXIT_SUCCESS);
2402 }
2403 
2404 static int
2405 setup_listener(int af)
2406 {
2407 	int sock;
2408 	int on;
2409 	int len;
2410 	int ret;
2411 	struct sockaddr_storage laddr;
2412 	struct sockaddr_in  *sin;
2413 	struct sockaddr_in6 *sin6;
2414 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2415 
2416 	assert(af == AF_INET || af == AF_INET6);
2417 
2418 	sock = socket(af, SOCK_STREAM, 0);
2419 	if (sock < 0) {
2420 		logperror("setup_listener: socket");
2421 		exit(1);
2422 	}
2423 
2424 	on = 1;
2425 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2426 	    sizeof (on)) < 0) {
2427 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2428 		exit(1);
2429 	}
2430 
2431 	bzero(&laddr, sizeof (laddr));
2432 	laddr.ss_family = af;
2433 
2434 	if (af == AF_INET) {
2435 		sin = (struct sockaddr_in *)&laddr;
2436 		sin->sin_port = htons(MPATHD_PORT);
2437 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2438 		len = sizeof (struct sockaddr_in);
2439 	} else {
2440 		sin6 = (struct sockaddr_in6 *)&laddr;
2441 		sin6->sin6_port = htons(MPATHD_PORT);
2442 		sin6->sin6_addr = loopback_addr;
2443 		len = sizeof (struct sockaddr_in6);
2444 	}
2445 
2446 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2447 	if (ret < 0) {
2448 		if (errno == EADDRINUSE) {
2449 			/*
2450 			 * Another instance of mpathd may be already active.
2451 			 */
2452 			logerr("main: is another instance of in.mpathd "
2453 			    "already active?\n");
2454 			exit(1);
2455 		} else {
2456 			(void) close(sock);
2457 			return (-1);
2458 		}
2459 	}
2460 	if (listen(sock, 30) < 0) {
2461 		logperror("main: listen");
2462 		exit(1);
2463 	}
2464 	if (poll_add(sock) == -1) {
2465 		(void) close(sock);
2466 		exit(1);
2467 	}
2468 
2469 	return (sock);
2470 }
2471 
2472 /*
2473  * Table of commands and their expected size; used by loopback_cmd().
2474  */
2475 static struct {
2476 	const char	*name;
2477 	unsigned int	size;
2478 } commands[] = {
2479 	{ "MI_PING",		sizeof (uint32_t)	},
2480 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2481 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2482 	{ "MI_SETOINDEX",	sizeof (mi_setoindex_t) },
2483 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2484 };
2485 
2486 /*
2487  * Commands received over the loopback interface come here. Currently
2488  * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
2489  * module. ifconfig only makes a connection, and closes it to check if
2490  * in.mpathd is running.
2491  * if_mpadm sends commands in the format specified by the mpathd_interface
2492  * structure.
2493  */
2494 static void
2495 loopback_cmd(int sock, int family)
2496 {
2497 	int newfd;
2498 	ssize_t len;
2499 	struct sockaddr_storage	peer;
2500 	struct sockaddr_in	*peer_sin;
2501 	struct sockaddr_in6	*peer_sin6;
2502 	socklen_t peerlen;
2503 	union mi_commands mpi;
2504 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2505 	char abuf[INET6_ADDRSTRLEN];
2506 	uint_t cmd;
2507 	int retval;
2508 
2509 	peerlen = sizeof (peer);
2510 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2511 	if (newfd < 0) {
2512 		logperror("loopback_cmd: accept");
2513 		return;
2514 	}
2515 
2516 	switch (family) {
2517 	case AF_INET:
2518 		/*
2519 		 * Validate the address and port to make sure that
2520 		 * non privileged processes don't connect and start
2521 		 * talking to us.
2522 		 */
2523 		if (peerlen != sizeof (struct sockaddr_in)) {
2524 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2525 			(void) close(newfd);
2526 			return;
2527 		}
2528 		peer_sin = (struct sockaddr_in *)&peer;
2529 		if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
2530 		    (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
2531 			(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2532 			    abuf, sizeof (abuf));
2533 			logerr("Attempt to connect from addr %s port %d\n",
2534 			    abuf, ntohs(peer_sin->sin_port));
2535 			(void) close(newfd);
2536 			return;
2537 		}
2538 		break;
2539 
2540 	case AF_INET6:
2541 		if (peerlen != sizeof (struct sockaddr_in6)) {
2542 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2543 			(void) close(newfd);
2544 			return;
2545 		}
2546 		/*
2547 		 * Validate the address and port to make sure that
2548 		 * non privileged processes don't connect and start
2549 		 * talking to us.
2550 		 */
2551 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2552 		if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
2553 		    (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
2554 		    &loopback_addr))) {
2555 			(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2556 			    sizeof (abuf));
2557 			logerr("Attempt to connect from addr %s port %d\n",
2558 			    abuf, ntohs(peer_sin6->sin6_port));
2559 			(void) close(newfd);
2560 			return;
2561 		}
2562 
2563 	default:
2564 		logdebug("loopback_cmd: family %d\n", family);
2565 		(void) close(newfd);
2566 		return;
2567 	}
2568 
2569 	/*
2570 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2571 	 * all supported commands
2572 	 */
2573 	len = read(newfd, &mpi, sizeof (mpi));
2574 
2575 	/*
2576 	 * ifconfig does not send any data. Just tests to see if mpathd
2577 	 * is already running.
2578 	 */
2579 	if (len <= 0) {
2580 		(void) close(newfd);
2581 		return;
2582 	}
2583 
2584 	/*
2585 	 * In theory, we can receive any sized message for a stream socket,
2586 	 * but we don't expect that to happen for a small message over a
2587 	 * loopback connection.
2588 	 */
2589 	if (len < sizeof (uint32_t)) {
2590 		logerr("loopback_cmd: bad command format or read returns "
2591 		    "partial data %d\n", len);
2592 	}
2593 
2594 	cmd = mpi.mi_command;
2595 	if (cmd >= MI_NCMD) {
2596 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2597 		(void) close(newfd);
2598 		return;
2599 	}
2600 
2601 	if (len < commands[cmd].size) {
2602 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2603 		    commands[cmd].name, commands[cmd].size, len);
2604 		(void) close(newfd);
2605 		return;
2606 	}
2607 
2608 	retval = process_cmd(newfd, &mpi);
2609 	if (retval != IPMP_SUCCESS) {
2610 		logerr("failed processing %s: %s\n", commands[cmd].name,
2611 		    ipmp_errmsg(retval));
2612 	}
2613 	(void) close(newfd);
2614 }
2615 
2616 extern int global_errno;	/* set by failover() or failback() */
2617 
2618 /*
2619  * Process the offline, undo offline and set original index commands,
2620  * received from if_mpadm(1M)
2621  */
2622 static unsigned int
2623 process_cmd(int newfd, union mi_commands *mpi)
2624 {
2625 	uint_t	nif = 0;
2626 	uint32_t cmd;
2627 	struct phyint *pi;
2628 	struct phyint *pi2;
2629 	struct phyint_group *pg;
2630 	boolean_t success;
2631 	int error;
2632 	struct mi_offline *mio;
2633 	struct mi_undo_offline *miu;
2634 	struct lifreq lifr;
2635 	int ifsock;
2636 	struct mi_setoindex *mis;
2637 
2638 	cmd = mpi->mi_command;
2639 
2640 	switch (cmd) {
2641 	case MI_OFFLINE:
2642 		mio = &mpi->mi_ocmd;
2643 		/*
2644 		 * Lookup the interface that needs to be offlined.
2645 		 * If it does not exist, return a suitable error.
2646 		 */
2647 		pi = phyint_lookup(mio->mio_ifname);
2648 		if (pi == NULL)
2649 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2650 
2651 		/*
2652 		 * Verify that the minimum redundancy requirements are met.
2653 		 * The multipathing group must have at least the specified
2654 		 * number of functional interfaces after offlining the
2655 		 * requested interface. Otherwise return a suitable error.
2656 		 */
2657 		pg = pi->pi_group;
2658 		nif = 0;
2659 		if (pg != phyint_anongroup) {
2660 			for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
2661 			    pi2 = pi2->pi_pgnext) {
2662 				if ((pi2->pi_state == PI_RUNNING) ||
2663 				    (pg->pg_groupfailed &&
2664 				    !(pi2->pi_flags & IFF_OFFLINE)))
2665 					nif++;
2666 			}
2667 		}
2668 		if (nif < mio->mio_min_redundancy)
2669 			return (send_result(newfd, IPMP_EMINRED, 0));
2670 
2671 		/*
2672 		 * The order of operation is to set IFF_OFFLINE, followed by
2673 		 * failover. Setting IFF_OFFLINE ensures that no new ipif's
2674 		 * can be created. Subsequent failover moves everything on
2675 		 * the OFFLINE interface to some other functional interface.
2676 		 */
2677 		success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
2678 		if (success) {
2679 			if (!pi->pi_empty) {
2680 				error = try_failover(pi, FAILOVER_NORMAL);
2681 				if (error != 0) {
2682 					if (!change_lif_flags(pi, IFF_OFFLINE,
2683 					    _B_FALSE)) {
2684 						logerr("process_cmd: couldn't"
2685 						    " clear OFFLINE flag on"
2686 						    " %s\n", pi->pi_name);
2687 						/*
2688 						 * Offline interfaces should
2689 						 * not be probed.
2690 						 */
2691 						stop_probing(pi);
2692 					}
2693 					return (send_result(newfd, error,
2694 					    global_errno));
2695 				}
2696 			}
2697 		} else {
2698 			return (send_result(newfd, IPMP_FAILURE, errno));
2699 		}
2700 
2701 		/*
2702 		 * The interface is now Offline, so stop probing it.
2703 		 * Note that if_mpadm(1M) will down the test addresses,
2704 		 * after receiving a success reply from us. The routing
2705 		 * socket message will then make us close the socket used
2706 		 * for sending probes. But it is more logical that an
2707 		 * offlined interface must not be probed, even if it has
2708 		 * test addresses.
2709 		 */
2710 		stop_probing(pi);
2711 		return (send_result(newfd, IPMP_SUCCESS, 0));
2712 
2713 	case MI_UNDO_OFFLINE:
2714 		miu = &mpi->mi_ucmd;
2715 		/*
2716 		 * Undo the offline command. As usual lookup the interface.
2717 		 * Send an error if it does not exist or is not offline.
2718 		 */
2719 		pi = phyint_lookup(miu->miu_ifname);
2720 		if (pi == NULL || pi->pi_state != PI_OFFLINE)
2721 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2722 
2723 		/*
2724 		 * Reset the state of the interface based on the current link
2725 		 * state; if this phyint subsequently acquires a test address,
2726 		 * the state will be updated later as a result of the probes.
2727 		 */
2728 		if (LINK_UP(pi))
2729 			phyint_chstate(pi, PI_RUNNING);
2730 		else
2731 			phyint_chstate(pi, PI_FAILED);
2732 
2733 		if (pi->pi_state == PI_RUNNING) {
2734 			/*
2735 			 * Note that the success of MI_UNDO_OFFLINE is not
2736 			 * contingent on actually failing back; in the odd
2737 			 * case where we cannot do it here, we will try again
2738 			 * in initifs() since pi->pi_full will still be zero.
2739 			 */
2740 			if (do_failback(pi) != IPMP_SUCCESS) {
2741 				logdebug("process_cmd: cannot failback from "
2742 				    "%s during MI_UNDO_OFFLINE\n", pi->pi_name);
2743 			}
2744 		}
2745 
2746 		/*
2747 		 * Clear the IFF_OFFLINE flag.  We have to do this last
2748 		 * because do_failback() relies on it being set to decide
2749 		 * when to display messages.
2750 		 */
2751 		(void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE);
2752 
2753 		return (send_result(newfd, IPMP_SUCCESS, 0));
2754 
2755 	case MI_SETOINDEX:
2756 		mis = &mpi->mi_scmd;
2757 
2758 		/* Get the socket for doing ioctls */
2759 		ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
2760 
2761 		/*
2762 		 * Get index of new original interface.
2763 		 * The index is returned in lifr.lifr_index.
2764 		 */
2765 		(void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
2766 		    sizeof (lifr.lifr_name));
2767 
2768 		if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
2769 			return (send_result(newfd, IPMP_FAILURE, errno));
2770 
2771 		/*
2772 		 * Set new original interface index.
2773 		 * The new index was put into lifr.lifr_index by the
2774 		 * SIOCGLIFINDEX ioctl.
2775 		 */
2776 		(void) strlcpy(lifr.lifr_name, mis->mis_lifname,
2777 		    sizeof (lifr.lifr_name));
2778 
2779 		if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
2780 			return (send_result(newfd, IPMP_FAILURE, errno));
2781 
2782 		return (send_result(newfd, IPMP_SUCCESS, 0));
2783 
2784 	case MI_QUERY:
2785 		return (process_query(newfd, &mpi->mi_qcmd));
2786 
2787 	default:
2788 		break;
2789 	}
2790 
2791 	return (send_result(newfd, IPMP_EPROTO, 0));
2792 }
2793 
2794 /*
2795  * Process the query request pointed to by `miq' and send a reply on file
2796  * descriptor `fd'.  Returns an IPMP error code.
2797  */
2798 static unsigned int
2799 process_query(int fd, mi_query_t *miq)
2800 {
2801 	ipmp_groupinfo_t	*grinfop;
2802 	ipmp_groupinfolist_t	*grlp;
2803 	ipmp_grouplist_t	*grlistp;
2804 	ipmp_ifinfo_t		*ifinfop;
2805 	ipmp_ifinfolist_t	*iflp;
2806 	ipmp_snap_t		*snap;
2807 	unsigned int		retval;
2808 
2809 	switch (miq->miq_inforeq) {
2810 	case IPMP_GROUPLIST:
2811 		retval = getgrouplist(&grlistp);
2812 		if (retval != IPMP_SUCCESS)
2813 			return (send_result(fd, retval, errno));
2814 
2815 		retval = send_result(fd, IPMP_SUCCESS, 0);
2816 		if (retval == IPMP_SUCCESS)
2817 			retval = send_grouplist(fd, grlistp);
2818 
2819 		ipmp_freegrouplist(grlistp);
2820 		return (retval);
2821 
2822 	case IPMP_GROUPINFO:
2823 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2824 		retval = getgroupinfo(miq->miq_ifname, &grinfop);
2825 		if (retval != IPMP_SUCCESS)
2826 			return (send_result(fd, retval, errno));
2827 
2828 		retval = send_result(fd, IPMP_SUCCESS, 0);
2829 		if (retval == IPMP_SUCCESS)
2830 			retval = send_groupinfo(fd, grinfop);
2831 
2832 		ipmp_freegroupinfo(grinfop);
2833 		return (retval);
2834 
2835 	case IPMP_IFINFO:
2836 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2837 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2838 		if (retval != IPMP_SUCCESS)
2839 			return (send_result(fd, retval, errno));
2840 
2841 		retval = send_result(fd, IPMP_SUCCESS, 0);
2842 		if (retval == IPMP_SUCCESS)
2843 			retval = send_ifinfo(fd, ifinfop);
2844 
2845 		ipmp_freeifinfo(ifinfop);
2846 		return (retval);
2847 
2848 	case IPMP_SNAP:
2849 		retval = getsnap(&snap);
2850 		if (retval != IPMP_SUCCESS)
2851 			return (send_result(fd, retval, errno));
2852 
2853 		retval = send_result(fd, IPMP_SUCCESS, 0);
2854 		if (retval != IPMP_SUCCESS)
2855 			goto out;
2856 
2857 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2858 		if (retval != IPMP_SUCCESS)
2859 			goto out;
2860 
2861 		retval = send_grouplist(fd, snap->sn_grlistp);
2862 		if (retval != IPMP_SUCCESS)
2863 			goto out;
2864 
2865 		iflp = snap->sn_ifinfolistp;
2866 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2867 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2868 			if (retval != IPMP_SUCCESS)
2869 				goto out;
2870 		}
2871 
2872 		grlp = snap->sn_grinfolistp;
2873 		for (; grlp != NULL; grlp = grlp->grl_next) {
2874 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2875 			if (retval != IPMP_SUCCESS)
2876 				goto out;
2877 		}
2878 	out:
2879 		ipmp_snap_free(snap);
2880 		return (retval);
2881 
2882 	default:
2883 		break;
2884 
2885 	}
2886 	return (send_result(fd, IPMP_EPROTO, 0));
2887 }
2888 
2889 /*
2890  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2891  * Returns an IPMP error code.
2892  */
2893 static unsigned int
2894 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2895 {
2896 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2897 	unsigned int	retval;
2898 
2899 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2900 	if (retval != IPMP_SUCCESS)
2901 		return (retval);
2902 
2903 	return (ipmp_writetlv(fd, IPMP_IFLIST,
2904 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
2905 }
2906 
2907 /*
2908  * Send the interface information pointed to by `ifinfop' on file descriptor
2909  * `fd'.  Returns an IPMP error code.
2910  */
2911 static unsigned int
2912 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2913 {
2914 	return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
2915 }
2916 
2917 /*
2918  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2919  * Returns an IPMP error code.
2920  */
2921 static unsigned int
2922 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2923 {
2924 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2925 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2926 }
2927 
2928 /*
2929  * Initialize an mi_result_t structure using `error' and `syserror' and
2930  * send it on file descriptor `fd'.  Returns an IPMP error code.
2931  */
2932 static unsigned int
2933 send_result(int fd, unsigned int error, int syserror)
2934 {
2935 	mi_result_t me;
2936 
2937 	me.me_mpathd_error = error;
2938 	if (error == IPMP_FAILURE)
2939 		me.me_sys_error = syserror;
2940 	else
2941 		me.me_sys_error = 0;
2942 
2943 	return (ipmp_write(fd, &me, sizeof (me)));
2944 }
2945 
2946 /*
2947  * Daemonize the process.
2948  */
2949 static boolean_t
2950 daemonize(void)
2951 {
2952 	switch (fork()) {
2953 	case -1:
2954 		return (_B_FALSE);
2955 
2956 	case  0:
2957 		/*
2958 		 * Lose our controlling terminal, and become both a session
2959 		 * leader and a process group leader.
2960 		 */
2961 		if (setsid() == -1)
2962 			return (_B_FALSE);
2963 
2964 		/*
2965 		 * Under POSIX, a session leader can accidentally (through
2966 		 * open(2)) acquire a controlling terminal if it does not
2967 		 * have one.  Just to be safe, fork() again so we are not a
2968 		 * session leader.
2969 		 */
2970 		switch (fork()) {
2971 		case -1:
2972 			return (_B_FALSE);
2973 
2974 		case 0:
2975 			(void) chdir("/");
2976 			(void) umask(022);
2977 			(void) fdwalk(closefunc, NULL);
2978 			break;
2979 
2980 		default:
2981 			_exit(EXIT_SUCCESS);
2982 		}
2983 		break;
2984 
2985 	default:
2986 		_exit(EXIT_SUCCESS);
2987 	}
2988 
2989 	return (_B_TRUE);
2990 }
2991 
2992 /*
2993  * The parent has created some fds before forking on purpose, keep them open.
2994  */
2995 static int
2996 closefunc(void *not_used, int fd)
2997 /* ARGSUSED */
2998 {
2999 	if (fd != lsock_v4 && fd != lsock_v6)
3000 		(void) close(fd);
3001 	return (0);
3002 }
3003 
3004 /* LOGGER */
3005 
3006 #include <syslog.h>
3007 
3008 /*
3009  * Logging routines.  All routines log to syslog, unless the daemon is
3010  * running in the foreground, in which case the logging goes to stderr.
3011  *
3012  * The following routines are available:
3013  *
3014  *	logdebug(): A printf-like function for outputting debug messages
3015  *	(messages at LOG_DEBUG) that are only of use to developers.
3016  *
3017  *	logtrace(): A printf-like function for outputting tracing messages
3018  *	(messages at LOG_INFO) from the daemon.	 This is typically used
3019  *	to log the receipt of interesting network-related conditions.
3020  *
3021  *	logerr(): A printf-like function for outputting error messages
3022  *	(messages at LOG_ERR) from the daemon.
3023  *
3024  *	logperror*(): A set of functions used to output error messages
3025  *	(messages at LOG_ERR); these automatically append strerror(errno)
3026  *	and a newline to the message passed to them.
3027  *
3028  * NOTE: since the logging functions write to syslog, the messages passed
3029  *	 to them are not eligible for localization.  Thus, gettext() must
3030  *	 *not* be used.
3031  */
3032 
3033 static int logging = 0;
3034 
3035 static void
3036 initlog(void)
3037 {
3038 	logging++;
3039 	openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON);
3040 }
3041 
3042 /* PRINTFLIKE1 */
3043 void
3044 logerr(char *fmt, ...)
3045 {
3046 	va_list ap;
3047 
3048 	va_start(ap, fmt);
3049 
3050 	if (logging)
3051 		vsyslog(LOG_ERR, fmt, ap);
3052 	else
3053 		(void) vfprintf(stderr, fmt, ap);
3054 	va_end(ap);
3055 }
3056 
3057 /* PRINTFLIKE1 */
3058 void
3059 logtrace(char *fmt, ...)
3060 {
3061 	va_list ap;
3062 
3063 	va_start(ap, fmt);
3064 
3065 	if (logging)
3066 		vsyslog(LOG_INFO, fmt, ap);
3067 	else
3068 		(void) vfprintf(stderr, fmt, ap);
3069 	va_end(ap);
3070 }
3071 
3072 /* PRINTFLIKE1 */
3073 void
3074 logdebug(char *fmt, ...)
3075 {
3076 	va_list ap;
3077 
3078 	va_start(ap, fmt);
3079 
3080 	if (logging)
3081 		vsyslog(LOG_DEBUG, fmt, ap);
3082 	else
3083 		(void) vfprintf(stderr, fmt, ap);
3084 	va_end(ap);
3085 }
3086 
3087 /* PRINTFLIKE1 */
3088 void
3089 logperror(char *str)
3090 {
3091 	if (logging)
3092 		syslog(LOG_ERR, "%s: %m\n", str);
3093 	else
3094 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
3095 }
3096 
3097 void
3098 logperror_pii(struct phyint_instance *pii, char *str)
3099 {
3100 	if (logging) {
3101 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3102 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
3103 	} else {
3104 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3105 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
3106 		    strerror(errno));
3107 	}
3108 }
3109 
3110 void
3111 logperror_li(struct logint *li, char *str)
3112 {
3113 	struct	phyint_instance	*pii = li->li_phyint_inst;
3114 
3115 	if (logging) {
3116 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3117 		    str, AF_STR(pii->pii_af), li->li_name);
3118 	} else {
3119 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3120 		    str, AF_STR(pii->pii_af), li->li_name,
3121 		    strerror(errno));
3122 	}
3123 }
3124 
3125 void
3126 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
3127 {
3128 	if (polled)
3129 		(void) poll_remove(pii->pii_probe_sock);
3130 	(void) close(pii->pii_probe_sock);
3131 	pii->pii_probe_sock = -1;
3132 	pii->pii_basetime_inited = 0;
3133 }
3134