xref: /titanic_41/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 8c74a1f9477c04aa8539a84a49aa2bf629c7a14d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30 
31 int debug = 0;				/* Debug flag */
32 static int pollfd_num = 0;		/* Num. of poll descriptors */
33 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
34 
35 					/* All times below in ms */
36 int	user_failure_detection_time;	/* user specified failure detection */
37 					/* time (fdt) */
38 int	user_probe_interval;		/* derived from user specified fdt */
39 
40 static int	rtsock_v4;		/* AF_INET routing socket */
41 static int	rtsock_v6;		/* AF_INET6 routing socket */
42 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
43 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
44 static int	lsock_v4;		/* Listen socket to detect mpathd */
45 static int	lsock_v6;		/* Listen socket to detect mpathd */
46 static int	mibfd = -1;		/* fd to get mib info */
47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
48 
49 boolean_t	full_scan_required = _B_FALSE;
50 static uint_t	last_initifs_time;	/* Time when initifs was last run */
51 static	char **argv0;			/* Saved for re-exec on SIGHUP */
52 boolean_t handle_link_notifications = _B_TRUE;
53 
54 static void	initlog(void);
55 static void	run_timeouts(void);
56 static void	initifs(void);
57 static void	check_if_removed(struct phyint_instance *pii);
58 static void	select_test_ifs(void);
59 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
60 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
61 static void	router_add_v4(mib2_ipRouteEntry_t *rp1,
62     struct in_addr nexthop_v4);
63 static void	router_add_v6(mib2_ipv6RouteEntry_t *rp1,
64     struct in6_addr nexthop_v6);
65 static void	router_add_common(int af, char *ifname,
66     struct in6_addr nexthop);
67 static void	init_router_targets();
68 static void	cleanup(void);
69 static int	setup_listener(int af);
70 static void	check_config(void);
71 static void	check_addr_unique(int af, char *name);
72 static void	init_host_targets(void);
73 static void	dup_host_targets(struct phyint_instance *desired_pii);
74 static void	loopback_cmd(int sock, int family);
75 static int	poll_remove(int fd);
76 static boolean_t daemonize(void);
77 static int	closefunc(void *, int);
78 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
79 static unsigned int process_query(int fd, mi_query_t *miq);
80 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
81 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
82 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
83 static unsigned int send_result(int fd, unsigned int error, int syserror);
84 
85 struct local_addr *laddr_list = NULL;
86 
87 /*
88  * Return the current time in milliseconds (from an arbitrary reference)
89  * truncated to fit into an int. Truncation is ok since we are interested
90  * only in differences and not the absolute values.
91  */
92 uint_t
93 getcurrenttime(void)
94 {
95 	uint_t	cur_time;	/* In ms */
96 
97 	/*
98 	 * Use of a non-user-adjustable source of time is
99 	 * required. However millisecond precision is sufficient.
100 	 * divide by 10^6
101 	 */
102 	cur_time = (uint_t)(gethrtime() / 1000000LL);
103 	return (cur_time);
104 }
105 
106 /*
107  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
108  */
109 int
110 poll_add(int fd)
111 {
112 	int i;
113 	int new_num;
114 	struct pollfd *newfds;
115 retry:
116 	/* Check if already present */
117 	for (i = 0; i < pollfd_num; i++) {
118 		if (pollfds[i].fd == fd)
119 			return (0);
120 	}
121 	/* Check for empty spot already present */
122 	for (i = 0; i < pollfd_num; i++) {
123 		if (pollfds[i].fd == -1) {
124 			pollfds[i].fd = fd;
125 			return (0);
126 		}
127 	}
128 
129 	/* Allocate space for 32 more fds and initialize to -1 */
130 	new_num = pollfd_num + 32;
131 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
132 	if (newfds == NULL) {
133 		logperror("poll_add: realloc");
134 		return (-1);
135 	}
136 	for (i = pollfd_num; i < new_num; i++) {
137 		newfds[i].fd = -1;
138 		newfds[i].events = POLLIN;
139 	}
140 	pollfd_num = new_num;
141 	pollfds = newfds;
142 	goto retry;
143 }
144 
145 /*
146  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
147  */
148 static int
149 poll_remove(int fd)
150 {
151 	int i;
152 
153 	/* Check if already present */
154 	for (i = 0; i < pollfd_num; i++) {
155 		if (pollfds[i].fd == fd) {
156 			pollfds[i].fd = -1;
157 			return (0);
158 		}
159 	}
160 	return (-1);
161 }
162 
163 /*
164  * Extract information about the phyint instance. If the phyint instance still
165  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
166  * will use it to detect phyint instances that don't exist any longer and
167  * remove them, from our database of phyint instances.
168  * Return value:
169  *	returns true if the phyint instance exists in the kernel,
170  *	returns false otherwise
171  */
172 static boolean_t
173 pii_process(int af, char *name, struct phyint_instance **pii_p)
174 {
175 	int err;
176 	struct phyint_instance *pii;
177 	struct phyint_instance *pii_other;
178 
179 	if (debug & D_PHYINT)
180 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
181 
182 	pii = phyint_inst_lookup(af, name);
183 	if (pii == NULL) {
184 		/*
185 		 * Phyint instance does not exist in our tables,
186 		 * create new phyint instance
187 		 */
188 		pii = phyint_inst_init_from_k(af, name);
189 	} else {
190 		/* Phyint exists in our tables */
191 		err = phyint_inst_update_from_k(pii);
192 
193 		switch (err) {
194 		case PI_IOCTL_ERROR:
195 			/* Some ioctl error. don't change anything */
196 			pii->pii_in_use = 1;
197 			break;
198 
199 		case PI_GROUP_CHANGED:
200 			/*
201 			 * The phyint has changed group.
202 			 */
203 			restore_phyint(pii->pii_phyint);
204 			/* FALLTHRU */
205 
206 		case PI_IFINDEX_CHANGED:
207 			/*
208 			 * Interface index has changed. Delete and
209 			 * recreate the phyint as it is quite likely
210 			 * the interface has been unplumbed and replumbed.
211 			 */
212 			pii_other = phyint_inst_other(pii);
213 			if (pii_other != NULL)
214 				phyint_inst_delete(pii_other);
215 			phyint_inst_delete(pii);
216 			pii = phyint_inst_init_from_k(af, name);
217 			break;
218 
219 		case PI_DELETED:
220 			/* Phyint instance has disappeared from kernel */
221 			pii->pii_in_use = 0;
222 			break;
223 
224 		case PI_OK:
225 			/* Phyint instance exists and is fine */
226 			pii->pii_in_use = 1;
227 			break;
228 
229 		default:
230 			/* Unknown status */
231 			logerr("pii_process: Unknown status %d\n", err);
232 			break;
233 		}
234 	}
235 
236 	*pii_p = pii;
237 	if (pii != NULL)
238 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
239 	else
240 		return (_B_FALSE);
241 }
242 
243 /*
244  * This phyint is leaving the group. Try to restore the phyint to its
245  * initial state. Return the addresses that belong to other group members,
246  * to the group, and take back any addresses owned by this phyint
247  */
248 void
249 restore_phyint(struct phyint *pi)
250 {
251 	if (pi->pi_group == phyint_anongroup)
252 		return;
253 
254 	/*
255 	 * Move everthing to some other member in the group.
256 	 * The phyint has changed group in the kernel. But we
257 	 * have yet to do it in our tables.
258 	 */
259 	if (!pi->pi_empty)
260 		(void) try_failover(pi, FAILOVER_TO_ANY);
261 	/*
262 	 * Move all addresses owned by 'pi' back to pi, from each
263 	 * of the other members of the group
264 	 */
265 	(void) try_failback(pi, _B_FALSE);
266 }
267 
268 /*
269  * Scan all interfaces to detect changes as well as new and deleted interfaces
270  */
271 static void
272 initifs()
273 {
274 	int	n;
275 	int	af;
276 	char	*cp;
277 	char	*buf;
278 	int	numifs;
279 	struct lifnum	lifn;
280 	struct lifconf	lifc;
281 	struct lifreq	*lifr;
282 	struct logint	*li;
283 	struct phyint_instance *pii;
284 	struct phyint_instance *next_pii;
285 	char	pi_name[LIFNAMSIZ + 1];
286 	boolean_t exists;
287 	struct phyint	*pi;
288 	struct local_addr *next;
289 
290 	if (debug & D_PHYINT)
291 		logdebug("initifs: Scanning interfaces\n");
292 
293 	last_initifs_time = getcurrenttime();
294 
295 	/*
296 	 * Free the laddr_list before collecting the local addresses.
297 	 */
298 	while (laddr_list != NULL) {
299 		next = laddr_list->next;
300 		free(laddr_list);
301 		laddr_list = next;
302 	}
303 
304 	/*
305 	 * Mark the interfaces so that we can find phyints and logints
306 	 * which have disappeared from the kernel. pii_process() and
307 	 * logint_init_from_k() will set {pii,li}_in_use when they find
308 	 * the interface in the kernel. Also, clear dupaddr bit on probe
309 	 * logint. check_addr_unique() will set the dupaddr bit on the
310 	 * probe logint, if the testaddress is not unique.
311 	 */
312 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
313 		pii->pii_in_use = 0;
314 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
315 			li->li_in_use = 0;
316 			if (pii->pii_probe_logint == li)
317 				li->li_dupaddr = 0;
318 		}
319 	}
320 
321 	lifn.lifn_family = AF_UNSPEC;
322 	lifn.lifn_flags = LIFC_ALLZONES;
323 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
324 		logperror("initifs: ioctl (get interface numbers)");
325 		return;
326 	}
327 	numifs = lifn.lifn_count;
328 
329 	buf = (char *)calloc(numifs, sizeof (struct lifreq));
330 	if (buf == NULL) {
331 		logperror("initifs: calloc");
332 		return;
333 	}
334 
335 	lifc.lifc_family = AF_UNSPEC;
336 	lifc.lifc_flags = LIFC_ALLZONES;
337 	lifc.lifc_len = numifs * sizeof (struct lifreq);
338 	lifc.lifc_buf = buf;
339 
340 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
341 		/*
342 		 * EINVAL is commonly encountered, when things change
343 		 * underneath us rapidly, (eg. at boot, when new interfaces
344 		 * are plumbed successively) and the kernel finds the buffer
345 		 * size we passed as too small. We will retry again
346 		 * when we see the next routing socket msg, or at worst after
347 		 * IF_SCAN_INTERVAL ms.
348 		 */
349 		if (errno != EINVAL) {
350 			logperror("initifs: ioctl"
351 			    " (get interface configuration)");
352 		}
353 		free(buf);
354 		return;
355 	}
356 
357 	lifr = (struct lifreq *)lifc.lifc_req;
358 
359 	/*
360 	 * For each lifreq returned by SIOGGLIFCONF, call pii_process()
361 	 * and get the state of the corresponding phyint_instance. If it is
362 	 * successful, then call logint_init_from_k() to get the state of the
363 	 * logint.
364 	 */
365 	for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
366 		int	sockfd;
367 		struct local_addr	*taddr;
368 		struct sockaddr_in	*sin;
369 		struct sockaddr_in6	*sin6;
370 		struct lifreq	lifreq;
371 
372 		af = lifr->lifr_addr.ss_family;
373 
374 		/*
375 		 * Collect all local addresses.
376 		 */
377 		sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
378 		(void) memset(&lifreq, 0, sizeof (lifreq));
379 		(void) strlcpy(lifreq.lifr_name, lifr->lifr_name,
380 		    sizeof (lifreq.lifr_name));
381 
382 		if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) {
383 			if (errno != ENXIO)
384 				logperror("initifs: ioctl (SIOCGLIFFLAGS)");
385 			continue;
386 		}
387 
388 		/*
389 		 * Add the interface address to laddr_list.
390 		 * Another node might have the same IP address which is up.
391 		 * In that case, it is appropriate  to use the address as a
392 		 * target, even though it is also configured (but not up) on
393 		 * the local system.
394 		 * Hence,the interface address is not added to laddr_list
395 		 * unless it is IFF_UP.
396 		 */
397 		if (lifreq.lifr_flags & IFF_UP) {
398 			taddr = malloc(sizeof (struct local_addr));
399 			if (taddr == NULL) {
400 				logperror("initifs: malloc");
401 				continue;
402 			}
403 			if (af == AF_INET) {
404 				sin = (struct sockaddr_in *)&lifr->lifr_addr;
405 				IN6_INADDR_TO_V4MAPPED(&sin->sin_addr,
406 				    &taddr->addr);
407 			} else {
408 				sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr;
409 				taddr->addr = sin6->sin6_addr;
410 			}
411 			taddr->next = laddr_list;
412 			laddr_list = taddr;
413 		}
414 
415 		/*
416 		 * Need to pass a phyint name to pii_process. Insert the
417 		 * null where the ':' IF_SEPARATOR is found in the logical
418 		 * name.
419 		 */
420 		(void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name));
421 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
422 			*cp = '\0';
423 
424 		exists = pii_process(af, pi_name, &pii);
425 		if (exists) {
426 			/* The phyint is fine. So process the logint */
427 			logint_init_from_k(pii, lifr->lifr_name);
428 		}
429 		check_addr_unique(af, lifr->lifr_name);
430 	}
431 
432 	free(buf);
433 
434 	/*
435 	 * If the test address is now unique, and if it was not unique
436 	 * previously,	clear the li_dupaddrmsg_printed flag and log a
437 	 * recovery message
438 	 */
439 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
440 		struct logint *li;
441 		char abuf[INET6_ADDRSTRLEN];
442 
443 		li = pii->pii_probe_logint;
444 		if ((li != NULL) && !li->li_dupaddr &&
445 		    li->li_dupaddrmsg_printed) {
446 			logerr("Test address %s is unique; enabling probe-"
447 			    "based failure detection\n",
448 			    pr_addr(pii->pii_af, li->li_addr, abuf,
449 				sizeof (abuf)));
450 			li->li_dupaddrmsg_printed = 0;
451 		}
452 	}
453 
454 	/*
455 	 * Scan for phyints and logints that have disappeared from the
456 	 * kernel, and delete them.
457 	 */
458 	pii = phyint_instances;
459 
460 	while (pii != NULL) {
461 		next_pii = pii->pii_next;
462 		check_if_removed(pii);
463 		pii = next_pii;
464 	}
465 
466 	/*
467 	 * Select a test address for sending probes on each phyint instance
468 	 */
469 	select_test_ifs();
470 
471 	/*
472 	 * Handle link up/down notifications from the NICs.
473 	 */
474 	process_link_state_changes();
475 
476 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
477 		/*
478 		 * If this is a case of group failure, we don't have much
479 		 * to do until the group recovers again.
480 		 */
481 		if (GROUP_FAILED(pi->pi_group))
482 			continue;
483 
484 		/*
485 		 * Try/Retry any pending failovers / failbacks, that did not
486 		 * not complete, or that could not be initiated previously.
487 		 * This implements the 3 invariants described in the big block
488 		 * comment at the beginning of probe.c
489 		 */
490 		if (pi->pi_flags & IFF_INACTIVE) {
491 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
492 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
493 		} else {
494 			struct phyint_instance *pii;
495 
496 			/*
497 			 * Skip interfaces which are not capable of probing,
498 			 * and interfaces that have downed links (as we will
499 			 * not get any response).
500 			 */
501 			if (LINK_DOWN(pi))
502 				continue;
503 
504 			pii = pi->pi_v4;
505 			if (!PROBE_CAPABLE(pii)) {
506 				pii = pi->pi_v6;
507 				if (!PROBE_CAPABLE(pii))
508 					continue;
509 			}
510 
511 			/*
512 			 * It is possible that the phyint has started
513 			 * receiving packets, after it has been marked
514 			 * PI_FAILED. Don't initiate failover, if the
515 			 * phyint has started recovering. failure_state()
516 			 * captures this check. A similar logic is used
517 			 * for failback/repair case.
518 			 */
519 			if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
520 			    (failure_state(pii) == PHYINT_FAILURE)) {
521 				(void) try_failover(pi, FAILOVER_NORMAL);
522 			} else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
523 				if (try_failback(pi, _B_FALSE) !=
524 				    IPMP_FAILURE) {
525 					(void) change_lif_flags(pi, IFF_FAILED,
526 					    _B_FALSE);
527 					/* Per state diagram */
528 					pi->pi_empty = 0;
529 				}
530 			}
531 		}
532 	}
533 }
534 
535 /*
536  * Check that test/probe addresses are always unique. link-locals and
537  * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER)
538  * address can produce unexpected results. Log an error and alert the user.
539  */
540 static void
541 check_addr_unique(int af, char *name)
542 {
543 	struct lifreq	lifr;
544 	struct phyint	*pi;
545 	struct in6_addr	addr;
546 	struct phyint_instance	*pii;
547 	struct sockaddr_in	*sin;
548 	struct sockaddr_in6	*sin6;
549 	int ifsock;
550 	char abuf[INET6_ADDRSTRLEN];
551 
552 	/* Get the socket for doing ioctls */
553 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
554 
555 	(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
556 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
557 	/*
558 	 * Get the address corresponding to 'name'. We cannot
559 	 * do a logint lookup in our tables, because, not all logints
560 	 * in the system are tracked by mpathd. (eg. things not in a group)
561 	 */
562 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
563 		if (errno == ENXIO) {
564 			/* Interface has vanished */
565 			return;
566 		} else {
567 			logperror("ioctl (get addr)");
568 			return;
569 		}
570 	}
571 
572 	if (af == AF_INET) {
573 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
574 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
575 	} else {
576 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
577 		addr = sin6->sin6_addr;
578 	}
579 
580 	/*
581 	 * Does the address 'addr' match any known test address ? If so
582 	 * it is a duplicate, unless we are looking at the same logint
583 	 */
584 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
585 		pii = PHYINT_INSTANCE(pi, af);
586 		if (pii == NULL || pii->pii_probe_logint == NULL)
587 			continue;
588 
589 		if (!IN6_ARE_ADDR_EQUAL(&addr,
590 		    &pii->pii_probe_logint->li_addr)) {
591 			continue;
592 		}
593 
594 		if (strncmp(pii->pii_probe_logint->li_name, name,
595 		    sizeof (pii->pii_probe_logint->li_name)) == 0) {
596 			continue;
597 		}
598 
599 		/*
600 		 * This test address is not unique. Set the dupaddr bit
601 		 */
602 		pii->pii_probe_logint->li_dupaddr = 1;
603 
604 		/*
605 		 * Log an error message if not already logged
606 		 */
607 		if (pii->pii_probe_logint->li_dupaddrmsg_printed)
608 			continue;
609 
610 		logerr("Test address %s is not unique; disabling "
611 		    "probe-based failure detection\n",
612 		    pr_addr(af, addr, abuf, sizeof (abuf)));
613 
614 		pii->pii_probe_logint->li_dupaddrmsg_printed = 1;
615 	}
616 }
617 
618 /*
619  * Stop probing an interface.  Called when an interface is offlined.
620  * The probe socket is closed on each interface instance, and the
621  * interface state set to PI_OFFLINE.
622  */
623 static void
624 stop_probing(struct phyint *pi)
625 {
626 	struct phyint_instance *pii;
627 
628 	pii = pi->pi_v4;
629 	if (pii != NULL) {
630 		if (pii->pii_probe_sock != -1)
631 			close_probe_socket(pii, _B_TRUE);
632 		pii->pii_probe_logint = NULL;
633 	}
634 
635 	pii = pi->pi_v6;
636 	if (pii != NULL) {
637 		if (pii->pii_probe_sock != -1)
638 			close_probe_socket(pii, _B_TRUE);
639 		pii->pii_probe_logint = NULL;
640 	}
641 
642 	phyint_chstate(pi, PI_OFFLINE);
643 }
644 
645 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
646 
647 /*
648  * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
649  * IFF_UP must also be set so that the associated address can be used as a
650  * source address.  Further, we must be able to exchange packets with local
651  * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
652  * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
653  */
654 static int
655 rate_testflags(uint64_t flags)
656 {
657 	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
658 		return (BAD_TESTFLAGS);
659 
660 	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
661 		return (BAD_TESTFLAGS);
662 
663 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
664 		return (BEST_TESTFLAGS);
665 
666 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
667 		return (BEST_TESTFLAGS);
668 
669 	return (OK_TESTFLAGS);
670 }
671 
672 /*
673  * Attempt to select a test address for each phyint instance.
674  * Call phyint_inst_sockinit() to complete the initializations.
675  */
676 static void
677 select_test_ifs(void)
678 {
679 	struct phyint		*pi;
680 	struct phyint_instance	*pii;
681 	struct phyint_instance	*next_pii;
682 	struct logint		*li;
683 	struct logint  		*probe_logint;
684 	boolean_t		target_scan_reqd = _B_FALSE;
685 	struct target		*tg;
686 	int			rating;
687 
688 	if (debug & D_PHYINT)
689 		logdebug("select_test_ifs\n");
690 
691 	/*
692 	 * For each phyint instance, do the test address selection
693 	 */
694 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
695 		next_pii = pii->pii_next;
696 		probe_logint = NULL;
697 
698 		/*
699 		 * An interface that is offline, should not be probed.
700 		 * Offline interfaces should always in PI_OFFLINE state,
701 		 * unless some other entity has set the offline flag.
702 		 */
703 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
704 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
705 				logerr("shouldn't be probing offline"
706 					" interface %s (state is: %u)."
707 					" Stopping probes.\n",
708 					pii->pii_phyint->pi_name,
709 					pii->pii_phyint->pi_state);
710 				stop_probing(pii->pii_phyint);
711 			}
712 			continue;
713 		}
714 
715 		li = pii->pii_probe_logint;
716 		if (li != NULL) {
717 			/*
718 			 * We've already got a test address; only proceed
719 			 * if it's suboptimal.
720 			 */
721 			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
722 				continue;
723 		}
724 
725 		/*
726 		 * Walk the logints of this phyint instance, and select
727 		 * the best available test address
728 		 */
729 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
730 			/*
731 			 * Skip any IPv6 logints that are not link-local,
732 			 * since we should always have a link-local address
733 			 * anyway and in6_data() expects link-local replies.
734 			 */
735 			if (pii->pii_af == AF_INET6 &&
736 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
737 				continue;
738 
739 			/*
740 			 * Rate the testflags. If we've found an optimal
741 			 * match, then break out; otherwise, record the most
742 			 * recent OK one.
743 			 */
744 			rating = rate_testflags(li->li_flags);
745 			if (rating == BAD_TESTFLAGS)
746 				continue;
747 
748 			probe_logint = li;
749 			if (rating == BEST_TESTFLAGS)
750 				break;
751 		}
752 
753 		/*
754 		 * If the probe logint has changed, ditch the old one.
755 		 */
756 		if (pii->pii_probe_logint != NULL &&
757 		    pii->pii_probe_logint != probe_logint) {
758 			if (pii->pii_probe_sock != -1)
759 				close_probe_socket(pii, _B_TRUE);
760 			pii->pii_probe_logint = NULL;
761 		}
762 
763 		if (probe_logint == NULL) {
764 			/*
765 			 * We don't have a test address. Don't print an
766 			 * error message immediately. check_config() will
767 			 * take care of it. Zero out the probe stats array
768 			 * since it is no longer relevant. Optimize by
769 			 * checking if it is already zeroed out.
770 			 */
771 			int pr_ndx;
772 
773 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
774 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
775 				clear_pii_probe_stats(pii);
776 				reset_crtt_all(pii->pii_phyint);
777 			}
778 			continue;
779 		} else if (probe_logint == pii->pii_probe_logint) {
780 			/*
781 			 * If we didn't find any new test addr, go to the
782 			 * next phyint.
783 			 */
784 			continue;
785 		}
786 
787 		/*
788 		 * The phyint is either being assigned a new testaddr
789 		 * or is being assigned a testaddr for the 1st time.
790 		 * Need to initialize the phyint socket
791 		 */
792 		pii->pii_probe_logint = probe_logint;
793 		if (!phyint_inst_sockinit(pii)) {
794 			if (debug & D_PHYINT) {
795 				logdebug("select_test_ifs: "
796 				    "phyint_sockinit failed\n");
797 			}
798 			phyint_inst_delete(pii);
799 			continue;
800 		}
801 
802 		/*
803 		 * This phyint instance is now enabled for probes; this
804 		 * impacts our state machine in two ways:
805 		 *
806 		 * 1. If we're probe *capable* as well (i.e., we have
807 		 *    probe targets) and the interface is in PI_NOTARGETS,
808 		 *    then transition to PI_RUNNING.
809 		 *
810 		 * 2. If we're not probe capable, and the other phyint
811 		 *    instance is also not probe capable, and we were in
812 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
813 		 *
814 		 * Also see the state diagram in mpd_probe.c.
815 		 */
816 		if (PROBE_CAPABLE(pii)) {
817 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
818 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
819 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
820 			if (pii->pii_phyint->pi_state == PI_RUNNING)
821 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
822 		}
823 
824 		if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
825 			tg = pii->pii_targets;
826 			if (tg != NULL)
827 				target_delete(tg);
828 			assert(pii->pii_targets == NULL);
829 			assert(pii->pii_target_next == NULL);
830 			assert(pii->pii_ntargets == 0);
831 			target_create(pii, probe_logint->li_dstaddr,
832 			    _B_TRUE);
833 		}
834 
835 		/*
836 		 * If no targets are currently known for this phyint
837 		 * we need to call init_router_targets. Since
838 		 * init_router_targets() initializes the list of targets
839 		 * for all phyints it is done below the loop.
840 		 */
841 		if (pii->pii_targets == NULL)
842 			target_scan_reqd = _B_TRUE;
843 
844 		/*
845 		 * Start the probe timer for this instance.
846 		 */
847 		if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) {
848 			start_timer(pii);
849 			pii->pii_basetime_inited = 1;
850 		}
851 	}
852 
853 	/*
854 	 * Check the interface list for any interfaces that are marked
855 	 * PI_FAILED but no longer enabled to send probes, and call
856 	 * phyint_check_for_repair() to see if the link now indicates that the
857 	 * interface should be repaired.  Also see the state diagram in
858 	 * mpd_probe.c.
859 	 */
860 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
861 		if (pi->pi_state == PI_FAILED &&
862 		    !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
863 			phyint_check_for_repair(pi);
864 		}
865 	}
866 
867 	/*
868 	 * Try to populate the target list. init_router_targets populates
869 	 * the target list from the routing table. If our target list is
870 	 * still empty, init_host_targets adds host targets based on the
871 	 * host target list of other phyints in the group.
872 	 */
873 	if (target_scan_reqd) {
874 		init_router_targets();
875 		init_host_targets();
876 	}
877 }
878 
879 /*
880  * Check phyint group configuration, to detect any inconsistencies,
881  * and log an error message. This is called from runtimeouts every
882  * 20 secs. But the error message is displayed once. If the
883  * consistency is resolved by the admin, a recovery message is displayed
884  * once.
885  */
886 static void
887 check_config(void)
888 {
889 	struct phyint_group *pg;
890 	struct phyint *pi;
891 	boolean_t v4_in_group;
892 	boolean_t v6_in_group;
893 
894 	/*
895 	 * All phyints of a group must be homogenous to ensure that
896 	 * failover or failback can be done. If any phyint in a group
897 	 * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
898 	 * Do a similar check for IPv6.
899 	 */
900 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
901 		if (pg == phyint_anongroup)
902 			continue;
903 
904 		v4_in_group = _B_FALSE;
905 		v6_in_group = _B_FALSE;
906 		/*
907 		 * 1st pass. Determine if at least 1 phyint in the group
908 		 * has IPv4 plumbed and if so set v4_in_group to true.
909 		 * Repeat similarly for IPv6.
910 		 */
911 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
912 			if (pi->pi_v4 != NULL)
913 				v4_in_group = _B_TRUE;
914 			if (pi->pi_v6 != NULL)
915 				v6_in_group = _B_TRUE;
916 		}
917 
918 		/*
919 		 * 2nd pass. If v4_in_group is true, check that phyint
920 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
921 		 * out a message the 1st time only.
922 		 */
923 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
924 			if (pi->pi_flags & IFF_OFFLINE)
925 				continue;
926 
927 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
928 				if (!pi->pi_cfgmsg_printed) {
929 					logerr("NIC %s of group %s is"
930 					    " not plumbed for IPv4 and may"
931 					    " affect failover capability\n",
932 					    pi->pi_name,
933 					    pi->pi_group->pg_name);
934 					pi->pi_cfgmsg_printed = 1;
935 				}
936 			} else if (v6_in_group == _B_TRUE &&
937 			    pi->pi_v6 == NULL) {
938 				if (!pi->pi_cfgmsg_printed) {
939 					logerr("NIC %s of group %s is"
940 					    " not plumbed for IPv6 and may"
941 					    " affect failover capability\n",
942 					    pi->pi_name,
943 					    pi->pi_group->pg_name);
944 					pi->pi_cfgmsg_printed = 1;
945 				}
946 			} else {
947 				/*
948 				 * The phyint matches the group configuration,
949 				 * if we have reached this point. If it was
950 				 * improperly configured earlier, log an
951 				 * error recovery message
952 				 */
953 				if (pi->pi_cfgmsg_printed) {
954 					logerr("NIC %s is now consistent with "
955 					    "group %s and failover capability "
956 					    "is restored\n", pi->pi_name,
957 					    pi->pi_group->pg_name);
958 					pi->pi_cfgmsg_printed = 0;
959 				}
960 			}
961 
962 		}
963 	}
964 
965 	/*
966 	 * In order to perform probe-based failure detection, a phyint must
967 	 * have at least 1 test/probe address for sending and receiving probes
968 	 * (either on IPv4 or IPv6 instance or both).  If no test address has
969 	 * been configured, notify the administrator, but continue on since we
970 	 * can still perform load spreading, along with "link up/down" based
971 	 * failure detection.
972 	 */
973 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
974 		if (pi->pi_flags & IFF_OFFLINE)
975 			continue;
976 
977 		if ((pi->pi_v4 == NULL ||
978 		    pi->pi_v4->pii_probe_logint == NULL) &&
979 		    (pi->pi_v6 == NULL ||
980 		    pi->pi_v6->pii_probe_logint == NULL)) {
981 			if (!pi->pi_taddrmsg_printed) {
982 				logerr("No test address configured on "
983 				    "interface %s; disabling probe-based "
984 				    "failure detection on it\n", pi->pi_name);
985 				pi->pi_taddrmsg_printed = 1;
986 			}
987 		} else if (pi->pi_taddrmsg_printed) {
988 			logerr("Test address now configured on interface %s; "
989 			    "enabling probe-based failure detection on it\n",
990 			    pi->pi_name);
991 			pi->pi_taddrmsg_printed = 0;
992 		}
993 
994 	}
995 }
996 
997 /*
998  * Timer mechanism using relative time (in milliseconds) from the
999  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
1000  * will fire after TIMER_INFINITY milliseconds.
1001  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
1002  * time values. Hence 2 consecutive timer events cannot be spaced farther
1003  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
1004  * that can be passed for the delay parameter of timer_schedule()
1005  */
1006 static uint_t timer_next;	/* Currently scheduled timeout */
1007 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
1008 
1009 static void
1010 timer_init(void)
1011 {
1012 	timer_next = getcurrenttime() + TIMER_INFINITY;
1013 	/*
1014 	 * The call to run_timeouts() will get the timer started
1015 	 * Since there are no phyints at this point, the timer will
1016 	 * be set for IF_SCAN_INTERVAL ms.
1017 	 */
1018 	run_timeouts();
1019 }
1020 
1021 /*
1022  * Make sure the next SIGALRM occurs delay milliseconds from the current
1023  * time if not earlier. We are interested only in time differences.
1024  */
1025 void
1026 timer_schedule(uint_t delay)
1027 {
1028 	uint_t now;
1029 	struct itimerval itimerval;
1030 
1031 	if (debug & D_TIMER)
1032 		logdebug("timer_schedule(%u)\n", delay);
1033 
1034 	assert(delay <= TIMER_INFINITY);
1035 
1036 	now = getcurrenttime();
1037 	if (delay == 0) {
1038 		/* Minimum allowed delay */
1039 		delay = 1;
1040 	}
1041 	/* Will this timer occur before the currently scheduled SIGALRM? */
1042 	if (timer_active && TIME_GE(now + delay, timer_next)) {
1043 		if (debug & D_TIMER) {
1044 			logdebug("timer_schedule(%u) - no action: "
1045 			    "now %u next %u\n", delay, now, timer_next);
1046 		}
1047 		return;
1048 	}
1049 	timer_next = now + delay;
1050 
1051 	itimerval.it_value.tv_sec = delay / 1000;
1052 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
1053 	itimerval.it_interval.tv_sec = 0;
1054 	itimerval.it_interval.tv_usec = 0;
1055 	if (debug & D_TIMER) {
1056 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1057 		    delay, itimerval.it_value.tv_sec,
1058 		    itimerval.it_value.tv_usec);
1059 	}
1060 	timer_active = _B_TRUE;
1061 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1062 		logperror("timer_schedule: setitimer");
1063 		exit(2);
1064 	}
1065 }
1066 
1067 /*
1068  * Timer has fired. Determine when the next timer event will occur by asking
1069  * all the timer routines. Should not be called from a timer routine.
1070  */
1071 static void
1072 run_timeouts(void)
1073 {
1074 	uint_t next;
1075 	uint_t next_event_time;
1076 	struct phyint_instance *pii;
1077 	struct phyint_instance *next_pii;
1078 	static boolean_t timeout_running;
1079 
1080 	/* assert that recursive timeouts don't happen. */
1081 	assert(!timeout_running);
1082 
1083 	timeout_running = _B_TRUE;
1084 
1085 	if (debug & D_TIMER)
1086 		logdebug("run_timeouts()\n");
1087 
1088 	next = TIMER_INFINITY;
1089 
1090 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1091 		next_pii = pii->pii_next;
1092 		next_event_time = phyint_inst_timer(pii);
1093 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1094 			next = next_event_time;
1095 
1096 		if (debug & D_TIMER) {
1097 			logdebug("run_timeouts(%s %s): next scheduled for"
1098 			    " this phyint inst %u, next scheduled global"
1099 			    " %u ms\n",
1100 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1101 			    next_event_time, next);
1102 		}
1103 	}
1104 
1105 	/*
1106 	 * Make sure initifs() is called at least once every
1107 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1108 	 * with the kernel, in case we have missed any routing
1109 	 * socket messages.
1110 	 */
1111 	if (next > IF_SCAN_INTERVAL)
1112 		next = IF_SCAN_INTERVAL;
1113 
1114 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1115 		initifs();
1116 		check_config();
1117 	}
1118 
1119 	if (debug & D_TIMER)
1120 		logdebug("run_timeouts: %u ms\n", next);
1121 
1122 	timer_schedule(next);
1123 	timeout_running = _B_FALSE;
1124 }
1125 
1126 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1127 static int eventpipe_write = -1;
1128 static boolean_t cleanup_started = _B_FALSE;
1129 				/* Don't write to eventpipe if in cleanup */
1130 /*
1131  * Ensure that signals are processed synchronously with the rest of
1132  * the code by just writing a one character signal number on the pipe.
1133  * The poll loop will pick this up and process the signal event.
1134  */
1135 static void
1136 sig_handler(int signo)
1137 {
1138 	uchar_t buf = (uchar_t)signo;
1139 
1140 	/*
1141 	 * Don't write to pipe if cleanup has already begun. cleanup()
1142 	 * might have closed the pipe already
1143 	 */
1144 	if (cleanup_started)
1145 		return;
1146 
1147 	if (eventpipe_write == -1) {
1148 		logerr("sig_handler: no pipe found\n");
1149 		return;
1150 	}
1151 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1152 		logperror("sig_handler: write");
1153 }
1154 
1155 extern struct probes_missed probes_missed;
1156 
1157 /*
1158  * Pick up a signal "byte" from the pipe and process it.
1159  */
1160 static void
1161 in_signal(int fd)
1162 {
1163 	uchar_t buf;
1164 	uint64_t  sent, acked, lost, unacked, unknown;
1165 	struct phyint_instance *pii;
1166 	int pr_ndx;
1167 
1168 	switch (read(fd, &buf, sizeof (buf))) {
1169 	case -1:
1170 		logperror("in_signal: read");
1171 		exit(1);
1172 		/* NOTREACHED */
1173 	case 1:
1174 		break;
1175 	case 0:
1176 		logerr("in_signal: read end of file\n");
1177 		exit(1);
1178 		/* NOTREACHED */
1179 	default:
1180 		logerr("in_signal: read > 1\n");
1181 		exit(1);
1182 	}
1183 
1184 	if (debug & D_TIMER)
1185 		logdebug("in_signal() got %d\n", buf);
1186 
1187 	switch (buf) {
1188 	case SIGALRM:
1189 		if (debug & D_TIMER) {
1190 			uint_t now = getcurrenttime();
1191 
1192 			logdebug("in_signal(SIGALRM) delta %u\n",
1193 			    now - timer_next);
1194 		}
1195 		timer_active = _B_FALSE;
1196 		run_timeouts();
1197 		break;
1198 	case SIGUSR1:
1199 		logdebug("Printing configuration:\n");
1200 		/* Print out the internal tables */
1201 		phyint_inst_print_all();
1202 
1203 		/*
1204 		 * Print out the accumulated statistics about missed
1205 		 * probes (happens due to scheduling delay).
1206 		 */
1207 		logerr("Missed sending total of %d probes spread over"
1208 		    " %d occurrences\n", probes_missed.pm_nprobes,
1209 		    probes_missed.pm_ntimes);
1210 
1211 		/*
1212 		 * Print out the accumulated statistics about probes
1213 		 * that were sent.
1214 		 */
1215 		for (pii = phyint_instances; pii != NULL;
1216 		    pii = pii->pii_next) {
1217 			unacked = 0;
1218 			acked = pii->pii_cum_stats.acked;
1219 			lost = pii->pii_cum_stats.lost;
1220 			sent = pii->pii_cum_stats.sent;
1221 			unknown = pii->pii_cum_stats.unknown;
1222 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1223 				switch (pii->pii_probes[pr_ndx].pr_status) {
1224 				case PR_ACKED:
1225 					acked++;
1226 					break;
1227 				case PR_LOST:
1228 					lost++;
1229 					break;
1230 				case PR_UNACKED:
1231 					unacked++;
1232 					break;
1233 				}
1234 			}
1235 			logerr("\nProbe stats on (%s %s)\n"
1236 			    "Number of probes sent %lld\n"
1237 			    "Number of probe acks received %lld\n"
1238 			    "Number of probes/acks lost %lld\n"
1239 			    "Number of valid unacknowled probes %lld\n"
1240 			    "Number of ambiguous probe acks received %lld\n",
1241 			    AF_STR(pii->pii_af), pii->pii_name,
1242 			    sent, acked, lost, unacked, unknown);
1243 		}
1244 		break;
1245 	case SIGHUP:
1246 		logerr("SIGHUP: restart and reread config file\n");
1247 		cleanup();
1248 		(void) execv(argv0[0], argv0);
1249 		_exit(0177);
1250 		/* NOTREACHED */
1251 	case SIGINT:
1252 	case SIGTERM:
1253 	case SIGQUIT:
1254 		cleanup();
1255 		exit(0);
1256 		/* NOTREACHED */
1257 	default:
1258 		logerr("in_signal: unknown signal: %d\n", buf);
1259 	}
1260 }
1261 
1262 static void
1263 cleanup(void)
1264 {
1265 	struct phyint_instance *pii;
1266 	struct phyint_instance *next_pii;
1267 
1268 	/*
1269 	 * Make sure that we don't write to eventpipe in
1270 	 * sig_handler() if any signal notably SIGALRM,
1271 	 * occurs after we close the eventpipe descriptor below
1272 	 */
1273 	cleanup_started = _B_TRUE;
1274 
1275 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1276 		next_pii = pii->pii_next;
1277 		phyint_inst_delete(pii);
1278 	}
1279 
1280 	(void) close(ifsock_v4);
1281 	(void) close(ifsock_v6);
1282 	(void) close(rtsock_v4);
1283 	(void) close(rtsock_v6);
1284 	(void) close(lsock_v4);
1285 	(void) close(lsock_v6);
1286 	(void) close(0);
1287 	(void) close(1);
1288 	(void) close(2);
1289 	(void) close(mibfd);
1290 	(void) close(eventpipe_read);
1291 	(void) close(eventpipe_write);
1292 }
1293 
1294 /*
1295  * Create pipe for signal delivery and set up signal handlers.
1296  */
1297 static void
1298 setup_eventpipe(void)
1299 {
1300 	int fds[2];
1301 	struct sigaction act;
1302 
1303 	if ((pipe(fds)) < 0) {
1304 		logperror("setup_eventpipe: pipe");
1305 		exit(1);
1306 	}
1307 	eventpipe_read = fds[0];
1308 	eventpipe_write = fds[1];
1309 	if (poll_add(eventpipe_read) == -1) {
1310 		exit(1);
1311 	}
1312 
1313 	act.sa_handler = sig_handler;
1314 	act.sa_flags = SA_RESTART;
1315 	(void) sigaction(SIGALRM, &act, NULL);
1316 
1317 	(void) sigset(SIGHUP, sig_handler);
1318 	(void) sigset(SIGUSR1, sig_handler);
1319 	(void) sigset(SIGTERM, sig_handler);
1320 	(void) sigset(SIGINT, sig_handler);
1321 	(void) sigset(SIGQUIT, sig_handler);
1322 }
1323 
1324 /*
1325  * Create a routing socket for receiving RTM_IFINFO messages.
1326  */
1327 static int
1328 setup_rtsock(int af)
1329 {
1330 	int	s;
1331 	int	flags;
1332 
1333 	s = socket(PF_ROUTE, SOCK_RAW, af);
1334 	if (s == -1) {
1335 		logperror("setup_rtsock: socket PF_ROUTE");
1336 		exit(1);
1337 	}
1338 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1339 		logperror("setup_rtsock: fcntl F_GETFL");
1340 		(void) close(s);
1341 		exit(1);
1342 	}
1343 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1344 		logperror("setup_rtsock: fcntl F_SETFL");
1345 		(void) close(s);
1346 		exit(1);
1347 	}
1348 	if (poll_add(s) == -1) {
1349 		(void) close(s);
1350 		exit(1);
1351 	}
1352 	return (s);
1353 }
1354 
1355 /*
1356  * Process an RTM_IFINFO message received on a routing socket.
1357  * The return value indicates whether a full interface scan is required.
1358  * Link up/down notifications from the NICs are reflected in the
1359  * IFF_RUNNING flag.
1360  * If just the state of the IFF_RUNNING interface flag has changed, a
1361  * a full interface scan isn't required.
1362  */
1363 static boolean_t
1364 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1365 {
1366 	struct sockaddr_dl *sdl;
1367 	struct phyint *pi;
1368 	uint64_t old_flags;
1369 	struct phyint_instance *pii;
1370 
1371 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1372 
1373 	/*
1374 	 * Although the sockaddr_dl structure is directly after the
1375 	 * if_msghdr_t structure. At the time of writing, the size of the
1376 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1377 	 * to the presence of a timeval structure, which contains longs,
1378 	 * in the if_data structure.  Anyway, we know where the message ends,
1379 	 * so we work backwards to get the start of the sockaddr_dl structure.
1380 	 */
1381 	/*LINTED*/
1382 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1383 		sizeof (struct sockaddr_dl));
1384 
1385 	assert(sdl->sdl_family == AF_LINK);
1386 
1387 	/*
1388 	 * The interface name is in sdl_data.
1389 	 * RTM_IFINFO messages are only generated for logical interface
1390 	 * zero, so there is no colon and logical interface number to
1391 	 * strip from the name.	 The name is not null terminated, but
1392 	 * there should be enough space in sdl_data to add the null.
1393 	 */
1394 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1395 		if (debug & D_LINKNOTE)
1396 			logdebug("process_rtm_ifinfo: "
1397 				"phyint name too long\n");
1398 		return (_B_TRUE);
1399 	}
1400 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1401 
1402 	pi = phyint_lookup(sdl->sdl_data);
1403 	if (pi == NULL) {
1404 		if (debug & D_LINKNOTE)
1405 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1406 				" for %s\n", sdl->sdl_data);
1407 		return (_B_TRUE);
1408 	}
1409 
1410 	/*
1411 	 * We want to try and avoid doing a full interface scan for
1412 	 * link state notifications from the NICs, as indicated
1413 	 * by the state of the IFF_RUNNING flag.  If just the
1414 	 * IFF_RUNNING flag has changed state, the link state changes
1415 	 * are processed without a full scan.
1416 	 * If there is both an IPv4 and IPv6 instance associated with
1417 	 * the physical interface, we will get an RTM_IFINFO message
1418 	 * for each instance.  If we just maintained a single copy of
1419 	 * the physical interface flags, it would appear that no flags
1420 	 * had changed when the second message is processed, leading us
1421 	 * to believe that the message wasn't generated by a flags change,
1422 	 * and that a full interface scan is required.
1423 	 * To get around this problem, two additional copies of the flags
1424 	 * are kept, one copy for each instance.  These are only used in
1425 	 * this routine.  At any one time, all three copies of the flags
1426 	 * should be identical except for the IFF_RUNNING flag.	 The
1427 	 * copy of the flags in the "phyint" structure is always up to
1428 	 * date.
1429 	 */
1430 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1431 	if (pii == NULL) {
1432 		if (debug & D_LINKNOTE)
1433 			logdebug("process_rtm_ifinfo: no instance of address "
1434 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1435 		return (_B_TRUE);
1436 	}
1437 
1438 	old_flags = pii->pii_flags;
1439 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1440 	pi->pi_flags = pii->pii_flags;
1441 
1442 	if (debug & D_LINKNOTE) {
1443 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1444 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1445 		    AF_STR(type), old_flags, pi->pi_flags);
1446 	}
1447 
1448 	/*
1449 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1450 	 * types.
1451 	 */
1452 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1453 		phyint_newtype(pi);
1454 
1455 	/*
1456 	 * If IFF_INACTIVE has been set, then no data addresses should be
1457 	 * hosted on the interface.  If IFF_INACTIVE has been cleared, then
1458 	 * move previously failed-over addresses back to it, provided it is
1459 	 * not failed.	For details, see the state diagram in mpd_probe.c.
1460 	 */
1461 	if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
1462 		if (pii->pii_flags & IFF_INACTIVE) {
1463 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
1464 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
1465 		} else {
1466 			if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
1467 				pi->pi_empty = 0;
1468 				(void) try_failback(pi, _B_FALSE);
1469 			}
1470 		}
1471 	}
1472 
1473 	/* Has just the IFF_RUNNING flag changed state ? */
1474 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1475 		struct phyint_instance *pii_other;
1476 		/*
1477 		 * It wasn't just a link state change.	Update
1478 		 * the other instance's copy of the flags.
1479 		 */
1480 		pii_other = phyint_inst_other(pii);
1481 		if (pii_other != NULL)
1482 			pii_other->pii_flags = pii->pii_flags;
1483 		return (_B_TRUE);
1484 	}
1485 
1486 	return (_B_FALSE);
1487 }
1488 
1489 /*
1490  * Retrieve as many routing socket messages as possible, and try to
1491  * empty the routing sockets. Initiate full scan of targets or interfaces
1492  * as needed.
1493  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1494  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1495  */
1496 static void
1497 process_rtsock(int rtsock_v4, int rtsock_v6)
1498 {
1499 	int	nbytes;
1500 	int64_t msg[2048 / 8];
1501 	struct rt_msghdr *rtm;
1502 	boolean_t need_if_scan = _B_FALSE;
1503 	boolean_t need_rt_scan = _B_FALSE;
1504 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1505 	int type;
1506 
1507 	/* Read as many messages as possible and try to empty the sockets */
1508 	for (type = AF_INET; ; type = AF_INET6) {
1509 		for (;;) {
1510 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1511 				rtsock_v6, msg, sizeof (msg));
1512 			if (nbytes <= 0) {
1513 				/* No more messages */
1514 				break;
1515 			}
1516 			rtm = (struct rt_msghdr *)msg;
1517 			if (rtm->rtm_version != RTM_VERSION) {
1518 				logerr("process_rtsock: version %d "
1519 				    "not understood\n", rtm->rtm_version);
1520 				break;
1521 			}
1522 
1523 			if (debug & D_PHYINT) {
1524 				logdebug("process_rtsock: message %d\n",
1525 				    rtm->rtm_type);
1526 			}
1527 
1528 			switch (rtm->rtm_type) {
1529 			case RTM_NEWADDR:
1530 			case RTM_DELADDR:
1531 				/*
1532 				 * Some logical interface has changed,
1533 				 * have to scan everything to determine
1534 				 * what actually changed.
1535 				 */
1536 				need_if_scan = _B_TRUE;
1537 				break;
1538 
1539 			case RTM_IFINFO:
1540 				rtm_ifinfo_seen = _B_TRUE;
1541 				need_if_scan |=
1542 					process_rtm_ifinfo((if_msghdr_t *)rtm,
1543 					type);
1544 				break;
1545 
1546 			case RTM_ADD:
1547 			case RTM_DELETE:
1548 			case RTM_CHANGE:
1549 			case RTM_OLDADD:
1550 			case RTM_OLDDEL:
1551 				need_rt_scan = _B_TRUE;
1552 				break;
1553 
1554 			default:
1555 				/* Not interesting */
1556 				break;
1557 			}
1558 		}
1559 		if (type == AF_INET6)
1560 			break;
1561 	}
1562 
1563 	if (need_if_scan) {
1564 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1565 			logdebug("process_rtsock: synchronizing with kernel\n");
1566 		initifs();
1567 	} else if (rtm_ifinfo_seen) {
1568 		if (debug & D_LINKNOTE)
1569 			logdebug("process_rtsock: "
1570 			    "link up/down notification(s) seen\n");
1571 		process_link_state_changes();
1572 	}
1573 
1574 	if (need_rt_scan)
1575 		init_router_targets();
1576 }
1577 
1578 /*
1579  * Look if the phyint instance or one of its logints have been removed from
1580  * the kernel and take appropriate action.
1581  * Uses {pii,li}_in_use.
1582  */
1583 static void
1584 check_if_removed(struct phyint_instance *pii)
1585 {
1586 	struct logint *li;
1587 	struct logint *next_li;
1588 
1589 	/* Detect phyints that have been removed from the kernel. */
1590 	if (!pii->pii_in_use) {
1591 		logtrace("%s %s has been removed from kernel\n",
1592 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1593 		phyint_inst_delete(pii);
1594 	} else {
1595 		/* Detect logints that have been removed. */
1596 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1597 			next_li = li->li_next;
1598 			if (!li->li_in_use) {
1599 				logint_delete(li);
1600 			}
1601 		}
1602 	}
1603 }
1604 
1605 /*
1606  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1607  * tables defined by mib2.h. Parse the returned data and extract
1608  * the 'routing' information table. Process the 'routing' table
1609  * to get the list of known onlink routers, and update our database.
1610  * These onlink routers will serve as our probe targets.
1611  * Returns false, if any system calls resulted in errors, true otherwise.
1612  */
1613 static boolean_t
1614 update_router_list(int fd)
1615 {
1616 	union {
1617 		char	ubuf[1024];
1618 		union T_primitives uprim;
1619 	} buf;
1620 
1621 	int			flags;
1622 	struct strbuf		ctlbuf;
1623 	struct strbuf		databuf;
1624 	struct T_optmgmt_req	*tor;
1625 	struct T_optmgmt_ack	*toa;
1626 	struct T_error_ack	*tea;
1627 	struct opthdr		*optp;
1628 	struct opthdr		*req;
1629 	int			status;
1630 	t_scalar_t		prim;
1631 
1632 	tor = (struct T_optmgmt_req *)&buf;
1633 
1634 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1635 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1636 	tor->OPT_length = sizeof (struct opthdr);
1637 	tor->MGMT_flags = T_CURRENT;
1638 
1639 	req = (struct opthdr *)&tor[1];
1640 	req->level = MIB2_IP;	/* any MIB2_xxx value ok here */
1641 	req->name  = 0;
1642 	req->len   = 0;
1643 
1644 	ctlbuf.buf = (char *)&buf;
1645 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1646 	ctlbuf.maxlen = sizeof (buf);
1647 	flags = 0;
1648 	if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
1649 		logperror("update_router_list: putmsg(ctl)");
1650 		return (_B_FALSE);
1651 	}
1652 
1653 	/*
1654 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1655 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1656 	 * a control and data part. The control part contains a struct
1657 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1658 	 * the level, name and length of the data in the data part. The
1659 	 * data part contains the actual table data. The last message
1660 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1661 	 * single option with zero optlen.
1662 	 */
1663 
1664 	for (;;) {
1665 		/*
1666 		 * Go around this loop once for each table. Ignore
1667 		 * all tables except the routing information table.
1668 		 */
1669 		flags = 0;
1670 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1671 		if (status < 0) {
1672 			if (errno == EINTR)
1673 				continue;
1674 			logperror("update_router_list: getmsg(ctl)");
1675 			return (_B_FALSE);
1676 		}
1677 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1678 			logerr("update_router_list: ctlbuf.len %d\n",
1679 			    ctlbuf.len);
1680 			return (_B_FALSE);
1681 		}
1682 
1683 		prim = buf.uprim.type;
1684 
1685 		switch (prim) {
1686 
1687 		case T_ERROR_ACK:
1688 			tea = &buf.uprim.error_ack;
1689 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1690 				logerr("update_router_list: T_ERROR_ACK"
1691 				    " ctlbuf.len %d\n", ctlbuf.len);
1692 				return (_B_FALSE);
1693 			}
1694 			logerr("update_router_list: T_ERROR_ACK:"
1695 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1696 			    tea->TLI_error, tea->UNIX_error);
1697 			return (_B_FALSE);
1698 
1699 		case T_OPTMGMT_ACK:
1700 			toa = &buf.uprim.optmgmt_ack;
1701 			optp = (struct opthdr *)&toa[1];
1702 			if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
1703 				logerr("update_router_list: ctlbuf.len %d\n",
1704 				    ctlbuf.len);
1705 				return (_B_FALSE);
1706 			}
1707 			if (toa->MGMT_flags != T_SUCCESS) {
1708 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1709 				    toa->MGMT_flags);
1710 				return (_B_FALSE);
1711 			}
1712 			break;
1713 
1714 		default:
1715 			logerr("update_router_list: unknown primitive %ld\n",
1716 			    prim);
1717 			return (_B_FALSE);
1718 		}
1719 
1720 		/* Process the T_OPGMGMT_ACK below */
1721 		assert(prim == T_OPTMGMT_ACK);
1722 
1723 		switch (status) {
1724 		case 0:
1725 			/*
1726 			 * We have reached the end of this T_OPTMGMT_ACK
1727 			 * message. If this is the last message i.e EOD,
1728 			 * return, else process the next T_OPTMGMT_ACK msg.
1729 			 */
1730 			if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
1731 			    sizeof (struct opthdr)) && optp->len == 0 &&
1732 			    optp->name == 0 && optp->level == 0) {
1733 				/*
1734 				 * This is the EOD message. Return
1735 				 */
1736 				return (_B_TRUE);
1737 			}
1738 			continue;
1739 
1740 		case MORECTL:
1741 		case MORECTL | MOREDATA:
1742 			/*
1743 			 * This should not happen. We should be able to read
1744 			 * the control portion in a single getmsg.
1745 			 */
1746 			logerr("update_router_list: MORECTL\n");
1747 			return (_B_FALSE);
1748 
1749 		case MOREDATA:
1750 			databuf.maxlen = optp->len;
1751 			/* malloc of 0 bytes is ok */
1752 			databuf.buf = malloc((size_t)optp->len);
1753 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1754 				logperror("update_router_list: malloc");
1755 				return (_B_FALSE);
1756 			}
1757 			databuf.len = 0;
1758 			flags = 0;
1759 			for (;;) {
1760 				status = getmsg(fd, NULL, &databuf, &flags);
1761 				if (status >= 0) {
1762 					break;
1763 				} else if (errno == EINTR) {
1764 					continue;
1765 				} else {
1766 					logperror("update_router_list:"
1767 					    " getmsg(data)");
1768 					free(databuf.buf);
1769 					return (_B_FALSE);
1770 				}
1771 			}
1772 
1773 			if (optp->level == MIB2_IP &&
1774 			    optp->name == MIB2_IP_ROUTE) {
1775 				/* LINTED */
1776 				ire_process_v4((mib2_ipRouteEntry_t *)
1777 				    databuf.buf, databuf.len);
1778 			} else if (optp->level == MIB2_IP6 &&
1779 			    optp->name == MIB2_IP6_ROUTE) {
1780 				/* LINTED */
1781 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1782 				    databuf.buf, databuf.len);
1783 			}
1784 			free(databuf.buf);
1785 		}
1786 	}
1787 	/* NOTREACHED */
1788 }
1789 
1790 /*
1791  * Examine the IPv4 routing table, for default routers. For each default
1792  * router, populate the list of targets of each phyint that is on the same
1793  * link as the default router
1794  */
1795 static void
1796 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1797 {
1798 	mib2_ipRouteEntry_t	*rp;
1799 	mib2_ipRouteEntry_t	*rp1;
1800 	struct	in_addr		nexthop_v4;
1801 	mib2_ipRouteEntry_t	*endp;
1802 
1803 	if (len == 0)
1804 		return;
1805 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1806 
1807 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1808 
1809 	/*
1810 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1811 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1812 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1813 	 * This is a potential target for probing, which we try to add
1814 	 * to the list of probe targets.
1815 	 */
1816 	for (rp = buf; rp < endp; rp++) {
1817 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1818 			continue;
1819 
1820 		/*  Get the nexthop address. */
1821 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1822 
1823 		/*
1824 		 * Get the nexthop address. Then determine the outgoing
1825 		 * interface, by examining all interface IREs, and picking the
1826 		 * match. We don't look at the interface specified in the route
1827 		 * because we need to add the router target on all matching
1828 		 * interfaces anyway; the goal is to avoid falling back to
1829 		 * multicast when some interfaces are in the same subnet but
1830 		 * not in the same group.
1831 		 */
1832 		for (rp1 = buf; rp1 < endp; rp1++) {
1833 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
1834 				continue;
1835 			}
1836 
1837 			/*
1838 			 * Determine the interface IRE that matches the nexthop.
1839 			 * i.e.	 (IRE addr & IRE mask) == (nexthop & IRE mask)
1840 			 */
1841 			if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
1842 			    (nexthop_v4.s_addr & rp1->ipRouteMask)) {
1843 				/*
1844 				 * We found the interface ire
1845 				 */
1846 				router_add_v4(rp1, nexthop_v4);
1847 			}
1848 		}
1849 	}
1850 }
1851 
1852 void
1853 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
1854 {
1855 	char *cp;
1856 	char ifname[LIFNAMSIZ + 1];
1857 	struct in6_addr	nexthop;
1858 	int len;
1859 
1860 	if (debug & D_TARGET)
1861 		logdebug("router_add_v4()\n");
1862 
1863 	len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
1864 	(void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
1865 	ifname[len] = '\0';
1866 
1867 	if (ifname[0] == '\0')
1868 		return;
1869 
1870 	cp = strchr(ifname, IF_SEPARATOR);
1871 	if (cp != NULL)
1872 		*cp = '\0';
1873 
1874 	IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1875 	router_add_common(AF_INET, ifname, nexthop);
1876 }
1877 
1878 void
1879 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1880 {
1881 	struct phyint_instance *pii;
1882 	struct phyint *pi;
1883 
1884 	if (debug & D_TARGET)
1885 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1886 
1887 	/*
1888 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1889 	 */
1890 	pii = phyint_inst_lookup(af, ifname);
1891 	if (pii == NULL)
1892 		return;
1893 
1894 	/*
1895 	 * Don't use our own addresses as targets.
1896 	 */
1897 	if (own_address(nexthop))
1898 		return;
1899 
1900 	/*
1901 	 * If the phyint is part a named group, then add the address to all
1902 	 * members of the group; note that this is suboptimal in the IPv4 case
1903 	 * as it has already been added to all matching interfaces in
1904 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1905 	 * itself, since other phyints in the anongroup may not be on the same
1906 	 * subnet.
1907 	 */
1908 	pi = pii->pii_phyint;
1909 	if (pi->pi_group == phyint_anongroup) {
1910 		target_add(pii, nexthop, _B_TRUE);
1911 	} else {
1912 		pi = pi->pi_group->pg_phyint;
1913 		for (; pi != NULL; pi = pi->pi_pgnext)
1914 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1915 	}
1916 }
1917 
1918 /*
1919  * Examine the IPv6 routing table, for default routers. For each default
1920  * router, populate the list of targets of each phyint that is on the same
1921  * link as the default router
1922  */
1923 static void
1924 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1925 {
1926 	mib2_ipv6RouteEntry_t	*rp;
1927 	mib2_ipv6RouteEntry_t	*endp;
1928 	struct	in6_addr nexthop_v6;
1929 
1930 	if (debug & D_TARGET)
1931 		logdebug("ire_process_v6(len %d)\n", len);
1932 
1933 	if (len == 0)
1934 		return;
1935 
1936 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1937 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1938 
1939 	/*
1940 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1941 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1942 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1943 	 * This is a potential target for probing, which we try to add
1944 	 * to the list of probe targets.
1945 	 */
1946 	for (rp = buf; rp < endp; rp++) {
1947 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
1948 			continue;
1949 
1950 		/*
1951 		 * We have the outgoing interface in ipv6RouteIfIndex
1952 		 * if ipv6RouteIfindex.o_length is non-zero. The outgoing
1953 		 * interface must be present for link-local addresses. Since
1954 		 * we use only link-local addreses for probing, we don't
1955 		 * consider the case when the outgoing interface is not
1956 		 * known and we need to scan interface ires
1957 		 */
1958 		nexthop_v6 = rp->ipv6RouteNextHop;
1959 		if (rp->ipv6RouteIfIndex.o_length != 0) {
1960 			/*
1961 			 * We already have the outgoing interface
1962 			 * in ipv6RouteIfIndex.
1963 			 */
1964 			router_add_v6(rp, nexthop_v6);
1965 		}
1966 	}
1967 }
1968 
1969 
1970 void
1971 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
1972 {
1973 	char ifname[LIFNAMSIZ + 1];
1974 	char *cp;
1975 	int  len;
1976 
1977 	if (debug & D_TARGET)
1978 		logdebug("router_add_v6()\n");
1979 
1980 	len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
1981 	(void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
1982 	ifname[len] = '\0';
1983 
1984 	if (ifname[0] == '\0')
1985 		return;
1986 
1987 	cp = strchr(ifname, IF_SEPARATOR);
1988 	if (cp != NULL)
1989 		*cp = '\0';
1990 
1991 	router_add_common(AF_INET6, ifname, nexthop_v6);
1992 }
1993 
1994 
1995 
1996 /*
1997  * Build a list of target routers, by scanning the routing tables.
1998  * It is assumed that interface routes exist, to reach the routers.
1999  */
2000 static void
2001 init_router_targets(void)
2002 {
2003 	struct	target *tg;
2004 	struct	target *next_tg;
2005 	struct	phyint_instance *pii;
2006 	struct	phyint *pi;
2007 
2008 	if (force_mcast)
2009 		return;
2010 
2011 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2012 		pi = pii->pii_phyint;
2013 		/*
2014 		 * Exclude ptp and host targets. Set tg_in_use to false,
2015 		 * only for router targets.
2016 		 */
2017 		if (!pii->pii_targets_are_routers ||
2018 		    (pi->pi_flags & IFF_POINTOPOINT))
2019 			continue;
2020 
2021 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
2022 			tg->tg_in_use = 0;
2023 	}
2024 
2025 	if (mibfd < 0) {
2026 		mibfd = open("/dev/ip", O_RDWR);
2027 		if (mibfd < 0) {
2028 			logperror("mibopen: ip open");
2029 			exit(1);
2030 		}
2031 	}
2032 
2033 	if (!update_router_list(mibfd)) {
2034 		(void) close(mibfd);
2035 		mibfd = -1;
2036 	}
2037 
2038 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2039 		if (!pii->pii_targets_are_routers ||
2040 		    (pi->pi_flags & IFF_POINTOPOINT))
2041 			continue;
2042 
2043 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
2044 			next_tg = tg->tg_next;
2045 			if (!tg->tg_in_use) {
2046 				target_delete(tg);
2047 			}
2048 		}
2049 	}
2050 }
2051 
2052 /*
2053  * Attempt to assign host targets to any interfaces that do not currently
2054  * have probe targets by sharing targets with other interfaces in the group.
2055  */
2056 static void
2057 init_host_targets(void)
2058 {
2059 	struct phyint_instance *pii;
2060 	struct phyint_group *pg;
2061 
2062 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2063 		pg = pii->pii_phyint->pi_group;
2064 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
2065 			dup_host_targets(pii);
2066 	}
2067 }
2068 
2069 /*
2070  * Duplicate host targets from other phyints of the group to
2071  * the phyint instance 'desired_pii'.
2072  */
2073 static void
2074 dup_host_targets(struct phyint_instance	 *desired_pii)
2075 {
2076 	int af;
2077 	struct phyint *pi;
2078 	struct phyint_instance *pii;
2079 	struct target *tg;
2080 
2081 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
2082 
2083 	af = desired_pii->pii_af;
2084 
2085 	/*
2086 	 * For every phyint in the same group as desired_pii, check if
2087 	 * it has any host targets. If so add them to desired_pii.
2088 	 */
2089 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
2090 		pii = PHYINT_INSTANCE(pi, af);
2091 		/*
2092 		 * We know that we don't have targets on this phyint instance
2093 		 * since we have been called. But we still check for
2094 		 * pii_targets_are_routers because another phyint instance
2095 		 * could have router targets, since IFF_NOFAILOVER addresses
2096 		 * on different phyint instances may belong to different
2097 		 * subnets.
2098 		 */
2099 		if ((pii == NULL) || (pii == desired_pii) ||
2100 		    pii->pii_targets_are_routers)
2101 			continue;
2102 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2103 			target_create(desired_pii, tg->tg_address, _B_FALSE);
2104 		}
2105 	}
2106 }
2107 
2108 static void
2109 usage(char *cmd)
2110 {
2111 	(void) fprintf(stderr, "usage: %s\n", cmd);
2112 }
2113 
2114 
2115 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2116 
2117 /* Get an option from the /etc/default/mpathd file */
2118 static char *
2119 getdefault(char *name)
2120 {
2121 	char namebuf[BUFSIZ];
2122 	char *value = NULL;
2123 
2124 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2125 		char	*cp;
2126 		int	flags;
2127 
2128 		/*
2129 		 * ignore case
2130 		 */
2131 		flags = defcntl(DC_GETFLAGS, 0);
2132 		TURNOFF(flags, DC_CASE);
2133 		(void) defcntl(DC_SETFLAGS, flags);
2134 
2135 		/* Add "=" to the name */
2136 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2137 		(void) strncat(namebuf, "=", 2);
2138 
2139 		if ((cp = defread(namebuf)) != NULL)
2140 			value = strdup(cp);
2141 
2142 		/* close */
2143 		(void) defopen((char *)NULL);
2144 	}
2145 	return (value);
2146 }
2147 
2148 
2149 /*
2150  * Command line options below
2151  */
2152 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2153 boolean_t	track_all_phyints = _B_FALSE;	/* option to track all NICs */
2154 static boolean_t adopt = _B_FALSE;
2155 static boolean_t foreground = _B_FALSE;
2156 
2157 int
2158 main(int argc, char *argv[])
2159 {
2160 	int i;
2161 	int c;
2162 	struct phyint_instance *pii;
2163 	char *value;
2164 
2165 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2166 	srandom(gethostid());	/* Initialize the random number generator */
2167 
2168 	/*
2169 	 * NOTE: The messages output by in.mpathd are not suitable for
2170 	 * translation, so we do not call textdomain().
2171 	 */
2172 	(void) setlocale(LC_ALL, "");
2173 
2174 	/*
2175 	 * Get the user specified value of 'failure detection time'
2176 	 * from /etc/default/mpathd
2177 	 */
2178 	value = getdefault("FAILURE_DETECTION_TIME");
2179 	if (value != NULL) {
2180 		user_failure_detection_time =
2181 		    (int)strtol((char *)value, NULL, 0);
2182 
2183 		if (user_failure_detection_time <= 0) {
2184 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2185 			logerr("Invalid failure detection time %s, assuming "
2186 			    "default %d\n", value, user_failure_detection_time);
2187 
2188 		} else if (user_failure_detection_time <
2189 		    MIN_FAILURE_DETECTION_TIME) {
2190 			user_failure_detection_time =
2191 			    MIN_FAILURE_DETECTION_TIME;
2192 			logerr("Too small failure detection time of %s, "
2193 			    "assuming minimum %d\n", value,
2194 			    user_failure_detection_time);
2195 		}
2196 		free(value);
2197 	} else {
2198 		/* User has not specified the parameter, Use default value */
2199 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2200 	}
2201 
2202 	/*
2203 	 * This gives the frequency at which probes will be sent.
2204 	 * When fdt ms elapses, we should be able to determine
2205 	 * whether 5 consecutive probes have failed or not.
2206 	 * 1 probe will be sent in every user_probe_interval ms,
2207 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2208 	 * user_probe_interval. Thus when we send out probe 'n' we
2209 	 * can be sure that probe 'n - 2' is lost, if we have not
2210 	 * got the ack. (since the probe interval is > crtt). But
2211 	 * probe 'n - 1' may be a valid unacked probe, since the
2212 	 * time between 2 successive probes could be as small as
2213 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2214 	 */
2215 	user_probe_interval = user_failure_detection_time /
2216 	    (NUM_PROBE_FAILS + 2);
2217 
2218 	/*
2219 	 * Get the user specified value of failback_enabled from
2220 	 * /etc/default/mpathd
2221 	 */
2222 	value = getdefault("FAILBACK");
2223 	if (value != NULL) {
2224 		if (strncasecmp(value, "yes", 3) == 0)
2225 			failback_enabled = _B_TRUE;
2226 		else if (strncasecmp(value, "no", 2) == 0)
2227 			failback_enabled = _B_FALSE;
2228 		else
2229 			logerr("Invalid value for FAILBACK %s\n", value);
2230 		free(value);
2231 	} else {
2232 		failback_enabled = _B_TRUE;
2233 	}
2234 
2235 	/*
2236 	 * Get the user specified value of track_all_phyints from
2237 	 * /etc/default/mpathd. The sense is reversed in
2238 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2239 	 */
2240 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2241 	if (value != NULL) {
2242 		if (strncasecmp(value, "yes", 3) == 0)
2243 			track_all_phyints = _B_FALSE;
2244 		else if (strncasecmp(value, "no", 2) == 0)
2245 			track_all_phyints = _B_TRUE;
2246 		else
2247 			logerr("Invalid value for "
2248 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2249 		free(value);
2250 	} else {
2251 		track_all_phyints = _B_FALSE;
2252 	}
2253 
2254 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2255 		switch (c) {
2256 		case 'a':
2257 			adopt = _B_TRUE;
2258 			break;
2259 		case 'm':
2260 			force_mcast = _B_TRUE;
2261 			break;
2262 		case 'd':
2263 			debug = D_ALL;
2264 			foreground = _B_TRUE;
2265 			break;
2266 		case 'D':
2267 			i = (int)strtol(optarg, NULL, 0);
2268 			if (i == 0) {
2269 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2270 				    optarg);
2271 				exit(1);
2272 			}
2273 			debug |= i;
2274 			foreground = _B_TRUE;
2275 			break;
2276 		case 'l':
2277 			/*
2278 			 * Turn off link state notification handling.
2279 			 * Undocumented command line flag, for debugging
2280 			 * purposes.
2281 			 */
2282 			handle_link_notifications = _B_FALSE;
2283 			break;
2284 		default:
2285 			usage(argv[0]);
2286 			exit(1);
2287 		}
2288 	}
2289 
2290 	/*
2291 	 * The sockets for the loopback command interface should be listening
2292 	 * before we fork and exit in daemonize(). This way, whoever started us
2293 	 * can use the loopback interface as soon as they get a zero exit
2294 	 * status.
2295 	 */
2296 	lsock_v4 = setup_listener(AF_INET);
2297 	lsock_v6 = setup_listener(AF_INET6);
2298 
2299 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2300 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2301 		exit(1);
2302 	}
2303 
2304 	if (!foreground) {
2305 		if (!daemonize()) {
2306 			logerr("cannot daemonize\n");
2307 			exit(EXIT_FAILURE);
2308 		}
2309 		initlog();
2310 	}
2311 
2312 	/*
2313 	 * Initializations:
2314 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2315 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2316 	 * 2. Initialize a pipe for handling/recording signal events.
2317 	 * 3. Create the routing sockets,  used for listening
2318 	 *    to routing / interface changes.
2319 	 * 4. phyint_init() - Initialize physical interface state
2320 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2321 	 *    which timer_init() does indirectly.
2322 	 * 5. timer_init()  - Initialize timer related stuff
2323 	 * 6. initifs() - Initialize our database of all known interfaces
2324 	 * 7. init_router_targets() - Initialize our database of all known
2325 	 *    router targets.
2326 	 */
2327 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2328 	if (ifsock_v4 < 0) {
2329 		logperror("main: IPv4 socket open");
2330 		exit(1);
2331 	}
2332 
2333 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2334 	if (ifsock_v6 < 0) {
2335 		logperror("main: IPv6 socket open");
2336 		exit(1);
2337 	}
2338 
2339 	setup_eventpipe();
2340 
2341 	rtsock_v4 = setup_rtsock(AF_INET);
2342 	rtsock_v6 = setup_rtsock(AF_INET6);
2343 
2344 	if (phyint_init() == -1) {
2345 		logerr("cannot initialize physical interface structures");
2346 		exit(1);
2347 	}
2348 
2349 	timer_init();
2350 
2351 	initifs();
2352 
2353 	/* Inform kernel whether failback is enabled or disabled */
2354 	if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) {
2355 		logperror("main: ioctl (SIOCSIPMPFAILBACK)");
2356 		exit(1);
2357 	}
2358 
2359 	/*
2360 	 * If we're operating in "adopt" mode and no interfaces need to be
2361 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2362 	 * interfaces are subsequently put into multipathing groups).
2363 	 */
2364 	if (adopt && phyint_instances == NULL)
2365 		exit(0);
2366 
2367 	/*
2368 	 * Main body. Keep listening for activity on any of the sockets
2369 	 * that we are monitoring and take appropriate action as necessary.
2370 	 * signals are also handled synchronously.
2371 	 */
2372 	for (;;) {
2373 		if (poll(pollfds, pollfd_num, -1) < 0) {
2374 			if (errno == EINTR)
2375 				continue;
2376 			logperror("main: poll");
2377 			exit(1);
2378 		}
2379 		for (i = 0; i < pollfd_num; i++) {
2380 			if ((pollfds[i].fd == -1) ||
2381 			    !(pollfds[i].revents & POLLIN))
2382 				continue;
2383 			if (pollfds[i].fd == eventpipe_read) {
2384 				in_signal(eventpipe_read);
2385 				break;
2386 			}
2387 			if (pollfds[i].fd == rtsock_v4 ||
2388 				pollfds[i].fd == rtsock_v6) {
2389 				process_rtsock(rtsock_v4, rtsock_v6);
2390 				break;
2391 			}
2392 			for (pii = phyint_instances; pii != NULL;
2393 			    pii = pii->pii_next) {
2394 				if (pollfds[i].fd == pii->pii_probe_sock) {
2395 					if (pii->pii_af == AF_INET)
2396 						in_data(pii);
2397 					else
2398 						in6_data(pii);
2399 					break;
2400 				}
2401 			}
2402 			if (pollfds[i].fd == lsock_v4)
2403 				loopback_cmd(lsock_v4, AF_INET);
2404 			else if (pollfds[i].fd == lsock_v6)
2405 				loopback_cmd(lsock_v6, AF_INET6);
2406 		}
2407 		if (full_scan_required) {
2408 			initifs();
2409 			full_scan_required = _B_FALSE;
2410 		}
2411 	}
2412 	/* NOTREACHED */
2413 	return (EXIT_SUCCESS);
2414 }
2415 
2416 static int
2417 setup_listener(int af)
2418 {
2419 	int sock;
2420 	int on;
2421 	int len;
2422 	int ret;
2423 	struct sockaddr_storage laddr;
2424 	struct sockaddr_in  *sin;
2425 	struct sockaddr_in6 *sin6;
2426 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2427 
2428 	assert(af == AF_INET || af == AF_INET6);
2429 
2430 	sock = socket(af, SOCK_STREAM, 0);
2431 	if (sock < 0) {
2432 		logperror("setup_listener: socket");
2433 		exit(1);
2434 	}
2435 
2436 	on = 1;
2437 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2438 	    sizeof (on)) < 0) {
2439 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2440 		exit(1);
2441 	}
2442 
2443 	bzero(&laddr, sizeof (laddr));
2444 	laddr.ss_family = af;
2445 
2446 	if (af == AF_INET) {
2447 		sin = (struct sockaddr_in *)&laddr;
2448 		sin->sin_port = htons(MPATHD_PORT);
2449 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2450 		len = sizeof (struct sockaddr_in);
2451 	} else {
2452 		sin6 = (struct sockaddr_in6 *)&laddr;
2453 		sin6->sin6_port = htons(MPATHD_PORT);
2454 		sin6->sin6_addr = loopback_addr;
2455 		len = sizeof (struct sockaddr_in6);
2456 	}
2457 
2458 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2459 	if (ret < 0) {
2460 		if (errno == EADDRINUSE) {
2461 			/*
2462 			 * Another instance of mpathd may be already active.
2463 			 */
2464 			logerr("main: is another instance of in.mpathd "
2465 			    "already active?\n");
2466 			exit(1);
2467 		} else {
2468 			(void) close(sock);
2469 			return (-1);
2470 		}
2471 	}
2472 	if (listen(sock, 30) < 0) {
2473 		logperror("main: listen");
2474 		exit(1);
2475 	}
2476 	if (poll_add(sock) == -1) {
2477 		(void) close(sock);
2478 		exit(1);
2479 	}
2480 
2481 	return (sock);
2482 }
2483 
2484 /*
2485  * Table of commands and their expected size; used by loopback_cmd().
2486  */
2487 static struct {
2488 	const char	*name;
2489 	unsigned int	size;
2490 } commands[] = {
2491 	{ "MI_PING",		sizeof (uint32_t)	},
2492 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2493 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2494 	{ "MI_SETOINDEX",	sizeof (mi_setoindex_t) },
2495 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2496 };
2497 
2498 /*
2499  * Commands received over the loopback interface come here. Currently
2500  * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
2501  * module. ifconfig only makes a connection, and closes it to check if
2502  * in.mpathd is running.
2503  * if_mpadm sends commands in the format specified by the mpathd_interface
2504  * structure.
2505  */
2506 static void
2507 loopback_cmd(int sock, int family)
2508 {
2509 	int newfd;
2510 	ssize_t len;
2511 	struct sockaddr_storage	peer;
2512 	struct sockaddr_in	*peer_sin;
2513 	struct sockaddr_in6	*peer_sin6;
2514 	socklen_t peerlen;
2515 	union mi_commands mpi;
2516 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2517 	char abuf[INET6_ADDRSTRLEN];
2518 	uint_t cmd;
2519 	int retval;
2520 
2521 	peerlen = sizeof (peer);
2522 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2523 	if (newfd < 0) {
2524 		logperror("loopback_cmd: accept");
2525 		return;
2526 	}
2527 
2528 	switch (family) {
2529 	case AF_INET:
2530 		/*
2531 		 * Validate the address and port to make sure that
2532 		 * non privileged processes don't connect and start
2533 		 * talking to us.
2534 		 */
2535 		if (peerlen != sizeof (struct sockaddr_in)) {
2536 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2537 			(void) close(newfd);
2538 			return;
2539 		}
2540 		peer_sin = (struct sockaddr_in *)&peer;
2541 		if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
2542 		    (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
2543 			(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2544 			    abuf, sizeof (abuf));
2545 			logerr("Attempt to connect from addr %s port %d\n",
2546 			    abuf, ntohs(peer_sin->sin_port));
2547 			(void) close(newfd);
2548 			return;
2549 		}
2550 		break;
2551 
2552 	case AF_INET6:
2553 		if (peerlen != sizeof (struct sockaddr_in6)) {
2554 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2555 			(void) close(newfd);
2556 			return;
2557 		}
2558 		/*
2559 		 * Validate the address and port to make sure that
2560 		 * non privileged processes don't connect and start
2561 		 * talking to us.
2562 		 */
2563 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2564 		if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
2565 		    (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
2566 		    &loopback_addr))) {
2567 			(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2568 			    sizeof (abuf));
2569 			logerr("Attempt to connect from addr %s port %d\n",
2570 			    abuf, ntohs(peer_sin6->sin6_port));
2571 			(void) close(newfd);
2572 			return;
2573 		}
2574 
2575 	default:
2576 		logdebug("loopback_cmd: family %d\n", family);
2577 		(void) close(newfd);
2578 		return;
2579 	}
2580 
2581 	/*
2582 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2583 	 * all supported commands
2584 	 */
2585 	len = read(newfd, &mpi, sizeof (mpi));
2586 
2587 	/*
2588 	 * ifconfig does not send any data. Just tests to see if mpathd
2589 	 * is already running.
2590 	 */
2591 	if (len <= 0) {
2592 		(void) close(newfd);
2593 		return;
2594 	}
2595 
2596 	/*
2597 	 * In theory, we can receive any sized message for a stream socket,
2598 	 * but we don't expect that to happen for a small message over a
2599 	 * loopback connection.
2600 	 */
2601 	if (len < sizeof (uint32_t)) {
2602 		logerr("loopback_cmd: bad command format or read returns "
2603 		    "partial data %d\n", len);
2604 	}
2605 
2606 	cmd = mpi.mi_command;
2607 	if (cmd >= MI_NCMD) {
2608 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2609 		(void) close(newfd);
2610 		return;
2611 	}
2612 
2613 	if (len < commands[cmd].size) {
2614 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2615 		    commands[cmd].name, commands[cmd].size, len);
2616 		(void) close(newfd);
2617 		return;
2618 	}
2619 
2620 	retval = process_cmd(newfd, &mpi);
2621 	if (retval != IPMP_SUCCESS) {
2622 		logerr("failed processing %s: %s\n", commands[cmd].name,
2623 		    ipmp_errmsg(retval));
2624 	}
2625 	(void) close(newfd);
2626 }
2627 
2628 extern int global_errno;	/* set by failover() or failback() */
2629 
2630 /*
2631  * Process the offline, undo offline and set original index commands,
2632  * received from if_mpadm(1M)
2633  */
2634 static unsigned int
2635 process_cmd(int newfd, union mi_commands *mpi)
2636 {
2637 	uint_t	nif = 0;
2638 	uint32_t cmd;
2639 	struct phyint *pi;
2640 	struct phyint *pi2;
2641 	struct phyint_group *pg;
2642 	boolean_t success;
2643 	int error;
2644 	struct mi_offline *mio;
2645 	struct mi_undo_offline *miu;
2646 	struct lifreq lifr;
2647 	int ifsock;
2648 	struct mi_setoindex *mis;
2649 
2650 	cmd = mpi->mi_command;
2651 
2652 	switch (cmd) {
2653 	case MI_OFFLINE:
2654 		mio = &mpi->mi_ocmd;
2655 		/*
2656 		 * Lookup the interface that needs to be offlined.
2657 		 * If it does not exist, return a suitable error.
2658 		 */
2659 		pi = phyint_lookup(mio->mio_ifname);
2660 		if (pi == NULL)
2661 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2662 
2663 		/*
2664 		 * Verify that the minimum redundancy requirements are met.
2665 		 * The multipathing group must have at least the specified
2666 		 * number of functional interfaces after offlining the
2667 		 * requested interface. Otherwise return a suitable error.
2668 		 */
2669 		pg = pi->pi_group;
2670 		nif = 0;
2671 		if (pg != phyint_anongroup) {
2672 			for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
2673 			    pi2 = pi2->pi_pgnext) {
2674 				if ((pi2->pi_state == PI_RUNNING) ||
2675 				    (pg->pg_groupfailed &&
2676 				    !(pi2->pi_flags & IFF_OFFLINE)))
2677 					nif++;
2678 			}
2679 		}
2680 		if (nif < mio->mio_min_redundancy)
2681 			return (send_result(newfd, IPMP_EMINRED, 0));
2682 
2683 		/*
2684 		 * The order of operation is to set IFF_OFFLINE, followed by
2685 		 * failover. Setting IFF_OFFLINE ensures that no new ipif's
2686 		 * can be created. Subsequent failover moves everything on
2687 		 * the OFFLINE interface to some other functional interface.
2688 		 */
2689 		success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
2690 		if (success) {
2691 			if (!pi->pi_empty) {
2692 				error = try_failover(pi, FAILOVER_NORMAL);
2693 				if (error != 0) {
2694 					if (!change_lif_flags(pi, IFF_OFFLINE,
2695 					    _B_FALSE)) {
2696 						logerr("process_cmd: couldn't"
2697 						    " clear OFFLINE flag on"
2698 						    " %s\n", pi->pi_name);
2699 						/*
2700 						 * Offline interfaces should
2701 						 * not be probed.
2702 						 */
2703 						stop_probing(pi);
2704 					}
2705 					return (send_result(newfd, error,
2706 					    global_errno));
2707 				}
2708 			}
2709 		} else {
2710 			return (send_result(newfd, IPMP_FAILURE, errno));
2711 		}
2712 
2713 		/*
2714 		 * The interface is now Offline, so stop probing it.
2715 		 * Note that if_mpadm(1M) will down the test addresses,
2716 		 * after receiving a success reply from us. The routing
2717 		 * socket message will then make us close the socket used
2718 		 * for sending probes. But it is more logical that an
2719 		 * offlined interface must not be probed, even if it has
2720 		 * test addresses.
2721 		 */
2722 		stop_probing(pi);
2723 		return (send_result(newfd, IPMP_SUCCESS, 0));
2724 
2725 	case MI_UNDO_OFFLINE:
2726 		miu = &mpi->mi_ucmd;
2727 		/*
2728 		 * Undo the offline command. As usual lookup the interface.
2729 		 * Send an error if it does not exist.
2730 		 */
2731 		pi = phyint_lookup(miu->miu_ifname);
2732 		if (pi == NULL)
2733 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2734 
2735 		/*
2736 		 * Inverse of the offline operation. Do a failback, and then
2737 		 * clear the IFF_OFFLINE flag.
2738 		 */
2739 		error = do_failback(pi, _B_TRUE);
2740 		if (error == IPMP_EFBPARTIAL)
2741 			return (send_result(newfd, IPMP_EFBPARTIAL, 0));
2742 		error = do_failback(pi, _B_FALSE);
2743 
2744 		switch (error) {
2745 		case IPMP_SUCCESS:
2746 			if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) {
2747 				logdebug("undo error %X\n", global_errno);
2748 				error = IPMP_FAILURE;
2749 				break;
2750 			}
2751 			/* FALLTHROUGH */
2752 
2753 		case IPMP_EFBPARTIAL:
2754 			/*
2755 			 * Reset the state of the interface based on the
2756 			 * current link state; if this phyint subsequently
2757 			 * acquires a test address, the state will be changed
2758 			 * again later as a result of the probes.
2759 			 */
2760 			if (LINK_UP(pi))
2761 				phyint_chstate(pi, PI_RUNNING);
2762 			else
2763 				phyint_chstate(pi, PI_FAILED);
2764 			break;
2765 
2766 		case IPMP_FAILURE:
2767 			break;
2768 
2769 		default:
2770 			logdebug("do_failback: unexpected return value\n");
2771 			break;
2772 		}
2773 		return (send_result(newfd, error, global_errno));
2774 
2775 	case MI_SETOINDEX:
2776 		mis = &mpi->mi_scmd;
2777 
2778 		/* Get the socket for doing ioctls */
2779 		ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
2780 
2781 		/*
2782 		 * Get index of new original interface.
2783 		 * The index is returned in lifr.lifr_index.
2784 		 */
2785 		(void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
2786 		    sizeof (lifr.lifr_name));
2787 
2788 		if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
2789 			return (send_result(newfd, IPMP_FAILURE, errno));
2790 
2791 		/*
2792 		 * Set new original interface index.
2793 		 * The new index was put into lifr.lifr_index by the
2794 		 * SIOCGLIFINDEX ioctl.
2795 		 */
2796 		(void) strlcpy(lifr.lifr_name, mis->mis_lifname,
2797 		    sizeof (lifr.lifr_name));
2798 
2799 		if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
2800 			return (send_result(newfd, IPMP_FAILURE, errno));
2801 
2802 		return (send_result(newfd, IPMP_SUCCESS, 0));
2803 
2804 	case MI_QUERY:
2805 		return (process_query(newfd, &mpi->mi_qcmd));
2806 
2807 	default:
2808 		break;
2809 	}
2810 
2811 	return (send_result(newfd, IPMP_EPROTO, 0));
2812 }
2813 
2814 /*
2815  * Process the query request pointed to by `miq' and send a reply on file
2816  * descriptor `fd'.  Returns an IPMP error code.
2817  */
2818 static unsigned int
2819 process_query(int fd, mi_query_t *miq)
2820 {
2821 	ipmp_groupinfo_t	*grinfop;
2822 	ipmp_groupinfolist_t	*grlp;
2823 	ipmp_grouplist_t	*grlistp;
2824 	ipmp_ifinfo_t		*ifinfop;
2825 	ipmp_ifinfolist_t	*iflp;
2826 	ipmp_snap_t		*snap;
2827 	unsigned int		retval;
2828 
2829 	switch (miq->miq_inforeq) {
2830 	case IPMP_GROUPLIST:
2831 		retval = getgrouplist(&grlistp);
2832 		if (retval != IPMP_SUCCESS)
2833 			return (send_result(fd, retval, errno));
2834 
2835 		retval = send_result(fd, IPMP_SUCCESS, 0);
2836 		if (retval == IPMP_SUCCESS)
2837 			retval = send_grouplist(fd, grlistp);
2838 
2839 		ipmp_freegrouplist(grlistp);
2840 		return (retval);
2841 
2842 	case IPMP_GROUPINFO:
2843 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2844 		retval = getgroupinfo(miq->miq_ifname, &grinfop);
2845 		if (retval != IPMP_SUCCESS)
2846 			return (send_result(fd, retval, errno));
2847 
2848 		retval = send_result(fd, IPMP_SUCCESS, 0);
2849 		if (retval == IPMP_SUCCESS)
2850 			retval = send_groupinfo(fd, grinfop);
2851 
2852 		ipmp_freegroupinfo(grinfop);
2853 		return (retval);
2854 
2855 	case IPMP_IFINFO:
2856 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2857 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2858 		if (retval != IPMP_SUCCESS)
2859 			return (send_result(fd, retval, errno));
2860 
2861 		retval = send_result(fd, IPMP_SUCCESS, 0);
2862 		if (retval == IPMP_SUCCESS)
2863 			retval = send_ifinfo(fd, ifinfop);
2864 
2865 		ipmp_freeifinfo(ifinfop);
2866 		return (retval);
2867 
2868 	case IPMP_SNAP:
2869 		retval = getsnap(&snap);
2870 		if (retval != IPMP_SUCCESS)
2871 			return (send_result(fd, retval, errno));
2872 
2873 		retval = send_result(fd, IPMP_SUCCESS, 0);
2874 		if (retval != IPMP_SUCCESS)
2875 			goto out;
2876 
2877 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2878 		if (retval != IPMP_SUCCESS)
2879 			goto out;
2880 
2881 		retval = send_grouplist(fd, snap->sn_grlistp);
2882 		if (retval != IPMP_SUCCESS)
2883 			goto out;
2884 
2885 		iflp = snap->sn_ifinfolistp;
2886 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2887 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2888 			if (retval != IPMP_SUCCESS)
2889 				goto out;
2890 		}
2891 
2892 		grlp = snap->sn_grinfolistp;
2893 		for (; grlp != NULL; grlp = grlp->grl_next) {
2894 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2895 			if (retval != IPMP_SUCCESS)
2896 				goto out;
2897 		}
2898 	out:
2899 		ipmp_snap_free(snap);
2900 		return (retval);
2901 
2902 	default:
2903 		break;
2904 
2905 	}
2906 	return (send_result(fd, IPMP_EPROTO, 0));
2907 }
2908 
2909 /*
2910  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2911  * Returns an IPMP error code.
2912  */
2913 static unsigned int
2914 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2915 {
2916 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2917 	unsigned int	retval;
2918 
2919 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2920 	if (retval != IPMP_SUCCESS)
2921 		return (retval);
2922 
2923 	return (ipmp_writetlv(fd, IPMP_IFLIST,
2924 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
2925 }
2926 
2927 /*
2928  * Send the interface information pointed to by `ifinfop' on file descriptor
2929  * `fd'.  Returns an IPMP error code.
2930  */
2931 static unsigned int
2932 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2933 {
2934 	return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
2935 }
2936 
2937 /*
2938  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2939  * Returns an IPMP error code.
2940  */
2941 static unsigned int
2942 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2943 {
2944 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2945 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2946 }
2947 
2948 /*
2949  * Initialize an mi_result_t structure using `error' and `syserror' and
2950  * send it on file descriptor `fd'.  Returns an IPMP error code.
2951  */
2952 static unsigned int
2953 send_result(int fd, unsigned int error, int syserror)
2954 {
2955 	mi_result_t me;
2956 
2957 	me.me_mpathd_error = error;
2958 	if (error == IPMP_FAILURE)
2959 		me.me_sys_error = syserror;
2960 	else
2961 		me.me_sys_error = 0;
2962 
2963 	return (ipmp_write(fd, &me, sizeof (me)));
2964 }
2965 
2966 /*
2967  * Daemonize the process.
2968  */
2969 static boolean_t
2970 daemonize(void)
2971 {
2972 	switch (fork()) {
2973 	case -1:
2974 		return (_B_FALSE);
2975 
2976 	case  0:
2977 		/*
2978 		 * Lose our controlling terminal, and become both a session
2979 		 * leader and a process group leader.
2980 		 */
2981 		if (setsid() == -1)
2982 			return (_B_FALSE);
2983 
2984 		/*
2985 		 * Under POSIX, a session leader can accidentally (through
2986 		 * open(2)) acquire a controlling terminal if it does not
2987 		 * have one.  Just to be safe, fork() again so we are not a
2988 		 * session leader.
2989 		 */
2990 		switch (fork()) {
2991 		case -1:
2992 			return (_B_FALSE);
2993 
2994 		case 0:
2995 			(void) chdir("/");
2996 			(void) umask(022);
2997 			(void) fdwalk(closefunc, NULL);
2998 			break;
2999 
3000 		default:
3001 			_exit(EXIT_SUCCESS);
3002 		}
3003 		break;
3004 
3005 	default:
3006 		_exit(EXIT_SUCCESS);
3007 	}
3008 
3009 	return (_B_TRUE);
3010 }
3011 
3012 /*
3013  * The parent has created some fds before forking on purpose, keep them open.
3014  */
3015 static int
3016 closefunc(void *not_used, int fd)
3017 /* ARGSUSED */
3018 {
3019 	if (fd != lsock_v4 && fd != lsock_v6)
3020 		(void) close(fd);
3021 	return (0);
3022 }
3023 
3024 /* LOGGER */
3025 
3026 #include <syslog.h>
3027 
3028 /*
3029  * Logging routines.  All routines log to syslog, unless the daemon is
3030  * running in the foreground, in which case the logging goes to stderr.
3031  *
3032  * The following routines are available:
3033  *
3034  *	logdebug(): A printf-like function for outputting debug messages
3035  *	(messages at LOG_DEBUG) that are only of use to developers.
3036  *
3037  *	logtrace(): A printf-like function for outputting tracing messages
3038  *	(messages at LOG_INFO) from the daemon.	 This is typically used
3039  *	to log the receipt of interesting network-related conditions.
3040  *
3041  *	logerr(): A printf-like function for outputting error messages
3042  *	(messages at LOG_ERR) from the daemon.
3043  *
3044  *	logperror*(): A set of functions used to output error messages
3045  *	(messages at LOG_ERR); these automatically append strerror(errno)
3046  *	and a newline to the message passed to them.
3047  *
3048  * NOTE: since the logging functions write to syslog, the messages passed
3049  *	 to them are not eligible for localization.  Thus, gettext() must
3050  *	 *not* be used.
3051  */
3052 
3053 static int logging = 0;
3054 
3055 static void
3056 initlog(void)
3057 {
3058 	logging++;
3059 	openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON);
3060 }
3061 
3062 /* PRINTFLIKE1 */
3063 void
3064 logerr(char *fmt, ...)
3065 {
3066 	va_list ap;
3067 
3068 	va_start(ap, fmt);
3069 
3070 	if (logging)
3071 		vsyslog(LOG_ERR, fmt, ap);
3072 	else
3073 		(void) vfprintf(stderr, fmt, ap);
3074 	va_end(ap);
3075 }
3076 
3077 /* PRINTFLIKE1 */
3078 void
3079 logtrace(char *fmt, ...)
3080 {
3081 	va_list ap;
3082 
3083 	va_start(ap, fmt);
3084 
3085 	if (logging)
3086 		vsyslog(LOG_INFO, fmt, ap);
3087 	else
3088 		(void) vfprintf(stderr, fmt, ap);
3089 	va_end(ap);
3090 }
3091 
3092 /* PRINTFLIKE1 */
3093 void
3094 logdebug(char *fmt, ...)
3095 {
3096 	va_list ap;
3097 
3098 	va_start(ap, fmt);
3099 
3100 	if (logging)
3101 		vsyslog(LOG_DEBUG, fmt, ap);
3102 	else
3103 		(void) vfprintf(stderr, fmt, ap);
3104 	va_end(ap);
3105 }
3106 
3107 /* PRINTFLIKE1 */
3108 void
3109 logperror(char *str)
3110 {
3111 	if (logging)
3112 		syslog(LOG_ERR, "%s: %m\n", str);
3113 	else
3114 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
3115 }
3116 
3117 void
3118 logperror_pii(struct phyint_instance *pii, char *str)
3119 {
3120 	if (logging) {
3121 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3122 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
3123 	} else {
3124 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3125 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
3126 		    strerror(errno));
3127 	}
3128 }
3129 
3130 void
3131 logperror_li(struct logint *li, char *str)
3132 {
3133 	struct	phyint_instance	*pii = li->li_phyint_inst;
3134 
3135 	if (logging) {
3136 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3137 		    str, AF_STR(pii->pii_af), li->li_name);
3138 	} else {
3139 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3140 		    str, AF_STR(pii->pii_af), li->li_name,
3141 		    strerror(errno));
3142 	}
3143 }
3144 
3145 void
3146 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
3147 {
3148 	if (polled)
3149 		(void) poll_remove(pii->pii_probe_sock);
3150 	(void) close(pii->pii_probe_sock);
3151 	pii->pii_probe_sock = -1;
3152 	pii->pii_basetime_inited = 0;
3153 }
3154