xref: /illumos-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 560f878bce5cdf0661659001415019ca5c8a01b4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30 
31 int debug = 0;				/* Debug flag */
32 static int pollfd_num = 0;		/* Num. of poll descriptors */
33 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
34 
35 					/* All times below in ms */
36 int	user_failure_detection_time;	/* user specified failure detection */
37 					/* time (fdt) */
38 int	user_probe_interval;		/* derived from user specified fdt */
39 
40 static int	rtsock_v4;		/* AF_INET routing socket */
41 static int	rtsock_v6;		/* AF_INET6 routing socket */
42 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
43 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
44 static int	lsock_v4;		/* Listen socket to detect mpathd */
45 static int	lsock_v6;		/* Listen socket to detect mpathd */
46 static int	mibfd = -1;		/* fd to get mib info */
47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
48 
49 boolean_t	full_scan_required = _B_FALSE;
50 static uint_t	last_initifs_time;	/* Time when initifs was last run */
51 static	char **argv0;			/* Saved for re-exec on SIGHUP */
52 boolean_t handle_link_notifications = _B_TRUE;
53 
54 static void	initlog(void);
55 static void	run_timeouts(void);
56 static void	initifs(void);
57 static void	check_if_removed(struct phyint_instance *pii);
58 static void	select_test_ifs(void);
59 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
60 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
61 static void	router_add_v4(mib2_ipRouteEntry_t *rp1,
62     struct in_addr nexthop_v4);
63 static void	router_add_v6(mib2_ipv6RouteEntry_t *rp1,
64     struct in6_addr nexthop_v6);
65 static void	router_add_common(int af, char *ifname,
66     struct in6_addr nexthop);
67 static void	init_router_targets();
68 static void	cleanup(void);
69 static int	setup_listener(int af);
70 static void	check_config(void);
71 static void	check_addr_unique(int af, char *name);
72 static void	init_host_targets(void);
73 static void	dup_host_targets(struct phyint_instance *desired_pii);
74 static void	loopback_cmd(int sock, int family);
75 static int	poll_remove(int fd);
76 static boolean_t daemonize(void);
77 static int	closefunc(void *, int);
78 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
79 static unsigned int process_query(int fd, mi_query_t *miq);
80 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
81 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
82 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
83 static unsigned int send_result(int fd, unsigned int error, int syserror);
84 
85 /*
86  * Return the current time in milliseconds (from an arbitrary reference)
87  * truncated to fit into an int. Truncation is ok since we are interested
88  * only in differences and not the absolute values.
89  */
90 uint_t
91 getcurrenttime(void)
92 {
93 	uint_t	cur_time;	/* In ms */
94 
95 	/*
96 	 * Use of a non-user-adjustable source of time is
97 	 * required. However millisecond precision is sufficient.
98 	 * divide by 10^6
99 	 */
100 	cur_time = (uint_t)(gethrtime() / 1000000LL);
101 	return (cur_time);
102 }
103 
104 /*
105  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
106  */
107 int
108 poll_add(int fd)
109 {
110 	int i;
111 	int new_num;
112 	struct pollfd *newfds;
113 retry:
114 	/* Check if already present */
115 	for (i = 0; i < pollfd_num; i++) {
116 		if (pollfds[i].fd == fd)
117 			return (0);
118 	}
119 	/* Check for empty spot already present */
120 	for (i = 0; i < pollfd_num; i++) {
121 		if (pollfds[i].fd == -1) {
122 			pollfds[i].fd = fd;
123 			return (0);
124 		}
125 	}
126 
127 	/* Allocate space for 32 more fds and initialize to -1 */
128 	new_num = pollfd_num + 32;
129 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
130 	if (newfds == NULL) {
131 		logperror("poll_add: realloc");
132 		return (-1);
133 	}
134 	for (i = pollfd_num; i < new_num; i++) {
135 		newfds[i].fd = -1;
136 		newfds[i].events = POLLIN;
137 	}
138 	pollfd_num = new_num;
139 	pollfds = newfds;
140 	goto retry;
141 }
142 
143 /*
144  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
145  */
146 static int
147 poll_remove(int fd)
148 {
149 	int i;
150 
151 	/* Check if already present */
152 	for (i = 0; i < pollfd_num; i++) {
153 		if (pollfds[i].fd == fd) {
154 			pollfds[i].fd = -1;
155 			return (0);
156 		}
157 	}
158 	return (-1);
159 }
160 
161 /*
162  * Extract information about the phyint instance. If the phyint instance still
163  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
164  * will use it to detect phyint instances that don't exist any longer and
165  * remove them, from our database of phyint instances.
166  * Return value:
167  *	returns true if the phyint instance exists in the kernel,
168  *	returns false otherwise
169  */
170 static boolean_t
171 pii_process(int af, char *name, struct phyint_instance **pii_p)
172 {
173 	int err;
174 	struct phyint_instance *pii;
175 	struct phyint_instance *pii_other;
176 
177 	if (debug & D_PHYINT)
178 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
179 
180 	pii = phyint_inst_lookup(af, name);
181 	if (pii == NULL) {
182 		/*
183 		 * Phyint instance does not exist in our tables,
184 		 * create new phyint instance
185 		 */
186 		pii = phyint_inst_init_from_k(af, name);
187 	} else {
188 		/* Phyint exists in our tables */
189 		err = phyint_inst_update_from_k(pii);
190 
191 		switch (err) {
192 		case PI_IOCTL_ERROR:
193 			/* Some ioctl error. don't change anything */
194 			pii->pii_in_use = 1;
195 			break;
196 
197 		case PI_GROUP_CHANGED:
198 			/*
199 			 * The phyint has changed group.
200 			 */
201 			restore_phyint(pii->pii_phyint);
202 			/* FALLTHRU */
203 
204 		case PI_IFINDEX_CHANGED:
205 			/*
206 			 * Interface index has changed. Delete and
207 			 * recreate the phyint as it is quite likely
208 			 * the interface has been unplumbed and replumbed.
209 			 */
210 			pii_other = phyint_inst_other(pii);
211 			if (pii_other != NULL)
212 				phyint_inst_delete(pii_other);
213 			phyint_inst_delete(pii);
214 			pii = phyint_inst_init_from_k(af, name);
215 			break;
216 
217 		case PI_DELETED:
218 			/* Phyint instance has disappeared from kernel */
219 			pii->pii_in_use = 0;
220 			break;
221 
222 		case PI_OK:
223 			/* Phyint instance exists and is fine */
224 			pii->pii_in_use = 1;
225 			break;
226 
227 		default:
228 			/* Unknown status */
229 			logerr("pii_process: Unknown status %d\n", err);
230 			break;
231 		}
232 	}
233 
234 	*pii_p = pii;
235 	if (pii != NULL)
236 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
237 	else
238 		return (_B_FALSE);
239 }
240 
241 /*
242  * This phyint is leaving the group. Try to restore the phyint to its
243  * initial state. Return the addresses that belong to other group members,
244  * to the group, and take back any addresses owned by this phyint
245  */
246 void
247 restore_phyint(struct phyint *pi)
248 {
249 	if (pi->pi_group == phyint_anongroup)
250 		return;
251 
252 	/*
253 	 * Move everthing to some other member in the group.
254 	 * The phyint has changed group in the kernel. But we
255 	 * have yet to do it in our tables.
256 	 */
257 	if (!pi->pi_empty)
258 		(void) try_failover(pi, FAILOVER_TO_ANY);
259 	/*
260 	 * Move all addresses owned by 'pi' back to pi, from each
261 	 * of the other members of the group
262 	 */
263 	(void) try_failback(pi, _B_FALSE);
264 }
265 
266 /*
267  * Scan all interfaces to detect changes as well as new and deleted interfaces
268  */
269 static void
270 initifs()
271 {
272 	int	n;
273 	int	af;
274 	char	*cp;
275 	char	*buf;
276 	int	numifs;
277 	struct lifnum	lifn;
278 	struct lifconf	lifc;
279 	struct lifreq	*lifr;
280 	struct logint	*li;
281 	struct phyint_instance *pii;
282 	struct phyint_instance *next_pii;
283 	char	pi_name[LIFNAMSIZ + 1];
284 	boolean_t exists;
285 	struct phyint	*pi;
286 
287 	if (debug & D_PHYINT)
288 		logdebug("initifs: Scanning interfaces\n");
289 
290 	last_initifs_time = getcurrenttime();
291 
292 	/*
293 	 * Mark the interfaces so that we can find phyints and logints
294 	 * which have disappeared from the kernel. pii_process() and
295 	 * logint_init_from_k() will set {pii,li}_in_use when they find
296 	 * the interface in the kernel. Also, clear dupaddr bit on probe
297 	 * logint. check_addr_unique() will set the dupaddr bit on the
298 	 * probe logint, if the testaddress is not unique.
299 	 */
300 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
301 		pii->pii_in_use = 0;
302 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
303 			li->li_in_use = 0;
304 			if (pii->pii_probe_logint == li)
305 				li->li_dupaddr = 0;
306 		}
307 	}
308 
309 	lifn.lifn_family = AF_UNSPEC;
310 	lifn.lifn_flags = 0;
311 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
312 		logperror("initifs: ioctl (get interface numbers)");
313 		return;
314 	}
315 	numifs = lifn.lifn_count;
316 
317 	buf = (char *)calloc(numifs, sizeof (struct lifreq));
318 	if (buf == NULL) {
319 		logperror("initifs: calloc");
320 		return;
321 	}
322 
323 	lifc.lifc_family = AF_UNSPEC;
324 	lifc.lifc_flags = 0;
325 	lifc.lifc_len = numifs * sizeof (struct lifreq);
326 	lifc.lifc_buf = buf;
327 
328 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
329 		/*
330 		 * EINVAL is commonly encountered, when things change
331 		 * underneath us rapidly, (eg. at boot, when new interfaces
332 		 * are plumbed successively) and the kernel finds the buffer
333 		 * size we passed as too small. We will retry again
334 		 * when we see the next routing socket msg, or at worst after
335 		 * IF_SCAN_INTERVAL ms.
336 		 */
337 		if (errno != EINVAL) {
338 			logperror("initifs: ioctl"
339 			    " (get interface configuration)");
340 		}
341 		free(buf);
342 		return;
343 	}
344 
345 	lifr = (struct lifreq *)lifc.lifc_req;
346 
347 	/*
348 	 * For each lifreq returned by SIOGGLIFCONF, call pii_process()
349 	 * and get the state of the corresponding phyint_instance. If it is
350 	 * successful, then call logint_init_from_k() to get the state of the
351 	 * logint.
352 	 */
353 	for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
354 		af = lifr->lifr_addr.ss_family;
355 
356 		/*
357 		 * Need to pass a phyint name to pii_process. Insert the
358 		 * null where the ':' IF_SEPARATOR is found in the logical
359 		 * name.
360 		 */
361 		(void) strncpy(pi_name, lifr->lifr_name, sizeof (pi_name));
362 		pi_name[sizeof (pi_name) - 1] = '\0';
363 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
364 			*cp = '\0';
365 
366 		exists = pii_process(af, pi_name, &pii);
367 		if (exists) {
368 			/* The phyint is fine. So process the logint */
369 			logint_init_from_k(pii, lifr->lifr_name);
370 		}
371 		check_addr_unique(af, lifr->lifr_name);
372 	}
373 
374 	free(buf);
375 
376 	/*
377 	 * If the test address is now unique, and if it was not unique
378 	 * previously,	clear the li_dupaddrmsg_printed flag and log a
379 	 * recovery message
380 	 */
381 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
382 		struct logint *li;
383 		char abuf[INET6_ADDRSTRLEN];
384 
385 		li = pii->pii_probe_logint;
386 		if ((li != NULL) && !li->li_dupaddr &&
387 		    li->li_dupaddrmsg_printed) {
388 			logerr("Test address %s is unique; enabling probe-"
389 			    "based failure detection\n",
390 			    pr_addr(pii->pii_af, li->li_addr, abuf,
391 				sizeof (abuf)));
392 			li->li_dupaddrmsg_printed = 0;
393 		}
394 	}
395 
396 	/*
397 	 * Scan for phyints and logints that have disappeared from the
398 	 * kernel, and delete them.
399 	 */
400 	pii = phyint_instances;
401 
402 	while (pii != NULL) {
403 		next_pii = pii->pii_next;
404 		check_if_removed(pii);
405 		pii = next_pii;
406 	}
407 
408 	/*
409 	 * Select a test address for sending probes on each phyint instance
410 	 */
411 	select_test_ifs();
412 
413 	/*
414 	 * Handle link up/down notifications from the NICs.
415 	 */
416 	process_link_state_changes();
417 
418 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
419 		/*
420 		 * If this is a case of group failure, we don't have much
421 		 * to do until the group recovers again.
422 		 */
423 		if (GROUP_FAILED(pi->pi_group))
424 			continue;
425 
426 		/*
427 		 * Try/Retry any pending failovers / failbacks, that did not
428 		 * not complete, or that could not be initiated previously.
429 		 * This implements the 3 invariants described in the big block
430 		 * comment at the beginning of probe.c
431 		 */
432 		if (pi->pi_flags & IFF_INACTIVE) {
433 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
434 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
435 		} else {
436 			struct phyint_instance *pii;
437 
438 			pii = pi->pi_v4;
439 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
440 				pii = pi->pi_v6;
441 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
442 				continue;
443 			/*
444 			 * It is possible that the phyint has started
445 			 * receiving packets, after it has been marked
446 			 * PI_FAILED. Don't initiate failover, if the
447 			 * phyint has started recovering. failure_state()
448 			 * captures this check. A similar logic is used
449 			 * for failback/repair case.
450 			 */
451 			if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
452 			    (failure_state(pii) == PHYINT_FAILURE)) {
453 				(void) try_failover(pi, FAILOVER_NORMAL);
454 			} else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
455 				if (try_failback(pi, _B_FALSE) !=
456 				    IPMP_FAILURE) {
457 					(void) change_lif_flags(pi, IFF_FAILED,
458 					    _B_FALSE);
459 					/* Per state diagram */
460 					pi->pi_empty = 0;
461 				}
462 			}
463 		}
464 	}
465 }
466 
467 /*
468  * Check that test/probe addresses are always unique. link-locals and
469  * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER)
470  * address can produce unexpected results. Log an error and alert the user.
471  */
472 static void
473 check_addr_unique(int af, char *name)
474 {
475 	struct lifreq	lifr;
476 	struct phyint	*pi;
477 	struct in6_addr	addr;
478 	struct phyint_instance	*pii;
479 	struct sockaddr_in	*sin;
480 	struct sockaddr_in6	*sin6;
481 	int ifsock;
482 	char abuf[INET6_ADDRSTRLEN];
483 
484 	/* Get the socket for doing ioctls */
485 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
486 
487 	(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
488 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
489 	/*
490 	 * Get the address corresponding to 'name'. We cannot
491 	 * do a logint lookup in our tables, because, not all logints
492 	 * in the system are tracked by mpathd. (eg. things not in a group)
493 	 */
494 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
495 		if (errno == ENXIO) {
496 			/* Interface has vanished */
497 			return;
498 		} else {
499 			logperror("ioctl (get addr)");
500 			return;
501 		}
502 	}
503 
504 	if (af == AF_INET) {
505 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
506 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
507 	} else {
508 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
509 		addr = sin6->sin6_addr;
510 	}
511 
512 	/*
513 	 * Does the address 'addr' match any known test address ? If so
514 	 * it is a duplicate, unless we are looking at the same logint
515 	 */
516 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
517 		pii = PHYINT_INSTANCE(pi, af);
518 		if (pii == NULL || pii->pii_probe_logint == NULL)
519 			continue;
520 
521 		if (!IN6_ARE_ADDR_EQUAL(&addr,
522 		    &pii->pii_probe_logint->li_addr)) {
523 			continue;
524 		}
525 
526 		if (strncmp(pii->pii_probe_logint->li_name, name,
527 		    sizeof (pii->pii_probe_logint->li_name)) == 0) {
528 			continue;
529 		}
530 
531 		/*
532 		 * This test address is not unique. Set the dupaddr bit
533 		 */
534 		pii->pii_probe_logint->li_dupaddr = 1;
535 
536 		/*
537 		 * Log an error message if not already logged
538 		 */
539 		if (pii->pii_probe_logint->li_dupaddrmsg_printed)
540 			continue;
541 
542 		logerr("Test address %s is not unique; disabling "
543 		    "probe-based failure detection\n",
544 		    pr_addr(af, addr, abuf, sizeof (abuf)));
545 
546 		pii->pii_probe_logint->li_dupaddrmsg_printed = 1;
547 	}
548 }
549 
550 /*
551  * Stop probing an interface.  Called when an interface is offlined.
552  * The probe socket is closed on each interface instance, and the
553  * interface state set to PI_OFFLINE.
554  */
555 static void
556 stop_probing(struct phyint *pi)
557 {
558 	struct phyint_instance *pii;
559 
560 	pii = pi->pi_v4;
561 	if (pii != NULL) {
562 		if (pii->pii_probe_sock != -1)
563 			close_probe_socket(pii, _B_TRUE);
564 		pii->pii_probe_logint = NULL;
565 	}
566 
567 	pii = pi->pi_v6;
568 	if (pii != NULL) {
569 		if (pii->pii_probe_sock != -1)
570 			close_probe_socket(pii, _B_TRUE);
571 		pii->pii_probe_logint = NULL;
572 	}
573 
574 	phyint_chstate(pi, PI_OFFLINE);
575 }
576 
577 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS };
578 
579 /*
580  * Rate the provided test flags.  By definition, IFF_NOFAILOVER must be set.
581  * IFF_UP must also be set so that the associated address can be used as a
582  * source address.  Further, we must be able to exchange packets with local
583  * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear.  For historical
584  * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses.
585  */
586 static int
587 rate_testflags(uint64_t flags)
588 {
589 	if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP))
590 		return (BAD_TESTFLAGS);
591 
592 	if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0)
593 		return (BAD_TESTFLAGS);
594 
595 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED)
596 		return (BEST_TESTFLAGS);
597 
598 	if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6)
599 		return (BEST_TESTFLAGS);
600 
601 	return (OK_TESTFLAGS);
602 }
603 
604 /*
605  * Attempt to select a test address for each phyint instance.
606  * Call phyint_inst_sockinit() to complete the initializations.
607  */
608 static void
609 select_test_ifs(void)
610 {
611 	struct phyint		*pi;
612 	struct phyint_instance	*pii;
613 	struct phyint_instance	*next_pii;
614 	struct logint		*li;
615 	struct logint  		*probe_logint;
616 	boolean_t		target_scan_reqd = _B_FALSE;
617 	struct target		*tg;
618 	int			rating;
619 
620 	if (debug & D_PHYINT)
621 		logdebug("select_test_ifs\n");
622 
623 	/*
624 	 * For each phyint instance, do the test address selection
625 	 */
626 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
627 		next_pii = pii->pii_next;
628 		probe_logint = NULL;
629 
630 		/*
631 		 * An interface that is offline, should not be probed.
632 		 * Offline interfaces should always in PI_OFFLINE state,
633 		 * unless some other entity has set the offline flag.
634 		 */
635 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
636 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
637 				logerr("shouldn't be probing offline"
638 					" interface %s (state is: %u)."
639 					" Stopping probes.\n",
640 					pii->pii_phyint->pi_name,
641 					pii->pii_phyint->pi_state);
642 				stop_probing(pii->pii_phyint);
643 			}
644 			continue;
645 		}
646 
647 		li = pii->pii_probe_logint;
648 		if (li != NULL) {
649 			/*
650 			 * We've already got a test address; only proceed
651 			 * if it's suboptimal.
652 			 */
653 			if (rate_testflags(li->li_flags) == BEST_TESTFLAGS)
654 				continue;
655 		}
656 
657 		/*
658 		 * Walk the logints of this phyint instance, and select
659 		 * the best available test address
660 		 */
661 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
662 			/*
663 			 * Skip any IPv6 logints that are not link-local,
664 			 * since we should always have a link-local address
665 			 * anyway and in6_data() expects link-local replies.
666 			 */
667 			if (pii->pii_af == AF_INET6 &&
668 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
669 				continue;
670 
671 			/*
672 			 * Rate the testflags. If we've found an optimal
673 			 * match, then break out; otherwise, record the most
674 			 * recent OK one.
675 			 */
676 			rating = rate_testflags(li->li_flags);
677 			if (rating == BAD_TESTFLAGS)
678 				continue;
679 
680 			probe_logint = li;
681 			if (rating == BEST_TESTFLAGS)
682 				break;
683 		}
684 
685 		/*
686 		 * If the probe logint has changed, ditch the old one.
687 		 */
688 		if (pii->pii_probe_logint != NULL &&
689 		    pii->pii_probe_logint != probe_logint) {
690 			if (pii->pii_probe_sock != -1)
691 				close_probe_socket(pii, _B_TRUE);
692 			pii->pii_probe_logint = NULL;
693 		}
694 
695 		if (probe_logint == NULL) {
696 			/*
697 			 * We don't have a test address. Don't print an
698 			 * error message immediately. check_config() will
699 			 * take care of it. Zero out the probe stats array
700 			 * since it is no longer relevant. Optimize by
701 			 * checking if it is already zeroed out.
702 			 */
703 			int pr_ndx;
704 
705 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
706 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
707 				clear_pii_probe_stats(pii);
708 				reset_crtt_all(pii->pii_phyint);
709 			}
710 			continue;
711 		} else if (probe_logint == pii->pii_probe_logint) {
712 			/*
713 			 * If we didn't find any new test addr, go to the
714 			 * next phyint.
715 			 */
716 			continue;
717 		}
718 
719 		/*
720 		 * The phyint is either being assigned a new testaddr
721 		 * or is being assigned a testaddr for the 1st time.
722 		 * Need to initialize the phyint socket
723 		 */
724 		pii->pii_probe_logint = probe_logint;
725 		if (!phyint_inst_sockinit(pii)) {
726 			if (debug & D_PHYINT) {
727 				logdebug("select_test_ifs: "
728 				    "phyint_sockinit failed\n");
729 			}
730 			phyint_inst_delete(pii);
731 			continue;
732 		}
733 
734 		/*
735 		 * This phyint instance is now enabled for probes; this
736 		 * impacts our state machine in two ways:
737 		 *
738 		 * 1. If we're probe *capable* as well (i.e., we have
739 		 *    probe targets) and the interface is in PI_NOTARGETS,
740 		 *    then transition to PI_RUNNING.
741 		 *
742 		 * 2. If we're not probe capable, and the other phyint
743 		 *    instance is also not probe capable, and we were in
744 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
745 		 *
746 		 * Also see the state diagram in mpd_probe.c.
747 		 */
748 		if (PROBE_CAPABLE(pii)) {
749 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
750 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
751 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
752 			if (pii->pii_phyint->pi_state == PI_RUNNING)
753 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
754 		}
755 
756 		if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
757 			tg = pii->pii_targets;
758 			if (tg != NULL)
759 				target_delete(tg);
760 			assert(pii->pii_targets == NULL);
761 			assert(pii->pii_target_next == NULL);
762 			assert(pii->pii_ntargets == 0);
763 			target_create(pii, probe_logint->li_dstaddr,
764 			    _B_TRUE);
765 		}
766 
767 		/*
768 		 * If no targets are currently known for this phyint
769 		 * we need to call init_router_targets. Since
770 		 * init_router_targets() initializes the list of targets
771 		 * for all phyints it is done below the loop.
772 		 */
773 		if (pii->pii_targets == NULL)
774 			target_scan_reqd = _B_TRUE;
775 
776 		/*
777 		 * Start the probe timer for this instance.
778 		 */
779 		if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) {
780 			start_timer(pii);
781 			pii->pii_basetime_inited = 1;
782 		}
783 	}
784 
785 	/*
786 	 * Check the interface list for any interfaces that are marked
787 	 * PI_FAILED but no longer enabled to send probes, and call
788 	 * phyint_check_for_repair() to see if the link now indicates that the
789 	 * interface should be repaired.  Also see the state diagram in
790 	 * mpd_probe.c.
791 	 */
792 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
793 		if (pi->pi_state == PI_FAILED &&
794 		    !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
795 			phyint_check_for_repair(pi);
796 		}
797 	}
798 
799 	/*
800 	 * Try to populate the target list. init_router_targets populates
801 	 * the target list from the routing table. If our target list is
802 	 * still empty, init_host_targets adds host targets based on the
803 	 * host target list of other phyints in the group.
804 	 */
805 	if (target_scan_reqd) {
806 		init_router_targets();
807 		init_host_targets();
808 	}
809 }
810 
811 /*
812  * Check phyint group configuration, to detect any inconsistencies,
813  * and log an error message. This is called from runtimeouts every
814  * 20 secs. But the error message is displayed once. If the
815  * consistency is resolved by the admin, a recovery message is displayed
816  * once.
817  */
818 static void
819 check_config(void)
820 {
821 	struct phyint_group *pg;
822 	struct phyint *pi;
823 	boolean_t v4_in_group;
824 	boolean_t v6_in_group;
825 
826 	/*
827 	 * All phyints of a group must be homogenous to ensure that
828 	 * failover or failback can be done. If any phyint in a group
829 	 * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
830 	 * Do a similar check for IPv6.
831 	 */
832 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
833 		if (pg == phyint_anongroup)
834 			continue;
835 
836 		v4_in_group = _B_FALSE;
837 		v6_in_group = _B_FALSE;
838 		/*
839 		 * 1st pass. Determine if at least 1 phyint in the group
840 		 * has IPv4 plumbed and if so set v4_in_group to true.
841 		 * Repeat similarly for IPv6.
842 		 */
843 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
844 			if (pi->pi_v4 != NULL)
845 				v4_in_group = _B_TRUE;
846 			if (pi->pi_v6 != NULL)
847 				v6_in_group = _B_TRUE;
848 		}
849 
850 		/*
851 		 * 2nd pass. If v4_in_group is true, check that phyint
852 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
853 		 * out a message the 1st time only.
854 		 */
855 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
856 			if (pi->pi_flags & IFF_OFFLINE)
857 				continue;
858 
859 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
860 				if (!pi->pi_cfgmsg_printed) {
861 					logerr("NIC %s of group %s is"
862 					    " not plumbed for IPv4 and may"
863 					    " affect failover capability\n",
864 					    pi->pi_name,
865 					    pi->pi_group->pg_name);
866 					pi->pi_cfgmsg_printed = 1;
867 				}
868 			} else if (v6_in_group == _B_TRUE &&
869 			    pi->pi_v6 == NULL) {
870 				if (!pi->pi_cfgmsg_printed) {
871 					logerr("NIC %s of group %s is"
872 					    " not plumbed for IPv6 and may"
873 					    " affect failover capability\n",
874 					    pi->pi_name,
875 					    pi->pi_group->pg_name);
876 					pi->pi_cfgmsg_printed = 1;
877 				}
878 			} else {
879 				/*
880 				 * The phyint matches the group configuration,
881 				 * if we have reached this point. If it was
882 				 * improperly configured earlier, log an
883 				 * error recovery message
884 				 */
885 				if (pi->pi_cfgmsg_printed) {
886 					logerr("NIC %s is now consistent with "
887 					    "group %s and failover capability "
888 					    "is restored\n", pi->pi_name,
889 					    pi->pi_group->pg_name);
890 					pi->pi_cfgmsg_printed = 0;
891 				}
892 			}
893 
894 		}
895 	}
896 
897 	/*
898 	 * In order to perform probe-based failure detection, a phyint must
899 	 * have at least 1 test/probe address for sending and receiving probes
900 	 * (either on IPv4 or IPv6 instance or both).  If no test address has
901 	 * been configured, notify the administrator, but continue on since we
902 	 * can still perform load spreading, along with "link up/down" based
903 	 * failure detection.
904 	 */
905 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
906 		if (pi->pi_flags & IFF_OFFLINE)
907 			continue;
908 
909 		if ((pi->pi_v4 == NULL ||
910 		    pi->pi_v4->pii_probe_logint == NULL) &&
911 		    (pi->pi_v6 == NULL ||
912 		    pi->pi_v6->pii_probe_logint == NULL)) {
913 			if (!pi->pi_taddrmsg_printed) {
914 				logerr("No test address configured on "
915 				    "interface %s; disabling probe-based "
916 				    "failure detection on it\n", pi->pi_name);
917 				pi->pi_taddrmsg_printed = 1;
918 			}
919 		} else if (pi->pi_taddrmsg_printed) {
920 			logerr("Test address now configured on interface %s; "
921 			    "enabling probe-based failure detection on it\n",
922 			    pi->pi_name);
923 			pi->pi_taddrmsg_printed = 0;
924 		}
925 
926 	}
927 }
928 
929 /*
930  * Timer mechanism using relative time (in milliseconds) from the
931  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
932  * will fire after TIMER_INFINITY milliseconds.
933  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
934  * time values. Hence 2 consecutive timer events cannot be spaced farther
935  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
936  * that can be passed for the delay parameter of timer_schedule()
937  */
938 static uint_t timer_next;	/* Currently scheduled timeout */
939 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
940 
941 static void
942 timer_init(void)
943 {
944 	timer_next = getcurrenttime() + TIMER_INFINITY;
945 	/*
946 	 * The call to run_timeouts() will get the timer started
947 	 * Since there are no phyints at this point, the timer will
948 	 * be set for IF_SCAN_INTERVAL ms.
949 	 */
950 	run_timeouts();
951 }
952 
953 /*
954  * Make sure the next SIGALRM occurs delay milliseconds from the current
955  * time if not earlier. We are interested only in time differences.
956  */
957 void
958 timer_schedule(uint_t delay)
959 {
960 	uint_t now;
961 	struct itimerval itimerval;
962 
963 	if (debug & D_TIMER)
964 		logdebug("timer_schedule(%u)\n", delay);
965 
966 	assert(delay <= TIMER_INFINITY);
967 
968 	now = getcurrenttime();
969 	if (delay == 0) {
970 		/* Minimum allowed delay */
971 		delay = 1;
972 	}
973 	/* Will this timer occur before the currently scheduled SIGALRM? */
974 	if (timer_active && TIME_GE(now + delay, timer_next)) {
975 		if (debug & D_TIMER) {
976 			logdebug("timer_schedule(%u) - no action: "
977 			    "now %u next %u\n", delay, now, timer_next);
978 		}
979 		return;
980 	}
981 	timer_next = now + delay;
982 
983 	itimerval.it_value.tv_sec = delay / 1000;
984 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
985 	itimerval.it_interval.tv_sec = 0;
986 	itimerval.it_interval.tv_usec = 0;
987 	if (debug & D_TIMER) {
988 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
989 		    delay, itimerval.it_value.tv_sec,
990 		    itimerval.it_value.tv_usec);
991 	}
992 	timer_active = _B_TRUE;
993 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
994 		logperror("timer_schedule: setitimer");
995 		exit(2);
996 	}
997 }
998 
999 /*
1000  * Timer has fired. Determine when the next timer event will occur by asking
1001  * all the timer routines. Should not be called from a timer routine.
1002  */
1003 static void
1004 run_timeouts(void)
1005 {
1006 	uint_t next;
1007 	uint_t next_event_time;
1008 	struct phyint_instance *pii;
1009 	struct phyint_instance *next_pii;
1010 	static boolean_t timeout_running;
1011 
1012 	/* assert that recursive timeouts don't happen. */
1013 	assert(!timeout_running);
1014 
1015 	timeout_running = _B_TRUE;
1016 
1017 	if (debug & D_TIMER)
1018 		logdebug("run_timeouts()\n");
1019 
1020 	next = TIMER_INFINITY;
1021 
1022 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1023 		next_pii = pii->pii_next;
1024 		next_event_time = phyint_inst_timer(pii);
1025 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1026 			next = next_event_time;
1027 
1028 		if (debug & D_TIMER) {
1029 			logdebug("run_timeouts(%s %s): next scheduled for"
1030 			    " this phyint inst %u, next scheduled global"
1031 			    " %u ms\n",
1032 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1033 			    next_event_time, next);
1034 		}
1035 	}
1036 
1037 	/*
1038 	 * Make sure initifs() is called at least once every
1039 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1040 	 * with the kernel, in case we have missed any routing
1041 	 * socket messages.
1042 	 */
1043 	if (next > IF_SCAN_INTERVAL)
1044 		next = IF_SCAN_INTERVAL;
1045 
1046 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1047 		initifs();
1048 		check_config();
1049 	}
1050 
1051 	if (debug & D_TIMER)
1052 		logdebug("run_timeouts: %u ms\n", next);
1053 
1054 	timer_schedule(next);
1055 	timeout_running = _B_FALSE;
1056 }
1057 
1058 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1059 static int eventpipe_write = -1;
1060 static boolean_t cleanup_started = _B_FALSE;
1061 				/* Don't write to eventpipe if in cleanup */
1062 /*
1063  * Ensure that signals are processed synchronously with the rest of
1064  * the code by just writing a one character signal number on the pipe.
1065  * The poll loop will pick this up and process the signal event.
1066  */
1067 static void
1068 sig_handler(int signo)
1069 {
1070 	uchar_t buf = (uchar_t)signo;
1071 
1072 	/*
1073 	 * Don't write to pipe if cleanup has already begun. cleanup()
1074 	 * might have closed the pipe already
1075 	 */
1076 	if (cleanup_started)
1077 		return;
1078 
1079 	if (eventpipe_write == -1) {
1080 		logerr("sig_handler: no pipe found\n");
1081 		return;
1082 	}
1083 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1084 		logperror("sig_handler: write");
1085 }
1086 
1087 extern struct probes_missed probes_missed;
1088 
1089 /*
1090  * Pick up a signal "byte" from the pipe and process it.
1091  */
1092 static void
1093 in_signal(int fd)
1094 {
1095 	uchar_t buf;
1096 	uint64_t  sent, acked, lost, unacked, unknown;
1097 	struct phyint_instance *pii;
1098 	int pr_ndx;
1099 
1100 	switch (read(fd, &buf, sizeof (buf))) {
1101 	case -1:
1102 		logperror("in_signal: read");
1103 		exit(1);
1104 		/* NOTREACHED */
1105 	case 1:
1106 		break;
1107 	case 0:
1108 		logerr("in_signal: read end of file\n");
1109 		exit(1);
1110 		/* NOTREACHED */
1111 	default:
1112 		logerr("in_signal: read > 1\n");
1113 		exit(1);
1114 	}
1115 
1116 	if (debug & D_TIMER)
1117 		logdebug("in_signal() got %d\n", buf);
1118 
1119 	switch (buf) {
1120 	case SIGALRM:
1121 		if (debug & D_TIMER) {
1122 			uint_t now = getcurrenttime();
1123 
1124 			logdebug("in_signal(SIGALRM) delta %u\n",
1125 			    now - timer_next);
1126 		}
1127 		timer_active = _B_FALSE;
1128 		run_timeouts();
1129 		break;
1130 	case SIGUSR1:
1131 		logdebug("Printing configuration:\n");
1132 		/* Print out the internal tables */
1133 		phyint_inst_print_all();
1134 
1135 		/*
1136 		 * Print out the accumulated statistics about missed
1137 		 * probes (happens due to scheduling delay).
1138 		 */
1139 		logerr("Missed sending total of %d probes spread over"
1140 		    " %d occurrences\n", probes_missed.pm_nprobes,
1141 		    probes_missed.pm_ntimes);
1142 
1143 		/*
1144 		 * Print out the accumulated statistics about probes
1145 		 * that were sent.
1146 		 */
1147 		for (pii = phyint_instances; pii != NULL;
1148 		    pii = pii->pii_next) {
1149 			unacked = 0;
1150 			acked = pii->pii_cum_stats.acked;
1151 			lost = pii->pii_cum_stats.lost;
1152 			sent = pii->pii_cum_stats.sent;
1153 			unknown = pii->pii_cum_stats.unknown;
1154 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1155 				switch (pii->pii_probes[pr_ndx].pr_status) {
1156 				case PR_ACKED:
1157 					acked++;
1158 					break;
1159 				case PR_LOST:
1160 					lost++;
1161 					break;
1162 				case PR_UNACKED:
1163 					unacked++;
1164 					break;
1165 				}
1166 			}
1167 			logerr("\nProbe stats on (%s %s)\n"
1168 			    "Number of probes sent %lld\n"
1169 			    "Number of probe acks received %lld\n"
1170 			    "Number of probes/acks lost %lld\n"
1171 			    "Number of valid unacknowled probes %lld\n"
1172 			    "Number of ambiguous probe acks received %lld\n",
1173 			    AF_STR(pii->pii_af), pii->pii_name,
1174 			    sent, acked, lost, unacked, unknown);
1175 		}
1176 		break;
1177 	case SIGHUP:
1178 		logerr("SIGHUP: restart and reread config file\n");
1179 		cleanup();
1180 		(void) execv(argv0[0], argv0);
1181 		_exit(0177);
1182 		/* NOTREACHED */
1183 	case SIGINT:
1184 	case SIGTERM:
1185 	case SIGQUIT:
1186 		cleanup();
1187 		exit(0);
1188 		/* NOTREACHED */
1189 	default:
1190 		logerr("in_signal: unknown signal: %d\n", buf);
1191 	}
1192 }
1193 
1194 static void
1195 cleanup(void)
1196 {
1197 	struct phyint_instance *pii;
1198 	struct phyint_instance *next_pii;
1199 
1200 	/*
1201 	 * Make sure that we don't write to eventpipe in
1202 	 * sig_handler() if any signal notably SIGALRM,
1203 	 * occurs after we close the eventpipe descriptor below
1204 	 */
1205 	cleanup_started = _B_TRUE;
1206 
1207 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1208 		next_pii = pii->pii_next;
1209 		phyint_inst_delete(pii);
1210 	}
1211 
1212 	(void) close(ifsock_v4);
1213 	(void) close(ifsock_v6);
1214 	(void) close(rtsock_v4);
1215 	(void) close(rtsock_v6);
1216 	(void) close(lsock_v4);
1217 	(void) close(lsock_v6);
1218 	(void) close(0);
1219 	(void) close(1);
1220 	(void) close(2);
1221 	(void) close(mibfd);
1222 	(void) close(eventpipe_read);
1223 	(void) close(eventpipe_write);
1224 }
1225 
1226 /*
1227  * Create pipe for signal delivery and set up signal handlers.
1228  */
1229 static void
1230 setup_eventpipe(void)
1231 {
1232 	int fds[2];
1233 	struct sigaction act;
1234 
1235 	if ((pipe(fds)) < 0) {
1236 		logperror("setup_eventpipe: pipe");
1237 		exit(1);
1238 	}
1239 	eventpipe_read = fds[0];
1240 	eventpipe_write = fds[1];
1241 	if (poll_add(eventpipe_read) == -1) {
1242 		exit(1);
1243 	}
1244 
1245 	act.sa_handler = sig_handler;
1246 	act.sa_flags = SA_RESTART;
1247 	(void) sigaction(SIGALRM, &act, NULL);
1248 
1249 	(void) sigset(SIGHUP, sig_handler);
1250 	(void) sigset(SIGUSR1, sig_handler);
1251 	(void) sigset(SIGTERM, sig_handler);
1252 	(void) sigset(SIGINT, sig_handler);
1253 	(void) sigset(SIGQUIT, sig_handler);
1254 }
1255 
1256 /*
1257  * Create a routing socket for receiving RTM_IFINFO messages.
1258  */
1259 static int
1260 setup_rtsock(int af)
1261 {
1262 	int	s;
1263 	int	flags;
1264 
1265 	s = socket(PF_ROUTE, SOCK_RAW, af);
1266 	if (s == -1) {
1267 		logperror("setup_rtsock: socket PF_ROUTE");
1268 		exit(1);
1269 	}
1270 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1271 		logperror("setup_rtsock: fcntl F_GETFL");
1272 		(void) close(s);
1273 		exit(1);
1274 	}
1275 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1276 		logperror("setup_rtsock: fcntl F_SETFL");
1277 		(void) close(s);
1278 		exit(1);
1279 	}
1280 	if (poll_add(s) == -1) {
1281 		(void) close(s);
1282 		exit(1);
1283 	}
1284 	return (s);
1285 }
1286 
1287 /*
1288  * Process an RTM_IFINFO message received on a routing socket.
1289  * The return value indicates whether a full interface scan is required.
1290  * Link up/down notifications from the NICs are reflected in the
1291  * IFF_RUNNING flag.
1292  * If just the state of the IFF_RUNNING interface flag has changed, a
1293  * a full interface scan isn't required.
1294  */
1295 static boolean_t
1296 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1297 {
1298 	struct sockaddr_dl *sdl;
1299 	struct phyint *pi;
1300 	uint64_t old_flags;
1301 	struct phyint_instance *pii;
1302 
1303 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1304 
1305 	/*
1306 	 * Although the sockaddr_dl structure is directly after the
1307 	 * if_msghdr_t structure. At the time of writing, the size of the
1308 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1309 	 * to the presence of a timeval structure, which contains longs,
1310 	 * in the if_data structure.  Anyway, we know where the message ends,
1311 	 * so we work backwards to get the start of the sockaddr_dl structure.
1312 	 */
1313 	/*LINTED*/
1314 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1315 		sizeof (struct sockaddr_dl));
1316 
1317 	assert(sdl->sdl_family == AF_LINK);
1318 
1319 	/*
1320 	 * The interface name is in sdl_data.
1321 	 * RTM_IFINFO messages are only generated for logical interface
1322 	 * zero, so there is no colon and logical interface number to
1323 	 * strip from the name.	 The name is not null terminated, but
1324 	 * there should be enough space in sdl_data to add the null.
1325 	 */
1326 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1327 		if (debug & D_LINKNOTE)
1328 			logdebug("process_rtm_ifinfo: "
1329 				"phyint name too long\n");
1330 		return (_B_TRUE);
1331 	}
1332 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1333 
1334 	pi = phyint_lookup(sdl->sdl_data);
1335 	if (pi == NULL) {
1336 		if (debug & D_LINKNOTE)
1337 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1338 				" for %s\n", sdl->sdl_data);
1339 		return (_B_TRUE);
1340 	}
1341 
1342 	/*
1343 	 * We want to try and avoid doing a full interface scan for
1344 	 * link state notifications from the NICs, as indicated
1345 	 * by the state of the IFF_RUNNING flag.  If just the
1346 	 * IFF_RUNNING flag has changed state, the link state changes
1347 	 * are processed without a full scan.
1348 	 * If there is both an IPv4 and IPv6 instance associated with
1349 	 * the physical interface, we will get an RTM_IFINFO message
1350 	 * for each instance.  If we just maintained a single copy of
1351 	 * the physical interface flags, it would appear that no flags
1352 	 * had changed when the second message is processed, leading us
1353 	 * to believe that the message wasn't generated by a flags change,
1354 	 * and that a full interface scan is required.
1355 	 * To get around this problem, two additional copies of the flags
1356 	 * are kept, one copy for each instance.  These are only used in
1357 	 * this routine.  At any one time, all three copies of the flags
1358 	 * should be identical except for the IFF_RUNNING flag.	 The
1359 	 * copy of the flags in the "phyint" structure is always up to
1360 	 * date.
1361 	 */
1362 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1363 	if (pii == NULL) {
1364 		if (debug & D_LINKNOTE)
1365 			logdebug("process_rtm_ifinfo: no instance of address "
1366 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1367 		return (_B_TRUE);
1368 	}
1369 
1370 	old_flags = pii->pii_flags;
1371 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1372 	pi->pi_flags = pii->pii_flags;
1373 
1374 	if (debug & D_LINKNOTE) {
1375 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1376 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1377 		    AF_STR(type), old_flags, pi->pi_flags);
1378 	}
1379 
1380 	/*
1381 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1382 	 * types.
1383 	 */
1384 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1385 		phyint_newtype(pi);
1386 
1387 	/*
1388 	 * If IFF_INACTIVE has been set, then no data addresses should be
1389 	 * hosted on the interface.  If IFF_INACTIVE has been cleared, then
1390 	 * move previously failed-over addresses back to it, provided it is
1391 	 * not failed.	For details, see the state diagram in mpd_probe.c.
1392 	 */
1393 	if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
1394 		if (pii->pii_flags & IFF_INACTIVE) {
1395 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
1396 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
1397 		} else {
1398 			if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
1399 				pi->pi_empty = 0;
1400 				(void) try_failback(pi, _B_FALSE);
1401 			}
1402 		}
1403 	}
1404 
1405 	/* Has just the IFF_RUNNING flag changed state ? */
1406 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1407 		struct phyint_instance *pii_other;
1408 		/*
1409 		 * It wasn't just a link state change.	Update
1410 		 * the other instance's copy of the flags.
1411 		 */
1412 		pii_other = phyint_inst_other(pii);
1413 		if (pii_other != NULL)
1414 			pii_other->pii_flags = pii->pii_flags;
1415 		return (_B_TRUE);
1416 	}
1417 
1418 	return (_B_FALSE);
1419 }
1420 
1421 /*
1422  * Retrieve as many routing socket messages as possible, and try to
1423  * empty the routing sockets. Initiate full scan of targets or interfaces
1424  * as needed.
1425  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1426  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1427  */
1428 static void
1429 process_rtsock(int rtsock_v4, int rtsock_v6)
1430 {
1431 	int	nbytes;
1432 	int64_t msg[2048 / 8];
1433 	struct rt_msghdr *rtm;
1434 	boolean_t need_if_scan = _B_FALSE;
1435 	boolean_t need_rt_scan = _B_FALSE;
1436 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1437 	int type;
1438 
1439 	/* Read as many messages as possible and try to empty the sockets */
1440 	for (type = AF_INET; ; type = AF_INET6) {
1441 		for (;;) {
1442 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1443 				rtsock_v6, msg, sizeof (msg));
1444 			if (nbytes <= 0) {
1445 				/* No more messages */
1446 				break;
1447 			}
1448 			rtm = (struct rt_msghdr *)msg;
1449 			if (rtm->rtm_version != RTM_VERSION) {
1450 				logerr("process_rtsock: version %d "
1451 				    "not understood\n", rtm->rtm_version);
1452 				break;
1453 			}
1454 
1455 			if (debug & D_PHYINT) {
1456 				logdebug("process_rtsock: message %d\n",
1457 				    rtm->rtm_type);
1458 			}
1459 
1460 			switch (rtm->rtm_type) {
1461 			case RTM_NEWADDR:
1462 			case RTM_DELADDR:
1463 				/*
1464 				 * Some logical interface has changed,
1465 				 * have to scan everything to determine
1466 				 * what actually changed.
1467 				 */
1468 				need_if_scan = _B_TRUE;
1469 				break;
1470 
1471 			case RTM_IFINFO:
1472 				rtm_ifinfo_seen = _B_TRUE;
1473 				need_if_scan |=
1474 					process_rtm_ifinfo((if_msghdr_t *)rtm,
1475 					type);
1476 				break;
1477 
1478 			case RTM_ADD:
1479 			case RTM_DELETE:
1480 			case RTM_CHANGE:
1481 			case RTM_OLDADD:
1482 			case RTM_OLDDEL:
1483 				need_rt_scan = _B_TRUE;
1484 				break;
1485 
1486 			default:
1487 				/* Not interesting */
1488 				break;
1489 			}
1490 		}
1491 		if (type == AF_INET6)
1492 			break;
1493 	}
1494 
1495 	if (need_if_scan) {
1496 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1497 			logdebug("process_rtsock: synchronizing with kernel\n");
1498 		initifs();
1499 	} else if (rtm_ifinfo_seen) {
1500 		if (debug & D_LINKNOTE)
1501 			logdebug("process_rtsock: "
1502 			    "link up/down notification(s) seen\n");
1503 		process_link_state_changes();
1504 	}
1505 
1506 	if (need_rt_scan)
1507 		init_router_targets();
1508 }
1509 
1510 /*
1511  * Look if the phyint instance or one of its logints have been removed from
1512  * the kernel and take appropriate action.
1513  * Uses {pii,li}_in_use.
1514  */
1515 static void
1516 check_if_removed(struct phyint_instance *pii)
1517 {
1518 	struct logint *li;
1519 	struct logint *next_li;
1520 
1521 	/* Detect phyints that have been removed from the kernel. */
1522 	if (!pii->pii_in_use) {
1523 		logtrace("%s %s has been removed from kernel\n",
1524 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1525 		phyint_inst_delete(pii);
1526 	} else {
1527 		/* Detect logints that have been removed. */
1528 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1529 			next_li = li->li_next;
1530 			if (!li->li_in_use) {
1531 				logint_delete(li);
1532 			}
1533 		}
1534 	}
1535 }
1536 
1537 /*
1538  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1539  * tables defined by mib2.h. Parse the returned data and extract
1540  * the 'routing' information table. Process the 'routing' table
1541  * to get the list of known onlink routers, and update our database.
1542  * These onlink routers will serve as our probe targets.
1543  * Returns false, if any system calls resulted in errors, true otherwise.
1544  */
1545 static boolean_t
1546 update_router_list(int fd)
1547 {
1548 	union {
1549 		char	ubuf[1024];
1550 		union T_primitives uprim;
1551 	} buf;
1552 
1553 	int			flags;
1554 	struct strbuf		ctlbuf;
1555 	struct strbuf		databuf;
1556 	struct T_optmgmt_req	*tor;
1557 	struct T_optmgmt_ack	*toa;
1558 	struct T_error_ack	*tea;
1559 	struct opthdr		*optp;
1560 	struct opthdr		*req;
1561 	int			status;
1562 	t_scalar_t		prim;
1563 
1564 	tor = (struct T_optmgmt_req *)&buf;
1565 
1566 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1567 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1568 	tor->OPT_length = sizeof (struct opthdr);
1569 	tor->MGMT_flags = T_CURRENT;
1570 
1571 	req = (struct opthdr *)&tor[1];
1572 	req->level = MIB2_IP;	/* any MIB2_xxx value ok here */
1573 	req->name  = 0;
1574 	req->len   = 0;
1575 
1576 	ctlbuf.buf = (char *)&buf;
1577 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1578 	ctlbuf.maxlen = sizeof (buf);
1579 	flags = 0;
1580 	if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
1581 		logperror("update_router_list: putmsg(ctl)");
1582 		return (_B_FALSE);
1583 	}
1584 
1585 	/*
1586 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1587 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1588 	 * a control and data part. The control part contains a struct
1589 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1590 	 * the level, name and length of the data in the data part. The
1591 	 * data part contains the actual table data. The last message
1592 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1593 	 * single option with zero optlen.
1594 	 */
1595 
1596 	for (;;) {
1597 		/*
1598 		 * Go around this loop once for each table. Ignore
1599 		 * all tables except the routing information table.
1600 		 */
1601 		flags = 0;
1602 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1603 		if (status < 0) {
1604 			if (errno == EINTR)
1605 				continue;
1606 			logperror("update_router_list: getmsg(ctl)");
1607 			return (_B_FALSE);
1608 		}
1609 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1610 			logerr("update_router_list: ctlbuf.len %d\n",
1611 			    ctlbuf.len);
1612 			return (_B_FALSE);
1613 		}
1614 
1615 		prim = buf.uprim.type;
1616 
1617 		switch (prim) {
1618 
1619 		case T_ERROR_ACK:
1620 			tea = &buf.uprim.error_ack;
1621 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1622 				logerr("update_router_list: T_ERROR_ACK"
1623 				    " ctlbuf.len %d\n", ctlbuf.len);
1624 				return (_B_FALSE);
1625 			}
1626 			logerr("update_router_list: T_ERROR_ACK:"
1627 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1628 			    tea->TLI_error, tea->UNIX_error);
1629 			return (_B_FALSE);
1630 
1631 		case T_OPTMGMT_ACK:
1632 			toa = &buf.uprim.optmgmt_ack;
1633 			optp = (struct opthdr *)&toa[1];
1634 			if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
1635 				logerr("update_router_list: ctlbuf.len %d\n",
1636 				    ctlbuf.len);
1637 				return (_B_FALSE);
1638 			}
1639 			if (toa->MGMT_flags != T_SUCCESS) {
1640 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1641 				    toa->MGMT_flags);
1642 				return (_B_FALSE);
1643 			}
1644 			break;
1645 
1646 		default:
1647 			logerr("update_router_list: unknown primitive %ld\n",
1648 			    prim);
1649 			return (_B_FALSE);
1650 		}
1651 
1652 		/* Process the T_OPGMGMT_ACK below */
1653 		assert(prim == T_OPTMGMT_ACK);
1654 
1655 		switch (status) {
1656 		case 0:
1657 			/*
1658 			 * We have reached the end of this T_OPTMGMT_ACK
1659 			 * message. If this is the last message i.e EOD,
1660 			 * return, else process the next T_OPTMGMT_ACK msg.
1661 			 */
1662 			if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
1663 			    sizeof (struct opthdr)) && optp->len == 0 &&
1664 			    optp->name == 0 && optp->level == 0) {
1665 				/*
1666 				 * This is the EOD message. Return
1667 				 */
1668 				return (_B_TRUE);
1669 			}
1670 			continue;
1671 
1672 		case MORECTL:
1673 		case MORECTL | MOREDATA:
1674 			/*
1675 			 * This should not happen. We should be able to read
1676 			 * the control portion in a single getmsg.
1677 			 */
1678 			logerr("update_router_list: MORECTL\n");
1679 			return (_B_FALSE);
1680 
1681 		case MOREDATA:
1682 			databuf.maxlen = optp->len;
1683 			/* malloc of 0 bytes is ok */
1684 			databuf.buf = malloc((size_t)optp->len);
1685 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1686 				logperror("update_router_list: malloc");
1687 				return (_B_FALSE);
1688 			}
1689 			databuf.len = 0;
1690 			flags = 0;
1691 			for (;;) {
1692 				status = getmsg(fd, NULL, &databuf, &flags);
1693 				if (status >= 0) {
1694 					break;
1695 				} else if (errno == EINTR) {
1696 					continue;
1697 				} else {
1698 					logperror("update_router_list:"
1699 					    " getmsg(data)");
1700 					free(databuf.buf);
1701 					return (_B_FALSE);
1702 				}
1703 			}
1704 
1705 			if (optp->level == MIB2_IP &&
1706 			    optp->name == MIB2_IP_ROUTE) {
1707 				/* LINTED */
1708 				ire_process_v4((mib2_ipRouteEntry_t *)
1709 				    databuf.buf, databuf.len);
1710 			} else if (optp->level == MIB2_IP6 &&
1711 			    optp->name == MIB2_IP6_ROUTE) {
1712 				/* LINTED */
1713 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1714 				    databuf.buf, databuf.len);
1715 			}
1716 			free(databuf.buf);
1717 		}
1718 	}
1719 	/* NOTREACHED */
1720 }
1721 
1722 /*
1723  * Examine the IPv4 routing table, for default routers. For each default
1724  * router, populate the list of targets of each phyint that is on the same
1725  * link as the default router
1726  */
1727 static void
1728 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1729 {
1730 	mib2_ipRouteEntry_t	*rp;
1731 	mib2_ipRouteEntry_t	*rp1;
1732 	struct	in_addr		nexthop_v4;
1733 	mib2_ipRouteEntry_t	*endp;
1734 
1735 	if (len == 0)
1736 		return;
1737 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1738 
1739 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1740 
1741 	/*
1742 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1743 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1744 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1745 	 * This is a potential target for probing, which we try to add
1746 	 * to the list of probe targets.
1747 	 */
1748 	for (rp = buf; rp < endp; rp++) {
1749 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1750 			continue;
1751 
1752 		/*  Get the nexthop address. */
1753 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1754 
1755 		/*
1756 		 * Get the nexthop address. Then determine the outgoing
1757 		 * interface, by examining all interface IREs, and picking the
1758 		 * match. We don't look at the interface specified in the route
1759 		 * because we need to add the router target on all matching
1760 		 * interfaces anyway; the goal is to avoid falling back to
1761 		 * multicast when some interfaces are in the same subnet but
1762 		 * not in the same group.
1763 		 */
1764 		for (rp1 = buf; rp1 < endp; rp1++) {
1765 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
1766 				continue;
1767 			}
1768 
1769 			/*
1770 			 * Determine the interface IRE that matches the nexthop.
1771 			 * i.e.	 (IRE addr & IRE mask) == (nexthop & IRE mask)
1772 			 */
1773 			if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
1774 			    (nexthop_v4.s_addr & rp1->ipRouteMask)) {
1775 				/*
1776 				 * We found the interface ire
1777 				 */
1778 				router_add_v4(rp1, nexthop_v4);
1779 			}
1780 		}
1781 	}
1782 }
1783 
1784 void
1785 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
1786 {
1787 	char *cp;
1788 	char ifname[LIFNAMSIZ + 1];
1789 	struct in6_addr	nexthop;
1790 	int len;
1791 
1792 	if (debug & D_TARGET)
1793 		logdebug("router_add_v4()\n");
1794 
1795 	len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
1796 	(void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
1797 	ifname[len] = '\0';
1798 
1799 	if (ifname[0] == '\0')
1800 		return;
1801 
1802 	cp = strchr(ifname, IF_SEPARATOR);
1803 	if (cp != NULL)
1804 		*cp = '\0';
1805 
1806 	IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1807 	router_add_common(AF_INET, ifname, nexthop);
1808 }
1809 
1810 void
1811 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1812 {
1813 	struct phyint_instance *pii;
1814 	struct phyint *pi;
1815 
1816 	if (debug & D_TARGET)
1817 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1818 
1819 	/*
1820 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1821 	 */
1822 	pii = phyint_inst_lookup(af, ifname);
1823 	if (pii == NULL)
1824 		return;
1825 
1826 	/*
1827 	 * Don't use our own addresses as targets.
1828 	 */
1829 	if (own_address(pii->pii_af, nexthop))
1830 		return;
1831 
1832 	/*
1833 	 * If the phyint is part a named group, then add the address to all
1834 	 * members of the group; note that this is suboptimal in the IPv4 case
1835 	 * as it has already been added to all matching interfaces in
1836 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1837 	 * itself, since other phyints in the anongroup may not be on the same
1838 	 * subnet.
1839 	 */
1840 	pi = pii->pii_phyint;
1841 	if (pi->pi_group == phyint_anongroup) {
1842 		target_add(pii, nexthop, _B_TRUE);
1843 	} else {
1844 		pi = pi->pi_group->pg_phyint;
1845 		for (; pi != NULL; pi = pi->pi_pgnext)
1846 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1847 	}
1848 }
1849 
1850 /*
1851  * Examine the IPv6 routing table, for default routers. For each default
1852  * router, populate the list of targets of each phyint that is on the same
1853  * link as the default router
1854  */
1855 static void
1856 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1857 {
1858 	mib2_ipv6RouteEntry_t	*rp;
1859 	mib2_ipv6RouteEntry_t	*endp;
1860 	struct	in6_addr nexthop_v6;
1861 
1862 	if (debug & D_TARGET)
1863 		logdebug("ire_process_v6(len %d)\n", len);
1864 
1865 	if (len == 0)
1866 		return;
1867 
1868 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1869 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1870 
1871 	/*
1872 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1873 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1874 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1875 	 * This is a potential target for probing, which we try to add
1876 	 * to the list of probe targets.
1877 	 */
1878 	for (rp = buf; rp < endp; rp++) {
1879 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
1880 			continue;
1881 
1882 		/*
1883 		 * We have the outgoing interface in ipv6RouteIfIndex
1884 		 * if ipv6RouteIfindex.o_length is non-zero. The outgoing
1885 		 * interface must be present for link-local addresses. Since
1886 		 * we use only link-local addreses for probing, we don't
1887 		 * consider the case when the outgoing interface is not
1888 		 * known and we need to scan interface ires
1889 		 */
1890 		nexthop_v6 = rp->ipv6RouteNextHop;
1891 		if (rp->ipv6RouteIfIndex.o_length != 0) {
1892 			/*
1893 			 * We already have the outgoing interface
1894 			 * in ipv6RouteIfIndex.
1895 			 */
1896 			router_add_v6(rp, nexthop_v6);
1897 		}
1898 	}
1899 }
1900 
1901 
1902 void
1903 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
1904 {
1905 	char ifname[LIFNAMSIZ + 1];
1906 	char *cp;
1907 	int  len;
1908 
1909 	if (debug & D_TARGET)
1910 		logdebug("router_add_v6()\n");
1911 
1912 	len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
1913 	(void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
1914 	ifname[len] = '\0';
1915 
1916 	if (ifname[0] == '\0')
1917 		return;
1918 
1919 	cp = strchr(ifname, IF_SEPARATOR);
1920 	if (cp != NULL)
1921 		*cp = '\0';
1922 
1923 	router_add_common(AF_INET6, ifname, nexthop_v6);
1924 }
1925 
1926 
1927 
1928 /*
1929  * Build a list of target routers, by scanning the routing tables.
1930  * It is assumed that interface routes exist, to reach the routers.
1931  */
1932 static void
1933 init_router_targets(void)
1934 {
1935 	struct	target *tg;
1936 	struct	target *next_tg;
1937 	struct	phyint_instance *pii;
1938 	struct	phyint *pi;
1939 
1940 	if (force_mcast)
1941 		return;
1942 
1943 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1944 		pi = pii->pii_phyint;
1945 		/*
1946 		 * Exclude ptp and host targets. Set tg_in_use to false,
1947 		 * only for router targets.
1948 		 */
1949 		if (!pii->pii_targets_are_routers ||
1950 		    (pi->pi_flags & IFF_POINTOPOINT))
1951 			continue;
1952 
1953 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1954 			tg->tg_in_use = 0;
1955 	}
1956 
1957 	if (mibfd < 0) {
1958 		mibfd = open("/dev/ip", O_RDWR);
1959 		if (mibfd < 0) {
1960 			logperror("mibopen: ip open");
1961 			exit(1);
1962 		}
1963 	}
1964 
1965 	if (!update_router_list(mibfd)) {
1966 		(void) close(mibfd);
1967 		mibfd = -1;
1968 	}
1969 
1970 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1971 		if (!pii->pii_targets_are_routers ||
1972 		    (pi->pi_flags & IFF_POINTOPOINT))
1973 			continue;
1974 
1975 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1976 			next_tg = tg->tg_next;
1977 			if (!tg->tg_in_use) {
1978 				target_delete(tg);
1979 			}
1980 		}
1981 	}
1982 }
1983 
1984 /*
1985  * Attempt to assign host targets to any interfaces that do not currently
1986  * have probe targets by sharing targets with other interfaces in the group.
1987  */
1988 static void
1989 init_host_targets(void)
1990 {
1991 	struct phyint_instance *pii;
1992 	struct phyint_group *pg;
1993 
1994 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1995 		pg = pii->pii_phyint->pi_group;
1996 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
1997 			dup_host_targets(pii);
1998 	}
1999 }
2000 
2001 /*
2002  * Duplicate host targets from other phyints of the group to
2003  * the phyint instance 'desired_pii'.
2004  */
2005 static void
2006 dup_host_targets(struct phyint_instance	 *desired_pii)
2007 {
2008 	int af;
2009 	struct phyint *pi;
2010 	struct phyint_instance *pii;
2011 	struct target *tg;
2012 
2013 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
2014 
2015 	af = desired_pii->pii_af;
2016 
2017 	/*
2018 	 * For every phyint in the same group as desired_pii, check if
2019 	 * it has any host targets. If so add them to desired_pii.
2020 	 */
2021 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
2022 		pii = PHYINT_INSTANCE(pi, af);
2023 		/*
2024 		 * We know that we don't have targets on this phyint instance
2025 		 * since we have been called. But we still check for
2026 		 * pii_targets_are_routers because another phyint instance
2027 		 * could have router targets, since IFF_NOFAILOVER addresses
2028 		 * on different phyint instances may belong to different
2029 		 * subnets.
2030 		 */
2031 		if ((pii == NULL) || (pii == desired_pii) ||
2032 		    pii->pii_targets_are_routers)
2033 			continue;
2034 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2035 			target_create(desired_pii, tg->tg_address, _B_FALSE);
2036 		}
2037 	}
2038 }
2039 
2040 static void
2041 usage(char *cmd)
2042 {
2043 	(void) fprintf(stderr, "usage: %s\n", cmd);
2044 }
2045 
2046 
2047 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2048 
2049 /* Get an option from the /etc/default/mpathd file */
2050 static char *
2051 getdefault(char *name)
2052 {
2053 	char namebuf[BUFSIZ];
2054 	char *value = NULL;
2055 
2056 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2057 		char	*cp;
2058 		int	flags;
2059 
2060 		/*
2061 		 * ignore case
2062 		 */
2063 		flags = defcntl(DC_GETFLAGS, 0);
2064 		TURNOFF(flags, DC_CASE);
2065 		(void) defcntl(DC_SETFLAGS, flags);
2066 
2067 		/* Add "=" to the name */
2068 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2069 		(void) strncat(namebuf, "=", 2);
2070 
2071 		if ((cp = defread(namebuf)) != NULL)
2072 			value = strdup(cp);
2073 
2074 		/* close */
2075 		(void) defopen((char *)NULL);
2076 	}
2077 	return (value);
2078 }
2079 
2080 
2081 /*
2082  * Command line options below
2083  */
2084 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2085 boolean_t	track_all_phyints = _B_FALSE;	/* option to track all NICs */
2086 static boolean_t adopt = _B_FALSE;
2087 static boolean_t foreground = _B_FALSE;
2088 
2089 int
2090 main(int argc, char *argv[])
2091 {
2092 	int i;
2093 	int c;
2094 	struct phyint_instance *pii;
2095 	char *value;
2096 
2097 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2098 	srandom(gethostid());	/* Initialize the random number generator */
2099 
2100 	/*
2101 	 * NOTE: The messages output by in.mpathd are not suitable for
2102 	 * translation, so we do not call textdomain().
2103 	 */
2104 	(void) setlocale(LC_ALL, "");
2105 
2106 	/*
2107 	 * Get the user specified value of 'failure detection time'
2108 	 * from /etc/default/mpathd
2109 	 */
2110 	value = getdefault("FAILURE_DETECTION_TIME");
2111 	if (value != NULL) {
2112 		user_failure_detection_time =
2113 		    (int)strtol((char *)value, NULL, 0);
2114 
2115 		if (user_failure_detection_time <= 0) {
2116 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2117 			logerr("Invalid failure detection time %s, assuming "
2118 			    "default %d\n", value, user_failure_detection_time);
2119 
2120 		} else if (user_failure_detection_time <
2121 		    MIN_FAILURE_DETECTION_TIME) {
2122 			user_failure_detection_time =
2123 			    MIN_FAILURE_DETECTION_TIME;
2124 			logerr("Too small failure detection time of %s, "
2125 			    "assuming minimum %d\n", value,
2126 			    user_failure_detection_time);
2127 		}
2128 		free(value);
2129 	} else {
2130 		/* User has not specified the parameter, Use default value */
2131 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2132 	}
2133 
2134 	/*
2135 	 * This gives the frequency at which probes will be sent.
2136 	 * When fdt ms elapses, we should be able to determine
2137 	 * whether 5 consecutive probes have failed or not.
2138 	 * 1 probe will be sent in every user_probe_interval ms,
2139 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2140 	 * user_probe_interval. Thus when we send out probe 'n' we
2141 	 * can be sure that probe 'n - 2' is lost, if we have not
2142 	 * got the ack. (since the probe interval is > crtt). But
2143 	 * probe 'n - 1' may be a valid unacked probe, since the
2144 	 * time between 2 successive probes could be as small as
2145 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2146 	 */
2147 	user_probe_interval = user_failure_detection_time /
2148 	    (NUM_PROBE_FAILS + 2);
2149 
2150 	/*
2151 	 * Get the user specified value of failback_enabled from
2152 	 * /etc/default/mpathd
2153 	 */
2154 	value = getdefault("FAILBACK");
2155 	if (value != NULL) {
2156 		if (strncasecmp(value, "yes", 3) == 0)
2157 			failback_enabled = _B_TRUE;
2158 		else if (strncasecmp(value, "no", 2) == 0)
2159 			failback_enabled = _B_FALSE;
2160 		else
2161 			logerr("Invalid value for FAILBACK %s\n", value);
2162 		free(value);
2163 	} else {
2164 		failback_enabled = _B_TRUE;
2165 	}
2166 
2167 	/*
2168 	 * Get the user specified value of track_all_phyints from
2169 	 * /etc/default/mpathd. The sense is reversed in
2170 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2171 	 */
2172 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2173 	if (value != NULL) {
2174 		if (strncasecmp(value, "yes", 3) == 0)
2175 			track_all_phyints = _B_FALSE;
2176 		else if (strncasecmp(value, "no", 2) == 0)
2177 			track_all_phyints = _B_TRUE;
2178 		else
2179 			logerr("Invalid value for "
2180 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2181 		free(value);
2182 	} else {
2183 		track_all_phyints = _B_FALSE;
2184 	}
2185 
2186 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2187 		switch (c) {
2188 		case 'a':
2189 			adopt = _B_TRUE;
2190 			break;
2191 		case 'm':
2192 			force_mcast = _B_TRUE;
2193 			break;
2194 		case 'd':
2195 			debug = D_ALL;
2196 			foreground = _B_TRUE;
2197 			break;
2198 		case 'D':
2199 			i = (int)strtol(optarg, NULL, 0);
2200 			if (i == 0) {
2201 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2202 				    optarg);
2203 				exit(1);
2204 			}
2205 			debug |= i;
2206 			foreground = _B_TRUE;
2207 			break;
2208 		case 'l':
2209 			/*
2210 			 * Turn off link state notification handling.
2211 			 * Undocumented command line flag, for debugging
2212 			 * purposes.
2213 			 */
2214 			handle_link_notifications = _B_FALSE;
2215 			break;
2216 		default:
2217 			usage(argv[0]);
2218 			exit(1);
2219 		}
2220 	}
2221 
2222 	/*
2223 	 * The sockets for the loopback command interface should be listening
2224 	 * before we fork and exit in daemonize(). This way, whoever started us
2225 	 * can use the loopback interface as soon as they get a zero exit
2226 	 * status.
2227 	 */
2228 	lsock_v4 = setup_listener(AF_INET);
2229 	lsock_v6 = setup_listener(AF_INET6);
2230 
2231 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2232 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2233 		exit(1);
2234 	}
2235 
2236 	if (!foreground) {
2237 		if (!daemonize()) {
2238 			logerr("cannot daemonize\n");
2239 			exit(EXIT_FAILURE);
2240 		}
2241 		initlog();
2242 	}
2243 
2244 	/*
2245 	 * Initializations:
2246 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2247 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2248 	 * 2. Initialize a pipe for handling/recording signal events.
2249 	 * 3. Create the routing sockets,  used for listening
2250 	 *    to routing / interface changes.
2251 	 * 4. phyint_init() - Initialize physical interface state
2252 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2253 	 *    which timer_init() does indirectly.
2254 	 * 5. timer_init()  - Initialize timer related stuff
2255 	 * 6. initifs() - Initialize our database of all known interfaces
2256 	 * 7. init_router_targets() - Initialize our database of all known
2257 	 *    router targets.
2258 	 */
2259 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2260 	if (ifsock_v4 < 0) {
2261 		logperror("main: IPv4 socket open");
2262 		exit(1);
2263 	}
2264 
2265 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2266 	if (ifsock_v6 < 0) {
2267 		logperror("main: IPv6 socket open");
2268 		exit(1);
2269 	}
2270 
2271 	setup_eventpipe();
2272 
2273 	rtsock_v4 = setup_rtsock(AF_INET);
2274 	rtsock_v6 = setup_rtsock(AF_INET6);
2275 
2276 	if (phyint_init() == -1) {
2277 		logerr("cannot initialize physical interface structures");
2278 		exit(1);
2279 	}
2280 
2281 	timer_init();
2282 
2283 	initifs();
2284 
2285 	/* Inform kernel whether failback is enabled or disabled */
2286 	if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) {
2287 		logperror("main: ioctl (SIOCSIPMPFAILBACK)");
2288 		exit(1);
2289 	}
2290 
2291 	/*
2292 	 * If we're operating in "adopt" mode and no interfaces need to be
2293 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2294 	 * interfaces are subsequently put into multipathing groups).
2295 	 */
2296 	if (adopt && phyint_instances == NULL)
2297 		exit(0);
2298 
2299 	/*
2300 	 * Main body. Keep listening for activity on any of the sockets
2301 	 * that we are monitoring and take appropriate action as necessary.
2302 	 * signals are also handled synchronously.
2303 	 */
2304 	for (;;) {
2305 		if (poll(pollfds, pollfd_num, -1) < 0) {
2306 			if (errno == EINTR)
2307 				continue;
2308 			logperror("main: poll");
2309 			exit(1);
2310 		}
2311 		for (i = 0; i < pollfd_num; i++) {
2312 			if ((pollfds[i].fd == -1) ||
2313 			    !(pollfds[i].revents & POLLIN))
2314 				continue;
2315 			if (pollfds[i].fd == eventpipe_read) {
2316 				in_signal(eventpipe_read);
2317 				break;
2318 			}
2319 			if (pollfds[i].fd == rtsock_v4 ||
2320 				pollfds[i].fd == rtsock_v6) {
2321 				process_rtsock(rtsock_v4, rtsock_v6);
2322 				break;
2323 			}
2324 			for (pii = phyint_instances; pii != NULL;
2325 			    pii = pii->pii_next) {
2326 				if (pollfds[i].fd == pii->pii_probe_sock) {
2327 					if (pii->pii_af == AF_INET)
2328 						in_data(pii);
2329 					else
2330 						in6_data(pii);
2331 					break;
2332 				}
2333 			}
2334 			if (pollfds[i].fd == lsock_v4)
2335 				loopback_cmd(lsock_v4, AF_INET);
2336 			else if (pollfds[i].fd == lsock_v6)
2337 				loopback_cmd(lsock_v6, AF_INET6);
2338 		}
2339 		if (full_scan_required) {
2340 			initifs();
2341 			full_scan_required = _B_FALSE;
2342 		}
2343 	}
2344 	/* NOTREACHED */
2345 	return (EXIT_SUCCESS);
2346 }
2347 
2348 static int
2349 setup_listener(int af)
2350 {
2351 	int sock;
2352 	int on;
2353 	int len;
2354 	int ret;
2355 	struct sockaddr_storage laddr;
2356 	struct sockaddr_in  *sin;
2357 	struct sockaddr_in6 *sin6;
2358 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2359 
2360 	assert(af == AF_INET || af == AF_INET6);
2361 
2362 	sock = socket(af, SOCK_STREAM, 0);
2363 	if (sock < 0) {
2364 		logperror("setup_listener: socket");
2365 		exit(1);
2366 	}
2367 
2368 	on = 1;
2369 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2370 	    sizeof (on)) < 0) {
2371 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2372 		exit(1);
2373 	}
2374 
2375 	bzero(&laddr, sizeof (laddr));
2376 	laddr.ss_family = af;
2377 
2378 	if (af == AF_INET) {
2379 		sin = (struct sockaddr_in *)&laddr;
2380 		sin->sin_port = htons(MPATHD_PORT);
2381 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2382 		len = sizeof (struct sockaddr_in);
2383 	} else {
2384 		sin6 = (struct sockaddr_in6 *)&laddr;
2385 		sin6->sin6_port = htons(MPATHD_PORT);
2386 		sin6->sin6_addr = loopback_addr;
2387 		len = sizeof (struct sockaddr_in6);
2388 	}
2389 
2390 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2391 	if (ret < 0) {
2392 		if (errno == EADDRINUSE) {
2393 			/*
2394 			 * Another instance of mpathd may be already active.
2395 			 */
2396 			logerr("main: is another instance of in.mpathd "
2397 			    "already active?\n");
2398 			exit(1);
2399 		} else {
2400 			(void) close(sock);
2401 			return (-1);
2402 		}
2403 	}
2404 	if (listen(sock, 30) < 0) {
2405 		logperror("main: listen");
2406 		exit(1);
2407 	}
2408 	if (poll_add(sock) == -1) {
2409 		(void) close(sock);
2410 		exit(1);
2411 	}
2412 
2413 	return (sock);
2414 }
2415 
2416 /*
2417  * Table of commands and their expected size; used by loopback_cmd().
2418  */
2419 static struct {
2420 	const char	*name;
2421 	unsigned int	size;
2422 } commands[] = {
2423 	{ "MI_PING",		sizeof (uint32_t)	},
2424 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2425 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2426 	{ "MI_SETOINDEX",	sizeof (mi_setoindex_t) },
2427 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2428 };
2429 
2430 /*
2431  * Commands received over the loopback interface come here. Currently
2432  * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
2433  * module. ifconfig only makes a connection, and closes it to check if
2434  * in.mpathd is running.
2435  * if_mpadm sends commands in the format specified by the mpathd_interface
2436  * structure.
2437  */
2438 static void
2439 loopback_cmd(int sock, int family)
2440 {
2441 	int newfd;
2442 	ssize_t len;
2443 	struct sockaddr_storage	peer;
2444 	struct sockaddr_in	*peer_sin;
2445 	struct sockaddr_in6	*peer_sin6;
2446 	socklen_t peerlen;
2447 	union mi_commands mpi;
2448 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2449 	char abuf[INET6_ADDRSTRLEN];
2450 	uint_t cmd;
2451 	int retval;
2452 
2453 	peerlen = sizeof (peer);
2454 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2455 	if (newfd < 0) {
2456 		logperror("loopback_cmd: accept");
2457 		return;
2458 	}
2459 
2460 	switch (family) {
2461 	case AF_INET:
2462 		/*
2463 		 * Validate the address and port to make sure that
2464 		 * non privileged processes don't connect and start
2465 		 * talking to us.
2466 		 */
2467 		if (peerlen != sizeof (struct sockaddr_in)) {
2468 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2469 			(void) close(newfd);
2470 			return;
2471 		}
2472 		peer_sin = (struct sockaddr_in *)&peer;
2473 		if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
2474 		    (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
2475 			(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2476 			    abuf, sizeof (abuf));
2477 			logerr("Attempt to connect from addr %s port %d\n",
2478 			    abuf, ntohs(peer_sin->sin_port));
2479 			(void) close(newfd);
2480 			return;
2481 		}
2482 		break;
2483 
2484 	case AF_INET6:
2485 		if (peerlen != sizeof (struct sockaddr_in6)) {
2486 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2487 			(void) close(newfd);
2488 			return;
2489 		}
2490 		/*
2491 		 * Validate the address and port to make sure that
2492 		 * non privileged processes don't connect and start
2493 		 * talking to us.
2494 		 */
2495 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2496 		if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
2497 		    (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
2498 		    &loopback_addr))) {
2499 			(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2500 			    sizeof (abuf));
2501 			logerr("Attempt to connect from addr %s port %d\n",
2502 			    abuf, ntohs(peer_sin6->sin6_port));
2503 			(void) close(newfd);
2504 			return;
2505 		}
2506 
2507 	default:
2508 		logdebug("loopback_cmd: family %d\n", family);
2509 		(void) close(newfd);
2510 		return;
2511 	}
2512 
2513 	/*
2514 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2515 	 * all supported commands
2516 	 */
2517 	len = read(newfd, &mpi, sizeof (mpi));
2518 
2519 	/*
2520 	 * ifconfig does not send any data. Just tests to see if mpathd
2521 	 * is already running.
2522 	 */
2523 	if (len <= 0) {
2524 		(void) close(newfd);
2525 		return;
2526 	}
2527 
2528 	/*
2529 	 * In theory, we can receive any sized message for a stream socket,
2530 	 * but we don't expect that to happen for a small message over a
2531 	 * loopback connection.
2532 	 */
2533 	if (len < sizeof (uint32_t)) {
2534 		logerr("loopback_cmd: bad command format or read returns "
2535 		    "partial data %d\n", len);
2536 	}
2537 
2538 	cmd = mpi.mi_command;
2539 	if (cmd >= MI_NCMD) {
2540 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2541 		(void) close(newfd);
2542 		return;
2543 	}
2544 
2545 	if (len < commands[cmd].size) {
2546 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2547 		    commands[cmd].name, commands[cmd].size, len);
2548 		(void) close(newfd);
2549 		return;
2550 	}
2551 
2552 	retval = process_cmd(newfd, &mpi);
2553 	if (retval != IPMP_SUCCESS) {
2554 		logerr("failed processing %s: %s\n", commands[cmd].name,
2555 		    ipmp_errmsg(retval));
2556 	}
2557 	(void) close(newfd);
2558 }
2559 
2560 extern int global_errno;	/* set by failover() or failback() */
2561 
2562 /*
2563  * Process the offline, undo offline and set original index commands,
2564  * received from if_mpadm(1M)
2565  */
2566 static unsigned int
2567 process_cmd(int newfd, union mi_commands *mpi)
2568 {
2569 	uint_t	nif = 0;
2570 	uint32_t cmd;
2571 	struct phyint *pi;
2572 	struct phyint *pi2;
2573 	struct phyint_group *pg;
2574 	boolean_t success;
2575 	int error;
2576 	struct mi_offline *mio;
2577 	struct mi_undo_offline *miu;
2578 	struct lifreq lifr;
2579 	int ifsock;
2580 	struct mi_setoindex *mis;
2581 
2582 	cmd = mpi->mi_command;
2583 
2584 	switch (cmd) {
2585 	case MI_OFFLINE:
2586 		mio = &mpi->mi_ocmd;
2587 		/*
2588 		 * Lookup the interface that needs to be offlined.
2589 		 * If it does not exist, return a suitable error.
2590 		 */
2591 		pi = phyint_lookup(mio->mio_ifname);
2592 		if (pi == NULL)
2593 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2594 
2595 		/*
2596 		 * Verify that the minimum redundancy requirements are met.
2597 		 * The multipathing group must have at least the specified
2598 		 * number of functional interfaces after offlining the
2599 		 * requested interface. Otherwise return a suitable error.
2600 		 */
2601 		pg = pi->pi_group;
2602 		nif = 0;
2603 		if (pg != phyint_anongroup) {
2604 			for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
2605 			    pi2 = pi2->pi_pgnext) {
2606 				if ((pi2->pi_state == PI_RUNNING) ||
2607 				    (pg->pg_groupfailed &&
2608 				    !(pi2->pi_flags & IFF_OFFLINE)))
2609 					nif++;
2610 			}
2611 		}
2612 		if (nif < mio->mio_min_redundancy)
2613 			return (send_result(newfd, IPMP_EMINRED, 0));
2614 
2615 		/*
2616 		 * The order of operation is to set IFF_OFFLINE, followed by
2617 		 * failover. Setting IFF_OFFLINE ensures that no new ipif's
2618 		 * can be created. Subsequent failover moves everything on
2619 		 * the OFFLINE interface to some other functional interface.
2620 		 */
2621 		success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
2622 		if (success) {
2623 			if (!pi->pi_empty) {
2624 				error = try_failover(pi, FAILOVER_NORMAL);
2625 				if (error != 0) {
2626 					if (!change_lif_flags(pi, IFF_OFFLINE,
2627 					    _B_FALSE)) {
2628 						logerr("process_cmd: couldn't"
2629 						    " clear OFFLINE flag on"
2630 						    " %s\n", pi->pi_name);
2631 						/*
2632 						 * Offline interfaces should
2633 						 * not be probed.
2634 						 */
2635 						stop_probing(pi);
2636 					}
2637 					return (send_result(newfd, error,
2638 					    global_errno));
2639 				}
2640 			}
2641 		} else {
2642 			return (send_result(newfd, IPMP_FAILURE, errno));
2643 		}
2644 
2645 		/*
2646 		 * The interface is now Offline, so stop probing it.
2647 		 * Note that if_mpadm(1M) will down the test addresses,
2648 		 * after receiving a success reply from us. The routing
2649 		 * socket message will then make us close the socket used
2650 		 * for sending probes. But it is more logical that an
2651 		 * offlined interface must not be probed, even if it has
2652 		 * test addresses.
2653 		 */
2654 		stop_probing(pi);
2655 		return (send_result(newfd, IPMP_SUCCESS, 0));
2656 
2657 	case MI_UNDO_OFFLINE:
2658 		miu = &mpi->mi_ucmd;
2659 		/*
2660 		 * Undo the offline command. As usual lookup the interface.
2661 		 * Send an error if it does not exist.
2662 		 */
2663 		pi = phyint_lookup(miu->miu_ifname);
2664 		if (pi == NULL)
2665 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2666 
2667 		/*
2668 		 * Inverse of the offline operation. Do a failback, and then
2669 		 * clear the IFF_OFFLINE flag.
2670 		 */
2671 		error = do_failback(pi, _B_TRUE);
2672 		if (error == IPMP_EFBPARTIAL)
2673 			return (send_result(newfd, IPMP_EFBPARTIAL, 0));
2674 		error = do_failback(pi, _B_FALSE);
2675 
2676 		switch (error) {
2677 		case IPMP_SUCCESS:
2678 			if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) {
2679 				logdebug("undo error %X\n", global_errno);
2680 				error = IPMP_FAILURE;
2681 				break;
2682 			}
2683 			/* FALLTHROUGH */
2684 
2685 		case IPMP_EFBPARTIAL:
2686 			/*
2687 			 * Reset the state of the interface based on the
2688 			 * current link state; if this phyint subsequently
2689 			 * acquires a test address, the state will be changed
2690 			 * again later as a result of the probes.
2691 			 */
2692 			if (LINK_UP(pi))
2693 				phyint_chstate(pi, PI_RUNNING);
2694 			else
2695 				phyint_chstate(pi, PI_FAILED);
2696 			break;
2697 
2698 		case IPMP_FAILURE:
2699 			break;
2700 
2701 		default:
2702 			logdebug("do_failback: unexpected return value\n");
2703 			break;
2704 		}
2705 		return (send_result(newfd, error, global_errno));
2706 
2707 	case MI_SETOINDEX:
2708 		mis = &mpi->mi_scmd;
2709 
2710 		/* Get the socket for doing ioctls */
2711 		ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
2712 
2713 		/*
2714 		 * Get index of new original interface.
2715 		 * The index is returned in lifr.lifr_index.
2716 		 */
2717 		(void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
2718 		    sizeof (lifr.lifr_name));
2719 
2720 		if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
2721 			return (send_result(newfd, IPMP_FAILURE, errno));
2722 
2723 		/*
2724 		 * Set new original interface index.
2725 		 * The new index was put into lifr.lifr_index by the
2726 		 * SIOCGLIFINDEX ioctl.
2727 		 */
2728 		(void) strlcpy(lifr.lifr_name, mis->mis_lifname,
2729 		    sizeof (lifr.lifr_name));
2730 
2731 		if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
2732 			return (send_result(newfd, IPMP_FAILURE, errno));
2733 
2734 		return (send_result(newfd, IPMP_SUCCESS, 0));
2735 
2736 	case MI_QUERY:
2737 		return (process_query(newfd, &mpi->mi_qcmd));
2738 
2739 	default:
2740 		break;
2741 	}
2742 
2743 	return (send_result(newfd, IPMP_EPROTO, 0));
2744 }
2745 
2746 /*
2747  * Process the query request pointed to by `miq' and send a reply on file
2748  * descriptor `fd'.  Returns an IPMP error code.
2749  */
2750 static unsigned int
2751 process_query(int fd, mi_query_t *miq)
2752 {
2753 	ipmp_groupinfo_t	*grinfop;
2754 	ipmp_groupinfolist_t	*grlp;
2755 	ipmp_grouplist_t	*grlistp;
2756 	ipmp_ifinfo_t		*ifinfop;
2757 	ipmp_ifinfolist_t	*iflp;
2758 	ipmp_snap_t		*snap;
2759 	unsigned int		retval;
2760 
2761 	switch (miq->miq_inforeq) {
2762 	case IPMP_GROUPLIST:
2763 		retval = getgrouplist(&grlistp);
2764 		if (retval != IPMP_SUCCESS)
2765 			return (send_result(fd, retval, errno));
2766 
2767 		retval = send_result(fd, IPMP_SUCCESS, 0);
2768 		if (retval == IPMP_SUCCESS)
2769 			retval = send_grouplist(fd, grlistp);
2770 
2771 		ipmp_freegrouplist(grlistp);
2772 		return (retval);
2773 
2774 	case IPMP_GROUPINFO:
2775 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2776 		retval = getgroupinfo(miq->miq_ifname, &grinfop);
2777 		if (retval != IPMP_SUCCESS)
2778 			return (send_result(fd, retval, errno));
2779 
2780 		retval = send_result(fd, IPMP_SUCCESS, 0);
2781 		if (retval == IPMP_SUCCESS)
2782 			retval = send_groupinfo(fd, grinfop);
2783 
2784 		ipmp_freegroupinfo(grinfop);
2785 		return (retval);
2786 
2787 	case IPMP_IFINFO:
2788 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2789 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2790 		if (retval != IPMP_SUCCESS)
2791 			return (send_result(fd, retval, errno));
2792 
2793 		retval = send_result(fd, IPMP_SUCCESS, 0);
2794 		if (retval == IPMP_SUCCESS)
2795 			retval = send_ifinfo(fd, ifinfop);
2796 
2797 		ipmp_freeifinfo(ifinfop);
2798 		return (retval);
2799 
2800 	case IPMP_SNAP:
2801 		retval = getsnap(&snap);
2802 		if (retval != IPMP_SUCCESS)
2803 			return (send_result(fd, retval, errno));
2804 
2805 		retval = send_result(fd, IPMP_SUCCESS, 0);
2806 		if (retval != IPMP_SUCCESS)
2807 			goto out;
2808 
2809 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2810 		if (retval != IPMP_SUCCESS)
2811 			goto out;
2812 
2813 		retval = send_grouplist(fd, snap->sn_grlistp);
2814 		if (retval != IPMP_SUCCESS)
2815 			goto out;
2816 
2817 		iflp = snap->sn_ifinfolistp;
2818 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2819 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2820 			if (retval != IPMP_SUCCESS)
2821 				goto out;
2822 		}
2823 
2824 		grlp = snap->sn_grinfolistp;
2825 		for (; grlp != NULL; grlp = grlp->grl_next) {
2826 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2827 			if (retval != IPMP_SUCCESS)
2828 				goto out;
2829 		}
2830 	out:
2831 		ipmp_snap_free(snap);
2832 		return (retval);
2833 
2834 	default:
2835 		break;
2836 
2837 	}
2838 	return (send_result(fd, IPMP_EPROTO, 0));
2839 }
2840 
2841 /*
2842  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2843  * Returns an IPMP error code.
2844  */
2845 static unsigned int
2846 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2847 {
2848 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2849 	unsigned int	retval;
2850 
2851 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2852 	if (retval != IPMP_SUCCESS)
2853 		return (retval);
2854 
2855 	return (ipmp_writetlv(fd, IPMP_IFLIST,
2856 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
2857 }
2858 
2859 /*
2860  * Send the interface information pointed to by `ifinfop' on file descriptor
2861  * `fd'.  Returns an IPMP error code.
2862  */
2863 static unsigned int
2864 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2865 {
2866 	return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
2867 }
2868 
2869 /*
2870  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2871  * Returns an IPMP error code.
2872  */
2873 static unsigned int
2874 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2875 {
2876 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2877 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2878 }
2879 
2880 /*
2881  * Initialize an mi_result_t structure using `error' and `syserror' and
2882  * send it on file descriptor `fd'.  Returns an IPMP error code.
2883  */
2884 static unsigned int
2885 send_result(int fd, unsigned int error, int syserror)
2886 {
2887 	mi_result_t me;
2888 
2889 	me.me_mpathd_error = error;
2890 	if (error == IPMP_FAILURE)
2891 		me.me_sys_error = syserror;
2892 	else
2893 		me.me_sys_error = 0;
2894 
2895 	return (ipmp_write(fd, &me, sizeof (me)));
2896 }
2897 
2898 /*
2899  * Daemonize the process.
2900  */
2901 static boolean_t
2902 daemonize(void)
2903 {
2904 	switch (fork()) {
2905 	case -1:
2906 		return (_B_FALSE);
2907 
2908 	case  0:
2909 		/*
2910 		 * Lose our controlling terminal, and become both a session
2911 		 * leader and a process group leader.
2912 		 */
2913 		if (setsid() == -1)
2914 			return (_B_FALSE);
2915 
2916 		/*
2917 		 * Under POSIX, a session leader can accidentally (through
2918 		 * open(2)) acquire a controlling terminal if it does not
2919 		 * have one.  Just to be safe, fork() again so we are not a
2920 		 * session leader.
2921 		 */
2922 		switch (fork()) {
2923 		case -1:
2924 			return (_B_FALSE);
2925 
2926 		case 0:
2927 			(void) chdir("/");
2928 			(void) umask(022);
2929 			(void) fdwalk(closefunc, NULL);
2930 			break;
2931 
2932 		default:
2933 			_exit(EXIT_SUCCESS);
2934 		}
2935 		break;
2936 
2937 	default:
2938 		_exit(EXIT_SUCCESS);
2939 	}
2940 
2941 	return (_B_TRUE);
2942 }
2943 
2944 /*
2945  * The parent has created some fds before forking on purpose, keep them open.
2946  */
2947 static int
2948 closefunc(void *not_used, int fd)
2949 /* ARGSUSED */
2950 {
2951 	if (fd != lsock_v4 && fd != lsock_v6)
2952 		(void) close(fd);
2953 	return (0);
2954 }
2955 
2956 /* LOGGER */
2957 
2958 #include <syslog.h>
2959 
2960 /*
2961  * Logging routines.  All routines log to syslog, unless the daemon is
2962  * running in the foreground, in which case the logging goes to stderr.
2963  *
2964  * The following routines are available:
2965  *
2966  *	logdebug(): A printf-like function for outputting debug messages
2967  *	(messages at LOG_DEBUG) that are only of use to developers.
2968  *
2969  *	logtrace(): A printf-like function for outputting tracing messages
2970  *	(messages at LOG_INFO) from the daemon.	 This is typically used
2971  *	to log the receipt of interesting network-related conditions.
2972  *
2973  *	logerr(): A printf-like function for outputting error messages
2974  *	(messages at LOG_ERR) from the daemon.
2975  *
2976  *	logperror*(): A set of functions used to output error messages
2977  *	(messages at LOG_ERR); these automatically append strerror(errno)
2978  *	and a newline to the message passed to them.
2979  *
2980  * NOTE: since the logging functions write to syslog, the messages passed
2981  *	 to them are not eligible for localization.  Thus, gettext() must
2982  *	 *not* be used.
2983  */
2984 
2985 static int logging = 0;
2986 
2987 static void
2988 initlog(void)
2989 {
2990 	logging++;
2991 	openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON);
2992 }
2993 
2994 /* PRINTFLIKE1 */
2995 void
2996 logerr(char *fmt, ...)
2997 {
2998 	va_list ap;
2999 
3000 	va_start(ap, fmt);
3001 
3002 	if (logging)
3003 		vsyslog(LOG_ERR, fmt, ap);
3004 	else
3005 		(void) vfprintf(stderr, fmt, ap);
3006 	va_end(ap);
3007 }
3008 
3009 /* PRINTFLIKE1 */
3010 void
3011 logtrace(char *fmt, ...)
3012 {
3013 	va_list ap;
3014 
3015 	va_start(ap, fmt);
3016 
3017 	if (logging)
3018 		vsyslog(LOG_INFO, fmt, ap);
3019 	else
3020 		(void) vfprintf(stderr, fmt, ap);
3021 	va_end(ap);
3022 }
3023 
3024 /* PRINTFLIKE1 */
3025 void
3026 logdebug(char *fmt, ...)
3027 {
3028 	va_list ap;
3029 
3030 	va_start(ap, fmt);
3031 
3032 	if (logging)
3033 		vsyslog(LOG_DEBUG, fmt, ap);
3034 	else
3035 		(void) vfprintf(stderr, fmt, ap);
3036 	va_end(ap);
3037 }
3038 
3039 /* PRINTFLIKE1 */
3040 void
3041 logperror(char *str)
3042 {
3043 	if (logging)
3044 		syslog(LOG_ERR, "%s: %m\n", str);
3045 	else
3046 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
3047 }
3048 
3049 void
3050 logperror_pii(struct phyint_instance *pii, char *str)
3051 {
3052 	if (logging) {
3053 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3054 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
3055 	} else {
3056 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3057 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
3058 		    strerror(errno));
3059 	}
3060 }
3061 
3062 void
3063 logperror_li(struct logint *li, char *str)
3064 {
3065 	struct	phyint_instance	*pii = li->li_phyint_inst;
3066 
3067 	if (logging) {
3068 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3069 		    str, AF_STR(pii->pii_af), li->li_name);
3070 	} else {
3071 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3072 		    str, AF_STR(pii->pii_af), li->li_name,
3073 		    strerror(errno));
3074 	}
3075 }
3076 
3077 void
3078 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
3079 {
3080 	if (polled)
3081 		(void) poll_remove(pii->pii_probe_sock);
3082 	(void) close(pii->pii_probe_sock);
3083 	pii->pii_probe_sock = -1;
3084 	pii->pii_basetime_inited = 0;
3085 }
3086