xref: /titanic_50/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "mpd_defs.h"
30 #include "mpd_tables.h"
31 
32 int debug = 0;				/* Debug flag */
33 static int pollfd_num = 0;		/* Num. of poll descriptors */
34 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
35 
36 					/* All times below in ms */
37 int	user_failure_detection_time;	/* user specified failure detection */
38 					/* time (fdt) */
39 int	user_probe_interval;		/* derived from user specified fdt */
40 
41 static int	rtsock_v4;		/* AF_INET routing socket */
42 static int	rtsock_v6;		/* AF_INET6 routing socket */
43 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
44 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
45 static int	lsock_v4;		/* Listen socket to detect mpathd */
46 static int	lsock_v6;		/* Listen socket to detect mpathd */
47 static int	mibfd = -1;		/* fd to get mib info */
48 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
49 
50 boolean_t	full_scan_required = _B_FALSE;
51 static uint_t	last_initifs_time;	/* Time when initifs was last run */
52 static	char **argv0;			/* Saved for re-exec on SIGHUP */
53 boolean_t handle_link_notifications = _B_TRUE;
54 
55 static void	initlog(void);
56 static void	run_timeouts(void);
57 static void	initifs(void);
58 static void	check_if_removed(struct phyint_instance *pii);
59 static void	select_test_ifs(void);
60 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
61 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
62 static void	router_add_v4(mib2_ipRouteEntry_t *rp1,
63     struct in_addr nexthop_v4);
64 static void	router_add_v6(mib2_ipv6RouteEntry_t *rp1,
65     struct in6_addr nexthop_v6);
66 static void	router_add_common(int af, char *ifname,
67     struct in6_addr nexthop);
68 static void	init_router_targets();
69 static void	cleanup(void);
70 static int	setup_listener(int af);
71 static void	check_config(void);
72 static void	check_addr_unique(int af, char *name);
73 static void	init_host_targets(void);
74 static void	dup_host_targets(struct phyint_instance *desired_pii);
75 static void	loopback_cmd(int sock, int family);
76 static int	poll_remove(int fd);
77 static boolean_t daemonize(void);
78 static int	closefunc(void *, int);
79 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
80 static unsigned int process_query(int fd, mi_query_t *miq);
81 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
82 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
83 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
84 static unsigned int send_result(int fd, unsigned int error, int syserror);
85 
86 /*
87  * Return the current time in milliseconds (from an arbitrary reference)
88  * truncated to fit into an int. Truncation is ok since we are interested
89  * only in differences and not the absolute values.
90  */
91 uint_t
92 getcurrenttime(void)
93 {
94 	uint_t	cur_time;	/* In ms */
95 
96 	/*
97 	 * Use of a non-user-adjustable source of time is
98 	 * required. However millisecond precision is sufficient.
99 	 * divide by 10^6
100 	 */
101 	cur_time = (uint_t)(gethrtime() / 1000000LL);
102 	return (cur_time);
103 }
104 
105 /*
106  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
107  */
108 int
109 poll_add(int fd)
110 {
111 	int i;
112 	int new_num;
113 	struct pollfd *newfds;
114 retry:
115 	/* Check if already present */
116 	for (i = 0; i < pollfd_num; i++) {
117 		if (pollfds[i].fd == fd)
118 			return (0);
119 	}
120 	/* Check for empty spot already present */
121 	for (i = 0; i < pollfd_num; i++) {
122 		if (pollfds[i].fd == -1) {
123 			pollfds[i].fd = fd;
124 			return (0);
125 		}
126 	}
127 
128 	/* Allocate space for 32 more fds and initialize to -1 */
129 	new_num = pollfd_num + 32;
130 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
131 	if (newfds == NULL) {
132 		logperror("poll_add: realloc");
133 		return (-1);
134 	}
135 	for (i = pollfd_num; i < new_num; i++) {
136 		newfds[i].fd = -1;
137 		newfds[i].events = POLLIN;
138 	}
139 	pollfd_num = new_num;
140 	pollfds = newfds;
141 	goto retry;
142 }
143 
144 /*
145  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
146  */
147 static int
148 poll_remove(int fd)
149 {
150 	int i;
151 
152 	/* Check if already present */
153 	for (i = 0; i < pollfd_num; i++) {
154 		if (pollfds[i].fd == fd) {
155 			pollfds[i].fd = -1;
156 			return (0);
157 		}
158 	}
159 	return (-1);
160 }
161 
162 /*
163  * Extract information about the phyint instance. If the phyint instance still
164  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
165  * will use it to detect phyint instances that don't exist any longer and
166  * remove them, from our database of phyint instances.
167  * Return value:
168  *	returns true if the phyint instance exists in the kernel,
169  *	returns false otherwise
170  */
171 static boolean_t
172 pii_process(int af, char *name, struct phyint_instance **pii_p)
173 {
174 	int err;
175 	struct phyint_instance *pii;
176 	struct phyint_instance *pii_other;
177 
178 	if (debug & D_PHYINT)
179 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
180 
181 	pii = phyint_inst_lookup(af, name);
182 	if (pii == NULL) {
183 		/*
184 		 * Phyint instance does not exist in our tables,
185 		 * create new phyint instance
186 		 */
187 		pii = phyint_inst_init_from_k(af, name);
188 	} else {
189 		/* Phyint exists in our tables */
190 		err = phyint_inst_update_from_k(pii);
191 
192 		switch (err) {
193 		case PI_IOCTL_ERROR:
194 			/* Some ioctl error. don't change anything */
195 			pii->pii_in_use = 1;
196 			break;
197 
198 		case PI_GROUP_CHANGED:
199 			/*
200 			 * The phyint has changed group.
201 			 */
202 			restore_phyint(pii->pii_phyint);
203 			/* FALLTHRU */
204 
205 		case PI_IFINDEX_CHANGED:
206 			/*
207 			 * Interface index has changed. Delete and
208 			 * recreate the phyint as it is quite likely
209 			 * the interface has been unplumbed and replumbed.
210 			 */
211 			pii_other = phyint_inst_other(pii);
212 			if (pii_other != NULL)
213 				phyint_inst_delete(pii_other);
214 			phyint_inst_delete(pii);
215 			pii = phyint_inst_init_from_k(af, name);
216 			break;
217 
218 		case PI_DELETED:
219 			/* Phyint instance has disappeared from kernel */
220 			pii->pii_in_use = 0;
221 			break;
222 
223 		case PI_OK:
224 			/* Phyint instance exists and is fine */
225 			pii->pii_in_use = 1;
226 			break;
227 
228 		default:
229 			/* Unknown status */
230 			logerr("pii_process: Unknown status %d\n", err);
231 			break;
232 		}
233 	}
234 
235 	*pii_p = pii;
236 	if (pii != NULL)
237 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
238 	else
239 		return (_B_FALSE);
240 }
241 
242 /*
243  * This phyint is leaving the group. Try to restore the phyint to its
244  * initial state. Return the addresses that belong to other group members,
245  * to the group, and take back any addresses owned by this phyint
246  */
247 void
248 restore_phyint(struct phyint *pi)
249 {
250 	if (pi->pi_group == phyint_anongroup)
251 		return;
252 
253 	/*
254 	 * Move everthing to some other member in the group.
255 	 * The phyint has changed group in the kernel. But we
256 	 * have yet to do it in our tables.
257 	 */
258 	if (!pi->pi_empty)
259 		(void) try_failover(pi, FAILOVER_TO_ANY);
260 	/*
261 	 * Move all addresses owned by 'pi' back to pi, from each
262 	 * of the other members of the group
263 	 */
264 	(void) try_failback(pi, _B_FALSE);
265 }
266 
267 /*
268  * Scan all interfaces to detect changes as well as new and deleted interfaces
269  */
270 static void
271 initifs()
272 {
273 	int	n;
274 	int	af;
275 	char	*cp;
276 	char	*buf;
277 	int	numifs;
278 	struct lifnum	lifn;
279 	struct lifconf	lifc;
280 	struct lifreq	*lifr;
281 	struct logint	*li;
282 	struct phyint_instance *pii;
283 	struct phyint_instance *next_pii;
284 	char	pi_name[LIFNAMSIZ + 1];
285 	boolean_t exists;
286 	struct phyint	*pi;
287 
288 	if (debug & D_PHYINT)
289 		logdebug("initifs: Scanning interfaces\n");
290 
291 	last_initifs_time = getcurrenttime();
292 
293 	/*
294 	 * Mark the interfaces so that we can find phyints and logints
295 	 * which have disappeared from the kernel. pii_process() and
296 	 * logint_init_from_k() will set {pii,li}_in_use when they find
297 	 * the interface in the kernel. Also, clear dupaddr bit on probe
298 	 * logint. check_addr_unique() will set the dupaddr bit on the
299 	 * probe logint, if the testaddress is not unique.
300 	 */
301 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
302 		pii->pii_in_use = 0;
303 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
304 			li->li_in_use = 0;
305 			if (pii->pii_probe_logint == li)
306 				li->li_dupaddr = 0;
307 		}
308 	}
309 
310 	lifn.lifn_family = AF_UNSPEC;
311 	lifn.lifn_flags = 0;
312 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
313 		logperror("initifs: ioctl (get interface numbers)");
314 		return;
315 	}
316 	numifs = lifn.lifn_count;
317 
318 	buf = (char *)calloc(numifs, sizeof (struct lifreq));
319 	if (buf == NULL) {
320 		logperror("initifs: calloc");
321 		return;
322 	}
323 
324 	lifc.lifc_family = AF_UNSPEC;
325 	lifc.lifc_flags = 0;
326 	lifc.lifc_len = numifs * sizeof (struct lifreq);
327 	lifc.lifc_buf = buf;
328 
329 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
330 		/*
331 		 * EINVAL is commonly encountered, when things change
332 		 * underneath us rapidly, (eg. at boot, when new interfaces
333 		 * are plumbed successively) and the kernel finds the buffer
334 		 * size we passed as too small. We will retry again
335 		 * when we see the next routing socket msg, or at worst after
336 		 * IF_SCAN_INTERVAL ms.
337 		 */
338 		if (errno != EINVAL) {
339 			logperror("initifs: ioctl"
340 			    " (get interface configuration)");
341 		}
342 		free(buf);
343 		return;
344 	}
345 
346 	lifr = (struct lifreq *)lifc.lifc_req;
347 
348 	/*
349 	 * For each lifreq returned by SIOGGLIFCONF, call pii_process()
350 	 * and get the state of the corresponding phyint_instance. If it is
351 	 * successful, then call logint_init_from_k() to get the state of the
352 	 * logint.
353 	 */
354 	for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
355 		af = lifr->lifr_addr.ss_family;
356 
357 		/*
358 		 * Need to pass a phyint name to pii_process. Insert the
359 		 * null where the ':' IF_SEPARATOR is found in the logical
360 		 * name.
361 		 */
362 		(void) strncpy(pi_name, lifr->lifr_name, sizeof (pi_name));
363 		pi_name[sizeof (pi_name) - 1] = '\0';
364 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
365 			*cp = '\0';
366 
367 		exists = pii_process(af, pi_name, &pii);
368 		if (exists) {
369 			/* The phyint is fine. So process the logint */
370 			logint_init_from_k(pii, lifr->lifr_name);
371 		}
372 		check_addr_unique(af, lifr->lifr_name);
373 	}
374 
375 	free(buf);
376 
377 	/*
378 	 * If the test address is now unique, and if it was not unique
379 	 * previously,	clear the li_dupaddrmsg_printed flag and log a
380 	 * recovery message
381 	 */
382 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
383 		struct logint *li;
384 		char abuf[INET6_ADDRSTRLEN];
385 
386 		li = pii->pii_probe_logint;
387 		if ((li != NULL) && !li->li_dupaddr &&
388 		    li->li_dupaddrmsg_printed) {
389 			logerr("Test address %s is unique; enabling probe-"
390 			    "based failure detection\n",
391 			    pr_addr(pii->pii_af, li->li_addr, abuf,
392 				sizeof (abuf)));
393 			li->li_dupaddrmsg_printed = 0;
394 		}
395 	}
396 
397 	/*
398 	 * Scan for phyints and logints that have disappeared from the
399 	 * kernel, and delete them.
400 	 */
401 	pii = phyint_instances;
402 
403 	while (pii != NULL) {
404 		next_pii = pii->pii_next;
405 		check_if_removed(pii);
406 		pii = next_pii;
407 	}
408 
409 	/*
410 	 * Select a test address for sending probes on each phyint instance
411 	 */
412 	select_test_ifs();
413 
414 	/*
415 	 * Handle link up/down notifications from the NICs.
416 	 */
417 	process_link_state_changes();
418 
419 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
420 		/*
421 		 * If this is a case of group failure, we don't have much
422 		 * to do until the group recovers again.
423 		 */
424 		if (GROUP_FAILED(pi->pi_group))
425 			continue;
426 
427 		/*
428 		 * Try/Retry any pending failovers / failbacks, that did not
429 		 * not complete, or that could not be initiated previously.
430 		 * This implements the 3 invariants described in the big block
431 		 * comment at the beginning of probe.c
432 		 */
433 		if (pi->pi_flags & IFF_INACTIVE) {
434 			if (!pi->pi_empty)
435 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
436 		} else {
437 			struct phyint_instance *pii;
438 
439 			pii = pi->pi_v4;
440 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
441 				pii = pi->pi_v6;
442 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
443 				continue;
444 			/*
445 			 * It is possible that the phyint has started
446 			 * receiving packets, after it has been marked
447 			 * PI_FAILED. Don't initiate failover, if the
448 			 * phyint has started recovering. failure_state()
449 			 * captures this check. A similar logic is used
450 			 * for failback/repair case.
451 			 */
452 			if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
453 			    (failure_state(pii) == PHYINT_FAILURE)) {
454 				(void) try_failover(pi, FAILOVER_NORMAL);
455 			} else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
456 				if (try_failback(pi, _B_FALSE) !=
457 				    IPMP_FAILURE) {
458 					(void) change_lif_flags(pi, IFF_FAILED,
459 					    _B_FALSE);
460 					/* Per state diagram */
461 					pi->pi_empty = 0;
462 				}
463 			}
464 		}
465 	}
466 }
467 
468 /*
469  * Check that test/probe addresses are always unique. link-locals and
470  * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER)
471  * address can produce unexpected results. Log an error and alert the user.
472  */
473 static void
474 check_addr_unique(int af, char *name)
475 {
476 	struct lifreq	lifr;
477 	struct phyint	*pi;
478 	struct in6_addr	addr;
479 	struct phyint_instance	*pii;
480 	struct sockaddr_in	*sin;
481 	struct sockaddr_in6	*sin6;
482 	int ifsock;
483 	char abuf[INET6_ADDRSTRLEN];
484 
485 	/* Get the socket for doing ioctls */
486 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
487 
488 	(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
489 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
490 	/*
491 	 * Get the address corresponding to 'name'. We cannot
492 	 * do a logint lookup in our tables, because, not all logints
493 	 * in the system are tracked by mpathd. (eg. things not in a group)
494 	 */
495 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
496 		if (errno == ENXIO) {
497 			/* Interface has vanished */
498 			return;
499 		} else {
500 			logperror("ioctl (get addr)");
501 			return;
502 		}
503 	}
504 
505 	if (af == AF_INET) {
506 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
507 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
508 	} else {
509 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
510 		addr = sin6->sin6_addr;
511 	}
512 
513 	/*
514 	 * Does the address 'addr' match any known test address ? If so
515 	 * it is a duplicate, unless we are looking at the same logint
516 	 */
517 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
518 		pii = PHYINT_INSTANCE(pi, af);
519 		if (pii == NULL || pii->pii_probe_logint == NULL)
520 			continue;
521 
522 		if (!IN6_ARE_ADDR_EQUAL(&addr,
523 		    &pii->pii_probe_logint->li_addr)) {
524 			continue;
525 		}
526 
527 		if (strncmp(pii->pii_probe_logint->li_name, name,
528 		    sizeof (pii->pii_probe_logint->li_name)) == 0) {
529 			continue;
530 		}
531 
532 		/*
533 		 * This test address is not unique. Set the dupaddr bit
534 		 */
535 		pii->pii_probe_logint->li_dupaddr = 1;
536 
537 		/*
538 		 * Log an error message if not already logged
539 		 */
540 		if (pii->pii_probe_logint->li_dupaddrmsg_printed)
541 			continue;
542 
543 		logerr("Test address %s is not unique; disabling "
544 		    "probe-based failure detection\n",
545 		    pr_addr(af, addr, abuf, sizeof (abuf)));
546 
547 		pii->pii_probe_logint->li_dupaddrmsg_printed = 1;
548 	}
549 }
550 
551 /*
552  * The pii_probe_logint used for probing, must satisfy the following properties
553  * with respect to its li_flags.
554  * IFF_NOFAILOVER - must be set (except in singleton group case)
555  * IFF_UP	  - must be set
556  * IFF_NOXMIT	  - must be clear
557  * IFF_NOLOCAL	  - must be clear
558  * IFF_DEPRECATED - preferably set (for IPv4)
559  */
560 #define	BEST_FLAG_SET	(IFF_NOFAILOVER | IFF_UP | IFF_DEPRECATED)
561 #define	CLEAR_FLAG_SET	(IFF_NOXMIT | IFF_NOLOCAL)
562 #define	TEST_CLEAR_FLAG_SET	CLEAR_FLAG_SET
563 #define	TEST_MINIMAL_FLAG_SET	(IFF_UP | CLEAR_FLAG_SET)
564 #define	TEST_BEST_FLAG_SET	(BEST_FLAG_SET | CLEAR_FLAG_SET)
565 
566 /*
567  * Stop probing an interface.  Called when an interface is offlined.
568  * The probe socket is closed on each interface instance, and the
569  * interface state set to PI_OFFLINE.
570  */
571 static void
572 stop_probing(struct phyint *pi)
573 {
574 	struct phyint_instance *pii;
575 
576 	pii = pi->pi_v4;
577 	if (pii != NULL) {
578 		if (pii->pii_probe_sock != -1)
579 			close_probe_socket(pii, _B_TRUE);
580 		pii->pii_probe_logint = NULL;
581 	}
582 
583 	pii = pi->pi_v6;
584 	if (pii != NULL) {
585 		if (pii->pii_probe_sock != -1)
586 			close_probe_socket(pii, _B_TRUE);
587 		pii->pii_probe_logint = NULL;
588 	}
589 
590 	phyint_chstate(pi, PI_OFFLINE);
591 }
592 
593 /*
594  * Do the test address selection for each phyint instance. Pick an
595  * IFF_NOFAILOVER address as test address. For singleton case,
596  * if user didn't configure an IFF_NOFAILOVER address, we will pick a
597  * normal address as test address. For (multiple adapter) groups,
598  * user is required to configure IFF_NOFAILOVER test address. Call
599  * phyint_inst_sockinit() to complete the initializations.
600  */
601 static void
602 select_test_ifs(void)
603 {
604 	struct phyint		*pi;
605 	struct phyint_instance	*pii;
606 	struct phyint_instance	*next_pii;
607 	struct logint	*li;
608 	struct logint	*test_logint;
609 	boolean_t target_scan_reqd = _B_FALSE;
610 	struct target *tg;
611 
612 	if (debug & D_PHYINT)
613 		logdebug("select_test_ifs\n");
614 
615 	/*
616 	 * For each phyint instance, do the test address selection
617 	 */
618 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
619 		next_pii = pii->pii_next;
620 		/*
621 		 * An interface that is offline, should not be probed.
622 		 * Offline interfaces should always in PI_OFFLINE state,
623 		 * unless some other entity has set the offline flag.
624 		 */
625 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
626 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
627 				logerr("shouldn't be probing offline"
628 					" interface %s (state is: %u)."
629 					" Stopping probes.\n",
630 					pii->pii_phyint->pi_name,
631 					pii->pii_phyint->pi_state);
632 				stop_probing(pii->pii_phyint);
633 			}
634 			continue;
635 		}
636 
637 		test_logint = pii->pii_probe_logint;
638 
639 		if (test_logint != NULL) {
640 			if ((test_logint->li_flags & TEST_BEST_FLAG_SET)
641 			    == BEST_FLAG_SET)
642 				continue;
643 
644 			/*
645 			 * If user configures IFF_NOXMIT or IFF_NOLOCAL
646 			 * flags on test addresses after in.mpathd has
647 			 * has started, the daemon aborts. In future
648 			 * this can be better handling, i.e. instead
649 			 * of abort the daemon, a more appropriate
650 			 * action may be issuing a warning and choose
651 			 * a different test address.
652 			 */
653 			assert((test_logint->li_flags & TEST_CLEAR_FLAG_SET)
654 			    == 0);
655 		}
656 
657 		/*
658 		 * Walk the logints of this phyint instance, and select
659 		 * the best available test address
660 		 */
661 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
662 			/*
663 			 * Skip any IPv6 logints that are not link-local,
664 			 * since we should always have a link-local address
665 			 * anyway and in6_data() expects link-local replies.
666 			 */
667 			if (pii->pii_af == AF_INET6 &&
668 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
669 				continue;
670 
671 			if ((li->li_flags & TEST_MINIMAL_FLAG_SET) == IFF_UP) {
672 				/*
673 				 * Now we have a testaddress, that satisfies
674 				 * the minimal properties.
675 				 */
676 				if ((li->li_flags & TEST_BEST_FLAG_SET)
677 				    == BEST_FLAG_SET) {
678 					/*
679 					 * This is the best possible address.
680 					 * So break, and continue to the
681 					 * next phyint
682 					 */
683 					test_logint = li;
684 					break;
685 				}
686 				if ((test_logint == NULL) ||
687 				    (!(test_logint->li_flags &
688 				    IFF_NOFAILOVER) &&
689 				    (li->li_flags & IFF_NOFAILOVER)))
690 					/*
691 					 * This is a possible candidate,
692 					 * unless we find a better one.
693 					 */
694 					test_logint = li;
695 			}
696 		}
697 
698 		/*
699 		 * If we've gone from a singleton group to a multiple adapter
700 		 * group, and we haven't found an IFF_NOFAILOVER test address
701 		 * by now, the old test address is no longer valid. If we are
702 		 * not dealing with a singleton group, and the above test
703 		 * address selection loop has selected a non IFF_NOFAILOVER
704 		 * address as a candidate, we will correct that here.
705 		 */
706 		if ((test_logint != NULL) &&
707 		    !SINGLETON_GROUP(pii->pii_phyint) &&
708 		    !(test_logint->li_flags & IFF_NOFAILOVER)) {
709 			test_logint = NULL;
710 			if (pii->pii_probe_sock != -1)
711 				close_probe_socket(pii, _B_TRUE);
712 			pii->pii_probe_logint = NULL;
713 		}
714 
715 		if (test_logint == NULL) {
716 			/*
717 			 * We don't have a test address. Don't print an
718 			 * error message immediately. check_config() will
719 			 * take care of it. Zero out the probe stats array
720 			 * since it is no longer relevant. Optimize by
721 			 * checking if it is already zeroed out.
722 			 */
723 			int pr_ndx;
724 
725 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
726 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
727 				clear_pii_probe_stats(pii);
728 				reset_crtt_all(pii->pii_phyint);
729 			}
730 			continue;
731 		} else if (test_logint == pii->pii_probe_logint) {
732 			/*
733 			 * If we didn't find any new test addr, go to the
734 			 * next phyint.
735 			 */
736 			continue;
737 		}
738 
739 		/*
740 		 * The phyint is either being assigned a new testaddr
741 		 * or is being assigned a testaddr for the 1st time.
742 		 * Need to initialize the phyint socket
743 		 */
744 		pii->pii_probe_logint = test_logint;
745 		if (!phyint_inst_sockinit(pii)) {
746 			if (debug & D_PHYINT) {
747 				logdebug("select_test_ifs: "
748 				    "phyint_sockinit failed\n");
749 			}
750 			phyint_inst_delete(pii);
751 			continue;
752 		}
753 
754 		/*
755 		 * This phyint instance is now enabled for probes; this
756 		 * impacts our state machine in two ways:
757 		 *
758 		 * 1. If we're probe *capable* as well (i.e., we have
759 		 *    probe targets) and the interface is in PI_NOTARGETS,
760 		 *    then transition to PI_RUNNING.
761 		 *
762 		 * 2. If we're not probe capable, and the other phyint
763 		 *    instance is also not probe capable, and we were in
764 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
765 		 *
766 		 * Also see the state diagram in mpd_probe.c.
767 		 */
768 		if (PROBE_CAPABLE(pii)) {
769 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
770 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
771 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
772 			if (pii->pii_phyint->pi_state == PI_RUNNING)
773 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
774 		}
775 
776 		if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
777 			tg = pii->pii_targets;
778 			if (tg != NULL)
779 				target_delete(tg);
780 			assert(pii->pii_targets == NULL);
781 			assert(pii->pii_target_next == NULL);
782 			assert(pii->pii_ntargets == 0);
783 			target_create(pii, test_logint->li_dstaddr,
784 			    _B_TRUE);
785 		}
786 
787 		/*
788 		 * If no targets are currently known for this phyint
789 		 * we need to call init_router_targets. Since
790 		 * init_router_targets() initializes the list of targets
791 		 * for all phyints it is done below the loop.
792 		 */
793 		if (pii->pii_targets == NULL)
794 			target_scan_reqd = _B_TRUE;
795 
796 		/*
797 		 * Start the probe timer for this instance.
798 		 */
799 		if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) {
800 			start_timer(pii);
801 			pii->pii_basetime_inited = 1;
802 		}
803 	}
804 
805 	/*
806 	 * Check the interface list for any interfaces that are marked
807 	 * PI_FAILED but no longer enabled to send probes, and call
808 	 * phyint_check_for_repair() to see if the link now indicates that the
809 	 * interface should be repaired.  Also see the state diagram in
810 	 * mpd_probe.c.
811 	 */
812 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
813 		if (pi->pi_state == PI_FAILED &&
814 		    !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
815 			phyint_check_for_repair(pi);
816 		}
817 	}
818 
819 	/*
820 	 * Try to populate the target list. init_router_targets populates
821 	 * the target list from the routing table. If our target list is
822 	 * still empty, init_host_targets adds host targets based on the
823 	 * host target list of other phyints in the group.
824 	 */
825 	if (target_scan_reqd) {
826 		init_router_targets();
827 		init_host_targets();
828 	}
829 }
830 
831 /*
832  * Check phyint group configuration, to detect any inconsistencies,
833  * and log an error message. This is called from runtimeouts every
834  * 20 secs. But the error message is displayed once. If the
835  * consistency is resolved by the admin, a recovery message is displayed
836  * once.
837  */
838 static void
839 check_config(void)
840 {
841 	struct phyint_group *pg;
842 	struct phyint *pi;
843 	boolean_t v4_in_group;
844 	boolean_t v6_in_group;
845 
846 	/*
847 	 * All phyints of a group must be homogenous to ensure that
848 	 * failover or failback can be done. If any phyint in a group
849 	 * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
850 	 * Do a similar check for IPv6.
851 	 */
852 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
853 		if (pg == phyint_anongroup)
854 			continue;
855 
856 		v4_in_group = _B_FALSE;
857 		v6_in_group = _B_FALSE;
858 		/*
859 		 * 1st pass. Determine if at least 1 phyint in the group
860 		 * has IPv4 plumbed and if so set v4_in_group to true.
861 		 * Repeat similarly for IPv6.
862 		 */
863 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
864 			if (pi->pi_v4 != NULL)
865 				v4_in_group = _B_TRUE;
866 			if (pi->pi_v6 != NULL)
867 				v6_in_group = _B_TRUE;
868 		}
869 
870 		/*
871 		 * 2nd pass. If v4_in_group is true, check that phyint
872 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
873 		 * out a message the 1st time only.
874 		 */
875 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
876 			if (pi->pi_flags & IFF_OFFLINE)
877 				continue;
878 
879 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
880 				if (!pi->pi_cfgmsg_printed) {
881 					logerr("NIC %s of group %s is"
882 					    " not plumbed for IPv4 and may"
883 					    " affect failover capability\n",
884 					    pi->pi_name,
885 					    pi->pi_group->pg_name);
886 					pi->pi_cfgmsg_printed = 1;
887 				}
888 			} else if (v6_in_group == _B_TRUE &&
889 			    pi->pi_v6 == NULL) {
890 				if (!pi->pi_cfgmsg_printed) {
891 					logerr("NIC %s of group %s is"
892 					    " not plumbed for IPv6 and may"
893 					    " affect failover capability\n",
894 					    pi->pi_name,
895 					    pi->pi_group->pg_name);
896 					pi->pi_cfgmsg_printed = 1;
897 				}
898 			} else {
899 				/*
900 				 * The phyint matches the group configuration,
901 				 * if we have reached this point. If it was
902 				 * improperly configured earlier, log an
903 				 * error recovery message
904 				 */
905 				if (pi->pi_cfgmsg_printed) {
906 					logerr("NIC %s is now consistent with "
907 					    "group %s and failover capability "
908 					    "is restored\n", pi->pi_name,
909 					    pi->pi_group->pg_name);
910 					pi->pi_cfgmsg_printed = 0;
911 				}
912 			}
913 
914 		}
915 	}
916 
917 	/*
918 	 * In order to perform probe-based failure detection, a phyint must
919 	 * have at least 1 test/probe address for sending and receiving probes
920 	 * (either on IPv4 or IPv6 instance or both).  If no test address has
921 	 * been configured, notify the administrator, but continue on since we
922 	 * can still perform load spreading, along with "link up/down" based
923 	 * failure detection.
924 	 *
925 	 * Note: In the singleton group case, when user didn't configure
926 	 * a test address, the probe address is picked by this daemon.
927 	 */
928 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
929 		if (pi->pi_flags & IFF_OFFLINE)
930 			continue;
931 
932 		if ((pi->pi_v4 == NULL ||
933 		    pi->pi_v4->pii_probe_logint == NULL) &&
934 		    (pi->pi_v6 == NULL ||
935 		    pi->pi_v6->pii_probe_logint == NULL)) {
936 			if (!pi->pi_taddrmsg_printed) {
937 				logerr("No test address configured on "
938 				    "interface %s; disabling probe-based "
939 				    "failure detection on it\n", pi->pi_name);
940 				pi->pi_taddrmsg_printed = 1;
941 			}
942 		} else if (pi->pi_taddrmsg_printed) {
943 			logerr("Test address now configured on interface %s; "
944 			    "enabling probe-based failure detection on it\n",
945 			    pi->pi_name);
946 			pi->pi_taddrmsg_printed = 0;
947 		}
948 
949 	}
950 }
951 
952 /*
953  * Timer mechanism using relative time (in milliseconds) from the
954  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
955  * will fire after TIMER_INFINITY milliseconds.
956  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
957  * time values. Hence 2 consecutive timer events cannot be spaced farther
958  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
959  * that can be passed for the delay parameter of timer_schedule()
960  */
961 static uint_t timer_next;	/* Currently scheduled timeout */
962 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
963 
964 static void
965 timer_init(void)
966 {
967 	timer_next = getcurrenttime() + TIMER_INFINITY;
968 	/*
969 	 * The call to run_timeouts() will get the timer started
970 	 * Since there are no phyints at this point, the timer will
971 	 * be set for IF_SCAN_INTERVAL ms.
972 	 */
973 	run_timeouts();
974 }
975 
976 /*
977  * Make sure the next SIGALRM occurs delay milliseconds from the current
978  * time if not earlier. We are interested only in time differences.
979  */
980 void
981 timer_schedule(uint_t delay)
982 {
983 	uint_t now;
984 	struct itimerval itimerval;
985 
986 	if (debug & D_TIMER)
987 		logdebug("timer_schedule(%u)\n", delay);
988 
989 	assert(delay <= TIMER_INFINITY);
990 
991 	now = getcurrenttime();
992 	if (delay == 0) {
993 		/* Minimum allowed delay */
994 		delay = 1;
995 	}
996 	/* Will this timer occur before the currently scheduled SIGALRM? */
997 	if (timer_active && TIME_GE(now + delay, timer_next)) {
998 		if (debug & D_TIMER) {
999 			logdebug("timer_schedule(%u) - no action: "
1000 			    "now %u next %u\n", delay, now, timer_next);
1001 		}
1002 		return;
1003 	}
1004 	timer_next = now + delay;
1005 
1006 	itimerval.it_value.tv_sec = delay / 1000;
1007 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
1008 	itimerval.it_interval.tv_sec = 0;
1009 	itimerval.it_interval.tv_usec = 0;
1010 	if (debug & D_TIMER) {
1011 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1012 		    delay, itimerval.it_value.tv_sec,
1013 		    itimerval.it_value.tv_usec);
1014 	}
1015 	timer_active = _B_TRUE;
1016 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1017 		logperror("timer_schedule: setitimer");
1018 		exit(2);
1019 	}
1020 }
1021 
1022 /*
1023  * Timer has fired. Determine when the next timer event will occur by asking
1024  * all the timer routines. Should not be called from a timer routine.
1025  */
1026 static void
1027 run_timeouts(void)
1028 {
1029 	uint_t next;
1030 	uint_t next_event_time;
1031 	struct phyint_instance *pii;
1032 	struct phyint_instance *next_pii;
1033 	static boolean_t timeout_running;
1034 
1035 	/* assert that recursive timeouts don't happen. */
1036 	assert(!timeout_running);
1037 
1038 	timeout_running = _B_TRUE;
1039 
1040 	if (debug & D_TIMER)
1041 		logdebug("run_timeouts()\n");
1042 
1043 	next = TIMER_INFINITY;
1044 
1045 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1046 		next_pii = pii->pii_next;
1047 		next_event_time = phyint_inst_timer(pii);
1048 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1049 			next = next_event_time;
1050 
1051 		if (debug & D_TIMER) {
1052 			logdebug("run_timeouts(%s %s): next scheduled for"
1053 			    " this phyint inst %u, next scheduled global"
1054 			    " %u ms\n",
1055 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1056 			    next_event_time, next);
1057 		}
1058 	}
1059 
1060 	/*
1061 	 * Make sure initifs() is called at least once every
1062 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1063 	 * with the kernel, in case we have missed any routing
1064 	 * socket messages.
1065 	 */
1066 	if (next > IF_SCAN_INTERVAL)
1067 		next = IF_SCAN_INTERVAL;
1068 
1069 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1070 		initifs();
1071 		check_config();
1072 	}
1073 
1074 	if (debug & D_TIMER)
1075 		logdebug("run_timeouts: %u ms\n", next);
1076 
1077 	timer_schedule(next);
1078 	timeout_running = _B_FALSE;
1079 }
1080 
1081 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1082 static int eventpipe_write = -1;
1083 static boolean_t cleanup_started = _B_FALSE;
1084 				/* Don't write to eventpipe if in cleanup */
1085 /*
1086  * Ensure that signals are processed synchronously with the rest of
1087  * the code by just writing a one character signal number on the pipe.
1088  * The poll loop will pick this up and process the signal event.
1089  */
1090 static void
1091 sig_handler(int signo)
1092 {
1093 	uchar_t buf = (uchar_t)signo;
1094 
1095 	/*
1096 	 * Don't write to pipe if cleanup has already begun. cleanup()
1097 	 * might have closed the pipe already
1098 	 */
1099 	if (cleanup_started)
1100 		return;
1101 
1102 	if (eventpipe_write == -1) {
1103 		logerr("sig_handler: no pipe found\n");
1104 		return;
1105 	}
1106 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1107 		logperror("sig_handler: write");
1108 }
1109 
1110 extern struct probes_missed probes_missed;
1111 
1112 /*
1113  * Pick up a signal "byte" from the pipe and process it.
1114  */
1115 static void
1116 in_signal(int fd)
1117 {
1118 	uchar_t buf;
1119 	uint64_t  sent, acked, lost, unacked, unknown;
1120 	struct phyint_instance *pii;
1121 	int pr_ndx;
1122 
1123 	switch (read(fd, &buf, sizeof (buf))) {
1124 	case -1:
1125 		logperror("in_signal: read");
1126 		exit(1);
1127 		/* NOTREACHED */
1128 	case 1:
1129 		break;
1130 	case 0:
1131 		logerr("in_signal: read end of file\n");
1132 		exit(1);
1133 		/* NOTREACHED */
1134 	default:
1135 		logerr("in_signal: read > 1\n");
1136 		exit(1);
1137 	}
1138 
1139 	if (debug & D_TIMER)
1140 		logdebug("in_signal() got %d\n", buf);
1141 
1142 	switch (buf) {
1143 	case SIGALRM:
1144 		if (debug & D_TIMER) {
1145 			uint_t now = getcurrenttime();
1146 
1147 			logdebug("in_signal(SIGALRM) delta %u\n",
1148 			    now - timer_next);
1149 		}
1150 		timer_active = _B_FALSE;
1151 		run_timeouts();
1152 		break;
1153 	case SIGUSR1:
1154 		logdebug("Printing configuration:\n");
1155 		/* Print out the internal tables */
1156 		phyint_inst_print_all();
1157 
1158 		/*
1159 		 * Print out the accumulated statistics about missed
1160 		 * probes (happens due to scheduling delay).
1161 		 */
1162 		logerr("Missed sending total of %d probes spread over"
1163 		    " %d occurrences\n", probes_missed.pm_nprobes,
1164 		    probes_missed.pm_ntimes);
1165 
1166 		/*
1167 		 * Print out the accumulated statistics about probes
1168 		 * that were sent.
1169 		 */
1170 		for (pii = phyint_instances; pii != NULL;
1171 		    pii = pii->pii_next) {
1172 			unacked = 0;
1173 			acked = pii->pii_cum_stats.acked;
1174 			lost = pii->pii_cum_stats.lost;
1175 			sent = pii->pii_cum_stats.sent;
1176 			unknown = pii->pii_cum_stats.unknown;
1177 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1178 				switch (pii->pii_probes[pr_ndx].pr_status) {
1179 				case PR_ACKED:
1180 					acked++;
1181 					break;
1182 				case PR_LOST:
1183 					lost++;
1184 					break;
1185 				case PR_UNACKED:
1186 					unacked++;
1187 					break;
1188 				}
1189 			}
1190 			logerr("\nProbe stats on (%s %s)\n"
1191 			    "Number of probes sent %lld\n"
1192 			    "Number of probe acks received %lld\n"
1193 			    "Number of probes/acks lost %lld\n"
1194 			    "Number of valid unacknowled probes %lld\n"
1195 			    "Number of ambiguous probe acks received %lld\n",
1196 			    AF_STR(pii->pii_af), pii->pii_name,
1197 			    sent, acked, lost, unacked, unknown);
1198 		}
1199 		break;
1200 	case SIGHUP:
1201 		logerr("SIGHUP: restart and reread config file\n");
1202 		cleanup();
1203 		(void) execv(argv0[0], argv0);
1204 		_exit(0177);
1205 		/* NOTREACHED */
1206 	case SIGINT:
1207 	case SIGTERM:
1208 	case SIGQUIT:
1209 		cleanup();
1210 		exit(0);
1211 		/* NOTREACHED */
1212 	default:
1213 		logerr("in_signal: unknown signal: %d\n", buf);
1214 	}
1215 }
1216 
1217 static void
1218 cleanup(void)
1219 {
1220 	struct phyint_instance *pii;
1221 	struct phyint_instance *next_pii;
1222 
1223 	/*
1224 	 * Make sure that we don't write to eventpipe in
1225 	 * sig_handler() if any signal notably SIGALRM,
1226 	 * occurs after we close the eventpipe descriptor below
1227 	 */
1228 	cleanup_started = _B_TRUE;
1229 
1230 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1231 		next_pii = pii->pii_next;
1232 		phyint_inst_delete(pii);
1233 	}
1234 
1235 	(void) close(ifsock_v4);
1236 	(void) close(ifsock_v6);
1237 	(void) close(rtsock_v4);
1238 	(void) close(rtsock_v6);
1239 	(void) close(lsock_v4);
1240 	(void) close(lsock_v6);
1241 	(void) close(0);
1242 	(void) close(1);
1243 	(void) close(2);
1244 	(void) close(mibfd);
1245 	(void) close(eventpipe_read);
1246 	(void) close(eventpipe_write);
1247 }
1248 
1249 /*
1250  * Create pipe for signal delivery and set up signal handlers.
1251  */
1252 static void
1253 setup_eventpipe(void)
1254 {
1255 	int fds[2];
1256 	struct sigaction act;
1257 
1258 	if ((pipe(fds)) < 0) {
1259 		logperror("setup_eventpipe: pipe");
1260 		exit(1);
1261 	}
1262 	eventpipe_read = fds[0];
1263 	eventpipe_write = fds[1];
1264 	if (poll_add(eventpipe_read) == -1) {
1265 		exit(1);
1266 	}
1267 
1268 	act.sa_handler = sig_handler;
1269 	act.sa_flags = SA_RESTART;
1270 	(void) sigaction(SIGALRM, &act, NULL);
1271 
1272 	(void) sigset(SIGHUP, sig_handler);
1273 	(void) sigset(SIGUSR1, sig_handler);
1274 	(void) sigset(SIGTERM, sig_handler);
1275 	(void) sigset(SIGINT, sig_handler);
1276 	(void) sigset(SIGQUIT, sig_handler);
1277 }
1278 
1279 /*
1280  * Create a routing socket for receiving RTM_IFINFO messages.
1281  */
1282 static int
1283 setup_rtsock(int af)
1284 {
1285 	int	s;
1286 	int	flags;
1287 
1288 	s = socket(PF_ROUTE, SOCK_RAW, af);
1289 	if (s == -1) {
1290 		logperror("setup_rtsock: socket PF_ROUTE");
1291 		exit(1);
1292 	}
1293 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1294 		logperror("setup_rtsock: fcntl F_GETFL");
1295 		(void) close(s);
1296 		exit(1);
1297 	}
1298 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1299 		logperror("setup_rtsock: fcntl F_SETFL");
1300 		(void) close(s);
1301 		exit(1);
1302 	}
1303 	if (poll_add(s) == -1) {
1304 		(void) close(s);
1305 		exit(1);
1306 	}
1307 	return (s);
1308 }
1309 
1310 /*
1311  * Process an RTM_IFINFO message received on a routing socket.
1312  * The return value indicates whether a full interface scan is required.
1313  * Link up/down notifications from the NICs are reflected in the
1314  * IFF_RUNNING flag.
1315  * If just the state of the IFF_RUNNING interface flag has changed, a
1316  * a full interface scan isn't required.
1317  */
1318 static boolean_t
1319 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1320 {
1321 	struct sockaddr_dl *sdl;
1322 	struct phyint *pi;
1323 	uint64_t old_flags;
1324 	struct phyint_instance *pii;
1325 
1326 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1327 
1328 	/*
1329 	 * Although the sockaddr_dl structure is directly after the
1330 	 * if_msghdr_t structure. At the time of writing, the size of the
1331 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1332 	 * to the presence of a timeval structure, which contains longs,
1333 	 * in the if_data structure.  Anyway, we know where the message ends,
1334 	 * so we work backwards to get the start of the sockaddr_dl structure.
1335 	 */
1336 	/*LINTED*/
1337 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1338 		sizeof (struct sockaddr_dl));
1339 
1340 	assert(sdl->sdl_family == AF_LINK);
1341 
1342 	/*
1343 	 * The interface name is in sdl_data.
1344 	 * RTM_IFINFO messages are only generated for logical interface
1345 	 * zero, so there is no colon and logical interface number to
1346 	 * strip from the name.	 The name is not null terminated, but
1347 	 * there should be enough space in sdl_data to add the null.
1348 	 */
1349 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1350 		if (debug & D_LINKNOTE)
1351 			logdebug("process_rtm_ifinfo: "
1352 				"phyint name too long\n");
1353 		return (_B_TRUE);
1354 	}
1355 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1356 
1357 	pi = phyint_lookup(sdl->sdl_data);
1358 	if (pi == NULL) {
1359 		if (debug & D_LINKNOTE)
1360 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1361 				" for %s\n", sdl->sdl_data);
1362 		return (_B_TRUE);
1363 	}
1364 
1365 	/*
1366 	 * We want to try and avoid doing a full interface scan for
1367 	 * link state notifications from the NICs, as indicated
1368 	 * by the state of the IFF_RUNNING flag.  If just the
1369 	 * IFF_RUNNING flag has changed state, the link state changes
1370 	 * are processed without a full scan.
1371 	 * If there is both an IPv4 and IPv6 instance associated with
1372 	 * the physical interface, we will get an RTM_IFINFO message
1373 	 * for each instance.  If we just maintained a single copy of
1374 	 * the physical interface flags, it would appear that no flags
1375 	 * had changed when the second message is processed, leading us
1376 	 * to believe that the message wasn't generated by a flags change,
1377 	 * and that a full interface scan is required.
1378 	 * To get around this problem, two additional copies of the flags
1379 	 * are kept, one copy for each instance.  These are only used in
1380 	 * this routine.  At any one time, all three copies of the flags
1381 	 * should be identical except for the IFF_RUNNING flag.	 The
1382 	 * copy of the flags in the "phyint" structure is always up to
1383 	 * date.
1384 	 */
1385 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1386 	if (pii == NULL) {
1387 		if (debug & D_LINKNOTE)
1388 			logdebug("process_rtm_ifinfo: no instance of address "
1389 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1390 		return (_B_TRUE);
1391 	}
1392 
1393 	old_flags = pii->pii_flags;
1394 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1395 	pi->pi_flags = pii->pii_flags;
1396 
1397 	if (debug & D_LINKNOTE) {
1398 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1399 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1400 		    AF_STR(type), old_flags, pi->pi_flags);
1401 	}
1402 
1403 	/*
1404 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1405 	 * types.
1406 	 */
1407 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1408 		phyint_newtype(pi);
1409 
1410 	/*
1411 	 * If IFF_INACTIVE has been set, then no data addresses should be
1412 	 * hosted on the interface.  If IFF_INACTIVE has been cleared, then
1413 	 * move previously failed-over addresses back to it, provided it is
1414 	 * not failed.	For details, see the state diagram in mpd_probe.c.
1415 	 */
1416 	if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
1417 		if (pii->pii_flags & IFF_INACTIVE) {
1418 			assert(pii->pii_flags & IFF_STANDBY);
1419 			if (!pi->pi_empty) {
1420 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
1421 			}
1422 		} else {
1423 			if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
1424 				pi->pi_empty = 0;
1425 				(void) try_failback(pi, _B_FALSE);
1426 			}
1427 		}
1428 	}
1429 
1430 	/* Has just the IFF_RUNNING flag changed state ? */
1431 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1432 		struct phyint_instance *pii_other;
1433 		/*
1434 		 * It wasn't just a link state change.	Update
1435 		 * the other instance's copy of the flags.
1436 		 */
1437 		pii_other = phyint_inst_other(pii);
1438 		if (pii_other != NULL)
1439 			pii_other->pii_flags = pii->pii_flags;
1440 		return (_B_TRUE);
1441 	}
1442 
1443 	return (_B_FALSE);
1444 }
1445 
1446 /*
1447  * Retrieve as many routing socket messages as possible, and try to
1448  * empty the routing sockets. Initiate full scan of targets or interfaces
1449  * as needed.
1450  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1451  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1452  */
1453 static void
1454 process_rtsock(int rtsock_v4, int rtsock_v6)
1455 {
1456 	int	nbytes;
1457 	int64_t msg[2048 / 8];
1458 	struct rt_msghdr *rtm;
1459 	boolean_t need_if_scan = _B_FALSE;
1460 	boolean_t need_rt_scan = _B_FALSE;
1461 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1462 	int type;
1463 
1464 	/* Read as many messages as possible and try to empty the sockets */
1465 	for (type = AF_INET; ; type = AF_INET6) {
1466 		for (;;) {
1467 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1468 				rtsock_v6, msg, sizeof (msg));
1469 			if (nbytes <= 0) {
1470 				/* No more messages */
1471 				break;
1472 			}
1473 			rtm = (struct rt_msghdr *)msg;
1474 			if (rtm->rtm_version != RTM_VERSION) {
1475 				logerr("process_rtsock: version %d "
1476 				    "not understood\n", rtm->rtm_version);
1477 				break;
1478 			}
1479 
1480 			if (debug & D_PHYINT) {
1481 				logdebug("process_rtsock: message %d\n",
1482 				    rtm->rtm_type);
1483 			}
1484 
1485 			switch (rtm->rtm_type) {
1486 			case RTM_NEWADDR:
1487 			case RTM_DELADDR:
1488 				/*
1489 				 * Some logical interface has changed,
1490 				 * have to scan everything to determine
1491 				 * what actually changed.
1492 				 */
1493 				need_if_scan = _B_TRUE;
1494 				break;
1495 
1496 			case RTM_IFINFO:
1497 				rtm_ifinfo_seen = _B_TRUE;
1498 				need_if_scan |=
1499 					process_rtm_ifinfo((if_msghdr_t *)rtm,
1500 					type);
1501 				break;
1502 
1503 			case RTM_ADD:
1504 			case RTM_DELETE:
1505 			case RTM_CHANGE:
1506 			case RTM_OLDADD:
1507 			case RTM_OLDDEL:
1508 				need_rt_scan = _B_TRUE;
1509 				break;
1510 
1511 			default:
1512 				/* Not interesting */
1513 				break;
1514 			}
1515 		}
1516 		if (type == AF_INET6)
1517 			break;
1518 	}
1519 
1520 	if (need_if_scan) {
1521 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1522 			logdebug("process_rtsock: synchronizing with kernel\n");
1523 		initifs();
1524 	} else if (rtm_ifinfo_seen) {
1525 		if (debug & D_LINKNOTE)
1526 			logdebug("process_rtsock: "
1527 			    "link up/down notification(s) seen\n");
1528 		process_link_state_changes();
1529 	}
1530 
1531 	if (need_rt_scan)
1532 		init_router_targets();
1533 }
1534 
1535 /*
1536  * Look if the phyint instance or one of its logints have been removed from
1537  * the kernel and take appropriate action.
1538  * Uses {pii,li}_in_use.
1539  */
1540 static void
1541 check_if_removed(struct phyint_instance *pii)
1542 {
1543 	struct logint *li;
1544 	struct logint *next_li;
1545 
1546 	/* Detect phyints that have been removed from the kernel. */
1547 	if (!pii->pii_in_use) {
1548 		logtrace("%s %s has been removed from kernel\n",
1549 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1550 		phyint_inst_delete(pii);
1551 	} else {
1552 		/* Detect logints that have been removed. */
1553 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1554 			next_li = li->li_next;
1555 			if (!li->li_in_use) {
1556 				logint_delete(li);
1557 			}
1558 		}
1559 	}
1560 }
1561 
1562 /*
1563  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1564  * tables defined by mib2.h. Parse the returned data and extract
1565  * the 'routing' information table. Process the 'routing' table
1566  * to get the list of known onlink routers, and update our database.
1567  * These onlink routers will serve as our probe targets.
1568  * Returns false, if any system calls resulted in errors, true otherwise.
1569  */
1570 static boolean_t
1571 update_router_list(int fd)
1572 {
1573 	union {
1574 		char	ubuf[1024];
1575 		union T_primitives uprim;
1576 	} buf;
1577 
1578 	int			flags;
1579 	struct strbuf		ctlbuf;
1580 	struct strbuf		databuf;
1581 	struct T_optmgmt_req	*tor;
1582 	struct T_optmgmt_ack	*toa;
1583 	struct T_error_ack	*tea;
1584 	struct opthdr		*optp;
1585 	struct opthdr		*req;
1586 	int			status;
1587 	t_scalar_t		prim;
1588 
1589 	tor = (struct T_optmgmt_req *)&buf;
1590 
1591 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1592 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1593 	tor->OPT_length = sizeof (struct opthdr);
1594 	tor->MGMT_flags = T_CURRENT;
1595 
1596 	req = (struct opthdr *)&tor[1];
1597 	req->level = MIB2_IP;	/* any MIB2_xxx value ok here */
1598 	req->name  = 0;
1599 	req->len   = 0;
1600 
1601 	ctlbuf.buf = (char *)&buf;
1602 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1603 	ctlbuf.maxlen = sizeof (buf);
1604 	flags = 0;
1605 	if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
1606 		logperror("update_router_list: putmsg(ctl)");
1607 		return (_B_FALSE);
1608 	}
1609 
1610 	/*
1611 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1612 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1613 	 * a control and data part. The control part contains a struct
1614 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1615 	 * the level, name and length of the data in the data part. The
1616 	 * data part contains the actual table data. The last message
1617 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1618 	 * single option with zero optlen.
1619 	 */
1620 
1621 	for (;;) {
1622 		/*
1623 		 * Go around this loop once for each table. Ignore
1624 		 * all tables except the routing information table.
1625 		 */
1626 		flags = 0;
1627 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1628 		if (status < 0) {
1629 			if (errno == EINTR)
1630 				continue;
1631 			logperror("update_router_list: getmsg(ctl)");
1632 			return (_B_FALSE);
1633 		}
1634 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1635 			logerr("update_router_list: ctlbuf.len %d\n",
1636 			    ctlbuf.len);
1637 			return (_B_FALSE);
1638 		}
1639 
1640 		prim = buf.uprim.type;
1641 
1642 		switch (prim) {
1643 
1644 		case T_ERROR_ACK:
1645 			tea = &buf.uprim.error_ack;
1646 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1647 				logerr("update_router_list: T_ERROR_ACK"
1648 				    " ctlbuf.len %d\n", ctlbuf.len);
1649 				return (_B_FALSE);
1650 			}
1651 			logerr("update_router_list: T_ERROR_ACK:"
1652 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1653 			    tea->TLI_error, tea->UNIX_error);
1654 			return (_B_FALSE);
1655 
1656 		case T_OPTMGMT_ACK:
1657 			toa = &buf.uprim.optmgmt_ack;
1658 			optp = (struct opthdr *)&toa[1];
1659 			if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
1660 				logerr("update_router_list: ctlbuf.len %d\n",
1661 				    ctlbuf.len);
1662 				return (_B_FALSE);
1663 			}
1664 			if (toa->MGMT_flags != T_SUCCESS) {
1665 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1666 				    toa->MGMT_flags);
1667 				return (_B_FALSE);
1668 			}
1669 			break;
1670 
1671 		default:
1672 			logerr("update_router_list: unknown primitive %ld\n",
1673 			    prim);
1674 			return (_B_FALSE);
1675 		}
1676 
1677 		/* Process the T_OPGMGMT_ACK below */
1678 		assert(prim == T_OPTMGMT_ACK);
1679 
1680 		switch (status) {
1681 		case 0:
1682 			/*
1683 			 * We have reached the end of this T_OPTMGMT_ACK
1684 			 * message. If this is the last message i.e EOD,
1685 			 * return, else process the next T_OPTMGMT_ACK msg.
1686 			 */
1687 			if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
1688 			    sizeof (struct opthdr)) && optp->len == 0 &&
1689 			    optp->name == 0 && optp->level == 0) {
1690 				/*
1691 				 * This is the EOD message. Return
1692 				 */
1693 				return (_B_TRUE);
1694 			}
1695 			continue;
1696 
1697 		case MORECTL:
1698 		case MORECTL | MOREDATA:
1699 			/*
1700 			 * This should not happen. We should be able to read
1701 			 * the control portion in a single getmsg.
1702 			 */
1703 			logerr("update_router_list: MORECTL\n");
1704 			return (_B_FALSE);
1705 
1706 		case MOREDATA:
1707 			databuf.maxlen = optp->len;
1708 			/* malloc of 0 bytes is ok */
1709 			databuf.buf = malloc((size_t)optp->len);
1710 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1711 				logperror("update_router_list: malloc");
1712 				return (_B_FALSE);
1713 			}
1714 			databuf.len = 0;
1715 			flags = 0;
1716 			for (;;) {
1717 				status = getmsg(fd, NULL, &databuf, &flags);
1718 				if (status >= 0) {
1719 					break;
1720 				} else if (errno == EINTR) {
1721 					continue;
1722 				} else {
1723 					logperror("update_router_list:"
1724 					    " getmsg(data)");
1725 					free(databuf.buf);
1726 					return (_B_FALSE);
1727 				}
1728 			}
1729 
1730 			if (optp->level == MIB2_IP &&
1731 			    optp->name == MIB2_IP_ROUTE) {
1732 				/* LINTED */
1733 				ire_process_v4((mib2_ipRouteEntry_t *)
1734 				    databuf.buf, databuf.len);
1735 			} else if (optp->level == MIB2_IP6 &&
1736 			    optp->name == MIB2_IP6_ROUTE) {
1737 				/* LINTED */
1738 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1739 				    databuf.buf, databuf.len);
1740 			}
1741 			free(databuf.buf);
1742 		}
1743 	}
1744 	/* NOTREACHED */
1745 }
1746 
1747 /*
1748  * Examine the IPv4 routing table, for default routers. For each default
1749  * router, populate the list of targets of each phyint that is on the same
1750  * link as the default router
1751  */
1752 static void
1753 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1754 {
1755 	mib2_ipRouteEntry_t	*rp;
1756 	mib2_ipRouteEntry_t	*rp1;
1757 	struct	in_addr		nexthop_v4;
1758 	mib2_ipRouteEntry_t	*endp;
1759 
1760 	if (len == 0)
1761 		return;
1762 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1763 
1764 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1765 
1766 	/*
1767 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1768 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1769 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1770 	 * This is a potential target for probing, which we try to add
1771 	 * to the list of probe targets.
1772 	 */
1773 	for (rp = buf; rp < endp; rp++) {
1774 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1775 			continue;
1776 
1777 		/*  Get the nexthop address. */
1778 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1779 
1780 		/*
1781 		 * Get the nexthop address. Then determine the outgoing
1782 		 * interface, by examining all interface IREs, and picking the
1783 		 * match. We don't look at the interface specified in the route
1784 		 * because we need to add the router target on all matching
1785 		 * interfaces anyway; the goal is to avoid falling back to
1786 		 * multicast when some interfaces are in the same subnet but
1787 		 * not in the same group.
1788 		 */
1789 		for (rp1 = buf; rp1 < endp; rp1++) {
1790 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
1791 				continue;
1792 			}
1793 
1794 			/*
1795 			 * Determine the interface IRE that matches the nexthop.
1796 			 * i.e.	 (IRE addr & IRE mask) == (nexthop & IRE mask)
1797 			 */
1798 			if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
1799 			    (nexthop_v4.s_addr & rp1->ipRouteMask)) {
1800 				/*
1801 				 * We found the interface ire
1802 				 */
1803 				router_add_v4(rp1, nexthop_v4);
1804 			}
1805 		}
1806 	}
1807 }
1808 
1809 void
1810 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
1811 {
1812 	char *cp;
1813 	char ifname[LIFNAMSIZ + 1];
1814 	struct in6_addr	nexthop;
1815 	int len;
1816 
1817 	if (debug & D_TARGET)
1818 		logdebug("router_add_v4()\n");
1819 
1820 	len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
1821 	(void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
1822 	ifname[len] = '\0';
1823 
1824 	if (ifname[0] == '\0')
1825 		return;
1826 
1827 	cp = strchr(ifname, IF_SEPARATOR);
1828 	if (cp != NULL)
1829 		*cp = '\0';
1830 
1831 	IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1832 	router_add_common(AF_INET, ifname, nexthop);
1833 }
1834 
1835 void
1836 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1837 {
1838 	struct phyint_instance *pii;
1839 	struct phyint *pi;
1840 
1841 	if (debug & D_TARGET)
1842 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1843 
1844 	/*
1845 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1846 	 */
1847 	pii = phyint_inst_lookup(af, ifname);
1848 	if (pii == NULL)
1849 		return;
1850 
1851 	/*
1852 	 * Don't use our own addresses as targets.
1853 	 */
1854 	if (own_address(pii->pii_af, nexthop))
1855 		return;
1856 
1857 	/*
1858 	 * If the phyint is part a named group, then add the address to all
1859 	 * members of the group; note that this is suboptimal in the IPv4 case
1860 	 * as it has already been added to all matching interfaces in
1861 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1862 	 * itself, since other phyints in the anongroup may not be on the same
1863 	 * subnet.
1864 	 */
1865 	pi = pii->pii_phyint;
1866 	if (pi->pi_group == phyint_anongroup) {
1867 		target_add(pii, nexthop, _B_TRUE);
1868 	} else {
1869 		pi = pi->pi_group->pg_phyint;
1870 		for (; pi != NULL; pi = pi->pi_pgnext)
1871 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1872 	}
1873 }
1874 
1875 /*
1876  * Examine the IPv6 routing table, for default routers. For each default
1877  * router, populate the list of targets of each phyint that is on the same
1878  * link as the default router
1879  */
1880 static void
1881 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1882 {
1883 	mib2_ipv6RouteEntry_t	*rp;
1884 	mib2_ipv6RouteEntry_t	*endp;
1885 	struct	in6_addr nexthop_v6;
1886 
1887 	if (debug & D_TARGET)
1888 		logdebug("ire_process_v6(len %d)\n", len);
1889 
1890 	if (len == 0)
1891 		return;
1892 
1893 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1894 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1895 
1896 	/*
1897 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1898 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1899 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1900 	 * This is a potential target for probing, which we try to add
1901 	 * to the list of probe targets.
1902 	 */
1903 	for (rp = buf; rp < endp; rp++) {
1904 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
1905 			continue;
1906 
1907 		/*
1908 		 * We have the outgoing interface in ipv6RouteIfIndex
1909 		 * if ipv6RouteIfindex.o_length is non-zero. The outgoing
1910 		 * interface must be present for link-local addresses. Since
1911 		 * we use only link-local addreses for probing, we don't
1912 		 * consider the case when the outgoing interface is not
1913 		 * known and we need to scan interface ires
1914 		 */
1915 		nexthop_v6 = rp->ipv6RouteNextHop;
1916 		if (rp->ipv6RouteIfIndex.o_length != 0) {
1917 			/*
1918 			 * We already have the outgoing interface
1919 			 * in ipv6RouteIfIndex.
1920 			 */
1921 			router_add_v6(rp, nexthop_v6);
1922 		}
1923 	}
1924 }
1925 
1926 
1927 void
1928 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
1929 {
1930 	char ifname[LIFNAMSIZ + 1];
1931 	char *cp;
1932 	int  len;
1933 
1934 	if (debug & D_TARGET)
1935 		logdebug("router_add_v6()\n");
1936 
1937 	len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
1938 	(void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
1939 	ifname[len] = '\0';
1940 
1941 	if (ifname[0] == '\0')
1942 		return;
1943 
1944 	cp = strchr(ifname, IF_SEPARATOR);
1945 	if (cp != NULL)
1946 		*cp = '\0';
1947 
1948 	router_add_common(AF_INET6, ifname, nexthop_v6);
1949 }
1950 
1951 
1952 
1953 /*
1954  * Build a list of target routers, by scanning the routing tables.
1955  * It is assumed that interface routes exist, to reach the routers.
1956  */
1957 static void
1958 init_router_targets(void)
1959 {
1960 	struct	target *tg;
1961 	struct	target *next_tg;
1962 	struct	phyint_instance *pii;
1963 	struct	phyint *pi;
1964 
1965 	if (force_mcast)
1966 		return;
1967 
1968 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1969 		pi = pii->pii_phyint;
1970 		/*
1971 		 * Exclude ptp and host targets. Set tg_in_use to false,
1972 		 * only for router targets.
1973 		 */
1974 		if (!pii->pii_targets_are_routers ||
1975 		    (pi->pi_flags & IFF_POINTOPOINT))
1976 			continue;
1977 
1978 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1979 			tg->tg_in_use = 0;
1980 	}
1981 
1982 	if (mibfd < 0) {
1983 		mibfd = open("/dev/ip", O_RDWR);
1984 		if (mibfd < 0) {
1985 			logperror("mibopen: ip open");
1986 			exit(1);
1987 		}
1988 	}
1989 
1990 	if (!update_router_list(mibfd)) {
1991 		(void) close(mibfd);
1992 		mibfd = -1;
1993 	}
1994 
1995 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1996 		if (!pii->pii_targets_are_routers ||
1997 		    (pi->pi_flags & IFF_POINTOPOINT))
1998 			continue;
1999 
2000 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
2001 			next_tg = tg->tg_next;
2002 			if (!tg->tg_in_use) {
2003 				target_delete(tg);
2004 			}
2005 		}
2006 	}
2007 }
2008 
2009 /*
2010  * Attempt to assign host targets to any interfaces that do not currently
2011  * have probe targets by sharing targets with other interfaces in the group.
2012  */
2013 static void
2014 init_host_targets(void)
2015 {
2016 	struct phyint_instance *pii;
2017 	struct phyint_group *pg;
2018 
2019 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2020 		pg = pii->pii_phyint->pi_group;
2021 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
2022 			dup_host_targets(pii);
2023 	}
2024 }
2025 
2026 /*
2027  * Duplicate host targets from other phyints of the group to
2028  * the phyint instance 'desired_pii'.
2029  */
2030 static void
2031 dup_host_targets(struct phyint_instance	 *desired_pii)
2032 {
2033 	int af;
2034 	struct phyint *pi;
2035 	struct phyint_instance *pii;
2036 	struct target *tg;
2037 
2038 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
2039 
2040 	af = desired_pii->pii_af;
2041 
2042 	/*
2043 	 * For every phyint in the same group as desired_pii, check if
2044 	 * it has any host targets. If so add them to desired_pii.
2045 	 */
2046 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
2047 		pii = PHYINT_INSTANCE(pi, af);
2048 		/*
2049 		 * We know that we don't have targets on this phyint instance
2050 		 * since we have been called. But we still check for
2051 		 * pii_targets_are_routers because another phyint instance
2052 		 * could have router targets, since IFF_NOFAILOVER addresses
2053 		 * on different phyint instances may belong to different
2054 		 * subnets.
2055 		 */
2056 		if ((pii == NULL) || (pii == desired_pii) ||
2057 		    pii->pii_targets_are_routers)
2058 			continue;
2059 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2060 			target_create(desired_pii, tg->tg_address, _B_FALSE);
2061 		}
2062 	}
2063 }
2064 
2065 static void
2066 usage(char *cmd)
2067 {
2068 	(void) fprintf(stderr, "usage: %s\n", cmd);
2069 }
2070 
2071 
2072 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2073 
2074 /* Get an option from the /etc/default/mpathd file */
2075 static char *
2076 getdefault(char *name)
2077 {
2078 	char namebuf[BUFSIZ];
2079 	char *value = NULL;
2080 
2081 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2082 		char	*cp;
2083 		int	flags;
2084 
2085 		/*
2086 		 * ignore case
2087 		 */
2088 		flags = defcntl(DC_GETFLAGS, 0);
2089 		TURNOFF(flags, DC_CASE);
2090 		(void) defcntl(DC_SETFLAGS, flags);
2091 
2092 		/* Add "=" to the name */
2093 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2094 		(void) strncat(namebuf, "=", 2);
2095 
2096 		if ((cp = defread(namebuf)) != NULL)
2097 			value = strdup(cp);
2098 
2099 		/* close */
2100 		(void) defopen((char *)NULL);
2101 	}
2102 	return (value);
2103 }
2104 
2105 
2106 /*
2107  * Command line options below
2108  */
2109 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2110 boolean_t	track_all_phyints = _B_FALSE;	/* option to track all NICs */
2111 static boolean_t adopt = _B_FALSE;
2112 static boolean_t foreground = _B_FALSE;
2113 
2114 int
2115 main(int argc, char *argv[])
2116 {
2117 	int i;
2118 	int c;
2119 	struct phyint_instance *pii;
2120 	char *value;
2121 
2122 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2123 	srandom(gethostid());	/* Initialize the random number generator */
2124 
2125 	/*
2126 	 * NOTE: The messages output by in.mpathd are not suitable for
2127 	 * translation, so we do not call textdomain().
2128 	 */
2129 	(void) setlocale(LC_ALL, "");
2130 
2131 	/*
2132 	 * Get the user specified value of 'failure detection time'
2133 	 * from /etc/default/mpathd
2134 	 */
2135 	value = getdefault("FAILURE_DETECTION_TIME");
2136 	if (value != NULL) {
2137 		user_failure_detection_time =
2138 		    (int)strtol((char *)value, NULL, 0);
2139 
2140 		if (user_failure_detection_time <= 0) {
2141 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2142 			logerr("Invalid failure detection time %s, assuming "
2143 			    "default %d\n", value, user_failure_detection_time);
2144 
2145 		} else if (user_failure_detection_time <
2146 		    MIN_FAILURE_DETECTION_TIME) {
2147 			user_failure_detection_time =
2148 			    MIN_FAILURE_DETECTION_TIME;
2149 			logerr("Too small failure detection time of %s, "
2150 			    "assuming minimum %d\n", value,
2151 			    user_failure_detection_time);
2152 		}
2153 		free(value);
2154 	} else {
2155 		/* User has not specified the parameter, Use default value */
2156 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2157 	}
2158 
2159 	/*
2160 	 * This gives the frequency at which probes will be sent.
2161 	 * When fdt ms elapses, we should be able to determine
2162 	 * whether 5 consecutive probes have failed or not.
2163 	 * 1 probe will be sent in every user_probe_interval ms,
2164 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2165 	 * user_probe_interval. Thus when we send out probe 'n' we
2166 	 * can be sure that probe 'n - 2' is lost, if we have not
2167 	 * got the ack. (since the probe interval is > crtt). But
2168 	 * probe 'n - 1' may be a valid unacked probe, since the
2169 	 * time between 2 successive probes could be as small as
2170 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2171 	 */
2172 	user_probe_interval = user_failure_detection_time /
2173 	    (NUM_PROBE_FAILS + 2);
2174 
2175 	/*
2176 	 * Get the user specified value of failback_enabled from
2177 	 * /etc/default/mpathd
2178 	 */
2179 	value = getdefault("FAILBACK");
2180 	if (value != NULL) {
2181 		if (strncasecmp(value, "yes", 3) == 0)
2182 			failback_enabled = _B_TRUE;
2183 		else if (strncasecmp(value, "no", 2) == 0)
2184 			failback_enabled = _B_FALSE;
2185 		else
2186 			logerr("Invalid value for FAILBACK %s\n", value);
2187 		free(value);
2188 	} else {
2189 		failback_enabled = _B_TRUE;
2190 	}
2191 
2192 	/*
2193 	 * Get the user specified value of track_all_phyints from
2194 	 * /etc/default/mpathd. The sense is reversed in
2195 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2196 	 */
2197 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2198 	if (value != NULL) {
2199 		if (strncasecmp(value, "yes", 3) == 0)
2200 			track_all_phyints = _B_FALSE;
2201 		else if (strncasecmp(value, "no", 2) == 0)
2202 			track_all_phyints = _B_TRUE;
2203 		else
2204 			logerr("Invalid value for "
2205 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2206 		free(value);
2207 	} else {
2208 		track_all_phyints = _B_FALSE;
2209 	}
2210 
2211 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2212 		switch (c) {
2213 		case 'a':
2214 			adopt = _B_TRUE;
2215 			break;
2216 		case 'm':
2217 			force_mcast = _B_TRUE;
2218 			break;
2219 		case 'd':
2220 			debug = D_ALL;
2221 			foreground = _B_TRUE;
2222 			break;
2223 		case 'D':
2224 			i = (int)strtol(optarg, NULL, 0);
2225 			if (i == 0) {
2226 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2227 				    optarg);
2228 				exit(1);
2229 			}
2230 			debug |= i;
2231 			foreground = _B_TRUE;
2232 			break;
2233 		case 'l':
2234 			/*
2235 			 * Turn off link state notification handling.
2236 			 * Undocumented command line flag, for debugging
2237 			 * purposes.
2238 			 */
2239 			handle_link_notifications = _B_FALSE;
2240 			break;
2241 		default:
2242 			usage(argv[0]);
2243 			exit(1);
2244 		}
2245 	}
2246 
2247 	/*
2248 	 * The sockets for the loopback command interface should be listening
2249 	 * before we fork and exit in daemonize(). This way, whoever started us
2250 	 * can use the loopback interface as soon as they get a zero exit
2251 	 * status.
2252 	 */
2253 	lsock_v4 = setup_listener(AF_INET);
2254 	lsock_v6 = setup_listener(AF_INET6);
2255 
2256 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2257 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2258 		exit(1);
2259 	}
2260 
2261 	if (!foreground) {
2262 		if (!daemonize()) {
2263 			logerr("cannot daemonize\n");
2264 			exit(EXIT_FAILURE);
2265 		}
2266 		initlog();
2267 	}
2268 
2269 	/*
2270 	 * Initializations:
2271 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2272 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2273 	 * 2. Initialize a pipe for handling/recording signal events.
2274 	 * 3. Create the routing sockets,  used for listening
2275 	 *    to routing / interface changes.
2276 	 * 4. phyint_init() - Initialize physical interface state
2277 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2278 	 *    which timer_init() does indirectly.
2279 	 * 5. timer_init()  - Initialize timer related stuff
2280 	 * 6. initifs() - Initialize our database of all known interfaces
2281 	 * 7. init_router_targets() - Initialize our database of all known
2282 	 *    router targets.
2283 	 */
2284 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2285 	if (ifsock_v4 < 0) {
2286 		logperror("main: IPv4 socket open");
2287 		exit(1);
2288 	}
2289 
2290 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2291 	if (ifsock_v6 < 0) {
2292 		logperror("main: IPv6 socket open");
2293 		exit(1);
2294 	}
2295 
2296 	setup_eventpipe();
2297 
2298 	rtsock_v4 = setup_rtsock(AF_INET);
2299 	rtsock_v6 = setup_rtsock(AF_INET6);
2300 
2301 	if (phyint_init() == -1) {
2302 		logerr("cannot initialize physical interface structures");
2303 		exit(1);
2304 	}
2305 
2306 	timer_init();
2307 
2308 	initifs();
2309 
2310 	/*
2311 	 * If we're operating in "adopt" mode and no interfaces need to be
2312 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2313 	 * interfaces are subsequently put into multipathing groups).
2314 	 */
2315 	if (adopt && phyint_instances == NULL)
2316 		exit(0);
2317 
2318 	/*
2319 	 * Main body. Keep listening for activity on any of the sockets
2320 	 * that we are monitoring and take appropriate action as necessary.
2321 	 * signals are also handled synchronously.
2322 	 */
2323 	for (;;) {
2324 		if (poll(pollfds, pollfd_num, -1) < 0) {
2325 			if (errno == EINTR)
2326 				continue;
2327 			logperror("main: poll");
2328 			exit(1);
2329 		}
2330 		for (i = 0; i < pollfd_num; i++) {
2331 			if ((pollfds[i].fd == -1) ||
2332 			    !(pollfds[i].revents & POLLIN))
2333 				continue;
2334 			if (pollfds[i].fd == eventpipe_read) {
2335 				in_signal(eventpipe_read);
2336 				break;
2337 			}
2338 			if (pollfds[i].fd == rtsock_v4 ||
2339 				pollfds[i].fd == rtsock_v6) {
2340 				process_rtsock(rtsock_v4, rtsock_v6);
2341 				break;
2342 			}
2343 			for (pii = phyint_instances; pii != NULL;
2344 			    pii = pii->pii_next) {
2345 				if (pollfds[i].fd == pii->pii_probe_sock) {
2346 					if (pii->pii_af == AF_INET)
2347 						in_data(pii);
2348 					else
2349 						in6_data(pii);
2350 					break;
2351 				}
2352 			}
2353 			if (pollfds[i].fd == lsock_v4)
2354 				loopback_cmd(lsock_v4, AF_INET);
2355 			else if (pollfds[i].fd == lsock_v6)
2356 				loopback_cmd(lsock_v6, AF_INET6);
2357 		}
2358 		if (full_scan_required) {
2359 			initifs();
2360 			full_scan_required = _B_FALSE;
2361 		}
2362 	}
2363 	/* NOTREACHED */
2364 	return (EXIT_SUCCESS);
2365 }
2366 
2367 static int
2368 setup_listener(int af)
2369 {
2370 	int sock;
2371 	int on;
2372 	int len;
2373 	int ret;
2374 	struct sockaddr_storage laddr;
2375 	struct sockaddr_in  *sin;
2376 	struct sockaddr_in6 *sin6;
2377 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2378 
2379 	assert(af == AF_INET || af == AF_INET6);
2380 
2381 	sock = socket(af, SOCK_STREAM, 0);
2382 	if (sock < 0) {
2383 		logperror("setup_listener: socket");
2384 		exit(1);
2385 	}
2386 
2387 	on = 1;
2388 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2389 	    sizeof (on)) < 0) {
2390 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2391 		exit(1);
2392 	}
2393 
2394 	bzero(&laddr, sizeof (laddr));
2395 	laddr.ss_family = af;
2396 
2397 	if (af == AF_INET) {
2398 		sin = (struct sockaddr_in *)&laddr;
2399 		sin->sin_port = htons(MPATHD_PORT);
2400 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2401 		len = sizeof (struct sockaddr_in);
2402 	} else {
2403 		sin6 = (struct sockaddr_in6 *)&laddr;
2404 		sin6->sin6_port = htons(MPATHD_PORT);
2405 		sin6->sin6_addr = loopback_addr;
2406 		len = sizeof (struct sockaddr_in6);
2407 	}
2408 
2409 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2410 	if (ret < 0) {
2411 		if (errno == EADDRINUSE) {
2412 			/*
2413 			 * Another instance of mpathd may be already active.
2414 			 */
2415 			logerr("main: is another instance of in.mpathd "
2416 			    "already active?\n");
2417 			exit(1);
2418 		} else {
2419 			(void) close(sock);
2420 			return (-1);
2421 		}
2422 	}
2423 	if (listen(sock, 30) < 0) {
2424 		logperror("main: listen");
2425 		exit(1);
2426 	}
2427 	if (poll_add(sock) == -1) {
2428 		(void) close(sock);
2429 		exit(1);
2430 	}
2431 
2432 	return (sock);
2433 }
2434 
2435 /*
2436  * Table of commands and their expected size; used by loopback_cmd().
2437  */
2438 static struct {
2439 	const char	*name;
2440 	unsigned int	size;
2441 } commands[] = {
2442 	{ "MI_PING",		sizeof (uint32_t)	},
2443 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2444 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2445 	{ "MI_SETOINDEX",	sizeof (mi_setoindex_t) },
2446 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2447 };
2448 
2449 /*
2450  * Commands received over the loopback interface come here. Currently
2451  * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
2452  * module. ifconfig only makes a connection, and closes it to check if
2453  * in.mpathd is running.
2454  * if_mpadm sends commands in the format specified by the mpathd_interface
2455  * structure.
2456  */
2457 static void
2458 loopback_cmd(int sock, int family)
2459 {
2460 	int newfd;
2461 	ssize_t len;
2462 	struct sockaddr_storage	peer;
2463 	struct sockaddr_in	*peer_sin;
2464 	struct sockaddr_in6	*peer_sin6;
2465 	socklen_t peerlen;
2466 	union mi_commands mpi;
2467 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2468 	char abuf[INET6_ADDRSTRLEN];
2469 	uint_t cmd;
2470 	int retval;
2471 
2472 	peerlen = sizeof (peer);
2473 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2474 	if (newfd < 0) {
2475 		logperror("loopback_cmd: accept");
2476 		return;
2477 	}
2478 
2479 	switch (family) {
2480 	case AF_INET:
2481 		/*
2482 		 * Validate the address and port to make sure that
2483 		 * non privileged processes don't connect and start
2484 		 * talking to us.
2485 		 */
2486 		if (peerlen != sizeof (struct sockaddr_in)) {
2487 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2488 			(void) close(newfd);
2489 			return;
2490 		}
2491 		peer_sin = (struct sockaddr_in *)&peer;
2492 		if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
2493 		    (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
2494 			(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2495 			    abuf, sizeof (abuf));
2496 			logerr("Attempt to connect from addr %s port %d\n",
2497 			    abuf, ntohs(peer_sin->sin_port));
2498 			(void) close(newfd);
2499 			return;
2500 		}
2501 		break;
2502 
2503 	case AF_INET6:
2504 		if (peerlen != sizeof (struct sockaddr_in6)) {
2505 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2506 			(void) close(newfd);
2507 			return;
2508 		}
2509 		/*
2510 		 * Validate the address and port to make sure that
2511 		 * non privileged processes don't connect and start
2512 		 * talking to us.
2513 		 */
2514 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2515 		if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
2516 		    (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
2517 		    &loopback_addr))) {
2518 			(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2519 			    sizeof (abuf));
2520 			logerr("Attempt to connect from addr %s port %d\n",
2521 			    abuf, ntohs(peer_sin6->sin6_port));
2522 			(void) close(newfd);
2523 			return;
2524 		}
2525 
2526 	default:
2527 		logdebug("loopback_cmd: family %d\n", family);
2528 		(void) close(newfd);
2529 		return;
2530 	}
2531 
2532 	/*
2533 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2534 	 * all supported commands
2535 	 */
2536 	len = read(newfd, &mpi, sizeof (mpi));
2537 
2538 	/*
2539 	 * ifconfig does not send any data. Just tests to see if mpathd
2540 	 * is already running.
2541 	 */
2542 	if (len <= 0) {
2543 		(void) close(newfd);
2544 		return;
2545 	}
2546 
2547 	/*
2548 	 * In theory, we can receive any sized message for a stream socket,
2549 	 * but we don't expect that to happen for a small message over a
2550 	 * loopback connection.
2551 	 */
2552 	if (len < sizeof (uint32_t)) {
2553 		logerr("loopback_cmd: bad command format or read returns "
2554 		    "partial data %d\n", len);
2555 	}
2556 
2557 	cmd = mpi.mi_command;
2558 	if (cmd >= MI_NCMD) {
2559 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2560 		(void) close(newfd);
2561 		return;
2562 	}
2563 
2564 	if (len < commands[cmd].size) {
2565 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2566 		    commands[cmd].name, commands[cmd].size, len);
2567 		(void) close(newfd);
2568 		return;
2569 	}
2570 
2571 	retval = process_cmd(newfd, &mpi);
2572 	if (retval != IPMP_SUCCESS) {
2573 		logerr("failed processing %s: %s\n", commands[cmd].name,
2574 		    ipmp_errmsg(retval));
2575 	}
2576 	(void) close(newfd);
2577 }
2578 
2579 extern int global_errno;	/* set by failover() or failback() */
2580 
2581 /*
2582  * Process the offline, undo offline and set original index commands,
2583  * received from if_mpadm(1M)
2584  */
2585 static unsigned int
2586 process_cmd(int newfd, union mi_commands *mpi)
2587 {
2588 	uint_t	nif = 0;
2589 	uint32_t cmd;
2590 	struct phyint *pi;
2591 	struct phyint *pi2;
2592 	struct phyint_group *pg;
2593 	boolean_t success;
2594 	int error;
2595 	struct mi_offline *mio;
2596 	struct mi_undo_offline *miu;
2597 	struct lifreq lifr;
2598 	int ifsock;
2599 	struct mi_setoindex *mis;
2600 
2601 	cmd = mpi->mi_command;
2602 
2603 	switch (cmd) {
2604 	case MI_OFFLINE:
2605 		mio = &mpi->mi_ocmd;
2606 		/*
2607 		 * Lookup the interface that needs to be offlined.
2608 		 * If it does not exist, return a suitable error.
2609 		 */
2610 		pi = phyint_lookup(mio->mio_ifname);
2611 		if (pi == NULL)
2612 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2613 
2614 		/*
2615 		 * Verify that the minimum redundancy requirements are met.
2616 		 * The multipathing group must have at least the specified
2617 		 * number of functional interfaces after offlining the
2618 		 * requested interface. Otherwise return a suitable error.
2619 		 */
2620 		pg = pi->pi_group;
2621 		nif = 0;
2622 		if (pg != phyint_anongroup) {
2623 			for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
2624 			    pi2 = pi2->pi_pgnext) {
2625 				if ((pi2->pi_state == PI_RUNNING) ||
2626 				    (pg->pg_groupfailed &&
2627 				    !(pi2->pi_flags & IFF_OFFLINE)))
2628 					nif++;
2629 			}
2630 		}
2631 		if (nif < mio->mio_min_redundancy)
2632 			return (send_result(newfd, IPMP_EMINRED, 0));
2633 
2634 		/*
2635 		 * The order of operation is to set IFF_OFFLINE, followed by
2636 		 * failover. Setting IFF_OFFLINE ensures that no new ipif's
2637 		 * can be created. Subsequent failover moves everything on
2638 		 * the OFFLINE interface to some other functional interface.
2639 		 */
2640 		success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
2641 		if (success) {
2642 			if (!pi->pi_empty) {
2643 				error = try_failover(pi, FAILOVER_NORMAL);
2644 				if (error != 0) {
2645 					if (!change_lif_flags(pi, IFF_OFFLINE,
2646 					    _B_FALSE)) {
2647 						logerr("process_cmd: couldn't"
2648 						    " clear OFFLINE flag on"
2649 						    " %s\n", pi->pi_name);
2650 						/*
2651 						 * Offline interfaces should
2652 						 * not be probed.
2653 						 */
2654 						stop_probing(pi);
2655 					}
2656 					return (send_result(newfd, error,
2657 					    global_errno));
2658 				}
2659 			}
2660 		} else {
2661 			return (send_result(newfd, IPMP_FAILURE, errno));
2662 		}
2663 
2664 		/*
2665 		 * The interface is now Offline, so stop probing it.
2666 		 * Note that if_mpadm(1M) will down the test addresses,
2667 		 * after receiving a success reply from us. The routing
2668 		 * socket message will then make us close the socket used
2669 		 * for sending probes. But it is more logical that an
2670 		 * offlined interface must not be probed, even if it has
2671 		 * test addresses.
2672 		 */
2673 		stop_probing(pi);
2674 		return (send_result(newfd, IPMP_SUCCESS, 0));
2675 
2676 	case MI_UNDO_OFFLINE:
2677 		miu = &mpi->mi_ucmd;
2678 		/*
2679 		 * Undo the offline command. As usual lookup the interface.
2680 		 * Send an error if it does not exist.
2681 		 */
2682 		pi = phyint_lookup(miu->miu_ifname);
2683 		if (pi == NULL)
2684 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2685 
2686 		/*
2687 		 * Inverse of the offline operation. Do a failback, and then
2688 		 * clear the IFF_OFFLINE flag.
2689 		 */
2690 		error = do_failback(pi, _B_TRUE);
2691 		if (error == IPMP_EFBPARTIAL)
2692 			return (send_result(newfd, IPMP_EFBPARTIAL, 0));
2693 		error = do_failback(pi, _B_FALSE);
2694 
2695 		switch (error) {
2696 		case IPMP_SUCCESS:
2697 			if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) {
2698 				logdebug("undo error %X\n", global_errno);
2699 				error = IPMP_FAILURE;
2700 				break;
2701 			}
2702 			/* FALLTHROUGH */
2703 
2704 		case IPMP_EFBPARTIAL:
2705 			/*
2706 			 * Reset the state of the interface based on the
2707 			 * current link state; if this phyint subsequently
2708 			 * acquires a test address, the state will be changed
2709 			 * again later as a result of the probes.
2710 			 */
2711 			if (LINK_UP(pi))
2712 				phyint_chstate(pi, PI_RUNNING);
2713 			else
2714 				phyint_chstate(pi, PI_FAILED);
2715 			break;
2716 
2717 		case IPMP_FAILURE:
2718 			break;
2719 
2720 		default:
2721 			logdebug("do_failback: unexpected return value\n");
2722 			break;
2723 		}
2724 		return (send_result(newfd, error, global_errno));
2725 
2726 	case MI_SETOINDEX:
2727 		mis = &mpi->mi_scmd;
2728 
2729 		/* Get the socket for doing ioctls */
2730 		ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
2731 
2732 		/*
2733 		 * Get index of new original interface.
2734 		 * The index is returned in lifr.lifr_index.
2735 		 */
2736 		(void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
2737 		    sizeof (lifr.lifr_name));
2738 
2739 		if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
2740 			return (send_result(newfd, IPMP_FAILURE, errno));
2741 
2742 		/*
2743 		 * Set new original interface index.
2744 		 * The new index was put into lifr.lifr_index by the
2745 		 * SIOCGLIFINDEX ioctl.
2746 		 */
2747 		(void) strlcpy(lifr.lifr_name, mis->mis_lifname,
2748 		    sizeof (lifr.lifr_name));
2749 
2750 		if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
2751 			return (send_result(newfd, IPMP_FAILURE, errno));
2752 
2753 		return (send_result(newfd, IPMP_SUCCESS, 0));
2754 
2755 	case MI_QUERY:
2756 		return (process_query(newfd, &mpi->mi_qcmd));
2757 
2758 	default:
2759 		break;
2760 	}
2761 
2762 	return (send_result(newfd, IPMP_EPROTO, 0));
2763 }
2764 
2765 /*
2766  * Process the query request pointed to by `miq' and send a reply on file
2767  * descriptor `fd'.  Returns an IPMP error code.
2768  */
2769 static unsigned int
2770 process_query(int fd, mi_query_t *miq)
2771 {
2772 	ipmp_groupinfo_t	*grinfop;
2773 	ipmp_groupinfolist_t	*grlp;
2774 	ipmp_grouplist_t	*grlistp;
2775 	ipmp_ifinfo_t		*ifinfop;
2776 	ipmp_ifinfolist_t	*iflp;
2777 	ipmp_snap_t		*snap;
2778 	unsigned int		retval;
2779 
2780 	switch (miq->miq_inforeq) {
2781 	case IPMP_GROUPLIST:
2782 		retval = getgrouplist(&grlistp);
2783 		if (retval != IPMP_SUCCESS)
2784 			return (send_result(fd, retval, errno));
2785 
2786 		retval = send_result(fd, IPMP_SUCCESS, 0);
2787 		if (retval == IPMP_SUCCESS)
2788 			retval = send_grouplist(fd, grlistp);
2789 
2790 		ipmp_freegrouplist(grlistp);
2791 		return (retval);
2792 
2793 	case IPMP_GROUPINFO:
2794 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2795 		retval = getgroupinfo(miq->miq_ifname, &grinfop);
2796 		if (retval != IPMP_SUCCESS)
2797 			return (send_result(fd, retval, errno));
2798 
2799 		retval = send_result(fd, IPMP_SUCCESS, 0);
2800 		if (retval == IPMP_SUCCESS)
2801 			retval = send_groupinfo(fd, grinfop);
2802 
2803 		ipmp_freegroupinfo(grinfop);
2804 		return (retval);
2805 
2806 	case IPMP_IFINFO:
2807 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2808 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2809 		if (retval != IPMP_SUCCESS)
2810 			return (send_result(fd, retval, errno));
2811 
2812 		retval = send_result(fd, IPMP_SUCCESS, 0);
2813 		if (retval == IPMP_SUCCESS)
2814 			retval = send_ifinfo(fd, ifinfop);
2815 
2816 		ipmp_freeifinfo(ifinfop);
2817 		return (retval);
2818 
2819 	case IPMP_SNAP:
2820 		retval = getsnap(&snap);
2821 		if (retval != IPMP_SUCCESS)
2822 			return (send_result(fd, retval, errno));
2823 
2824 		retval = send_result(fd, IPMP_SUCCESS, 0);
2825 		if (retval != IPMP_SUCCESS)
2826 			goto out;
2827 
2828 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2829 		if (retval != IPMP_SUCCESS)
2830 			goto out;
2831 
2832 		retval = send_grouplist(fd, snap->sn_grlistp);
2833 		if (retval != IPMP_SUCCESS)
2834 			goto out;
2835 
2836 		iflp = snap->sn_ifinfolistp;
2837 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2838 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2839 			if (retval != IPMP_SUCCESS)
2840 				goto out;
2841 		}
2842 
2843 		grlp = snap->sn_grinfolistp;
2844 		for (; grlp != NULL; grlp = grlp->grl_next) {
2845 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2846 			if (retval != IPMP_SUCCESS)
2847 				goto out;
2848 		}
2849 	out:
2850 		ipmp_snap_free(snap);
2851 		return (retval);
2852 
2853 	default:
2854 		break;
2855 
2856 	}
2857 	return (send_result(fd, IPMP_EPROTO, 0));
2858 }
2859 
2860 /*
2861  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2862  * Returns an IPMP error code.
2863  */
2864 static unsigned int
2865 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2866 {
2867 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2868 	unsigned int	retval;
2869 
2870 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2871 	if (retval != IPMP_SUCCESS)
2872 		return (retval);
2873 
2874 	return (ipmp_writetlv(fd, IPMP_IFLIST,
2875 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
2876 }
2877 
2878 /*
2879  * Send the interface information pointed to by `ifinfop' on file descriptor
2880  * `fd'.  Returns an IPMP error code.
2881  */
2882 static unsigned int
2883 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2884 {
2885 	return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
2886 }
2887 
2888 /*
2889  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2890  * Returns an IPMP error code.
2891  */
2892 static unsigned int
2893 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2894 {
2895 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2896 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2897 }
2898 
2899 /*
2900  * Initialize an mi_result_t structure using `error' and `syserror' and
2901  * send it on file descriptor `fd'.  Returns an IPMP error code.
2902  */
2903 static unsigned int
2904 send_result(int fd, unsigned int error, int syserror)
2905 {
2906 	mi_result_t me;
2907 
2908 	me.me_mpathd_error = error;
2909 	if (error == IPMP_FAILURE)
2910 		me.me_sys_error = syserror;
2911 	else
2912 		me.me_sys_error = 0;
2913 
2914 	return (ipmp_write(fd, &me, sizeof (me)));
2915 }
2916 
2917 /*
2918  * Daemonize the process.
2919  */
2920 static boolean_t
2921 daemonize(void)
2922 {
2923 	switch (fork()) {
2924 	case -1:
2925 		return (_B_FALSE);
2926 
2927 	case  0:
2928 		/*
2929 		 * Lose our controlling terminal, and become both a session
2930 		 * leader and a process group leader.
2931 		 */
2932 		if (setsid() == -1)
2933 			return (_B_FALSE);
2934 
2935 		/*
2936 		 * Under POSIX, a session leader can accidentally (through
2937 		 * open(2)) acquire a controlling terminal if it does not
2938 		 * have one.  Just to be safe, fork() again so we are not a
2939 		 * session leader.
2940 		 */
2941 		switch (fork()) {
2942 		case -1:
2943 			return (_B_FALSE);
2944 
2945 		case 0:
2946 			(void) chdir("/");
2947 			(void) umask(022);
2948 			(void) fdwalk(closefunc, NULL);
2949 			break;
2950 
2951 		default:
2952 			_exit(EXIT_SUCCESS);
2953 		}
2954 		break;
2955 
2956 	default:
2957 		_exit(EXIT_SUCCESS);
2958 	}
2959 
2960 	return (_B_TRUE);
2961 }
2962 
2963 /*
2964  * The parent has created some fds before forking on purpose, keep them open.
2965  */
2966 static int
2967 closefunc(void *not_used, int fd)
2968 /* ARGSUSED */
2969 {
2970 	if (fd != lsock_v4 && fd != lsock_v6)
2971 		(void) close(fd);
2972 	return (0);
2973 }
2974 
2975 /* LOGGER */
2976 
2977 #include <syslog.h>
2978 
2979 /*
2980  * Logging routines.  All routines log to syslog, unless the daemon is
2981  * running in the foreground, in which case the logging goes to stderr.
2982  *
2983  * The following routines are available:
2984  *
2985  *	logdebug(): A printf-like function for outputting debug messages
2986  *	(messages at LOG_DEBUG) that are only of use to developers.
2987  *
2988  *	logtrace(): A printf-like function for outputting tracing messages
2989  *	(messages at LOG_INFO) from the daemon.	 This is typically used
2990  *	to log the receipt of interesting network-related conditions.
2991  *
2992  *	logerr(): A printf-like function for outputting error messages
2993  *	(messages at LOG_ERR) from the daemon.
2994  *
2995  *	logperror*(): A set of functions used to output error messages
2996  *	(messages at LOG_ERR); these automatically append strerror(errno)
2997  *	and a newline to the message passed to them.
2998  *
2999  * NOTE: since the logging functions write to syslog, the messages passed
3000  *	 to them are not eligible for localization.  Thus, gettext() must
3001  *	 *not* be used.
3002  */
3003 
3004 static int logging = 0;
3005 
3006 static void
3007 initlog(void)
3008 {
3009 	logging++;
3010 	openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON);
3011 }
3012 
3013 /* PRINTFLIKE1 */
3014 void
3015 logerr(char *fmt, ...)
3016 {
3017 	va_list ap;
3018 
3019 	va_start(ap, fmt);
3020 
3021 	if (logging)
3022 		vsyslog(LOG_ERR, fmt, ap);
3023 	else
3024 		(void) vfprintf(stderr, fmt, ap);
3025 	va_end(ap);
3026 }
3027 
3028 /* PRINTFLIKE1 */
3029 void
3030 logtrace(char *fmt, ...)
3031 {
3032 	va_list ap;
3033 
3034 	va_start(ap, fmt);
3035 
3036 	if (logging)
3037 		vsyslog(LOG_INFO, fmt, ap);
3038 	else
3039 		(void) vfprintf(stderr, fmt, ap);
3040 	va_end(ap);
3041 }
3042 
3043 /* PRINTFLIKE1 */
3044 void
3045 logdebug(char *fmt, ...)
3046 {
3047 	va_list ap;
3048 
3049 	va_start(ap, fmt);
3050 
3051 	if (logging)
3052 		vsyslog(LOG_DEBUG, fmt, ap);
3053 	else
3054 		(void) vfprintf(stderr, fmt, ap);
3055 	va_end(ap);
3056 }
3057 
3058 /* PRINTFLIKE1 */
3059 void
3060 logperror(char *str)
3061 {
3062 	if (logging)
3063 		syslog(LOG_ERR, "%s: %m\n", str);
3064 	else
3065 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
3066 }
3067 
3068 void
3069 logperror_pii(struct phyint_instance *pii, char *str)
3070 {
3071 	if (logging) {
3072 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3073 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
3074 	} else {
3075 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3076 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
3077 		    strerror(errno));
3078 	}
3079 }
3080 
3081 void
3082 logperror_li(struct logint *li, char *str)
3083 {
3084 	struct	phyint_instance	*pii = li->li_phyint_inst;
3085 
3086 	if (logging) {
3087 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3088 		    str, AF_STR(pii->pii_af), li->li_name);
3089 	} else {
3090 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3091 		    str, AF_STR(pii->pii_af), li->li_name,
3092 		    strerror(errno));
3093 	}
3094 }
3095 
3096 void
3097 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
3098 {
3099 	if (polled)
3100 		(void) poll_remove(pii->pii_probe_sock);
3101 	(void) close(pii->pii_probe_sock);
3102 	pii->pii_probe_sock = -1;
3103 	pii->pii_basetime_inited = 0;
3104 }
3105