xref: /titanic_41/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c (revision c1d6ec86828a11bb71d265a10d7dad531001727d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "mpd_defs.h"
30 #include "mpd_tables.h"
31 
32 int debug = 0;				/* Debug flag */
33 static int pollfd_num = 0;		/* Num. of poll descriptors */
34 static struct pollfd *pollfds = NULL;	/* Array of poll descriptors */
35 
36 					/* All times below in ms */
37 int	user_failure_detection_time;	/* user specified failure detection */
38 					/* time (fdt) */
39 int	user_probe_interval;		/* derived from user specified fdt */
40 
41 static int	rtsock_v4;		/* AF_INET routing socket */
42 static int	rtsock_v6;		/* AF_INET6 routing socket */
43 int	ifsock_v4 = -1;			/* IPv4 socket for ioctls  */
44 int	ifsock_v6 = -1;			/* IPv6 socket for ioctls  */
45 static int	lsock_v4;		/* Listen socket to detect mpathd */
46 static int	lsock_v6;		/* Listen socket to detect mpathd */
47 static int	mibfd = -1;		/* fd to get mib info */
48 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */
49 
50 boolean_t	full_scan_required = _B_FALSE;
51 static uint_t	last_initifs_time;	/* Time when initifs was last run */
52 static	char **argv0;			/* Saved for re-exec on SIGHUP */
53 boolean_t handle_link_notifications = _B_TRUE;
54 
55 static void	initlog(void);
56 static void	run_timeouts(void);
57 static void	initifs(void);
58 static void	check_if_removed(struct phyint_instance *pii);
59 static void	select_test_ifs(void);
60 static void	ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len);
61 static void	ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len);
62 static void	router_add_v4(mib2_ipRouteEntry_t *rp1,
63     struct in_addr nexthop_v4);
64 static void	router_add_v6(mib2_ipv6RouteEntry_t *rp1,
65     struct in6_addr nexthop_v6);
66 static void	router_add_common(int af, char *ifname,
67     struct in6_addr nexthop);
68 static void	init_router_targets();
69 static void	cleanup(void);
70 static int	setup_listener(int af);
71 static void	check_config(void);
72 static void	check_addr_unique(int af, char *name);
73 static void	init_host_targets(void);
74 static void	dup_host_targets(struct phyint_instance *desired_pii);
75 static void	loopback_cmd(int sock, int family);
76 static int	poll_remove(int fd);
77 static boolean_t daemonize(void);
78 static int	closefunc(void *, int);
79 static unsigned int process_cmd(int newfd, union mi_commands *mpi);
80 static unsigned int process_query(int fd, mi_query_t *miq);
81 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop);
82 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp);
83 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop);
84 static unsigned int send_result(int fd, unsigned int error, int syserror);
85 
86 /*
87  * Return the current time in milliseconds (from an arbitrary reference)
88  * truncated to fit into an int. Truncation is ok since we are interested
89  * only in differences and not the absolute values.
90  */
91 uint_t
92 getcurrenttime(void)
93 {
94 	uint_t	cur_time;	/* In ms */
95 
96 	/*
97 	 * Use of a non-user-adjustable source of time is
98 	 * required. However millisecond precision is sufficient.
99 	 * divide by 10^6
100 	 */
101 	cur_time = (uint_t)(gethrtime() / 1000000LL);
102 	return (cur_time);
103 }
104 
105 /*
106  * Add fd to the set being polled. Returns 0 if ok; -1 if failed.
107  */
108 int
109 poll_add(int fd)
110 {
111 	int i;
112 	int new_num;
113 	struct pollfd *newfds;
114 retry:
115 	/* Check if already present */
116 	for (i = 0; i < pollfd_num; i++) {
117 		if (pollfds[i].fd == fd)
118 			return (0);
119 	}
120 	/* Check for empty spot already present */
121 	for (i = 0; i < pollfd_num; i++) {
122 		if (pollfds[i].fd == -1) {
123 			pollfds[i].fd = fd;
124 			return (0);
125 		}
126 	}
127 
128 	/* Allocate space for 32 more fds and initialize to -1 */
129 	new_num = pollfd_num + 32;
130 	newfds = realloc(pollfds, new_num * sizeof (struct pollfd));
131 	if (newfds == NULL) {
132 		logperror("poll_add: realloc");
133 		return (-1);
134 	}
135 	for (i = pollfd_num; i < new_num; i++) {
136 		newfds[i].fd = -1;
137 		newfds[i].events = POLLIN;
138 	}
139 	pollfd_num = new_num;
140 	pollfds = newfds;
141 	goto retry;
142 }
143 
144 /*
145  * Remove fd from the set being polled. Returns 0 if ok; -1 if failed.
146  */
147 static int
148 poll_remove(int fd)
149 {
150 	int i;
151 
152 	/* Check if already present */
153 	for (i = 0; i < pollfd_num; i++) {
154 		if (pollfds[i].fd == fd) {
155 			pollfds[i].fd = -1;
156 			return (0);
157 		}
158 	}
159 	return (-1);
160 }
161 
162 /*
163  * Extract information about the phyint instance. If the phyint instance still
164  * exists in the kernel then set pii_in_use, else clear it. check_if_removed()
165  * will use it to detect phyint instances that don't exist any longer and
166  * remove them, from our database of phyint instances.
167  * Return value:
168  *	returns true if the phyint instance exists in the kernel,
169  *	returns false otherwise
170  */
171 static boolean_t
172 pii_process(int af, char *name, struct phyint_instance **pii_p)
173 {
174 	int err;
175 	struct phyint_instance *pii;
176 	struct phyint_instance *pii_other;
177 
178 	if (debug & D_PHYINT)
179 		logdebug("pii_process(%s %s)\n", AF_STR(af), name);
180 
181 	pii = phyint_inst_lookup(af, name);
182 	if (pii == NULL) {
183 		/*
184 		 * Phyint instance does not exist in our tables,
185 		 * create new phyint instance
186 		 */
187 		pii = phyint_inst_init_from_k(af, name);
188 	} else {
189 		/* Phyint exists in our tables */
190 		err = phyint_inst_update_from_k(pii);
191 
192 		switch (err) {
193 		case PI_IOCTL_ERROR:
194 			/* Some ioctl error. don't change anything */
195 			pii->pii_in_use = 1;
196 			break;
197 
198 		case PI_GROUP_CHANGED:
199 			/*
200 			 * The phyint has changed group.
201 			 */
202 			restore_phyint(pii->pii_phyint);
203 			/* FALLTHRU */
204 
205 		case PI_IFINDEX_CHANGED:
206 			/*
207 			 * Interface index has changed. Delete and
208 			 * recreate the phyint as it is quite likely
209 			 * the interface has been unplumbed and replumbed.
210 			 */
211 			pii_other = phyint_inst_other(pii);
212 			if (pii_other != NULL)
213 				phyint_inst_delete(pii_other);
214 			phyint_inst_delete(pii);
215 			pii = phyint_inst_init_from_k(af, name);
216 			break;
217 
218 		case PI_DELETED:
219 			/* Phyint instance has disappeared from kernel */
220 			pii->pii_in_use = 0;
221 			break;
222 
223 		case PI_OK:
224 			/* Phyint instance exists and is fine */
225 			pii->pii_in_use = 1;
226 			break;
227 
228 		default:
229 			/* Unknown status */
230 			logerr("pii_process: Unknown status %d\n", err);
231 			break;
232 		}
233 	}
234 
235 	*pii_p = pii;
236 	if (pii != NULL)
237 		return (pii->pii_in_use ? _B_TRUE : _B_FALSE);
238 	else
239 		return (_B_FALSE);
240 }
241 
242 /*
243  * This phyint is leaving the group. Try to restore the phyint to its
244  * initial state. Return the addresses that belong to other group members,
245  * to the group, and take back any addresses owned by this phyint
246  */
247 void
248 restore_phyint(struct phyint *pi)
249 {
250 	if (pi->pi_group == phyint_anongroup)
251 		return;
252 
253 	/*
254 	 * Move everthing to some other member in the group.
255 	 * The phyint has changed group in the kernel. But we
256 	 * have yet to do it in our tables.
257 	 */
258 	if (!pi->pi_empty)
259 		(void) try_failover(pi, FAILOVER_TO_ANY);
260 	/*
261 	 * Move all addresses owned by 'pi' back to pi, from each
262 	 * of the other members of the group
263 	 */
264 	(void) try_failback(pi, _B_FALSE);
265 }
266 
267 /*
268  * Scan all interfaces to detect changes as well as new and deleted interfaces
269  */
270 static void
271 initifs()
272 {
273 	int	n;
274 	int	af;
275 	char	*cp;
276 	char	*buf;
277 	int	numifs;
278 	struct lifnum	lifn;
279 	struct lifconf	lifc;
280 	struct lifreq	*lifr;
281 	struct logint	*li;
282 	struct phyint_instance *pii;
283 	struct phyint_instance *next_pii;
284 	char	pi_name[LIFNAMSIZ + 1];
285 	boolean_t exists;
286 	struct phyint	*pi;
287 
288 	if (debug & D_PHYINT)
289 		logdebug("initifs: Scanning interfaces\n");
290 
291 	last_initifs_time = getcurrenttime();
292 
293 	/*
294 	 * Mark the interfaces so that we can find phyints and logints
295 	 * which have disappeared from the kernel. pii_process() and
296 	 * logint_init_from_k() will set {pii,li}_in_use when they find
297 	 * the interface in the kernel. Also, clear dupaddr bit on probe
298 	 * logint. check_addr_unique() will set the dupaddr bit on the
299 	 * probe logint, if the testaddress is not unique.
300 	 */
301 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
302 		pii->pii_in_use = 0;
303 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
304 			li->li_in_use = 0;
305 			if (pii->pii_probe_logint == li)
306 				li->li_dupaddr = 0;
307 		}
308 	}
309 
310 	lifn.lifn_family = AF_UNSPEC;
311 	lifn.lifn_flags = 0;
312 	if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) {
313 		logperror("initifs: ioctl (get interface numbers)");
314 		return;
315 	}
316 	numifs = lifn.lifn_count;
317 
318 	buf = (char *)calloc(numifs, sizeof (struct lifreq));
319 	if (buf == NULL) {
320 		logperror("initifs: calloc");
321 		return;
322 	}
323 
324 	lifc.lifc_family = AF_UNSPEC;
325 	lifc.lifc_flags = 0;
326 	lifc.lifc_len = numifs * sizeof (struct lifreq);
327 	lifc.lifc_buf = buf;
328 
329 	if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) {
330 		/*
331 		 * EINVAL is commonly encountered, when things change
332 		 * underneath us rapidly, (eg. at boot, when new interfaces
333 		 * are plumbed successively) and the kernel finds the buffer
334 		 * size we passed as too small. We will retry again
335 		 * when we see the next routing socket msg, or at worst after
336 		 * IF_SCAN_INTERVAL ms.
337 		 */
338 		if (errno != EINVAL) {
339 			logperror("initifs: ioctl"
340 			    " (get interface configuration)");
341 		}
342 		free(buf);
343 		return;
344 	}
345 
346 	lifr = (struct lifreq *)lifc.lifc_req;
347 
348 	/*
349 	 * For each lifreq returned by SIOGGLIFCONF, call pii_process()
350 	 * and get the state of the corresponding phyint_instance. If it is
351 	 * successful, then call logint_init_from_k() to get the state of the
352 	 * logint.
353 	 */
354 	for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) {
355 		af = lifr->lifr_addr.ss_family;
356 
357 		/*
358 		 * Need to pass a phyint name to pii_process. Insert the
359 		 * null where the ':' IF_SEPARATOR is found in the logical
360 		 * name.
361 		 */
362 		(void) strncpy(pi_name, lifr->lifr_name, sizeof (pi_name));
363 		pi_name[sizeof (pi_name) - 1] = '\0';
364 		if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL)
365 			*cp = '\0';
366 
367 		exists = pii_process(af, pi_name, &pii);
368 		if (exists) {
369 			/* The phyint is fine. So process the logint */
370 			logint_init_from_k(pii, lifr->lifr_name);
371 		}
372 		check_addr_unique(af, lifr->lifr_name);
373 	}
374 
375 	free(buf);
376 
377 	/*
378 	 * If the test address is now unique, and if it was not unique
379 	 * previously,	clear the li_dupaddrmsg_printed flag and log a
380 	 * recovery message
381 	 */
382 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
383 		struct logint *li;
384 		char abuf[INET6_ADDRSTRLEN];
385 
386 		li = pii->pii_probe_logint;
387 		if ((li != NULL) && !li->li_dupaddr &&
388 		    li->li_dupaddrmsg_printed) {
389 			logerr("Test address %s is unique; enabling probe-"
390 			    "based failure detection\n",
391 			    pr_addr(pii->pii_af, li->li_addr, abuf,
392 				sizeof (abuf)));
393 			li->li_dupaddrmsg_printed = 0;
394 		}
395 	}
396 
397 	/*
398 	 * Scan for phyints and logints that have disappeared from the
399 	 * kernel, and delete them.
400 	 */
401 	pii = phyint_instances;
402 
403 	while (pii != NULL) {
404 		next_pii = pii->pii_next;
405 		check_if_removed(pii);
406 		pii = next_pii;
407 	}
408 
409 	/*
410 	 * Select a test address for sending probes on each phyint instance
411 	 */
412 	select_test_ifs();
413 
414 	/*
415 	 * Handle link up/down notifications from the NICs.
416 	 */
417 	process_link_state_changes();
418 
419 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
420 		/*
421 		 * If this is a case of group failure, we don't have much
422 		 * to do until the group recovers again.
423 		 */
424 		if (GROUP_FAILED(pi->pi_group))
425 			continue;
426 
427 		/*
428 		 * Try/Retry any pending failovers / failbacks, that did not
429 		 * not complete, or that could not be initiated previously.
430 		 * This implements the 3 invariants described in the big block
431 		 * comment at the beginning of probe.c
432 		 */
433 		if (pi->pi_flags & IFF_INACTIVE) {
434 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
435 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
436 		} else {
437 			struct phyint_instance *pii;
438 
439 			pii = pi->pi_v4;
440 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
441 				pii = pi->pi_v6;
442 			if (LINK_UP(pi) && !PROBE_CAPABLE(pii))
443 				continue;
444 			/*
445 			 * It is possible that the phyint has started
446 			 * receiving packets, after it has been marked
447 			 * PI_FAILED. Don't initiate failover, if the
448 			 * phyint has started recovering. failure_state()
449 			 * captures this check. A similar logic is used
450 			 * for failback/repair case.
451 			 */
452 			if (pi->pi_state == PI_FAILED && !pi->pi_empty &&
453 			    (failure_state(pii) == PHYINT_FAILURE)) {
454 				(void) try_failover(pi, FAILOVER_NORMAL);
455 			} else if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
456 				if (try_failback(pi, _B_FALSE) !=
457 				    IPMP_FAILURE) {
458 					(void) change_lif_flags(pi, IFF_FAILED,
459 					    _B_FALSE);
460 					/* Per state diagram */
461 					pi->pi_empty = 0;
462 				}
463 			}
464 		}
465 	}
466 }
467 
468 /*
469  * Check that test/probe addresses are always unique. link-locals and
470  * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER)
471  * address can produce unexpected results. Log an error and alert the user.
472  */
473 static void
474 check_addr_unique(int af, char *name)
475 {
476 	struct lifreq	lifr;
477 	struct phyint	*pi;
478 	struct in6_addr	addr;
479 	struct phyint_instance	*pii;
480 	struct sockaddr_in	*sin;
481 	struct sockaddr_in6	*sin6;
482 	int ifsock;
483 	char abuf[INET6_ADDRSTRLEN];
484 
485 	/* Get the socket for doing ioctls */
486 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
487 
488 	(void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name));
489 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
490 	/*
491 	 * Get the address corresponding to 'name'. We cannot
492 	 * do a logint lookup in our tables, because, not all logints
493 	 * in the system are tracked by mpathd. (eg. things not in a group)
494 	 */
495 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
496 		if (errno == ENXIO) {
497 			/* Interface has vanished */
498 			return;
499 		} else {
500 			logperror("ioctl (get addr)");
501 			return;
502 		}
503 	}
504 
505 	if (af == AF_INET) {
506 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
507 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr);
508 	} else {
509 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
510 		addr = sin6->sin6_addr;
511 	}
512 
513 	/*
514 	 * Does the address 'addr' match any known test address ? If so
515 	 * it is a duplicate, unless we are looking at the same logint
516 	 */
517 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
518 		pii = PHYINT_INSTANCE(pi, af);
519 		if (pii == NULL || pii->pii_probe_logint == NULL)
520 			continue;
521 
522 		if (!IN6_ARE_ADDR_EQUAL(&addr,
523 		    &pii->pii_probe_logint->li_addr)) {
524 			continue;
525 		}
526 
527 		if (strncmp(pii->pii_probe_logint->li_name, name,
528 		    sizeof (pii->pii_probe_logint->li_name)) == 0) {
529 			continue;
530 		}
531 
532 		/*
533 		 * This test address is not unique. Set the dupaddr bit
534 		 */
535 		pii->pii_probe_logint->li_dupaddr = 1;
536 
537 		/*
538 		 * Log an error message if not already logged
539 		 */
540 		if (pii->pii_probe_logint->li_dupaddrmsg_printed)
541 			continue;
542 
543 		logerr("Test address %s is not unique; disabling "
544 		    "probe-based failure detection\n",
545 		    pr_addr(af, addr, abuf, sizeof (abuf)));
546 
547 		pii->pii_probe_logint->li_dupaddrmsg_printed = 1;
548 	}
549 }
550 
551 /*
552  * The pii_probe_logint used for probing, must satisfy the following properties
553  * with respect to its li_flags.
554  * IFF_NOFAILOVER - must be set (except in singleton group case)
555  * IFF_UP	  - must be set
556  * IFF_NOXMIT	  - must be clear
557  * IFF_NOLOCAL	  - must be clear
558  * IFF_DEPRECATED - preferably set (for IPv4)
559  */
560 #define	BEST_FLAG_SET	(IFF_NOFAILOVER | IFF_UP | IFF_DEPRECATED)
561 #define	CLEAR_FLAG_SET	(IFF_NOXMIT | IFF_NOLOCAL)
562 #define	TEST_CLEAR_FLAG_SET	CLEAR_FLAG_SET
563 #define	TEST_MINIMAL_FLAG_SET	(IFF_UP | CLEAR_FLAG_SET)
564 #define	TEST_BEST_FLAG_SET	(BEST_FLAG_SET | CLEAR_FLAG_SET)
565 
566 /*
567  * Stop probing an interface.  Called when an interface is offlined.
568  * The probe socket is closed on each interface instance, and the
569  * interface state set to PI_OFFLINE.
570  */
571 static void
572 stop_probing(struct phyint *pi)
573 {
574 	struct phyint_instance *pii;
575 
576 	pii = pi->pi_v4;
577 	if (pii != NULL) {
578 		if (pii->pii_probe_sock != -1)
579 			close_probe_socket(pii, _B_TRUE);
580 		pii->pii_probe_logint = NULL;
581 	}
582 
583 	pii = pi->pi_v6;
584 	if (pii != NULL) {
585 		if (pii->pii_probe_sock != -1)
586 			close_probe_socket(pii, _B_TRUE);
587 		pii->pii_probe_logint = NULL;
588 	}
589 
590 	phyint_chstate(pi, PI_OFFLINE);
591 }
592 
593 /*
594  * Do the test address selection for each phyint instance. Pick an
595  * IFF_NOFAILOVER address as test address. For singleton case,
596  * if user didn't configure an IFF_NOFAILOVER address, we will pick a
597  * normal address as test address. For (multiple adapter) groups,
598  * user is required to configure IFF_NOFAILOVER test address. Call
599  * phyint_inst_sockinit() to complete the initializations.
600  */
601 static void
602 select_test_ifs(void)
603 {
604 	struct phyint		*pi;
605 	struct phyint_instance	*pii;
606 	struct phyint_instance	*next_pii;
607 	struct logint	*li;
608 	struct logint	*test_logint;
609 	boolean_t target_scan_reqd = _B_FALSE;
610 	struct target *tg;
611 
612 	if (debug & D_PHYINT)
613 		logdebug("select_test_ifs\n");
614 
615 	/*
616 	 * For each phyint instance, do the test address selection
617 	 */
618 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
619 		next_pii = pii->pii_next;
620 		/*
621 		 * An interface that is offline, should not be probed.
622 		 * Offline interfaces should always in PI_OFFLINE state,
623 		 * unless some other entity has set the offline flag.
624 		 */
625 		if (pii->pii_phyint->pi_flags & IFF_OFFLINE) {
626 			if (pii->pii_phyint->pi_state != PI_OFFLINE) {
627 				logerr("shouldn't be probing offline"
628 					" interface %s (state is: %u)."
629 					" Stopping probes.\n",
630 					pii->pii_phyint->pi_name,
631 					pii->pii_phyint->pi_state);
632 				stop_probing(pii->pii_phyint);
633 			}
634 			continue;
635 		}
636 
637 		test_logint = pii->pii_probe_logint;
638 
639 		if (test_logint != NULL) {
640 			if ((test_logint->li_flags & TEST_BEST_FLAG_SET)
641 			    == BEST_FLAG_SET)
642 				continue;
643 
644 			/*
645 			 * If user configures IFF_NOXMIT or IFF_NOLOCAL
646 			 * flags on test addresses after in.mpathd has
647 			 * has started, the daemon aborts. In future
648 			 * this can be better handling, i.e. instead
649 			 * of abort the daemon, a more appropriate
650 			 * action may be issuing a warning and choose
651 			 * a different test address.
652 			 */
653 			assert((test_logint->li_flags & TEST_CLEAR_FLAG_SET)
654 			    == 0);
655 		}
656 
657 		/*
658 		 * Walk the logints of this phyint instance, and select
659 		 * the best available test address
660 		 */
661 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
662 			/*
663 			 * Skip any IPv6 logints that are not link-local,
664 			 * since we should always have a link-local address
665 			 * anyway and in6_data() expects link-local replies.
666 			 */
667 			if (pii->pii_af == AF_INET6 &&
668 			    !IN6_IS_ADDR_LINKLOCAL(&li->li_addr))
669 				continue;
670 
671 			if ((li->li_flags & TEST_MINIMAL_FLAG_SET) == IFF_UP) {
672 				/*
673 				 * Now we have a testaddress, that satisfies
674 				 * the minimal properties.
675 				 */
676 				if ((li->li_flags & TEST_BEST_FLAG_SET)
677 				    == BEST_FLAG_SET) {
678 					/*
679 					 * This is the best possible address.
680 					 * So break, and continue to the
681 					 * next phyint
682 					 */
683 					test_logint = li;
684 					break;
685 				}
686 				if ((test_logint == NULL) ||
687 				    (!(test_logint->li_flags &
688 				    IFF_NOFAILOVER) &&
689 				    (li->li_flags & IFF_NOFAILOVER)))
690 					/*
691 					 * This is a possible candidate,
692 					 * unless we find a better one.
693 					 */
694 					test_logint = li;
695 			}
696 		}
697 
698 		/*
699 		 * If we've gone from a singleton group to a multiple adapter
700 		 * group, and we haven't found an IFF_NOFAILOVER test address
701 		 * by now, the old test address is no longer valid. If we are
702 		 * not dealing with a singleton group, and the above test
703 		 * address selection loop has selected a non IFF_NOFAILOVER
704 		 * address as a candidate, we will correct that here.
705 		 */
706 		if ((test_logint != NULL) &&
707 		    !SINGLETON_GROUP(pii->pii_phyint) &&
708 		    !(test_logint->li_flags & IFF_NOFAILOVER)) {
709 			test_logint = NULL;
710 			if (pii->pii_probe_sock != -1)
711 				close_probe_socket(pii, _B_TRUE);
712 			pii->pii_probe_logint = NULL;
713 		}
714 
715 		if (test_logint == NULL) {
716 			/*
717 			 * We don't have a test address. Don't print an
718 			 * error message immediately. check_config() will
719 			 * take care of it. Zero out the probe stats array
720 			 * since it is no longer relevant. Optimize by
721 			 * checking if it is already zeroed out.
722 			 */
723 			int pr_ndx;
724 
725 			pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
726 			if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) {
727 				clear_pii_probe_stats(pii);
728 				reset_crtt_all(pii->pii_phyint);
729 			}
730 			continue;
731 		} else if (test_logint == pii->pii_probe_logint) {
732 			/*
733 			 * If we didn't find any new test addr, go to the
734 			 * next phyint.
735 			 */
736 			continue;
737 		}
738 
739 		/*
740 		 * The phyint is either being assigned a new testaddr
741 		 * or is being assigned a testaddr for the 1st time.
742 		 * Need to initialize the phyint socket
743 		 */
744 		pii->pii_probe_logint = test_logint;
745 		if (!phyint_inst_sockinit(pii)) {
746 			if (debug & D_PHYINT) {
747 				logdebug("select_test_ifs: "
748 				    "phyint_sockinit failed\n");
749 			}
750 			phyint_inst_delete(pii);
751 			continue;
752 		}
753 
754 		/*
755 		 * This phyint instance is now enabled for probes; this
756 		 * impacts our state machine in two ways:
757 		 *
758 		 * 1. If we're probe *capable* as well (i.e., we have
759 		 *    probe targets) and the interface is in PI_NOTARGETS,
760 		 *    then transition to PI_RUNNING.
761 		 *
762 		 * 2. If we're not probe capable, and the other phyint
763 		 *    instance is also not probe capable, and we were in
764 		 *    PI_RUNNING, then transition to PI_NOTARGETS.
765 		 *
766 		 * Also see the state diagram in mpd_probe.c.
767 		 */
768 		if (PROBE_CAPABLE(pii)) {
769 			if (pii->pii_phyint->pi_state == PI_NOTARGETS)
770 				phyint_chstate(pii->pii_phyint, PI_RUNNING);
771 		} else if (!PROBE_CAPABLE(phyint_inst_other(pii))) {
772 			if (pii->pii_phyint->pi_state == PI_RUNNING)
773 				phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
774 		}
775 
776 		if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
777 			tg = pii->pii_targets;
778 			if (tg != NULL)
779 				target_delete(tg);
780 			assert(pii->pii_targets == NULL);
781 			assert(pii->pii_target_next == NULL);
782 			assert(pii->pii_ntargets == 0);
783 			target_create(pii, test_logint->li_dstaddr,
784 			    _B_TRUE);
785 		}
786 
787 		/*
788 		 * If no targets are currently known for this phyint
789 		 * we need to call init_router_targets. Since
790 		 * init_router_targets() initializes the list of targets
791 		 * for all phyints it is done below the loop.
792 		 */
793 		if (pii->pii_targets == NULL)
794 			target_scan_reqd = _B_TRUE;
795 
796 		/*
797 		 * Start the probe timer for this instance.
798 		 */
799 		if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) {
800 			start_timer(pii);
801 			pii->pii_basetime_inited = 1;
802 		}
803 	}
804 
805 	/*
806 	 * Check the interface list for any interfaces that are marked
807 	 * PI_FAILED but no longer enabled to send probes, and call
808 	 * phyint_check_for_repair() to see if the link now indicates that the
809 	 * interface should be repaired.  Also see the state diagram in
810 	 * mpd_probe.c.
811 	 */
812 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
813 		if (pi->pi_state == PI_FAILED &&
814 		    !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
815 			phyint_check_for_repair(pi);
816 		}
817 	}
818 
819 	/*
820 	 * Try to populate the target list. init_router_targets populates
821 	 * the target list from the routing table. If our target list is
822 	 * still empty, init_host_targets adds host targets based on the
823 	 * host target list of other phyints in the group.
824 	 */
825 	if (target_scan_reqd) {
826 		init_router_targets();
827 		init_host_targets();
828 	}
829 }
830 
831 /*
832  * Check phyint group configuration, to detect any inconsistencies,
833  * and log an error message. This is called from runtimeouts every
834  * 20 secs. But the error message is displayed once. If the
835  * consistency is resolved by the admin, a recovery message is displayed
836  * once.
837  */
838 static void
839 check_config(void)
840 {
841 	struct phyint_group *pg;
842 	struct phyint *pi;
843 	boolean_t v4_in_group;
844 	boolean_t v6_in_group;
845 
846 	/*
847 	 * All phyints of a group must be homogenous to ensure that
848 	 * failover or failback can be done. If any phyint in a group
849 	 * has IPv4 plumbed, check that all phyints have IPv4 plumbed.
850 	 * Do a similar check for IPv6.
851 	 */
852 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
853 		if (pg == phyint_anongroup)
854 			continue;
855 
856 		v4_in_group = _B_FALSE;
857 		v6_in_group = _B_FALSE;
858 		/*
859 		 * 1st pass. Determine if at least 1 phyint in the group
860 		 * has IPv4 plumbed and if so set v4_in_group to true.
861 		 * Repeat similarly for IPv6.
862 		 */
863 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
864 			if (pi->pi_v4 != NULL)
865 				v4_in_group = _B_TRUE;
866 			if (pi->pi_v6 != NULL)
867 				v6_in_group = _B_TRUE;
868 		}
869 
870 		/*
871 		 * 2nd pass. If v4_in_group is true, check that phyint
872 		 * has IPv4 plumbed. Repeat similarly for IPv6. Print
873 		 * out a message the 1st time only.
874 		 */
875 		for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
876 			if (pi->pi_flags & IFF_OFFLINE)
877 				continue;
878 
879 			if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) {
880 				if (!pi->pi_cfgmsg_printed) {
881 					logerr("NIC %s of group %s is"
882 					    " not plumbed for IPv4 and may"
883 					    " affect failover capability\n",
884 					    pi->pi_name,
885 					    pi->pi_group->pg_name);
886 					pi->pi_cfgmsg_printed = 1;
887 				}
888 			} else if (v6_in_group == _B_TRUE &&
889 			    pi->pi_v6 == NULL) {
890 				if (!pi->pi_cfgmsg_printed) {
891 					logerr("NIC %s of group %s is"
892 					    " not plumbed for IPv6 and may"
893 					    " affect failover capability\n",
894 					    pi->pi_name,
895 					    pi->pi_group->pg_name);
896 					pi->pi_cfgmsg_printed = 1;
897 				}
898 			} else {
899 				/*
900 				 * The phyint matches the group configuration,
901 				 * if we have reached this point. If it was
902 				 * improperly configured earlier, log an
903 				 * error recovery message
904 				 */
905 				if (pi->pi_cfgmsg_printed) {
906 					logerr("NIC %s is now consistent with "
907 					    "group %s and failover capability "
908 					    "is restored\n", pi->pi_name,
909 					    pi->pi_group->pg_name);
910 					pi->pi_cfgmsg_printed = 0;
911 				}
912 			}
913 
914 		}
915 	}
916 
917 	/*
918 	 * In order to perform probe-based failure detection, a phyint must
919 	 * have at least 1 test/probe address for sending and receiving probes
920 	 * (either on IPv4 or IPv6 instance or both).  If no test address has
921 	 * been configured, notify the administrator, but continue on since we
922 	 * can still perform load spreading, along with "link up/down" based
923 	 * failure detection.
924 	 *
925 	 * Note: In the singleton group case, when user didn't configure
926 	 * a test address, the probe address is picked by this daemon.
927 	 */
928 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
929 		if (pi->pi_flags & IFF_OFFLINE)
930 			continue;
931 
932 		if ((pi->pi_v4 == NULL ||
933 		    pi->pi_v4->pii_probe_logint == NULL) &&
934 		    (pi->pi_v6 == NULL ||
935 		    pi->pi_v6->pii_probe_logint == NULL)) {
936 			if (!pi->pi_taddrmsg_printed) {
937 				logerr("No test address configured on "
938 				    "interface %s; disabling probe-based "
939 				    "failure detection on it\n", pi->pi_name);
940 				pi->pi_taddrmsg_printed = 1;
941 			}
942 		} else if (pi->pi_taddrmsg_printed) {
943 			logerr("Test address now configured on interface %s; "
944 			    "enabling probe-based failure detection on it\n",
945 			    pi->pi_name);
946 			pi->pi_taddrmsg_printed = 0;
947 		}
948 
949 	}
950 }
951 
952 /*
953  * Timer mechanism using relative time (in milliseconds) from the
954  * previous timer event. Timers exceeding TIMER_INFINITY milliseconds
955  * will fire after TIMER_INFINITY milliseconds.
956  * Unsigned arithmetic note: We assume a 32-bit circular sequence space for
957  * time values. Hence 2 consecutive timer events cannot be spaced farther
958  * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value
959  * that can be passed for the delay parameter of timer_schedule()
960  */
961 static uint_t timer_next;	/* Currently scheduled timeout */
962 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */
963 
964 static void
965 timer_init(void)
966 {
967 	timer_next = getcurrenttime() + TIMER_INFINITY;
968 	/*
969 	 * The call to run_timeouts() will get the timer started
970 	 * Since there are no phyints at this point, the timer will
971 	 * be set for IF_SCAN_INTERVAL ms.
972 	 */
973 	run_timeouts();
974 }
975 
976 /*
977  * Make sure the next SIGALRM occurs delay milliseconds from the current
978  * time if not earlier. We are interested only in time differences.
979  */
980 void
981 timer_schedule(uint_t delay)
982 {
983 	uint_t now;
984 	struct itimerval itimerval;
985 
986 	if (debug & D_TIMER)
987 		logdebug("timer_schedule(%u)\n", delay);
988 
989 	assert(delay <= TIMER_INFINITY);
990 
991 	now = getcurrenttime();
992 	if (delay == 0) {
993 		/* Minimum allowed delay */
994 		delay = 1;
995 	}
996 	/* Will this timer occur before the currently scheduled SIGALRM? */
997 	if (timer_active && TIME_GE(now + delay, timer_next)) {
998 		if (debug & D_TIMER) {
999 			logdebug("timer_schedule(%u) - no action: "
1000 			    "now %u next %u\n", delay, now, timer_next);
1001 		}
1002 		return;
1003 	}
1004 	timer_next = now + delay;
1005 
1006 	itimerval.it_value.tv_sec = delay / 1000;
1007 	itimerval.it_value.tv_usec = (delay % 1000) * 1000;
1008 	itimerval.it_interval.tv_sec = 0;
1009 	itimerval.it_interval.tv_usec = 0;
1010 	if (debug & D_TIMER) {
1011 		logdebug("timer_schedule(%u): sec %ld usec %ld\n",
1012 		    delay, itimerval.it_value.tv_sec,
1013 		    itimerval.it_value.tv_usec);
1014 	}
1015 	timer_active = _B_TRUE;
1016 	if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) {
1017 		logperror("timer_schedule: setitimer");
1018 		exit(2);
1019 	}
1020 }
1021 
1022 /*
1023  * Timer has fired. Determine when the next timer event will occur by asking
1024  * all the timer routines. Should not be called from a timer routine.
1025  */
1026 static void
1027 run_timeouts(void)
1028 {
1029 	uint_t next;
1030 	uint_t next_event_time;
1031 	struct phyint_instance *pii;
1032 	struct phyint_instance *next_pii;
1033 	static boolean_t timeout_running;
1034 
1035 	/* assert that recursive timeouts don't happen. */
1036 	assert(!timeout_running);
1037 
1038 	timeout_running = _B_TRUE;
1039 
1040 	if (debug & D_TIMER)
1041 		logdebug("run_timeouts()\n");
1042 
1043 	next = TIMER_INFINITY;
1044 
1045 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1046 		next_pii = pii->pii_next;
1047 		next_event_time = phyint_inst_timer(pii);
1048 		if (next_event_time != TIMER_INFINITY && next_event_time < next)
1049 			next = next_event_time;
1050 
1051 		if (debug & D_TIMER) {
1052 			logdebug("run_timeouts(%s %s): next scheduled for"
1053 			    " this phyint inst %u, next scheduled global"
1054 			    " %u ms\n",
1055 			    AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
1056 			    next_event_time, next);
1057 		}
1058 	}
1059 
1060 	/*
1061 	 * Make sure initifs() is called at least once every
1062 	 * IF_SCAN_INTERVAL, to make sure that we are in sync
1063 	 * with the kernel, in case we have missed any routing
1064 	 * socket messages.
1065 	 */
1066 	if (next > IF_SCAN_INTERVAL)
1067 		next = IF_SCAN_INTERVAL;
1068 
1069 	if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) {
1070 		initifs();
1071 		check_config();
1072 	}
1073 
1074 	if (debug & D_TIMER)
1075 		logdebug("run_timeouts: %u ms\n", next);
1076 
1077 	timer_schedule(next);
1078 	timeout_running = _B_FALSE;
1079 }
1080 
1081 static int eventpipe_read = -1;	/* Used for synchronous signal delivery */
1082 static int eventpipe_write = -1;
1083 static boolean_t cleanup_started = _B_FALSE;
1084 				/* Don't write to eventpipe if in cleanup */
1085 /*
1086  * Ensure that signals are processed synchronously with the rest of
1087  * the code by just writing a one character signal number on the pipe.
1088  * The poll loop will pick this up and process the signal event.
1089  */
1090 static void
1091 sig_handler(int signo)
1092 {
1093 	uchar_t buf = (uchar_t)signo;
1094 
1095 	/*
1096 	 * Don't write to pipe if cleanup has already begun. cleanup()
1097 	 * might have closed the pipe already
1098 	 */
1099 	if (cleanup_started)
1100 		return;
1101 
1102 	if (eventpipe_write == -1) {
1103 		logerr("sig_handler: no pipe found\n");
1104 		return;
1105 	}
1106 	if (write(eventpipe_write, &buf, sizeof (buf)) < 0)
1107 		logperror("sig_handler: write");
1108 }
1109 
1110 extern struct probes_missed probes_missed;
1111 
1112 /*
1113  * Pick up a signal "byte" from the pipe and process it.
1114  */
1115 static void
1116 in_signal(int fd)
1117 {
1118 	uchar_t buf;
1119 	uint64_t  sent, acked, lost, unacked, unknown;
1120 	struct phyint_instance *pii;
1121 	int pr_ndx;
1122 
1123 	switch (read(fd, &buf, sizeof (buf))) {
1124 	case -1:
1125 		logperror("in_signal: read");
1126 		exit(1);
1127 		/* NOTREACHED */
1128 	case 1:
1129 		break;
1130 	case 0:
1131 		logerr("in_signal: read end of file\n");
1132 		exit(1);
1133 		/* NOTREACHED */
1134 	default:
1135 		logerr("in_signal: read > 1\n");
1136 		exit(1);
1137 	}
1138 
1139 	if (debug & D_TIMER)
1140 		logdebug("in_signal() got %d\n", buf);
1141 
1142 	switch (buf) {
1143 	case SIGALRM:
1144 		if (debug & D_TIMER) {
1145 			uint_t now = getcurrenttime();
1146 
1147 			logdebug("in_signal(SIGALRM) delta %u\n",
1148 			    now - timer_next);
1149 		}
1150 		timer_active = _B_FALSE;
1151 		run_timeouts();
1152 		break;
1153 	case SIGUSR1:
1154 		logdebug("Printing configuration:\n");
1155 		/* Print out the internal tables */
1156 		phyint_inst_print_all();
1157 
1158 		/*
1159 		 * Print out the accumulated statistics about missed
1160 		 * probes (happens due to scheduling delay).
1161 		 */
1162 		logerr("Missed sending total of %d probes spread over"
1163 		    " %d occurrences\n", probes_missed.pm_nprobes,
1164 		    probes_missed.pm_ntimes);
1165 
1166 		/*
1167 		 * Print out the accumulated statistics about probes
1168 		 * that were sent.
1169 		 */
1170 		for (pii = phyint_instances; pii != NULL;
1171 		    pii = pii->pii_next) {
1172 			unacked = 0;
1173 			acked = pii->pii_cum_stats.acked;
1174 			lost = pii->pii_cum_stats.lost;
1175 			sent = pii->pii_cum_stats.sent;
1176 			unknown = pii->pii_cum_stats.unknown;
1177 			for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) {
1178 				switch (pii->pii_probes[pr_ndx].pr_status) {
1179 				case PR_ACKED:
1180 					acked++;
1181 					break;
1182 				case PR_LOST:
1183 					lost++;
1184 					break;
1185 				case PR_UNACKED:
1186 					unacked++;
1187 					break;
1188 				}
1189 			}
1190 			logerr("\nProbe stats on (%s %s)\n"
1191 			    "Number of probes sent %lld\n"
1192 			    "Number of probe acks received %lld\n"
1193 			    "Number of probes/acks lost %lld\n"
1194 			    "Number of valid unacknowled probes %lld\n"
1195 			    "Number of ambiguous probe acks received %lld\n",
1196 			    AF_STR(pii->pii_af), pii->pii_name,
1197 			    sent, acked, lost, unacked, unknown);
1198 		}
1199 		break;
1200 	case SIGHUP:
1201 		logerr("SIGHUP: restart and reread config file\n");
1202 		cleanup();
1203 		(void) execv(argv0[0], argv0);
1204 		_exit(0177);
1205 		/* NOTREACHED */
1206 	case SIGINT:
1207 	case SIGTERM:
1208 	case SIGQUIT:
1209 		cleanup();
1210 		exit(0);
1211 		/* NOTREACHED */
1212 	default:
1213 		logerr("in_signal: unknown signal: %d\n", buf);
1214 	}
1215 }
1216 
1217 static void
1218 cleanup(void)
1219 {
1220 	struct phyint_instance *pii;
1221 	struct phyint_instance *next_pii;
1222 
1223 	/*
1224 	 * Make sure that we don't write to eventpipe in
1225 	 * sig_handler() if any signal notably SIGALRM,
1226 	 * occurs after we close the eventpipe descriptor below
1227 	 */
1228 	cleanup_started = _B_TRUE;
1229 
1230 	for (pii = phyint_instances; pii != NULL; pii = next_pii) {
1231 		next_pii = pii->pii_next;
1232 		phyint_inst_delete(pii);
1233 	}
1234 
1235 	(void) close(ifsock_v4);
1236 	(void) close(ifsock_v6);
1237 	(void) close(rtsock_v4);
1238 	(void) close(rtsock_v6);
1239 	(void) close(lsock_v4);
1240 	(void) close(lsock_v6);
1241 	(void) close(0);
1242 	(void) close(1);
1243 	(void) close(2);
1244 	(void) close(mibfd);
1245 	(void) close(eventpipe_read);
1246 	(void) close(eventpipe_write);
1247 }
1248 
1249 /*
1250  * Create pipe for signal delivery and set up signal handlers.
1251  */
1252 static void
1253 setup_eventpipe(void)
1254 {
1255 	int fds[2];
1256 	struct sigaction act;
1257 
1258 	if ((pipe(fds)) < 0) {
1259 		logperror("setup_eventpipe: pipe");
1260 		exit(1);
1261 	}
1262 	eventpipe_read = fds[0];
1263 	eventpipe_write = fds[1];
1264 	if (poll_add(eventpipe_read) == -1) {
1265 		exit(1);
1266 	}
1267 
1268 	act.sa_handler = sig_handler;
1269 	act.sa_flags = SA_RESTART;
1270 	(void) sigaction(SIGALRM, &act, NULL);
1271 
1272 	(void) sigset(SIGHUP, sig_handler);
1273 	(void) sigset(SIGUSR1, sig_handler);
1274 	(void) sigset(SIGTERM, sig_handler);
1275 	(void) sigset(SIGINT, sig_handler);
1276 	(void) sigset(SIGQUIT, sig_handler);
1277 }
1278 
1279 /*
1280  * Create a routing socket for receiving RTM_IFINFO messages.
1281  */
1282 static int
1283 setup_rtsock(int af)
1284 {
1285 	int	s;
1286 	int	flags;
1287 
1288 	s = socket(PF_ROUTE, SOCK_RAW, af);
1289 	if (s == -1) {
1290 		logperror("setup_rtsock: socket PF_ROUTE");
1291 		exit(1);
1292 	}
1293 	if ((flags = fcntl(s, F_GETFL, 0)) < 0) {
1294 		logperror("setup_rtsock: fcntl F_GETFL");
1295 		(void) close(s);
1296 		exit(1);
1297 	}
1298 	if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) {
1299 		logperror("setup_rtsock: fcntl F_SETFL");
1300 		(void) close(s);
1301 		exit(1);
1302 	}
1303 	if (poll_add(s) == -1) {
1304 		(void) close(s);
1305 		exit(1);
1306 	}
1307 	return (s);
1308 }
1309 
1310 /*
1311  * Process an RTM_IFINFO message received on a routing socket.
1312  * The return value indicates whether a full interface scan is required.
1313  * Link up/down notifications from the NICs are reflected in the
1314  * IFF_RUNNING flag.
1315  * If just the state of the IFF_RUNNING interface flag has changed, a
1316  * a full interface scan isn't required.
1317  */
1318 static boolean_t
1319 process_rtm_ifinfo(if_msghdr_t *ifm, int type)
1320 {
1321 	struct sockaddr_dl *sdl;
1322 	struct phyint *pi;
1323 	uint64_t old_flags;
1324 	struct phyint_instance *pii;
1325 
1326 	assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP);
1327 
1328 	/*
1329 	 * Although the sockaddr_dl structure is directly after the
1330 	 * if_msghdr_t structure. At the time of writing, the size of the
1331 	 * if_msghdr_t structure is different on 32 and 64 bit kernels, due
1332 	 * to the presence of a timeval structure, which contains longs,
1333 	 * in the if_data structure.  Anyway, we know where the message ends,
1334 	 * so we work backwards to get the start of the sockaddr_dl structure.
1335 	 */
1336 	/*LINTED*/
1337 	sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen -
1338 		sizeof (struct sockaddr_dl));
1339 
1340 	assert(sdl->sdl_family == AF_LINK);
1341 
1342 	/*
1343 	 * The interface name is in sdl_data.
1344 	 * RTM_IFINFO messages are only generated for logical interface
1345 	 * zero, so there is no colon and logical interface number to
1346 	 * strip from the name.	 The name is not null terminated, but
1347 	 * there should be enough space in sdl_data to add the null.
1348 	 */
1349 	if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) {
1350 		if (debug & D_LINKNOTE)
1351 			logdebug("process_rtm_ifinfo: "
1352 				"phyint name too long\n");
1353 		return (_B_TRUE);
1354 	}
1355 	sdl->sdl_data[sdl->sdl_nlen] = 0;
1356 
1357 	pi = phyint_lookup(sdl->sdl_data);
1358 	if (pi == NULL) {
1359 		if (debug & D_LINKNOTE)
1360 			logdebug("process_rtm_ifinfo: phyint lookup failed"
1361 				" for %s\n", sdl->sdl_data);
1362 		return (_B_TRUE);
1363 	}
1364 
1365 	/*
1366 	 * We want to try and avoid doing a full interface scan for
1367 	 * link state notifications from the NICs, as indicated
1368 	 * by the state of the IFF_RUNNING flag.  If just the
1369 	 * IFF_RUNNING flag has changed state, the link state changes
1370 	 * are processed without a full scan.
1371 	 * If there is both an IPv4 and IPv6 instance associated with
1372 	 * the physical interface, we will get an RTM_IFINFO message
1373 	 * for each instance.  If we just maintained a single copy of
1374 	 * the physical interface flags, it would appear that no flags
1375 	 * had changed when the second message is processed, leading us
1376 	 * to believe that the message wasn't generated by a flags change,
1377 	 * and that a full interface scan is required.
1378 	 * To get around this problem, two additional copies of the flags
1379 	 * are kept, one copy for each instance.  These are only used in
1380 	 * this routine.  At any one time, all three copies of the flags
1381 	 * should be identical except for the IFF_RUNNING flag.	 The
1382 	 * copy of the flags in the "phyint" structure is always up to
1383 	 * date.
1384 	 */
1385 	pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6;
1386 	if (pii == NULL) {
1387 		if (debug & D_LINKNOTE)
1388 			logdebug("process_rtm_ifinfo: no instance of address "
1389 			    "family %s for %s\n", AF_STR(type), pi->pi_name);
1390 		return (_B_TRUE);
1391 	}
1392 
1393 	old_flags = pii->pii_flags;
1394 	pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags);
1395 	pi->pi_flags = pii->pii_flags;
1396 
1397 	if (debug & D_LINKNOTE) {
1398 		logdebug("process_rtm_ifinfo: %s address family: %s, "
1399 		    "old flags: %llx, new flags: %llx\n", pi->pi_name,
1400 		    AF_STR(type), old_flags, pi->pi_flags);
1401 	}
1402 
1403 	/*
1404 	 * If IFF_STANDBY has changed, indicate that the interface has changed
1405 	 * types.
1406 	 */
1407 	if ((old_flags ^ pii->pii_flags) & IFF_STANDBY)
1408 		phyint_newtype(pi);
1409 
1410 	/*
1411 	 * If IFF_INACTIVE has been set, then no data addresses should be
1412 	 * hosted on the interface.  If IFF_INACTIVE has been cleared, then
1413 	 * move previously failed-over addresses back to it, provided it is
1414 	 * not failed.	For details, see the state diagram in mpd_probe.c.
1415 	 */
1416 	if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) {
1417 		if (pii->pii_flags & IFF_INACTIVE) {
1418 			if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY))
1419 				(void) try_failover(pi, FAILOVER_TO_NONSTANDBY);
1420 		} else {
1421 			if (pi->pi_state == PI_RUNNING && !pi->pi_full) {
1422 				pi->pi_empty = 0;
1423 				(void) try_failback(pi, _B_FALSE);
1424 			}
1425 		}
1426 	}
1427 
1428 	/* Has just the IFF_RUNNING flag changed state ? */
1429 	if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) {
1430 		struct phyint_instance *pii_other;
1431 		/*
1432 		 * It wasn't just a link state change.	Update
1433 		 * the other instance's copy of the flags.
1434 		 */
1435 		pii_other = phyint_inst_other(pii);
1436 		if (pii_other != NULL)
1437 			pii_other->pii_flags = pii->pii_flags;
1438 		return (_B_TRUE);
1439 	}
1440 
1441 	return (_B_FALSE);
1442 }
1443 
1444 /*
1445  * Retrieve as many routing socket messages as possible, and try to
1446  * empty the routing sockets. Initiate full scan of targets or interfaces
1447  * as needed.
1448  * We listen on separate IPv4 an IPv6 sockets so that we can accurately
1449  * detect changes in certain flags (see "process_rtm_ifinfo()" above).
1450  */
1451 static void
1452 process_rtsock(int rtsock_v4, int rtsock_v6)
1453 {
1454 	int	nbytes;
1455 	int64_t msg[2048 / 8];
1456 	struct rt_msghdr *rtm;
1457 	boolean_t need_if_scan = _B_FALSE;
1458 	boolean_t need_rt_scan = _B_FALSE;
1459 	boolean_t rtm_ifinfo_seen = _B_FALSE;
1460 	int type;
1461 
1462 	/* Read as many messages as possible and try to empty the sockets */
1463 	for (type = AF_INET; ; type = AF_INET6) {
1464 		for (;;) {
1465 			nbytes = read((type == AF_INET) ? rtsock_v4 :
1466 				rtsock_v6, msg, sizeof (msg));
1467 			if (nbytes <= 0) {
1468 				/* No more messages */
1469 				break;
1470 			}
1471 			rtm = (struct rt_msghdr *)msg;
1472 			if (rtm->rtm_version != RTM_VERSION) {
1473 				logerr("process_rtsock: version %d "
1474 				    "not understood\n", rtm->rtm_version);
1475 				break;
1476 			}
1477 
1478 			if (debug & D_PHYINT) {
1479 				logdebug("process_rtsock: message %d\n",
1480 				    rtm->rtm_type);
1481 			}
1482 
1483 			switch (rtm->rtm_type) {
1484 			case RTM_NEWADDR:
1485 			case RTM_DELADDR:
1486 				/*
1487 				 * Some logical interface has changed,
1488 				 * have to scan everything to determine
1489 				 * what actually changed.
1490 				 */
1491 				need_if_scan = _B_TRUE;
1492 				break;
1493 
1494 			case RTM_IFINFO:
1495 				rtm_ifinfo_seen = _B_TRUE;
1496 				need_if_scan |=
1497 					process_rtm_ifinfo((if_msghdr_t *)rtm,
1498 					type);
1499 				break;
1500 
1501 			case RTM_ADD:
1502 			case RTM_DELETE:
1503 			case RTM_CHANGE:
1504 			case RTM_OLDADD:
1505 			case RTM_OLDDEL:
1506 				need_rt_scan = _B_TRUE;
1507 				break;
1508 
1509 			default:
1510 				/* Not interesting */
1511 				break;
1512 			}
1513 		}
1514 		if (type == AF_INET6)
1515 			break;
1516 	}
1517 
1518 	if (need_if_scan) {
1519 		if (debug & D_LINKNOTE && rtm_ifinfo_seen)
1520 			logdebug("process_rtsock: synchronizing with kernel\n");
1521 		initifs();
1522 	} else if (rtm_ifinfo_seen) {
1523 		if (debug & D_LINKNOTE)
1524 			logdebug("process_rtsock: "
1525 			    "link up/down notification(s) seen\n");
1526 		process_link_state_changes();
1527 	}
1528 
1529 	if (need_rt_scan)
1530 		init_router_targets();
1531 }
1532 
1533 /*
1534  * Look if the phyint instance or one of its logints have been removed from
1535  * the kernel and take appropriate action.
1536  * Uses {pii,li}_in_use.
1537  */
1538 static void
1539 check_if_removed(struct phyint_instance *pii)
1540 {
1541 	struct logint *li;
1542 	struct logint *next_li;
1543 
1544 	/* Detect phyints that have been removed from the kernel. */
1545 	if (!pii->pii_in_use) {
1546 		logtrace("%s %s has been removed from kernel\n",
1547 		    AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
1548 		phyint_inst_delete(pii);
1549 	} else {
1550 		/* Detect logints that have been removed. */
1551 		for (li = pii->pii_logint; li != NULL; li = next_li) {
1552 			next_li = li->li_next;
1553 			if (!li->li_in_use) {
1554 				logint_delete(li);
1555 			}
1556 		}
1557 	}
1558 }
1559 
1560 /*
1561  * Send down a T_OPTMGMT_REQ to ip asking for all data in the various
1562  * tables defined by mib2.h. Parse the returned data and extract
1563  * the 'routing' information table. Process the 'routing' table
1564  * to get the list of known onlink routers, and update our database.
1565  * These onlink routers will serve as our probe targets.
1566  * Returns false, if any system calls resulted in errors, true otherwise.
1567  */
1568 static boolean_t
1569 update_router_list(int fd)
1570 {
1571 	union {
1572 		char	ubuf[1024];
1573 		union T_primitives uprim;
1574 	} buf;
1575 
1576 	int			flags;
1577 	struct strbuf		ctlbuf;
1578 	struct strbuf		databuf;
1579 	struct T_optmgmt_req	*tor;
1580 	struct T_optmgmt_ack	*toa;
1581 	struct T_error_ack	*tea;
1582 	struct opthdr		*optp;
1583 	struct opthdr		*req;
1584 	int			status;
1585 	t_scalar_t		prim;
1586 
1587 	tor = (struct T_optmgmt_req *)&buf;
1588 
1589 	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
1590 	tor->OPT_offset = sizeof (struct T_optmgmt_req);
1591 	tor->OPT_length = sizeof (struct opthdr);
1592 	tor->MGMT_flags = T_CURRENT;
1593 
1594 	req = (struct opthdr *)&tor[1];
1595 	req->level = MIB2_IP;	/* any MIB2_xxx value ok here */
1596 	req->name  = 0;
1597 	req->len   = 0;
1598 
1599 	ctlbuf.buf = (char *)&buf;
1600 	ctlbuf.len = tor->OPT_length + tor->OPT_offset;
1601 	ctlbuf.maxlen = sizeof (buf);
1602 	flags = 0;
1603 	if (putmsg(fd, &ctlbuf, NULL, flags) == -1) {
1604 		logperror("update_router_list: putmsg(ctl)");
1605 		return (_B_FALSE);
1606 	}
1607 
1608 	/*
1609 	 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for
1610 	 * each table defined in mib2.h.  Each T_OPTMGMT_ACK msg contains
1611 	 * a control and data part. The control part contains a struct
1612 	 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies
1613 	 * the level, name and length of the data in the data part. The
1614 	 * data part contains the actual table data. The last message
1615 	 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a
1616 	 * single option with zero optlen.
1617 	 */
1618 
1619 	for (;;) {
1620 		/*
1621 		 * Go around this loop once for each table. Ignore
1622 		 * all tables except the routing information table.
1623 		 */
1624 		flags = 0;
1625 		status = getmsg(fd, &ctlbuf, NULL, &flags);
1626 		if (status < 0) {
1627 			if (errno == EINTR)
1628 				continue;
1629 			logperror("update_router_list: getmsg(ctl)");
1630 			return (_B_FALSE);
1631 		}
1632 		if (ctlbuf.len < sizeof (t_scalar_t)) {
1633 			logerr("update_router_list: ctlbuf.len %d\n",
1634 			    ctlbuf.len);
1635 			return (_B_FALSE);
1636 		}
1637 
1638 		prim = buf.uprim.type;
1639 
1640 		switch (prim) {
1641 
1642 		case T_ERROR_ACK:
1643 			tea = &buf.uprim.error_ack;
1644 			if (ctlbuf.len < sizeof (struct T_error_ack)) {
1645 				logerr("update_router_list: T_ERROR_ACK"
1646 				    " ctlbuf.len %d\n", ctlbuf.len);
1647 				return (_B_FALSE);
1648 			}
1649 			logerr("update_router_list: T_ERROR_ACK:"
1650 			    " TLI_error = 0x%lx, UNIX_error = 0x%lx\n",
1651 			    tea->TLI_error, tea->UNIX_error);
1652 			return (_B_FALSE);
1653 
1654 		case T_OPTMGMT_ACK:
1655 			toa = &buf.uprim.optmgmt_ack;
1656 			optp = (struct opthdr *)&toa[1];
1657 			if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) {
1658 				logerr("update_router_list: ctlbuf.len %d\n",
1659 				    ctlbuf.len);
1660 				return (_B_FALSE);
1661 			}
1662 			if (toa->MGMT_flags != T_SUCCESS) {
1663 				logerr("update_router_list: MGMT_flags 0x%lx\n",
1664 				    toa->MGMT_flags);
1665 				return (_B_FALSE);
1666 			}
1667 			break;
1668 
1669 		default:
1670 			logerr("update_router_list: unknown primitive %ld\n",
1671 			    prim);
1672 			return (_B_FALSE);
1673 		}
1674 
1675 		/* Process the T_OPGMGMT_ACK below */
1676 		assert(prim == T_OPTMGMT_ACK);
1677 
1678 		switch (status) {
1679 		case 0:
1680 			/*
1681 			 * We have reached the end of this T_OPTMGMT_ACK
1682 			 * message. If this is the last message i.e EOD,
1683 			 * return, else process the next T_OPTMGMT_ACK msg.
1684 			 */
1685 			if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) +
1686 			    sizeof (struct opthdr)) && optp->len == 0 &&
1687 			    optp->name == 0 && optp->level == 0) {
1688 				/*
1689 				 * This is the EOD message. Return
1690 				 */
1691 				return (_B_TRUE);
1692 			}
1693 			continue;
1694 
1695 		case MORECTL:
1696 		case MORECTL | MOREDATA:
1697 			/*
1698 			 * This should not happen. We should be able to read
1699 			 * the control portion in a single getmsg.
1700 			 */
1701 			logerr("update_router_list: MORECTL\n");
1702 			return (_B_FALSE);
1703 
1704 		case MOREDATA:
1705 			databuf.maxlen = optp->len;
1706 			/* malloc of 0 bytes is ok */
1707 			databuf.buf = malloc((size_t)optp->len);
1708 			if (databuf.maxlen != 0 && databuf.buf == NULL) {
1709 				logperror("update_router_list: malloc");
1710 				return (_B_FALSE);
1711 			}
1712 			databuf.len = 0;
1713 			flags = 0;
1714 			for (;;) {
1715 				status = getmsg(fd, NULL, &databuf, &flags);
1716 				if (status >= 0) {
1717 					break;
1718 				} else if (errno == EINTR) {
1719 					continue;
1720 				} else {
1721 					logperror("update_router_list:"
1722 					    " getmsg(data)");
1723 					free(databuf.buf);
1724 					return (_B_FALSE);
1725 				}
1726 			}
1727 
1728 			if (optp->level == MIB2_IP &&
1729 			    optp->name == MIB2_IP_ROUTE) {
1730 				/* LINTED */
1731 				ire_process_v4((mib2_ipRouteEntry_t *)
1732 				    databuf.buf, databuf.len);
1733 			} else if (optp->level == MIB2_IP6 &&
1734 			    optp->name == MIB2_IP6_ROUTE) {
1735 				/* LINTED */
1736 				ire_process_v6((mib2_ipv6RouteEntry_t *)
1737 				    databuf.buf, databuf.len);
1738 			}
1739 			free(databuf.buf);
1740 		}
1741 	}
1742 	/* NOTREACHED */
1743 }
1744 
1745 /*
1746  * Examine the IPv4 routing table, for default routers. For each default
1747  * router, populate the list of targets of each phyint that is on the same
1748  * link as the default router
1749  */
1750 static void
1751 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len)
1752 {
1753 	mib2_ipRouteEntry_t	*rp;
1754 	mib2_ipRouteEntry_t	*rp1;
1755 	struct	in_addr		nexthop_v4;
1756 	mib2_ipRouteEntry_t	*endp;
1757 
1758 	if (len == 0)
1759 		return;
1760 	assert((len % sizeof (mib2_ipRouteEntry_t)) == 0);
1761 
1762 	endp = buf + (len / sizeof (mib2_ipRouteEntry_t));
1763 
1764 	/*
1765 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1766 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1767 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1768 	 * This is a potential target for probing, which we try to add
1769 	 * to the list of probe targets.
1770 	 */
1771 	for (rp = buf; rp < endp; rp++) {
1772 		if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET))
1773 			continue;
1774 
1775 		/*  Get the nexthop address. */
1776 		nexthop_v4.s_addr = rp->ipRouteNextHop;
1777 
1778 		/*
1779 		 * Get the nexthop address. Then determine the outgoing
1780 		 * interface, by examining all interface IREs, and picking the
1781 		 * match. We don't look at the interface specified in the route
1782 		 * because we need to add the router target on all matching
1783 		 * interfaces anyway; the goal is to avoid falling back to
1784 		 * multicast when some interfaces are in the same subnet but
1785 		 * not in the same group.
1786 		 */
1787 		for (rp1 = buf; rp1 < endp; rp1++) {
1788 			if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) {
1789 				continue;
1790 			}
1791 
1792 			/*
1793 			 * Determine the interface IRE that matches the nexthop.
1794 			 * i.e.	 (IRE addr & IRE mask) == (nexthop & IRE mask)
1795 			 */
1796 			if ((rp1->ipRouteDest & rp1->ipRouteMask) ==
1797 			    (nexthop_v4.s_addr & rp1->ipRouteMask)) {
1798 				/*
1799 				 * We found the interface ire
1800 				 */
1801 				router_add_v4(rp1, nexthop_v4);
1802 			}
1803 		}
1804 	}
1805 }
1806 
1807 void
1808 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4)
1809 {
1810 	char *cp;
1811 	char ifname[LIFNAMSIZ + 1];
1812 	struct in6_addr	nexthop;
1813 	int len;
1814 
1815 	if (debug & D_TARGET)
1816 		logdebug("router_add_v4()\n");
1817 
1818 	len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1);
1819 	(void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len);
1820 	ifname[len] = '\0';
1821 
1822 	if (ifname[0] == '\0')
1823 		return;
1824 
1825 	cp = strchr(ifname, IF_SEPARATOR);
1826 	if (cp != NULL)
1827 		*cp = '\0';
1828 
1829 	IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop);
1830 	router_add_common(AF_INET, ifname, nexthop);
1831 }
1832 
1833 void
1834 router_add_common(int af, char *ifname, struct in6_addr nexthop)
1835 {
1836 	struct phyint_instance *pii;
1837 	struct phyint *pi;
1838 
1839 	if (debug & D_TARGET)
1840 		logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname);
1841 
1842 	/*
1843 	 * Retrieve the phyint instance; bail if it's not known to us yet.
1844 	 */
1845 	pii = phyint_inst_lookup(af, ifname);
1846 	if (pii == NULL)
1847 		return;
1848 
1849 	/*
1850 	 * Don't use our own addresses as targets.
1851 	 */
1852 	if (own_address(pii->pii_af, nexthop))
1853 		return;
1854 
1855 	/*
1856 	 * If the phyint is part a named group, then add the address to all
1857 	 * members of the group; note that this is suboptimal in the IPv4 case
1858 	 * as it has already been added to all matching interfaces in
1859 	 * ire_process_v4(). Otherwise, add the address only to the phyint
1860 	 * itself, since other phyints in the anongroup may not be on the same
1861 	 * subnet.
1862 	 */
1863 	pi = pii->pii_phyint;
1864 	if (pi->pi_group == phyint_anongroup) {
1865 		target_add(pii, nexthop, _B_TRUE);
1866 	} else {
1867 		pi = pi->pi_group->pg_phyint;
1868 		for (; pi != NULL; pi = pi->pi_pgnext)
1869 			target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE);
1870 	}
1871 }
1872 
1873 /*
1874  * Examine the IPv6 routing table, for default routers. For each default
1875  * router, populate the list of targets of each phyint that is on the same
1876  * link as the default router
1877  */
1878 static void
1879 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len)
1880 {
1881 	mib2_ipv6RouteEntry_t	*rp;
1882 	mib2_ipv6RouteEntry_t	*endp;
1883 	struct	in6_addr nexthop_v6;
1884 
1885 	if (debug & D_TARGET)
1886 		logdebug("ire_process_v6(len %d)\n", len);
1887 
1888 	if (len == 0)
1889 		return;
1890 
1891 	assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0);
1892 	endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t));
1893 
1894 	/*
1895 	 * Loop thru the routing table entries. Process any IRE_DEFAULT,
1896 	 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others.
1897 	 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address.
1898 	 * This is a potential target for probing, which we try to add
1899 	 * to the list of probe targets.
1900 	 */
1901 	for (rp = buf; rp < endp; rp++) {
1902 		if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET))
1903 			continue;
1904 
1905 		/*
1906 		 * We have the outgoing interface in ipv6RouteIfIndex
1907 		 * if ipv6RouteIfindex.o_length is non-zero. The outgoing
1908 		 * interface must be present for link-local addresses. Since
1909 		 * we use only link-local addreses for probing, we don't
1910 		 * consider the case when the outgoing interface is not
1911 		 * known and we need to scan interface ires
1912 		 */
1913 		nexthop_v6 = rp->ipv6RouteNextHop;
1914 		if (rp->ipv6RouteIfIndex.o_length != 0) {
1915 			/*
1916 			 * We already have the outgoing interface
1917 			 * in ipv6RouteIfIndex.
1918 			 */
1919 			router_add_v6(rp, nexthop_v6);
1920 		}
1921 	}
1922 }
1923 
1924 
1925 void
1926 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6)
1927 {
1928 	char ifname[LIFNAMSIZ + 1];
1929 	char *cp;
1930 	int  len;
1931 
1932 	if (debug & D_TARGET)
1933 		logdebug("router_add_v6()\n");
1934 
1935 	len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1);
1936 	(void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len);
1937 	ifname[len] = '\0';
1938 
1939 	if (ifname[0] == '\0')
1940 		return;
1941 
1942 	cp = strchr(ifname, IF_SEPARATOR);
1943 	if (cp != NULL)
1944 		*cp = '\0';
1945 
1946 	router_add_common(AF_INET6, ifname, nexthop_v6);
1947 }
1948 
1949 
1950 
1951 /*
1952  * Build a list of target routers, by scanning the routing tables.
1953  * It is assumed that interface routes exist, to reach the routers.
1954  */
1955 static void
1956 init_router_targets(void)
1957 {
1958 	struct	target *tg;
1959 	struct	target *next_tg;
1960 	struct	phyint_instance *pii;
1961 	struct	phyint *pi;
1962 
1963 	if (force_mcast)
1964 		return;
1965 
1966 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1967 		pi = pii->pii_phyint;
1968 		/*
1969 		 * Exclude ptp and host targets. Set tg_in_use to false,
1970 		 * only for router targets.
1971 		 */
1972 		if (!pii->pii_targets_are_routers ||
1973 		    (pi->pi_flags & IFF_POINTOPOINT))
1974 			continue;
1975 
1976 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1977 			tg->tg_in_use = 0;
1978 	}
1979 
1980 	if (mibfd < 0) {
1981 		mibfd = open("/dev/ip", O_RDWR);
1982 		if (mibfd < 0) {
1983 			logperror("mibopen: ip open");
1984 			exit(1);
1985 		}
1986 	}
1987 
1988 	if (!update_router_list(mibfd)) {
1989 		(void) close(mibfd);
1990 		mibfd = -1;
1991 	}
1992 
1993 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
1994 		if (!pii->pii_targets_are_routers ||
1995 		    (pi->pi_flags & IFF_POINTOPOINT))
1996 			continue;
1997 
1998 		for (tg = pii->pii_targets; tg != NULL; tg = next_tg) {
1999 			next_tg = tg->tg_next;
2000 			if (!tg->tg_in_use) {
2001 				target_delete(tg);
2002 			}
2003 		}
2004 	}
2005 }
2006 
2007 /*
2008  * Attempt to assign host targets to any interfaces that do not currently
2009  * have probe targets by sharing targets with other interfaces in the group.
2010  */
2011 static void
2012 init_host_targets(void)
2013 {
2014 	struct phyint_instance *pii;
2015 	struct phyint_group *pg;
2016 
2017 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2018 		pg = pii->pii_phyint->pi_group;
2019 		if (pg != phyint_anongroup && pii->pii_targets == NULL)
2020 			dup_host_targets(pii);
2021 	}
2022 }
2023 
2024 /*
2025  * Duplicate host targets from other phyints of the group to
2026  * the phyint instance 'desired_pii'.
2027  */
2028 static void
2029 dup_host_targets(struct phyint_instance	 *desired_pii)
2030 {
2031 	int af;
2032 	struct phyint *pi;
2033 	struct phyint_instance *pii;
2034 	struct target *tg;
2035 
2036 	assert(desired_pii->pii_phyint->pi_group != phyint_anongroup);
2037 
2038 	af = desired_pii->pii_af;
2039 
2040 	/*
2041 	 * For every phyint in the same group as desired_pii, check if
2042 	 * it has any host targets. If so add them to desired_pii.
2043 	 */
2044 	for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) {
2045 		pii = PHYINT_INSTANCE(pi, af);
2046 		/*
2047 		 * We know that we don't have targets on this phyint instance
2048 		 * since we have been called. But we still check for
2049 		 * pii_targets_are_routers because another phyint instance
2050 		 * could have router targets, since IFF_NOFAILOVER addresses
2051 		 * on different phyint instances may belong to different
2052 		 * subnets.
2053 		 */
2054 		if ((pii == NULL) || (pii == desired_pii) ||
2055 		    pii->pii_targets_are_routers)
2056 			continue;
2057 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2058 			target_create(desired_pii, tg->tg_address, _B_FALSE);
2059 		}
2060 	}
2061 }
2062 
2063 static void
2064 usage(char *cmd)
2065 {
2066 	(void) fprintf(stderr, "usage: %s\n", cmd);
2067 }
2068 
2069 
2070 #define	MPATHD_DEFAULT_FILE	"/etc/default/mpathd"
2071 
2072 /* Get an option from the /etc/default/mpathd file */
2073 static char *
2074 getdefault(char *name)
2075 {
2076 	char namebuf[BUFSIZ];
2077 	char *value = NULL;
2078 
2079 	if (defopen(MPATHD_DEFAULT_FILE) == 0) {
2080 		char	*cp;
2081 		int	flags;
2082 
2083 		/*
2084 		 * ignore case
2085 		 */
2086 		flags = defcntl(DC_GETFLAGS, 0);
2087 		TURNOFF(flags, DC_CASE);
2088 		(void) defcntl(DC_SETFLAGS, flags);
2089 
2090 		/* Add "=" to the name */
2091 		(void) strncpy(namebuf, name, sizeof (namebuf) - 2);
2092 		(void) strncat(namebuf, "=", 2);
2093 
2094 		if ((cp = defread(namebuf)) != NULL)
2095 			value = strdup(cp);
2096 
2097 		/* close */
2098 		(void) defopen((char *)NULL);
2099 	}
2100 	return (value);
2101 }
2102 
2103 
2104 /*
2105  * Command line options below
2106  */
2107 boolean_t	failback_enabled = _B_TRUE;	/* failback enabled/disabled */
2108 boolean_t	track_all_phyints = _B_FALSE;	/* option to track all NICs */
2109 static boolean_t adopt = _B_FALSE;
2110 static boolean_t foreground = _B_FALSE;
2111 
2112 int
2113 main(int argc, char *argv[])
2114 {
2115 	int i;
2116 	int c;
2117 	struct phyint_instance *pii;
2118 	char *value;
2119 
2120 	argv0 = argv;		/* Saved for re-exec on SIGHUP */
2121 	srandom(gethostid());	/* Initialize the random number generator */
2122 
2123 	/*
2124 	 * NOTE: The messages output by in.mpathd are not suitable for
2125 	 * translation, so we do not call textdomain().
2126 	 */
2127 	(void) setlocale(LC_ALL, "");
2128 
2129 	/*
2130 	 * Get the user specified value of 'failure detection time'
2131 	 * from /etc/default/mpathd
2132 	 */
2133 	value = getdefault("FAILURE_DETECTION_TIME");
2134 	if (value != NULL) {
2135 		user_failure_detection_time =
2136 		    (int)strtol((char *)value, NULL, 0);
2137 
2138 		if (user_failure_detection_time <= 0) {
2139 			user_failure_detection_time = FAILURE_DETECTION_TIME;
2140 			logerr("Invalid failure detection time %s, assuming "
2141 			    "default %d\n", value, user_failure_detection_time);
2142 
2143 		} else if (user_failure_detection_time <
2144 		    MIN_FAILURE_DETECTION_TIME) {
2145 			user_failure_detection_time =
2146 			    MIN_FAILURE_DETECTION_TIME;
2147 			logerr("Too small failure detection time of %s, "
2148 			    "assuming minimum %d\n", value,
2149 			    user_failure_detection_time);
2150 		}
2151 		free(value);
2152 	} else {
2153 		/* User has not specified the parameter, Use default value */
2154 		user_failure_detection_time = FAILURE_DETECTION_TIME;
2155 	}
2156 
2157 	/*
2158 	 * This gives the frequency at which probes will be sent.
2159 	 * When fdt ms elapses, we should be able to determine
2160 	 * whether 5 consecutive probes have failed or not.
2161 	 * 1 probe will be sent in every user_probe_interval ms,
2162 	 * randomly anytime in the (0.5  - 1.0) 2nd half of every
2163 	 * user_probe_interval. Thus when we send out probe 'n' we
2164 	 * can be sure that probe 'n - 2' is lost, if we have not
2165 	 * got the ack. (since the probe interval is > crtt). But
2166 	 * probe 'n - 1' may be a valid unacked probe, since the
2167 	 * time between 2 successive probes could be as small as
2168 	 * 0.5 * user_probe_interval.  Hence the NUM_PROBE_FAILS + 2
2169 	 */
2170 	user_probe_interval = user_failure_detection_time /
2171 	    (NUM_PROBE_FAILS + 2);
2172 
2173 	/*
2174 	 * Get the user specified value of failback_enabled from
2175 	 * /etc/default/mpathd
2176 	 */
2177 	value = getdefault("FAILBACK");
2178 	if (value != NULL) {
2179 		if (strncasecmp(value, "yes", 3) == 0)
2180 			failback_enabled = _B_TRUE;
2181 		else if (strncasecmp(value, "no", 2) == 0)
2182 			failback_enabled = _B_FALSE;
2183 		else
2184 			logerr("Invalid value for FAILBACK %s\n", value);
2185 		free(value);
2186 	} else {
2187 		failback_enabled = _B_TRUE;
2188 	}
2189 
2190 	/*
2191 	 * Get the user specified value of track_all_phyints from
2192 	 * /etc/default/mpathd. The sense is reversed in
2193 	 * TRACK_INTERFACES_ONLY_WITH_GROUPS.
2194 	 */
2195 	value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS");
2196 	if (value != NULL) {
2197 		if (strncasecmp(value, "yes", 3) == 0)
2198 			track_all_phyints = _B_FALSE;
2199 		else if (strncasecmp(value, "no", 2) == 0)
2200 			track_all_phyints = _B_TRUE;
2201 		else
2202 			logerr("Invalid value for "
2203 			    "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value);
2204 		free(value);
2205 	} else {
2206 		track_all_phyints = _B_FALSE;
2207 	}
2208 
2209 	while ((c = getopt(argc, argv, "adD:ml")) != EOF) {
2210 		switch (c) {
2211 		case 'a':
2212 			adopt = _B_TRUE;
2213 			break;
2214 		case 'm':
2215 			force_mcast = _B_TRUE;
2216 			break;
2217 		case 'd':
2218 			debug = D_ALL;
2219 			foreground = _B_TRUE;
2220 			break;
2221 		case 'D':
2222 			i = (int)strtol(optarg, NULL, 0);
2223 			if (i == 0) {
2224 				(void) fprintf(stderr, "Bad debug flags: %s\n",
2225 				    optarg);
2226 				exit(1);
2227 			}
2228 			debug |= i;
2229 			foreground = _B_TRUE;
2230 			break;
2231 		case 'l':
2232 			/*
2233 			 * Turn off link state notification handling.
2234 			 * Undocumented command line flag, for debugging
2235 			 * purposes.
2236 			 */
2237 			handle_link_notifications = _B_FALSE;
2238 			break;
2239 		default:
2240 			usage(argv[0]);
2241 			exit(1);
2242 		}
2243 	}
2244 
2245 	/*
2246 	 * The sockets for the loopback command interface should be listening
2247 	 * before we fork and exit in daemonize(). This way, whoever started us
2248 	 * can use the loopback interface as soon as they get a zero exit
2249 	 * status.
2250 	 */
2251 	lsock_v4 = setup_listener(AF_INET);
2252 	lsock_v6 = setup_listener(AF_INET6);
2253 
2254 	if (lsock_v4 < 0 && lsock_v6 < 0) {
2255 		logerr("main: setup_listener failed for both IPv4 and IPv6\n");
2256 		exit(1);
2257 	}
2258 
2259 	if (!foreground) {
2260 		if (!daemonize()) {
2261 			logerr("cannot daemonize\n");
2262 			exit(EXIT_FAILURE);
2263 		}
2264 		initlog();
2265 	}
2266 
2267 	/*
2268 	 * Initializations:
2269 	 * 1. Create ifsock* sockets. These are used for performing SIOC*
2270 	 *    ioctls. We have 2 sockets 1 each for IPv4 and IPv6.
2271 	 * 2. Initialize a pipe for handling/recording signal events.
2272 	 * 3. Create the routing sockets,  used for listening
2273 	 *    to routing / interface changes.
2274 	 * 4. phyint_init() - Initialize physical interface state
2275 	 *    (in mpd_tables.c).  Must be done before creating interfaces,
2276 	 *    which timer_init() does indirectly.
2277 	 * 5. timer_init()  - Initialize timer related stuff
2278 	 * 6. initifs() - Initialize our database of all known interfaces
2279 	 * 7. init_router_targets() - Initialize our database of all known
2280 	 *    router targets.
2281 	 */
2282 	ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0);
2283 	if (ifsock_v4 < 0) {
2284 		logperror("main: IPv4 socket open");
2285 		exit(1);
2286 	}
2287 
2288 	ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0);
2289 	if (ifsock_v6 < 0) {
2290 		logperror("main: IPv6 socket open");
2291 		exit(1);
2292 	}
2293 
2294 	setup_eventpipe();
2295 
2296 	rtsock_v4 = setup_rtsock(AF_INET);
2297 	rtsock_v6 = setup_rtsock(AF_INET6);
2298 
2299 	if (phyint_init() == -1) {
2300 		logerr("cannot initialize physical interface structures");
2301 		exit(1);
2302 	}
2303 
2304 	timer_init();
2305 
2306 	initifs();
2307 
2308 	/* Inform kernel whether failback is enabled or disabled */
2309 	if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) {
2310 		logperror("main: ioctl (SIOCSIPMPFAILBACK)");
2311 		exit(1);
2312 	}
2313 
2314 	/*
2315 	 * If we're operating in "adopt" mode and no interfaces need to be
2316 	 * tracked, shut down (ifconfig(1M) will restart us on demand if
2317 	 * interfaces are subsequently put into multipathing groups).
2318 	 */
2319 	if (adopt && phyint_instances == NULL)
2320 		exit(0);
2321 
2322 	/*
2323 	 * Main body. Keep listening for activity on any of the sockets
2324 	 * that we are monitoring and take appropriate action as necessary.
2325 	 * signals are also handled synchronously.
2326 	 */
2327 	for (;;) {
2328 		if (poll(pollfds, pollfd_num, -1) < 0) {
2329 			if (errno == EINTR)
2330 				continue;
2331 			logperror("main: poll");
2332 			exit(1);
2333 		}
2334 		for (i = 0; i < pollfd_num; i++) {
2335 			if ((pollfds[i].fd == -1) ||
2336 			    !(pollfds[i].revents & POLLIN))
2337 				continue;
2338 			if (pollfds[i].fd == eventpipe_read) {
2339 				in_signal(eventpipe_read);
2340 				break;
2341 			}
2342 			if (pollfds[i].fd == rtsock_v4 ||
2343 				pollfds[i].fd == rtsock_v6) {
2344 				process_rtsock(rtsock_v4, rtsock_v6);
2345 				break;
2346 			}
2347 			for (pii = phyint_instances; pii != NULL;
2348 			    pii = pii->pii_next) {
2349 				if (pollfds[i].fd == pii->pii_probe_sock) {
2350 					if (pii->pii_af == AF_INET)
2351 						in_data(pii);
2352 					else
2353 						in6_data(pii);
2354 					break;
2355 				}
2356 			}
2357 			if (pollfds[i].fd == lsock_v4)
2358 				loopback_cmd(lsock_v4, AF_INET);
2359 			else if (pollfds[i].fd == lsock_v6)
2360 				loopback_cmd(lsock_v6, AF_INET6);
2361 		}
2362 		if (full_scan_required) {
2363 			initifs();
2364 			full_scan_required = _B_FALSE;
2365 		}
2366 	}
2367 	/* NOTREACHED */
2368 	return (EXIT_SUCCESS);
2369 }
2370 
2371 static int
2372 setup_listener(int af)
2373 {
2374 	int sock;
2375 	int on;
2376 	int len;
2377 	int ret;
2378 	struct sockaddr_storage laddr;
2379 	struct sockaddr_in  *sin;
2380 	struct sockaddr_in6 *sin6;
2381 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2382 
2383 	assert(af == AF_INET || af == AF_INET6);
2384 
2385 	sock = socket(af, SOCK_STREAM, 0);
2386 	if (sock < 0) {
2387 		logperror("setup_listener: socket");
2388 		exit(1);
2389 	}
2390 
2391 	on = 1;
2392 	if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
2393 	    sizeof (on)) < 0) {
2394 		logperror("setup_listener: setsockopt (SO_REUSEADDR)");
2395 		exit(1);
2396 	}
2397 
2398 	bzero(&laddr, sizeof (laddr));
2399 	laddr.ss_family = af;
2400 
2401 	if (af == AF_INET) {
2402 		sin = (struct sockaddr_in *)&laddr;
2403 		sin->sin_port = htons(MPATHD_PORT);
2404 		sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
2405 		len = sizeof (struct sockaddr_in);
2406 	} else {
2407 		sin6 = (struct sockaddr_in6 *)&laddr;
2408 		sin6->sin6_port = htons(MPATHD_PORT);
2409 		sin6->sin6_addr = loopback_addr;
2410 		len = sizeof (struct sockaddr_in6);
2411 	}
2412 
2413 	ret = bind(sock, (struct sockaddr *)&laddr, len);
2414 	if (ret < 0) {
2415 		if (errno == EADDRINUSE) {
2416 			/*
2417 			 * Another instance of mpathd may be already active.
2418 			 */
2419 			logerr("main: is another instance of in.mpathd "
2420 			    "already active?\n");
2421 			exit(1);
2422 		} else {
2423 			(void) close(sock);
2424 			return (-1);
2425 		}
2426 	}
2427 	if (listen(sock, 30) < 0) {
2428 		logperror("main: listen");
2429 		exit(1);
2430 	}
2431 	if (poll_add(sock) == -1) {
2432 		(void) close(sock);
2433 		exit(1);
2434 	}
2435 
2436 	return (sock);
2437 }
2438 
2439 /*
2440  * Table of commands and their expected size; used by loopback_cmd().
2441  */
2442 static struct {
2443 	const char	*name;
2444 	unsigned int	size;
2445 } commands[] = {
2446 	{ "MI_PING",		sizeof (uint32_t)	},
2447 	{ "MI_OFFLINE",		sizeof (mi_offline_t)	},
2448 	{ "MI_UNDO_OFFLINE",	sizeof (mi_undo_offline_t) },
2449 	{ "MI_SETOINDEX",	sizeof (mi_setoindex_t) },
2450 	{ "MI_QUERY",		sizeof (mi_query_t)	}
2451 };
2452 
2453 /*
2454  * Commands received over the loopback interface come here. Currently
2455  * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP
2456  * module. ifconfig only makes a connection, and closes it to check if
2457  * in.mpathd is running.
2458  * if_mpadm sends commands in the format specified by the mpathd_interface
2459  * structure.
2460  */
2461 static void
2462 loopback_cmd(int sock, int family)
2463 {
2464 	int newfd;
2465 	ssize_t len;
2466 	struct sockaddr_storage	peer;
2467 	struct sockaddr_in	*peer_sin;
2468 	struct sockaddr_in6	*peer_sin6;
2469 	socklen_t peerlen;
2470 	union mi_commands mpi;
2471 	struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
2472 	char abuf[INET6_ADDRSTRLEN];
2473 	uint_t cmd;
2474 	int retval;
2475 
2476 	peerlen = sizeof (peer);
2477 	newfd = accept(sock, (struct sockaddr *)&peer, &peerlen);
2478 	if (newfd < 0) {
2479 		logperror("loopback_cmd: accept");
2480 		return;
2481 	}
2482 
2483 	switch (family) {
2484 	case AF_INET:
2485 		/*
2486 		 * Validate the address and port to make sure that
2487 		 * non privileged processes don't connect and start
2488 		 * talking to us.
2489 		 */
2490 		if (peerlen != sizeof (struct sockaddr_in)) {
2491 			logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen);
2492 			(void) close(newfd);
2493 			return;
2494 		}
2495 		peer_sin = (struct sockaddr_in *)&peer;
2496 		if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) ||
2497 		    (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) {
2498 			(void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr,
2499 			    abuf, sizeof (abuf));
2500 			logerr("Attempt to connect from addr %s port %d\n",
2501 			    abuf, ntohs(peer_sin->sin_port));
2502 			(void) close(newfd);
2503 			return;
2504 		}
2505 		break;
2506 
2507 	case AF_INET6:
2508 		if (peerlen != sizeof (struct sockaddr_in6)) {
2509 			logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen);
2510 			(void) close(newfd);
2511 			return;
2512 		}
2513 		/*
2514 		 * Validate the address and port to make sure that
2515 		 * non privileged processes don't connect and start
2516 		 * talking to us.
2517 		 */
2518 		peer_sin6 = (struct sockaddr_in6 *)&peer;
2519 		if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) ||
2520 		    (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr,
2521 		    &loopback_addr))) {
2522 			(void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf,
2523 			    sizeof (abuf));
2524 			logerr("Attempt to connect from addr %s port %d\n",
2525 			    abuf, ntohs(peer_sin6->sin6_port));
2526 			(void) close(newfd);
2527 			return;
2528 		}
2529 
2530 	default:
2531 		logdebug("loopback_cmd: family %d\n", family);
2532 		(void) close(newfd);
2533 		return;
2534 	}
2535 
2536 	/*
2537 	 * The sizeof the 'mpi' buffer corresponds to the maximum size of
2538 	 * all supported commands
2539 	 */
2540 	len = read(newfd, &mpi, sizeof (mpi));
2541 
2542 	/*
2543 	 * ifconfig does not send any data. Just tests to see if mpathd
2544 	 * is already running.
2545 	 */
2546 	if (len <= 0) {
2547 		(void) close(newfd);
2548 		return;
2549 	}
2550 
2551 	/*
2552 	 * In theory, we can receive any sized message for a stream socket,
2553 	 * but we don't expect that to happen for a small message over a
2554 	 * loopback connection.
2555 	 */
2556 	if (len < sizeof (uint32_t)) {
2557 		logerr("loopback_cmd: bad command format or read returns "
2558 		    "partial data %d\n", len);
2559 	}
2560 
2561 	cmd = mpi.mi_command;
2562 	if (cmd >= MI_NCMD) {
2563 		logerr("loopback_cmd: unknown command id `%d'\n", cmd);
2564 		(void) close(newfd);
2565 		return;
2566 	}
2567 
2568 	if (len < commands[cmd].size) {
2569 		logerr("loopback_cmd: short %s command (expected %d, got %d)\n",
2570 		    commands[cmd].name, commands[cmd].size, len);
2571 		(void) close(newfd);
2572 		return;
2573 	}
2574 
2575 	retval = process_cmd(newfd, &mpi);
2576 	if (retval != IPMP_SUCCESS) {
2577 		logerr("failed processing %s: %s\n", commands[cmd].name,
2578 		    ipmp_errmsg(retval));
2579 	}
2580 	(void) close(newfd);
2581 }
2582 
2583 extern int global_errno;	/* set by failover() or failback() */
2584 
2585 /*
2586  * Process the offline, undo offline and set original index commands,
2587  * received from if_mpadm(1M)
2588  */
2589 static unsigned int
2590 process_cmd(int newfd, union mi_commands *mpi)
2591 {
2592 	uint_t	nif = 0;
2593 	uint32_t cmd;
2594 	struct phyint *pi;
2595 	struct phyint *pi2;
2596 	struct phyint_group *pg;
2597 	boolean_t success;
2598 	int error;
2599 	struct mi_offline *mio;
2600 	struct mi_undo_offline *miu;
2601 	struct lifreq lifr;
2602 	int ifsock;
2603 	struct mi_setoindex *mis;
2604 
2605 	cmd = mpi->mi_command;
2606 
2607 	switch (cmd) {
2608 	case MI_OFFLINE:
2609 		mio = &mpi->mi_ocmd;
2610 		/*
2611 		 * Lookup the interface that needs to be offlined.
2612 		 * If it does not exist, return a suitable error.
2613 		 */
2614 		pi = phyint_lookup(mio->mio_ifname);
2615 		if (pi == NULL)
2616 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2617 
2618 		/*
2619 		 * Verify that the minimum redundancy requirements are met.
2620 		 * The multipathing group must have at least the specified
2621 		 * number of functional interfaces after offlining the
2622 		 * requested interface. Otherwise return a suitable error.
2623 		 */
2624 		pg = pi->pi_group;
2625 		nif = 0;
2626 		if (pg != phyint_anongroup) {
2627 			for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL;
2628 			    pi2 = pi2->pi_pgnext) {
2629 				if ((pi2->pi_state == PI_RUNNING) ||
2630 				    (pg->pg_groupfailed &&
2631 				    !(pi2->pi_flags & IFF_OFFLINE)))
2632 					nif++;
2633 			}
2634 		}
2635 		if (nif < mio->mio_min_redundancy)
2636 			return (send_result(newfd, IPMP_EMINRED, 0));
2637 
2638 		/*
2639 		 * The order of operation is to set IFF_OFFLINE, followed by
2640 		 * failover. Setting IFF_OFFLINE ensures that no new ipif's
2641 		 * can be created. Subsequent failover moves everything on
2642 		 * the OFFLINE interface to some other functional interface.
2643 		 */
2644 		success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE);
2645 		if (success) {
2646 			if (!pi->pi_empty) {
2647 				error = try_failover(pi, FAILOVER_NORMAL);
2648 				if (error != 0) {
2649 					if (!change_lif_flags(pi, IFF_OFFLINE,
2650 					    _B_FALSE)) {
2651 						logerr("process_cmd: couldn't"
2652 						    " clear OFFLINE flag on"
2653 						    " %s\n", pi->pi_name);
2654 						/*
2655 						 * Offline interfaces should
2656 						 * not be probed.
2657 						 */
2658 						stop_probing(pi);
2659 					}
2660 					return (send_result(newfd, error,
2661 					    global_errno));
2662 				}
2663 			}
2664 		} else {
2665 			return (send_result(newfd, IPMP_FAILURE, errno));
2666 		}
2667 
2668 		/*
2669 		 * The interface is now Offline, so stop probing it.
2670 		 * Note that if_mpadm(1M) will down the test addresses,
2671 		 * after receiving a success reply from us. The routing
2672 		 * socket message will then make us close the socket used
2673 		 * for sending probes. But it is more logical that an
2674 		 * offlined interface must not be probed, even if it has
2675 		 * test addresses.
2676 		 */
2677 		stop_probing(pi);
2678 		return (send_result(newfd, IPMP_SUCCESS, 0));
2679 
2680 	case MI_UNDO_OFFLINE:
2681 		miu = &mpi->mi_ucmd;
2682 		/*
2683 		 * Undo the offline command. As usual lookup the interface.
2684 		 * Send an error if it does not exist.
2685 		 */
2686 		pi = phyint_lookup(miu->miu_ifname);
2687 		if (pi == NULL)
2688 			return (send_result(newfd, IPMP_FAILURE, EINVAL));
2689 
2690 		/*
2691 		 * Inverse of the offline operation. Do a failback, and then
2692 		 * clear the IFF_OFFLINE flag.
2693 		 */
2694 		error = do_failback(pi, _B_TRUE);
2695 		if (error == IPMP_EFBPARTIAL)
2696 			return (send_result(newfd, IPMP_EFBPARTIAL, 0));
2697 		error = do_failback(pi, _B_FALSE);
2698 
2699 		switch (error) {
2700 		case IPMP_SUCCESS:
2701 			if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) {
2702 				logdebug("undo error %X\n", global_errno);
2703 				error = IPMP_FAILURE;
2704 				break;
2705 			}
2706 			/* FALLTHROUGH */
2707 
2708 		case IPMP_EFBPARTIAL:
2709 			/*
2710 			 * Reset the state of the interface based on the
2711 			 * current link state; if this phyint subsequently
2712 			 * acquires a test address, the state will be changed
2713 			 * again later as a result of the probes.
2714 			 */
2715 			if (LINK_UP(pi))
2716 				phyint_chstate(pi, PI_RUNNING);
2717 			else
2718 				phyint_chstate(pi, PI_FAILED);
2719 			break;
2720 
2721 		case IPMP_FAILURE:
2722 			break;
2723 
2724 		default:
2725 			logdebug("do_failback: unexpected return value\n");
2726 			break;
2727 		}
2728 		return (send_result(newfd, error, global_errno));
2729 
2730 	case MI_SETOINDEX:
2731 		mis = &mpi->mi_scmd;
2732 
2733 		/* Get the socket for doing ioctls */
2734 		ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6;
2735 
2736 		/*
2737 		 * Get index of new original interface.
2738 		 * The index is returned in lifr.lifr_index.
2739 		 */
2740 		(void) strlcpy(lifr.lifr_name, mis->mis_new_pifname,
2741 		    sizeof (lifr.lifr_name));
2742 
2743 		if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0)
2744 			return (send_result(newfd, IPMP_FAILURE, errno));
2745 
2746 		/*
2747 		 * Set new original interface index.
2748 		 * The new index was put into lifr.lifr_index by the
2749 		 * SIOCGLIFINDEX ioctl.
2750 		 */
2751 		(void) strlcpy(lifr.lifr_name, mis->mis_lifname,
2752 		    sizeof (lifr.lifr_name));
2753 
2754 		if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0)
2755 			return (send_result(newfd, IPMP_FAILURE, errno));
2756 
2757 		return (send_result(newfd, IPMP_SUCCESS, 0));
2758 
2759 	case MI_QUERY:
2760 		return (process_query(newfd, &mpi->mi_qcmd));
2761 
2762 	default:
2763 		break;
2764 	}
2765 
2766 	return (send_result(newfd, IPMP_EPROTO, 0));
2767 }
2768 
2769 /*
2770  * Process the query request pointed to by `miq' and send a reply on file
2771  * descriptor `fd'.  Returns an IPMP error code.
2772  */
2773 static unsigned int
2774 process_query(int fd, mi_query_t *miq)
2775 {
2776 	ipmp_groupinfo_t	*grinfop;
2777 	ipmp_groupinfolist_t	*grlp;
2778 	ipmp_grouplist_t	*grlistp;
2779 	ipmp_ifinfo_t		*ifinfop;
2780 	ipmp_ifinfolist_t	*iflp;
2781 	ipmp_snap_t		*snap;
2782 	unsigned int		retval;
2783 
2784 	switch (miq->miq_inforeq) {
2785 	case IPMP_GROUPLIST:
2786 		retval = getgrouplist(&grlistp);
2787 		if (retval != IPMP_SUCCESS)
2788 			return (send_result(fd, retval, errno));
2789 
2790 		retval = send_result(fd, IPMP_SUCCESS, 0);
2791 		if (retval == IPMP_SUCCESS)
2792 			retval = send_grouplist(fd, grlistp);
2793 
2794 		ipmp_freegrouplist(grlistp);
2795 		return (retval);
2796 
2797 	case IPMP_GROUPINFO:
2798 		miq->miq_grname[LIFGRNAMSIZ - 1] = '\0';
2799 		retval = getgroupinfo(miq->miq_ifname, &grinfop);
2800 		if (retval != IPMP_SUCCESS)
2801 			return (send_result(fd, retval, errno));
2802 
2803 		retval = send_result(fd, IPMP_SUCCESS, 0);
2804 		if (retval == IPMP_SUCCESS)
2805 			retval = send_groupinfo(fd, grinfop);
2806 
2807 		ipmp_freegroupinfo(grinfop);
2808 		return (retval);
2809 
2810 	case IPMP_IFINFO:
2811 		miq->miq_ifname[LIFNAMSIZ - 1] = '\0';
2812 		retval = getifinfo(miq->miq_ifname, &ifinfop);
2813 		if (retval != IPMP_SUCCESS)
2814 			return (send_result(fd, retval, errno));
2815 
2816 		retval = send_result(fd, IPMP_SUCCESS, 0);
2817 		if (retval == IPMP_SUCCESS)
2818 			retval = send_ifinfo(fd, ifinfop);
2819 
2820 		ipmp_freeifinfo(ifinfop);
2821 		return (retval);
2822 
2823 	case IPMP_SNAP:
2824 		retval = getsnap(&snap);
2825 		if (retval != IPMP_SUCCESS)
2826 			return (send_result(fd, retval, errno));
2827 
2828 		retval = send_result(fd, IPMP_SUCCESS, 0);
2829 		if (retval != IPMP_SUCCESS)
2830 			goto out;
2831 
2832 		retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap);
2833 		if (retval != IPMP_SUCCESS)
2834 			goto out;
2835 
2836 		retval = send_grouplist(fd, snap->sn_grlistp);
2837 		if (retval != IPMP_SUCCESS)
2838 			goto out;
2839 
2840 		iflp = snap->sn_ifinfolistp;
2841 		for (; iflp != NULL; iflp = iflp->ifl_next) {
2842 			retval = send_ifinfo(fd, iflp->ifl_ifinfop);
2843 			if (retval != IPMP_SUCCESS)
2844 				goto out;
2845 		}
2846 
2847 		grlp = snap->sn_grinfolistp;
2848 		for (; grlp != NULL; grlp = grlp->grl_next) {
2849 			retval = send_groupinfo(fd, grlp->grl_grinfop);
2850 			if (retval != IPMP_SUCCESS)
2851 				goto out;
2852 		}
2853 	out:
2854 		ipmp_snap_free(snap);
2855 		return (retval);
2856 
2857 	default:
2858 		break;
2859 
2860 	}
2861 	return (send_result(fd, IPMP_EPROTO, 0));
2862 }
2863 
2864 /*
2865  * Send the group information pointed to by `grinfop' on file descriptor `fd'.
2866  * Returns an IPMP error code.
2867  */
2868 static unsigned int
2869 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop)
2870 {
2871 	ipmp_iflist_t	*iflistp = grinfop->gr_iflistp;
2872 	unsigned int	retval;
2873 
2874 	retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop);
2875 	if (retval != IPMP_SUCCESS)
2876 		return (retval);
2877 
2878 	return (ipmp_writetlv(fd, IPMP_IFLIST,
2879 	    IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp));
2880 }
2881 
2882 /*
2883  * Send the interface information pointed to by `ifinfop' on file descriptor
2884  * `fd'.  Returns an IPMP error code.
2885  */
2886 static unsigned int
2887 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop)
2888 {
2889 	return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop));
2890 }
2891 
2892 /*
2893  * Send the group list pointed to by `grlistp' on file descriptor `fd'.
2894  * Returns an IPMP error code.
2895  */
2896 static unsigned int
2897 send_grouplist(int fd, ipmp_grouplist_t *grlistp)
2898 {
2899 	return (ipmp_writetlv(fd, IPMP_GROUPLIST,
2900 	    IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp));
2901 }
2902 
2903 /*
2904  * Initialize an mi_result_t structure using `error' and `syserror' and
2905  * send it on file descriptor `fd'.  Returns an IPMP error code.
2906  */
2907 static unsigned int
2908 send_result(int fd, unsigned int error, int syserror)
2909 {
2910 	mi_result_t me;
2911 
2912 	me.me_mpathd_error = error;
2913 	if (error == IPMP_FAILURE)
2914 		me.me_sys_error = syserror;
2915 	else
2916 		me.me_sys_error = 0;
2917 
2918 	return (ipmp_write(fd, &me, sizeof (me)));
2919 }
2920 
2921 /*
2922  * Daemonize the process.
2923  */
2924 static boolean_t
2925 daemonize(void)
2926 {
2927 	switch (fork()) {
2928 	case -1:
2929 		return (_B_FALSE);
2930 
2931 	case  0:
2932 		/*
2933 		 * Lose our controlling terminal, and become both a session
2934 		 * leader and a process group leader.
2935 		 */
2936 		if (setsid() == -1)
2937 			return (_B_FALSE);
2938 
2939 		/*
2940 		 * Under POSIX, a session leader can accidentally (through
2941 		 * open(2)) acquire a controlling terminal if it does not
2942 		 * have one.  Just to be safe, fork() again so we are not a
2943 		 * session leader.
2944 		 */
2945 		switch (fork()) {
2946 		case -1:
2947 			return (_B_FALSE);
2948 
2949 		case 0:
2950 			(void) chdir("/");
2951 			(void) umask(022);
2952 			(void) fdwalk(closefunc, NULL);
2953 			break;
2954 
2955 		default:
2956 			_exit(EXIT_SUCCESS);
2957 		}
2958 		break;
2959 
2960 	default:
2961 		_exit(EXIT_SUCCESS);
2962 	}
2963 
2964 	return (_B_TRUE);
2965 }
2966 
2967 /*
2968  * The parent has created some fds before forking on purpose, keep them open.
2969  */
2970 static int
2971 closefunc(void *not_used, int fd)
2972 /* ARGSUSED */
2973 {
2974 	if (fd != lsock_v4 && fd != lsock_v6)
2975 		(void) close(fd);
2976 	return (0);
2977 }
2978 
2979 /* LOGGER */
2980 
2981 #include <syslog.h>
2982 
2983 /*
2984  * Logging routines.  All routines log to syslog, unless the daemon is
2985  * running in the foreground, in which case the logging goes to stderr.
2986  *
2987  * The following routines are available:
2988  *
2989  *	logdebug(): A printf-like function for outputting debug messages
2990  *	(messages at LOG_DEBUG) that are only of use to developers.
2991  *
2992  *	logtrace(): A printf-like function for outputting tracing messages
2993  *	(messages at LOG_INFO) from the daemon.	 This is typically used
2994  *	to log the receipt of interesting network-related conditions.
2995  *
2996  *	logerr(): A printf-like function for outputting error messages
2997  *	(messages at LOG_ERR) from the daemon.
2998  *
2999  *	logperror*(): A set of functions used to output error messages
3000  *	(messages at LOG_ERR); these automatically append strerror(errno)
3001  *	and a newline to the message passed to them.
3002  *
3003  * NOTE: since the logging functions write to syslog, the messages passed
3004  *	 to them are not eligible for localization.  Thus, gettext() must
3005  *	 *not* be used.
3006  */
3007 
3008 static int logging = 0;
3009 
3010 static void
3011 initlog(void)
3012 {
3013 	logging++;
3014 	openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON);
3015 }
3016 
3017 /* PRINTFLIKE1 */
3018 void
3019 logerr(char *fmt, ...)
3020 {
3021 	va_list ap;
3022 
3023 	va_start(ap, fmt);
3024 
3025 	if (logging)
3026 		vsyslog(LOG_ERR, fmt, ap);
3027 	else
3028 		(void) vfprintf(stderr, fmt, ap);
3029 	va_end(ap);
3030 }
3031 
3032 /* PRINTFLIKE1 */
3033 void
3034 logtrace(char *fmt, ...)
3035 {
3036 	va_list ap;
3037 
3038 	va_start(ap, fmt);
3039 
3040 	if (logging)
3041 		vsyslog(LOG_INFO, fmt, ap);
3042 	else
3043 		(void) vfprintf(stderr, fmt, ap);
3044 	va_end(ap);
3045 }
3046 
3047 /* PRINTFLIKE1 */
3048 void
3049 logdebug(char *fmt, ...)
3050 {
3051 	va_list ap;
3052 
3053 	va_start(ap, fmt);
3054 
3055 	if (logging)
3056 		vsyslog(LOG_DEBUG, fmt, ap);
3057 	else
3058 		(void) vfprintf(stderr, fmt, ap);
3059 	va_end(ap);
3060 }
3061 
3062 /* PRINTFLIKE1 */
3063 void
3064 logperror(char *str)
3065 {
3066 	if (logging)
3067 		syslog(LOG_ERR, "%s: %m\n", str);
3068 	else
3069 		(void) fprintf(stderr, "%s: %s\n", str, strerror(errno));
3070 }
3071 
3072 void
3073 logperror_pii(struct phyint_instance *pii, char *str)
3074 {
3075 	if (logging) {
3076 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3077 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name);
3078 	} else {
3079 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3080 		    str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name,
3081 		    strerror(errno));
3082 	}
3083 }
3084 
3085 void
3086 logperror_li(struct logint *li, char *str)
3087 {
3088 	struct	phyint_instance	*pii = li->li_phyint_inst;
3089 
3090 	if (logging) {
3091 		syslog(LOG_ERR, "%s (%s %s): %m\n",
3092 		    str, AF_STR(pii->pii_af), li->li_name);
3093 	} else {
3094 		(void) fprintf(stderr, "%s (%s %s): %s\n",
3095 		    str, AF_STR(pii->pii_af), li->li_name,
3096 		    strerror(errno));
3097 	}
3098 }
3099 
3100 void
3101 close_probe_socket(struct phyint_instance *pii, boolean_t polled)
3102 {
3103 	if (polled)
3104 		(void) poll_remove(pii->pii_probe_sock);
3105 	(void) close(pii->pii_probe_sock);
3106 	pii->pii_probe_sock = -1;
3107 	pii->pii_basetime_inited = 0;
3108 }
3109