xref: /titanic_51/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision 2cb53ad67f463fb038a7c555bb0611fb6a8acec7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 /*
30  * Global list of phyints, phyint instances, phyint groups and the anonymous
31  * group; the latter is initialized in phyint_init().
32  */
33 struct phyint *phyints = NULL;
34 struct phyint_instance	*phyint_instances = NULL;
35 struct phyint_group *phyint_groups = NULL;
36 struct phyint_group *phyint_anongroup;
37 
38 /*
39  * Grouplist signature; initialized in phyint_init().
40  */
41 static uint64_t phyint_grouplistsig;
42 
43 static void phyint_inst_insert(struct phyint_instance *pii);
44 static void phyint_inst_print(struct phyint_instance *pii);
45 
46 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
47 static void phyint_delete(struct phyint *pi);
48 static boolean_t phyint_is_usable(struct phyint *pi);
49 
50 static void logint_print(struct logint *li);
51 static void logint_insert(struct phyint_instance *pii, struct logint *li);
52 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
53 
54 static void target_print(struct target *tg);
55 static void target_insert(struct phyint_instance *pii, struct target *tg);
56 static struct target *target_first(struct phyint_instance *pii);
57 static struct target *target_select_best(struct phyint_instance *pii);
58 static void target_flush_hosts(struct phyint_group *pg);
59 
60 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
61 
62 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
63 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
64 
65 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
66 static int phyint_group_state_event(struct phyint_group *pg);
67 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
68 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
69     ipmp_if_op_t op);
70 
71 static int logint_upcount(struct phyint *pi);
72 static uint64_t gensig(void);
73 
74 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
75 int
76 phyint_init(void)
77 {
78 	phyint_grouplistsig = gensig();
79 	if (track_all_phyints) {
80 		phyint_anongroup = phyint_group_create("");
81 		if (phyint_anongroup == NULL)
82 			return (-1);
83 		phyint_group_insert(phyint_anongroup);
84 	}
85 	return (0);
86 }
87 
88 /* Return the phyint with the given name */
89 struct phyint *
90 phyint_lookup(const char *name)
91 {
92 	struct phyint *pi;
93 
94 	if (debug & D_PHYINT)
95 		logdebug("phyint_lookup(%s)\n", name);
96 
97 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
98 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
99 			break;
100 	}
101 	return (pi);
102 }
103 
104 /*
105  * Lookup a phyint in the group that has the same hardware address as `pi', or
106  * NULL if there's none.  If `online_only' is set, then only online phyints
107  * are considered when matching.  Otherwise, phyints that had been offlined
108  * due to a duplicate hardware address will also be considered.
109  */
110 static struct phyint *
111 phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
112 {
113 	struct phyint *pi2;
114 
115 	if (pi->pi_group == phyint_anongroup)
116 		return (NULL);
117 
118 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
119 		if (pi2 == pi)
120 			continue;
121 
122 		/*
123 		 * NOTE: even when online_only is B_FALSE, we ignore phyints
124 		 * that are administratively offline (rather than offline
125 		 * because they're dups); when they're brought back online,
126 		 * they'll be flagged as dups if need be.
127 		 */
128 		if (pi2->pi_state == PI_OFFLINE &&
129 		    (online_only || !pi2->pi_hwaddrdup))
130 			continue;
131 
132 		if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
133 		    bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
134 			return (pi2);
135 	}
136 	return (NULL);
137 }
138 
139 /*
140  * Respond to DLPI notifications.  Currently, this only processes physical
141  * address changes for the phyint passed via `arg' by onlining or offlining
142  * phyints in the group.
143  */
144 /* ARGSUSED */
145 static void
146 phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
147 {
148 	struct phyint *pi = arg;
149 	struct phyint *oduppi = NULL, *duppi = NULL;
150 
151 	assert((dnip->dni_note & pi->pi_notes) != 0);
152 
153 	if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
154 		return;
155 
156 	assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
157 
158 	/*
159 	 * If our hardware address hasn't changed, there's nothing to do.
160 	 */
161 	if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
162 	    bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
163 		return;
164 
165 	oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
166 	pi->pi_hwaddrlen = dnip->dni_physaddrlen;
167 	(void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
168 	duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
169 
170 	if (oduppi != NULL || pi->pi_hwaddrdup) {
171 		/*
172 		 * Our old hardware address was a duplicate.  If we'd been
173 		 * offlined because of it, and our new hardware address is not
174 		 * a duplicate, then bring us online.  Otherwise, `oduppi'
175 		 * must've been the one brought offline; bring it online.
176 		 */
177 		if (pi->pi_hwaddrdup) {
178 			if (duppi == NULL)
179 				(void) phyint_undo_offline(pi);
180 		} else {
181 			assert(oduppi->pi_hwaddrdup);
182 			(void) phyint_undo_offline(oduppi);
183 		}
184 	}
185 
186 	if (duppi != NULL && !pi->pi_hwaddrdup) {
187 		/*
188 		 * Our new hardware address was a duplicate and we're not
189 		 * yet flagged as a duplicate; bring us offline.
190 		 */
191 		pi->pi_hwaddrdup = _B_TRUE;
192 		(void) phyint_offline(pi, 0);
193 	}
194 }
195 
196 /*
197  * Initialize information about the underlying link for `pi', and set us
198  * up to be notified about future changes.  Returns _B_TRUE on success.
199  */
200 boolean_t
201 phyint_link_init(struct phyint *pi)
202 {
203 	int retval;
204 	uint_t notes;
205 	const char *errmsg;
206 	dlpi_notifyid_t id;
207 
208 	pi->pi_notes = 0;
209 	retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
210 	if (retval != DLPI_SUCCESS) {
211 		pi->pi_dh = NULL;
212 		errmsg = "cannot open";
213 		goto failed;
214 	}
215 
216 	pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
217 	retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
218 	    &pi->pi_hwaddrlen);
219 	if (retval != DLPI_SUCCESS) {
220 		errmsg = "cannot get hardware address";
221 		goto failed;
222 	}
223 
224 	/*
225 	 * Check if the link supports DLPI link state notifications.  For
226 	 * historical reasons, the actual changes are tracked through routing
227 	 * sockets, so we immediately disable the notification upon success.
228 	 */
229 	notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
230 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
231 	if (retval == DLPI_SUCCESS) {
232 		(void) dlpi_disabnotify(pi->pi_dh, id, NULL);
233 		pi->pi_notes |= notes;
234 	}
235 
236 	/*
237 	 * Enable notification of hardware address changes to keep pi_hwaddr
238 	 * up-to-date and track if we need to offline/undo-offline phyints.
239 	 */
240 	notes = DL_NOTE_PHYS_ADDR;
241 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
242 	if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
243 		pi->pi_notes |= notes;
244 
245 	return (_B_TRUE);
246 failed:
247 	logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
248 	if (pi->pi_dh != NULL) {
249 		dlpi_close(pi->pi_dh);
250 		pi->pi_dh = NULL;
251 	}
252 	return (_B_FALSE);
253 }
254 
255 /*
256  * Close use of link on `pi'.
257  */
258 void
259 phyint_link_close(struct phyint *pi)
260 {
261 	if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
262 		(void) poll_remove(dlpi_fd(pi->pi_dh));
263 		pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
264 	}
265 
266 	/*
267 	 * NOTE: we don't clear pi_notes here so that iflinkstate() can still
268 	 * properly report the link state even when offline (which is possible
269 	 * since we use IFF_RUNNING to track link state).
270 	 */
271 	dlpi_close(pi->pi_dh);
272 	pi->pi_dh = NULL;
273 }
274 
275 /* Return the phyint instance with the given name and the given family */
276 struct phyint_instance *
277 phyint_inst_lookup(int af, char *name)
278 {
279 	struct phyint *pi;
280 
281 	if (debug & D_PHYINT)
282 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
283 
284 	assert(af == AF_INET || af == AF_INET6);
285 
286 	pi = phyint_lookup(name);
287 	if (pi == NULL)
288 		return (NULL);
289 
290 	return (PHYINT_INSTANCE(pi, af));
291 }
292 
293 struct phyint_group *
294 phyint_group_lookup(const char *pg_name)
295 {
296 	struct phyint_group *pg;
297 
298 	if (debug & D_PHYINT)
299 		logdebug("phyint_group_lookup(%s)\n", pg_name);
300 
301 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
302 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
303 			break;
304 	}
305 	return (pg);
306 }
307 
308 /*
309  * Insert the phyint in the linked list of all phyints. If the phyint belongs
310  * to some group, insert it in the phyint group list.
311  */
312 static void
313 phyint_insert(struct phyint *pi, struct phyint_group *pg)
314 {
315 	if (debug & D_PHYINT)
316 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
317 
318 	/* Insert the phyint at the head of the 'all phyints' list */
319 	pi->pi_next = phyints;
320 	pi->pi_prev = NULL;
321 	if (phyints != NULL)
322 		phyints->pi_prev = pi;
323 	phyints = pi;
324 
325 	/*
326 	 * Insert the phyint at the head of the 'phyint_group members' list
327 	 * of the phyint group to which it belongs.
328 	 */
329 	pi->pi_pgnext = NULL;
330 	pi->pi_pgprev = NULL;
331 	pi->pi_group = pg;
332 
333 	pi->pi_pgnext = pg->pg_phyint;
334 	if (pi->pi_pgnext != NULL)
335 		pi->pi_pgnext->pi_pgprev = pi;
336 	pg->pg_phyint = pi;
337 
338 	/* Refresh the group state now that this phyint has been added */
339 	phyint_group_refresh_state(pg);
340 
341 	pg->pg_sig++;
342 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
343 }
344 
345 /* Insert the phyint instance in the linked list of all phyint instances. */
346 static void
347 phyint_inst_insert(struct phyint_instance *pii)
348 {
349 	if (debug & D_PHYINT) {
350 		logdebug("phyint_inst_insert(%s %s)\n",
351 		    AF_STR(pii->pii_af), pii->pii_name);
352 	}
353 
354 	/*
355 	 * Insert the phyint at the head of the 'all phyint instances' list.
356 	 */
357 	pii->pii_next = phyint_instances;
358 	pii->pii_prev = NULL;
359 	if (phyint_instances != NULL)
360 		phyint_instances->pii_prev = pii;
361 	phyint_instances = pii;
362 }
363 
364 /*
365  * Create a new phyint with the given parameters. Also insert it into
366  * the list of all phyints and the list of phyint group members by calling
367  * phyint_insert().
368  */
369 static struct phyint *
370 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
371     uint64_t flags)
372 {
373 	struct phyint *pi;
374 
375 	pi = calloc(1, sizeof (struct phyint));
376 	if (pi == NULL) {
377 		logperror("phyint_create: calloc");
378 		return (NULL);
379 	}
380 
381 	/*
382 	 * Record the phyint values.
383 	 */
384 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
385 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
386 	pi->pi_ifindex = ifindex;
387 	pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
388 
389 	/*
390 	 * If the interface is offline, we set the state to PI_OFFLINE.
391 	 * Otherwise, we optimistically start in the PI_RUNNING state.  Later
392 	 * (in process_link_state_changes()), we will adjust this to match the
393 	 * current state of the link.  Further, if test addresses are
394 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
395 	 * to either PI_RUNNING or PI_FAILED depending on the probe results.
396 	 */
397 	pi->pi_state = (flags & IFF_OFFLINE) ? PI_OFFLINE : PI_RUNNING;
398 	pi->pi_flags = PHYINT_FLAGS(flags);
399 
400 	/*
401 	 * Initialize the link state.  The link state is initialized to
402 	 * up, so that if the link is down when IPMP starts monitoring
403 	 * the interface, it will appear as though there has been a
404 	 * transition from the link up to link down.  This avoids
405 	 * having to treat this situation as a special case.
406 	 */
407 	INIT_LINK_STATE(pi);
408 
409 	if (!phyint_link_init(pi)) {
410 		free(pi);
411 		return (NULL);
412 	}
413 
414 	/*
415 	 * Insert the phyint in the list of all phyints, and the
416 	 * list of phyint group members
417 	 */
418 	phyint_insert(pi, pg);
419 
420 	return (pi);
421 }
422 
423 /*
424  * Create a new phyint instance belonging to the phyint 'pi' and address
425  * family 'af'. Also insert it into the list of all phyint instances by
426  * calling phyint_inst_insert().
427  */
428 static struct phyint_instance *
429 phyint_inst_create(struct phyint *pi, int af)
430 {
431 	struct phyint_instance *pii;
432 
433 	pii = calloc(1, sizeof (struct phyint_instance));
434 	if (pii == NULL) {
435 		logperror("phyint_inst_create: calloc");
436 		return (NULL);
437 	}
438 
439 	/*
440 	 * Attach the phyint instance to the phyint.
441 	 * Set the back pointers as well
442 	 */
443 	pii->pii_phyint = pi;
444 	if (af == AF_INET)
445 		pi->pi_v4 = pii;
446 	else
447 		pi->pi_v6 = pii;
448 
449 	pii->pii_in_use = 1;
450 	pii->pii_probe_sock = -1;
451 	pii->pii_snxt = 1;
452 	pii->pii_af = af;
453 	pii->pii_fd_hrtime = gethrtime() +
454 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
455 	pii->pii_flags = pi->pi_flags;
456 
457 	/* Insert the phyint instance in the list of all phyint instances. */
458 	phyint_inst_insert(pii);
459 	return (pii);
460 }
461 
462 /*
463  * Change the state of phyint `pi' to state `state'.
464  */
465 void
466 phyint_chstate(struct phyint *pi, enum pi_state state)
467 {
468 	/*
469 	 * To simplify things, some callers always set a given state
470 	 * regardless of the previous state of the phyint (e.g., setting
471 	 * PI_RUNNING when it's already set).  We shouldn't bother
472 	 * generating an event or consuming a signature for these, since
473 	 * the actual state of the interface is unchanged.
474 	 */
475 	if (pi->pi_state == state)
476 		return;
477 
478 	pi->pi_state = state;
479 	phyint_changed(pi);
480 }
481 
482 /*
483  * Note that `pi' has changed state.
484  */
485 void
486 phyint_changed(struct phyint *pi)
487 {
488 	pi->pi_group->pg_sig++;
489 	(void) phyint_state_event(pi->pi_group, pi);
490 }
491 
492 /*
493  * Insert the phyint group in the linked list of all phyint groups
494  * at the head of the list
495  */
496 void
497 phyint_group_insert(struct phyint_group *pg)
498 {
499 	pg->pg_next = phyint_groups;
500 	pg->pg_prev = NULL;
501 	if (phyint_groups != NULL)
502 		phyint_groups->pg_prev = pg;
503 	phyint_groups = pg;
504 
505 	phyint_grouplistsig++;
506 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
507 }
508 
509 /*
510  * Create a new phyint group called 'name'.
511  */
512 struct phyint_group *
513 phyint_group_create(const char *name)
514 {
515 	struct	phyint_group *pg;
516 
517 	if (debug & D_PHYINT)
518 		logdebug("phyint_group_create(%s)\n", name);
519 
520 	pg = calloc(1, sizeof (struct phyint_group));
521 	if (pg == NULL) {
522 		logperror("phyint_group_create: calloc");
523 		return (NULL);
524 	}
525 
526 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
527 	pg->pg_sig = gensig();
528 	pg->pg_fdt = user_failure_detection_time;
529 	pg->pg_probeint = user_probe_interval;
530 	pg->pg_in_use = _B_TRUE;
531 
532 	/*
533 	 * Normal groups always start in the PG_FAILED state since they
534 	 * have no active interfaces.  In contrast, anonymous groups are
535 	 * heterogeneous and thus always PG_OK.
536 	 */
537 	pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
538 
539 	return (pg);
540 }
541 
542 /*
543  * Change the state of the phyint group `pg' to state `state'.
544  */
545 void
546 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
547 {
548 	assert(pg != phyint_anongroup);
549 
550 	/*
551 	 * To simplify things, some callers always set a given state
552 	 * regardless of the previous state of the group (e.g., setting
553 	 * PG_DEGRADED when it's already set).  We shouldn't bother
554 	 * generating an event or consuming a signature for these, since
555 	 * the actual state of the group is unchanged.
556 	 */
557 	if (pg->pg_state == state)
558 		return;
559 
560 	pg->pg_state = state;
561 
562 	switch (state) {
563 	case PG_FAILED:
564 		/*
565 		 * We can never know with certainty that a group has
566 		 * failed.  It is possible that all known targets have
567 		 * failed simultaneously, and new targets have come up
568 		 * instead. If the targets are routers then router
569 		 * discovery will kick in, and we will see the new routers
570 		 * thru routing socket messages. But if the targets are
571 		 * hosts, we have to discover it by multicast.	So flush
572 		 * all the host targets. The next probe will send out a
573 		 * multicast echo request. If this is a group failure, we
574 		 * will still not see any response, otherwise the group
575 		 * will be repaired after we get NUM_PROBE_REPAIRS
576 		 * consecutive unicast replies on any phyint.
577 		 */
578 		target_flush_hosts(pg);
579 		break;
580 
581 	case PG_OK:
582 	case PG_DEGRADED:
583 		break;
584 
585 	default:
586 		logerr("phyint_group_chstate: invalid group state %d; "
587 		    "aborting\n", state);
588 		abort();
589 	}
590 
591 	pg->pg_sig++;
592 	(void) phyint_group_state_event(pg);
593 }
594 
595 /*
596  * Create a new phyint instance and initialize it from the values supplied by
597  * the kernel. Always check for ENXIO before logging any error, because the
598  * interface could have vanished after completion of SIOCGLIFCONF.
599  * Return values:
600  *	pointer to the phyint instance on success
601  *	NULL on failure Eg. if the phyint instance is not found in the kernel
602  */
603 struct phyint_instance *
604 phyint_inst_init_from_k(int af, char *pi_name)
605 {
606 	char	pg_name[LIFNAMSIZ + 1];
607 	int	ifsock;
608 	uint_t	ifindex;
609 	uint64_t	flags;
610 	struct lifreq	lifr;
611 	struct phyint	*pi;
612 	struct phyint_instance	*pii;
613 	boolean_t	pi_created;
614 	struct phyint_group	*pg;
615 
616 retry:
617 	pii = NULL;
618 	pi = NULL;
619 	pg = NULL;
620 	pi_created = _B_FALSE;
621 
622 	if (debug & D_PHYINT) {
623 		logdebug("phyint_inst_init_from_k(%s %s)\n",
624 		    AF_STR(af), pi_name);
625 	}
626 
627 	assert(af == AF_INET || af == AF_INET6);
628 
629 	/* Get the socket for doing ioctls */
630 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
631 
632 	/*
633 	 * Get the interface flags.  Ignore virtual interfaces, IPMP
634 	 * meta-interfaces, point-to-point interfaces, and interfaces
635 	 * that can't support multicast.
636 	 */
637 	(void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
638 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
639 		if (errno != ENXIO) {
640 			logperror("phyint_inst_init_from_k:"
641 			    " ioctl (get flags)");
642 		}
643 		return (NULL);
644 	}
645 	flags = lifr.lifr_flags;
646 	if (!(flags & IFF_MULTICAST) ||
647 	    (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
648 		return (NULL);
649 
650 	/*
651 	 * Get the ifindex for recording later in our tables, in case we need
652 	 * to create a new phyint.
653 	 */
654 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
655 		if (errno != ENXIO) {
656 			logperror("phyint_inst_init_from_k: "
657 			    " ioctl (get lifindex)");
658 		}
659 		return (NULL);
660 	}
661 	ifindex = lifr.lifr_index;
662 
663 	/*
664 	 * Get the phyint group name of this phyint, from the kernel.
665 	 */
666 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
667 		if (errno != ENXIO) {
668 			logperror("phyint_inst_init_from_k: "
669 			    "ioctl (get group name)");
670 		}
671 		return (NULL);
672 	}
673 	(void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
674 
675 	/*
676 	 * If the phyint is not part of any group, pg_name is the
677 	 * null string. If 'track_all_phyints' is false, there is no
678 	 * need to create a phyint.
679 	 */
680 	if (pg_name[0] == '\0' && !track_all_phyints) {
681 		/*
682 		 * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
683 		 * set, reset them. These flags shouldn't be set if in.mpathd
684 		 * isn't tracking the interface.
685 		 */
686 		if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
687 			lifr.lifr_flags = flags &
688 			    ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
689 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
690 				if (errno != ENXIO) {
691 					logperror("phyint_inst_init_from_k:"
692 					    " ioctl (set flags)");
693 				}
694 			}
695 		}
696 		return (NULL);
697 	}
698 
699 	/*
700 	 * We need to create a new phyint instance.  We may also need to
701 	 * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
702 	 * an underlying interface before it found its IPMP meta-interface.
703 	 * Note that we keep any created groups even if phyint_inst_from_k()
704 	 * fails since a group's existence is not dependent on the ability of
705 	 * in.mpathd to the track the group's interfaces.
706 	 */
707 	if ((pg = phyint_group_lookup(pg_name)) == NULL) {
708 		if ((pg = phyint_group_create(pg_name)) == NULL) {
709 			logerr("phyint_inst_init_from_k: cannot create group "
710 			    "%s\n", pg_name);
711 			return (NULL);
712 		}
713 		phyint_group_insert(pg);
714 	}
715 
716 	/*
717 	 * Lookup the phyint. If the phyint does not exist create it.
718 	 */
719 	pi = phyint_lookup(pi_name);
720 	if (pi == NULL) {
721 		pi = phyint_create(pi_name, pg, ifindex, flags);
722 		if (pi == NULL) {
723 			logerr("phyint_inst_init_from_k:"
724 			    " unable to create phyint %s\n", pi_name);
725 			return (NULL);
726 		}
727 		pi_created = _B_TRUE;
728 	} else {
729 		/* The phyint exists already. */
730 		assert(pi_created == _B_FALSE);
731 		/*
732 		 * Normally we should see consistent values for the IPv4 and
733 		 * IPv6 instances, for phyint properties. If we don't, it
734 		 * means things have changed underneath us, and we should
735 		 * resync our tables with the kernel. Check whether the
736 		 * interface index has changed. If so, it is most likely
737 		 * the interface has been unplumbed and replumbed,
738 		 * while we are yet to update our tables. Do it now.
739 		 */
740 		if (pi->pi_ifindex != ifindex) {
741 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
742 			goto retry;
743 		}
744 		assert(PHYINT_INSTANCE(pi, af) == NULL);
745 
746 		/*
747 		 * If the group name seen by the IPv4 and IPv6 instances
748 		 * are different, it is most likely the groupname has
749 		 * changed, while we are yet to update our tables. Do it now.
750 		 */
751 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
752 			phyint_inst_delete(PHYINT_INSTANCE(pi,
753 			    AF_OTHER(af)));
754 			goto retry;
755 		}
756 	}
757 
758 	/*
759 	 * Create a new phyint instance, corresponding to the 'af'
760 	 * passed in.
761 	 */
762 	pii = phyint_inst_create(pi, af);
763 	if (pii == NULL) {
764 		logerr("phyint_inst_init_from_k: unable to create"
765 		    "phyint inst %s\n", pi->pi_name);
766 		if (pi_created)
767 			phyint_delete(pi);
768 
769 		return (NULL);
770 	}
771 
772 	if (pi_created) {
773 		/*
774 		 * If this phyint does not have a unique hardware address in its
775 		 * group, offline it.  (The change_pif_flags() implementation
776 		 * requires that we defer this until after the phyint_instance
777 		 * is created.)
778 		 */
779 		if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
780 			pi->pi_hwaddrdup = _B_TRUE;
781 			(void) phyint_offline(pi, 0);
782 		}
783 	}
784 
785 	return (pii);
786 }
787 
788 /*
789  * Bind pii_probe_sock to the address associated with pii_probe_logint.
790  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
791  * targets. Do the common part in this function, and complete the
792  * initializations by calling the protocol specific functions
793  * phyint_inst_v{4,6}_sockinit() respectively.
794  *
795  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
796  */
797 boolean_t
798 phyint_inst_sockinit(struct phyint_instance *pii)
799 {
800 	boolean_t success;
801 	struct phyint_group *pg;
802 
803 	if (debug & D_PHYINT) {
804 		logdebug("phyint_inst_sockinit(%s %s)\n",
805 		    AF_STR(pii->pii_af), pii->pii_name);
806 	}
807 
808 	assert(pii->pii_probe_logint != NULL);
809 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
810 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
811 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
812 
813 	/*
814 	 * If the socket is already bound, close pii_probe_sock
815 	 */
816 	if (pii->pii_probe_sock != -1)
817 		close_probe_socket(pii, _B_TRUE);
818 
819 	/*
820 	 * If the phyint is not part of a named group and track_all_phyints is
821 	 * false, simply return.
822 	 */
823 	pg = pii->pii_phyint->pi_group;
824 	if (pg == phyint_anongroup && !track_all_phyints) {
825 		if (debug & D_PHYINT)
826 			logdebug("phyint_inst_sockinit: no group\n");
827 		return (_B_FALSE);
828 	}
829 
830 	/*
831 	 * Initialize the socket by calling the protocol specific function.
832 	 * If it succeeds, add the socket to the poll list.
833 	 */
834 	if (pii->pii_af == AF_INET6)
835 		success = phyint_inst_v6_sockinit(pii);
836 	else
837 		success = phyint_inst_v4_sockinit(pii);
838 
839 	if (success && (poll_add(pii->pii_probe_sock) == 0))
840 		return (_B_TRUE);
841 
842 	/* Something failed, cleanup and return false */
843 	if (pii->pii_probe_sock != -1)
844 		close_probe_socket(pii, _B_FALSE);
845 
846 	return (_B_FALSE);
847 }
848 
849 /*
850  * IPv6 specific part in initializing the pii_probe_sock. This socket is
851  * used to send/receive ICMPv6 probe packets.
852  */
853 static boolean_t
854 phyint_inst_v6_sockinit(struct phyint_instance *pii)
855 {
856 	icmp6_filter_t filter;
857 	int hopcount = 1;
858 	int off = 0;
859 	int on = 1;
860 	struct	sockaddr_in6	testaddr;
861 
862 	/*
863 	 * Open a raw socket with ICMPv6 protocol.
864 	 *
865 	 * Use IPV6_BOUND_IF to make sure that probes are sent and received on
866 	 * the specified phyint only.  Bind to the test address to ensure that
867 	 * the responses are sent to the specified phyint.
868 	 *
869 	 * Set the hopcount to 1 so that probe packets are not routed.
870 	 * Disable multicast loopback. Set the receive filter to
871 	 * receive only ICMPv6 echo replies.
872 	 */
873 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
874 	if (pii->pii_probe_sock < 0) {
875 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
876 		return (_B_FALSE);
877 	}
878 
879 	bzero(&testaddr, sizeof (testaddr));
880 	testaddr.sin6_family = AF_INET6;
881 	testaddr.sin6_port = 0;
882 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
883 
884 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
885 	    sizeof (testaddr)) < 0) {
886 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
887 		return (_B_FALSE);
888 	}
889 
890 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
891 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
892 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
893 		    " IPV6_MULTICAST_IF");
894 		return (_B_FALSE);
895 	}
896 
897 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
898 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
899 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
900 		    " IPV6_BOUND_IF");
901 		return (_B_FALSE);
902 	}
903 
904 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
905 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
906 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
907 		    " IPV6_UNICAST_HOPS");
908 		return (_B_FALSE);
909 	}
910 
911 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
912 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
913 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
914 		    " IPV6_MULTICAST_HOPS");
915 		return (_B_FALSE);
916 	}
917 
918 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
919 	    (char *)&off, sizeof (off)) < 0) {
920 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
921 		    " IPV6_MULTICAST_LOOP");
922 		return (_B_FALSE);
923 	}
924 
925 	/*
926 	 * Filter out so that we only receive ICMP echo replies
927 	 */
928 	ICMP6_FILTER_SETBLOCKALL(&filter);
929 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
930 
931 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
932 	    (char *)&filter, sizeof (filter)) < 0) {
933 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
934 		    " ICMP6_FILTER");
935 		return (_B_FALSE);
936 	}
937 
938 	/* Enable receipt of hoplimit */
939 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
940 	    &on, sizeof (on)) < 0) {
941 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
942 		    " IPV6_RECVHOPLIMIT");
943 		return (_B_FALSE);
944 	}
945 
946 	/* Enable receipt of timestamp */
947 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
948 	    &on, sizeof (on)) < 0) {
949 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
950 		    " SO_TIMESTAMP");
951 		return (_B_FALSE);
952 	}
953 
954 	return (_B_TRUE);
955 }
956 
957 /*
958  * IPv4 specific part in initializing the pii_probe_sock. This socket is
959  * used to send/receive ICMPv4 probe packets.
960  */
961 static boolean_t
962 phyint_inst_v4_sockinit(struct phyint_instance *pii)
963 {
964 	struct sockaddr_in  testaddr;
965 	char	char_off = 0;
966 	int	ttl = 1;
967 	char	char_ttl = 1;
968 	int	on = 1;
969 
970 	/*
971 	 * Open a raw socket with ICMPv4 protocol.
972 	 *
973 	 * Use IP_BOUND_IF to make sure that probes are sent and received on
974 	 * the specified phyint only.  Bind to the test address to ensure that
975 	 * the responses are sent to the specified phyint.
976 	 *
977 	 * Set the ttl to 1 so that probe packets are not routed.
978 	 * Disable multicast loopback.  Enable receipt of timestamp.
979 	 */
980 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
981 	if (pii->pii_probe_sock < 0) {
982 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
983 		return (_B_FALSE);
984 	}
985 
986 	bzero(&testaddr, sizeof (testaddr));
987 	testaddr.sin_family = AF_INET;
988 	testaddr.sin_port = 0;
989 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
990 	    &testaddr.sin_addr);
991 
992 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
993 	    sizeof (testaddr)) < 0) {
994 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
995 		return (_B_FALSE);
996 	}
997 
998 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
999 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
1000 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1001 		    " IP_BOUND_IF");
1002 		return (_B_FALSE);
1003 	}
1004 
1005 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
1006 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
1007 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1008 		    " IP_MULTICAST_IF");
1009 		return (_B_FALSE);
1010 	}
1011 
1012 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
1013 	    (char *)&ttl, sizeof (ttl)) < 0) {
1014 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1015 		    " IP_TTL");
1016 		return (_B_FALSE);
1017 	}
1018 
1019 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
1020 	    (char *)&char_off, sizeof (char_off)) == -1) {
1021 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1022 		    " IP_MULTICAST_LOOP");
1023 		return (_B_FALSE);
1024 	}
1025 
1026 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
1027 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
1028 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1029 		    " IP_MULTICAST_TTL");
1030 		return (_B_FALSE);
1031 	}
1032 
1033 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
1034 	    sizeof (on)) < 0) {
1035 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1036 		    " SO_TIMESTAMP");
1037 		return (_B_FALSE);
1038 	}
1039 
1040 	return (_B_TRUE);
1041 }
1042 
1043 /*
1044  * Remove the phyint group from the list of 'all phyint groups'
1045  * and free it.
1046  */
1047 void
1048 phyint_group_delete(struct phyint_group *pg)
1049 {
1050 	/*
1051 	 * The anonymous group always exists, even when empty.
1052 	 */
1053 	if (pg == phyint_anongroup)
1054 		return;
1055 
1056 	if (debug & D_PHYINT)
1057 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
1058 
1059 	/*
1060 	 * The phyint group must be empty, and must not have any phyints.
1061 	 * The phyint group must be in the list of all phyint groups
1062 	 */
1063 	assert(pg->pg_phyint == NULL);
1064 	assert(phyint_groups == pg || pg->pg_prev != NULL);
1065 
1066 	if (pg->pg_prev != NULL)
1067 		pg->pg_prev->pg_next = pg->pg_next;
1068 	else
1069 		phyint_groups = pg->pg_next;
1070 
1071 	if (pg->pg_next != NULL)
1072 		pg->pg_next->pg_prev = pg->pg_prev;
1073 
1074 	pg->pg_next = NULL;
1075 	pg->pg_prev = NULL;
1076 
1077 	phyint_grouplistsig++;
1078 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
1079 
1080 	addrlist_free(&pg->pg_addrs);
1081 	free(pg);
1082 }
1083 
1084 /*
1085  * Refresh the state of `pg' based on its current members.
1086  */
1087 void
1088 phyint_group_refresh_state(struct phyint_group *pg)
1089 {
1090 	enum pg_state state;
1091 	enum pg_state origstate = pg->pg_state;
1092 	struct phyint *pi, *usablepi;
1093 	uint_t nif = 0, nusable = 0;
1094 
1095 	/*
1096 	 * Anonymous groups never change state.
1097 	 */
1098 	if (pg == phyint_anongroup)
1099 		return;
1100 
1101 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1102 		nif++;
1103 		if (phyint_is_usable(pi)) {
1104 			nusable++;
1105 			usablepi = pi;
1106 		}
1107 	}
1108 
1109 	if (nusable == 0)
1110 		state = PG_FAILED;
1111 	else if (nif == nusable)
1112 		state = PG_OK;
1113 	else
1114 		state = PG_DEGRADED;
1115 
1116 	phyint_group_chstate(pg, state);
1117 
1118 	/*
1119 	 * If we're shutting down, skip logging messages since otherwise our
1120 	 * shutdown housecleaning will make us report that groups are unusable.
1121 	 */
1122 	if (cleanup_started)
1123 		return;
1124 
1125 	/*
1126 	 * NOTE: We use pg_failmsg_printed rather than origstate since
1127 	 * otherwise at startup we'll log a "now usable" message when the
1128 	 * first usable phyint is added to an empty group.
1129 	 */
1130 	if (state != PG_FAILED && pg->pg_failmsg_printed) {
1131 		assert(origstate == PG_FAILED);
1132 		logerr("At least 1 IP interface (%s) in group %s is now "
1133 		    "usable\n", usablepi->pi_name, pg->pg_name);
1134 		pg->pg_failmsg_printed = _B_FALSE;
1135 	} else if (origstate != PG_FAILED && state == PG_FAILED) {
1136 		logerr("All IP interfaces in group %s are now unusable\n",
1137 		    pg->pg_name);
1138 		pg->pg_failmsg_printed = _B_TRUE;
1139 	}
1140 }
1141 
1142 /*
1143  * Extract information from the kernel about the desired phyint.
1144  * Look only for properties of the phyint and not properties of logints.
1145  * Take appropriate action on the changes.
1146  * Return codes:
1147  *	PI_OK
1148  *		The phyint exists in the kernel and matches our knowledge
1149  *		of the phyint.
1150  *	PI_DELETED
1151  *		The phyint has vanished in the kernel.
1152  *	PI_IFINDEX_CHANGED
1153  *		The phyint's interface index has changed.
1154  *		Ask the caller to delete and recreate the phyint.
1155  *	PI_IOCTL_ERROR
1156  *		Some ioctl error. Don't change anything.
1157  *	PI_GROUP_CHANGED
1158  *		The phyint has changed group.
1159  */
1160 int
1161 phyint_inst_update_from_k(struct phyint_instance *pii)
1162 {
1163 	struct lifreq lifr;
1164 	int	ifsock;
1165 	struct phyint *pi;
1166 
1167 	pi = pii->pii_phyint;
1168 
1169 	if (debug & D_PHYINT) {
1170 		logdebug("phyint_inst_update_from_k(%s %s)\n",
1171 		    AF_STR(pii->pii_af), pi->pi_name);
1172 	}
1173 
1174 	/*
1175 	 * Get the ifindex from the kernel, for comparison with the
1176 	 * value in our tables.
1177 	 */
1178 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
1179 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1180 
1181 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1182 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
1183 		if (errno == ENXIO) {
1184 			return (PI_DELETED);
1185 		} else {
1186 			logperror_pii(pii, "phyint_inst_update_from_k:"
1187 			    " ioctl (get lifindex)");
1188 			return (PI_IOCTL_ERROR);
1189 		}
1190 	}
1191 
1192 	if (lifr.lifr_index != pi->pi_ifindex) {
1193 		/*
1194 		 * The index has changed. Most likely the interface has
1195 		 * been unplumbed and replumbed. Ask the caller to take
1196 		 * appropriate action.
1197 		 */
1198 		if (debug & D_PHYINT) {
1199 			logdebug("phyint_inst_update_from_k:"
1200 			    " old index %d new index %d\n",
1201 			    pi->pi_ifindex, lifr.lifr_index);
1202 		}
1203 		return (PI_IFINDEX_CHANGED);
1204 	}
1205 
1206 	/*
1207 	 * Get the group name from the kernel, for comparison with
1208 	 * the value in our tables.
1209 	 */
1210 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
1211 		if (errno == ENXIO) {
1212 			return (PI_DELETED);
1213 		} else {
1214 			logperror_pii(pii, "phyint_inst_update_from_k:"
1215 			    " ioctl (get groupname)");
1216 			return (PI_IOCTL_ERROR);
1217 		}
1218 	}
1219 
1220 	/*
1221 	 * If the phyint has changed group i.e. if the phyint group name
1222 	 * returned by the kernel is different, ask the caller to delete
1223 	 * and recreate the phyint in the right group
1224 	 */
1225 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
1226 		/* Groupname has changed */
1227 		if (debug & D_PHYINT) {
1228 			logdebug("phyint_inst_update_from_k:"
1229 			    " groupname change\n");
1230 		}
1231 		return (PI_GROUP_CHANGED);
1232 	}
1233 
1234 	/*
1235 	 * Get the current phyint flags from the kernel, and determine what
1236 	 * flags have changed by comparing against our tables.	Note that the
1237 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
1238 	 * that IFF_INACTIVE is really still set on the interface.
1239 	 */
1240 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
1241 		if (errno == ENXIO) {
1242 			return (PI_DELETED);
1243 		} else {
1244 			logperror_pii(pii, "phyint_inst_update_from_k: "
1245 			    " ioctl (get flags)");
1246 			return (PI_IOCTL_ERROR);
1247 		}
1248 	}
1249 
1250 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
1251 	if (pi->pi_v4 != NULL)
1252 		pi->pi_v4->pii_flags = pi->pi_flags;
1253 	if (pi->pi_v6 != NULL)
1254 		pi->pi_v6->pii_flags = pi->pi_flags;
1255 
1256 	/*
1257 	 * Make sure the IFF_FAILED flag is set if and only if we think
1258 	 * the interface should be failed.
1259 	 */
1260 	if (pi->pi_flags & IFF_FAILED) {
1261 		if (pi->pi_state == PI_RUNNING)
1262 			(void) change_pif_flags(pi, 0, IFF_FAILED);
1263 	} else {
1264 		if (pi->pi_state == PI_FAILED)
1265 			(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1266 	}
1267 
1268 	/* No change in phyint status */
1269 	return (PI_OK);
1270 }
1271 
1272 /*
1273  * Delete the phyint. Remove it from the list of all phyints, and the
1274  * list of phyint group members.
1275  */
1276 static void
1277 phyint_delete(struct phyint *pi)
1278 {
1279 	struct phyint *pi2;
1280 	struct phyint_group *pg = pi->pi_group;
1281 
1282 	if (debug & D_PHYINT)
1283 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1284 
1285 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1286 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1287 
1288 	/*
1289 	 * The phyint must belong to a group.
1290 	 */
1291 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1292 
1293 	/* The phyint must be in the list of all phyints */
1294 	assert(phyints == pi || pi->pi_prev != NULL);
1295 
1296 	/* Remove the phyint from the phyint group list */
1297 	pg->pg_sig++;
1298 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1299 
1300 	if (pi->pi_pgprev == NULL) {
1301 		/* Phyint is the 1st in the phyint group list */
1302 		pg->pg_phyint = pi->pi_pgnext;
1303 	} else {
1304 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1305 	}
1306 	if (pi->pi_pgnext != NULL)
1307 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1308 	pi->pi_pgnext = NULL;
1309 	pi->pi_pgprev = NULL;
1310 
1311 	/* Refresh the group state now that this phyint has been removed */
1312 	phyint_group_refresh_state(pg);
1313 
1314 	/* Remove the phyint from the global list of phyints */
1315 	if (pi->pi_prev == NULL) {
1316 		/* Phyint is the 1st in the list */
1317 		phyints = pi->pi_next;
1318 	} else {
1319 		pi->pi_prev->pi_next = pi->pi_next;
1320 	}
1321 	if (pi->pi_next != NULL)
1322 		pi->pi_next->pi_prev = pi->pi_prev;
1323 	pi->pi_next = NULL;
1324 	pi->pi_prev = NULL;
1325 
1326 	/*
1327 	 * See if another phyint in the group had been offlined because
1328 	 * it was a dup of `pi' -- and if so, online it.
1329 	 */
1330 	if (!pi->pi_hwaddrdup &&
1331 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1332 		assert(pi2->pi_hwaddrdup);
1333 		(void) phyint_undo_offline(pi2);
1334 	}
1335 	phyint_link_close(pi);
1336 	free(pi);
1337 }
1338 
1339 /*
1340  * Offline phyint `pi' if at least `minred' usable interfaces remain in the
1341  * group.  Returns an IPMP error code.
1342  */
1343 int
1344 phyint_offline(struct phyint *pi, uint_t minred)
1345 {
1346 	boolean_t was_active;
1347 	unsigned int nusable = 0;
1348 	struct phyint *pi2;
1349 	struct phyint_group *pg = pi->pi_group;
1350 
1351 	/*
1352 	 * Verify that enough usable interfaces in the group would remain.
1353 	 * As a special case, if the group has failed, allow any non-offline
1354 	 * phyints to be offlined.
1355 	 */
1356 	if (pg != phyint_anongroup) {
1357 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1358 			if (pi2 == pi)
1359 				continue;
1360 			if (phyint_is_usable(pi2) ||
1361 			    (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
1362 				nusable++;
1363 		}
1364 	}
1365 	if (nusable < minred)
1366 		return (IPMP_EMINRED);
1367 
1368 	was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1369 
1370 	if (!change_pif_flags(pi, IFF_OFFLINE, IFF_INACTIVE))
1371 		return (IPMP_FAILURE);
1372 
1373 	/*
1374 	 * The interface is now offline, so stop probing it.  Note that
1375 	 * if_mpadm(1M) will down the test addresses, after receiving a
1376 	 * success reply from us. The routing socket message will then make us
1377 	 * close the socket used for sending probes. But it is more logical
1378 	 * that an offlined interface must not be probed, even if it has test
1379 	 * addresses.
1380 	 *
1381 	 * NOTE: stop_probing() also sets PI_OFFLINE.
1382 	 */
1383 	stop_probing(pi);
1384 
1385 	/*
1386 	 * If we're offlining the phyint because it has a duplicate hardware
1387 	 * address, print a warning -- and leave the link open so that we can
1388 	 * be notified of hardware address changes that make it usable again.
1389 	 * Otherwise, close the link so that we won't prevent a detach.
1390 	 */
1391 	if (pi->pi_hwaddrdup) {
1392 		logerr("IP interface %s has a hardware address which is not "
1393 		    "unique in group %s; offlining\n", pi->pi_name,
1394 		    pg->pg_name);
1395 	} else {
1396 		phyint_link_close(pi);
1397 	}
1398 
1399 	/*
1400 	 * If this phyint was preventing another phyint with a duplicate
1401 	 * hardware address from being online, bring that one online now.
1402 	 */
1403 	if (!pi->pi_hwaddrdup &&
1404 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1405 		assert(pi2->pi_hwaddrdup);
1406 		(void) phyint_undo_offline(pi2);
1407 	}
1408 
1409 	/*
1410 	 * If this interface was active, try to activate another INACTIVE
1411 	 * interface in the group.
1412 	 */
1413 	if (was_active)
1414 		phyint_activate_another(pi);
1415 
1416 	return (IPMP_SUCCESS);
1417 }
1418 
1419 /*
1420  * Undo a previous offline of `pi'.  Returns an IPMP error code.
1421  */
1422 int
1423 phyint_undo_offline(struct phyint *pi)
1424 {
1425 	if (pi->pi_state != PI_OFFLINE) {
1426 		errno = EINVAL;
1427 		return (IPMP_FAILURE);
1428 	}
1429 
1430 	/*
1431 	 * If necessary, reinitialize our link information and verify that its
1432 	 * hardware address is still unique across the group.
1433 	 */
1434 	if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
1435 		errno = EIO;
1436 		return (IPMP_FAILURE);
1437 	}
1438 
1439 	if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
1440 		pi->pi_hwaddrdup = _B_TRUE;
1441 		return (IPMP_EHWADDRDUP);
1442 	}
1443 
1444 	if (pi->pi_hwaddrdup) {
1445 		logerr("IP interface %s now has a unique hardware address in "
1446 		    "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
1447 		pi->pi_hwaddrdup = _B_FALSE;
1448 	}
1449 
1450 	if (!change_pif_flags(pi, 0, IFF_OFFLINE))
1451 		return (IPMP_FAILURE);
1452 
1453 	/*
1454 	 * While the interface was offline, it may have failed (e.g. the link
1455 	 * may have gone down).  phyint_inst_check_for_failure() will have
1456 	 * already set pi_flags with IFF_FAILED, so we can use that to decide
1457 	 * whether the phyint should transition to running.  Note that after
1458 	 * we transition to running, we will start sending probes again (if
1459 	 * test addresses are configured), which may also reveal that the
1460 	 * interface is in fact failed.
1461 	 */
1462 	if (pi->pi_flags & IFF_FAILED) {
1463 		phyint_chstate(pi, PI_FAILED);
1464 	} else {
1465 		/* calls phyint_chstate() */
1466 		phyint_transition_to_running(pi);
1467 	}
1468 
1469 	/*
1470 	 * Give the requestor time to configure test addresses before
1471 	 * complaining that they're missing.
1472 	 */
1473 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
1474 
1475 	return (IPMP_SUCCESS);
1476 }
1477 
1478 /*
1479  * Delete (unlink and free), the phyint instance.
1480  */
1481 void
1482 phyint_inst_delete(struct phyint_instance *pii)
1483 {
1484 	struct phyint *pi = pii->pii_phyint;
1485 
1486 	assert(pi != NULL);
1487 
1488 	if (debug & D_PHYINT) {
1489 		logdebug("phyint_inst_delete(%s %s)\n",
1490 		    AF_STR(pii->pii_af), pi->pi_name);
1491 	}
1492 
1493 	/*
1494 	 * If the phyint instance has associated probe targets
1495 	 * delete all the targets
1496 	 */
1497 	while (pii->pii_targets != NULL)
1498 		target_delete(pii->pii_targets);
1499 
1500 	/*
1501 	 * Delete all the logints associated with this phyint
1502 	 * instance.
1503 	 */
1504 	while (pii->pii_logint != NULL)
1505 		logint_delete(pii->pii_logint);
1506 
1507 	/*
1508 	 * Close the socket used to send probes to targets from this phyint.
1509 	 */
1510 	if (pii->pii_probe_sock != -1)
1511 		close_probe_socket(pii, _B_TRUE);
1512 
1513 	/*
1514 	 * Phyint instance must be in the list of all phyint instances.
1515 	 * Remove phyint instance from the global list of phyint instances.
1516 	 */
1517 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1518 	if (pii->pii_prev == NULL) {
1519 		/* Phyint is the 1st in the list */
1520 		phyint_instances = pii->pii_next;
1521 	} else {
1522 		pii->pii_prev->pii_next = pii->pii_next;
1523 	}
1524 	if (pii->pii_next != NULL)
1525 		pii->pii_next->pii_prev = pii->pii_prev;
1526 	pii->pii_next = NULL;
1527 	pii->pii_prev = NULL;
1528 
1529 	/*
1530 	 * Reset the phyint instance pointer in the phyint.
1531 	 * If this is the last phyint instance (being deleted) on this
1532 	 * phyint, then delete the phyint.
1533 	 */
1534 	if (pii->pii_af == AF_INET)
1535 		pi->pi_v4 = NULL;
1536 	else
1537 		pi->pi_v6 = NULL;
1538 
1539 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1540 		phyint_delete(pi);
1541 
1542 	free(pii);
1543 }
1544 
1545 static void
1546 phyint_inst_print(struct phyint_instance *pii)
1547 {
1548 	struct logint *li;
1549 	struct target *tg;
1550 	char abuf[INET6_ADDRSTRLEN];
1551 	int most_recent;
1552 	int i;
1553 
1554 	if (pii->pii_phyint == NULL) {
1555 		logdebug("pii->pi_phyint NULL can't print\n");
1556 		return;
1557 	}
1558 
1559 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1560 	    "sock %x in_use %d\n",
1561 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1562 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1563 	    pii->pii_in_use);
1564 
1565 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1566 		logint_print(li);
1567 
1568 	logdebug("\n");
1569 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1570 		target_print(tg);
1571 
1572 	if (pii->pii_targets == NULL)
1573 		logdebug("pi_targets NULL\n");
1574 
1575 	if (pii->pii_target_next != NULL) {
1576 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1577 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1578 		    abuf, sizeof (abuf)));
1579 	} else {
1580 		logdebug("pi_target_next NULL\n");
1581 	}
1582 
1583 	if (pii->pii_rtt_target_next != NULL) {
1584 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1585 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1586 		    abuf, sizeof (abuf)));
1587 	} else {
1588 		logdebug("pi_rtt_target_next NULL\n");
1589 	}
1590 
1591 	if (pii->pii_targets != NULL) {
1592 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1593 
1594 		i = most_recent;
1595 		do {
1596 			if (pii->pii_probes[i].pr_target != NULL) {
1597 				logdebug("#%d target %s ", i,
1598 				    pr_addr(pii->pii_af,
1599 				    pii->pii_probes[i].pr_target->tg_address,
1600 				    abuf, sizeof (abuf)));
1601 			} else {
1602 				logdebug("#%d target NULL ", i);
1603 			}
1604 			logdebug("time_start %lld status %d "
1605 			    "time_ackproc %lld time_lost %u",
1606 			    pii->pii_probes[i].pr_hrtime_start,
1607 			    pii->pii_probes[i].pr_status,
1608 			    pii->pii_probes[i].pr_hrtime_ackproc,
1609 			    pii->pii_probes[i].pr_time_lost);
1610 			i = PROBE_INDEX_PREV(i);
1611 		} while (i != most_recent);
1612 	}
1613 }
1614 
1615 /*
1616  * Lookup a logint based on the logical interface name, on the given
1617  * phyint instance.
1618  */
1619 static struct logint *
1620 logint_lookup(struct phyint_instance *pii, char *name)
1621 {
1622 	struct logint *li;
1623 
1624 	if (debug & D_LOGINT) {
1625 		logdebug("logint_lookup(%s, %s)\n",
1626 		    AF_STR(pii->pii_af), name);
1627 	}
1628 
1629 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1630 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1631 			break;
1632 	}
1633 	return (li);
1634 }
1635 
1636 /*
1637  * Insert a logint at the head of the list of logints of the given
1638  * phyint instance
1639  */
1640 static void
1641 logint_insert(struct phyint_instance *pii, struct logint *li)
1642 {
1643 	li->li_next = pii->pii_logint;
1644 	li->li_prev = NULL;
1645 	if (pii->pii_logint != NULL)
1646 		pii->pii_logint->li_prev = li;
1647 	pii->pii_logint = li;
1648 	li->li_phyint_inst = pii;
1649 }
1650 
1651 /*
1652  * Create a new named logint, on the specified phyint instance.
1653  */
1654 static struct logint *
1655 logint_create(struct phyint_instance *pii, char *name)
1656 {
1657 	struct logint *li;
1658 
1659 	if (debug & D_LOGINT) {
1660 		logdebug("logint_create(%s %s %s)\n",
1661 		    AF_STR(pii->pii_af), pii->pii_name, name);
1662 	}
1663 
1664 	li = calloc(1, sizeof (struct logint));
1665 	if (li == NULL) {
1666 		logperror("logint_create: calloc");
1667 		return (NULL);
1668 	}
1669 
1670 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1671 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1672 	logint_insert(pii, li);
1673 	return (li);
1674 }
1675 
1676 /*
1677  * Initialize the logint based on the data returned by the kernel.
1678  */
1679 void
1680 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1681 {
1682 	int	ifsock;
1683 	uint64_t flags;
1684 	uint64_t saved_flags;
1685 	struct	logint	*li;
1686 	struct lifreq	lifr;
1687 	struct in6_addr	test_subnet;
1688 	struct in6_addr	testaddr;
1689 	int	test_subnet_len;
1690 	struct sockaddr_in6	*sin6;
1691 	struct sockaddr_in	*sin;
1692 	char abuf[INET6_ADDRSTRLEN];
1693 	boolean_t  ptp = _B_FALSE;
1694 	struct in6_addr tgaddr;
1695 
1696 	if (debug & D_LOGINT) {
1697 		logdebug("logint_init_from_k(%s %s)\n",
1698 		    AF_STR(pii->pii_af), li_name);
1699 	}
1700 
1701 	/* Get the socket for doing ioctls */
1702 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1703 
1704 	/*
1705 	 * Get the flags from the kernel. Also serves as a check whether
1706 	 * the logical still exists. If it doesn't exist, no need to proceed
1707 	 * any further. li_in_use will make the caller clean up the logint
1708 	 */
1709 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1710 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1711 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1712 		/* Interface may have vanished */
1713 		if (errno != ENXIO) {
1714 			logperror_pii(pii, "logint_init_from_k: "
1715 			    "ioctl (get flags)");
1716 		}
1717 		return;
1718 	}
1719 
1720 	flags = lifr.lifr_flags;
1721 
1722 	/*
1723 	 * Verified the logint exists. Now lookup the logint in our tables.
1724 	 * If it does not exist, create a new logint.
1725 	 */
1726 	li = logint_lookup(pii, li_name);
1727 	if (li == NULL) {
1728 		li = logint_create(pii, li_name);
1729 		if (li == NULL) {
1730 			/*
1731 			 * Pretend the interface does not exist
1732 			 * in the kernel
1733 			 */
1734 			return;
1735 		}
1736 	}
1737 
1738 	/*
1739 	 * Update li->li_flags with the new flags, after saving the old
1740 	 * value. This is used later to check what flags has changed and
1741 	 * take any action
1742 	 */
1743 	saved_flags = li->li_flags;
1744 	li->li_flags = flags;
1745 
1746 	/*
1747 	 * Get the address, prefix, prefixlength and update the logint.
1748 	 * Check if anything has changed. If the logint used for the
1749 	 * test address has changed, take suitable action.
1750 	 */
1751 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1752 		/* Interface may have vanished */
1753 		if (errno != ENXIO) {
1754 			logperror_li(li, "logint_init_from_k: (get addr)");
1755 		}
1756 		goto error;
1757 	}
1758 
1759 	if (pii->pii_af == AF_INET) {
1760 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1761 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1762 	} else {
1763 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1764 		testaddr = sin6->sin6_addr;
1765 	}
1766 
1767 	if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1768 		/* Interface may have vanished */
1769 		if (errno != ENXIO)
1770 			logperror_li(li, "logint_init_from_k: (get subnet)");
1771 		goto error;
1772 	}
1773 	if (lifr.lifr_subnet.ss_family == AF_INET6) {
1774 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1775 		test_subnet = sin6->sin6_addr;
1776 		test_subnet_len = lifr.lifr_addrlen;
1777 	} else {
1778 		sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1779 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1780 		test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
1781 	}
1782 
1783 	/*
1784 	 * If this is the logint corresponding to the test address used for
1785 	 * sending probes, then if anything significant has changed we need to
1786 	 * determine the test address again.  We ignore changes to the
1787 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1788 	 * course.
1789 	 */
1790 	if (pii->pii_probe_logint == li) {
1791 		if (((li->li_flags ^ saved_flags) &
1792 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1793 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1794 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1795 		    &li->li_subnet)) ||
1796 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1797 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1798 			/*
1799 			 * Something significant that affects the testaddress
1800 			 * has changed. Redo the testaddress selection later on
1801 			 * in select_test_ifs(). For now do the cleanup and
1802 			 * set pii_probe_logint to NULL.
1803 			 */
1804 			if (pii->pii_probe_sock != -1)
1805 				close_probe_socket(pii, _B_TRUE);
1806 			pii->pii_probe_logint = NULL;
1807 		}
1808 	}
1809 
1810 
1811 	/* Update the logint with the values obtained from the kernel.	*/
1812 	li->li_addr = testaddr;
1813 	li->li_in_use = 1;
1814 	if (ptp) {
1815 		li->li_dstaddr = tgaddr;
1816 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1817 		    IP_ABITS : IPV6_ABITS;
1818 	} else {
1819 		li->li_subnet = test_subnet;
1820 		li->li_subnet_len = test_subnet_len;
1821 	}
1822 
1823 	if (debug & D_LOGINT)
1824 		logint_print(li);
1825 
1826 	return;
1827 
1828 error:
1829 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1830 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1831 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1832 	logint_delete(li);
1833 }
1834 
1835 /*
1836  * Delete (unlink and free) a logint.
1837  */
1838 void
1839 logint_delete(struct logint *li)
1840 {
1841 	struct phyint_instance *pii;
1842 
1843 	pii = li->li_phyint_inst;
1844 	assert(pii != NULL);
1845 
1846 	if (debug & D_LOGINT) {
1847 		int af;
1848 		char abuf[INET6_ADDRSTRLEN];
1849 
1850 		af = pii->pii_af;
1851 		logdebug("logint_delete(%s %s %s/%u)\n",
1852 		    AF_STR(af), li->li_name,
1853 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1854 		    li->li_subnet_len);
1855 	}
1856 
1857 	/* logint must be in the list of logints */
1858 	assert(pii->pii_logint == li || li->li_prev != NULL);
1859 
1860 	/* Remove the logint from the list of logints  */
1861 	if (li->li_prev == NULL) {
1862 		/* logint is the 1st in the list */
1863 		pii->pii_logint = li->li_next;
1864 	} else {
1865 		li->li_prev->li_next = li->li_next;
1866 	}
1867 	if (li->li_next != NULL)
1868 		li->li_next->li_prev = li->li_prev;
1869 	li->li_next = NULL;
1870 	li->li_prev = NULL;
1871 
1872 	/*
1873 	 * If this logint is also being used for probing, then close the
1874 	 * associated socket, if it exists.
1875 	 */
1876 	if (pii->pii_probe_logint == li) {
1877 		if (pii->pii_probe_sock != -1)
1878 			close_probe_socket(pii, _B_TRUE);
1879 		pii->pii_probe_logint = NULL;
1880 	}
1881 
1882 	free(li);
1883 }
1884 
1885 static void
1886 logint_print(struct logint *li)
1887 {
1888 	char abuf[INET6_ADDRSTRLEN];
1889 	int af = li->li_phyint_inst->pii_af;
1890 
1891 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1892 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1893 
1894 	logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
1895 }
1896 
1897 char *
1898 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1899 {
1900 	struct in_addr	addr_v4;
1901 
1902 	if (af == AF_INET) {
1903 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1904 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1905 	} else {
1906 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1907 	}
1908 	return (abuf);
1909 }
1910 
1911 /*
1912  * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
1913  * represented by the [`af',`addr'] pair.  Needed because in.mpathd internally
1914  * stores all addresses as in6_addrs, but we don't want to expose that.
1915  */
1916 void
1917 addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
1918 {
1919 	struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
1920 	struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
1921 
1922 	assert(af == AF_INET || af == AF_INET6);
1923 
1924 	switch (af) {
1925 	case AF_INET:
1926 		(void) memset(sinp, 0, sizeof (*sinp));
1927 		sinp->sin_family = AF_INET;
1928 		IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
1929 		break;
1930 	case AF_INET6:
1931 		(void) memset(sin6p, 0, sizeof (*sin6p));
1932 		sin6p->sin6_family = AF_INET6;
1933 		sin6p->sin6_addr = *addr;
1934 		break;
1935 	}
1936 }
1937 
1938 /* Lookup target on its address */
1939 struct target *
1940 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1941 {
1942 	struct target *tg;
1943 
1944 	if (debug & D_TARGET) {
1945 		char abuf[INET6_ADDRSTRLEN];
1946 
1947 		logdebug("target_lookup(%s %s): addr %s\n",
1948 		    AF_STR(pii->pii_af), pii->pii_name,
1949 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1950 	}
1951 
1952 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1953 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1954 			break;
1955 	}
1956 	return (tg);
1957 }
1958 
1959 /*
1960  * Find and return the next active target, for the next probe.
1961  * If no active targets are available, return NULL.
1962  */
1963 struct target *
1964 target_next(struct target *tg)
1965 {
1966 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
1967 	struct	target	*marker = tg;
1968 	hrtime_t now;
1969 
1970 	now = gethrtime();
1971 
1972 	/*
1973 	 * Target must be in the list of targets for this phyint
1974 	 * instance.
1975 	 */
1976 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1977 	assert(pii->pii_targets != NULL);
1978 
1979 	/* Return the next active target */
1980 	do {
1981 		/*
1982 		 * Go to the next target. If we hit the end,
1983 		 * reset the ptr to the head
1984 		 */
1985 		tg = tg->tg_next;
1986 		if (tg == NULL)
1987 			tg = pii->pii_targets;
1988 
1989 		assert(TG_STATUS_VALID(tg->tg_status));
1990 
1991 		switch (tg->tg_status) {
1992 		case TG_ACTIVE:
1993 			return (tg);
1994 
1995 		case TG_UNUSED:
1996 			assert(pii->pii_targets_are_routers);
1997 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
1998 				/*
1999 				 * Bubble up the unused target to active
2000 				 */
2001 				tg->tg_status = TG_ACTIVE;
2002 				pii->pii_ntargets++;
2003 				return (tg);
2004 			}
2005 			break;
2006 
2007 		case TG_SLOW:
2008 			assert(pii->pii_targets_are_routers);
2009 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2010 				/*
2011 				 * Bubble up the slow target to unused
2012 				 */
2013 				tg->tg_status = TG_UNUSED;
2014 			}
2015 			break;
2016 
2017 		case TG_DEAD:
2018 			assert(pii->pii_targets_are_routers);
2019 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2020 				/*
2021 				 * Bubble up the dead target to slow
2022 				 */
2023 				tg->tg_status = TG_SLOW;
2024 				tg->tg_latime = now;
2025 			}
2026 			break;
2027 		}
2028 
2029 	} while (tg != marker);
2030 
2031 	return (NULL);
2032 }
2033 
2034 /*
2035  * Select the best available target, that is not already TG_ACTIVE,
2036  * for the caller. The caller will determine whether it wants to
2037  * make the returned target TG_ACTIVE.
2038  * The selection order is as follows.
2039  * 1. pick a TG_UNSED target, if it exists.
2040  * 2. else pick a TG_SLOW target that has recovered, if it exists
2041  * 3. else pick any TG_SLOW target, if it exists
2042  * 4. else pick a TG_DEAD target that has recovered, if it exists
2043  * 5. else pick any TG_DEAD target, if it exists
2044  * 6. else return null
2045  */
2046 static struct target *
2047 target_select_best(struct phyint_instance *pii)
2048 {
2049 	struct target *tg;
2050 	struct target *slow = NULL;
2051 	struct target *dead = NULL;
2052 	struct target *slow_recovered = NULL;
2053 	struct target *dead_recovered = NULL;
2054 	hrtime_t now;
2055 
2056 	now = gethrtime();
2057 
2058 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2059 		assert(TG_STATUS_VALID(tg->tg_status));
2060 
2061 		switch (tg->tg_status) {
2062 		case TG_UNUSED:
2063 			return (tg);
2064 
2065 		case TG_SLOW:
2066 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2067 				slow_recovered = tg;
2068 				/*
2069 				 * Promote the slow_recovered to unused
2070 				 */
2071 				tg->tg_status = TG_UNUSED;
2072 			} else {
2073 				slow = tg;
2074 			}
2075 			break;
2076 
2077 		case TG_DEAD:
2078 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2079 				dead_recovered = tg;
2080 				/*
2081 				 * Promote the dead_recovered to slow
2082 				 */
2083 				tg->tg_status = TG_SLOW;
2084 				tg->tg_latime = now;
2085 			} else {
2086 				dead = tg;
2087 			}
2088 			break;
2089 
2090 		default:
2091 			break;
2092 		}
2093 	}
2094 
2095 	if (slow_recovered != NULL)
2096 		return (slow_recovered);
2097 	else if (slow != NULL)
2098 		return (slow);
2099 	else if (dead_recovered != NULL)
2100 		return (dead_recovered);
2101 	else
2102 		return (dead);
2103 }
2104 
2105 /*
2106  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
2107  * that are active, pick the next best below.
2108  */
2109 static void
2110 target_activate_all(struct phyint_instance *pii)
2111 {
2112 	struct target *tg;
2113 
2114 	assert(pii->pii_ntargets == 0);
2115 	assert(pii->pii_target_next == NULL);
2116 	assert(pii->pii_rtt_target_next == NULL);
2117 	assert(pii->pii_targets_are_routers);
2118 
2119 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
2120 		tg = target_select_best(pii);
2121 		if (tg == NULL) {
2122 			/* We are out of targets */
2123 			return;
2124 		}
2125 
2126 		assert(TG_STATUS_VALID(tg->tg_status));
2127 		assert(tg->tg_status != TG_ACTIVE);
2128 		tg->tg_status = TG_ACTIVE;
2129 		pii->pii_ntargets++;
2130 		if (pii->pii_target_next == NULL) {
2131 			pii->pii_target_next = tg;
2132 			pii->pii_rtt_target_next = tg;
2133 		}
2134 	}
2135 }
2136 
2137 static struct target *
2138 target_first(struct phyint_instance *pii)
2139 {
2140 	struct target *tg;
2141 
2142 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2143 		assert(TG_STATUS_VALID(tg->tg_status));
2144 		if (tg->tg_status == TG_ACTIVE)
2145 			break;
2146 	}
2147 
2148 	return (tg);
2149 }
2150 
2151 /*
2152  * Create a default target entry.
2153  */
2154 void
2155 target_create(struct phyint_instance *pii, struct in6_addr addr,
2156     boolean_t is_router)
2157 {
2158 	struct target *tg;
2159 	struct phyint *pi;
2160 	struct logint *li;
2161 
2162 	if (debug & D_TARGET) {
2163 		char abuf[INET6_ADDRSTRLEN];
2164 
2165 		logdebug("target_create(%s %s, %s)\n",
2166 		    AF_STR(pii->pii_af), pii->pii_name,
2167 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
2168 	}
2169 
2170 	/*
2171 	 * If the test address is not yet initialized, do not add
2172 	 * any target, since we cannot determine whether the target
2173 	 * belongs to the same subnet as the test address.
2174 	 */
2175 	li = pii->pii_probe_logint;
2176 	if (li == NULL)
2177 		return;
2178 
2179 	/*
2180 	 * If there are multiple subnets associated with an interface, then
2181 	 * add the target to this phyint instance only if it belongs to the
2182 	 * same subnet as the test address.  This assures us that we will
2183 	 * be able to reach this target through our routing table.
2184 	 */
2185 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
2186 		return;
2187 
2188 	if (pii->pii_targets != NULL) {
2189 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
2190 		if (is_router) {
2191 			if (!pii->pii_targets_are_routers) {
2192 				/*
2193 				 * Prefer router over hosts. Using hosts is a
2194 				 * fallback mechanism, hence delete all host
2195 				 * targets.
2196 				 */
2197 				while (pii->pii_targets != NULL)
2198 					target_delete(pii->pii_targets);
2199 			}
2200 		} else {
2201 			/*
2202 			 * Routers take precedence over hosts. If this
2203 			 * is a router list and we are trying to add a
2204 			 * host, just return. If this is a host list
2205 			 * and if we have sufficient targets, just return
2206 			 */
2207 			if (pii->pii_targets_are_routers ||
2208 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
2209 				return;
2210 		}
2211 	}
2212 
2213 	tg = calloc(1, sizeof (struct target));
2214 	if (tg == NULL) {
2215 		logperror("target_create: calloc");
2216 		return;
2217 	}
2218 
2219 	tg->tg_phyint_inst = pii;
2220 	tg->tg_address = addr;
2221 	tg->tg_in_use = 1;
2222 	tg->tg_rtt_sa = -1;
2223 	tg->tg_num_deferred = 0;
2224 
2225 	/*
2226 	 * If this is the first target, set 'pii_targets_are_routers'
2227 	 * The list of targets is either a list of hosts or list or
2228 	 * routers, but not a mix.
2229 	 */
2230 	if (pii->pii_targets == NULL) {
2231 		assert(pii->pii_ntargets == 0);
2232 		assert(pii->pii_target_next == NULL);
2233 		assert(pii->pii_rtt_target_next == NULL);
2234 		pii->pii_targets_are_routers = is_router ? 1 : 0;
2235 	}
2236 
2237 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
2238 		assert(pii->pii_targets_are_routers);
2239 		assert(pii->pii_target_next != NULL);
2240 		assert(pii->pii_rtt_target_next != NULL);
2241 		tg->tg_status = TG_UNUSED;
2242 	} else {
2243 		if (pii->pii_ntargets == 0) {
2244 			assert(pii->pii_target_next == NULL);
2245 			pii->pii_target_next = tg;
2246 			pii->pii_rtt_target_next = tg;
2247 		}
2248 		pii->pii_ntargets++;
2249 		tg->tg_status = TG_ACTIVE;
2250 	}
2251 
2252 	target_insert(pii, tg);
2253 
2254 	/*
2255 	 * Change state to PI_RUNNING if this phyint instance is capable of
2256 	 * sending and receiving probes -- that is, if we know of at least 1
2257 	 * target, and this phyint instance is probe-capable.  For more
2258 	 * details, see the phyint state diagram in mpd_probe.c.
2259 	 */
2260 	pi = pii->pii_phyint;
2261 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
2262 		if (pi->pi_flags & IFF_FAILED)
2263 			phyint_chstate(pi, PI_FAILED);
2264 		else
2265 			phyint_chstate(pi, PI_RUNNING);
2266 	}
2267 }
2268 
2269 /*
2270  * Add the target address named by `addr' to phyint instance `pii' if it does
2271  * not already exist.  If the target is a router, `is_router' should be set to
2272  * B_TRUE.
2273  */
2274 void
2275 target_add(struct phyint_instance *pii, struct in6_addr addr,
2276     boolean_t is_router)
2277 {
2278 	struct target *tg;
2279 
2280 	if (pii == NULL)
2281 		return;
2282 
2283 	tg = target_lookup(pii, addr);
2284 
2285 	/*
2286 	 * If the target does not exist, create it; target_create() will set
2287 	 * tg_in_use to true.  Even if it exists already, if it's a router
2288 	 * target and we'd previously learned of it through multicast, then we
2289 	 * need to recreate it as a router target.  Otherwise, just set
2290 	 * tg_in_use to to true so that init_router_targets() won't delete it.
2291 	 */
2292 	if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
2293 		target_create(pii, addr, is_router);
2294 	else if (is_router)
2295 		tg->tg_in_use = 1;
2296 }
2297 
2298 /*
2299  * Insert target at head of linked list of targets for the associated
2300  * phyint instance
2301  */
2302 static void
2303 target_insert(struct phyint_instance *pii, struct target *tg)
2304 {
2305 	tg->tg_next = pii->pii_targets;
2306 	tg->tg_prev = NULL;
2307 	if (tg->tg_next != NULL)
2308 		tg->tg_next->tg_prev = tg;
2309 	pii->pii_targets = tg;
2310 }
2311 
2312 /*
2313  * Delete a target (unlink and free).
2314  */
2315 void
2316 target_delete(struct target *tg)
2317 {
2318 	int af;
2319 	struct phyint_instance	*pii;
2320 	struct phyint_instance	*pii_other;
2321 
2322 	pii = tg->tg_phyint_inst;
2323 	af = pii->pii_af;
2324 
2325 	if (debug & D_TARGET) {
2326 		char abuf[INET6_ADDRSTRLEN];
2327 
2328 		logdebug("target_delete(%s %s, %s)\n",
2329 		    AF_STR(af), pii->pii_name,
2330 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
2331 	}
2332 
2333 	/*
2334 	 * Target must be in the list of targets for this phyint
2335 	 * instance.
2336 	 */
2337 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2338 
2339 	/*
2340 	 * Reset all references to 'tg' in the probe information
2341 	 * for this phyint.
2342 	 */
2343 	reset_pii_probes(pii, tg);
2344 
2345 	/*
2346 	 * Remove this target from the list of targets of this
2347 	 * phyint instance.
2348 	 */
2349 	if (tg->tg_prev == NULL) {
2350 		pii->pii_targets = tg->tg_next;
2351 	} else {
2352 		tg->tg_prev->tg_next = tg->tg_next;
2353 	}
2354 
2355 	if (tg->tg_next != NULL)
2356 		tg->tg_next->tg_prev = tg->tg_prev;
2357 
2358 	tg->tg_next = NULL;
2359 	tg->tg_prev = NULL;
2360 
2361 	if (tg->tg_status == TG_ACTIVE)
2362 		pii->pii_ntargets--;
2363 
2364 	/*
2365 	 * Adjust the next target to probe, if it points to
2366 	 * to the currently deleted target.
2367 	 */
2368 	if (pii->pii_target_next == tg)
2369 		pii->pii_target_next = target_first(pii);
2370 
2371 	if (pii->pii_rtt_target_next == tg)
2372 		pii->pii_rtt_target_next = target_first(pii);
2373 
2374 	free(tg);
2375 
2376 	/*
2377 	 * The number of active targets pii_ntargets == 0 iff
2378 	 * the next active target pii->pii_target_next == NULL
2379 	 */
2380 	if (pii->pii_ntargets != 0) {
2381 		assert(pii->pii_target_next != NULL);
2382 		assert(pii->pii_rtt_target_next != NULL);
2383 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2384 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2385 		return;
2386 	}
2387 
2388 	/* At this point, we don't have any active targets. */
2389 	assert(pii->pii_target_next == NULL);
2390 	assert(pii->pii_rtt_target_next == NULL);
2391 
2392 	if (pii->pii_targets_are_routers) {
2393 		/*
2394 		 * Activate any TG_SLOW or TG_DEAD router targets,
2395 		 * since we don't have any other targets
2396 		 */
2397 		target_activate_all(pii);
2398 
2399 		if (pii->pii_ntargets != 0) {
2400 			assert(pii->pii_target_next != NULL);
2401 			assert(pii->pii_rtt_target_next != NULL);
2402 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2403 			assert(pii->pii_rtt_target_next->tg_status ==
2404 			    TG_ACTIVE);
2405 			return;
2406 		}
2407 	}
2408 
2409 	/*
2410 	 * If we still don't have any active targets, the list must
2411 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2412 	 * targets. Zero out the probe stats since it will not be
2413 	 * relevant any longer.
2414 	 */
2415 	assert(pii->pii_targets == NULL);
2416 	pii->pii_targets_are_routers = _B_FALSE;
2417 	clear_pii_probe_stats(pii);
2418 	pii_other = phyint_inst_other(pii);
2419 
2420 	/*
2421 	 * If there are no targets on both instances and the interface would
2422 	 * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
2423 	 * since we cannot probe this phyint any more.  For more details,
2424 	 * please see phyint state diagram in mpd_probe.c.
2425 	 */
2426 	if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
2427 	    pii->pii_phyint->pi_state != PI_OFFLINE)
2428 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2429 }
2430 
2431 /*
2432  * Flush the target list of every phyint in the group, if the list
2433  * is a host target list. This is called if group failure is suspected.
2434  * If all targets have failed, multicast will subsequently discover new
2435  * targets. Else it is a group failure.
2436  * Note: This function is a no-op if the list is a router target list.
2437  */
2438 static void
2439 target_flush_hosts(struct phyint_group *pg)
2440 {
2441 	struct phyint *pi;
2442 	struct phyint_instance *pii;
2443 
2444 	if (debug & D_TARGET)
2445 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2446 
2447 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2448 		pii = pi->pi_v4;
2449 		if (pii != NULL && !pii->pii_targets_are_routers) {
2450 			/*
2451 			 * Delete all the targets. When the list becomes
2452 			 * empty, target_delete() will set pii->pii_targets
2453 			 * to NULL.
2454 			 */
2455 			while (pii->pii_targets != NULL)
2456 				target_delete(pii->pii_targets);
2457 		}
2458 		pii = pi->pi_v6;
2459 		if (pii != NULL && !pii->pii_targets_are_routers) {
2460 			/*
2461 			 * Delete all the targets. When the list becomes
2462 			 * empty, target_delete() will set pii->pii_targets
2463 			 * to NULL.
2464 			 */
2465 			while (pii->pii_targets != NULL)
2466 				target_delete(pii->pii_targets);
2467 		}
2468 	}
2469 }
2470 
2471 /*
2472  * Reset all references to 'target' in the probe info, as this target is
2473  * being deleted. The pr_target field is guaranteed to be non-null if
2474  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2475  * pr_target will not be accessed unconditionally.
2476  */
2477 static void
2478 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2479 {
2480 	int i;
2481 
2482 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2483 		if (pii->pii_probes[i].pr_target == tg) {
2484 			if (pii->pii_probes[i].pr_status == PR_UNACKED) {
2485 				probe_chstate(&pii->pii_probes[i], pii,
2486 				    PR_LOST);
2487 			}
2488 			pii->pii_probes[i].pr_target = NULL;
2489 		}
2490 	}
2491 
2492 }
2493 
2494 /*
2495  * Clear the probe statistics array.
2496  */
2497 void
2498 clear_pii_probe_stats(struct phyint_instance *pii)
2499 {
2500 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2501 	/* Reset the next probe index in the probe stats array */
2502 	pii->pii_probe_next = 0;
2503 }
2504 
2505 static void
2506 target_print(struct target *tg)
2507 {
2508 	char	abuf[INET6_ADDRSTRLEN];
2509 	char	buf[128];
2510 	char	buf2[128];
2511 	int	af;
2512 	int	i;
2513 
2514 	af = tg->tg_phyint_inst->pii_af;
2515 
2516 	logdebug("Target on %s %s addr %s\n"
2517 	    "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
2518 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2519 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2520 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2521 	    tg->tg_crtt, tg->tg_in_use);
2522 
2523 	buf[0] = '\0';
2524 	for (i = 0; i < tg->tg_num_deferred; i++) {
2525 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2526 		    tg->tg_deferred[i]);
2527 		(void) strlcat(buf, buf2, sizeof (buf));
2528 	}
2529 	logdebug("deferred rtts:%s\n", buf);
2530 }
2531 
2532 void
2533 phyint_inst_print_all(void)
2534 {
2535 	struct phyint_instance *pii;
2536 
2537 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2538 		phyint_inst_print(pii);
2539 	}
2540 }
2541 
2542 /*
2543  * Compare two prefixes that have the same prefix length.
2544  * Fails if the prefix length is unreasonable.
2545  */
2546 boolean_t
2547 prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
2548 {
2549 	uchar_t mask;
2550 	int j;
2551 
2552 	if (prefix_len > IPV6_ABITS)
2553 		return (_B_FALSE);
2554 
2555 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2556 		if (p1.s6_addr[j] != p2.s6_addr[j])
2557 			return (_B_FALSE);
2558 
2559 	/* Make the N leftmost bits one */
2560 	mask = 0xff << (8 - prefix_len);
2561 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2562 		return (_B_FALSE);
2563 
2564 	return (_B_TRUE);
2565 }
2566 
2567 /*
2568  * Get the number of UP logints on phyint `pi'.
2569  */
2570 static int
2571 logint_upcount(struct phyint *pi)
2572 {
2573 	struct	logint	*li;
2574 	int count = 0;
2575 
2576 	if (pi->pi_v4 != NULL) {
2577 		for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
2578 			if (li->li_flags & IFF_UP)
2579 				count++;
2580 		}
2581 	}
2582 
2583 	if (pi->pi_v6 != NULL) {
2584 		for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
2585 			if (li->li_flags & IFF_UP)
2586 				count++;
2587 		}
2588 	}
2589 
2590 	return (count);
2591 }
2592 
2593 /*
2594  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2595  */
2596 struct phyint_instance *
2597 phyint_inst_other(struct phyint_instance *pii)
2598 {
2599 	if (pii->pii_af == AF_INET)
2600 		return (pii->pii_phyint->pi_v6);
2601 	else
2602 		return (pii->pii_phyint->pi_v4);
2603 }
2604 
2605 /*
2606  * Check whether a phyint is functioning.
2607  */
2608 static boolean_t
2609 phyint_is_functioning(struct phyint *pi)
2610 {
2611 	if (pi->pi_state == PI_RUNNING)
2612 		return (_B_TRUE);
2613 	return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
2614 }
2615 
2616 /*
2617  * Check whether a phyint is usable.
2618  */
2619 static boolean_t
2620 phyint_is_usable(struct phyint *pi)
2621 {
2622 	if (logint_upcount(pi) == 0)
2623 		return (_B_FALSE);
2624 	return (phyint_is_functioning(pi));
2625 }
2626 
2627 /*
2628  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2629  * Before sending the event, it prepends the current version of the IPMP
2630  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2631  * `nvl' is freed).
2632  */
2633 static int
2634 post_event(const char *subclass, nvlist_t *nvl)
2635 {
2636 	static evchan_t *evchp = NULL;
2637 
2638 	/*
2639 	 * Initialize the event channel if we haven't already done so.
2640 	 */
2641 	if (evchp == NULL) {
2642 		errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
2643 		if (errno != 0) {
2644 			logerr("cannot create event channel `%s': %s\n",
2645 			    IPMP_EVENT_CHAN, strerror(errno));
2646 			goto failed;
2647 		}
2648 	}
2649 
2650 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2651 	    IPMP_EVENT_CUR_VERSION);
2652 	if (errno != 0) {
2653 		logerr("cannot create `%s' event: %s", subclass,
2654 		    strerror(errno));
2655 		goto failed;
2656 	}
2657 
2658 	errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
2659 	    "in.mpathd", nvl, EVCH_NOSLEEP);
2660 	if (errno != 0) {
2661 		logerr("cannot send `%s' event: %s\n", subclass,
2662 		    strerror(errno));
2663 		goto failed;
2664 	}
2665 
2666 	nvlist_free(nvl);
2667 	return (0);
2668 failed:
2669 	nvlist_free(nvl);
2670 	return (-1);
2671 }
2672 
2673 /*
2674  * Return the external IPMP state associated with phyint `pi'.
2675  */
2676 static ipmp_if_state_t
2677 ifstate(struct phyint *pi)
2678 {
2679 	switch (pi->pi_state) {
2680 	case PI_NOTARGETS:
2681 		if (pi->pi_flags & IFF_FAILED)
2682 			return (IPMP_IF_FAILED);
2683 		return (IPMP_IF_UNKNOWN);
2684 
2685 	case PI_OFFLINE:
2686 		return (IPMP_IF_OFFLINE);
2687 
2688 	case PI_FAILED:
2689 		return (IPMP_IF_FAILED);
2690 
2691 	case PI_RUNNING:
2692 		return (IPMP_IF_OK);
2693 	}
2694 
2695 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2696 	abort();
2697 	/* NOTREACHED */
2698 }
2699 
2700 /*
2701  * Return the external IPMP interface type associated with phyint `pi'.
2702  */
2703 static ipmp_if_type_t
2704 iftype(struct phyint *pi)
2705 {
2706 	if (pi->pi_flags & IFF_STANDBY)
2707 		return (IPMP_IF_STANDBY);
2708 	else
2709 		return (IPMP_IF_NORMAL);
2710 }
2711 
2712 /*
2713  * Return the external IPMP link state associated with phyint `pi'.
2714  */
2715 static ipmp_if_linkstate_t
2716 iflinkstate(struct phyint *pi)
2717 {
2718 	if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
2719 		return (IPMP_LINK_UNKNOWN);
2720 
2721 	return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
2722 }
2723 
2724 /*
2725  * Return the external IPMP probe state associated with phyint `pi'.
2726  */
2727 static ipmp_if_probestate_t
2728 ifprobestate(struct phyint *pi)
2729 {
2730 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
2731 		return (IPMP_PROBE_DISABLED);
2732 
2733 	if (pi->pi_state == PI_FAILED)
2734 		return (IPMP_PROBE_FAILED);
2735 
2736 	if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
2737 		return (IPMP_PROBE_UNKNOWN);
2738 
2739 	return (IPMP_PROBE_OK);
2740 }
2741 
2742 /*
2743  * Return the external IPMP target mode associated with phyint instance `pii'.
2744  */
2745 static ipmp_if_targmode_t
2746 iftargmode(struct phyint_instance *pii)
2747 {
2748 	if (!PROBE_ENABLED(pii))
2749 		return (IPMP_TARG_DISABLED);
2750 	else if (pii->pii_targets_are_routers)
2751 		return (IPMP_TARG_ROUTES);
2752 	else
2753 		return (IPMP_TARG_MULTICAST);
2754 }
2755 
2756 /*
2757  * Return the external IPMP flags associated with phyint `pi'.
2758  */
2759 static ipmp_if_flags_t
2760 ifflags(struct phyint *pi)
2761 {
2762 	ipmp_if_flags_t flags = 0;
2763 
2764 	if (logint_upcount(pi) == 0)
2765 		flags |= IPMP_IFFLAG_DOWN;
2766 	if (pi->pi_flags & IFF_INACTIVE)
2767 		flags |= IPMP_IFFLAG_INACTIVE;
2768 	if (pi->pi_hwaddrdup)
2769 		flags |= IPMP_IFFLAG_HWADDRDUP;
2770 	if (phyint_is_functioning(pi) && flags == 0)
2771 		flags |= IPMP_IFFLAG_ACTIVE;
2772 
2773 	return (flags);
2774 }
2775 
2776 /*
2777  * Store the test address used on phyint instance `pii' in `ssp'.  If there's
2778  * no test address, 0.0.0.0 is stored.
2779  */
2780 static struct sockaddr_storage *
2781 iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
2782 {
2783 	if (PROBE_ENABLED(pii))
2784 		addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
2785 	else
2786 		addr2storage(AF_INET6, &in6addr_any, ssp);
2787 
2788 	return (ssp);
2789 }
2790 
2791 /*
2792  * Return the external IPMP group state associated with phyint group `pg'.
2793  */
2794 static ipmp_group_state_t
2795 groupstate(struct phyint_group *pg)
2796 {
2797 	switch (pg->pg_state) {
2798 	case PG_FAILED:
2799 		return (IPMP_GROUP_FAILED);
2800 	case PG_DEGRADED:
2801 		return (IPMP_GROUP_DEGRADED);
2802 	case PG_OK:
2803 		return (IPMP_GROUP_OK);
2804 	}
2805 
2806 	logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
2807 	abort();
2808 	/* NOTREACHED */
2809 }
2810 
2811 /*
2812  * Return the external IPMP probe state associated with probe `ps'.
2813  */
2814 static ipmp_probe_state_t
2815 probestate(struct probe_stats *ps)
2816 {
2817 	switch (ps->pr_status) {
2818 	case PR_UNUSED:
2819 	case PR_LOST:
2820 		return (IPMP_PROBE_LOST);
2821 	case PR_UNACKED:
2822 		return (IPMP_PROBE_SENT);
2823 	case PR_ACKED:
2824 		return (IPMP_PROBE_ACKED);
2825 	}
2826 
2827 	logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
2828 	abort();
2829 	/* NOTREACHED */
2830 }
2831 
2832 /*
2833  * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
2834  * on phyint instance `pii'.  Returns 0 on success, -1 on failure.
2835  */
2836 int
2837 probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
2838 {
2839 	nvlist_t *nvl;
2840 	hrtime_t proc_time = 0, recv_time = 0;
2841 	struct sockaddr_storage ss;
2842 	struct target *tg = pr->pr_target;
2843 	int64_t rttavg, rttdev;
2844 
2845 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2846 	if (errno != 0) {
2847 		logperror("cannot create `interface change' event");
2848 		return (-1);
2849 	}
2850 
2851 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
2852 	if (errno != 0)
2853 		goto failed;
2854 
2855 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
2856 	if (errno != 0)
2857 		goto failed;
2858 
2859 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
2860 	if (errno != 0)
2861 		goto failed;
2862 
2863 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
2864 	    pr->pr_hrtime_start);
2865 	if (errno != 0)
2866 		goto failed;
2867 
2868 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
2869 	    pr->pr_hrtime_sent);
2870 	if (errno != 0)
2871 		goto failed;
2872 
2873 	if (pr->pr_status == PR_ACKED) {
2874 		recv_time = pr->pr_hrtime_ackrecv;
2875 		proc_time = pr->pr_hrtime_ackproc;
2876 	}
2877 
2878 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
2879 	if (errno != 0)
2880 		goto failed;
2881 
2882 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
2883 	if (errno != 0)
2884 		goto failed;
2885 
2886 	if (tg != NULL)
2887 		addr2storage(pii->pii_af, &tg->tg_address, &ss);
2888 	else
2889 		addr2storage(pii->pii_af, &in6addr_any, &ss);
2890 
2891 	errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
2892 	    sizeof (ss));
2893 	if (errno != 0)
2894 		goto failed;
2895 
2896 	rttavg = (tg != NULL) ? (tg->tg_rtt_sa / 8) : 0;
2897 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, rttavg);
2898 	if (errno != 0)
2899 		goto failed;
2900 
2901 	rttdev = (tg != NULL) ? (tg->tg_rtt_sd / 4) : 0;
2902 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, rttdev);
2903 	if (errno != 0)
2904 		goto failed;
2905 
2906 	return (post_event(ESC_IPMP_PROBE_STATE, nvl));
2907 failed:
2908 	logperror("cannot create `probe state' event");
2909 	nvlist_free(nvl);
2910 	return (-1);
2911 }
2912 
2913 /*
2914  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2915  * Returns 0 on success, -1 on failure.
2916  */
2917 static int
2918 phyint_group_state_event(struct phyint_group *pg)
2919 {
2920 	nvlist_t	*nvl;
2921 
2922 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2923 	if (errno != 0) {
2924 		logperror("cannot create `group state change' event");
2925 		return (-1);
2926 	}
2927 
2928 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2929 	if (errno != 0)
2930 		goto failed;
2931 
2932 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2933 	if (errno != 0)
2934 		goto failed;
2935 
2936 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2937 	if (errno != 0)
2938 		goto failed;
2939 
2940 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2941 failed:
2942 	logperror("cannot create `group state change' event");
2943 	nvlist_free(nvl);
2944 	return (-1);
2945 }
2946 
2947 /*
2948  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2949  * `pg'.  Returns 0 on success, -1 on failure.
2950  */
2951 static int
2952 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2953 {
2954 	nvlist_t *nvl;
2955 
2956 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2957 	if (errno != 0) {
2958 		logperror("cannot create `group change' event");
2959 		return (-1);
2960 	}
2961 
2962 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2963 	if (errno != 0)
2964 		goto failed;
2965 
2966 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2967 	if (errno != 0)
2968 		goto failed;
2969 
2970 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
2971 	    phyint_grouplistsig);
2972 	if (errno != 0)
2973 		goto failed;
2974 
2975 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
2976 	if (errno != 0)
2977 		goto failed;
2978 
2979 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
2980 failed:
2981 	logperror("cannot create `group change' event");
2982 	nvlist_free(nvl);
2983 	return (-1);
2984 }
2985 
2986 /*
2987  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
2988  * group `pg'.	Returns 0 on success, -1 on failure.
2989  */
2990 static int
2991 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
2992     ipmp_if_op_t op)
2993 {
2994 	nvlist_t *nvl;
2995 
2996 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2997 	if (errno != 0) {
2998 		logperror("cannot create `group member change' event");
2999 		return (-1);
3000 	}
3001 
3002 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3003 	if (errno != 0)
3004 		goto failed;
3005 
3006 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3007 	if (errno != 0)
3008 		goto failed;
3009 
3010 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
3011 	if (errno != 0)
3012 		goto failed;
3013 
3014 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3015 	if (errno != 0)
3016 		goto failed;
3017 
3018 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3019 	if (errno != 0)
3020 		goto failed;
3021 
3022 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3023 	if (errno != 0)
3024 		goto failed;
3025 
3026 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
3027 failed:
3028 	logperror("cannot create `group member change' event");
3029 	nvlist_free(nvl);
3030 	return (-1);
3031 
3032 }
3033 
3034 /*
3035  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
3036  * Returns 0 on success, -1 on failure.
3037  */
3038 static int
3039 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
3040 {
3041 	nvlist_t *nvl;
3042 
3043 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3044 	if (errno != 0) {
3045 		logperror("cannot create `interface change' event");
3046 		return (-1);
3047 	}
3048 
3049 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3050 	if (errno != 0)
3051 		goto failed;
3052 
3053 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3054 	if (errno != 0)
3055 		goto failed;
3056 
3057 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3058 	if (errno != 0)
3059 		goto failed;
3060 
3061 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3062 	if (errno != 0)
3063 		goto failed;
3064 
3065 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3066 	if (errno != 0)
3067 		goto failed;
3068 
3069 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
3070 failed:
3071 	logperror("cannot create `interface change' event");
3072 	nvlist_free(nvl);
3073 	return (-1);
3074 
3075 }
3076 
3077 /*
3078  * Generate a signature for use.  The signature is conceptually divided
3079  * into two pieces: a random 16-bit "generation number" and a 48-bit
3080  * monotonically increasing integer.  The generation number protects
3081  * against stale updates to entities (e.g., IPMP groups) that have been
3082  * deleted and since recreated.
3083  */
3084 static uint64_t
3085 gensig(void)
3086 {
3087 	static int seeded = 0;
3088 
3089 	if (seeded == 0) {
3090 		srand48((long)gethrtime());
3091 		seeded++;
3092 	}
3093 
3094 	return ((uint64_t)lrand48() << 48 | 1);
3095 }
3096 
3097 /*
3098  * Store the information associated with group `grname' into a dynamically
3099  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
3100  */
3101 unsigned int
3102 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
3103 {
3104 	struct phyint		*pi;
3105 	struct phyint_group	*pg;
3106 	char			(*ifs)[LIFNAMSIZ];
3107 	unsigned int		i, j;
3108 	unsigned int		nif = 0, naddr = 0;
3109 	lifgroupinfo_t		lifgr;
3110 	addrlist_t		*addrp;
3111 	struct sockaddr_storage	*addrs;
3112 	int			fdt = 0;
3113 
3114 	pg = phyint_group_lookup(grname);
3115 	if (pg == NULL)
3116 		return (IPMP_EUNKGROUP);
3117 
3118 	/*
3119 	 * Tally up the number of interfaces, allocate an array to hold them,
3120 	 * and insert their names into the array.  While we're at it, if any
3121 	 * interface is actually enabled to send probes, save the group fdt.
3122 	 */
3123 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
3124 		nif++;
3125 
3126 	ifs = alloca(nif * sizeof (*ifs));
3127 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
3128 		assert(i < nif);
3129 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
3130 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
3131 			fdt = pg->pg_fdt;
3132 	}
3133 	assert(i == nif);
3134 
3135 	/*
3136 	 * If this is the anonymous group, there's no other information to
3137 	 * collect (since there's no IPMP interface).
3138 	 */
3139 	if (pg == phyint_anongroup) {
3140 		*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3141 		    groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
3142 		return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3143 	}
3144 
3145 	/*
3146 	 * Grab some additional information about the group from the kernel.
3147 	 * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
3148 	 * we can use ifsock_v4 even for a V6-only group.)
3149 	 */
3150 	(void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
3151 	if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
3152 		if (errno == ENOENT)
3153 			return (IPMP_EUNKGROUP);
3154 
3155 		logperror("getgroupinfo: SIOCGLIFGROUPINFO");
3156 		return (IPMP_FAILURE);
3157 	}
3158 
3159 	/*
3160 	 * Tally up the number of data addresses, allocate an array to hold
3161 	 * them, and insert their values into the array.
3162 	 */
3163 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
3164 		naddr++;
3165 
3166 	addrs = alloca(naddr * sizeof (*addrs));
3167 	i = 0;
3168 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3169 		/*
3170 		 * It's possible to have duplicate addresses (if some are
3171 		 * down).  Weed the dups out to avoid confusing consumers.
3172 		 * (If groups start having tons of addresses, we'll need a
3173 		 * better algorithm here.)
3174 		 */
3175 		for (j = 0; j < i; j++) {
3176 			if (sockaddrcmp(&addrs[j], &addrp->al_addr))
3177 				break;
3178 		}
3179 		if (j == i) {
3180 			assert(i < naddr);
3181 			addrs[i++] = addrp->al_addr;
3182 		}
3183 	}
3184 	naddr = i;
3185 
3186 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3187 	    groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
3188 	    lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
3189 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3190 }
3191 
3192 /*
3193  * Store the target information associated with phyint instance `pii' into a
3194  * dynamically allocated structure pointed to by `*targinfopp'.  Returns an
3195  * IPMP error code.
3196  */
3197 unsigned int
3198 gettarginfo(struct phyint_instance *pii, const char *name,
3199     ipmp_targinfo_t **targinfopp)
3200 {
3201 	uint_t ntarg = 0;
3202 	struct target *tg;
3203 	struct sockaddr_storage	ss;
3204 	struct sockaddr_storage *targs = NULL;
3205 
3206 	if (PROBE_CAPABLE(pii)) {
3207 		targs = alloca(pii->pii_ntargets * sizeof (*targs));
3208 		tg = pii->pii_target_next;
3209 		do {
3210 			if (tg->tg_status == TG_ACTIVE) {
3211 				assert(ntarg < pii->pii_ntargets);
3212 				addr2storage(pii->pii_af, &tg->tg_address,
3213 				    &targs[ntarg++]);
3214 			}
3215 			if ((tg = tg->tg_next) == NULL)
3216 				tg = pii->pii_targets;
3217 		} while (tg != pii->pii_target_next);
3218 
3219 		assert(ntarg == pii->pii_ntargets);
3220 	}
3221 
3222 	*targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
3223 	    iftargmode(pii), ntarg, targs);
3224 	return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3225 }
3226 
3227 /*
3228  * Store the information associated with interface `ifname' into a dynamically
3229  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
3230  */
3231 unsigned int
3232 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
3233 {
3234 	int		retval;
3235 	struct phyint	*pi;
3236 	ipmp_targinfo_t	*targinfo4;
3237 	ipmp_targinfo_t	*targinfo6;
3238 
3239 	pi = phyint_lookup(ifname);
3240 	if (pi == NULL)
3241 		return (IPMP_EUNKIF);
3242 
3243 	if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
3244 	    (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
3245 		goto out;
3246 
3247 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
3248 	    ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
3249 	    ifflags(pi), targinfo4, targinfo6);
3250 	retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3251 out:
3252 	if (targinfo4 != NULL)
3253 		ipmp_freetarginfo(targinfo4);
3254 	if (targinfo6 != NULL)
3255 		ipmp_freetarginfo(targinfo6);
3256 	return (retval);
3257 }
3258 
3259 /*
3260  * Store the current list of IPMP groups into a dynamically allocated
3261  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
3262  */
3263 unsigned int
3264 getgrouplist(ipmp_grouplist_t **grlistpp)
3265 {
3266 	struct phyint_group	*pg;
3267 	char			(*groups)[LIFGRNAMSIZ];
3268 	unsigned int		i, ngroup;
3269 
3270 	/*
3271 	 * Tally up the number of groups, allocate an array to hold them, and
3272 	 * insert their names into the array.
3273 	 */
3274 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
3275 		ngroup++;
3276 
3277 	groups = alloca(ngroup * sizeof (*groups));
3278 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
3279 		assert(i < ngroup);
3280 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
3281 	}
3282 	assert(i == ngroup);
3283 
3284 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
3285 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3286 }
3287 
3288 /*
3289  * Store the address information for `ssp' (in group `grname') into a
3290  * dynamically allocated structure pointed to by `*adinfopp'.  Returns an IPMP
3291  * error code.  (We'd call this function getaddrinfo(), but it would conflict
3292  * with getaddrinfo(3SOCKET)).
3293  */
3294 unsigned int
3295 getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
3296     ipmp_addrinfo_t **adinfopp)
3297 {
3298 	int ifsock;
3299 	addrlist_t *addrp, *addrmatchp = NULL;
3300 	ipmp_addr_state_t state;
3301 	const char *binding = "";
3302 	struct lifreq lifr;
3303 	struct phyint_group *pg;
3304 
3305 	if ((pg = phyint_group_lookup(grname)) == NULL)
3306 		return (IPMP_EUNKADDR);
3307 
3308 	/*
3309 	 * Walk through the data addresses, and find a match.  Note that since
3310 	 * some of the addresses may be down, more than one may match.  We
3311 	 * prefer an up address (if one exists).
3312 	 */
3313 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3314 		if (sockaddrcmp(ssp, &addrp->al_addr)) {
3315 			addrmatchp = addrp;
3316 			if (addrmatchp->al_flags & IFF_UP)
3317 				break;
3318 		}
3319 	}
3320 
3321 	if (addrmatchp == NULL)
3322 		return (IPMP_EUNKADDR);
3323 
3324 	state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
3325 	if (state == IPMP_ADDR_UP) {
3326 		ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
3327 		(void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
3328 		if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
3329 			binding = lifr.lifr_binding;
3330 	}
3331 
3332 	*adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
3333 	return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3334 }
3335 
3336 /*
3337  * Store a snapshot of the IPMP subsystem into a dynamically allocated
3338  * structure pointed to by `*snapp'.  Returns an IPMP error code.
3339  */
3340 unsigned int
3341 getsnap(ipmp_snap_t **snapp)
3342 {
3343 	ipmp_grouplist_t	*grlistp;
3344 	ipmp_groupinfo_t	*grinfop;
3345 	ipmp_addrinfo_t		*adinfop;
3346 	ipmp_addrlist_t		*adlistp;
3347 	ipmp_ifinfo_t		*ifinfop;
3348 	ipmp_snap_t		*snap;
3349 	struct phyint		*pi;
3350 	unsigned int		i, j;
3351 	int			retval;
3352 
3353 	snap = ipmp_snap_create();
3354 	if (snap == NULL)
3355 		return (IPMP_ENOMEM);
3356 
3357 	/*
3358 	 * Add group list.
3359 	 */
3360 	retval = getgrouplist(&snap->sn_grlistp);
3361 	if (retval != IPMP_SUCCESS)
3362 		goto failed;
3363 
3364 	/*
3365 	 * Add information for each group in the list, along with all of its
3366 	 * data addresses.
3367 	 */
3368 	grlistp = snap->sn_grlistp;
3369 	for (i = 0; i < grlistp->gl_ngroup; i++) {
3370 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
3371 		if (retval != IPMP_SUCCESS)
3372 			goto failed;
3373 
3374 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
3375 		if (retval != IPMP_SUCCESS) {
3376 			ipmp_freegroupinfo(grinfop);
3377 			goto failed;
3378 		}
3379 
3380 		adlistp = grinfop->gr_adlistp;
3381 		for (j = 0; j < adlistp->al_naddr; j++) {
3382 			retval = getgraddrinfo(grinfop->gr_name,
3383 			    &adlistp->al_addrs[j], &adinfop);
3384 			if (retval != IPMP_SUCCESS)
3385 				goto failed;
3386 
3387 			retval = ipmp_snap_addaddrinfo(snap, adinfop);
3388 			if (retval != IPMP_SUCCESS) {
3389 				ipmp_freeaddrinfo(adinfop);
3390 				goto failed;
3391 			}
3392 		}
3393 	}
3394 
3395 	/*
3396 	 * Add information for each configured phyint.
3397 	 */
3398 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
3399 		retval = getifinfo(pi->pi_name, &ifinfop);
3400 		if (retval != IPMP_SUCCESS)
3401 			goto failed;
3402 
3403 		retval = ipmp_snap_addifinfo(snap, ifinfop);
3404 		if (retval != IPMP_SUCCESS) {
3405 			ipmp_freeifinfo(ifinfop);
3406 			goto failed;
3407 		}
3408 	}
3409 
3410 	*snapp = snap;
3411 	return (IPMP_SUCCESS);
3412 failed:
3413 	ipmp_snap_free(snap);
3414 	return (retval);
3415 }
3416