xref: /illumos-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision 6e375c8351497b82ffa4f33cbf61d712999b4605)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 /*
30  * Global list of phyints, phyint instances, phyint groups and the anonymous
31  * group; the latter is initialized in phyint_init().
32  */
33 struct phyint *phyints = NULL;
34 struct phyint_instance	*phyint_instances = NULL;
35 struct phyint_group *phyint_groups = NULL;
36 struct phyint_group *phyint_anongroup;
37 
38 /*
39  * Grouplist signature; initialized in phyint_init().
40  */
41 static uint64_t phyint_grouplistsig;
42 
43 static void phyint_inst_insert(struct phyint_instance *pii);
44 static void phyint_inst_print(struct phyint_instance *pii);
45 
46 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
47 static void phyint_delete(struct phyint *pi);
48 static boolean_t phyint_is_usable(struct phyint *pi);
49 
50 static void logint_print(struct logint *li);
51 static void logint_insert(struct phyint_instance *pii, struct logint *li);
52 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
53 
54 static void target_print(struct target *tg);
55 static void target_insert(struct phyint_instance *pii, struct target *tg);
56 static struct target *target_first(struct phyint_instance *pii);
57 static struct target *target_select_best(struct phyint_instance *pii);
58 static void target_flush_hosts(struct phyint_group *pg);
59 
60 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
61 
62 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
63 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
64 
65 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
66 static int phyint_group_state_event(struct phyint_group *pg);
67 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
68 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
69     ipmp_if_op_t op);
70 
71 static int logint_upcount(struct phyint *pi);
72 static uint64_t gensig(void);
73 
74 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
75 int
76 phyint_init(void)
77 {
78 	phyint_grouplistsig = gensig();
79 	if (track_all_phyints) {
80 		phyint_anongroup = phyint_group_create("");
81 		if (phyint_anongroup == NULL)
82 			return (-1);
83 		phyint_group_insert(phyint_anongroup);
84 	}
85 	return (0);
86 }
87 
88 /* Return the phyint with the given name */
89 struct phyint *
90 phyint_lookup(const char *name)
91 {
92 	struct phyint *pi;
93 
94 	if (debug & D_PHYINT)
95 		logdebug("phyint_lookup(%s)\n", name);
96 
97 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
98 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
99 			break;
100 	}
101 	return (pi);
102 }
103 
104 /*
105  * Lookup a phyint in the group that has the same hardware address as `pi', or
106  * NULL if there's none.  If `online_only' is set, then only online phyints
107  * are considered when matching.  Otherwise, phyints that had been offlined
108  * due to a duplicate hardware address will also be considered.
109  */
110 static struct phyint *
111 phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
112 {
113 	struct phyint *pi2;
114 
115 	if (pi->pi_group == phyint_anongroup)
116 		return (NULL);
117 
118 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
119 		if (pi2 == pi)
120 			continue;
121 
122 		/*
123 		 * NOTE: even when online_only is B_FALSE, we ignore phyints
124 		 * that are administratively offline (rather than offline
125 		 * because they're dups); when they're brought back online,
126 		 * they'll be flagged as dups if need be.
127 		 */
128 		if (pi2->pi_state == PI_OFFLINE &&
129 		    (online_only || !pi2->pi_hwaddrdup))
130 			continue;
131 
132 		if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
133 		    bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
134 			return (pi2);
135 	}
136 	return (NULL);
137 }
138 
139 /*
140  * Respond to DLPI notifications.  Currently, this only processes physical
141  * address changes for the phyint passed via `arg' by onlining or offlining
142  * phyints in the group.
143  */
144 /* ARGSUSED */
145 static void
146 phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
147 {
148 	struct phyint *pi = arg;
149 	struct phyint *oduppi = NULL, *duppi = NULL;
150 
151 	assert((dnip->dni_note & pi->pi_notes) != 0);
152 
153 	if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
154 		return;
155 
156 	assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
157 
158 	/*
159 	 * If our hardware address hasn't changed, there's nothing to do.
160 	 */
161 	if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
162 	    bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
163 		return;
164 
165 	oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
166 	pi->pi_hwaddrlen = dnip->dni_physaddrlen;
167 	(void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
168 	duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
169 
170 	if (oduppi != NULL || pi->pi_hwaddrdup) {
171 		/*
172 		 * Our old hardware address was a duplicate.  If we'd been
173 		 * offlined because of it, and our new hardware address is not
174 		 * a duplicate, then bring us online.  Otherwise, `oduppi'
175 		 * must've been the one brought offline; bring it online.
176 		 */
177 		if (pi->pi_hwaddrdup) {
178 			if (duppi == NULL)
179 				(void) phyint_undo_offline(pi);
180 		} else {
181 			assert(oduppi->pi_hwaddrdup);
182 			(void) phyint_undo_offline(oduppi);
183 		}
184 	}
185 
186 	if (duppi != NULL && !pi->pi_hwaddrdup) {
187 		/*
188 		 * Our new hardware address was a duplicate and we're not
189 		 * yet flagged as a duplicate; bring us offline.
190 		 */
191 		pi->pi_hwaddrdup = _B_TRUE;
192 		(void) phyint_offline(pi, 0);
193 	}
194 }
195 
196 /*
197  * Initialize information about the underlying link for `pi', and set us
198  * up to be notified about future changes.  Returns _B_TRUE on success.
199  */
200 boolean_t
201 phyint_link_init(struct phyint *pi)
202 {
203 	int retval;
204 	uint_t notes;
205 	const char *errmsg;
206 	dlpi_notifyid_t id;
207 
208 	pi->pi_notes = 0;
209 	retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
210 	if (retval != DLPI_SUCCESS) {
211 		pi->pi_dh = NULL;
212 		errmsg = "cannot open";
213 		goto failed;
214 	}
215 
216 	pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
217 	retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
218 	    &pi->pi_hwaddrlen);
219 	if (retval != DLPI_SUCCESS) {
220 		errmsg = "cannot get hardware address";
221 		goto failed;
222 	}
223 
224 	retval = dlpi_bind(pi->pi_dh, DLPI_ANY_SAP, NULL);
225 	if (retval != DLPI_SUCCESS) {
226 		errmsg = "cannot bind to DLPI_ANY_SAP";
227 		goto failed;
228 	}
229 
230 	/*
231 	 * Check if the link supports DLPI link state notifications.  For
232 	 * historical reasons, the actual changes are tracked through routing
233 	 * sockets, so we immediately disable the notification upon success.
234 	 */
235 	notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
236 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
237 	if (retval == DLPI_SUCCESS) {
238 		(void) dlpi_disabnotify(pi->pi_dh, id, NULL);
239 		pi->pi_notes |= notes;
240 	}
241 
242 	/*
243 	 * Enable notification of hardware address changes to keep pi_hwaddr
244 	 * up-to-date and track if we need to offline/undo-offline phyints.
245 	 */
246 	notes = DL_NOTE_PHYS_ADDR;
247 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
248 	if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
249 		pi->pi_notes |= notes;
250 
251 	return (_B_TRUE);
252 failed:
253 	logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
254 	if (pi->pi_dh != NULL) {
255 		dlpi_close(pi->pi_dh);
256 		pi->pi_dh = NULL;
257 	}
258 	return (_B_FALSE);
259 }
260 
261 /*
262  * Close use of link on `pi'.
263  */
264 void
265 phyint_link_close(struct phyint *pi)
266 {
267 	if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
268 		(void) poll_remove(dlpi_fd(pi->pi_dh));
269 		pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
270 	}
271 
272 	/*
273 	 * NOTE: we don't clear pi_notes here so that iflinkstate() can still
274 	 * properly report the link state even when offline (which is possible
275 	 * since we use IFF_RUNNING to track link state).
276 	 */
277 	dlpi_close(pi->pi_dh);
278 	pi->pi_dh = NULL;
279 }
280 
281 /* Return the phyint instance with the given name and the given family */
282 struct phyint_instance *
283 phyint_inst_lookup(int af, char *name)
284 {
285 	struct phyint *pi;
286 
287 	if (debug & D_PHYINT)
288 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
289 
290 	assert(af == AF_INET || af == AF_INET6);
291 
292 	pi = phyint_lookup(name);
293 	if (pi == NULL)
294 		return (NULL);
295 
296 	return (PHYINT_INSTANCE(pi, af));
297 }
298 
299 struct phyint_group *
300 phyint_group_lookup(const char *pg_name)
301 {
302 	struct phyint_group *pg;
303 
304 	if (debug & D_PHYINT)
305 		logdebug("phyint_group_lookup(%s)\n", pg_name);
306 
307 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
308 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
309 			break;
310 	}
311 	return (pg);
312 }
313 
314 /*
315  * Insert the phyint in the linked list of all phyints. If the phyint belongs
316  * to some group, insert it in the phyint group list.
317  */
318 static void
319 phyint_insert(struct phyint *pi, struct phyint_group *pg)
320 {
321 	if (debug & D_PHYINT)
322 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
323 
324 	/* Insert the phyint at the head of the 'all phyints' list */
325 	pi->pi_next = phyints;
326 	pi->pi_prev = NULL;
327 	if (phyints != NULL)
328 		phyints->pi_prev = pi;
329 	phyints = pi;
330 
331 	/*
332 	 * Insert the phyint at the head of the 'phyint_group members' list
333 	 * of the phyint group to which it belongs.
334 	 */
335 	pi->pi_pgnext = NULL;
336 	pi->pi_pgprev = NULL;
337 	pi->pi_group = pg;
338 
339 	pi->pi_pgnext = pg->pg_phyint;
340 	if (pi->pi_pgnext != NULL)
341 		pi->pi_pgnext->pi_pgprev = pi;
342 	pg->pg_phyint = pi;
343 
344 	/* Refresh the group state now that this phyint has been added */
345 	phyint_group_refresh_state(pg);
346 
347 	pg->pg_sig++;
348 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
349 }
350 
351 /* Insert the phyint instance in the linked list of all phyint instances. */
352 static void
353 phyint_inst_insert(struct phyint_instance *pii)
354 {
355 	if (debug & D_PHYINT) {
356 		logdebug("phyint_inst_insert(%s %s)\n",
357 		    AF_STR(pii->pii_af), pii->pii_name);
358 	}
359 
360 	/*
361 	 * Insert the phyint at the head of the 'all phyint instances' list.
362 	 */
363 	pii->pii_next = phyint_instances;
364 	pii->pii_prev = NULL;
365 	if (phyint_instances != NULL)
366 		phyint_instances->pii_prev = pii;
367 	phyint_instances = pii;
368 }
369 
370 /*
371  * Create a new phyint with the given parameters. Also insert it into
372  * the list of all phyints and the list of phyint group members by calling
373  * phyint_insert().
374  */
375 static struct phyint *
376 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
377     uint64_t flags)
378 {
379 	struct phyint *pi;
380 
381 	pi = calloc(1, sizeof (struct phyint));
382 	if (pi == NULL) {
383 		logperror("phyint_create: calloc");
384 		return (NULL);
385 	}
386 
387 	/*
388 	 * Record the phyint values.
389 	 */
390 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
391 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
392 	pi->pi_ifindex = ifindex;
393 	pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
394 
395 	/*
396 	 * If the interface is offline, we set the state to PI_OFFLINE.
397 	 * Otherwise, we optimistically start in the PI_RUNNING state.  Later
398 	 * (in process_link_state_changes()), we will adjust this to match the
399 	 * current state of the link.  Further, if test addresses are
400 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
401 	 * to either PI_RUNNING or PI_FAILED depending on the probe results.
402 	 */
403 	pi->pi_state = (flags & IFF_OFFLINE) ? PI_OFFLINE : PI_RUNNING;
404 	pi->pi_flags = PHYINT_FLAGS(flags);
405 
406 	/*
407 	 * Initialise the link state.  The link state is initialised to
408 	 * up, so that if the link is down when IPMP starts monitoring
409 	 * the interface, it will appear as though there has been a
410 	 * transition from the link up to link down.  This avoids
411 	 * having to treat this situation as a special case.
412 	 */
413 	INIT_LINK_STATE(pi);
414 
415 	if (!phyint_link_init(pi)) {
416 		free(pi);
417 		return (NULL);
418 	}
419 
420 	/*
421 	 * Insert the phyint in the list of all phyints, and the
422 	 * list of phyint group members
423 	 */
424 	phyint_insert(pi, pg);
425 
426 	return (pi);
427 }
428 
429 /*
430  * Create a new phyint instance belonging to the phyint 'pi' and address
431  * family 'af'. Also insert it into the list of all phyint instances by
432  * calling phyint_inst_insert().
433  */
434 static struct phyint_instance *
435 phyint_inst_create(struct phyint *pi, int af)
436 {
437 	struct phyint_instance *pii;
438 
439 	pii = calloc(1, sizeof (struct phyint_instance));
440 	if (pii == NULL) {
441 		logperror("phyint_inst_create: calloc");
442 		return (NULL);
443 	}
444 
445 	/*
446 	 * Attach the phyint instance to the phyint.
447 	 * Set the back pointers as well
448 	 */
449 	pii->pii_phyint = pi;
450 	if (af == AF_INET)
451 		pi->pi_v4 = pii;
452 	else
453 		pi->pi_v6 = pii;
454 
455 	pii->pii_in_use = 1;
456 	pii->pii_probe_sock = -1;
457 	pii->pii_snxt = 1;
458 	pii->pii_af = af;
459 	pii->pii_fd_hrtime = gethrtime() +
460 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
461 	pii->pii_flags = pi->pi_flags;
462 
463 	/* Insert the phyint instance in the list of all phyint instances. */
464 	phyint_inst_insert(pii);
465 	return (pii);
466 }
467 
468 /*
469  * Change the state of phyint `pi' to state `state'.
470  */
471 void
472 phyint_chstate(struct phyint *pi, enum pi_state state)
473 {
474 	/*
475 	 * To simplify things, some callers always set a given state
476 	 * regardless of the previous state of the phyint (e.g., setting
477 	 * PI_RUNNING when it's already set).  We shouldn't bother
478 	 * generating an event or consuming a signature for these, since
479 	 * the actual state of the interface is unchanged.
480 	 */
481 	if (pi->pi_state == state)
482 		return;
483 
484 	pi->pi_state = state;
485 	phyint_changed(pi);
486 }
487 
488 /*
489  * Note that `pi' has changed state.
490  */
491 void
492 phyint_changed(struct phyint *pi)
493 {
494 	pi->pi_group->pg_sig++;
495 	(void) phyint_state_event(pi->pi_group, pi);
496 }
497 
498 /*
499  * Insert the phyint group in the linked list of all phyint groups
500  * at the head of the list
501  */
502 void
503 phyint_group_insert(struct phyint_group *pg)
504 {
505 	pg->pg_next = phyint_groups;
506 	pg->pg_prev = NULL;
507 	if (phyint_groups != NULL)
508 		phyint_groups->pg_prev = pg;
509 	phyint_groups = pg;
510 
511 	phyint_grouplistsig++;
512 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
513 }
514 
515 /*
516  * Create a new phyint group called 'name'.
517  */
518 struct phyint_group *
519 phyint_group_create(const char *name)
520 {
521 	struct	phyint_group *pg;
522 
523 	if (debug & D_PHYINT)
524 		logdebug("phyint_group_create(%s)\n", name);
525 
526 	pg = calloc(1, sizeof (struct phyint_group));
527 	if (pg == NULL) {
528 		logperror("phyint_group_create: calloc");
529 		return (NULL);
530 	}
531 
532 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
533 	pg->pg_sig = gensig();
534 	pg->pg_fdt = user_failure_detection_time;
535 	pg->pg_probeint = user_probe_interval;
536 	pg->pg_in_use = _B_TRUE;
537 
538 	/*
539 	 * Normal groups always start in the PG_FAILED state since they
540 	 * have no active interfaces.  In contrast, anonymous groups are
541 	 * heterogeneous and thus always PG_OK.
542 	 */
543 	pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
544 
545 	return (pg);
546 }
547 
548 /*
549  * Change the state of the phyint group `pg' to state `state'.
550  */
551 void
552 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
553 {
554 	assert(pg != phyint_anongroup);
555 
556 	/*
557 	 * To simplify things, some callers always set a given state
558 	 * regardless of the previous state of the group (e.g., setting
559 	 * PG_DEGRADED when it's already set).  We shouldn't bother
560 	 * generating an event or consuming a signature for these, since
561 	 * the actual state of the group is unchanged.
562 	 */
563 	if (pg->pg_state == state)
564 		return;
565 
566 	pg->pg_state = state;
567 
568 	switch (state) {
569 	case PG_FAILED:
570 		/*
571 		 * We can never know with certainty that a group has
572 		 * failed.  It is possible that all known targets have
573 		 * failed simultaneously, and new targets have come up
574 		 * instead. If the targets are routers then router
575 		 * discovery will kick in, and we will see the new routers
576 		 * thru routing socket messages. But if the targets are
577 		 * hosts, we have to discover it by multicast.	So flush
578 		 * all the host targets. The next probe will send out a
579 		 * multicast echo request. If this is a group failure, we
580 		 * will still not see any response, otherwise the group
581 		 * will be repaired after we get NUM_PROBE_REPAIRS
582 		 * consecutive unicast replies on any phyint.
583 		 */
584 		target_flush_hosts(pg);
585 		break;
586 
587 	case PG_OK:
588 	case PG_DEGRADED:
589 		break;
590 
591 	default:
592 		logerr("phyint_group_chstate: invalid group state %d; "
593 		    "aborting\n", state);
594 		abort();
595 	}
596 
597 	pg->pg_sig++;
598 	(void) phyint_group_state_event(pg);
599 }
600 
601 /*
602  * Create a new phyint instance and initialize it from the values supplied by
603  * the kernel. Always check for ENXIO before logging any error, because the
604  * interface could have vanished after completion of SIOCGLIFCONF.
605  * Return values:
606  *	pointer to the phyint instance on success
607  *	NULL on failure Eg. if the phyint instance is not found in the kernel
608  */
609 struct phyint_instance *
610 phyint_inst_init_from_k(int af, char *pi_name)
611 {
612 	char	pg_name[LIFNAMSIZ + 1];
613 	int	ifsock;
614 	uint_t	ifindex;
615 	uint64_t	flags;
616 	struct lifreq	lifr;
617 	struct phyint	*pi;
618 	struct phyint_instance	*pii;
619 	boolean_t	pi_created;
620 	struct phyint_group	*pg;
621 
622 retry:
623 	pii = NULL;
624 	pi = NULL;
625 	pg = NULL;
626 	pi_created = _B_FALSE;
627 
628 	if (debug & D_PHYINT) {
629 		logdebug("phyint_inst_init_from_k(%s %s)\n",
630 		    AF_STR(af), pi_name);
631 	}
632 
633 	assert(af == AF_INET || af == AF_INET6);
634 
635 	/* Get the socket for doing ioctls */
636 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
637 
638 	/*
639 	 * Get the interface flags.  Ignore virtual interfaces, IPMP
640 	 * meta-interfaces, point-to-point interfaces, and interfaces
641 	 * that can't support multicast.
642 	 */
643 	(void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
644 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
645 		if (errno != ENXIO) {
646 			logperror("phyint_inst_init_from_k:"
647 			    " ioctl (get flags)");
648 		}
649 		return (NULL);
650 	}
651 	flags = lifr.lifr_flags;
652 	if (!(flags & IFF_MULTICAST) ||
653 	    (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
654 		return (NULL);
655 
656 	/*
657 	 * Get the ifindex for recording later in our tables, in case we need
658 	 * to create a new phyint.
659 	 */
660 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
661 		if (errno != ENXIO) {
662 			logperror("phyint_inst_init_from_k: "
663 			    " ioctl (get lifindex)");
664 		}
665 		return (NULL);
666 	}
667 	ifindex = lifr.lifr_index;
668 
669 	/*
670 	 * Get the phyint group name of this phyint, from the kernel.
671 	 */
672 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
673 		if (errno != ENXIO) {
674 			logperror("phyint_inst_init_from_k: "
675 			    "ioctl (get group name)");
676 		}
677 		return (NULL);
678 	}
679 	(void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
680 
681 	/*
682 	 * If the phyint is not part of any group, pg_name is the
683 	 * null string. If 'track_all_phyints' is false, there is no
684 	 * need to create a phyint.
685 	 */
686 	if (pg_name[0] == '\0' && !track_all_phyints) {
687 		/*
688 		 * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
689 		 * set, reset them. These flags shouldn't be set if in.mpathd
690 		 * isn't tracking the interface.
691 		 */
692 		if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
693 			lifr.lifr_flags = flags &
694 			    ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
695 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
696 				if (errno != ENXIO) {
697 					logperror("phyint_inst_init_from_k:"
698 					    " ioctl (set flags)");
699 				}
700 			}
701 		}
702 		return (NULL);
703 	}
704 
705 	/*
706 	 * We need to create a new phyint instance.  We may also need to
707 	 * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
708 	 * an underlying interface before it found its IPMP meta-interface.
709 	 * Note that we keep any created groups even if phyint_inst_from_k()
710 	 * fails since a group's existence is not dependent on the ability of
711 	 * in.mpathd to the track the group's interfaces.
712 	 */
713 	if ((pg = phyint_group_lookup(pg_name)) == NULL) {
714 		if ((pg = phyint_group_create(pg_name)) == NULL) {
715 			logerr("phyint_inst_init_from_k: cannot create group "
716 			    "%s\n", pg_name);
717 			return (NULL);
718 		}
719 		phyint_group_insert(pg);
720 	}
721 
722 	/*
723 	 * Lookup the phyint. If the phyint does not exist create it.
724 	 */
725 	pi = phyint_lookup(pi_name);
726 	if (pi == NULL) {
727 		pi = phyint_create(pi_name, pg, ifindex, flags);
728 		if (pi == NULL) {
729 			logerr("phyint_inst_init_from_k:"
730 			    " unable to create phyint %s\n", pi_name);
731 			return (NULL);
732 		}
733 		pi_created = _B_TRUE;
734 	} else {
735 		/* The phyint exists already. */
736 		assert(pi_created == _B_FALSE);
737 		/*
738 		 * Normally we should see consistent values for the IPv4 and
739 		 * IPv6 instances, for phyint properties. If we don't, it
740 		 * means things have changed underneath us, and we should
741 		 * resync our tables with the kernel. Check whether the
742 		 * interface index has changed. If so, it is most likely
743 		 * the interface has been unplumbed and replumbed,
744 		 * while we are yet to update our tables. Do it now.
745 		 */
746 		if (pi->pi_ifindex != ifindex) {
747 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
748 			goto retry;
749 		}
750 		assert(PHYINT_INSTANCE(pi, af) == NULL);
751 
752 		/*
753 		 * If the group name seen by the IPv4 and IPv6 instances
754 		 * are different, it is most likely the groupname has
755 		 * changed, while we are yet to update our tables. Do it now.
756 		 */
757 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
758 			phyint_inst_delete(PHYINT_INSTANCE(pi,
759 			    AF_OTHER(af)));
760 			goto retry;
761 		}
762 	}
763 
764 	/*
765 	 * Create a new phyint instance, corresponding to the 'af'
766 	 * passed in.
767 	 */
768 	pii = phyint_inst_create(pi, af);
769 	if (pii == NULL) {
770 		logerr("phyint_inst_init_from_k: unable to create"
771 		    "phyint inst %s\n", pi->pi_name);
772 		if (pi_created)
773 			phyint_delete(pi);
774 
775 		return (NULL);
776 	}
777 
778 	if (pi_created) {
779 		/*
780 		 * If this phyint does not have a unique hardware address in its
781 		 * group, offline it.  (The change_pif_flags() implementation
782 		 * requires that we defer this until after the phyint_instance
783 		 * is created.)
784 		 */
785 		if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
786 			pi->pi_hwaddrdup = _B_TRUE;
787 			(void) phyint_offline(pi, 0);
788 		}
789 	}
790 
791 	return (pii);
792 }
793 
794 /*
795  * Bind pii_probe_sock to the address associated with pii_probe_logint.
796  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
797  * targets. Do the common part in this function, and complete the
798  * initializations by calling the protocol specific functions
799  * phyint_inst_v{4,6}_sockinit() respectively.
800  *
801  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
802  */
803 boolean_t
804 phyint_inst_sockinit(struct phyint_instance *pii)
805 {
806 	boolean_t success;
807 	struct phyint_group *pg;
808 
809 	if (debug & D_PHYINT) {
810 		logdebug("phyint_inst_sockinit(%s %s)\n",
811 		    AF_STR(pii->pii_af), pii->pii_name);
812 	}
813 
814 	assert(pii->pii_probe_logint != NULL);
815 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
816 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
817 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
818 
819 	/*
820 	 * If the socket is already bound, close pii_probe_sock
821 	 */
822 	if (pii->pii_probe_sock != -1)
823 		close_probe_socket(pii, _B_TRUE);
824 
825 	/*
826 	 * If the phyint is not part of a named group and track_all_phyints is
827 	 * false, simply return.
828 	 */
829 	pg = pii->pii_phyint->pi_group;
830 	if (pg == phyint_anongroup && !track_all_phyints) {
831 		if (debug & D_PHYINT)
832 			logdebug("phyint_inst_sockinit: no group\n");
833 		return (_B_FALSE);
834 	}
835 
836 	/*
837 	 * Initialize the socket by calling the protocol specific function.
838 	 * If it succeeds, add the socket to the poll list.
839 	 */
840 	if (pii->pii_af == AF_INET6)
841 		success = phyint_inst_v6_sockinit(pii);
842 	else
843 		success = phyint_inst_v4_sockinit(pii);
844 
845 	if (success && (poll_add(pii->pii_probe_sock) == 0))
846 		return (_B_TRUE);
847 
848 	/* Something failed, cleanup and return false */
849 	if (pii->pii_probe_sock != -1)
850 		close_probe_socket(pii, _B_FALSE);
851 
852 	return (_B_FALSE);
853 }
854 
855 /*
856  * IPv6 specific part in initializing the pii_probe_sock. This socket is
857  * used to send/receive ICMPv6 probe packets.
858  */
859 static boolean_t
860 phyint_inst_v6_sockinit(struct phyint_instance *pii)
861 {
862 	icmp6_filter_t filter;
863 	int hopcount = 1;
864 	int off = 0;
865 	int on = 1;
866 	struct	sockaddr_in6	testaddr;
867 
868 	/*
869 	 * Open a raw socket with ICMPv6 protocol.
870 	 *
871 	 * Use IPV6_BOUND_IF to make sure that probes are sent and received on
872 	 * the specified phyint only.  Bind to the test address to ensure that
873 	 * the responses are sent to the specified phyint.
874 	 *
875 	 * Set the hopcount to 1 so that probe packets are not routed.
876 	 * Disable multicast loopback. Set the receive filter to
877 	 * receive only ICMPv6 echo replies.
878 	 */
879 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
880 	if (pii->pii_probe_sock < 0) {
881 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
882 		return (_B_FALSE);
883 	}
884 
885 	bzero(&testaddr, sizeof (testaddr));
886 	testaddr.sin6_family = AF_INET6;
887 	testaddr.sin6_port = 0;
888 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
889 
890 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
891 	    sizeof (testaddr)) < 0) {
892 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
893 		return (_B_FALSE);
894 	}
895 
896 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
897 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
898 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
899 		    " IPV6_MULTICAST_IF");
900 		return (_B_FALSE);
901 	}
902 
903 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
904 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
905 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
906 		    " IPV6_BOUND_IF");
907 		return (_B_FALSE);
908 	}
909 
910 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
911 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
912 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
913 		    " IPV6_UNICAST_HOPS");
914 		return (_B_FALSE);
915 	}
916 
917 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
918 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
919 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
920 		    " IPV6_MULTICAST_HOPS");
921 		return (_B_FALSE);
922 	}
923 
924 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
925 	    (char *)&off, sizeof (off)) < 0) {
926 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
927 		    " IPV6_MULTICAST_LOOP");
928 		return (_B_FALSE);
929 	}
930 
931 	/*
932 	 * Filter out so that we only receive ICMP echo replies
933 	 */
934 	ICMP6_FILTER_SETBLOCKALL(&filter);
935 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
936 
937 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
938 	    (char *)&filter, sizeof (filter)) < 0) {
939 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
940 		    " ICMP6_FILTER");
941 		return (_B_FALSE);
942 	}
943 
944 	/* Enable receipt of hoplimit */
945 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
946 	    &on, sizeof (on)) < 0) {
947 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
948 		    " IPV6_RECVHOPLIMIT");
949 		return (_B_FALSE);
950 	}
951 
952 	/* Enable receipt of timestamp */
953 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
954 	    &on, sizeof (on)) < 0) {
955 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
956 		    " SO_TIMESTAMP");
957 		return (_B_FALSE);
958 	}
959 
960 	return (_B_TRUE);
961 }
962 
963 /*
964  * IPv4 specific part in initializing the pii_probe_sock. This socket is
965  * used to send/receive ICMPv4 probe packets.
966  */
967 static boolean_t
968 phyint_inst_v4_sockinit(struct phyint_instance *pii)
969 {
970 	struct sockaddr_in  testaddr;
971 	char	char_off = 0;
972 	int	ttl = 1;
973 	char	char_ttl = 1;
974 	int	on = 1;
975 
976 	/*
977 	 * Open a raw socket with ICMPv4 protocol.
978 	 *
979 	 * Use IP_BOUND_IF to make sure that probes are sent and received on
980 	 * the specified phyint only.  Bind to the test address to ensure that
981 	 * the responses are sent to the specified phyint.
982 	 *
983 	 * Set the ttl to 1 so that probe packets are not routed.
984 	 * Disable multicast loopback.  Enable receipt of timestamp.
985 	 */
986 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
987 	if (pii->pii_probe_sock < 0) {
988 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
989 		return (_B_FALSE);
990 	}
991 
992 	bzero(&testaddr, sizeof (testaddr));
993 	testaddr.sin_family = AF_INET;
994 	testaddr.sin_port = 0;
995 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
996 	    &testaddr.sin_addr);
997 
998 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
999 	    sizeof (testaddr)) < 0) {
1000 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
1001 		return (_B_FALSE);
1002 	}
1003 
1004 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
1005 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
1006 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1007 		    " IP_BOUND_IF");
1008 		return (_B_FALSE);
1009 	}
1010 
1011 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
1012 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
1013 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1014 		    " IP_MULTICAST_IF");
1015 		return (_B_FALSE);
1016 	}
1017 
1018 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
1019 	    (char *)&ttl, sizeof (ttl)) < 0) {
1020 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1021 		    " IP_TTL");
1022 		return (_B_FALSE);
1023 	}
1024 
1025 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
1026 	    (char *)&char_off, sizeof (char_off)) == -1) {
1027 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1028 		    " IP_MULTICAST_LOOP");
1029 		return (_B_FALSE);
1030 	}
1031 
1032 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
1033 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
1034 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1035 		    " IP_MULTICAST_TTL");
1036 		return (_B_FALSE);
1037 	}
1038 
1039 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
1040 	    sizeof (on)) < 0) {
1041 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1042 		    " SO_TIMESTAMP");
1043 		return (_B_FALSE);
1044 	}
1045 
1046 	return (_B_TRUE);
1047 }
1048 
1049 /*
1050  * Remove the phyint group from the list of 'all phyint groups'
1051  * and free it.
1052  */
1053 void
1054 phyint_group_delete(struct phyint_group *pg)
1055 {
1056 	/*
1057 	 * The anonymous group always exists, even when empty.
1058 	 */
1059 	if (pg == phyint_anongroup)
1060 		return;
1061 
1062 	if (debug & D_PHYINT)
1063 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
1064 
1065 	/*
1066 	 * The phyint group must be empty, and must not have any phyints.
1067 	 * The phyint group must be in the list of all phyint groups
1068 	 */
1069 	assert(pg->pg_phyint == NULL);
1070 	assert(phyint_groups == pg || pg->pg_prev != NULL);
1071 
1072 	if (pg->pg_prev != NULL)
1073 		pg->pg_prev->pg_next = pg->pg_next;
1074 	else
1075 		phyint_groups = pg->pg_next;
1076 
1077 	if (pg->pg_next != NULL)
1078 		pg->pg_next->pg_prev = pg->pg_prev;
1079 
1080 	pg->pg_next = NULL;
1081 	pg->pg_prev = NULL;
1082 
1083 	phyint_grouplistsig++;
1084 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
1085 
1086 	addrlist_free(&pg->pg_addrs);
1087 	free(pg);
1088 }
1089 
1090 /*
1091  * Refresh the state of `pg' based on its current members.
1092  */
1093 void
1094 phyint_group_refresh_state(struct phyint_group *pg)
1095 {
1096 	enum pg_state state;
1097 	enum pg_state origstate = pg->pg_state;
1098 	struct phyint *pi, *usablepi;
1099 	uint_t nif = 0, nusable = 0;
1100 
1101 	/*
1102 	 * Anonymous groups never change state.
1103 	 */
1104 	if (pg == phyint_anongroup)
1105 		return;
1106 
1107 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1108 		nif++;
1109 		if (phyint_is_usable(pi)) {
1110 			nusable++;
1111 			usablepi = pi;
1112 		}
1113 	}
1114 
1115 	if (nusable == 0)
1116 		state = PG_FAILED;
1117 	else if (nif == nusable)
1118 		state = PG_OK;
1119 	else
1120 		state = PG_DEGRADED;
1121 
1122 	phyint_group_chstate(pg, state);
1123 
1124 	/*
1125 	 * If we're shutting down, skip logging messages since otherwise our
1126 	 * shutdown housecleaning will make us report that groups are unusable.
1127 	 */
1128 	if (cleanup_started)
1129 		return;
1130 
1131 	/*
1132 	 * NOTE: We use pg_failmsg_printed rather than origstate since
1133 	 * otherwise at startup we'll log a "now usable" message when the
1134 	 * first usable phyint is added to an empty group.
1135 	 */
1136 	if (state != PG_FAILED && pg->pg_failmsg_printed) {
1137 		assert(origstate == PG_FAILED);
1138 		logerr("At least 1 IP interface (%s) in group %s is now "
1139 		    "usable\n", usablepi->pi_name, pg->pg_name);
1140 		pg->pg_failmsg_printed = _B_FALSE;
1141 	} else if (origstate != PG_FAILED && state == PG_FAILED) {
1142 		logerr("All IP interfaces in group %s are now unusable\n",
1143 		    pg->pg_name);
1144 		pg->pg_failmsg_printed = _B_TRUE;
1145 	}
1146 }
1147 
1148 /*
1149  * Extract information from the kernel about the desired phyint.
1150  * Look only for properties of the phyint and not properties of logints.
1151  * Take appropriate action on the changes.
1152  * Return codes:
1153  *	PI_OK
1154  *		The phyint exists in the kernel and matches our knowledge
1155  *		of the phyint.
1156  *	PI_DELETED
1157  *		The phyint has vanished in the kernel.
1158  *	PI_IFINDEX_CHANGED
1159  *		The phyint's interface index has changed.
1160  *		Ask the caller to delete and recreate the phyint.
1161  *	PI_IOCTL_ERROR
1162  *		Some ioctl error. Don't change anything.
1163  *	PI_GROUP_CHANGED
1164  *		The phyint has changed group.
1165  */
1166 int
1167 phyint_inst_update_from_k(struct phyint_instance *pii)
1168 {
1169 	struct lifreq lifr;
1170 	int	ifsock;
1171 	struct phyint *pi;
1172 
1173 	pi = pii->pii_phyint;
1174 
1175 	if (debug & D_PHYINT) {
1176 		logdebug("phyint_inst_update_from_k(%s %s)\n",
1177 		    AF_STR(pii->pii_af), pi->pi_name);
1178 	}
1179 
1180 	/*
1181 	 * Get the ifindex from the kernel, for comparison with the
1182 	 * value in our tables.
1183 	 */
1184 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
1185 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1186 
1187 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1188 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
1189 		if (errno == ENXIO) {
1190 			return (PI_DELETED);
1191 		} else {
1192 			logperror_pii(pii, "phyint_inst_update_from_k:"
1193 			    " ioctl (get lifindex)");
1194 			return (PI_IOCTL_ERROR);
1195 		}
1196 	}
1197 
1198 	if (lifr.lifr_index != pi->pi_ifindex) {
1199 		/*
1200 		 * The index has changed. Most likely the interface has
1201 		 * been unplumbed and replumbed. Ask the caller to take
1202 		 * appropriate action.
1203 		 */
1204 		if (debug & D_PHYINT) {
1205 			logdebug("phyint_inst_update_from_k:"
1206 			    " old index %d new index %d\n",
1207 			    pi->pi_ifindex, lifr.lifr_index);
1208 		}
1209 		return (PI_IFINDEX_CHANGED);
1210 	}
1211 
1212 	/*
1213 	 * Get the group name from the kernel, for comparison with
1214 	 * the value in our tables.
1215 	 */
1216 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
1217 		if (errno == ENXIO) {
1218 			return (PI_DELETED);
1219 		} else {
1220 			logperror_pii(pii, "phyint_inst_update_from_k:"
1221 			    " ioctl (get groupname)");
1222 			return (PI_IOCTL_ERROR);
1223 		}
1224 	}
1225 
1226 	/*
1227 	 * If the phyint has changed group i.e. if the phyint group name
1228 	 * returned by the kernel is different, ask the caller to delete
1229 	 * and recreate the phyint in the right group
1230 	 */
1231 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
1232 		/* Groupname has changed */
1233 		if (debug & D_PHYINT) {
1234 			logdebug("phyint_inst_update_from_k:"
1235 			    " groupname change\n");
1236 		}
1237 		return (PI_GROUP_CHANGED);
1238 	}
1239 
1240 	/*
1241 	 * Get the current phyint flags from the kernel, and determine what
1242 	 * flags have changed by comparing against our tables.	Note that the
1243 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
1244 	 * that IFF_INACTIVE is really still set on the interface.
1245 	 */
1246 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
1247 		if (errno == ENXIO) {
1248 			return (PI_DELETED);
1249 		} else {
1250 			logperror_pii(pii, "phyint_inst_update_from_k: "
1251 			    " ioctl (get flags)");
1252 			return (PI_IOCTL_ERROR);
1253 		}
1254 	}
1255 
1256 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
1257 	if (pi->pi_v4 != NULL)
1258 		pi->pi_v4->pii_flags = pi->pi_flags;
1259 	if (pi->pi_v6 != NULL)
1260 		pi->pi_v6->pii_flags = pi->pi_flags;
1261 
1262 	/*
1263 	 * Make sure the IFF_FAILED flag is set if and only if we think
1264 	 * the interface should be failed.
1265 	 */
1266 	if (pi->pi_flags & IFF_FAILED) {
1267 		if (pi->pi_state == PI_RUNNING)
1268 			(void) change_pif_flags(pi, 0, IFF_FAILED);
1269 	} else {
1270 		if (pi->pi_state == PI_FAILED)
1271 			(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1272 	}
1273 
1274 	/* No change in phyint status */
1275 	return (PI_OK);
1276 }
1277 
1278 /*
1279  * Delete the phyint. Remove it from the list of all phyints, and the
1280  * list of phyint group members.
1281  */
1282 static void
1283 phyint_delete(struct phyint *pi)
1284 {
1285 	struct phyint *pi2;
1286 	struct phyint_group *pg = pi->pi_group;
1287 
1288 	if (debug & D_PHYINT)
1289 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1290 
1291 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1292 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1293 
1294 	/*
1295 	 * The phyint must belong to a group.
1296 	 */
1297 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1298 
1299 	/* The phyint must be in the list of all phyints */
1300 	assert(phyints == pi || pi->pi_prev != NULL);
1301 
1302 	/* Remove the phyint from the phyint group list */
1303 	pg->pg_sig++;
1304 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1305 
1306 	if (pi->pi_pgprev == NULL) {
1307 		/* Phyint is the 1st in the phyint group list */
1308 		pg->pg_phyint = pi->pi_pgnext;
1309 	} else {
1310 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1311 	}
1312 	if (pi->pi_pgnext != NULL)
1313 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1314 	pi->pi_pgnext = NULL;
1315 	pi->pi_pgprev = NULL;
1316 
1317 	/* Refresh the group state now that this phyint has been removed */
1318 	phyint_group_refresh_state(pg);
1319 
1320 	/* Remove the phyint from the global list of phyints */
1321 	if (pi->pi_prev == NULL) {
1322 		/* Phyint is the 1st in the list */
1323 		phyints = pi->pi_next;
1324 	} else {
1325 		pi->pi_prev->pi_next = pi->pi_next;
1326 	}
1327 	if (pi->pi_next != NULL)
1328 		pi->pi_next->pi_prev = pi->pi_prev;
1329 	pi->pi_next = NULL;
1330 	pi->pi_prev = NULL;
1331 
1332 	/*
1333 	 * See if another phyint in the group had been offlined because
1334 	 * it was a dup of `pi' -- and if so, online it.
1335 	 */
1336 	if (!pi->pi_hwaddrdup &&
1337 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1338 		assert(pi2->pi_hwaddrdup);
1339 		(void) phyint_undo_offline(pi2);
1340 	}
1341 	phyint_link_close(pi);
1342 	free(pi);
1343 }
1344 
1345 /*
1346  * Offline phyint `pi' if at least `minred' usable interfaces remain in the
1347  * group.  Returns an IPMP error code.
1348  */
1349 int
1350 phyint_offline(struct phyint *pi, uint_t minred)
1351 {
1352 	unsigned int nusable = 0;
1353 	struct phyint *pi2;
1354 	struct phyint_group *pg = pi->pi_group;
1355 
1356 	/*
1357 	 * Verify that enough usable interfaces in the group would remain.
1358 	 * As a special case, if the group has failed, allow any non-offline
1359 	 * phyints to be offlined.
1360 	 */
1361 	if (pg != phyint_anongroup) {
1362 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1363 			if (pi2 == pi)
1364 				continue;
1365 			if (phyint_is_usable(pi2) ||
1366 			    (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
1367 				nusable++;
1368 		}
1369 	}
1370 	if (nusable < minred)
1371 		return (IPMP_EMINRED);
1372 
1373 	if (!change_pif_flags(pi, IFF_OFFLINE, 0))
1374 		return (IPMP_FAILURE);
1375 
1376 	/*
1377 	 * The interface is now offline, so stop probing it.  Note that
1378 	 * if_mpadm(1M) will down the test addresses, after receiving a
1379 	 * success reply from us. The routing socket message will then make us
1380 	 * close the socket used for sending probes. But it is more logical
1381 	 * that an offlined interface must not be probed, even if it has test
1382 	 * addresses.
1383 	 *
1384 	 * NOTE: stop_probing() also sets PI_OFFLINE.
1385 	 */
1386 	stop_probing(pi);
1387 
1388 	/*
1389 	 * If we're offlining the phyint because it has a duplicate hardware
1390 	 * address, print a warning -- and leave the link open so that we can
1391 	 * be notified of hardware address changes that make it usable again.
1392 	 * Otherwise, close the link so that we won't prevent a detach.
1393 	 */
1394 	if (pi->pi_hwaddrdup) {
1395 		logerr("IP interface %s has a hardware address which is not "
1396 		    "unique in group %s; offlining\n", pi->pi_name,
1397 		    pg->pg_name);
1398 	} else {
1399 		phyint_link_close(pi);
1400 	}
1401 
1402 	/*
1403 	 * If this phyint was preventing another phyint with a duplicate
1404 	 * hardware address from being online, bring that one online now.
1405 	 */
1406 	if (!pi->pi_hwaddrdup &&
1407 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1408 		assert(pi2->pi_hwaddrdup);
1409 		(void) phyint_undo_offline(pi2);
1410 	}
1411 
1412 	/*
1413 	 * If this interface was active, try to activate another INACTIVE
1414 	 * interface in the group.
1415 	 */
1416 	if (!(pi->pi_flags & IFF_INACTIVE))
1417 		phyint_activate_another(pi);
1418 
1419 	return (IPMP_SUCCESS);
1420 }
1421 
1422 /*
1423  * Undo a previous offline of `pi'.  Returns an IPMP error code.
1424  */
1425 int
1426 phyint_undo_offline(struct phyint *pi)
1427 {
1428 	if (pi->pi_state != PI_OFFLINE) {
1429 		errno = EINVAL;
1430 		return (IPMP_FAILURE);
1431 	}
1432 
1433 	/*
1434 	 * If necessary, reinitialize our link information and verify that its
1435 	 * hardware address is still unique across the group.
1436 	 */
1437 	if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
1438 		errno = EIO;
1439 		return (IPMP_FAILURE);
1440 	}
1441 
1442 	if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
1443 		pi->pi_hwaddrdup = _B_TRUE;
1444 		return (IPMP_EHWADDRDUP);
1445 	}
1446 
1447 	if (pi->pi_hwaddrdup) {
1448 		logerr("IP interface %s now has a unique hardware address in "
1449 		    "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
1450 		pi->pi_hwaddrdup = _B_FALSE;
1451 	}
1452 
1453 	if (!change_pif_flags(pi, 0, IFF_OFFLINE))
1454 		return (IPMP_FAILURE);
1455 
1456 	/*
1457 	 * While the interface was offline, it may have failed (e.g. the link
1458 	 * may have gone down).  phyint_inst_check_for_failure() will have
1459 	 * already set pi_flags with IFF_FAILED, so we can use that to decide
1460 	 * whether the phyint should transition to running.  Note that after
1461 	 * we transition to running, we will start sending probes again (if
1462 	 * test addresses are configured), which may also reveal that the
1463 	 * interface is in fact failed.
1464 	 */
1465 	if (pi->pi_flags & IFF_FAILED) {
1466 		phyint_chstate(pi, PI_FAILED);
1467 	} else {
1468 		/* calls phyint_chstate() */
1469 		phyint_transition_to_running(pi);
1470 	}
1471 
1472 	/*
1473 	 * Give the requestor time to configure test addresses before
1474 	 * complaining that they're missing.
1475 	 */
1476 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
1477 
1478 	return (IPMP_SUCCESS);
1479 }
1480 
1481 /*
1482  * Delete (unlink and free), the phyint instance.
1483  */
1484 void
1485 phyint_inst_delete(struct phyint_instance *pii)
1486 {
1487 	struct phyint *pi = pii->pii_phyint;
1488 
1489 	assert(pi != NULL);
1490 
1491 	if (debug & D_PHYINT) {
1492 		logdebug("phyint_inst_delete(%s %s)\n",
1493 		    AF_STR(pii->pii_af), pi->pi_name);
1494 	}
1495 
1496 	/*
1497 	 * If the phyint instance has associated probe targets
1498 	 * delete all the targets
1499 	 */
1500 	while (pii->pii_targets != NULL)
1501 		target_delete(pii->pii_targets);
1502 
1503 	/*
1504 	 * Delete all the logints associated with this phyint
1505 	 * instance.
1506 	 */
1507 	while (pii->pii_logint != NULL)
1508 		logint_delete(pii->pii_logint);
1509 
1510 	/*
1511 	 * Close the socket used to send probes to targets from this phyint.
1512 	 */
1513 	if (pii->pii_probe_sock != -1)
1514 		close_probe_socket(pii, _B_TRUE);
1515 
1516 	/*
1517 	 * Phyint instance must be in the list of all phyint instances.
1518 	 * Remove phyint instance from the global list of phyint instances.
1519 	 */
1520 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1521 	if (pii->pii_prev == NULL) {
1522 		/* Phyint is the 1st in the list */
1523 		phyint_instances = pii->pii_next;
1524 	} else {
1525 		pii->pii_prev->pii_next = pii->pii_next;
1526 	}
1527 	if (pii->pii_next != NULL)
1528 		pii->pii_next->pii_prev = pii->pii_prev;
1529 	pii->pii_next = NULL;
1530 	pii->pii_prev = NULL;
1531 
1532 	/*
1533 	 * Reset the phyint instance pointer in the phyint.
1534 	 * If this is the last phyint instance (being deleted) on this
1535 	 * phyint, then delete the phyint.
1536 	 */
1537 	if (pii->pii_af == AF_INET)
1538 		pi->pi_v4 = NULL;
1539 	else
1540 		pi->pi_v6 = NULL;
1541 
1542 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1543 		phyint_delete(pi);
1544 
1545 	free(pii);
1546 }
1547 
1548 static void
1549 phyint_inst_print(struct phyint_instance *pii)
1550 {
1551 	struct logint *li;
1552 	struct target *tg;
1553 	char abuf[INET6_ADDRSTRLEN];
1554 	int most_recent;
1555 	int i;
1556 
1557 	if (pii->pii_phyint == NULL) {
1558 		logdebug("pii->pi_phyint NULL can't print\n");
1559 		return;
1560 	}
1561 
1562 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1563 	    "sock %x in_use %d\n",
1564 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1565 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1566 	    pii->pii_in_use);
1567 
1568 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1569 		logint_print(li);
1570 
1571 	logdebug("\n");
1572 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1573 		target_print(tg);
1574 
1575 	if (pii->pii_targets == NULL)
1576 		logdebug("pi_targets NULL\n");
1577 
1578 	if (pii->pii_target_next != NULL) {
1579 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1580 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1581 		    abuf, sizeof (abuf)));
1582 	} else {
1583 		logdebug("pi_target_next NULL\n");
1584 	}
1585 
1586 	if (pii->pii_rtt_target_next != NULL) {
1587 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1588 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1589 		    abuf, sizeof (abuf)));
1590 	} else {
1591 		logdebug("pi_rtt_target_next NULL\n");
1592 	}
1593 
1594 	if (pii->pii_targets != NULL) {
1595 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1596 
1597 		i = most_recent;
1598 		do {
1599 			if (pii->pii_probes[i].pr_target != NULL) {
1600 				logdebug("#%d target %s ", i,
1601 				    pr_addr(pii->pii_af,
1602 				    pii->pii_probes[i].pr_target->tg_address,
1603 				    abuf, sizeof (abuf)));
1604 			} else {
1605 				logdebug("#%d target NULL ", i);
1606 			}
1607 			logdebug("time_start %lld status %d "
1608 			    "time_ackproc %lld time_lost %u",
1609 			    pii->pii_probes[i].pr_hrtime_start,
1610 			    pii->pii_probes[i].pr_status,
1611 			    pii->pii_probes[i].pr_hrtime_ackproc,
1612 			    pii->pii_probes[i].pr_time_lost);
1613 			i = PROBE_INDEX_PREV(i);
1614 		} while (i != most_recent);
1615 	}
1616 }
1617 
1618 /*
1619  * Lookup a logint based on the logical interface name, on the given
1620  * phyint instance.
1621  */
1622 static struct logint *
1623 logint_lookup(struct phyint_instance *pii, char *name)
1624 {
1625 	struct logint *li;
1626 
1627 	if (debug & D_LOGINT) {
1628 		logdebug("logint_lookup(%s, %s)\n",
1629 		    AF_STR(pii->pii_af), name);
1630 	}
1631 
1632 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1633 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1634 			break;
1635 	}
1636 	return (li);
1637 }
1638 
1639 /*
1640  * Insert a logint at the head of the list of logints of the given
1641  * phyint instance
1642  */
1643 static void
1644 logint_insert(struct phyint_instance *pii, struct logint *li)
1645 {
1646 	li->li_next = pii->pii_logint;
1647 	li->li_prev = NULL;
1648 	if (pii->pii_logint != NULL)
1649 		pii->pii_logint->li_prev = li;
1650 	pii->pii_logint = li;
1651 	li->li_phyint_inst = pii;
1652 }
1653 
1654 /*
1655  * Create a new named logint, on the specified phyint instance.
1656  */
1657 static struct logint *
1658 logint_create(struct phyint_instance *pii, char *name)
1659 {
1660 	struct logint *li;
1661 
1662 	if (debug & D_LOGINT) {
1663 		logdebug("logint_create(%s %s %s)\n",
1664 		    AF_STR(pii->pii_af), pii->pii_name, name);
1665 	}
1666 
1667 	li = calloc(1, sizeof (struct logint));
1668 	if (li == NULL) {
1669 		logperror("logint_create: calloc");
1670 		return (NULL);
1671 	}
1672 
1673 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1674 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1675 	logint_insert(pii, li);
1676 	return (li);
1677 }
1678 
1679 /*
1680  * Initialize the logint based on the data returned by the kernel.
1681  */
1682 void
1683 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1684 {
1685 	int	ifsock;
1686 	uint64_t flags;
1687 	uint64_t saved_flags;
1688 	struct	logint	*li;
1689 	struct lifreq	lifr;
1690 	struct in6_addr	test_subnet;
1691 	struct in6_addr	testaddr;
1692 	int	test_subnet_len;
1693 	struct sockaddr_in6	*sin6;
1694 	struct sockaddr_in	*sin;
1695 	char abuf[INET6_ADDRSTRLEN];
1696 	boolean_t  ptp = _B_FALSE;
1697 	struct in6_addr tgaddr;
1698 
1699 	if (debug & D_LOGINT) {
1700 		logdebug("logint_init_from_k(%s %s)\n",
1701 		    AF_STR(pii->pii_af), li_name);
1702 	}
1703 
1704 	/* Get the socket for doing ioctls */
1705 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1706 
1707 	/*
1708 	 * Get the flags from the kernel. Also serves as a check whether
1709 	 * the logical still exists. If it doesn't exist, no need to proceed
1710 	 * any further. li_in_use will make the caller clean up the logint
1711 	 */
1712 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1713 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1714 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1715 		/* Interface may have vanished */
1716 		if (errno != ENXIO) {
1717 			logperror_pii(pii, "logint_init_from_k: "
1718 			    "ioctl (get flags)");
1719 		}
1720 		return;
1721 	}
1722 
1723 	flags = lifr.lifr_flags;
1724 
1725 	/*
1726 	 * Verified the logint exists. Now lookup the logint in our tables.
1727 	 * If it does not exist, create a new logint.
1728 	 */
1729 	li = logint_lookup(pii, li_name);
1730 	if (li == NULL) {
1731 		li = logint_create(pii, li_name);
1732 		if (li == NULL) {
1733 			/*
1734 			 * Pretend the interface does not exist
1735 			 * in the kernel
1736 			 */
1737 			return;
1738 		}
1739 	}
1740 
1741 	/*
1742 	 * Update li->li_flags with the new flags, after saving the old
1743 	 * value. This is used later to check what flags has changed and
1744 	 * take any action
1745 	 */
1746 	saved_flags = li->li_flags;
1747 	li->li_flags = flags;
1748 
1749 	/*
1750 	 * Get the address, prefix, prefixlength and update the logint.
1751 	 * Check if anything has changed. If the logint used for the
1752 	 * test address has changed, take suitable action.
1753 	 */
1754 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1755 		/* Interface may have vanished */
1756 		if (errno != ENXIO) {
1757 			logperror_li(li, "logint_init_from_k: (get addr)");
1758 		}
1759 		goto error;
1760 	}
1761 
1762 	if (pii->pii_af == AF_INET) {
1763 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1764 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1765 	} else {
1766 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1767 		testaddr = sin6->sin6_addr;
1768 	}
1769 
1770 	if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1771 		/* Interface may have vanished */
1772 		if (errno != ENXIO)
1773 			logperror_li(li, "logint_init_from_k: (get subnet)");
1774 		goto error;
1775 	}
1776 	if (lifr.lifr_subnet.ss_family == AF_INET6) {
1777 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1778 		test_subnet = sin6->sin6_addr;
1779 		test_subnet_len = lifr.lifr_addrlen;
1780 	} else {
1781 		sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1782 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1783 		test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
1784 	}
1785 
1786 	/*
1787 	 * If this is the logint corresponding to the test address used for
1788 	 * sending probes, then if anything significant has changed we need to
1789 	 * determine the test address again.  We ignore changes to the
1790 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1791 	 * course.
1792 	 */
1793 	if (pii->pii_probe_logint == li) {
1794 		if (((li->li_flags ^ saved_flags) &
1795 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1796 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1797 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1798 		    &li->li_subnet)) ||
1799 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1800 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1801 			/*
1802 			 * Something significant that affects the testaddress
1803 			 * has changed. Redo the testaddress selection later on
1804 			 * in select_test_ifs(). For now do the cleanup and
1805 			 * set pii_probe_logint to NULL.
1806 			 */
1807 			if (pii->pii_probe_sock != -1)
1808 				close_probe_socket(pii, _B_TRUE);
1809 			pii->pii_probe_logint = NULL;
1810 		}
1811 	}
1812 
1813 
1814 	/* Update the logint with the values obtained from the kernel.	*/
1815 	li->li_addr = testaddr;
1816 	li->li_in_use = 1;
1817 	if (ptp) {
1818 		li->li_dstaddr = tgaddr;
1819 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1820 		    IP_ABITS : IPV6_ABITS;
1821 	} else {
1822 		li->li_subnet = test_subnet;
1823 		li->li_subnet_len = test_subnet_len;
1824 	}
1825 
1826 	if (debug & D_LOGINT)
1827 		logint_print(li);
1828 
1829 	return;
1830 
1831 error:
1832 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1833 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1834 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1835 	logint_delete(li);
1836 }
1837 
1838 /*
1839  * Delete (unlink and free) a logint.
1840  */
1841 void
1842 logint_delete(struct logint *li)
1843 {
1844 	struct phyint_instance *pii;
1845 
1846 	pii = li->li_phyint_inst;
1847 	assert(pii != NULL);
1848 
1849 	if (debug & D_LOGINT) {
1850 		int af;
1851 		char abuf[INET6_ADDRSTRLEN];
1852 
1853 		af = pii->pii_af;
1854 		logdebug("logint_delete(%s %s %s/%u)\n",
1855 		    AF_STR(af), li->li_name,
1856 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1857 		    li->li_subnet_len);
1858 	}
1859 
1860 	/* logint must be in the list of logints */
1861 	assert(pii->pii_logint == li || li->li_prev != NULL);
1862 
1863 	/* Remove the logint from the list of logints  */
1864 	if (li->li_prev == NULL) {
1865 		/* logint is the 1st in the list */
1866 		pii->pii_logint = li->li_next;
1867 	} else {
1868 		li->li_prev->li_next = li->li_next;
1869 	}
1870 	if (li->li_next != NULL)
1871 		li->li_next->li_prev = li->li_prev;
1872 	li->li_next = NULL;
1873 	li->li_prev = NULL;
1874 
1875 	/*
1876 	 * If this logint is also being used for probing, then close the
1877 	 * associated socket, if it exists.
1878 	 */
1879 	if (pii->pii_probe_logint == li) {
1880 		if (pii->pii_probe_sock != -1)
1881 			close_probe_socket(pii, _B_TRUE);
1882 		pii->pii_probe_logint = NULL;
1883 	}
1884 
1885 	free(li);
1886 }
1887 
1888 static void
1889 logint_print(struct logint *li)
1890 {
1891 	char abuf[INET6_ADDRSTRLEN];
1892 	int af = li->li_phyint_inst->pii_af;
1893 
1894 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1895 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1896 
1897 	logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
1898 }
1899 
1900 char *
1901 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1902 {
1903 	struct in_addr	addr_v4;
1904 
1905 	if (af == AF_INET) {
1906 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1907 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1908 	} else {
1909 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1910 	}
1911 	return (abuf);
1912 }
1913 
1914 /*
1915  * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
1916  * represented by the [`af',`addr'] pair.  Needed because in.mpathd internally
1917  * stores all addresses as in6_addrs, but we don't want to expose that.
1918  */
1919 void
1920 addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
1921 {
1922 	struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
1923 	struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
1924 
1925 	assert(af == AF_INET || af == AF_INET6);
1926 
1927 	switch (af) {
1928 	case AF_INET:
1929 		(void) memset(sinp, 0, sizeof (*sinp));
1930 		sinp->sin_family = AF_INET;
1931 		IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
1932 		break;
1933 	case AF_INET6:
1934 		(void) memset(sin6p, 0, sizeof (*sin6p));
1935 		sin6p->sin6_family = AF_INET6;
1936 		sin6p->sin6_addr = *addr;
1937 		break;
1938 	}
1939 }
1940 
1941 /* Lookup target on its address */
1942 struct target *
1943 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1944 {
1945 	struct target *tg;
1946 
1947 	if (debug & D_TARGET) {
1948 		char abuf[INET6_ADDRSTRLEN];
1949 
1950 		logdebug("target_lookup(%s %s): addr %s\n",
1951 		    AF_STR(pii->pii_af), pii->pii_name,
1952 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1953 	}
1954 
1955 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1956 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1957 			break;
1958 	}
1959 	return (tg);
1960 }
1961 
1962 /*
1963  * Find and return the next active target, for the next probe.
1964  * If no active targets are available, return NULL.
1965  */
1966 struct target *
1967 target_next(struct target *tg)
1968 {
1969 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
1970 	struct	target	*marker = tg;
1971 	hrtime_t now;
1972 
1973 	now = gethrtime();
1974 
1975 	/*
1976 	 * Target must be in the list of targets for this phyint
1977 	 * instance.
1978 	 */
1979 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1980 	assert(pii->pii_targets != NULL);
1981 
1982 	/* Return the next active target */
1983 	do {
1984 		/*
1985 		 * Go to the next target. If we hit the end,
1986 		 * reset the ptr to the head
1987 		 */
1988 		tg = tg->tg_next;
1989 		if (tg == NULL)
1990 			tg = pii->pii_targets;
1991 
1992 		assert(TG_STATUS_VALID(tg->tg_status));
1993 
1994 		switch (tg->tg_status) {
1995 		case TG_ACTIVE:
1996 			return (tg);
1997 
1998 		case TG_UNUSED:
1999 			assert(pii->pii_targets_are_routers);
2000 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
2001 				/*
2002 				 * Bubble up the unused target to active
2003 				 */
2004 				tg->tg_status = TG_ACTIVE;
2005 				pii->pii_ntargets++;
2006 				return (tg);
2007 			}
2008 			break;
2009 
2010 		case TG_SLOW:
2011 			assert(pii->pii_targets_are_routers);
2012 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2013 				/*
2014 				 * Bubble up the slow target to unused
2015 				 */
2016 				tg->tg_status = TG_UNUSED;
2017 			}
2018 			break;
2019 
2020 		case TG_DEAD:
2021 			assert(pii->pii_targets_are_routers);
2022 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2023 				/*
2024 				 * Bubble up the dead target to slow
2025 				 */
2026 				tg->tg_status = TG_SLOW;
2027 				tg->tg_latime = now;
2028 			}
2029 			break;
2030 		}
2031 
2032 	} while (tg != marker);
2033 
2034 	return (NULL);
2035 }
2036 
2037 /*
2038  * Select the best available target, that is not already TG_ACTIVE,
2039  * for the caller. The caller will determine whether it wants to
2040  * make the returned target TG_ACTIVE.
2041  * The selection order is as follows.
2042  * 1. pick a TG_UNSED target, if it exists.
2043  * 2. else pick a TG_SLOW target that has recovered, if it exists
2044  * 3. else pick any TG_SLOW target, if it exists
2045  * 4. else pick a TG_DEAD target that has recovered, if it exists
2046  * 5. else pick any TG_DEAD target, if it exists
2047  * 6. else return null
2048  */
2049 static struct target *
2050 target_select_best(struct phyint_instance *pii)
2051 {
2052 	struct target *tg;
2053 	struct target *slow = NULL;
2054 	struct target *dead = NULL;
2055 	struct target *slow_recovered = NULL;
2056 	struct target *dead_recovered = NULL;
2057 	hrtime_t now;
2058 
2059 	now = gethrtime();
2060 
2061 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2062 		assert(TG_STATUS_VALID(tg->tg_status));
2063 
2064 		switch (tg->tg_status) {
2065 		case TG_UNUSED:
2066 			return (tg);
2067 
2068 		case TG_SLOW:
2069 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2070 				slow_recovered = tg;
2071 				/*
2072 				 * Promote the slow_recovered to unused
2073 				 */
2074 				tg->tg_status = TG_UNUSED;
2075 			} else {
2076 				slow = tg;
2077 			}
2078 			break;
2079 
2080 		case TG_DEAD:
2081 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2082 				dead_recovered = tg;
2083 				/*
2084 				 * Promote the dead_recovered to slow
2085 				 */
2086 				tg->tg_status = TG_SLOW;
2087 				tg->tg_latime = now;
2088 			} else {
2089 				dead = tg;
2090 			}
2091 			break;
2092 
2093 		default:
2094 			break;
2095 		}
2096 	}
2097 
2098 	if (slow_recovered != NULL)
2099 		return (slow_recovered);
2100 	else if (slow != NULL)
2101 		return (slow);
2102 	else if (dead_recovered != NULL)
2103 		return (dead_recovered);
2104 	else
2105 		return (dead);
2106 }
2107 
2108 /*
2109  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
2110  * that are active, pick the next best below.
2111  */
2112 static void
2113 target_activate_all(struct phyint_instance *pii)
2114 {
2115 	struct target *tg;
2116 
2117 	assert(pii->pii_ntargets == 0);
2118 	assert(pii->pii_target_next == NULL);
2119 	assert(pii->pii_rtt_target_next == NULL);
2120 	assert(pii->pii_targets_are_routers);
2121 
2122 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
2123 		tg = target_select_best(pii);
2124 		if (tg == NULL) {
2125 			/* We are out of targets */
2126 			return;
2127 		}
2128 
2129 		assert(TG_STATUS_VALID(tg->tg_status));
2130 		assert(tg->tg_status != TG_ACTIVE);
2131 		tg->tg_status = TG_ACTIVE;
2132 		pii->pii_ntargets++;
2133 		if (pii->pii_target_next == NULL) {
2134 			pii->pii_target_next = tg;
2135 			pii->pii_rtt_target_next = tg;
2136 		}
2137 	}
2138 }
2139 
2140 static struct target *
2141 target_first(struct phyint_instance *pii)
2142 {
2143 	struct target *tg;
2144 
2145 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2146 		assert(TG_STATUS_VALID(tg->tg_status));
2147 		if (tg->tg_status == TG_ACTIVE)
2148 			break;
2149 	}
2150 
2151 	return (tg);
2152 }
2153 
2154 /*
2155  * Create a default target entry.
2156  */
2157 void
2158 target_create(struct phyint_instance *pii, struct in6_addr addr,
2159     boolean_t is_router)
2160 {
2161 	struct target *tg;
2162 	struct phyint *pi;
2163 	struct logint *li;
2164 
2165 	if (debug & D_TARGET) {
2166 		char abuf[INET6_ADDRSTRLEN];
2167 
2168 		logdebug("target_create(%s %s, %s)\n",
2169 		    AF_STR(pii->pii_af), pii->pii_name,
2170 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
2171 	}
2172 
2173 	/*
2174 	 * If the test address is not yet initialized, do not add
2175 	 * any target, since we cannot determine whether the target
2176 	 * belongs to the same subnet as the test address.
2177 	 */
2178 	li = pii->pii_probe_logint;
2179 	if (li == NULL)
2180 		return;
2181 
2182 	/*
2183 	 * If there are multiple subnets associated with an interface, then
2184 	 * add the target to this phyint instance only if it belongs to the
2185 	 * same subnet as the test address.  This assures us that we will
2186 	 * be able to reach this target through our routing table.
2187 	 */
2188 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
2189 		return;
2190 
2191 	if (pii->pii_targets != NULL) {
2192 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
2193 		if (is_router) {
2194 			if (!pii->pii_targets_are_routers) {
2195 				/*
2196 				 * Prefer router over hosts. Using hosts is a
2197 				 * fallback mechanism, hence delete all host
2198 				 * targets.
2199 				 */
2200 				while (pii->pii_targets != NULL)
2201 					target_delete(pii->pii_targets);
2202 			}
2203 		} else {
2204 			/*
2205 			 * Routers take precedence over hosts. If this
2206 			 * is a router list and we are trying to add a
2207 			 * host, just return. If this is a host list
2208 			 * and if we have sufficient targets, just return
2209 			 */
2210 			if (pii->pii_targets_are_routers ||
2211 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
2212 				return;
2213 		}
2214 	}
2215 
2216 	tg = calloc(1, sizeof (struct target));
2217 	if (tg == NULL) {
2218 		logperror("target_create: calloc");
2219 		return;
2220 	}
2221 
2222 	tg->tg_phyint_inst = pii;
2223 	tg->tg_address = addr;
2224 	tg->tg_in_use = 1;
2225 	tg->tg_rtt_sa = -1;
2226 	tg->tg_num_deferred = 0;
2227 
2228 	/*
2229 	 * If this is the first target, set 'pii_targets_are_routers'
2230 	 * The list of targets is either a list of hosts or list or
2231 	 * routers, but not a mix.
2232 	 */
2233 	if (pii->pii_targets == NULL) {
2234 		assert(pii->pii_ntargets == 0);
2235 		assert(pii->pii_target_next == NULL);
2236 		assert(pii->pii_rtt_target_next == NULL);
2237 		pii->pii_targets_are_routers = is_router ? 1 : 0;
2238 	}
2239 
2240 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
2241 		assert(pii->pii_targets_are_routers);
2242 		assert(pii->pii_target_next != NULL);
2243 		assert(pii->pii_rtt_target_next != NULL);
2244 		tg->tg_status = TG_UNUSED;
2245 	} else {
2246 		if (pii->pii_ntargets == 0) {
2247 			assert(pii->pii_target_next == NULL);
2248 			pii->pii_target_next = tg;
2249 			pii->pii_rtt_target_next = tg;
2250 		}
2251 		pii->pii_ntargets++;
2252 		tg->tg_status = TG_ACTIVE;
2253 	}
2254 
2255 	target_insert(pii, tg);
2256 
2257 	/*
2258 	 * Change state to PI_RUNNING if this phyint instance is capable of
2259 	 * sending and receiving probes -- that is, if we know of at least 1
2260 	 * target, and this phyint instance is probe-capable.  For more
2261 	 * details, see the phyint state diagram in mpd_probe.c.
2262 	 */
2263 	pi = pii->pii_phyint;
2264 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
2265 		if (pi->pi_flags & IFF_FAILED)
2266 			phyint_chstate(pi, PI_FAILED);
2267 		else
2268 			phyint_chstate(pi, PI_RUNNING);
2269 	}
2270 }
2271 
2272 /*
2273  * Add the target address named by `addr' to phyint instance `pii' if it does
2274  * not already exist.  If the target is a router, `is_router' should be set to
2275  * B_TRUE.
2276  */
2277 void
2278 target_add(struct phyint_instance *pii, struct in6_addr addr,
2279     boolean_t is_router)
2280 {
2281 	struct target *tg;
2282 
2283 	if (pii == NULL)
2284 		return;
2285 
2286 	tg = target_lookup(pii, addr);
2287 
2288 	/*
2289 	 * If the target does not exist, create it; target_create() will set
2290 	 * tg_in_use to true.  Even if it exists already, if it's a router
2291 	 * target and we'd previously learned of it through multicast, then we
2292 	 * need to recreate it as a router target.  Otherwise, just set
2293 	 * tg_in_use to to true so that init_router_targets() won't delete it.
2294 	 */
2295 	if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
2296 		target_create(pii, addr, is_router);
2297 	else if (is_router)
2298 		tg->tg_in_use = 1;
2299 }
2300 
2301 /*
2302  * Insert target at head of linked list of targets for the associated
2303  * phyint instance
2304  */
2305 static void
2306 target_insert(struct phyint_instance *pii, struct target *tg)
2307 {
2308 	tg->tg_next = pii->pii_targets;
2309 	tg->tg_prev = NULL;
2310 	if (tg->tg_next != NULL)
2311 		tg->tg_next->tg_prev = tg;
2312 	pii->pii_targets = tg;
2313 }
2314 
2315 /*
2316  * Delete a target (unlink and free).
2317  */
2318 void
2319 target_delete(struct target *tg)
2320 {
2321 	int af;
2322 	struct phyint_instance	*pii;
2323 	struct phyint_instance	*pii_other;
2324 
2325 	pii = tg->tg_phyint_inst;
2326 	af = pii->pii_af;
2327 
2328 	if (debug & D_TARGET) {
2329 		char abuf[INET6_ADDRSTRLEN];
2330 
2331 		logdebug("target_delete(%s %s, %s)\n",
2332 		    AF_STR(af), pii->pii_name,
2333 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
2334 	}
2335 
2336 	/*
2337 	 * Target must be in the list of targets for this phyint
2338 	 * instance.
2339 	 */
2340 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2341 
2342 	/*
2343 	 * Reset all references to 'tg' in the probe information
2344 	 * for this phyint.
2345 	 */
2346 	reset_pii_probes(pii, tg);
2347 
2348 	/*
2349 	 * Remove this target from the list of targets of this
2350 	 * phyint instance.
2351 	 */
2352 	if (tg->tg_prev == NULL) {
2353 		pii->pii_targets = tg->tg_next;
2354 	} else {
2355 		tg->tg_prev->tg_next = tg->tg_next;
2356 	}
2357 
2358 	if (tg->tg_next != NULL)
2359 		tg->tg_next->tg_prev = tg->tg_prev;
2360 
2361 	tg->tg_next = NULL;
2362 	tg->tg_prev = NULL;
2363 
2364 	if (tg->tg_status == TG_ACTIVE)
2365 		pii->pii_ntargets--;
2366 
2367 	/*
2368 	 * Adjust the next target to probe, if it points to
2369 	 * to the currently deleted target.
2370 	 */
2371 	if (pii->pii_target_next == tg)
2372 		pii->pii_target_next = target_first(pii);
2373 
2374 	if (pii->pii_rtt_target_next == tg)
2375 		pii->pii_rtt_target_next = target_first(pii);
2376 
2377 	free(tg);
2378 
2379 	/*
2380 	 * The number of active targets pii_ntargets == 0 iff
2381 	 * the next active target pii->pii_target_next == NULL
2382 	 */
2383 	if (pii->pii_ntargets != 0) {
2384 		assert(pii->pii_target_next != NULL);
2385 		assert(pii->pii_rtt_target_next != NULL);
2386 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2387 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2388 		return;
2389 	}
2390 
2391 	/* At this point, we don't have any active targets. */
2392 	assert(pii->pii_target_next == NULL);
2393 	assert(pii->pii_rtt_target_next == NULL);
2394 
2395 	if (pii->pii_targets_are_routers) {
2396 		/*
2397 		 * Activate any TG_SLOW or TG_DEAD router targets,
2398 		 * since we don't have any other targets
2399 		 */
2400 		target_activate_all(pii);
2401 
2402 		if (pii->pii_ntargets != 0) {
2403 			assert(pii->pii_target_next != NULL);
2404 			assert(pii->pii_rtt_target_next != NULL);
2405 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2406 			assert(pii->pii_rtt_target_next->tg_status ==
2407 			    TG_ACTIVE);
2408 			return;
2409 		}
2410 	}
2411 
2412 	/*
2413 	 * If we still don't have any active targets, the list must
2414 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2415 	 * targets. Zero out the probe stats since it will not be
2416 	 * relevant any longer.
2417 	 */
2418 	assert(pii->pii_targets == NULL);
2419 	pii->pii_targets_are_routers = _B_FALSE;
2420 	clear_pii_probe_stats(pii);
2421 	pii_other = phyint_inst_other(pii);
2422 
2423 	/*
2424 	 * If there are no targets on both instances and the interface would
2425 	 * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
2426 	 * since we cannot probe this phyint any more.  For more details,
2427 	 * please see phyint state diagram in mpd_probe.c.
2428 	 */
2429 	if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
2430 	    pii->pii_phyint->pi_state != PI_OFFLINE)
2431 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2432 }
2433 
2434 /*
2435  * Flush the target list of every phyint in the group, if the list
2436  * is a host target list. This is called if group failure is suspected.
2437  * If all targets have failed, multicast will subsequently discover new
2438  * targets. Else it is a group failure.
2439  * Note: This function is a no-op if the list is a router target list.
2440  */
2441 static void
2442 target_flush_hosts(struct phyint_group *pg)
2443 {
2444 	struct phyint *pi;
2445 	struct phyint_instance *pii;
2446 
2447 	if (debug & D_TARGET)
2448 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2449 
2450 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2451 		pii = pi->pi_v4;
2452 		if (pii != NULL && !pii->pii_targets_are_routers) {
2453 			/*
2454 			 * Delete all the targets. When the list becomes
2455 			 * empty, target_delete() will set pii->pii_targets
2456 			 * to NULL.
2457 			 */
2458 			while (pii->pii_targets != NULL)
2459 				target_delete(pii->pii_targets);
2460 		}
2461 		pii = pi->pi_v6;
2462 		if (pii != NULL && !pii->pii_targets_are_routers) {
2463 			/*
2464 			 * Delete all the targets. When the list becomes
2465 			 * empty, target_delete() will set pii->pii_targets
2466 			 * to NULL.
2467 			 */
2468 			while (pii->pii_targets != NULL)
2469 				target_delete(pii->pii_targets);
2470 		}
2471 	}
2472 }
2473 
2474 /*
2475  * Reset all references to 'target' in the probe info, as this target is
2476  * being deleted. The pr_target field is guaranteed to be non-null if
2477  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2478  * pr_target will not be accessed unconditionally.
2479  */
2480 static void
2481 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2482 {
2483 	int i;
2484 
2485 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2486 		if (pii->pii_probes[i].pr_target == tg) {
2487 			if (pii->pii_probes[i].pr_status == PR_UNACKED) {
2488 				probe_chstate(&pii->pii_probes[i], pii,
2489 				    PR_LOST);
2490 			}
2491 			pii->pii_probes[i].pr_target = NULL;
2492 		}
2493 	}
2494 
2495 }
2496 
2497 /*
2498  * Clear the probe statistics array.
2499  */
2500 void
2501 clear_pii_probe_stats(struct phyint_instance *pii)
2502 {
2503 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2504 	/* Reset the next probe index in the probe stats array */
2505 	pii->pii_probe_next = 0;
2506 }
2507 
2508 static void
2509 target_print(struct target *tg)
2510 {
2511 	char	abuf[INET6_ADDRSTRLEN];
2512 	char	buf[128];
2513 	char	buf2[128];
2514 	int	af;
2515 	int	i;
2516 
2517 	af = tg->tg_phyint_inst->pii_af;
2518 
2519 	logdebug("Target on %s %s addr %s\n"
2520 	    "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
2521 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2522 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2523 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2524 	    tg->tg_crtt, tg->tg_in_use);
2525 
2526 	buf[0] = '\0';
2527 	for (i = 0; i < tg->tg_num_deferred; i++) {
2528 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2529 		    tg->tg_deferred[i]);
2530 		(void) strlcat(buf, buf2, sizeof (buf));
2531 	}
2532 	logdebug("deferred rtts:%s\n", buf);
2533 }
2534 
2535 void
2536 phyint_inst_print_all(void)
2537 {
2538 	struct phyint_instance *pii;
2539 
2540 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2541 		phyint_inst_print(pii);
2542 	}
2543 }
2544 
2545 /*
2546  * Compare two prefixes that have the same prefix length.
2547  * Fails if the prefix length is unreasonable.
2548  */
2549 boolean_t
2550 prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
2551 {
2552 	uchar_t mask;
2553 	int j;
2554 
2555 	if (prefix_len > IPV6_ABITS)
2556 		return (_B_FALSE);
2557 
2558 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2559 		if (p1.s6_addr[j] != p2.s6_addr[j])
2560 			return (_B_FALSE);
2561 
2562 	/* Make the N leftmost bits one */
2563 	mask = 0xff << (8 - prefix_len);
2564 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2565 		return (_B_FALSE);
2566 
2567 	return (_B_TRUE);
2568 }
2569 
2570 /*
2571  * Get the number of UP logints on phyint `pi'.
2572  */
2573 static int
2574 logint_upcount(struct phyint *pi)
2575 {
2576 	struct	logint	*li;
2577 	int count = 0;
2578 
2579 	if (pi->pi_v4 != NULL) {
2580 		for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
2581 			if (li->li_flags & IFF_UP)
2582 				count++;
2583 		}
2584 	}
2585 
2586 	if (pi->pi_v6 != NULL) {
2587 		for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
2588 			if (li->li_flags & IFF_UP)
2589 				count++;
2590 		}
2591 	}
2592 
2593 	return (count);
2594 }
2595 
2596 /*
2597  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2598  */
2599 struct phyint_instance *
2600 phyint_inst_other(struct phyint_instance *pii)
2601 {
2602 	if (pii->pii_af == AF_INET)
2603 		return (pii->pii_phyint->pi_v6);
2604 	else
2605 		return (pii->pii_phyint->pi_v4);
2606 }
2607 
2608 /*
2609  * Check whether a phyint is functioning.
2610  */
2611 static boolean_t
2612 phyint_is_functioning(struct phyint *pi)
2613 {
2614 	if (pi->pi_state == PI_RUNNING)
2615 		return (_B_TRUE);
2616 	return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
2617 }
2618 
2619 /*
2620  * Check whether a phyint is usable.
2621  */
2622 static boolean_t
2623 phyint_is_usable(struct phyint *pi)
2624 {
2625 	if (logint_upcount(pi) == 0)
2626 		return (_B_FALSE);
2627 	return (phyint_is_functioning(pi));
2628 }
2629 
2630 /*
2631  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2632  * Before sending the event, it prepends the current version of the IPMP
2633  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2634  * `nvl' is freed).
2635  */
2636 static int
2637 post_event(const char *subclass, nvlist_t *nvl)
2638 {
2639 	static evchan_t *evchp = NULL;
2640 
2641 	/*
2642 	 * Initialize the event channel if we haven't already done so.
2643 	 */
2644 	if (evchp == NULL) {
2645 		errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
2646 		if (errno != 0) {
2647 			logerr("cannot create event channel `%s': %s\n",
2648 			    IPMP_EVENT_CHAN, strerror(errno));
2649 			goto failed;
2650 		}
2651 	}
2652 
2653 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2654 	    IPMP_EVENT_CUR_VERSION);
2655 	if (errno != 0) {
2656 		logerr("cannot create `%s' event: %s", subclass,
2657 		    strerror(errno));
2658 		goto failed;
2659 	}
2660 
2661 	errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
2662 	    "in.mpathd", nvl, EVCH_NOSLEEP);
2663 	if (errno != 0) {
2664 		logerr("cannot send `%s' event: %s\n", subclass,
2665 		    strerror(errno));
2666 		goto failed;
2667 	}
2668 
2669 	nvlist_free(nvl);
2670 	return (0);
2671 failed:
2672 	nvlist_free(nvl);
2673 	return (-1);
2674 }
2675 
2676 /*
2677  * Return the external IPMP state associated with phyint `pi'.
2678  */
2679 static ipmp_if_state_t
2680 ifstate(struct phyint *pi)
2681 {
2682 	switch (pi->pi_state) {
2683 	case PI_NOTARGETS:
2684 		if (pi->pi_flags & IFF_FAILED)
2685 			return (IPMP_IF_FAILED);
2686 		return (IPMP_IF_UNKNOWN);
2687 
2688 	case PI_OFFLINE:
2689 		return (IPMP_IF_OFFLINE);
2690 
2691 	case PI_FAILED:
2692 		return (IPMP_IF_FAILED);
2693 
2694 	case PI_RUNNING:
2695 		return (IPMP_IF_OK);
2696 	}
2697 
2698 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2699 	abort();
2700 	/* NOTREACHED */
2701 }
2702 
2703 /*
2704  * Return the external IPMP interface type associated with phyint `pi'.
2705  */
2706 static ipmp_if_type_t
2707 iftype(struct phyint *pi)
2708 {
2709 	if (pi->pi_flags & IFF_STANDBY)
2710 		return (IPMP_IF_STANDBY);
2711 	else
2712 		return (IPMP_IF_NORMAL);
2713 }
2714 
2715 /*
2716  * Return the external IPMP link state associated with phyint `pi'.
2717  */
2718 static ipmp_if_linkstate_t
2719 iflinkstate(struct phyint *pi)
2720 {
2721 	if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
2722 		return (IPMP_LINK_UNKNOWN);
2723 
2724 	return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
2725 }
2726 
2727 /*
2728  * Return the external IPMP probe state associated with phyint `pi'.
2729  */
2730 static ipmp_if_probestate_t
2731 ifprobestate(struct phyint *pi)
2732 {
2733 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
2734 		return (IPMP_PROBE_DISABLED);
2735 
2736 	if (pi->pi_state == PI_FAILED)
2737 		return (IPMP_PROBE_FAILED);
2738 
2739 	if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
2740 		return (IPMP_PROBE_UNKNOWN);
2741 
2742 	return (IPMP_PROBE_OK);
2743 }
2744 
2745 /*
2746  * Return the external IPMP target mode associated with phyint instance `pii'.
2747  */
2748 static ipmp_if_targmode_t
2749 iftargmode(struct phyint_instance *pii)
2750 {
2751 	if (!PROBE_ENABLED(pii))
2752 		return (IPMP_TARG_DISABLED);
2753 	else if (pii->pii_targets_are_routers)
2754 		return (IPMP_TARG_ROUTES);
2755 	else
2756 		return (IPMP_TARG_MULTICAST);
2757 }
2758 
2759 /*
2760  * Return the external IPMP flags associated with phyint `pi'.
2761  */
2762 static ipmp_if_flags_t
2763 ifflags(struct phyint *pi)
2764 {
2765 	ipmp_if_flags_t flags = 0;
2766 
2767 	if (logint_upcount(pi) == 0)
2768 		flags |= IPMP_IFFLAG_DOWN;
2769 	if (pi->pi_flags & IFF_INACTIVE)
2770 		flags |= IPMP_IFFLAG_INACTIVE;
2771 	if (pi->pi_hwaddrdup)
2772 		flags |= IPMP_IFFLAG_HWADDRDUP;
2773 	if (phyint_is_functioning(pi) && flags == 0)
2774 		flags |= IPMP_IFFLAG_ACTIVE;
2775 
2776 	return (flags);
2777 }
2778 
2779 /*
2780  * Store the test address used on phyint instance `pii' in `ssp'.  If there's
2781  * no test address, 0.0.0.0 is stored.
2782  */
2783 static struct sockaddr_storage *
2784 iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
2785 {
2786 	if (PROBE_ENABLED(pii))
2787 		addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
2788 	else
2789 		addr2storage(AF_INET6, &in6addr_any, ssp);
2790 
2791 	return (ssp);
2792 }
2793 
2794 /*
2795  * Return the external IPMP group state associated with phyint group `pg'.
2796  */
2797 static ipmp_group_state_t
2798 groupstate(struct phyint_group *pg)
2799 {
2800 	switch (pg->pg_state) {
2801 	case PG_FAILED:
2802 		return (IPMP_GROUP_FAILED);
2803 	case PG_DEGRADED:
2804 		return (IPMP_GROUP_DEGRADED);
2805 	case PG_OK:
2806 		return (IPMP_GROUP_OK);
2807 	}
2808 
2809 	logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
2810 	abort();
2811 	/* NOTREACHED */
2812 }
2813 
2814 /*
2815  * Return the external IPMP probe state associated with probe `ps'.
2816  */
2817 static ipmp_probe_state_t
2818 probestate(struct probe_stats *ps)
2819 {
2820 	switch (ps->pr_status) {
2821 	case PR_UNUSED:
2822 	case PR_LOST:
2823 		return (IPMP_PROBE_LOST);
2824 	case PR_UNACKED:
2825 		return (IPMP_PROBE_SENT);
2826 	case PR_ACKED:
2827 		return (IPMP_PROBE_ACKED);
2828 	}
2829 
2830 	logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
2831 	abort();
2832 	/* NOTREACHED */
2833 }
2834 
2835 /*
2836  * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
2837  * on phyint instance `pii'.  Returns 0 on success, -1 on failure.
2838  */
2839 int
2840 probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
2841 {
2842 	nvlist_t *nvl;
2843 	hrtime_t proc_time = 0, recv_time = 0;
2844 	struct sockaddr_storage ss;
2845 	struct target *tg = pr->pr_target;
2846 
2847 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2848 	if (errno != 0) {
2849 		logperror("cannot create `interface change' event");
2850 		return (-1);
2851 	}
2852 
2853 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
2854 	if (errno != 0)
2855 		goto failed;
2856 
2857 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
2858 	if (errno != 0)
2859 		goto failed;
2860 
2861 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
2862 	if (errno != 0)
2863 		goto failed;
2864 
2865 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
2866 	    pr->pr_hrtime_start);
2867 	if (errno != 0)
2868 		goto failed;
2869 
2870 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
2871 	    pr->pr_hrtime_sent);
2872 	if (errno != 0)
2873 		goto failed;
2874 
2875 	if (pr->pr_status == PR_ACKED) {
2876 		recv_time = pr->pr_hrtime_ackrecv;
2877 		proc_time = pr->pr_hrtime_ackproc;
2878 	}
2879 
2880 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
2881 	if (errno != 0)
2882 		goto failed;
2883 
2884 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
2885 	if (errno != 0)
2886 		goto failed;
2887 
2888 	if (tg != NULL)
2889 		addr2storage(pii->pii_af, &tg->tg_address, &ss);
2890 	else
2891 		addr2storage(pii->pii_af, &in6addr_any, &ss);
2892 
2893 	errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
2894 	    sizeof (ss));
2895 	if (errno != 0)
2896 		goto failed;
2897 
2898 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG,
2899 	    tg->tg_rtt_sa / 8);
2900 	if (errno != 0)
2901 		goto failed;
2902 
2903 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV,
2904 	    tg->tg_rtt_sd / 4);
2905 	if (errno != 0)
2906 		goto failed;
2907 
2908 	return (post_event(ESC_IPMP_PROBE_STATE, nvl));
2909 failed:
2910 	logperror("cannot create `probe state' event");
2911 	nvlist_free(nvl);
2912 	return (-1);
2913 }
2914 
2915 /*
2916  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2917  * Returns 0 on success, -1 on failure.
2918  */
2919 static int
2920 phyint_group_state_event(struct phyint_group *pg)
2921 {
2922 	nvlist_t	*nvl;
2923 
2924 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2925 	if (errno != 0) {
2926 		logperror("cannot create `group state change' event");
2927 		return (-1);
2928 	}
2929 
2930 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2931 	if (errno != 0)
2932 		goto failed;
2933 
2934 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2935 	if (errno != 0)
2936 		goto failed;
2937 
2938 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2939 	if (errno != 0)
2940 		goto failed;
2941 
2942 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2943 failed:
2944 	logperror("cannot create `group state change' event");
2945 	nvlist_free(nvl);
2946 	return (-1);
2947 }
2948 
2949 /*
2950  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2951  * `pg'.  Returns 0 on success, -1 on failure.
2952  */
2953 static int
2954 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2955 {
2956 	nvlist_t *nvl;
2957 
2958 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2959 	if (errno != 0) {
2960 		logperror("cannot create `group change' event");
2961 		return (-1);
2962 	}
2963 
2964 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2965 	if (errno != 0)
2966 		goto failed;
2967 
2968 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2969 	if (errno != 0)
2970 		goto failed;
2971 
2972 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
2973 	    phyint_grouplistsig);
2974 	if (errno != 0)
2975 		goto failed;
2976 
2977 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
2978 	if (errno != 0)
2979 		goto failed;
2980 
2981 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
2982 failed:
2983 	logperror("cannot create `group change' event");
2984 	nvlist_free(nvl);
2985 	return (-1);
2986 }
2987 
2988 /*
2989  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
2990  * group `pg'.	Returns 0 on success, -1 on failure.
2991  */
2992 static int
2993 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
2994     ipmp_if_op_t op)
2995 {
2996 	nvlist_t *nvl;
2997 
2998 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2999 	if (errno != 0) {
3000 		logperror("cannot create `group member change' event");
3001 		return (-1);
3002 	}
3003 
3004 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3005 	if (errno != 0)
3006 		goto failed;
3007 
3008 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3009 	if (errno != 0)
3010 		goto failed;
3011 
3012 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
3013 	if (errno != 0)
3014 		goto failed;
3015 
3016 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3017 	if (errno != 0)
3018 		goto failed;
3019 
3020 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3021 	if (errno != 0)
3022 		goto failed;
3023 
3024 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3025 	if (errno != 0)
3026 		goto failed;
3027 
3028 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
3029 failed:
3030 	logperror("cannot create `group member change' event");
3031 	nvlist_free(nvl);
3032 	return (-1);
3033 
3034 }
3035 
3036 /*
3037  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
3038  * Returns 0 on success, -1 on failure.
3039  */
3040 static int
3041 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
3042 {
3043 	nvlist_t *nvl;
3044 
3045 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3046 	if (errno != 0) {
3047 		logperror("cannot create `interface change' event");
3048 		return (-1);
3049 	}
3050 
3051 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3052 	if (errno != 0)
3053 		goto failed;
3054 
3055 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3056 	if (errno != 0)
3057 		goto failed;
3058 
3059 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3060 	if (errno != 0)
3061 		goto failed;
3062 
3063 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3064 	if (errno != 0)
3065 		goto failed;
3066 
3067 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3068 	if (errno != 0)
3069 		goto failed;
3070 
3071 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
3072 failed:
3073 	logperror("cannot create `interface change' event");
3074 	nvlist_free(nvl);
3075 	return (-1);
3076 
3077 }
3078 
3079 /*
3080  * Generate a signature for use.  The signature is conceptually divided
3081  * into two pieces: a random 16-bit "generation number" and a 48-bit
3082  * monotonically increasing integer.  The generation number protects
3083  * against stale updates to entities (e.g., IPMP groups) that have been
3084  * deleted and since recreated.
3085  */
3086 static uint64_t
3087 gensig(void)
3088 {
3089 	static int seeded = 0;
3090 
3091 	if (seeded == 0) {
3092 		srand48((long)gethrtime());
3093 		seeded++;
3094 	}
3095 
3096 	return ((uint64_t)lrand48() << 48 | 1);
3097 }
3098 
3099 /*
3100  * Store the information associated with group `grname' into a dynamically
3101  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
3102  */
3103 unsigned int
3104 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
3105 {
3106 	struct phyint		*pi;
3107 	struct phyint_group	*pg;
3108 	char			(*ifs)[LIFNAMSIZ];
3109 	unsigned int		i, j;
3110 	unsigned int		nif = 0, naddr = 0;
3111 	lifgroupinfo_t		lifgr;
3112 	addrlist_t		*addrp;
3113 	struct sockaddr_storage	*addrs;
3114 	int			fdt = 0;
3115 
3116 	pg = phyint_group_lookup(grname);
3117 	if (pg == NULL)
3118 		return (IPMP_EUNKGROUP);
3119 
3120 	/*
3121 	 * Tally up the number of interfaces, allocate an array to hold them,
3122 	 * and insert their names into the array.  While we're at it, if any
3123 	 * interface is actually enabled to send probes, save the group fdt.
3124 	 */
3125 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
3126 		nif++;
3127 
3128 	ifs = alloca(nif * sizeof (*ifs));
3129 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
3130 		assert(i < nif);
3131 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
3132 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
3133 			fdt = pg->pg_fdt;
3134 	}
3135 	assert(i == nif);
3136 
3137 	/*
3138 	 * If this is the anonymous group, there's no other information to
3139 	 * collect (since there's no IPMP interface).
3140 	 */
3141 	if (pg == phyint_anongroup) {
3142 		*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3143 		    groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
3144 		return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3145 	}
3146 
3147 	/*
3148 	 * Grab some additional information about the group from the kernel.
3149 	 * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
3150 	 * we can use ifsock_v4 even for a V6-only group.)
3151 	 */
3152 	(void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
3153 	if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
3154 		if (errno == ENOENT)
3155 			return (IPMP_EUNKGROUP);
3156 
3157 		logperror("getgroupinfo: SIOCGLIFGROUPINFO");
3158 		return (IPMP_FAILURE);
3159 	}
3160 
3161 	/*
3162 	 * Tally up the number of data addresses, allocate an array to hold
3163 	 * them, and insert their values into the array.
3164 	 */
3165 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
3166 		naddr++;
3167 
3168 	addrs = alloca(naddr * sizeof (*addrs));
3169 	i = 0;
3170 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3171 		/*
3172 		 * It's possible to have duplicate addresses (if some are
3173 		 * down).  Weed the dups out to avoid confusing consumers.
3174 		 * (If groups start having tons of addresses, we'll need a
3175 		 * better algorithm here.)
3176 		 */
3177 		for (j = 0; j < i; j++) {
3178 			if (sockaddrcmp(&addrs[j], &addrp->al_addr))
3179 				break;
3180 		}
3181 		if (j == i) {
3182 			assert(i < naddr);
3183 			addrs[i++] = addrp->al_addr;
3184 		}
3185 	}
3186 	naddr = i;
3187 
3188 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3189 	    groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
3190 	    lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
3191 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3192 }
3193 
3194 /*
3195  * Store the target information associated with phyint instance `pii' into a
3196  * dynamically allocated structure pointed to by `*targinfopp'.  Returns an
3197  * IPMP error code.
3198  */
3199 unsigned int
3200 gettarginfo(struct phyint_instance *pii, const char *name,
3201     ipmp_targinfo_t **targinfopp)
3202 {
3203 	uint_t ntarg = 0;
3204 	struct target *tg;
3205 	struct sockaddr_storage	ss;
3206 	struct sockaddr_storage *targs = NULL;
3207 
3208 	if (PROBE_CAPABLE(pii)) {
3209 		targs = alloca(pii->pii_ntargets * sizeof (*targs));
3210 		tg = pii->pii_target_next;
3211 		do {
3212 			if (tg->tg_status == TG_ACTIVE) {
3213 				assert(ntarg < pii->pii_ntargets);
3214 				addr2storage(pii->pii_af, &tg->tg_address,
3215 				    &targs[ntarg++]);
3216 			}
3217 			if ((tg = tg->tg_next) == NULL)
3218 				tg = pii->pii_targets;
3219 		} while (tg != pii->pii_target_next);
3220 
3221 		assert(ntarg == pii->pii_ntargets);
3222 	}
3223 
3224 	*targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
3225 	    iftargmode(pii), ntarg, targs);
3226 	return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3227 }
3228 
3229 /*
3230  * Store the information associated with interface `ifname' into a dynamically
3231  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
3232  */
3233 unsigned int
3234 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
3235 {
3236 	int		retval;
3237 	struct phyint	*pi;
3238 	ipmp_targinfo_t	*targinfo4;
3239 	ipmp_targinfo_t	*targinfo6;
3240 
3241 	pi = phyint_lookup(ifname);
3242 	if (pi == NULL)
3243 		return (IPMP_EUNKIF);
3244 
3245 	if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
3246 	    (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
3247 		goto out;
3248 
3249 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
3250 	    ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
3251 	    ifflags(pi), targinfo4, targinfo6);
3252 	retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3253 out:
3254 	if (targinfo4 != NULL)
3255 		ipmp_freetarginfo(targinfo4);
3256 	if (targinfo6 != NULL)
3257 		ipmp_freetarginfo(targinfo6);
3258 	return (retval);
3259 }
3260 
3261 /*
3262  * Store the current list of IPMP groups into a dynamically allocated
3263  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
3264  */
3265 unsigned int
3266 getgrouplist(ipmp_grouplist_t **grlistpp)
3267 {
3268 	struct phyint_group	*pg;
3269 	char			(*groups)[LIFGRNAMSIZ];
3270 	unsigned int		i, ngroup;
3271 
3272 	/*
3273 	 * Tally up the number of groups, allocate an array to hold them, and
3274 	 * insert their names into the array.
3275 	 */
3276 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
3277 		ngroup++;
3278 
3279 	groups = alloca(ngroup * sizeof (*groups));
3280 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
3281 		assert(i < ngroup);
3282 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
3283 	}
3284 	assert(i == ngroup);
3285 
3286 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
3287 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3288 }
3289 
3290 /*
3291  * Store the address information for `ssp' (in group `grname') into a
3292  * dynamically allocated structure pointed to by `*adinfopp'.  Returns an IPMP
3293  * error code.  (We'd call this function getaddrinfo(), but it would conflict
3294  * with getaddrinfo(3SOCKET)).
3295  */
3296 unsigned int
3297 getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
3298     ipmp_addrinfo_t **adinfopp)
3299 {
3300 	int ifsock;
3301 	addrlist_t *addrp, *addrmatchp = NULL;
3302 	ipmp_addr_state_t state;
3303 	const char *binding = "";
3304 	struct lifreq lifr;
3305 	struct phyint_group *pg;
3306 
3307 	if ((pg = phyint_group_lookup(grname)) == NULL)
3308 		return (IPMP_EUNKADDR);
3309 
3310 	/*
3311 	 * Walk through the data addresses, and find a match.  Note that since
3312 	 * some of the addresses may be down, more than one may match.  We
3313 	 * prefer an up address (if one exists).
3314 	 */
3315 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3316 		if (sockaddrcmp(ssp, &addrp->al_addr)) {
3317 			addrmatchp = addrp;
3318 			if (addrmatchp->al_flags & IFF_UP)
3319 				break;
3320 		}
3321 	}
3322 
3323 	if (addrmatchp == NULL)
3324 		return (IPMP_EUNKADDR);
3325 
3326 	state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
3327 	if (state == IPMP_ADDR_UP) {
3328 		ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
3329 		(void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
3330 		if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
3331 			binding = lifr.lifr_binding;
3332 	}
3333 
3334 	*adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
3335 	return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3336 }
3337 
3338 /*
3339  * Store a snapshot of the IPMP subsystem into a dynamically allocated
3340  * structure pointed to by `*snapp'.  Returns an IPMP error code.
3341  */
3342 unsigned int
3343 getsnap(ipmp_snap_t **snapp)
3344 {
3345 	ipmp_grouplist_t	*grlistp;
3346 	ipmp_groupinfo_t	*grinfop;
3347 	ipmp_addrinfo_t		*adinfop;
3348 	ipmp_addrlist_t		*adlistp;
3349 	ipmp_ifinfo_t		*ifinfop;
3350 	ipmp_snap_t		*snap;
3351 	struct phyint		*pi;
3352 	unsigned int		i, j;
3353 	int			retval;
3354 
3355 	snap = ipmp_snap_create();
3356 	if (snap == NULL)
3357 		return (IPMP_ENOMEM);
3358 
3359 	/*
3360 	 * Add group list.
3361 	 */
3362 	retval = getgrouplist(&snap->sn_grlistp);
3363 	if (retval != IPMP_SUCCESS)
3364 		goto failed;
3365 
3366 	/*
3367 	 * Add information for each group in the list, along with all of its
3368 	 * data addresses.
3369 	 */
3370 	grlistp = snap->sn_grlistp;
3371 	for (i = 0; i < grlistp->gl_ngroup; i++) {
3372 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
3373 		if (retval != IPMP_SUCCESS)
3374 			goto failed;
3375 
3376 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
3377 		if (retval != IPMP_SUCCESS) {
3378 			ipmp_freegroupinfo(grinfop);
3379 			goto failed;
3380 		}
3381 
3382 		adlistp = grinfop->gr_adlistp;
3383 		for (j = 0; j < adlistp->al_naddr; j++) {
3384 			retval = getgraddrinfo(grinfop->gr_name,
3385 			    &adlistp->al_addrs[j], &adinfop);
3386 			if (retval != IPMP_SUCCESS)
3387 				goto failed;
3388 
3389 			retval = ipmp_snap_addaddrinfo(snap, adinfop);
3390 			if (retval != IPMP_SUCCESS) {
3391 				ipmp_freeaddrinfo(adinfop);
3392 				goto failed;
3393 			}
3394 		}
3395 	}
3396 
3397 	/*
3398 	 * Add information for each configured phyint.
3399 	 */
3400 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
3401 		retval = getifinfo(pi->pi_name, &ifinfop);
3402 		if (retval != IPMP_SUCCESS)
3403 			goto failed;
3404 
3405 		retval = ipmp_snap_addifinfo(snap, ifinfop);
3406 		if (retval != IPMP_SUCCESS) {
3407 			ipmp_freeifinfo(ifinfop);
3408 			goto failed;
3409 		}
3410 	}
3411 
3412 	*snapp = snap;
3413 	return (IPMP_SUCCESS);
3414 failed:
3415 	ipmp_snap_free(snap);
3416 	return (retval);
3417 }
3418