xref: /illumos-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision 47842382d52f28aa3173aa6b511781c322ccb6a2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 /*
30  * Global list of phyints, phyint instances, phyint groups and the anonymous
31  * group; the latter is initialized in phyint_init().
32  */
33 struct phyint *phyints = NULL;
34 struct phyint_instance	*phyint_instances = NULL;
35 struct phyint_group *phyint_groups = NULL;
36 struct phyint_group *phyint_anongroup;
37 
38 /*
39  * Grouplist signature; initialized in phyint_init().
40  */
41 static uint64_t phyint_grouplistsig;
42 
43 static void phyint_inst_insert(struct phyint_instance *pii);
44 static void phyint_inst_print(struct phyint_instance *pii);
45 
46 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
47 static void phyint_delete(struct phyint *pi);
48 static boolean_t phyint_is_usable(struct phyint *pi);
49 
50 static void logint_print(struct logint *li);
51 static void logint_insert(struct phyint_instance *pii, struct logint *li);
52 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
53 
54 static void target_print(struct target *tg);
55 static void target_insert(struct phyint_instance *pii, struct target *tg);
56 static struct target *target_first(struct phyint_instance *pii);
57 static struct target *target_select_best(struct phyint_instance *pii);
58 static void target_flush_hosts(struct phyint_group *pg);
59 
60 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
61 
62 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
63 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
64 
65 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
66 static int phyint_group_state_event(struct phyint_group *pg);
67 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
68 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
69     ipmp_if_op_t op);
70 
71 static int logint_upcount(struct phyint *pi);
72 static uint64_t gensig(void);
73 
74 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
75 int
76 phyint_init(void)
77 {
78 	phyint_grouplistsig = gensig();
79 	if (track_all_phyints) {
80 		phyint_anongroup = phyint_group_create("");
81 		if (phyint_anongroup == NULL)
82 			return (-1);
83 		phyint_group_insert(phyint_anongroup);
84 	}
85 	return (0);
86 }
87 
88 /* Return the phyint with the given name */
89 struct phyint *
90 phyint_lookup(const char *name)
91 {
92 	struct phyint *pi;
93 
94 	if (debug & D_PHYINT)
95 		logdebug("phyint_lookup(%s)\n", name);
96 
97 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
98 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
99 			break;
100 	}
101 	return (pi);
102 }
103 
104 /*
105  * Lookup a phyint in the group that has the same hardware address as `pi', or
106  * NULL if there's none.  If `online_only' is set, then only online phyints
107  * are considered when matching.  Otherwise, phyints that had been offlined
108  * due to a duplicate hardware address will also be considered.
109  */
110 static struct phyint *
111 phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
112 {
113 	struct phyint *pi2;
114 
115 	if (pi->pi_group == phyint_anongroup)
116 		return (NULL);
117 
118 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
119 		if (pi2 == pi)
120 			continue;
121 
122 		/*
123 		 * NOTE: even when online_only is B_FALSE, we ignore phyints
124 		 * that are administratively offline (rather than offline
125 		 * because they're dups); when they're brought back online,
126 		 * they'll be flagged as dups if need be.
127 		 */
128 		if (pi2->pi_state == PI_OFFLINE &&
129 		    (online_only || !pi2->pi_hwaddrdup))
130 			continue;
131 
132 		if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
133 		    bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
134 			return (pi2);
135 	}
136 	return (NULL);
137 }
138 
139 /*
140  * Respond to DLPI notifications.  Currently, this only processes physical
141  * address changes for the phyint passed via `arg' by onlining or offlining
142  * phyints in the group.
143  */
144 /* ARGSUSED */
145 static void
146 phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
147 {
148 	struct phyint *pi = arg;
149 	struct phyint *oduppi = NULL, *duppi = NULL;
150 
151 	assert((dnip->dni_note & pi->pi_notes) != 0);
152 
153 	if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
154 		return;
155 
156 	assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
157 
158 	/*
159 	 * If our hardware address hasn't changed, there's nothing to do.
160 	 */
161 	if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
162 	    bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
163 		return;
164 
165 	oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
166 	pi->pi_hwaddrlen = dnip->dni_physaddrlen;
167 	(void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
168 	duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
169 
170 	if (oduppi != NULL || pi->pi_hwaddrdup) {
171 		/*
172 		 * Our old hardware address was a duplicate.  If we'd been
173 		 * offlined because of it, and our new hardware address is not
174 		 * a duplicate, then bring us online.  Otherwise, `oduppi'
175 		 * must've been the one brought offline; bring it online.
176 		 */
177 		if (pi->pi_hwaddrdup) {
178 			if (duppi == NULL)
179 				(void) phyint_undo_offline(pi);
180 		} else {
181 			assert(oduppi->pi_hwaddrdup);
182 			(void) phyint_undo_offline(oduppi);
183 		}
184 	}
185 
186 	if (duppi != NULL && !pi->pi_hwaddrdup) {
187 		/*
188 		 * Our new hardware address was a duplicate and we're not
189 		 * yet flagged as a duplicate; bring us offline.
190 		 */
191 		pi->pi_hwaddrdup = _B_TRUE;
192 		(void) phyint_offline(pi, 0);
193 	}
194 }
195 
196 /*
197  * Initialize information about the underlying link for `pi', and set us
198  * up to be notified about future changes.  Returns _B_TRUE on success.
199  */
200 boolean_t
201 phyint_link_init(struct phyint *pi)
202 {
203 	int retval;
204 	uint_t notes;
205 	const char *errmsg;
206 	dlpi_notifyid_t id;
207 
208 	pi->pi_notes = 0;
209 	retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
210 	if (retval != DLPI_SUCCESS) {
211 		pi->pi_dh = NULL;
212 		errmsg = "cannot open";
213 		goto failed;
214 	}
215 
216 	pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
217 	retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
218 	    &pi->pi_hwaddrlen);
219 	if (retval != DLPI_SUCCESS) {
220 		errmsg = "cannot get hardware address";
221 		goto failed;
222 	}
223 
224 	retval = dlpi_bind(pi->pi_dh, DLPI_ANY_SAP, NULL);
225 	if (retval != DLPI_SUCCESS) {
226 		errmsg = "cannot bind to DLPI_ANY_SAP";
227 		goto failed;
228 	}
229 
230 	/*
231 	 * Check if the link supports DLPI link state notifications.  For
232 	 * historical reasons, the actual changes are tracked through routing
233 	 * sockets, so we immediately disable the notification upon success.
234 	 */
235 	notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
236 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
237 	if (retval == DLPI_SUCCESS) {
238 		(void) dlpi_disabnotify(pi->pi_dh, id, NULL);
239 		pi->pi_notes |= notes;
240 	}
241 
242 	/*
243 	 * Enable notification of hardware address changes to keep pi_hwaddr
244 	 * up-to-date and track if we need to offline/undo-offline phyints.
245 	 */
246 	notes = DL_NOTE_PHYS_ADDR;
247 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
248 	if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
249 		pi->pi_notes |= notes;
250 
251 	return (_B_TRUE);
252 failed:
253 	logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
254 	if (pi->pi_dh != NULL) {
255 		dlpi_close(pi->pi_dh);
256 		pi->pi_dh = NULL;
257 	}
258 	return (_B_FALSE);
259 }
260 
261 /*
262  * Close use of link on `pi'.
263  */
264 void
265 phyint_link_close(struct phyint *pi)
266 {
267 	if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
268 		(void) poll_remove(dlpi_fd(pi->pi_dh));
269 		pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
270 	}
271 
272 	/*
273 	 * NOTE: we don't clear pi_notes here so that iflinkstate() can still
274 	 * properly report the link state even when offline (which is possible
275 	 * since we use IFF_RUNNING to track link state).
276 	 */
277 	dlpi_close(pi->pi_dh);
278 	pi->pi_dh = NULL;
279 }
280 
281 /* Return the phyint instance with the given name and the given family */
282 struct phyint_instance *
283 phyint_inst_lookup(int af, char *name)
284 {
285 	struct phyint *pi;
286 
287 	if (debug & D_PHYINT)
288 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
289 
290 	assert(af == AF_INET || af == AF_INET6);
291 
292 	pi = phyint_lookup(name);
293 	if (pi == NULL)
294 		return (NULL);
295 
296 	return (PHYINT_INSTANCE(pi, af));
297 }
298 
299 struct phyint_group *
300 phyint_group_lookup(const char *pg_name)
301 {
302 	struct phyint_group *pg;
303 
304 	if (debug & D_PHYINT)
305 		logdebug("phyint_group_lookup(%s)\n", pg_name);
306 
307 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
308 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
309 			break;
310 	}
311 	return (pg);
312 }
313 
314 /*
315  * Insert the phyint in the linked list of all phyints. If the phyint belongs
316  * to some group, insert it in the phyint group list.
317  */
318 static void
319 phyint_insert(struct phyint *pi, struct phyint_group *pg)
320 {
321 	if (debug & D_PHYINT)
322 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
323 
324 	/* Insert the phyint at the head of the 'all phyints' list */
325 	pi->pi_next = phyints;
326 	pi->pi_prev = NULL;
327 	if (phyints != NULL)
328 		phyints->pi_prev = pi;
329 	phyints = pi;
330 
331 	/*
332 	 * Insert the phyint at the head of the 'phyint_group members' list
333 	 * of the phyint group to which it belongs.
334 	 */
335 	pi->pi_pgnext = NULL;
336 	pi->pi_pgprev = NULL;
337 	pi->pi_group = pg;
338 
339 	pi->pi_pgnext = pg->pg_phyint;
340 	if (pi->pi_pgnext != NULL)
341 		pi->pi_pgnext->pi_pgprev = pi;
342 	pg->pg_phyint = pi;
343 
344 	/* Refresh the group state now that this phyint has been added */
345 	phyint_group_refresh_state(pg);
346 
347 	pg->pg_sig++;
348 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
349 }
350 
351 /* Insert the phyint instance in the linked list of all phyint instances. */
352 static void
353 phyint_inst_insert(struct phyint_instance *pii)
354 {
355 	if (debug & D_PHYINT) {
356 		logdebug("phyint_inst_insert(%s %s)\n",
357 		    AF_STR(pii->pii_af), pii->pii_name);
358 	}
359 
360 	/*
361 	 * Insert the phyint at the head of the 'all phyint instances' list.
362 	 */
363 	pii->pii_next = phyint_instances;
364 	pii->pii_prev = NULL;
365 	if (phyint_instances != NULL)
366 		phyint_instances->pii_prev = pii;
367 	phyint_instances = pii;
368 }
369 
370 /*
371  * Create a new phyint with the given parameters. Also insert it into
372  * the list of all phyints and the list of phyint group members by calling
373  * phyint_insert().
374  */
375 static struct phyint *
376 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
377     uint64_t flags)
378 {
379 	struct phyint *pi;
380 
381 	pi = calloc(1, sizeof (struct phyint));
382 	if (pi == NULL) {
383 		logperror("phyint_create: calloc");
384 		return (NULL);
385 	}
386 
387 	/*
388 	 * Record the phyint values.
389 	 */
390 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
391 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
392 	pi->pi_ifindex = ifindex;
393 	pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
394 
395 	/*
396 	 * If the interface is offline, we set the state to PI_OFFLINE.
397 	 * Otherwise, we optimistically start in the PI_RUNNING state.  Later
398 	 * (in process_link_state_changes()), we will adjust this to match the
399 	 * current state of the link.  Further, if test addresses are
400 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
401 	 * to either PI_RUNNING or PI_FAILED depending on the probe results.
402 	 */
403 	pi->pi_state = (flags & IFF_OFFLINE) ? PI_OFFLINE : PI_RUNNING;
404 	pi->pi_flags = PHYINT_FLAGS(flags);
405 
406 	/*
407 	 * Initialize the link state.  The link state is initialized to
408 	 * up, so that if the link is down when IPMP starts monitoring
409 	 * the interface, it will appear as though there has been a
410 	 * transition from the link up to link down.  This avoids
411 	 * having to treat this situation as a special case.
412 	 */
413 	INIT_LINK_STATE(pi);
414 
415 	if (!phyint_link_init(pi)) {
416 		free(pi);
417 		return (NULL);
418 	}
419 
420 	/*
421 	 * Insert the phyint in the list of all phyints, and the
422 	 * list of phyint group members
423 	 */
424 	phyint_insert(pi, pg);
425 
426 	return (pi);
427 }
428 
429 /*
430  * Create a new phyint instance belonging to the phyint 'pi' and address
431  * family 'af'. Also insert it into the list of all phyint instances by
432  * calling phyint_inst_insert().
433  */
434 static struct phyint_instance *
435 phyint_inst_create(struct phyint *pi, int af)
436 {
437 	struct phyint_instance *pii;
438 
439 	pii = calloc(1, sizeof (struct phyint_instance));
440 	if (pii == NULL) {
441 		logperror("phyint_inst_create: calloc");
442 		return (NULL);
443 	}
444 
445 	/*
446 	 * Attach the phyint instance to the phyint.
447 	 * Set the back pointers as well
448 	 */
449 	pii->pii_phyint = pi;
450 	if (af == AF_INET)
451 		pi->pi_v4 = pii;
452 	else
453 		pi->pi_v6 = pii;
454 
455 	pii->pii_in_use = 1;
456 	pii->pii_probe_sock = -1;
457 	pii->pii_snxt = 1;
458 	pii->pii_af = af;
459 	pii->pii_fd_hrtime = gethrtime() +
460 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
461 	pii->pii_flags = pi->pi_flags;
462 
463 	/* Insert the phyint instance in the list of all phyint instances. */
464 	phyint_inst_insert(pii);
465 	return (pii);
466 }
467 
468 /*
469  * Change the state of phyint `pi' to state `state'.
470  */
471 void
472 phyint_chstate(struct phyint *pi, enum pi_state state)
473 {
474 	/*
475 	 * To simplify things, some callers always set a given state
476 	 * regardless of the previous state of the phyint (e.g., setting
477 	 * PI_RUNNING when it's already set).  We shouldn't bother
478 	 * generating an event or consuming a signature for these, since
479 	 * the actual state of the interface is unchanged.
480 	 */
481 	if (pi->pi_state == state)
482 		return;
483 
484 	pi->pi_state = state;
485 	phyint_changed(pi);
486 }
487 
488 /*
489  * Note that `pi' has changed state.
490  */
491 void
492 phyint_changed(struct phyint *pi)
493 {
494 	pi->pi_group->pg_sig++;
495 	(void) phyint_state_event(pi->pi_group, pi);
496 }
497 
498 /*
499  * Insert the phyint group in the linked list of all phyint groups
500  * at the head of the list
501  */
502 void
503 phyint_group_insert(struct phyint_group *pg)
504 {
505 	pg->pg_next = phyint_groups;
506 	pg->pg_prev = NULL;
507 	if (phyint_groups != NULL)
508 		phyint_groups->pg_prev = pg;
509 	phyint_groups = pg;
510 
511 	phyint_grouplistsig++;
512 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
513 }
514 
515 /*
516  * Create a new phyint group called 'name'.
517  */
518 struct phyint_group *
519 phyint_group_create(const char *name)
520 {
521 	struct	phyint_group *pg;
522 
523 	if (debug & D_PHYINT)
524 		logdebug("phyint_group_create(%s)\n", name);
525 
526 	pg = calloc(1, sizeof (struct phyint_group));
527 	if (pg == NULL) {
528 		logperror("phyint_group_create: calloc");
529 		return (NULL);
530 	}
531 
532 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
533 	pg->pg_sig = gensig();
534 	pg->pg_fdt = user_failure_detection_time;
535 	pg->pg_probeint = user_probe_interval;
536 	pg->pg_in_use = _B_TRUE;
537 
538 	/*
539 	 * Normal groups always start in the PG_FAILED state since they
540 	 * have no active interfaces.  In contrast, anonymous groups are
541 	 * heterogeneous and thus always PG_OK.
542 	 */
543 	pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
544 
545 	return (pg);
546 }
547 
548 /*
549  * Change the state of the phyint group `pg' to state `state'.
550  */
551 void
552 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
553 {
554 	assert(pg != phyint_anongroup);
555 
556 	/*
557 	 * To simplify things, some callers always set a given state
558 	 * regardless of the previous state of the group (e.g., setting
559 	 * PG_DEGRADED when it's already set).  We shouldn't bother
560 	 * generating an event or consuming a signature for these, since
561 	 * the actual state of the group is unchanged.
562 	 */
563 	if (pg->pg_state == state)
564 		return;
565 
566 	pg->pg_state = state;
567 
568 	switch (state) {
569 	case PG_FAILED:
570 		/*
571 		 * We can never know with certainty that a group has
572 		 * failed.  It is possible that all known targets have
573 		 * failed simultaneously, and new targets have come up
574 		 * instead. If the targets are routers then router
575 		 * discovery will kick in, and we will see the new routers
576 		 * thru routing socket messages. But if the targets are
577 		 * hosts, we have to discover it by multicast.	So flush
578 		 * all the host targets. The next probe will send out a
579 		 * multicast echo request. If this is a group failure, we
580 		 * will still not see any response, otherwise the group
581 		 * will be repaired after we get NUM_PROBE_REPAIRS
582 		 * consecutive unicast replies on any phyint.
583 		 */
584 		target_flush_hosts(pg);
585 		break;
586 
587 	case PG_OK:
588 	case PG_DEGRADED:
589 		break;
590 
591 	default:
592 		logerr("phyint_group_chstate: invalid group state %d; "
593 		    "aborting\n", state);
594 		abort();
595 	}
596 
597 	pg->pg_sig++;
598 	(void) phyint_group_state_event(pg);
599 }
600 
601 /*
602  * Create a new phyint instance and initialize it from the values supplied by
603  * the kernel. Always check for ENXIO before logging any error, because the
604  * interface could have vanished after completion of SIOCGLIFCONF.
605  * Return values:
606  *	pointer to the phyint instance on success
607  *	NULL on failure Eg. if the phyint instance is not found in the kernel
608  */
609 struct phyint_instance *
610 phyint_inst_init_from_k(int af, char *pi_name)
611 {
612 	char	pg_name[LIFNAMSIZ + 1];
613 	int	ifsock;
614 	uint_t	ifindex;
615 	uint64_t	flags;
616 	struct lifreq	lifr;
617 	struct phyint	*pi;
618 	struct phyint_instance	*pii;
619 	boolean_t	pi_created;
620 	struct phyint_group	*pg;
621 
622 retry:
623 	pii = NULL;
624 	pi = NULL;
625 	pg = NULL;
626 	pi_created = _B_FALSE;
627 
628 	if (debug & D_PHYINT) {
629 		logdebug("phyint_inst_init_from_k(%s %s)\n",
630 		    AF_STR(af), pi_name);
631 	}
632 
633 	assert(af == AF_INET || af == AF_INET6);
634 
635 	/* Get the socket for doing ioctls */
636 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
637 
638 	/*
639 	 * Get the interface flags.  Ignore virtual interfaces, IPMP
640 	 * meta-interfaces, point-to-point interfaces, and interfaces
641 	 * that can't support multicast.
642 	 */
643 	(void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
644 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
645 		if (errno != ENXIO) {
646 			logperror("phyint_inst_init_from_k:"
647 			    " ioctl (get flags)");
648 		}
649 		return (NULL);
650 	}
651 	flags = lifr.lifr_flags;
652 	if (!(flags & IFF_MULTICAST) ||
653 	    (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
654 		return (NULL);
655 
656 	/*
657 	 * Get the ifindex for recording later in our tables, in case we need
658 	 * to create a new phyint.
659 	 */
660 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
661 		if (errno != ENXIO) {
662 			logperror("phyint_inst_init_from_k: "
663 			    " ioctl (get lifindex)");
664 		}
665 		return (NULL);
666 	}
667 	ifindex = lifr.lifr_index;
668 
669 	/*
670 	 * Get the phyint group name of this phyint, from the kernel.
671 	 */
672 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
673 		if (errno != ENXIO) {
674 			logperror("phyint_inst_init_from_k: "
675 			    "ioctl (get group name)");
676 		}
677 		return (NULL);
678 	}
679 	(void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
680 
681 	/*
682 	 * If the phyint is not part of any group, pg_name is the
683 	 * null string. If 'track_all_phyints' is false, there is no
684 	 * need to create a phyint.
685 	 */
686 	if (pg_name[0] == '\0' && !track_all_phyints) {
687 		/*
688 		 * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
689 		 * set, reset them. These flags shouldn't be set if in.mpathd
690 		 * isn't tracking the interface.
691 		 */
692 		if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
693 			lifr.lifr_flags = flags &
694 			    ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
695 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
696 				if (errno != ENXIO) {
697 					logperror("phyint_inst_init_from_k:"
698 					    " ioctl (set flags)");
699 				}
700 			}
701 		}
702 		return (NULL);
703 	}
704 
705 	/*
706 	 * We need to create a new phyint instance.  We may also need to
707 	 * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
708 	 * an underlying interface before it found its IPMP meta-interface.
709 	 * Note that we keep any created groups even if phyint_inst_from_k()
710 	 * fails since a group's existence is not dependent on the ability of
711 	 * in.mpathd to the track the group's interfaces.
712 	 */
713 	if ((pg = phyint_group_lookup(pg_name)) == NULL) {
714 		if ((pg = phyint_group_create(pg_name)) == NULL) {
715 			logerr("phyint_inst_init_from_k: cannot create group "
716 			    "%s\n", pg_name);
717 			return (NULL);
718 		}
719 		phyint_group_insert(pg);
720 	}
721 
722 	/*
723 	 * Lookup the phyint. If the phyint does not exist create it.
724 	 */
725 	pi = phyint_lookup(pi_name);
726 	if (pi == NULL) {
727 		pi = phyint_create(pi_name, pg, ifindex, flags);
728 		if (pi == NULL) {
729 			logerr("phyint_inst_init_from_k:"
730 			    " unable to create phyint %s\n", pi_name);
731 			return (NULL);
732 		}
733 		pi_created = _B_TRUE;
734 	} else {
735 		/* The phyint exists already. */
736 		assert(pi_created == _B_FALSE);
737 		/*
738 		 * Normally we should see consistent values for the IPv4 and
739 		 * IPv6 instances, for phyint properties. If we don't, it
740 		 * means things have changed underneath us, and we should
741 		 * resync our tables with the kernel. Check whether the
742 		 * interface index has changed. If so, it is most likely
743 		 * the interface has been unplumbed and replumbed,
744 		 * while we are yet to update our tables. Do it now.
745 		 */
746 		if (pi->pi_ifindex != ifindex) {
747 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
748 			goto retry;
749 		}
750 		assert(PHYINT_INSTANCE(pi, af) == NULL);
751 
752 		/*
753 		 * If the group name seen by the IPv4 and IPv6 instances
754 		 * are different, it is most likely the groupname has
755 		 * changed, while we are yet to update our tables. Do it now.
756 		 */
757 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
758 			phyint_inst_delete(PHYINT_INSTANCE(pi,
759 			    AF_OTHER(af)));
760 			goto retry;
761 		}
762 	}
763 
764 	/*
765 	 * Create a new phyint instance, corresponding to the 'af'
766 	 * passed in.
767 	 */
768 	pii = phyint_inst_create(pi, af);
769 	if (pii == NULL) {
770 		logerr("phyint_inst_init_from_k: unable to create"
771 		    "phyint inst %s\n", pi->pi_name);
772 		if (pi_created)
773 			phyint_delete(pi);
774 
775 		return (NULL);
776 	}
777 
778 	if (pi_created) {
779 		/*
780 		 * If this phyint does not have a unique hardware address in its
781 		 * group, offline it.  (The change_pif_flags() implementation
782 		 * requires that we defer this until after the phyint_instance
783 		 * is created.)
784 		 */
785 		if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
786 			pi->pi_hwaddrdup = _B_TRUE;
787 			(void) phyint_offline(pi, 0);
788 		}
789 	}
790 
791 	return (pii);
792 }
793 
794 /*
795  * Bind pii_probe_sock to the address associated with pii_probe_logint.
796  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
797  * targets. Do the common part in this function, and complete the
798  * initializations by calling the protocol specific functions
799  * phyint_inst_v{4,6}_sockinit() respectively.
800  *
801  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
802  */
803 boolean_t
804 phyint_inst_sockinit(struct phyint_instance *pii)
805 {
806 	boolean_t success;
807 	struct phyint_group *pg;
808 
809 	if (debug & D_PHYINT) {
810 		logdebug("phyint_inst_sockinit(%s %s)\n",
811 		    AF_STR(pii->pii_af), pii->pii_name);
812 	}
813 
814 	assert(pii->pii_probe_logint != NULL);
815 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
816 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
817 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
818 
819 	/*
820 	 * If the socket is already bound, close pii_probe_sock
821 	 */
822 	if (pii->pii_probe_sock != -1)
823 		close_probe_socket(pii, _B_TRUE);
824 
825 	/*
826 	 * If the phyint is not part of a named group and track_all_phyints is
827 	 * false, simply return.
828 	 */
829 	pg = pii->pii_phyint->pi_group;
830 	if (pg == phyint_anongroup && !track_all_phyints) {
831 		if (debug & D_PHYINT)
832 			logdebug("phyint_inst_sockinit: no group\n");
833 		return (_B_FALSE);
834 	}
835 
836 	/*
837 	 * Initialize the socket by calling the protocol specific function.
838 	 * If it succeeds, add the socket to the poll list.
839 	 */
840 	if (pii->pii_af == AF_INET6)
841 		success = phyint_inst_v6_sockinit(pii);
842 	else
843 		success = phyint_inst_v4_sockinit(pii);
844 
845 	if (success && (poll_add(pii->pii_probe_sock) == 0))
846 		return (_B_TRUE);
847 
848 	/* Something failed, cleanup and return false */
849 	if (pii->pii_probe_sock != -1)
850 		close_probe_socket(pii, _B_FALSE);
851 
852 	return (_B_FALSE);
853 }
854 
855 /*
856  * IPv6 specific part in initializing the pii_probe_sock. This socket is
857  * used to send/receive ICMPv6 probe packets.
858  */
859 static boolean_t
860 phyint_inst_v6_sockinit(struct phyint_instance *pii)
861 {
862 	icmp6_filter_t filter;
863 	int hopcount = 1;
864 	int off = 0;
865 	int on = 1;
866 	struct	sockaddr_in6	testaddr;
867 
868 	/*
869 	 * Open a raw socket with ICMPv6 protocol.
870 	 *
871 	 * Use IPV6_BOUND_IF to make sure that probes are sent and received on
872 	 * the specified phyint only.  Bind to the test address to ensure that
873 	 * the responses are sent to the specified phyint.
874 	 *
875 	 * Set the hopcount to 1 so that probe packets are not routed.
876 	 * Disable multicast loopback. Set the receive filter to
877 	 * receive only ICMPv6 echo replies.
878 	 */
879 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
880 	if (pii->pii_probe_sock < 0) {
881 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
882 		return (_B_FALSE);
883 	}
884 
885 	bzero(&testaddr, sizeof (testaddr));
886 	testaddr.sin6_family = AF_INET6;
887 	testaddr.sin6_port = 0;
888 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
889 
890 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
891 	    sizeof (testaddr)) < 0) {
892 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
893 		return (_B_FALSE);
894 	}
895 
896 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
897 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
898 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
899 		    " IPV6_MULTICAST_IF");
900 		return (_B_FALSE);
901 	}
902 
903 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
904 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
905 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
906 		    " IPV6_BOUND_IF");
907 		return (_B_FALSE);
908 	}
909 
910 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
911 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
912 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
913 		    " IPV6_UNICAST_HOPS");
914 		return (_B_FALSE);
915 	}
916 
917 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
918 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
919 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
920 		    " IPV6_MULTICAST_HOPS");
921 		return (_B_FALSE);
922 	}
923 
924 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
925 	    (char *)&off, sizeof (off)) < 0) {
926 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
927 		    " IPV6_MULTICAST_LOOP");
928 		return (_B_FALSE);
929 	}
930 
931 	/*
932 	 * Filter out so that we only receive ICMP echo replies
933 	 */
934 	ICMP6_FILTER_SETBLOCKALL(&filter);
935 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
936 
937 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
938 	    (char *)&filter, sizeof (filter)) < 0) {
939 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
940 		    " ICMP6_FILTER");
941 		return (_B_FALSE);
942 	}
943 
944 	/* Enable receipt of hoplimit */
945 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
946 	    &on, sizeof (on)) < 0) {
947 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
948 		    " IPV6_RECVHOPLIMIT");
949 		return (_B_FALSE);
950 	}
951 
952 	/* Enable receipt of timestamp */
953 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
954 	    &on, sizeof (on)) < 0) {
955 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
956 		    " SO_TIMESTAMP");
957 		return (_B_FALSE);
958 	}
959 
960 	return (_B_TRUE);
961 }
962 
963 /*
964  * IPv4 specific part in initializing the pii_probe_sock. This socket is
965  * used to send/receive ICMPv4 probe packets.
966  */
967 static boolean_t
968 phyint_inst_v4_sockinit(struct phyint_instance *pii)
969 {
970 	struct sockaddr_in  testaddr;
971 	char	char_off = 0;
972 	int	ttl = 1;
973 	char	char_ttl = 1;
974 	int	on = 1;
975 
976 	/*
977 	 * Open a raw socket with ICMPv4 protocol.
978 	 *
979 	 * Use IP_BOUND_IF to make sure that probes are sent and received on
980 	 * the specified phyint only.  Bind to the test address to ensure that
981 	 * the responses are sent to the specified phyint.
982 	 *
983 	 * Set the ttl to 1 so that probe packets are not routed.
984 	 * Disable multicast loopback.  Enable receipt of timestamp.
985 	 */
986 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
987 	if (pii->pii_probe_sock < 0) {
988 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
989 		return (_B_FALSE);
990 	}
991 
992 	bzero(&testaddr, sizeof (testaddr));
993 	testaddr.sin_family = AF_INET;
994 	testaddr.sin_port = 0;
995 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
996 	    &testaddr.sin_addr);
997 
998 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
999 	    sizeof (testaddr)) < 0) {
1000 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
1001 		return (_B_FALSE);
1002 	}
1003 
1004 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
1005 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
1006 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1007 		    " IP_BOUND_IF");
1008 		return (_B_FALSE);
1009 	}
1010 
1011 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
1012 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
1013 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1014 		    " IP_MULTICAST_IF");
1015 		return (_B_FALSE);
1016 	}
1017 
1018 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
1019 	    (char *)&ttl, sizeof (ttl)) < 0) {
1020 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1021 		    " IP_TTL");
1022 		return (_B_FALSE);
1023 	}
1024 
1025 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
1026 	    (char *)&char_off, sizeof (char_off)) == -1) {
1027 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1028 		    " IP_MULTICAST_LOOP");
1029 		return (_B_FALSE);
1030 	}
1031 
1032 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
1033 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
1034 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1035 		    " IP_MULTICAST_TTL");
1036 		return (_B_FALSE);
1037 	}
1038 
1039 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
1040 	    sizeof (on)) < 0) {
1041 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1042 		    " SO_TIMESTAMP");
1043 		return (_B_FALSE);
1044 	}
1045 
1046 	return (_B_TRUE);
1047 }
1048 
1049 /*
1050  * Remove the phyint group from the list of 'all phyint groups'
1051  * and free it.
1052  */
1053 void
1054 phyint_group_delete(struct phyint_group *pg)
1055 {
1056 	/*
1057 	 * The anonymous group always exists, even when empty.
1058 	 */
1059 	if (pg == phyint_anongroup)
1060 		return;
1061 
1062 	if (debug & D_PHYINT)
1063 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
1064 
1065 	/*
1066 	 * The phyint group must be empty, and must not have any phyints.
1067 	 * The phyint group must be in the list of all phyint groups
1068 	 */
1069 	assert(pg->pg_phyint == NULL);
1070 	assert(phyint_groups == pg || pg->pg_prev != NULL);
1071 
1072 	if (pg->pg_prev != NULL)
1073 		pg->pg_prev->pg_next = pg->pg_next;
1074 	else
1075 		phyint_groups = pg->pg_next;
1076 
1077 	if (pg->pg_next != NULL)
1078 		pg->pg_next->pg_prev = pg->pg_prev;
1079 
1080 	pg->pg_next = NULL;
1081 	pg->pg_prev = NULL;
1082 
1083 	phyint_grouplistsig++;
1084 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
1085 
1086 	addrlist_free(&pg->pg_addrs);
1087 	free(pg);
1088 }
1089 
1090 /*
1091  * Refresh the state of `pg' based on its current members.
1092  */
1093 void
1094 phyint_group_refresh_state(struct phyint_group *pg)
1095 {
1096 	enum pg_state state;
1097 	enum pg_state origstate = pg->pg_state;
1098 	struct phyint *pi, *usablepi;
1099 	uint_t nif = 0, nusable = 0;
1100 
1101 	/*
1102 	 * Anonymous groups never change state.
1103 	 */
1104 	if (pg == phyint_anongroup)
1105 		return;
1106 
1107 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1108 		nif++;
1109 		if (phyint_is_usable(pi)) {
1110 			nusable++;
1111 			usablepi = pi;
1112 		}
1113 	}
1114 
1115 	if (nusable == 0)
1116 		state = PG_FAILED;
1117 	else if (nif == nusable)
1118 		state = PG_OK;
1119 	else
1120 		state = PG_DEGRADED;
1121 
1122 	phyint_group_chstate(pg, state);
1123 
1124 	/*
1125 	 * If we're shutting down, skip logging messages since otherwise our
1126 	 * shutdown housecleaning will make us report that groups are unusable.
1127 	 */
1128 	if (cleanup_started)
1129 		return;
1130 
1131 	/*
1132 	 * NOTE: We use pg_failmsg_printed rather than origstate since
1133 	 * otherwise at startup we'll log a "now usable" message when the
1134 	 * first usable phyint is added to an empty group.
1135 	 */
1136 	if (state != PG_FAILED && pg->pg_failmsg_printed) {
1137 		assert(origstate == PG_FAILED);
1138 		logerr("At least 1 IP interface (%s) in group %s is now "
1139 		    "usable\n", usablepi->pi_name, pg->pg_name);
1140 		pg->pg_failmsg_printed = _B_FALSE;
1141 	} else if (origstate != PG_FAILED && state == PG_FAILED) {
1142 		logerr("All IP interfaces in group %s are now unusable\n",
1143 		    pg->pg_name);
1144 		pg->pg_failmsg_printed = _B_TRUE;
1145 	}
1146 }
1147 
1148 /*
1149  * Extract information from the kernel about the desired phyint.
1150  * Look only for properties of the phyint and not properties of logints.
1151  * Take appropriate action on the changes.
1152  * Return codes:
1153  *	PI_OK
1154  *		The phyint exists in the kernel and matches our knowledge
1155  *		of the phyint.
1156  *	PI_DELETED
1157  *		The phyint has vanished in the kernel.
1158  *	PI_IFINDEX_CHANGED
1159  *		The phyint's interface index has changed.
1160  *		Ask the caller to delete and recreate the phyint.
1161  *	PI_IOCTL_ERROR
1162  *		Some ioctl error. Don't change anything.
1163  *	PI_GROUP_CHANGED
1164  *		The phyint has changed group.
1165  */
1166 int
1167 phyint_inst_update_from_k(struct phyint_instance *pii)
1168 {
1169 	struct lifreq lifr;
1170 	int	ifsock;
1171 	struct phyint *pi;
1172 
1173 	pi = pii->pii_phyint;
1174 
1175 	if (debug & D_PHYINT) {
1176 		logdebug("phyint_inst_update_from_k(%s %s)\n",
1177 		    AF_STR(pii->pii_af), pi->pi_name);
1178 	}
1179 
1180 	/*
1181 	 * Get the ifindex from the kernel, for comparison with the
1182 	 * value in our tables.
1183 	 */
1184 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
1185 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1186 
1187 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1188 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
1189 		if (errno == ENXIO) {
1190 			return (PI_DELETED);
1191 		} else {
1192 			logperror_pii(pii, "phyint_inst_update_from_k:"
1193 			    " ioctl (get lifindex)");
1194 			return (PI_IOCTL_ERROR);
1195 		}
1196 	}
1197 
1198 	if (lifr.lifr_index != pi->pi_ifindex) {
1199 		/*
1200 		 * The index has changed. Most likely the interface has
1201 		 * been unplumbed and replumbed. Ask the caller to take
1202 		 * appropriate action.
1203 		 */
1204 		if (debug & D_PHYINT) {
1205 			logdebug("phyint_inst_update_from_k:"
1206 			    " old index %d new index %d\n",
1207 			    pi->pi_ifindex, lifr.lifr_index);
1208 		}
1209 		return (PI_IFINDEX_CHANGED);
1210 	}
1211 
1212 	/*
1213 	 * Get the group name from the kernel, for comparison with
1214 	 * the value in our tables.
1215 	 */
1216 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
1217 		if (errno == ENXIO) {
1218 			return (PI_DELETED);
1219 		} else {
1220 			logperror_pii(pii, "phyint_inst_update_from_k:"
1221 			    " ioctl (get groupname)");
1222 			return (PI_IOCTL_ERROR);
1223 		}
1224 	}
1225 
1226 	/*
1227 	 * If the phyint has changed group i.e. if the phyint group name
1228 	 * returned by the kernel is different, ask the caller to delete
1229 	 * and recreate the phyint in the right group
1230 	 */
1231 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
1232 		/* Groupname has changed */
1233 		if (debug & D_PHYINT) {
1234 			logdebug("phyint_inst_update_from_k:"
1235 			    " groupname change\n");
1236 		}
1237 		return (PI_GROUP_CHANGED);
1238 	}
1239 
1240 	/*
1241 	 * Get the current phyint flags from the kernel, and determine what
1242 	 * flags have changed by comparing against our tables.	Note that the
1243 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
1244 	 * that IFF_INACTIVE is really still set on the interface.
1245 	 */
1246 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
1247 		if (errno == ENXIO) {
1248 			return (PI_DELETED);
1249 		} else {
1250 			logperror_pii(pii, "phyint_inst_update_from_k: "
1251 			    " ioctl (get flags)");
1252 			return (PI_IOCTL_ERROR);
1253 		}
1254 	}
1255 
1256 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
1257 	if (pi->pi_v4 != NULL)
1258 		pi->pi_v4->pii_flags = pi->pi_flags;
1259 	if (pi->pi_v6 != NULL)
1260 		pi->pi_v6->pii_flags = pi->pi_flags;
1261 
1262 	/*
1263 	 * Make sure the IFF_FAILED flag is set if and only if we think
1264 	 * the interface should be failed.
1265 	 */
1266 	if (pi->pi_flags & IFF_FAILED) {
1267 		if (pi->pi_state == PI_RUNNING)
1268 			(void) change_pif_flags(pi, 0, IFF_FAILED);
1269 	} else {
1270 		if (pi->pi_state == PI_FAILED)
1271 			(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1272 	}
1273 
1274 	/* No change in phyint status */
1275 	return (PI_OK);
1276 }
1277 
1278 /*
1279  * Delete the phyint. Remove it from the list of all phyints, and the
1280  * list of phyint group members.
1281  */
1282 static void
1283 phyint_delete(struct phyint *pi)
1284 {
1285 	struct phyint *pi2;
1286 	struct phyint_group *pg = pi->pi_group;
1287 
1288 	if (debug & D_PHYINT)
1289 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1290 
1291 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1292 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1293 
1294 	/*
1295 	 * The phyint must belong to a group.
1296 	 */
1297 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1298 
1299 	/* The phyint must be in the list of all phyints */
1300 	assert(phyints == pi || pi->pi_prev != NULL);
1301 
1302 	/* Remove the phyint from the phyint group list */
1303 	pg->pg_sig++;
1304 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1305 
1306 	if (pi->pi_pgprev == NULL) {
1307 		/* Phyint is the 1st in the phyint group list */
1308 		pg->pg_phyint = pi->pi_pgnext;
1309 	} else {
1310 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1311 	}
1312 	if (pi->pi_pgnext != NULL)
1313 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1314 	pi->pi_pgnext = NULL;
1315 	pi->pi_pgprev = NULL;
1316 
1317 	/* Refresh the group state now that this phyint has been removed */
1318 	phyint_group_refresh_state(pg);
1319 
1320 	/* Remove the phyint from the global list of phyints */
1321 	if (pi->pi_prev == NULL) {
1322 		/* Phyint is the 1st in the list */
1323 		phyints = pi->pi_next;
1324 	} else {
1325 		pi->pi_prev->pi_next = pi->pi_next;
1326 	}
1327 	if (pi->pi_next != NULL)
1328 		pi->pi_next->pi_prev = pi->pi_prev;
1329 	pi->pi_next = NULL;
1330 	pi->pi_prev = NULL;
1331 
1332 	/*
1333 	 * See if another phyint in the group had been offlined because
1334 	 * it was a dup of `pi' -- and if so, online it.
1335 	 */
1336 	if (!pi->pi_hwaddrdup &&
1337 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1338 		assert(pi2->pi_hwaddrdup);
1339 		(void) phyint_undo_offline(pi2);
1340 	}
1341 	phyint_link_close(pi);
1342 	free(pi);
1343 }
1344 
1345 /*
1346  * Offline phyint `pi' if at least `minred' usable interfaces remain in the
1347  * group.  Returns an IPMP error code.
1348  */
1349 int
1350 phyint_offline(struct phyint *pi, uint_t minred)
1351 {
1352 	boolean_t was_active;
1353 	unsigned int nusable = 0;
1354 	struct phyint *pi2;
1355 	struct phyint_group *pg = pi->pi_group;
1356 
1357 	/*
1358 	 * Verify that enough usable interfaces in the group would remain.
1359 	 * As a special case, if the group has failed, allow any non-offline
1360 	 * phyints to be offlined.
1361 	 */
1362 	if (pg != phyint_anongroup) {
1363 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1364 			if (pi2 == pi)
1365 				continue;
1366 			if (phyint_is_usable(pi2) ||
1367 			    (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
1368 				nusable++;
1369 		}
1370 	}
1371 	if (nusable < minred)
1372 		return (IPMP_EMINRED);
1373 
1374 	was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1375 
1376 	if (!change_pif_flags(pi, IFF_OFFLINE, IFF_INACTIVE))
1377 		return (IPMP_FAILURE);
1378 
1379 	/*
1380 	 * The interface is now offline, so stop probing it.  Note that
1381 	 * if_mpadm(1M) will down the test addresses, after receiving a
1382 	 * success reply from us. The routing socket message will then make us
1383 	 * close the socket used for sending probes. But it is more logical
1384 	 * that an offlined interface must not be probed, even if it has test
1385 	 * addresses.
1386 	 *
1387 	 * NOTE: stop_probing() also sets PI_OFFLINE.
1388 	 */
1389 	stop_probing(pi);
1390 
1391 	/*
1392 	 * If we're offlining the phyint because it has a duplicate hardware
1393 	 * address, print a warning -- and leave the link open so that we can
1394 	 * be notified of hardware address changes that make it usable again.
1395 	 * Otherwise, close the link so that we won't prevent a detach.
1396 	 */
1397 	if (pi->pi_hwaddrdup) {
1398 		logerr("IP interface %s has a hardware address which is not "
1399 		    "unique in group %s; offlining\n", pi->pi_name,
1400 		    pg->pg_name);
1401 	} else {
1402 		phyint_link_close(pi);
1403 	}
1404 
1405 	/*
1406 	 * If this phyint was preventing another phyint with a duplicate
1407 	 * hardware address from being online, bring that one online now.
1408 	 */
1409 	if (!pi->pi_hwaddrdup &&
1410 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1411 		assert(pi2->pi_hwaddrdup);
1412 		(void) phyint_undo_offline(pi2);
1413 	}
1414 
1415 	/*
1416 	 * If this interface was active, try to activate another INACTIVE
1417 	 * interface in the group.
1418 	 */
1419 	if (was_active)
1420 		phyint_activate_another(pi);
1421 
1422 	return (IPMP_SUCCESS);
1423 }
1424 
1425 /*
1426  * Undo a previous offline of `pi'.  Returns an IPMP error code.
1427  */
1428 int
1429 phyint_undo_offline(struct phyint *pi)
1430 {
1431 	if (pi->pi_state != PI_OFFLINE) {
1432 		errno = EINVAL;
1433 		return (IPMP_FAILURE);
1434 	}
1435 
1436 	/*
1437 	 * If necessary, reinitialize our link information and verify that its
1438 	 * hardware address is still unique across the group.
1439 	 */
1440 	if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
1441 		errno = EIO;
1442 		return (IPMP_FAILURE);
1443 	}
1444 
1445 	if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
1446 		pi->pi_hwaddrdup = _B_TRUE;
1447 		return (IPMP_EHWADDRDUP);
1448 	}
1449 
1450 	if (pi->pi_hwaddrdup) {
1451 		logerr("IP interface %s now has a unique hardware address in "
1452 		    "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
1453 		pi->pi_hwaddrdup = _B_FALSE;
1454 	}
1455 
1456 	if (!change_pif_flags(pi, 0, IFF_OFFLINE))
1457 		return (IPMP_FAILURE);
1458 
1459 	/*
1460 	 * While the interface was offline, it may have failed (e.g. the link
1461 	 * may have gone down).  phyint_inst_check_for_failure() will have
1462 	 * already set pi_flags with IFF_FAILED, so we can use that to decide
1463 	 * whether the phyint should transition to running.  Note that after
1464 	 * we transition to running, we will start sending probes again (if
1465 	 * test addresses are configured), which may also reveal that the
1466 	 * interface is in fact failed.
1467 	 */
1468 	if (pi->pi_flags & IFF_FAILED) {
1469 		phyint_chstate(pi, PI_FAILED);
1470 	} else {
1471 		/* calls phyint_chstate() */
1472 		phyint_transition_to_running(pi);
1473 	}
1474 
1475 	/*
1476 	 * Give the requestor time to configure test addresses before
1477 	 * complaining that they're missing.
1478 	 */
1479 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
1480 
1481 	return (IPMP_SUCCESS);
1482 }
1483 
1484 /*
1485  * Delete (unlink and free), the phyint instance.
1486  */
1487 void
1488 phyint_inst_delete(struct phyint_instance *pii)
1489 {
1490 	struct phyint *pi = pii->pii_phyint;
1491 
1492 	assert(pi != NULL);
1493 
1494 	if (debug & D_PHYINT) {
1495 		logdebug("phyint_inst_delete(%s %s)\n",
1496 		    AF_STR(pii->pii_af), pi->pi_name);
1497 	}
1498 
1499 	/*
1500 	 * If the phyint instance has associated probe targets
1501 	 * delete all the targets
1502 	 */
1503 	while (pii->pii_targets != NULL)
1504 		target_delete(pii->pii_targets);
1505 
1506 	/*
1507 	 * Delete all the logints associated with this phyint
1508 	 * instance.
1509 	 */
1510 	while (pii->pii_logint != NULL)
1511 		logint_delete(pii->pii_logint);
1512 
1513 	/*
1514 	 * Close the socket used to send probes to targets from this phyint.
1515 	 */
1516 	if (pii->pii_probe_sock != -1)
1517 		close_probe_socket(pii, _B_TRUE);
1518 
1519 	/*
1520 	 * Phyint instance must be in the list of all phyint instances.
1521 	 * Remove phyint instance from the global list of phyint instances.
1522 	 */
1523 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1524 	if (pii->pii_prev == NULL) {
1525 		/* Phyint is the 1st in the list */
1526 		phyint_instances = pii->pii_next;
1527 	} else {
1528 		pii->pii_prev->pii_next = pii->pii_next;
1529 	}
1530 	if (pii->pii_next != NULL)
1531 		pii->pii_next->pii_prev = pii->pii_prev;
1532 	pii->pii_next = NULL;
1533 	pii->pii_prev = NULL;
1534 
1535 	/*
1536 	 * Reset the phyint instance pointer in the phyint.
1537 	 * If this is the last phyint instance (being deleted) on this
1538 	 * phyint, then delete the phyint.
1539 	 */
1540 	if (pii->pii_af == AF_INET)
1541 		pi->pi_v4 = NULL;
1542 	else
1543 		pi->pi_v6 = NULL;
1544 
1545 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1546 		phyint_delete(pi);
1547 
1548 	free(pii);
1549 }
1550 
1551 static void
1552 phyint_inst_print(struct phyint_instance *pii)
1553 {
1554 	struct logint *li;
1555 	struct target *tg;
1556 	char abuf[INET6_ADDRSTRLEN];
1557 	int most_recent;
1558 	int i;
1559 
1560 	if (pii->pii_phyint == NULL) {
1561 		logdebug("pii->pi_phyint NULL can't print\n");
1562 		return;
1563 	}
1564 
1565 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1566 	    "sock %x in_use %d\n",
1567 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1568 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1569 	    pii->pii_in_use);
1570 
1571 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1572 		logint_print(li);
1573 
1574 	logdebug("\n");
1575 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1576 		target_print(tg);
1577 
1578 	if (pii->pii_targets == NULL)
1579 		logdebug("pi_targets NULL\n");
1580 
1581 	if (pii->pii_target_next != NULL) {
1582 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1583 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1584 		    abuf, sizeof (abuf)));
1585 	} else {
1586 		logdebug("pi_target_next NULL\n");
1587 	}
1588 
1589 	if (pii->pii_rtt_target_next != NULL) {
1590 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1591 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1592 		    abuf, sizeof (abuf)));
1593 	} else {
1594 		logdebug("pi_rtt_target_next NULL\n");
1595 	}
1596 
1597 	if (pii->pii_targets != NULL) {
1598 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1599 
1600 		i = most_recent;
1601 		do {
1602 			if (pii->pii_probes[i].pr_target != NULL) {
1603 				logdebug("#%d target %s ", i,
1604 				    pr_addr(pii->pii_af,
1605 				    pii->pii_probes[i].pr_target->tg_address,
1606 				    abuf, sizeof (abuf)));
1607 			} else {
1608 				logdebug("#%d target NULL ", i);
1609 			}
1610 			logdebug("time_start %lld status %d "
1611 			    "time_ackproc %lld time_lost %u",
1612 			    pii->pii_probes[i].pr_hrtime_start,
1613 			    pii->pii_probes[i].pr_status,
1614 			    pii->pii_probes[i].pr_hrtime_ackproc,
1615 			    pii->pii_probes[i].pr_time_lost);
1616 			i = PROBE_INDEX_PREV(i);
1617 		} while (i != most_recent);
1618 	}
1619 }
1620 
1621 /*
1622  * Lookup a logint based on the logical interface name, on the given
1623  * phyint instance.
1624  */
1625 static struct logint *
1626 logint_lookup(struct phyint_instance *pii, char *name)
1627 {
1628 	struct logint *li;
1629 
1630 	if (debug & D_LOGINT) {
1631 		logdebug("logint_lookup(%s, %s)\n",
1632 		    AF_STR(pii->pii_af), name);
1633 	}
1634 
1635 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1636 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1637 			break;
1638 	}
1639 	return (li);
1640 }
1641 
1642 /*
1643  * Insert a logint at the head of the list of logints of the given
1644  * phyint instance
1645  */
1646 static void
1647 logint_insert(struct phyint_instance *pii, struct logint *li)
1648 {
1649 	li->li_next = pii->pii_logint;
1650 	li->li_prev = NULL;
1651 	if (pii->pii_logint != NULL)
1652 		pii->pii_logint->li_prev = li;
1653 	pii->pii_logint = li;
1654 	li->li_phyint_inst = pii;
1655 }
1656 
1657 /*
1658  * Create a new named logint, on the specified phyint instance.
1659  */
1660 static struct logint *
1661 logint_create(struct phyint_instance *pii, char *name)
1662 {
1663 	struct logint *li;
1664 
1665 	if (debug & D_LOGINT) {
1666 		logdebug("logint_create(%s %s %s)\n",
1667 		    AF_STR(pii->pii_af), pii->pii_name, name);
1668 	}
1669 
1670 	li = calloc(1, sizeof (struct logint));
1671 	if (li == NULL) {
1672 		logperror("logint_create: calloc");
1673 		return (NULL);
1674 	}
1675 
1676 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1677 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1678 	logint_insert(pii, li);
1679 	return (li);
1680 }
1681 
1682 /*
1683  * Initialize the logint based on the data returned by the kernel.
1684  */
1685 void
1686 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1687 {
1688 	int	ifsock;
1689 	uint64_t flags;
1690 	uint64_t saved_flags;
1691 	struct	logint	*li;
1692 	struct lifreq	lifr;
1693 	struct in6_addr	test_subnet;
1694 	struct in6_addr	testaddr;
1695 	int	test_subnet_len;
1696 	struct sockaddr_in6	*sin6;
1697 	struct sockaddr_in	*sin;
1698 	char abuf[INET6_ADDRSTRLEN];
1699 	boolean_t  ptp = _B_FALSE;
1700 	struct in6_addr tgaddr;
1701 
1702 	if (debug & D_LOGINT) {
1703 		logdebug("logint_init_from_k(%s %s)\n",
1704 		    AF_STR(pii->pii_af), li_name);
1705 	}
1706 
1707 	/* Get the socket for doing ioctls */
1708 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1709 
1710 	/*
1711 	 * Get the flags from the kernel. Also serves as a check whether
1712 	 * the logical still exists. If it doesn't exist, no need to proceed
1713 	 * any further. li_in_use will make the caller clean up the logint
1714 	 */
1715 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1716 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1717 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1718 		/* Interface may have vanished */
1719 		if (errno != ENXIO) {
1720 			logperror_pii(pii, "logint_init_from_k: "
1721 			    "ioctl (get flags)");
1722 		}
1723 		return;
1724 	}
1725 
1726 	flags = lifr.lifr_flags;
1727 
1728 	/*
1729 	 * Verified the logint exists. Now lookup the logint in our tables.
1730 	 * If it does not exist, create a new logint.
1731 	 */
1732 	li = logint_lookup(pii, li_name);
1733 	if (li == NULL) {
1734 		li = logint_create(pii, li_name);
1735 		if (li == NULL) {
1736 			/*
1737 			 * Pretend the interface does not exist
1738 			 * in the kernel
1739 			 */
1740 			return;
1741 		}
1742 	}
1743 
1744 	/*
1745 	 * Update li->li_flags with the new flags, after saving the old
1746 	 * value. This is used later to check what flags has changed and
1747 	 * take any action
1748 	 */
1749 	saved_flags = li->li_flags;
1750 	li->li_flags = flags;
1751 
1752 	/*
1753 	 * Get the address, prefix, prefixlength and update the logint.
1754 	 * Check if anything has changed. If the logint used for the
1755 	 * test address has changed, take suitable action.
1756 	 */
1757 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1758 		/* Interface may have vanished */
1759 		if (errno != ENXIO) {
1760 			logperror_li(li, "logint_init_from_k: (get addr)");
1761 		}
1762 		goto error;
1763 	}
1764 
1765 	if (pii->pii_af == AF_INET) {
1766 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1767 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1768 	} else {
1769 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1770 		testaddr = sin6->sin6_addr;
1771 	}
1772 
1773 	if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1774 		/* Interface may have vanished */
1775 		if (errno != ENXIO)
1776 			logperror_li(li, "logint_init_from_k: (get subnet)");
1777 		goto error;
1778 	}
1779 	if (lifr.lifr_subnet.ss_family == AF_INET6) {
1780 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1781 		test_subnet = sin6->sin6_addr;
1782 		test_subnet_len = lifr.lifr_addrlen;
1783 	} else {
1784 		sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1785 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1786 		test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
1787 	}
1788 
1789 	/*
1790 	 * If this is the logint corresponding to the test address used for
1791 	 * sending probes, then if anything significant has changed we need to
1792 	 * determine the test address again.  We ignore changes to the
1793 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1794 	 * course.
1795 	 */
1796 	if (pii->pii_probe_logint == li) {
1797 		if (((li->li_flags ^ saved_flags) &
1798 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1799 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1800 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1801 		    &li->li_subnet)) ||
1802 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1803 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1804 			/*
1805 			 * Something significant that affects the testaddress
1806 			 * has changed. Redo the testaddress selection later on
1807 			 * in select_test_ifs(). For now do the cleanup and
1808 			 * set pii_probe_logint to NULL.
1809 			 */
1810 			if (pii->pii_probe_sock != -1)
1811 				close_probe_socket(pii, _B_TRUE);
1812 			pii->pii_probe_logint = NULL;
1813 		}
1814 	}
1815 
1816 
1817 	/* Update the logint with the values obtained from the kernel.	*/
1818 	li->li_addr = testaddr;
1819 	li->li_in_use = 1;
1820 	if (ptp) {
1821 		li->li_dstaddr = tgaddr;
1822 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1823 		    IP_ABITS : IPV6_ABITS;
1824 	} else {
1825 		li->li_subnet = test_subnet;
1826 		li->li_subnet_len = test_subnet_len;
1827 	}
1828 
1829 	if (debug & D_LOGINT)
1830 		logint_print(li);
1831 
1832 	return;
1833 
1834 error:
1835 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1836 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1837 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1838 	logint_delete(li);
1839 }
1840 
1841 /*
1842  * Delete (unlink and free) a logint.
1843  */
1844 void
1845 logint_delete(struct logint *li)
1846 {
1847 	struct phyint_instance *pii;
1848 
1849 	pii = li->li_phyint_inst;
1850 	assert(pii != NULL);
1851 
1852 	if (debug & D_LOGINT) {
1853 		int af;
1854 		char abuf[INET6_ADDRSTRLEN];
1855 
1856 		af = pii->pii_af;
1857 		logdebug("logint_delete(%s %s %s/%u)\n",
1858 		    AF_STR(af), li->li_name,
1859 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1860 		    li->li_subnet_len);
1861 	}
1862 
1863 	/* logint must be in the list of logints */
1864 	assert(pii->pii_logint == li || li->li_prev != NULL);
1865 
1866 	/* Remove the logint from the list of logints  */
1867 	if (li->li_prev == NULL) {
1868 		/* logint is the 1st in the list */
1869 		pii->pii_logint = li->li_next;
1870 	} else {
1871 		li->li_prev->li_next = li->li_next;
1872 	}
1873 	if (li->li_next != NULL)
1874 		li->li_next->li_prev = li->li_prev;
1875 	li->li_next = NULL;
1876 	li->li_prev = NULL;
1877 
1878 	/*
1879 	 * If this logint is also being used for probing, then close the
1880 	 * associated socket, if it exists.
1881 	 */
1882 	if (pii->pii_probe_logint == li) {
1883 		if (pii->pii_probe_sock != -1)
1884 			close_probe_socket(pii, _B_TRUE);
1885 		pii->pii_probe_logint = NULL;
1886 	}
1887 
1888 	free(li);
1889 }
1890 
1891 static void
1892 logint_print(struct logint *li)
1893 {
1894 	char abuf[INET6_ADDRSTRLEN];
1895 	int af = li->li_phyint_inst->pii_af;
1896 
1897 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1898 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1899 
1900 	logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
1901 }
1902 
1903 char *
1904 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1905 {
1906 	struct in_addr	addr_v4;
1907 
1908 	if (af == AF_INET) {
1909 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1910 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1911 	} else {
1912 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1913 	}
1914 	return (abuf);
1915 }
1916 
1917 /*
1918  * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
1919  * represented by the [`af',`addr'] pair.  Needed because in.mpathd internally
1920  * stores all addresses as in6_addrs, but we don't want to expose that.
1921  */
1922 void
1923 addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
1924 {
1925 	struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
1926 	struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
1927 
1928 	assert(af == AF_INET || af == AF_INET6);
1929 
1930 	switch (af) {
1931 	case AF_INET:
1932 		(void) memset(sinp, 0, sizeof (*sinp));
1933 		sinp->sin_family = AF_INET;
1934 		IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
1935 		break;
1936 	case AF_INET6:
1937 		(void) memset(sin6p, 0, sizeof (*sin6p));
1938 		sin6p->sin6_family = AF_INET6;
1939 		sin6p->sin6_addr = *addr;
1940 		break;
1941 	}
1942 }
1943 
1944 /* Lookup target on its address */
1945 struct target *
1946 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1947 {
1948 	struct target *tg;
1949 
1950 	if (debug & D_TARGET) {
1951 		char abuf[INET6_ADDRSTRLEN];
1952 
1953 		logdebug("target_lookup(%s %s): addr %s\n",
1954 		    AF_STR(pii->pii_af), pii->pii_name,
1955 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1956 	}
1957 
1958 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1959 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1960 			break;
1961 	}
1962 	return (tg);
1963 }
1964 
1965 /*
1966  * Find and return the next active target, for the next probe.
1967  * If no active targets are available, return NULL.
1968  */
1969 struct target *
1970 target_next(struct target *tg)
1971 {
1972 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
1973 	struct	target	*marker = tg;
1974 	hrtime_t now;
1975 
1976 	now = gethrtime();
1977 
1978 	/*
1979 	 * Target must be in the list of targets for this phyint
1980 	 * instance.
1981 	 */
1982 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1983 	assert(pii->pii_targets != NULL);
1984 
1985 	/* Return the next active target */
1986 	do {
1987 		/*
1988 		 * Go to the next target. If we hit the end,
1989 		 * reset the ptr to the head
1990 		 */
1991 		tg = tg->tg_next;
1992 		if (tg == NULL)
1993 			tg = pii->pii_targets;
1994 
1995 		assert(TG_STATUS_VALID(tg->tg_status));
1996 
1997 		switch (tg->tg_status) {
1998 		case TG_ACTIVE:
1999 			return (tg);
2000 
2001 		case TG_UNUSED:
2002 			assert(pii->pii_targets_are_routers);
2003 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
2004 				/*
2005 				 * Bubble up the unused target to active
2006 				 */
2007 				tg->tg_status = TG_ACTIVE;
2008 				pii->pii_ntargets++;
2009 				return (tg);
2010 			}
2011 			break;
2012 
2013 		case TG_SLOW:
2014 			assert(pii->pii_targets_are_routers);
2015 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2016 				/*
2017 				 * Bubble up the slow target to unused
2018 				 */
2019 				tg->tg_status = TG_UNUSED;
2020 			}
2021 			break;
2022 
2023 		case TG_DEAD:
2024 			assert(pii->pii_targets_are_routers);
2025 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2026 				/*
2027 				 * Bubble up the dead target to slow
2028 				 */
2029 				tg->tg_status = TG_SLOW;
2030 				tg->tg_latime = now;
2031 			}
2032 			break;
2033 		}
2034 
2035 	} while (tg != marker);
2036 
2037 	return (NULL);
2038 }
2039 
2040 /*
2041  * Select the best available target, that is not already TG_ACTIVE,
2042  * for the caller. The caller will determine whether it wants to
2043  * make the returned target TG_ACTIVE.
2044  * The selection order is as follows.
2045  * 1. pick a TG_UNSED target, if it exists.
2046  * 2. else pick a TG_SLOW target that has recovered, if it exists
2047  * 3. else pick any TG_SLOW target, if it exists
2048  * 4. else pick a TG_DEAD target that has recovered, if it exists
2049  * 5. else pick any TG_DEAD target, if it exists
2050  * 6. else return null
2051  */
2052 static struct target *
2053 target_select_best(struct phyint_instance *pii)
2054 {
2055 	struct target *tg;
2056 	struct target *slow = NULL;
2057 	struct target *dead = NULL;
2058 	struct target *slow_recovered = NULL;
2059 	struct target *dead_recovered = NULL;
2060 	hrtime_t now;
2061 
2062 	now = gethrtime();
2063 
2064 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2065 		assert(TG_STATUS_VALID(tg->tg_status));
2066 
2067 		switch (tg->tg_status) {
2068 		case TG_UNUSED:
2069 			return (tg);
2070 
2071 		case TG_SLOW:
2072 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2073 				slow_recovered = tg;
2074 				/*
2075 				 * Promote the slow_recovered to unused
2076 				 */
2077 				tg->tg_status = TG_UNUSED;
2078 			} else {
2079 				slow = tg;
2080 			}
2081 			break;
2082 
2083 		case TG_DEAD:
2084 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2085 				dead_recovered = tg;
2086 				/*
2087 				 * Promote the dead_recovered to slow
2088 				 */
2089 				tg->tg_status = TG_SLOW;
2090 				tg->tg_latime = now;
2091 			} else {
2092 				dead = tg;
2093 			}
2094 			break;
2095 
2096 		default:
2097 			break;
2098 		}
2099 	}
2100 
2101 	if (slow_recovered != NULL)
2102 		return (slow_recovered);
2103 	else if (slow != NULL)
2104 		return (slow);
2105 	else if (dead_recovered != NULL)
2106 		return (dead_recovered);
2107 	else
2108 		return (dead);
2109 }
2110 
2111 /*
2112  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
2113  * that are active, pick the next best below.
2114  */
2115 static void
2116 target_activate_all(struct phyint_instance *pii)
2117 {
2118 	struct target *tg;
2119 
2120 	assert(pii->pii_ntargets == 0);
2121 	assert(pii->pii_target_next == NULL);
2122 	assert(pii->pii_rtt_target_next == NULL);
2123 	assert(pii->pii_targets_are_routers);
2124 
2125 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
2126 		tg = target_select_best(pii);
2127 		if (tg == NULL) {
2128 			/* We are out of targets */
2129 			return;
2130 		}
2131 
2132 		assert(TG_STATUS_VALID(tg->tg_status));
2133 		assert(tg->tg_status != TG_ACTIVE);
2134 		tg->tg_status = TG_ACTIVE;
2135 		pii->pii_ntargets++;
2136 		if (pii->pii_target_next == NULL) {
2137 			pii->pii_target_next = tg;
2138 			pii->pii_rtt_target_next = tg;
2139 		}
2140 	}
2141 }
2142 
2143 static struct target *
2144 target_first(struct phyint_instance *pii)
2145 {
2146 	struct target *tg;
2147 
2148 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2149 		assert(TG_STATUS_VALID(tg->tg_status));
2150 		if (tg->tg_status == TG_ACTIVE)
2151 			break;
2152 	}
2153 
2154 	return (tg);
2155 }
2156 
2157 /*
2158  * Create a default target entry.
2159  */
2160 void
2161 target_create(struct phyint_instance *pii, struct in6_addr addr,
2162     boolean_t is_router)
2163 {
2164 	struct target *tg;
2165 	struct phyint *pi;
2166 	struct logint *li;
2167 
2168 	if (debug & D_TARGET) {
2169 		char abuf[INET6_ADDRSTRLEN];
2170 
2171 		logdebug("target_create(%s %s, %s)\n",
2172 		    AF_STR(pii->pii_af), pii->pii_name,
2173 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
2174 	}
2175 
2176 	/*
2177 	 * If the test address is not yet initialized, do not add
2178 	 * any target, since we cannot determine whether the target
2179 	 * belongs to the same subnet as the test address.
2180 	 */
2181 	li = pii->pii_probe_logint;
2182 	if (li == NULL)
2183 		return;
2184 
2185 	/*
2186 	 * If there are multiple subnets associated with an interface, then
2187 	 * add the target to this phyint instance only if it belongs to the
2188 	 * same subnet as the test address.  This assures us that we will
2189 	 * be able to reach this target through our routing table.
2190 	 */
2191 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
2192 		return;
2193 
2194 	if (pii->pii_targets != NULL) {
2195 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
2196 		if (is_router) {
2197 			if (!pii->pii_targets_are_routers) {
2198 				/*
2199 				 * Prefer router over hosts. Using hosts is a
2200 				 * fallback mechanism, hence delete all host
2201 				 * targets.
2202 				 */
2203 				while (pii->pii_targets != NULL)
2204 					target_delete(pii->pii_targets);
2205 			}
2206 		} else {
2207 			/*
2208 			 * Routers take precedence over hosts. If this
2209 			 * is a router list and we are trying to add a
2210 			 * host, just return. If this is a host list
2211 			 * and if we have sufficient targets, just return
2212 			 */
2213 			if (pii->pii_targets_are_routers ||
2214 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
2215 				return;
2216 		}
2217 	}
2218 
2219 	tg = calloc(1, sizeof (struct target));
2220 	if (tg == NULL) {
2221 		logperror("target_create: calloc");
2222 		return;
2223 	}
2224 
2225 	tg->tg_phyint_inst = pii;
2226 	tg->tg_address = addr;
2227 	tg->tg_in_use = 1;
2228 	tg->tg_rtt_sa = -1;
2229 	tg->tg_num_deferred = 0;
2230 
2231 	/*
2232 	 * If this is the first target, set 'pii_targets_are_routers'
2233 	 * The list of targets is either a list of hosts or list or
2234 	 * routers, but not a mix.
2235 	 */
2236 	if (pii->pii_targets == NULL) {
2237 		assert(pii->pii_ntargets == 0);
2238 		assert(pii->pii_target_next == NULL);
2239 		assert(pii->pii_rtt_target_next == NULL);
2240 		pii->pii_targets_are_routers = is_router ? 1 : 0;
2241 	}
2242 
2243 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
2244 		assert(pii->pii_targets_are_routers);
2245 		assert(pii->pii_target_next != NULL);
2246 		assert(pii->pii_rtt_target_next != NULL);
2247 		tg->tg_status = TG_UNUSED;
2248 	} else {
2249 		if (pii->pii_ntargets == 0) {
2250 			assert(pii->pii_target_next == NULL);
2251 			pii->pii_target_next = tg;
2252 			pii->pii_rtt_target_next = tg;
2253 		}
2254 		pii->pii_ntargets++;
2255 		tg->tg_status = TG_ACTIVE;
2256 	}
2257 
2258 	target_insert(pii, tg);
2259 
2260 	/*
2261 	 * Change state to PI_RUNNING if this phyint instance is capable of
2262 	 * sending and receiving probes -- that is, if we know of at least 1
2263 	 * target, and this phyint instance is probe-capable.  For more
2264 	 * details, see the phyint state diagram in mpd_probe.c.
2265 	 */
2266 	pi = pii->pii_phyint;
2267 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
2268 		if (pi->pi_flags & IFF_FAILED)
2269 			phyint_chstate(pi, PI_FAILED);
2270 		else
2271 			phyint_chstate(pi, PI_RUNNING);
2272 	}
2273 }
2274 
2275 /*
2276  * Add the target address named by `addr' to phyint instance `pii' if it does
2277  * not already exist.  If the target is a router, `is_router' should be set to
2278  * B_TRUE.
2279  */
2280 void
2281 target_add(struct phyint_instance *pii, struct in6_addr addr,
2282     boolean_t is_router)
2283 {
2284 	struct target *tg;
2285 
2286 	if (pii == NULL)
2287 		return;
2288 
2289 	tg = target_lookup(pii, addr);
2290 
2291 	/*
2292 	 * If the target does not exist, create it; target_create() will set
2293 	 * tg_in_use to true.  Even if it exists already, if it's a router
2294 	 * target and we'd previously learned of it through multicast, then we
2295 	 * need to recreate it as a router target.  Otherwise, just set
2296 	 * tg_in_use to to true so that init_router_targets() won't delete it.
2297 	 */
2298 	if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
2299 		target_create(pii, addr, is_router);
2300 	else if (is_router)
2301 		tg->tg_in_use = 1;
2302 }
2303 
2304 /*
2305  * Insert target at head of linked list of targets for the associated
2306  * phyint instance
2307  */
2308 static void
2309 target_insert(struct phyint_instance *pii, struct target *tg)
2310 {
2311 	tg->tg_next = pii->pii_targets;
2312 	tg->tg_prev = NULL;
2313 	if (tg->tg_next != NULL)
2314 		tg->tg_next->tg_prev = tg;
2315 	pii->pii_targets = tg;
2316 }
2317 
2318 /*
2319  * Delete a target (unlink and free).
2320  */
2321 void
2322 target_delete(struct target *tg)
2323 {
2324 	int af;
2325 	struct phyint_instance	*pii;
2326 	struct phyint_instance	*pii_other;
2327 
2328 	pii = tg->tg_phyint_inst;
2329 	af = pii->pii_af;
2330 
2331 	if (debug & D_TARGET) {
2332 		char abuf[INET6_ADDRSTRLEN];
2333 
2334 		logdebug("target_delete(%s %s, %s)\n",
2335 		    AF_STR(af), pii->pii_name,
2336 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
2337 	}
2338 
2339 	/*
2340 	 * Target must be in the list of targets for this phyint
2341 	 * instance.
2342 	 */
2343 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2344 
2345 	/*
2346 	 * Reset all references to 'tg' in the probe information
2347 	 * for this phyint.
2348 	 */
2349 	reset_pii_probes(pii, tg);
2350 
2351 	/*
2352 	 * Remove this target from the list of targets of this
2353 	 * phyint instance.
2354 	 */
2355 	if (tg->tg_prev == NULL) {
2356 		pii->pii_targets = tg->tg_next;
2357 	} else {
2358 		tg->tg_prev->tg_next = tg->tg_next;
2359 	}
2360 
2361 	if (tg->tg_next != NULL)
2362 		tg->tg_next->tg_prev = tg->tg_prev;
2363 
2364 	tg->tg_next = NULL;
2365 	tg->tg_prev = NULL;
2366 
2367 	if (tg->tg_status == TG_ACTIVE)
2368 		pii->pii_ntargets--;
2369 
2370 	/*
2371 	 * Adjust the next target to probe, if it points to
2372 	 * to the currently deleted target.
2373 	 */
2374 	if (pii->pii_target_next == tg)
2375 		pii->pii_target_next = target_first(pii);
2376 
2377 	if (pii->pii_rtt_target_next == tg)
2378 		pii->pii_rtt_target_next = target_first(pii);
2379 
2380 	free(tg);
2381 
2382 	/*
2383 	 * The number of active targets pii_ntargets == 0 iff
2384 	 * the next active target pii->pii_target_next == NULL
2385 	 */
2386 	if (pii->pii_ntargets != 0) {
2387 		assert(pii->pii_target_next != NULL);
2388 		assert(pii->pii_rtt_target_next != NULL);
2389 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2390 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2391 		return;
2392 	}
2393 
2394 	/* At this point, we don't have any active targets. */
2395 	assert(pii->pii_target_next == NULL);
2396 	assert(pii->pii_rtt_target_next == NULL);
2397 
2398 	if (pii->pii_targets_are_routers) {
2399 		/*
2400 		 * Activate any TG_SLOW or TG_DEAD router targets,
2401 		 * since we don't have any other targets
2402 		 */
2403 		target_activate_all(pii);
2404 
2405 		if (pii->pii_ntargets != 0) {
2406 			assert(pii->pii_target_next != NULL);
2407 			assert(pii->pii_rtt_target_next != NULL);
2408 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2409 			assert(pii->pii_rtt_target_next->tg_status ==
2410 			    TG_ACTIVE);
2411 			return;
2412 		}
2413 	}
2414 
2415 	/*
2416 	 * If we still don't have any active targets, the list must
2417 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2418 	 * targets. Zero out the probe stats since it will not be
2419 	 * relevant any longer.
2420 	 */
2421 	assert(pii->pii_targets == NULL);
2422 	pii->pii_targets_are_routers = _B_FALSE;
2423 	clear_pii_probe_stats(pii);
2424 	pii_other = phyint_inst_other(pii);
2425 
2426 	/*
2427 	 * If there are no targets on both instances and the interface would
2428 	 * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
2429 	 * since we cannot probe this phyint any more.  For more details,
2430 	 * please see phyint state diagram in mpd_probe.c.
2431 	 */
2432 	if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
2433 	    pii->pii_phyint->pi_state != PI_OFFLINE)
2434 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2435 }
2436 
2437 /*
2438  * Flush the target list of every phyint in the group, if the list
2439  * is a host target list. This is called if group failure is suspected.
2440  * If all targets have failed, multicast will subsequently discover new
2441  * targets. Else it is a group failure.
2442  * Note: This function is a no-op if the list is a router target list.
2443  */
2444 static void
2445 target_flush_hosts(struct phyint_group *pg)
2446 {
2447 	struct phyint *pi;
2448 	struct phyint_instance *pii;
2449 
2450 	if (debug & D_TARGET)
2451 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2452 
2453 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2454 		pii = pi->pi_v4;
2455 		if (pii != NULL && !pii->pii_targets_are_routers) {
2456 			/*
2457 			 * Delete all the targets. When the list becomes
2458 			 * empty, target_delete() will set pii->pii_targets
2459 			 * to NULL.
2460 			 */
2461 			while (pii->pii_targets != NULL)
2462 				target_delete(pii->pii_targets);
2463 		}
2464 		pii = pi->pi_v6;
2465 		if (pii != NULL && !pii->pii_targets_are_routers) {
2466 			/*
2467 			 * Delete all the targets. When the list becomes
2468 			 * empty, target_delete() will set pii->pii_targets
2469 			 * to NULL.
2470 			 */
2471 			while (pii->pii_targets != NULL)
2472 				target_delete(pii->pii_targets);
2473 		}
2474 	}
2475 }
2476 
2477 /*
2478  * Reset all references to 'target' in the probe info, as this target is
2479  * being deleted. The pr_target field is guaranteed to be non-null if
2480  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2481  * pr_target will not be accessed unconditionally.
2482  */
2483 static void
2484 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2485 {
2486 	int i;
2487 
2488 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2489 		if (pii->pii_probes[i].pr_target == tg) {
2490 			if (pii->pii_probes[i].pr_status == PR_UNACKED) {
2491 				probe_chstate(&pii->pii_probes[i], pii,
2492 				    PR_LOST);
2493 			}
2494 			pii->pii_probes[i].pr_target = NULL;
2495 		}
2496 	}
2497 
2498 }
2499 
2500 /*
2501  * Clear the probe statistics array.
2502  */
2503 void
2504 clear_pii_probe_stats(struct phyint_instance *pii)
2505 {
2506 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2507 	/* Reset the next probe index in the probe stats array */
2508 	pii->pii_probe_next = 0;
2509 }
2510 
2511 static void
2512 target_print(struct target *tg)
2513 {
2514 	char	abuf[INET6_ADDRSTRLEN];
2515 	char	buf[128];
2516 	char	buf2[128];
2517 	int	af;
2518 	int	i;
2519 
2520 	af = tg->tg_phyint_inst->pii_af;
2521 
2522 	logdebug("Target on %s %s addr %s\n"
2523 	    "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
2524 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2525 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2526 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2527 	    tg->tg_crtt, tg->tg_in_use);
2528 
2529 	buf[0] = '\0';
2530 	for (i = 0; i < tg->tg_num_deferred; i++) {
2531 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2532 		    tg->tg_deferred[i]);
2533 		(void) strlcat(buf, buf2, sizeof (buf));
2534 	}
2535 	logdebug("deferred rtts:%s\n", buf);
2536 }
2537 
2538 void
2539 phyint_inst_print_all(void)
2540 {
2541 	struct phyint_instance *pii;
2542 
2543 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2544 		phyint_inst_print(pii);
2545 	}
2546 }
2547 
2548 /*
2549  * Compare two prefixes that have the same prefix length.
2550  * Fails if the prefix length is unreasonable.
2551  */
2552 boolean_t
2553 prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
2554 {
2555 	uchar_t mask;
2556 	int j;
2557 
2558 	if (prefix_len > IPV6_ABITS)
2559 		return (_B_FALSE);
2560 
2561 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2562 		if (p1.s6_addr[j] != p2.s6_addr[j])
2563 			return (_B_FALSE);
2564 
2565 	/* Make the N leftmost bits one */
2566 	mask = 0xff << (8 - prefix_len);
2567 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2568 		return (_B_FALSE);
2569 
2570 	return (_B_TRUE);
2571 }
2572 
2573 /*
2574  * Get the number of UP logints on phyint `pi'.
2575  */
2576 static int
2577 logint_upcount(struct phyint *pi)
2578 {
2579 	struct	logint	*li;
2580 	int count = 0;
2581 
2582 	if (pi->pi_v4 != NULL) {
2583 		for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
2584 			if (li->li_flags & IFF_UP)
2585 				count++;
2586 		}
2587 	}
2588 
2589 	if (pi->pi_v6 != NULL) {
2590 		for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
2591 			if (li->li_flags & IFF_UP)
2592 				count++;
2593 		}
2594 	}
2595 
2596 	return (count);
2597 }
2598 
2599 /*
2600  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2601  */
2602 struct phyint_instance *
2603 phyint_inst_other(struct phyint_instance *pii)
2604 {
2605 	if (pii->pii_af == AF_INET)
2606 		return (pii->pii_phyint->pi_v6);
2607 	else
2608 		return (pii->pii_phyint->pi_v4);
2609 }
2610 
2611 /*
2612  * Check whether a phyint is functioning.
2613  */
2614 static boolean_t
2615 phyint_is_functioning(struct phyint *pi)
2616 {
2617 	if (pi->pi_state == PI_RUNNING)
2618 		return (_B_TRUE);
2619 	return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
2620 }
2621 
2622 /*
2623  * Check whether a phyint is usable.
2624  */
2625 static boolean_t
2626 phyint_is_usable(struct phyint *pi)
2627 {
2628 	if (logint_upcount(pi) == 0)
2629 		return (_B_FALSE);
2630 	return (phyint_is_functioning(pi));
2631 }
2632 
2633 /*
2634  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2635  * Before sending the event, it prepends the current version of the IPMP
2636  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2637  * `nvl' is freed).
2638  */
2639 static int
2640 post_event(const char *subclass, nvlist_t *nvl)
2641 {
2642 	static evchan_t *evchp = NULL;
2643 
2644 	/*
2645 	 * Initialize the event channel if we haven't already done so.
2646 	 */
2647 	if (evchp == NULL) {
2648 		errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
2649 		if (errno != 0) {
2650 			logerr("cannot create event channel `%s': %s\n",
2651 			    IPMP_EVENT_CHAN, strerror(errno));
2652 			goto failed;
2653 		}
2654 	}
2655 
2656 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2657 	    IPMP_EVENT_CUR_VERSION);
2658 	if (errno != 0) {
2659 		logerr("cannot create `%s' event: %s", subclass,
2660 		    strerror(errno));
2661 		goto failed;
2662 	}
2663 
2664 	errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
2665 	    "in.mpathd", nvl, EVCH_NOSLEEP);
2666 	if (errno != 0) {
2667 		logerr("cannot send `%s' event: %s\n", subclass,
2668 		    strerror(errno));
2669 		goto failed;
2670 	}
2671 
2672 	nvlist_free(nvl);
2673 	return (0);
2674 failed:
2675 	nvlist_free(nvl);
2676 	return (-1);
2677 }
2678 
2679 /*
2680  * Return the external IPMP state associated with phyint `pi'.
2681  */
2682 static ipmp_if_state_t
2683 ifstate(struct phyint *pi)
2684 {
2685 	switch (pi->pi_state) {
2686 	case PI_NOTARGETS:
2687 		if (pi->pi_flags & IFF_FAILED)
2688 			return (IPMP_IF_FAILED);
2689 		return (IPMP_IF_UNKNOWN);
2690 
2691 	case PI_OFFLINE:
2692 		return (IPMP_IF_OFFLINE);
2693 
2694 	case PI_FAILED:
2695 		return (IPMP_IF_FAILED);
2696 
2697 	case PI_RUNNING:
2698 		return (IPMP_IF_OK);
2699 	}
2700 
2701 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2702 	abort();
2703 	/* NOTREACHED */
2704 }
2705 
2706 /*
2707  * Return the external IPMP interface type associated with phyint `pi'.
2708  */
2709 static ipmp_if_type_t
2710 iftype(struct phyint *pi)
2711 {
2712 	if (pi->pi_flags & IFF_STANDBY)
2713 		return (IPMP_IF_STANDBY);
2714 	else
2715 		return (IPMP_IF_NORMAL);
2716 }
2717 
2718 /*
2719  * Return the external IPMP link state associated with phyint `pi'.
2720  */
2721 static ipmp_if_linkstate_t
2722 iflinkstate(struct phyint *pi)
2723 {
2724 	if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
2725 		return (IPMP_LINK_UNKNOWN);
2726 
2727 	return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
2728 }
2729 
2730 /*
2731  * Return the external IPMP probe state associated with phyint `pi'.
2732  */
2733 static ipmp_if_probestate_t
2734 ifprobestate(struct phyint *pi)
2735 {
2736 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
2737 		return (IPMP_PROBE_DISABLED);
2738 
2739 	if (pi->pi_state == PI_FAILED)
2740 		return (IPMP_PROBE_FAILED);
2741 
2742 	if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
2743 		return (IPMP_PROBE_UNKNOWN);
2744 
2745 	return (IPMP_PROBE_OK);
2746 }
2747 
2748 /*
2749  * Return the external IPMP target mode associated with phyint instance `pii'.
2750  */
2751 static ipmp_if_targmode_t
2752 iftargmode(struct phyint_instance *pii)
2753 {
2754 	if (!PROBE_ENABLED(pii))
2755 		return (IPMP_TARG_DISABLED);
2756 	else if (pii->pii_targets_are_routers)
2757 		return (IPMP_TARG_ROUTES);
2758 	else
2759 		return (IPMP_TARG_MULTICAST);
2760 }
2761 
2762 /*
2763  * Return the external IPMP flags associated with phyint `pi'.
2764  */
2765 static ipmp_if_flags_t
2766 ifflags(struct phyint *pi)
2767 {
2768 	ipmp_if_flags_t flags = 0;
2769 
2770 	if (logint_upcount(pi) == 0)
2771 		flags |= IPMP_IFFLAG_DOWN;
2772 	if (pi->pi_flags & IFF_INACTIVE)
2773 		flags |= IPMP_IFFLAG_INACTIVE;
2774 	if (pi->pi_hwaddrdup)
2775 		flags |= IPMP_IFFLAG_HWADDRDUP;
2776 	if (phyint_is_functioning(pi) && flags == 0)
2777 		flags |= IPMP_IFFLAG_ACTIVE;
2778 
2779 	return (flags);
2780 }
2781 
2782 /*
2783  * Store the test address used on phyint instance `pii' in `ssp'.  If there's
2784  * no test address, 0.0.0.0 is stored.
2785  */
2786 static struct sockaddr_storage *
2787 iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
2788 {
2789 	if (PROBE_ENABLED(pii))
2790 		addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
2791 	else
2792 		addr2storage(AF_INET6, &in6addr_any, ssp);
2793 
2794 	return (ssp);
2795 }
2796 
2797 /*
2798  * Return the external IPMP group state associated with phyint group `pg'.
2799  */
2800 static ipmp_group_state_t
2801 groupstate(struct phyint_group *pg)
2802 {
2803 	switch (pg->pg_state) {
2804 	case PG_FAILED:
2805 		return (IPMP_GROUP_FAILED);
2806 	case PG_DEGRADED:
2807 		return (IPMP_GROUP_DEGRADED);
2808 	case PG_OK:
2809 		return (IPMP_GROUP_OK);
2810 	}
2811 
2812 	logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
2813 	abort();
2814 	/* NOTREACHED */
2815 }
2816 
2817 /*
2818  * Return the external IPMP probe state associated with probe `ps'.
2819  */
2820 static ipmp_probe_state_t
2821 probestate(struct probe_stats *ps)
2822 {
2823 	switch (ps->pr_status) {
2824 	case PR_UNUSED:
2825 	case PR_LOST:
2826 		return (IPMP_PROBE_LOST);
2827 	case PR_UNACKED:
2828 		return (IPMP_PROBE_SENT);
2829 	case PR_ACKED:
2830 		return (IPMP_PROBE_ACKED);
2831 	}
2832 
2833 	logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
2834 	abort();
2835 	/* NOTREACHED */
2836 }
2837 
2838 /*
2839  * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
2840  * on phyint instance `pii'.  Returns 0 on success, -1 on failure.
2841  */
2842 int
2843 probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
2844 {
2845 	nvlist_t *nvl;
2846 	hrtime_t proc_time = 0, recv_time = 0;
2847 	struct sockaddr_storage ss;
2848 	struct target *tg = pr->pr_target;
2849 
2850 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2851 	if (errno != 0) {
2852 		logperror("cannot create `interface change' event");
2853 		return (-1);
2854 	}
2855 
2856 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
2857 	if (errno != 0)
2858 		goto failed;
2859 
2860 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
2861 	if (errno != 0)
2862 		goto failed;
2863 
2864 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
2865 	if (errno != 0)
2866 		goto failed;
2867 
2868 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
2869 	    pr->pr_hrtime_start);
2870 	if (errno != 0)
2871 		goto failed;
2872 
2873 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
2874 	    pr->pr_hrtime_sent);
2875 	if (errno != 0)
2876 		goto failed;
2877 
2878 	if (pr->pr_status == PR_ACKED) {
2879 		recv_time = pr->pr_hrtime_ackrecv;
2880 		proc_time = pr->pr_hrtime_ackproc;
2881 	}
2882 
2883 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
2884 	if (errno != 0)
2885 		goto failed;
2886 
2887 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
2888 	if (errno != 0)
2889 		goto failed;
2890 
2891 	if (tg != NULL)
2892 		addr2storage(pii->pii_af, &tg->tg_address, &ss);
2893 	else
2894 		addr2storage(pii->pii_af, &in6addr_any, &ss);
2895 
2896 	errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
2897 	    sizeof (ss));
2898 	if (errno != 0)
2899 		goto failed;
2900 
2901 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG,
2902 	    tg->tg_rtt_sa / 8);
2903 	if (errno != 0)
2904 		goto failed;
2905 
2906 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV,
2907 	    tg->tg_rtt_sd / 4);
2908 	if (errno != 0)
2909 		goto failed;
2910 
2911 	return (post_event(ESC_IPMP_PROBE_STATE, nvl));
2912 failed:
2913 	logperror("cannot create `probe state' event");
2914 	nvlist_free(nvl);
2915 	return (-1);
2916 }
2917 
2918 /*
2919  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2920  * Returns 0 on success, -1 on failure.
2921  */
2922 static int
2923 phyint_group_state_event(struct phyint_group *pg)
2924 {
2925 	nvlist_t	*nvl;
2926 
2927 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2928 	if (errno != 0) {
2929 		logperror("cannot create `group state change' event");
2930 		return (-1);
2931 	}
2932 
2933 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2934 	if (errno != 0)
2935 		goto failed;
2936 
2937 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2938 	if (errno != 0)
2939 		goto failed;
2940 
2941 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2942 	if (errno != 0)
2943 		goto failed;
2944 
2945 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2946 failed:
2947 	logperror("cannot create `group state change' event");
2948 	nvlist_free(nvl);
2949 	return (-1);
2950 }
2951 
2952 /*
2953  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2954  * `pg'.  Returns 0 on success, -1 on failure.
2955  */
2956 static int
2957 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2958 {
2959 	nvlist_t *nvl;
2960 
2961 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2962 	if (errno != 0) {
2963 		logperror("cannot create `group change' event");
2964 		return (-1);
2965 	}
2966 
2967 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2968 	if (errno != 0)
2969 		goto failed;
2970 
2971 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2972 	if (errno != 0)
2973 		goto failed;
2974 
2975 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
2976 	    phyint_grouplistsig);
2977 	if (errno != 0)
2978 		goto failed;
2979 
2980 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
2981 	if (errno != 0)
2982 		goto failed;
2983 
2984 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
2985 failed:
2986 	logperror("cannot create `group change' event");
2987 	nvlist_free(nvl);
2988 	return (-1);
2989 }
2990 
2991 /*
2992  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
2993  * group `pg'.	Returns 0 on success, -1 on failure.
2994  */
2995 static int
2996 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
2997     ipmp_if_op_t op)
2998 {
2999 	nvlist_t *nvl;
3000 
3001 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3002 	if (errno != 0) {
3003 		logperror("cannot create `group member change' event");
3004 		return (-1);
3005 	}
3006 
3007 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3008 	if (errno != 0)
3009 		goto failed;
3010 
3011 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3012 	if (errno != 0)
3013 		goto failed;
3014 
3015 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
3016 	if (errno != 0)
3017 		goto failed;
3018 
3019 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3020 	if (errno != 0)
3021 		goto failed;
3022 
3023 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3024 	if (errno != 0)
3025 		goto failed;
3026 
3027 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3028 	if (errno != 0)
3029 		goto failed;
3030 
3031 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
3032 failed:
3033 	logperror("cannot create `group member change' event");
3034 	nvlist_free(nvl);
3035 	return (-1);
3036 
3037 }
3038 
3039 /*
3040  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
3041  * Returns 0 on success, -1 on failure.
3042  */
3043 static int
3044 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
3045 {
3046 	nvlist_t *nvl;
3047 
3048 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3049 	if (errno != 0) {
3050 		logperror("cannot create `interface change' event");
3051 		return (-1);
3052 	}
3053 
3054 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3055 	if (errno != 0)
3056 		goto failed;
3057 
3058 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3059 	if (errno != 0)
3060 		goto failed;
3061 
3062 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3063 	if (errno != 0)
3064 		goto failed;
3065 
3066 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3067 	if (errno != 0)
3068 		goto failed;
3069 
3070 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3071 	if (errno != 0)
3072 		goto failed;
3073 
3074 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
3075 failed:
3076 	logperror("cannot create `interface change' event");
3077 	nvlist_free(nvl);
3078 	return (-1);
3079 
3080 }
3081 
3082 /*
3083  * Generate a signature for use.  The signature is conceptually divided
3084  * into two pieces: a random 16-bit "generation number" and a 48-bit
3085  * monotonically increasing integer.  The generation number protects
3086  * against stale updates to entities (e.g., IPMP groups) that have been
3087  * deleted and since recreated.
3088  */
3089 static uint64_t
3090 gensig(void)
3091 {
3092 	static int seeded = 0;
3093 
3094 	if (seeded == 0) {
3095 		srand48((long)gethrtime());
3096 		seeded++;
3097 	}
3098 
3099 	return ((uint64_t)lrand48() << 48 | 1);
3100 }
3101 
3102 /*
3103  * Store the information associated with group `grname' into a dynamically
3104  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
3105  */
3106 unsigned int
3107 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
3108 {
3109 	struct phyint		*pi;
3110 	struct phyint_group	*pg;
3111 	char			(*ifs)[LIFNAMSIZ];
3112 	unsigned int		i, j;
3113 	unsigned int		nif = 0, naddr = 0;
3114 	lifgroupinfo_t		lifgr;
3115 	addrlist_t		*addrp;
3116 	struct sockaddr_storage	*addrs;
3117 	int			fdt = 0;
3118 
3119 	pg = phyint_group_lookup(grname);
3120 	if (pg == NULL)
3121 		return (IPMP_EUNKGROUP);
3122 
3123 	/*
3124 	 * Tally up the number of interfaces, allocate an array to hold them,
3125 	 * and insert their names into the array.  While we're at it, if any
3126 	 * interface is actually enabled to send probes, save the group fdt.
3127 	 */
3128 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
3129 		nif++;
3130 
3131 	ifs = alloca(nif * sizeof (*ifs));
3132 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
3133 		assert(i < nif);
3134 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
3135 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
3136 			fdt = pg->pg_fdt;
3137 	}
3138 	assert(i == nif);
3139 
3140 	/*
3141 	 * If this is the anonymous group, there's no other information to
3142 	 * collect (since there's no IPMP interface).
3143 	 */
3144 	if (pg == phyint_anongroup) {
3145 		*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3146 		    groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
3147 		return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3148 	}
3149 
3150 	/*
3151 	 * Grab some additional information about the group from the kernel.
3152 	 * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
3153 	 * we can use ifsock_v4 even for a V6-only group.)
3154 	 */
3155 	(void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
3156 	if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
3157 		if (errno == ENOENT)
3158 			return (IPMP_EUNKGROUP);
3159 
3160 		logperror("getgroupinfo: SIOCGLIFGROUPINFO");
3161 		return (IPMP_FAILURE);
3162 	}
3163 
3164 	/*
3165 	 * Tally up the number of data addresses, allocate an array to hold
3166 	 * them, and insert their values into the array.
3167 	 */
3168 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
3169 		naddr++;
3170 
3171 	addrs = alloca(naddr * sizeof (*addrs));
3172 	i = 0;
3173 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3174 		/*
3175 		 * It's possible to have duplicate addresses (if some are
3176 		 * down).  Weed the dups out to avoid confusing consumers.
3177 		 * (If groups start having tons of addresses, we'll need a
3178 		 * better algorithm here.)
3179 		 */
3180 		for (j = 0; j < i; j++) {
3181 			if (sockaddrcmp(&addrs[j], &addrp->al_addr))
3182 				break;
3183 		}
3184 		if (j == i) {
3185 			assert(i < naddr);
3186 			addrs[i++] = addrp->al_addr;
3187 		}
3188 	}
3189 	naddr = i;
3190 
3191 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3192 	    groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
3193 	    lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
3194 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3195 }
3196 
3197 /*
3198  * Store the target information associated with phyint instance `pii' into a
3199  * dynamically allocated structure pointed to by `*targinfopp'.  Returns an
3200  * IPMP error code.
3201  */
3202 unsigned int
3203 gettarginfo(struct phyint_instance *pii, const char *name,
3204     ipmp_targinfo_t **targinfopp)
3205 {
3206 	uint_t ntarg = 0;
3207 	struct target *tg;
3208 	struct sockaddr_storage	ss;
3209 	struct sockaddr_storage *targs = NULL;
3210 
3211 	if (PROBE_CAPABLE(pii)) {
3212 		targs = alloca(pii->pii_ntargets * sizeof (*targs));
3213 		tg = pii->pii_target_next;
3214 		do {
3215 			if (tg->tg_status == TG_ACTIVE) {
3216 				assert(ntarg < pii->pii_ntargets);
3217 				addr2storage(pii->pii_af, &tg->tg_address,
3218 				    &targs[ntarg++]);
3219 			}
3220 			if ((tg = tg->tg_next) == NULL)
3221 				tg = pii->pii_targets;
3222 		} while (tg != pii->pii_target_next);
3223 
3224 		assert(ntarg == pii->pii_ntargets);
3225 	}
3226 
3227 	*targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
3228 	    iftargmode(pii), ntarg, targs);
3229 	return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3230 }
3231 
3232 /*
3233  * Store the information associated with interface `ifname' into a dynamically
3234  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
3235  */
3236 unsigned int
3237 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
3238 {
3239 	int		retval;
3240 	struct phyint	*pi;
3241 	ipmp_targinfo_t	*targinfo4;
3242 	ipmp_targinfo_t	*targinfo6;
3243 
3244 	pi = phyint_lookup(ifname);
3245 	if (pi == NULL)
3246 		return (IPMP_EUNKIF);
3247 
3248 	if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
3249 	    (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
3250 		goto out;
3251 
3252 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
3253 	    ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
3254 	    ifflags(pi), targinfo4, targinfo6);
3255 	retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3256 out:
3257 	if (targinfo4 != NULL)
3258 		ipmp_freetarginfo(targinfo4);
3259 	if (targinfo6 != NULL)
3260 		ipmp_freetarginfo(targinfo6);
3261 	return (retval);
3262 }
3263 
3264 /*
3265  * Store the current list of IPMP groups into a dynamically allocated
3266  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
3267  */
3268 unsigned int
3269 getgrouplist(ipmp_grouplist_t **grlistpp)
3270 {
3271 	struct phyint_group	*pg;
3272 	char			(*groups)[LIFGRNAMSIZ];
3273 	unsigned int		i, ngroup;
3274 
3275 	/*
3276 	 * Tally up the number of groups, allocate an array to hold them, and
3277 	 * insert their names into the array.
3278 	 */
3279 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
3280 		ngroup++;
3281 
3282 	groups = alloca(ngroup * sizeof (*groups));
3283 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
3284 		assert(i < ngroup);
3285 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
3286 	}
3287 	assert(i == ngroup);
3288 
3289 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
3290 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3291 }
3292 
3293 /*
3294  * Store the address information for `ssp' (in group `grname') into a
3295  * dynamically allocated structure pointed to by `*adinfopp'.  Returns an IPMP
3296  * error code.  (We'd call this function getaddrinfo(), but it would conflict
3297  * with getaddrinfo(3SOCKET)).
3298  */
3299 unsigned int
3300 getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
3301     ipmp_addrinfo_t **adinfopp)
3302 {
3303 	int ifsock;
3304 	addrlist_t *addrp, *addrmatchp = NULL;
3305 	ipmp_addr_state_t state;
3306 	const char *binding = "";
3307 	struct lifreq lifr;
3308 	struct phyint_group *pg;
3309 
3310 	if ((pg = phyint_group_lookup(grname)) == NULL)
3311 		return (IPMP_EUNKADDR);
3312 
3313 	/*
3314 	 * Walk through the data addresses, and find a match.  Note that since
3315 	 * some of the addresses may be down, more than one may match.  We
3316 	 * prefer an up address (if one exists).
3317 	 */
3318 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3319 		if (sockaddrcmp(ssp, &addrp->al_addr)) {
3320 			addrmatchp = addrp;
3321 			if (addrmatchp->al_flags & IFF_UP)
3322 				break;
3323 		}
3324 	}
3325 
3326 	if (addrmatchp == NULL)
3327 		return (IPMP_EUNKADDR);
3328 
3329 	state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
3330 	if (state == IPMP_ADDR_UP) {
3331 		ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
3332 		(void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
3333 		if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
3334 			binding = lifr.lifr_binding;
3335 	}
3336 
3337 	*adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
3338 	return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3339 }
3340 
3341 /*
3342  * Store a snapshot of the IPMP subsystem into a dynamically allocated
3343  * structure pointed to by `*snapp'.  Returns an IPMP error code.
3344  */
3345 unsigned int
3346 getsnap(ipmp_snap_t **snapp)
3347 {
3348 	ipmp_grouplist_t	*grlistp;
3349 	ipmp_groupinfo_t	*grinfop;
3350 	ipmp_addrinfo_t		*adinfop;
3351 	ipmp_addrlist_t		*adlistp;
3352 	ipmp_ifinfo_t		*ifinfop;
3353 	ipmp_snap_t		*snap;
3354 	struct phyint		*pi;
3355 	unsigned int		i, j;
3356 	int			retval;
3357 
3358 	snap = ipmp_snap_create();
3359 	if (snap == NULL)
3360 		return (IPMP_ENOMEM);
3361 
3362 	/*
3363 	 * Add group list.
3364 	 */
3365 	retval = getgrouplist(&snap->sn_grlistp);
3366 	if (retval != IPMP_SUCCESS)
3367 		goto failed;
3368 
3369 	/*
3370 	 * Add information for each group in the list, along with all of its
3371 	 * data addresses.
3372 	 */
3373 	grlistp = snap->sn_grlistp;
3374 	for (i = 0; i < grlistp->gl_ngroup; i++) {
3375 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
3376 		if (retval != IPMP_SUCCESS)
3377 			goto failed;
3378 
3379 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
3380 		if (retval != IPMP_SUCCESS) {
3381 			ipmp_freegroupinfo(grinfop);
3382 			goto failed;
3383 		}
3384 
3385 		adlistp = grinfop->gr_adlistp;
3386 		for (j = 0; j < adlistp->al_naddr; j++) {
3387 			retval = getgraddrinfo(grinfop->gr_name,
3388 			    &adlistp->al_addrs[j], &adinfop);
3389 			if (retval != IPMP_SUCCESS)
3390 				goto failed;
3391 
3392 			retval = ipmp_snap_addaddrinfo(snap, adinfop);
3393 			if (retval != IPMP_SUCCESS) {
3394 				ipmp_freeaddrinfo(adinfop);
3395 				goto failed;
3396 			}
3397 		}
3398 	}
3399 
3400 	/*
3401 	 * Add information for each configured phyint.
3402 	 */
3403 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
3404 		retval = getifinfo(pi->pi_name, &ifinfop);
3405 		if (retval != IPMP_SUCCESS)
3406 			goto failed;
3407 
3408 		retval = ipmp_snap_addifinfo(snap, ifinfop);
3409 		if (retval != IPMP_SUCCESS) {
3410 			ipmp_freeifinfo(ifinfop);
3411 			goto failed;
3412 		}
3413 	}
3414 
3415 	*snapp = snap;
3416 	return (IPMP_SUCCESS);
3417 failed:
3418 	ipmp_snap_free(snap);
3419 	return (retval);
3420 }
3421