xref: /titanic_41/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision ef4d27fba69298571e509867dd27ea8bca349ec9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 /*
30  * Global list of phyints, phyint instances, phyint groups and the anonymous
31  * group; the latter is initialized in phyint_init().
32  */
33 struct phyint *phyints = NULL;
34 struct phyint_instance	*phyint_instances = NULL;
35 struct phyint_group *phyint_groups = NULL;
36 struct phyint_group *phyint_anongroup;
37 
38 /*
39  * Grouplist signature; initialized in phyint_init().
40  */
41 static uint64_t phyint_grouplistsig;
42 
43 static void phyint_inst_insert(struct phyint_instance *pii);
44 static void phyint_inst_print(struct phyint_instance *pii);
45 
46 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
47 static void phyint_delete(struct phyint *pi);
48 static boolean_t phyint_is_usable(struct phyint *pi);
49 
50 static void logint_print(struct logint *li);
51 static void logint_insert(struct phyint_instance *pii, struct logint *li);
52 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
53 
54 static void target_print(struct target *tg);
55 static void target_insert(struct phyint_instance *pii, struct target *tg);
56 static struct target *target_first(struct phyint_instance *pii);
57 static struct target *target_select_best(struct phyint_instance *pii);
58 static void target_flush_hosts(struct phyint_group *pg);
59 
60 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
61 
62 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
63 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
64 
65 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
66 static int phyint_group_state_event(struct phyint_group *pg);
67 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
68 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
69     ipmp_if_op_t op);
70 
71 static int logint_upcount(struct phyint *pi);
72 static uint64_t gensig(void);
73 
74 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
75 int
76 phyint_init(void)
77 {
78 	phyint_grouplistsig = gensig();
79 	if (track_all_phyints) {
80 		phyint_anongroup = phyint_group_create("");
81 		if (phyint_anongroup == NULL)
82 			return (-1);
83 		phyint_group_insert(phyint_anongroup);
84 	}
85 	return (0);
86 }
87 
88 /* Return the phyint with the given name */
89 struct phyint *
90 phyint_lookup(const char *name)
91 {
92 	struct phyint *pi;
93 
94 	if (debug & D_PHYINT)
95 		logdebug("phyint_lookup(%s)\n", name);
96 
97 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
98 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
99 			break;
100 	}
101 	return (pi);
102 }
103 
104 /*
105  * Lookup a phyint in the group that has the same hardware address as `pi', or
106  * NULL if there's none.  If `online_only' is set, then only online phyints
107  * are considered when matching.  Otherwise, phyints that had been offlined
108  * due to a duplicate hardware address will also be considered.
109  */
110 static struct phyint *
111 phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
112 {
113 	struct phyint *pi2;
114 
115 	if (pi->pi_group == phyint_anongroup)
116 		return (NULL);
117 
118 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
119 		if (pi2 == pi)
120 			continue;
121 
122 		/*
123 		 * NOTE: even when online_only is B_FALSE, we ignore phyints
124 		 * that are administratively offline (rather than offline
125 		 * because they're dups); when they're brought back online,
126 		 * they'll be flagged as dups if need be.
127 		 */
128 		if (pi2->pi_state == PI_OFFLINE &&
129 		    (online_only || !pi2->pi_hwaddrdup))
130 			continue;
131 
132 		if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
133 		    bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
134 			return (pi2);
135 	}
136 	return (NULL);
137 }
138 
139 /*
140  * Respond to DLPI notifications.  Currently, this only processes physical
141  * address changes for the phyint passed via `arg' by onlining or offlining
142  * phyints in the group.
143  */
144 /* ARGSUSED */
145 static void
146 phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
147 {
148 	struct phyint *pi = arg;
149 	struct phyint *oduppi = NULL, *duppi = NULL;
150 
151 	assert((dnip->dni_note & pi->pi_notes) != 0);
152 
153 	if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
154 		return;
155 
156 	assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
157 
158 	/*
159 	 * If our hardware address hasn't changed, there's nothing to do.
160 	 */
161 	if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
162 	    bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
163 		return;
164 
165 	oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
166 	pi->pi_hwaddrlen = dnip->dni_physaddrlen;
167 	(void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
168 	duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
169 
170 	if (oduppi != NULL || pi->pi_hwaddrdup) {
171 		/*
172 		 * Our old hardware address was a duplicate.  If we'd been
173 		 * offlined because of it, and our new hardware address is not
174 		 * a duplicate, then bring us online.  Otherwise, `oduppi'
175 		 * must've been the one brought offline; bring it online.
176 		 */
177 		if (pi->pi_hwaddrdup) {
178 			if (duppi == NULL)
179 				(void) phyint_undo_offline(pi);
180 		} else {
181 			assert(oduppi->pi_hwaddrdup);
182 			(void) phyint_undo_offline(oduppi);
183 		}
184 	}
185 
186 	if (duppi != NULL && !pi->pi_hwaddrdup) {
187 		/*
188 		 * Our new hardware address was a duplicate and we're not
189 		 * yet flagged as a duplicate; bring us offline.
190 		 */
191 		pi->pi_hwaddrdup = _B_TRUE;
192 		(void) phyint_offline(pi, 0);
193 	}
194 }
195 
196 /*
197  * Initialize information about the underlying link for `pi', and set us
198  * up to be notified about future changes.  Returns _B_TRUE on success.
199  */
200 boolean_t
201 phyint_link_init(struct phyint *pi)
202 {
203 	int retval;
204 	uint_t notes;
205 	const char *errmsg;
206 	dlpi_notifyid_t id;
207 
208 	pi->pi_notes = 0;
209 	retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
210 	if (retval != DLPI_SUCCESS) {
211 		pi->pi_dh = NULL;
212 		errmsg = "cannot open";
213 		goto failed;
214 	}
215 
216 	pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
217 	retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
218 	    &pi->pi_hwaddrlen);
219 	if (retval != DLPI_SUCCESS) {
220 		errmsg = "cannot get hardware address";
221 		goto failed;
222 	}
223 
224 	/*
225 	 * Check if the link supports DLPI link state notifications.  For
226 	 * historical reasons, the actual changes are tracked through routing
227 	 * sockets, so we immediately disable the notification upon success.
228 	 */
229 	notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
230 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
231 	if (retval == DLPI_SUCCESS) {
232 		(void) dlpi_disabnotify(pi->pi_dh, id, NULL);
233 		pi->pi_notes |= notes;
234 	}
235 
236 	/*
237 	 * Enable notification of hardware address changes to keep pi_hwaddr
238 	 * up-to-date and track if we need to offline/undo-offline phyints.
239 	 */
240 	notes = DL_NOTE_PHYS_ADDR;
241 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
242 	if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
243 		pi->pi_notes |= notes;
244 
245 	return (_B_TRUE);
246 failed:
247 	logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
248 	if (pi->pi_dh != NULL) {
249 		dlpi_close(pi->pi_dh);
250 		pi->pi_dh = NULL;
251 	}
252 	return (_B_FALSE);
253 }
254 
255 /*
256  * Close use of link on `pi'.
257  */
258 void
259 phyint_link_close(struct phyint *pi)
260 {
261 	if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
262 		(void) poll_remove(dlpi_fd(pi->pi_dh));
263 		pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
264 	}
265 
266 	/*
267 	 * NOTE: we don't clear pi_notes here so that iflinkstate() can still
268 	 * properly report the link state even when offline (which is possible
269 	 * since we use IFF_RUNNING to track link state).
270 	 */
271 	dlpi_close(pi->pi_dh);
272 	pi->pi_dh = NULL;
273 }
274 
275 /* Return the phyint instance with the given name and the given family */
276 struct phyint_instance *
277 phyint_inst_lookup(int af, char *name)
278 {
279 	struct phyint *pi;
280 
281 	if (debug & D_PHYINT)
282 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
283 
284 	assert(af == AF_INET || af == AF_INET6);
285 
286 	pi = phyint_lookup(name);
287 	if (pi == NULL)
288 		return (NULL);
289 
290 	return (PHYINT_INSTANCE(pi, af));
291 }
292 
293 struct phyint_group *
294 phyint_group_lookup(const char *pg_name)
295 {
296 	struct phyint_group *pg;
297 
298 	if (debug & D_PHYINT)
299 		logdebug("phyint_group_lookup(%s)\n", pg_name);
300 
301 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
302 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
303 			break;
304 	}
305 	return (pg);
306 }
307 
308 /*
309  * Insert the phyint in the linked list of all phyints. If the phyint belongs
310  * to some group, insert it in the phyint group list.
311  */
312 static void
313 phyint_insert(struct phyint *pi, struct phyint_group *pg)
314 {
315 	if (debug & D_PHYINT)
316 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
317 
318 	/* Insert the phyint at the head of the 'all phyints' list */
319 	pi->pi_next = phyints;
320 	pi->pi_prev = NULL;
321 	if (phyints != NULL)
322 		phyints->pi_prev = pi;
323 	phyints = pi;
324 
325 	/*
326 	 * Insert the phyint at the head of the 'phyint_group members' list
327 	 * of the phyint group to which it belongs.
328 	 */
329 	pi->pi_pgnext = NULL;
330 	pi->pi_pgprev = NULL;
331 	pi->pi_group = pg;
332 
333 	pi->pi_pgnext = pg->pg_phyint;
334 	if (pi->pi_pgnext != NULL)
335 		pi->pi_pgnext->pi_pgprev = pi;
336 	pg->pg_phyint = pi;
337 
338 	/* Refresh the group state now that this phyint has been added */
339 	phyint_group_refresh_state(pg);
340 
341 	pg->pg_sig++;
342 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
343 }
344 
345 /* Insert the phyint instance in the linked list of all phyint instances. */
346 static void
347 phyint_inst_insert(struct phyint_instance *pii)
348 {
349 	if (debug & D_PHYINT) {
350 		logdebug("phyint_inst_insert(%s %s)\n",
351 		    AF_STR(pii->pii_af), pii->pii_name);
352 	}
353 
354 	/*
355 	 * Insert the phyint at the head of the 'all phyint instances' list.
356 	 */
357 	pii->pii_next = phyint_instances;
358 	pii->pii_prev = NULL;
359 	if (phyint_instances != NULL)
360 		phyint_instances->pii_prev = pii;
361 	phyint_instances = pii;
362 }
363 
364 /*
365  * Create a new phyint with the given parameters. Also insert it into
366  * the list of all phyints and the list of phyint group members by calling
367  * phyint_insert().
368  */
369 static struct phyint *
370 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
371     uint64_t flags)
372 {
373 	struct phyint *pi;
374 
375 	pi = calloc(1, sizeof (struct phyint));
376 	if (pi == NULL) {
377 		logperror("phyint_create: calloc");
378 		return (NULL);
379 	}
380 
381 	/*
382 	 * Record the phyint values.
383 	 */
384 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
385 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
386 	pi->pi_ifindex = ifindex;
387 	pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
388 
389 	pi->pi_state = PI_INIT;
390 	pi->pi_flags = PHYINT_FLAGS(flags);
391 
392 	/*
393 	 * Initialize the link state.  The link state is initialized to
394 	 * up, so that if the link is down when IPMP starts monitoring
395 	 * the interface, it will appear as though there has been a
396 	 * transition from the link up to link down.  This avoids
397 	 * having to treat this situation as a special case.
398 	 */
399 	INIT_LINK_STATE(pi);
400 
401 	if (!phyint_link_init(pi)) {
402 		free(pi);
403 		return (NULL);
404 	}
405 
406 	/*
407 	 * Insert the phyint in the list of all phyints, and the
408 	 * list of phyint group members
409 	 */
410 	phyint_insert(pi, pg);
411 
412 	/*
413 	 * If the interface is offline, we set the state to PI_OFFLINE.
414 	 * Otherwise, optimistically consider this interface running.  Later
415 	 * (in process_link_state_changes()), we will adjust this to match the
416 	 * current state of the link.  Further, if test addresses are
417 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
418 	 * to either PI_RUNNING or PI_FAILED depending on the probe results.
419 	 */
420 	if (flags & IFF_OFFLINE)
421 		phyint_chstate(pi, PI_OFFLINE);
422 	else
423 		phyint_transition_to_running(pi); /* calls phyint_chstate() */
424 
425 	return (pi);
426 }
427 
428 /*
429  * Create a new phyint instance belonging to the phyint 'pi' and address
430  * family 'af'. Also insert it into the list of all phyint instances by
431  * calling phyint_inst_insert().
432  */
433 static struct phyint_instance *
434 phyint_inst_create(struct phyint *pi, int af)
435 {
436 	struct phyint_instance *pii;
437 
438 	pii = calloc(1, sizeof (struct phyint_instance));
439 	if (pii == NULL) {
440 		logperror("phyint_inst_create: calloc");
441 		return (NULL);
442 	}
443 
444 	/*
445 	 * Attach the phyint instance to the phyint.
446 	 * Set the back pointers as well
447 	 */
448 	pii->pii_phyint = pi;
449 	if (af == AF_INET)
450 		pi->pi_v4 = pii;
451 	else
452 		pi->pi_v6 = pii;
453 
454 	pii->pii_in_use = 1;
455 	pii->pii_probe_sock = -1;
456 	pii->pii_snxt = 1;
457 	pii->pii_af = af;
458 	pii->pii_fd_hrtime = gethrtime() +
459 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
460 	pii->pii_flags = pi->pi_flags;
461 
462 	/* Insert the phyint instance in the list of all phyint instances. */
463 	phyint_inst_insert(pii);
464 	return (pii);
465 }
466 
467 /*
468  * Change the state of phyint `pi' to state `state'.
469  */
470 void
471 phyint_chstate(struct phyint *pi, enum pi_state state)
472 {
473 	/*
474 	 * To simplify things, some callers always set a given state
475 	 * regardless of the previous state of the phyint (e.g., setting
476 	 * PI_RUNNING when it's already set).  We shouldn't bother
477 	 * generating an event or consuming a signature for these, since
478 	 * the actual state of the interface is unchanged.
479 	 */
480 	if (pi->pi_state == state)
481 		return;
482 
483 	pi->pi_state = state;
484 	phyint_changed(pi);
485 }
486 
487 /*
488  * Note that `pi' has changed state.
489  */
490 void
491 phyint_changed(struct phyint *pi)
492 {
493 	pi->pi_group->pg_sig++;
494 	(void) phyint_state_event(pi->pi_group, pi);
495 }
496 
497 /*
498  * Insert the phyint group in the linked list of all phyint groups
499  * at the head of the list
500  */
501 void
502 phyint_group_insert(struct phyint_group *pg)
503 {
504 	pg->pg_next = phyint_groups;
505 	pg->pg_prev = NULL;
506 	if (phyint_groups != NULL)
507 		phyint_groups->pg_prev = pg;
508 	phyint_groups = pg;
509 
510 	phyint_grouplistsig++;
511 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
512 }
513 
514 /*
515  * Create a new phyint group called 'name'.
516  */
517 struct phyint_group *
518 phyint_group_create(const char *name)
519 {
520 	struct	phyint_group *pg;
521 
522 	if (debug & D_PHYINT)
523 		logdebug("phyint_group_create(%s)\n", name);
524 
525 	pg = calloc(1, sizeof (struct phyint_group));
526 	if (pg == NULL) {
527 		logperror("phyint_group_create: calloc");
528 		return (NULL);
529 	}
530 
531 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
532 	pg->pg_sig = gensig();
533 	pg->pg_fdt = user_failure_detection_time;
534 	pg->pg_probeint = user_probe_interval;
535 	pg->pg_in_use = _B_TRUE;
536 
537 	/*
538 	 * Normal groups always start in the PG_FAILED state since they
539 	 * have no active interfaces.  In contrast, anonymous groups are
540 	 * heterogeneous and thus always PG_OK.
541 	 */
542 	pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
543 
544 	return (pg);
545 }
546 
547 /*
548  * Change the state of the phyint group `pg' to state `state'.
549  */
550 void
551 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
552 {
553 	assert(pg != phyint_anongroup);
554 
555 	/*
556 	 * To simplify things, some callers always set a given state
557 	 * regardless of the previous state of the group (e.g., setting
558 	 * PG_DEGRADED when it's already set).  We shouldn't bother
559 	 * generating an event or consuming a signature for these, since
560 	 * the actual state of the group is unchanged.
561 	 */
562 	if (pg->pg_state == state)
563 		return;
564 
565 	pg->pg_state = state;
566 
567 	switch (state) {
568 	case PG_FAILED:
569 		/*
570 		 * We can never know with certainty that a group has
571 		 * failed.  It is possible that all known targets have
572 		 * failed simultaneously, and new targets have come up
573 		 * instead. If the targets are routers then router
574 		 * discovery will kick in, and we will see the new routers
575 		 * thru routing socket messages. But if the targets are
576 		 * hosts, we have to discover it by multicast.	So flush
577 		 * all the host targets. The next probe will send out a
578 		 * multicast echo request. If this is a group failure, we
579 		 * will still not see any response, otherwise the group
580 		 * will be repaired after we get NUM_PROBE_REPAIRS
581 		 * consecutive unicast replies on any phyint.
582 		 */
583 		target_flush_hosts(pg);
584 		break;
585 
586 	case PG_OK:
587 	case PG_DEGRADED:
588 		break;
589 
590 	default:
591 		logerr("phyint_group_chstate: invalid group state %d; "
592 		    "aborting\n", state);
593 		abort();
594 	}
595 
596 	pg->pg_sig++;
597 	(void) phyint_group_state_event(pg);
598 }
599 
600 /*
601  * Create a new phyint instance and initialize it from the values supplied by
602  * the kernel. Always check for ENXIO before logging any error, because the
603  * interface could have vanished after completion of SIOCGLIFCONF.
604  * Return values:
605  *	pointer to the phyint instance on success
606  *	NULL on failure Eg. if the phyint instance is not found in the kernel
607  */
608 struct phyint_instance *
609 phyint_inst_init_from_k(int af, char *pi_name)
610 {
611 	char	pg_name[LIFNAMSIZ + 1];
612 	int	ifsock;
613 	uint_t	ifindex;
614 	uint64_t	flags;
615 	struct lifreq	lifr;
616 	struct phyint	*pi;
617 	struct phyint_instance	*pii;
618 	boolean_t	pi_created;
619 	struct phyint_group	*pg;
620 
621 retry:
622 	pii = NULL;
623 	pi = NULL;
624 	pg = NULL;
625 	pi_created = _B_FALSE;
626 
627 	if (debug & D_PHYINT) {
628 		logdebug("phyint_inst_init_from_k(%s %s)\n",
629 		    AF_STR(af), pi_name);
630 	}
631 
632 	assert(af == AF_INET || af == AF_INET6);
633 
634 	/* Get the socket for doing ioctls */
635 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
636 
637 	/*
638 	 * Get the interface flags.  Ignore virtual interfaces, IPMP
639 	 * meta-interfaces, point-to-point interfaces, and interfaces
640 	 * that can't support multicast.
641 	 */
642 	(void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
643 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
644 		if (errno != ENXIO) {
645 			logperror("phyint_inst_init_from_k:"
646 			    " ioctl (get flags)");
647 		}
648 		return (NULL);
649 	}
650 	flags = lifr.lifr_flags;
651 	if (!(flags & IFF_MULTICAST) ||
652 	    (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
653 		return (NULL);
654 
655 	/*
656 	 * Get the ifindex for recording later in our tables, in case we need
657 	 * to create a new phyint.
658 	 */
659 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
660 		if (errno != ENXIO) {
661 			logperror("phyint_inst_init_from_k: "
662 			    " ioctl (get lifindex)");
663 		}
664 		return (NULL);
665 	}
666 	ifindex = lifr.lifr_index;
667 
668 	/*
669 	 * Get the phyint group name of this phyint, from the kernel.
670 	 */
671 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
672 		if (errno != ENXIO) {
673 			logperror("phyint_inst_init_from_k: "
674 			    "ioctl (get group name)");
675 		}
676 		return (NULL);
677 	}
678 	(void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
679 
680 	/*
681 	 * If the phyint is not part of any group, pg_name is the
682 	 * null string. If 'track_all_phyints' is false, there is no
683 	 * need to create a phyint.
684 	 */
685 	if (pg_name[0] == '\0' && !track_all_phyints) {
686 		/*
687 		 * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
688 		 * set, reset them. These flags shouldn't be set if in.mpathd
689 		 * isn't tracking the interface.
690 		 */
691 		if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
692 			lifr.lifr_flags = flags &
693 			    ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
694 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
695 				if (errno != ENXIO) {
696 					logperror("phyint_inst_init_from_k:"
697 					    " ioctl (set flags)");
698 				}
699 			}
700 		}
701 		return (NULL);
702 	}
703 
704 	/*
705 	 * We need to create a new phyint instance.  We may also need to
706 	 * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
707 	 * an underlying interface before it found its IPMP meta-interface.
708 	 * Note that we keep any created groups even if phyint_inst_from_k()
709 	 * fails since a group's existence is not dependent on the ability of
710 	 * in.mpathd to the track the group's interfaces.
711 	 */
712 	if ((pg = phyint_group_lookup(pg_name)) == NULL) {
713 		if ((pg = phyint_group_create(pg_name)) == NULL) {
714 			logerr("phyint_inst_init_from_k: cannot create group "
715 			    "%s\n", pg_name);
716 			return (NULL);
717 		}
718 		phyint_group_insert(pg);
719 	}
720 
721 	/*
722 	 * Lookup the phyint. If the phyint does not exist create it.
723 	 */
724 	pi = phyint_lookup(pi_name);
725 	if (pi == NULL) {
726 		pi = phyint_create(pi_name, pg, ifindex, flags);
727 		if (pi == NULL) {
728 			logerr("phyint_inst_init_from_k:"
729 			    " unable to create phyint %s\n", pi_name);
730 			return (NULL);
731 		}
732 		pi_created = _B_TRUE;
733 	} else {
734 		/* The phyint exists already. */
735 		assert(pi_created == _B_FALSE);
736 		/*
737 		 * Normally we should see consistent values for the IPv4 and
738 		 * IPv6 instances, for phyint properties. If we don't, it
739 		 * means things have changed underneath us, and we should
740 		 * resync our tables with the kernel. Check whether the
741 		 * interface index has changed. If so, it is most likely
742 		 * the interface has been unplumbed and replumbed,
743 		 * while we are yet to update our tables. Do it now.
744 		 */
745 		if (pi->pi_ifindex != ifindex) {
746 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
747 			goto retry;
748 		}
749 		assert(PHYINT_INSTANCE(pi, af) == NULL);
750 
751 		/*
752 		 * If the group name seen by the IPv4 and IPv6 instances
753 		 * are different, it is most likely the groupname has
754 		 * changed, while we are yet to update our tables. Do it now.
755 		 */
756 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
757 			phyint_inst_delete(PHYINT_INSTANCE(pi,
758 			    AF_OTHER(af)));
759 			goto retry;
760 		}
761 	}
762 
763 	/*
764 	 * Create a new phyint instance, corresponding to the 'af'
765 	 * passed in.
766 	 */
767 	pii = phyint_inst_create(pi, af);
768 	if (pii == NULL) {
769 		logerr("phyint_inst_init_from_k: unable to create"
770 		    "phyint inst %s\n", pi->pi_name);
771 		if (pi_created)
772 			phyint_delete(pi);
773 
774 		return (NULL);
775 	}
776 
777 	if (pi_created) {
778 		/*
779 		 * If this phyint does not have a unique hardware address in its
780 		 * group, offline it.  (The change_pif_flags() implementation
781 		 * requires that we defer this until after the phyint_instance
782 		 * is created.)
783 		 */
784 		if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
785 			pi->pi_hwaddrdup = _B_TRUE;
786 			(void) phyint_offline(pi, 0);
787 		}
788 	}
789 
790 	return (pii);
791 }
792 
793 /*
794  * Bind pii_probe_sock to the address associated with pii_probe_logint.
795  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
796  * targets. Do the common part in this function, and complete the
797  * initializations by calling the protocol specific functions
798  * phyint_inst_v{4,6}_sockinit() respectively.
799  *
800  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
801  */
802 boolean_t
803 phyint_inst_sockinit(struct phyint_instance *pii)
804 {
805 	boolean_t success;
806 	struct phyint_group *pg;
807 
808 	if (debug & D_PHYINT) {
809 		logdebug("phyint_inst_sockinit(%s %s)\n",
810 		    AF_STR(pii->pii_af), pii->pii_name);
811 	}
812 
813 	assert(pii->pii_probe_logint != NULL);
814 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
815 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
816 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
817 
818 	/*
819 	 * If the socket is already bound, close pii_probe_sock
820 	 */
821 	if (pii->pii_probe_sock != -1)
822 		close_probe_socket(pii, _B_TRUE);
823 
824 	/*
825 	 * If the phyint is not part of a named group and track_all_phyints is
826 	 * false, simply return.
827 	 */
828 	pg = pii->pii_phyint->pi_group;
829 	if (pg == phyint_anongroup && !track_all_phyints) {
830 		if (debug & D_PHYINT)
831 			logdebug("phyint_inst_sockinit: no group\n");
832 		return (_B_FALSE);
833 	}
834 
835 	/*
836 	 * Initialize the socket by calling the protocol specific function.
837 	 * If it succeeds, add the socket to the poll list.
838 	 */
839 	if (pii->pii_af == AF_INET6)
840 		success = phyint_inst_v6_sockinit(pii);
841 	else
842 		success = phyint_inst_v4_sockinit(pii);
843 
844 	if (success && (poll_add(pii->pii_probe_sock) == 0))
845 		return (_B_TRUE);
846 
847 	/* Something failed, cleanup and return false */
848 	if (pii->pii_probe_sock != -1)
849 		close_probe_socket(pii, _B_FALSE);
850 
851 	return (_B_FALSE);
852 }
853 
854 /*
855  * IPv6 specific part in initializing the pii_probe_sock. This socket is
856  * used to send/receive ICMPv6 probe packets.
857  */
858 static boolean_t
859 phyint_inst_v6_sockinit(struct phyint_instance *pii)
860 {
861 	icmp6_filter_t filter;
862 	int hopcount = 1;
863 	int off = 0;
864 	int on = 1;
865 	struct	sockaddr_in6	testaddr;
866 
867 	/*
868 	 * Open a raw socket with ICMPv6 protocol.
869 	 *
870 	 * Use IPV6_BOUND_IF to make sure that probes are sent and received on
871 	 * the specified phyint only.  Bind to the test address to ensure that
872 	 * the responses are sent to the specified phyint.
873 	 *
874 	 * Set the hopcount to 1 so that probe packets are not routed.
875 	 * Disable multicast loopback. Set the receive filter to
876 	 * receive only ICMPv6 echo replies.
877 	 */
878 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
879 	if (pii->pii_probe_sock < 0) {
880 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
881 		return (_B_FALSE);
882 	}
883 
884 	bzero(&testaddr, sizeof (testaddr));
885 	testaddr.sin6_family = AF_INET6;
886 	testaddr.sin6_port = 0;
887 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
888 
889 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
890 	    sizeof (testaddr)) < 0) {
891 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
892 		return (_B_FALSE);
893 	}
894 
895 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
896 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
897 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
898 		    " IPV6_MULTICAST_IF");
899 		return (_B_FALSE);
900 	}
901 
902 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
903 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
904 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
905 		    " IPV6_BOUND_IF");
906 		return (_B_FALSE);
907 	}
908 
909 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
910 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
911 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
912 		    " IPV6_UNICAST_HOPS");
913 		return (_B_FALSE);
914 	}
915 
916 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
917 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
918 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
919 		    " IPV6_MULTICAST_HOPS");
920 		return (_B_FALSE);
921 	}
922 
923 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
924 	    (char *)&off, sizeof (off)) < 0) {
925 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
926 		    " IPV6_MULTICAST_LOOP");
927 		return (_B_FALSE);
928 	}
929 
930 	/*
931 	 * Filter out so that we only receive ICMP echo replies
932 	 */
933 	ICMP6_FILTER_SETBLOCKALL(&filter);
934 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
935 
936 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
937 	    (char *)&filter, sizeof (filter)) < 0) {
938 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
939 		    " ICMP6_FILTER");
940 		return (_B_FALSE);
941 	}
942 
943 	/* Enable receipt of hoplimit */
944 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
945 	    &on, sizeof (on)) < 0) {
946 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
947 		    " IPV6_RECVHOPLIMIT");
948 		return (_B_FALSE);
949 	}
950 
951 	/* Enable receipt of timestamp */
952 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
953 	    &on, sizeof (on)) < 0) {
954 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
955 		    " SO_TIMESTAMP");
956 		return (_B_FALSE);
957 	}
958 
959 	return (_B_TRUE);
960 }
961 
962 /*
963  * IPv4 specific part in initializing the pii_probe_sock. This socket is
964  * used to send/receive ICMPv4 probe packets.
965  */
966 static boolean_t
967 phyint_inst_v4_sockinit(struct phyint_instance *pii)
968 {
969 	struct sockaddr_in  testaddr;
970 	char	char_off = 0;
971 	int	ttl = 1;
972 	char	char_ttl = 1;
973 	int	on = 1;
974 
975 	/*
976 	 * Open a raw socket with ICMPv4 protocol.
977 	 *
978 	 * Use IP_BOUND_IF to make sure that probes are sent and received on
979 	 * the specified phyint only.  Bind to the test address to ensure that
980 	 * the responses are sent to the specified phyint.
981 	 *
982 	 * Set the ttl to 1 so that probe packets are not routed.
983 	 * Disable multicast loopback.  Enable receipt of timestamp.
984 	 */
985 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
986 	if (pii->pii_probe_sock < 0) {
987 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
988 		return (_B_FALSE);
989 	}
990 
991 	bzero(&testaddr, sizeof (testaddr));
992 	testaddr.sin_family = AF_INET;
993 	testaddr.sin_port = 0;
994 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
995 	    &testaddr.sin_addr);
996 
997 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
998 	    sizeof (testaddr)) < 0) {
999 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
1000 		return (_B_FALSE);
1001 	}
1002 
1003 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
1004 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
1005 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1006 		    " IP_BOUND_IF");
1007 		return (_B_FALSE);
1008 	}
1009 
1010 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
1011 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
1012 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1013 		    " IP_MULTICAST_IF");
1014 		return (_B_FALSE);
1015 	}
1016 
1017 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
1018 	    (char *)&ttl, sizeof (ttl)) < 0) {
1019 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1020 		    " IP_TTL");
1021 		return (_B_FALSE);
1022 	}
1023 
1024 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
1025 	    (char *)&char_off, sizeof (char_off)) == -1) {
1026 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1027 		    " IP_MULTICAST_LOOP");
1028 		return (_B_FALSE);
1029 	}
1030 
1031 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
1032 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
1033 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1034 		    " IP_MULTICAST_TTL");
1035 		return (_B_FALSE);
1036 	}
1037 
1038 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
1039 	    sizeof (on)) < 0) {
1040 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1041 		    " SO_TIMESTAMP");
1042 		return (_B_FALSE);
1043 	}
1044 
1045 	return (_B_TRUE);
1046 }
1047 
1048 /*
1049  * Remove the phyint group from the list of 'all phyint groups'
1050  * and free it.
1051  */
1052 void
1053 phyint_group_delete(struct phyint_group *pg)
1054 {
1055 	/*
1056 	 * The anonymous group always exists, even when empty.
1057 	 */
1058 	if (pg == phyint_anongroup)
1059 		return;
1060 
1061 	if (debug & D_PHYINT)
1062 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
1063 
1064 	/*
1065 	 * The phyint group must be empty, and must not have any phyints.
1066 	 * The phyint group must be in the list of all phyint groups
1067 	 */
1068 	assert(pg->pg_phyint == NULL);
1069 	assert(phyint_groups == pg || pg->pg_prev != NULL);
1070 
1071 	if (pg->pg_prev != NULL)
1072 		pg->pg_prev->pg_next = pg->pg_next;
1073 	else
1074 		phyint_groups = pg->pg_next;
1075 
1076 	if (pg->pg_next != NULL)
1077 		pg->pg_next->pg_prev = pg->pg_prev;
1078 
1079 	pg->pg_next = NULL;
1080 	pg->pg_prev = NULL;
1081 
1082 	phyint_grouplistsig++;
1083 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
1084 
1085 	addrlist_free(&pg->pg_addrs);
1086 	free(pg);
1087 }
1088 
1089 /*
1090  * Refresh the state of `pg' based on its current members.
1091  */
1092 void
1093 phyint_group_refresh_state(struct phyint_group *pg)
1094 {
1095 	enum pg_state state;
1096 	enum pg_state origstate = pg->pg_state;
1097 	struct phyint *pi, *usablepi;
1098 	uint_t nif = 0, nusable = 0;
1099 
1100 	/*
1101 	 * Anonymous groups never change state.
1102 	 */
1103 	if (pg == phyint_anongroup)
1104 		return;
1105 
1106 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1107 		nif++;
1108 		if (phyint_is_usable(pi)) {
1109 			nusable++;
1110 			usablepi = pi;
1111 		}
1112 	}
1113 
1114 	if (nusable == 0)
1115 		state = PG_FAILED;
1116 	else if (nif == nusable)
1117 		state = PG_OK;
1118 	else
1119 		state = PG_DEGRADED;
1120 
1121 	phyint_group_chstate(pg, state);
1122 
1123 	/*
1124 	 * If we're shutting down, skip logging messages since otherwise our
1125 	 * shutdown housecleaning will make us report that groups are unusable.
1126 	 */
1127 	if (cleanup_started)
1128 		return;
1129 
1130 	/*
1131 	 * NOTE: We use pg_failmsg_printed rather than origstate since
1132 	 * otherwise at startup we'll log a "now usable" message when the
1133 	 * first usable phyint is added to an empty group.
1134 	 */
1135 	if (state != PG_FAILED && pg->pg_failmsg_printed) {
1136 		assert(origstate == PG_FAILED);
1137 		logerr("At least 1 IP interface (%s) in group %s is now "
1138 		    "usable\n", usablepi->pi_name, pg->pg_name);
1139 		pg->pg_failmsg_printed = _B_FALSE;
1140 	} else if (origstate != PG_FAILED && state == PG_FAILED) {
1141 		logerr("All IP interfaces in group %s are now unusable\n",
1142 		    pg->pg_name);
1143 		pg->pg_failmsg_printed = _B_TRUE;
1144 	}
1145 }
1146 
1147 /*
1148  * Extract information from the kernel about the desired phyint.
1149  * Look only for properties of the phyint and not properties of logints.
1150  * Take appropriate action on the changes.
1151  * Return codes:
1152  *	PI_OK
1153  *		The phyint exists in the kernel and matches our knowledge
1154  *		of the phyint.
1155  *	PI_DELETED
1156  *		The phyint has vanished in the kernel.
1157  *	PI_IFINDEX_CHANGED
1158  *		The phyint's interface index has changed.
1159  *		Ask the caller to delete and recreate the phyint.
1160  *	PI_IOCTL_ERROR
1161  *		Some ioctl error. Don't change anything.
1162  *	PI_GROUP_CHANGED
1163  *		The phyint has changed group.
1164  */
1165 int
1166 phyint_inst_update_from_k(struct phyint_instance *pii)
1167 {
1168 	struct lifreq lifr;
1169 	int	ifsock;
1170 	struct phyint *pi;
1171 
1172 	pi = pii->pii_phyint;
1173 
1174 	if (debug & D_PHYINT) {
1175 		logdebug("phyint_inst_update_from_k(%s %s)\n",
1176 		    AF_STR(pii->pii_af), pi->pi_name);
1177 	}
1178 
1179 	/*
1180 	 * Get the ifindex from the kernel, for comparison with the
1181 	 * value in our tables.
1182 	 */
1183 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
1184 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1185 
1186 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1187 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
1188 		if (errno == ENXIO) {
1189 			return (PI_DELETED);
1190 		} else {
1191 			logperror_pii(pii, "phyint_inst_update_from_k:"
1192 			    " ioctl (get lifindex)");
1193 			return (PI_IOCTL_ERROR);
1194 		}
1195 	}
1196 
1197 	if (lifr.lifr_index != pi->pi_ifindex) {
1198 		/*
1199 		 * The index has changed. Most likely the interface has
1200 		 * been unplumbed and replumbed. Ask the caller to take
1201 		 * appropriate action.
1202 		 */
1203 		if (debug & D_PHYINT) {
1204 			logdebug("phyint_inst_update_from_k:"
1205 			    " old index %d new index %d\n",
1206 			    pi->pi_ifindex, lifr.lifr_index);
1207 		}
1208 		return (PI_IFINDEX_CHANGED);
1209 	}
1210 
1211 	/*
1212 	 * Get the group name from the kernel, for comparison with
1213 	 * the value in our tables.
1214 	 */
1215 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
1216 		if (errno == ENXIO) {
1217 			return (PI_DELETED);
1218 		} else {
1219 			logperror_pii(pii, "phyint_inst_update_from_k:"
1220 			    " ioctl (get groupname)");
1221 			return (PI_IOCTL_ERROR);
1222 		}
1223 	}
1224 
1225 	/*
1226 	 * If the phyint has changed group i.e. if the phyint group name
1227 	 * returned by the kernel is different, ask the caller to delete
1228 	 * and recreate the phyint in the right group
1229 	 */
1230 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
1231 		/* Groupname has changed */
1232 		if (debug & D_PHYINT) {
1233 			logdebug("phyint_inst_update_from_k:"
1234 			    " groupname change\n");
1235 		}
1236 		return (PI_GROUP_CHANGED);
1237 	}
1238 
1239 	/*
1240 	 * Get the current phyint flags from the kernel, and determine what
1241 	 * flags have changed by comparing against our tables.	Note that the
1242 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
1243 	 * that IFF_INACTIVE is really still set on the interface.
1244 	 */
1245 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
1246 		if (errno == ENXIO) {
1247 			return (PI_DELETED);
1248 		} else {
1249 			logperror_pii(pii, "phyint_inst_update_from_k: "
1250 			    " ioctl (get flags)");
1251 			return (PI_IOCTL_ERROR);
1252 		}
1253 	}
1254 
1255 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
1256 	if (pi->pi_v4 != NULL)
1257 		pi->pi_v4->pii_flags = pi->pi_flags;
1258 	if (pi->pi_v6 != NULL)
1259 		pi->pi_v6->pii_flags = pi->pi_flags;
1260 
1261 	/*
1262 	 * Make sure the IFF_FAILED flag is set if and only if we think
1263 	 * the interface should be failed.
1264 	 */
1265 	if (pi->pi_flags & IFF_FAILED) {
1266 		if (pi->pi_state == PI_RUNNING)
1267 			(void) change_pif_flags(pi, 0, IFF_FAILED);
1268 	} else {
1269 		if (pi->pi_state == PI_FAILED)
1270 			(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1271 	}
1272 
1273 	/* No change in phyint status */
1274 	return (PI_OK);
1275 }
1276 
1277 /*
1278  * Delete the phyint. Remove it from the list of all phyints, and the
1279  * list of phyint group members.
1280  */
1281 static void
1282 phyint_delete(struct phyint *pi)
1283 {
1284 	struct phyint *pi2;
1285 	struct phyint_group *pg = pi->pi_group;
1286 
1287 	if (debug & D_PHYINT)
1288 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1289 
1290 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1291 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1292 
1293 	/*
1294 	 * The phyint must belong to a group.
1295 	 */
1296 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1297 
1298 	/* The phyint must be in the list of all phyints */
1299 	assert(phyints == pi || pi->pi_prev != NULL);
1300 
1301 	/* Remove the phyint from the phyint group list */
1302 	pg->pg_sig++;
1303 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1304 
1305 	if (pi->pi_pgprev == NULL) {
1306 		/* Phyint is the 1st in the phyint group list */
1307 		pg->pg_phyint = pi->pi_pgnext;
1308 	} else {
1309 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1310 	}
1311 	if (pi->pi_pgnext != NULL)
1312 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1313 	pi->pi_pgnext = NULL;
1314 	pi->pi_pgprev = NULL;
1315 
1316 	/* Refresh the group state now that this phyint has been removed */
1317 	phyint_group_refresh_state(pg);
1318 
1319 	/* Remove the phyint from the global list of phyints */
1320 	if (pi->pi_prev == NULL) {
1321 		/* Phyint is the 1st in the list */
1322 		phyints = pi->pi_next;
1323 	} else {
1324 		pi->pi_prev->pi_next = pi->pi_next;
1325 	}
1326 	if (pi->pi_next != NULL)
1327 		pi->pi_next->pi_prev = pi->pi_prev;
1328 	pi->pi_next = NULL;
1329 	pi->pi_prev = NULL;
1330 
1331 	/*
1332 	 * See if another phyint in the group had been offlined because
1333 	 * it was a dup of `pi' -- and if so, online it.
1334 	 */
1335 	if (!pi->pi_hwaddrdup &&
1336 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1337 		assert(pi2->pi_hwaddrdup);
1338 		(void) phyint_undo_offline(pi2);
1339 	}
1340 	phyint_link_close(pi);
1341 	free(pi);
1342 }
1343 
1344 /*
1345  * Offline phyint `pi' if at least `minred' usable interfaces remain in the
1346  * group.  Returns an IPMP error code.
1347  */
1348 int
1349 phyint_offline(struct phyint *pi, uint_t minred)
1350 {
1351 	boolean_t was_active;
1352 	unsigned int nusable = 0;
1353 	struct phyint *pi2;
1354 	struct phyint_group *pg = pi->pi_group;
1355 
1356 	/*
1357 	 * Verify that enough usable interfaces in the group would remain.
1358 	 * As a special case, if the group has failed, allow any non-offline
1359 	 * phyints to be offlined.
1360 	 */
1361 	if (pg != phyint_anongroup) {
1362 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1363 			if (pi2 == pi)
1364 				continue;
1365 			if (phyint_is_usable(pi2) ||
1366 			    (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
1367 				nusable++;
1368 		}
1369 	}
1370 	if (nusable < minred)
1371 		return (IPMP_EMINRED);
1372 
1373 	was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1374 
1375 	if (!change_pif_flags(pi, IFF_OFFLINE, IFF_INACTIVE))
1376 		return (IPMP_FAILURE);
1377 
1378 	/*
1379 	 * The interface is now offline, so stop probing it.  Note that
1380 	 * if_mpadm(1M) will down the test addresses, after receiving a
1381 	 * success reply from us. The routing socket message will then make us
1382 	 * close the socket used for sending probes. But it is more logical
1383 	 * that an offlined interface must not be probed, even if it has test
1384 	 * addresses.
1385 	 *
1386 	 * NOTE: stop_probing() also sets PI_OFFLINE.
1387 	 */
1388 	stop_probing(pi);
1389 
1390 	/*
1391 	 * If we're offlining the phyint because it has a duplicate hardware
1392 	 * address, print a warning -- and leave the link open so that we can
1393 	 * be notified of hardware address changes that make it usable again.
1394 	 * Otherwise, close the link so that we won't prevent a detach.
1395 	 */
1396 	if (pi->pi_hwaddrdup) {
1397 		logerr("IP interface %s has a hardware address which is not "
1398 		    "unique in group %s; offlining\n", pi->pi_name,
1399 		    pg->pg_name);
1400 	} else {
1401 		phyint_link_close(pi);
1402 	}
1403 
1404 	/*
1405 	 * If this phyint was preventing another phyint with a duplicate
1406 	 * hardware address from being online, bring that one online now.
1407 	 */
1408 	if (!pi->pi_hwaddrdup &&
1409 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1410 		assert(pi2->pi_hwaddrdup);
1411 		(void) phyint_undo_offline(pi2);
1412 	}
1413 
1414 	/*
1415 	 * If this interface was active, try to activate another INACTIVE
1416 	 * interface in the group.
1417 	 */
1418 	if (was_active)
1419 		phyint_activate_another(pi);
1420 
1421 	return (IPMP_SUCCESS);
1422 }
1423 
1424 /*
1425  * Undo a previous offline of `pi'.  Returns an IPMP error code.
1426  */
1427 int
1428 phyint_undo_offline(struct phyint *pi)
1429 {
1430 	if (pi->pi_state != PI_OFFLINE) {
1431 		errno = EINVAL;
1432 		return (IPMP_FAILURE);
1433 	}
1434 
1435 	/*
1436 	 * If necessary, reinitialize our link information and verify that its
1437 	 * hardware address is still unique across the group.
1438 	 */
1439 	if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
1440 		errno = EIO;
1441 		return (IPMP_FAILURE);
1442 	}
1443 
1444 	if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
1445 		pi->pi_hwaddrdup = _B_TRUE;
1446 		return (IPMP_EHWADDRDUP);
1447 	}
1448 
1449 	if (pi->pi_hwaddrdup) {
1450 		logerr("IP interface %s now has a unique hardware address in "
1451 		    "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
1452 		pi->pi_hwaddrdup = _B_FALSE;
1453 	}
1454 
1455 	if (!change_pif_flags(pi, 0, IFF_OFFLINE))
1456 		return (IPMP_FAILURE);
1457 
1458 	/*
1459 	 * While the interface was offline, it may have failed (e.g. the link
1460 	 * may have gone down).  phyint_inst_check_for_failure() will have
1461 	 * already set pi_flags with IFF_FAILED, so we can use that to decide
1462 	 * whether the phyint should transition to running.  Note that after
1463 	 * we transition to running, we will start sending probes again (if
1464 	 * test addresses are configured), which may also reveal that the
1465 	 * interface is in fact failed.
1466 	 */
1467 	if (pi->pi_flags & IFF_FAILED) {
1468 		phyint_chstate(pi, PI_FAILED);
1469 	} else {
1470 		/* calls phyint_chstate() */
1471 		phyint_transition_to_running(pi);
1472 	}
1473 
1474 	/*
1475 	 * Give the requestor time to configure test addresses before
1476 	 * complaining that they're missing.
1477 	 */
1478 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
1479 
1480 	return (IPMP_SUCCESS);
1481 }
1482 
1483 /*
1484  * Delete (unlink and free), the phyint instance.
1485  */
1486 void
1487 phyint_inst_delete(struct phyint_instance *pii)
1488 {
1489 	struct phyint *pi = pii->pii_phyint;
1490 
1491 	assert(pi != NULL);
1492 
1493 	if (debug & D_PHYINT) {
1494 		logdebug("phyint_inst_delete(%s %s)\n",
1495 		    AF_STR(pii->pii_af), pi->pi_name);
1496 	}
1497 
1498 	/*
1499 	 * If the phyint instance has associated probe targets
1500 	 * delete all the targets
1501 	 */
1502 	while (pii->pii_targets != NULL)
1503 		target_delete(pii->pii_targets);
1504 
1505 	/*
1506 	 * Delete all the logints associated with this phyint
1507 	 * instance.
1508 	 */
1509 	while (pii->pii_logint != NULL)
1510 		logint_delete(pii->pii_logint);
1511 
1512 	/*
1513 	 * Close the socket used to send probes to targets from this phyint.
1514 	 */
1515 	if (pii->pii_probe_sock != -1)
1516 		close_probe_socket(pii, _B_TRUE);
1517 
1518 	/*
1519 	 * Phyint instance must be in the list of all phyint instances.
1520 	 * Remove phyint instance from the global list of phyint instances.
1521 	 */
1522 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1523 	if (pii->pii_prev == NULL) {
1524 		/* Phyint is the 1st in the list */
1525 		phyint_instances = pii->pii_next;
1526 	} else {
1527 		pii->pii_prev->pii_next = pii->pii_next;
1528 	}
1529 	if (pii->pii_next != NULL)
1530 		pii->pii_next->pii_prev = pii->pii_prev;
1531 	pii->pii_next = NULL;
1532 	pii->pii_prev = NULL;
1533 
1534 	/*
1535 	 * Reset the phyint instance pointer in the phyint.
1536 	 * If this is the last phyint instance (being deleted) on this
1537 	 * phyint, then delete the phyint.
1538 	 */
1539 	if (pii->pii_af == AF_INET)
1540 		pi->pi_v4 = NULL;
1541 	else
1542 		pi->pi_v6 = NULL;
1543 
1544 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1545 		phyint_delete(pi);
1546 
1547 	free(pii);
1548 }
1549 
1550 static void
1551 phyint_inst_print(struct phyint_instance *pii)
1552 {
1553 	struct logint *li;
1554 	struct target *tg;
1555 	char abuf[INET6_ADDRSTRLEN];
1556 	int most_recent;
1557 	int i;
1558 
1559 	if (pii->pii_phyint == NULL) {
1560 		logdebug("pii->pi_phyint NULL can't print\n");
1561 		return;
1562 	}
1563 
1564 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1565 	    "sock %x in_use %d\n",
1566 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1567 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1568 	    pii->pii_in_use);
1569 
1570 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1571 		logint_print(li);
1572 
1573 	logdebug("\n");
1574 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1575 		target_print(tg);
1576 
1577 	if (pii->pii_targets == NULL)
1578 		logdebug("pi_targets NULL\n");
1579 
1580 	if (pii->pii_target_next != NULL) {
1581 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1582 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1583 		    abuf, sizeof (abuf)));
1584 	} else {
1585 		logdebug("pi_target_next NULL\n");
1586 	}
1587 
1588 	if (pii->pii_rtt_target_next != NULL) {
1589 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1590 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1591 		    abuf, sizeof (abuf)));
1592 	} else {
1593 		logdebug("pi_rtt_target_next NULL\n");
1594 	}
1595 
1596 	if (pii->pii_targets != NULL) {
1597 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1598 
1599 		i = most_recent;
1600 		do {
1601 			if (pii->pii_probes[i].pr_target != NULL) {
1602 				logdebug("#%d target %s ", i,
1603 				    pr_addr(pii->pii_af,
1604 				    pii->pii_probes[i].pr_target->tg_address,
1605 				    abuf, sizeof (abuf)));
1606 			} else {
1607 				logdebug("#%d target NULL ", i);
1608 			}
1609 			logdebug("time_start %lld status %d "
1610 			    "time_ackproc %lld time_lost %u",
1611 			    pii->pii_probes[i].pr_hrtime_start,
1612 			    pii->pii_probes[i].pr_status,
1613 			    pii->pii_probes[i].pr_hrtime_ackproc,
1614 			    pii->pii_probes[i].pr_time_lost);
1615 			i = PROBE_INDEX_PREV(i);
1616 		} while (i != most_recent);
1617 	}
1618 }
1619 
1620 /*
1621  * Lookup a logint based on the logical interface name, on the given
1622  * phyint instance.
1623  */
1624 static struct logint *
1625 logint_lookup(struct phyint_instance *pii, char *name)
1626 {
1627 	struct logint *li;
1628 
1629 	if (debug & D_LOGINT) {
1630 		logdebug("logint_lookup(%s, %s)\n",
1631 		    AF_STR(pii->pii_af), name);
1632 	}
1633 
1634 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1635 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1636 			break;
1637 	}
1638 	return (li);
1639 }
1640 
1641 /*
1642  * Insert a logint at the head of the list of logints of the given
1643  * phyint instance
1644  */
1645 static void
1646 logint_insert(struct phyint_instance *pii, struct logint *li)
1647 {
1648 	li->li_next = pii->pii_logint;
1649 	li->li_prev = NULL;
1650 	if (pii->pii_logint != NULL)
1651 		pii->pii_logint->li_prev = li;
1652 	pii->pii_logint = li;
1653 	li->li_phyint_inst = pii;
1654 }
1655 
1656 /*
1657  * Create a new named logint, on the specified phyint instance.
1658  */
1659 static struct logint *
1660 logint_create(struct phyint_instance *pii, char *name)
1661 {
1662 	struct logint *li;
1663 
1664 	if (debug & D_LOGINT) {
1665 		logdebug("logint_create(%s %s %s)\n",
1666 		    AF_STR(pii->pii_af), pii->pii_name, name);
1667 	}
1668 
1669 	li = calloc(1, sizeof (struct logint));
1670 	if (li == NULL) {
1671 		logperror("logint_create: calloc");
1672 		return (NULL);
1673 	}
1674 
1675 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1676 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1677 	logint_insert(pii, li);
1678 	return (li);
1679 }
1680 
1681 /*
1682  * Initialize the logint based on the data returned by the kernel.
1683  */
1684 void
1685 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1686 {
1687 	int	ifsock;
1688 	uint64_t flags;
1689 	uint64_t saved_flags;
1690 	struct	logint	*li;
1691 	struct lifreq	lifr;
1692 	struct in6_addr	test_subnet;
1693 	struct in6_addr	testaddr;
1694 	int	test_subnet_len;
1695 	struct sockaddr_in6	*sin6;
1696 	struct sockaddr_in	*sin;
1697 	char abuf[INET6_ADDRSTRLEN];
1698 	boolean_t  ptp = _B_FALSE;
1699 	struct in6_addr tgaddr;
1700 
1701 	if (debug & D_LOGINT) {
1702 		logdebug("logint_init_from_k(%s %s)\n",
1703 		    AF_STR(pii->pii_af), li_name);
1704 	}
1705 
1706 	/* Get the socket for doing ioctls */
1707 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1708 
1709 	/*
1710 	 * Get the flags from the kernel. Also serves as a check whether
1711 	 * the logical still exists. If it doesn't exist, no need to proceed
1712 	 * any further. li_in_use will make the caller clean up the logint
1713 	 */
1714 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1715 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1716 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1717 		/* Interface may have vanished */
1718 		if (errno != ENXIO) {
1719 			logperror_pii(pii, "logint_init_from_k: "
1720 			    "ioctl (get flags)");
1721 		}
1722 		return;
1723 	}
1724 
1725 	flags = lifr.lifr_flags;
1726 
1727 	/*
1728 	 * Verified the logint exists. Now lookup the logint in our tables.
1729 	 * If it does not exist, create a new logint.
1730 	 */
1731 	li = logint_lookup(pii, li_name);
1732 	if (li == NULL) {
1733 		li = logint_create(pii, li_name);
1734 		if (li == NULL) {
1735 			/*
1736 			 * Pretend the interface does not exist
1737 			 * in the kernel
1738 			 */
1739 			return;
1740 		}
1741 	}
1742 
1743 	/*
1744 	 * Update li->li_flags with the new flags, after saving the old
1745 	 * value. This is used later to check what flags has changed and
1746 	 * take any action
1747 	 */
1748 	saved_flags = li->li_flags;
1749 	li->li_flags = flags;
1750 
1751 	/*
1752 	 * Get the address, prefix, prefixlength and update the logint.
1753 	 * Check if anything has changed. If the logint used for the
1754 	 * test address has changed, take suitable action.
1755 	 */
1756 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1757 		/* Interface may have vanished */
1758 		if (errno != ENXIO) {
1759 			logperror_li(li, "logint_init_from_k: (get addr)");
1760 		}
1761 		goto error;
1762 	}
1763 
1764 	if (pii->pii_af == AF_INET) {
1765 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1766 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1767 	} else {
1768 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1769 		testaddr = sin6->sin6_addr;
1770 	}
1771 
1772 	if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1773 		/* Interface may have vanished */
1774 		if (errno != ENXIO)
1775 			logperror_li(li, "logint_init_from_k: (get subnet)");
1776 		goto error;
1777 	}
1778 	if (lifr.lifr_subnet.ss_family == AF_INET6) {
1779 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1780 		test_subnet = sin6->sin6_addr;
1781 		test_subnet_len = lifr.lifr_addrlen;
1782 	} else {
1783 		sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1784 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1785 		test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
1786 	}
1787 
1788 	/*
1789 	 * If this is the logint corresponding to the test address used for
1790 	 * sending probes, then if anything significant has changed we need to
1791 	 * determine the test address again.  We ignore changes to the
1792 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1793 	 * course.
1794 	 */
1795 	if (pii->pii_probe_logint == li) {
1796 		if (((li->li_flags ^ saved_flags) &
1797 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1798 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1799 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1800 		    &li->li_subnet)) ||
1801 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1802 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1803 			/*
1804 			 * Something significant that affects the testaddress
1805 			 * has changed. Redo the testaddress selection later on
1806 			 * in select_test_ifs(). For now do the cleanup and
1807 			 * set pii_probe_logint to NULL.
1808 			 */
1809 			if (pii->pii_probe_sock != -1)
1810 				close_probe_socket(pii, _B_TRUE);
1811 			pii->pii_probe_logint = NULL;
1812 		}
1813 	}
1814 
1815 
1816 	/* Update the logint with the values obtained from the kernel.	*/
1817 	li->li_addr = testaddr;
1818 	li->li_in_use = 1;
1819 	if (ptp) {
1820 		li->li_dstaddr = tgaddr;
1821 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1822 		    IP_ABITS : IPV6_ABITS;
1823 	} else {
1824 		li->li_subnet = test_subnet;
1825 		li->li_subnet_len = test_subnet_len;
1826 	}
1827 
1828 	if (debug & D_LOGINT)
1829 		logint_print(li);
1830 
1831 	return;
1832 
1833 error:
1834 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1835 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1836 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1837 	logint_delete(li);
1838 }
1839 
1840 /*
1841  * Delete (unlink and free) a logint.
1842  */
1843 void
1844 logint_delete(struct logint *li)
1845 {
1846 	struct phyint_instance *pii;
1847 
1848 	pii = li->li_phyint_inst;
1849 	assert(pii != NULL);
1850 
1851 	if (debug & D_LOGINT) {
1852 		int af;
1853 		char abuf[INET6_ADDRSTRLEN];
1854 
1855 		af = pii->pii_af;
1856 		logdebug("logint_delete(%s %s %s/%u)\n",
1857 		    AF_STR(af), li->li_name,
1858 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1859 		    li->li_subnet_len);
1860 	}
1861 
1862 	/* logint must be in the list of logints */
1863 	assert(pii->pii_logint == li || li->li_prev != NULL);
1864 
1865 	/* Remove the logint from the list of logints  */
1866 	if (li->li_prev == NULL) {
1867 		/* logint is the 1st in the list */
1868 		pii->pii_logint = li->li_next;
1869 	} else {
1870 		li->li_prev->li_next = li->li_next;
1871 	}
1872 	if (li->li_next != NULL)
1873 		li->li_next->li_prev = li->li_prev;
1874 	li->li_next = NULL;
1875 	li->li_prev = NULL;
1876 
1877 	/*
1878 	 * If this logint is also being used for probing, then close the
1879 	 * associated socket, if it exists.
1880 	 */
1881 	if (pii->pii_probe_logint == li) {
1882 		if (pii->pii_probe_sock != -1)
1883 			close_probe_socket(pii, _B_TRUE);
1884 		pii->pii_probe_logint = NULL;
1885 	}
1886 
1887 	free(li);
1888 }
1889 
1890 static void
1891 logint_print(struct logint *li)
1892 {
1893 	char abuf[INET6_ADDRSTRLEN];
1894 	int af = li->li_phyint_inst->pii_af;
1895 
1896 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1897 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1898 
1899 	logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
1900 }
1901 
1902 char *
1903 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1904 {
1905 	struct in_addr	addr_v4;
1906 
1907 	if (af == AF_INET) {
1908 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1909 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1910 	} else {
1911 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1912 	}
1913 	return (abuf);
1914 }
1915 
1916 /*
1917  * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
1918  * represented by the [`af',`addr'] pair.  Needed because in.mpathd internally
1919  * stores all addresses as in6_addrs, but we don't want to expose that.
1920  */
1921 void
1922 addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
1923 {
1924 	struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
1925 	struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
1926 
1927 	assert(af == AF_INET || af == AF_INET6);
1928 
1929 	switch (af) {
1930 	case AF_INET:
1931 		(void) memset(sinp, 0, sizeof (*sinp));
1932 		sinp->sin_family = AF_INET;
1933 		IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
1934 		break;
1935 	case AF_INET6:
1936 		(void) memset(sin6p, 0, sizeof (*sin6p));
1937 		sin6p->sin6_family = AF_INET6;
1938 		sin6p->sin6_addr = *addr;
1939 		break;
1940 	}
1941 }
1942 
1943 /* Lookup target on its address */
1944 struct target *
1945 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1946 {
1947 	struct target *tg;
1948 
1949 	if (debug & D_TARGET) {
1950 		char abuf[INET6_ADDRSTRLEN];
1951 
1952 		logdebug("target_lookup(%s %s): addr %s\n",
1953 		    AF_STR(pii->pii_af), pii->pii_name,
1954 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1955 	}
1956 
1957 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1958 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1959 			break;
1960 	}
1961 	return (tg);
1962 }
1963 
1964 /*
1965  * Find and return the next active target, for the next probe.
1966  * If no active targets are available, return NULL.
1967  */
1968 struct target *
1969 target_next(struct target *tg)
1970 {
1971 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
1972 	struct	target	*marker = tg;
1973 	hrtime_t now;
1974 
1975 	now = gethrtime();
1976 
1977 	/*
1978 	 * Target must be in the list of targets for this phyint
1979 	 * instance.
1980 	 */
1981 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1982 	assert(pii->pii_targets != NULL);
1983 
1984 	/* Return the next active target */
1985 	do {
1986 		/*
1987 		 * Go to the next target. If we hit the end,
1988 		 * reset the ptr to the head
1989 		 */
1990 		tg = tg->tg_next;
1991 		if (tg == NULL)
1992 			tg = pii->pii_targets;
1993 
1994 		assert(TG_STATUS_VALID(tg->tg_status));
1995 
1996 		switch (tg->tg_status) {
1997 		case TG_ACTIVE:
1998 			return (tg);
1999 
2000 		case TG_UNUSED:
2001 			assert(pii->pii_targets_are_routers);
2002 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
2003 				/*
2004 				 * Bubble up the unused target to active
2005 				 */
2006 				tg->tg_status = TG_ACTIVE;
2007 				pii->pii_ntargets++;
2008 				return (tg);
2009 			}
2010 			break;
2011 
2012 		case TG_SLOW:
2013 			assert(pii->pii_targets_are_routers);
2014 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2015 				/*
2016 				 * Bubble up the slow target to unused
2017 				 */
2018 				tg->tg_status = TG_UNUSED;
2019 			}
2020 			break;
2021 
2022 		case TG_DEAD:
2023 			assert(pii->pii_targets_are_routers);
2024 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2025 				/*
2026 				 * Bubble up the dead target to slow
2027 				 */
2028 				tg->tg_status = TG_SLOW;
2029 				tg->tg_latime = now;
2030 			}
2031 			break;
2032 		}
2033 
2034 	} while (tg != marker);
2035 
2036 	return (NULL);
2037 }
2038 
2039 /*
2040  * Select the best available target, that is not already TG_ACTIVE,
2041  * for the caller. The caller will determine whether it wants to
2042  * make the returned target TG_ACTIVE.
2043  * The selection order is as follows.
2044  * 1. pick a TG_UNSED target, if it exists.
2045  * 2. else pick a TG_SLOW target that has recovered, if it exists
2046  * 3. else pick any TG_SLOW target, if it exists
2047  * 4. else pick a TG_DEAD target that has recovered, if it exists
2048  * 5. else pick any TG_DEAD target, if it exists
2049  * 6. else return null
2050  */
2051 static struct target *
2052 target_select_best(struct phyint_instance *pii)
2053 {
2054 	struct target *tg;
2055 	struct target *slow = NULL;
2056 	struct target *dead = NULL;
2057 	struct target *slow_recovered = NULL;
2058 	struct target *dead_recovered = NULL;
2059 	hrtime_t now;
2060 
2061 	now = gethrtime();
2062 
2063 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2064 		assert(TG_STATUS_VALID(tg->tg_status));
2065 
2066 		switch (tg->tg_status) {
2067 		case TG_UNUSED:
2068 			return (tg);
2069 
2070 		case TG_SLOW:
2071 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2072 				slow_recovered = tg;
2073 				/*
2074 				 * Promote the slow_recovered to unused
2075 				 */
2076 				tg->tg_status = TG_UNUSED;
2077 			} else {
2078 				slow = tg;
2079 			}
2080 			break;
2081 
2082 		case TG_DEAD:
2083 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2084 				dead_recovered = tg;
2085 				/*
2086 				 * Promote the dead_recovered to slow
2087 				 */
2088 				tg->tg_status = TG_SLOW;
2089 				tg->tg_latime = now;
2090 			} else {
2091 				dead = tg;
2092 			}
2093 			break;
2094 
2095 		default:
2096 			break;
2097 		}
2098 	}
2099 
2100 	if (slow_recovered != NULL)
2101 		return (slow_recovered);
2102 	else if (slow != NULL)
2103 		return (slow);
2104 	else if (dead_recovered != NULL)
2105 		return (dead_recovered);
2106 	else
2107 		return (dead);
2108 }
2109 
2110 /*
2111  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
2112  * that are active, pick the next best below.
2113  */
2114 static void
2115 target_activate_all(struct phyint_instance *pii)
2116 {
2117 	struct target *tg;
2118 
2119 	assert(pii->pii_ntargets == 0);
2120 	assert(pii->pii_target_next == NULL);
2121 	assert(pii->pii_rtt_target_next == NULL);
2122 	assert(pii->pii_targets_are_routers);
2123 
2124 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
2125 		tg = target_select_best(pii);
2126 		if (tg == NULL) {
2127 			/* We are out of targets */
2128 			return;
2129 		}
2130 
2131 		assert(TG_STATUS_VALID(tg->tg_status));
2132 		assert(tg->tg_status != TG_ACTIVE);
2133 		tg->tg_status = TG_ACTIVE;
2134 		pii->pii_ntargets++;
2135 		if (pii->pii_target_next == NULL) {
2136 			pii->pii_target_next = tg;
2137 			pii->pii_rtt_target_next = tg;
2138 		}
2139 	}
2140 }
2141 
2142 static struct target *
2143 target_first(struct phyint_instance *pii)
2144 {
2145 	struct target *tg;
2146 
2147 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2148 		assert(TG_STATUS_VALID(tg->tg_status));
2149 		if (tg->tg_status == TG_ACTIVE)
2150 			break;
2151 	}
2152 
2153 	return (tg);
2154 }
2155 
2156 /*
2157  * Create a default target entry.
2158  */
2159 void
2160 target_create(struct phyint_instance *pii, struct in6_addr addr,
2161     boolean_t is_router)
2162 {
2163 	struct target *tg;
2164 	struct phyint *pi;
2165 	struct logint *li;
2166 
2167 	if (debug & D_TARGET) {
2168 		char abuf[INET6_ADDRSTRLEN];
2169 
2170 		logdebug("target_create(%s %s, %s)\n",
2171 		    AF_STR(pii->pii_af), pii->pii_name,
2172 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
2173 	}
2174 
2175 	/*
2176 	 * If the test address is not yet initialized, do not add
2177 	 * any target, since we cannot determine whether the target
2178 	 * belongs to the same subnet as the test address.
2179 	 */
2180 	li = pii->pii_probe_logint;
2181 	if (li == NULL)
2182 		return;
2183 
2184 	/*
2185 	 * If there are multiple subnets associated with an interface, then
2186 	 * add the target to this phyint instance only if it belongs to the
2187 	 * same subnet as the test address.  This assures us that we will
2188 	 * be able to reach this target through our routing table.
2189 	 */
2190 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
2191 		return;
2192 
2193 	if (pii->pii_targets != NULL) {
2194 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
2195 		if (is_router) {
2196 			if (!pii->pii_targets_are_routers) {
2197 				/*
2198 				 * Prefer router over hosts. Using hosts is a
2199 				 * fallback mechanism, hence delete all host
2200 				 * targets.
2201 				 */
2202 				while (pii->pii_targets != NULL)
2203 					target_delete(pii->pii_targets);
2204 			}
2205 		} else {
2206 			/*
2207 			 * Routers take precedence over hosts. If this
2208 			 * is a router list and we are trying to add a
2209 			 * host, just return. If this is a host list
2210 			 * and if we have sufficient targets, just return
2211 			 */
2212 			if (pii->pii_targets_are_routers ||
2213 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
2214 				return;
2215 		}
2216 	}
2217 
2218 	tg = calloc(1, sizeof (struct target));
2219 	if (tg == NULL) {
2220 		logperror("target_create: calloc");
2221 		return;
2222 	}
2223 
2224 	tg->tg_phyint_inst = pii;
2225 	tg->tg_address = addr;
2226 	tg->tg_in_use = 1;
2227 	tg->tg_rtt_sa = -1;
2228 	tg->tg_num_deferred = 0;
2229 
2230 	/*
2231 	 * If this is the first target, set 'pii_targets_are_routers'
2232 	 * The list of targets is either a list of hosts or list or
2233 	 * routers, but not a mix.
2234 	 */
2235 	if (pii->pii_targets == NULL) {
2236 		assert(pii->pii_ntargets == 0);
2237 		assert(pii->pii_target_next == NULL);
2238 		assert(pii->pii_rtt_target_next == NULL);
2239 		pii->pii_targets_are_routers = is_router ? 1 : 0;
2240 	}
2241 
2242 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
2243 		assert(pii->pii_targets_are_routers);
2244 		assert(pii->pii_target_next != NULL);
2245 		assert(pii->pii_rtt_target_next != NULL);
2246 		tg->tg_status = TG_UNUSED;
2247 	} else {
2248 		if (pii->pii_ntargets == 0) {
2249 			assert(pii->pii_target_next == NULL);
2250 			pii->pii_target_next = tg;
2251 			pii->pii_rtt_target_next = tg;
2252 		}
2253 		pii->pii_ntargets++;
2254 		tg->tg_status = TG_ACTIVE;
2255 	}
2256 
2257 	target_insert(pii, tg);
2258 
2259 	/*
2260 	 * Change state to PI_RUNNING if this phyint instance is capable of
2261 	 * sending and receiving probes -- that is, if we know of at least 1
2262 	 * target, and this phyint instance is probe-capable.  For more
2263 	 * details, see the phyint state diagram in mpd_probe.c.
2264 	 */
2265 	pi = pii->pii_phyint;
2266 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
2267 		if (pi->pi_flags & IFF_FAILED)
2268 			phyint_chstate(pi, PI_FAILED);
2269 		else
2270 			phyint_chstate(pi, PI_RUNNING);
2271 	}
2272 }
2273 
2274 /*
2275  * Add the target address named by `addr' to phyint instance `pii' if it does
2276  * not already exist.  If the target is a router, `is_router' should be set to
2277  * B_TRUE.
2278  */
2279 void
2280 target_add(struct phyint_instance *pii, struct in6_addr addr,
2281     boolean_t is_router)
2282 {
2283 	struct target *tg;
2284 
2285 	if (pii == NULL)
2286 		return;
2287 
2288 	tg = target_lookup(pii, addr);
2289 
2290 	/*
2291 	 * If the target does not exist, create it; target_create() will set
2292 	 * tg_in_use to true.  Even if it exists already, if it's a router
2293 	 * target and we'd previously learned of it through multicast, then we
2294 	 * need to recreate it as a router target.  Otherwise, just set
2295 	 * tg_in_use to to true so that init_router_targets() won't delete it.
2296 	 */
2297 	if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
2298 		target_create(pii, addr, is_router);
2299 	else if (is_router)
2300 		tg->tg_in_use = 1;
2301 }
2302 
2303 /*
2304  * Insert target at head of linked list of targets for the associated
2305  * phyint instance
2306  */
2307 static void
2308 target_insert(struct phyint_instance *pii, struct target *tg)
2309 {
2310 	tg->tg_next = pii->pii_targets;
2311 	tg->tg_prev = NULL;
2312 	if (tg->tg_next != NULL)
2313 		tg->tg_next->tg_prev = tg;
2314 	pii->pii_targets = tg;
2315 }
2316 
2317 /*
2318  * Delete a target (unlink and free).
2319  */
2320 void
2321 target_delete(struct target *tg)
2322 {
2323 	int af;
2324 	struct phyint_instance	*pii;
2325 	struct phyint_instance	*pii_other;
2326 
2327 	pii = tg->tg_phyint_inst;
2328 	af = pii->pii_af;
2329 
2330 	if (debug & D_TARGET) {
2331 		char abuf[INET6_ADDRSTRLEN];
2332 
2333 		logdebug("target_delete(%s %s, %s)\n",
2334 		    AF_STR(af), pii->pii_name,
2335 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
2336 	}
2337 
2338 	/*
2339 	 * Target must be in the list of targets for this phyint
2340 	 * instance.
2341 	 */
2342 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2343 
2344 	/*
2345 	 * Reset all references to 'tg' in the probe information
2346 	 * for this phyint.
2347 	 */
2348 	reset_pii_probes(pii, tg);
2349 
2350 	/*
2351 	 * Remove this target from the list of targets of this
2352 	 * phyint instance.
2353 	 */
2354 	if (tg->tg_prev == NULL) {
2355 		pii->pii_targets = tg->tg_next;
2356 	} else {
2357 		tg->tg_prev->tg_next = tg->tg_next;
2358 	}
2359 
2360 	if (tg->tg_next != NULL)
2361 		tg->tg_next->tg_prev = tg->tg_prev;
2362 
2363 	tg->tg_next = NULL;
2364 	tg->tg_prev = NULL;
2365 
2366 	if (tg->tg_status == TG_ACTIVE)
2367 		pii->pii_ntargets--;
2368 
2369 	/*
2370 	 * Adjust the next target to probe, if it points to
2371 	 * to the currently deleted target.
2372 	 */
2373 	if (pii->pii_target_next == tg)
2374 		pii->pii_target_next = target_first(pii);
2375 
2376 	if (pii->pii_rtt_target_next == tg)
2377 		pii->pii_rtt_target_next = target_first(pii);
2378 
2379 	free(tg);
2380 
2381 	/*
2382 	 * The number of active targets pii_ntargets == 0 iff
2383 	 * the next active target pii->pii_target_next == NULL
2384 	 */
2385 	if (pii->pii_ntargets != 0) {
2386 		assert(pii->pii_target_next != NULL);
2387 		assert(pii->pii_rtt_target_next != NULL);
2388 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2389 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2390 		return;
2391 	}
2392 
2393 	/* At this point, we don't have any active targets. */
2394 	assert(pii->pii_target_next == NULL);
2395 	assert(pii->pii_rtt_target_next == NULL);
2396 
2397 	if (pii->pii_targets_are_routers) {
2398 		/*
2399 		 * Activate any TG_SLOW or TG_DEAD router targets,
2400 		 * since we don't have any other targets
2401 		 */
2402 		target_activate_all(pii);
2403 
2404 		if (pii->pii_ntargets != 0) {
2405 			assert(pii->pii_target_next != NULL);
2406 			assert(pii->pii_rtt_target_next != NULL);
2407 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2408 			assert(pii->pii_rtt_target_next->tg_status ==
2409 			    TG_ACTIVE);
2410 			return;
2411 		}
2412 	}
2413 
2414 	/*
2415 	 * If we still don't have any active targets, the list must
2416 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2417 	 * targets. Zero out the probe stats since it will not be
2418 	 * relevant any longer.
2419 	 */
2420 	assert(pii->pii_targets == NULL);
2421 	pii->pii_targets_are_routers = _B_FALSE;
2422 	clear_pii_probe_stats(pii);
2423 	pii_other = phyint_inst_other(pii);
2424 
2425 	/*
2426 	 * If there are no targets on both instances and the interface would
2427 	 * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
2428 	 * since we cannot probe this phyint any more.  For more details,
2429 	 * please see phyint state diagram in mpd_probe.c.
2430 	 */
2431 	if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
2432 	    pii->pii_phyint->pi_state != PI_OFFLINE)
2433 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2434 }
2435 
2436 /*
2437  * Flush the target list of every phyint in the group, if the list
2438  * is a host target list. This is called if group failure is suspected.
2439  * If all targets have failed, multicast will subsequently discover new
2440  * targets. Else it is a group failure.
2441  * Note: This function is a no-op if the list is a router target list.
2442  */
2443 static void
2444 target_flush_hosts(struct phyint_group *pg)
2445 {
2446 	struct phyint *pi;
2447 	struct phyint_instance *pii;
2448 
2449 	if (debug & D_TARGET)
2450 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2451 
2452 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2453 		pii = pi->pi_v4;
2454 		if (pii != NULL && !pii->pii_targets_are_routers) {
2455 			/*
2456 			 * Delete all the targets. When the list becomes
2457 			 * empty, target_delete() will set pii->pii_targets
2458 			 * to NULL.
2459 			 */
2460 			while (pii->pii_targets != NULL)
2461 				target_delete(pii->pii_targets);
2462 		}
2463 		pii = pi->pi_v6;
2464 		if (pii != NULL && !pii->pii_targets_are_routers) {
2465 			/*
2466 			 * Delete all the targets. When the list becomes
2467 			 * empty, target_delete() will set pii->pii_targets
2468 			 * to NULL.
2469 			 */
2470 			while (pii->pii_targets != NULL)
2471 				target_delete(pii->pii_targets);
2472 		}
2473 	}
2474 }
2475 
2476 /*
2477  * Reset all references to 'target' in the probe info, as this target is
2478  * being deleted. The pr_target field is guaranteed to be non-null if
2479  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2480  * pr_target will not be accessed unconditionally.
2481  */
2482 static void
2483 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2484 {
2485 	int i;
2486 
2487 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2488 		if (pii->pii_probes[i].pr_target == tg) {
2489 			if (pii->pii_probes[i].pr_status == PR_UNACKED) {
2490 				probe_chstate(&pii->pii_probes[i], pii,
2491 				    PR_LOST);
2492 			}
2493 			pii->pii_probes[i].pr_target = NULL;
2494 		}
2495 	}
2496 
2497 }
2498 
2499 /*
2500  * Clear the probe statistics array.
2501  */
2502 void
2503 clear_pii_probe_stats(struct phyint_instance *pii)
2504 {
2505 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2506 	/* Reset the next probe index in the probe stats array */
2507 	pii->pii_probe_next = 0;
2508 }
2509 
2510 static void
2511 target_print(struct target *tg)
2512 {
2513 	char	abuf[INET6_ADDRSTRLEN];
2514 	char	buf[128];
2515 	char	buf2[128];
2516 	int	af;
2517 	int	i;
2518 
2519 	af = tg->tg_phyint_inst->pii_af;
2520 
2521 	logdebug("Target on %s %s addr %s\n"
2522 	    "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
2523 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2524 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2525 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2526 	    tg->tg_crtt, tg->tg_in_use);
2527 
2528 	buf[0] = '\0';
2529 	for (i = 0; i < tg->tg_num_deferred; i++) {
2530 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2531 		    tg->tg_deferred[i]);
2532 		(void) strlcat(buf, buf2, sizeof (buf));
2533 	}
2534 	logdebug("deferred rtts:%s\n", buf);
2535 }
2536 
2537 void
2538 phyint_inst_print_all(void)
2539 {
2540 	struct phyint_instance *pii;
2541 
2542 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2543 		phyint_inst_print(pii);
2544 	}
2545 }
2546 
2547 /*
2548  * Compare two prefixes that have the same prefix length.
2549  * Fails if the prefix length is unreasonable.
2550  */
2551 boolean_t
2552 prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
2553 {
2554 	uchar_t mask;
2555 	int j;
2556 
2557 	if (prefix_len > IPV6_ABITS)
2558 		return (_B_FALSE);
2559 
2560 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2561 		if (p1.s6_addr[j] != p2.s6_addr[j])
2562 			return (_B_FALSE);
2563 
2564 	/* Make the N leftmost bits one */
2565 	mask = 0xff << (8 - prefix_len);
2566 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2567 		return (_B_FALSE);
2568 
2569 	return (_B_TRUE);
2570 }
2571 
2572 /*
2573  * Get the number of UP logints on phyint `pi'.
2574  */
2575 static int
2576 logint_upcount(struct phyint *pi)
2577 {
2578 	struct	logint	*li;
2579 	int count = 0;
2580 
2581 	if (pi->pi_v4 != NULL) {
2582 		for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
2583 			if (li->li_flags & IFF_UP)
2584 				count++;
2585 		}
2586 	}
2587 
2588 	if (pi->pi_v6 != NULL) {
2589 		for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
2590 			if (li->li_flags & IFF_UP)
2591 				count++;
2592 		}
2593 	}
2594 
2595 	return (count);
2596 }
2597 
2598 /*
2599  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2600  */
2601 struct phyint_instance *
2602 phyint_inst_other(struct phyint_instance *pii)
2603 {
2604 	if (pii->pii_af == AF_INET)
2605 		return (pii->pii_phyint->pi_v6);
2606 	else
2607 		return (pii->pii_phyint->pi_v4);
2608 }
2609 
2610 /*
2611  * Check whether a phyint is functioning.
2612  */
2613 static boolean_t
2614 phyint_is_functioning(struct phyint *pi)
2615 {
2616 	if (pi->pi_state == PI_RUNNING)
2617 		return (_B_TRUE);
2618 	return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
2619 }
2620 
2621 /*
2622  * Check whether a phyint is usable.
2623  */
2624 static boolean_t
2625 phyint_is_usable(struct phyint *pi)
2626 {
2627 	if (logint_upcount(pi) == 0)
2628 		return (_B_FALSE);
2629 	return (phyint_is_functioning(pi));
2630 }
2631 
2632 /*
2633  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2634  * Before sending the event, it prepends the current version of the IPMP
2635  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2636  * `nvl' is freed).
2637  */
2638 static int
2639 post_event(const char *subclass, nvlist_t *nvl)
2640 {
2641 	static evchan_t *evchp = NULL;
2642 
2643 	/*
2644 	 * Initialize the event channel if we haven't already done so.
2645 	 */
2646 	if (evchp == NULL) {
2647 		errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
2648 		if (errno != 0) {
2649 			logerr("cannot create event channel `%s': %s\n",
2650 			    IPMP_EVENT_CHAN, strerror(errno));
2651 			goto failed;
2652 		}
2653 	}
2654 
2655 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2656 	    IPMP_EVENT_CUR_VERSION);
2657 	if (errno != 0) {
2658 		logerr("cannot create `%s' event: %s", subclass,
2659 		    strerror(errno));
2660 		goto failed;
2661 	}
2662 
2663 	errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
2664 	    "in.mpathd", nvl, EVCH_NOSLEEP);
2665 	if (errno != 0) {
2666 		logerr("cannot send `%s' event: %s\n", subclass,
2667 		    strerror(errno));
2668 		goto failed;
2669 	}
2670 
2671 	nvlist_free(nvl);
2672 	return (0);
2673 failed:
2674 	nvlist_free(nvl);
2675 	return (-1);
2676 }
2677 
2678 /*
2679  * Return the external IPMP state associated with phyint `pi'.
2680  */
2681 static ipmp_if_state_t
2682 ifstate(struct phyint *pi)
2683 {
2684 	switch (pi->pi_state) {
2685 	case PI_INIT:
2686 		return (IPMP_IF_UNKNOWN);
2687 
2688 	case PI_NOTARGETS:
2689 		if (pi->pi_flags & IFF_FAILED)
2690 			return (IPMP_IF_FAILED);
2691 		return (IPMP_IF_UNKNOWN);
2692 
2693 	case PI_OFFLINE:
2694 		return (IPMP_IF_OFFLINE);
2695 
2696 	case PI_FAILED:
2697 		return (IPMP_IF_FAILED);
2698 
2699 	case PI_RUNNING:
2700 		return (IPMP_IF_OK);
2701 	}
2702 
2703 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2704 	abort();
2705 	/* NOTREACHED */
2706 }
2707 
2708 /*
2709  * Return the external IPMP interface type associated with phyint `pi'.
2710  */
2711 static ipmp_if_type_t
2712 iftype(struct phyint *pi)
2713 {
2714 	if (pi->pi_flags & IFF_STANDBY)
2715 		return (IPMP_IF_STANDBY);
2716 	else
2717 		return (IPMP_IF_NORMAL);
2718 }
2719 
2720 /*
2721  * Return the external IPMP link state associated with phyint `pi'.
2722  */
2723 static ipmp_if_linkstate_t
2724 iflinkstate(struct phyint *pi)
2725 {
2726 	if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
2727 		return (IPMP_LINK_UNKNOWN);
2728 
2729 	return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
2730 }
2731 
2732 /*
2733  * Return the external IPMP probe state associated with phyint `pi'.
2734  */
2735 static ipmp_if_probestate_t
2736 ifprobestate(struct phyint *pi)
2737 {
2738 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
2739 		return (IPMP_PROBE_DISABLED);
2740 
2741 	if (pi->pi_state == PI_FAILED)
2742 		return (IPMP_PROBE_FAILED);
2743 
2744 	if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
2745 		return (IPMP_PROBE_UNKNOWN);
2746 
2747 	return (IPMP_PROBE_OK);
2748 }
2749 
2750 /*
2751  * Return the external IPMP target mode associated with phyint instance `pii'.
2752  */
2753 static ipmp_if_targmode_t
2754 iftargmode(struct phyint_instance *pii)
2755 {
2756 	if (!PROBE_ENABLED(pii))
2757 		return (IPMP_TARG_DISABLED);
2758 	else if (pii->pii_targets_are_routers)
2759 		return (IPMP_TARG_ROUTES);
2760 	else
2761 		return (IPMP_TARG_MULTICAST);
2762 }
2763 
2764 /*
2765  * Return the external IPMP flags associated with phyint `pi'.
2766  */
2767 static ipmp_if_flags_t
2768 ifflags(struct phyint *pi)
2769 {
2770 	ipmp_if_flags_t flags = 0;
2771 
2772 	if (logint_upcount(pi) == 0)
2773 		flags |= IPMP_IFFLAG_DOWN;
2774 	if (pi->pi_flags & IFF_INACTIVE)
2775 		flags |= IPMP_IFFLAG_INACTIVE;
2776 	if (pi->pi_hwaddrdup)
2777 		flags |= IPMP_IFFLAG_HWADDRDUP;
2778 	if (phyint_is_functioning(pi) && flags == 0)
2779 		flags |= IPMP_IFFLAG_ACTIVE;
2780 
2781 	return (flags);
2782 }
2783 
2784 /*
2785  * Store the test address used on phyint instance `pii' in `ssp'.  If there's
2786  * no test address, 0.0.0.0 is stored.
2787  */
2788 static struct sockaddr_storage *
2789 iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
2790 {
2791 	if (PROBE_ENABLED(pii))
2792 		addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
2793 	else
2794 		addr2storage(AF_INET6, &in6addr_any, ssp);
2795 
2796 	return (ssp);
2797 }
2798 
2799 /*
2800  * Return the external IPMP group state associated with phyint group `pg'.
2801  */
2802 static ipmp_group_state_t
2803 groupstate(struct phyint_group *pg)
2804 {
2805 	switch (pg->pg_state) {
2806 	case PG_FAILED:
2807 		return (IPMP_GROUP_FAILED);
2808 	case PG_DEGRADED:
2809 		return (IPMP_GROUP_DEGRADED);
2810 	case PG_OK:
2811 		return (IPMP_GROUP_OK);
2812 	}
2813 
2814 	logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
2815 	abort();
2816 	/* NOTREACHED */
2817 }
2818 
2819 /*
2820  * Return the external IPMP probe state associated with probe `ps'.
2821  */
2822 static ipmp_probe_state_t
2823 probestate(struct probe_stats *ps)
2824 {
2825 	switch (ps->pr_status) {
2826 	case PR_UNUSED:
2827 	case PR_LOST:
2828 		return (IPMP_PROBE_LOST);
2829 	case PR_UNACKED:
2830 		return (IPMP_PROBE_SENT);
2831 	case PR_ACKED:
2832 		return (IPMP_PROBE_ACKED);
2833 	}
2834 
2835 	logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
2836 	abort();
2837 	/* NOTREACHED */
2838 }
2839 
2840 /*
2841  * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
2842  * on phyint instance `pii'.  Returns 0 on success, -1 on failure.
2843  */
2844 int
2845 probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
2846 {
2847 	nvlist_t *nvl;
2848 	hrtime_t proc_time = 0, recv_time = 0;
2849 	struct sockaddr_storage ss;
2850 	struct target *tg = pr->pr_target;
2851 	int64_t rttavg, rttdev;
2852 
2853 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2854 	if (errno != 0) {
2855 		logperror("cannot create `interface change' event");
2856 		return (-1);
2857 	}
2858 
2859 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
2860 	if (errno != 0)
2861 		goto failed;
2862 
2863 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
2864 	if (errno != 0)
2865 		goto failed;
2866 
2867 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
2868 	if (errno != 0)
2869 		goto failed;
2870 
2871 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
2872 	    pr->pr_hrtime_start);
2873 	if (errno != 0)
2874 		goto failed;
2875 
2876 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
2877 	    pr->pr_hrtime_sent);
2878 	if (errno != 0)
2879 		goto failed;
2880 
2881 	if (pr->pr_status == PR_ACKED) {
2882 		recv_time = pr->pr_hrtime_ackrecv;
2883 		proc_time = pr->pr_hrtime_ackproc;
2884 	}
2885 
2886 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
2887 	if (errno != 0)
2888 		goto failed;
2889 
2890 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
2891 	if (errno != 0)
2892 		goto failed;
2893 
2894 	if (tg != NULL)
2895 		addr2storage(pii->pii_af, &tg->tg_address, &ss);
2896 	else
2897 		addr2storage(pii->pii_af, &in6addr_any, &ss);
2898 
2899 	errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
2900 	    sizeof (ss));
2901 	if (errno != 0)
2902 		goto failed;
2903 
2904 	rttavg = (tg != NULL) ? (tg->tg_rtt_sa / 8) : 0;
2905 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, rttavg);
2906 	if (errno != 0)
2907 		goto failed;
2908 
2909 	rttdev = (tg != NULL) ? (tg->tg_rtt_sd / 4) : 0;
2910 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, rttdev);
2911 	if (errno != 0)
2912 		goto failed;
2913 
2914 	return (post_event(ESC_IPMP_PROBE_STATE, nvl));
2915 failed:
2916 	logperror("cannot create `probe state' event");
2917 	nvlist_free(nvl);
2918 	return (-1);
2919 }
2920 
2921 /*
2922  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2923  * Returns 0 on success, -1 on failure.
2924  */
2925 static int
2926 phyint_group_state_event(struct phyint_group *pg)
2927 {
2928 	nvlist_t	*nvl;
2929 
2930 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2931 	if (errno != 0) {
2932 		logperror("cannot create `group state change' event");
2933 		return (-1);
2934 	}
2935 
2936 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2937 	if (errno != 0)
2938 		goto failed;
2939 
2940 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2941 	if (errno != 0)
2942 		goto failed;
2943 
2944 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2945 	if (errno != 0)
2946 		goto failed;
2947 
2948 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2949 failed:
2950 	logperror("cannot create `group state change' event");
2951 	nvlist_free(nvl);
2952 	return (-1);
2953 }
2954 
2955 /*
2956  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2957  * `pg'.  Returns 0 on success, -1 on failure.
2958  */
2959 static int
2960 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2961 {
2962 	nvlist_t *nvl;
2963 
2964 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2965 	if (errno != 0) {
2966 		logperror("cannot create `group change' event");
2967 		return (-1);
2968 	}
2969 
2970 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2971 	if (errno != 0)
2972 		goto failed;
2973 
2974 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2975 	if (errno != 0)
2976 		goto failed;
2977 
2978 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
2979 	    phyint_grouplistsig);
2980 	if (errno != 0)
2981 		goto failed;
2982 
2983 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
2984 	if (errno != 0)
2985 		goto failed;
2986 
2987 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
2988 failed:
2989 	logperror("cannot create `group change' event");
2990 	nvlist_free(nvl);
2991 	return (-1);
2992 }
2993 
2994 /*
2995  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
2996  * group `pg'.	Returns 0 on success, -1 on failure.
2997  */
2998 static int
2999 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
3000     ipmp_if_op_t op)
3001 {
3002 	nvlist_t *nvl;
3003 
3004 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3005 	if (errno != 0) {
3006 		logperror("cannot create `group member change' event");
3007 		return (-1);
3008 	}
3009 
3010 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3011 	if (errno != 0)
3012 		goto failed;
3013 
3014 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3015 	if (errno != 0)
3016 		goto failed;
3017 
3018 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
3019 	if (errno != 0)
3020 		goto failed;
3021 
3022 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3023 	if (errno != 0)
3024 		goto failed;
3025 
3026 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3027 	if (errno != 0)
3028 		goto failed;
3029 
3030 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3031 	if (errno != 0)
3032 		goto failed;
3033 
3034 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
3035 failed:
3036 	logperror("cannot create `group member change' event");
3037 	nvlist_free(nvl);
3038 	return (-1);
3039 
3040 }
3041 
3042 /*
3043  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
3044  * Returns 0 on success, -1 on failure.
3045  */
3046 static int
3047 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
3048 {
3049 	nvlist_t *nvl;
3050 
3051 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3052 	if (errno != 0) {
3053 		logperror("cannot create `interface change' event");
3054 		return (-1);
3055 	}
3056 
3057 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3058 	if (errno != 0)
3059 		goto failed;
3060 
3061 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3062 	if (errno != 0)
3063 		goto failed;
3064 
3065 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3066 	if (errno != 0)
3067 		goto failed;
3068 
3069 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3070 	if (errno != 0)
3071 		goto failed;
3072 
3073 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3074 	if (errno != 0)
3075 		goto failed;
3076 
3077 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
3078 failed:
3079 	logperror("cannot create `interface change' event");
3080 	nvlist_free(nvl);
3081 	return (-1);
3082 
3083 }
3084 
3085 /*
3086  * Generate a signature for use.  The signature is conceptually divided
3087  * into two pieces: a random 16-bit "generation number" and a 48-bit
3088  * monotonically increasing integer.  The generation number protects
3089  * against stale updates to entities (e.g., IPMP groups) that have been
3090  * deleted and since recreated.
3091  */
3092 static uint64_t
3093 gensig(void)
3094 {
3095 	static int seeded = 0;
3096 
3097 	if (seeded == 0) {
3098 		srand48((long)gethrtime());
3099 		seeded++;
3100 	}
3101 
3102 	return ((uint64_t)lrand48() << 48 | 1);
3103 }
3104 
3105 /*
3106  * Store the information associated with group `grname' into a dynamically
3107  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
3108  */
3109 unsigned int
3110 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
3111 {
3112 	struct phyint		*pi;
3113 	struct phyint_group	*pg;
3114 	char			(*ifs)[LIFNAMSIZ];
3115 	unsigned int		i, j;
3116 	unsigned int		nif = 0, naddr = 0;
3117 	lifgroupinfo_t		lifgr;
3118 	addrlist_t		*addrp;
3119 	struct sockaddr_storage	*addrs;
3120 	int			fdt = 0;
3121 
3122 	pg = phyint_group_lookup(grname);
3123 	if (pg == NULL)
3124 		return (IPMP_EUNKGROUP);
3125 
3126 	/*
3127 	 * Tally up the number of interfaces, allocate an array to hold them,
3128 	 * and insert their names into the array.  While we're at it, if any
3129 	 * interface is actually enabled to send probes, save the group fdt.
3130 	 */
3131 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
3132 		nif++;
3133 
3134 	ifs = alloca(nif * sizeof (*ifs));
3135 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
3136 		assert(i < nif);
3137 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
3138 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
3139 			fdt = pg->pg_fdt;
3140 	}
3141 	assert(i == nif);
3142 
3143 	/*
3144 	 * If this is the anonymous group, there's no other information to
3145 	 * collect (since there's no IPMP interface).
3146 	 */
3147 	if (pg == phyint_anongroup) {
3148 		*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3149 		    groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
3150 		return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3151 	}
3152 
3153 	/*
3154 	 * Grab some additional information about the group from the kernel.
3155 	 * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
3156 	 * we can use ifsock_v4 even for a V6-only group.)
3157 	 */
3158 	(void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
3159 	if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
3160 		if (errno == ENOENT)
3161 			return (IPMP_EUNKGROUP);
3162 
3163 		logperror("getgroupinfo: SIOCGLIFGROUPINFO");
3164 		return (IPMP_FAILURE);
3165 	}
3166 
3167 	/*
3168 	 * Tally up the number of data addresses, allocate an array to hold
3169 	 * them, and insert their values into the array.
3170 	 */
3171 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
3172 		naddr++;
3173 
3174 	addrs = alloca(naddr * sizeof (*addrs));
3175 	i = 0;
3176 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3177 		/*
3178 		 * It's possible to have duplicate addresses (if some are
3179 		 * down).  Weed the dups out to avoid confusing consumers.
3180 		 * (If groups start having tons of addresses, we'll need a
3181 		 * better algorithm here.)
3182 		 */
3183 		for (j = 0; j < i; j++) {
3184 			if (sockaddrcmp(&addrs[j], &addrp->al_addr))
3185 				break;
3186 		}
3187 		if (j == i) {
3188 			assert(i < naddr);
3189 			addrs[i++] = addrp->al_addr;
3190 		}
3191 	}
3192 	naddr = i;
3193 
3194 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3195 	    groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
3196 	    lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
3197 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3198 }
3199 
3200 /*
3201  * Store the target information associated with phyint instance `pii' into a
3202  * dynamically allocated structure pointed to by `*targinfopp'.  Returns an
3203  * IPMP error code.
3204  */
3205 unsigned int
3206 gettarginfo(struct phyint_instance *pii, const char *name,
3207     ipmp_targinfo_t **targinfopp)
3208 {
3209 	uint_t ntarg = 0;
3210 	struct target *tg;
3211 	struct sockaddr_storage	ss;
3212 	struct sockaddr_storage *targs = NULL;
3213 
3214 	if (PROBE_CAPABLE(pii)) {
3215 		targs = alloca(pii->pii_ntargets * sizeof (*targs));
3216 		tg = pii->pii_target_next;
3217 		do {
3218 			if (tg->tg_status == TG_ACTIVE) {
3219 				assert(ntarg < pii->pii_ntargets);
3220 				addr2storage(pii->pii_af, &tg->tg_address,
3221 				    &targs[ntarg++]);
3222 			}
3223 			if ((tg = tg->tg_next) == NULL)
3224 				tg = pii->pii_targets;
3225 		} while (tg != pii->pii_target_next);
3226 
3227 		assert(ntarg == pii->pii_ntargets);
3228 	}
3229 
3230 	*targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
3231 	    iftargmode(pii), ntarg, targs);
3232 	return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3233 }
3234 
3235 /*
3236  * Store the information associated with interface `ifname' into a dynamically
3237  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
3238  */
3239 unsigned int
3240 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
3241 {
3242 	int		retval;
3243 	struct phyint	*pi;
3244 	ipmp_targinfo_t	*targinfo4;
3245 	ipmp_targinfo_t	*targinfo6;
3246 
3247 	pi = phyint_lookup(ifname);
3248 	if (pi == NULL)
3249 		return (IPMP_EUNKIF);
3250 
3251 	if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
3252 	    (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
3253 		goto out;
3254 
3255 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
3256 	    ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
3257 	    ifflags(pi), targinfo4, targinfo6);
3258 	retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3259 out:
3260 	if (targinfo4 != NULL)
3261 		ipmp_freetarginfo(targinfo4);
3262 	if (targinfo6 != NULL)
3263 		ipmp_freetarginfo(targinfo6);
3264 	return (retval);
3265 }
3266 
3267 /*
3268  * Store the current list of IPMP groups into a dynamically allocated
3269  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
3270  */
3271 unsigned int
3272 getgrouplist(ipmp_grouplist_t **grlistpp)
3273 {
3274 	struct phyint_group	*pg;
3275 	char			(*groups)[LIFGRNAMSIZ];
3276 	unsigned int		i, ngroup;
3277 
3278 	/*
3279 	 * Tally up the number of groups, allocate an array to hold them, and
3280 	 * insert their names into the array.
3281 	 */
3282 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
3283 		ngroup++;
3284 
3285 	groups = alloca(ngroup * sizeof (*groups));
3286 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
3287 		assert(i < ngroup);
3288 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
3289 	}
3290 	assert(i == ngroup);
3291 
3292 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
3293 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3294 }
3295 
3296 /*
3297  * Store the address information for `ssp' (in group `grname') into a
3298  * dynamically allocated structure pointed to by `*adinfopp'.  Returns an IPMP
3299  * error code.  (We'd call this function getaddrinfo(), but it would conflict
3300  * with getaddrinfo(3SOCKET)).
3301  */
3302 unsigned int
3303 getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
3304     ipmp_addrinfo_t **adinfopp)
3305 {
3306 	int ifsock;
3307 	addrlist_t *addrp, *addrmatchp = NULL;
3308 	ipmp_addr_state_t state;
3309 	const char *binding = "";
3310 	struct lifreq lifr;
3311 	struct phyint_group *pg;
3312 
3313 	if ((pg = phyint_group_lookup(grname)) == NULL)
3314 		return (IPMP_EUNKADDR);
3315 
3316 	/*
3317 	 * Walk through the data addresses, and find a match.  Note that since
3318 	 * some of the addresses may be down, more than one may match.  We
3319 	 * prefer an up address (if one exists).
3320 	 */
3321 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3322 		if (sockaddrcmp(ssp, &addrp->al_addr)) {
3323 			addrmatchp = addrp;
3324 			if (addrmatchp->al_flags & IFF_UP)
3325 				break;
3326 		}
3327 	}
3328 
3329 	if (addrmatchp == NULL)
3330 		return (IPMP_EUNKADDR);
3331 
3332 	state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
3333 	if (state == IPMP_ADDR_UP) {
3334 		ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
3335 		(void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
3336 		if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
3337 			binding = lifr.lifr_binding;
3338 	}
3339 
3340 	*adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
3341 	return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3342 }
3343 
3344 /*
3345  * Store a snapshot of the IPMP subsystem into a dynamically allocated
3346  * structure pointed to by `*snapp'.  Returns an IPMP error code.
3347  */
3348 unsigned int
3349 getsnap(ipmp_snap_t **snapp)
3350 {
3351 	ipmp_grouplist_t	*grlistp;
3352 	ipmp_groupinfo_t	*grinfop;
3353 	ipmp_addrinfo_t		*adinfop;
3354 	ipmp_addrlist_t		*adlistp;
3355 	ipmp_ifinfo_t		*ifinfop;
3356 	ipmp_snap_t		*snap;
3357 	struct phyint		*pi;
3358 	unsigned int		i, j;
3359 	int			retval;
3360 
3361 	snap = ipmp_snap_create();
3362 	if (snap == NULL)
3363 		return (IPMP_ENOMEM);
3364 
3365 	/*
3366 	 * Add group list.
3367 	 */
3368 	retval = getgrouplist(&snap->sn_grlistp);
3369 	if (retval != IPMP_SUCCESS)
3370 		goto failed;
3371 
3372 	/*
3373 	 * Add information for each group in the list, along with all of its
3374 	 * data addresses.
3375 	 */
3376 	grlistp = snap->sn_grlistp;
3377 	for (i = 0; i < grlistp->gl_ngroup; i++) {
3378 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
3379 		if (retval != IPMP_SUCCESS)
3380 			goto failed;
3381 
3382 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
3383 		if (retval != IPMP_SUCCESS) {
3384 			ipmp_freegroupinfo(grinfop);
3385 			goto failed;
3386 		}
3387 
3388 		adlistp = grinfop->gr_adlistp;
3389 		for (j = 0; j < adlistp->al_naddr; j++) {
3390 			retval = getgraddrinfo(grinfop->gr_name,
3391 			    &adlistp->al_addrs[j], &adinfop);
3392 			if (retval != IPMP_SUCCESS)
3393 				goto failed;
3394 
3395 			retval = ipmp_snap_addaddrinfo(snap, adinfop);
3396 			if (retval != IPMP_SUCCESS) {
3397 				ipmp_freeaddrinfo(adinfop);
3398 				goto failed;
3399 			}
3400 		}
3401 	}
3402 
3403 	/*
3404 	 * Add information for each configured phyint.
3405 	 */
3406 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
3407 		retval = getifinfo(pi->pi_name, &ifinfop);
3408 		if (retval != IPMP_SUCCESS)
3409 			goto failed;
3410 
3411 		retval = ipmp_snap_addifinfo(snap, ifinfop);
3412 		if (retval != IPMP_SUCCESS) {
3413 			ipmp_freeifinfo(ifinfop);
3414 			goto failed;
3415 		}
3416 	}
3417 
3418 	*snapp = snap;
3419 	return (IPMP_SUCCESS);
3420 failed:
3421 	ipmp_snap_free(snap);
3422 	return (retval);
3423 }
3424