xref: /titanic_51/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision 604635facc40339ec5edaeba7cfbf31b615cfbfe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include "mpd_defs.h"
27 #include "mpd_tables.h"
28 
29 /*
30  * Global list of phyints, phyint instances, phyint groups and the anonymous
31  * group; the latter is initialized in phyint_init().
32  */
33 struct phyint *phyints = NULL;
34 struct phyint_instance	*phyint_instances = NULL;
35 struct phyint_group *phyint_groups = NULL;
36 struct phyint_group *phyint_anongroup;
37 
38 /*
39  * Grouplist signature; initialized in phyint_init().
40  */
41 static uint64_t phyint_grouplistsig;
42 
43 static void phyint_inst_insert(struct phyint_instance *pii);
44 static void phyint_inst_print(struct phyint_instance *pii);
45 
46 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
47 static void phyint_delete(struct phyint *pi);
48 static boolean_t phyint_is_usable(struct phyint *pi);
49 
50 static void logint_print(struct logint *li);
51 static void logint_insert(struct phyint_instance *pii, struct logint *li);
52 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
53 
54 static void target_print(struct target *tg);
55 static void target_insert(struct phyint_instance *pii, struct target *tg);
56 static struct target *target_first(struct phyint_instance *pii);
57 static struct target *target_select_best(struct phyint_instance *pii);
58 static void target_flush_hosts(struct phyint_group *pg);
59 
60 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
61 
62 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
63 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
64 
65 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
66 static int phyint_group_state_event(struct phyint_group *pg);
67 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
68 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
69     ipmp_if_op_t op);
70 
71 static int logint_upcount(struct phyint *pi);
72 static uint64_t gensig(void);
73 
74 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
75 int
76 phyint_init(void)
77 {
78 	phyint_grouplistsig = gensig();
79 	if (track_all_phyints) {
80 		phyint_anongroup = phyint_group_create("");
81 		if (phyint_anongroup == NULL)
82 			return (-1);
83 		phyint_group_insert(phyint_anongroup);
84 	}
85 	return (0);
86 }
87 
88 /* Return the phyint with the given name */
89 struct phyint *
90 phyint_lookup(const char *name)
91 {
92 	struct phyint *pi;
93 
94 	if (debug & D_PHYINT)
95 		logdebug("phyint_lookup(%s)\n", name);
96 
97 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
98 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
99 			break;
100 	}
101 	return (pi);
102 }
103 
104 /*
105  * Lookup a phyint in the group that has the same hardware address as `pi', or
106  * NULL if there's none.  If `online_only' is set, then only online phyints
107  * are considered when matching.  Otherwise, phyints that had been offlined
108  * due to a duplicate hardware address will also be considered.
109  */
110 static struct phyint *
111 phyint_lookup_hwaddr(struct phyint *pi, boolean_t online_only)
112 {
113 	struct phyint *pi2;
114 
115 	if (pi->pi_group == phyint_anongroup)
116 		return (NULL);
117 
118 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
119 		if (pi2 == pi)
120 			continue;
121 
122 		/*
123 		 * NOTE: even when online_only is B_FALSE, we ignore phyints
124 		 * that are administratively offline (rather than offline
125 		 * because they're dups); when they're brought back online,
126 		 * they'll be flagged as dups if need be.
127 		 */
128 		if (pi2->pi_state == PI_OFFLINE &&
129 		    (online_only || !pi2->pi_hwaddrdup))
130 			continue;
131 
132 		if (pi2->pi_hwaddrlen == pi->pi_hwaddrlen &&
133 		    bcmp(pi2->pi_hwaddr, pi->pi_hwaddr, pi->pi_hwaddrlen) == 0)
134 			return (pi2);
135 	}
136 	return (NULL);
137 }
138 
139 /*
140  * Respond to DLPI notifications.  Currently, this only processes physical
141  * address changes for the phyint passed via `arg' by onlining or offlining
142  * phyints in the group.
143  */
144 /* ARGSUSED */
145 static void
146 phyint_link_notify(dlpi_handle_t dh, dlpi_notifyinfo_t *dnip, void *arg)
147 {
148 	struct phyint *pi = arg;
149 	struct phyint *oduppi = NULL, *duppi = NULL;
150 
151 	assert((dnip->dni_note & pi->pi_notes) != 0);
152 
153 	if (dnip->dni_note != DL_NOTE_PHYS_ADDR)
154 		return;
155 
156 	assert(dnip->dni_physaddrlen <= DLPI_PHYSADDR_MAX);
157 
158 	/*
159 	 * If our hardware address hasn't changed, there's nothing to do.
160 	 */
161 	if (pi->pi_hwaddrlen == dnip->dni_physaddrlen &&
162 	    bcmp(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen) == 0)
163 		return;
164 
165 	oduppi = phyint_lookup_hwaddr(pi, _B_FALSE);
166 	pi->pi_hwaddrlen = dnip->dni_physaddrlen;
167 	(void) memcpy(pi->pi_hwaddr, dnip->dni_physaddr, pi->pi_hwaddrlen);
168 	duppi = phyint_lookup_hwaddr(pi, _B_FALSE);
169 
170 	if (oduppi != NULL || pi->pi_hwaddrdup) {
171 		/*
172 		 * Our old hardware address was a duplicate.  If we'd been
173 		 * offlined because of it, and our new hardware address is not
174 		 * a duplicate, then bring us online.  Otherwise, `oduppi'
175 		 * must've been the one brought offline; bring it online.
176 		 */
177 		if (pi->pi_hwaddrdup) {
178 			if (duppi == NULL)
179 				(void) phyint_undo_offline(pi);
180 		} else {
181 			assert(oduppi->pi_hwaddrdup);
182 			(void) phyint_undo_offline(oduppi);
183 		}
184 	}
185 
186 	if (duppi != NULL && !pi->pi_hwaddrdup) {
187 		/*
188 		 * Our new hardware address was a duplicate and we're not
189 		 * yet flagged as a duplicate; bring us offline.
190 		 */
191 		pi->pi_hwaddrdup = _B_TRUE;
192 		(void) phyint_offline(pi, 0);
193 	}
194 }
195 
196 /*
197  * Initialize information about the underlying link for `pi', and set us
198  * up to be notified about future changes.  Returns _B_TRUE on success.
199  */
200 boolean_t
201 phyint_link_init(struct phyint *pi)
202 {
203 	int retval;
204 	uint_t notes;
205 	const char *errmsg;
206 	dlpi_notifyid_t id;
207 
208 	pi->pi_notes = 0;
209 	retval = dlpi_open(pi->pi_name, &pi->pi_dh, 0);
210 	if (retval != DLPI_SUCCESS) {
211 		pi->pi_dh = NULL;
212 		errmsg = "cannot open";
213 		goto failed;
214 	}
215 
216 	pi->pi_hwaddrlen = DLPI_PHYSADDR_MAX;
217 	retval = dlpi_get_physaddr(pi->pi_dh, DL_CURR_PHYS_ADDR, pi->pi_hwaddr,
218 	    &pi->pi_hwaddrlen);
219 	if (retval != DLPI_SUCCESS) {
220 		errmsg = "cannot get hardware address";
221 		goto failed;
222 	}
223 
224 	/*
225 	 * Check if the link supports DLPI link state notifications.  For
226 	 * historical reasons, the actual changes are tracked through routing
227 	 * sockets, so we immediately disable the notification upon success.
228 	 */
229 	notes = DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN;
230 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
231 	if (retval == DLPI_SUCCESS) {
232 		(void) dlpi_disabnotify(pi->pi_dh, id, NULL);
233 		pi->pi_notes |= notes;
234 	}
235 
236 	/*
237 	 * Enable notification of hardware address changes to keep pi_hwaddr
238 	 * up-to-date and track if we need to offline/undo-offline phyints.
239 	 */
240 	notes = DL_NOTE_PHYS_ADDR;
241 	retval = dlpi_enabnotify(pi->pi_dh, notes, phyint_link_notify, pi, &id);
242 	if (retval == DLPI_SUCCESS && poll_add(dlpi_fd(pi->pi_dh)) == 0)
243 		pi->pi_notes |= notes;
244 
245 	return (_B_TRUE);
246 failed:
247 	logerr("%s: %s: %s\n", pi->pi_name, errmsg, dlpi_strerror(retval));
248 	if (pi->pi_dh != NULL) {
249 		dlpi_close(pi->pi_dh);
250 		pi->pi_dh = NULL;
251 	}
252 	return (_B_FALSE);
253 }
254 
255 /*
256  * Close use of link on `pi'.
257  */
258 void
259 phyint_link_close(struct phyint *pi)
260 {
261 	if (pi->pi_notes & DL_NOTE_PHYS_ADDR) {
262 		(void) poll_remove(dlpi_fd(pi->pi_dh));
263 		pi->pi_notes &= ~DL_NOTE_PHYS_ADDR;
264 	}
265 
266 	/*
267 	 * NOTE: we don't clear pi_notes here so that iflinkstate() can still
268 	 * properly report the link state even when offline (which is possible
269 	 * since we use IFF_RUNNING to track link state).
270 	 */
271 	dlpi_close(pi->pi_dh);
272 	pi->pi_dh = NULL;
273 }
274 
275 /* Return the phyint instance with the given name and the given family */
276 struct phyint_instance *
277 phyint_inst_lookup(int af, char *name)
278 {
279 	struct phyint *pi;
280 
281 	if (debug & D_PHYINT)
282 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
283 
284 	assert(af == AF_INET || af == AF_INET6);
285 
286 	pi = phyint_lookup(name);
287 	if (pi == NULL)
288 		return (NULL);
289 
290 	return (PHYINT_INSTANCE(pi, af));
291 }
292 
293 struct phyint_group *
294 phyint_group_lookup(const char *pg_name)
295 {
296 	struct phyint_group *pg;
297 
298 	if (debug & D_PHYINT)
299 		logdebug("phyint_group_lookup(%s)\n", pg_name);
300 
301 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
302 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
303 			break;
304 	}
305 	return (pg);
306 }
307 
308 /*
309  * Insert the phyint in the linked list of all phyints. If the phyint belongs
310  * to some group, insert it in the phyint group list.
311  */
312 static void
313 phyint_insert(struct phyint *pi, struct phyint_group *pg)
314 {
315 	if (debug & D_PHYINT)
316 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
317 
318 	/* Insert the phyint at the head of the 'all phyints' list */
319 	pi->pi_next = phyints;
320 	pi->pi_prev = NULL;
321 	if (phyints != NULL)
322 		phyints->pi_prev = pi;
323 	phyints = pi;
324 
325 	/*
326 	 * Insert the phyint at the head of the 'phyint_group members' list
327 	 * of the phyint group to which it belongs.
328 	 */
329 	pi->pi_pgnext = NULL;
330 	pi->pi_pgprev = NULL;
331 	pi->pi_group = pg;
332 
333 	pi->pi_pgnext = pg->pg_phyint;
334 	if (pi->pi_pgnext != NULL)
335 		pi->pi_pgnext->pi_pgprev = pi;
336 	pg->pg_phyint = pi;
337 
338 	/* Refresh the group state now that this phyint has been added */
339 	phyint_group_refresh_state(pg);
340 
341 	pg->pg_sig++;
342 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
343 }
344 
345 /* Insert the phyint instance in the linked list of all phyint instances. */
346 static void
347 phyint_inst_insert(struct phyint_instance *pii)
348 {
349 	if (debug & D_PHYINT) {
350 		logdebug("phyint_inst_insert(%s %s)\n",
351 		    AF_STR(pii->pii_af), pii->pii_name);
352 	}
353 
354 	/*
355 	 * Insert the phyint at the head of the 'all phyint instances' list.
356 	 */
357 	pii->pii_next = phyint_instances;
358 	pii->pii_prev = NULL;
359 	if (phyint_instances != NULL)
360 		phyint_instances->pii_prev = pii;
361 	phyint_instances = pii;
362 }
363 
364 /*
365  * Create a new phyint with the given parameters. Also insert it into
366  * the list of all phyints and the list of phyint group members by calling
367  * phyint_insert().
368  */
369 static struct phyint *
370 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
371     uint64_t flags)
372 {
373 	struct phyint *pi;
374 
375 	pi = calloc(1, sizeof (struct phyint));
376 	if (pi == NULL) {
377 		logperror("phyint_create: calloc");
378 		return (NULL);
379 	}
380 
381 	/*
382 	 * Record the phyint values.
383 	 */
384 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
385 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
386 	pi->pi_ifindex = ifindex;
387 	pi->pi_icmpid = htons(((getpid() & 0xFF) << 8) | (ifindex & 0xFF));
388 
389 	pi->pi_state = PI_INIT;
390 	pi->pi_flags = PHYINT_FLAGS(flags);
391 
392 	/*
393 	 * Initialize the link state.  The link state is initialized to
394 	 * up, so that if the link is down when IPMP starts monitoring
395 	 * the interface, it will appear as though there has been a
396 	 * transition from the link up to link down.  This avoids
397 	 * having to treat this situation as a special case.
398 	 */
399 	INIT_LINK_STATE(pi);
400 
401 	if (!phyint_link_init(pi)) {
402 		free(pi);
403 		return (NULL);
404 	}
405 
406 	/*
407 	 * Insert the phyint in the list of all phyints, and the
408 	 * list of phyint group members
409 	 */
410 	phyint_insert(pi, pg);
411 
412 	/*
413 	 * If the interface is offline, we set the state to PI_OFFLINE.
414 	 * Otherwise, optimistically consider this interface running.  Later
415 	 * (in process_link_state_changes()), we will adjust this to match the
416 	 * current state of the link.  Further, if test addresses are
417 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
418 	 * to either PI_RUNNING or PI_FAILED depending on the probe results.
419 	 */
420 	if (flags & IFF_OFFLINE)
421 		phyint_chstate(pi, PI_OFFLINE);
422 	else
423 		phyint_transition_to_running(pi); /* calls phyint_chstate() */
424 
425 	return (pi);
426 }
427 
428 /*
429  * Create a new phyint instance belonging to the phyint 'pi' and address
430  * family 'af'. Also insert it into the list of all phyint instances by
431  * calling phyint_inst_insert().
432  */
433 static struct phyint_instance *
434 phyint_inst_create(struct phyint *pi, int af)
435 {
436 	struct phyint_instance *pii;
437 
438 	pii = calloc(1, sizeof (struct phyint_instance));
439 	if (pii == NULL) {
440 		logperror("phyint_inst_create: calloc");
441 		return (NULL);
442 	}
443 
444 	/*
445 	 * Attach the phyint instance to the phyint.
446 	 * Set the back pointers as well
447 	 */
448 	pii->pii_phyint = pi;
449 	if (af == AF_INET)
450 		pi->pi_v4 = pii;
451 	else
452 		pi->pi_v6 = pii;
453 
454 	pii->pii_in_use = 1;
455 	pii->pii_probe_sock = -1;
456 	pii->pii_snxt = 1;
457 	pii->pii_af = af;
458 	pii->pii_fd_hrtime = gethrtime() +
459 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
460 	pii->pii_flags = pi->pi_flags;
461 
462 	/* Insert the phyint instance in the list of all phyint instances. */
463 	phyint_inst_insert(pii);
464 	return (pii);
465 }
466 
467 /*
468  * Change the state of phyint `pi' to state `state'.
469  */
470 void
471 phyint_chstate(struct phyint *pi, enum pi_state state)
472 {
473 	/*
474 	 * To simplify things, some callers always set a given state
475 	 * regardless of the previous state of the phyint (e.g., setting
476 	 * PI_RUNNING when it's already set).  We shouldn't bother
477 	 * generating an event or consuming a signature for these, since
478 	 * the actual state of the interface is unchanged.
479 	 */
480 	if (pi->pi_state == state)
481 		return;
482 
483 	pi->pi_state = state;
484 	phyint_changed(pi);
485 }
486 
487 /*
488  * Note that `pi' has changed state.
489  */
490 void
491 phyint_changed(struct phyint *pi)
492 {
493 	pi->pi_group->pg_sig++;
494 	(void) phyint_state_event(pi->pi_group, pi);
495 }
496 
497 /*
498  * Insert the phyint group in the linked list of all phyint groups
499  * at the head of the list
500  */
501 void
502 phyint_group_insert(struct phyint_group *pg)
503 {
504 	pg->pg_next = phyint_groups;
505 	pg->pg_prev = NULL;
506 	if (phyint_groups != NULL)
507 		phyint_groups->pg_prev = pg;
508 	phyint_groups = pg;
509 
510 	phyint_grouplistsig++;
511 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
512 }
513 
514 /*
515  * Create a new phyint group called 'name'.
516  */
517 struct phyint_group *
518 phyint_group_create(const char *name)
519 {
520 	struct	phyint_group *pg;
521 
522 	if (debug & D_PHYINT)
523 		logdebug("phyint_group_create(%s)\n", name);
524 
525 	pg = calloc(1, sizeof (struct phyint_group));
526 	if (pg == NULL) {
527 		logperror("phyint_group_create: calloc");
528 		return (NULL);
529 	}
530 
531 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
532 	pg->pg_sig = gensig();
533 	pg->pg_fdt = user_failure_detection_time;
534 	pg->pg_probeint = user_probe_interval;
535 	pg->pg_in_use = _B_TRUE;
536 
537 	/*
538 	 * Normal groups always start in the PG_FAILED state since they
539 	 * have no active interfaces.  In contrast, anonymous groups are
540 	 * heterogeneous and thus always PG_OK.
541 	 */
542 	pg->pg_state = (name[0] == '\0' ? PG_OK : PG_FAILED);
543 
544 	return (pg);
545 }
546 
547 /*
548  * Change the state of the phyint group `pg' to state `state'.
549  */
550 void
551 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
552 {
553 	assert(pg != phyint_anongroup);
554 
555 	/*
556 	 * To simplify things, some callers always set a given state
557 	 * regardless of the previous state of the group (e.g., setting
558 	 * PG_DEGRADED when it's already set).  We shouldn't bother
559 	 * generating an event or consuming a signature for these, since
560 	 * the actual state of the group is unchanged.
561 	 */
562 	if (pg->pg_state == state)
563 		return;
564 
565 	pg->pg_state = state;
566 
567 	switch (state) {
568 	case PG_FAILED:
569 		/*
570 		 * We can never know with certainty that a group has
571 		 * failed.  It is possible that all known targets have
572 		 * failed simultaneously, and new targets have come up
573 		 * instead. If the targets are routers then router
574 		 * discovery will kick in, and we will see the new routers
575 		 * thru routing socket messages. But if the targets are
576 		 * hosts, we have to discover it by multicast.	So flush
577 		 * all the host targets. The next probe will send out a
578 		 * multicast echo request. If this is a group failure, we
579 		 * will still not see any response, otherwise the group
580 		 * will be repaired after we get NUM_PROBE_REPAIRS
581 		 * consecutive unicast replies on any phyint.
582 		 */
583 		target_flush_hosts(pg);
584 		break;
585 
586 	case PG_OK:
587 	case PG_DEGRADED:
588 		break;
589 
590 	default:
591 		logerr("phyint_group_chstate: invalid group state %d; "
592 		    "aborting\n", state);
593 		abort();
594 	}
595 
596 	pg->pg_sig++;
597 	(void) phyint_group_state_event(pg);
598 }
599 
600 /*
601  * Create a new phyint instance and initialize it from the values supplied by
602  * the kernel. Always check for ENXIO before logging any error, because the
603  * interface could have vanished after completion of SIOCGLIFCONF.
604  * Return values:
605  *	pointer to the phyint instance on success
606  *	NULL on failure Eg. if the phyint instance is not found in the kernel
607  */
608 struct phyint_instance *
609 phyint_inst_init_from_k(int af, char *pi_name)
610 {
611 	char	pg_name[LIFNAMSIZ + 1];
612 	int	ifsock;
613 	uint_t	ifindex;
614 	uint64_t	flags;
615 	struct lifreq	lifr;
616 	struct phyint	*pi;
617 	struct phyint_instance	*pii;
618 	boolean_t	pi_created;
619 	struct phyint_group	*pg;
620 
621 retry:
622 	pii = NULL;
623 	pi = NULL;
624 	pg = NULL;
625 	pi_created = _B_FALSE;
626 
627 	if (debug & D_PHYINT) {
628 		logdebug("phyint_inst_init_from_k(%s %s)\n",
629 		    AF_STR(af), pi_name);
630 	}
631 
632 	assert(af == AF_INET || af == AF_INET6);
633 
634 	/* Get the socket for doing ioctls */
635 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
636 
637 	/*
638 	 * Get the interface flags.  Ignore virtual interfaces, IPMP
639 	 * meta-interfaces, point-to-point interfaces, and interfaces
640 	 * that can't support multicast.
641 	 */
642 	(void) strlcpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
643 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
644 		if (errno != ENXIO) {
645 			logperror("phyint_inst_init_from_k:"
646 			    " ioctl (get flags)");
647 		}
648 		return (NULL);
649 	}
650 	flags = lifr.lifr_flags;
651 	if (!(flags & IFF_MULTICAST) ||
652 	    (flags & (IFF_VIRTUAL|IFF_IPMP|IFF_POINTOPOINT)))
653 		return (NULL);
654 
655 	/*
656 	 * Get the ifindex for recording later in our tables, in case we need
657 	 * to create a new phyint.
658 	 */
659 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
660 		if (errno != ENXIO) {
661 			logperror("phyint_inst_init_from_k: "
662 			    " ioctl (get lifindex)");
663 		}
664 		return (NULL);
665 	}
666 	ifindex = lifr.lifr_index;
667 
668 	/*
669 	 * Get the phyint group name of this phyint, from the kernel.
670 	 */
671 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
672 		if (errno != ENXIO) {
673 			logperror("phyint_inst_init_from_k: "
674 			    "ioctl (get group name)");
675 		}
676 		return (NULL);
677 	}
678 	(void) strlcpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
679 
680 	/*
681 	 * If the phyint is not part of any group, pg_name is the
682 	 * null string. If 'track_all_phyints' is false, there is no
683 	 * need to create a phyint.
684 	 */
685 	if (pg_name[0] == '\0' && !track_all_phyints) {
686 		/*
687 		 * If the IFF_FAILED, IFF_INACTIVE, or IFF_OFFLINE flags are
688 		 * set, reset them. These flags shouldn't be set if in.mpathd
689 		 * isn't tracking the interface.
690 		 */
691 		if ((flags & (IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE))) {
692 			lifr.lifr_flags = flags &
693 			    ~(IFF_FAILED | IFF_INACTIVE | IFF_OFFLINE);
694 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
695 				if (errno != ENXIO) {
696 					logperror("phyint_inst_init_from_k:"
697 					    " ioctl (set flags)");
698 				}
699 			}
700 		}
701 		return (NULL);
702 	}
703 
704 	/*
705 	 * We need to create a new phyint instance.  We may also need to
706 	 * create the group if e.g. the SIOCGLIFCONF loop in initifs() found
707 	 * an underlying interface before it found its IPMP meta-interface.
708 	 * Note that we keep any created groups even if phyint_inst_from_k()
709 	 * fails since a group's existence is not dependent on the ability of
710 	 * in.mpathd to the track the group's interfaces.
711 	 */
712 	if ((pg = phyint_group_lookup(pg_name)) == NULL) {
713 		if ((pg = phyint_group_create(pg_name)) == NULL) {
714 			logerr("phyint_inst_init_from_k: cannot create group "
715 			    "%s\n", pg_name);
716 			return (NULL);
717 		}
718 		phyint_group_insert(pg);
719 	}
720 
721 	/*
722 	 * Lookup the phyint. If the phyint does not exist create it.
723 	 */
724 	pi = phyint_lookup(pi_name);
725 	if (pi == NULL) {
726 		pi = phyint_create(pi_name, pg, ifindex, flags);
727 		if (pi == NULL) {
728 			logerr("phyint_inst_init_from_k:"
729 			    " unable to create phyint %s\n", pi_name);
730 			return (NULL);
731 		}
732 		pi_created = _B_TRUE;
733 	} else {
734 		/* The phyint exists already. */
735 		assert(pi_created == _B_FALSE);
736 		/*
737 		 * Normally we should see consistent values for the IPv4 and
738 		 * IPv6 instances, for phyint properties. If we don't, it
739 		 * means things have changed underneath us, and we should
740 		 * resync our tables with the kernel. Check whether the
741 		 * interface index has changed. If so, it is most likely
742 		 * the interface has been unplumbed and replumbed,
743 		 * while we are yet to update our tables. Do it now.
744 		 */
745 		if (pi->pi_ifindex != ifindex) {
746 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
747 			goto retry;
748 		}
749 		assert(PHYINT_INSTANCE(pi, af) == NULL);
750 
751 		/*
752 		 * If the group name seen by the IPv4 and IPv6 instances
753 		 * are different, it is most likely the groupname has
754 		 * changed, while we are yet to update our tables. Do it now.
755 		 */
756 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
757 			phyint_inst_delete(PHYINT_INSTANCE(pi,
758 			    AF_OTHER(af)));
759 			goto retry;
760 		}
761 	}
762 
763 	/*
764 	 * Create a new phyint instance, corresponding to the 'af'
765 	 * passed in.
766 	 */
767 	pii = phyint_inst_create(pi, af);
768 	if (pii == NULL) {
769 		logerr("phyint_inst_init_from_k: unable to create"
770 		    "phyint inst %s\n", pi->pi_name);
771 		if (pi_created)
772 			phyint_delete(pi);
773 
774 		return (NULL);
775 	}
776 
777 	if (pi_created) {
778 		/*
779 		 * If this phyint does not have a unique hardware address in its
780 		 * group, offline it.  (The change_pif_flags() implementation
781 		 * requires that we defer this until after the phyint_instance
782 		 * is created.)
783 		 */
784 		if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
785 			pi->pi_hwaddrdup = _B_TRUE;
786 			(void) phyint_offline(pi, 0);
787 		}
788 	}
789 
790 	return (pii);
791 }
792 
793 /*
794  * Bind pii_probe_sock to the address associated with pii_probe_logint.
795  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
796  * targets. Do the common part in this function, and complete the
797  * initializations by calling the protocol specific functions
798  * phyint_inst_v{4,6}_sockinit() respectively.
799  *
800  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
801  */
802 boolean_t
803 phyint_inst_sockinit(struct phyint_instance *pii)
804 {
805 	boolean_t success;
806 	struct phyint_group *pg;
807 
808 	if (debug & D_PHYINT) {
809 		logdebug("phyint_inst_sockinit(%s %s)\n",
810 		    AF_STR(pii->pii_af), pii->pii_name);
811 	}
812 
813 	assert(pii->pii_probe_logint != NULL);
814 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
815 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
816 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
817 
818 	/*
819 	 * If the socket is already bound, close pii_probe_sock
820 	 */
821 	if (pii->pii_probe_sock != -1)
822 		close_probe_socket(pii, _B_TRUE);
823 
824 	/*
825 	 * If the phyint is not part of a named group and track_all_phyints is
826 	 * false, simply return.
827 	 */
828 	pg = pii->pii_phyint->pi_group;
829 	if (pg == phyint_anongroup && !track_all_phyints) {
830 		if (debug & D_PHYINT)
831 			logdebug("phyint_inst_sockinit: no group\n");
832 		return (_B_FALSE);
833 	}
834 
835 	/*
836 	 * Initialize the socket by calling the protocol specific function.
837 	 * If it succeeds, add the socket to the poll list.
838 	 */
839 	if (pii->pii_af == AF_INET6)
840 		success = phyint_inst_v6_sockinit(pii);
841 	else
842 		success = phyint_inst_v4_sockinit(pii);
843 
844 	if (success && (poll_add(pii->pii_probe_sock) == 0))
845 		return (_B_TRUE);
846 
847 	/* Something failed, cleanup and return false */
848 	if (pii->pii_probe_sock != -1)
849 		close_probe_socket(pii, _B_FALSE);
850 
851 	return (_B_FALSE);
852 }
853 
854 /*
855  * IPv6 specific part in initializing the pii_probe_sock. This socket is
856  * used to send/receive ICMPv6 probe packets.
857  */
858 static boolean_t
859 phyint_inst_v6_sockinit(struct phyint_instance *pii)
860 {
861 	icmp6_filter_t filter;
862 	int hopcount = 1;
863 	int off = 0;
864 	int on = 1;
865 	struct	sockaddr_in6	testaddr;
866 	int flags;
867 
868 	/*
869 	 * Open a raw socket with ICMPv6 protocol.
870 	 *
871 	 * Use IPV6_BOUND_IF to make sure that probes are sent and received on
872 	 * the specified phyint only.  Bind to the test address to ensure that
873 	 * the responses are sent to the specified phyint.
874 	 *
875 	 * Set the hopcount to 1 so that probe packets are not routed.
876 	 * Disable multicast loopback. Set the receive filter to
877 	 * receive only ICMPv6 echo replies.
878 	 */
879 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
880 	if (pii->pii_probe_sock < 0) {
881 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
882 		return (_B_FALSE);
883 	}
884 
885 	/*
886 	 * Probes must not block in case of lower layer issues.
887 	 */
888 	if ((flags = fcntl(pii->pii_probe_sock, F_GETFL, 0)) == -1) {
889 		logperror_pii(pii, "phyint_inst_v6_sockinit: fcntl"
890 		    " F_GETFL");
891 		return (_B_FALSE);
892 	}
893 	if (fcntl(pii->pii_probe_sock, F_SETFL,
894 	    flags | O_NONBLOCK) == -1) {
895 		logperror_pii(pii, "phyint_inst_v6_sockinit: fcntl"
896 		    " F_SETFL O_NONBLOCK");
897 		return (_B_FALSE);
898 	}
899 
900 	bzero(&testaddr, sizeof (testaddr));
901 	testaddr.sin6_family = AF_INET6;
902 	testaddr.sin6_port = 0;
903 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
904 
905 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
906 	    sizeof (testaddr)) < 0) {
907 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
908 		return (_B_FALSE);
909 	}
910 
911 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_IF,
912 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
913 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
914 		    " IPV6_MULTICAST_IF");
915 		return (_B_FALSE);
916 	}
917 
918 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_BOUND_IF,
919 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
920 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
921 		    " IPV6_BOUND_IF");
922 		return (_B_FALSE);
923 	}
924 
925 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
926 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
927 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
928 		    " IPV6_UNICAST_HOPS");
929 		return (_B_FALSE);
930 	}
931 
932 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
933 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
934 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
935 		    " IPV6_MULTICAST_HOPS");
936 		return (_B_FALSE);
937 	}
938 
939 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
940 	    (char *)&off, sizeof (off)) < 0) {
941 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
942 		    " IPV6_MULTICAST_LOOP");
943 		return (_B_FALSE);
944 	}
945 
946 	/*
947 	 * Filter out so that we only receive ICMP echo replies
948 	 */
949 	ICMP6_FILTER_SETBLOCKALL(&filter);
950 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
951 
952 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
953 	    (char *)&filter, sizeof (filter)) < 0) {
954 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
955 		    " ICMP6_FILTER");
956 		return (_B_FALSE);
957 	}
958 
959 	/* Enable receipt of hoplimit */
960 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
961 	    &on, sizeof (on)) < 0) {
962 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
963 		    " IPV6_RECVHOPLIMIT");
964 		return (_B_FALSE);
965 	}
966 
967 	/* Enable receipt of timestamp */
968 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP,
969 	    &on, sizeof (on)) < 0) {
970 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
971 		    " SO_TIMESTAMP");
972 		return (_B_FALSE);
973 	}
974 
975 	return (_B_TRUE);
976 }
977 
978 /*
979  * IPv4 specific part in initializing the pii_probe_sock. This socket is
980  * used to send/receive ICMPv4 probe packets.
981  */
982 static boolean_t
983 phyint_inst_v4_sockinit(struct phyint_instance *pii)
984 {
985 	struct sockaddr_in  testaddr;
986 	char	char_off = 0;
987 	int	ttl = 1;
988 	char	char_ttl = 1;
989 	int	on = 1;
990 	int	flags;
991 
992 	/*
993 	 * Open a raw socket with ICMPv4 protocol.
994 	 *
995 	 * Use IP_BOUND_IF to make sure that probes are sent and received on
996 	 * the specified phyint only.  Bind to the test address to ensure that
997 	 * the responses are sent to the specified phyint.
998 	 *
999 	 * Set the ttl to 1 so that probe packets are not routed.
1000 	 * Disable multicast loopback.  Enable receipt of timestamp.
1001 	 */
1002 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
1003 	if (pii->pii_probe_sock < 0) {
1004 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
1005 		return (_B_FALSE);
1006 	}
1007 
1008 	/*
1009 	 * Probes must not block in case of lower layer issues.
1010 	 */
1011 	if ((flags = fcntl(pii->pii_probe_sock, F_GETFL, 0)) == -1) {
1012 		logperror_pii(pii, "phyint_inst_v4_sockinit: fcntl"
1013 		    " F_GETFL");
1014 		return (_B_FALSE);
1015 	}
1016 	if (fcntl(pii->pii_probe_sock, F_SETFL,
1017 	    flags | O_NONBLOCK) == -1) {
1018 		logperror_pii(pii, "phyint_inst_v4_sockinit: fcntl"
1019 		    " F_SETFL O_NONBLOCK");
1020 		return (_B_FALSE);
1021 	}
1022 
1023 	bzero(&testaddr, sizeof (testaddr));
1024 	testaddr.sin_family = AF_INET;
1025 	testaddr.sin_port = 0;
1026 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
1027 	    &testaddr.sin_addr);
1028 
1029 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
1030 	    sizeof (testaddr)) < 0) {
1031 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
1032 		return (_B_FALSE);
1033 	}
1034 
1035 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_BOUND_IF,
1036 	    &pii->pii_ifindex, sizeof (uint_t)) < 0) {
1037 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1038 		    " IP_BOUND_IF");
1039 		return (_B_FALSE);
1040 	}
1041 
1042 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_IF,
1043 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
1044 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1045 		    " IP_MULTICAST_IF");
1046 		return (_B_FALSE);
1047 	}
1048 
1049 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
1050 	    (char *)&ttl, sizeof (ttl)) < 0) {
1051 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1052 		    " IP_TTL");
1053 		return (_B_FALSE);
1054 	}
1055 
1056 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
1057 	    (char *)&char_off, sizeof (char_off)) == -1) {
1058 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1059 		    " IP_MULTICAST_LOOP");
1060 		return (_B_FALSE);
1061 	}
1062 
1063 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
1064 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
1065 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1066 		    " IP_MULTICAST_TTL");
1067 		return (_B_FALSE);
1068 	}
1069 
1070 	if (setsockopt(pii->pii_probe_sock, SOL_SOCKET, SO_TIMESTAMP, &on,
1071 	    sizeof (on)) < 0) {
1072 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
1073 		    " SO_TIMESTAMP");
1074 		return (_B_FALSE);
1075 	}
1076 
1077 	return (_B_TRUE);
1078 }
1079 
1080 /*
1081  * Remove the phyint group from the list of 'all phyint groups'
1082  * and free it.
1083  */
1084 void
1085 phyint_group_delete(struct phyint_group *pg)
1086 {
1087 	/*
1088 	 * The anonymous group always exists, even when empty.
1089 	 */
1090 	if (pg == phyint_anongroup)
1091 		return;
1092 
1093 	if (debug & D_PHYINT)
1094 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
1095 
1096 	/*
1097 	 * The phyint group must be empty, and must not have any phyints.
1098 	 * The phyint group must be in the list of all phyint groups
1099 	 */
1100 	assert(pg->pg_phyint == NULL);
1101 	assert(phyint_groups == pg || pg->pg_prev != NULL);
1102 
1103 	if (pg->pg_prev != NULL)
1104 		pg->pg_prev->pg_next = pg->pg_next;
1105 	else
1106 		phyint_groups = pg->pg_next;
1107 
1108 	if (pg->pg_next != NULL)
1109 		pg->pg_next->pg_prev = pg->pg_prev;
1110 
1111 	pg->pg_next = NULL;
1112 	pg->pg_prev = NULL;
1113 
1114 	phyint_grouplistsig++;
1115 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
1116 
1117 	addrlist_free(&pg->pg_addrs);
1118 	free(pg);
1119 }
1120 
1121 /*
1122  * Refresh the state of `pg' based on its current members.
1123  */
1124 void
1125 phyint_group_refresh_state(struct phyint_group *pg)
1126 {
1127 	enum pg_state state;
1128 	enum pg_state origstate = pg->pg_state;
1129 	struct phyint *pi, *usablepi;
1130 	uint_t nif = 0, nusable = 0;
1131 
1132 	/*
1133 	 * Anonymous groups never change state.
1134 	 */
1135 	if (pg == phyint_anongroup)
1136 		return;
1137 
1138 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
1139 		nif++;
1140 		if (phyint_is_usable(pi)) {
1141 			nusable++;
1142 			usablepi = pi;
1143 		}
1144 	}
1145 
1146 	if (nusable == 0)
1147 		state = PG_FAILED;
1148 	else if (nif == nusable)
1149 		state = PG_OK;
1150 	else
1151 		state = PG_DEGRADED;
1152 
1153 	phyint_group_chstate(pg, state);
1154 
1155 	/*
1156 	 * If we're shutting down, skip logging messages since otherwise our
1157 	 * shutdown housecleaning will make us report that groups are unusable.
1158 	 */
1159 	if (cleanup_started)
1160 		return;
1161 
1162 	/*
1163 	 * NOTE: We use pg_failmsg_printed rather than origstate since
1164 	 * otherwise at startup we'll log a "now usable" message when the
1165 	 * first usable phyint is added to an empty group.
1166 	 */
1167 	if (state != PG_FAILED && pg->pg_failmsg_printed) {
1168 		assert(origstate == PG_FAILED);
1169 		logerr("At least 1 IP interface (%s) in group %s is now "
1170 		    "usable\n", usablepi->pi_name, pg->pg_name);
1171 		pg->pg_failmsg_printed = _B_FALSE;
1172 	} else if (origstate != PG_FAILED && state == PG_FAILED) {
1173 		logerr("All IP interfaces in group %s are now unusable\n",
1174 		    pg->pg_name);
1175 		pg->pg_failmsg_printed = _B_TRUE;
1176 	}
1177 }
1178 
1179 /*
1180  * Extract information from the kernel about the desired phyint.
1181  * Look only for properties of the phyint and not properties of logints.
1182  * Take appropriate action on the changes.
1183  * Return codes:
1184  *	PI_OK
1185  *		The phyint exists in the kernel and matches our knowledge
1186  *		of the phyint.
1187  *	PI_DELETED
1188  *		The phyint has vanished in the kernel.
1189  *	PI_IFINDEX_CHANGED
1190  *		The phyint's interface index has changed.
1191  *		Ask the caller to delete and recreate the phyint.
1192  *	PI_IOCTL_ERROR
1193  *		Some ioctl error. Don't change anything.
1194  *	PI_GROUP_CHANGED
1195  *		The phyint has changed group.
1196  */
1197 int
1198 phyint_inst_update_from_k(struct phyint_instance *pii)
1199 {
1200 	struct lifreq lifr;
1201 	int	ifsock;
1202 	struct phyint *pi;
1203 
1204 	pi = pii->pii_phyint;
1205 
1206 	if (debug & D_PHYINT) {
1207 		logdebug("phyint_inst_update_from_k(%s %s)\n",
1208 		    AF_STR(pii->pii_af), pi->pi_name);
1209 	}
1210 
1211 	/*
1212 	 * Get the ifindex from the kernel, for comparison with the
1213 	 * value in our tables.
1214 	 */
1215 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
1216 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1217 
1218 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1219 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
1220 		if (errno == ENXIO) {
1221 			return (PI_DELETED);
1222 		} else {
1223 			logperror_pii(pii, "phyint_inst_update_from_k:"
1224 			    " ioctl (get lifindex)");
1225 			return (PI_IOCTL_ERROR);
1226 		}
1227 	}
1228 
1229 	if (lifr.lifr_index != pi->pi_ifindex) {
1230 		/*
1231 		 * The index has changed. Most likely the interface has
1232 		 * been unplumbed and replumbed. Ask the caller to take
1233 		 * appropriate action.
1234 		 */
1235 		if (debug & D_PHYINT) {
1236 			logdebug("phyint_inst_update_from_k:"
1237 			    " old index %d new index %d\n",
1238 			    pi->pi_ifindex, lifr.lifr_index);
1239 		}
1240 		return (PI_IFINDEX_CHANGED);
1241 	}
1242 
1243 	/*
1244 	 * Get the group name from the kernel, for comparison with
1245 	 * the value in our tables.
1246 	 */
1247 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
1248 		if (errno == ENXIO) {
1249 			return (PI_DELETED);
1250 		} else {
1251 			logperror_pii(pii, "phyint_inst_update_from_k:"
1252 			    " ioctl (get groupname)");
1253 			return (PI_IOCTL_ERROR);
1254 		}
1255 	}
1256 
1257 	/*
1258 	 * If the phyint has changed group i.e. if the phyint group name
1259 	 * returned by the kernel is different, ask the caller to delete
1260 	 * and recreate the phyint in the right group
1261 	 */
1262 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
1263 		/* Groupname has changed */
1264 		if (debug & D_PHYINT) {
1265 			logdebug("phyint_inst_update_from_k:"
1266 			    " groupname change\n");
1267 		}
1268 		return (PI_GROUP_CHANGED);
1269 	}
1270 
1271 	/*
1272 	 * Get the current phyint flags from the kernel, and determine what
1273 	 * flags have changed by comparing against our tables.	Note that the
1274 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
1275 	 * that IFF_INACTIVE is really still set on the interface.
1276 	 */
1277 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
1278 		if (errno == ENXIO) {
1279 			return (PI_DELETED);
1280 		} else {
1281 			logperror_pii(pii, "phyint_inst_update_from_k: "
1282 			    " ioctl (get flags)");
1283 			return (PI_IOCTL_ERROR);
1284 		}
1285 	}
1286 
1287 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
1288 	if (pi->pi_v4 != NULL)
1289 		pi->pi_v4->pii_flags = pi->pi_flags;
1290 	if (pi->pi_v6 != NULL)
1291 		pi->pi_v6->pii_flags = pi->pi_flags;
1292 
1293 	/*
1294 	 * Make sure the IFF_FAILED flag is set if and only if we think
1295 	 * the interface should be failed.
1296 	 */
1297 	if (pi->pi_flags & IFF_FAILED) {
1298 		if (pi->pi_state == PI_RUNNING)
1299 			(void) change_pif_flags(pi, 0, IFF_FAILED);
1300 	} else {
1301 		if (pi->pi_state == PI_FAILED)
1302 			(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
1303 	}
1304 
1305 	/* No change in phyint status */
1306 	return (PI_OK);
1307 }
1308 
1309 /*
1310  * Delete the phyint. Remove it from the list of all phyints, and the
1311  * list of phyint group members.
1312  */
1313 static void
1314 phyint_delete(struct phyint *pi)
1315 {
1316 	struct phyint *pi2;
1317 	struct phyint_group *pg = pi->pi_group;
1318 
1319 	if (debug & D_PHYINT)
1320 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1321 
1322 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1323 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1324 
1325 	/*
1326 	 * The phyint must belong to a group.
1327 	 */
1328 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1329 
1330 	/* The phyint must be in the list of all phyints */
1331 	assert(phyints == pi || pi->pi_prev != NULL);
1332 
1333 	/* Remove the phyint from the phyint group list */
1334 	pg->pg_sig++;
1335 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1336 
1337 	if (pi->pi_pgprev == NULL) {
1338 		/* Phyint is the 1st in the phyint group list */
1339 		pg->pg_phyint = pi->pi_pgnext;
1340 	} else {
1341 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1342 	}
1343 	if (pi->pi_pgnext != NULL)
1344 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1345 	pi->pi_pgnext = NULL;
1346 	pi->pi_pgprev = NULL;
1347 
1348 	/* Refresh the group state now that this phyint has been removed */
1349 	phyint_group_refresh_state(pg);
1350 
1351 	/* Remove the phyint from the global list of phyints */
1352 	if (pi->pi_prev == NULL) {
1353 		/* Phyint is the 1st in the list */
1354 		phyints = pi->pi_next;
1355 	} else {
1356 		pi->pi_prev->pi_next = pi->pi_next;
1357 	}
1358 	if (pi->pi_next != NULL)
1359 		pi->pi_next->pi_prev = pi->pi_prev;
1360 	pi->pi_next = NULL;
1361 	pi->pi_prev = NULL;
1362 
1363 	/*
1364 	 * See if another phyint in the group had been offlined because
1365 	 * it was a dup of `pi' -- and if so, online it.
1366 	 */
1367 	if (!pi->pi_hwaddrdup &&
1368 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1369 		assert(pi2->pi_hwaddrdup);
1370 		(void) phyint_undo_offline(pi2);
1371 	}
1372 	phyint_link_close(pi);
1373 	free(pi);
1374 }
1375 
1376 /*
1377  * Offline phyint `pi' if at least `minred' usable interfaces remain in the
1378  * group.  Returns an IPMP error code.
1379  */
1380 int
1381 phyint_offline(struct phyint *pi, uint_t minred)
1382 {
1383 	boolean_t was_active;
1384 	unsigned int nusable = 0;
1385 	struct phyint *pi2;
1386 	struct phyint_group *pg = pi->pi_group;
1387 
1388 	/*
1389 	 * Verify that enough usable interfaces in the group would remain.
1390 	 * As a special case, if the group has failed, allow any non-offline
1391 	 * phyints to be offlined.
1392 	 */
1393 	if (pg != phyint_anongroup) {
1394 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1395 			if (pi2 == pi)
1396 				continue;
1397 			if (phyint_is_usable(pi2) ||
1398 			    (GROUP_FAILED(pg) && pi2->pi_state != PI_OFFLINE))
1399 				nusable++;
1400 		}
1401 	}
1402 	if (nusable < minred)
1403 		return (IPMP_EMINRED);
1404 
1405 	was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
1406 
1407 	if (!change_pif_flags(pi, IFF_OFFLINE, IFF_INACTIVE))
1408 		return (IPMP_FAILURE);
1409 
1410 	/*
1411 	 * The interface is now offline, so stop probing it.  Note that
1412 	 * if_mpadm(1M) will down the test addresses, after receiving a
1413 	 * success reply from us. The routing socket message will then make us
1414 	 * close the socket used for sending probes. But it is more logical
1415 	 * that an offlined interface must not be probed, even if it has test
1416 	 * addresses.
1417 	 *
1418 	 * NOTE: stop_probing() also sets PI_OFFLINE.
1419 	 */
1420 	stop_probing(pi);
1421 
1422 	/*
1423 	 * If we're offlining the phyint because it has a duplicate hardware
1424 	 * address, print a warning -- and leave the link open so that we can
1425 	 * be notified of hardware address changes that make it usable again.
1426 	 * Otherwise, close the link so that we won't prevent a detach.
1427 	 */
1428 	if (pi->pi_hwaddrdup) {
1429 		logerr("IP interface %s has a hardware address which is not "
1430 		    "unique in group %s; offlining\n", pi->pi_name,
1431 		    pg->pg_name);
1432 	} else {
1433 		phyint_link_close(pi);
1434 	}
1435 
1436 	/*
1437 	 * If this phyint was preventing another phyint with a duplicate
1438 	 * hardware address from being online, bring that one online now.
1439 	 */
1440 	if (!pi->pi_hwaddrdup &&
1441 	    (pi2 = phyint_lookup_hwaddr(pi, _B_FALSE)) != NULL) {
1442 		assert(pi2->pi_hwaddrdup);
1443 		(void) phyint_undo_offline(pi2);
1444 	}
1445 
1446 	/*
1447 	 * If this interface was active, try to activate another INACTIVE
1448 	 * interface in the group.
1449 	 */
1450 	if (was_active)
1451 		phyint_activate_another(pi);
1452 
1453 	return (IPMP_SUCCESS);
1454 }
1455 
1456 /*
1457  * Undo a previous offline of `pi'.  Returns an IPMP error code.
1458  */
1459 int
1460 phyint_undo_offline(struct phyint *pi)
1461 {
1462 	if (pi->pi_state != PI_OFFLINE) {
1463 		errno = EINVAL;
1464 		return (IPMP_FAILURE);
1465 	}
1466 
1467 	/*
1468 	 * If necessary, reinitialize our link information and verify that its
1469 	 * hardware address is still unique across the group.
1470 	 */
1471 	if (pi->pi_dh == NULL && !phyint_link_init(pi)) {
1472 		errno = EIO;
1473 		return (IPMP_FAILURE);
1474 	}
1475 
1476 	if (phyint_lookup_hwaddr(pi, _B_TRUE) != NULL) {
1477 		pi->pi_hwaddrdup = _B_TRUE;
1478 		return (IPMP_EHWADDRDUP);
1479 	}
1480 
1481 	if (pi->pi_hwaddrdup) {
1482 		logerr("IP interface %s now has a unique hardware address in "
1483 		    "group %s; onlining\n", pi->pi_name, pi->pi_group->pg_name);
1484 		pi->pi_hwaddrdup = _B_FALSE;
1485 	}
1486 
1487 	if (!change_pif_flags(pi, 0, IFF_OFFLINE))
1488 		return (IPMP_FAILURE);
1489 
1490 	/*
1491 	 * While the interface was offline, it may have failed (e.g. the link
1492 	 * may have gone down).  phyint_inst_check_for_failure() will have
1493 	 * already set pi_flags with IFF_FAILED, so we can use that to decide
1494 	 * whether the phyint should transition to running.  Note that after
1495 	 * we transition to running, we will start sending probes again (if
1496 	 * test addresses are configured), which may also reveal that the
1497 	 * interface is in fact failed.
1498 	 */
1499 	if (pi->pi_flags & IFF_FAILED) {
1500 		phyint_chstate(pi, PI_FAILED);
1501 	} else {
1502 		/* calls phyint_chstate() */
1503 		phyint_transition_to_running(pi);
1504 	}
1505 
1506 	/*
1507 	 * Give the requestor time to configure test addresses before
1508 	 * complaining that they're missing.
1509 	 */
1510 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
1511 
1512 	return (IPMP_SUCCESS);
1513 }
1514 
1515 /*
1516  * Delete (unlink and free), the phyint instance.
1517  */
1518 void
1519 phyint_inst_delete(struct phyint_instance *pii)
1520 {
1521 	struct phyint *pi = pii->pii_phyint;
1522 
1523 	assert(pi != NULL);
1524 
1525 	if (debug & D_PHYINT) {
1526 		logdebug("phyint_inst_delete(%s %s)\n",
1527 		    AF_STR(pii->pii_af), pi->pi_name);
1528 	}
1529 
1530 	/*
1531 	 * If the phyint instance has associated probe targets
1532 	 * delete all the targets
1533 	 */
1534 	while (pii->pii_targets != NULL)
1535 		target_delete(pii->pii_targets);
1536 
1537 	/*
1538 	 * Delete all the logints associated with this phyint
1539 	 * instance.
1540 	 */
1541 	while (pii->pii_logint != NULL)
1542 		logint_delete(pii->pii_logint);
1543 
1544 	/*
1545 	 * Close the socket used to send probes to targets from this phyint.
1546 	 */
1547 	if (pii->pii_probe_sock != -1)
1548 		close_probe_socket(pii, _B_TRUE);
1549 
1550 	/*
1551 	 * Phyint instance must be in the list of all phyint instances.
1552 	 * Remove phyint instance from the global list of phyint instances.
1553 	 */
1554 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1555 	if (pii->pii_prev == NULL) {
1556 		/* Phyint is the 1st in the list */
1557 		phyint_instances = pii->pii_next;
1558 	} else {
1559 		pii->pii_prev->pii_next = pii->pii_next;
1560 	}
1561 	if (pii->pii_next != NULL)
1562 		pii->pii_next->pii_prev = pii->pii_prev;
1563 	pii->pii_next = NULL;
1564 	pii->pii_prev = NULL;
1565 
1566 	/*
1567 	 * Reset the phyint instance pointer in the phyint.
1568 	 * If this is the last phyint instance (being deleted) on this
1569 	 * phyint, then delete the phyint.
1570 	 */
1571 	if (pii->pii_af == AF_INET)
1572 		pi->pi_v4 = NULL;
1573 	else
1574 		pi->pi_v6 = NULL;
1575 
1576 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1577 		phyint_delete(pi);
1578 
1579 	free(pii);
1580 }
1581 
1582 static void
1583 phyint_inst_print(struct phyint_instance *pii)
1584 {
1585 	struct logint *li;
1586 	struct target *tg;
1587 	char abuf[INET6_ADDRSTRLEN];
1588 	int most_recent;
1589 	int i;
1590 
1591 	if (pii->pii_phyint == NULL) {
1592 		logdebug("pii->pi_phyint NULL can't print\n");
1593 		return;
1594 	}
1595 
1596 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1597 	    "sock %x in_use %d\n",
1598 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1599 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1600 	    pii->pii_in_use);
1601 
1602 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1603 		logint_print(li);
1604 
1605 	logdebug("\n");
1606 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1607 		target_print(tg);
1608 
1609 	if (pii->pii_targets == NULL)
1610 		logdebug("pi_targets NULL\n");
1611 
1612 	if (pii->pii_target_next != NULL) {
1613 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1614 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1615 		    abuf, sizeof (abuf)));
1616 	} else {
1617 		logdebug("pi_target_next NULL\n");
1618 	}
1619 
1620 	if (pii->pii_rtt_target_next != NULL) {
1621 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1622 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1623 		    abuf, sizeof (abuf)));
1624 	} else {
1625 		logdebug("pi_rtt_target_next NULL\n");
1626 	}
1627 
1628 	if (pii->pii_targets != NULL) {
1629 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1630 
1631 		i = most_recent;
1632 		do {
1633 			if (pii->pii_probes[i].pr_target != NULL) {
1634 				logdebug("#%d target %s ", i,
1635 				    pr_addr(pii->pii_af,
1636 				    pii->pii_probes[i].pr_target->tg_address,
1637 				    abuf, sizeof (abuf)));
1638 			} else {
1639 				logdebug("#%d target NULL ", i);
1640 			}
1641 			logdebug("time_start %lld status %d "
1642 			    "time_ackproc %lld time_lost %u",
1643 			    pii->pii_probes[i].pr_hrtime_start,
1644 			    pii->pii_probes[i].pr_status,
1645 			    pii->pii_probes[i].pr_hrtime_ackproc,
1646 			    pii->pii_probes[i].pr_time_lost);
1647 			i = PROBE_INDEX_PREV(i);
1648 		} while (i != most_recent);
1649 	}
1650 }
1651 
1652 /*
1653  * Lookup a logint based on the logical interface name, on the given
1654  * phyint instance.
1655  */
1656 static struct logint *
1657 logint_lookup(struct phyint_instance *pii, char *name)
1658 {
1659 	struct logint *li;
1660 
1661 	if (debug & D_LOGINT) {
1662 		logdebug("logint_lookup(%s, %s)\n",
1663 		    AF_STR(pii->pii_af), name);
1664 	}
1665 
1666 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1667 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1668 			break;
1669 	}
1670 	return (li);
1671 }
1672 
1673 /*
1674  * Insert a logint at the head of the list of logints of the given
1675  * phyint instance
1676  */
1677 static void
1678 logint_insert(struct phyint_instance *pii, struct logint *li)
1679 {
1680 	li->li_next = pii->pii_logint;
1681 	li->li_prev = NULL;
1682 	if (pii->pii_logint != NULL)
1683 		pii->pii_logint->li_prev = li;
1684 	pii->pii_logint = li;
1685 	li->li_phyint_inst = pii;
1686 }
1687 
1688 /*
1689  * Create a new named logint, on the specified phyint instance.
1690  */
1691 static struct logint *
1692 logint_create(struct phyint_instance *pii, char *name)
1693 {
1694 	struct logint *li;
1695 
1696 	if (debug & D_LOGINT) {
1697 		logdebug("logint_create(%s %s %s)\n",
1698 		    AF_STR(pii->pii_af), pii->pii_name, name);
1699 	}
1700 
1701 	li = calloc(1, sizeof (struct logint));
1702 	if (li == NULL) {
1703 		logperror("logint_create: calloc");
1704 		return (NULL);
1705 	}
1706 
1707 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1708 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1709 	logint_insert(pii, li);
1710 	return (li);
1711 }
1712 
1713 /*
1714  * Initialize the logint based on the data returned by the kernel.
1715  */
1716 void
1717 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1718 {
1719 	int	ifsock;
1720 	uint64_t flags;
1721 	uint64_t saved_flags;
1722 	struct	logint	*li;
1723 	struct lifreq	lifr;
1724 	struct in6_addr	test_subnet;
1725 	struct in6_addr	testaddr;
1726 	int	test_subnet_len;
1727 	struct sockaddr_in6	*sin6;
1728 	struct sockaddr_in	*sin;
1729 	char abuf[INET6_ADDRSTRLEN];
1730 	boolean_t  ptp = _B_FALSE;
1731 	struct in6_addr tgaddr;
1732 
1733 	if (debug & D_LOGINT) {
1734 		logdebug("logint_init_from_k(%s %s)\n",
1735 		    AF_STR(pii->pii_af), li_name);
1736 	}
1737 
1738 	/* Get the socket for doing ioctls */
1739 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1740 
1741 	/*
1742 	 * Get the flags from the kernel. Also serves as a check whether
1743 	 * the logical still exists. If it doesn't exist, no need to proceed
1744 	 * any further. li_in_use will make the caller clean up the logint
1745 	 */
1746 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1747 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1748 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1749 		/* Interface may have vanished */
1750 		if (errno != ENXIO) {
1751 			logperror_pii(pii, "logint_init_from_k: "
1752 			    "ioctl (get flags)");
1753 		}
1754 		return;
1755 	}
1756 
1757 	flags = lifr.lifr_flags;
1758 
1759 	/*
1760 	 * Verified the logint exists. Now lookup the logint in our tables.
1761 	 * If it does not exist, create a new logint.
1762 	 */
1763 	li = logint_lookup(pii, li_name);
1764 	if (li == NULL) {
1765 		li = logint_create(pii, li_name);
1766 		if (li == NULL) {
1767 			/*
1768 			 * Pretend the interface does not exist
1769 			 * in the kernel
1770 			 */
1771 			return;
1772 		}
1773 	}
1774 
1775 	/*
1776 	 * Update li->li_flags with the new flags, after saving the old
1777 	 * value. This is used later to check what flags has changed and
1778 	 * take any action
1779 	 */
1780 	saved_flags = li->li_flags;
1781 	li->li_flags = flags;
1782 
1783 	/*
1784 	 * Get the address, prefix, prefixlength and update the logint.
1785 	 * Check if anything has changed. If the logint used for the
1786 	 * test address has changed, take suitable action.
1787 	 */
1788 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1789 		/* Interface may have vanished */
1790 		if (errno != ENXIO) {
1791 			logperror_li(li, "logint_init_from_k: (get addr)");
1792 		}
1793 		goto error;
1794 	}
1795 
1796 	if (pii->pii_af == AF_INET) {
1797 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1798 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1799 	} else {
1800 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1801 		testaddr = sin6->sin6_addr;
1802 	}
1803 
1804 	if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1805 		/* Interface may have vanished */
1806 		if (errno != ENXIO)
1807 			logperror_li(li, "logint_init_from_k: (get subnet)");
1808 		goto error;
1809 	}
1810 	if (lifr.lifr_subnet.ss_family == AF_INET6) {
1811 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1812 		test_subnet = sin6->sin6_addr;
1813 		test_subnet_len = lifr.lifr_addrlen;
1814 	} else {
1815 		sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1816 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1817 		test_subnet_len = lifr.lifr_addrlen + (IPV6_ABITS - IP_ABITS);
1818 	}
1819 
1820 	/*
1821 	 * If this is the logint corresponding to the test address used for
1822 	 * sending probes, then if anything significant has changed we need to
1823 	 * determine the test address again.  We ignore changes to the
1824 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1825 	 * course.
1826 	 */
1827 	if (pii->pii_probe_logint == li) {
1828 		if (((li->li_flags ^ saved_flags) &
1829 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1830 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1831 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1832 		    &li->li_subnet)) ||
1833 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1834 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1835 			/*
1836 			 * Something significant that affects the testaddress
1837 			 * has changed. Redo the testaddress selection later on
1838 			 * in select_test_ifs(). For now do the cleanup and
1839 			 * set pii_probe_logint to NULL.
1840 			 */
1841 			if (pii->pii_probe_sock != -1)
1842 				close_probe_socket(pii, _B_TRUE);
1843 			pii->pii_probe_logint = NULL;
1844 		}
1845 	}
1846 
1847 
1848 	/* Update the logint with the values obtained from the kernel.	*/
1849 	li->li_addr = testaddr;
1850 	li->li_in_use = 1;
1851 	if (ptp) {
1852 		li->li_dstaddr = tgaddr;
1853 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1854 		    IP_ABITS : IPV6_ABITS;
1855 	} else {
1856 		li->li_subnet = test_subnet;
1857 		li->li_subnet_len = test_subnet_len;
1858 	}
1859 
1860 	if (debug & D_LOGINT)
1861 		logint_print(li);
1862 
1863 	return;
1864 
1865 error:
1866 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1867 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1868 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1869 	logint_delete(li);
1870 }
1871 
1872 /*
1873  * Delete (unlink and free) a logint.
1874  */
1875 void
1876 logint_delete(struct logint *li)
1877 {
1878 	struct phyint_instance *pii;
1879 
1880 	pii = li->li_phyint_inst;
1881 	assert(pii != NULL);
1882 
1883 	if (debug & D_LOGINT) {
1884 		int af;
1885 		char abuf[INET6_ADDRSTRLEN];
1886 
1887 		af = pii->pii_af;
1888 		logdebug("logint_delete(%s %s %s/%u)\n",
1889 		    AF_STR(af), li->li_name,
1890 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1891 		    li->li_subnet_len);
1892 	}
1893 
1894 	/* logint must be in the list of logints */
1895 	assert(pii->pii_logint == li || li->li_prev != NULL);
1896 
1897 	/* Remove the logint from the list of logints  */
1898 	if (li->li_prev == NULL) {
1899 		/* logint is the 1st in the list */
1900 		pii->pii_logint = li->li_next;
1901 	} else {
1902 		li->li_prev->li_next = li->li_next;
1903 	}
1904 	if (li->li_next != NULL)
1905 		li->li_next->li_prev = li->li_prev;
1906 	li->li_next = NULL;
1907 	li->li_prev = NULL;
1908 
1909 	/*
1910 	 * If this logint is also being used for probing, then close the
1911 	 * associated socket, if it exists.
1912 	 */
1913 	if (pii->pii_probe_logint == li) {
1914 		if (pii->pii_probe_sock != -1)
1915 			close_probe_socket(pii, _B_TRUE);
1916 		pii->pii_probe_logint = NULL;
1917 	}
1918 
1919 	free(li);
1920 }
1921 
1922 static void
1923 logint_print(struct logint *li)
1924 {
1925 	char abuf[INET6_ADDRSTRLEN];
1926 	int af = li->li_phyint_inst->pii_af;
1927 
1928 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1929 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1930 
1931 	logdebug("\tFlags: %llx in_use %d\n", li->li_flags, li->li_in_use);
1932 }
1933 
1934 char *
1935 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1936 {
1937 	struct in_addr	addr_v4;
1938 
1939 	if (af == AF_INET) {
1940 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1941 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1942 	} else {
1943 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1944 	}
1945 	return (abuf);
1946 }
1947 
1948 /*
1949  * Fill in the sockaddr_storage pointed to by `ssp' with the IP address
1950  * represented by the [`af',`addr'] pair.  Needed because in.mpathd internally
1951  * stores all addresses as in6_addrs, but we don't want to expose that.
1952  */
1953 void
1954 addr2storage(int af, const struct in6_addr *addr, struct sockaddr_storage *ssp)
1955 {
1956 	struct sockaddr_in *sinp = (struct sockaddr_in *)ssp;
1957 	struct sockaddr_in6 *sin6p = (struct sockaddr_in6 *)ssp;
1958 
1959 	assert(af == AF_INET || af == AF_INET6);
1960 
1961 	switch (af) {
1962 	case AF_INET:
1963 		(void) memset(sinp, 0, sizeof (*sinp));
1964 		sinp->sin_family = AF_INET;
1965 		IN6_V4MAPPED_TO_INADDR(addr, &sinp->sin_addr);
1966 		break;
1967 	case AF_INET6:
1968 		(void) memset(sin6p, 0, sizeof (*sin6p));
1969 		sin6p->sin6_family = AF_INET6;
1970 		sin6p->sin6_addr = *addr;
1971 		break;
1972 	}
1973 }
1974 
1975 /* Lookup target on its address */
1976 struct target *
1977 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1978 {
1979 	struct target *tg;
1980 
1981 	if (debug & D_TARGET) {
1982 		char abuf[INET6_ADDRSTRLEN];
1983 
1984 		logdebug("target_lookup(%s %s): addr %s\n",
1985 		    AF_STR(pii->pii_af), pii->pii_name,
1986 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1987 	}
1988 
1989 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1990 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1991 			break;
1992 	}
1993 	return (tg);
1994 }
1995 
1996 /*
1997  * Find and return the next active target, for the next probe.
1998  * If no active targets are available, return NULL.
1999  */
2000 struct target *
2001 target_next(struct target *tg)
2002 {
2003 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
2004 	struct	target	*marker = tg;
2005 	hrtime_t now;
2006 
2007 	now = gethrtime();
2008 
2009 	/*
2010 	 * Target must be in the list of targets for this phyint
2011 	 * instance.
2012 	 */
2013 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2014 	assert(pii->pii_targets != NULL);
2015 
2016 	/* Return the next active target */
2017 	do {
2018 		/*
2019 		 * Go to the next target. If we hit the end,
2020 		 * reset the ptr to the head
2021 		 */
2022 		tg = tg->tg_next;
2023 		if (tg == NULL)
2024 			tg = pii->pii_targets;
2025 
2026 		assert(TG_STATUS_VALID(tg->tg_status));
2027 
2028 		switch (tg->tg_status) {
2029 		case TG_ACTIVE:
2030 			return (tg);
2031 
2032 		case TG_UNUSED:
2033 			assert(pii->pii_targets_are_routers);
2034 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
2035 				/*
2036 				 * Bubble up the unused target to active
2037 				 */
2038 				tg->tg_status = TG_ACTIVE;
2039 				pii->pii_ntargets++;
2040 				return (tg);
2041 			}
2042 			break;
2043 
2044 		case TG_SLOW:
2045 			assert(pii->pii_targets_are_routers);
2046 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2047 				/*
2048 				 * Bubble up the slow target to unused
2049 				 */
2050 				tg->tg_status = TG_UNUSED;
2051 			}
2052 			break;
2053 
2054 		case TG_DEAD:
2055 			assert(pii->pii_targets_are_routers);
2056 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2057 				/*
2058 				 * Bubble up the dead target to slow
2059 				 */
2060 				tg->tg_status = TG_SLOW;
2061 				tg->tg_latime = now;
2062 			}
2063 			break;
2064 		}
2065 
2066 	} while (tg != marker);
2067 
2068 	return (NULL);
2069 }
2070 
2071 /*
2072  * Select the best available target, that is not already TG_ACTIVE,
2073  * for the caller. The caller will determine whether it wants to
2074  * make the returned target TG_ACTIVE.
2075  * The selection order is as follows.
2076  * 1. pick a TG_UNSED target, if it exists.
2077  * 2. else pick a TG_SLOW target that has recovered, if it exists
2078  * 3. else pick any TG_SLOW target, if it exists
2079  * 4. else pick a TG_DEAD target that has recovered, if it exists
2080  * 5. else pick any TG_DEAD target, if it exists
2081  * 6. else return null
2082  */
2083 static struct target *
2084 target_select_best(struct phyint_instance *pii)
2085 {
2086 	struct target *tg;
2087 	struct target *slow = NULL;
2088 	struct target *dead = NULL;
2089 	struct target *slow_recovered = NULL;
2090 	struct target *dead_recovered = NULL;
2091 	hrtime_t now;
2092 
2093 	now = gethrtime();
2094 
2095 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2096 		assert(TG_STATUS_VALID(tg->tg_status));
2097 
2098 		switch (tg->tg_status) {
2099 		case TG_UNUSED:
2100 			return (tg);
2101 
2102 		case TG_SLOW:
2103 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2104 				slow_recovered = tg;
2105 				/*
2106 				 * Promote the slow_recovered to unused
2107 				 */
2108 				tg->tg_status = TG_UNUSED;
2109 			} else {
2110 				slow = tg;
2111 			}
2112 			break;
2113 
2114 		case TG_DEAD:
2115 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
2116 				dead_recovered = tg;
2117 				/*
2118 				 * Promote the dead_recovered to slow
2119 				 */
2120 				tg->tg_status = TG_SLOW;
2121 				tg->tg_latime = now;
2122 			} else {
2123 				dead = tg;
2124 			}
2125 			break;
2126 
2127 		default:
2128 			break;
2129 		}
2130 	}
2131 
2132 	if (slow_recovered != NULL)
2133 		return (slow_recovered);
2134 	else if (slow != NULL)
2135 		return (slow);
2136 	else if (dead_recovered != NULL)
2137 		return (dead_recovered);
2138 	else
2139 		return (dead);
2140 }
2141 
2142 /*
2143  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
2144  * that are active, pick the next best below.
2145  */
2146 static void
2147 target_activate_all(struct phyint_instance *pii)
2148 {
2149 	struct target *tg;
2150 
2151 	assert(pii->pii_ntargets == 0);
2152 	assert(pii->pii_target_next == NULL);
2153 	assert(pii->pii_rtt_target_next == NULL);
2154 	assert(pii->pii_targets_are_routers);
2155 
2156 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
2157 		tg = target_select_best(pii);
2158 		if (tg == NULL) {
2159 			/* We are out of targets */
2160 			return;
2161 		}
2162 
2163 		assert(TG_STATUS_VALID(tg->tg_status));
2164 		assert(tg->tg_status != TG_ACTIVE);
2165 		tg->tg_status = TG_ACTIVE;
2166 		pii->pii_ntargets++;
2167 		if (pii->pii_target_next == NULL) {
2168 			pii->pii_target_next = tg;
2169 			pii->pii_rtt_target_next = tg;
2170 		}
2171 	}
2172 }
2173 
2174 static struct target *
2175 target_first(struct phyint_instance *pii)
2176 {
2177 	struct target *tg;
2178 
2179 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
2180 		assert(TG_STATUS_VALID(tg->tg_status));
2181 		if (tg->tg_status == TG_ACTIVE)
2182 			break;
2183 	}
2184 
2185 	return (tg);
2186 }
2187 
2188 /*
2189  * Create a default target entry.
2190  */
2191 void
2192 target_create(struct phyint_instance *pii, struct in6_addr addr,
2193     boolean_t is_router)
2194 {
2195 	struct target *tg;
2196 	struct phyint *pi;
2197 	struct logint *li;
2198 
2199 	if (debug & D_TARGET) {
2200 		char abuf[INET6_ADDRSTRLEN];
2201 
2202 		logdebug("target_create(%s %s, %s)\n",
2203 		    AF_STR(pii->pii_af), pii->pii_name,
2204 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
2205 	}
2206 
2207 	/*
2208 	 * If the test address is not yet initialized, do not add
2209 	 * any target, since we cannot determine whether the target
2210 	 * belongs to the same subnet as the test address.
2211 	 */
2212 	li = pii->pii_probe_logint;
2213 	if (li == NULL)
2214 		return;
2215 
2216 	/*
2217 	 * If there are multiple subnets associated with an interface, then
2218 	 * add the target to this phyint instance only if it belongs to the
2219 	 * same subnet as the test address.  This assures us that we will
2220 	 * be able to reach this target through our routing table.
2221 	 */
2222 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
2223 		return;
2224 
2225 	if (pii->pii_targets != NULL) {
2226 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
2227 		if (is_router) {
2228 			if (!pii->pii_targets_are_routers) {
2229 				/*
2230 				 * Prefer router over hosts. Using hosts is a
2231 				 * fallback mechanism, hence delete all host
2232 				 * targets.
2233 				 */
2234 				while (pii->pii_targets != NULL)
2235 					target_delete(pii->pii_targets);
2236 			}
2237 		} else {
2238 			/*
2239 			 * Routers take precedence over hosts. If this
2240 			 * is a router list and we are trying to add a
2241 			 * host, just return. If this is a host list
2242 			 * and if we have sufficient targets, just return
2243 			 */
2244 			if (pii->pii_targets_are_routers ||
2245 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
2246 				return;
2247 		}
2248 	}
2249 
2250 	tg = calloc(1, sizeof (struct target));
2251 	if (tg == NULL) {
2252 		logperror("target_create: calloc");
2253 		return;
2254 	}
2255 
2256 	tg->tg_phyint_inst = pii;
2257 	tg->tg_address = addr;
2258 	tg->tg_in_use = 1;
2259 	tg->tg_rtt_sa = -1;
2260 	tg->tg_num_deferred = 0;
2261 
2262 	/*
2263 	 * If this is the first target, set 'pii_targets_are_routers'
2264 	 * The list of targets is either a list of hosts or list or
2265 	 * routers, but not a mix.
2266 	 */
2267 	if (pii->pii_targets == NULL) {
2268 		assert(pii->pii_ntargets == 0);
2269 		assert(pii->pii_target_next == NULL);
2270 		assert(pii->pii_rtt_target_next == NULL);
2271 		pii->pii_targets_are_routers = is_router ? 1 : 0;
2272 	}
2273 
2274 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
2275 		assert(pii->pii_targets_are_routers);
2276 		assert(pii->pii_target_next != NULL);
2277 		assert(pii->pii_rtt_target_next != NULL);
2278 		tg->tg_status = TG_UNUSED;
2279 	} else {
2280 		if (pii->pii_ntargets == 0) {
2281 			assert(pii->pii_target_next == NULL);
2282 			pii->pii_target_next = tg;
2283 			pii->pii_rtt_target_next = tg;
2284 		}
2285 		pii->pii_ntargets++;
2286 		tg->tg_status = TG_ACTIVE;
2287 	}
2288 
2289 	target_insert(pii, tg);
2290 
2291 	/*
2292 	 * Change state to PI_RUNNING if this phyint instance is capable of
2293 	 * sending and receiving probes -- that is, if we know of at least 1
2294 	 * target, and this phyint instance is probe-capable.  For more
2295 	 * details, see the phyint state diagram in mpd_probe.c.
2296 	 */
2297 	pi = pii->pii_phyint;
2298 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
2299 		if (pi->pi_flags & IFF_FAILED)
2300 			phyint_chstate(pi, PI_FAILED);
2301 		else
2302 			phyint_chstate(pi, PI_RUNNING);
2303 	}
2304 }
2305 
2306 /*
2307  * Add the target address named by `addr' to phyint instance `pii' if it does
2308  * not already exist.  If the target is a router, `is_router' should be set to
2309  * B_TRUE.
2310  */
2311 void
2312 target_add(struct phyint_instance *pii, struct in6_addr addr,
2313     boolean_t is_router)
2314 {
2315 	struct target *tg;
2316 
2317 	if (pii == NULL)
2318 		return;
2319 
2320 	tg = target_lookup(pii, addr);
2321 
2322 	/*
2323 	 * If the target does not exist, create it; target_create() will set
2324 	 * tg_in_use to true.  Even if it exists already, if it's a router
2325 	 * target and we'd previously learned of it through multicast, then we
2326 	 * need to recreate it as a router target.  Otherwise, just set
2327 	 * tg_in_use to to true so that init_router_targets() won't delete it.
2328 	 */
2329 	if (tg == NULL || (is_router && !pii->pii_targets_are_routers))
2330 		target_create(pii, addr, is_router);
2331 	else if (is_router)
2332 		tg->tg_in_use = 1;
2333 }
2334 
2335 /*
2336  * Insert target at head of linked list of targets for the associated
2337  * phyint instance
2338  */
2339 static void
2340 target_insert(struct phyint_instance *pii, struct target *tg)
2341 {
2342 	tg->tg_next = pii->pii_targets;
2343 	tg->tg_prev = NULL;
2344 	if (tg->tg_next != NULL)
2345 		tg->tg_next->tg_prev = tg;
2346 	pii->pii_targets = tg;
2347 }
2348 
2349 /*
2350  * Delete a target (unlink and free).
2351  */
2352 void
2353 target_delete(struct target *tg)
2354 {
2355 	int af;
2356 	struct phyint_instance	*pii;
2357 	struct phyint_instance	*pii_other;
2358 
2359 	pii = tg->tg_phyint_inst;
2360 	af = pii->pii_af;
2361 
2362 	if (debug & D_TARGET) {
2363 		char abuf[INET6_ADDRSTRLEN];
2364 
2365 		logdebug("target_delete(%s %s, %s)\n",
2366 		    AF_STR(af), pii->pii_name,
2367 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
2368 	}
2369 
2370 	/*
2371 	 * Target must be in the list of targets for this phyint
2372 	 * instance.
2373 	 */
2374 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
2375 
2376 	/*
2377 	 * Reset all references to 'tg' in the probe information
2378 	 * for this phyint.
2379 	 */
2380 	reset_pii_probes(pii, tg);
2381 
2382 	/*
2383 	 * Remove this target from the list of targets of this
2384 	 * phyint instance.
2385 	 */
2386 	if (tg->tg_prev == NULL) {
2387 		pii->pii_targets = tg->tg_next;
2388 	} else {
2389 		tg->tg_prev->tg_next = tg->tg_next;
2390 	}
2391 
2392 	if (tg->tg_next != NULL)
2393 		tg->tg_next->tg_prev = tg->tg_prev;
2394 
2395 	tg->tg_next = NULL;
2396 	tg->tg_prev = NULL;
2397 
2398 	if (tg->tg_status == TG_ACTIVE)
2399 		pii->pii_ntargets--;
2400 
2401 	/*
2402 	 * Adjust the next target to probe, if it points to
2403 	 * to the currently deleted target.
2404 	 */
2405 	if (pii->pii_target_next == tg)
2406 		pii->pii_target_next = target_first(pii);
2407 
2408 	if (pii->pii_rtt_target_next == tg)
2409 		pii->pii_rtt_target_next = target_first(pii);
2410 
2411 	free(tg);
2412 
2413 	/*
2414 	 * The number of active targets pii_ntargets == 0 iff
2415 	 * the next active target pii->pii_target_next == NULL
2416 	 */
2417 	if (pii->pii_ntargets != 0) {
2418 		assert(pii->pii_target_next != NULL);
2419 		assert(pii->pii_rtt_target_next != NULL);
2420 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2421 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2422 		return;
2423 	}
2424 
2425 	/* At this point, we don't have any active targets. */
2426 	assert(pii->pii_target_next == NULL);
2427 	assert(pii->pii_rtt_target_next == NULL);
2428 
2429 	if (pii->pii_targets_are_routers) {
2430 		/*
2431 		 * Activate any TG_SLOW or TG_DEAD router targets,
2432 		 * since we don't have any other targets
2433 		 */
2434 		target_activate_all(pii);
2435 
2436 		if (pii->pii_ntargets != 0) {
2437 			assert(pii->pii_target_next != NULL);
2438 			assert(pii->pii_rtt_target_next != NULL);
2439 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2440 			assert(pii->pii_rtt_target_next->tg_status ==
2441 			    TG_ACTIVE);
2442 			return;
2443 		}
2444 	}
2445 
2446 	/*
2447 	 * If we still don't have any active targets, the list must
2448 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2449 	 * targets. Zero out the probe stats since it will not be
2450 	 * relevant any longer.
2451 	 */
2452 	assert(pii->pii_targets == NULL);
2453 	pii->pii_targets_are_routers = _B_FALSE;
2454 	clear_pii_probe_stats(pii);
2455 	pii_other = phyint_inst_other(pii);
2456 
2457 	/*
2458 	 * If there are no targets on both instances and the interface would
2459 	 * otherwise be considered PI_RUNNING, go back to PI_NOTARGETS state,
2460 	 * since we cannot probe this phyint any more.  For more details,
2461 	 * please see phyint state diagram in mpd_probe.c.
2462 	 */
2463 	if (!PROBE_CAPABLE(pii_other) && LINK_UP(pii->pii_phyint) &&
2464 	    pii->pii_phyint->pi_state != PI_OFFLINE)
2465 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2466 }
2467 
2468 /*
2469  * Flush the target list of every phyint in the group, if the list
2470  * is a host target list. This is called if group failure is suspected.
2471  * If all targets have failed, multicast will subsequently discover new
2472  * targets. Else it is a group failure.
2473  * Note: This function is a no-op if the list is a router target list.
2474  */
2475 static void
2476 target_flush_hosts(struct phyint_group *pg)
2477 {
2478 	struct phyint *pi;
2479 	struct phyint_instance *pii;
2480 
2481 	if (debug & D_TARGET)
2482 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2483 
2484 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2485 		pii = pi->pi_v4;
2486 		if (pii != NULL && !pii->pii_targets_are_routers) {
2487 			/*
2488 			 * Delete all the targets. When the list becomes
2489 			 * empty, target_delete() will set pii->pii_targets
2490 			 * to NULL.
2491 			 */
2492 			while (pii->pii_targets != NULL)
2493 				target_delete(pii->pii_targets);
2494 		}
2495 		pii = pi->pi_v6;
2496 		if (pii != NULL && !pii->pii_targets_are_routers) {
2497 			/*
2498 			 * Delete all the targets. When the list becomes
2499 			 * empty, target_delete() will set pii->pii_targets
2500 			 * to NULL.
2501 			 */
2502 			while (pii->pii_targets != NULL)
2503 				target_delete(pii->pii_targets);
2504 		}
2505 	}
2506 }
2507 
2508 /*
2509  * Reset all references to 'target' in the probe info, as this target is
2510  * being deleted. The pr_target field is guaranteed to be non-null if
2511  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2512  * pr_target will not be accessed unconditionally.
2513  */
2514 static void
2515 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2516 {
2517 	int i;
2518 
2519 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2520 		if (pii->pii_probes[i].pr_target == tg) {
2521 			if (pii->pii_probes[i].pr_status == PR_UNACKED) {
2522 				probe_chstate(&pii->pii_probes[i], pii,
2523 				    PR_LOST);
2524 			}
2525 			pii->pii_probes[i].pr_target = NULL;
2526 		}
2527 	}
2528 
2529 }
2530 
2531 /*
2532  * Clear the probe statistics array.
2533  */
2534 void
2535 clear_pii_probe_stats(struct phyint_instance *pii)
2536 {
2537 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2538 	/* Reset the next probe index in the probe stats array */
2539 	pii->pii_probe_next = 0;
2540 }
2541 
2542 static void
2543 target_print(struct target *tg)
2544 {
2545 	char	abuf[INET6_ADDRSTRLEN];
2546 	char	buf[128];
2547 	char	buf2[128];
2548 	int	af;
2549 	int	i;
2550 
2551 	af = tg->tg_phyint_inst->pii_af;
2552 
2553 	logdebug("Target on %s %s addr %s\n"
2554 	    "status %d rtt_sa %lld rtt_sd %lld crtt %d tg_in_use %d\n",
2555 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2556 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2557 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2558 	    tg->tg_crtt, tg->tg_in_use);
2559 
2560 	buf[0] = '\0';
2561 	for (i = 0; i < tg->tg_num_deferred; i++) {
2562 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2563 		    tg->tg_deferred[i]);
2564 		(void) strlcat(buf, buf2, sizeof (buf));
2565 	}
2566 	logdebug("deferred rtts:%s\n", buf);
2567 }
2568 
2569 void
2570 phyint_inst_print_all(void)
2571 {
2572 	struct phyint_instance *pii;
2573 
2574 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2575 		phyint_inst_print(pii);
2576 	}
2577 }
2578 
2579 /*
2580  * Compare two prefixes that have the same prefix length.
2581  * Fails if the prefix length is unreasonable.
2582  */
2583 boolean_t
2584 prefix_equal(struct in6_addr p1, struct in6_addr p2, uint_t prefix_len)
2585 {
2586 	uchar_t mask;
2587 	int j;
2588 
2589 	if (prefix_len > IPV6_ABITS)
2590 		return (_B_FALSE);
2591 
2592 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2593 		if (p1.s6_addr[j] != p2.s6_addr[j])
2594 			return (_B_FALSE);
2595 
2596 	/* Make the N leftmost bits one */
2597 	mask = 0xff << (8 - prefix_len);
2598 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2599 		return (_B_FALSE);
2600 
2601 	return (_B_TRUE);
2602 }
2603 
2604 /*
2605  * Get the number of UP logints on phyint `pi'.
2606  */
2607 static int
2608 logint_upcount(struct phyint *pi)
2609 {
2610 	struct	logint	*li;
2611 	int count = 0;
2612 
2613 	if (pi->pi_v4 != NULL) {
2614 		for (li = pi->pi_v4->pii_logint; li != NULL; li = li->li_next) {
2615 			if (li->li_flags & IFF_UP)
2616 				count++;
2617 		}
2618 	}
2619 
2620 	if (pi->pi_v6 != NULL) {
2621 		for (li = pi->pi_v6->pii_logint; li != NULL; li = li->li_next) {
2622 			if (li->li_flags & IFF_UP)
2623 				count++;
2624 		}
2625 	}
2626 
2627 	return (count);
2628 }
2629 
2630 /*
2631  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2632  */
2633 struct phyint_instance *
2634 phyint_inst_other(struct phyint_instance *pii)
2635 {
2636 	if (pii->pii_af == AF_INET)
2637 		return (pii->pii_phyint->pi_v6);
2638 	else
2639 		return (pii->pii_phyint->pi_v4);
2640 }
2641 
2642 /*
2643  * Check whether a phyint is functioning.
2644  */
2645 static boolean_t
2646 phyint_is_functioning(struct phyint *pi)
2647 {
2648 	if (pi->pi_state == PI_RUNNING)
2649 		return (_B_TRUE);
2650 	return (pi->pi_state == PI_NOTARGETS && !(pi->pi_flags & IFF_FAILED));
2651 }
2652 
2653 /*
2654  * Check whether a phyint is usable.
2655  */
2656 static boolean_t
2657 phyint_is_usable(struct phyint *pi)
2658 {
2659 	if (logint_upcount(pi) == 0)
2660 		return (_B_FALSE);
2661 	return (phyint_is_functioning(pi));
2662 }
2663 
2664 /*
2665  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2666  * Before sending the event, it prepends the current version of the IPMP
2667  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2668  * `nvl' is freed).
2669  */
2670 static int
2671 post_event(const char *subclass, nvlist_t *nvl)
2672 {
2673 	static evchan_t *evchp = NULL;
2674 
2675 	/*
2676 	 * Initialize the event channel if we haven't already done so.
2677 	 */
2678 	if (evchp == NULL) {
2679 		errno = sysevent_evc_bind(IPMP_EVENT_CHAN, &evchp, EVCH_CREAT);
2680 		if (errno != 0) {
2681 			logerr("cannot create event channel `%s': %s\n",
2682 			    IPMP_EVENT_CHAN, strerror(errno));
2683 			goto failed;
2684 		}
2685 	}
2686 
2687 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2688 	    IPMP_EVENT_CUR_VERSION);
2689 	if (errno != 0) {
2690 		logerr("cannot create `%s' event: %s", subclass,
2691 		    strerror(errno));
2692 		goto failed;
2693 	}
2694 
2695 	errno = sysevent_evc_publish(evchp, EC_IPMP, subclass, "com.sun",
2696 	    "in.mpathd", nvl, EVCH_NOSLEEP);
2697 	if (errno != 0) {
2698 		logerr("cannot send `%s' event: %s\n", subclass,
2699 		    strerror(errno));
2700 		goto failed;
2701 	}
2702 
2703 	nvlist_free(nvl);
2704 	return (0);
2705 failed:
2706 	nvlist_free(nvl);
2707 	return (-1);
2708 }
2709 
2710 /*
2711  * Return the external IPMP state associated with phyint `pi'.
2712  */
2713 static ipmp_if_state_t
2714 ifstate(struct phyint *pi)
2715 {
2716 	switch (pi->pi_state) {
2717 	case PI_INIT:
2718 		return (IPMP_IF_UNKNOWN);
2719 
2720 	case PI_NOTARGETS:
2721 		if (pi->pi_flags & IFF_FAILED)
2722 			return (IPMP_IF_FAILED);
2723 		return (IPMP_IF_UNKNOWN);
2724 
2725 	case PI_OFFLINE:
2726 		return (IPMP_IF_OFFLINE);
2727 
2728 	case PI_FAILED:
2729 		return (IPMP_IF_FAILED);
2730 
2731 	case PI_RUNNING:
2732 		return (IPMP_IF_OK);
2733 	}
2734 
2735 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2736 	abort();
2737 	/* NOTREACHED */
2738 }
2739 
2740 /*
2741  * Return the external IPMP interface type associated with phyint `pi'.
2742  */
2743 static ipmp_if_type_t
2744 iftype(struct phyint *pi)
2745 {
2746 	if (pi->pi_flags & IFF_STANDBY)
2747 		return (IPMP_IF_STANDBY);
2748 	else
2749 		return (IPMP_IF_NORMAL);
2750 }
2751 
2752 /*
2753  * Return the external IPMP link state associated with phyint `pi'.
2754  */
2755 static ipmp_if_linkstate_t
2756 iflinkstate(struct phyint *pi)
2757 {
2758 	if (!(pi->pi_notes & (DL_NOTE_LINK_UP|DL_NOTE_LINK_DOWN)))
2759 		return (IPMP_LINK_UNKNOWN);
2760 
2761 	return (LINK_DOWN(pi) ? IPMP_LINK_DOWN : IPMP_LINK_UP);
2762 }
2763 
2764 /*
2765  * Return the external IPMP probe state associated with phyint `pi'.
2766  */
2767 static ipmp_if_probestate_t
2768 ifprobestate(struct phyint *pi)
2769 {
2770 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6))
2771 		return (IPMP_PROBE_DISABLED);
2772 
2773 	if (pi->pi_state == PI_FAILED)
2774 		return (IPMP_PROBE_FAILED);
2775 
2776 	if (!PROBE_CAPABLE(pi->pi_v4) && !PROBE_CAPABLE(pi->pi_v6))
2777 		return (IPMP_PROBE_UNKNOWN);
2778 
2779 	return (IPMP_PROBE_OK);
2780 }
2781 
2782 /*
2783  * Return the external IPMP target mode associated with phyint instance `pii'.
2784  */
2785 static ipmp_if_targmode_t
2786 iftargmode(struct phyint_instance *pii)
2787 {
2788 	if (!PROBE_ENABLED(pii))
2789 		return (IPMP_TARG_DISABLED);
2790 	else if (pii->pii_targets_are_routers)
2791 		return (IPMP_TARG_ROUTES);
2792 	else
2793 		return (IPMP_TARG_MULTICAST);
2794 }
2795 
2796 /*
2797  * Return the external IPMP flags associated with phyint `pi'.
2798  */
2799 static ipmp_if_flags_t
2800 ifflags(struct phyint *pi)
2801 {
2802 	ipmp_if_flags_t flags = 0;
2803 
2804 	if (logint_upcount(pi) == 0)
2805 		flags |= IPMP_IFFLAG_DOWN;
2806 	if (pi->pi_flags & IFF_INACTIVE)
2807 		flags |= IPMP_IFFLAG_INACTIVE;
2808 	if (pi->pi_hwaddrdup)
2809 		flags |= IPMP_IFFLAG_HWADDRDUP;
2810 	if (phyint_is_functioning(pi) && flags == 0)
2811 		flags |= IPMP_IFFLAG_ACTIVE;
2812 
2813 	return (flags);
2814 }
2815 
2816 /*
2817  * Store the test address used on phyint instance `pii' in `ssp'.  If there's
2818  * no test address, 0.0.0.0 is stored.
2819  */
2820 static struct sockaddr_storage *
2821 iftestaddr(struct phyint_instance *pii, struct sockaddr_storage *ssp)
2822 {
2823 	if (PROBE_ENABLED(pii))
2824 		addr2storage(pii->pii_af, &pii->pii_probe_logint->li_addr, ssp);
2825 	else
2826 		addr2storage(AF_INET6, &in6addr_any, ssp);
2827 
2828 	return (ssp);
2829 }
2830 
2831 /*
2832  * Return the external IPMP group state associated with phyint group `pg'.
2833  */
2834 static ipmp_group_state_t
2835 groupstate(struct phyint_group *pg)
2836 {
2837 	switch (pg->pg_state) {
2838 	case PG_FAILED:
2839 		return (IPMP_GROUP_FAILED);
2840 	case PG_DEGRADED:
2841 		return (IPMP_GROUP_DEGRADED);
2842 	case PG_OK:
2843 		return (IPMP_GROUP_OK);
2844 	}
2845 
2846 	logerr("groupstate: unknown state %d; aborting\n", pg->pg_state);
2847 	abort();
2848 	/* NOTREACHED */
2849 }
2850 
2851 /*
2852  * Return the external IPMP probe state associated with probe `ps'.
2853  */
2854 static ipmp_probe_state_t
2855 probestate(struct probe_stats *ps)
2856 {
2857 	switch (ps->pr_status) {
2858 	case PR_UNUSED:
2859 	case PR_LOST:
2860 		return (IPMP_PROBE_LOST);
2861 	case PR_UNACKED:
2862 		return (IPMP_PROBE_SENT);
2863 	case PR_ACKED:
2864 		return (IPMP_PROBE_ACKED);
2865 	}
2866 
2867 	logerr("probestate: unknown state %d; aborting\n", ps->pr_status);
2868 	abort();
2869 	/* NOTREACHED */
2870 }
2871 
2872 /*
2873  * Generate an ESC_IPMP_PROBE_STATE sysevent for the probe described by `pr'
2874  * on phyint instance `pii'.  Returns 0 on success, -1 on failure.
2875  */
2876 int
2877 probe_state_event(struct probe_stats *pr, struct phyint_instance *pii)
2878 {
2879 	nvlist_t *nvl;
2880 	hrtime_t proc_time = 0, recv_time = 0;
2881 	struct sockaddr_storage ss;
2882 	struct target *tg = pr->pr_target;
2883 	int64_t rttavg, rttdev;
2884 
2885 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2886 	if (errno != 0) {
2887 		logperror("cannot create `interface change' event");
2888 		return (-1);
2889 	}
2890 
2891 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_ID, pr->pr_id);
2892 	if (errno != 0)
2893 		goto failed;
2894 
2895 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pii->pii_phyint->pi_name);
2896 	if (errno != 0)
2897 		goto failed;
2898 
2899 	errno = nvlist_add_uint32(nvl, IPMP_PROBE_STATE, probestate(pr));
2900 	if (errno != 0)
2901 		goto failed;
2902 
2903 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_START_TIME,
2904 	    pr->pr_hrtime_start);
2905 	if (errno != 0)
2906 		goto failed;
2907 
2908 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_SENT_TIME,
2909 	    pr->pr_hrtime_sent);
2910 	if (errno != 0)
2911 		goto failed;
2912 
2913 	if (pr->pr_status == PR_ACKED) {
2914 		recv_time = pr->pr_hrtime_ackrecv;
2915 		proc_time = pr->pr_hrtime_ackproc;
2916 	}
2917 
2918 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKRECV_TIME, recv_time);
2919 	if (errno != 0)
2920 		goto failed;
2921 
2922 	errno = nvlist_add_hrtime(nvl, IPMP_PROBE_ACKPROC_TIME, proc_time);
2923 	if (errno != 0)
2924 		goto failed;
2925 
2926 	if (tg != NULL)
2927 		addr2storage(pii->pii_af, &tg->tg_address, &ss);
2928 	else
2929 		addr2storage(pii->pii_af, &in6addr_any, &ss);
2930 
2931 	errno = nvlist_add_byte_array(nvl, IPMP_PROBE_TARGET, (uchar_t *)&ss,
2932 	    sizeof (ss));
2933 	if (errno != 0)
2934 		goto failed;
2935 
2936 	rttavg = (tg != NULL) ? (tg->tg_rtt_sa / 8) : 0;
2937 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTAVG, rttavg);
2938 	if (errno != 0)
2939 		goto failed;
2940 
2941 	rttdev = (tg != NULL) ? (tg->tg_rtt_sd / 4) : 0;
2942 	errno = nvlist_add_int64(nvl, IPMP_PROBE_TARGET_RTTDEV, rttdev);
2943 	if (errno != 0)
2944 		goto failed;
2945 
2946 	return (post_event(ESC_IPMP_PROBE_STATE, nvl));
2947 failed:
2948 	logperror("cannot create `probe state' event");
2949 	nvlist_free(nvl);
2950 	return (-1);
2951 }
2952 
2953 /*
2954  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2955  * Returns 0 on success, -1 on failure.
2956  */
2957 static int
2958 phyint_group_state_event(struct phyint_group *pg)
2959 {
2960 	nvlist_t	*nvl;
2961 
2962 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2963 	if (errno != 0) {
2964 		logperror("cannot create `group state change' event");
2965 		return (-1);
2966 	}
2967 
2968 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2969 	if (errno != 0)
2970 		goto failed;
2971 
2972 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2973 	if (errno != 0)
2974 		goto failed;
2975 
2976 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2977 	if (errno != 0)
2978 		goto failed;
2979 
2980 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2981 failed:
2982 	logperror("cannot create `group state change' event");
2983 	nvlist_free(nvl);
2984 	return (-1);
2985 }
2986 
2987 /*
2988  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2989  * `pg'.  Returns 0 on success, -1 on failure.
2990  */
2991 static int
2992 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2993 {
2994 	nvlist_t *nvl;
2995 
2996 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2997 	if (errno != 0) {
2998 		logperror("cannot create `group change' event");
2999 		return (-1);
3000 	}
3001 
3002 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3003 	if (errno != 0)
3004 		goto failed;
3005 
3006 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3007 	if (errno != 0)
3008 		goto failed;
3009 
3010 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
3011 	    phyint_grouplistsig);
3012 	if (errno != 0)
3013 		goto failed;
3014 
3015 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
3016 	if (errno != 0)
3017 		goto failed;
3018 
3019 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
3020 failed:
3021 	logperror("cannot create `group change' event");
3022 	nvlist_free(nvl);
3023 	return (-1);
3024 }
3025 
3026 /*
3027  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
3028  * group `pg'.	Returns 0 on success, -1 on failure.
3029  */
3030 static int
3031 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
3032     ipmp_if_op_t op)
3033 {
3034 	nvlist_t *nvl;
3035 
3036 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3037 	if (errno != 0) {
3038 		logperror("cannot create `group member change' event");
3039 		return (-1);
3040 	}
3041 
3042 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3043 	if (errno != 0)
3044 		goto failed;
3045 
3046 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3047 	if (errno != 0)
3048 		goto failed;
3049 
3050 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
3051 	if (errno != 0)
3052 		goto failed;
3053 
3054 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3055 	if (errno != 0)
3056 		goto failed;
3057 
3058 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3059 	if (errno != 0)
3060 		goto failed;
3061 
3062 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3063 	if (errno != 0)
3064 		goto failed;
3065 
3066 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
3067 failed:
3068 	logperror("cannot create `group member change' event");
3069 	nvlist_free(nvl);
3070 	return (-1);
3071 
3072 }
3073 
3074 /*
3075  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
3076  * Returns 0 on success, -1 on failure.
3077  */
3078 static int
3079 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
3080 {
3081 	nvlist_t *nvl;
3082 
3083 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
3084 	if (errno != 0) {
3085 		logperror("cannot create `interface change' event");
3086 		return (-1);
3087 	}
3088 
3089 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
3090 	if (errno != 0)
3091 		goto failed;
3092 
3093 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
3094 	if (errno != 0)
3095 		goto failed;
3096 
3097 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
3098 	if (errno != 0)
3099 		goto failed;
3100 
3101 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
3102 	if (errno != 0)
3103 		goto failed;
3104 
3105 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
3106 	if (errno != 0)
3107 		goto failed;
3108 
3109 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
3110 failed:
3111 	logperror("cannot create `interface change' event");
3112 	nvlist_free(nvl);
3113 	return (-1);
3114 
3115 }
3116 
3117 /*
3118  * Generate a signature for use.  The signature is conceptually divided
3119  * into two pieces: a random 16-bit "generation number" and a 48-bit
3120  * monotonically increasing integer.  The generation number protects
3121  * against stale updates to entities (e.g., IPMP groups) that have been
3122  * deleted and since recreated.
3123  */
3124 static uint64_t
3125 gensig(void)
3126 {
3127 	static int seeded = 0;
3128 
3129 	if (seeded == 0) {
3130 		srand48((long)gethrtime());
3131 		seeded++;
3132 	}
3133 
3134 	return ((uint64_t)lrand48() << 48 | 1);
3135 }
3136 
3137 /*
3138  * Store the information associated with group `grname' into a dynamically
3139  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
3140  */
3141 unsigned int
3142 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
3143 {
3144 	struct phyint		*pi;
3145 	struct phyint_group	*pg;
3146 	char			(*ifs)[LIFNAMSIZ];
3147 	unsigned int		i, j;
3148 	unsigned int		nif = 0, naddr = 0;
3149 	lifgroupinfo_t		lifgr;
3150 	addrlist_t		*addrp;
3151 	struct sockaddr_storage	*addrs;
3152 	int			fdt = 0;
3153 
3154 	pg = phyint_group_lookup(grname);
3155 	if (pg == NULL)
3156 		return (IPMP_EUNKGROUP);
3157 
3158 	/*
3159 	 * Tally up the number of interfaces, allocate an array to hold them,
3160 	 * and insert their names into the array.  While we're at it, if any
3161 	 * interface is actually enabled to send probes, save the group fdt.
3162 	 */
3163 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
3164 		nif++;
3165 
3166 	ifs = alloca(nif * sizeof (*ifs));
3167 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
3168 		assert(i < nif);
3169 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
3170 		if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6))
3171 			fdt = pg->pg_fdt;
3172 	}
3173 	assert(i == nif);
3174 
3175 	/*
3176 	 * If this is the anonymous group, there's no other information to
3177 	 * collect (since there's no IPMP interface).
3178 	 */
3179 	if (pg == phyint_anongroup) {
3180 		*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3181 		    groupstate(pg), nif, ifs, "", "", "", "", 0, NULL);
3182 		return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3183 	}
3184 
3185 	/*
3186 	 * Grab some additional information about the group from the kernel.
3187 	 * (NOTE: since SIOCGLIFGROUPINFO does not look up by interface name,
3188 	 * we can use ifsock_v4 even for a V6-only group.)
3189 	 */
3190 	(void) strlcpy(lifgr.gi_grname, grname, LIFGRNAMSIZ);
3191 	if (ioctl(ifsock_v4, SIOCGLIFGROUPINFO, &lifgr) == -1) {
3192 		if (errno == ENOENT)
3193 			return (IPMP_EUNKGROUP);
3194 
3195 		logperror("getgroupinfo: SIOCGLIFGROUPINFO");
3196 		return (IPMP_FAILURE);
3197 	}
3198 
3199 	/*
3200 	 * Tally up the number of data addresses, allocate an array to hold
3201 	 * them, and insert their values into the array.
3202 	 */
3203 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next)
3204 		naddr++;
3205 
3206 	addrs = alloca(naddr * sizeof (*addrs));
3207 	i = 0;
3208 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3209 		/*
3210 		 * It's possible to have duplicate addresses (if some are
3211 		 * down).  Weed the dups out to avoid confusing consumers.
3212 		 * (If groups start having tons of addresses, we'll need a
3213 		 * better algorithm here.)
3214 		 */
3215 		for (j = 0; j < i; j++) {
3216 			if (sockaddrcmp(&addrs[j], &addrp->al_addr))
3217 				break;
3218 		}
3219 		if (j == i) {
3220 			assert(i < naddr);
3221 			addrs[i++] = addrp->al_addr;
3222 		}
3223 	}
3224 	naddr = i;
3225 
3226 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, fdt,
3227 	    groupstate(pg), nif, ifs, lifgr.gi_grifname, lifgr.gi_m4ifname,
3228 	    lifgr.gi_m6ifname, lifgr.gi_bcifname, naddr, addrs);
3229 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3230 }
3231 
3232 /*
3233  * Store the target information associated with phyint instance `pii' into a
3234  * dynamically allocated structure pointed to by `*targinfopp'.  Returns an
3235  * IPMP error code.
3236  */
3237 unsigned int
3238 gettarginfo(struct phyint_instance *pii, const char *name,
3239     ipmp_targinfo_t **targinfopp)
3240 {
3241 	uint_t ntarg = 0;
3242 	struct target *tg;
3243 	struct sockaddr_storage	ss;
3244 	struct sockaddr_storage *targs = NULL;
3245 
3246 	if (PROBE_CAPABLE(pii)) {
3247 		targs = alloca(pii->pii_ntargets * sizeof (*targs));
3248 		tg = pii->pii_target_next;
3249 		do {
3250 			if (tg->tg_status == TG_ACTIVE) {
3251 				assert(ntarg < pii->pii_ntargets);
3252 				addr2storage(pii->pii_af, &tg->tg_address,
3253 				    &targs[ntarg++]);
3254 			}
3255 			if ((tg = tg->tg_next) == NULL)
3256 				tg = pii->pii_targets;
3257 		} while (tg != pii->pii_target_next);
3258 
3259 		assert(ntarg == pii->pii_ntargets);
3260 	}
3261 
3262 	*targinfopp = ipmp_targinfo_create(name, iftestaddr(pii, &ss),
3263 	    iftargmode(pii), ntarg, targs);
3264 	return (*targinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3265 }
3266 
3267 /*
3268  * Store the information associated with interface `ifname' into a dynamically
3269  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
3270  */
3271 unsigned int
3272 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
3273 {
3274 	int		retval;
3275 	struct phyint	*pi;
3276 	ipmp_targinfo_t	*targinfo4;
3277 	ipmp_targinfo_t	*targinfo6;
3278 
3279 	pi = phyint_lookup(ifname);
3280 	if (pi == NULL)
3281 		return (IPMP_EUNKIF);
3282 
3283 	if ((retval = gettarginfo(pi->pi_v4, pi->pi_name, &targinfo4)) != 0 ||
3284 	    (retval = gettarginfo(pi->pi_v6, pi->pi_name, &targinfo6)) != 0)
3285 		goto out;
3286 
3287 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
3288 	    ifstate(pi), iftype(pi), iflinkstate(pi), ifprobestate(pi),
3289 	    ifflags(pi), targinfo4, targinfo6);
3290 	retval = (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3291 out:
3292 	if (targinfo4 != NULL)
3293 		ipmp_freetarginfo(targinfo4);
3294 	if (targinfo6 != NULL)
3295 		ipmp_freetarginfo(targinfo6);
3296 	return (retval);
3297 }
3298 
3299 /*
3300  * Store the current list of IPMP groups into a dynamically allocated
3301  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
3302  */
3303 unsigned int
3304 getgrouplist(ipmp_grouplist_t **grlistpp)
3305 {
3306 	struct phyint_group	*pg;
3307 	char			(*groups)[LIFGRNAMSIZ];
3308 	unsigned int		i, ngroup;
3309 
3310 	/*
3311 	 * Tally up the number of groups, allocate an array to hold them, and
3312 	 * insert their names into the array.
3313 	 */
3314 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
3315 		ngroup++;
3316 
3317 	groups = alloca(ngroup * sizeof (*groups));
3318 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
3319 		assert(i < ngroup);
3320 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
3321 	}
3322 	assert(i == ngroup);
3323 
3324 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
3325 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3326 }
3327 
3328 /*
3329  * Store the address information for `ssp' (in group `grname') into a
3330  * dynamically allocated structure pointed to by `*adinfopp'.  Returns an IPMP
3331  * error code.  (We'd call this function getaddrinfo(), but it would conflict
3332  * with getaddrinfo(3SOCKET)).
3333  */
3334 unsigned int
3335 getgraddrinfo(const char *grname, struct sockaddr_storage *ssp,
3336     ipmp_addrinfo_t **adinfopp)
3337 {
3338 	int ifsock;
3339 	addrlist_t *addrp, *addrmatchp = NULL;
3340 	ipmp_addr_state_t state;
3341 	const char *binding = "";
3342 	struct lifreq lifr;
3343 	struct phyint_group *pg;
3344 
3345 	if ((pg = phyint_group_lookup(grname)) == NULL)
3346 		return (IPMP_EUNKADDR);
3347 
3348 	/*
3349 	 * Walk through the data addresses, and find a match.  Note that since
3350 	 * some of the addresses may be down, more than one may match.  We
3351 	 * prefer an up address (if one exists).
3352 	 */
3353 	for (addrp = pg->pg_addrs; addrp != NULL; addrp = addrp->al_next) {
3354 		if (sockaddrcmp(ssp, &addrp->al_addr)) {
3355 			addrmatchp = addrp;
3356 			if (addrmatchp->al_flags & IFF_UP)
3357 				break;
3358 		}
3359 	}
3360 
3361 	if (addrmatchp == NULL)
3362 		return (IPMP_EUNKADDR);
3363 
3364 	state = (addrmatchp->al_flags & IFF_UP) ? IPMP_ADDR_UP : IPMP_ADDR_DOWN;
3365 	if (state == IPMP_ADDR_UP) {
3366 		ifsock = (ssp->ss_family == AF_INET) ? ifsock_v4 : ifsock_v6;
3367 		(void) strlcpy(lifr.lifr_name, addrmatchp->al_name, LIFNAMSIZ);
3368 		if (ioctl(ifsock, SIOCGLIFBINDING, &lifr) >= 0)
3369 			binding = lifr.lifr_binding;
3370 	}
3371 
3372 	*adinfopp = ipmp_addrinfo_create(ssp, state, pg->pg_name, binding);
3373 	return (*adinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
3374 }
3375 
3376 /*
3377  * Store a snapshot of the IPMP subsystem into a dynamically allocated
3378  * structure pointed to by `*snapp'.  Returns an IPMP error code.
3379  */
3380 unsigned int
3381 getsnap(ipmp_snap_t **snapp)
3382 {
3383 	ipmp_grouplist_t	*grlistp;
3384 	ipmp_groupinfo_t	*grinfop;
3385 	ipmp_addrinfo_t		*adinfop;
3386 	ipmp_addrlist_t		*adlistp;
3387 	ipmp_ifinfo_t		*ifinfop;
3388 	ipmp_snap_t		*snap;
3389 	struct phyint		*pi;
3390 	unsigned int		i, j;
3391 	int			retval;
3392 
3393 	snap = ipmp_snap_create();
3394 	if (snap == NULL)
3395 		return (IPMP_ENOMEM);
3396 
3397 	/*
3398 	 * Add group list.
3399 	 */
3400 	retval = getgrouplist(&snap->sn_grlistp);
3401 	if (retval != IPMP_SUCCESS)
3402 		goto failed;
3403 
3404 	/*
3405 	 * Add information for each group in the list, along with all of its
3406 	 * data addresses.
3407 	 */
3408 	grlistp = snap->sn_grlistp;
3409 	for (i = 0; i < grlistp->gl_ngroup; i++) {
3410 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
3411 		if (retval != IPMP_SUCCESS)
3412 			goto failed;
3413 
3414 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
3415 		if (retval != IPMP_SUCCESS) {
3416 			ipmp_freegroupinfo(grinfop);
3417 			goto failed;
3418 		}
3419 
3420 		adlistp = grinfop->gr_adlistp;
3421 		for (j = 0; j < adlistp->al_naddr; j++) {
3422 			retval = getgraddrinfo(grinfop->gr_name,
3423 			    &adlistp->al_addrs[j], &adinfop);
3424 			if (retval != IPMP_SUCCESS)
3425 				goto failed;
3426 
3427 			retval = ipmp_snap_addaddrinfo(snap, adinfop);
3428 			if (retval != IPMP_SUCCESS) {
3429 				ipmp_freeaddrinfo(adinfop);
3430 				goto failed;
3431 			}
3432 		}
3433 	}
3434 
3435 	/*
3436 	 * Add information for each configured phyint.
3437 	 */
3438 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
3439 		retval = getifinfo(pi->pi_name, &ifinfop);
3440 		if (retval != IPMP_SUCCESS)
3441 			goto failed;
3442 
3443 		retval = ipmp_snap_addifinfo(snap, ifinfop);
3444 		if (retval != IPMP_SUCCESS) {
3445 			ipmp_freeifinfo(ifinfop);
3446 			goto failed;
3447 		}
3448 	}
3449 
3450 	*snapp = snap;
3451 	return (IPMP_SUCCESS);
3452 failed:
3453 	ipmp_snap_free(snap);
3454 	return (retval);
3455 }
3456