xref: /titanic_51/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "mpd_defs.h"
30 #include "mpd_tables.h"
31 
32 /*
33  * Global list of phyints, phyint instances, phyint groups and the anonymous
34  * group; the latter is initialized in phyint_init().
35  */
36 struct phyint *phyints = NULL;
37 struct phyint_instance	*phyint_instances = NULL;
38 struct phyint_group *phyint_groups = NULL;
39 struct phyint_group *phyint_anongroup;
40 
41 /*
42  * Grouplist signature; initialized in phyint_init().
43  */
44 static uint64_t phyint_grouplistsig;
45 
46 static void phyint_inst_insert(struct phyint_instance *pii);
47 static void phyint_inst_print(struct phyint_instance *pii);
48 
49 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
50 static void phyint_delete(struct phyint *pi);
51 
52 static void phyint_group_insert(struct phyint_group *pg);
53 static void phyint_group_delete(struct phyint_group *pg);
54 static struct phyint_group *phyint_group_lookup(const char *pg_name);
55 static struct phyint_group *phyint_group_create(const char *pg_name);
56 
57 static void logint_print(struct logint *li);
58 static void logint_insert(struct phyint_instance *pii, struct logint *li);
59 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
60 
61 static void target_print(struct target *tg);
62 static void target_insert(struct phyint_instance *pii, struct target *tg);
63 static struct target *target_first(struct phyint_instance *pii);
64 static struct target *target_select_best(struct phyint_instance *pii);
65 static void target_flush_hosts(struct phyint_group *pg);
66 
67 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
68 
69 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
70 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
71 
72 static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask);
73 static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2,
74     int prefix_len);
75 
76 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
77 static int phyint_group_state_event(struct phyint_group *pg);
78 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
79 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
80     ipmp_if_op_t op);
81 
82 static uint64_t gensig(void);
83 
84 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
85 int
86 phyint_init(void)
87 {
88 	phyint_grouplistsig = gensig();
89 	if (track_all_phyints) {
90 		phyint_anongroup = phyint_group_create("");
91 		if (phyint_anongroup == NULL)
92 			return (-1);
93 		phyint_group_insert(phyint_anongroup);
94 	}
95 	return (0);
96 }
97 
98 /* Return the phyint with the given name */
99 struct phyint *
100 phyint_lookup(const char *name)
101 {
102 	struct phyint *pi;
103 
104 	if (debug & D_PHYINT)
105 		logdebug("phyint_lookup(%s)\n", name);
106 
107 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
108 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
109 			break;
110 	}
111 	return (pi);
112 }
113 
114 /* Return the phyint instance with the given name and the given family */
115 struct phyint_instance *
116 phyint_inst_lookup(int af, char *name)
117 {
118 	struct phyint *pi;
119 
120 	if (debug & D_PHYINT)
121 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
122 
123 	assert(af == AF_INET || af == AF_INET6);
124 
125 	pi = phyint_lookup(name);
126 	if (pi == NULL)
127 		return (NULL);
128 
129 	return (PHYINT_INSTANCE(pi, af));
130 }
131 
132 static struct phyint_group *
133 phyint_group_lookup(const char *pg_name)
134 {
135 	struct phyint_group *pg;
136 
137 	if (debug & D_PHYINT)
138 		logdebug("phyint_group_lookup(%s)\n", pg_name);
139 
140 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
141 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
142 			break;
143 	}
144 	return (pg);
145 }
146 
147 /*
148  * Insert the phyint in the linked list of all phyints. If the phyint belongs
149  * to some group, insert it in the phyint group list.
150  */
151 static void
152 phyint_insert(struct phyint *pi, struct phyint_group *pg)
153 {
154 	if (debug & D_PHYINT)
155 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
156 
157 	/* Insert the phyint at the head of the 'all phyints' list */
158 	pi->pi_next = phyints;
159 	pi->pi_prev = NULL;
160 	if (phyints != NULL)
161 		phyints->pi_prev = pi;
162 	phyints = pi;
163 
164 	/*
165 	 * Insert the phyint at the head of the 'phyint_group members' list
166 	 * of the phyint group to which it belongs.
167 	 */
168 	pi->pi_pgnext = NULL;
169 	pi->pi_pgprev = NULL;
170 	pi->pi_group = pg;
171 
172 	pi->pi_pgnext = pg->pg_phyint;
173 	if (pi->pi_pgnext != NULL)
174 		pi->pi_pgnext->pi_pgprev = pi;
175 	pg->pg_phyint = pi;
176 
177 	pg->pg_sig++;
178 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
179 }
180 
181 /* Insert the phyint instance in the linked list of all phyint instances. */
182 static void
183 phyint_inst_insert(struct phyint_instance *pii)
184 {
185 	if (debug & D_PHYINT) {
186 		logdebug("phyint_inst_insert(%s %s)\n",
187 		    AF_STR(pii->pii_af), pii->pii_name);
188 	}
189 
190 	/*
191 	 * Insert the phyint at the head of the 'all phyint instances' list.
192 	 */
193 	pii->pii_next = phyint_instances;
194 	pii->pii_prev = NULL;
195 	if (phyint_instances != NULL)
196 		phyint_instances->pii_prev = pii;
197 	phyint_instances = pii;
198 }
199 
200 /*
201  * Create a new phyint with the given parameters. Also insert it into
202  * the list of all phyints and the list of phyint group members by calling
203  * phyint_insert().
204  */
205 static struct phyint *
206 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
207     uint64_t flags)
208 {
209 	struct phyint *pi;
210 
211 	pi = calloc(1, sizeof (struct phyint));
212 	if (pi == NULL) {
213 		logperror("phyint_create: calloc");
214 		return (NULL);
215 	}
216 
217 	/*
218 	 * Record the phyint values. Also insert the phyint into the
219 	 * phyint group by calling phyint_insert().
220 	 */
221 	(void) strncpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
222 	pi->pi_name[sizeof (pi->pi_name) - 1] = '\0';
223 	pi->pi_ifindex = ifindex;
224 	pi->pi_icmpid =
225 	    htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF));
226 	/*
227 	 * We optimistically start in the PI_RUNNING state.  Later (in
228 	 * process_link_state_changes()), we will readjust this to match the
229 	 * current state of the link.  Further, if test addresses are
230 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
231 	 * either PI_RUNNING or PI_FAILED, depending on the result of the test
232 	 * probes.
233 	 */
234 	pi->pi_state = PI_RUNNING;
235 	pi->pi_flags = PHYINT_FLAGS(flags);
236 	/*
237 	 * Initialise the link state.  The link state is initialised to
238 	 * up, so that if the link is down when IPMP starts monitoring
239 	 * the interface, it will appear as though there has been a
240 	 * transition from the link up to link down.  This avoids
241 	 * having to treat this situation as a special case.
242 	 */
243 	INIT_LINK_STATE(pi);
244 
245 	/*
246 	 * Insert the phyint in the list of all phyints, and the
247 	 * list of phyint group members
248 	 */
249 	phyint_insert(pi, pg);
250 
251 	/*
252 	 * If we are joining a failed group, mark the interface as
253 	 * failed.
254 	 */
255 	if (GROUP_FAILED(pg))
256 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
257 
258 	return (pi);
259 }
260 
261 /*
262  * Create a new phyint instance belonging to the phyint 'pi' and address
263  * family 'af'. Also insert it into the list of all phyint instances by
264  * calling phyint_inst_insert().
265  */
266 static struct phyint_instance *
267 phyint_inst_create(struct phyint *pi, int af)
268 {
269 	struct phyint_instance *pii;
270 
271 	pii = calloc(1, sizeof (struct phyint_instance));
272 	if (pii == NULL) {
273 		logperror("phyint_inst_create: calloc");
274 		return (NULL);
275 	}
276 
277 	/*
278 	 * Attach the phyint instance to the phyint.
279 	 * Set the back pointers as well
280 	 */
281 	pii->pii_phyint = pi;
282 	if (af == AF_INET)
283 		pi->pi_v4 = pii;
284 	else
285 		pi->pi_v6 = pii;
286 
287 	pii->pii_in_use = 1;
288 	pii->pii_probe_sock = -1;
289 	pii->pii_snxt = 1;
290 	pii->pii_af = af;
291 	pii->pii_fd_hrtime = gethrtime() +
292 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
293 	pii->pii_flags = pi->pi_flags;
294 
295 	/* Insert the phyint instance in the list of all phyint instances. */
296 	phyint_inst_insert(pii);
297 	return (pii);
298 }
299 
300 /*
301  * Change the state of phyint `pi' to state `state'.
302  */
303 void
304 phyint_chstate(struct phyint *pi, enum pi_state state)
305 {
306 	/*
307 	 * To simplify things, some callers always set a given state
308 	 * regardless of the previous state of the phyint (e.g., setting
309 	 * PI_RUNNING when it's already set).  We shouldn't bother
310 	 * generating an event or consuming a signature for these, since
311 	 * the actual state of the interface is unchanged.
312 	 */
313 	if (pi->pi_state == state)
314 		return;
315 
316 	pi->pi_state = state;
317 	pi->pi_group->pg_sig++;
318 	(void) phyint_state_event(pi->pi_group, pi);
319 }
320 
321 /*
322  * Note that the type of phyint `pi' has changed.
323  */
324 void
325 phyint_newtype(struct phyint *pi)
326 {
327 	pi->pi_group->pg_sig++;
328 	(void) phyint_state_event(pi->pi_group, pi);
329 }
330 
331 /*
332  * Insert the phyint group in the linked list of all phyint groups
333  * at the head of the list
334  */
335 static void
336 phyint_group_insert(struct phyint_group *pg)
337 {
338 	pg->pg_next = phyint_groups;
339 	pg->pg_prev = NULL;
340 	if (phyint_groups != NULL)
341 		phyint_groups->pg_prev = pg;
342 	phyint_groups = pg;
343 
344 	phyint_grouplistsig++;
345 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
346 }
347 
348 /*
349  * Create a new phyint group called 'name'.
350  */
351 static struct phyint_group *
352 phyint_group_create(const char *name)
353 {
354 	struct	phyint_group *pg;
355 
356 	if (debug & D_PHYINT)
357 		logdebug("phyint_group_create(%s)\n", name);
358 
359 	pg = calloc(1, sizeof (struct phyint_group));
360 	if (pg == NULL) {
361 		logperror("phyint_group_create: calloc");
362 		return (NULL);
363 	}
364 
365 	(void) strncpy(pg->pg_name, name, sizeof (pg->pg_name));
366 	pg->pg_name[sizeof (pg->pg_name) - 1] = '\0';
367 	pg->pg_sig = gensig();
368 
369 	pg->pg_fdt = user_failure_detection_time;
370 	pg->pg_probeint = user_probe_interval;
371 
372 	return (pg);
373 }
374 
375 /*
376  * Change the state of the phyint group `pg' to state `state'.
377  */
378 void
379 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
380 {
381 	assert(pg != phyint_anongroup);
382 
383 	switch (state) {
384 	case PG_FAILED:
385 		pg->pg_groupfailed = 1;
386 
387 		/*
388 		 * We can never know with certainty that a group has
389 		 * failed.  It is possible that all known targets have
390 		 * failed simultaneously, and new targets have come up
391 		 * instead. If the targets are routers then router
392 		 * discovery will kick in, and we will see the new routers
393 		 * thru routing socket messages. But if the targets are
394 		 * hosts, we have to discover it by multicast.	So flush
395 		 * all the host targets. The next probe will send out a
396 		 * multicast echo request. If this is a group failure, we
397 		 * will still not see any response, otherwise we will
398 		 * clear the pg_groupfailed flag after we get
399 		 * NUM_PROBE_REPAIRS consecutive unicast replies on any
400 		 * phyint.
401 		 */
402 		target_flush_hosts(pg);
403 		break;
404 
405 	case PG_RUNNING:
406 		pg->pg_groupfailed = 0;
407 		break;
408 
409 	default:
410 		logerr("phyint_group_chstate: invalid group state %d; "
411 		    "aborting\n", state);
412 		abort();
413 	}
414 
415 	pg->pg_sig++;
416 	(void) phyint_group_state_event(pg);
417 }
418 
419 /*
420  * Create a new phyint instance and initialize it from the values supplied by
421  * the kernel. Always check for ENXIO before logging any error, because the
422  * interface could have vanished after completion of SIOCGLIFCONF.
423  * Return values:
424  *	pointer to the phyint instance on success
425  *	NULL on failure Eg. if the phyint instance is not found in the kernel
426  */
427 struct phyint_instance *
428 phyint_inst_init_from_k(int af, char *pi_name)
429 {
430 	char	pg_name[LIFNAMSIZ + 1];
431 	int	ifsock;
432 	uint_t	ifindex;
433 	uint64_t	flags;
434 	struct lifreq	lifr;
435 	struct phyint	*pi;
436 	struct phyint_instance	*pii;
437 	boolean_t	pg_created;
438 	boolean_t	pi_created;
439 	struct phyint_group	*pg;
440 
441 retry:
442 	pii = NULL;
443 	pi = NULL;
444 	pg = NULL;
445 	pi_created = _B_FALSE;
446 	pg_created = _B_FALSE;
447 
448 	if (debug & D_PHYINT) {
449 		logdebug("phyint_inst_init_from_k(%s %s)\n",
450 		    AF_STR(af), pi_name);
451 	}
452 
453 	assert(af == AF_INET || af == AF_INET6);
454 
455 	/* Get the socket for doing ioctls */
456 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
457 
458 	/*
459 	 * Get the interface flags. Ignore loopback and multipoint
460 	 * interfaces.
461 	 */
462 	(void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
463 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
464 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
465 		if (errno != ENXIO) {
466 			logperror("phyint_inst_init_from_k:"
467 			    " ioctl (get flags)");
468 		}
469 		return (NULL);
470 	}
471 	flags = lifr.lifr_flags;
472 	if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK))
473 		return (NULL);
474 
475 	/*
476 	 * Get the ifindex for recording later in our tables, in case we need
477 	 * to create a new phyint.
478 	 */
479 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
480 		if (errno != ENXIO) {
481 			logperror("phyint_inst_init_from_k: "
482 			    " ioctl (get lifindex)");
483 		}
484 		return (NULL);
485 	}
486 	ifindex = lifr.lifr_index;
487 
488 	/*
489 	 * Get the phyint group name of this phyint, from the kernel.
490 	 */
491 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
492 		if (errno != ENXIO) {
493 			logperror("phyint_inst_init_from_k: "
494 			    "ioctl (get group name)");
495 		}
496 		return (NULL);
497 	}
498 	(void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
499 	pg_name[sizeof (pg_name) - 1] = '\0';
500 
501 	/*
502 	 * If the phyint is not part of any group, pg_name is the
503 	 * null string. If 'track_all_phyints' is false, there is no
504 	 * need to create a phyint.
505 	 */
506 	if (pg_name[0] == '\0' && !track_all_phyints) {
507 		/*
508 		 * If the IFF_FAILED or IFF_OFFLINE flags are set, reset
509 		 * them. These flags shouldn't be set if IPMP isn't
510 		 * tracking the interface.
511 		 */
512 		if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) {
513 			lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE);
514 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
515 				if (errno != ENXIO) {
516 					logperror("phyint_inst_init_from_k:"
517 					    " ioctl (set flags)");
518 				}
519 			}
520 		}
521 		return (NULL);
522 	}
523 
524 	/*
525 	 * We need to create a new phyint instance. A phyint instance
526 	 * belongs to a phyint, and the phyint belongs to a phyint group.
527 	 * So we first lookup the 'parents' and if they don't exist then
528 	 * we create them.
529 	 */
530 	pg = phyint_group_lookup(pg_name);
531 	if (pg == NULL) {
532 		pg = phyint_group_create(pg_name);
533 		if (pg == NULL) {
534 			logerr("phyint_inst_init_from_k:"
535 			    " unable to create group %s\n", pg_name);
536 			return (NULL);
537 		}
538 		phyint_group_insert(pg);
539 		pg_created = _B_TRUE;
540 	}
541 
542 	/*
543 	 * Lookup the phyint. If the phyint does not exist create it.
544 	 */
545 	pi = phyint_lookup(pi_name);
546 	if (pi == NULL) {
547 		pi = phyint_create(pi_name, pg, ifindex, flags);
548 		if (pi == NULL) {
549 			logerr("phyint_inst_init_from_k:"
550 			    " unable to create phyint %s\n", pi_name);
551 			if (pg_created)
552 				phyint_group_delete(pg);
553 			return (NULL);
554 		}
555 		pi_created = _B_TRUE;
556 	} else {
557 		/* The phyint exists already. */
558 		assert(pi_created == _B_FALSE);
559 		/*
560 		 * Normally we should see consistent values for the IPv4 and
561 		 * IPv6 instances, for phyint properties. If we don't, it
562 		 * means things have changed underneath us, and we should
563 		 * resync our tables with the kernel. Check whether the
564 		 * interface index has changed. If so, it is most likely
565 		 * the interface has been unplumbed and replumbed,
566 		 * while we are yet to update our tables. Do it now.
567 		 */
568 		if (pi->pi_ifindex != ifindex) {
569 			if (pg_created)
570 				phyint_group_delete(pg);
571 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
572 			goto retry;
573 		}
574 		assert(PHYINT_INSTANCE(pi, af) == NULL);
575 
576 		/*
577 		 * If the group name seen by the IPv4 and IPv6 instances
578 		 * are different, it is most likely the groupname has
579 		 * changed, while we are yet to update our tables. Do it now.
580 		 */
581 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
582 			if (pg_created)
583 				phyint_group_delete(pg);
584 			restore_phyint(pi);
585 			phyint_inst_delete(PHYINT_INSTANCE(pi,
586 			    AF_OTHER(af)));
587 			goto retry;
588 		}
589 	}
590 
591 	/*
592 	 * Create a new phyint instance, corresponding to the 'af'
593 	 * passed in.
594 	 */
595 	pii = phyint_inst_create(pi, af);
596 	if (pii == NULL) {
597 		logerr("phyint_inst_init_from_k: unable to create"
598 		    "phyint inst %s\n", pi->pi_name);
599 		if (pi_created) {
600 			/*
601 			 * Deleting the phyint will delete the phyint group
602 			 * if this is the last phyint in the group.
603 			 */
604 			phyint_delete(pi);
605 		}
606 		return (NULL);
607 	}
608 
609 	return (pii);
610 }
611 
612 /*
613  * Bind the pii_probe_sock to the chosen IFF_NOFAILOVER address in
614  * pii_probe_logint. This socket will be used for sending and receiving
615  * ICMP/ICMPv6 probes to targets. Do the common part in this function, and
616  * complete the initializations by calling the protocol specific functions
617  * phyint_inst_v{4,6}_sockinit() respectively.
618  *
619  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
620  */
621 boolean_t
622 phyint_inst_sockinit(struct phyint_instance *pii)
623 {
624 	boolean_t success;
625 	struct phyint_group *pg;
626 
627 	if (debug & D_PHYINT) {
628 		logdebug("phyint_inst_sockinit(%s %s)\n",
629 		    AF_STR(pii->pii_af), pii->pii_name);
630 	}
631 
632 	assert(pii->pii_probe_logint != NULL);
633 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
634 	assert(SINGLETON_GROUP(pii->pii_phyint) ||
635 	    (pii->pii_probe_logint->li_flags & IFF_NOFAILOVER));
636 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
637 
638 	/*
639 	 * If the socket is already bound, close pii_probe_sock
640 	 */
641 	if (pii->pii_probe_sock != -1)
642 		close_probe_socket(pii, _B_TRUE);
643 
644 	/*
645 	 * If the phyint is not part of a named group and track_all_phyints is
646 	 * false, simply return.
647 	 */
648 	pg = pii->pii_phyint->pi_group;
649 	if (pg == phyint_anongroup && !track_all_phyints) {
650 		if (debug & D_PHYINT)
651 			logdebug("phyint_inst_sockinit: no group\n");
652 		return (_B_FALSE);
653 	}
654 
655 	/*
656 	 * Initialize the socket by calling the protocol specific function.
657 	 * If it succeeds, add the socket to the poll list.
658 	 */
659 	if (pii->pii_af == AF_INET6)
660 		success = phyint_inst_v6_sockinit(pii);
661 	else
662 		success = phyint_inst_v4_sockinit(pii);
663 
664 	if (success && (poll_add(pii->pii_probe_sock) == 0))
665 		return (_B_TRUE);
666 
667 	/* Something failed, cleanup and return false */
668 	if (pii->pii_probe_sock != -1)
669 		close_probe_socket(pii, _B_FALSE);
670 
671 	return (_B_FALSE);
672 }
673 
674 /*
675  * IPv6 specific part in initializing the pii_probe_sock. This socket is
676  * used to send/receive ICMPv6 probe packets.
677  */
678 static boolean_t
679 phyint_inst_v6_sockinit(struct phyint_instance *pii)
680 {
681 	icmp6_filter_t filter;
682 	int hopcount = 1;
683 	int int_op;
684 	struct	sockaddr_in6	testaddr;
685 
686 	/*
687 	 * Open a raw socket with ICMPv6 protocol.
688 	 *
689 	 * Use IPV6_DONTFAILOVER_IF to make sure that probes go out
690 	 * on the specified phyint only, and are not subject to load
691 	 * balancing. Bind to the src address chosen will ensure that
692 	 * the responses are received only on the specified phyint.
693 	 *
694 	 * Set the hopcount to 1 so that probe packets are not routed.
695 	 * Disable multicast loopback. Set the receive filter to
696 	 * receive only ICMPv6 echo replies.
697 	 */
698 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
699 	if (pii->pii_probe_sock < 0) {
700 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
701 		return (_B_FALSE);
702 }
703 
704 	bzero(&testaddr, sizeof (testaddr));
705 	testaddr.sin6_family = AF_INET6;
706 	testaddr.sin6_port = 0;
707 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
708 
709 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
710 	    sizeof (testaddr)) < 0) {
711 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
712 		return (_B_FALSE);
713 	}
714 
715 	/*
716 	 * IPV6_DONTFAILOVER_IF option takes precedence over setting
717 	 * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again.
718 	 */
719 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF,
720 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
721 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
722 		    " IPV6_DONTFAILOVER_IF");
723 		return (_B_FALSE);
724 	}
725 
726 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
727 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
728 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
729 		    " IPV6_UNICAST_HOPS");
730 		return (_B_FALSE);
731 	}
732 
733 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
734 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
735 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
736 		    " IPV6_MULTICAST_HOPS");
737 		return (_B_FALSE);
738 	}
739 
740 	int_op = 0;	/* used to turn off option */
741 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
742 	    (char *)&int_op, sizeof (int_op)) < 0) {
743 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
744 		    " IPV6_MULTICAST_LOOP");
745 		return (_B_FALSE);
746 	}
747 
748 	/*
749 	 * Filter out so that we only receive ICMP echo replies
750 	 */
751 	ICMP6_FILTER_SETBLOCKALL(&filter);
752 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
753 
754 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
755 	    (char *)&filter, sizeof (filter)) < 0) {
756 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
757 		    " ICMP6_FILTER");
758 		return (_B_FALSE);
759 	}
760 
761 	/* Enable receipt of ancillary data */
762 	int_op = 1;
763 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
764 	    (char *)&int_op, sizeof (int_op)) < 0) {
765 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
766 		    " IPV6_RECVHOPLIMIT");
767 		return (_B_FALSE);
768 	}
769 
770 	return (_B_TRUE);
771 }
772 
773 /*
774  * IPv4 specific part in initializing the pii_probe_sock. This socket is
775  * used to send/receive ICMPv4 probe packets.
776  */
777 static boolean_t
778 phyint_inst_v4_sockinit(struct phyint_instance *pii)
779 {
780 	struct sockaddr_in  testaddr;
781 	char	char_op;
782 	int	ttl = 1;
783 	char	char_ttl = 1;
784 
785 	/*
786 	 * Open a raw socket with ICMPv4 protocol.
787 	 *
788 	 * Use IP_DONTFAILOVER_IF to make sure that probes go out
789 	 * on the specified phyint only, and are not subject to load
790 	 * balancing. Bind to the src address chosen will ensure that
791 	 * the responses are received only on the specified phyint.
792 	 *
793 	 * Set the ttl to 1 so that probe packets are not routed.
794 	 * Disable multicast loopback.
795 	 */
796 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
797 	if (pii->pii_probe_sock < 0) {
798 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
799 		return (_B_FALSE);
800 	}
801 
802 	bzero(&testaddr, sizeof (testaddr));
803 	testaddr.sin_family = AF_INET;
804 	testaddr.sin_port = 0;
805 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
806 	    &testaddr.sin_addr);
807 
808 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
809 	    sizeof (testaddr)) < 0) {
810 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
811 		return (_B_FALSE);
812 	}
813 
814 	/*
815 	 * IP_DONTFAILOVER_IF option takes precedence over setting
816 	 * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again.
817 	 */
818 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF,
819 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
820 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
821 		    " IP_DONTFAILOVER");
822 		return (_B_FALSE);
823 	}
824 
825 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
826 	    (char *)&ttl, sizeof (ttl)) < 0) {
827 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
828 		    " IP_TTL");
829 		return (_B_FALSE);
830 	}
831 
832 	char_op = 0;	/* used to turn off option */
833 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
834 	    (char *)&char_op, sizeof (char_op)) == -1) {
835 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
836 		    " IP_MULTICAST_LOOP");
837 		return (_B_FALSE);
838 	}
839 
840 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
841 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
842 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
843 		    " IP_MULTICAST_TTL");
844 		return (_B_FALSE);
845 	}
846 
847 	return (_B_TRUE);
848 }
849 
850 /*
851  * Remove the phyint group from the list of 'all phyint groups'
852  * and free it.
853  */
854 static void
855 phyint_group_delete(struct phyint_group *pg)
856 {
857 	/*
858 	 * The anonymous group always exists, even when empty.
859 	 */
860 	if (pg == phyint_anongroup)
861 		return;
862 
863 	if (debug & D_PHYINT)
864 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
865 
866 	/*
867 	 * The phyint group must be empty, and must not have any phyints.
868 	 * The phyint group must be in the list of all phyint groups
869 	 */
870 	assert(pg->pg_phyint == NULL);
871 	assert(phyint_groups == pg || pg->pg_prev != NULL);
872 
873 	if (pg->pg_prev != NULL)
874 		pg->pg_prev->pg_next = pg->pg_next;
875 	else
876 		phyint_groups = pg->pg_next;
877 
878 	if (pg->pg_next != NULL)
879 		pg->pg_next->pg_prev = pg->pg_prev;
880 
881 	pg->pg_next = NULL;
882 	pg->pg_prev = NULL;
883 
884 	phyint_grouplistsig++;
885 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
886 
887 	free(pg);
888 }
889 
890 /*
891  * Extract information from the kernel about the desired phyint.
892  * Look only for properties of the phyint and not properties of logints.
893  * Take appropriate action on the changes.
894  * Return codes:
895  *	PI_OK
896  *		The phyint exists in the kernel and matches our knowledge
897  *		of the phyint.
898  *	PI_DELETED
899  *		The phyint has vanished in the kernel.
900  *	PI_IFINDEX_CHANGED
901  *		The phyint's interface index has changed.
902  *		Ask the caller to delete and recreate the phyint.
903  *	PI_IOCTL_ERROR
904  *		Some ioctl error. Don't change anything.
905  *	PI_GROUP_CHANGED
906  *		The phyint has changed group.
907  */
908 int
909 phyint_inst_update_from_k(struct phyint_instance *pii)
910 {
911 	struct lifreq lifr;
912 	int	ifsock;
913 	struct phyint *pi;
914 
915 	pi = pii->pii_phyint;
916 
917 	if (debug & D_PHYINT) {
918 		logdebug("phyint_inst_update_from_k(%s %s)\n",
919 		    AF_STR(pii->pii_af), pi->pi_name);
920 	}
921 
922 	/*
923 	 * Get the ifindex from the kernel, for comparison with the
924 	 * value in our tables.
925 	 */
926 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
927 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
928 
929 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
930 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
931 		if (errno == ENXIO) {
932 			return (PI_DELETED);
933 		} else {
934 			logperror_pii(pii, "phyint_inst_update_from_k:"
935 			    " ioctl (get lifindex)");
936 			return (PI_IOCTL_ERROR);
937 		}
938 	}
939 
940 	if (lifr.lifr_index != pi->pi_ifindex) {
941 		/*
942 		 * The index has changed. Most likely the interface has
943 		 * been unplumbed and replumbed. Ask the caller to take
944 		 * appropriate action.
945 		 */
946 		if (debug & D_PHYINT) {
947 			logdebug("phyint_inst_update_from_k:"
948 			    " old index %d new index %d\n",
949 			    pi->pi_ifindex, lifr.lifr_index);
950 		}
951 		return (PI_IFINDEX_CHANGED);
952 	}
953 
954 	/*
955 	 * Get the group name from the kernel, for comparison with
956 	 * the value in our tables.
957 	 */
958 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
959 		if (errno == ENXIO) {
960 			return (PI_DELETED);
961 		} else {
962 			logperror_pii(pii, "phyint_inst_update_from_k:"
963 			    " ioctl (get groupname)");
964 			return (PI_IOCTL_ERROR);
965 		}
966 	}
967 
968 	/*
969 	 * If the phyint has changed group i.e. if the phyint group name
970 	 * returned by the kernel is different, ask the caller to delete
971 	 * and recreate the phyint in the right group
972 	 */
973 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
974 		/* Groupname has changed */
975 		if (debug & D_PHYINT) {
976 			logdebug("phyint_inst_update_from_k:"
977 			    " groupname change\n");
978 		}
979 		return (PI_GROUP_CHANGED);
980 	}
981 
982 	/*
983 	 * Get the current phyint flags from the kernel, and determine what
984 	 * flags have changed by comparing against our tables.	Note that the
985 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
986 	 * that IFF_INACTIVE is really still set on the interface.
987 	 */
988 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
989 		if (errno == ENXIO) {
990 			return (PI_DELETED);
991 		} else {
992 			logperror_pii(pii, "phyint_inst_update_from_k: "
993 			    " ioctl (get flags)");
994 			return (PI_IOCTL_ERROR);
995 		}
996 	}
997 
998 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
999 	if (pi->pi_v4 != NULL)
1000 		pi->pi_v4->pii_flags = pi->pi_flags;
1001 	if (pi->pi_v6 != NULL)
1002 		pi->pi_v6->pii_flags = pi->pi_flags;
1003 
1004 	if (pi->pi_flags & IFF_FAILED) {
1005 		/*
1006 		 * If we are in the running and full state, we have
1007 		 * completed failbacks successfully and we would have
1008 		 * expected IFF_FAILED to have been clear. That it is
1009 		 * set means there was a race condition. Some other
1010 		 * process turned on the IFF_FAILED flag. Since the
1011 		 * flag setting is not atomic, i.e. a get ioctl followed
1012 		 * by a set ioctl, and since there is no way to set an
1013 		 * individual flag bit, this could have occurred.
1014 		 */
1015 		if (pi->pi_state == PI_RUNNING && pi->pi_full)
1016 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1017 	} else {
1018 		/*
1019 		 * If we are in the failed state, there was a race.
1020 		 * we have completed failover successfully because our
1021 		 * state is failed and empty. Some other process turned
1022 		 * off the IFF_FAILED flag. Same comment as above
1023 		 */
1024 		if (pi->pi_state == PI_FAILED && pi->pi_empty)
1025 			(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1026 	}
1027 
1028 	/* No change in phyint status */
1029 	return (PI_OK);
1030 }
1031 
1032 /*
1033  * Delete the phyint. Remove it from the list of all phyints, and the
1034  * list of phyint group members. If the group becomes empty, delete the
1035  * group also.
1036  */
1037 static void
1038 phyint_delete(struct phyint *pi)
1039 {
1040 	struct phyint_group *pg = pi->pi_group;
1041 
1042 	if (debug & D_PHYINT)
1043 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1044 
1045 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1046 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1047 
1048 	/*
1049 	 * The phyint must belong to a group.
1050 	 */
1051 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1052 
1053 	/* The phyint must be in the list of all phyints */
1054 	assert(phyints == pi || pi->pi_prev != NULL);
1055 
1056 	/* Remove the phyint from the phyint group list */
1057 	pg->pg_sig++;
1058 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1059 
1060 	if (pi->pi_pgprev == NULL) {
1061 		/* Phyint is the 1st in the phyint group list */
1062 		pg->pg_phyint = pi->pi_pgnext;
1063 	} else {
1064 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1065 	}
1066 	if (pi->pi_pgnext != NULL)
1067 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1068 	pi->pi_pgnext = NULL;
1069 	pi->pi_pgprev = NULL;
1070 
1071 	/* Remove the phyint from the global list of phyints */
1072 	if (pi->pi_prev == NULL) {
1073 		/* Phyint is the 1st in the list */
1074 		phyints = pi->pi_next;
1075 	} else {
1076 		pi->pi_prev->pi_next = pi->pi_next;
1077 	}
1078 	if (pi->pi_next != NULL)
1079 		pi->pi_next->pi_prev = pi->pi_prev;
1080 	pi->pi_next = NULL;
1081 	pi->pi_prev = NULL;
1082 
1083 	free(pi);
1084 
1085 	/* Delete the phyint_group if the last phyint has been deleted */
1086 	if (pg->pg_phyint == NULL)
1087 		phyint_group_delete(pg);
1088 }
1089 
1090 /*
1091  * Delete (unlink and free), the phyint instance.
1092  */
1093 void
1094 phyint_inst_delete(struct phyint_instance *pii)
1095 {
1096 	struct phyint *pi = pii->pii_phyint;
1097 
1098 	assert(pi != NULL);
1099 
1100 	if (debug & D_PHYINT) {
1101 		logdebug("phyint_inst_delete(%s %s)\n",
1102 		    AF_STR(pii->pii_af), pi->pi_name);
1103 	}
1104 
1105 	/*
1106 	 * If the phyint instance has associated probe targets
1107 	 * delete all the targets
1108 	 */
1109 	while (pii->pii_targets != NULL)
1110 		target_delete(pii->pii_targets);
1111 
1112 	/*
1113 	 * Delete all the logints associated with this phyint
1114 	 * instance.
1115 	 */
1116 	while (pii->pii_logint != NULL)
1117 		logint_delete(pii->pii_logint);
1118 
1119 	/*
1120 	 * Close the IFF_NOFAILOVER socket used to send probes to targets
1121 	 * from this phyint.
1122 	 */
1123 	if (pii->pii_probe_sock != -1)
1124 		close_probe_socket(pii, _B_TRUE);
1125 
1126 	/*
1127 	 * Phyint instance must be in the list of all phyint instances.
1128 	 * Remove phyint instance from the global list of phyint instances.
1129 	 */
1130 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1131 	if (pii->pii_prev == NULL) {
1132 		/* Phyint is the 1st in the list */
1133 		phyint_instances = pii->pii_next;
1134 	} else {
1135 		pii->pii_prev->pii_next = pii->pii_next;
1136 	}
1137 	if (pii->pii_next != NULL)
1138 		pii->pii_next->pii_prev = pii->pii_prev;
1139 	pii->pii_next = NULL;
1140 	pii->pii_prev = NULL;
1141 
1142 	/*
1143 	 * Reset the phyint instance pointer in the phyint.
1144 	 * If this is the last phyint instance (being deleted) on this
1145 	 * phyint, then delete the phyint.
1146 	 */
1147 	if (pii->pii_af == AF_INET)
1148 		pi->pi_v4 = NULL;
1149 	else
1150 		pi->pi_v6 = NULL;
1151 
1152 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1153 		phyint_delete(pi);
1154 
1155 	free(pii);
1156 }
1157 
1158 static void
1159 phyint_inst_print(struct phyint_instance *pii)
1160 {
1161 	struct logint *li;
1162 	struct target *tg;
1163 	char abuf[INET6_ADDRSTRLEN];
1164 	int most_recent;
1165 	int i;
1166 
1167 	if (pii->pii_phyint == NULL) {
1168 		logdebug("pii->pi_phyint NULL can't print\n");
1169 		return;
1170 	}
1171 
1172 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1173 	    "sock %x in_use %d empty %x full %x\n",
1174 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1175 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1176 	    pii->pii_in_use, pii->pii_phyint->pi_empty,
1177 	    pii->pii_phyint->pi_full);
1178 
1179 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1180 		logint_print(li);
1181 
1182 	logdebug("\n");
1183 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1184 		target_print(tg);
1185 
1186 	if (pii->pii_targets == NULL)
1187 		logdebug("pi_targets NULL\n");
1188 
1189 	if (pii->pii_target_next != NULL) {
1190 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1191 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1192 			abuf, sizeof (abuf)));
1193 	} else {
1194 		logdebug("pi_target_next NULL\n");
1195 	}
1196 
1197 	if (pii->pii_rtt_target_next != NULL) {
1198 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1199 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1200 			abuf, sizeof (abuf)));
1201 	} else {
1202 		logdebug("pi_rtt_target_next NULL\n");
1203 	}
1204 
1205 	if (pii->pii_targets != NULL) {
1206 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1207 
1208 		i = most_recent;
1209 		do {
1210 			if (pii->pii_probes[i].pr_target != NULL) {
1211 				logdebug("#%d target %s ", i,
1212 				    pr_addr(pii->pii_af,
1213 				    pii->pii_probes[i].pr_target->tg_address,
1214 				    abuf, sizeof (abuf)));
1215 			} else {
1216 				logdebug("#%d target NULL ", i);
1217 			}
1218 			logdebug("time_sent %u status %d time_ack/lost %u\n",
1219 			    pii->pii_probes[i].pr_time_sent,
1220 			    pii->pii_probes[i].pr_status,
1221 			    pii->pii_probes[i].pr_time_lost);
1222 			i = PROBE_INDEX_PREV(i);
1223 		} while (i != most_recent);
1224 	}
1225 }
1226 
1227 /*
1228  * Lookup a logint based on the logical interface name, on the given
1229  * phyint instance.
1230  */
1231 static struct logint *
1232 logint_lookup(struct phyint_instance *pii, char *name)
1233 {
1234 	struct logint *li;
1235 
1236 	if (debug & D_LOGINT) {
1237 		logdebug("logint_lookup(%s, %s)\n",
1238 		    AF_STR(pii->pii_af), name);
1239 	}
1240 
1241 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1242 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1243 			break;
1244 	}
1245 	return (li);
1246 }
1247 
1248 /*
1249  * Insert a logint at the head of the list of logints of the given
1250  * phyint instance
1251  */
1252 static void
1253 logint_insert(struct phyint_instance *pii, struct logint *li)
1254 {
1255 	li->li_next = pii->pii_logint;
1256 	li->li_prev = NULL;
1257 	if (pii->pii_logint != NULL)
1258 		pii->pii_logint->li_prev = li;
1259 	pii->pii_logint = li;
1260 	li->li_phyint_inst = pii;
1261 }
1262 
1263 /*
1264  * Create a new named logint, on the specified phyint instance.
1265  */
1266 static struct logint *
1267 logint_create(struct phyint_instance *pii, char *name)
1268 {
1269 	struct logint *li;
1270 
1271 	if (debug & D_LOGINT) {
1272 		logdebug("logint_create(%s %s %s)\n",
1273 		    AF_STR(pii->pii_af), pii->pii_name, name);
1274 	}
1275 
1276 	li = calloc(1, sizeof (struct logint));
1277 	if (li == NULL) {
1278 		logperror("logint_create: calloc");
1279 		return (NULL);
1280 	}
1281 
1282 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1283 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1284 	logint_insert(pii, li);
1285 	return (li);
1286 }
1287 
1288 /*
1289  * Initialize the logint based on the data returned by the kernel.
1290  */
1291 void
1292 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1293 {
1294 	int	ifsock;
1295 	uint64_t flags;
1296 	uint64_t saved_flags;
1297 	struct	logint	*li;
1298 	struct lifreq	lifr;
1299 	struct in6_addr	test_subnet;
1300 	struct in6_addr	test_subnet_mask;
1301 	struct in6_addr	testaddr;
1302 	int	test_subnet_len;
1303 	struct sockaddr_in6	*sin6;
1304 	struct sockaddr_in	*sin;
1305 	char abuf[INET6_ADDRSTRLEN];
1306 	boolean_t  ptp = _B_FALSE;
1307 	struct in6_addr tgaddr;
1308 
1309 	if (debug & D_LOGINT) {
1310 		logdebug("logint_init_from_k(%s %s)\n",
1311 		    AF_STR(pii->pii_af), li_name);
1312 	}
1313 
1314 	/* Get the socket for doing ioctls */
1315 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1316 
1317 	/*
1318 	 * Get the flags from the kernel. Also serves as a check whether
1319 	 * the logical still exists. If it doesn't exist, no need to proceed
1320 	 * any further. li_in_use will make the caller clean up the logint
1321 	 */
1322 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1323 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1324 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1325 		/* Interface may have vanished */
1326 		if (errno != ENXIO) {
1327 			logperror_pii(pii, "logint_init_from_k: "
1328 			    "ioctl (get flags)");
1329 		}
1330 		return;
1331 	}
1332 
1333 	flags = lifr.lifr_flags;
1334 
1335 	/*
1336 	 * Verified the logint exists. Now lookup the logint in our tables.
1337 	 * If it does not exist, create a new logint.
1338 	 */
1339 	li = logint_lookup(pii, li_name);
1340 	if (li == NULL) {
1341 		li = logint_create(pii, li_name);
1342 		if (li == NULL) {
1343 			/*
1344 			 * Pretend the interface does not exist
1345 			 * in the kernel
1346 			 */
1347 			return;
1348 		}
1349 	}
1350 
1351 	/*
1352 	 * Update li->li_flags with the new flags, after saving the old
1353 	 * value. This is used later to check what flags has changed and
1354 	 * take any action
1355 	 */
1356 	saved_flags = li->li_flags;
1357 	li->li_flags = flags;
1358 
1359 	/*
1360 	 * Get the address, prefix, prefixlength and update the logint.
1361 	 * Check if anything has changed. If the logint used for the
1362 	 * test address has changed, take suitable action.
1363 	 */
1364 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1365 		/* Interface may have vanished */
1366 		if (errno != ENXIO) {
1367 			logperror_li(li, "logint_init_from_k: (get addr)");
1368 		}
1369 		goto error;
1370 	}
1371 
1372 	if (pii->pii_af == AF_INET) {
1373 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1374 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1375 	} else {
1376 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1377 		testaddr = sin6->sin6_addr;
1378 	}
1379 
1380 	if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
1381 		ptp = _B_TRUE;
1382 		if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) {
1383 			if (errno != ENXIO) {
1384 				logperror_li(li, "logint_init_from_k:"
1385 				    " (get dstaddr)");
1386 			}
1387 			goto error;
1388 		}
1389 		if (pii->pii_af == AF_INET) {
1390 			sin = (struct sockaddr_in *)&lifr.lifr_addr;
1391 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr);
1392 		} else {
1393 			sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1394 			tgaddr = sin6->sin6_addr;
1395 		}
1396 	} else {
1397 		if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1398 			/* Interface may have vanished */
1399 			if (errno != ENXIO) {
1400 				logperror_li(li, "logint_init_from_k:"
1401 				    " (get subnet)");
1402 			}
1403 			goto error;
1404 		}
1405 		if (lifr.lifr_subnet.ss_family == AF_INET6) {
1406 			sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1407 			test_subnet = sin6->sin6_addr;
1408 			test_subnet_len = lifr.lifr_addrlen;
1409 		} else {
1410 			sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1411 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1412 			test_subnet_len = lifr.lifr_addrlen +
1413 			    (IPV6_ABITS - IP_ABITS);
1414 		}
1415 		(void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask);
1416 	}
1417 
1418 	/*
1419 	 * Also record the OINDEX for completeness. This information is
1420 	 * not used.
1421 	 */
1422 	if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) {
1423 		if (errno != ENXIO)  {
1424 			logperror_li(li, "logint_init_from_k:"
1425 			    " (get lifoindex)");
1426 		}
1427 		goto error;
1428 	}
1429 
1430 	/*
1431 	 * If this is the logint corresponding to the test address used for
1432 	 * sending probes, then if anything significant has changed we need to
1433 	 * determine the test address again.  We ignore changes to the
1434 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1435 	 * course.
1436 	 */
1437 	if (pii->pii_probe_logint == li) {
1438 		if (((li->li_flags ^ saved_flags) &
1439 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1440 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1441 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1442 			&li->li_subnet)) ||
1443 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1444 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1445 			/*
1446 			 * Something significant that affects the testaddress
1447 			 * has changed. Redo the testaddress selection later on
1448 			 * in select_test_ifs(). For now do the cleanup and
1449 			 * set pii_probe_logint to NULL.
1450 			 */
1451 			if (pii->pii_probe_sock != -1)
1452 				close_probe_socket(pii, _B_TRUE);
1453 			pii->pii_probe_logint = NULL;
1454 		}
1455 	}
1456 
1457 
1458 	/* Update the logint with the values obtained from the kernel.	*/
1459 	li->li_addr = testaddr;
1460 	li->li_in_use = 1;
1461 	li->li_oifindex = lifr.lifr_index;
1462 	if (ptp) {
1463 		li->li_dstaddr = tgaddr;
1464 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1465 		    IP_ABITS : IPV6_ABITS;
1466 	} else {
1467 		li->li_subnet = test_subnet;
1468 		li->li_subnet_len = test_subnet_len;
1469 	}
1470 
1471 	if (debug & D_LOGINT)
1472 		logint_print(li);
1473 
1474 	return;
1475 
1476 error:
1477 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1478 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1479 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1480 	logint_delete(li);
1481 }
1482 
1483 /*
1484  * Delete (unlink and free) a logint.
1485  */
1486 void
1487 logint_delete(struct logint *li)
1488 {
1489 	struct phyint_instance *pii;
1490 
1491 	pii = li->li_phyint_inst;
1492 	assert(pii != NULL);
1493 
1494 	if (debug & D_LOGINT) {
1495 		int af;
1496 		char abuf[INET6_ADDRSTRLEN];
1497 
1498 		af = pii->pii_af;
1499 		logdebug("logint_delete(%s %s %s/%u)\n",
1500 		    AF_STR(af), li->li_name,
1501 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1502 		    li->li_subnet_len);
1503 	}
1504 
1505 	/* logint must be in the list of logints */
1506 	assert(pii->pii_logint == li || li->li_prev != NULL);
1507 
1508 	/* Remove the logint from the list of logints  */
1509 	if (li->li_prev == NULL) {
1510 		/* logint is the 1st in the list */
1511 		pii->pii_logint = li->li_next;
1512 	} else {
1513 		li->li_prev->li_next = li->li_next;
1514 	}
1515 	if (li->li_next != NULL)
1516 		li->li_next->li_prev = li->li_prev;
1517 	li->li_next = NULL;
1518 	li->li_prev = NULL;
1519 
1520 	/*
1521 	 * If this logint corresponds to the IFF_NOFAILOVER testaddress of
1522 	 * this phyint, then close the associated socket, if it exists
1523 	 */
1524 	if (pii->pii_probe_logint == li) {
1525 		if (pii->pii_probe_sock != -1)
1526 			close_probe_socket(pii, _B_TRUE);
1527 		pii->pii_probe_logint = NULL;
1528 	}
1529 
1530 	free(li);
1531 }
1532 
1533 static void
1534 logint_print(struct logint *li)
1535 {
1536 	char abuf[INET6_ADDRSTRLEN];
1537 	int af;
1538 
1539 	af = li->li_phyint_inst->pii_af;
1540 
1541 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1542 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1543 
1544 	logdebug("\tFlags: %llx in_use %d oifindex %d\n",
1545 	    li->li_flags, li->li_in_use, li->li_oifindex);
1546 }
1547 
1548 char *
1549 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1550 {
1551 	struct in_addr	addr_v4;
1552 
1553 	if (af == AF_INET) {
1554 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1555 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1556 	} else {
1557 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1558 	}
1559 	return (abuf);
1560 }
1561 
1562 /* Lookup target on its address */
1563 struct target *
1564 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1565 {
1566 	struct target *tg;
1567 
1568 	if (debug & D_TARGET) {
1569 		char abuf[INET6_ADDRSTRLEN];
1570 
1571 		logdebug("target_lookup(%s %s): addr %s\n",
1572 		    AF_STR(pii->pii_af), pii->pii_name,
1573 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1574 	}
1575 
1576 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1577 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1578 			break;
1579 	}
1580 	return (tg);
1581 }
1582 
1583 /*
1584  * Find and return the next active target, for the next probe.
1585  * If no active targets are available, return NULL.
1586  */
1587 struct target *
1588 target_next(struct target *tg)
1589 {
1590 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
1591 	struct	target	*marker = tg;
1592 	hrtime_t now;
1593 
1594 	now = gethrtime();
1595 
1596 	/*
1597 	 * Target must be in the list of targets for this phyint
1598 	 * instance.
1599 	 */
1600 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1601 	assert(pii->pii_targets != NULL);
1602 
1603 	/* Return the next active target */
1604 	do {
1605 		/*
1606 		 * Go to the next target. If we hit the end,
1607 		 * reset the ptr to the head
1608 		 */
1609 		tg = tg->tg_next;
1610 		if (tg == NULL)
1611 			tg = pii->pii_targets;
1612 
1613 		assert(TG_STATUS_VALID(tg->tg_status));
1614 
1615 		switch (tg->tg_status) {
1616 		case TG_ACTIVE:
1617 			return (tg);
1618 
1619 		case TG_UNUSED:
1620 			assert(pii->pii_targets_are_routers);
1621 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
1622 				/*
1623 				 * Bubble up the unused target to active
1624 				 */
1625 				tg->tg_status = TG_ACTIVE;
1626 				pii->pii_ntargets++;
1627 				return (tg);
1628 			}
1629 			break;
1630 
1631 		case TG_SLOW:
1632 			assert(pii->pii_targets_are_routers);
1633 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1634 				/*
1635 				 * Bubble up the slow target to unused
1636 				 */
1637 				tg->tg_status = TG_UNUSED;
1638 			}
1639 			break;
1640 
1641 		case TG_DEAD:
1642 			assert(pii->pii_targets_are_routers);
1643 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1644 				/*
1645 				 * Bubble up the dead target to slow
1646 				 */
1647 				tg->tg_status = TG_SLOW;
1648 				tg->tg_latime = now;
1649 			}
1650 			break;
1651 		}
1652 
1653 	} while (tg != marker);
1654 
1655 	return (NULL);
1656 }
1657 
1658 /*
1659  * Select the best available target, that is not already TG_ACTIVE,
1660  * for the caller. The caller will determine whether it wants to
1661  * make the returned target TG_ACTIVE.
1662  * The selection order is as follows.
1663  * 1. pick a TG_UNSED target, if it exists.
1664  * 2. else pick a TG_SLOW target that has recovered, if it exists
1665  * 3. else pick any TG_SLOW target, if it exists
1666  * 4. else pick a TG_DEAD target that has recovered, if it exists
1667  * 5. else pick any TG_DEAD target, if it exists
1668  * 6. else return null
1669  */
1670 static struct target *
1671 target_select_best(struct phyint_instance *pii)
1672 {
1673 	struct target *tg;
1674 	struct target *slow = NULL;
1675 	struct target *dead = NULL;
1676 	struct target *slow_recovered = NULL;
1677 	struct target *dead_recovered = NULL;
1678 	hrtime_t now;
1679 
1680 	now = gethrtime();
1681 
1682 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1683 		assert(TG_STATUS_VALID(tg->tg_status));
1684 
1685 		switch (tg->tg_status) {
1686 		case TG_UNUSED:
1687 			return (tg);
1688 
1689 		case TG_SLOW:
1690 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1691 				slow_recovered = tg;
1692 				/*
1693 				 * Promote the slow_recoverd to unused
1694 				 */
1695 				tg->tg_status = TG_UNUSED;
1696 			} else {
1697 				slow = tg;
1698 			}
1699 			break;
1700 
1701 		case TG_DEAD:
1702 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1703 				dead_recovered = tg;
1704 				/*
1705 				 * Promote the dead_recoverd to slow
1706 				 */
1707 				tg->tg_status = TG_SLOW;
1708 				tg->tg_latime = now;
1709 			} else {
1710 				dead = tg;
1711 			}
1712 			break;
1713 
1714 		default:
1715 			break;
1716 		}
1717 	}
1718 
1719 	if (slow_recovered != NULL)
1720 		return (slow_recovered);
1721 	else if (slow != NULL)
1722 		return (slow);
1723 	else if (dead_recovered != NULL)
1724 		return (dead_recovered);
1725 	else
1726 		return (dead);
1727 }
1728 
1729 /*
1730  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
1731  * that are active, pick the next best below.
1732  */
1733 static void
1734 target_activate_all(struct phyint_instance *pii)
1735 {
1736 	struct target *tg;
1737 
1738 	assert(pii->pii_ntargets == 0);
1739 	assert(pii->pii_target_next == NULL);
1740 	assert(pii->pii_rtt_target_next == NULL);
1741 	assert(pii->pii_targets_are_routers);
1742 
1743 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
1744 		tg = target_select_best(pii);
1745 		if (tg == NULL) {
1746 			/* We are out of targets */
1747 			return;
1748 		}
1749 
1750 		assert(TG_STATUS_VALID(tg->tg_status));
1751 		assert(tg->tg_status != TG_ACTIVE);
1752 		tg->tg_status = TG_ACTIVE;
1753 		pii->pii_ntargets++;
1754 		if (pii->pii_target_next == NULL) {
1755 			pii->pii_target_next = tg;
1756 			pii->pii_rtt_target_next = tg;
1757 		}
1758 	}
1759 }
1760 
1761 static struct target *
1762 target_first(struct phyint_instance *pii)
1763 {
1764 	struct target *tg;
1765 
1766 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1767 		assert(TG_STATUS_VALID(tg->tg_status));
1768 		if (tg->tg_status == TG_ACTIVE)
1769 			break;
1770 	}
1771 
1772 	return (tg);
1773 }
1774 
1775 /*
1776  * Create a default target entry.
1777  */
1778 void
1779 target_create(struct phyint_instance *pii, struct in6_addr addr,
1780     boolean_t is_router)
1781 {
1782 	struct target *tg;
1783 	struct phyint *pi;
1784 	struct logint *li;
1785 
1786 	if (debug & D_TARGET) {
1787 		char abuf[INET6_ADDRSTRLEN];
1788 
1789 		logdebug("target_create(%s %s, %s)\n",
1790 		    AF_STR(pii->pii_af), pii->pii_name,
1791 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1792 	}
1793 
1794 	/*
1795 	 * If the test address is not yet initialized, do not add
1796 	 * any target, since we cannot determine whether the target
1797 	 * belongs to the same subnet as the test address.
1798 	 */
1799 	li = pii->pii_probe_logint;
1800 	if (li == NULL)
1801 		return;
1802 
1803 	/*
1804 	 * If there are multiple subnets associated with an interface, then
1805 	 * add the target to this phyint instance, only if it belongs to the
1806 	 * same subnet as the test address. The reason is that interface
1807 	 * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER
1808 	 * addresses, will disappear after failover, and the targets will not
1809 	 * be reachable from this interface.
1810 	 */
1811 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
1812 		return;
1813 
1814 	if (pii->pii_targets != NULL) {
1815 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1816 		if (is_router) {
1817 			if (!pii->pii_targets_are_routers) {
1818 				/*
1819 				 * Prefer router over hosts. Using hosts is a
1820 				 * fallback mechanism, hence delete all host
1821 				 * targets.
1822 				 */
1823 				while (pii->pii_targets != NULL)
1824 					target_delete(pii->pii_targets);
1825 			}
1826 		} else {
1827 			/*
1828 			 * Routers take precedence over hosts. If this
1829 			 * is a router list and we are trying to add a
1830 			 * host, just return. If this is a host list
1831 			 * and if we have sufficient targets, just return
1832 			 */
1833 			if (pii->pii_targets_are_routers ||
1834 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
1835 				return;
1836 		}
1837 	}
1838 
1839 	tg = calloc(1, sizeof (struct target));
1840 	if (tg == NULL) {
1841 		logperror("target_create: calloc");
1842 		return;
1843 	}
1844 
1845 	tg->tg_phyint_inst = pii;
1846 	tg->tg_address = addr;
1847 	tg->tg_in_use = 1;
1848 	tg->tg_rtt_sa = -1;
1849 	tg->tg_num_deferred = 0;
1850 
1851 	/*
1852 	 * If this is the first target, set 'pii_targets_are_routers'
1853 	 * The list of targets is either a list of hosts or list or
1854 	 * routers, but not a mix.
1855 	 */
1856 	if (pii->pii_targets == NULL) {
1857 		assert(pii->pii_ntargets == 0);
1858 		assert(pii->pii_target_next == NULL);
1859 		assert(pii->pii_rtt_target_next == NULL);
1860 		pii->pii_targets_are_routers = is_router ? 1 : 0;
1861 	}
1862 
1863 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
1864 		assert(pii->pii_targets_are_routers);
1865 		assert(pii->pii_target_next != NULL);
1866 		assert(pii->pii_rtt_target_next != NULL);
1867 		tg->tg_status = TG_UNUSED;
1868 	} else {
1869 		if (pii->pii_ntargets == 0) {
1870 			assert(pii->pii_target_next == NULL);
1871 			pii->pii_target_next = tg;
1872 			pii->pii_rtt_target_next = tg;
1873 		}
1874 		pii->pii_ntargets++;
1875 		tg->tg_status = TG_ACTIVE;
1876 	}
1877 
1878 	target_insert(pii, tg);
1879 
1880 	/*
1881 	 * Change to running state, if this phyint instance is capable of
1882 	 * sending and receiving probes. i.e if we know of at least 1 target,
1883 	 * and this phyint instance socket is bound to the IFF_NOFAILOVER
1884 	 * address. More details in phyint state diagram in probe.c.
1885 	 */
1886 	pi = pii->pii_phyint;
1887 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
1888 		if (pi->pi_flags & IFF_FAILED)
1889 			phyint_chstate(pi, PI_FAILED);
1890 		else
1891 			phyint_chstate(pi, PI_RUNNING);
1892 	}
1893 }
1894 
1895 /*
1896  * Add the target address named by `addr' to phyint instance `pii' if it does
1897  * not already exist.  If the target is a router, `is_router' should be set to
1898  * B_TRUE.
1899  */
1900 void
1901 target_add(struct phyint_instance *pii, struct in6_addr addr,
1902     boolean_t is_router)
1903 {
1904 	struct target *tg;
1905 
1906 	if (pii == NULL)
1907 		return;
1908 
1909 	tg = target_lookup(pii, addr);
1910 
1911 	/*
1912 	 * If the target does not exist, create it; target_create() will set
1913 	 * tg_in_use to true.  If it exists already, and it is a router
1914 	 * target, set tg_in_use to to true, so that init_router_targets()
1915 	 * won't delete it
1916 	 */
1917 	if (tg == NULL)
1918 		target_create(pii, addr, is_router);
1919 	else if (is_router)
1920 		tg->tg_in_use = 1;
1921 }
1922 
1923 /*
1924  * Insert target at head of linked list of targets for the associated
1925  * phyint instance
1926  */
1927 static void
1928 target_insert(struct phyint_instance *pii, struct target *tg)
1929 {
1930 	tg->tg_next = pii->pii_targets;
1931 	tg->tg_prev = NULL;
1932 	if (tg->tg_next != NULL)
1933 		tg->tg_next->tg_prev = tg;
1934 	pii->pii_targets = tg;
1935 }
1936 
1937 /*
1938  * Delete a target (unlink and free).
1939  */
1940 void
1941 target_delete(struct target *tg)
1942 {
1943 	int af;
1944 	struct phyint_instance	*pii;
1945 	struct phyint_instance	*pii_other;
1946 
1947 	pii = tg->tg_phyint_inst;
1948 	af = pii->pii_af;
1949 
1950 	if (debug & D_TARGET) {
1951 		char abuf[INET6_ADDRSTRLEN];
1952 
1953 		logdebug("target_delete(%s %s, %s)\n",
1954 		    AF_STR(af), pii->pii_name,
1955 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
1956 	}
1957 
1958 	/*
1959 	 * Target must be in the list of targets for this phyint
1960 	 * instance.
1961 	 */
1962 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1963 
1964 	/*
1965 	 * Reset all references to 'tg' in the probe information
1966 	 * for this phyint.
1967 	 */
1968 	reset_pii_probes(pii, tg);
1969 
1970 	/*
1971 	 * Remove this target from the list of targets of this
1972 	 * phyint instance.
1973 	 */
1974 	if (tg->tg_prev == NULL) {
1975 		pii->pii_targets = tg->tg_next;
1976 	} else {
1977 		tg->tg_prev->tg_next = tg->tg_next;
1978 	}
1979 
1980 	if (tg->tg_next != NULL)
1981 		tg->tg_next->tg_prev = tg->tg_prev;
1982 
1983 	tg->tg_next = NULL;
1984 	tg->tg_prev = NULL;
1985 
1986 	if (tg->tg_status == TG_ACTIVE)
1987 		pii->pii_ntargets--;
1988 
1989 	/*
1990 	 * Adjust the next target to probe, if it points to
1991 	 * to the currently deleted target.
1992 	 */
1993 	if (pii->pii_target_next == tg)
1994 		pii->pii_target_next = target_first(pii);
1995 
1996 	if (pii->pii_rtt_target_next == tg)
1997 		pii->pii_rtt_target_next = target_first(pii);
1998 
1999 	free(tg);
2000 
2001 	/*
2002 	 * The number of active targets pii_ntargets == 0 iff
2003 	 * the next active target pii->pii_target_next == NULL
2004 	 */
2005 	if (pii->pii_ntargets != 0) {
2006 		assert(pii->pii_target_next != NULL);
2007 		assert(pii->pii_rtt_target_next != NULL);
2008 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2009 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2010 		return;
2011 	}
2012 
2013 	/* At this point, we don't have any active targets. */
2014 	assert(pii->pii_target_next == NULL);
2015 	assert(pii->pii_rtt_target_next == NULL);
2016 
2017 	if (pii->pii_targets_are_routers) {
2018 		/*
2019 		 * Activate any TG_SLOW or TG_DEAD router targets,
2020 		 * since we don't have any other targets
2021 		 */
2022 		target_activate_all(pii);
2023 
2024 		if (pii->pii_ntargets != 0) {
2025 			assert(pii->pii_target_next != NULL);
2026 			assert(pii->pii_rtt_target_next != NULL);
2027 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2028 			assert(pii->pii_rtt_target_next->tg_status ==
2029 			    TG_ACTIVE);
2030 			return;
2031 		}
2032 	}
2033 
2034 	/*
2035 	 * If we still don't have any active targets, the list must
2036 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2037 	 * targets. Zero out the probe stats since it will not be
2038 	 * relevant any longer.
2039 	 */
2040 	assert(pii->pii_targets == NULL);
2041 	clear_pii_probe_stats(pii);
2042 	pii_other = phyint_inst_other(pii);
2043 
2044 	/*
2045 	 * If there are no targets on both instances,
2046 	 * go back to PI_NOTARGETS state, since we cannot
2047 	 * probe this phyint any more. For more details,
2048 	 * please see phyint state diagram in mpd_probe.c.
2049 	 */
2050 	if (!PROBE_CAPABLE(pii_other))
2051 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2052 }
2053 
2054 /*
2055  * Flush the target list of every phyint in the group, if the list
2056  * is a host target list. This is called if group failure is suspected.
2057  * If all targets have failed, multicast will subsequently discover new
2058  * targets. Else it is a group failure.
2059  * Note: This function is a no-op if the list is a router target list.
2060  */
2061 static void
2062 target_flush_hosts(struct phyint_group *pg)
2063 {
2064 	struct phyint *pi;
2065 	struct phyint_instance *pii;
2066 
2067 	if (debug & D_TARGET)
2068 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2069 
2070 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2071 		pii = pi->pi_v4;
2072 		if (pii != NULL && !pii->pii_targets_are_routers) {
2073 			/*
2074 			 * Delete all the targets. When the list becomes
2075 			 * empty, target_delete() will set pii->pii_targets
2076 			 * to NULL.
2077 			 */
2078 			while (pii->pii_targets != NULL)
2079 				target_delete(pii->pii_targets);
2080 		}
2081 		pii = pi->pi_v6;
2082 		if (pii != NULL && !pii->pii_targets_are_routers) {
2083 			/*
2084 			 * Delete all the targets. When the list becomes
2085 			 * empty, target_delete() will set pii->pii_targets
2086 			 * to NULL.
2087 			 */
2088 			while (pii->pii_targets != NULL)
2089 				target_delete(pii->pii_targets);
2090 		}
2091 	}
2092 }
2093 
2094 /*
2095  * Reset all references to 'target' in the probe info, as this target is
2096  * being deleted. The pr_target field is guaranteed to be non-null if
2097  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2098  * pr_target will not be accessed unconditionally.
2099  */
2100 static void
2101 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2102 {
2103 	int i;
2104 
2105 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2106 		if (pii->pii_probes[i].pr_target == tg) {
2107 			pii->pii_probes[i].pr_target = NULL;
2108 			if (pii->pii_probes[i].pr_status == PR_UNACKED)
2109 				pii->pii_probes[i].pr_status = PR_LOST;
2110 		}
2111 	}
2112 
2113 }
2114 
2115 /*
2116  * Clear the probe statistics array.
2117  */
2118 void
2119 clear_pii_probe_stats(struct phyint_instance *pii)
2120 {
2121 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2122 	/* Reset the next probe index in the probe stats array */
2123 	pii->pii_probe_next = 0;
2124 }
2125 
2126 static void
2127 target_print(struct target *tg)
2128 {
2129 	char	abuf[INET6_ADDRSTRLEN];
2130 	char	buf[128];
2131 	char	buf2[128];
2132 	int	af;
2133 	int	i;
2134 
2135 	af = tg->tg_phyint_inst->pii_af;
2136 
2137 	logdebug("Target on %s %s addr %s\n"
2138 	    "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n",
2139 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2140 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2141 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2142 	    tg->tg_crtt, tg->tg_in_use);
2143 
2144 	buf[0] = '\0';
2145 	for (i = 0; i < tg->tg_num_deferred; i++) {
2146 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2147 		    tg->tg_deferred[i]);
2148 		(void) strlcat(buf, buf2, sizeof (buf));
2149 	}
2150 	logdebug("deferred rtts:%s\n", buf);
2151 }
2152 
2153 void
2154 phyint_inst_print_all(void)
2155 {
2156 	struct phyint_instance *pii;
2157 
2158 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2159 		phyint_inst_print(pii);
2160 	}
2161 }
2162 
2163 /*
2164  * Convert length for a mask to the mask.
2165  */
2166 static void
2167 ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask)
2168 {
2169 	int	j;
2170 
2171 	assert(masklen <= IPV6_ABITS);
2172 	bzero((char *)bitmask, sizeof (*bitmask));
2173 
2174 	/* Make the 'masklen' leftmost bits one */
2175 	for (j = 0; masklen > 8; masklen -= 8, j++)
2176 		bitmask->s6_addr[j] = 0xff;
2177 
2178 	bitmask->s6_addr[j] = 0xff << (8 - masklen);
2179 
2180 }
2181 
2182 /*
2183  * Compare two prefixes that have the same prefix length.
2184  * Fails if the prefix length is unreasonable.
2185  */
2186 static boolean_t
2187 prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len)
2188 {
2189 	uchar_t mask;
2190 	int j;
2191 
2192 	if (prefix_len < 0 || prefix_len > IPV6_ABITS)
2193 		return (_B_FALSE);
2194 
2195 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2196 		if (p1.s6_addr[j] != p2.s6_addr[j])
2197 			return (_B_FALSE);
2198 
2199 	/* Make the N leftmost bits one */
2200 	mask = 0xff << (8 - prefix_len);
2201 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2202 		return (_B_FALSE);
2203 
2204 	return (_B_TRUE);
2205 }
2206 
2207 /*
2208  * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both
2209  * IPv4 and IPv6 put together. The phyint with the least such number
2210  * will be used as the failover destination, if no standby interface is
2211  * available
2212  */
2213 int
2214 logint_upcount(struct phyint *pi)
2215 {
2216 	struct	logint	*li;
2217 	struct	phyint_instance *pii;
2218 	int count = 0;
2219 
2220 	pii = pi->pi_v4;
2221 	if (pii != NULL) {
2222 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
2223 			if ((li->li_flags &
2224 			    (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
2225 				count++;
2226 			}
2227 		}
2228 	}
2229 
2230 	pii = pi->pi_v6;
2231 	if (pii != NULL) {
2232 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
2233 			if ((li->li_flags &
2234 			    (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
2235 				count++;
2236 			}
2237 		}
2238 	}
2239 
2240 	return (count);
2241 }
2242 
2243 /*
2244  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2245  */
2246 struct phyint_instance *
2247 phyint_inst_other(struct phyint_instance *pii)
2248 {
2249 	if (pii->pii_af == AF_INET)
2250 		return (pii->pii_phyint->pi_v6);
2251 	else
2252 		return (pii->pii_phyint->pi_v4);
2253 }
2254 
2255 /*
2256  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2257  * Before sending the event, it prepends the current version of the IPMP
2258  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2259  * `nvl' is freed).
2260  */
2261 static int
2262 post_event(const char *subclass, nvlist_t *nvl)
2263 {
2264 	sysevent_id_t eid;
2265 
2266 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2267 	    IPMP_EVENT_CUR_VERSION);
2268 	if (errno != 0) {
2269 		logerr("cannot create `%s' event: %s", subclass,
2270 		    strerror(errno));
2271 		goto failed;
2272 	}
2273 
2274 	if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR,
2275 	    "in.mpathd", nvl, &eid) == -1) {
2276 		logerr("cannot send `%s' event: %s\n", subclass,
2277 		    strerror(errno));
2278 		goto failed;
2279 	}
2280 
2281 	nvlist_free(nvl);
2282 	return (0);
2283 failed:
2284 	nvlist_free(nvl);
2285 	return (-1);
2286 }
2287 
2288 /*
2289  * Return the external IPMP state associated with phyint `pi'.
2290  */
2291 static ipmp_if_state_t
2292 ifstate(struct phyint *pi)
2293 {
2294 	switch (pi->pi_state) {
2295 	case PI_NOTARGETS:
2296 		return (IPMP_IF_UNKNOWN);
2297 
2298 	case PI_OFFLINE:
2299 		return (IPMP_IF_OFFLINE);
2300 
2301 	case PI_FAILED:
2302 		return (IPMP_IF_FAILED);
2303 
2304 	case PI_RUNNING:
2305 		return (IPMP_IF_OK);
2306 	}
2307 
2308 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2309 	abort();
2310 	/* NOTREACHED */
2311 }
2312 
2313 /*
2314  * Return the external IPMP interface type associated with phyint `pi'.
2315  */
2316 static ipmp_if_type_t
2317 iftype(struct phyint *pi)
2318 {
2319 	if (pi->pi_flags & IFF_STANDBY)
2320 		return (IPMP_IF_STANDBY);
2321 	else
2322 		return (IPMP_IF_NORMAL);
2323 }
2324 
2325 /*
2326  * Return the external IPMP group state associated with phyint group `pg'.
2327  */
2328 static ipmp_group_state_t
2329 groupstate(struct phyint_group *pg)
2330 {
2331 	return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK);
2332 }
2333 
2334 /*
2335  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2336  * Returns 0 on success, -1 on failure.
2337  */
2338 static int
2339 phyint_group_state_event(struct phyint_group *pg)
2340 {
2341 	nvlist_t	*nvl;
2342 
2343 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2344 	if (errno != 0) {
2345 		logperror("cannot create `group state change' event");
2346 		return (-1);
2347 	}
2348 
2349 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2350 	if (errno != 0)
2351 		goto failed;
2352 
2353 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2354 	if (errno != 0)
2355 		goto failed;
2356 
2357 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2358 	if (errno != 0)
2359 		goto failed;
2360 
2361 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2362 failed:
2363 	logperror("cannot create `group state change' event");
2364 	nvlist_free(nvl);
2365 	return (-1);
2366 }
2367 
2368 /*
2369  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2370  * `pg'.  Returns 0 on success, -1 on failure.
2371  */
2372 static int
2373 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2374 {
2375 	nvlist_t *nvl;
2376 
2377 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2378 	if (errno != 0) {
2379 		logperror("cannot create `group change' event");
2380 		return (-1);
2381 	}
2382 
2383 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2384 	if (errno != 0)
2385 		goto failed;
2386 
2387 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2388 	if (errno != 0)
2389 		goto failed;
2390 
2391 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
2392 	    phyint_grouplistsig);
2393 	if (errno != 0)
2394 		goto failed;
2395 
2396 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
2397 	if (errno != 0)
2398 		goto failed;
2399 
2400 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
2401 failed:
2402 	logperror("cannot create `group change' event");
2403 	nvlist_free(nvl);
2404 	return (-1);
2405 }
2406 
2407 /*
2408  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
2409  * group `pg'.	Returns 0 on success, -1 on failure.
2410  */
2411 static int
2412 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
2413     ipmp_if_op_t op)
2414 {
2415 	nvlist_t *nvl;
2416 
2417 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2418 	if (errno != 0) {
2419 		logperror("cannot create `group member change' event");
2420 		return (-1);
2421 	}
2422 
2423 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2424 	if (errno != 0)
2425 		goto failed;
2426 
2427 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2428 	if (errno != 0)
2429 		goto failed;
2430 
2431 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
2432 	if (errno != 0)
2433 		goto failed;
2434 
2435 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
2436 	if (errno != 0)
2437 		goto failed;
2438 
2439 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
2440 	if (errno != 0)
2441 		goto failed;
2442 
2443 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
2444 	if (errno != 0)
2445 		goto failed;
2446 
2447 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
2448 failed:
2449 	logperror("cannot create `group member change' event");
2450 	nvlist_free(nvl);
2451 	return (-1);
2452 
2453 }
2454 
2455 /*
2456  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
2457  * Returns 0 on success, -1 on failure.
2458  */
2459 static int
2460 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
2461 {
2462 	nvlist_t *nvl;
2463 
2464 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2465 	if (errno != 0) {
2466 		logperror("cannot create `interface change' event");
2467 		return (-1);
2468 	}
2469 
2470 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2471 	if (errno != 0)
2472 		goto failed;
2473 
2474 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2475 	if (errno != 0)
2476 		goto failed;
2477 
2478 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
2479 	if (errno != 0)
2480 		goto failed;
2481 
2482 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
2483 	if (errno != 0)
2484 		goto failed;
2485 
2486 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
2487 	if (errno != 0)
2488 		goto failed;
2489 
2490 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
2491 failed:
2492 	logperror("cannot create `interface change' event");
2493 	nvlist_free(nvl);
2494 	return (-1);
2495 
2496 }
2497 
2498 /*
2499  * Generate a signature for use.  The signature is conceptually divided
2500  * into two pieces: a random 16-bit "generation number" and a 48-bit
2501  * monotonically increasing integer.  The generation number protects
2502  * against stale updates to entities (e.g., IPMP groups) that have been
2503  * deleted and since recreated.
2504  */
2505 static uint64_t
2506 gensig(void)
2507 {
2508 	static int seeded = 0;
2509 
2510 	if (seeded == 0) {
2511 		srand48((long)gethrtime());
2512 		seeded++;
2513 	}
2514 
2515 	return ((uint64_t)lrand48() << 48 | 1);
2516 }
2517 
2518 /*
2519  * Store the information associated with group `grname' into a dynamically
2520  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
2521  */
2522 unsigned int
2523 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
2524 {
2525 	struct phyint_group	*pg;
2526 	struct phyint		*pi;
2527 	char			(*ifs)[LIFNAMSIZ];
2528 	unsigned int		nif, i;
2529 
2530 	pg = phyint_group_lookup(grname);
2531 	if (pg == NULL)
2532 		return (IPMP_EUNKGROUP);
2533 
2534 	/*
2535 	 * Tally up the number of interfaces, allocate an array to hold them,
2536 	 * and insert their names into the array.
2537 	 */
2538 	for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
2539 		nif++;
2540 
2541 	ifs = alloca(nif * sizeof (*ifs));
2542 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
2543 		assert(i < nif);
2544 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
2545 	}
2546 	assert(i == nif);
2547 
2548 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig,
2549 	    groupstate(pg), nif, ifs);
2550 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2551 }
2552 
2553 /*
2554  * Store the information associated with interface `ifname' into a dynamically
2555  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
2556  */
2557 unsigned int
2558 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
2559 {
2560 	struct phyint	*pi;
2561 
2562 	pi = phyint_lookup(ifname);
2563 	if (pi == NULL)
2564 		return (IPMP_EUNKIF);
2565 
2566 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
2567 	    ifstate(pi), iftype(pi));
2568 	return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2569 }
2570 
2571 /*
2572  * Store the current list of IPMP groups into a dynamically allocated
2573  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
2574  */
2575 unsigned int
2576 getgrouplist(ipmp_grouplist_t **grlistpp)
2577 {
2578 	struct phyint_group	*pg;
2579 	char			(*groups)[LIFGRNAMSIZ];
2580 	unsigned int		i, ngroup;
2581 
2582 	/*
2583 	 * Tally up the number of groups, allocate an array to hold them, and
2584 	 * insert their names into the array.
2585 	 */
2586 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
2587 		ngroup++;
2588 
2589 	groups = alloca(ngroup * sizeof (*groups));
2590 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
2591 		assert(i < ngroup);
2592 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
2593 	}
2594 	assert(i == ngroup);
2595 
2596 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
2597 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2598 }
2599 
2600 /*
2601  * Store a snapshot of the IPMP subsystem into a dynamically allocated
2602  * structure pointed to by `*snapp'.  Returns an IPMP error code.
2603  */
2604 unsigned int
2605 getsnap(ipmp_snap_t **snapp)
2606 {
2607 	ipmp_grouplist_t	*grlistp;
2608 	ipmp_groupinfo_t	*grinfop;
2609 	ipmp_ifinfo_t		*ifinfop;
2610 	ipmp_snap_t		*snap;
2611 	struct phyint		*pi;
2612 	unsigned int		i;
2613 	int			retval;
2614 
2615 	snap = ipmp_snap_create();
2616 	if (snap == NULL)
2617 		return (IPMP_ENOMEM);
2618 
2619 	/*
2620 	 * Add group list.
2621 	 */
2622 	retval = getgrouplist(&snap->sn_grlistp);
2623 	if (retval != IPMP_SUCCESS) {
2624 		ipmp_snap_free(snap);
2625 		return (retval);
2626 	}
2627 
2628 	/*
2629 	 * Add information for each group in the list.
2630 	 */
2631 	grlistp = snap->sn_grlistp;
2632 	for (i = 0; i < grlistp->gl_ngroup; i++) {
2633 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
2634 		if (retval != IPMP_SUCCESS) {
2635 			ipmp_snap_free(snap);
2636 			return (retval);
2637 		}
2638 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
2639 		if (retval != IPMP_SUCCESS) {
2640 			ipmp_freegroupinfo(grinfop);
2641 			ipmp_snap_free(snap);
2642 			return (retval);
2643 		}
2644 	}
2645 
2646 	/*
2647 	 * Add information for each configured phyint.
2648 	 */
2649 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2650 		retval = getifinfo(pi->pi_name, &ifinfop);
2651 		if (retval != IPMP_SUCCESS) {
2652 			ipmp_snap_free(snap);
2653 			return (retval);
2654 		}
2655 		retval = ipmp_snap_addifinfo(snap, ifinfop);
2656 		if (retval != IPMP_SUCCESS) {
2657 			ipmp_freeifinfo(ifinfop);
2658 			ipmp_snap_free(snap);
2659 			return (retval);
2660 		}
2661 	}
2662 
2663 	*snapp = snap;
2664 	return (IPMP_SUCCESS);
2665 }
2666