xref: /titanic_44/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision f88012516de80b8841b4dd45ea1b3b2e15dc47d0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30 
31 /*
32  * Global list of phyints, phyint instances, phyint groups and the anonymous
33  * group; the latter is initialized in phyint_init().
34  */
35 struct phyint *phyints = NULL;
36 struct phyint_instance	*phyint_instances = NULL;
37 struct phyint_group *phyint_groups = NULL;
38 struct phyint_group *phyint_anongroup;
39 
40 /*
41  * Grouplist signature; initialized in phyint_init().
42  */
43 static uint64_t phyint_grouplistsig;
44 
45 static void phyint_inst_insert(struct phyint_instance *pii);
46 static void phyint_inst_print(struct phyint_instance *pii);
47 
48 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
49 static void phyint_delete(struct phyint *pi);
50 
51 static void phyint_group_insert(struct phyint_group *pg);
52 static void phyint_group_delete(struct phyint_group *pg);
53 static struct phyint_group *phyint_group_lookup(const char *pg_name);
54 static struct phyint_group *phyint_group_create(const char *pg_name);
55 
56 static void logint_print(struct logint *li);
57 static void logint_insert(struct phyint_instance *pii, struct logint *li);
58 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
59 
60 static void target_print(struct target *tg);
61 static void target_insert(struct phyint_instance *pii, struct target *tg);
62 static struct target *target_first(struct phyint_instance *pii);
63 static struct target *target_select_best(struct phyint_instance *pii);
64 static void target_flush_hosts(struct phyint_group *pg);
65 
66 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
67 
68 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
69 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
70 
71 static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask);
72 static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2,
73     int prefix_len);
74 
75 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
76 static int phyint_group_state_event(struct phyint_group *pg);
77 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
78 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
79     ipmp_if_op_t op);
80 
81 static uint64_t gensig(void);
82 
83 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
84 int
85 phyint_init(void)
86 {
87 	phyint_grouplistsig = gensig();
88 	if (track_all_phyints) {
89 		phyint_anongroup = phyint_group_create("");
90 		if (phyint_anongroup == NULL)
91 			return (-1);
92 		phyint_group_insert(phyint_anongroup);
93 	}
94 	return (0);
95 }
96 
97 /* Return the phyint with the given name */
98 struct phyint *
99 phyint_lookup(const char *name)
100 {
101 	struct phyint *pi;
102 
103 	if (debug & D_PHYINT)
104 		logdebug("phyint_lookup(%s)\n", name);
105 
106 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
107 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
108 			break;
109 	}
110 	return (pi);
111 }
112 
113 /* Return the phyint instance with the given name and the given family */
114 struct phyint_instance *
115 phyint_inst_lookup(int af, char *name)
116 {
117 	struct phyint *pi;
118 
119 	if (debug & D_PHYINT)
120 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
121 
122 	assert(af == AF_INET || af == AF_INET6);
123 
124 	pi = phyint_lookup(name);
125 	if (pi == NULL)
126 		return (NULL);
127 
128 	return (PHYINT_INSTANCE(pi, af));
129 }
130 
131 static struct phyint_group *
132 phyint_group_lookup(const char *pg_name)
133 {
134 	struct phyint_group *pg;
135 
136 	if (debug & D_PHYINT)
137 		logdebug("phyint_group_lookup(%s)\n", pg_name);
138 
139 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
140 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
141 			break;
142 	}
143 	return (pg);
144 }
145 
146 /*
147  * Insert the phyint in the linked list of all phyints. If the phyint belongs
148  * to some group, insert it in the phyint group list.
149  */
150 static void
151 phyint_insert(struct phyint *pi, struct phyint_group *pg)
152 {
153 	if (debug & D_PHYINT)
154 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
155 
156 	/* Insert the phyint at the head of the 'all phyints' list */
157 	pi->pi_next = phyints;
158 	pi->pi_prev = NULL;
159 	if (phyints != NULL)
160 		phyints->pi_prev = pi;
161 	phyints = pi;
162 
163 	/*
164 	 * Insert the phyint at the head of the 'phyint_group members' list
165 	 * of the phyint group to which it belongs.
166 	 */
167 	pi->pi_pgnext = NULL;
168 	pi->pi_pgprev = NULL;
169 	pi->pi_group = pg;
170 
171 	pi->pi_pgnext = pg->pg_phyint;
172 	if (pi->pi_pgnext != NULL)
173 		pi->pi_pgnext->pi_pgprev = pi;
174 	pg->pg_phyint = pi;
175 
176 	pg->pg_sig++;
177 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
178 }
179 
180 /* Insert the phyint instance in the linked list of all phyint instances. */
181 static void
182 phyint_inst_insert(struct phyint_instance *pii)
183 {
184 	if (debug & D_PHYINT) {
185 		logdebug("phyint_inst_insert(%s %s)\n",
186 		    AF_STR(pii->pii_af), pii->pii_name);
187 	}
188 
189 	/*
190 	 * Insert the phyint at the head of the 'all phyint instances' list.
191 	 */
192 	pii->pii_next = phyint_instances;
193 	pii->pii_prev = NULL;
194 	if (phyint_instances != NULL)
195 		phyint_instances->pii_prev = pii;
196 	phyint_instances = pii;
197 }
198 
199 /*
200  * Create a new phyint with the given parameters. Also insert it into
201  * the list of all phyints and the list of phyint group members by calling
202  * phyint_insert().
203  */
204 static struct phyint *
205 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
206     uint64_t flags)
207 {
208 	struct phyint *pi;
209 
210 	pi = calloc(1, sizeof (struct phyint));
211 	if (pi == NULL) {
212 		logperror("phyint_create: calloc");
213 		return (NULL);
214 	}
215 
216 	/*
217 	 * Record the phyint values. Also insert the phyint into the
218 	 * phyint group by calling phyint_insert().
219 	 */
220 	(void) strncpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
221 	pi->pi_name[sizeof (pi->pi_name) - 1] = '\0';
222 	pi->pi_ifindex = ifindex;
223 	pi->pi_icmpid =
224 	    htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF));
225 	/*
226 	 * We optimistically start in the PI_RUNNING state.  Later (in
227 	 * process_link_state_changes()), we will readjust this to match the
228 	 * current state of the link.  Further, if test addresses are
229 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
230 	 * either PI_RUNNING or PI_FAILED, depending on the result of the test
231 	 * probes.
232 	 */
233 	pi->pi_state = PI_RUNNING;
234 	pi->pi_flags = PHYINT_FLAGS(flags);
235 	/*
236 	 * Initialise the link state.  The link state is initialised to
237 	 * up, so that if the link is down when IPMP starts monitoring
238 	 * the interface, it will appear as though there has been a
239 	 * transition from the link up to link down.  This avoids
240 	 * having to treat this situation as a special case.
241 	 */
242 	INIT_LINK_STATE(pi);
243 
244 	/*
245 	 * Insert the phyint in the list of all phyints, and the
246 	 * list of phyint group members
247 	 */
248 	phyint_insert(pi, pg);
249 
250 	/*
251 	 * If we are joining a failed group, mark the interface as
252 	 * failed.
253 	 */
254 	if (GROUP_FAILED(pg))
255 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
256 
257 	return (pi);
258 }
259 
260 /*
261  * Create a new phyint instance belonging to the phyint 'pi' and address
262  * family 'af'. Also insert it into the list of all phyint instances by
263  * calling phyint_inst_insert().
264  */
265 static struct phyint_instance *
266 phyint_inst_create(struct phyint *pi, int af)
267 {
268 	struct phyint_instance *pii;
269 
270 	pii = calloc(1, sizeof (struct phyint_instance));
271 	if (pii == NULL) {
272 		logperror("phyint_inst_create: calloc");
273 		return (NULL);
274 	}
275 
276 	/*
277 	 * Attach the phyint instance to the phyint.
278 	 * Set the back pointers as well
279 	 */
280 	pii->pii_phyint = pi;
281 	if (af == AF_INET)
282 		pi->pi_v4 = pii;
283 	else
284 		pi->pi_v6 = pii;
285 
286 	pii->pii_in_use = 1;
287 	pii->pii_probe_sock = -1;
288 	pii->pii_snxt = 1;
289 	pii->pii_af = af;
290 	pii->pii_fd_hrtime = gethrtime() +
291 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
292 	pii->pii_flags = pi->pi_flags;
293 
294 	/* Insert the phyint instance in the list of all phyint instances. */
295 	phyint_inst_insert(pii);
296 	return (pii);
297 }
298 
299 /*
300  * Change the state of phyint `pi' to state `state'.
301  */
302 void
303 phyint_chstate(struct phyint *pi, enum pi_state state)
304 {
305 	/*
306 	 * To simplify things, some callers always set a given state
307 	 * regardless of the previous state of the phyint (e.g., setting
308 	 * PI_RUNNING when it's already set).  We shouldn't bother
309 	 * generating an event or consuming a signature for these, since
310 	 * the actual state of the interface is unchanged.
311 	 */
312 	if (pi->pi_state == state)
313 		return;
314 
315 	pi->pi_state = state;
316 	pi->pi_group->pg_sig++;
317 	(void) phyint_state_event(pi->pi_group, pi);
318 }
319 
320 /*
321  * Note that the type of phyint `pi' has changed.
322  */
323 void
324 phyint_newtype(struct phyint *pi)
325 {
326 	pi->pi_group->pg_sig++;
327 	(void) phyint_state_event(pi->pi_group, pi);
328 }
329 
330 /*
331  * Insert the phyint group in the linked list of all phyint groups
332  * at the head of the list
333  */
334 static void
335 phyint_group_insert(struct phyint_group *pg)
336 {
337 	pg->pg_next = phyint_groups;
338 	pg->pg_prev = NULL;
339 	if (phyint_groups != NULL)
340 		phyint_groups->pg_prev = pg;
341 	phyint_groups = pg;
342 
343 	phyint_grouplistsig++;
344 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
345 }
346 
347 /*
348  * Create a new phyint group called 'name'.
349  */
350 static struct phyint_group *
351 phyint_group_create(const char *name)
352 {
353 	struct	phyint_group *pg;
354 
355 	if (debug & D_PHYINT)
356 		logdebug("phyint_group_create(%s)\n", name);
357 
358 	pg = calloc(1, sizeof (struct phyint_group));
359 	if (pg == NULL) {
360 		logperror("phyint_group_create: calloc");
361 		return (NULL);
362 	}
363 
364 	(void) strncpy(pg->pg_name, name, sizeof (pg->pg_name));
365 	pg->pg_name[sizeof (pg->pg_name) - 1] = '\0';
366 	pg->pg_sig = gensig();
367 
368 	pg->pg_fdt = user_failure_detection_time;
369 	pg->pg_probeint = user_probe_interval;
370 
371 	return (pg);
372 }
373 
374 /*
375  * Change the state of the phyint group `pg' to state `state'.
376  */
377 void
378 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
379 {
380 	assert(pg != phyint_anongroup);
381 
382 	switch (state) {
383 	case PG_FAILED:
384 		pg->pg_groupfailed = 1;
385 
386 		/*
387 		 * We can never know with certainty that a group has
388 		 * failed.  It is possible that all known targets have
389 		 * failed simultaneously, and new targets have come up
390 		 * instead. If the targets are routers then router
391 		 * discovery will kick in, and we will see the new routers
392 		 * thru routing socket messages. But if the targets are
393 		 * hosts, we have to discover it by multicast.	So flush
394 		 * all the host targets. The next probe will send out a
395 		 * multicast echo request. If this is a group failure, we
396 		 * will still not see any response, otherwise we will
397 		 * clear the pg_groupfailed flag after we get
398 		 * NUM_PROBE_REPAIRS consecutive unicast replies on any
399 		 * phyint.
400 		 */
401 		target_flush_hosts(pg);
402 		break;
403 
404 	case PG_RUNNING:
405 		pg->pg_groupfailed = 0;
406 		break;
407 
408 	default:
409 		logerr("phyint_group_chstate: invalid group state %d; "
410 		    "aborting\n", state);
411 		abort();
412 	}
413 
414 	pg->pg_sig++;
415 	(void) phyint_group_state_event(pg);
416 }
417 
418 /*
419  * Create a new phyint instance and initialize it from the values supplied by
420  * the kernel. Always check for ENXIO before logging any error, because the
421  * interface could have vanished after completion of SIOCGLIFCONF.
422  * Return values:
423  *	pointer to the phyint instance on success
424  *	NULL on failure Eg. if the phyint instance is not found in the kernel
425  */
426 struct phyint_instance *
427 phyint_inst_init_from_k(int af, char *pi_name)
428 {
429 	char	pg_name[LIFNAMSIZ + 1];
430 	int	ifsock;
431 	uint_t	ifindex;
432 	uint64_t	flags;
433 	struct lifreq	lifr;
434 	struct phyint	*pi;
435 	struct phyint_instance	*pii;
436 	boolean_t	pg_created;
437 	boolean_t	pi_created;
438 	struct phyint_group	*pg;
439 
440 retry:
441 	pii = NULL;
442 	pi = NULL;
443 	pg = NULL;
444 	pi_created = _B_FALSE;
445 	pg_created = _B_FALSE;
446 
447 	if (debug & D_PHYINT) {
448 		logdebug("phyint_inst_init_from_k(%s %s)\n",
449 		    AF_STR(af), pi_name);
450 	}
451 
452 	assert(af == AF_INET || af == AF_INET6);
453 
454 	/* Get the socket for doing ioctls */
455 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
456 
457 	/*
458 	 * Get the interface flags. Ignore loopback and multipoint
459 	 * interfaces.
460 	 */
461 	(void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
462 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
463 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
464 		if (errno != ENXIO) {
465 			logperror("phyint_inst_init_from_k:"
466 			    " ioctl (get flags)");
467 		}
468 		return (NULL);
469 	}
470 	flags = lifr.lifr_flags;
471 	if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK))
472 		return (NULL);
473 
474 	/*
475 	 * Get the ifindex for recording later in our tables, in case we need
476 	 * to create a new phyint.
477 	 */
478 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
479 		if (errno != ENXIO) {
480 			logperror("phyint_inst_init_from_k: "
481 			    " ioctl (get lifindex)");
482 		}
483 		return (NULL);
484 	}
485 	ifindex = lifr.lifr_index;
486 
487 	/*
488 	 * Get the phyint group name of this phyint, from the kernel.
489 	 */
490 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
491 		if (errno != ENXIO) {
492 			logperror("phyint_inst_init_from_k: "
493 			    "ioctl (get group name)");
494 		}
495 		return (NULL);
496 	}
497 	(void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
498 	pg_name[sizeof (pg_name) - 1] = '\0';
499 
500 	/*
501 	 * If the phyint is not part of any group, pg_name is the
502 	 * null string. If 'track_all_phyints' is false, there is no
503 	 * need to create a phyint.
504 	 */
505 	if (pg_name[0] == '\0' && !track_all_phyints) {
506 		/*
507 		 * If the IFF_FAILED or IFF_OFFLINE flags are set, reset
508 		 * them. These flags shouldn't be set if IPMP isn't
509 		 * tracking the interface.
510 		 */
511 		if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) {
512 			lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE);
513 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
514 				if (errno != ENXIO) {
515 					logperror("phyint_inst_init_from_k:"
516 					    " ioctl (set flags)");
517 				}
518 			}
519 		}
520 		return (NULL);
521 	}
522 
523 	/*
524 	 * We need to create a new phyint instance. A phyint instance
525 	 * belongs to a phyint, and the phyint belongs to a phyint group.
526 	 * So we first lookup the 'parents' and if they don't exist then
527 	 * we create them.
528 	 */
529 	pg = phyint_group_lookup(pg_name);
530 	if (pg == NULL) {
531 		pg = phyint_group_create(pg_name);
532 		if (pg == NULL) {
533 			logerr("phyint_inst_init_from_k:"
534 			    " unable to create group %s\n", pg_name);
535 			return (NULL);
536 		}
537 		phyint_group_insert(pg);
538 		pg_created = _B_TRUE;
539 	}
540 
541 	/*
542 	 * Lookup the phyint. If the phyint does not exist create it.
543 	 */
544 	pi = phyint_lookup(pi_name);
545 	if (pi == NULL) {
546 		pi = phyint_create(pi_name, pg, ifindex, flags);
547 		if (pi == NULL) {
548 			logerr("phyint_inst_init_from_k:"
549 			    " unable to create phyint %s\n", pi_name);
550 			if (pg_created)
551 				phyint_group_delete(pg);
552 			return (NULL);
553 		}
554 		pi_created = _B_TRUE;
555 	} else {
556 		/* The phyint exists already. */
557 		assert(pi_created == _B_FALSE);
558 		/*
559 		 * Normally we should see consistent values for the IPv4 and
560 		 * IPv6 instances, for phyint properties. If we don't, it
561 		 * means things have changed underneath us, and we should
562 		 * resync our tables with the kernel. Check whether the
563 		 * interface index has changed. If so, it is most likely
564 		 * the interface has been unplumbed and replumbed,
565 		 * while we are yet to update our tables. Do it now.
566 		 */
567 		if (pi->pi_ifindex != ifindex) {
568 			if (pg_created)
569 				phyint_group_delete(pg);
570 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
571 			goto retry;
572 		}
573 		assert(PHYINT_INSTANCE(pi, af) == NULL);
574 
575 		/*
576 		 * If the group name seen by the IPv4 and IPv6 instances
577 		 * are different, it is most likely the groupname has
578 		 * changed, while we are yet to update our tables. Do it now.
579 		 */
580 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
581 			if (pg_created)
582 				phyint_group_delete(pg);
583 			restore_phyint(pi);
584 			phyint_inst_delete(PHYINT_INSTANCE(pi,
585 			    AF_OTHER(af)));
586 			goto retry;
587 		}
588 	}
589 
590 	/*
591 	 * Create a new phyint instance, corresponding to the 'af'
592 	 * passed in.
593 	 */
594 	pii = phyint_inst_create(pi, af);
595 	if (pii == NULL) {
596 		logerr("phyint_inst_init_from_k: unable to create"
597 		    "phyint inst %s\n", pi->pi_name);
598 		if (pi_created) {
599 			/*
600 			 * Deleting the phyint will delete the phyint group
601 			 * if this is the last phyint in the group.
602 			 */
603 			phyint_delete(pi);
604 		}
605 		return (NULL);
606 	}
607 
608 	return (pii);
609 }
610 
611 /*
612  * Bind pii_probe_sock to the address associated with pii_probe_logint.
613  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
614  * targets. Do the common part in this function, and complete the
615  * initializations by calling the protocol specific functions
616  * phyint_inst_v{4,6}_sockinit() respectively.
617  *
618  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
619  */
620 boolean_t
621 phyint_inst_sockinit(struct phyint_instance *pii)
622 {
623 	boolean_t success;
624 	struct phyint_group *pg;
625 
626 	if (debug & D_PHYINT) {
627 		logdebug("phyint_inst_sockinit(%s %s)\n",
628 		    AF_STR(pii->pii_af), pii->pii_name);
629 	}
630 
631 	assert(pii->pii_probe_logint != NULL);
632 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
633 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
634 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
635 
636 	/*
637 	 * If the socket is already bound, close pii_probe_sock
638 	 */
639 	if (pii->pii_probe_sock != -1)
640 		close_probe_socket(pii, _B_TRUE);
641 
642 	/*
643 	 * If the phyint is not part of a named group and track_all_phyints is
644 	 * false, simply return.
645 	 */
646 	pg = pii->pii_phyint->pi_group;
647 	if (pg == phyint_anongroup && !track_all_phyints) {
648 		if (debug & D_PHYINT)
649 			logdebug("phyint_inst_sockinit: no group\n");
650 		return (_B_FALSE);
651 	}
652 
653 	/*
654 	 * Initialize the socket by calling the protocol specific function.
655 	 * If it succeeds, add the socket to the poll list.
656 	 */
657 	if (pii->pii_af == AF_INET6)
658 		success = phyint_inst_v6_sockinit(pii);
659 	else
660 		success = phyint_inst_v4_sockinit(pii);
661 
662 	if (success && (poll_add(pii->pii_probe_sock) == 0))
663 		return (_B_TRUE);
664 
665 	/* Something failed, cleanup and return false */
666 	if (pii->pii_probe_sock != -1)
667 		close_probe_socket(pii, _B_FALSE);
668 
669 	return (_B_FALSE);
670 }
671 
672 /*
673  * IPv6 specific part in initializing the pii_probe_sock. This socket is
674  * used to send/receive ICMPv6 probe packets.
675  */
676 static boolean_t
677 phyint_inst_v6_sockinit(struct phyint_instance *pii)
678 {
679 	icmp6_filter_t filter;
680 	int hopcount = 1;
681 	int int_op;
682 	struct	sockaddr_in6	testaddr;
683 
684 	/*
685 	 * Open a raw socket with ICMPv6 protocol.
686 	 *
687 	 * Use IPV6_DONTFAILOVER_IF to make sure that probes go out
688 	 * on the specified phyint only, and are not subject to load
689 	 * balancing. Bind to the src address chosen will ensure that
690 	 * the responses are received only on the specified phyint.
691 	 *
692 	 * Set the hopcount to 1 so that probe packets are not routed.
693 	 * Disable multicast loopback. Set the receive filter to
694 	 * receive only ICMPv6 echo replies.
695 	 */
696 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
697 	if (pii->pii_probe_sock < 0) {
698 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
699 		return (_B_FALSE);
700 }
701 
702 	bzero(&testaddr, sizeof (testaddr));
703 	testaddr.sin6_family = AF_INET6;
704 	testaddr.sin6_port = 0;
705 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
706 
707 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
708 	    sizeof (testaddr)) < 0) {
709 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
710 		return (_B_FALSE);
711 	}
712 
713 	/*
714 	 * IPV6_DONTFAILOVER_IF option takes precedence over setting
715 	 * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again.
716 	 */
717 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF,
718 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
719 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
720 		    " IPV6_DONTFAILOVER_IF");
721 		return (_B_FALSE);
722 	}
723 
724 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
725 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
726 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
727 		    " IPV6_UNICAST_HOPS");
728 		return (_B_FALSE);
729 	}
730 
731 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
732 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
733 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
734 		    " IPV6_MULTICAST_HOPS");
735 		return (_B_FALSE);
736 	}
737 
738 	int_op = 0;	/* used to turn off option */
739 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
740 	    (char *)&int_op, sizeof (int_op)) < 0) {
741 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
742 		    " IPV6_MULTICAST_LOOP");
743 		return (_B_FALSE);
744 	}
745 
746 	/*
747 	 * Filter out so that we only receive ICMP echo replies
748 	 */
749 	ICMP6_FILTER_SETBLOCKALL(&filter);
750 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
751 
752 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
753 	    (char *)&filter, sizeof (filter)) < 0) {
754 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
755 		    " ICMP6_FILTER");
756 		return (_B_FALSE);
757 	}
758 
759 	/* Enable receipt of ancillary data */
760 	int_op = 1;
761 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
762 	    (char *)&int_op, sizeof (int_op)) < 0) {
763 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
764 		    " IPV6_RECVHOPLIMIT");
765 		return (_B_FALSE);
766 	}
767 
768 	return (_B_TRUE);
769 }
770 
771 /*
772  * IPv4 specific part in initializing the pii_probe_sock. This socket is
773  * used to send/receive ICMPv4 probe packets.
774  */
775 static boolean_t
776 phyint_inst_v4_sockinit(struct phyint_instance *pii)
777 {
778 	struct sockaddr_in  testaddr;
779 	char	char_op;
780 	int	ttl = 1;
781 	char	char_ttl = 1;
782 
783 	/*
784 	 * Open a raw socket with ICMPv4 protocol.
785 	 *
786 	 * Use IP_DONTFAILOVER_IF to make sure that probes go out
787 	 * on the specified phyint only, and are not subject to load
788 	 * balancing. Bind to the src address chosen will ensure that
789 	 * the responses are received only on the specified phyint.
790 	 *
791 	 * Set the ttl to 1 so that probe packets are not routed.
792 	 * Disable multicast loopback.
793 	 */
794 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
795 	if (pii->pii_probe_sock < 0) {
796 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
797 		return (_B_FALSE);
798 	}
799 
800 	bzero(&testaddr, sizeof (testaddr));
801 	testaddr.sin_family = AF_INET;
802 	testaddr.sin_port = 0;
803 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
804 	    &testaddr.sin_addr);
805 
806 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
807 	    sizeof (testaddr)) < 0) {
808 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
809 		return (_B_FALSE);
810 	}
811 
812 	/*
813 	 * IP_DONTFAILOVER_IF option takes precedence over setting
814 	 * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again.
815 	 */
816 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF,
817 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
818 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
819 		    " IP_DONTFAILOVER");
820 		return (_B_FALSE);
821 	}
822 
823 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
824 	    (char *)&ttl, sizeof (ttl)) < 0) {
825 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
826 		    " IP_TTL");
827 		return (_B_FALSE);
828 	}
829 
830 	char_op = 0;	/* used to turn off option */
831 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
832 	    (char *)&char_op, sizeof (char_op)) == -1) {
833 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
834 		    " IP_MULTICAST_LOOP");
835 		return (_B_FALSE);
836 	}
837 
838 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
839 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
840 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
841 		    " IP_MULTICAST_TTL");
842 		return (_B_FALSE);
843 	}
844 
845 	return (_B_TRUE);
846 }
847 
848 /*
849  * Remove the phyint group from the list of 'all phyint groups'
850  * and free it.
851  */
852 static void
853 phyint_group_delete(struct phyint_group *pg)
854 {
855 	/*
856 	 * The anonymous group always exists, even when empty.
857 	 */
858 	if (pg == phyint_anongroup)
859 		return;
860 
861 	if (debug & D_PHYINT)
862 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
863 
864 	/*
865 	 * The phyint group must be empty, and must not have any phyints.
866 	 * The phyint group must be in the list of all phyint groups
867 	 */
868 	assert(pg->pg_phyint == NULL);
869 	assert(phyint_groups == pg || pg->pg_prev != NULL);
870 
871 	if (pg->pg_prev != NULL)
872 		pg->pg_prev->pg_next = pg->pg_next;
873 	else
874 		phyint_groups = pg->pg_next;
875 
876 	if (pg->pg_next != NULL)
877 		pg->pg_next->pg_prev = pg->pg_prev;
878 
879 	pg->pg_next = NULL;
880 	pg->pg_prev = NULL;
881 
882 	phyint_grouplistsig++;
883 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
884 
885 	free(pg);
886 }
887 
888 /*
889  * Extract information from the kernel about the desired phyint.
890  * Look only for properties of the phyint and not properties of logints.
891  * Take appropriate action on the changes.
892  * Return codes:
893  *	PI_OK
894  *		The phyint exists in the kernel and matches our knowledge
895  *		of the phyint.
896  *	PI_DELETED
897  *		The phyint has vanished in the kernel.
898  *	PI_IFINDEX_CHANGED
899  *		The phyint's interface index has changed.
900  *		Ask the caller to delete and recreate the phyint.
901  *	PI_IOCTL_ERROR
902  *		Some ioctl error. Don't change anything.
903  *	PI_GROUP_CHANGED
904  *		The phyint has changed group.
905  */
906 int
907 phyint_inst_update_from_k(struct phyint_instance *pii)
908 {
909 	struct lifreq lifr;
910 	int	ifsock;
911 	struct phyint *pi;
912 
913 	pi = pii->pii_phyint;
914 
915 	if (debug & D_PHYINT) {
916 		logdebug("phyint_inst_update_from_k(%s %s)\n",
917 		    AF_STR(pii->pii_af), pi->pi_name);
918 	}
919 
920 	/*
921 	 * Get the ifindex from the kernel, for comparison with the
922 	 * value in our tables.
923 	 */
924 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
925 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
926 
927 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
928 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
929 		if (errno == ENXIO) {
930 			return (PI_DELETED);
931 		} else {
932 			logperror_pii(pii, "phyint_inst_update_from_k:"
933 			    " ioctl (get lifindex)");
934 			return (PI_IOCTL_ERROR);
935 		}
936 	}
937 
938 	if (lifr.lifr_index != pi->pi_ifindex) {
939 		/*
940 		 * The index has changed. Most likely the interface has
941 		 * been unplumbed and replumbed. Ask the caller to take
942 		 * appropriate action.
943 		 */
944 		if (debug & D_PHYINT) {
945 			logdebug("phyint_inst_update_from_k:"
946 			    " old index %d new index %d\n",
947 			    pi->pi_ifindex, lifr.lifr_index);
948 		}
949 		return (PI_IFINDEX_CHANGED);
950 	}
951 
952 	/*
953 	 * Get the group name from the kernel, for comparison with
954 	 * the value in our tables.
955 	 */
956 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
957 		if (errno == ENXIO) {
958 			return (PI_DELETED);
959 		} else {
960 			logperror_pii(pii, "phyint_inst_update_from_k:"
961 			    " ioctl (get groupname)");
962 			return (PI_IOCTL_ERROR);
963 		}
964 	}
965 
966 	/*
967 	 * If the phyint has changed group i.e. if the phyint group name
968 	 * returned by the kernel is different, ask the caller to delete
969 	 * and recreate the phyint in the right group
970 	 */
971 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
972 		/* Groupname has changed */
973 		if (debug & D_PHYINT) {
974 			logdebug("phyint_inst_update_from_k:"
975 			    " groupname change\n");
976 		}
977 		return (PI_GROUP_CHANGED);
978 	}
979 
980 	/*
981 	 * Get the current phyint flags from the kernel, and determine what
982 	 * flags have changed by comparing against our tables.	Note that the
983 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
984 	 * that IFF_INACTIVE is really still set on the interface.
985 	 */
986 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
987 		if (errno == ENXIO) {
988 			return (PI_DELETED);
989 		} else {
990 			logperror_pii(pii, "phyint_inst_update_from_k: "
991 			    " ioctl (get flags)");
992 			return (PI_IOCTL_ERROR);
993 		}
994 	}
995 
996 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
997 	if (pi->pi_v4 != NULL)
998 		pi->pi_v4->pii_flags = pi->pi_flags;
999 	if (pi->pi_v6 != NULL)
1000 		pi->pi_v6->pii_flags = pi->pi_flags;
1001 
1002 	if (pi->pi_flags & IFF_FAILED) {
1003 		/*
1004 		 * If we are in the running and full state, we have
1005 		 * completed failbacks successfully and we would have
1006 		 * expected IFF_FAILED to have been clear. That it is
1007 		 * set means there was a race condition. Some other
1008 		 * process turned on the IFF_FAILED flag. Since the
1009 		 * flag setting is not atomic, i.e. a get ioctl followed
1010 		 * by a set ioctl, and since there is no way to set an
1011 		 * individual flag bit, this could have occurred.
1012 		 */
1013 		if (pi->pi_state == PI_RUNNING && pi->pi_full)
1014 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1015 	} else {
1016 		/*
1017 		 * If we are in the failed state, there was a race.
1018 		 * we have completed failover successfully because our
1019 		 * state is failed and empty. Some other process turned
1020 		 * off the IFF_FAILED flag. Same comment as above
1021 		 */
1022 		if (pi->pi_state == PI_FAILED && pi->pi_empty)
1023 			(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1024 	}
1025 
1026 	/* No change in phyint status */
1027 	return (PI_OK);
1028 }
1029 
1030 /*
1031  * Delete the phyint. Remove it from the list of all phyints, and the
1032  * list of phyint group members. If the group becomes empty, delete the
1033  * group also.
1034  */
1035 static void
1036 phyint_delete(struct phyint *pi)
1037 {
1038 	struct phyint_group *pg = pi->pi_group;
1039 
1040 	if (debug & D_PHYINT)
1041 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1042 
1043 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1044 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1045 
1046 	/*
1047 	 * The phyint must belong to a group.
1048 	 */
1049 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1050 
1051 	/* The phyint must be in the list of all phyints */
1052 	assert(phyints == pi || pi->pi_prev != NULL);
1053 
1054 	/* Remove the phyint from the phyint group list */
1055 	pg->pg_sig++;
1056 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1057 
1058 	if (pi->pi_pgprev == NULL) {
1059 		/* Phyint is the 1st in the phyint group list */
1060 		pg->pg_phyint = pi->pi_pgnext;
1061 	} else {
1062 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1063 	}
1064 	if (pi->pi_pgnext != NULL)
1065 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1066 	pi->pi_pgnext = NULL;
1067 	pi->pi_pgprev = NULL;
1068 
1069 	/* Remove the phyint from the global list of phyints */
1070 	if (pi->pi_prev == NULL) {
1071 		/* Phyint is the 1st in the list */
1072 		phyints = pi->pi_next;
1073 	} else {
1074 		pi->pi_prev->pi_next = pi->pi_next;
1075 	}
1076 	if (pi->pi_next != NULL)
1077 		pi->pi_next->pi_prev = pi->pi_prev;
1078 	pi->pi_next = NULL;
1079 	pi->pi_prev = NULL;
1080 
1081 	free(pi);
1082 
1083 	/* Delete the phyint_group if the last phyint has been deleted */
1084 	if (pg->pg_phyint == NULL)
1085 		phyint_group_delete(pg);
1086 }
1087 
1088 /*
1089  * Delete (unlink and free), the phyint instance.
1090  */
1091 void
1092 phyint_inst_delete(struct phyint_instance *pii)
1093 {
1094 	struct phyint *pi = pii->pii_phyint;
1095 
1096 	assert(pi != NULL);
1097 
1098 	if (debug & D_PHYINT) {
1099 		logdebug("phyint_inst_delete(%s %s)\n",
1100 		    AF_STR(pii->pii_af), pi->pi_name);
1101 	}
1102 
1103 	/*
1104 	 * If the phyint instance has associated probe targets
1105 	 * delete all the targets
1106 	 */
1107 	while (pii->pii_targets != NULL)
1108 		target_delete(pii->pii_targets);
1109 
1110 	/*
1111 	 * Delete all the logints associated with this phyint
1112 	 * instance.
1113 	 */
1114 	while (pii->pii_logint != NULL)
1115 		logint_delete(pii->pii_logint);
1116 
1117 	/*
1118 	 * Close the socket used to send probes to targets from this phyint.
1119 	 */
1120 	if (pii->pii_probe_sock != -1)
1121 		close_probe_socket(pii, _B_TRUE);
1122 
1123 	/*
1124 	 * Phyint instance must be in the list of all phyint instances.
1125 	 * Remove phyint instance from the global list of phyint instances.
1126 	 */
1127 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1128 	if (pii->pii_prev == NULL) {
1129 		/* Phyint is the 1st in the list */
1130 		phyint_instances = pii->pii_next;
1131 	} else {
1132 		pii->pii_prev->pii_next = pii->pii_next;
1133 	}
1134 	if (pii->pii_next != NULL)
1135 		pii->pii_next->pii_prev = pii->pii_prev;
1136 	pii->pii_next = NULL;
1137 	pii->pii_prev = NULL;
1138 
1139 	/*
1140 	 * Reset the phyint instance pointer in the phyint.
1141 	 * If this is the last phyint instance (being deleted) on this
1142 	 * phyint, then delete the phyint.
1143 	 */
1144 	if (pii->pii_af == AF_INET)
1145 		pi->pi_v4 = NULL;
1146 	else
1147 		pi->pi_v6 = NULL;
1148 
1149 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1150 		phyint_delete(pi);
1151 
1152 	free(pii);
1153 }
1154 
1155 static void
1156 phyint_inst_print(struct phyint_instance *pii)
1157 {
1158 	struct logint *li;
1159 	struct target *tg;
1160 	char abuf[INET6_ADDRSTRLEN];
1161 	int most_recent;
1162 	int i;
1163 
1164 	if (pii->pii_phyint == NULL) {
1165 		logdebug("pii->pi_phyint NULL can't print\n");
1166 		return;
1167 	}
1168 
1169 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1170 	    "sock %x in_use %d empty %x full %x\n",
1171 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1172 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1173 	    pii->pii_in_use, pii->pii_phyint->pi_empty,
1174 	    pii->pii_phyint->pi_full);
1175 
1176 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1177 		logint_print(li);
1178 
1179 	logdebug("\n");
1180 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1181 		target_print(tg);
1182 
1183 	if (pii->pii_targets == NULL)
1184 		logdebug("pi_targets NULL\n");
1185 
1186 	if (pii->pii_target_next != NULL) {
1187 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1188 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1189 			abuf, sizeof (abuf)));
1190 	} else {
1191 		logdebug("pi_target_next NULL\n");
1192 	}
1193 
1194 	if (pii->pii_rtt_target_next != NULL) {
1195 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1196 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1197 			abuf, sizeof (abuf)));
1198 	} else {
1199 		logdebug("pi_rtt_target_next NULL\n");
1200 	}
1201 
1202 	if (pii->pii_targets != NULL) {
1203 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1204 
1205 		i = most_recent;
1206 		do {
1207 			if (pii->pii_probes[i].pr_target != NULL) {
1208 				logdebug("#%d target %s ", i,
1209 				    pr_addr(pii->pii_af,
1210 				    pii->pii_probes[i].pr_target->tg_address,
1211 				    abuf, sizeof (abuf)));
1212 			} else {
1213 				logdebug("#%d target NULL ", i);
1214 			}
1215 			logdebug("time_sent %u status %d time_ack/lost %u\n",
1216 			    pii->pii_probes[i].pr_time_sent,
1217 			    pii->pii_probes[i].pr_status,
1218 			    pii->pii_probes[i].pr_time_lost);
1219 			i = PROBE_INDEX_PREV(i);
1220 		} while (i != most_recent);
1221 	}
1222 }
1223 
1224 /*
1225  * Lookup a logint based on the logical interface name, on the given
1226  * phyint instance.
1227  */
1228 static struct logint *
1229 logint_lookup(struct phyint_instance *pii, char *name)
1230 {
1231 	struct logint *li;
1232 
1233 	if (debug & D_LOGINT) {
1234 		logdebug("logint_lookup(%s, %s)\n",
1235 		    AF_STR(pii->pii_af), name);
1236 	}
1237 
1238 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1239 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1240 			break;
1241 	}
1242 	return (li);
1243 }
1244 
1245 /*
1246  * Insert a logint at the head of the list of logints of the given
1247  * phyint instance
1248  */
1249 static void
1250 logint_insert(struct phyint_instance *pii, struct logint *li)
1251 {
1252 	li->li_next = pii->pii_logint;
1253 	li->li_prev = NULL;
1254 	if (pii->pii_logint != NULL)
1255 		pii->pii_logint->li_prev = li;
1256 	pii->pii_logint = li;
1257 	li->li_phyint_inst = pii;
1258 }
1259 
1260 /*
1261  * Create a new named logint, on the specified phyint instance.
1262  */
1263 static struct logint *
1264 logint_create(struct phyint_instance *pii, char *name)
1265 {
1266 	struct logint *li;
1267 
1268 	if (debug & D_LOGINT) {
1269 		logdebug("logint_create(%s %s %s)\n",
1270 		    AF_STR(pii->pii_af), pii->pii_name, name);
1271 	}
1272 
1273 	li = calloc(1, sizeof (struct logint));
1274 	if (li == NULL) {
1275 		logperror("logint_create: calloc");
1276 		return (NULL);
1277 	}
1278 
1279 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1280 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1281 	logint_insert(pii, li);
1282 	return (li);
1283 }
1284 
1285 /*
1286  * Initialize the logint based on the data returned by the kernel.
1287  */
1288 void
1289 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1290 {
1291 	int	ifsock;
1292 	uint64_t flags;
1293 	uint64_t saved_flags;
1294 	struct	logint	*li;
1295 	struct lifreq	lifr;
1296 	struct in6_addr	test_subnet;
1297 	struct in6_addr	test_subnet_mask;
1298 	struct in6_addr	testaddr;
1299 	int	test_subnet_len;
1300 	struct sockaddr_in6	*sin6;
1301 	struct sockaddr_in	*sin;
1302 	char abuf[INET6_ADDRSTRLEN];
1303 	boolean_t  ptp = _B_FALSE;
1304 	struct in6_addr tgaddr;
1305 
1306 	if (debug & D_LOGINT) {
1307 		logdebug("logint_init_from_k(%s %s)\n",
1308 		    AF_STR(pii->pii_af), li_name);
1309 	}
1310 
1311 	/* Get the socket for doing ioctls */
1312 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1313 
1314 	/*
1315 	 * Get the flags from the kernel. Also serves as a check whether
1316 	 * the logical still exists. If it doesn't exist, no need to proceed
1317 	 * any further. li_in_use will make the caller clean up the logint
1318 	 */
1319 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1320 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1321 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1322 		/* Interface may have vanished */
1323 		if (errno != ENXIO) {
1324 			logperror_pii(pii, "logint_init_from_k: "
1325 			    "ioctl (get flags)");
1326 		}
1327 		return;
1328 	}
1329 
1330 	flags = lifr.lifr_flags;
1331 
1332 	/*
1333 	 * Verified the logint exists. Now lookup the logint in our tables.
1334 	 * If it does not exist, create a new logint.
1335 	 */
1336 	li = logint_lookup(pii, li_name);
1337 	if (li == NULL) {
1338 		li = logint_create(pii, li_name);
1339 		if (li == NULL) {
1340 			/*
1341 			 * Pretend the interface does not exist
1342 			 * in the kernel
1343 			 */
1344 			return;
1345 		}
1346 	}
1347 
1348 	/*
1349 	 * Update li->li_flags with the new flags, after saving the old
1350 	 * value. This is used later to check what flags has changed and
1351 	 * take any action
1352 	 */
1353 	saved_flags = li->li_flags;
1354 	li->li_flags = flags;
1355 
1356 	/*
1357 	 * Get the address, prefix, prefixlength and update the logint.
1358 	 * Check if anything has changed. If the logint used for the
1359 	 * test address has changed, take suitable action.
1360 	 */
1361 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1362 		/* Interface may have vanished */
1363 		if (errno != ENXIO) {
1364 			logperror_li(li, "logint_init_from_k: (get addr)");
1365 		}
1366 		goto error;
1367 	}
1368 
1369 	if (pii->pii_af == AF_INET) {
1370 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1371 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1372 	} else {
1373 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1374 		testaddr = sin6->sin6_addr;
1375 	}
1376 
1377 	if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
1378 		ptp = _B_TRUE;
1379 		if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) {
1380 			if (errno != ENXIO) {
1381 				logperror_li(li, "logint_init_from_k:"
1382 				    " (get dstaddr)");
1383 			}
1384 			goto error;
1385 		}
1386 		if (pii->pii_af == AF_INET) {
1387 			sin = (struct sockaddr_in *)&lifr.lifr_addr;
1388 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr);
1389 		} else {
1390 			sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1391 			tgaddr = sin6->sin6_addr;
1392 		}
1393 	} else {
1394 		if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1395 			/* Interface may have vanished */
1396 			if (errno != ENXIO) {
1397 				logperror_li(li, "logint_init_from_k:"
1398 				    " (get subnet)");
1399 			}
1400 			goto error;
1401 		}
1402 		if (lifr.lifr_subnet.ss_family == AF_INET6) {
1403 			sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1404 			test_subnet = sin6->sin6_addr;
1405 			test_subnet_len = lifr.lifr_addrlen;
1406 		} else {
1407 			sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1408 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1409 			test_subnet_len = lifr.lifr_addrlen +
1410 			    (IPV6_ABITS - IP_ABITS);
1411 		}
1412 		(void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask);
1413 	}
1414 
1415 	/*
1416 	 * Also record the OINDEX for completeness. This information is
1417 	 * not used.
1418 	 */
1419 	if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) {
1420 		if (errno != ENXIO)  {
1421 			logperror_li(li, "logint_init_from_k:"
1422 			    " (get lifoindex)");
1423 		}
1424 		goto error;
1425 	}
1426 
1427 	/*
1428 	 * If this is the logint corresponding to the test address used for
1429 	 * sending probes, then if anything significant has changed we need to
1430 	 * determine the test address again.  We ignore changes to the
1431 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1432 	 * course.
1433 	 */
1434 	if (pii->pii_probe_logint == li) {
1435 		if (((li->li_flags ^ saved_flags) &
1436 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1437 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1438 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1439 			&li->li_subnet)) ||
1440 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1441 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1442 			/*
1443 			 * Something significant that affects the testaddress
1444 			 * has changed. Redo the testaddress selection later on
1445 			 * in select_test_ifs(). For now do the cleanup and
1446 			 * set pii_probe_logint to NULL.
1447 			 */
1448 			if (pii->pii_probe_sock != -1)
1449 				close_probe_socket(pii, _B_TRUE);
1450 			pii->pii_probe_logint = NULL;
1451 		}
1452 	}
1453 
1454 
1455 	/* Update the logint with the values obtained from the kernel.	*/
1456 	li->li_addr = testaddr;
1457 	li->li_in_use = 1;
1458 	li->li_oifindex = lifr.lifr_index;
1459 	if (ptp) {
1460 		li->li_dstaddr = tgaddr;
1461 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1462 		    IP_ABITS : IPV6_ABITS;
1463 	} else {
1464 		li->li_subnet = test_subnet;
1465 		li->li_subnet_len = test_subnet_len;
1466 	}
1467 
1468 	if (debug & D_LOGINT)
1469 		logint_print(li);
1470 
1471 	return;
1472 
1473 error:
1474 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1475 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1476 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1477 	logint_delete(li);
1478 }
1479 
1480 /*
1481  * Delete (unlink and free) a logint.
1482  */
1483 void
1484 logint_delete(struct logint *li)
1485 {
1486 	struct phyint_instance *pii;
1487 
1488 	pii = li->li_phyint_inst;
1489 	assert(pii != NULL);
1490 
1491 	if (debug & D_LOGINT) {
1492 		int af;
1493 		char abuf[INET6_ADDRSTRLEN];
1494 
1495 		af = pii->pii_af;
1496 		logdebug("logint_delete(%s %s %s/%u)\n",
1497 		    AF_STR(af), li->li_name,
1498 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1499 		    li->li_subnet_len);
1500 	}
1501 
1502 	/* logint must be in the list of logints */
1503 	assert(pii->pii_logint == li || li->li_prev != NULL);
1504 
1505 	/* Remove the logint from the list of logints  */
1506 	if (li->li_prev == NULL) {
1507 		/* logint is the 1st in the list */
1508 		pii->pii_logint = li->li_next;
1509 	} else {
1510 		li->li_prev->li_next = li->li_next;
1511 	}
1512 	if (li->li_next != NULL)
1513 		li->li_next->li_prev = li->li_prev;
1514 	li->li_next = NULL;
1515 	li->li_prev = NULL;
1516 
1517 	/*
1518 	 * If this logint is also being used for probing, then close the
1519 	 * associated socket, if it exists.
1520 	 */
1521 	if (pii->pii_probe_logint == li) {
1522 		if (pii->pii_probe_sock != -1)
1523 			close_probe_socket(pii, _B_TRUE);
1524 		pii->pii_probe_logint = NULL;
1525 	}
1526 
1527 	free(li);
1528 }
1529 
1530 static void
1531 logint_print(struct logint *li)
1532 {
1533 	char abuf[INET6_ADDRSTRLEN];
1534 	int af;
1535 
1536 	af = li->li_phyint_inst->pii_af;
1537 
1538 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1539 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1540 
1541 	logdebug("\tFlags: %llx in_use %d oifindex %d\n",
1542 	    li->li_flags, li->li_in_use, li->li_oifindex);
1543 }
1544 
1545 char *
1546 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1547 {
1548 	struct in_addr	addr_v4;
1549 
1550 	if (af == AF_INET) {
1551 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1552 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1553 	} else {
1554 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1555 	}
1556 	return (abuf);
1557 }
1558 
1559 /* Lookup target on its address */
1560 struct target *
1561 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1562 {
1563 	struct target *tg;
1564 
1565 	if (debug & D_TARGET) {
1566 		char abuf[INET6_ADDRSTRLEN];
1567 
1568 		logdebug("target_lookup(%s %s): addr %s\n",
1569 		    AF_STR(pii->pii_af), pii->pii_name,
1570 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1571 	}
1572 
1573 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1574 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1575 			break;
1576 	}
1577 	return (tg);
1578 }
1579 
1580 /*
1581  * Find and return the next active target, for the next probe.
1582  * If no active targets are available, return NULL.
1583  */
1584 struct target *
1585 target_next(struct target *tg)
1586 {
1587 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
1588 	struct	target	*marker = tg;
1589 	hrtime_t now;
1590 
1591 	now = gethrtime();
1592 
1593 	/*
1594 	 * Target must be in the list of targets for this phyint
1595 	 * instance.
1596 	 */
1597 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1598 	assert(pii->pii_targets != NULL);
1599 
1600 	/* Return the next active target */
1601 	do {
1602 		/*
1603 		 * Go to the next target. If we hit the end,
1604 		 * reset the ptr to the head
1605 		 */
1606 		tg = tg->tg_next;
1607 		if (tg == NULL)
1608 			tg = pii->pii_targets;
1609 
1610 		assert(TG_STATUS_VALID(tg->tg_status));
1611 
1612 		switch (tg->tg_status) {
1613 		case TG_ACTIVE:
1614 			return (tg);
1615 
1616 		case TG_UNUSED:
1617 			assert(pii->pii_targets_are_routers);
1618 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
1619 				/*
1620 				 * Bubble up the unused target to active
1621 				 */
1622 				tg->tg_status = TG_ACTIVE;
1623 				pii->pii_ntargets++;
1624 				return (tg);
1625 			}
1626 			break;
1627 
1628 		case TG_SLOW:
1629 			assert(pii->pii_targets_are_routers);
1630 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1631 				/*
1632 				 * Bubble up the slow target to unused
1633 				 */
1634 				tg->tg_status = TG_UNUSED;
1635 			}
1636 			break;
1637 
1638 		case TG_DEAD:
1639 			assert(pii->pii_targets_are_routers);
1640 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1641 				/*
1642 				 * Bubble up the dead target to slow
1643 				 */
1644 				tg->tg_status = TG_SLOW;
1645 				tg->tg_latime = now;
1646 			}
1647 			break;
1648 		}
1649 
1650 	} while (tg != marker);
1651 
1652 	return (NULL);
1653 }
1654 
1655 /*
1656  * Select the best available target, that is not already TG_ACTIVE,
1657  * for the caller. The caller will determine whether it wants to
1658  * make the returned target TG_ACTIVE.
1659  * The selection order is as follows.
1660  * 1. pick a TG_UNSED target, if it exists.
1661  * 2. else pick a TG_SLOW target that has recovered, if it exists
1662  * 3. else pick any TG_SLOW target, if it exists
1663  * 4. else pick a TG_DEAD target that has recovered, if it exists
1664  * 5. else pick any TG_DEAD target, if it exists
1665  * 6. else return null
1666  */
1667 static struct target *
1668 target_select_best(struct phyint_instance *pii)
1669 {
1670 	struct target *tg;
1671 	struct target *slow = NULL;
1672 	struct target *dead = NULL;
1673 	struct target *slow_recovered = NULL;
1674 	struct target *dead_recovered = NULL;
1675 	hrtime_t now;
1676 
1677 	now = gethrtime();
1678 
1679 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1680 		assert(TG_STATUS_VALID(tg->tg_status));
1681 
1682 		switch (tg->tg_status) {
1683 		case TG_UNUSED:
1684 			return (tg);
1685 
1686 		case TG_SLOW:
1687 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1688 				slow_recovered = tg;
1689 				/*
1690 				 * Promote the slow_recoverd to unused
1691 				 */
1692 				tg->tg_status = TG_UNUSED;
1693 			} else {
1694 				slow = tg;
1695 			}
1696 			break;
1697 
1698 		case TG_DEAD:
1699 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1700 				dead_recovered = tg;
1701 				/*
1702 				 * Promote the dead_recoverd to slow
1703 				 */
1704 				tg->tg_status = TG_SLOW;
1705 				tg->tg_latime = now;
1706 			} else {
1707 				dead = tg;
1708 			}
1709 			break;
1710 
1711 		default:
1712 			break;
1713 		}
1714 	}
1715 
1716 	if (slow_recovered != NULL)
1717 		return (slow_recovered);
1718 	else if (slow != NULL)
1719 		return (slow);
1720 	else if (dead_recovered != NULL)
1721 		return (dead_recovered);
1722 	else
1723 		return (dead);
1724 }
1725 
1726 /*
1727  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
1728  * that are active, pick the next best below.
1729  */
1730 static void
1731 target_activate_all(struct phyint_instance *pii)
1732 {
1733 	struct target *tg;
1734 
1735 	assert(pii->pii_ntargets == 0);
1736 	assert(pii->pii_target_next == NULL);
1737 	assert(pii->pii_rtt_target_next == NULL);
1738 	assert(pii->pii_targets_are_routers);
1739 
1740 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
1741 		tg = target_select_best(pii);
1742 		if (tg == NULL) {
1743 			/* We are out of targets */
1744 			return;
1745 		}
1746 
1747 		assert(TG_STATUS_VALID(tg->tg_status));
1748 		assert(tg->tg_status != TG_ACTIVE);
1749 		tg->tg_status = TG_ACTIVE;
1750 		pii->pii_ntargets++;
1751 		if (pii->pii_target_next == NULL) {
1752 			pii->pii_target_next = tg;
1753 			pii->pii_rtt_target_next = tg;
1754 		}
1755 	}
1756 }
1757 
1758 static struct target *
1759 target_first(struct phyint_instance *pii)
1760 {
1761 	struct target *tg;
1762 
1763 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1764 		assert(TG_STATUS_VALID(tg->tg_status));
1765 		if (tg->tg_status == TG_ACTIVE)
1766 			break;
1767 	}
1768 
1769 	return (tg);
1770 }
1771 
1772 /*
1773  * Create a default target entry.
1774  */
1775 void
1776 target_create(struct phyint_instance *pii, struct in6_addr addr,
1777     boolean_t is_router)
1778 {
1779 	struct target *tg;
1780 	struct phyint *pi;
1781 	struct logint *li;
1782 
1783 	if (debug & D_TARGET) {
1784 		char abuf[INET6_ADDRSTRLEN];
1785 
1786 		logdebug("target_create(%s %s, %s)\n",
1787 		    AF_STR(pii->pii_af), pii->pii_name,
1788 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1789 	}
1790 
1791 	/*
1792 	 * If the test address is not yet initialized, do not add
1793 	 * any target, since we cannot determine whether the target
1794 	 * belongs to the same subnet as the test address.
1795 	 */
1796 	li = pii->pii_probe_logint;
1797 	if (li == NULL)
1798 		return;
1799 
1800 	/*
1801 	 * If there are multiple subnets associated with an interface, then
1802 	 * add the target to this phyint instance, only if it belongs to the
1803 	 * same subnet as the test address. The reason is that interface
1804 	 * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER
1805 	 * addresses, will disappear after failover, and the targets will not
1806 	 * be reachable from this interface.
1807 	 */
1808 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
1809 		return;
1810 
1811 	if (pii->pii_targets != NULL) {
1812 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1813 		if (is_router) {
1814 			if (!pii->pii_targets_are_routers) {
1815 				/*
1816 				 * Prefer router over hosts. Using hosts is a
1817 				 * fallback mechanism, hence delete all host
1818 				 * targets.
1819 				 */
1820 				while (pii->pii_targets != NULL)
1821 					target_delete(pii->pii_targets);
1822 			}
1823 		} else {
1824 			/*
1825 			 * Routers take precedence over hosts. If this
1826 			 * is a router list and we are trying to add a
1827 			 * host, just return. If this is a host list
1828 			 * and if we have sufficient targets, just return
1829 			 */
1830 			if (pii->pii_targets_are_routers ||
1831 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
1832 				return;
1833 		}
1834 	}
1835 
1836 	tg = calloc(1, sizeof (struct target));
1837 	if (tg == NULL) {
1838 		logperror("target_create: calloc");
1839 		return;
1840 	}
1841 
1842 	tg->tg_phyint_inst = pii;
1843 	tg->tg_address = addr;
1844 	tg->tg_in_use = 1;
1845 	tg->tg_rtt_sa = -1;
1846 	tg->tg_num_deferred = 0;
1847 
1848 	/*
1849 	 * If this is the first target, set 'pii_targets_are_routers'
1850 	 * The list of targets is either a list of hosts or list or
1851 	 * routers, but not a mix.
1852 	 */
1853 	if (pii->pii_targets == NULL) {
1854 		assert(pii->pii_ntargets == 0);
1855 		assert(pii->pii_target_next == NULL);
1856 		assert(pii->pii_rtt_target_next == NULL);
1857 		pii->pii_targets_are_routers = is_router ? 1 : 0;
1858 	}
1859 
1860 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
1861 		assert(pii->pii_targets_are_routers);
1862 		assert(pii->pii_target_next != NULL);
1863 		assert(pii->pii_rtt_target_next != NULL);
1864 		tg->tg_status = TG_UNUSED;
1865 	} else {
1866 		if (pii->pii_ntargets == 0) {
1867 			assert(pii->pii_target_next == NULL);
1868 			pii->pii_target_next = tg;
1869 			pii->pii_rtt_target_next = tg;
1870 		}
1871 		pii->pii_ntargets++;
1872 		tg->tg_status = TG_ACTIVE;
1873 	}
1874 
1875 	target_insert(pii, tg);
1876 
1877 	/*
1878 	 * Change state to PI_RUNNING if this phyint instance is capable of
1879 	 * sending and receiving probes -- that is, if we know of at least 1
1880 	 * target, and this phyint instance is probe-capable.  For more
1881 	 * details, see the phyint state diagram in mpd_probe.c.
1882 	 */
1883 	pi = pii->pii_phyint;
1884 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
1885 		if (pi->pi_flags & IFF_FAILED)
1886 			phyint_chstate(pi, PI_FAILED);
1887 		else
1888 			phyint_chstate(pi, PI_RUNNING);
1889 	}
1890 }
1891 
1892 /*
1893  * Add the target address named by `addr' to phyint instance `pii' if it does
1894  * not already exist.  If the target is a router, `is_router' should be set to
1895  * B_TRUE.
1896  */
1897 void
1898 target_add(struct phyint_instance *pii, struct in6_addr addr,
1899     boolean_t is_router)
1900 {
1901 	struct target *tg;
1902 
1903 	if (pii == NULL)
1904 		return;
1905 
1906 	tg = target_lookup(pii, addr);
1907 
1908 	/*
1909 	 * If the target does not exist, create it; target_create() will set
1910 	 * tg_in_use to true.  If it exists already, and it is a router
1911 	 * target, set tg_in_use to to true, so that init_router_targets()
1912 	 * won't delete it
1913 	 */
1914 	if (tg == NULL)
1915 		target_create(pii, addr, is_router);
1916 	else if (is_router)
1917 		tg->tg_in_use = 1;
1918 }
1919 
1920 /*
1921  * Insert target at head of linked list of targets for the associated
1922  * phyint instance
1923  */
1924 static void
1925 target_insert(struct phyint_instance *pii, struct target *tg)
1926 {
1927 	tg->tg_next = pii->pii_targets;
1928 	tg->tg_prev = NULL;
1929 	if (tg->tg_next != NULL)
1930 		tg->tg_next->tg_prev = tg;
1931 	pii->pii_targets = tg;
1932 }
1933 
1934 /*
1935  * Delete a target (unlink and free).
1936  */
1937 void
1938 target_delete(struct target *tg)
1939 {
1940 	int af;
1941 	struct phyint_instance	*pii;
1942 	struct phyint_instance	*pii_other;
1943 
1944 	pii = tg->tg_phyint_inst;
1945 	af = pii->pii_af;
1946 
1947 	if (debug & D_TARGET) {
1948 		char abuf[INET6_ADDRSTRLEN];
1949 
1950 		logdebug("target_delete(%s %s, %s)\n",
1951 		    AF_STR(af), pii->pii_name,
1952 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
1953 	}
1954 
1955 	/*
1956 	 * Target must be in the list of targets for this phyint
1957 	 * instance.
1958 	 */
1959 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1960 
1961 	/*
1962 	 * Reset all references to 'tg' in the probe information
1963 	 * for this phyint.
1964 	 */
1965 	reset_pii_probes(pii, tg);
1966 
1967 	/*
1968 	 * Remove this target from the list of targets of this
1969 	 * phyint instance.
1970 	 */
1971 	if (tg->tg_prev == NULL) {
1972 		pii->pii_targets = tg->tg_next;
1973 	} else {
1974 		tg->tg_prev->tg_next = tg->tg_next;
1975 	}
1976 
1977 	if (tg->tg_next != NULL)
1978 		tg->tg_next->tg_prev = tg->tg_prev;
1979 
1980 	tg->tg_next = NULL;
1981 	tg->tg_prev = NULL;
1982 
1983 	if (tg->tg_status == TG_ACTIVE)
1984 		pii->pii_ntargets--;
1985 
1986 	/*
1987 	 * Adjust the next target to probe, if it points to
1988 	 * to the currently deleted target.
1989 	 */
1990 	if (pii->pii_target_next == tg)
1991 		pii->pii_target_next = target_first(pii);
1992 
1993 	if (pii->pii_rtt_target_next == tg)
1994 		pii->pii_rtt_target_next = target_first(pii);
1995 
1996 	free(tg);
1997 
1998 	/*
1999 	 * The number of active targets pii_ntargets == 0 iff
2000 	 * the next active target pii->pii_target_next == NULL
2001 	 */
2002 	if (pii->pii_ntargets != 0) {
2003 		assert(pii->pii_target_next != NULL);
2004 		assert(pii->pii_rtt_target_next != NULL);
2005 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2006 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2007 		return;
2008 	}
2009 
2010 	/* At this point, we don't have any active targets. */
2011 	assert(pii->pii_target_next == NULL);
2012 	assert(pii->pii_rtt_target_next == NULL);
2013 
2014 	if (pii->pii_targets_are_routers) {
2015 		/*
2016 		 * Activate any TG_SLOW or TG_DEAD router targets,
2017 		 * since we don't have any other targets
2018 		 */
2019 		target_activate_all(pii);
2020 
2021 		if (pii->pii_ntargets != 0) {
2022 			assert(pii->pii_target_next != NULL);
2023 			assert(pii->pii_rtt_target_next != NULL);
2024 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2025 			assert(pii->pii_rtt_target_next->tg_status ==
2026 			    TG_ACTIVE);
2027 			return;
2028 		}
2029 	}
2030 
2031 	/*
2032 	 * If we still don't have any active targets, the list must
2033 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2034 	 * targets. Zero out the probe stats since it will not be
2035 	 * relevant any longer.
2036 	 */
2037 	assert(pii->pii_targets == NULL);
2038 	clear_pii_probe_stats(pii);
2039 	pii_other = phyint_inst_other(pii);
2040 
2041 	/*
2042 	 * If there are no targets on both instances,
2043 	 * go back to PI_NOTARGETS state, since we cannot
2044 	 * probe this phyint any more. For more details,
2045 	 * please see phyint state diagram in mpd_probe.c.
2046 	 */
2047 	if (!PROBE_CAPABLE(pii_other))
2048 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2049 }
2050 
2051 /*
2052  * Flush the target list of every phyint in the group, if the list
2053  * is a host target list. This is called if group failure is suspected.
2054  * If all targets have failed, multicast will subsequently discover new
2055  * targets. Else it is a group failure.
2056  * Note: This function is a no-op if the list is a router target list.
2057  */
2058 static void
2059 target_flush_hosts(struct phyint_group *pg)
2060 {
2061 	struct phyint *pi;
2062 	struct phyint_instance *pii;
2063 
2064 	if (debug & D_TARGET)
2065 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2066 
2067 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2068 		pii = pi->pi_v4;
2069 		if (pii != NULL && !pii->pii_targets_are_routers) {
2070 			/*
2071 			 * Delete all the targets. When the list becomes
2072 			 * empty, target_delete() will set pii->pii_targets
2073 			 * to NULL.
2074 			 */
2075 			while (pii->pii_targets != NULL)
2076 				target_delete(pii->pii_targets);
2077 		}
2078 		pii = pi->pi_v6;
2079 		if (pii != NULL && !pii->pii_targets_are_routers) {
2080 			/*
2081 			 * Delete all the targets. When the list becomes
2082 			 * empty, target_delete() will set pii->pii_targets
2083 			 * to NULL.
2084 			 */
2085 			while (pii->pii_targets != NULL)
2086 				target_delete(pii->pii_targets);
2087 		}
2088 	}
2089 }
2090 
2091 /*
2092  * Reset all references to 'target' in the probe info, as this target is
2093  * being deleted. The pr_target field is guaranteed to be non-null if
2094  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2095  * pr_target will not be accessed unconditionally.
2096  */
2097 static void
2098 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2099 {
2100 	int i;
2101 
2102 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2103 		if (pii->pii_probes[i].pr_target == tg) {
2104 			pii->pii_probes[i].pr_target = NULL;
2105 			if (pii->pii_probes[i].pr_status == PR_UNACKED)
2106 				pii->pii_probes[i].pr_status = PR_LOST;
2107 		}
2108 	}
2109 
2110 }
2111 
2112 /*
2113  * Clear the probe statistics array.
2114  */
2115 void
2116 clear_pii_probe_stats(struct phyint_instance *pii)
2117 {
2118 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2119 	/* Reset the next probe index in the probe stats array */
2120 	pii->pii_probe_next = 0;
2121 }
2122 
2123 static void
2124 target_print(struct target *tg)
2125 {
2126 	char	abuf[INET6_ADDRSTRLEN];
2127 	char	buf[128];
2128 	char	buf2[128];
2129 	int	af;
2130 	int	i;
2131 
2132 	af = tg->tg_phyint_inst->pii_af;
2133 
2134 	logdebug("Target on %s %s addr %s\n"
2135 	    "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n",
2136 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2137 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2138 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2139 	    tg->tg_crtt, tg->tg_in_use);
2140 
2141 	buf[0] = '\0';
2142 	for (i = 0; i < tg->tg_num_deferred; i++) {
2143 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2144 		    tg->tg_deferred[i]);
2145 		(void) strlcat(buf, buf2, sizeof (buf));
2146 	}
2147 	logdebug("deferred rtts:%s\n", buf);
2148 }
2149 
2150 void
2151 phyint_inst_print_all(void)
2152 {
2153 	struct phyint_instance *pii;
2154 
2155 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2156 		phyint_inst_print(pii);
2157 	}
2158 }
2159 
2160 /*
2161  * Convert length for a mask to the mask.
2162  */
2163 static void
2164 ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask)
2165 {
2166 	int	j;
2167 
2168 	assert(masklen <= IPV6_ABITS);
2169 	bzero((char *)bitmask, sizeof (*bitmask));
2170 
2171 	/* Make the 'masklen' leftmost bits one */
2172 	for (j = 0; masklen > 8; masklen -= 8, j++)
2173 		bitmask->s6_addr[j] = 0xff;
2174 
2175 	bitmask->s6_addr[j] = 0xff << (8 - masklen);
2176 
2177 }
2178 
2179 /*
2180  * Compare two prefixes that have the same prefix length.
2181  * Fails if the prefix length is unreasonable.
2182  */
2183 static boolean_t
2184 prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len)
2185 {
2186 	uchar_t mask;
2187 	int j;
2188 
2189 	if (prefix_len < 0 || prefix_len > IPV6_ABITS)
2190 		return (_B_FALSE);
2191 
2192 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2193 		if (p1.s6_addr[j] != p2.s6_addr[j])
2194 			return (_B_FALSE);
2195 
2196 	/* Make the N leftmost bits one */
2197 	mask = 0xff << (8 - prefix_len);
2198 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2199 		return (_B_FALSE);
2200 
2201 	return (_B_TRUE);
2202 }
2203 
2204 /*
2205  * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both
2206  * IPv4 and IPv6 put together. The phyint with the least such number
2207  * will be used as the failover destination, if no standby interface is
2208  * available
2209  */
2210 int
2211 logint_upcount(struct phyint *pi)
2212 {
2213 	struct	logint	*li;
2214 	struct	phyint_instance *pii;
2215 	int count = 0;
2216 
2217 	pii = pi->pi_v4;
2218 	if (pii != NULL) {
2219 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
2220 			if ((li->li_flags &
2221 			    (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
2222 				count++;
2223 			}
2224 		}
2225 	}
2226 
2227 	pii = pi->pi_v6;
2228 	if (pii != NULL) {
2229 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
2230 			if ((li->li_flags &
2231 			    (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
2232 				count++;
2233 			}
2234 		}
2235 	}
2236 
2237 	return (count);
2238 }
2239 
2240 /*
2241  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2242  */
2243 struct phyint_instance *
2244 phyint_inst_other(struct phyint_instance *pii)
2245 {
2246 	if (pii->pii_af == AF_INET)
2247 		return (pii->pii_phyint->pi_v6);
2248 	else
2249 		return (pii->pii_phyint->pi_v4);
2250 }
2251 
2252 /*
2253  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2254  * Before sending the event, it prepends the current version of the IPMP
2255  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2256  * `nvl' is freed).
2257  */
2258 static int
2259 post_event(const char *subclass, nvlist_t *nvl)
2260 {
2261 	sysevent_id_t eid;
2262 
2263 	/*
2264 	 * Since sysevents don't work yet in non-global zones, there cannot
2265 	 * possibly be any consumers yet, so don't bother trying to generate
2266 	 * them.  (Otherwise, we'll spew warnings.)
2267 	 */
2268 	if (getzoneid() != GLOBAL_ZONEID) {
2269 		nvlist_free(nvl);
2270 		return (0);
2271 	}
2272 
2273 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2274 	    IPMP_EVENT_CUR_VERSION);
2275 	if (errno != 0) {
2276 		logerr("cannot create `%s' event: %s", subclass,
2277 		    strerror(errno));
2278 		goto failed;
2279 	}
2280 
2281 	if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR,
2282 	    "in.mpathd", nvl, &eid) == -1) {
2283 		logerr("cannot send `%s' event: %s\n", subclass,
2284 		    strerror(errno));
2285 		goto failed;
2286 	}
2287 
2288 	nvlist_free(nvl);
2289 	return (0);
2290 failed:
2291 	nvlist_free(nvl);
2292 	return (-1);
2293 }
2294 
2295 /*
2296  * Return the external IPMP state associated with phyint `pi'.
2297  */
2298 static ipmp_if_state_t
2299 ifstate(struct phyint *pi)
2300 {
2301 	switch (pi->pi_state) {
2302 	case PI_NOTARGETS:
2303 		return (IPMP_IF_UNKNOWN);
2304 
2305 	case PI_OFFLINE:
2306 		return (IPMP_IF_OFFLINE);
2307 
2308 	case PI_FAILED:
2309 		return (IPMP_IF_FAILED);
2310 
2311 	case PI_RUNNING:
2312 		return (IPMP_IF_OK);
2313 	}
2314 
2315 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2316 	abort();
2317 	/* NOTREACHED */
2318 }
2319 
2320 /*
2321  * Return the external IPMP interface type associated with phyint `pi'.
2322  */
2323 static ipmp_if_type_t
2324 iftype(struct phyint *pi)
2325 {
2326 	if (pi->pi_flags & IFF_STANDBY)
2327 		return (IPMP_IF_STANDBY);
2328 	else
2329 		return (IPMP_IF_NORMAL);
2330 }
2331 
2332 /*
2333  * Return the external IPMP group state associated with phyint group `pg'.
2334  */
2335 static ipmp_group_state_t
2336 groupstate(struct phyint_group *pg)
2337 {
2338 	return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK);
2339 }
2340 
2341 /*
2342  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2343  * Returns 0 on success, -1 on failure.
2344  */
2345 static int
2346 phyint_group_state_event(struct phyint_group *pg)
2347 {
2348 	nvlist_t	*nvl;
2349 
2350 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2351 	if (errno != 0) {
2352 		logperror("cannot create `group state change' event");
2353 		return (-1);
2354 	}
2355 
2356 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2357 	if (errno != 0)
2358 		goto failed;
2359 
2360 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2361 	if (errno != 0)
2362 		goto failed;
2363 
2364 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2365 	if (errno != 0)
2366 		goto failed;
2367 
2368 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2369 failed:
2370 	logperror("cannot create `group state change' event");
2371 	nvlist_free(nvl);
2372 	return (-1);
2373 }
2374 
2375 /*
2376  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2377  * `pg'.  Returns 0 on success, -1 on failure.
2378  */
2379 static int
2380 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2381 {
2382 	nvlist_t *nvl;
2383 
2384 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2385 	if (errno != 0) {
2386 		logperror("cannot create `group change' event");
2387 		return (-1);
2388 	}
2389 
2390 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2391 	if (errno != 0)
2392 		goto failed;
2393 
2394 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2395 	if (errno != 0)
2396 		goto failed;
2397 
2398 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
2399 	    phyint_grouplistsig);
2400 	if (errno != 0)
2401 		goto failed;
2402 
2403 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
2404 	if (errno != 0)
2405 		goto failed;
2406 
2407 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
2408 failed:
2409 	logperror("cannot create `group change' event");
2410 	nvlist_free(nvl);
2411 	return (-1);
2412 }
2413 
2414 /*
2415  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
2416  * group `pg'.	Returns 0 on success, -1 on failure.
2417  */
2418 static int
2419 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
2420     ipmp_if_op_t op)
2421 {
2422 	nvlist_t *nvl;
2423 
2424 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2425 	if (errno != 0) {
2426 		logperror("cannot create `group member change' event");
2427 		return (-1);
2428 	}
2429 
2430 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2431 	if (errno != 0)
2432 		goto failed;
2433 
2434 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2435 	if (errno != 0)
2436 		goto failed;
2437 
2438 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
2439 	if (errno != 0)
2440 		goto failed;
2441 
2442 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
2443 	if (errno != 0)
2444 		goto failed;
2445 
2446 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
2447 	if (errno != 0)
2448 		goto failed;
2449 
2450 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
2451 	if (errno != 0)
2452 		goto failed;
2453 
2454 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
2455 failed:
2456 	logperror("cannot create `group member change' event");
2457 	nvlist_free(nvl);
2458 	return (-1);
2459 
2460 }
2461 
2462 /*
2463  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
2464  * Returns 0 on success, -1 on failure.
2465  */
2466 static int
2467 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
2468 {
2469 	nvlist_t *nvl;
2470 
2471 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2472 	if (errno != 0) {
2473 		logperror("cannot create `interface change' event");
2474 		return (-1);
2475 	}
2476 
2477 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2478 	if (errno != 0)
2479 		goto failed;
2480 
2481 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2482 	if (errno != 0)
2483 		goto failed;
2484 
2485 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
2486 	if (errno != 0)
2487 		goto failed;
2488 
2489 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
2490 	if (errno != 0)
2491 		goto failed;
2492 
2493 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
2494 	if (errno != 0)
2495 		goto failed;
2496 
2497 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
2498 failed:
2499 	logperror("cannot create `interface change' event");
2500 	nvlist_free(nvl);
2501 	return (-1);
2502 
2503 }
2504 
2505 /*
2506  * Generate a signature for use.  The signature is conceptually divided
2507  * into two pieces: a random 16-bit "generation number" and a 48-bit
2508  * monotonically increasing integer.  The generation number protects
2509  * against stale updates to entities (e.g., IPMP groups) that have been
2510  * deleted and since recreated.
2511  */
2512 static uint64_t
2513 gensig(void)
2514 {
2515 	static int seeded = 0;
2516 
2517 	if (seeded == 0) {
2518 		srand48((long)gethrtime());
2519 		seeded++;
2520 	}
2521 
2522 	return ((uint64_t)lrand48() << 48 | 1);
2523 }
2524 
2525 /*
2526  * Store the information associated with group `grname' into a dynamically
2527  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
2528  */
2529 unsigned int
2530 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
2531 {
2532 	struct phyint_group	*pg;
2533 	struct phyint		*pi;
2534 	char			(*ifs)[LIFNAMSIZ];
2535 	unsigned int		nif, i;
2536 
2537 	pg = phyint_group_lookup(grname);
2538 	if (pg == NULL)
2539 		return (IPMP_EUNKGROUP);
2540 
2541 	/*
2542 	 * Tally up the number of interfaces, allocate an array to hold them,
2543 	 * and insert their names into the array.
2544 	 */
2545 	for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
2546 		nif++;
2547 
2548 	ifs = alloca(nif * sizeof (*ifs));
2549 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
2550 		assert(i < nif);
2551 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
2552 	}
2553 	assert(i == nif);
2554 
2555 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig,
2556 	    groupstate(pg), nif, ifs);
2557 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2558 }
2559 
2560 /*
2561  * Store the information associated with interface `ifname' into a dynamically
2562  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
2563  */
2564 unsigned int
2565 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
2566 {
2567 	struct phyint	*pi;
2568 
2569 	pi = phyint_lookup(ifname);
2570 	if (pi == NULL)
2571 		return (IPMP_EUNKIF);
2572 
2573 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
2574 	    ifstate(pi), iftype(pi));
2575 	return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2576 }
2577 
2578 /*
2579  * Store the current list of IPMP groups into a dynamically allocated
2580  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
2581  */
2582 unsigned int
2583 getgrouplist(ipmp_grouplist_t **grlistpp)
2584 {
2585 	struct phyint_group	*pg;
2586 	char			(*groups)[LIFGRNAMSIZ];
2587 	unsigned int		i, ngroup;
2588 
2589 	/*
2590 	 * Tally up the number of groups, allocate an array to hold them, and
2591 	 * insert their names into the array.
2592 	 */
2593 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
2594 		ngroup++;
2595 
2596 	groups = alloca(ngroup * sizeof (*groups));
2597 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
2598 		assert(i < ngroup);
2599 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
2600 	}
2601 	assert(i == ngroup);
2602 
2603 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
2604 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2605 }
2606 
2607 /*
2608  * Store a snapshot of the IPMP subsystem into a dynamically allocated
2609  * structure pointed to by `*snapp'.  Returns an IPMP error code.
2610  */
2611 unsigned int
2612 getsnap(ipmp_snap_t **snapp)
2613 {
2614 	ipmp_grouplist_t	*grlistp;
2615 	ipmp_groupinfo_t	*grinfop;
2616 	ipmp_ifinfo_t		*ifinfop;
2617 	ipmp_snap_t		*snap;
2618 	struct phyint		*pi;
2619 	unsigned int		i;
2620 	int			retval;
2621 
2622 	snap = ipmp_snap_create();
2623 	if (snap == NULL)
2624 		return (IPMP_ENOMEM);
2625 
2626 	/*
2627 	 * Add group list.
2628 	 */
2629 	retval = getgrouplist(&snap->sn_grlistp);
2630 	if (retval != IPMP_SUCCESS) {
2631 		ipmp_snap_free(snap);
2632 		return (retval);
2633 	}
2634 
2635 	/*
2636 	 * Add information for each group in the list.
2637 	 */
2638 	grlistp = snap->sn_grlistp;
2639 	for (i = 0; i < grlistp->gl_ngroup; i++) {
2640 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
2641 		if (retval != IPMP_SUCCESS) {
2642 			ipmp_snap_free(snap);
2643 			return (retval);
2644 		}
2645 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
2646 		if (retval != IPMP_SUCCESS) {
2647 			ipmp_freegroupinfo(grinfop);
2648 			ipmp_snap_free(snap);
2649 			return (retval);
2650 		}
2651 	}
2652 
2653 	/*
2654 	 * Add information for each configured phyint.
2655 	 */
2656 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2657 		retval = getifinfo(pi->pi_name, &ifinfop);
2658 		if (retval != IPMP_SUCCESS) {
2659 			ipmp_snap_free(snap);
2660 			return (retval);
2661 		}
2662 		retval = ipmp_snap_addifinfo(snap, ifinfop);
2663 		if (retval != IPMP_SUCCESS) {
2664 			ipmp_freeifinfo(ifinfop);
2665 			ipmp_snap_free(snap);
2666 			return (retval);
2667 		}
2668 	}
2669 
2670 	*snapp = snap;
2671 	return (IPMP_SUCCESS);
2672 }
2673