xref: /titanic_52/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.c (revision 275c9da86e89f8abf71135cf63d9fc23671b2e60)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include "mpd_defs.h"
29 #include "mpd_tables.h"
30 
31 /*
32  * Global list of phyints, phyint instances, phyint groups and the anonymous
33  * group; the latter is initialized in phyint_init().
34  */
35 struct phyint *phyints = NULL;
36 struct phyint_instance	*phyint_instances = NULL;
37 struct phyint_group *phyint_groups = NULL;
38 struct phyint_group *phyint_anongroup;
39 
40 /*
41  * Grouplist signature; initialized in phyint_init().
42  */
43 static uint64_t phyint_grouplistsig;
44 
45 static void phyint_inst_insert(struct phyint_instance *pii);
46 static void phyint_inst_print(struct phyint_instance *pii);
47 
48 static void phyint_insert(struct phyint *pi, struct phyint_group *pg);
49 static void phyint_delete(struct phyint *pi);
50 
51 static void phyint_group_insert(struct phyint_group *pg);
52 static void phyint_group_delete(struct phyint_group *pg);
53 static struct phyint_group *phyint_group_lookup(const char *pg_name);
54 static struct phyint_group *phyint_group_create(const char *pg_name);
55 
56 static void logint_print(struct logint *li);
57 static void logint_insert(struct phyint_instance *pii, struct logint *li);
58 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name);
59 
60 static void target_print(struct target *tg);
61 static void target_insert(struct phyint_instance *pii, struct target *tg);
62 static struct target *target_first(struct phyint_instance *pii);
63 static struct target *target_select_best(struct phyint_instance *pii);
64 static void target_flush_hosts(struct phyint_group *pg);
65 
66 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg);
67 
68 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii);
69 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii);
70 
71 static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask);
72 static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2,
73     int prefix_len);
74 
75 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi);
76 static int phyint_group_state_event(struct phyint_group *pg);
77 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t);
78 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
79     ipmp_if_op_t op);
80 
81 static uint64_t gensig(void);
82 
83 /* Initialize any per-file global state.  Returns 0 on success, -1 on failure */
84 int
85 phyint_init(void)
86 {
87 	phyint_grouplistsig = gensig();
88 	if (track_all_phyints) {
89 		phyint_anongroup = phyint_group_create("");
90 		if (phyint_anongroup == NULL)
91 			return (-1);
92 		phyint_group_insert(phyint_anongroup);
93 	}
94 	return (0);
95 }
96 
97 /* Return the phyint with the given name */
98 struct phyint *
99 phyint_lookup(const char *name)
100 {
101 	struct phyint *pi;
102 
103 	if (debug & D_PHYINT)
104 		logdebug("phyint_lookup(%s)\n", name);
105 
106 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
107 		if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0)
108 			break;
109 	}
110 	return (pi);
111 }
112 
113 /* Return the phyint instance with the given name and the given family */
114 struct phyint_instance *
115 phyint_inst_lookup(int af, char *name)
116 {
117 	struct phyint *pi;
118 
119 	if (debug & D_PHYINT)
120 		logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name);
121 
122 	assert(af == AF_INET || af == AF_INET6);
123 
124 	pi = phyint_lookup(name);
125 	if (pi == NULL)
126 		return (NULL);
127 
128 	return (PHYINT_INSTANCE(pi, af));
129 }
130 
131 static struct phyint_group *
132 phyint_group_lookup(const char *pg_name)
133 {
134 	struct phyint_group *pg;
135 
136 	if (debug & D_PHYINT)
137 		logdebug("phyint_group_lookup(%s)\n", pg_name);
138 
139 	for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) {
140 		if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0)
141 			break;
142 	}
143 	return (pg);
144 }
145 
146 /*
147  * Insert the phyint in the linked list of all phyints. If the phyint belongs
148  * to some group, insert it in the phyint group list.
149  */
150 static void
151 phyint_insert(struct phyint *pi, struct phyint_group *pg)
152 {
153 	if (debug & D_PHYINT)
154 		logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name);
155 
156 	/* Insert the phyint at the head of the 'all phyints' list */
157 	pi->pi_next = phyints;
158 	pi->pi_prev = NULL;
159 	if (phyints != NULL)
160 		phyints->pi_prev = pi;
161 	phyints = pi;
162 
163 	/*
164 	 * Insert the phyint at the head of the 'phyint_group members' list
165 	 * of the phyint group to which it belongs.
166 	 */
167 	pi->pi_pgnext = NULL;
168 	pi->pi_pgprev = NULL;
169 	pi->pi_group = pg;
170 
171 	pi->pi_pgnext = pg->pg_phyint;
172 	if (pi->pi_pgnext != NULL)
173 		pi->pi_pgnext->pi_pgprev = pi;
174 	pg->pg_phyint = pi;
175 
176 	pg->pg_sig++;
177 	(void) phyint_group_member_event(pg, pi, IPMP_IF_ADD);
178 }
179 
180 /* Insert the phyint instance in the linked list of all phyint instances. */
181 static void
182 phyint_inst_insert(struct phyint_instance *pii)
183 {
184 	if (debug & D_PHYINT) {
185 		logdebug("phyint_inst_insert(%s %s)\n",
186 		    AF_STR(pii->pii_af), pii->pii_name);
187 	}
188 
189 	/*
190 	 * Insert the phyint at the head of the 'all phyint instances' list.
191 	 */
192 	pii->pii_next = phyint_instances;
193 	pii->pii_prev = NULL;
194 	if (phyint_instances != NULL)
195 		phyint_instances->pii_prev = pii;
196 	phyint_instances = pii;
197 }
198 
199 /*
200  * Create a new phyint with the given parameters. Also insert it into
201  * the list of all phyints and the list of phyint group members by calling
202  * phyint_insert().
203  */
204 static struct phyint *
205 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex,
206     uint64_t flags)
207 {
208 	struct phyint *pi;
209 
210 	pi = calloc(1, sizeof (struct phyint));
211 	if (pi == NULL) {
212 		logperror("phyint_create: calloc");
213 		return (NULL);
214 	}
215 
216 	/*
217 	 * Record the phyint values. Also insert the phyint into the
218 	 * phyint group by calling phyint_insert().
219 	 */
220 	(void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name));
221 	pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME;
222 	pi->pi_ifindex = ifindex;
223 	pi->pi_icmpid =
224 	    htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF));
225 	/*
226 	 * We optimistically start in the PI_RUNNING state.  Later (in
227 	 * process_link_state_changes()), we will readjust this to match the
228 	 * current state of the link.  Further, if test addresses are
229 	 * subsequently assigned, we will transition to PI_NOTARGETS and then
230 	 * either PI_RUNNING or PI_FAILED, depending on the result of the test
231 	 * probes.
232 	 */
233 	pi->pi_state = PI_RUNNING;
234 	pi->pi_flags = PHYINT_FLAGS(flags);
235 	/*
236 	 * Initialise the link state.  The link state is initialised to
237 	 * up, so that if the link is down when IPMP starts monitoring
238 	 * the interface, it will appear as though there has been a
239 	 * transition from the link up to link down.  This avoids
240 	 * having to treat this situation as a special case.
241 	 */
242 	INIT_LINK_STATE(pi);
243 
244 	/*
245 	 * Insert the phyint in the list of all phyints, and the
246 	 * list of phyint group members
247 	 */
248 	phyint_insert(pi, pg);
249 
250 	/*
251 	 * If we are joining a failed group, mark the interface as
252 	 * failed.
253 	 */
254 	if (GROUP_FAILED(pg))
255 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
256 
257 	return (pi);
258 }
259 
260 /*
261  * Create a new phyint instance belonging to the phyint 'pi' and address
262  * family 'af'. Also insert it into the list of all phyint instances by
263  * calling phyint_inst_insert().
264  */
265 static struct phyint_instance *
266 phyint_inst_create(struct phyint *pi, int af)
267 {
268 	struct phyint_instance *pii;
269 
270 	pii = calloc(1, sizeof (struct phyint_instance));
271 	if (pii == NULL) {
272 		logperror("phyint_inst_create: calloc");
273 		return (NULL);
274 	}
275 
276 	/*
277 	 * Attach the phyint instance to the phyint.
278 	 * Set the back pointers as well
279 	 */
280 	pii->pii_phyint = pi;
281 	if (af == AF_INET)
282 		pi->pi_v4 = pii;
283 	else
284 		pi->pi_v6 = pii;
285 
286 	pii->pii_in_use = 1;
287 	pii->pii_probe_sock = -1;
288 	pii->pii_snxt = 1;
289 	pii->pii_af = af;
290 	pii->pii_fd_hrtime = gethrtime() +
291 	    (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC);
292 	pii->pii_flags = pi->pi_flags;
293 
294 	/* Insert the phyint instance in the list of all phyint instances. */
295 	phyint_inst_insert(pii);
296 	return (pii);
297 }
298 
299 /*
300  * Change the state of phyint `pi' to state `state'.
301  */
302 void
303 phyint_chstate(struct phyint *pi, enum pi_state state)
304 {
305 	/*
306 	 * To simplify things, some callers always set a given state
307 	 * regardless of the previous state of the phyint (e.g., setting
308 	 * PI_RUNNING when it's already set).  We shouldn't bother
309 	 * generating an event or consuming a signature for these, since
310 	 * the actual state of the interface is unchanged.
311 	 */
312 	if (pi->pi_state == state)
313 		return;
314 
315 	pi->pi_state = state;
316 	pi->pi_group->pg_sig++;
317 	(void) phyint_state_event(pi->pi_group, pi);
318 }
319 
320 /*
321  * Note that the type of phyint `pi' has changed.
322  */
323 void
324 phyint_newtype(struct phyint *pi)
325 {
326 	pi->pi_group->pg_sig++;
327 	(void) phyint_state_event(pi->pi_group, pi);
328 }
329 
330 /*
331  * Insert the phyint group in the linked list of all phyint groups
332  * at the head of the list
333  */
334 static void
335 phyint_group_insert(struct phyint_group *pg)
336 {
337 	pg->pg_next = phyint_groups;
338 	pg->pg_prev = NULL;
339 	if (phyint_groups != NULL)
340 		phyint_groups->pg_prev = pg;
341 	phyint_groups = pg;
342 
343 	phyint_grouplistsig++;
344 	(void) phyint_group_change_event(pg, IPMP_GROUP_ADD);
345 }
346 
347 /*
348  * Create a new phyint group called 'name'.
349  */
350 static struct phyint_group *
351 phyint_group_create(const char *name)
352 {
353 	struct	phyint_group *pg;
354 
355 	if (debug & D_PHYINT)
356 		logdebug("phyint_group_create(%s)\n", name);
357 
358 	pg = calloc(1, sizeof (struct phyint_group));
359 	if (pg == NULL) {
360 		logperror("phyint_group_create: calloc");
361 		return (NULL);
362 	}
363 
364 	(void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name));
365 	pg->pg_sig = gensig();
366 
367 	pg->pg_fdt = user_failure_detection_time;
368 	pg->pg_probeint = user_probe_interval;
369 
370 	return (pg);
371 }
372 
373 /*
374  * Change the state of the phyint group `pg' to state `state'.
375  */
376 void
377 phyint_group_chstate(struct phyint_group *pg, enum pg_state state)
378 {
379 	assert(pg != phyint_anongroup);
380 
381 	switch (state) {
382 	case PG_FAILED:
383 		pg->pg_groupfailed = 1;
384 
385 		/*
386 		 * We can never know with certainty that a group has
387 		 * failed.  It is possible that all known targets have
388 		 * failed simultaneously, and new targets have come up
389 		 * instead. If the targets are routers then router
390 		 * discovery will kick in, and we will see the new routers
391 		 * thru routing socket messages. But if the targets are
392 		 * hosts, we have to discover it by multicast.	So flush
393 		 * all the host targets. The next probe will send out a
394 		 * multicast echo request. If this is a group failure, we
395 		 * will still not see any response, otherwise we will
396 		 * clear the pg_groupfailed flag after we get
397 		 * NUM_PROBE_REPAIRS consecutive unicast replies on any
398 		 * phyint.
399 		 */
400 		target_flush_hosts(pg);
401 		break;
402 
403 	case PG_RUNNING:
404 		pg->pg_groupfailed = 0;
405 		break;
406 
407 	default:
408 		logerr("phyint_group_chstate: invalid group state %d; "
409 		    "aborting\n", state);
410 		abort();
411 	}
412 
413 	pg->pg_sig++;
414 	(void) phyint_group_state_event(pg);
415 }
416 
417 /*
418  * Create a new phyint instance and initialize it from the values supplied by
419  * the kernel. Always check for ENXIO before logging any error, because the
420  * interface could have vanished after completion of SIOCGLIFCONF.
421  * Return values:
422  *	pointer to the phyint instance on success
423  *	NULL on failure Eg. if the phyint instance is not found in the kernel
424  */
425 struct phyint_instance *
426 phyint_inst_init_from_k(int af, char *pi_name)
427 {
428 	char	pg_name[LIFNAMSIZ + 1];
429 	int	ifsock;
430 	uint_t	ifindex;
431 	uint64_t	flags;
432 	struct lifreq	lifr;
433 	struct phyint	*pi;
434 	struct phyint_instance	*pii;
435 	boolean_t	pg_created;
436 	boolean_t	pi_created;
437 	struct phyint_group	*pg;
438 
439 retry:
440 	pii = NULL;
441 	pi = NULL;
442 	pg = NULL;
443 	pi_created = _B_FALSE;
444 	pg_created = _B_FALSE;
445 
446 	if (debug & D_PHYINT) {
447 		logdebug("phyint_inst_init_from_k(%s %s)\n",
448 		    AF_STR(af), pi_name);
449 	}
450 
451 	assert(af == AF_INET || af == AF_INET6);
452 
453 	/* Get the socket for doing ioctls */
454 	ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6;
455 
456 	/*
457 	 * Get the interface flags. Ignore loopback and multipoint
458 	 * interfaces.
459 	 */
460 	(void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name));
461 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
462 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
463 		if (errno != ENXIO) {
464 			logperror("phyint_inst_init_from_k:"
465 			    " ioctl (get flags)");
466 		}
467 		return (NULL);
468 	}
469 	flags = lifr.lifr_flags;
470 	if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK))
471 		return (NULL);
472 
473 	/*
474 	 * Get the ifindex for recording later in our tables, in case we need
475 	 * to create a new phyint.
476 	 */
477 	if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) {
478 		if (errno != ENXIO) {
479 			logperror("phyint_inst_init_from_k: "
480 			    " ioctl (get lifindex)");
481 		}
482 		return (NULL);
483 	}
484 	ifindex = lifr.lifr_index;
485 
486 	/*
487 	 * Get the phyint group name of this phyint, from the kernel.
488 	 */
489 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) {
490 		if (errno != ENXIO) {
491 			logperror("phyint_inst_init_from_k: "
492 			    "ioctl (get group name)");
493 		}
494 		return (NULL);
495 	}
496 	(void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name));
497 	pg_name[sizeof (pg_name) - 1] = '\0';
498 
499 	/*
500 	 * If the phyint is not part of any group, pg_name is the
501 	 * null string. If 'track_all_phyints' is false, there is no
502 	 * need to create a phyint.
503 	 */
504 	if (pg_name[0] == '\0' && !track_all_phyints) {
505 		/*
506 		 * If the IFF_FAILED or IFF_OFFLINE flags are set, reset
507 		 * them. These flags shouldn't be set if IPMP isn't
508 		 * tracking the interface.
509 		 */
510 		if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) {
511 			lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE);
512 			if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
513 				if (errno != ENXIO) {
514 					logperror("phyint_inst_init_from_k:"
515 					    " ioctl (set flags)");
516 				}
517 			}
518 		}
519 		return (NULL);
520 	}
521 
522 	/*
523 	 * We need to create a new phyint instance. A phyint instance
524 	 * belongs to a phyint, and the phyint belongs to a phyint group.
525 	 * So we first lookup the 'parents' and if they don't exist then
526 	 * we create them.
527 	 */
528 	pg = phyint_group_lookup(pg_name);
529 	if (pg == NULL) {
530 		pg = phyint_group_create(pg_name);
531 		if (pg == NULL) {
532 			logerr("phyint_inst_init_from_k:"
533 			    " unable to create group %s\n", pg_name);
534 			return (NULL);
535 		}
536 		phyint_group_insert(pg);
537 		pg_created = _B_TRUE;
538 	}
539 
540 	/*
541 	 * Lookup the phyint. If the phyint does not exist create it.
542 	 */
543 	pi = phyint_lookup(pi_name);
544 	if (pi == NULL) {
545 		pi = phyint_create(pi_name, pg, ifindex, flags);
546 		if (pi == NULL) {
547 			logerr("phyint_inst_init_from_k:"
548 			    " unable to create phyint %s\n", pi_name);
549 			if (pg_created)
550 				phyint_group_delete(pg);
551 			return (NULL);
552 		}
553 		pi_created = _B_TRUE;
554 	} else {
555 		/* The phyint exists already. */
556 		assert(pi_created == _B_FALSE);
557 		/*
558 		 * Normally we should see consistent values for the IPv4 and
559 		 * IPv6 instances, for phyint properties. If we don't, it
560 		 * means things have changed underneath us, and we should
561 		 * resync our tables with the kernel. Check whether the
562 		 * interface index has changed. If so, it is most likely
563 		 * the interface has been unplumbed and replumbed,
564 		 * while we are yet to update our tables. Do it now.
565 		 */
566 		if (pi->pi_ifindex != ifindex) {
567 			if (pg_created)
568 				phyint_group_delete(pg);
569 			phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af)));
570 			goto retry;
571 		}
572 		assert(PHYINT_INSTANCE(pi, af) == NULL);
573 
574 		/*
575 		 * If the group name seen by the IPv4 and IPv6 instances
576 		 * are different, it is most likely the groupname has
577 		 * changed, while we are yet to update our tables. Do it now.
578 		 */
579 		if (strcmp(pi->pi_group->pg_name, pg_name) != 0) {
580 			if (pg_created)
581 				phyint_group_delete(pg);
582 			restore_phyint(pi);
583 			phyint_inst_delete(PHYINT_INSTANCE(pi,
584 			    AF_OTHER(af)));
585 			goto retry;
586 		}
587 	}
588 
589 	/*
590 	 * Create a new phyint instance, corresponding to the 'af'
591 	 * passed in.
592 	 */
593 	pii = phyint_inst_create(pi, af);
594 	if (pii == NULL) {
595 		logerr("phyint_inst_init_from_k: unable to create"
596 		    "phyint inst %s\n", pi->pi_name);
597 		if (pi_created) {
598 			/*
599 			 * Deleting the phyint will delete the phyint group
600 			 * if this is the last phyint in the group.
601 			 */
602 			phyint_delete(pi);
603 		}
604 		return (NULL);
605 	}
606 
607 	return (pii);
608 }
609 
610 /*
611  * Bind pii_probe_sock to the address associated with pii_probe_logint.
612  * This socket will be used for sending and receiving ICMP/ICMPv6 probes to
613  * targets. Do the common part in this function, and complete the
614  * initializations by calling the protocol specific functions
615  * phyint_inst_v{4,6}_sockinit() respectively.
616  *
617  * Return values: _B_TRUE/_B_FALSE for success or failure respectively.
618  */
619 boolean_t
620 phyint_inst_sockinit(struct phyint_instance *pii)
621 {
622 	boolean_t success;
623 	struct phyint_group *pg;
624 
625 	if (debug & D_PHYINT) {
626 		logdebug("phyint_inst_sockinit(%s %s)\n",
627 		    AF_STR(pii->pii_af), pii->pii_name);
628 	}
629 
630 	assert(pii->pii_probe_logint != NULL);
631 	assert(pii->pii_probe_logint->li_flags & IFF_UP);
632 	assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER);
633 	assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6);
634 
635 	/*
636 	 * If the socket is already bound, close pii_probe_sock
637 	 */
638 	if (pii->pii_probe_sock != -1)
639 		close_probe_socket(pii, _B_TRUE);
640 
641 	/*
642 	 * If the phyint is not part of a named group and track_all_phyints is
643 	 * false, simply return.
644 	 */
645 	pg = pii->pii_phyint->pi_group;
646 	if (pg == phyint_anongroup && !track_all_phyints) {
647 		if (debug & D_PHYINT)
648 			logdebug("phyint_inst_sockinit: no group\n");
649 		return (_B_FALSE);
650 	}
651 
652 	/*
653 	 * Initialize the socket by calling the protocol specific function.
654 	 * If it succeeds, add the socket to the poll list.
655 	 */
656 	if (pii->pii_af == AF_INET6)
657 		success = phyint_inst_v6_sockinit(pii);
658 	else
659 		success = phyint_inst_v4_sockinit(pii);
660 
661 	if (success && (poll_add(pii->pii_probe_sock) == 0))
662 		return (_B_TRUE);
663 
664 	/* Something failed, cleanup and return false */
665 	if (pii->pii_probe_sock != -1)
666 		close_probe_socket(pii, _B_FALSE);
667 
668 	return (_B_FALSE);
669 }
670 
671 /*
672  * IPv6 specific part in initializing the pii_probe_sock. This socket is
673  * used to send/receive ICMPv6 probe packets.
674  */
675 static boolean_t
676 phyint_inst_v6_sockinit(struct phyint_instance *pii)
677 {
678 	icmp6_filter_t filter;
679 	int hopcount = 1;
680 	int int_op;
681 	struct	sockaddr_in6	testaddr;
682 
683 	/*
684 	 * Open a raw socket with ICMPv6 protocol.
685 	 *
686 	 * Use IPV6_DONTFAILOVER_IF to make sure that probes go out
687 	 * on the specified phyint only, and are not subject to load
688 	 * balancing. Bind to the src address chosen will ensure that
689 	 * the responses are received only on the specified phyint.
690 	 *
691 	 * Set the hopcount to 1 so that probe packets are not routed.
692 	 * Disable multicast loopback. Set the receive filter to
693 	 * receive only ICMPv6 echo replies.
694 	 */
695 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6);
696 	if (pii->pii_probe_sock < 0) {
697 		logperror_pii(pii, "phyint_inst_v6_sockinit: socket");
698 		return (_B_FALSE);
699 }
700 
701 	bzero(&testaddr, sizeof (testaddr));
702 	testaddr.sin6_family = AF_INET6;
703 	testaddr.sin6_port = 0;
704 	testaddr.sin6_addr = pii->pii_probe_logint->li_addr;
705 
706 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
707 	    sizeof (testaddr)) < 0) {
708 		logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind");
709 		return (_B_FALSE);
710 	}
711 
712 	/*
713 	 * IPV6_DONTFAILOVER_IF option takes precedence over setting
714 	 * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again.
715 	 */
716 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF,
717 	    (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) {
718 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
719 		    " IPV6_DONTFAILOVER_IF");
720 		return (_B_FALSE);
721 	}
722 
723 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS,
724 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
725 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
726 		    " IPV6_UNICAST_HOPS");
727 		return (_B_FALSE);
728 	}
729 
730 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS,
731 	    (char *)&hopcount, sizeof (hopcount)) < 0) {
732 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
733 		    " IPV6_MULTICAST_HOPS");
734 		return (_B_FALSE);
735 	}
736 
737 	int_op = 0;	/* used to turn off option */
738 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP,
739 	    (char *)&int_op, sizeof (int_op)) < 0) {
740 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
741 		    " IPV6_MULTICAST_LOOP");
742 		return (_B_FALSE);
743 	}
744 
745 	/*
746 	 * Filter out so that we only receive ICMP echo replies
747 	 */
748 	ICMP6_FILTER_SETBLOCKALL(&filter);
749 	ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter);
750 
751 	if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER,
752 	    (char *)&filter, sizeof (filter)) < 0) {
753 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
754 		    " ICMP6_FILTER");
755 		return (_B_FALSE);
756 	}
757 
758 	/* Enable receipt of ancillary data */
759 	int_op = 1;
760 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT,
761 	    (char *)&int_op, sizeof (int_op)) < 0) {
762 		logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt"
763 		    " IPV6_RECVHOPLIMIT");
764 		return (_B_FALSE);
765 	}
766 
767 	return (_B_TRUE);
768 }
769 
770 /*
771  * IPv4 specific part in initializing the pii_probe_sock. This socket is
772  * used to send/receive ICMPv4 probe packets.
773  */
774 static boolean_t
775 phyint_inst_v4_sockinit(struct phyint_instance *pii)
776 {
777 	struct sockaddr_in  testaddr;
778 	char	char_op;
779 	int	ttl = 1;
780 	char	char_ttl = 1;
781 
782 	/*
783 	 * Open a raw socket with ICMPv4 protocol.
784 	 *
785 	 * Use IP_DONTFAILOVER_IF to make sure that probes go out
786 	 * on the specified phyint only, and are not subject to load
787 	 * balancing. Bind to the src address chosen will ensure that
788 	 * the responses are received only on the specified phyint.
789 	 *
790 	 * Set the ttl to 1 so that probe packets are not routed.
791 	 * Disable multicast loopback.
792 	 */
793 	pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP);
794 	if (pii->pii_probe_sock < 0) {
795 		logperror_pii(pii, "phyint_inst_v4_sockinit: socket");
796 		return (_B_FALSE);
797 	}
798 
799 	bzero(&testaddr, sizeof (testaddr));
800 	testaddr.sin_family = AF_INET;
801 	testaddr.sin_port = 0;
802 	IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr,
803 	    &testaddr.sin_addr);
804 
805 	if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr,
806 	    sizeof (testaddr)) < 0) {
807 		logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind");
808 		return (_B_FALSE);
809 	}
810 
811 	/*
812 	 * IP_DONTFAILOVER_IF option takes precedence over setting
813 	 * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again.
814 	 */
815 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF,
816 	    (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) {
817 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
818 		    " IP_DONTFAILOVER");
819 		return (_B_FALSE);
820 	}
821 
822 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL,
823 	    (char *)&ttl, sizeof (ttl)) < 0) {
824 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
825 		    " IP_TTL");
826 		return (_B_FALSE);
827 	}
828 
829 	char_op = 0;	/* used to turn off option */
830 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP,
831 	    (char *)&char_op, sizeof (char_op)) == -1) {
832 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
833 		    " IP_MULTICAST_LOOP");
834 		return (_B_FALSE);
835 	}
836 
837 	if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL,
838 	    (char *)&char_ttl, sizeof (char_ttl)) == -1) {
839 		logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt"
840 		    " IP_MULTICAST_TTL");
841 		return (_B_FALSE);
842 	}
843 
844 	return (_B_TRUE);
845 }
846 
847 /*
848  * Remove the phyint group from the list of 'all phyint groups'
849  * and free it.
850  */
851 static void
852 phyint_group_delete(struct phyint_group *pg)
853 {
854 	/*
855 	 * The anonymous group always exists, even when empty.
856 	 */
857 	if (pg == phyint_anongroup)
858 		return;
859 
860 	if (debug & D_PHYINT)
861 		logdebug("phyint_group_delete('%s')\n", pg->pg_name);
862 
863 	/*
864 	 * The phyint group must be empty, and must not have any phyints.
865 	 * The phyint group must be in the list of all phyint groups
866 	 */
867 	assert(pg->pg_phyint == NULL);
868 	assert(phyint_groups == pg || pg->pg_prev != NULL);
869 
870 	if (pg->pg_prev != NULL)
871 		pg->pg_prev->pg_next = pg->pg_next;
872 	else
873 		phyint_groups = pg->pg_next;
874 
875 	if (pg->pg_next != NULL)
876 		pg->pg_next->pg_prev = pg->pg_prev;
877 
878 	pg->pg_next = NULL;
879 	pg->pg_prev = NULL;
880 
881 	phyint_grouplistsig++;
882 	(void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE);
883 
884 	free(pg);
885 }
886 
887 /*
888  * Extract information from the kernel about the desired phyint.
889  * Look only for properties of the phyint and not properties of logints.
890  * Take appropriate action on the changes.
891  * Return codes:
892  *	PI_OK
893  *		The phyint exists in the kernel and matches our knowledge
894  *		of the phyint.
895  *	PI_DELETED
896  *		The phyint has vanished in the kernel.
897  *	PI_IFINDEX_CHANGED
898  *		The phyint's interface index has changed.
899  *		Ask the caller to delete and recreate the phyint.
900  *	PI_IOCTL_ERROR
901  *		Some ioctl error. Don't change anything.
902  *	PI_GROUP_CHANGED
903  *		The phyint has changed group.
904  */
905 int
906 phyint_inst_update_from_k(struct phyint_instance *pii)
907 {
908 	struct lifreq lifr;
909 	int	ifsock;
910 	struct phyint *pi;
911 
912 	pi = pii->pii_phyint;
913 
914 	if (debug & D_PHYINT) {
915 		logdebug("phyint_inst_update_from_k(%s %s)\n",
916 		    AF_STR(pii->pii_af), pi->pi_name);
917 	}
918 
919 	/*
920 	 * Get the ifindex from the kernel, for comparison with the
921 	 * value in our tables.
922 	 */
923 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
924 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
925 
926 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
927 	if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) {
928 		if (errno == ENXIO) {
929 			return (PI_DELETED);
930 		} else {
931 			logperror_pii(pii, "phyint_inst_update_from_k:"
932 			    " ioctl (get lifindex)");
933 			return (PI_IOCTL_ERROR);
934 		}
935 	}
936 
937 	if (lifr.lifr_index != pi->pi_ifindex) {
938 		/*
939 		 * The index has changed. Most likely the interface has
940 		 * been unplumbed and replumbed. Ask the caller to take
941 		 * appropriate action.
942 		 */
943 		if (debug & D_PHYINT) {
944 			logdebug("phyint_inst_update_from_k:"
945 			    " old index %d new index %d\n",
946 			    pi->pi_ifindex, lifr.lifr_index);
947 		}
948 		return (PI_IFINDEX_CHANGED);
949 	}
950 
951 	/*
952 	 * Get the group name from the kernel, for comparison with
953 	 * the value in our tables.
954 	 */
955 	if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) {
956 		if (errno == ENXIO) {
957 			return (PI_DELETED);
958 		} else {
959 			logperror_pii(pii, "phyint_inst_update_from_k:"
960 			    " ioctl (get groupname)");
961 			return (PI_IOCTL_ERROR);
962 		}
963 	}
964 
965 	/*
966 	 * If the phyint has changed group i.e. if the phyint group name
967 	 * returned by the kernel is different, ask the caller to delete
968 	 * and recreate the phyint in the right group
969 	 */
970 	if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) {
971 		/* Groupname has changed */
972 		if (debug & D_PHYINT) {
973 			logdebug("phyint_inst_update_from_k:"
974 			    " groupname change\n");
975 		}
976 		return (PI_GROUP_CHANGED);
977 	}
978 
979 	/*
980 	 * Get the current phyint flags from the kernel, and determine what
981 	 * flags have changed by comparing against our tables.	Note that the
982 	 * IFF_INACTIVE processing in initifs() relies on this call to ensure
983 	 * that IFF_INACTIVE is really still set on the interface.
984 	 */
985 	if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) {
986 		if (errno == ENXIO) {
987 			return (PI_DELETED);
988 		} else {
989 			logperror_pii(pii, "phyint_inst_update_from_k: "
990 			    " ioctl (get flags)");
991 			return (PI_IOCTL_ERROR);
992 		}
993 	}
994 
995 	pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags);
996 	if (pi->pi_v4 != NULL)
997 		pi->pi_v4->pii_flags = pi->pi_flags;
998 	if (pi->pi_v6 != NULL)
999 		pi->pi_v6->pii_flags = pi->pi_flags;
1000 
1001 	if (pi->pi_flags & IFF_FAILED) {
1002 		/*
1003 		 * If we are in the running and full state, we have
1004 		 * completed failbacks successfully and we would have
1005 		 * expected IFF_FAILED to have been clear. That it is
1006 		 * set means there was a race condition. Some other
1007 		 * process turned on the IFF_FAILED flag. Since the
1008 		 * flag setting is not atomic, i.e. a get ioctl followed
1009 		 * by a set ioctl, and since there is no way to set an
1010 		 * individual flag bit, this could have occurred.
1011 		 */
1012 		if (pi->pi_state == PI_RUNNING && pi->pi_full)
1013 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
1014 	} else {
1015 		/*
1016 		 * If we are in the failed state, there was a race.
1017 		 * we have completed failover successfully because our
1018 		 * state is failed and empty. Some other process turned
1019 		 * off the IFF_FAILED flag. Same comment as above
1020 		 */
1021 		if (pi->pi_state == PI_FAILED && pi->pi_empty)
1022 			(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
1023 	}
1024 
1025 	/* No change in phyint status */
1026 	return (PI_OK);
1027 }
1028 
1029 /*
1030  * Delete the phyint. Remove it from the list of all phyints, and the
1031  * list of phyint group members. If the group becomes empty, delete the
1032  * group also.
1033  */
1034 static void
1035 phyint_delete(struct phyint *pi)
1036 {
1037 	struct phyint_group *pg = pi->pi_group;
1038 
1039 	if (debug & D_PHYINT)
1040 		logdebug("phyint_delete(%s)\n", pi->pi_name);
1041 
1042 	/* Both IPv4 and IPv6 phyint instances must have been deleted. */
1043 	assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL);
1044 
1045 	/*
1046 	 * The phyint must belong to a group.
1047 	 */
1048 	assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL);
1049 
1050 	/* The phyint must be in the list of all phyints */
1051 	assert(phyints == pi || pi->pi_prev != NULL);
1052 
1053 	/* Remove the phyint from the phyint group list */
1054 	pg->pg_sig++;
1055 	(void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE);
1056 
1057 	if (pi->pi_pgprev == NULL) {
1058 		/* Phyint is the 1st in the phyint group list */
1059 		pg->pg_phyint = pi->pi_pgnext;
1060 	} else {
1061 		pi->pi_pgprev->pi_pgnext = pi->pi_pgnext;
1062 	}
1063 	if (pi->pi_pgnext != NULL)
1064 		pi->pi_pgnext->pi_pgprev = pi->pi_pgprev;
1065 	pi->pi_pgnext = NULL;
1066 	pi->pi_pgprev = NULL;
1067 
1068 	/* Remove the phyint from the global list of phyints */
1069 	if (pi->pi_prev == NULL) {
1070 		/* Phyint is the 1st in the list */
1071 		phyints = pi->pi_next;
1072 	} else {
1073 		pi->pi_prev->pi_next = pi->pi_next;
1074 	}
1075 	if (pi->pi_next != NULL)
1076 		pi->pi_next->pi_prev = pi->pi_prev;
1077 	pi->pi_next = NULL;
1078 	pi->pi_prev = NULL;
1079 
1080 	free(pi);
1081 
1082 	/* Delete the phyint_group if the last phyint has been deleted */
1083 	if (pg->pg_phyint == NULL)
1084 		phyint_group_delete(pg);
1085 }
1086 
1087 /*
1088  * Delete (unlink and free), the phyint instance.
1089  */
1090 void
1091 phyint_inst_delete(struct phyint_instance *pii)
1092 {
1093 	struct phyint *pi = pii->pii_phyint;
1094 
1095 	assert(pi != NULL);
1096 
1097 	if (debug & D_PHYINT) {
1098 		logdebug("phyint_inst_delete(%s %s)\n",
1099 		    AF_STR(pii->pii_af), pi->pi_name);
1100 	}
1101 
1102 	/*
1103 	 * If the phyint instance has associated probe targets
1104 	 * delete all the targets
1105 	 */
1106 	while (pii->pii_targets != NULL)
1107 		target_delete(pii->pii_targets);
1108 
1109 	/*
1110 	 * Delete all the logints associated with this phyint
1111 	 * instance.
1112 	 */
1113 	while (pii->pii_logint != NULL)
1114 		logint_delete(pii->pii_logint);
1115 
1116 	/*
1117 	 * Close the socket used to send probes to targets from this phyint.
1118 	 */
1119 	if (pii->pii_probe_sock != -1)
1120 		close_probe_socket(pii, _B_TRUE);
1121 
1122 	/*
1123 	 * Phyint instance must be in the list of all phyint instances.
1124 	 * Remove phyint instance from the global list of phyint instances.
1125 	 */
1126 	assert(phyint_instances == pii || pii->pii_prev != NULL);
1127 	if (pii->pii_prev == NULL) {
1128 		/* Phyint is the 1st in the list */
1129 		phyint_instances = pii->pii_next;
1130 	} else {
1131 		pii->pii_prev->pii_next = pii->pii_next;
1132 	}
1133 	if (pii->pii_next != NULL)
1134 		pii->pii_next->pii_prev = pii->pii_prev;
1135 	pii->pii_next = NULL;
1136 	pii->pii_prev = NULL;
1137 
1138 	/*
1139 	 * Reset the phyint instance pointer in the phyint.
1140 	 * If this is the last phyint instance (being deleted) on this
1141 	 * phyint, then delete the phyint.
1142 	 */
1143 	if (pii->pii_af == AF_INET)
1144 		pi->pi_v4 = NULL;
1145 	else
1146 		pi->pi_v6 = NULL;
1147 
1148 	if (pi->pi_v4 == NULL && pi->pi_v6 == NULL)
1149 		phyint_delete(pi);
1150 
1151 	free(pii);
1152 }
1153 
1154 static void
1155 phyint_inst_print(struct phyint_instance *pii)
1156 {
1157 	struct logint *li;
1158 	struct target *tg;
1159 	char abuf[INET6_ADDRSTRLEN];
1160 	int most_recent;
1161 	int i;
1162 
1163 	if (pii->pii_phyint == NULL) {
1164 		logdebug("pii->pi_phyint NULL can't print\n");
1165 		return;
1166 	}
1167 
1168 	logdebug("\nPhyint instance: %s %s index %u state %x flags %llx	 "
1169 	    "sock %x in_use %d empty %x full %x\n",
1170 	    AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex,
1171 	    pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock,
1172 	    pii->pii_in_use, pii->pii_phyint->pi_empty,
1173 	    pii->pii_phyint->pi_full);
1174 
1175 	for (li = pii->pii_logint; li != NULL; li = li->li_next)
1176 		logint_print(li);
1177 
1178 	logdebug("\n");
1179 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next)
1180 		target_print(tg);
1181 
1182 	if (pii->pii_targets == NULL)
1183 		logdebug("pi_targets NULL\n");
1184 
1185 	if (pii->pii_target_next != NULL) {
1186 		logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af),
1187 		    pr_addr(pii->pii_af, pii->pii_target_next->tg_address,
1188 		    abuf, sizeof (abuf)));
1189 	} else {
1190 		logdebug("pi_target_next NULL\n");
1191 	}
1192 
1193 	if (pii->pii_rtt_target_next != NULL) {
1194 		logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af),
1195 		    pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address,
1196 		    abuf, sizeof (abuf)));
1197 	} else {
1198 		logdebug("pi_rtt_target_next NULL\n");
1199 	}
1200 
1201 	if (pii->pii_targets != NULL) {
1202 		most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
1203 
1204 		i = most_recent;
1205 		do {
1206 			if (pii->pii_probes[i].pr_target != NULL) {
1207 				logdebug("#%d target %s ", i,
1208 				    pr_addr(pii->pii_af,
1209 				    pii->pii_probes[i].pr_target->tg_address,
1210 				    abuf, sizeof (abuf)));
1211 			} else {
1212 				logdebug("#%d target NULL ", i);
1213 			}
1214 			logdebug("time_sent %u status %d time_ack/lost %u\n",
1215 			    pii->pii_probes[i].pr_time_sent,
1216 			    pii->pii_probes[i].pr_status,
1217 			    pii->pii_probes[i].pr_time_lost);
1218 			i = PROBE_INDEX_PREV(i);
1219 		} while (i != most_recent);
1220 	}
1221 }
1222 
1223 /*
1224  * Lookup a logint based on the logical interface name, on the given
1225  * phyint instance.
1226  */
1227 static struct logint *
1228 logint_lookup(struct phyint_instance *pii, char *name)
1229 {
1230 	struct logint *li;
1231 
1232 	if (debug & D_LOGINT) {
1233 		logdebug("logint_lookup(%s, %s)\n",
1234 		    AF_STR(pii->pii_af), name);
1235 	}
1236 
1237 	for (li = pii->pii_logint; li != NULL; li = li->li_next) {
1238 		if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0)
1239 			break;
1240 	}
1241 	return (li);
1242 }
1243 
1244 /*
1245  * Insert a logint at the head of the list of logints of the given
1246  * phyint instance
1247  */
1248 static void
1249 logint_insert(struct phyint_instance *pii, struct logint *li)
1250 {
1251 	li->li_next = pii->pii_logint;
1252 	li->li_prev = NULL;
1253 	if (pii->pii_logint != NULL)
1254 		pii->pii_logint->li_prev = li;
1255 	pii->pii_logint = li;
1256 	li->li_phyint_inst = pii;
1257 }
1258 
1259 /*
1260  * Create a new named logint, on the specified phyint instance.
1261  */
1262 static struct logint *
1263 logint_create(struct phyint_instance *pii, char *name)
1264 {
1265 	struct logint *li;
1266 
1267 	if (debug & D_LOGINT) {
1268 		logdebug("logint_create(%s %s %s)\n",
1269 		    AF_STR(pii->pii_af), pii->pii_name, name);
1270 	}
1271 
1272 	li = calloc(1, sizeof (struct logint));
1273 	if (li == NULL) {
1274 		logperror("logint_create: calloc");
1275 		return (NULL);
1276 	}
1277 
1278 	(void) strncpy(li->li_name, name, sizeof (li->li_name));
1279 	li->li_name[sizeof (li->li_name) - 1] = '\0';
1280 	logint_insert(pii, li);
1281 	return (li);
1282 }
1283 
1284 /*
1285  * Initialize the logint based on the data returned by the kernel.
1286  */
1287 void
1288 logint_init_from_k(struct phyint_instance *pii, char *li_name)
1289 {
1290 	int	ifsock;
1291 	uint64_t flags;
1292 	uint64_t saved_flags;
1293 	struct	logint	*li;
1294 	struct lifreq	lifr;
1295 	struct in6_addr	test_subnet;
1296 	struct in6_addr	test_subnet_mask;
1297 	struct in6_addr	testaddr;
1298 	int	test_subnet_len;
1299 	struct sockaddr_in6	*sin6;
1300 	struct sockaddr_in	*sin;
1301 	char abuf[INET6_ADDRSTRLEN];
1302 	boolean_t  ptp = _B_FALSE;
1303 	struct in6_addr tgaddr;
1304 
1305 	if (debug & D_LOGINT) {
1306 		logdebug("logint_init_from_k(%s %s)\n",
1307 		    AF_STR(pii->pii_af), li_name);
1308 	}
1309 
1310 	/* Get the socket for doing ioctls */
1311 	ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6;
1312 
1313 	/*
1314 	 * Get the flags from the kernel. Also serves as a check whether
1315 	 * the logical still exists. If it doesn't exist, no need to proceed
1316 	 * any further. li_in_use will make the caller clean up the logint
1317 	 */
1318 	(void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name));
1319 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
1320 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
1321 		/* Interface may have vanished */
1322 		if (errno != ENXIO) {
1323 			logperror_pii(pii, "logint_init_from_k: "
1324 			    "ioctl (get flags)");
1325 		}
1326 		return;
1327 	}
1328 
1329 	flags = lifr.lifr_flags;
1330 
1331 	/*
1332 	 * Verified the logint exists. Now lookup the logint in our tables.
1333 	 * If it does not exist, create a new logint.
1334 	 */
1335 	li = logint_lookup(pii, li_name);
1336 	if (li == NULL) {
1337 		li = logint_create(pii, li_name);
1338 		if (li == NULL) {
1339 			/*
1340 			 * Pretend the interface does not exist
1341 			 * in the kernel
1342 			 */
1343 			return;
1344 		}
1345 	}
1346 
1347 	/*
1348 	 * Update li->li_flags with the new flags, after saving the old
1349 	 * value. This is used later to check what flags has changed and
1350 	 * take any action
1351 	 */
1352 	saved_flags = li->li_flags;
1353 	li->li_flags = flags;
1354 
1355 	/*
1356 	 * Get the address, prefix, prefixlength and update the logint.
1357 	 * Check if anything has changed. If the logint used for the
1358 	 * test address has changed, take suitable action.
1359 	 */
1360 	if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) {
1361 		/* Interface may have vanished */
1362 		if (errno != ENXIO) {
1363 			logperror_li(li, "logint_init_from_k: (get addr)");
1364 		}
1365 		goto error;
1366 	}
1367 
1368 	if (pii->pii_af == AF_INET) {
1369 		sin = (struct sockaddr_in *)&lifr.lifr_addr;
1370 		IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr);
1371 	} else {
1372 		sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1373 		testaddr = sin6->sin6_addr;
1374 	}
1375 
1376 	if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) {
1377 		ptp = _B_TRUE;
1378 		if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) {
1379 			if (errno != ENXIO) {
1380 				logperror_li(li, "logint_init_from_k:"
1381 				    " (get dstaddr)");
1382 			}
1383 			goto error;
1384 		}
1385 		if (pii->pii_af == AF_INET) {
1386 			sin = (struct sockaddr_in *)&lifr.lifr_addr;
1387 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr);
1388 		} else {
1389 			sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr;
1390 			tgaddr = sin6->sin6_addr;
1391 		}
1392 	} else {
1393 		if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) {
1394 			/* Interface may have vanished */
1395 			if (errno != ENXIO) {
1396 				logperror_li(li, "logint_init_from_k:"
1397 				    " (get subnet)");
1398 			}
1399 			goto error;
1400 		}
1401 		if (lifr.lifr_subnet.ss_family == AF_INET6) {
1402 			sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet;
1403 			test_subnet = sin6->sin6_addr;
1404 			test_subnet_len = lifr.lifr_addrlen;
1405 		} else {
1406 			sin = (struct sockaddr_in *)&lifr.lifr_subnet;
1407 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet);
1408 			test_subnet_len = lifr.lifr_addrlen +
1409 			    (IPV6_ABITS - IP_ABITS);
1410 		}
1411 		(void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask);
1412 	}
1413 
1414 	/*
1415 	 * Also record the OINDEX for completeness. This information is
1416 	 * not used.
1417 	 */
1418 	if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) {
1419 		if (errno != ENXIO)  {
1420 			logperror_li(li, "logint_init_from_k:"
1421 			    " (get lifoindex)");
1422 		}
1423 		goto error;
1424 	}
1425 
1426 	/*
1427 	 * If this is the logint corresponding to the test address used for
1428 	 * sending probes, then if anything significant has changed we need to
1429 	 * determine the test address again.  We ignore changes to the
1430 	 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of
1431 	 * course.
1432 	 */
1433 	if (pii->pii_probe_logint == li) {
1434 		if (((li->li_flags ^ saved_flags) &
1435 		    ~(IFF_FAILED | IFF_RUNNING)) != 0 ||
1436 		    !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) ||
1437 		    (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet,
1438 		    &li->li_subnet)) ||
1439 		    (!ptp && test_subnet_len != li->li_subnet_len) ||
1440 		    (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) {
1441 			/*
1442 			 * Something significant that affects the testaddress
1443 			 * has changed. Redo the testaddress selection later on
1444 			 * in select_test_ifs(). For now do the cleanup and
1445 			 * set pii_probe_logint to NULL.
1446 			 */
1447 			if (pii->pii_probe_sock != -1)
1448 				close_probe_socket(pii, _B_TRUE);
1449 			pii->pii_probe_logint = NULL;
1450 		}
1451 	}
1452 
1453 
1454 	/* Update the logint with the values obtained from the kernel.	*/
1455 	li->li_addr = testaddr;
1456 	li->li_in_use = 1;
1457 	li->li_oifindex = lifr.lifr_index;
1458 	if (ptp) {
1459 		li->li_dstaddr = tgaddr;
1460 		li->li_subnet_len = (pii->pii_af == AF_INET) ?
1461 		    IP_ABITS : IPV6_ABITS;
1462 	} else {
1463 		li->li_subnet = test_subnet;
1464 		li->li_subnet_len = test_subnet_len;
1465 	}
1466 
1467 	if (debug & D_LOGINT)
1468 		logint_print(li);
1469 
1470 	return;
1471 
1472 error:
1473 	logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n",
1474 	    AF_STR(pii->pii_af), pii->pii_name, li->li_name,
1475 	    pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf)));
1476 	logint_delete(li);
1477 }
1478 
1479 /*
1480  * Delete (unlink and free) a logint.
1481  */
1482 void
1483 logint_delete(struct logint *li)
1484 {
1485 	struct phyint_instance *pii;
1486 
1487 	pii = li->li_phyint_inst;
1488 	assert(pii != NULL);
1489 
1490 	if (debug & D_LOGINT) {
1491 		int af;
1492 		char abuf[INET6_ADDRSTRLEN];
1493 
1494 		af = pii->pii_af;
1495 		logdebug("logint_delete(%s %s %s/%u)\n",
1496 		    AF_STR(af), li->li_name,
1497 		    pr_addr(af, li->li_addr, abuf, sizeof (abuf)),
1498 		    li->li_subnet_len);
1499 	}
1500 
1501 	/* logint must be in the list of logints */
1502 	assert(pii->pii_logint == li || li->li_prev != NULL);
1503 
1504 	/* Remove the logint from the list of logints  */
1505 	if (li->li_prev == NULL) {
1506 		/* logint is the 1st in the list */
1507 		pii->pii_logint = li->li_next;
1508 	} else {
1509 		li->li_prev->li_next = li->li_next;
1510 	}
1511 	if (li->li_next != NULL)
1512 		li->li_next->li_prev = li->li_prev;
1513 	li->li_next = NULL;
1514 	li->li_prev = NULL;
1515 
1516 	/*
1517 	 * If this logint is also being used for probing, then close the
1518 	 * associated socket, if it exists.
1519 	 */
1520 	if (pii->pii_probe_logint == li) {
1521 		if (pii->pii_probe_sock != -1)
1522 			close_probe_socket(pii, _B_TRUE);
1523 		pii->pii_probe_logint = NULL;
1524 	}
1525 
1526 	free(li);
1527 }
1528 
1529 static void
1530 logint_print(struct logint *li)
1531 {
1532 	char abuf[INET6_ADDRSTRLEN];
1533 	int af;
1534 
1535 	af = li->li_phyint_inst->pii_af;
1536 
1537 	logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name,
1538 	    pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len);
1539 
1540 	logdebug("\tFlags: %llx in_use %d oifindex %d\n",
1541 	    li->li_flags, li->li_in_use, li->li_oifindex);
1542 }
1543 
1544 char *
1545 pr_addr(int af, struct in6_addr addr, char *abuf, int len)
1546 {
1547 	struct in_addr	addr_v4;
1548 
1549 	if (af == AF_INET) {
1550 		IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4);
1551 		(void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len);
1552 	} else {
1553 		(void) inet_ntop(AF_INET6, (void *)&addr, abuf, len);
1554 	}
1555 	return (abuf);
1556 }
1557 
1558 /* Lookup target on its address */
1559 struct target *
1560 target_lookup(struct phyint_instance *pii, struct in6_addr addr)
1561 {
1562 	struct target *tg;
1563 
1564 	if (debug & D_TARGET) {
1565 		char abuf[INET6_ADDRSTRLEN];
1566 
1567 		logdebug("target_lookup(%s %s): addr %s\n",
1568 		    AF_STR(pii->pii_af), pii->pii_name,
1569 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1570 	}
1571 
1572 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1573 		if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr))
1574 			break;
1575 	}
1576 	return (tg);
1577 }
1578 
1579 /*
1580  * Find and return the next active target, for the next probe.
1581  * If no active targets are available, return NULL.
1582  */
1583 struct target *
1584 target_next(struct target *tg)
1585 {
1586 	struct	phyint_instance	*pii = tg->tg_phyint_inst;
1587 	struct	target	*marker = tg;
1588 	hrtime_t now;
1589 
1590 	now = gethrtime();
1591 
1592 	/*
1593 	 * Target must be in the list of targets for this phyint
1594 	 * instance.
1595 	 */
1596 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1597 	assert(pii->pii_targets != NULL);
1598 
1599 	/* Return the next active target */
1600 	do {
1601 		/*
1602 		 * Go to the next target. If we hit the end,
1603 		 * reset the ptr to the head
1604 		 */
1605 		tg = tg->tg_next;
1606 		if (tg == NULL)
1607 			tg = pii->pii_targets;
1608 
1609 		assert(TG_STATUS_VALID(tg->tg_status));
1610 
1611 		switch (tg->tg_status) {
1612 		case TG_ACTIVE:
1613 			return (tg);
1614 
1615 		case TG_UNUSED:
1616 			assert(pii->pii_targets_are_routers);
1617 			if (pii->pii_ntargets < MAX_PROBE_TARGETS) {
1618 				/*
1619 				 * Bubble up the unused target to active
1620 				 */
1621 				tg->tg_status = TG_ACTIVE;
1622 				pii->pii_ntargets++;
1623 				return (tg);
1624 			}
1625 			break;
1626 
1627 		case TG_SLOW:
1628 			assert(pii->pii_targets_are_routers);
1629 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1630 				/*
1631 				 * Bubble up the slow target to unused
1632 				 */
1633 				tg->tg_status = TG_UNUSED;
1634 			}
1635 			break;
1636 
1637 		case TG_DEAD:
1638 			assert(pii->pii_targets_are_routers);
1639 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1640 				/*
1641 				 * Bubble up the dead target to slow
1642 				 */
1643 				tg->tg_status = TG_SLOW;
1644 				tg->tg_latime = now;
1645 			}
1646 			break;
1647 		}
1648 
1649 	} while (tg != marker);
1650 
1651 	return (NULL);
1652 }
1653 
1654 /*
1655  * Select the best available target, that is not already TG_ACTIVE,
1656  * for the caller. The caller will determine whether it wants to
1657  * make the returned target TG_ACTIVE.
1658  * The selection order is as follows.
1659  * 1. pick a TG_UNSED target, if it exists.
1660  * 2. else pick a TG_SLOW target that has recovered, if it exists
1661  * 3. else pick any TG_SLOW target, if it exists
1662  * 4. else pick a TG_DEAD target that has recovered, if it exists
1663  * 5. else pick any TG_DEAD target, if it exists
1664  * 6. else return null
1665  */
1666 static struct target *
1667 target_select_best(struct phyint_instance *pii)
1668 {
1669 	struct target *tg;
1670 	struct target *slow = NULL;
1671 	struct target *dead = NULL;
1672 	struct target *slow_recovered = NULL;
1673 	struct target *dead_recovered = NULL;
1674 	hrtime_t now;
1675 
1676 	now = gethrtime();
1677 
1678 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1679 		assert(TG_STATUS_VALID(tg->tg_status));
1680 
1681 		switch (tg->tg_status) {
1682 		case TG_UNUSED:
1683 			return (tg);
1684 
1685 		case TG_SLOW:
1686 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1687 				slow_recovered = tg;
1688 				/*
1689 				 * Promote the slow_recoverd to unused
1690 				 */
1691 				tg->tg_status = TG_UNUSED;
1692 			} else {
1693 				slow = tg;
1694 			}
1695 			break;
1696 
1697 		case TG_DEAD:
1698 			if (tg->tg_latime + MIN_RECOVERY_TIME < now) {
1699 				dead_recovered = tg;
1700 				/*
1701 				 * Promote the dead_recoverd to slow
1702 				 */
1703 				tg->tg_status = TG_SLOW;
1704 				tg->tg_latime = now;
1705 			} else {
1706 				dead = tg;
1707 			}
1708 			break;
1709 
1710 		default:
1711 			break;
1712 		}
1713 	}
1714 
1715 	if (slow_recovered != NULL)
1716 		return (slow_recovered);
1717 	else if (slow != NULL)
1718 		return (slow);
1719 	else if (dead_recovered != NULL)
1720 		return (dead_recovered);
1721 	else
1722 		return (dead);
1723 }
1724 
1725 /*
1726  * Some target was deleted. If we don't have even MIN_PROBE_TARGETS
1727  * that are active, pick the next best below.
1728  */
1729 static void
1730 target_activate_all(struct phyint_instance *pii)
1731 {
1732 	struct target *tg;
1733 
1734 	assert(pii->pii_ntargets == 0);
1735 	assert(pii->pii_target_next == NULL);
1736 	assert(pii->pii_rtt_target_next == NULL);
1737 	assert(pii->pii_targets_are_routers);
1738 
1739 	while (pii->pii_ntargets < MIN_PROBE_TARGETS) {
1740 		tg = target_select_best(pii);
1741 		if (tg == NULL) {
1742 			/* We are out of targets */
1743 			return;
1744 		}
1745 
1746 		assert(TG_STATUS_VALID(tg->tg_status));
1747 		assert(tg->tg_status != TG_ACTIVE);
1748 		tg->tg_status = TG_ACTIVE;
1749 		pii->pii_ntargets++;
1750 		if (pii->pii_target_next == NULL) {
1751 			pii->pii_target_next = tg;
1752 			pii->pii_rtt_target_next = tg;
1753 		}
1754 	}
1755 }
1756 
1757 static struct target *
1758 target_first(struct phyint_instance *pii)
1759 {
1760 	struct target *tg;
1761 
1762 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
1763 		assert(TG_STATUS_VALID(tg->tg_status));
1764 		if (tg->tg_status == TG_ACTIVE)
1765 			break;
1766 	}
1767 
1768 	return (tg);
1769 }
1770 
1771 /*
1772  * Create a default target entry.
1773  */
1774 void
1775 target_create(struct phyint_instance *pii, struct in6_addr addr,
1776     boolean_t is_router)
1777 {
1778 	struct target *tg;
1779 	struct phyint *pi;
1780 	struct logint *li;
1781 
1782 	if (debug & D_TARGET) {
1783 		char abuf[INET6_ADDRSTRLEN];
1784 
1785 		logdebug("target_create(%s %s, %s)\n",
1786 		    AF_STR(pii->pii_af), pii->pii_name,
1787 		    pr_addr(pii->pii_af, addr, abuf, sizeof (abuf)));
1788 	}
1789 
1790 	/*
1791 	 * If the test address is not yet initialized, do not add
1792 	 * any target, since we cannot determine whether the target
1793 	 * belongs to the same subnet as the test address.
1794 	 */
1795 	li = pii->pii_probe_logint;
1796 	if (li == NULL)
1797 		return;
1798 
1799 	/*
1800 	 * If there are multiple subnets associated with an interface, then
1801 	 * add the target to this phyint instance, only if it belongs to the
1802 	 * same subnet as the test address. The reason is that interface
1803 	 * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER
1804 	 * addresses, will disappear after failover, and the targets will not
1805 	 * be reachable from this interface.
1806 	 */
1807 	if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len))
1808 		return;
1809 
1810 	if (pii->pii_targets != NULL) {
1811 		assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
1812 		if (is_router) {
1813 			if (!pii->pii_targets_are_routers) {
1814 				/*
1815 				 * Prefer router over hosts. Using hosts is a
1816 				 * fallback mechanism, hence delete all host
1817 				 * targets.
1818 				 */
1819 				while (pii->pii_targets != NULL)
1820 					target_delete(pii->pii_targets);
1821 			}
1822 		} else {
1823 			/*
1824 			 * Routers take precedence over hosts. If this
1825 			 * is a router list and we are trying to add a
1826 			 * host, just return. If this is a host list
1827 			 * and if we have sufficient targets, just return
1828 			 */
1829 			if (pii->pii_targets_are_routers ||
1830 			    pii->pii_ntargets == MAX_PROBE_TARGETS)
1831 				return;
1832 		}
1833 	}
1834 
1835 	tg = calloc(1, sizeof (struct target));
1836 	if (tg == NULL) {
1837 		logperror("target_create: calloc");
1838 		return;
1839 	}
1840 
1841 	tg->tg_phyint_inst = pii;
1842 	tg->tg_address = addr;
1843 	tg->tg_in_use = 1;
1844 	tg->tg_rtt_sa = -1;
1845 	tg->tg_num_deferred = 0;
1846 
1847 	/*
1848 	 * If this is the first target, set 'pii_targets_are_routers'
1849 	 * The list of targets is either a list of hosts or list or
1850 	 * routers, but not a mix.
1851 	 */
1852 	if (pii->pii_targets == NULL) {
1853 		assert(pii->pii_ntargets == 0);
1854 		assert(pii->pii_target_next == NULL);
1855 		assert(pii->pii_rtt_target_next == NULL);
1856 		pii->pii_targets_are_routers = is_router ? 1 : 0;
1857 	}
1858 
1859 	if (pii->pii_ntargets == MAX_PROBE_TARGETS) {
1860 		assert(pii->pii_targets_are_routers);
1861 		assert(pii->pii_target_next != NULL);
1862 		assert(pii->pii_rtt_target_next != NULL);
1863 		tg->tg_status = TG_UNUSED;
1864 	} else {
1865 		if (pii->pii_ntargets == 0) {
1866 			assert(pii->pii_target_next == NULL);
1867 			pii->pii_target_next = tg;
1868 			pii->pii_rtt_target_next = tg;
1869 		}
1870 		pii->pii_ntargets++;
1871 		tg->tg_status = TG_ACTIVE;
1872 	}
1873 
1874 	target_insert(pii, tg);
1875 
1876 	/*
1877 	 * Change state to PI_RUNNING if this phyint instance is capable of
1878 	 * sending and receiving probes -- that is, if we know of at least 1
1879 	 * target, and this phyint instance is probe-capable.  For more
1880 	 * details, see the phyint state diagram in mpd_probe.c.
1881 	 */
1882 	pi = pii->pii_phyint;
1883 	if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) {
1884 		if (pi->pi_flags & IFF_FAILED)
1885 			phyint_chstate(pi, PI_FAILED);
1886 		else
1887 			phyint_chstate(pi, PI_RUNNING);
1888 	}
1889 }
1890 
1891 /*
1892  * Add the target address named by `addr' to phyint instance `pii' if it does
1893  * not already exist.  If the target is a router, `is_router' should be set to
1894  * B_TRUE.
1895  */
1896 void
1897 target_add(struct phyint_instance *pii, struct in6_addr addr,
1898     boolean_t is_router)
1899 {
1900 	struct target *tg;
1901 
1902 	if (pii == NULL)
1903 		return;
1904 
1905 	tg = target_lookup(pii, addr);
1906 
1907 	/*
1908 	 * If the target does not exist, create it; target_create() will set
1909 	 * tg_in_use to true.  If it exists already, and it is a router
1910 	 * target, set tg_in_use to to true, so that init_router_targets()
1911 	 * won't delete it
1912 	 */
1913 	if (tg == NULL)
1914 		target_create(pii, addr, is_router);
1915 	else if (is_router)
1916 		tg->tg_in_use = 1;
1917 }
1918 
1919 /*
1920  * Insert target at head of linked list of targets for the associated
1921  * phyint instance
1922  */
1923 static void
1924 target_insert(struct phyint_instance *pii, struct target *tg)
1925 {
1926 	tg->tg_next = pii->pii_targets;
1927 	tg->tg_prev = NULL;
1928 	if (tg->tg_next != NULL)
1929 		tg->tg_next->tg_prev = tg;
1930 	pii->pii_targets = tg;
1931 }
1932 
1933 /*
1934  * Delete a target (unlink and free).
1935  */
1936 void
1937 target_delete(struct target *tg)
1938 {
1939 	int af;
1940 	struct phyint_instance	*pii;
1941 	struct phyint_instance	*pii_other;
1942 
1943 	pii = tg->tg_phyint_inst;
1944 	af = pii->pii_af;
1945 
1946 	if (debug & D_TARGET) {
1947 		char abuf[INET6_ADDRSTRLEN];
1948 
1949 		logdebug("target_delete(%s %s, %s)\n",
1950 		    AF_STR(af), pii->pii_name,
1951 		    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)));
1952 	}
1953 
1954 	/*
1955 	 * Target must be in the list of targets for this phyint
1956 	 * instance.
1957 	 */
1958 	assert(pii->pii_targets == tg || tg->tg_prev != NULL);
1959 
1960 	/*
1961 	 * Reset all references to 'tg' in the probe information
1962 	 * for this phyint.
1963 	 */
1964 	reset_pii_probes(pii, tg);
1965 
1966 	/*
1967 	 * Remove this target from the list of targets of this
1968 	 * phyint instance.
1969 	 */
1970 	if (tg->tg_prev == NULL) {
1971 		pii->pii_targets = tg->tg_next;
1972 	} else {
1973 		tg->tg_prev->tg_next = tg->tg_next;
1974 	}
1975 
1976 	if (tg->tg_next != NULL)
1977 		tg->tg_next->tg_prev = tg->tg_prev;
1978 
1979 	tg->tg_next = NULL;
1980 	tg->tg_prev = NULL;
1981 
1982 	if (tg->tg_status == TG_ACTIVE)
1983 		pii->pii_ntargets--;
1984 
1985 	/*
1986 	 * Adjust the next target to probe, if it points to
1987 	 * to the currently deleted target.
1988 	 */
1989 	if (pii->pii_target_next == tg)
1990 		pii->pii_target_next = target_first(pii);
1991 
1992 	if (pii->pii_rtt_target_next == tg)
1993 		pii->pii_rtt_target_next = target_first(pii);
1994 
1995 	free(tg);
1996 
1997 	/*
1998 	 * The number of active targets pii_ntargets == 0 iff
1999 	 * the next active target pii->pii_target_next == NULL
2000 	 */
2001 	if (pii->pii_ntargets != 0) {
2002 		assert(pii->pii_target_next != NULL);
2003 		assert(pii->pii_rtt_target_next != NULL);
2004 		assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2005 		assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE);
2006 		return;
2007 	}
2008 
2009 	/* At this point, we don't have any active targets. */
2010 	assert(pii->pii_target_next == NULL);
2011 	assert(pii->pii_rtt_target_next == NULL);
2012 
2013 	if (pii->pii_targets_are_routers) {
2014 		/*
2015 		 * Activate any TG_SLOW or TG_DEAD router targets,
2016 		 * since we don't have any other targets
2017 		 */
2018 		target_activate_all(pii);
2019 
2020 		if (pii->pii_ntargets != 0) {
2021 			assert(pii->pii_target_next != NULL);
2022 			assert(pii->pii_rtt_target_next != NULL);
2023 			assert(pii->pii_target_next->tg_status == TG_ACTIVE);
2024 			assert(pii->pii_rtt_target_next->tg_status ==
2025 			    TG_ACTIVE);
2026 			return;
2027 		}
2028 	}
2029 
2030 	/*
2031 	 * If we still don't have any active targets, the list must
2032 	 * must be really empty. There aren't even TG_SLOW or TG_DEAD
2033 	 * targets. Zero out the probe stats since it will not be
2034 	 * relevant any longer.
2035 	 */
2036 	assert(pii->pii_targets == NULL);
2037 	clear_pii_probe_stats(pii);
2038 	pii_other = phyint_inst_other(pii);
2039 
2040 	/*
2041 	 * If there are no targets on both instances and the interface is
2042 	 * online, go back to PI_NOTARGETS state, since we cannot probe this
2043 	 * phyint any more.  For more details, please see phyint state
2044 	 * diagram in mpd_probe.c.
2045 	 */
2046 	if (!PROBE_CAPABLE(pii_other) &&
2047 	    pii->pii_phyint->pi_state != PI_OFFLINE)
2048 		phyint_chstate(pii->pii_phyint, PI_NOTARGETS);
2049 }
2050 
2051 /*
2052  * Flush the target list of every phyint in the group, if the list
2053  * is a host target list. This is called if group failure is suspected.
2054  * If all targets have failed, multicast will subsequently discover new
2055  * targets. Else it is a group failure.
2056  * Note: This function is a no-op if the list is a router target list.
2057  */
2058 static void
2059 target_flush_hosts(struct phyint_group *pg)
2060 {
2061 	struct phyint *pi;
2062 	struct phyint_instance *pii;
2063 
2064 	if (debug & D_TARGET)
2065 		logdebug("target_flush_hosts(%s)\n", pg->pg_name);
2066 
2067 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
2068 		pii = pi->pi_v4;
2069 		if (pii != NULL && !pii->pii_targets_are_routers) {
2070 			/*
2071 			 * Delete all the targets. When the list becomes
2072 			 * empty, target_delete() will set pii->pii_targets
2073 			 * to NULL.
2074 			 */
2075 			while (pii->pii_targets != NULL)
2076 				target_delete(pii->pii_targets);
2077 		}
2078 		pii = pi->pi_v6;
2079 		if (pii != NULL && !pii->pii_targets_are_routers) {
2080 			/*
2081 			 * Delete all the targets. When the list becomes
2082 			 * empty, target_delete() will set pii->pii_targets
2083 			 * to NULL.
2084 			 */
2085 			while (pii->pii_targets != NULL)
2086 				target_delete(pii->pii_targets);
2087 		}
2088 	}
2089 }
2090 
2091 /*
2092  * Reset all references to 'target' in the probe info, as this target is
2093  * being deleted. The pr_target field is guaranteed to be non-null if
2094  * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that
2095  * pr_target will not be accessed unconditionally.
2096  */
2097 static void
2098 reset_pii_probes(struct phyint_instance *pii, struct target *tg)
2099 {
2100 	int i;
2101 
2102 	for (i = 0; i < PROBE_STATS_COUNT; i++) {
2103 		if (pii->pii_probes[i].pr_target == tg) {
2104 			pii->pii_probes[i].pr_target = NULL;
2105 			if (pii->pii_probes[i].pr_status == PR_UNACKED)
2106 				pii->pii_probes[i].pr_status = PR_LOST;
2107 		}
2108 	}
2109 
2110 }
2111 
2112 /*
2113  * Clear the probe statistics array.
2114  */
2115 void
2116 clear_pii_probe_stats(struct phyint_instance *pii)
2117 {
2118 	bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT);
2119 	/* Reset the next probe index in the probe stats array */
2120 	pii->pii_probe_next = 0;
2121 }
2122 
2123 static void
2124 target_print(struct target *tg)
2125 {
2126 	char	abuf[INET6_ADDRSTRLEN];
2127 	char	buf[128];
2128 	char	buf2[128];
2129 	int	af;
2130 	int	i;
2131 
2132 	af = tg->tg_phyint_inst->pii_af;
2133 
2134 	logdebug("Target on %s %s addr %s\n"
2135 	    "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n",
2136 	    AF_STR(af), tg->tg_phyint_inst->pii_name,
2137 	    pr_addr(af, tg->tg_address, abuf, sizeof (abuf)),
2138 	    tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd,
2139 	    tg->tg_crtt, tg->tg_in_use);
2140 
2141 	buf[0] = '\0';
2142 	for (i = 0; i < tg->tg_num_deferred; i++) {
2143 		(void) snprintf(buf2, sizeof (buf2), " %dms",
2144 		    tg->tg_deferred[i]);
2145 		(void) strlcat(buf, buf2, sizeof (buf));
2146 	}
2147 	logdebug("deferred rtts:%s\n", buf);
2148 }
2149 
2150 void
2151 phyint_inst_print_all(void)
2152 {
2153 	struct phyint_instance *pii;
2154 
2155 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
2156 		phyint_inst_print(pii);
2157 	}
2158 }
2159 
2160 /*
2161  * Convert length for a mask to the mask.
2162  */
2163 static void
2164 ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask)
2165 {
2166 	int	j;
2167 
2168 	assert(masklen <= IPV6_ABITS);
2169 	bzero((char *)bitmask, sizeof (*bitmask));
2170 
2171 	/* Make the 'masklen' leftmost bits one */
2172 	for (j = 0; masklen > 8; masklen -= 8, j++)
2173 		bitmask->s6_addr[j] = 0xff;
2174 
2175 	bitmask->s6_addr[j] = 0xff << (8 - masklen);
2176 
2177 }
2178 
2179 /*
2180  * Compare two prefixes that have the same prefix length.
2181  * Fails if the prefix length is unreasonable.
2182  */
2183 static boolean_t
2184 prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len)
2185 {
2186 	uchar_t mask;
2187 	int j;
2188 
2189 	if (prefix_len < 0 || prefix_len > IPV6_ABITS)
2190 		return (_B_FALSE);
2191 
2192 	for (j = 0; prefix_len > 8; prefix_len -= 8, j++)
2193 		if (p1.s6_addr[j] != p2.s6_addr[j])
2194 			return (_B_FALSE);
2195 
2196 	/* Make the N leftmost bits one */
2197 	mask = 0xff << (8 - prefix_len);
2198 	if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask))
2199 		return (_B_FALSE);
2200 
2201 	return (_B_TRUE);
2202 }
2203 
2204 /*
2205  * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both
2206  * IPv4 and IPv6 put together. The phyint with the least such number
2207  * will be used as the failover destination, if no standby interface is
2208  * available
2209  */
2210 int
2211 logint_upcount(struct phyint *pi)
2212 {
2213 	struct	logint	*li;
2214 	struct	phyint_instance *pii;
2215 	int count = 0;
2216 
2217 	pii = pi->pi_v4;
2218 	if (pii != NULL) {
2219 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
2220 			if ((li->li_flags &
2221 			    (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
2222 				count++;
2223 			}
2224 		}
2225 	}
2226 
2227 	pii = pi->pi_v6;
2228 	if (pii != NULL) {
2229 		for (li = pii->pii_logint; li != NULL; li = li->li_next) {
2230 			if ((li->li_flags &
2231 			    (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) {
2232 				count++;
2233 			}
2234 		}
2235 	}
2236 
2237 	return (count);
2238 }
2239 
2240 /*
2241  * Get the phyint instance with the other (IPv4 / IPv6) protocol
2242  */
2243 struct phyint_instance *
2244 phyint_inst_other(struct phyint_instance *pii)
2245 {
2246 	if (pii->pii_af == AF_INET)
2247 		return (pii->pii_phyint->pi_v6);
2248 	else
2249 		return (pii->pii_phyint->pi_v4);
2250 }
2251 
2252 /*
2253  * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'.
2254  * Before sending the event, it prepends the current version of the IPMP
2255  * sysevent API.  Returns 0 on success, -1 on failure (in either case,
2256  * `nvl' is freed).
2257  */
2258 static int
2259 post_event(const char *subclass, nvlist_t *nvl)
2260 {
2261 	sysevent_id_t eid;
2262 
2263 	/*
2264 	 * Since sysevents don't work yet in non-global zones, there cannot
2265 	 * possibly be any consumers yet, so don't bother trying to generate
2266 	 * them.  (Otherwise, we'll spew warnings.)
2267 	 */
2268 	if (getzoneid() != GLOBAL_ZONEID) {
2269 		nvlist_free(nvl);
2270 		return (0);
2271 	}
2272 
2273 	errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION,
2274 	    IPMP_EVENT_CUR_VERSION);
2275 	if (errno != 0) {
2276 		logerr("cannot create `%s' event: %s", subclass,
2277 		    strerror(errno));
2278 		goto failed;
2279 	}
2280 
2281 	if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR,
2282 	    "in.mpathd", nvl, &eid) == -1) {
2283 		logerr("cannot send `%s' event: %s\n", subclass,
2284 		    strerror(errno));
2285 		goto failed;
2286 	}
2287 
2288 	nvlist_free(nvl);
2289 	return (0);
2290 failed:
2291 	nvlist_free(nvl);
2292 	return (-1);
2293 }
2294 
2295 /*
2296  * Return the external IPMP state associated with phyint `pi'.
2297  */
2298 static ipmp_if_state_t
2299 ifstate(struct phyint *pi)
2300 {
2301 	switch (pi->pi_state) {
2302 	case PI_NOTARGETS:
2303 		return (IPMP_IF_UNKNOWN);
2304 
2305 	case PI_OFFLINE:
2306 		return (IPMP_IF_OFFLINE);
2307 
2308 	case PI_FAILED:
2309 		return (IPMP_IF_FAILED);
2310 
2311 	case PI_RUNNING:
2312 		return (IPMP_IF_OK);
2313 	}
2314 
2315 	logerr("ifstate: unknown state %d; aborting\n", pi->pi_state);
2316 	abort();
2317 	/* NOTREACHED */
2318 }
2319 
2320 /*
2321  * Return the external IPMP interface type associated with phyint `pi'.
2322  */
2323 static ipmp_if_type_t
2324 iftype(struct phyint *pi)
2325 {
2326 	if (pi->pi_flags & IFF_STANDBY)
2327 		return (IPMP_IF_STANDBY);
2328 	else
2329 		return (IPMP_IF_NORMAL);
2330 }
2331 
2332 /*
2333  * Return the external IPMP group state associated with phyint group `pg'.
2334  */
2335 static ipmp_group_state_t
2336 groupstate(struct phyint_group *pg)
2337 {
2338 	return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK);
2339 }
2340 
2341 /*
2342  * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'.
2343  * Returns 0 on success, -1 on failure.
2344  */
2345 static int
2346 phyint_group_state_event(struct phyint_group *pg)
2347 {
2348 	nvlist_t	*nvl;
2349 
2350 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2351 	if (errno != 0) {
2352 		logperror("cannot create `group state change' event");
2353 		return (-1);
2354 	}
2355 
2356 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2357 	if (errno != 0)
2358 		goto failed;
2359 
2360 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2361 	if (errno != 0)
2362 		goto failed;
2363 
2364 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg));
2365 	if (errno != 0)
2366 		goto failed;
2367 
2368 	return (post_event(ESC_IPMP_GROUP_STATE, nvl));
2369 failed:
2370 	logperror("cannot create `group state change' event");
2371 	nvlist_free(nvl);
2372 	return (-1);
2373 }
2374 
2375 /*
2376  * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group
2377  * `pg'.  Returns 0 on success, -1 on failure.
2378  */
2379 static int
2380 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op)
2381 {
2382 	nvlist_t *nvl;
2383 
2384 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2385 	if (errno != 0) {
2386 		logperror("cannot create `group change' event");
2387 		return (-1);
2388 	}
2389 
2390 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2391 	if (errno != 0)
2392 		goto failed;
2393 
2394 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2395 	if (errno != 0)
2396 		goto failed;
2397 
2398 	errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE,
2399 	    phyint_grouplistsig);
2400 	if (errno != 0)
2401 		goto failed;
2402 
2403 	errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op);
2404 	if (errno != 0)
2405 		goto failed;
2406 
2407 	return (post_event(ESC_IPMP_GROUP_CHANGE, nvl));
2408 failed:
2409 	logperror("cannot create `group change' event");
2410 	nvlist_free(nvl);
2411 	return (-1);
2412 }
2413 
2414 /*
2415  * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in
2416  * group `pg'.	Returns 0 on success, -1 on failure.
2417  */
2418 static int
2419 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi,
2420     ipmp_if_op_t op)
2421 {
2422 	nvlist_t *nvl;
2423 
2424 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2425 	if (errno != 0) {
2426 		logperror("cannot create `group member change' event");
2427 		return (-1);
2428 	}
2429 
2430 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2431 	if (errno != 0)
2432 		goto failed;
2433 
2434 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2435 	if (errno != 0)
2436 		goto failed;
2437 
2438 	errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op);
2439 	if (errno != 0)
2440 		goto failed;
2441 
2442 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
2443 	if (errno != 0)
2444 		goto failed;
2445 
2446 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
2447 	if (errno != 0)
2448 		goto failed;
2449 
2450 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
2451 	if (errno != 0)
2452 		goto failed;
2453 
2454 	return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl));
2455 failed:
2456 	logperror("cannot create `group member change' event");
2457 	nvlist_free(nvl);
2458 	return (-1);
2459 
2460 }
2461 
2462 /*
2463  * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'.
2464  * Returns 0 on success, -1 on failure.
2465  */
2466 static int
2467 phyint_state_event(struct phyint_group *pg, struct phyint *pi)
2468 {
2469 	nvlist_t *nvl;
2470 
2471 	errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
2472 	if (errno != 0) {
2473 		logperror("cannot create `interface change' event");
2474 		return (-1);
2475 	}
2476 
2477 	errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name);
2478 	if (errno != 0)
2479 		goto failed;
2480 
2481 	errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig);
2482 	if (errno != 0)
2483 		goto failed;
2484 
2485 	errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name);
2486 	if (errno != 0)
2487 		goto failed;
2488 
2489 	errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi));
2490 	if (errno != 0)
2491 		goto failed;
2492 
2493 	errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi));
2494 	if (errno != 0)
2495 		goto failed;
2496 
2497 	return (post_event(ESC_IPMP_IF_CHANGE, nvl));
2498 failed:
2499 	logperror("cannot create `interface change' event");
2500 	nvlist_free(nvl);
2501 	return (-1);
2502 
2503 }
2504 
2505 /*
2506  * Generate a signature for use.  The signature is conceptually divided
2507  * into two pieces: a random 16-bit "generation number" and a 48-bit
2508  * monotonically increasing integer.  The generation number protects
2509  * against stale updates to entities (e.g., IPMP groups) that have been
2510  * deleted and since recreated.
2511  */
2512 static uint64_t
2513 gensig(void)
2514 {
2515 	static int seeded = 0;
2516 
2517 	if (seeded == 0) {
2518 		srand48((long)gethrtime());
2519 		seeded++;
2520 	}
2521 
2522 	return ((uint64_t)lrand48() << 48 | 1);
2523 }
2524 
2525 /*
2526  * Store the information associated with group `grname' into a dynamically
2527  * allocated structure pointed to by `*grinfopp'.  Returns an IPMP error code.
2528  */
2529 unsigned int
2530 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp)
2531 {
2532 	struct phyint_group	*pg;
2533 	struct phyint		*pi;
2534 	char			(*ifs)[LIFNAMSIZ];
2535 	unsigned int		nif, i;
2536 
2537 	pg = phyint_group_lookup(grname);
2538 	if (pg == NULL)
2539 		return (IPMP_EUNKGROUP);
2540 
2541 	/*
2542 	 * Tally up the number of interfaces, allocate an array to hold them,
2543 	 * and insert their names into the array.
2544 	 */
2545 	for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext)
2546 		nif++;
2547 
2548 	ifs = alloca(nif * sizeof (*ifs));
2549 	for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) {
2550 		assert(i < nif);
2551 		(void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ);
2552 	}
2553 	assert(i == nif);
2554 
2555 	*grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig,
2556 	    groupstate(pg), nif, ifs);
2557 	return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2558 }
2559 
2560 /*
2561  * Store the information associated with interface `ifname' into a dynamically
2562  * allocated structure pointed to by `*ifinfopp'.  Returns an IPMP error code.
2563  */
2564 unsigned int
2565 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp)
2566 {
2567 	struct phyint	*pi;
2568 
2569 	pi = phyint_lookup(ifname);
2570 	if (pi == NULL)
2571 		return (IPMP_EUNKIF);
2572 
2573 	*ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name,
2574 	    ifstate(pi), iftype(pi));
2575 	return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2576 }
2577 
2578 /*
2579  * Store the current list of IPMP groups into a dynamically allocated
2580  * structure pointed to by `*grlistpp'.	 Returns an IPMP error code.
2581  */
2582 unsigned int
2583 getgrouplist(ipmp_grouplist_t **grlistpp)
2584 {
2585 	struct phyint_group	*pg;
2586 	char			(*groups)[LIFGRNAMSIZ];
2587 	unsigned int		i, ngroup;
2588 
2589 	/*
2590 	 * Tally up the number of groups, allocate an array to hold them, and
2591 	 * insert their names into the array.
2592 	 */
2593 	for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next)
2594 		ngroup++;
2595 
2596 	groups = alloca(ngroup * sizeof (*groups));
2597 	for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) {
2598 		assert(i < ngroup);
2599 		(void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ);
2600 	}
2601 	assert(i == ngroup);
2602 
2603 	*grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups);
2604 	return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS);
2605 }
2606 
2607 /*
2608  * Store a snapshot of the IPMP subsystem into a dynamically allocated
2609  * structure pointed to by `*snapp'.  Returns an IPMP error code.
2610  */
2611 unsigned int
2612 getsnap(ipmp_snap_t **snapp)
2613 {
2614 	ipmp_grouplist_t	*grlistp;
2615 	ipmp_groupinfo_t	*grinfop;
2616 	ipmp_ifinfo_t		*ifinfop;
2617 	ipmp_snap_t		*snap;
2618 	struct phyint		*pi;
2619 	unsigned int		i;
2620 	int			retval;
2621 
2622 	snap = ipmp_snap_create();
2623 	if (snap == NULL)
2624 		return (IPMP_ENOMEM);
2625 
2626 	/*
2627 	 * Add group list.
2628 	 */
2629 	retval = getgrouplist(&snap->sn_grlistp);
2630 	if (retval != IPMP_SUCCESS) {
2631 		ipmp_snap_free(snap);
2632 		return (retval);
2633 	}
2634 
2635 	/*
2636 	 * Add information for each group in the list.
2637 	 */
2638 	grlistp = snap->sn_grlistp;
2639 	for (i = 0; i < grlistp->gl_ngroup; i++) {
2640 		retval = getgroupinfo(grlistp->gl_groups[i], &grinfop);
2641 		if (retval != IPMP_SUCCESS) {
2642 			ipmp_snap_free(snap);
2643 			return (retval);
2644 		}
2645 		retval = ipmp_snap_addgroupinfo(snap, grinfop);
2646 		if (retval != IPMP_SUCCESS) {
2647 			ipmp_freegroupinfo(grinfop);
2648 			ipmp_snap_free(snap);
2649 			return (retval);
2650 		}
2651 	}
2652 
2653 	/*
2654 	 * Add information for each configured phyint.
2655 	 */
2656 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
2657 		retval = getifinfo(pi->pi_name, &ifinfop);
2658 		if (retval != IPMP_SUCCESS) {
2659 			ipmp_snap_free(snap);
2660 			return (retval);
2661 		}
2662 		retval = ipmp_snap_addifinfo(snap, ifinfop);
2663 		if (retval != IPMP_SUCCESS) {
2664 			ipmp_freeifinfo(ifinfop);
2665 			ipmp_snap_free(snap);
2666 			return (retval);
2667 		}
2668 	}
2669 
2670 	*snapp = snap;
2671 	return (IPMP_SUCCESS);
2672 }
2673