xref: /titanic_50/usr/src/cmd/cmd-inet/usr.lib/ilbd/ilbd_hc.c (revision 1767006bb066ef500b90b432fba79d63d0d09b36)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2012 Milan Jurik. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <sys/list.h>
31 #include <sys/stropts.h>
32 #include <sys/siginfo.h>
33 #include <sys/wait.h>
34 #include <arpa/inet.h>
35 #include <netinet/in.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <strings.h>
39 #include <stddef.h>
40 #include <unistd.h>
41 #include <libilb.h>
42 #include <port.h>
43 #include <time.h>
44 #include <signal.h>
45 #include <assert.h>
46 #include <errno.h>
47 #include <spawn.h>
48 #include <fcntl.h>
49 #include <limits.h>
50 #include "libilb_impl.h"
51 #include "ilbd.h"
52 
53 /* Global list of HC objects */
54 list_t ilbd_hc_list;
55 
56 /* Timer queue for all hc related timers. */
57 static iu_tq_t *ilbd_hc_timer_q;
58 
59 /* Indicate whether the timer needs to be updated */
60 static boolean_t hc_timer_restarted;
61 
62 static void ilbd_hc_probe_timer(iu_tq_t *, void *);
63 static ilb_status_t ilbd_hc_restart_timer(ilbd_hc_t *, ilbd_hc_srv_t *);
64 static boolean_t ilbd_run_probe(ilbd_hc_srv_t *);
65 
66 #define	MAX(a, b)	((a) > (b) ? (a) : (b))
67 
68 /*
69  * Number of arguments passed to a probe.  argc[0] is the path name of
70  * the probe.
71  */
72 #define	HC_PROBE_ARGC	8
73 
74 /*
75  * Max number of characters to be read from the output of a probe.  It
76  * is long enough to read in a 64 bit integer.
77  */
78 #define	HC_MAX_PROBE_OUTPUT	24
79 
80 void
81 i_ilbd_setup_hc_list(void)
82 {
83 	list_create(&ilbd_hc_list, sizeof (ilbd_hc_t),
84 	    offsetof(ilbd_hc_t, ihc_link));
85 }
86 
87 /*
88  * Given a hc object name, return a pointer to hc object if found.
89  */
90 ilbd_hc_t *
91 ilbd_get_hc(const char *name)
92 {
93 	ilbd_hc_t *hc;
94 
95 	for (hc = list_head(&ilbd_hc_list); hc != NULL;
96 	    hc = list_next(&ilbd_hc_list, hc)) {
97 		if (strcasecmp(hc->ihc_name, name) == 0)
98 			return (hc);
99 	}
100 	return (NULL);
101 }
102 
103 /*
104  * Generates an audit record for create-healthcheck,
105  * delete-healtcheck subcommands.
106  */
107 static void
108 ilbd_audit_hc_event(const char *audit_hcname,
109     const ilb_hc_info_t *audit_hcinfo, ilbd_cmd_t cmd,
110     ilb_status_t rc, ucred_t *ucredp)
111 {
112 	adt_session_data_t	*ah;
113 	adt_event_data_t	*event;
114 	au_event_t	flag;
115 	int	audit_error;
116 
117 	if ((ucredp == NULL) && (cmd == ILBD_CREATE_HC))  {
118 		/*
119 		 * we came here from the path where ilbd incorporates
120 		 * the configuration that is listed in SCF:
121 		 * i_ilbd_read_config->ilbd_walk_hc_pgs->
122 		 *   ->ilbd_scf_instance_walk_pg->ilbd_create_hc
123 		 * We skip auditing in that case
124 		 */
125 		logdebug("ilbd_audit_hc_event: skipping auditing");
126 		return;
127 	}
128 
129 	if (adt_start_session(&ah, NULL, 0) != 0) {
130 		logerr("ilbd_audit_hc_event: adt_start_session failed");
131 		exit(EXIT_FAILURE);
132 	}
133 	if (adt_set_from_ucred(ah, ucredp, ADT_NEW) != 0) {
134 		(void) adt_end_session(ah);
135 		logerr("ilbd_audit_rule_event: adt_set_from_ucred failed");
136 		exit(EXIT_FAILURE);
137 	}
138 	if (cmd == ILBD_CREATE_HC)
139 		flag = ADT_ilb_create_healthcheck;
140 	else if (cmd == ILBD_DESTROY_HC)
141 		flag = ADT_ilb_delete_healthcheck;
142 
143 	if ((event = adt_alloc_event(ah, flag)) == NULL) {
144 		logerr("ilbd_audit_hc_event: adt_alloc_event failed");
145 		exit(EXIT_FAILURE);
146 	}
147 	(void) memset((char *)event, 0, sizeof (adt_event_data_t));
148 
149 	switch (cmd) {
150 	case ILBD_CREATE_HC:
151 		event->adt_ilb_create_healthcheck.auth_used =
152 		    NET_ILB_CONFIG_AUTH;
153 		event->adt_ilb_create_healthcheck.hc_test =
154 		    (char *)audit_hcinfo->hci_test;
155 		event->adt_ilb_create_healthcheck.hc_name =
156 		    (char *)audit_hcinfo->hci_name;
157 
158 		/*
159 		 * If the value 0 is stored, the default values are
160 		 * set in the kernel. User land does not know about them
161 		 * So if the user does not specify them, audit record
162 		 * will show them as 0
163 		 */
164 		event->adt_ilb_create_healthcheck.hc_timeout =
165 		    audit_hcinfo->hci_timeout;
166 		event->adt_ilb_create_healthcheck.hc_count =
167 		    audit_hcinfo->hci_count;
168 		event->adt_ilb_create_healthcheck.hc_interval =
169 		    audit_hcinfo->hci_interval;
170 		break;
171 	case ILBD_DESTROY_HC:
172 		event->adt_ilb_delete_healthcheck.auth_used =
173 		    NET_ILB_CONFIG_AUTH;
174 		event->adt_ilb_delete_healthcheck.hc_name =
175 		    (char *)audit_hcname;
176 		break;
177 	}
178 
179 	/* Fill in success/failure */
180 	if (rc == ILB_STATUS_OK) {
181 		if (adt_put_event(event, ADT_SUCCESS, ADT_SUCCESS) != 0) {
182 			logerr("ilbd_audit_hc_event: adt_put_event failed");
183 			exit(EXIT_FAILURE);
184 		}
185 	} else {
186 		audit_error = ilberror2auditerror(rc);
187 		if (adt_put_event(event, ADT_FAILURE, audit_error) != 0) {
188 			logerr("ilbd_audit_hc_event: adt_put_event failed");
189 			exit(EXIT_FAILURE);
190 		}
191 	}
192 	adt_free_event(event);
193 	(void) adt_end_session(ah);
194 }
195 
196 /*
197  * Given the ilb_hc_info_t passed in (from the libilb), create a hc object
198  * in ilbd.  The parameter ev_port is not used, refer to comments of
199  * ilbd_create_sg() in ilbd_sg.c
200  */
201 /* ARGSUSED */
202 ilb_status_t
203 ilbd_create_hc(const ilb_hc_info_t *hc_info, int ev_port,
204     const struct passwd *ps, ucred_t *ucredp)
205 {
206 	ilbd_hc_t *hc;
207 	ilb_status_t ret = ILB_STATUS_OK;
208 
209 	/*
210 	 * ps == NULL is from the daemon when it starts and load configuration
211 	 * ps != NULL is from client.
212 	 */
213 	if (ps != NULL) {
214 		ret = ilbd_check_client_config_auth(ps);
215 		if (ret != ILB_STATUS_OK) {
216 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
217 			    ret, ucredp);
218 			return (ret);
219 		}
220 	}
221 
222 	if (hc_info->hci_name[0] == '\0') {
223 		logdebug("ilbd_create_hc: missing healthcheck info");
224 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
225 		    ILB_STATUS_ENOHCINFO, ucredp);
226 		return (ILB_STATUS_ENOHCINFO);
227 	}
228 
229 	hc = ilbd_get_hc(hc_info->hci_name);
230 	if (hc != NULL) {
231 		logdebug("ilbd_create_hc: healthcheck name %s already"
232 		    " exists", hc_info->hci_name);
233 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
234 		    ILB_STATUS_EEXIST, ucredp);
235 		return (ILB_STATUS_EEXIST);
236 	}
237 
238 	/*
239 	 * Sanity check on user supplied probe.  The given path name
240 	 * must be a full path name (starts with '/') and is
241 	 * executable.
242 	 */
243 	if (strcasecmp(hc_info->hci_test, ILB_HC_STR_TCP) != 0 &&
244 	    strcasecmp(hc_info->hci_test, ILB_HC_STR_UDP) != 0 &&
245 	    strcasecmp(hc_info->hci_test, ILB_HC_STR_PING) != 0 &&
246 	    (hc_info->hci_test[0] != '/' ||
247 	    access(hc_info->hci_test, X_OK) == -1)) {
248 		if (errno == ENOENT) {
249 			logdebug("ilbd_create_hc: user script %s doesn't "
250 			    "exist", hc_info->hci_test);
251 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
252 			    ILB_STATUS_ENOENT, ucredp);
253 			return (ILB_STATUS_ENOENT);
254 		} else {
255 			logdebug("ilbd_create_hc: user script %s is "
256 			    "invalid", hc_info->hci_test);
257 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
258 			    ILB_STATUS_EINVAL, ucredp);
259 			return (ILB_STATUS_EINVAL);
260 		}
261 	}
262 
263 	/* Create and add the hc object */
264 	hc = calloc(1, sizeof (ilbd_hc_t));
265 	if (hc == NULL) {
266 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
267 		    ILB_STATUS_ENOMEM, ucredp);
268 		return (ILB_STATUS_ENOMEM);
269 	}
270 	(void) memcpy(&hc->ihc_info, hc_info, sizeof (ilb_hc_info_t));
271 	if (strcasecmp(hc->ihc_test, ILB_HC_STR_TCP) == 0)
272 		hc->ihc_test_type = ILBD_HC_TCP;
273 	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_UDP) == 0)
274 		hc->ihc_test_type = ILBD_HC_UDP;
275 	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_PING) == 0)
276 		hc->ihc_test_type = ILBD_HC_PING;
277 	else
278 		hc->ihc_test_type = ILBD_HC_USER;
279 	list_create(&hc->ihc_rules, sizeof (ilbd_hc_rule_t),
280 	    offsetof(ilbd_hc_rule_t, hcr_link));
281 
282 	/* Update SCF */
283 	if (ps != NULL) {
284 		if ((ret = ilbd_create_pg(ILBD_SCF_HC, (void *)hc)) !=
285 		    ILB_STATUS_OK) {
286 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
287 			    ret, ucredp);
288 			list_destroy(&hc->ihc_rules);
289 			free(hc);
290 			return (ret);
291 		}
292 	}
293 
294 	/* Everything is fine, now add it to the global list. */
295 	list_insert_tail(&ilbd_hc_list, hc);
296 	ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC, ret, ucredp);
297 	return (ret);
298 }
299 
300 /*
301  * Given a name of a hc object, destroy it.
302  */
303 ilb_status_t
304 ilbd_destroy_hc(const char *hc_name, const struct passwd *ps,
305     ucred_t *ucredp)
306 {
307 	ilb_status_t ret;
308 	ilbd_hc_t *hc;
309 
310 	/*
311 	 * No need to check ps == NULL, daemon won't call any destroy func
312 	 * at start up.
313 	 */
314 	ret = ilbd_check_client_config_auth(ps);
315 	if (ret != ILB_STATUS_OK) {
316 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
317 		    ret, ucredp);
318 		return (ret);
319 	}
320 
321 	hc = ilbd_get_hc(hc_name);
322 	if (hc == NULL) {
323 		logdebug("ilbd_destroy_hc: healthcheck %s does not exist",
324 		    hc_name);
325 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
326 		    ILB_STATUS_ENOENT, ucredp);
327 		return (ILB_STATUS_ENOENT);
328 	}
329 
330 	/* If hc is in use, cannot delete it */
331 	if (hc->ihc_rule_cnt > 0) {
332 		logdebug("ilbd_destroy_hc: healthcheck %s is associated"
333 		    " with a rule - cannot remove", hc_name);
334 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
335 		    ILB_STATUS_INUSE, ucredp);
336 		return (ILB_STATUS_INUSE);
337 	}
338 
339 	if ((ret = ilbd_destroy_pg(ILBD_SCF_HC, hc_name)) !=
340 	    ILB_STATUS_OK) {
341 		logdebug("ilbd_destroy_hc: cannot destroy healthcheck %s "
342 		    "property group", hc_name);
343 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
344 		    ret, ucredp);
345 		return (ret);
346 	}
347 
348 	list_remove(&ilbd_hc_list, hc);
349 	list_destroy(&hc->ihc_rules);
350 	free(hc);
351 	ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC, ret, ucredp);
352 	return (ret);
353 }
354 
355 /*
356  * Given a hc object name, return its information.  Used by libilb to
357  * get hc info.
358  */
359 ilb_status_t
360 ilbd_get_hc_info(const char *hc_name, uint32_t *rbuf, size_t *rbufsz)
361 {
362 	ilbd_hc_t	*hc;
363 	ilb_hc_info_t	*hc_info;
364 	ilb_comm_t	*ic = (ilb_comm_t *)rbuf;
365 
366 	hc = ilbd_get_hc(hc_name);
367 	if (hc == NULL) {
368 		logdebug("%s: healthcheck %s does not exist", __func__,
369 		    hc_name);
370 		return (ILB_STATUS_ENOENT);
371 	}
372 	ilbd_reply_ok(rbuf, rbufsz);
373 	hc_info = (ilb_hc_info_t *)&ic->ic_data;
374 
375 	(void) strlcpy(hc_info->hci_name, hc->ihc_name, sizeof (hc->ihc_name));
376 	(void) strlcpy(hc_info->hci_test, hc->ihc_test, sizeof (hc->ihc_test));
377 	hc_info->hci_timeout = hc->ihc_timeout;
378 	hc_info->hci_count = hc->ihc_count;
379 	hc_info->hci_interval = hc->ihc_interval;
380 	hc_info->hci_def_ping = hc->ihc_def_ping;
381 
382 	*rbufsz += sizeof (ilb_hc_info_t);
383 
384 	return (ILB_STATUS_OK);
385 }
386 
387 static void
388 ilbd_hc_copy_srvs(uint32_t *rbuf, size_t *rbufsz, ilbd_hc_rule_t *hc_rule,
389     const char *rulename)
390 {
391 	ilbd_hc_srv_t		*tmp_srv;
392 	ilb_hc_srv_t		*dst_srv;
393 	ilb_hc_rule_srv_t	*srvs;
394 	size_t			tmp_rbufsz;
395 	int			i;
396 
397 	tmp_rbufsz = *rbufsz;
398 	/* Set up the reply buffer.  rbufsz will be set to the new size. */
399 	ilbd_reply_ok(rbuf, rbufsz);
400 
401 	/* Calculate how much space is left for holding server info. */
402 	*rbufsz += sizeof (ilb_hc_rule_srv_t);
403 	tmp_rbufsz -= *rbufsz;
404 
405 	srvs = (ilb_hc_rule_srv_t *)&((ilb_comm_t *)rbuf)->ic_data;
406 
407 	tmp_srv = list_head(&hc_rule->hcr_servers);
408 	for (i = 0; tmp_srv != NULL && tmp_rbufsz >= sizeof (*dst_srv); i++) {
409 		dst_srv = &srvs->rs_srvs[i];
410 
411 		(void) strlcpy(dst_srv->hcs_rule_name, rulename, ILB_NAMESZ);
412 		(void) strlcpy(dst_srv->hcs_ID, tmp_srv->shc_sg_srv->sgs_srvID,
413 		    ILB_NAMESZ);
414 		(void) strlcpy(dst_srv->hcs_hc_name,
415 		    tmp_srv->shc_hc->ihc_name, ILB_NAMESZ);
416 		dst_srv->hcs_IP = tmp_srv->shc_sg_srv->sgs_addr;
417 		dst_srv->hcs_fail_cnt = tmp_srv->shc_fail_cnt;
418 		dst_srv->hcs_status = tmp_srv->shc_status;
419 		dst_srv->hcs_rtt = tmp_srv->shc_rtt;
420 		dst_srv->hcs_lasttime = tmp_srv->shc_lasttime;
421 		dst_srv->hcs_nexttime = tmp_srv->shc_nexttime;
422 
423 		tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv);
424 		tmp_rbufsz -= sizeof (*dst_srv);
425 	}
426 	srvs->rs_num_srvs = i;
427 	*rbufsz += i * sizeof (*dst_srv);
428 }
429 
430 /*
431  * Given a rule name, return the hc status of its servers.
432  */
433 ilb_status_t
434 ilbd_get_hc_srvs(const char *rulename, uint32_t *rbuf, size_t *rbufsz)
435 {
436 	ilbd_hc_t	*hc;
437 	ilbd_hc_rule_t	*hc_rule;
438 
439 	for (hc = list_head(&ilbd_hc_list); hc != NULL;
440 	    hc = list_next(&ilbd_hc_list, hc)) {
441 		for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
442 		    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
443 			if (strcasecmp(hc_rule->hcr_rule->irl_name,
444 			    rulename) != 0) {
445 				continue;
446 			}
447 			ilbd_hc_copy_srvs(rbuf, rbufsz, hc_rule, rulename);
448 			return (ILB_STATUS_OK);
449 		}
450 	}
451 	return (ILB_STATUS_RULE_NO_HC);
452 }
453 
454 /*
455  * Initialize the hc timer and associate the notification of timeout to
456  * the given event port.
457  */
458 void
459 ilbd_hc_timer_init(int ev_port, ilbd_timer_event_obj_t *ev_obj)
460 {
461 	struct sigevent sigev;
462 	port_notify_t notify;
463 
464 	if ((ilbd_hc_timer_q = iu_tq_create()) == NULL) {
465 		logerr("%s: cannot create hc timer queue", __func__);
466 		exit(EXIT_FAILURE);
467 	}
468 	hc_timer_restarted = B_FALSE;
469 
470 	ev_obj->ev = ILBD_EVENT_TIMER;
471 	ev_obj->timerid = -1;
472 
473 	notify.portnfy_port = ev_port;
474 	notify.portnfy_user = ev_obj;
475 	sigev.sigev_notify = SIGEV_PORT;
476 	sigev.sigev_value.sival_ptr = &notify;
477 	if (timer_create(CLOCK_REALTIME, &sigev, &ev_obj->timerid) == -1) {
478 		logerr("%s: cannot create timer", __func__);
479 		exit(EXIT_FAILURE);
480 	}
481 }
482 
483 /*
484  * HC timeout handler.
485  */
486 void
487 ilbd_hc_timeout(void)
488 {
489 	(void) iu_expire_timers(ilbd_hc_timer_q);
490 	hc_timer_restarted = B_TRUE;
491 }
492 
493 /*
494  * Set up the timer to fire at the earliest timeout.
495  */
496 void
497 ilbd_hc_timer_update(ilbd_timer_event_obj_t *ev_obj)
498 {
499 	itimerspec_t itimeout;
500 	int timeout;
501 
502 	/*
503 	 * There is no change on the timer list, so no need to set up the
504 	 * timer again.
505 	 */
506 	if (!hc_timer_restarted)
507 		return;
508 
509 restart:
510 	if ((timeout = iu_earliest_timer(ilbd_hc_timer_q)) == INFTIM) {
511 		hc_timer_restarted = B_FALSE;
512 		return;
513 	} else if (timeout == 0) {
514 		/*
515 		 * Handle the timeout immediately.  After that (clearing all
516 		 * the expired timers), check to  see if there are still
517 		 * timers running.  If yes, start them.
518 		 */
519 		(void) iu_expire_timers(ilbd_hc_timer_q);
520 		goto restart;
521 	}
522 
523 	itimeout.it_value.tv_sec = timeout / MILLISEC + 1;
524 	itimeout.it_value.tv_nsec = 0;
525 	itimeout.it_interval.tv_sec = 0;
526 	itimeout.it_interval.tv_nsec = 0;
527 
528 	/*
529 	 * Failure to set a timeout is "OK" since hopefully there will be
530 	 * other events and timer_settime() will be called again.  So
531 	 * we will only miss some timeouts.  But in the worst case, no event
532 	 * will happen and ilbd will get stuck...
533 	 */
534 	if (timer_settime(ev_obj->timerid, 0, &itimeout, NULL) == -1)
535 		logerr("%s: cannot set timer", __func__);
536 	hc_timer_restarted = B_FALSE;
537 }
538 
539 /*
540  * Kill the probe process of a server.
541  */
542 static void
543 ilbd_hc_kill_probe(ilbd_hc_srv_t *srv)
544 {
545 	/*
546 	 * First dissociate the fd from the event port.  It should not
547 	 * fail.
548 	 */
549 	if (port_dissociate(srv->shc_ev_port, PORT_SOURCE_FD,
550 	    srv->shc_child_fd) != 0) {
551 		logdebug("%s: port_dissociate: %s", __func__, strerror(errno));
552 	}
553 	(void) close(srv->shc_child_fd);
554 	free(srv->shc_ev);
555 	srv->shc_ev = NULL;
556 
557 	/* Then kill the probe process. */
558 	if (kill(srv->shc_child_pid, SIGKILL) != 0) {
559 		logerr("%s: rule %s server %s: %s", __func__,
560 		    srv->shc_hc_rule->hcr_rule->irl_name,
561 		    srv->shc_sg_srv->sgs_srvID, strerror(errno));
562 	}
563 	/* Should not fail... */
564 	if (waitpid(srv->shc_child_pid, NULL, 0) != srv->shc_child_pid) {
565 		logdebug("%s: waitpid: rule %s server %s", __func__,
566 		    srv->shc_hc_rule->hcr_rule->irl_name,
567 		    srv->shc_sg_srv->sgs_srvID);
568 	}
569 	srv->shc_child_pid = 0;
570 }
571 
572 /*
573  * Disable the server, either because the server is dead or because a timer
574  * cannot be started for this server.  Note that this only affects the
575  * transient configuration, meaning only in memory.  The persistent
576  * configuration is not affected.
577  */
578 static void
579 ilbd_mark_server_disabled(ilbd_hc_srv_t *srv)
580 {
581 	srv->shc_status = ILB_HCS_DISABLED;
582 
583 	/* Disable the server in kernel. */
584 	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
585 	    srv->shc_hc_rule->hcr_rule->irl_name,
586 	    stat_declare_srv_dead) != ILB_STATUS_OK) {
587 		logerr("%s: cannot disable server in kernel: rule %s "
588 		    "server %s", __func__,
589 		    srv->shc_hc_rule->hcr_rule->irl_name,
590 		    srv->shc_sg_srv->sgs_srvID);
591 	}
592 }
593 
594 /*
595  * A probe fails, set the state of the server.
596  */
597 static void
598 ilbd_set_fail_state(ilbd_hc_srv_t *srv)
599 {
600 	if (++srv->shc_fail_cnt < srv->shc_hc->ihc_count) {
601 		/* Probe again */
602 		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
603 		return;
604 	}
605 
606 	logdebug("%s: rule %s server %s fails %u", __func__,
607 	    srv->shc_hc_rule->hcr_rule->irl_name, srv->shc_sg_srv->sgs_srvID,
608 	    srv->shc_fail_cnt);
609 
610 	/*
611 	 * If this is a ping test, mark the server as
612 	 * unreachable instead of dead.
613 	 */
614 	if (srv->shc_hc->ihc_test_type == ILBD_HC_PING ||
615 	    srv->shc_state == ilbd_hc_def_pinging) {
616 		srv->shc_status = ILB_HCS_UNREACH;
617 	} else {
618 		srv->shc_status = ILB_HCS_DEAD;
619 	}
620 
621 	/* Disable the server in kernel. */
622 	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
623 	    srv->shc_hc_rule->hcr_rule->irl_name, stat_declare_srv_dead) !=
624 	    ILB_STATUS_OK) {
625 		logerr("%s: cannot disable server in kernel: rule %s "
626 		    "server %s", __func__,
627 		    srv->shc_hc_rule->hcr_rule->irl_name,
628 		    srv->shc_sg_srv->sgs_srvID);
629 	}
630 
631 	/* Still keep probing in case the server is alive again. */
632 	if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
633 		/* Only thing to do is to disable the server... */
634 		logerr("%s: cannot restart timer: rule %s server %s", __func__,
635 		    srv->shc_hc_rule->hcr_rule->irl_name,
636 		    srv->shc_sg_srv->sgs_srvID);
637 		srv->shc_status = ILB_HCS_DISABLED;
638 	}
639 }
640 
641 /*
642  * A probe process has not returned for the ihc_timeout period, we should
643  * kill it.  This function is the handler of this.
644  */
645 /* ARGSUSED */
646 static void
647 ilbd_hc_kill_timer(iu_tq_t *tq, void *arg)
648 {
649 	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
650 
651 	ilbd_hc_kill_probe(srv);
652 	ilbd_set_fail_state(srv);
653 }
654 
655 /*
656  * Probe timeout handler.  Send out the appropriate probe.
657  */
658 /* ARGSUSED */
659 static void
660 ilbd_hc_probe_timer(iu_tq_t *tq, void *arg)
661 {
662 	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
663 
664 	/*
665 	 * If starting the probe fails, just pretend that the timeout has
666 	 * extended.
667 	 */
668 	if (!ilbd_run_probe(srv)) {
669 		/*
670 		 * If we cannot restart the timer, the only thing we can do
671 		 * is to disable this server.  Hopefully the sys admin will
672 		 * notice this and enable this server again later.
673 		 */
674 		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
675 			logerr("%s: cannot restart timer: rule %s server %s, "
676 			    "disabling it", __func__,
677 			    srv->shc_hc_rule->hcr_rule->irl_name,
678 			    srv->shc_sg_srv->sgs_srvID);
679 			ilbd_mark_server_disabled(srv);
680 		}
681 		return;
682 	}
683 
684 	/*
685 	 * Similar to above, if kill timer cannot be started, disable the
686 	 * server.
687 	 */
688 	if ((srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q,
689 	    srv->shc_hc->ihc_timeout, ilbd_hc_kill_timer, srv)) == -1) {
690 		logerr("%s: cannot start kill timer: rule %s server %s, "
691 		    "disabling it", __func__,
692 		    srv->shc_hc_rule->hcr_rule->irl_name,
693 		    srv->shc_sg_srv->sgs_srvID);
694 		ilbd_mark_server_disabled(srv);
695 	}
696 	hc_timer_restarted = B_TRUE;
697 }
698 
699 /* Restart the periodic timer for a given server. */
700 static ilb_status_t
701 ilbd_hc_restart_timer(ilbd_hc_t *hc, ilbd_hc_srv_t *srv)
702 {
703 	int timeout;
704 
705 	/* Don't allow the timeout interval to be less than 1s */
706 	timeout = MAX((hc->ihc_interval >> 1) + (gethrtime() %
707 	    (hc->ihc_interval + 1)), 1);
708 
709 	/*
710 	 * If the probe is actually a ping probe, there is no need to
711 	 * do default pinging.  Just skip the step.
712 	 */
713 	if (hc->ihc_def_ping && hc->ihc_test_type != ILBD_HC_PING)
714 		srv->shc_state = ilbd_hc_def_pinging;
715 	else
716 		srv->shc_state = ilbd_hc_probing;
717 	srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q, timeout,
718 	    ilbd_hc_probe_timer, srv);
719 
720 	if (srv->shc_tid == -1)
721 		return (ILB_STATUS_TIMER);
722 	srv->shc_lasttime = time(NULL);
723 	srv->shc_nexttime = time(NULL) + timeout;
724 
725 	hc_timer_restarted = B_TRUE;
726 	return (ILB_STATUS_OK);
727 }
728 
729 /* Helper routine to associate a server with its hc object. */
730 static ilb_status_t
731 ilbd_hc_srv_add(ilbd_hc_t *hc, ilbd_hc_rule_t *hc_rule,
732     const ilb_sg_srv_t *srv, int ev_port)
733 {
734 	ilbd_hc_srv_t *new_srv;
735 	ilb_status_t ret;
736 
737 	if ((new_srv = calloc(1, sizeof (ilbd_hc_srv_t))) == NULL)
738 		return (ILB_STATUS_ENOMEM);
739 	new_srv->shc_hc = hc;
740 	new_srv->shc_hc_rule = hc_rule;
741 	new_srv->shc_sg_srv = srv;
742 	new_srv->shc_ev_port = ev_port;
743 	new_srv->shc_tid = -1;
744 	new_srv->shc_nexttime = time(NULL);
745 	new_srv->shc_lasttime = new_srv->shc_nexttime;
746 
747 	if ((hc_rule->hcr_rule->irl_flags & ILB_FLAGS_RULE_ENABLED) &&
748 	    ILB_IS_SRV_ENABLED(srv->sgs_flags)) {
749 		new_srv->shc_status = ILB_HCS_UNINIT;
750 		ret = ilbd_hc_restart_timer(hc, new_srv);
751 		if (ret != ILB_STATUS_OK) {
752 			free(new_srv);
753 			return (ret);
754 		}
755 	} else {
756 		new_srv->shc_status = ILB_HCS_DISABLED;
757 	}
758 
759 	list_insert_tail(&hc_rule->hcr_servers, new_srv);
760 	return (ILB_STATUS_OK);
761 }
762 
763 /* Handy macro to cancel a server's timer. */
764 #define	HC_CANCEL_TIMER(srv)						\
765 {									\
766 	void *arg;							\
767 	int ret;							\
768 	if ((srv)->shc_tid != -1) {					\
769 		ret = iu_cancel_timer(ilbd_hc_timer_q, (srv)->shc_tid, &arg); \
770 		(srv)->shc_tid = -1;					\
771 		assert(ret == 1);					\
772 		assert(arg == (srv));					\
773 	}								\
774 	hc_timer_restarted = B_TRUE;					\
775 }
776 
777 /* Helper routine to dissociate a server from its hc object. */
778 static ilb_status_t
779 ilbd_hc_srv_rem(ilbd_hc_rule_t *hc_rule, const ilb_sg_srv_t *srv)
780 {
781 	ilbd_hc_srv_t *tmp_srv;
782 
783 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
784 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
785 		if (tmp_srv->shc_sg_srv == srv) {
786 			list_remove(&hc_rule->hcr_servers, tmp_srv);
787 			HC_CANCEL_TIMER(tmp_srv);
788 			if (tmp_srv->shc_child_pid != 0)
789 				ilbd_hc_kill_probe(tmp_srv);
790 			free(tmp_srv);
791 			return (ILB_STATUS_OK);
792 		}
793 	}
794 	return (ILB_STATUS_ENOENT);
795 }
796 
797 /* Helper routine to dissociate all servers of a rule from its hc object. */
798 static void
799 ilbd_hc_srv_rem_all(ilbd_hc_rule_t *hc_rule)
800 {
801 	ilbd_hc_srv_t *srv;
802 
803 	while ((srv = list_remove_head(&hc_rule->hcr_servers)) != NULL) {
804 		HC_CANCEL_TIMER(srv);
805 		if (srv->shc_child_pid != 0)
806 			ilbd_hc_kill_probe(srv);
807 		free(srv);
808 	}
809 }
810 
811 /* Associate a rule with its hc object. */
812 ilb_status_t
813 ilbd_hc_associate_rule(const ilbd_rule_t *rule, int ev_port)
814 {
815 	ilbd_hc_t	*hc;
816 	ilbd_hc_rule_t	*hc_rule;
817 	ilb_status_t	ret;
818 	ilbd_sg_t	*sg;
819 	ilbd_srv_t	*ilbd_srv;
820 
821 	/* The rule is assumed to be initialized appropriately. */
822 	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
823 		logdebug("ilbd_hc_associate_rule: healthcheck %s does not "
824 		    "exist", rule->irl_hcname);
825 		return (ILB_STATUS_ENOHCINFO);
826 	}
827 	if ((hc->ihc_test_type == ILBD_HC_TCP &&
828 	    rule->irl_proto != IPPROTO_TCP) ||
829 	    (hc->ihc_test_type == ILBD_HC_UDP &&
830 	    rule->irl_proto != IPPROTO_UDP)) {
831 		return (ILB_STATUS_RULE_HC_MISMATCH);
832 	}
833 	if ((hc_rule = calloc(1, sizeof (ilbd_hc_rule_t))) == NULL) {
834 		logdebug("ilbd_hc_associate_rule: out of memory");
835 		return (ILB_STATUS_ENOMEM);
836 	}
837 
838 	hc_rule->hcr_rule = rule;
839 	list_create(&hc_rule->hcr_servers, sizeof (ilbd_hc_srv_t),
840 	    offsetof(ilbd_hc_srv_t, shc_srv_link));
841 
842 	/* Add all the servers. */
843 	sg = rule->irl_sg;
844 	for (ilbd_srv = list_head(&sg->isg_srvlist); ilbd_srv != NULL;
845 	    ilbd_srv = list_next(&sg->isg_srvlist, ilbd_srv)) {
846 		if ((ret = ilbd_hc_srv_add(hc, hc_rule, &ilbd_srv->isv_srv,
847 		    ev_port)) != ILB_STATUS_OK) {
848 			/* Remove all previously added servers */
849 			ilbd_hc_srv_rem_all(hc_rule);
850 			list_destroy(&hc_rule->hcr_servers);
851 			free(hc_rule);
852 			return (ret);
853 		}
854 	}
855 	list_insert_tail(&hc->ihc_rules, hc_rule);
856 	hc->ihc_rule_cnt++;
857 
858 	return (ILB_STATUS_OK);
859 }
860 
861 /* Dissociate a rule from its hc object. */
862 ilb_status_t
863 ilbd_hc_dissociate_rule(const ilbd_rule_t *rule)
864 {
865 	ilbd_hc_t	*hc;
866 	ilbd_hc_rule_t	*hc_rule;
867 
868 	/* The rule is assumed to be initialized appropriately. */
869 	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
870 		logdebug("ilbd_hc_dissociate_rule: healthcheck %s does not "
871 		    "exist", rule->irl_hcname);
872 		return (ILB_STATUS_ENOENT);
873 	}
874 	for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
875 	    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
876 		if (hc_rule->hcr_rule == rule)
877 			break;
878 	}
879 	if (hc_rule == NULL) {
880 		logdebug("ilbd_hc_dissociate_rule: rule %s is not associated "
881 		    "with healtcheck %s", rule->irl_hcname, hc->ihc_name);
882 		return (ILB_STATUS_ENOENT);
883 	}
884 	ilbd_hc_srv_rem_all(hc_rule);
885 	list_remove(&hc->ihc_rules, hc_rule);
886 	hc->ihc_rule_cnt--;
887 	list_destroy(&hc_rule->hcr_servers);
888 	free(hc_rule);
889 	return (ILB_STATUS_OK);
890 }
891 
892 /*
893  * Given a hc object name and a rule, check to see if the rule is associated
894  * with the hc object.  If it is, the hc object is returned in **hc and the
895  * ilbd_hc_rule_t is returned in **hc_rule.
896  */
897 static boolean_t
898 ilbd_hc_check_rule(const char *hc_name, const ilbd_rule_t *rule,
899     ilbd_hc_t **hc, ilbd_hc_rule_t **hc_rule)
900 {
901 	ilbd_hc_t	*tmp_hc;
902 	ilbd_hc_rule_t	*tmp_hc_rule;
903 
904 	if ((tmp_hc = ilbd_get_hc(hc_name)) == NULL)
905 		return (B_FALSE);
906 	for (tmp_hc_rule = list_head(&tmp_hc->ihc_rules); tmp_hc_rule != NULL;
907 	    tmp_hc_rule = list_next(&tmp_hc->ihc_rules, tmp_hc_rule)) {
908 		if (tmp_hc_rule->hcr_rule == rule) {
909 			*hc = tmp_hc;
910 			*hc_rule = tmp_hc_rule;
911 			return (B_TRUE);
912 		}
913 	}
914 	return (B_FALSE);
915 }
916 
917 /* Associate a server with its hc object. */
918 ilb_status_t
919 ilbd_hc_add_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
920     int ev_port)
921 {
922 	ilbd_hc_t	*hc;
923 	ilbd_hc_rule_t	*hc_rule;
924 
925 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
926 		return (ILB_STATUS_ENOENT);
927 	return (ilbd_hc_srv_add(hc, hc_rule, srv, ev_port));
928 }
929 
930 /* Dissociate a server from its hc object. */
931 ilb_status_t
932 ilbd_hc_del_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
933 {
934 	ilbd_hc_t	*hc;
935 	ilbd_hc_rule_t	*hc_rule;
936 
937 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
938 		return (ILB_STATUS_ENOENT);
939 	return (ilbd_hc_srv_rem(hc_rule, srv));
940 }
941 
942 /* Helper routine to enable/disable a server's hc probe. */
943 static ilb_status_t
944 ilbd_hc_toggle_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
945     boolean_t enable)
946 {
947 	ilbd_hc_t	*hc;
948 	ilbd_hc_rule_t	*hc_rule;
949 	ilbd_hc_srv_t	*tmp_srv;
950 	ilb_status_t	ret;
951 
952 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
953 		return (ILB_STATUS_ENOENT);
954 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
955 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
956 		if (tmp_srv->shc_sg_srv != srv) {
957 			continue;
958 		}
959 		if (enable) {
960 			if (tmp_srv->shc_status == ILB_HCS_DISABLED) {
961 				ret = ilbd_hc_restart_timer(hc, tmp_srv);
962 				if (ret != ILB_STATUS_OK) {
963 					logerr("%s: cannot start timers for "
964 					    "rule %s server %s", __func__,
965 					    rule->irl_name,
966 					    tmp_srv->shc_sg_srv->sgs_srvID);
967 					return (ret);
968 				}
969 				/* Start from fresh... */
970 				tmp_srv->shc_status = ILB_HCS_UNINIT;
971 				tmp_srv->shc_rtt = 0;
972 				tmp_srv->shc_fail_cnt = 0;
973 			}
974 		} else {
975 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
976 				tmp_srv->shc_status = ILB_HCS_DISABLED;
977 				HC_CANCEL_TIMER(tmp_srv);
978 				if (tmp_srv->shc_child_pid != 0)
979 					ilbd_hc_kill_probe(tmp_srv);
980 			}
981 		}
982 		return (ILB_STATUS_OK);
983 	}
984 	return (ILB_STATUS_ENOENT);
985 }
986 
987 ilb_status_t
988 ilbd_hc_enable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
989 {
990 	return (ilbd_hc_toggle_server(rule, srv, B_TRUE));
991 }
992 
993 ilb_status_t
994 ilbd_hc_disable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
995 {
996 	return (ilbd_hc_toggle_server(rule, srv, B_FALSE));
997 }
998 
999 /*
1000  * Helper routine to enable/disable a rule's hc probe (including all its
1001  * servers).
1002  */
1003 static ilb_status_t
1004 ilbd_hc_toggle_rule(const ilbd_rule_t *rule, boolean_t enable)
1005 {
1006 	ilbd_hc_t	*hc;
1007 	ilbd_hc_rule_t	*hc_rule;
1008 	ilbd_hc_srv_t	*tmp_srv;
1009 	int		ret;
1010 
1011 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
1012 		return (ILB_STATUS_ENOENT);
1013 
1014 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
1015 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
1016 		if (enable) {
1017 			/*
1018 			 * If the server is disabled in the rule, do not
1019 			 * restart its timer.
1020 			 */
1021 			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1022 			    ILB_IS_SRV_ENABLED(
1023 			    tmp_srv->shc_sg_srv->sgs_flags)) {
1024 				ret = ilbd_hc_restart_timer(hc, tmp_srv);
1025 				if (ret != ILB_STATUS_OK) {
1026 					logerr("%s: cannot start timers for "
1027 					    "rule %s server %s", __func__,
1028 					    rule->irl_name,
1029 					    tmp_srv->shc_sg_srv->sgs_srvID);
1030 					goto rollback;
1031 				} else {
1032 					/* Start from fresh... */
1033 					tmp_srv->shc_status = ILB_HCS_UNINIT;
1034 					tmp_srv->shc_rtt = 0;
1035 					tmp_srv->shc_fail_cnt = 0;
1036 				}
1037 			}
1038 		} else {
1039 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1040 				HC_CANCEL_TIMER(tmp_srv);
1041 				tmp_srv->shc_status = ILB_HCS_DISABLED;
1042 				if (tmp_srv->shc_child_pid != 0)
1043 					ilbd_hc_kill_probe(tmp_srv);
1044 			}
1045 		}
1046 	}
1047 	return (ILB_STATUS_OK);
1048 rollback:
1049 	enable = !enable;
1050 	for (tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv);
1051 	    tmp_srv != NULL;
1052 	    tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv)) {
1053 		if (enable) {
1054 			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1055 			    ILB_IS_SRV_ENABLED(
1056 			    tmp_srv->shc_sg_srv->sgs_flags)) {
1057 				(void) ilbd_hc_restart_timer(hc, tmp_srv);
1058 				tmp_srv->shc_status = ILB_HCS_UNINIT;
1059 				tmp_srv->shc_rtt = 0;
1060 				tmp_srv->shc_fail_cnt = 0;
1061 			}
1062 		} else {
1063 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1064 				HC_CANCEL_TIMER(tmp_srv);
1065 				tmp_srv->shc_status = ILB_HCS_DISABLED;
1066 				if (tmp_srv->shc_child_pid != 0)
1067 					ilbd_hc_kill_probe(tmp_srv);
1068 			}
1069 		}
1070 	}
1071 	return (ret);
1072 }
1073 
1074 ilb_status_t
1075 ilbd_hc_enable_rule(const ilbd_rule_t *rule)
1076 {
1077 	return (ilbd_hc_toggle_rule(rule, B_TRUE));
1078 }
1079 
1080 ilb_status_t
1081 ilbd_hc_disable_rule(const ilbd_rule_t *rule)
1082 {
1083 	return (ilbd_hc_toggle_rule(rule, B_FALSE));
1084 }
1085 
1086 static const char *
1087 topo_2_str(ilb_topo_t topo)
1088 {
1089 	switch (topo) {
1090 	case ILB_TOPO_DSR:
1091 		return ("DSR");
1092 	case ILB_TOPO_NAT:
1093 		return ("NAT");
1094 	case ILB_TOPO_HALF_NAT:
1095 		return ("HALF_NAT");
1096 	default:
1097 		/* Should not happen. */
1098 		logerr("%s: unknown topology", __func__);
1099 		break;
1100 	}
1101 	return ("");
1102 }
1103 
1104 /*
1105  * Create the argument list to be passed to a hc probe command.
1106  * The passed in argv is assumed to have HC_PROBE_ARGC elements.
1107  */
1108 static boolean_t
1109 create_argv(ilbd_hc_srv_t *srv, char *argv[])
1110 {
1111 	char buf[INET6_ADDRSTRLEN];
1112 	ilbd_rule_t const *rule;
1113 	ilb_sg_srv_t const *sg_srv;
1114 	struct in_addr v4_addr;
1115 	in_port_t port;
1116 	int i;
1117 
1118 	rule = srv->shc_hc_rule->hcr_rule;
1119 	sg_srv = srv->shc_sg_srv;
1120 
1121 	if (srv->shc_state == ilbd_hc_def_pinging) {
1122 		if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL)
1123 			return (B_FALSE);
1124 	} else {
1125 		switch (srv->shc_hc->ihc_test_type) {
1126 		case ILBD_HC_USER:
1127 			if ((argv[0] = strdup(srv->shc_hc->ihc_test)) == NULL)
1128 				return (B_FALSE);
1129 			break;
1130 		case ILBD_HC_TCP:
1131 		case ILBD_HC_UDP:
1132 			if ((argv[0] = strdup(ILB_PROBE_PROTO)) ==
1133 			    NULL) {
1134 				return (B_FALSE);
1135 			}
1136 			break;
1137 		case ILBD_HC_PING:
1138 			if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL) {
1139 				return (B_FALSE);
1140 			}
1141 			break;
1142 		}
1143 	}
1144 
1145 	/*
1146 	 * argv[1] is the VIP.
1147 	 *
1148 	 * Right now, the VIP and the backend server addresses should be
1149 	 * in the same IP address family.  Here we don't do that in case
1150 	 * this assumption is changed in future.
1151 	 */
1152 	if (IN6_IS_ADDR_V4MAPPED(&rule->irl_vip)) {
1153 		IN6_V4MAPPED_TO_INADDR(&rule->irl_vip, &v4_addr);
1154 		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1155 			goto cleanup;
1156 	} else {
1157 		if (inet_ntop(AF_INET6, &rule->irl_vip, buf,
1158 		    sizeof (buf)) == NULL) {
1159 			goto cleanup;
1160 		}
1161 	}
1162 	if ((argv[1] = strdup(buf)) == NULL)
1163 		goto cleanup;
1164 
1165 	/*
1166 	 * argv[2] is the backend server address.
1167 	 */
1168 	if (IN6_IS_ADDR_V4MAPPED(&sg_srv->sgs_addr)) {
1169 		IN6_V4MAPPED_TO_INADDR(&sg_srv->sgs_addr, &v4_addr);
1170 		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1171 			goto cleanup;
1172 	} else {
1173 		if (inet_ntop(AF_INET6, &sg_srv->sgs_addr, buf,
1174 		    sizeof (buf)) == NULL) {
1175 			goto cleanup;
1176 		}
1177 	}
1178 	if ((argv[2] = strdup(buf)) == NULL)
1179 		goto cleanup;
1180 
1181 	/*
1182 	 * argv[3] is the transport protocol used in the rule.
1183 	 */
1184 	switch (rule->irl_proto) {
1185 	case IPPROTO_TCP:
1186 		argv[3] = strdup("TCP");
1187 		break;
1188 	case IPPROTO_UDP:
1189 		argv[3] = strdup("UDP");
1190 		break;
1191 	default:
1192 		logerr("%s: unknown protocol", __func__);
1193 		goto cleanup;
1194 	}
1195 	if (argv[3] == NULL)
1196 		goto cleanup;
1197 
1198 	/*
1199 	 * argv[4] is the load balance mode, DSR, NAT, HALF-NAT.
1200 	 */
1201 	if ((argv[4] = strdup(topo_2_str(rule->irl_topo))) == NULL)
1202 		goto cleanup;
1203 
1204 	/*
1205 	 * argv[5] is the port range.  Right now, there should only be 1 port.
1206 	 */
1207 	switch (rule->irl_hcpflag) {
1208 	case ILB_HCI_PROBE_FIX:
1209 		port = ntohs(rule->irl_hcport);
1210 		break;
1211 	case ILB_HCI_PROBE_ANY: {
1212 		in_port_t min, max;
1213 
1214 		if (ntohs(sg_srv->sgs_minport) == 0) {
1215 			min = ntohs(rule->irl_minport);
1216 			max = ntohs(rule->irl_maxport);
1217 		} else {
1218 			min = ntohs(sg_srv->sgs_minport);
1219 			max = ntohs(sg_srv->sgs_maxport);
1220 		}
1221 		if (max > min)
1222 			port = min + gethrtime() % (max - min + 1);
1223 		else
1224 			port = min;
1225 		break;
1226 	}
1227 	default:
1228 		logerr("%s: unknown HC flag", __func__);
1229 		goto cleanup;
1230 	}
1231 	(void) sprintf(buf, "%d", port);
1232 	if ((argv[5] = strdup(buf)) == NULL)
1233 		goto cleanup;
1234 
1235 	/*
1236 	 * argv[6] is the probe timeout.
1237 	 */
1238 	(void) sprintf(buf, "%d", srv->shc_hc->ihc_timeout);
1239 	if ((argv[6] = strdup(buf)) == NULL)
1240 		goto cleanup;
1241 
1242 	argv[7] = NULL;
1243 	return (B_TRUE);
1244 
1245 cleanup:
1246 	for (i = 0; i < HC_PROBE_ARGC; i++) {
1247 		if (argv[i] != NULL)
1248 			free(argv[i]);
1249 	}
1250 	return (B_FALSE);
1251 }
1252 
1253 static void
1254 destroy_argv(char *argv[])
1255 {
1256 	int i;
1257 
1258 	for (i = 0; argv[i] != NULL; i++)
1259 		free(argv[i]);
1260 }
1261 
1262 /* Spawn a process to run the hc probe on the given server. */
1263 static boolean_t
1264 ilbd_run_probe(ilbd_hc_srv_t *srv)
1265 {
1266 	posix_spawn_file_actions_t	fd_actions;
1267 	posix_spawnattr_t		attr;
1268 	sigset_t			child_sigset;
1269 	int				fds[2];
1270 	int				fdflags;
1271 	pid_t				pid;
1272 	char				*child_argv[HC_PROBE_ARGC];
1273 	ilbd_hc_probe_event_t		*probe_ev;
1274 	char				*probe_name;
1275 
1276 	bzero(child_argv, HC_PROBE_ARGC * sizeof (char *));
1277 	if ((probe_ev = calloc(1, sizeof (*probe_ev))) == NULL) {
1278 		logdebug("ilbd_run_probe: calloc");
1279 		return (B_FALSE);
1280 	}
1281 
1282 	/* Set up a pipe to get output from probe command. */
1283 	if (pipe(fds) < 0) {
1284 		logdebug("ilbd_run_probe: cannot create pipe");
1285 		free(probe_ev);
1286 		return (B_FALSE);
1287 	}
1288 	/* Set our side of the pipe to be non-blocking */
1289 	if ((fdflags = fcntl(fds[0], F_GETFL, 0)) == -1) {
1290 		logdebug("ilbd_run_probe: fcntl(F_GETFL)");
1291 		goto cleanup;
1292 	}
1293 	if (fcntl(fds[0], F_SETFL, fdflags | O_NONBLOCK) == -1) {
1294 		logdebug("ilbd_run_probe: fcntl(F_SETFL)");
1295 		goto cleanup;
1296 	}
1297 
1298 	if (posix_spawn_file_actions_init(&fd_actions) != 0) {
1299 		logdebug("ilbd_run_probe: posix_spawn_file_actions_init");
1300 		goto cleanup;
1301 	}
1302 	if (posix_spawnattr_init(&attr) != 0) {
1303 		logdebug("ilbd_run_probe: posix_spawnattr_init");
1304 		goto cleanup;
1305 	}
1306 	if (posix_spawn_file_actions_addclose(&fd_actions, fds[0]) != 0) {
1307 		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1308 		goto cleanup;
1309 	}
1310 	if (posix_spawn_file_actions_adddup2(&fd_actions, fds[1],
1311 	    STDOUT_FILENO) != 0) {
1312 		logdebug("ilbd_run_probe: posix_spawn_file_actions_dup2");
1313 		goto cleanup;
1314 	}
1315 	if (posix_spawn_file_actions_addclose(&fd_actions, fds[1]) != 0) {
1316 		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1317 		goto cleanup;
1318 	}
1319 
1320 	/* Reset all signal handling of the child to default. */
1321 	(void) sigfillset(&child_sigset);
1322 	if (posix_spawnattr_setsigdefault(&attr, &child_sigset) != 0) {
1323 		logdebug("ilbd_run_probe: posix_spawnattr_setsigdefault");
1324 		goto cleanup;
1325 	}
1326 	/* Don't want SIGCHLD. */
1327 	if (posix_spawnattr_setflags(&attr, POSIX_SPAWN_NOSIGCHLD_NP|
1328 	    POSIX_SPAWN_SETSIGDEF) != 0) {
1329 		logdebug("ilbd_run_probe: posix_spawnattr_setflags");
1330 		goto cleanup;
1331 	}
1332 
1333 	if (!create_argv(srv, child_argv)) {
1334 		logdebug("ilbd_run_probe: create_argv");
1335 		goto cleanup;
1336 	}
1337 
1338 	/*
1339 	 * If we are doing default pinging or not using a user supplied
1340 	 * probe, we should execute our standard supplied probe.  The
1341 	 * supplied probe command handles all types of probes.  And the
1342 	 * type used depends on argv[0], as filled in by create_argv().
1343 	 */
1344 	if (srv->shc_state == ilbd_hc_def_pinging ||
1345 	    srv->shc_hc->ihc_test_type != ILBD_HC_USER) {
1346 		probe_name = ILB_PROBE_PROTO;
1347 	} else {
1348 		probe_name = srv->shc_hc->ihc_test;
1349 	}
1350 	if (posix_spawn(&pid, probe_name, &fd_actions, &attr, child_argv,
1351 	    NULL) != 0) {
1352 		logerr("%s: posix_spawn: %s for server %s: %s", __func__,
1353 		    srv->shc_hc->ihc_test, srv->shc_sg_srv->sgs_srvID,
1354 		    strerror(errno));
1355 		goto cleanup;
1356 	}
1357 
1358 	(void) close(fds[1]);
1359 	destroy_argv(child_argv);
1360 	srv->shc_child_pid = pid;
1361 	srv->shc_child_fd = fds[0];
1362 	srv->shc_ev = probe_ev;
1363 
1364 	probe_ev->ihp_ev = ILBD_EVENT_PROBE;
1365 	probe_ev->ihp_srv = srv;
1366 	probe_ev->ihp_pid = pid;
1367 	if (port_associate(srv->shc_ev_port, PORT_SOURCE_FD, fds[0],
1368 	    POLLRDNORM, probe_ev) != 0) {
1369 		/*
1370 		 * Need to kill the child.  It will free the srv->shc_ev,
1371 		 * which is probe_ev.  So set probe_ev to NULL.
1372 		 */
1373 		ilbd_hc_kill_probe(srv);
1374 		probe_ev = NULL;
1375 		goto cleanup;
1376 	}
1377 
1378 	return (B_TRUE);
1379 
1380 cleanup:
1381 	(void) close(fds[0]);
1382 	(void) close(fds[1]);
1383 	destroy_argv(child_argv);
1384 	if (probe_ev != NULL)
1385 		free(probe_ev);
1386 	return (B_FALSE);
1387 }
1388 
1389 /*
1390  * Called by ild_hc_probe_return() to re-associate the fd to a child to
1391  * the event port.
1392  */
1393 static void
1394 reassociate_port(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1395 {
1396 	if (port_associate(ev_port, PORT_SOURCE_FD, fd,
1397 	    POLLRDNORM, ev) != 0) {
1398 		/*
1399 		 * If we cannot reassociate with the port, the only
1400 		 * thing we can do now is to kill the child and
1401 		 * do a blocking wait here...
1402 		 */
1403 		logdebug("%s: port_associate: %s", __func__, strerror(errno));
1404 		if (kill(ev->ihp_pid, SIGKILL) != 0)
1405 			logerr("%s: kill: %s", __func__, strerror(errno));
1406 		if (waitpid(ev->ihp_pid, NULL, 0) != ev->ihp_pid)
1407 			logdebug("%s: waitpid: %s", __func__, strerror(errno));
1408 		free(ev);
1409 	}
1410 }
1411 
1412 /*
1413  * To handle a child probe process hanging up.
1414  */
1415 static void
1416 ilbd_hc_child_hup(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1417 {
1418 	ilbd_hc_srv_t *srv;
1419 	pid_t ret_pid;
1420 	int ret;
1421 
1422 	srv = ev->ihp_srv;
1423 
1424 	if (!ev->ihp_done) {
1425 		/* ilbd does not care about this process anymore ... */
1426 		ev->ihp_done = B_TRUE;
1427 		srv->shc_ev = NULL;
1428 		srv->shc_child_pid = 0;
1429 		HC_CANCEL_TIMER(srv);
1430 		ilbd_set_fail_state(srv);
1431 	}
1432 	ret_pid = waitpid(ev->ihp_pid, &ret, WNOHANG);
1433 	switch (ret_pid) {
1434 	case -1:
1435 		logperror("ilbd_hc_child_hup: waitpid");
1436 		/* FALLTHROUGH */
1437 	case 0:
1438 		/* The child has not completed the exit. Wait again. */
1439 		reassociate_port(ev_port, fd, ev);
1440 		break;
1441 	default:
1442 		/* Right now, we just ignore the exit status. */
1443 		if (WIFEXITED(ret))
1444 			ret = WEXITSTATUS(ret);
1445 		(void) close(fd);
1446 		free(ev);
1447 	}
1448 }
1449 
1450 /*
1451  * To read the output of a child probe process.
1452  */
1453 static void
1454 ilbd_hc_child_data(int fd, ilbd_hc_probe_event_t *ev)
1455 {
1456 	ilbd_hc_srv_t *srv;
1457 	char buf[HC_MAX_PROBE_OUTPUT];
1458 	int ret;
1459 	int64_t rtt;
1460 
1461 	srv = ev->ihp_srv;
1462 
1463 	bzero(buf, HC_MAX_PROBE_OUTPUT);
1464 	ret = read(fd, buf, HC_MAX_PROBE_OUTPUT - 1);
1465 	/* Should not happen since event port should have caught this. */
1466 	assert(ret > 0);
1467 
1468 	/*
1469 	 * We expect the probe command to print out the RTT only.  But
1470 	 * the command may misbehave and print out more than what we intend to
1471 	 * read in.  So need to do this check below to "flush" out all the
1472 	 * output from the command.
1473 	 */
1474 	if (!ev->ihp_done) {
1475 		ev->ihp_done = B_TRUE;
1476 		/* We don't need to know about this event anymore. */
1477 		srv->shc_ev = NULL;
1478 		srv->shc_child_pid = 0;
1479 		HC_CANCEL_TIMER(srv);
1480 	} else {
1481 		return;
1482 	}
1483 
1484 	rtt = strtoll(buf, NULL, 10);
1485 
1486 	/*
1487 	 * -1 means the server is dead or the probe somehow fails.  Treat
1488 	 * them both as server is dead.
1489 	 */
1490 	if (rtt == -1) {
1491 		ilbd_set_fail_state(srv);
1492 		return;
1493 	} else if (rtt > 0) {
1494 		/* If the returned RTT value is not valid, just ignore it. */
1495 		if (rtt > 0 && rtt <= UINT_MAX) {
1496 			/* Set rtt to be the simple smoothed average. */
1497 			if (srv->shc_rtt == 0) {
1498 				srv->shc_rtt = rtt;
1499 			} else {
1500 				srv->shc_rtt = 3 * ((srv)->shc_rtt >> 2) +
1501 				    (rtt >> 2);
1502 			}
1503 		}
1504 
1505 	}
1506 
1507 	switch (srv->shc_state) {
1508 	case ilbd_hc_def_pinging:
1509 		srv->shc_state = ilbd_hc_probing;
1510 
1511 		/* Ping is OK, now start the probe. */
1512 		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
1513 		break;
1514 	case ilbd_hc_probing:
1515 		srv->shc_fail_cnt = 0;
1516 
1517 		/* Server is dead before, re-enable it. */
1518 		if (srv->shc_status == ILB_HCS_UNREACH ||
1519 		    srv->shc_status == ILB_HCS_DEAD) {
1520 			/*
1521 			 * If enabling the server in kernel fails now,
1522 			 * hopefully when the timer fires again later, the
1523 			 * enabling can be done.
1524 			 */
1525 			if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
1526 			    srv->shc_hc_rule->hcr_rule->irl_name,
1527 			    stat_declare_srv_alive) != ILB_STATUS_OK) {
1528 				logerr("%s: cannot enable server in kernel: "
1529 				    " rule %s server %s", __func__,
1530 				    srv->shc_hc_rule->hcr_rule->irl_name,
1531 				    srv->shc_sg_srv->sgs_srvID);
1532 			} else {
1533 				srv->shc_status = ILB_HCS_ALIVE;
1534 			}
1535 		} else {
1536 			srv->shc_status = ILB_HCS_ALIVE;
1537 		}
1538 		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
1539 			logerr("%s: cannot restart timer: rule %s server %s",
1540 			    __func__, srv->shc_hc_rule->hcr_rule->irl_name,
1541 			    srv->shc_sg_srv->sgs_srvID);
1542 			ilbd_mark_server_disabled(srv);
1543 		}
1544 		break;
1545 	default:
1546 		logdebug("%s: unknown state", __func__);
1547 		break;
1548 	}
1549 }
1550 
1551 /*
1552  * Handle the return event of a child probe fd.
1553  */
1554 void
1555 ilbd_hc_probe_return(int ev_port, int fd, int port_events,
1556     ilbd_hc_probe_event_t *ev)
1557 {
1558 	/*
1559 	 * Note that there can be more than one events delivered to us at
1560 	 * the same time.  So we need to check them individually.
1561 	 */
1562 	if (port_events & POLLRDNORM)
1563 		ilbd_hc_child_data(fd, ev);
1564 
1565 	if (port_events & (POLLHUP|POLLERR)) {
1566 		ilbd_hc_child_hup(ev_port, fd, ev);
1567 		return;
1568 	}
1569 
1570 	/*
1571 	 * Re-associate the fd with the port so that when the child
1572 	 * exits, we can reap the status.
1573 	 */
1574 	reassociate_port(ev_port, fd, ev);
1575 }
1576