xref: /titanic_51/usr/src/cmd/cmd-inet/usr.lib/ilbd/ilbd_hc.c (revision d583b39bfb4e2571d3e41097c5c357ffe353ad45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2012 Milan Jurik. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <sys/list.h>
31 #include <sys/stropts.h>
32 #include <sys/siginfo.h>
33 #include <sys/wait.h>
34 #include <arpa/inet.h>
35 #include <netinet/in.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <strings.h>
39 #include <stddef.h>
40 #include <unistd.h>
41 #include <libilb.h>
42 #include <port.h>
43 #include <time.h>
44 #include <signal.h>
45 #include <assert.h>
46 #include <errno.h>
47 #include <spawn.h>
48 #include <fcntl.h>
49 #include <limits.h>
50 #include "libilb_impl.h"
51 #include "ilbd.h"
52 
53 /* Global list of HC objects */
54 list_t ilbd_hc_list;
55 
56 /* Timer queue for all hc related timers. */
57 static iu_tq_t *ilbd_hc_timer_q;
58 
59 /* Indicate whether the timer needs to be updated */
60 static boolean_t hc_timer_restarted;
61 
62 static void ilbd_hc_probe_timer(iu_tq_t *, void *);
63 static ilb_status_t ilbd_hc_restart_timer(ilbd_hc_t *, ilbd_hc_srv_t *);
64 static boolean_t ilbd_run_probe(ilbd_hc_srv_t *);
65 
66 #define	MAX(a, b)	((a) > (b) ? (a) : (b))
67 
68 /*
69  * Number of arguments passed to a probe.  argc[0] is the path name of
70  * the probe.
71  */
72 #define	HC_PROBE_ARGC	8
73 
74 /*
75  * Max number of characters to be read from the output of a probe.  It
76  * is long enough to read in a 64 bit integer.
77  */
78 #define	HC_MAX_PROBE_OUTPUT	24
79 
80 void
81 i_ilbd_setup_hc_list(void)
82 {
83 	list_create(&ilbd_hc_list, sizeof (ilbd_hc_t),
84 	    offsetof(ilbd_hc_t, ihc_link));
85 }
86 
87 /*
88  * Given a hc object name, return a pointer to hc object if found.
89  */
90 ilbd_hc_t *
91 ilbd_get_hc(const char *name)
92 {
93 	ilbd_hc_t *hc;
94 
95 	for (hc = list_head(&ilbd_hc_list); hc != NULL;
96 	    hc = list_next(&ilbd_hc_list, hc)) {
97 		if (strcasecmp(hc->ihc_name, name) == 0)
98 			return (hc);
99 	}
100 	return (NULL);
101 }
102 
103 /*
104  * Generates an audit record for create-healthcheck,
105  * delete-healtcheck subcommands.
106  */
107 static void
108 ilbd_audit_hc_event(const char *audit_hcname,
109     const ilb_hc_info_t *audit_hcinfo, ilbd_cmd_t cmd,
110     ilb_status_t rc, ucred_t *ucredp)
111 {
112 	adt_session_data_t	*ah;
113 	adt_event_data_t	*event;
114 	au_event_t	flag;
115 	int	audit_error;
116 
117 	if ((ucredp == NULL) && (cmd == ILBD_CREATE_HC))  {
118 		/*
119 		 * we came here from the path where ilbd incorporates
120 		 * the configuration that is listed in SCF:
121 		 * i_ilbd_read_config->ilbd_walk_hc_pgs->
122 		 *   ->ilbd_scf_instance_walk_pg->ilbd_create_hc
123 		 * We skip auditing in that case
124 		 */
125 		logdebug("ilbd_audit_hc_event: skipping auditing");
126 		return;
127 	}
128 
129 	if (adt_start_session(&ah, NULL, 0) != 0) {
130 		logerr("ilbd_audit_hc_event: adt_start_session failed");
131 		exit(EXIT_FAILURE);
132 	}
133 	if (adt_set_from_ucred(ah, ucredp, ADT_NEW) != 0) {
134 		(void) adt_end_session(ah);
135 		logerr("ilbd_audit_rule_event: adt_set_from_ucred failed");
136 		exit(EXIT_FAILURE);
137 	}
138 	if (cmd == ILBD_CREATE_HC)
139 		flag = ADT_ilb_create_healthcheck;
140 	else if (cmd == ILBD_DESTROY_HC)
141 		flag = ADT_ilb_delete_healthcheck;
142 
143 	if ((event = adt_alloc_event(ah, flag)) == NULL) {
144 		logerr("ilbd_audit_hc_event: adt_alloc_event failed");
145 		exit(EXIT_FAILURE);
146 	}
147 	(void) memset((char *)event, 0, sizeof (adt_event_data_t));
148 
149 	switch (cmd) {
150 	case ILBD_CREATE_HC:
151 		event->adt_ilb_create_healthcheck.auth_used =
152 		    NET_ILB_CONFIG_AUTH;
153 		event->adt_ilb_create_healthcheck.hc_test =
154 		    (char *)audit_hcinfo->hci_test;
155 		event->adt_ilb_create_healthcheck.hc_name =
156 		    (char *)audit_hcinfo->hci_name;
157 
158 		/*
159 		 * If the value 0 is stored, the default values are
160 		 * set in the kernel. User land does not know about them
161 		 * So if the user does not specify them, audit record
162 		 * will show them as 0
163 		 */
164 		event->adt_ilb_create_healthcheck.hc_timeout =
165 		    audit_hcinfo->hci_timeout;
166 		event->adt_ilb_create_healthcheck.hc_count =
167 		    audit_hcinfo->hci_count;
168 		event->adt_ilb_create_healthcheck.hc_interval =
169 		    audit_hcinfo->hci_interval;
170 		break;
171 	case ILBD_DESTROY_HC:
172 		event->adt_ilb_delete_healthcheck.auth_used =
173 		    NET_ILB_CONFIG_AUTH;
174 		event->adt_ilb_delete_healthcheck.hc_name =
175 		    (char *)audit_hcname;
176 		break;
177 	}
178 
179 	/* Fill in success/failure */
180 	if (rc == ILB_STATUS_OK) {
181 		if (adt_put_event(event, ADT_SUCCESS, ADT_SUCCESS) != 0) {
182 			logerr("ilbd_audit_hc_event: adt_put_event failed");
183 			exit(EXIT_FAILURE);
184 		}
185 	} else {
186 		audit_error = ilberror2auditerror(rc);
187 		if (adt_put_event(event, ADT_FAILURE, audit_error) != 0) {
188 			logerr("ilbd_audit_hc_event: adt_put_event failed");
189 			exit(EXIT_FAILURE);
190 		}
191 	}
192 	adt_free_event(event);
193 	(void) adt_end_session(ah);
194 }
195 
196 /*
197  * Given the ilb_hc_info_t passed in (from the libilb), create a hc object
198  * in ilbd.  The parameter ev_port is not used, refer to comments of
199  * ilbd_create_sg() in ilbd_sg.c
200  */
201 /* ARGSUSED */
202 ilb_status_t
203 ilbd_create_hc(const ilb_hc_info_t *hc_info, int ev_port,
204     const struct passwd *ps, ucred_t *ucredp)
205 {
206 	ilbd_hc_t *hc;
207 	ilb_status_t ret = ILB_STATUS_OK;
208 
209 	/*
210 	 * ps == NULL is from the daemon when it starts and load configuration
211 	 * ps != NULL is from client.
212 	 */
213 	if (ps != NULL) {
214 		ret = ilbd_check_client_config_auth(ps);
215 		if (ret != ILB_STATUS_OK) {
216 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
217 			    ret, ucredp);
218 			return (ret);
219 		}
220 	}
221 
222 	if (hc_info->hci_name[0] == '\0') {
223 		logdebug("ilbd_create_hc: missing healthcheck info");
224 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
225 		    ILB_STATUS_ENOHCINFO, ucredp);
226 		return (ILB_STATUS_ENOHCINFO);
227 	}
228 
229 	hc = ilbd_get_hc(hc_info->hci_name);
230 	if (hc != NULL) {
231 		logdebug("ilbd_create_hc: healthcheck name %s already"
232 		    " exists", hc_info->hci_name);
233 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
234 		    ILB_STATUS_EEXIST, ucredp);
235 		return (ILB_STATUS_EEXIST);
236 	}
237 
238 	/*
239 	 * Sanity check on user supplied probe.  The given path name
240 	 * must be a full path name (starts with '/') and is
241 	 * executable.
242 	 */
243 	if (strcasecmp(hc_info->hci_test, ILB_HC_STR_TCP) != 0 &&
244 	    strcasecmp(hc_info->hci_test, ILB_HC_STR_UDP) != 0 &&
245 	    strcasecmp(hc_info->hci_test, ILB_HC_STR_PING) != 0 &&
246 	    (hc_info->hci_test[0] != '/' ||
247 	    access(hc_info->hci_test, X_OK) == -1)) {
248 		if (errno == ENOENT) {
249 			logdebug("ilbd_create_hc: user script %s doesn't "
250 			    "exist", hc_info->hci_test);
251 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
252 			    ILB_STATUS_ENOENT, ucredp);
253 			return (ILB_STATUS_ENOENT);
254 		} else {
255 			logdebug("ilbd_create_hc: user script %s is "
256 			    "invalid", hc_info->hci_test);
257 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
258 			    ILB_STATUS_EINVAL, ucredp);
259 			return (ILB_STATUS_EINVAL);
260 		}
261 	}
262 
263 	/* Create and add the hc object */
264 	hc = calloc(1, sizeof (ilbd_hc_t));
265 	if (hc == NULL) {
266 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
267 		    ILB_STATUS_ENOMEM, ucredp);
268 		return (ILB_STATUS_ENOMEM);
269 	}
270 	(void) memcpy(&hc->ihc_info, hc_info, sizeof (ilb_hc_info_t));
271 	if (strcasecmp(hc->ihc_test, ILB_HC_STR_TCP) == 0)
272 		hc->ihc_test_type = ILBD_HC_TCP;
273 	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_UDP) == 0)
274 		hc->ihc_test_type = ILBD_HC_UDP;
275 	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_PING) == 0)
276 		hc->ihc_test_type = ILBD_HC_PING;
277 	else
278 		hc->ihc_test_type = ILBD_HC_USER;
279 	list_create(&hc->ihc_rules, sizeof (ilbd_hc_rule_t),
280 	    offsetof(ilbd_hc_rule_t, hcr_link));
281 
282 	/* Update SCF */
283 	if (ps != NULL) {
284 		if ((ret = ilbd_create_pg(ILBD_SCF_HC, (void *)hc)) !=
285 		    ILB_STATUS_OK) {
286 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
287 			    ret, ucredp);
288 			free(hc);
289 			return (ret);
290 		}
291 	}
292 
293 	/* Everything is fine, now add it to the global list. */
294 	list_insert_tail(&ilbd_hc_list, hc);
295 	ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC, ret, ucredp);
296 	return (ret);
297 }
298 
299 /*
300  * Given a name of a hc object, destroy it.
301  */
302 ilb_status_t
303 ilbd_destroy_hc(const char *hc_name, const struct passwd *ps,
304     ucred_t *ucredp)
305 {
306 	ilb_status_t ret;
307 	ilbd_hc_t *hc;
308 
309 	/*
310 	 * No need to check ps == NULL, daemon won't call any destroy func
311 	 * at start up.
312 	 */
313 	ret = ilbd_check_client_config_auth(ps);
314 	if (ret != ILB_STATUS_OK) {
315 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
316 		    ret, ucredp);
317 		return (ret);
318 	}
319 
320 	hc = ilbd_get_hc(hc_name);
321 	if (hc == NULL) {
322 		logdebug("ilbd_destroy_hc: healthcheck %s does not exist",
323 		    hc_name);
324 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
325 		    ILB_STATUS_ENOENT, ucredp);
326 		return (ILB_STATUS_ENOENT);
327 	}
328 
329 	/* If hc is in use, cannot delete it */
330 	if (hc->ihc_rule_cnt > 0) {
331 		logdebug("ilbd_destroy_hc: healthcheck %s is associated"
332 		    " with a rule - cannot remove", hc_name);
333 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
334 		    ILB_STATUS_INUSE, ucredp);
335 		return (ILB_STATUS_INUSE);
336 	}
337 
338 	if ((ret = ilbd_destroy_pg(ILBD_SCF_HC, hc_name)) !=
339 	    ILB_STATUS_OK) {
340 		logdebug("ilbd_destroy_hc: cannot destroy healthcheck %s "
341 		    "property group", hc_name);
342 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
343 		    ret, ucredp);
344 		return (ret);
345 	}
346 
347 	list_remove(&ilbd_hc_list, hc);
348 	free(hc);
349 	ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC, ret, ucredp);
350 	return (ret);
351 }
352 
353 /*
354  * Given a hc object name, return its information.  Used by libilb to
355  * get hc info.
356  */
357 ilb_status_t
358 ilbd_get_hc_info(const char *hc_name, uint32_t *rbuf, size_t *rbufsz)
359 {
360 	ilbd_hc_t	*hc;
361 	ilb_hc_info_t	*hc_info;
362 	ilb_comm_t	*ic = (ilb_comm_t *)rbuf;
363 
364 	hc = ilbd_get_hc(hc_name);
365 	if (hc == NULL) {
366 		logdebug("%s: healthcheck %s does not exist", __func__,
367 		    hc_name);
368 		return (ILB_STATUS_ENOENT);
369 	}
370 	ilbd_reply_ok(rbuf, rbufsz);
371 	hc_info = (ilb_hc_info_t *)&ic->ic_data;
372 
373 	(void) strlcpy(hc_info->hci_name, hc->ihc_name, sizeof (hc->ihc_name));
374 	(void) strlcpy(hc_info->hci_test, hc->ihc_test, sizeof (hc->ihc_test));
375 	hc_info->hci_timeout = hc->ihc_timeout;
376 	hc_info->hci_count = hc->ihc_count;
377 	hc_info->hci_interval = hc->ihc_interval;
378 	hc_info->hci_def_ping = hc->ihc_def_ping;
379 
380 	*rbufsz += sizeof (ilb_hc_info_t);
381 
382 	return (ILB_STATUS_OK);
383 }
384 
385 static void
386 ilbd_hc_copy_srvs(uint32_t *rbuf, size_t *rbufsz, ilbd_hc_rule_t *hc_rule,
387     const char *rulename)
388 {
389 	ilbd_hc_srv_t		*tmp_srv;
390 	ilb_hc_srv_t		*dst_srv;
391 	ilb_hc_rule_srv_t	*srvs;
392 	size_t			tmp_rbufsz;
393 	int			i;
394 
395 	tmp_rbufsz = *rbufsz;
396 	/* Set up the reply buffer.  rbufsz will be set to the new size. */
397 	ilbd_reply_ok(rbuf, rbufsz);
398 
399 	/* Calculate how much space is left for holding server info. */
400 	*rbufsz += sizeof (ilb_hc_rule_srv_t);
401 	tmp_rbufsz -= *rbufsz;
402 
403 	srvs = (ilb_hc_rule_srv_t *)&((ilb_comm_t *)rbuf)->ic_data;
404 
405 	tmp_srv = list_head(&hc_rule->hcr_servers);
406 	for (i = 0; tmp_srv != NULL && tmp_rbufsz >= sizeof (*dst_srv); i++) {
407 		dst_srv = &srvs->rs_srvs[i];
408 
409 		(void) strlcpy(dst_srv->hcs_rule_name, rulename, ILB_NAMESZ);
410 		(void) strlcpy(dst_srv->hcs_ID, tmp_srv->shc_sg_srv->sgs_srvID,
411 		    ILB_NAMESZ);
412 		(void) strlcpy(dst_srv->hcs_hc_name,
413 		    tmp_srv->shc_hc->ihc_name, ILB_NAMESZ);
414 		dst_srv->hcs_IP = tmp_srv->shc_sg_srv->sgs_addr;
415 		dst_srv->hcs_fail_cnt = tmp_srv->shc_fail_cnt;
416 		dst_srv->hcs_status = tmp_srv->shc_status;
417 		dst_srv->hcs_rtt = tmp_srv->shc_rtt;
418 		dst_srv->hcs_lasttime = tmp_srv->shc_lasttime;
419 		dst_srv->hcs_nexttime = tmp_srv->shc_nexttime;
420 
421 		tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv);
422 		tmp_rbufsz -= sizeof (*dst_srv);
423 	}
424 	srvs->rs_num_srvs = i;
425 	*rbufsz += i * sizeof (*dst_srv);
426 }
427 
428 /*
429  * Given a rule name, return the hc status of its servers.
430  */
431 ilb_status_t
432 ilbd_get_hc_srvs(const char *rulename, uint32_t *rbuf, size_t *rbufsz)
433 {
434 	ilbd_hc_t	*hc;
435 	ilbd_hc_rule_t	*hc_rule;
436 
437 	for (hc = list_head(&ilbd_hc_list); hc != NULL;
438 	    hc = list_next(&ilbd_hc_list, hc)) {
439 		for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
440 		    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
441 			if (strcasecmp(hc_rule->hcr_rule->irl_name,
442 			    rulename) != 0) {
443 				continue;
444 			}
445 			ilbd_hc_copy_srvs(rbuf, rbufsz, hc_rule, rulename);
446 			return (ILB_STATUS_OK);
447 		}
448 	}
449 	return (ILB_STATUS_RULE_NO_HC);
450 }
451 
452 /*
453  * Initialize the hc timer and associate the notification of timeout to
454  * the given event port.
455  */
456 void
457 ilbd_hc_timer_init(int ev_port, ilbd_timer_event_obj_t *ev_obj)
458 {
459 	struct sigevent sigev;
460 	port_notify_t notify;
461 
462 	if ((ilbd_hc_timer_q = iu_tq_create()) == NULL) {
463 		logerr("%s: cannot create hc timer queue", __func__);
464 		exit(EXIT_FAILURE);
465 	}
466 	hc_timer_restarted = B_FALSE;
467 
468 	ev_obj->ev = ILBD_EVENT_TIMER;
469 	ev_obj->timerid = -1;
470 
471 	notify.portnfy_port = ev_port;
472 	notify.portnfy_user = ev_obj;
473 	sigev.sigev_notify = SIGEV_PORT;
474 	sigev.sigev_value.sival_ptr = &notify;
475 	if (timer_create(CLOCK_REALTIME, &sigev, &ev_obj->timerid) == -1) {
476 		logerr("%s: cannot create timer", __func__);
477 		exit(EXIT_FAILURE);
478 	}
479 }
480 
481 /*
482  * HC timeout handler.
483  */
484 void
485 ilbd_hc_timeout(void)
486 {
487 	(void) iu_expire_timers(ilbd_hc_timer_q);
488 	hc_timer_restarted = B_TRUE;
489 }
490 
491 /*
492  * Set up the timer to fire at the earliest timeout.
493  */
494 void
495 ilbd_hc_timer_update(ilbd_timer_event_obj_t *ev_obj)
496 {
497 	itimerspec_t itimeout;
498 	int timeout;
499 
500 	/*
501 	 * There is no change on the timer list, so no need to set up the
502 	 * timer again.
503 	 */
504 	if (!hc_timer_restarted)
505 		return;
506 
507 restart:
508 	if ((timeout = iu_earliest_timer(ilbd_hc_timer_q)) == INFTIM) {
509 		hc_timer_restarted = B_FALSE;
510 		return;
511 	} else if (timeout == 0) {
512 		/*
513 		 * Handle the timeout immediately.  After that (clearing all
514 		 * the expired timers), check to  see if there are still
515 		 * timers running.  If yes, start them.
516 		 */
517 		(void) iu_expire_timers(ilbd_hc_timer_q);
518 		goto restart;
519 	}
520 
521 	itimeout.it_value.tv_sec = timeout / MILLISEC + 1;
522 	itimeout.it_value.tv_nsec = 0;
523 	itimeout.it_interval.tv_sec = 0;
524 	itimeout.it_interval.tv_nsec = 0;
525 
526 	/*
527 	 * Failure to set a timeout is "OK" since hopefully there will be
528 	 * other events and timer_settime() will be called again.  So
529 	 * we will only miss some timeouts.  But in the worst case, no event
530 	 * will happen and ilbd will get stuck...
531 	 */
532 	if (timer_settime(ev_obj->timerid, 0, &itimeout, NULL) == -1)
533 		logerr("%s: cannot set timer", __func__);
534 	hc_timer_restarted = B_FALSE;
535 }
536 
537 /*
538  * Kill the probe process of a server.
539  */
540 static void
541 ilbd_hc_kill_probe(ilbd_hc_srv_t *srv)
542 {
543 	/*
544 	 * First dissociate the fd from the event port.  It should not
545 	 * fail.
546 	 */
547 	if (port_dissociate(srv->shc_ev_port, PORT_SOURCE_FD,
548 	    srv->shc_child_fd) != 0) {
549 		logdebug("%s: port_dissociate: %s", __func__, strerror(errno));
550 	}
551 	(void) close(srv->shc_child_fd);
552 	free(srv->shc_ev);
553 	srv->shc_ev = NULL;
554 
555 	/* Then kill the probe process. */
556 	if (kill(srv->shc_child_pid, SIGKILL) != 0) {
557 		logerr("%s: rule %s server %s: %s", __func__,
558 		    srv->shc_hc_rule->hcr_rule->irl_name,
559 		    srv->shc_sg_srv->sgs_srvID, strerror(errno));
560 	}
561 	/* Should not fail... */
562 	if (waitpid(srv->shc_child_pid, NULL, 0) != srv->shc_child_pid) {
563 		logdebug("%s: waitpid: rule %s server %s", __func__,
564 		    srv->shc_hc_rule->hcr_rule->irl_name,
565 		    srv->shc_sg_srv->sgs_srvID);
566 	}
567 	srv->shc_child_pid = 0;
568 }
569 
570 /*
571  * Disable the server, either because the server is dead or because a timer
572  * cannot be started for this server.  Note that this only affects the
573  * transient configuration, meaning only in memory.  The persistent
574  * configuration is not affected.
575  */
576 static void
577 ilbd_mark_server_disabled(ilbd_hc_srv_t *srv)
578 {
579 	srv->shc_status = ILB_HCS_DISABLED;
580 
581 	/* Disable the server in kernel. */
582 	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
583 	    srv->shc_hc_rule->hcr_rule->irl_name,
584 	    stat_declare_srv_dead) != ILB_STATUS_OK) {
585 		logerr("%s: cannot disable server in kernel: rule %s "
586 		    "server %s", __func__,
587 		    srv->shc_hc_rule->hcr_rule->irl_name,
588 		    srv->shc_sg_srv->sgs_srvID);
589 	}
590 }
591 
592 /*
593  * A probe fails, set the state of the server.
594  */
595 static void
596 ilbd_set_fail_state(ilbd_hc_srv_t *srv)
597 {
598 	if (++srv->shc_fail_cnt < srv->shc_hc->ihc_count) {
599 		/* Probe again */
600 		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
601 		return;
602 	}
603 
604 	logdebug("%s: rule %s server %s fails %u", __func__,
605 	    srv->shc_hc_rule->hcr_rule->irl_name, srv->shc_sg_srv->sgs_srvID,
606 	    srv->shc_fail_cnt);
607 
608 	/*
609 	 * If this is a ping test, mark the server as
610 	 * unreachable instead of dead.
611 	 */
612 	if (srv->shc_hc->ihc_test_type == ILBD_HC_PING ||
613 	    srv->shc_state == ilbd_hc_def_pinging) {
614 		srv->shc_status = ILB_HCS_UNREACH;
615 	} else {
616 		srv->shc_status = ILB_HCS_DEAD;
617 	}
618 
619 	/* Disable the server in kernel. */
620 	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
621 	    srv->shc_hc_rule->hcr_rule->irl_name, stat_declare_srv_dead) !=
622 	    ILB_STATUS_OK) {
623 		logerr("%s: cannot disable server in kernel: rule %s "
624 		    "server %s", __func__,
625 		    srv->shc_hc_rule->hcr_rule->irl_name,
626 		    srv->shc_sg_srv->sgs_srvID);
627 	}
628 
629 	/* Still keep probing in case the server is alive again. */
630 	if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
631 		/* Only thing to do is to disable the server... */
632 		logerr("%s: cannot restart timer: rule %s server %s", __func__,
633 		    srv->shc_hc_rule->hcr_rule->irl_name,
634 		    srv->shc_sg_srv->sgs_srvID);
635 		srv->shc_status = ILB_HCS_DISABLED;
636 	}
637 }
638 
639 /*
640  * A probe process has not returned for the ihc_timeout period, we should
641  * kill it.  This function is the handler of this.
642  */
643 /* ARGSUSED */
644 static void
645 ilbd_hc_kill_timer(iu_tq_t *tq, void *arg)
646 {
647 	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
648 
649 	ilbd_hc_kill_probe(srv);
650 	ilbd_set_fail_state(srv);
651 }
652 
653 /*
654  * Probe timeout handler.  Send out the appropriate probe.
655  */
656 /* ARGSUSED */
657 static void
658 ilbd_hc_probe_timer(iu_tq_t *tq, void *arg)
659 {
660 	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
661 
662 	/*
663 	 * If starting the probe fails, just pretend that the timeout has
664 	 * extended.
665 	 */
666 	if (!ilbd_run_probe(srv)) {
667 		/*
668 		 * If we cannot restart the timer, the only thing we can do
669 		 * is to disable this server.  Hopefully the sys admin will
670 		 * notice this and enable this server again later.
671 		 */
672 		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
673 			logerr("%s: cannot restart timer: rule %s server %s, "
674 			    "disabling it", __func__,
675 			    srv->shc_hc_rule->hcr_rule->irl_name,
676 			    srv->shc_sg_srv->sgs_srvID);
677 			ilbd_mark_server_disabled(srv);
678 		}
679 		return;
680 	}
681 
682 	/*
683 	 * Similar to above, if kill timer cannot be started, disable the
684 	 * server.
685 	 */
686 	if ((srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q,
687 	    srv->shc_hc->ihc_timeout, ilbd_hc_kill_timer, srv)) == -1) {
688 		logerr("%s: cannot start kill timer: rule %s server %s, "
689 		    "disabling it", __func__,
690 		    srv->shc_hc_rule->hcr_rule->irl_name,
691 		    srv->shc_sg_srv->sgs_srvID);
692 		ilbd_mark_server_disabled(srv);
693 	}
694 	hc_timer_restarted = B_TRUE;
695 }
696 
697 /* Restart the periodic timer for a given server. */
698 static ilb_status_t
699 ilbd_hc_restart_timer(ilbd_hc_t *hc, ilbd_hc_srv_t *srv)
700 {
701 	int timeout;
702 
703 	/* Don't allow the timeout interval to be less than 1s */
704 	timeout = MAX((hc->ihc_interval >> 1) + (gethrtime() %
705 	    (hc->ihc_interval + 1)), 1);
706 
707 	/*
708 	 * If the probe is actually a ping probe, there is no need to
709 	 * do default pinging.  Just skip the step.
710 	 */
711 	if (hc->ihc_def_ping && hc->ihc_test_type != ILBD_HC_PING)
712 		srv->shc_state = ilbd_hc_def_pinging;
713 	else
714 		srv->shc_state = ilbd_hc_probing;
715 	srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q, timeout,
716 	    ilbd_hc_probe_timer, srv);
717 
718 	if (srv->shc_tid == -1)
719 		return (ILB_STATUS_TIMER);
720 	srv->shc_lasttime = time(NULL);
721 	srv->shc_nexttime = time(NULL) + timeout;
722 
723 	hc_timer_restarted = B_TRUE;
724 	return (ILB_STATUS_OK);
725 }
726 
727 /* Helper routine to associate a server with its hc object. */
728 static ilb_status_t
729 ilbd_hc_srv_add(ilbd_hc_t *hc, ilbd_hc_rule_t *hc_rule,
730     const ilb_sg_srv_t *srv, int ev_port)
731 {
732 	ilbd_hc_srv_t *new_srv;
733 	ilb_status_t ret;
734 
735 	if ((new_srv = calloc(1, sizeof (ilbd_hc_srv_t))) == NULL)
736 		return (ILB_STATUS_ENOMEM);
737 	new_srv->shc_hc = hc;
738 	new_srv->shc_hc_rule = hc_rule;
739 	new_srv->shc_sg_srv = srv;
740 	new_srv->shc_ev_port = ev_port;
741 	new_srv->shc_tid = -1;
742 	new_srv->shc_nexttime = time(NULL);
743 	new_srv->shc_lasttime = new_srv->shc_nexttime;
744 
745 	if ((hc_rule->hcr_rule->irl_flags & ILB_FLAGS_RULE_ENABLED) &&
746 	    ILB_IS_SRV_ENABLED(srv->sgs_flags)) {
747 		new_srv->shc_status = ILB_HCS_UNINIT;
748 		ret = ilbd_hc_restart_timer(hc, new_srv);
749 		if (ret != ILB_STATUS_OK) {
750 			free(new_srv);
751 			return (ret);
752 		}
753 	} else {
754 		new_srv->shc_status = ILB_HCS_DISABLED;
755 	}
756 
757 	list_insert_tail(&hc_rule->hcr_servers, new_srv);
758 	return (ILB_STATUS_OK);
759 }
760 
761 /* Handy macro to cancel a server's timer. */
762 #define	HC_CANCEL_TIMER(srv)						\
763 {									\
764 	void *arg;							\
765 	int ret;							\
766 	if ((srv)->shc_tid != -1) {					\
767 		ret = iu_cancel_timer(ilbd_hc_timer_q, (srv)->shc_tid, &arg); \
768 		(srv)->shc_tid = -1;					\
769 		assert(ret == 1);					\
770 		assert(arg == (srv));					\
771 	}								\
772 	hc_timer_restarted = B_TRUE;					\
773 }
774 
775 /* Helper routine to dissociate a server from its hc object. */
776 static ilb_status_t
777 ilbd_hc_srv_rem(ilbd_hc_rule_t *hc_rule, const ilb_sg_srv_t *srv)
778 {
779 	ilbd_hc_srv_t *tmp_srv;
780 
781 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
782 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
783 		if (tmp_srv->shc_sg_srv == srv) {
784 			list_remove(&hc_rule->hcr_servers, tmp_srv);
785 			HC_CANCEL_TIMER(tmp_srv);
786 			if (tmp_srv->shc_child_pid != 0)
787 				ilbd_hc_kill_probe(tmp_srv);
788 			free(tmp_srv);
789 			return (ILB_STATUS_OK);
790 		}
791 	}
792 	return (ILB_STATUS_ENOENT);
793 }
794 
795 /* Helper routine to dissociate all servers of a rule from its hc object. */
796 static void
797 ilbd_hc_srv_rem_all(ilbd_hc_rule_t *hc_rule)
798 {
799 	ilbd_hc_srv_t *srv;
800 
801 	while ((srv = list_remove_head(&hc_rule->hcr_servers)) != NULL) {
802 		HC_CANCEL_TIMER(srv);
803 		if (srv->shc_child_pid != 0)
804 			ilbd_hc_kill_probe(srv);
805 		free(srv);
806 	}
807 }
808 
809 /* Associate a rule with its hc object. */
810 ilb_status_t
811 ilbd_hc_associate_rule(const ilbd_rule_t *rule, int ev_port)
812 {
813 	ilbd_hc_t	*hc;
814 	ilbd_hc_rule_t	*hc_rule;
815 	ilb_status_t	ret;
816 	ilbd_sg_t	*sg;
817 	ilbd_srv_t	*ilbd_srv;
818 
819 	/* The rule is assumed to be initialized appropriately. */
820 	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
821 		logdebug("ilbd_hc_associate_rule: healthcheck %s does not "
822 		    "exist", rule->irl_hcname);
823 		return (ILB_STATUS_ENOHCINFO);
824 	}
825 	if ((hc->ihc_test_type == ILBD_HC_TCP &&
826 	    rule->irl_proto != IPPROTO_TCP) ||
827 	    (hc->ihc_test_type == ILBD_HC_UDP &&
828 	    rule->irl_proto != IPPROTO_UDP)) {
829 		return (ILB_STATUS_RULE_HC_MISMATCH);
830 	}
831 	if ((hc_rule = calloc(1, sizeof (ilbd_hc_rule_t))) == NULL) {
832 		logdebug("ilbd_hc_associate_rule: out of memory");
833 		return (ILB_STATUS_ENOMEM);
834 	}
835 
836 	hc_rule->hcr_rule = rule;
837 	list_create(&hc_rule->hcr_servers, sizeof (ilbd_hc_srv_t),
838 	    offsetof(ilbd_hc_srv_t, shc_srv_link));
839 
840 	/* Add all the servers. */
841 	sg = rule->irl_sg;
842 	for (ilbd_srv = list_head(&sg->isg_srvlist); ilbd_srv != NULL;
843 	    ilbd_srv = list_next(&sg->isg_srvlist, ilbd_srv)) {
844 		if ((ret = ilbd_hc_srv_add(hc, hc_rule, &ilbd_srv->isv_srv,
845 		    ev_port)) != ILB_STATUS_OK) {
846 			/* Remove all previously added servers */
847 			ilbd_hc_srv_rem_all(hc_rule);
848 			free(hc_rule);
849 			return (ret);
850 		}
851 	}
852 	list_insert_tail(&hc->ihc_rules, hc_rule);
853 	hc->ihc_rule_cnt++;
854 
855 	return (ILB_STATUS_OK);
856 }
857 
858 /* Dissociate a rule from its hc object. */
859 ilb_status_t
860 ilbd_hc_dissociate_rule(const ilbd_rule_t *rule)
861 {
862 	ilbd_hc_t	*hc;
863 	ilbd_hc_rule_t	*hc_rule;
864 
865 	/* The rule is assumed to be initialized appropriately. */
866 	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
867 		logdebug("ilbd_hc_dissociate_rule: healthcheck %s does not "
868 		    "exist", rule->irl_hcname);
869 		return (ILB_STATUS_ENOENT);
870 	}
871 	for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
872 	    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
873 		if (hc_rule->hcr_rule == rule)
874 			break;
875 	}
876 	if (hc_rule == NULL) {
877 		logdebug("ilbd_hc_dissociate_rule: rule %s is not associated "
878 		    "with healtcheck %s", rule->irl_hcname, hc->ihc_name);
879 		return (ILB_STATUS_ENOENT);
880 	}
881 	ilbd_hc_srv_rem_all(hc_rule);
882 	list_remove(&hc->ihc_rules, hc_rule);
883 	hc->ihc_rule_cnt--;
884 	return (ILB_STATUS_OK);
885 }
886 
887 /*
888  * Given a hc object name and a rule, check to see if the rule is associated
889  * with the hc object.  If it is, the hc object is returned in **hc and the
890  * ilbd_hc_rule_t is returned in **hc_rule.
891  */
892 static boolean_t
893 ilbd_hc_check_rule(const char *hc_name, const ilbd_rule_t *rule,
894     ilbd_hc_t **hc, ilbd_hc_rule_t **hc_rule)
895 {
896 	ilbd_hc_t	*tmp_hc;
897 	ilbd_hc_rule_t	*tmp_hc_rule;
898 
899 	if ((tmp_hc = ilbd_get_hc(hc_name)) == NULL)
900 		return (B_FALSE);
901 	for (tmp_hc_rule = list_head(&tmp_hc->ihc_rules); tmp_hc_rule != NULL;
902 	    tmp_hc_rule = list_next(&tmp_hc->ihc_rules, tmp_hc_rule)) {
903 		if (tmp_hc_rule->hcr_rule == rule) {
904 			*hc = tmp_hc;
905 			*hc_rule = tmp_hc_rule;
906 			return (B_TRUE);
907 		}
908 	}
909 	return (B_FALSE);
910 }
911 
912 /* Associate a server with its hc object. */
913 ilb_status_t
914 ilbd_hc_add_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
915     int ev_port)
916 {
917 	ilbd_hc_t	*hc;
918 	ilbd_hc_rule_t	*hc_rule;
919 
920 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
921 		return (ILB_STATUS_ENOENT);
922 	return (ilbd_hc_srv_add(hc, hc_rule, srv, ev_port));
923 }
924 
925 /* Dissociate a server from its hc object. */
926 ilb_status_t
927 ilbd_hc_del_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
928 {
929 	ilbd_hc_t	*hc;
930 	ilbd_hc_rule_t	*hc_rule;
931 
932 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
933 		return (ILB_STATUS_ENOENT);
934 	return (ilbd_hc_srv_rem(hc_rule, srv));
935 }
936 
937 /* Helper routine to enable/disable a server's hc probe. */
938 static ilb_status_t
939 ilbd_hc_toggle_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
940     boolean_t enable)
941 {
942 	ilbd_hc_t	*hc;
943 	ilbd_hc_rule_t	*hc_rule;
944 	ilbd_hc_srv_t	*tmp_srv;
945 	ilb_status_t	ret;
946 
947 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
948 		return (ILB_STATUS_ENOENT);
949 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
950 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
951 		if (tmp_srv->shc_sg_srv != srv) {
952 			continue;
953 		}
954 		if (enable) {
955 			if (tmp_srv->shc_status == ILB_HCS_DISABLED) {
956 				ret = ilbd_hc_restart_timer(hc, tmp_srv);
957 				if (ret != ILB_STATUS_OK) {
958 					logerr("%s: cannot start timers for "
959 					    "rule %s server %s", __func__,
960 					    rule->irl_name,
961 					    tmp_srv->shc_sg_srv->sgs_srvID);
962 					return (ret);
963 				}
964 				/* Start from fresh... */
965 				tmp_srv->shc_status = ILB_HCS_UNINIT;
966 				tmp_srv->shc_rtt = 0;
967 				tmp_srv->shc_fail_cnt = 0;
968 			}
969 		} else {
970 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
971 				tmp_srv->shc_status = ILB_HCS_DISABLED;
972 				HC_CANCEL_TIMER(tmp_srv);
973 				if (tmp_srv->shc_child_pid != 0)
974 					ilbd_hc_kill_probe(tmp_srv);
975 			}
976 		}
977 		return (ILB_STATUS_OK);
978 	}
979 	return (ILB_STATUS_ENOENT);
980 }
981 
982 ilb_status_t
983 ilbd_hc_enable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
984 {
985 	return (ilbd_hc_toggle_server(rule, srv, B_TRUE));
986 }
987 
988 ilb_status_t
989 ilbd_hc_disable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
990 {
991 	return (ilbd_hc_toggle_server(rule, srv, B_FALSE));
992 }
993 
994 /*
995  * Helper routine to enable/disable a rule's hc probe (including all its
996  * servers).
997  */
998 static ilb_status_t
999 ilbd_hc_toggle_rule(const ilbd_rule_t *rule, boolean_t enable)
1000 {
1001 	ilbd_hc_t	*hc;
1002 	ilbd_hc_rule_t	*hc_rule;
1003 	ilbd_hc_srv_t	*tmp_srv;
1004 	int		ret;
1005 
1006 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
1007 		return (ILB_STATUS_ENOENT);
1008 
1009 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
1010 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
1011 		if (enable) {
1012 			/*
1013 			 * If the server is disabled in the rule, do not
1014 			 * restart its timer.
1015 			 */
1016 			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1017 			    ILB_IS_SRV_ENABLED(
1018 			    tmp_srv->shc_sg_srv->sgs_flags)) {
1019 				ret = ilbd_hc_restart_timer(hc, tmp_srv);
1020 				if (ret != ILB_STATUS_OK) {
1021 					logerr("%s: cannot start timers for "
1022 					    "rule %s server %s", __func__,
1023 					    rule->irl_name,
1024 					    tmp_srv->shc_sg_srv->sgs_srvID);
1025 					goto rollback;
1026 				} else {
1027 					/* Start from fresh... */
1028 					tmp_srv->shc_status = ILB_HCS_UNINIT;
1029 					tmp_srv->shc_rtt = 0;
1030 					tmp_srv->shc_fail_cnt = 0;
1031 				}
1032 			}
1033 		} else {
1034 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1035 				HC_CANCEL_TIMER(tmp_srv);
1036 				tmp_srv->shc_status = ILB_HCS_DISABLED;
1037 				if (tmp_srv->shc_child_pid != 0)
1038 					ilbd_hc_kill_probe(tmp_srv);
1039 			}
1040 		}
1041 	}
1042 	return (ILB_STATUS_OK);
1043 rollback:
1044 	enable = !enable;
1045 	for (tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv);
1046 	    tmp_srv != NULL;
1047 	    tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv)) {
1048 		if (enable) {
1049 			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1050 			    ILB_IS_SRV_ENABLED(
1051 			    tmp_srv->shc_sg_srv->sgs_flags)) {
1052 				(void) ilbd_hc_restart_timer(hc, tmp_srv);
1053 				tmp_srv->shc_status = ILB_HCS_UNINIT;
1054 				tmp_srv->shc_rtt = 0;
1055 				tmp_srv->shc_fail_cnt = 0;
1056 			}
1057 		} else {
1058 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1059 				HC_CANCEL_TIMER(tmp_srv);
1060 				tmp_srv->shc_status = ILB_HCS_DISABLED;
1061 				if (tmp_srv->shc_child_pid != 0)
1062 					ilbd_hc_kill_probe(tmp_srv);
1063 			}
1064 		}
1065 	}
1066 	return (ret);
1067 }
1068 
1069 ilb_status_t
1070 ilbd_hc_enable_rule(const ilbd_rule_t *rule)
1071 {
1072 	return (ilbd_hc_toggle_rule(rule, B_TRUE));
1073 }
1074 
1075 ilb_status_t
1076 ilbd_hc_disable_rule(const ilbd_rule_t *rule)
1077 {
1078 	return (ilbd_hc_toggle_rule(rule, B_FALSE));
1079 }
1080 
1081 static const char *
1082 topo_2_str(ilb_topo_t topo)
1083 {
1084 	switch (topo) {
1085 	case ILB_TOPO_DSR:
1086 		return ("DSR");
1087 	case ILB_TOPO_NAT:
1088 		return ("NAT");
1089 	case ILB_TOPO_HALF_NAT:
1090 		return ("HALF_NAT");
1091 	default:
1092 		/* Should not happen. */
1093 		logerr("%s: unknown topology", __func__);
1094 		break;
1095 	}
1096 	return ("");
1097 }
1098 
1099 /*
1100  * Create the argument list to be passed to a hc probe command.
1101  * The passed in argv is assumed to have HC_PROBE_ARGC elements.
1102  */
1103 static boolean_t
1104 create_argv(ilbd_hc_srv_t *srv, char *argv[])
1105 {
1106 	char buf[INET6_ADDRSTRLEN];
1107 	ilbd_rule_t const *rule;
1108 	ilb_sg_srv_t const *sg_srv;
1109 	struct in_addr v4_addr;
1110 	in_port_t port;
1111 	int i;
1112 
1113 	rule = srv->shc_hc_rule->hcr_rule;
1114 	sg_srv = srv->shc_sg_srv;
1115 
1116 	if (srv->shc_state == ilbd_hc_def_pinging) {
1117 		if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL)
1118 			return (B_FALSE);
1119 	} else {
1120 		switch (srv->shc_hc->ihc_test_type) {
1121 		case ILBD_HC_USER:
1122 			if ((argv[0] = strdup(srv->shc_hc->ihc_test)) == NULL)
1123 				return (B_FALSE);
1124 			break;
1125 		case ILBD_HC_TCP:
1126 		case ILBD_HC_UDP:
1127 			if ((argv[0] = strdup(ILB_PROBE_PROTO)) ==
1128 			    NULL) {
1129 				return (B_FALSE);
1130 			}
1131 			break;
1132 		case ILBD_HC_PING:
1133 			if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL) {
1134 				return (B_FALSE);
1135 			}
1136 			break;
1137 		}
1138 	}
1139 
1140 	/*
1141 	 * argv[1] is the VIP.
1142 	 *
1143 	 * Right now, the VIP and the backend server addresses should be
1144 	 * in the same IP address family.  Here we don't do that in case
1145 	 * this assumption is changed in future.
1146 	 */
1147 	if (IN6_IS_ADDR_V4MAPPED(&rule->irl_vip)) {
1148 		IN6_V4MAPPED_TO_INADDR(&rule->irl_vip, &v4_addr);
1149 		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1150 			goto cleanup;
1151 	} else {
1152 		if (inet_ntop(AF_INET6, &rule->irl_vip, buf,
1153 		    sizeof (buf)) == NULL) {
1154 			goto cleanup;
1155 		}
1156 	}
1157 	if ((argv[1] = strdup(buf)) == NULL)
1158 		goto cleanup;
1159 
1160 	/*
1161 	 * argv[2] is the backend server address.
1162 	 */
1163 	if (IN6_IS_ADDR_V4MAPPED(&sg_srv->sgs_addr)) {
1164 		IN6_V4MAPPED_TO_INADDR(&sg_srv->sgs_addr, &v4_addr);
1165 		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1166 			goto cleanup;
1167 	} else {
1168 		if (inet_ntop(AF_INET6, &sg_srv->sgs_addr, buf,
1169 		    sizeof (buf)) == NULL) {
1170 			goto cleanup;
1171 		}
1172 	}
1173 	if ((argv[2] = strdup(buf)) == NULL)
1174 		goto cleanup;
1175 
1176 	/*
1177 	 * argv[3] is the transport protocol used in the rule.
1178 	 */
1179 	switch (rule->irl_proto) {
1180 	case IPPROTO_TCP:
1181 		argv[3] = strdup("TCP");
1182 		break;
1183 	case IPPROTO_UDP:
1184 		argv[3] = strdup("UDP");
1185 		break;
1186 	default:
1187 		logerr("%s: unknown protocol", __func__);
1188 		goto cleanup;
1189 	}
1190 	if (argv[3] == NULL)
1191 		goto cleanup;
1192 
1193 	/*
1194 	 * argv[4] is the load balance mode, DSR, NAT, HALF-NAT.
1195 	 */
1196 	if ((argv[4] = strdup(topo_2_str(rule->irl_topo))) == NULL)
1197 		goto cleanup;
1198 
1199 	/*
1200 	 * argv[5] is the port range.  Right now, there should only be 1 port.
1201 	 */
1202 	switch (rule->irl_hcpflag) {
1203 	case ILB_HCI_PROBE_FIX:
1204 		port = ntohs(rule->irl_hcport);
1205 		break;
1206 	case ILB_HCI_PROBE_ANY: {
1207 		in_port_t min, max;
1208 
1209 		if (ntohs(sg_srv->sgs_minport) == 0) {
1210 			min = ntohs(rule->irl_minport);
1211 			max = ntohs(rule->irl_maxport);
1212 		} else {
1213 			min = ntohs(sg_srv->sgs_minport);
1214 			max = ntohs(sg_srv->sgs_maxport);
1215 		}
1216 		if (max > min)
1217 			port = min + gethrtime() % (max - min + 1);
1218 		else
1219 			port = min;
1220 		break;
1221 	}
1222 	default:
1223 		logerr("%s: unknown HC flag", __func__);
1224 		goto cleanup;
1225 	}
1226 	(void) sprintf(buf, "%d", port);
1227 	if ((argv[5] = strdup(buf)) == NULL)
1228 		goto cleanup;
1229 
1230 	/*
1231 	 * argv[6] is the probe timeout.
1232 	 */
1233 	(void) sprintf(buf, "%d", srv->shc_hc->ihc_timeout);
1234 	if ((argv[6] = strdup(buf)) == NULL)
1235 		goto cleanup;
1236 
1237 	argv[7] = NULL;
1238 	return (B_TRUE);
1239 
1240 cleanup:
1241 	for (i = 0; i < HC_PROBE_ARGC; i++) {
1242 		if (argv[i] != NULL)
1243 			free(argv[i]);
1244 	}
1245 	return (B_FALSE);
1246 }
1247 
1248 static void
1249 destroy_argv(char *argv[])
1250 {
1251 	int i;
1252 
1253 	for (i = 0; argv[i] != NULL; i++)
1254 		free(argv[i]);
1255 }
1256 
1257 /* Spawn a process to run the hc probe on the given server. */
1258 static boolean_t
1259 ilbd_run_probe(ilbd_hc_srv_t *srv)
1260 {
1261 	posix_spawn_file_actions_t	fd_actions;
1262 	posix_spawnattr_t		attr;
1263 	sigset_t			child_sigset;
1264 	int				fds[2];
1265 	int				fdflags;
1266 	pid_t				pid;
1267 	char				*child_argv[HC_PROBE_ARGC];
1268 	ilbd_hc_probe_event_t		*probe_ev;
1269 	char				*probe_name;
1270 
1271 	bzero(child_argv, HC_PROBE_ARGC * sizeof (char *));
1272 	if ((probe_ev = calloc(1, sizeof (*probe_ev))) == NULL) {
1273 		logdebug("ilbd_run_probe: calloc");
1274 		return (B_FALSE);
1275 	}
1276 
1277 	/* Set up a pipe to get output from probe command. */
1278 	if (pipe(fds) < 0) {
1279 		logdebug("ilbd_run_probe: cannot create pipe");
1280 		free(probe_ev);
1281 		return (B_FALSE);
1282 	}
1283 	/* Set our side of the pipe to be non-blocking */
1284 	if ((fdflags = fcntl(fds[0], F_GETFL, 0)) == -1) {
1285 		logdebug("ilbd_run_probe: fcntl(F_GETFL)");
1286 		goto cleanup;
1287 	}
1288 	if (fcntl(fds[0], F_SETFL, fdflags | O_NONBLOCK) == -1) {
1289 		logdebug("ilbd_run_probe: fcntl(F_SETFL)");
1290 		goto cleanup;
1291 	}
1292 
1293 	if (posix_spawn_file_actions_init(&fd_actions) != 0) {
1294 		logdebug("ilbd_run_probe: posix_spawn_file_actions_init");
1295 		goto cleanup;
1296 	}
1297 	if (posix_spawnattr_init(&attr) != 0) {
1298 		logdebug("ilbd_run_probe: posix_spawnattr_init");
1299 		goto cleanup;
1300 	}
1301 	if (posix_spawn_file_actions_addclose(&fd_actions, fds[0]) != 0) {
1302 		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1303 		goto cleanup;
1304 	}
1305 	if (posix_spawn_file_actions_adddup2(&fd_actions, fds[1],
1306 	    STDOUT_FILENO) != 0) {
1307 		logdebug("ilbd_run_probe: posix_spawn_file_actions_dup2");
1308 		goto cleanup;
1309 	}
1310 	if (posix_spawn_file_actions_addclose(&fd_actions, fds[1]) != 0) {
1311 		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1312 		goto cleanup;
1313 	}
1314 
1315 	/* Reset all signal handling of the child to default. */
1316 	(void) sigfillset(&child_sigset);
1317 	if (posix_spawnattr_setsigdefault(&attr, &child_sigset) != 0) {
1318 		logdebug("ilbd_run_probe: posix_spawnattr_setsigdefault");
1319 		goto cleanup;
1320 	}
1321 	/* Don't want SIGCHLD. */
1322 	if (posix_spawnattr_setflags(&attr, POSIX_SPAWN_NOSIGCHLD_NP|
1323 	    POSIX_SPAWN_SETSIGDEF) != 0) {
1324 		logdebug("ilbd_run_probe: posix_spawnattr_setflags");
1325 		goto cleanup;
1326 	}
1327 
1328 	if (!create_argv(srv, child_argv)) {
1329 		logdebug("ilbd_run_probe: create_argv");
1330 		goto cleanup;
1331 	}
1332 
1333 	/*
1334 	 * If we are doing default pinging or not using a user supplied
1335 	 * probe, we should execute our standard supplied probe.  The
1336 	 * supplied probe command handles all types of probes.  And the
1337 	 * type used depends on argv[0], as filled in by create_argv().
1338 	 */
1339 	if (srv->shc_state == ilbd_hc_def_pinging ||
1340 	    srv->shc_hc->ihc_test_type != ILBD_HC_USER) {
1341 		probe_name = ILB_PROBE_PROTO;
1342 	} else {
1343 		probe_name = srv->shc_hc->ihc_test;
1344 	}
1345 	if (posix_spawn(&pid, probe_name, &fd_actions, &attr, child_argv,
1346 	    NULL) != 0) {
1347 		logerr("%s: posix_spawn: %s for server %s: %s", __func__,
1348 		    srv->shc_hc->ihc_test, srv->shc_sg_srv->sgs_srvID,
1349 		    strerror(errno));
1350 		goto cleanup;
1351 	}
1352 
1353 	(void) close(fds[1]);
1354 	destroy_argv(child_argv);
1355 	srv->shc_child_pid = pid;
1356 	srv->shc_child_fd = fds[0];
1357 	srv->shc_ev = probe_ev;
1358 
1359 	probe_ev->ihp_ev = ILBD_EVENT_PROBE;
1360 	probe_ev->ihp_srv = srv;
1361 	probe_ev->ihp_pid = pid;
1362 	if (port_associate(srv->shc_ev_port, PORT_SOURCE_FD, fds[0],
1363 	    POLLRDNORM, probe_ev) != 0) {
1364 		/*
1365 		 * Need to kill the child.  It will free the srv->shc_ev,
1366 		 * which is probe_ev.  So set probe_ev to NULL.
1367 		 */
1368 		ilbd_hc_kill_probe(srv);
1369 		probe_ev = NULL;
1370 		goto cleanup;
1371 	}
1372 
1373 	return (B_TRUE);
1374 
1375 cleanup:
1376 	(void) close(fds[0]);
1377 	(void) close(fds[1]);
1378 	destroy_argv(child_argv);
1379 	if (probe_ev != NULL)
1380 		free(probe_ev);
1381 	return (B_FALSE);
1382 }
1383 
1384 /*
1385  * Called by ild_hc_probe_return() to re-associate the fd to a child to
1386  * the event port.
1387  */
1388 static void
1389 reassociate_port(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1390 {
1391 	if (port_associate(ev_port, PORT_SOURCE_FD, fd,
1392 	    POLLRDNORM, ev) != 0) {
1393 		/*
1394 		 * If we cannot reassociate with the port, the only
1395 		 * thing we can do now is to kill the child and
1396 		 * do a blocking wait here...
1397 		 */
1398 		logdebug("%s: port_associate: %s", __func__, strerror(errno));
1399 		if (kill(ev->ihp_pid, SIGKILL) != 0)
1400 			logerr("%s: kill: %s", __func__, strerror(errno));
1401 		if (waitpid(ev->ihp_pid, NULL, 0) != ev->ihp_pid)
1402 			logdebug("%s: waitpid: %s", __func__, strerror(errno));
1403 		free(ev);
1404 	}
1405 }
1406 
1407 /*
1408  * To handle a child probe process hanging up.
1409  */
1410 static void
1411 ilbd_hc_child_hup(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1412 {
1413 	ilbd_hc_srv_t *srv;
1414 	pid_t ret_pid;
1415 	int ret;
1416 
1417 	srv = ev->ihp_srv;
1418 
1419 	if (!ev->ihp_done) {
1420 		/* ilbd does not care about this process anymore ... */
1421 		ev->ihp_done = B_TRUE;
1422 		srv->shc_ev = NULL;
1423 		srv->shc_child_pid = 0;
1424 		HC_CANCEL_TIMER(srv);
1425 		ilbd_set_fail_state(srv);
1426 	}
1427 	ret_pid = waitpid(ev->ihp_pid, &ret, WNOHANG);
1428 	switch (ret_pid) {
1429 	case -1:
1430 		logperror("ilbd_hc_child_hup: waitpid");
1431 		/* FALLTHROUGH */
1432 	case 0:
1433 		/* The child has not completed the exit. Wait again. */
1434 		reassociate_port(ev_port, fd, ev);
1435 		break;
1436 	default:
1437 		/* Right now, we just ignore the exit status. */
1438 		if (WIFEXITED(ret))
1439 			ret = WEXITSTATUS(ret);
1440 		(void) close(fd);
1441 		free(ev);
1442 	}
1443 }
1444 
1445 /*
1446  * To read the output of a child probe process.
1447  */
1448 static void
1449 ilbd_hc_child_data(int fd, ilbd_hc_probe_event_t *ev)
1450 {
1451 	ilbd_hc_srv_t *srv;
1452 	char buf[HC_MAX_PROBE_OUTPUT];
1453 	int ret;
1454 	int64_t rtt;
1455 
1456 	srv = ev->ihp_srv;
1457 
1458 	bzero(buf, HC_MAX_PROBE_OUTPUT);
1459 	ret = read(fd, buf, HC_MAX_PROBE_OUTPUT - 1);
1460 	/* Should not happen since event port should have caught this. */
1461 	assert(ret > 0);
1462 
1463 	/*
1464 	 * We expect the probe command to print out the RTT only.  But
1465 	 * the command may misbehave and print out more than what we intend to
1466 	 * read in.  So need to do this check below to "flush" out all the
1467 	 * output from the command.
1468 	 */
1469 	if (!ev->ihp_done) {
1470 		ev->ihp_done = B_TRUE;
1471 		/* We don't need to know about this event anymore. */
1472 		srv->shc_ev = NULL;
1473 		srv->shc_child_pid = 0;
1474 		HC_CANCEL_TIMER(srv);
1475 	} else {
1476 		return;
1477 	}
1478 
1479 	rtt = strtoll(buf, NULL, 10);
1480 
1481 	/*
1482 	 * -1 means the server is dead or the probe somehow fails.  Treat
1483 	 * them both as server is dead.
1484 	 */
1485 	if (rtt == -1) {
1486 		ilbd_set_fail_state(srv);
1487 		return;
1488 	} else if (rtt > 0) {
1489 		/* If the returned RTT value is not valid, just ignore it. */
1490 		if (rtt > 0 && rtt <= UINT_MAX) {
1491 			/* Set rtt to be the simple smoothed average. */
1492 			if (srv->shc_rtt == 0) {
1493 				srv->shc_rtt = rtt;
1494 			} else {
1495 				srv->shc_rtt = 3 * ((srv)->shc_rtt >> 2) +
1496 				    (rtt >> 2);
1497 			}
1498 		}
1499 
1500 	}
1501 
1502 	switch (srv->shc_state) {
1503 	case ilbd_hc_def_pinging:
1504 		srv->shc_state = ilbd_hc_probing;
1505 
1506 		/* Ping is OK, now start the probe. */
1507 		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
1508 		break;
1509 	case ilbd_hc_probing:
1510 		srv->shc_fail_cnt = 0;
1511 
1512 		/* Server is dead before, re-enable it. */
1513 		if (srv->shc_status == ILB_HCS_UNREACH ||
1514 		    srv->shc_status == ILB_HCS_DEAD) {
1515 			/*
1516 			 * If enabling the server in kernel fails now,
1517 			 * hopefully when the timer fires again later, the
1518 			 * enabling can be done.
1519 			 */
1520 			if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
1521 			    srv->shc_hc_rule->hcr_rule->irl_name,
1522 			    stat_declare_srv_alive) != ILB_STATUS_OK) {
1523 				logerr("%s: cannot enable server in kernel: "
1524 				    " rule %s server %s", __func__,
1525 				    srv->shc_hc_rule->hcr_rule->irl_name,
1526 				    srv->shc_sg_srv->sgs_srvID);
1527 			} else {
1528 				srv->shc_status = ILB_HCS_ALIVE;
1529 			}
1530 		} else {
1531 			srv->shc_status = ILB_HCS_ALIVE;
1532 		}
1533 		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
1534 			logerr("%s: cannot restart timer: rule %s server %s",
1535 			    __func__, srv->shc_hc_rule->hcr_rule->irl_name,
1536 			    srv->shc_sg_srv->sgs_srvID);
1537 			ilbd_mark_server_disabled(srv);
1538 		}
1539 		break;
1540 	default:
1541 		logdebug("%s: unknown state", __func__);
1542 		break;
1543 	}
1544 }
1545 
1546 /*
1547  * Handle the return event of a child probe fd.
1548  */
1549 void
1550 ilbd_hc_probe_return(int ev_port, int fd, int port_events,
1551     ilbd_hc_probe_event_t *ev)
1552 {
1553 	/*
1554 	 * Note that there can be more than one events delivered to us at
1555 	 * the same time.  So we need to check them individually.
1556 	 */
1557 	if (port_events & POLLRDNORM)
1558 		ilbd_hc_child_data(fd, ev);
1559 
1560 	if (port_events & (POLLHUP|POLLERR)) {
1561 		ilbd_hc_child_hup(ev_port, fd, ev);
1562 		return;
1563 	}
1564 
1565 	/*
1566 	 * Re-associate the fd with the port so that when the child
1567 	 * exits, we can reap the status.
1568 	 */
1569 	reassociate_port(ev_port, fd, ev);
1570 }
1571