xref: /titanic_44/usr/src/cmd/cmd-inet/usr.lib/ilbd/ilbd_hc.c (revision 4a16f9a6c1cc74aeed5ff36b4723c3e43bc67666)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/socket.h>
29 #include <sys/list.h>
30 #include <sys/stropts.h>
31 #include <sys/siginfo.h>
32 #include <sys/wait.h>
33 #include <arpa/inet.h>
34 #include <netinet/in.h>
35 #include <stdlib.h>
36 #include <stdio.h>
37 #include <strings.h>
38 #include <stddef.h>
39 #include <unistd.h>
40 #include <libilb.h>
41 #include <port.h>
42 #include <time.h>
43 #include <signal.h>
44 #include <assert.h>
45 #include <errno.h>
46 #include <spawn.h>
47 #include <fcntl.h>
48 #include <limits.h>
49 #include "libilb_impl.h"
50 #include "ilbd.h"
51 
52 /* Global list of HC objects */
53 list_t ilbd_hc_list;
54 
55 /* Timer queue for all hc related timers. */
56 static iu_tq_t *ilbd_hc_timer_q;
57 
58 /* Indicate whether the timer needs to be updated */
59 static boolean_t hc_timer_restarted;
60 
61 static void ilbd_hc_probe_timer(iu_tq_t *, void *);
62 static ilb_status_t ilbd_hc_restart_timer(ilbd_hc_t *, ilbd_hc_srv_t *);
63 static boolean_t ilbd_run_probe(ilbd_hc_srv_t *);
64 
65 #define	MAX(a, b)	((a) > (b) ? (a) : (b))
66 
67 /*
68  * Number of arguments passed to a probe.  argc[0] is the path name of
69  * the probe.
70  */
71 #define	HC_PROBE_ARGC	8
72 
73 /*
74  * Max number of characters to be read from the output of a probe.  It
75  * is long enough to read in a 64 bit integer.
76  */
77 #define	HC_MAX_PROBE_OUTPUT	24
78 
79 void
80 i_ilbd_setup_hc_list(void)
81 {
82 	list_create(&ilbd_hc_list, sizeof (ilbd_hc_t),
83 	    offsetof(ilbd_hc_t, ihc_link));
84 }
85 
86 /*
87  * Given a hc object name, return a pointer to hc object if found.
88  */
89 ilbd_hc_t *
90 ilbd_get_hc(const char *name)
91 {
92 	ilbd_hc_t *hc;
93 
94 	for (hc = list_head(&ilbd_hc_list); hc != NULL;
95 	    hc = list_next(&ilbd_hc_list, hc)) {
96 		if (strcasecmp(hc->ihc_name, name) == 0)
97 			return (hc);
98 	}
99 	return (NULL);
100 }
101 
102 /*
103  * Generates an audit record for create-healthcheck,
104  * delete-healtcheck subcommands.
105  */
106 static void
107 ilbd_audit_hc_event(const char *audit_hcname,
108     const ilb_hc_info_t *audit_hcinfo, ilbd_cmd_t cmd,
109     ilb_status_t rc, ucred_t *ucredp)
110 {
111 	adt_session_data_t	*ah;
112 	adt_event_data_t	*event;
113 	au_event_t	flag;
114 	int	audit_error;
115 
116 	if ((ucredp == NULL) && (cmd == ILBD_CREATE_HC))  {
117 		/*
118 		 * we came here from the path where ilbd incorporates
119 		 * the configuration that is listed in SCF:
120 		 * i_ilbd_read_config->ilbd_walk_hc_pgs->
121 		 *   ->ilbd_scf_instance_walk_pg->ilbd_create_hc
122 		 * We skip auditing in that case
123 		 */
124 		logdebug("ilbd_audit_hc_event: skipping auditing");
125 		return;
126 	}
127 
128 	if (adt_start_session(&ah, NULL, 0) != 0) {
129 		logerr("ilbd_audit_hc_event: adt_start_session failed");
130 		exit(EXIT_FAILURE);
131 	}
132 	if (adt_set_from_ucred(ah, ucredp, ADT_NEW) != 0) {
133 		(void) adt_end_session(ah);
134 		logerr("ilbd_audit_rule_event: adt_set_from_ucred failed");
135 		exit(EXIT_FAILURE);
136 	}
137 	if (cmd == ILBD_CREATE_HC)
138 		flag = ADT_ilb_create_healthcheck;
139 	else if (cmd == ILBD_DESTROY_HC)
140 		flag = ADT_ilb_delete_healthcheck;
141 
142 	if ((event = adt_alloc_event(ah, flag)) == NULL) {
143 		logerr("ilbd_audit_hc_event: adt_alloc_event failed");
144 		exit(EXIT_FAILURE);
145 	}
146 	(void) memset((char *)event, 0, sizeof (adt_event_data_t));
147 
148 	switch (cmd) {
149 	case ILBD_CREATE_HC:
150 		event->adt_ilb_create_healthcheck.auth_used =
151 		    NET_ILB_CONFIG_AUTH;
152 		event->adt_ilb_create_healthcheck.hc_test =
153 		    (char *)audit_hcinfo->hci_test;
154 		event->adt_ilb_create_healthcheck.hc_name =
155 		    (char *)audit_hcinfo->hci_name;
156 
157 		/*
158 		 * If the value 0 is stored, the default values are
159 		 * set in the kernel. User land does not know about them
160 		 * So if the user does not specify them, audit record
161 		 * will show them as 0
162 		 */
163 		event->adt_ilb_create_healthcheck.hc_timeout =
164 		    audit_hcinfo->hci_timeout;
165 		event->adt_ilb_create_healthcheck.hc_count =
166 		    audit_hcinfo->hci_count;
167 		event->adt_ilb_create_healthcheck.hc_interval =
168 		    audit_hcinfo->hci_interval;
169 		break;
170 	case ILBD_DESTROY_HC:
171 		event->adt_ilb_delete_healthcheck.auth_used =
172 		    NET_ILB_CONFIG_AUTH;
173 		event->adt_ilb_delete_healthcheck.hc_name =
174 		    (char *)audit_hcname;
175 		break;
176 	}
177 
178 	/* Fill in success/failure */
179 	if (rc == ILB_STATUS_OK) {
180 		if (adt_put_event(event, ADT_SUCCESS, ADT_SUCCESS) != 0) {
181 			logerr("ilbd_audit_hc_event: adt_put_event failed");
182 			exit(EXIT_FAILURE);
183 		}
184 	} else {
185 		audit_error = ilberror2auditerror(rc);
186 		if (adt_put_event(event, ADT_FAILURE, audit_error) != 0) {
187 			logerr("ilbd_audit_hc_event: adt_put_event failed");
188 			exit(EXIT_FAILURE);
189 		}
190 	}
191 	adt_free_event(event);
192 	(void) adt_end_session(ah);
193 }
194 
195 /*
196  * Given the ilb_hc_info_t passed in (from the libilb), create a hc object
197  * in ilbd.  The parameter ev_port is not used, refer to comments of
198  * ilbd_create_sg() in ilbd_sg.c
199  */
200 /* ARGSUSED */
201 ilb_status_t
202 ilbd_create_hc(const ilb_hc_info_t *hc_info, int ev_port,
203     const struct passwd *ps, ucred_t *ucredp)
204 {
205 	ilbd_hc_t *hc;
206 	ilb_status_t ret = ILB_STATUS_OK;
207 
208 	/*
209 	 * ps == NULL is from the daemon when it starts and load configuration
210 	 * ps != NULL is from client.
211 	 */
212 	if (ps != NULL) {
213 		ret = ilbd_check_client_config_auth(ps);
214 		if (ret != ILB_STATUS_OK) {
215 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
216 			    ret, ucredp);
217 			return (ret);
218 		}
219 	}
220 
221 	if (hc_info->hci_name[0] == '\0') {
222 		logdebug("ilbd_create_hc: missing healthcheck info");
223 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
224 		    ILB_STATUS_ENOHCINFO, ucredp);
225 		return (ILB_STATUS_ENOHCINFO);
226 	}
227 
228 	hc = ilbd_get_hc(hc_info->hci_name);
229 	if (hc != NULL) {
230 		logdebug("ilbd_create_hc: healthcheck name %s already"
231 		    " exists", hc_info->hci_name);
232 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
233 		    ILB_STATUS_EEXIST, ucredp);
234 		return (ILB_STATUS_EEXIST);
235 	}
236 
237 	/*
238 	 * Sanity check on user supplied probe.  The given path name
239 	 * must be a full path name (starts with '/') and is
240 	 * executable.
241 	 */
242 	if (strcasecmp(hc_info->hci_test, ILB_HC_STR_TCP) != 0 &&
243 	    strcasecmp(hc_info->hci_test, ILB_HC_STR_UDP) != 0 &&
244 	    strcasecmp(hc_info->hci_test, ILB_HC_STR_PING) != 0 &&
245 	    (hc_info->hci_test[0] != '/' ||
246 	    access(hc_info->hci_test, X_OK) == -1)) {
247 		if (errno == ENOENT) {
248 			logdebug("ilbd_create_hc: user script %s doesn't "
249 			    "exist", hc_info->hci_test);
250 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
251 			    ILB_STATUS_ENOENT, ucredp);
252 			return (ILB_STATUS_ENOENT);
253 		} else {
254 			logdebug("ilbd_create_hc: user script %s is "
255 			    "invalid", hc_info->hci_test);
256 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
257 			    ILB_STATUS_EINVAL, ucredp);
258 			return (ILB_STATUS_EINVAL);
259 		}
260 	}
261 
262 	/* Create and add the hc object */
263 	hc = calloc(1, sizeof (ilbd_hc_t));
264 	if (hc == NULL) {
265 		ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
266 		    ILB_STATUS_ENOMEM, ucredp);
267 		return (ILB_STATUS_ENOMEM);
268 	}
269 	(void) memcpy(&hc->ihc_info, hc_info, sizeof (ilb_hc_info_t));
270 	if (strcasecmp(hc->ihc_test, ILB_HC_STR_TCP) == 0)
271 		hc->ihc_test_type = ILBD_HC_TCP;
272 	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_UDP) == 0)
273 		hc->ihc_test_type = ILBD_HC_UDP;
274 	else if (strcasecmp(hc->ihc_test, ILB_HC_STR_PING) == 0)
275 		hc->ihc_test_type = ILBD_HC_PING;
276 	else
277 		hc->ihc_test_type = ILBD_HC_USER;
278 	list_create(&hc->ihc_rules, sizeof (ilbd_hc_rule_t),
279 	    offsetof(ilbd_hc_rule_t, hcr_link));
280 
281 	/* Update SCF */
282 	if (ps != NULL) {
283 		if ((ret = ilbd_create_pg(ILBD_SCF_HC, (void *)hc)) !=
284 		    ILB_STATUS_OK) {
285 			ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
286 			    ret, ucredp);
287 			free(hc);
288 			return (ret);
289 		}
290 	}
291 
292 	/* Everything is fine, now add it to the global list. */
293 	list_insert_tail(&ilbd_hc_list, hc);
294 	ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC, ret, ucredp);
295 	return (ret);
296 }
297 
298 /*
299  * Given a name of a hc object, destroy it.
300  */
301 ilb_status_t
302 ilbd_destroy_hc(const char *hc_name, const struct passwd *ps,
303     ucred_t *ucredp)
304 {
305 	ilb_status_t ret;
306 	ilbd_hc_t *hc;
307 
308 	/*
309 	 * No need to check ps == NULL, daemon won't call any destroy func
310 	 * at start up.
311 	 */
312 	ret = ilbd_check_client_config_auth(ps);
313 	if (ret != ILB_STATUS_OK) {
314 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
315 		    ret, ucredp);
316 		return (ret);
317 	}
318 
319 	hc = ilbd_get_hc(hc_name);
320 	if (hc == NULL) {
321 		logdebug("ilbd_destroy_hc: healthcheck %s does not exist",
322 		    hc_name);
323 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
324 		    ILB_STATUS_ENOENT, ucredp);
325 		return (ILB_STATUS_ENOENT);
326 	}
327 
328 	/* If hc is in use, cannot delete it */
329 	if (hc->ihc_rule_cnt > 0) {
330 		logdebug("ilbd_destroy_hc: healthcheck %s is associated"
331 		    " with a rule - cannot remove", hc_name);
332 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
333 		    ILB_STATUS_INUSE, ucredp);
334 		return (ILB_STATUS_INUSE);
335 	}
336 
337 	if ((ret = ilbd_destroy_pg(ILBD_SCF_HC, hc_name)) !=
338 	    ILB_STATUS_OK) {
339 		logdebug("ilbd_destroy_hc: cannot destroy healthcheck %s "
340 		    "property group", hc_name);
341 		ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
342 		    ret, ucredp);
343 		return (ret);
344 	}
345 
346 	list_remove(&ilbd_hc_list, hc);
347 	free(hc);
348 	ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC, ret, ucredp);
349 	return (ret);
350 }
351 
352 /*
353  * Given a hc object name, return its information.  Used by libilb to
354  * get hc info.
355  */
356 ilb_status_t
357 ilbd_get_hc_info(const char *hc_name, uint32_t *rbuf, size_t *rbufsz)
358 {
359 	ilbd_hc_t	*hc;
360 	ilb_hc_info_t	*hc_info;
361 	ilb_comm_t	*ic = (ilb_comm_t *)rbuf;
362 
363 	hc = ilbd_get_hc(hc_name);
364 	if (hc == NULL) {
365 		logdebug("%s: healthcheck %s does not exist", __func__,
366 		    hc_name);
367 		return (ILB_STATUS_ENOENT);
368 	}
369 	ilbd_reply_ok(rbuf, rbufsz);
370 	hc_info = (ilb_hc_info_t *)&ic->ic_data;
371 
372 	(void) strlcpy(hc_info->hci_name, hc->ihc_name, sizeof (hc->ihc_name));
373 	(void) strlcpy(hc_info->hci_test, hc->ihc_test, sizeof (hc->ihc_test));
374 	hc_info->hci_timeout = hc->ihc_timeout;
375 	hc_info->hci_count = hc->ihc_count;
376 	hc_info->hci_interval = hc->ihc_interval;
377 	hc_info->hci_def_ping = hc->ihc_def_ping;
378 
379 	*rbufsz += sizeof (ilb_hc_info_t);
380 
381 	return (ILB_STATUS_OK);
382 }
383 
384 static void
385 ilbd_hc_copy_srvs(uint32_t *rbuf, size_t *rbufsz, ilbd_hc_rule_t *hc_rule,
386     const char *rulename)
387 {
388 	ilbd_hc_srv_t		*tmp_srv;
389 	ilb_hc_srv_t		*dst_srv;
390 	ilb_hc_rule_srv_t	*srvs;
391 	size_t			tmp_rbufsz;
392 	int			i;
393 
394 	tmp_rbufsz = *rbufsz;
395 	/* Set up the reply buffer.  rbufsz will be set to the new size. */
396 	ilbd_reply_ok(rbuf, rbufsz);
397 
398 	/* Calculate how much space is left for holding server info. */
399 	*rbufsz += sizeof (ilb_hc_rule_srv_t);
400 	tmp_rbufsz -= *rbufsz;
401 
402 	srvs = (ilb_hc_rule_srv_t *)&((ilb_comm_t *)rbuf)->ic_data;
403 
404 	tmp_srv = list_head(&hc_rule->hcr_servers);
405 	for (i = 0; tmp_srv != NULL && tmp_rbufsz >= sizeof (*dst_srv); i++) {
406 		dst_srv = &srvs->rs_srvs[i];
407 
408 		(void) strlcpy(dst_srv->hcs_rule_name, rulename, ILB_NAMESZ);
409 		(void) strlcpy(dst_srv->hcs_ID, tmp_srv->shc_sg_srv->sgs_srvID,
410 		    ILB_NAMESZ);
411 		(void) strlcpy(dst_srv->hcs_hc_name,
412 		    tmp_srv->shc_hc->ihc_name, ILB_NAMESZ);
413 		dst_srv->hcs_IP = tmp_srv->shc_sg_srv->sgs_addr;
414 		dst_srv->hcs_fail_cnt = tmp_srv->shc_fail_cnt;
415 		dst_srv->hcs_status = tmp_srv->shc_status;
416 		dst_srv->hcs_rtt = tmp_srv->shc_rtt;
417 		dst_srv->hcs_lasttime = tmp_srv->shc_lasttime;
418 		dst_srv->hcs_nexttime = tmp_srv->shc_nexttime;
419 
420 		tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv);
421 		tmp_rbufsz -= sizeof (*dst_srv);
422 	}
423 	srvs->rs_num_srvs = i;
424 	*rbufsz += i * sizeof (*dst_srv);
425 }
426 
427 /*
428  * Given a rule name, return the hc status of its servers.
429  */
430 ilb_status_t
431 ilbd_get_hc_srvs(const char *rulename, uint32_t *rbuf, size_t *rbufsz)
432 {
433 	ilbd_hc_t	*hc;
434 	ilbd_hc_rule_t	*hc_rule;
435 
436 	for (hc = list_head(&ilbd_hc_list); hc != NULL;
437 	    hc = list_next(&ilbd_hc_list, hc)) {
438 		for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
439 		    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
440 			if (strcasecmp(hc_rule->hcr_rule->irl_name,
441 			    rulename) != 0) {
442 				continue;
443 			}
444 			ilbd_hc_copy_srvs(rbuf, rbufsz, hc_rule, rulename);
445 			return (ILB_STATUS_OK);
446 		}
447 	}
448 	return (ILB_STATUS_RULE_NO_HC);
449 }
450 
451 /*
452  * Initialize the hc timer and associate the notification of timeout to
453  * the given event port.
454  */
455 void
456 ilbd_hc_timer_init(int ev_port, ilbd_timer_event_obj_t *ev_obj)
457 {
458 	struct sigevent sigev;
459 	port_notify_t notify;
460 
461 	if ((ilbd_hc_timer_q = iu_tq_create()) == NULL) {
462 		logerr("%s: cannot create hc timer queue", __func__);
463 		exit(EXIT_FAILURE);
464 	}
465 	hc_timer_restarted = B_FALSE;
466 
467 	ev_obj->ev = ILBD_EVENT_TIMER;
468 	ev_obj->timerid = -1;
469 
470 	notify.portnfy_port = ev_port;
471 	notify.portnfy_user = ev_obj;
472 	sigev.sigev_notify = SIGEV_PORT;
473 	sigev.sigev_value.sival_ptr = &notify;
474 	if (timer_create(CLOCK_REALTIME, &sigev, &ev_obj->timerid) == -1) {
475 		logerr("%s: cannot create timer", __func__);
476 		exit(EXIT_FAILURE);
477 	}
478 }
479 
480 /*
481  * HC timeout handler.
482  */
483 void
484 ilbd_hc_timeout(void)
485 {
486 	(void) iu_expire_timers(ilbd_hc_timer_q);
487 	hc_timer_restarted = B_TRUE;
488 }
489 
490 /*
491  * Set up the timer to fire at the earliest timeout.
492  */
493 void
494 ilbd_hc_timer_update(ilbd_timer_event_obj_t *ev_obj)
495 {
496 	itimerspec_t itimeout;
497 	int timeout;
498 
499 	/*
500 	 * There is no change on the timer list, so no need to set up the
501 	 * timer again.
502 	 */
503 	if (!hc_timer_restarted)
504 		return;
505 
506 restart:
507 	if ((timeout = iu_earliest_timer(ilbd_hc_timer_q)) == INFTIM) {
508 		hc_timer_restarted = B_FALSE;
509 		return;
510 	} else if (timeout == 0) {
511 		/*
512 		 * Handle the timeout immediately.  After that (clearing all
513 		 * the expired timers), check to  see if there are still
514 		 * timers running.  If yes, start them.
515 		 */
516 		(void) iu_expire_timers(ilbd_hc_timer_q);
517 		goto restart;
518 	}
519 
520 	itimeout.it_value.tv_sec = timeout / MILLISEC + 1;
521 	itimeout.it_value.tv_nsec = 0;
522 	itimeout.it_interval.tv_sec = 0;
523 	itimeout.it_interval.tv_nsec = 0;
524 
525 	/*
526 	 * Failure to set a timeout is "OK" since hopefully there will be
527 	 * other events and timer_settime() will be called again.  So
528 	 * we will only miss some timeouts.  But in the worst case, no event
529 	 * will happen and ilbd will get stuck...
530 	 */
531 	if (timer_settime(ev_obj->timerid, 0, &itimeout, NULL) == -1)
532 		logerr("%s: cannot set timer", __func__);
533 	hc_timer_restarted = B_FALSE;
534 }
535 
536 /*
537  * Kill the probe process of a server.
538  */
539 static void
540 ilbd_hc_kill_probe(ilbd_hc_srv_t *srv)
541 {
542 	/*
543 	 * First dissociate the fd from the event port.  It should not
544 	 * fail.
545 	 */
546 	if (port_dissociate(srv->shc_ev_port, PORT_SOURCE_FD,
547 	    srv->shc_child_fd) != 0) {
548 		logdebug("%s: port_dissociate: %s", __func__, strerror(errno));
549 	}
550 	(void) close(srv->shc_child_fd);
551 	free(srv->shc_ev);
552 	srv->shc_ev = NULL;
553 
554 	/* Then kill the probe process. */
555 	if (kill(srv->shc_child_pid, SIGKILL) != 0) {
556 		logerr("%s: rule %s server %s: %s", __func__,
557 		    srv->shc_hc_rule->hcr_rule->irl_name,
558 		    srv->shc_sg_srv->sgs_srvID, strerror(errno));
559 	}
560 	/* Should not fail... */
561 	if (waitpid(srv->shc_child_pid, NULL, 0) != srv->shc_child_pid) {
562 		logdebug("%s: waitpid: rule %s server %s", __func__,
563 		    srv->shc_hc_rule->hcr_rule->irl_name,
564 		    srv->shc_sg_srv->sgs_srvID);
565 	}
566 	srv->shc_child_pid = 0;
567 }
568 
569 /*
570  * Disable the server, either because the server is dead or because a timer
571  * cannot be started for this server.  Note that this only affects the
572  * transient configuration, meaning only in memory.  The persistent
573  * configuration is not affected.
574  */
575 static void
576 ilbd_mark_server_disabled(ilbd_hc_srv_t *srv)
577 {
578 	srv->shc_status = ILB_HCS_DISABLED;
579 
580 	/* Disable the server in kernel. */
581 	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
582 	    srv->shc_hc_rule->hcr_rule->irl_name,
583 	    stat_declare_srv_dead) != ILB_STATUS_OK) {
584 		logerr("%s: cannot disable server in kernel: rule %s "
585 		    "server %s", __func__,
586 		    srv->shc_hc_rule->hcr_rule->irl_name,
587 		    srv->shc_sg_srv->sgs_srvID);
588 	}
589 }
590 
591 /*
592  * A probe fails, set the state of the server.
593  */
594 static void
595 ilbd_set_fail_state(ilbd_hc_srv_t *srv)
596 {
597 	if (++srv->shc_fail_cnt < srv->shc_hc->ihc_count) {
598 		/* Probe again */
599 		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
600 		return;
601 	}
602 
603 	logdebug("%s: rule %s server %s fails %u", __func__,
604 	    srv->shc_hc_rule->hcr_rule->irl_name, srv->shc_sg_srv->sgs_srvID,
605 	    srv->shc_fail_cnt);
606 
607 	/*
608 	 * If this is a ping test, mark the server as
609 	 * unreachable instead of dead.
610 	 */
611 	if (srv->shc_hc->ihc_test_type == ILBD_HC_PING ||
612 	    srv->shc_state == ilbd_hc_def_pinging) {
613 		srv->shc_status = ILB_HCS_UNREACH;
614 	} else {
615 		srv->shc_status = ILB_HCS_DEAD;
616 	}
617 
618 	/* Disable the server in kernel. */
619 	if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
620 	    srv->shc_hc_rule->hcr_rule->irl_name, stat_declare_srv_dead) !=
621 	    ILB_STATUS_OK) {
622 		logerr("%s: cannot disable server in kernel: rule %s "
623 		    "server %s", __func__,
624 		    srv->shc_hc_rule->hcr_rule->irl_name,
625 		    srv->shc_sg_srv->sgs_srvID);
626 	}
627 
628 	/* Still keep probing in case the server is alive again. */
629 	if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
630 		/* Only thing to do is to disable the server... */
631 		logerr("%s: cannot restart timer: rule %s server %s", __func__,
632 		    srv->shc_hc_rule->hcr_rule->irl_name,
633 		    srv->shc_sg_srv->sgs_srvID);
634 		srv->shc_status = ILB_HCS_DISABLED;
635 	}
636 }
637 
638 /*
639  * A probe process has not returned for the ihc_timeout period, we should
640  * kill it.  This function is the handler of this.
641  */
642 /* ARGSUSED */
643 static void
644 ilbd_hc_kill_timer(iu_tq_t *tq, void *arg)
645 {
646 	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
647 
648 	ilbd_hc_kill_probe(srv);
649 	ilbd_set_fail_state(srv);
650 }
651 
652 /*
653  * Probe timeout handler.  Send out the appropriate probe.
654  */
655 /* ARGSUSED */
656 static void
657 ilbd_hc_probe_timer(iu_tq_t *tq, void *arg)
658 {
659 	ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
660 
661 	/*
662 	 * If starting the probe fails, just pretend that the timeout has
663 	 * extended.
664 	 */
665 	if (!ilbd_run_probe(srv)) {
666 		/*
667 		 * If we cannot restart the timer, the only thing we can do
668 		 * is to disable this server.  Hopefully the sys admin will
669 		 * notice this and enable this server again later.
670 		 */
671 		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
672 			logerr("%s: cannot restart timer: rule %s server %s, "
673 			    "disabling it", __func__,
674 			    srv->shc_hc_rule->hcr_rule->irl_name,
675 			    srv->shc_sg_srv->sgs_srvID);
676 			ilbd_mark_server_disabled(srv);
677 		}
678 		return;
679 	}
680 
681 	/*
682 	 * Similar to above, if kill timer cannot be started, disable the
683 	 * server.
684 	 */
685 	if ((srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q,
686 	    srv->shc_hc->ihc_timeout, ilbd_hc_kill_timer, srv)) == -1) {
687 		logerr("%s: cannot start kill timer: rule %s server %s, "
688 		    "disabling it", __func__,
689 		    srv->shc_hc_rule->hcr_rule->irl_name,
690 		    srv->shc_sg_srv->sgs_srvID);
691 		ilbd_mark_server_disabled(srv);
692 	}
693 	hc_timer_restarted = B_TRUE;
694 }
695 
696 /* Restart the periodic timer for a given server. */
697 static ilb_status_t
698 ilbd_hc_restart_timer(ilbd_hc_t *hc, ilbd_hc_srv_t *srv)
699 {
700 	int timeout;
701 
702 	/* Don't allow the timeout interval to be less than 1s */
703 	timeout = MAX((hc->ihc_interval >> 1) + (gethrtime() %
704 	    (hc->ihc_interval + 1)), 1);
705 
706 	/*
707 	 * If the probe is actually a ping probe, there is no need to
708 	 * do default pinging.  Just skip the step.
709 	 */
710 	if (hc->ihc_def_ping && hc->ihc_test_type != ILBD_HC_PING)
711 		srv->shc_state = ilbd_hc_def_pinging;
712 	else
713 		srv->shc_state = ilbd_hc_probing;
714 	srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q, timeout,
715 	    ilbd_hc_probe_timer, srv);
716 
717 	if (srv->shc_tid == -1)
718 		return (ILB_STATUS_TIMER);
719 	srv->shc_lasttime = time(NULL);
720 	srv->shc_nexttime = time(NULL) + timeout;
721 
722 	hc_timer_restarted = B_TRUE;
723 	return (ILB_STATUS_OK);
724 }
725 
726 /* Helper routine to associate a server with its hc object. */
727 static ilb_status_t
728 ilbd_hc_srv_add(ilbd_hc_t *hc, ilbd_hc_rule_t *hc_rule,
729     const ilb_sg_srv_t *srv, int ev_port)
730 {
731 	ilbd_hc_srv_t *new_srv;
732 	ilb_status_t ret;
733 
734 	if ((new_srv = calloc(1, sizeof (ilbd_hc_srv_t))) == NULL)
735 		return (ILB_STATUS_ENOMEM);
736 	new_srv->shc_hc = hc;
737 	new_srv->shc_hc_rule = hc_rule;
738 	new_srv->shc_sg_srv = srv;
739 	new_srv->shc_ev_port = ev_port;
740 	new_srv->shc_tid = -1;
741 	new_srv->shc_nexttime = time(NULL);
742 	new_srv->shc_lasttime = new_srv->shc_nexttime;
743 
744 	if ((hc_rule->hcr_rule->irl_flags & ILB_FLAGS_RULE_ENABLED) &&
745 	    ILB_IS_SRV_ENABLED(srv->sgs_flags)) {
746 		new_srv->shc_status = ILB_HCS_UNINIT;
747 		ret = ilbd_hc_restart_timer(hc, new_srv);
748 		if (ret != ILB_STATUS_OK) {
749 			free(new_srv);
750 			return (ret);
751 		}
752 	} else {
753 		new_srv->shc_status = ILB_HCS_DISABLED;
754 	}
755 
756 	list_insert_tail(&hc_rule->hcr_servers, new_srv);
757 	return (ILB_STATUS_OK);
758 }
759 
760 /* Handy macro to cancel a server's timer. */
761 #define	HC_CANCEL_TIMER(srv)						\
762 {									\
763 	void *arg;							\
764 	int ret;							\
765 	if ((srv)->shc_tid != -1) {					\
766 		ret = iu_cancel_timer(ilbd_hc_timer_q, (srv)->shc_tid, &arg); \
767 		(srv)->shc_tid = -1;					\
768 		assert(ret == 1);					\
769 		assert(arg == (srv));					\
770 	}								\
771 	hc_timer_restarted = B_TRUE;					\
772 }
773 
774 /* Helper routine to dissociate a server from its hc object. */
775 static ilb_status_t
776 ilbd_hc_srv_rem(ilbd_hc_rule_t *hc_rule, const ilb_sg_srv_t *srv)
777 {
778 	ilbd_hc_srv_t *tmp_srv;
779 
780 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
781 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
782 		if (tmp_srv->shc_sg_srv == srv) {
783 			list_remove(&hc_rule->hcr_servers, tmp_srv);
784 			HC_CANCEL_TIMER(tmp_srv);
785 			if (tmp_srv->shc_child_pid != 0)
786 				ilbd_hc_kill_probe(tmp_srv);
787 			free(tmp_srv);
788 			return (ILB_STATUS_OK);
789 		}
790 	}
791 	return (ILB_STATUS_ENOENT);
792 }
793 
794 /* Helper routine to dissociate all servers of a rule from its hc object. */
795 static void
796 ilbd_hc_srv_rem_all(ilbd_hc_rule_t *hc_rule)
797 {
798 	ilbd_hc_srv_t *srv;
799 
800 	while ((srv = list_remove_head(&hc_rule->hcr_servers)) != NULL) {
801 		HC_CANCEL_TIMER(srv);
802 		if (srv->shc_child_pid != 0)
803 			ilbd_hc_kill_probe(srv);
804 		free(srv);
805 	}
806 }
807 
808 /* Associate a rule with its hc object. */
809 ilb_status_t
810 ilbd_hc_associate_rule(const ilbd_rule_t *rule, int ev_port)
811 {
812 	ilbd_hc_t	*hc;
813 	ilbd_hc_rule_t	*hc_rule;
814 	ilb_status_t	ret;
815 	ilbd_sg_t	*sg;
816 	ilbd_srv_t	*ilbd_srv;
817 
818 	/* The rule is assumed to be initialized appropriately. */
819 	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
820 		logdebug("ilbd_hc_associate_rule: healthcheck %s does not "
821 		    "exist", rule->irl_hcname);
822 		return (ILB_STATUS_ENOHCINFO);
823 	}
824 	if ((hc->ihc_test_type == ILBD_HC_TCP &&
825 	    rule->irl_proto != IPPROTO_TCP) ||
826 	    (hc->ihc_test_type == ILBD_HC_UDP &&
827 	    rule->irl_proto != IPPROTO_UDP)) {
828 		return (ILB_STATUS_RULE_HC_MISMATCH);
829 	}
830 	if ((hc_rule = calloc(1, sizeof (ilbd_hc_rule_t))) == NULL) {
831 		logdebug("ilbd_hc_associate_rule: out of memory");
832 		return (ILB_STATUS_ENOMEM);
833 	}
834 
835 	hc_rule->hcr_rule = rule;
836 	list_create(&hc_rule->hcr_servers, sizeof (ilbd_hc_srv_t),
837 	    offsetof(ilbd_hc_srv_t, shc_srv_link));
838 
839 	/* Add all the servers. */
840 	sg = rule->irl_sg;
841 	for (ilbd_srv = list_head(&sg->isg_srvlist); ilbd_srv != NULL;
842 	    ilbd_srv = list_next(&sg->isg_srvlist, ilbd_srv)) {
843 		if ((ret = ilbd_hc_srv_add(hc, hc_rule, &ilbd_srv->isv_srv,
844 		    ev_port)) != ILB_STATUS_OK) {
845 			/* Remove all previously added servers */
846 			ilbd_hc_srv_rem_all(hc_rule);
847 			free(hc_rule);
848 			return (ret);
849 		}
850 	}
851 	list_insert_tail(&hc->ihc_rules, hc_rule);
852 	hc->ihc_rule_cnt++;
853 
854 	return (ILB_STATUS_OK);
855 }
856 
857 /* Dissociate a rule from its hc object. */
858 ilb_status_t
859 ilbd_hc_dissociate_rule(const ilbd_rule_t *rule)
860 {
861 	ilbd_hc_t	*hc;
862 	ilbd_hc_rule_t	*hc_rule;
863 
864 	/* The rule is assumed to be initialized appropriately. */
865 	if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
866 		logdebug("ilbd_hc_dissociate_rule: healthcheck %s does not "
867 		    "exist", rule->irl_hcname);
868 		return (ILB_STATUS_ENOENT);
869 	}
870 	for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
871 	    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
872 		if (hc_rule->hcr_rule == rule)
873 			break;
874 	}
875 	if (hc_rule == NULL) {
876 		logdebug("ilbd_hc_dissociate_rule: rule %s is not associated "
877 		    "with healtcheck %s", rule->irl_hcname, hc->ihc_name);
878 		return (ILB_STATUS_ENOENT);
879 	}
880 	ilbd_hc_srv_rem_all(hc_rule);
881 	list_remove(&hc->ihc_rules, hc_rule);
882 	hc->ihc_rule_cnt--;
883 	return (ILB_STATUS_OK);
884 }
885 
886 /*
887  * Given a hc object name and a rule, check to see if the rule is associated
888  * with the hc object.  If it is, the hc object is returned in **hc and the
889  * ilbd_hc_rule_t is returned in **hc_rule.
890  */
891 static boolean_t
892 ilbd_hc_check_rule(const char *hc_name, const ilbd_rule_t *rule,
893     ilbd_hc_t **hc, ilbd_hc_rule_t **hc_rule)
894 {
895 	ilbd_hc_t	*tmp_hc;
896 	ilbd_hc_rule_t	*tmp_hc_rule;
897 
898 	if ((tmp_hc = ilbd_get_hc(hc_name)) == NULL)
899 		return (B_FALSE);
900 	for (tmp_hc_rule = list_head(&tmp_hc->ihc_rules); tmp_hc_rule != NULL;
901 	    tmp_hc_rule = list_next(&tmp_hc->ihc_rules, tmp_hc_rule)) {
902 		if (tmp_hc_rule->hcr_rule == rule) {
903 			*hc = tmp_hc;
904 			*hc_rule = tmp_hc_rule;
905 			return (B_TRUE);
906 		}
907 	}
908 	return (B_FALSE);
909 }
910 
911 /* Associate a server with its hc object. */
912 ilb_status_t
913 ilbd_hc_add_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
914     int ev_port)
915 {
916 	ilbd_hc_t	*hc;
917 	ilbd_hc_rule_t	*hc_rule;
918 
919 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
920 		return (ILB_STATUS_ENOENT);
921 	return (ilbd_hc_srv_add(hc, hc_rule, srv, ev_port));
922 }
923 
924 /* Dissociate a server from its hc object. */
925 ilb_status_t
926 ilbd_hc_del_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
927 {
928 	ilbd_hc_t	*hc;
929 	ilbd_hc_rule_t	*hc_rule;
930 
931 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
932 		return (ILB_STATUS_ENOENT);
933 	return (ilbd_hc_srv_rem(hc_rule, srv));
934 }
935 
936 /* Helper routine to enable/disable a server's hc probe. */
937 static ilb_status_t
938 ilbd_hc_toggle_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
939     boolean_t enable)
940 {
941 	ilbd_hc_t	*hc;
942 	ilbd_hc_rule_t	*hc_rule;
943 	ilbd_hc_srv_t	*tmp_srv;
944 	ilb_status_t	ret;
945 
946 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
947 		return (ILB_STATUS_ENOENT);
948 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
949 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
950 		if (tmp_srv->shc_sg_srv != srv) {
951 			continue;
952 		}
953 		if (enable) {
954 			if (tmp_srv->shc_status == ILB_HCS_DISABLED) {
955 				ret = ilbd_hc_restart_timer(hc, tmp_srv);
956 				if (ret != ILB_STATUS_OK) {
957 					logerr("%s: cannot start timers for "
958 					    "rule %s server %s", __func__,
959 					    rule->irl_name,
960 					    tmp_srv->shc_sg_srv->sgs_srvID);
961 					return (ret);
962 				}
963 				/* Start from fresh... */
964 				tmp_srv->shc_status = ILB_HCS_UNINIT;
965 				tmp_srv->shc_rtt = 0;
966 				tmp_srv->shc_fail_cnt = 0;
967 			}
968 		} else {
969 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
970 				tmp_srv->shc_status = ILB_HCS_DISABLED;
971 				HC_CANCEL_TIMER(tmp_srv);
972 				if (tmp_srv->shc_child_pid != 0)
973 					ilbd_hc_kill_probe(tmp_srv);
974 			}
975 		}
976 		return (ILB_STATUS_OK);
977 	}
978 	return (ILB_STATUS_ENOENT);
979 }
980 
981 ilb_status_t
982 ilbd_hc_enable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
983 {
984 	return (ilbd_hc_toggle_server(rule, srv, B_TRUE));
985 }
986 
987 ilb_status_t
988 ilbd_hc_disable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
989 {
990 	return (ilbd_hc_toggle_server(rule, srv, B_FALSE));
991 }
992 
993 /*
994  * Helper routine to enable/disable a rule's hc probe (including all its
995  * servers).
996  */
997 static ilb_status_t
998 ilbd_hc_toggle_rule(const ilbd_rule_t *rule, boolean_t enable)
999 {
1000 	ilbd_hc_t	*hc;
1001 	ilbd_hc_rule_t	*hc_rule;
1002 	ilbd_hc_srv_t	*tmp_srv;
1003 	int		ret;
1004 
1005 	if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
1006 		return (ILB_STATUS_ENOENT);
1007 
1008 	for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
1009 	    tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
1010 		if (enable) {
1011 			/*
1012 			 * If the server is disabled in the rule, do not
1013 			 * restart its timer.
1014 			 */
1015 			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1016 			    ILB_IS_SRV_ENABLED(
1017 			    tmp_srv->shc_sg_srv->sgs_flags)) {
1018 				ret = ilbd_hc_restart_timer(hc, tmp_srv);
1019 				if (ret != ILB_STATUS_OK) {
1020 					logerr("%s: cannot start timers for "
1021 					    "rule %s server %s", __func__,
1022 					    rule->irl_name,
1023 					    tmp_srv->shc_sg_srv->sgs_srvID);
1024 					goto rollback;
1025 				} else {
1026 					/* Start from fresh... */
1027 					tmp_srv->shc_status = ILB_HCS_UNINIT;
1028 					tmp_srv->shc_rtt = 0;
1029 					tmp_srv->shc_fail_cnt = 0;
1030 				}
1031 			}
1032 		} else {
1033 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1034 				HC_CANCEL_TIMER(tmp_srv);
1035 				tmp_srv->shc_status = ILB_HCS_DISABLED;
1036 				if (tmp_srv->shc_child_pid != 0)
1037 					ilbd_hc_kill_probe(tmp_srv);
1038 			}
1039 		}
1040 	}
1041 	return (ILB_STATUS_OK);
1042 rollback:
1043 	enable = !enable;
1044 	for (tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv);
1045 	    tmp_srv != NULL;
1046 	    tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv)) {
1047 		if (enable) {
1048 			if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1049 			    ILB_IS_SRV_ENABLED(
1050 			    tmp_srv->shc_sg_srv->sgs_flags)) {
1051 				(void) ilbd_hc_restart_timer(hc, tmp_srv);
1052 				tmp_srv->shc_status = ILB_HCS_UNINIT;
1053 				tmp_srv->shc_rtt = 0;
1054 				tmp_srv->shc_fail_cnt = 0;
1055 			}
1056 		} else {
1057 			if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1058 				HC_CANCEL_TIMER(tmp_srv);
1059 				tmp_srv->shc_status = ILB_HCS_DISABLED;
1060 				if (tmp_srv->shc_child_pid != 0)
1061 					ilbd_hc_kill_probe(tmp_srv);
1062 			}
1063 		}
1064 	}
1065 	return (ret);
1066 }
1067 
1068 ilb_status_t
1069 ilbd_hc_enable_rule(const ilbd_rule_t *rule)
1070 {
1071 	return (ilbd_hc_toggle_rule(rule, B_TRUE));
1072 }
1073 
1074 ilb_status_t
1075 ilbd_hc_disable_rule(const ilbd_rule_t *rule)
1076 {
1077 	return (ilbd_hc_toggle_rule(rule, B_FALSE));
1078 }
1079 
1080 static const char *
1081 topo_2_str(ilb_topo_t topo)
1082 {
1083 	switch (topo) {
1084 	case ILB_TOPO_DSR:
1085 		return ("DSR");
1086 		break;
1087 	case ILB_TOPO_NAT:
1088 		return ("NAT");
1089 		break;
1090 	case ILB_TOPO_HALF_NAT:
1091 		return ("HALF_NAT");
1092 		break;
1093 	default:
1094 		/* Should not happen. */
1095 		logerr("%s: unknown topology", __func__);
1096 		break;
1097 	}
1098 	return ("");
1099 }
1100 
1101 /*
1102  * Create the argument list to be passed to a hc probe command.
1103  * The passed in argv is assumed to have HC_PROBE_ARGC elements.
1104  */
1105 static boolean_t
1106 create_argv(ilbd_hc_srv_t *srv, char *argv[])
1107 {
1108 	char buf[INET6_ADDRSTRLEN];
1109 	ilbd_rule_t const *rule;
1110 	ilb_sg_srv_t const *sg_srv;
1111 	struct in_addr v4_addr;
1112 	in_port_t port;
1113 	int i;
1114 
1115 	rule = srv->shc_hc_rule->hcr_rule;
1116 	sg_srv = srv->shc_sg_srv;
1117 
1118 	if (srv->shc_state == ilbd_hc_def_pinging) {
1119 		if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL)
1120 			return (B_FALSE);
1121 	} else {
1122 		switch (srv->shc_hc->ihc_test_type) {
1123 		case ILBD_HC_USER:
1124 			if ((argv[0] = strdup(srv->shc_hc->ihc_test)) == NULL)
1125 				return (B_FALSE);
1126 			break;
1127 		case ILBD_HC_TCP:
1128 		case ILBD_HC_UDP:
1129 			if ((argv[0] = strdup(ILB_PROBE_PROTO)) ==
1130 			    NULL) {
1131 				return (B_FALSE);
1132 			}
1133 			break;
1134 		case ILBD_HC_PING:
1135 			if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL) {
1136 				return (B_FALSE);
1137 			}
1138 			break;
1139 		}
1140 	}
1141 
1142 	/*
1143 	 * argv[1] is the VIP.
1144 	 *
1145 	 * Right now, the VIP and the backend server addresses should be
1146 	 * in the same IP address family.  Here we don't do that in case
1147 	 * this assumption is changed in future.
1148 	 */
1149 	if (IN6_IS_ADDR_V4MAPPED(&rule->irl_vip)) {
1150 		IN6_V4MAPPED_TO_INADDR(&rule->irl_vip, &v4_addr);
1151 		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1152 			goto cleanup;
1153 	} else {
1154 		if (inet_ntop(AF_INET6, &rule->irl_vip, buf,
1155 		    sizeof (buf)) == NULL) {
1156 			goto cleanup;
1157 		}
1158 	}
1159 	if ((argv[1] = strdup(buf)) == NULL)
1160 		goto cleanup;
1161 
1162 	/*
1163 	 * argv[2] is the backend server address.
1164 	 */
1165 	if (IN6_IS_ADDR_V4MAPPED(&sg_srv->sgs_addr)) {
1166 		IN6_V4MAPPED_TO_INADDR(&sg_srv->sgs_addr, &v4_addr);
1167 		if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1168 			goto cleanup;
1169 	} else {
1170 		if (inet_ntop(AF_INET6, &sg_srv->sgs_addr, buf,
1171 		    sizeof (buf)) == NULL) {
1172 			goto cleanup;
1173 		}
1174 	}
1175 	if ((argv[2] = strdup(buf)) == NULL)
1176 		goto cleanup;
1177 
1178 	/*
1179 	 * argv[3] is the transport protocol used in the rule.
1180 	 */
1181 	switch (rule->irl_proto) {
1182 	case IPPROTO_TCP:
1183 		argv[3] = strdup("TCP");
1184 		break;
1185 	case IPPROTO_UDP:
1186 		argv[3] = strdup("UDP");
1187 		break;
1188 	default:
1189 		logerr("%s: unknown protocol", __func__);
1190 		goto cleanup;
1191 		break;
1192 	}
1193 	if (argv[3] == NULL)
1194 		goto cleanup;
1195 
1196 	/*
1197 	 * argv[4] is the load balance mode, DSR, NAT, HALF-NAT.
1198 	 */
1199 	if ((argv[4] = strdup(topo_2_str(rule->irl_topo))) == NULL)
1200 		goto cleanup;
1201 
1202 	/*
1203 	 * argv[5] is the port range.  Right now, there should only be 1 port.
1204 	 */
1205 	switch (rule->irl_hcpflag) {
1206 	case ILB_HCI_PROBE_FIX:
1207 		port = ntohs(rule->irl_hcport);
1208 		break;
1209 	case ILB_HCI_PROBE_ANY: {
1210 		in_port_t min, max;
1211 
1212 		if (ntohs(sg_srv->sgs_minport) == 0) {
1213 			min = ntohs(rule->irl_minport);
1214 			max = ntohs(rule->irl_maxport);
1215 		} else {
1216 			min = ntohs(sg_srv->sgs_minport);
1217 			max = ntohs(sg_srv->sgs_maxport);
1218 		}
1219 		if (max > min)
1220 			port = min + gethrtime() % (max - min + 1);
1221 		else
1222 			port = min;
1223 		break;
1224 	}
1225 	default:
1226 		logerr("%s: unknown HC flag", __func__);
1227 		goto cleanup;
1228 		break;
1229 	}
1230 	(void) sprintf(buf, "%d", port);
1231 	if ((argv[5] = strdup(buf)) == NULL)
1232 		goto cleanup;
1233 
1234 	/*
1235 	 * argv[6] is the probe timeout.
1236 	 */
1237 	(void) sprintf(buf, "%d", srv->shc_hc->ihc_timeout);
1238 	if ((argv[6] = strdup(buf)) == NULL)
1239 		goto cleanup;
1240 
1241 	argv[7] = NULL;
1242 	return (B_TRUE);
1243 
1244 cleanup:
1245 	for (i = 0; i < HC_PROBE_ARGC; i++) {
1246 		if (argv[i] != NULL)
1247 			free(argv[i]);
1248 	}
1249 	return (B_FALSE);
1250 }
1251 
1252 static void
1253 destroy_argv(char *argv[])
1254 {
1255 	int i;
1256 
1257 	for (i = 0; argv[i] != NULL; i++)
1258 		free(argv[i]);
1259 }
1260 
1261 /* Spawn a process to run the hc probe on the given server. */
1262 static boolean_t
1263 ilbd_run_probe(ilbd_hc_srv_t *srv)
1264 {
1265 	posix_spawn_file_actions_t	fd_actions;
1266 	posix_spawnattr_t		attr;
1267 	sigset_t			child_sigset;
1268 	int				fds[2];
1269 	int				fdflags;
1270 	pid_t				pid;
1271 	char				*child_argv[HC_PROBE_ARGC];
1272 	ilbd_hc_probe_event_t		*probe_ev;
1273 	char				*probe_name;
1274 
1275 	bzero(child_argv, HC_PROBE_ARGC * sizeof (char *));
1276 	if ((probe_ev = calloc(1, sizeof (*probe_ev))) == NULL) {
1277 		logdebug("ilbd_run_probe: calloc");
1278 		return (B_FALSE);
1279 	}
1280 
1281 	/* Set up a pipe to get output from probe command. */
1282 	if (pipe(fds) < 0) {
1283 		logdebug("ilbd_run_probe: cannot create pipe");
1284 		free(probe_ev);
1285 		return (B_FALSE);
1286 	}
1287 	/* Set our side of the pipe to be non-blocking */
1288 	if ((fdflags = fcntl(fds[0], F_GETFL, 0)) == -1) {
1289 		logdebug("ilbd_run_probe: fcntl(F_GETFL)");
1290 		goto cleanup;
1291 	}
1292 	if (fcntl(fds[0], F_SETFL, fdflags | O_NONBLOCK) == -1) {
1293 		logdebug("ilbd_run_probe: fcntl(F_SETFL)");
1294 		goto cleanup;
1295 	}
1296 
1297 	if (posix_spawn_file_actions_init(&fd_actions) != 0) {
1298 		logdebug("ilbd_run_probe: posix_spawn_file_actions_init");
1299 		goto cleanup;
1300 	}
1301 	if (posix_spawnattr_init(&attr) != 0) {
1302 		logdebug("ilbd_run_probe: posix_spawnattr_init");
1303 		goto cleanup;
1304 	}
1305 	if (posix_spawn_file_actions_addclose(&fd_actions, fds[0]) != 0) {
1306 		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1307 		goto cleanup;
1308 	}
1309 	if (posix_spawn_file_actions_adddup2(&fd_actions, fds[1],
1310 	    STDOUT_FILENO) != 0) {
1311 		logdebug("ilbd_run_probe: posix_spawn_file_actions_dup2");
1312 		goto cleanup;
1313 	}
1314 	if (posix_spawn_file_actions_addclose(&fd_actions, fds[1]) != 0) {
1315 		logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1316 		goto cleanup;
1317 	}
1318 
1319 	/* Reset all signal handling of the child to default. */
1320 	(void) sigfillset(&child_sigset);
1321 	if (posix_spawnattr_setsigdefault(&attr, &child_sigset) != 0) {
1322 		logdebug("ilbd_run_probe: posix_spawnattr_setsigdefault");
1323 		goto cleanup;
1324 	}
1325 	/* Don't want SIGCHLD. */
1326 	if (posix_spawnattr_setflags(&attr, POSIX_SPAWN_NOSIGCHLD_NP|
1327 	    POSIX_SPAWN_SETSIGDEF) != 0) {
1328 		logdebug("ilbd_run_probe: posix_spawnattr_setflags");
1329 		goto cleanup;
1330 	}
1331 
1332 	if (!create_argv(srv, child_argv)) {
1333 		logdebug("ilbd_run_probe: create_argv");
1334 		goto cleanup;
1335 	}
1336 
1337 	/*
1338 	 * If we are doing default pinging or not using a user supplied
1339 	 * probe, we should execute our standard supplied probe.  The
1340 	 * supplied probe command handles all types of probes.  And the
1341 	 * type used depends on argv[0], as filled in by create_argv().
1342 	 */
1343 	if (srv->shc_state == ilbd_hc_def_pinging ||
1344 	    srv->shc_hc->ihc_test_type != ILBD_HC_USER) {
1345 		probe_name = ILB_PROBE_PROTO;
1346 	} else {
1347 		probe_name = srv->shc_hc->ihc_test;
1348 	}
1349 	if (posix_spawn(&pid, probe_name, &fd_actions, &attr, child_argv,
1350 	    NULL) != 0) {
1351 		logerr("%s: posix_spawn: %s for server %s: %s", __func__,
1352 		    srv->shc_hc->ihc_test, srv->shc_sg_srv->sgs_srvID,
1353 		    strerror(errno));
1354 		goto cleanup;
1355 	}
1356 
1357 	(void) close(fds[1]);
1358 	destroy_argv(child_argv);
1359 	srv->shc_child_pid = pid;
1360 	srv->shc_child_fd = fds[0];
1361 	srv->shc_ev = probe_ev;
1362 
1363 	probe_ev->ihp_ev = ILBD_EVENT_PROBE;
1364 	probe_ev->ihp_srv = srv;
1365 	probe_ev->ihp_pid = pid;
1366 	if (port_associate(srv->shc_ev_port, PORT_SOURCE_FD, fds[0],
1367 	    POLLRDNORM, probe_ev) != 0) {
1368 		/*
1369 		 * Need to kill the child.  It will free the srv->shc_ev,
1370 		 * which is probe_ev.  So set probe_ev to NULL.
1371 		 */
1372 		ilbd_hc_kill_probe(srv);
1373 		probe_ev = NULL;
1374 		goto cleanup;
1375 	}
1376 
1377 	return (B_TRUE);
1378 
1379 cleanup:
1380 	(void) close(fds[0]);
1381 	(void) close(fds[1]);
1382 	destroy_argv(child_argv);
1383 	if (probe_ev != NULL)
1384 		free(probe_ev);
1385 	return (B_FALSE);
1386 }
1387 
1388 /*
1389  * Called by ild_hc_probe_return() to re-associate the fd to a child to
1390  * the event port.
1391  */
1392 static void
1393 reassociate_port(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1394 {
1395 	if (port_associate(ev_port, PORT_SOURCE_FD, fd,
1396 	    POLLRDNORM, ev) != 0) {
1397 		/*
1398 		 * If we cannot reassociate with the port, the only
1399 		 * thing we can do now is to kill the child and
1400 		 * do a blocking wait here...
1401 		 */
1402 		logdebug("%s: port_associate: %s", __func__, strerror(errno));
1403 		if (kill(ev->ihp_pid, SIGKILL) != 0)
1404 			logerr("%s: kill: %s", __func__, strerror(errno));
1405 		if (waitpid(ev->ihp_pid, NULL, 0) != ev->ihp_pid)
1406 			logdebug("%s: waitpid: %s", __func__, strerror(errno));
1407 		free(ev);
1408 	}
1409 }
1410 
1411 /*
1412  * To handle a child probe process hanging up.
1413  */
1414 static void
1415 ilbd_hc_child_hup(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1416 {
1417 	ilbd_hc_srv_t *srv;
1418 	pid_t ret_pid;
1419 	int ret;
1420 
1421 	srv = ev->ihp_srv;
1422 
1423 	if (!ev->ihp_done) {
1424 		/* ilbd does not care about this process anymore ... */
1425 		ev->ihp_done = B_TRUE;
1426 		srv->shc_ev = NULL;
1427 		srv->shc_child_pid = 0;
1428 		HC_CANCEL_TIMER(srv);
1429 		ilbd_set_fail_state(srv);
1430 	}
1431 	ret_pid = waitpid(ev->ihp_pid, &ret, WNOHANG);
1432 	switch (ret_pid) {
1433 	case -1:
1434 		logperror("ilbd_hc_child_hup: waitpid");
1435 		/* FALLTHROUGH */
1436 	case 0:
1437 		/* The child has not completed the exit. Wait again. */
1438 		reassociate_port(ev_port, fd, ev);
1439 		break;
1440 	default:
1441 		/* Right now, we just ignore the exit status. */
1442 		if (WIFEXITED(ret))
1443 			ret = WEXITSTATUS(ret);
1444 		(void) close(fd);
1445 		free(ev);
1446 	}
1447 }
1448 
1449 /*
1450  * To read the output of a child probe process.
1451  */
1452 static void
1453 ilbd_hc_child_data(int fd, ilbd_hc_probe_event_t *ev)
1454 {
1455 	ilbd_hc_srv_t *srv;
1456 	char buf[HC_MAX_PROBE_OUTPUT];
1457 	int ret;
1458 	int64_t rtt;
1459 
1460 	srv = ev->ihp_srv;
1461 
1462 	bzero(buf, HC_MAX_PROBE_OUTPUT);
1463 	ret = read(fd, buf, HC_MAX_PROBE_OUTPUT - 1);
1464 	/* Should not happen since event port should have caught this. */
1465 	assert(ret > 0);
1466 
1467 	/*
1468 	 * We expect the probe command to print out the RTT only.  But
1469 	 * the command may misbehave and print out more than what we intend to
1470 	 * read in.  So need to do this check below to "flush" out all the
1471 	 * output from the command.
1472 	 */
1473 	if (!ev->ihp_done) {
1474 		ev->ihp_done = B_TRUE;
1475 		/* We don't need to know about this event anymore. */
1476 		srv->shc_ev = NULL;
1477 		srv->shc_child_pid = 0;
1478 		HC_CANCEL_TIMER(srv);
1479 	} else {
1480 		return;
1481 	}
1482 
1483 	rtt = strtoll(buf, NULL, 10);
1484 
1485 	/*
1486 	 * -1 means the server is dead or the probe somehow fails.  Treat
1487 	 * them both as server is dead.
1488 	 */
1489 	if (rtt == -1) {
1490 		ilbd_set_fail_state(srv);
1491 		return;
1492 	} else if (rtt > 0) {
1493 		/* If the returned RTT value is not valid, just ignore it. */
1494 		if (rtt > 0 && rtt <= UINT_MAX) {
1495 			/* Set rtt to be the simple smoothed average. */
1496 			if (srv->shc_rtt == 0) {
1497 				srv->shc_rtt = rtt;
1498 			} else {
1499 				srv->shc_rtt = 3 * ((srv)->shc_rtt >> 2) +
1500 				    (rtt >> 2);
1501 			}
1502 		}
1503 
1504 	}
1505 
1506 	switch (srv->shc_state) {
1507 	case ilbd_hc_def_pinging:
1508 		srv->shc_state = ilbd_hc_probing;
1509 
1510 		/* Ping is OK, now start the probe. */
1511 		ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
1512 		break;
1513 	case ilbd_hc_probing:
1514 		srv->shc_fail_cnt = 0;
1515 
1516 		/* Server is dead before, re-enable it. */
1517 		if (srv->shc_status == ILB_HCS_UNREACH ||
1518 		    srv->shc_status == ILB_HCS_DEAD) {
1519 			/*
1520 			 * If enabling the server in kernel fails now,
1521 			 * hopefully when the timer fires again later, the
1522 			 * enabling can be done.
1523 			 */
1524 			if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
1525 			    srv->shc_hc_rule->hcr_rule->irl_name,
1526 			    stat_declare_srv_alive) != ILB_STATUS_OK) {
1527 				logerr("%s: cannot enable server in kernel: "
1528 				    " rule %s server %s", __func__,
1529 				    srv->shc_hc_rule->hcr_rule->irl_name,
1530 				    srv->shc_sg_srv->sgs_srvID);
1531 			} else {
1532 				srv->shc_status = ILB_HCS_ALIVE;
1533 			}
1534 		} else {
1535 			srv->shc_status = ILB_HCS_ALIVE;
1536 		}
1537 		if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
1538 			logerr("%s: cannot restart timer: rule %s server %s",
1539 			    __func__, srv->shc_hc_rule->hcr_rule->irl_name,
1540 			    srv->shc_sg_srv->sgs_srvID);
1541 			ilbd_mark_server_disabled(srv);
1542 		}
1543 		break;
1544 	default:
1545 		logdebug("%s: unknown state", __func__);
1546 		break;
1547 	}
1548 }
1549 
1550 /*
1551  * Handle the return event of a child probe fd.
1552  */
1553 void
1554 ilbd_hc_probe_return(int ev_port, int fd, int port_events,
1555     ilbd_hc_probe_event_t *ev)
1556 {
1557 	/*
1558 	 * Note that there can be more than one events delivered to us at
1559 	 * the same time.  So we need to check them individually.
1560 	 */
1561 	if (port_events & POLLRDNORM)
1562 		ilbd_hc_child_data(fd, ev);
1563 
1564 	if (port_events & (POLLHUP|POLLERR)) {
1565 		ilbd_hc_child_hup(ev_port, fd, ev);
1566 		return;
1567 	}
1568 
1569 	/*
1570 	 * Re-associate the fd with the port so that when the child
1571 	 * exits, we can reap the status.
1572 	 */
1573 	reassociate_port(ev_port, fd, ev);
1574 }
1575