xref: /freebsd/sbin/hastd/control.c (revision 2e3f49888ec8851bafb22011533217487764fdb0)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009-2010 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Pawel Jakub Dawidek under sponsorship from
8  * the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/wait.h>
34 
35 #include <errno.h>
36 #include <pthread.h>
37 #include <signal.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <unistd.h>
41 
42 #include "hast.h"
43 #include "hastd.h"
44 #include "hast_checksum.h"
45 #include "hast_compression.h"
46 #include "hast_proto.h"
47 #include "hooks.h"
48 #include "nv.h"
49 #include "pjdlog.h"
50 #include "proto.h"
51 #include "subr.h"
52 
53 #include "control.h"
54 
55 void
56 child_cleanup(struct hast_resource *res)
57 {
58 
59 	proto_close(res->hr_ctrl);
60 	res->hr_ctrl = NULL;
61 	if (res->hr_event != NULL) {
62 		proto_close(res->hr_event);
63 		res->hr_event = NULL;
64 	}
65 	if (res->hr_conn != NULL) {
66 		proto_close(res->hr_conn);
67 		res->hr_conn = NULL;
68 	}
69 	res->hr_workerpid = 0;
70 }
71 
72 static void
73 control_set_role_common(struct hastd_config *cfg, struct nv *nvout,
74     uint8_t role, struct hast_resource *res, const char *name, unsigned int no)
75 {
76 	int oldrole;
77 
78 	/* Name is always needed. */
79 	if (name != NULL)
80 		nv_add_string(nvout, name, "resource%u", no);
81 
82 	if (res == NULL) {
83 		PJDLOG_ASSERT(cfg != NULL);
84 		PJDLOG_ASSERT(name != NULL);
85 
86 		TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
87 			if (strcmp(res->hr_name, name) == 0)
88 				break;
89 		}
90 		if (res == NULL) {
91 			nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
92 			return;
93 		}
94 	}
95 	PJDLOG_ASSERT(res != NULL);
96 
97 	/* Send previous role back. */
98 	nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
99 
100 	/* Nothing changed, return here. */
101 	if (role == res->hr_role)
102 		return;
103 
104 	pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
105 	pjdlog_info("Role changed to %s.", role2str(role));
106 
107 	/* Change role to the new one. */
108 	oldrole = res->hr_role;
109 	res->hr_role = role;
110 	pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role));
111 
112 	/*
113 	 * If previous role was primary or secondary we have to kill process
114 	 * doing that work.
115 	 */
116 	if (res->hr_workerpid != 0) {
117 		if (kill(res->hr_workerpid, SIGTERM) == -1) {
118 			pjdlog_errno(LOG_WARNING,
119 			    "Unable to kill worker process %u",
120 			    (unsigned int)res->hr_workerpid);
121 		} else if (waitpid(res->hr_workerpid, NULL, 0) !=
122 		    res->hr_workerpid) {
123 			pjdlog_errno(LOG_WARNING,
124 			    "Error while waiting for worker process %u",
125 			    (unsigned int)res->hr_workerpid);
126 		} else {
127 			pjdlog_debug(1, "Worker process %u stopped.",
128 			    (unsigned int)res->hr_workerpid);
129 		}
130 		child_cleanup(res);
131 	}
132 
133 	/* Start worker process if we are changing to primary. */
134 	if (role == HAST_ROLE_PRIMARY)
135 		hastd_primary(res);
136 	pjdlog_prefix_set("%s", "");
137 	hook_exec(res->hr_exec, "role", res->hr_name, role2str(oldrole),
138 	    role2str(res->hr_role), NULL);
139 }
140 
141 void
142 control_set_role(struct hast_resource *res, uint8_t role)
143 {
144 
145 	control_set_role_common(NULL, NULL, role, res, NULL, 0);
146 }
147 
148 static void
149 control_status_worker(struct hast_resource *res, struct nv *nvout,
150     unsigned int no)
151 {
152 	struct nv *cnvin, *cnvout;
153 	const char *str;
154 	int error;
155 
156 	cnvin = NULL;
157 
158 	/*
159 	 * Prepare and send command to worker process.
160 	 */
161 	cnvout = nv_alloc();
162 	nv_add_uint8(cnvout, CONTROL_STATUS, "cmd");
163 	error = nv_error(cnvout);
164 	if (error != 0) {
165 		pjdlog_common(LOG_ERR, 0, error,
166 		    "Unable to prepare control header");
167 		goto end;
168 	}
169 	if (hast_proto_send(res, res->hr_ctrl, cnvout, NULL, 0) == -1) {
170 		error = errno;
171 		pjdlog_errno(LOG_ERR, "Unable to send control header");
172 		goto end;
173 	}
174 
175 	/*
176 	 * Receive response.
177 	 */
178 	if (hast_proto_recv_hdr(res->hr_ctrl, &cnvin) == -1) {
179 		error = errno;
180 		pjdlog_errno(LOG_ERR, "Unable to receive control header");
181 		goto end;
182 	}
183 
184 	error = nv_get_int16(cnvin, "error");
185 	if (error != 0)
186 		goto end;
187 
188 	if ((str = nv_get_string(cnvin, "status")) == NULL) {
189 		error = ENOENT;
190 		pjdlog_errno(LOG_ERR, "Field 'status' is missing.");
191 		goto end;
192 	}
193 	nv_add_string(nvout, str, "status%u", no);
194 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "dirty"), "dirty%u", no);
195 	nv_add_uint32(nvout, nv_get_uint32(cnvin, "extentsize"),
196 	    "extentsize%u", no);
197 	nv_add_uint32(nvout, nv_get_uint32(cnvin, "keepdirty"),
198 	    "keepdirty%u", no);
199 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_read"),
200 	    "stat_read%u", no);
201 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_write"),
202 	    "stat_write%u", no);
203 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_delete"),
204 	    "stat_delete%u", no);
205 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_flush"),
206 	    "stat_flush%u", no);
207 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_activemap_update"),
208 	    "stat_activemap_update%u", no);
209 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_read_error"),
210 	    "stat_read_error%u", no);
211 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_write_error"),
212 	    "stat_write_error%u", no);
213 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_delete_error"),
214 	    "stat_delete_error%u", no);
215 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "stat_flush_error"),
216 	    "stat_flush_error%u", no);
217 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "idle_queue_size"),
218 	    "idle_queue_size%u", no);
219 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "local_queue_size"),
220 	    "local_queue_size%u", no);
221 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "send_queue_size"),
222 	    "send_queue_size%u", no);
223 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "recv_queue_size"),
224 	    "recv_queue_size%u", no);
225 	nv_add_uint64(nvout, nv_get_uint64(cnvin, "done_queue_size"),
226 	    "done_queue_size%u", no);
227 end:
228 	if (cnvin != NULL)
229 		nv_free(cnvin);
230 	if (cnvout != NULL)
231 		nv_free(cnvout);
232 	if (error != 0)
233 		nv_add_int16(nvout, error, "error");
234 }
235 
236 static void
237 control_status(struct hastd_config *cfg, struct nv *nvout,
238     struct hast_resource *res, const char *name, unsigned int no)
239 {
240 
241 	PJDLOG_ASSERT(cfg != NULL);
242 	PJDLOG_ASSERT(nvout != NULL);
243 	PJDLOG_ASSERT(name != NULL);
244 
245 	/* Name is always needed. */
246 	nv_add_string(nvout, name, "resource%u", no);
247 
248 	if (res == NULL) {
249 		TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
250 			if (strcmp(res->hr_name, name) == 0)
251 				break;
252 		}
253 		if (res == NULL) {
254 			nv_add_int16(nvout, EHAST_NOENTRY, "error%u", no);
255 			return;
256 		}
257 	}
258 	PJDLOG_ASSERT(res != NULL);
259 	nv_add_string(nvout, res->hr_provname, "provname%u", no);
260 	nv_add_string(nvout, res->hr_localpath, "localpath%u", no);
261 	nv_add_string(nvout, res->hr_remoteaddr, "remoteaddr%u", no);
262 	if (res->hr_sourceaddr[0] != '\0')
263 		nv_add_string(nvout, res->hr_sourceaddr, "sourceaddr%u", no);
264 	switch (res->hr_replication) {
265 	case HAST_REPLICATION_FULLSYNC:
266 		nv_add_string(nvout, "fullsync", "replication%u", no);
267 		break;
268 	case HAST_REPLICATION_MEMSYNC:
269 		nv_add_string(nvout, "memsync", "replication%u", no);
270 		break;
271 	case HAST_REPLICATION_ASYNC:
272 		nv_add_string(nvout, "async", "replication%u", no);
273 		break;
274 	default:
275 		nv_add_string(nvout, "unknown", "replication%u", no);
276 		break;
277 	}
278 	nv_add_string(nvout, checksum_name(res->hr_checksum),
279 	    "checksum%u", no);
280 	nv_add_string(nvout, compression_name(res->hr_compression),
281 	    "compression%u", no);
282 	nv_add_string(nvout, role2str(res->hr_role), "role%u", no);
283 	nv_add_int32(nvout, res->hr_workerpid, "workerpid%u", no);
284 
285 	switch (res->hr_role) {
286 	case HAST_ROLE_PRIMARY:
287 		PJDLOG_ASSERT(res->hr_workerpid != 0);
288 		/* FALLTHROUGH */
289 	case HAST_ROLE_SECONDARY:
290 		if (res->hr_workerpid != 0)
291 			break;
292 		/* FALLTHROUGH */
293 	default:
294 		return;
295 	}
296 
297 	/*
298 	 * If we are here, it means that we have a worker process, which we
299 	 * want to ask some questions.
300 	 */
301 	control_status_worker(res, nvout, no);
302 }
303 
304 void
305 control_handle(struct hastd_config *cfg)
306 {
307 	struct proto_conn *conn;
308 	struct nv *nvin, *nvout;
309 	unsigned int ii;
310 	const char *str;
311 	uint8_t cmd, role;
312 	int error;
313 
314 	if (proto_accept(cfg->hc_controlconn, &conn) == -1) {
315 		pjdlog_errno(LOG_ERR, "Unable to accept control connection");
316 		return;
317 	}
318 
319 	cfg->hc_controlin = conn;
320 	nvin = nvout = NULL;
321 	role = HAST_ROLE_UNDEF;
322 
323 	if (hast_proto_recv_hdr(conn, &nvin) == -1) {
324 		pjdlog_errno(LOG_ERR, "Unable to receive control header");
325 		nvin = NULL;
326 		goto close;
327 	}
328 
329 	/* Obtain command code. 0 means that nv_get_uint8() failed. */
330 	cmd = nv_get_uint8(nvin, "cmd");
331 	if (cmd == 0) {
332 		pjdlog_error("Control header is missing 'cmd' field.");
333 		goto close;
334 	}
335 
336 	/* Allocate outgoing nv structure. */
337 	nvout = nv_alloc();
338 	if (nvout == NULL) {
339 		pjdlog_error("Unable to allocate header for control response.");
340 		goto close;
341 	}
342 
343 	error = 0;
344 
345 	str = nv_get_string(nvin, "resource0");
346 	if (str == NULL) {
347 		pjdlog_error("Control header is missing 'resource0' field.");
348 		error = EHAST_INVALID;
349 		goto fail;
350 	}
351 	if (cmd == HASTCTL_CMD_SETROLE) {
352 		role = nv_get_uint8(nvin, "role");
353 		switch (role) {
354 		case HAST_ROLE_INIT:
355 		case HAST_ROLE_PRIMARY:
356 		case HAST_ROLE_SECONDARY:
357 			break;
358 		default:
359 			pjdlog_error("Invalid role received (%hhu).", role);
360 			error = EHAST_INVALID;
361 			goto fail;
362 		}
363 	}
364 	if (strcmp(str, "all") == 0) {
365 		struct hast_resource *res;
366 
367 		/* All configured resources. */
368 
369 		ii = 0;
370 		TAILQ_FOREACH(res, &cfg->hc_resources, hr_next) {
371 			switch (cmd) {
372 			case HASTCTL_CMD_SETROLE:
373 				control_set_role_common(cfg, nvout, role, res,
374 				    res->hr_name, ii++);
375 				break;
376 			case HASTCTL_CMD_STATUS:
377 				control_status(cfg, nvout, res, res->hr_name,
378 				    ii++);
379 				break;
380 			default:
381 				pjdlog_error("Invalid command received (%hhu).",
382 				    cmd);
383 				error = EHAST_UNIMPLEMENTED;
384 				goto fail;
385 			}
386 		}
387 	} else {
388 		/* Only selected resources. */
389 
390 		for (ii = 0; ; ii++) {
391 			str = nv_get_string(nvin, "resource%u", ii);
392 			if (str == NULL)
393 				break;
394 			switch (cmd) {
395 			case HASTCTL_CMD_SETROLE:
396 				control_set_role_common(cfg, nvout, role, NULL,
397 				    str, ii);
398 				break;
399 			case HASTCTL_CMD_STATUS:
400 				control_status(cfg, nvout, NULL, str, ii);
401 				break;
402 			default:
403 				pjdlog_error("Invalid command received (%hhu).",
404 				    cmd);
405 				error = EHAST_UNIMPLEMENTED;
406 				goto fail;
407 			}
408 		}
409 	}
410 	if (nv_error(nvout) != 0)
411 		goto close;
412 fail:
413 	if (error != 0)
414 		nv_add_int16(nvout, error, "error");
415 
416 	if (hast_proto_send(NULL, conn, nvout, NULL, 0) == -1)
417 		pjdlog_errno(LOG_ERR, "Unable to send control response");
418 close:
419 	if (nvin != NULL)
420 		nv_free(nvin);
421 	if (nvout != NULL)
422 		nv_free(nvout);
423 	proto_close(conn);
424 	cfg->hc_controlin = NULL;
425 }
426 
427 /*
428  * Thread handles control requests from the parent.
429  */
430 void *
431 ctrl_thread(void *arg)
432 {
433 	struct hast_resource *res = arg;
434 	struct nv *nvin, *nvout;
435 	uint8_t cmd;
436 
437 	for (;;) {
438 		if (hast_proto_recv_hdr(res->hr_ctrl, &nvin) == -1) {
439 			if (sigexit_received)
440 				pthread_exit(NULL);
441 			pjdlog_errno(LOG_ERR,
442 			    "Unable to receive control message");
443 			kill(getpid(), SIGTERM);
444 			pthread_exit(NULL);
445 		}
446 		cmd = nv_get_uint8(nvin, "cmd");
447 		if (cmd == 0) {
448 			pjdlog_error("Control message is missing 'cmd' field.");
449 			nv_free(nvin);
450 			continue;
451 		}
452 		nvout = nv_alloc();
453 		switch (cmd) {
454 		case CONTROL_STATUS:
455 			if (res->hr_remotein != NULL &&
456 			    res->hr_remoteout != NULL) {
457 				nv_add_string(nvout, "complete", "status");
458 			} else {
459 				nv_add_string(nvout, "degraded", "status");
460 			}
461 			nv_add_uint32(nvout, (uint32_t)res->hr_extentsize,
462 			    "extentsize");
463 			if (res->hr_role == HAST_ROLE_PRIMARY) {
464 				nv_add_uint32(nvout,
465 				    (uint32_t)res->hr_keepdirty, "keepdirty");
466 				nv_add_uint64(nvout,
467 				    (uint64_t)(activemap_ndirty(res->hr_amp) *
468 				    res->hr_extentsize), "dirty");
469 			} else {
470 				nv_add_uint32(nvout, (uint32_t)0, "keepdirty");
471 				nv_add_uint64(nvout, (uint64_t)0, "dirty");
472 			}
473 			nv_add_uint64(nvout, res->hr_stat_read, "stat_read");
474 			nv_add_uint64(nvout, res->hr_stat_write, "stat_write");
475 			nv_add_uint64(nvout, res->hr_stat_delete,
476 			    "stat_delete");
477 			nv_add_uint64(nvout, res->hr_stat_flush, "stat_flush");
478 			nv_add_uint64(nvout, res->hr_stat_activemap_update,
479 			    "stat_activemap_update");
480 			nv_add_uint64(nvout, res->hr_stat_read_error,
481 			    "stat_read_error");
482 			nv_add_uint64(nvout, res->hr_stat_write_error +
483 			    res->hr_stat_activemap_write_error,
484 			    "stat_write_error");
485 			nv_add_uint64(nvout, res->hr_stat_delete_error,
486 			    "stat_delete_error");
487 			nv_add_uint64(nvout, res->hr_stat_flush_error +
488 			    res->hr_stat_activemap_flush_error,
489 			    "stat_flush_error");
490 			res->output_status_aux(nvout);
491 			nv_add_int16(nvout, 0, "error");
492 			break;
493 		case CONTROL_RELOAD:
494 			/*
495 			 * When parent receives SIGHUP and discovers that
496 			 * something related to us has changes, it sends reload
497 			 * message to us.
498 			 */
499 			PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY);
500 			primary_config_reload(res, nvin);
501 			nv_add_int16(nvout, 0, "error");
502 			break;
503 		default:
504 			nv_add_int16(nvout, EINVAL, "error");
505 			break;
506 		}
507 		nv_free(nvin);
508 		if (nv_error(nvout) != 0) {
509 			pjdlog_error("Unable to create answer on control message.");
510 			nv_free(nvout);
511 			continue;
512 		}
513 		if (hast_proto_send(NULL, res->hr_ctrl, nvout, NULL, 0) == -1) {
514 			pjdlog_errno(LOG_ERR,
515 			    "Unable to send reply to control message");
516 		}
517 		nv_free(nvout);
518 	}
519 	/* NOTREACHED */
520 	return (NULL);
521 }
522