132115b10SPawel Jakub Dawidek /*- 232115b10SPawel Jakub Dawidek * Copyright (c) 2009 The FreeBSD Foundation 31fee97b0SPawel Jakub Dawidek * Copyright (c) 2010-2011 Pawel Jakub Dawidek <pawel@dawidek.net> 432115b10SPawel Jakub Dawidek * All rights reserved. 532115b10SPawel Jakub Dawidek * 632115b10SPawel Jakub Dawidek * This software was developed by Pawel Jakub Dawidek under sponsorship from 732115b10SPawel Jakub Dawidek * the FreeBSD Foundation. 832115b10SPawel Jakub Dawidek * 932115b10SPawel Jakub Dawidek * Redistribution and use in source and binary forms, with or without 1032115b10SPawel Jakub Dawidek * modification, are permitted provided that the following conditions 1132115b10SPawel Jakub Dawidek * are met: 1232115b10SPawel Jakub Dawidek * 1. Redistributions of source code must retain the above copyright 1332115b10SPawel Jakub Dawidek * notice, this list of conditions and the following disclaimer. 1432115b10SPawel Jakub Dawidek * 2. Redistributions in binary form must reproduce the above copyright 1532115b10SPawel Jakub Dawidek * notice, this list of conditions and the following disclaimer in the 1632115b10SPawel Jakub Dawidek * documentation and/or other materials provided with the distribution. 1732115b10SPawel Jakub Dawidek * 1832115b10SPawel Jakub Dawidek * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 1932115b10SPawel Jakub Dawidek * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2032115b10SPawel Jakub Dawidek * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2132115b10SPawel Jakub Dawidek * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 2232115b10SPawel Jakub Dawidek * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2332115b10SPawel Jakub Dawidek * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2432115b10SPawel Jakub Dawidek * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2532115b10SPawel Jakub Dawidek * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2632115b10SPawel Jakub Dawidek * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2732115b10SPawel Jakub Dawidek * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2832115b10SPawel Jakub Dawidek * SUCH DAMAGE. 2932115b10SPawel Jakub Dawidek */ 3032115b10SPawel Jakub Dawidek 3132115b10SPawel Jakub Dawidek #include <sys/cdefs.h> 3232115b10SPawel Jakub Dawidek __FBSDID("$FreeBSD$"); 3332115b10SPawel Jakub Dawidek 3432115b10SPawel Jakub Dawidek #include <sys/types.h> 3532115b10SPawel Jakub Dawidek #include <sys/time.h> 3632115b10SPawel Jakub Dawidek #include <sys/bio.h> 3732115b10SPawel Jakub Dawidek #include <sys/disk.h> 3832115b10SPawel Jakub Dawidek #include <sys/stat.h> 3932115b10SPawel Jakub Dawidek 4032115b10SPawel Jakub Dawidek #include <geom/gate/g_gate.h> 4132115b10SPawel Jakub Dawidek 4232115b10SPawel Jakub Dawidek #include <err.h> 4332115b10SPawel Jakub Dawidek #include <errno.h> 4432115b10SPawel Jakub Dawidek #include <fcntl.h> 4532115b10SPawel Jakub Dawidek #include <libgeom.h> 4632115b10SPawel Jakub Dawidek #include <pthread.h> 476d0c801eSPawel Jakub Dawidek #include <signal.h> 4832115b10SPawel Jakub Dawidek #include <stdint.h> 4932115b10SPawel Jakub Dawidek #include <stdio.h> 5032115b10SPawel Jakub Dawidek #include <string.h> 5132115b10SPawel Jakub Dawidek #include <sysexits.h> 5232115b10SPawel Jakub Dawidek #include <unistd.h> 5332115b10SPawel Jakub Dawidek 5432115b10SPawel Jakub Dawidek #include <activemap.h> 5532115b10SPawel Jakub Dawidek #include <nv.h> 5632115b10SPawel Jakub Dawidek #include <rangelock.h> 5732115b10SPawel Jakub Dawidek 5832115b10SPawel Jakub Dawidek #include "control.h" 595bdff860SPawel Jakub Dawidek #include "event.h" 6032115b10SPawel Jakub Dawidek #include "hast.h" 6132115b10SPawel Jakub Dawidek #include "hast_proto.h" 6232115b10SPawel Jakub Dawidek #include "hastd.h" 630becad39SPawel Jakub Dawidek #include "hooks.h" 6432115b10SPawel Jakub Dawidek #include "metadata.h" 6532115b10SPawel Jakub Dawidek #include "proto.h" 6632115b10SPawel Jakub Dawidek #include "pjdlog.h" 67d6e636c9SPawel Jakub Dawidek #include "refcnt.h" 6832115b10SPawel Jakub Dawidek #include "subr.h" 6932115b10SPawel Jakub Dawidek #include "synch.h" 7032115b10SPawel Jakub Dawidek 710989854dSPawel Jakub Dawidek /* The is only one remote component for now. */ 720989854dSPawel Jakub Dawidek #define ISREMOTE(no) ((no) == 1) 730989854dSPawel Jakub Dawidek 7432115b10SPawel Jakub Dawidek struct hio { 7532115b10SPawel Jakub Dawidek /* 7632115b10SPawel Jakub Dawidek * Number of components we are still waiting for. 7732115b10SPawel Jakub Dawidek * When this field goes to 0, we can send the request back to the 7832115b10SPawel Jakub Dawidek * kernel. Each component has to decrease this counter by one 7932115b10SPawel Jakub Dawidek * even on failure. 8032115b10SPawel Jakub Dawidek */ 816e87c151SEd Schouten refcnt_t hio_countdown; 8232115b10SPawel Jakub Dawidek /* 8332115b10SPawel Jakub Dawidek * Each component has a place to store its own error. 8432115b10SPawel Jakub Dawidek * Once the request is handled by all components we can decide if the 8532115b10SPawel Jakub Dawidek * request overall is successful or not. 8632115b10SPawel Jakub Dawidek */ 8732115b10SPawel Jakub Dawidek int *hio_errors; 8832115b10SPawel Jakub Dawidek /* 890b626a28SPawel Jakub Dawidek * Structure used to communicate with GEOM Gate class. 9032115b10SPawel Jakub Dawidek */ 9132115b10SPawel Jakub Dawidek struct g_gate_ctl_io hio_ggio; 9207ebc362SPawel Jakub Dawidek /* 9307ebc362SPawel Jakub Dawidek * Request was already confirmed to GEOM Gate. 9407ebc362SPawel Jakub Dawidek */ 9507ebc362SPawel Jakub Dawidek bool hio_done; 9607ebc362SPawel Jakub Dawidek /* 975d69ed53SMikolaj Golub * Number of components we are still waiting before sending write 985d69ed53SMikolaj Golub * completion ack to GEOM Gate. Used for memsync. 995d69ed53SMikolaj Golub */ 1005d69ed53SMikolaj Golub refcnt_t hio_writecount; 1015d69ed53SMikolaj Golub /* 1025d69ed53SMikolaj Golub * Memsync request was acknowleged by remote. 1035d69ed53SMikolaj Golub */ 1045d69ed53SMikolaj Golub bool hio_memsyncacked; 1055d69ed53SMikolaj Golub /* 10607ebc362SPawel Jakub Dawidek * Remember replication from the time the request was initiated, 10707ebc362SPawel Jakub Dawidek * so we won't get confused when replication changes on reload. 10807ebc362SPawel Jakub Dawidek */ 10907ebc362SPawel Jakub Dawidek int hio_replication; 11032115b10SPawel Jakub Dawidek TAILQ_ENTRY(hio) *hio_next; 11132115b10SPawel Jakub Dawidek }; 11232115b10SPawel Jakub Dawidek #define hio_free_next hio_next[0] 11332115b10SPawel Jakub Dawidek #define hio_done_next hio_next[0] 11432115b10SPawel Jakub Dawidek 11532115b10SPawel Jakub Dawidek /* 11632115b10SPawel Jakub Dawidek * Free list holds unused structures. When free list is empty, we have to wait 11732115b10SPawel Jakub Dawidek * until some in-progress requests are freed. 11832115b10SPawel Jakub Dawidek */ 11932115b10SPawel Jakub Dawidek static TAILQ_HEAD(, hio) hio_free_list; 1206b66c350SMikolaj Golub static size_t hio_free_list_size; 12132115b10SPawel Jakub Dawidek static pthread_mutex_t hio_free_list_lock; 12232115b10SPawel Jakub Dawidek static pthread_cond_t hio_free_list_cond; 12332115b10SPawel Jakub Dawidek /* 12432115b10SPawel Jakub Dawidek * There is one send list for every component. One requests is placed on all 12532115b10SPawel Jakub Dawidek * send lists - each component gets the same request, but each component is 12632115b10SPawel Jakub Dawidek * responsible for managing his own send list. 12732115b10SPawel Jakub Dawidek */ 12832115b10SPawel Jakub Dawidek static TAILQ_HEAD(, hio) *hio_send_list; 1296b66c350SMikolaj Golub static size_t *hio_send_list_size; 13032115b10SPawel Jakub Dawidek static pthread_mutex_t *hio_send_list_lock; 13132115b10SPawel Jakub Dawidek static pthread_cond_t *hio_send_list_cond; 1326b66c350SMikolaj Golub #define hio_send_local_list_size hio_send_list_size[0] 1336b66c350SMikolaj Golub #define hio_send_remote_list_size hio_send_list_size[1] 13432115b10SPawel Jakub Dawidek /* 13532115b10SPawel Jakub Dawidek * There is one recv list for every component, although local components don't 13632115b10SPawel Jakub Dawidek * use recv lists as local requests are done synchronously. 13732115b10SPawel Jakub Dawidek */ 13832115b10SPawel Jakub Dawidek static TAILQ_HEAD(, hio) *hio_recv_list; 1396b66c350SMikolaj Golub static size_t *hio_recv_list_size; 14032115b10SPawel Jakub Dawidek static pthread_mutex_t *hio_recv_list_lock; 14132115b10SPawel Jakub Dawidek static pthread_cond_t *hio_recv_list_cond; 1426b66c350SMikolaj Golub #define hio_recv_remote_list_size hio_recv_list_size[1] 14332115b10SPawel Jakub Dawidek /* 14432115b10SPawel Jakub Dawidek * Request is placed on done list by the slowest component (the one that 14532115b10SPawel Jakub Dawidek * decreased hio_countdown from 1 to 0). 14632115b10SPawel Jakub Dawidek */ 14732115b10SPawel Jakub Dawidek static TAILQ_HEAD(, hio) hio_done_list; 1486b66c350SMikolaj Golub static size_t hio_done_list_size; 14932115b10SPawel Jakub Dawidek static pthread_mutex_t hio_done_list_lock; 15032115b10SPawel Jakub Dawidek static pthread_cond_t hio_done_list_cond; 15132115b10SPawel Jakub Dawidek /* 15232115b10SPawel Jakub Dawidek * Structure below are for interaction with sync thread. 15332115b10SPawel Jakub Dawidek */ 15432115b10SPawel Jakub Dawidek static bool sync_inprogress; 15532115b10SPawel Jakub Dawidek static pthread_mutex_t sync_lock; 15632115b10SPawel Jakub Dawidek static pthread_cond_t sync_cond; 15732115b10SPawel Jakub Dawidek /* 15832115b10SPawel Jakub Dawidek * The lock below allows to synchornize access to remote connections. 15932115b10SPawel Jakub Dawidek */ 16032115b10SPawel Jakub Dawidek static pthread_rwlock_t *hio_remote_lock; 16132115b10SPawel Jakub Dawidek 16232115b10SPawel Jakub Dawidek /* 16332115b10SPawel Jakub Dawidek * Lock to synchronize metadata updates. Also synchronize access to 16432115b10SPawel Jakub Dawidek * hr_primary_localcnt and hr_primary_remotecnt fields. 16532115b10SPawel Jakub Dawidek */ 16632115b10SPawel Jakub Dawidek static pthread_mutex_t metadata_lock; 16732115b10SPawel Jakub Dawidek 16832115b10SPawel Jakub Dawidek /* 16932115b10SPawel Jakub Dawidek * Maximum number of outstanding I/O requests. 17032115b10SPawel Jakub Dawidek */ 17132115b10SPawel Jakub Dawidek #define HAST_HIO_MAX 256 17232115b10SPawel Jakub Dawidek /* 17332115b10SPawel Jakub Dawidek * Number of components. At this point there are only two components: local 17432115b10SPawel Jakub Dawidek * and remote, but in the future it might be possible to use multiple local 17532115b10SPawel Jakub Dawidek * and remote components. 17632115b10SPawel Jakub Dawidek */ 17732115b10SPawel Jakub Dawidek #define HAST_NCOMPONENTS 2 17832115b10SPawel Jakub Dawidek 17932115b10SPawel Jakub Dawidek #define ISCONNECTED(res, no) \ 18032115b10SPawel Jakub Dawidek ((res)->hr_remotein != NULL && (res)->hr_remoteout != NULL) 18132115b10SPawel Jakub Dawidek 18232115b10SPawel Jakub Dawidek #define QUEUE_INSERT1(hio, name, ncomp) do { \ 18332115b10SPawel Jakub Dawidek mtx_lock(&hio_##name##_list_lock[(ncomp)]); \ 1849c539971SMikolaj Golub if (TAILQ_EMPTY(&hio_##name##_list[(ncomp)])) \ 1859c539971SMikolaj Golub cv_broadcast(&hio_##name##_list_cond[(ncomp)]); \ 18632115b10SPawel Jakub Dawidek TAILQ_INSERT_TAIL(&hio_##name##_list[(ncomp)], (hio), \ 18732115b10SPawel Jakub Dawidek hio_next[(ncomp)]); \ 1886b66c350SMikolaj Golub hio_##name##_list_size[(ncomp)]++; \ 1899c539971SMikolaj Golub mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \ 19032115b10SPawel Jakub Dawidek } while (0) 19132115b10SPawel Jakub Dawidek #define QUEUE_INSERT2(hio, name) do { \ 19232115b10SPawel Jakub Dawidek mtx_lock(&hio_##name##_list_lock); \ 1939c539971SMikolaj Golub if (TAILQ_EMPTY(&hio_##name##_list)) \ 1949c539971SMikolaj Golub cv_broadcast(&hio_##name##_list_cond); \ 19532115b10SPawel Jakub Dawidek TAILQ_INSERT_TAIL(&hio_##name##_list, (hio), hio_##name##_next);\ 1966b66c350SMikolaj Golub hio_##name##_list_size++; \ 19732115b10SPawel Jakub Dawidek mtx_unlock(&hio_##name##_list_lock); \ 19832115b10SPawel Jakub Dawidek } while (0) 199448efa94SPawel Jakub Dawidek #define QUEUE_TAKE1(hio, name, ncomp, timeout) do { \ 200448efa94SPawel Jakub Dawidek bool _last; \ 201448efa94SPawel Jakub Dawidek \ 20232115b10SPawel Jakub Dawidek mtx_lock(&hio_##name##_list_lock[(ncomp)]); \ 203448efa94SPawel Jakub Dawidek _last = false; \ 204448efa94SPawel Jakub Dawidek while (((hio) = TAILQ_FIRST(&hio_##name##_list[(ncomp)])) == NULL && !_last) { \ 205448efa94SPawel Jakub Dawidek cv_timedwait(&hio_##name##_list_cond[(ncomp)], \ 206448efa94SPawel Jakub Dawidek &hio_##name##_list_lock[(ncomp)], (timeout)); \ 207448efa94SPawel Jakub Dawidek if ((timeout) != 0) \ 208448efa94SPawel Jakub Dawidek _last = true; \ 20932115b10SPawel Jakub Dawidek } \ 210448efa94SPawel Jakub Dawidek if (hio != NULL) { \ 2116b66c350SMikolaj Golub PJDLOG_ASSERT(hio_##name##_list_size[(ncomp)] != 0); \ 2126b66c350SMikolaj Golub hio_##name##_list_size[(ncomp)]--; \ 21332115b10SPawel Jakub Dawidek TAILQ_REMOVE(&hio_##name##_list[(ncomp)], (hio), \ 21432115b10SPawel Jakub Dawidek hio_next[(ncomp)]); \ 215448efa94SPawel Jakub Dawidek } \ 21632115b10SPawel Jakub Dawidek mtx_unlock(&hio_##name##_list_lock[(ncomp)]); \ 21732115b10SPawel Jakub Dawidek } while (0) 21832115b10SPawel Jakub Dawidek #define QUEUE_TAKE2(hio, name) do { \ 21932115b10SPawel Jakub Dawidek mtx_lock(&hio_##name##_list_lock); \ 22032115b10SPawel Jakub Dawidek while (((hio) = TAILQ_FIRST(&hio_##name##_list)) == NULL) { \ 22132115b10SPawel Jakub Dawidek cv_wait(&hio_##name##_list_cond, \ 22232115b10SPawel Jakub Dawidek &hio_##name##_list_lock); \ 22332115b10SPawel Jakub Dawidek } \ 2246b66c350SMikolaj Golub PJDLOG_ASSERT(hio_##name##_list_size != 0); \ 2256b66c350SMikolaj Golub hio_##name##_list_size--; \ 22632115b10SPawel Jakub Dawidek TAILQ_REMOVE(&hio_##name##_list, (hio), hio_##name##_next); \ 22732115b10SPawel Jakub Dawidek mtx_unlock(&hio_##name##_list_lock); \ 22832115b10SPawel Jakub Dawidek } while (0) 22932115b10SPawel Jakub Dawidek 2308f04423fSMikolaj Golub #define ISFULLSYNC(hio) ((hio)->hio_replication == HAST_REPLICATION_FULLSYNC) 2318f04423fSMikolaj Golub #define ISMEMSYNC(hio) ((hio)->hio_replication == HAST_REPLICATION_MEMSYNC) 2328f04423fSMikolaj Golub #define ISASYNC(hio) ((hio)->hio_replication == HAST_REPLICATION_ASYNC) 2338f04423fSMikolaj Golub 234328e0f4bSPawel Jakub Dawidek #define SYNCREQ(hio) do { \ 235328e0f4bSPawel Jakub Dawidek (hio)->hio_ggio.gctl_unit = -1; \ 236328e0f4bSPawel Jakub Dawidek (hio)->hio_ggio.gctl_seq = 1; \ 237328e0f4bSPawel Jakub Dawidek } while (0) 23832115b10SPawel Jakub Dawidek #define ISSYNCREQ(hio) ((hio)->hio_ggio.gctl_unit == -1) 23932115b10SPawel Jakub Dawidek #define SYNCREQDONE(hio) do { (hio)->hio_ggio.gctl_unit = -2; } while (0) 24032115b10SPawel Jakub Dawidek #define ISSYNCREQDONE(hio) ((hio)->hio_ggio.gctl_unit == -2) 2418f04423fSMikolaj Golub 2428f04423fSMikolaj Golub #define ISMEMSYNCWRITE(hio) (ISMEMSYNC(hio) && \ 2438f04423fSMikolaj Golub (hio)->hio_ggio.gctl_cmd == BIO_WRITE && !ISSYNCREQ(hio)) 24432115b10SPawel Jakub Dawidek 24532115b10SPawel Jakub Dawidek static struct hast_resource *gres; 24632115b10SPawel Jakub Dawidek 24732115b10SPawel Jakub Dawidek static pthread_mutex_t range_lock; 24832115b10SPawel Jakub Dawidek static struct rangelocks *range_regular; 24932115b10SPawel Jakub Dawidek static bool range_regular_wait; 25032115b10SPawel Jakub Dawidek static pthread_cond_t range_regular_cond; 25132115b10SPawel Jakub Dawidek static struct rangelocks *range_sync; 25232115b10SPawel Jakub Dawidek static bool range_sync_wait; 25332115b10SPawel Jakub Dawidek static pthread_cond_t range_sync_cond; 254ac0401e3SPawel Jakub Dawidek static bool fullystarted; 25532115b10SPawel Jakub Dawidek 25632115b10SPawel Jakub Dawidek static void *ggate_recv_thread(void *arg); 25732115b10SPawel Jakub Dawidek static void *local_send_thread(void *arg); 25832115b10SPawel Jakub Dawidek static void *remote_send_thread(void *arg); 25932115b10SPawel Jakub Dawidek static void *remote_recv_thread(void *arg); 26032115b10SPawel Jakub Dawidek static void *ggate_send_thread(void *arg); 26132115b10SPawel Jakub Dawidek static void *sync_thread(void *arg); 26232115b10SPawel Jakub Dawidek static void *guard_thread(void *arg); 26332115b10SPawel Jakub Dawidek 2646d0c801eSPawel Jakub Dawidek static void 2656b66c350SMikolaj Golub output_status_aux(struct nv *nvout) 2666b66c350SMikolaj Golub { 2676b66c350SMikolaj Golub 2686b66c350SMikolaj Golub nv_add_uint64(nvout, (uint64_t)hio_free_list_size, 2696b66c350SMikolaj Golub "idle_queue_size"); 2706b66c350SMikolaj Golub nv_add_uint64(nvout, (uint64_t)hio_send_local_list_size, 2716b66c350SMikolaj Golub "local_queue_size"); 2726b66c350SMikolaj Golub nv_add_uint64(nvout, (uint64_t)hio_send_remote_list_size, 2736b66c350SMikolaj Golub "send_queue_size"); 2746b66c350SMikolaj Golub nv_add_uint64(nvout, (uint64_t)hio_recv_remote_list_size, 2756b66c350SMikolaj Golub "recv_queue_size"); 2766b66c350SMikolaj Golub nv_add_uint64(nvout, (uint64_t)hio_done_list_size, 2776b66c350SMikolaj Golub "done_queue_size"); 2786b66c350SMikolaj Golub } 2796b66c350SMikolaj Golub 2806b66c350SMikolaj Golub static void 28132115b10SPawel Jakub Dawidek cleanup(struct hast_resource *res) 28232115b10SPawel Jakub Dawidek { 28332115b10SPawel Jakub Dawidek int rerrno; 28432115b10SPawel Jakub Dawidek 28532115b10SPawel Jakub Dawidek /* Remember errno. */ 28632115b10SPawel Jakub Dawidek rerrno = errno; 28732115b10SPawel Jakub Dawidek 28832115b10SPawel Jakub Dawidek /* Destroy ggate provider if we created one. */ 28932115b10SPawel Jakub Dawidek if (res->hr_ggateunit >= 0) { 29032115b10SPawel Jakub Dawidek struct g_gate_ctl_destroy ggiod; 29132115b10SPawel Jakub Dawidek 2924e47b646SPawel Jakub Dawidek bzero(&ggiod, sizeof(ggiod)); 29332115b10SPawel Jakub Dawidek ggiod.gctl_version = G_GATE_VERSION; 29432115b10SPawel Jakub Dawidek ggiod.gctl_unit = res->hr_ggateunit; 29532115b10SPawel Jakub Dawidek ggiod.gctl_force = 1; 2962b1b224dSPawel Jakub Dawidek if (ioctl(res->hr_ggatefd, G_GATE_CMD_DESTROY, &ggiod) == -1) { 297783ee753SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, 298783ee753SPawel Jakub Dawidek "Unable to destroy hast/%s device", 29932115b10SPawel Jakub Dawidek res->hr_provname); 30032115b10SPawel Jakub Dawidek } 30132115b10SPawel Jakub Dawidek res->hr_ggateunit = -1; 30232115b10SPawel Jakub Dawidek } 30332115b10SPawel Jakub Dawidek 30432115b10SPawel Jakub Dawidek /* Restore errno. */ 30532115b10SPawel Jakub Dawidek errno = rerrno; 30632115b10SPawel Jakub Dawidek } 30732115b10SPawel Jakub Dawidek 308e43e02f1SPawel Jakub Dawidek static __dead2 void 30932115b10SPawel Jakub Dawidek primary_exit(int exitcode, const char *fmt, ...) 31032115b10SPawel Jakub Dawidek { 31132115b10SPawel Jakub Dawidek va_list ap; 31232115b10SPawel Jakub Dawidek 3132ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(exitcode != EX_OK); 31432115b10SPawel Jakub Dawidek va_start(ap, fmt); 31532115b10SPawel Jakub Dawidek pjdlogv_errno(LOG_ERR, fmt, ap); 31632115b10SPawel Jakub Dawidek va_end(ap); 31732115b10SPawel Jakub Dawidek cleanup(gres); 31832115b10SPawel Jakub Dawidek exit(exitcode); 31932115b10SPawel Jakub Dawidek } 32032115b10SPawel Jakub Dawidek 321e43e02f1SPawel Jakub Dawidek static __dead2 void 32232115b10SPawel Jakub Dawidek primary_exitx(int exitcode, const char *fmt, ...) 32332115b10SPawel Jakub Dawidek { 32432115b10SPawel Jakub Dawidek va_list ap; 32532115b10SPawel Jakub Dawidek 32632115b10SPawel Jakub Dawidek va_start(ap, fmt); 32732115b10SPawel Jakub Dawidek pjdlogv(exitcode == EX_OK ? LOG_INFO : LOG_ERR, fmt, ap); 32832115b10SPawel Jakub Dawidek va_end(ap); 32932115b10SPawel Jakub Dawidek cleanup(gres); 33032115b10SPawel Jakub Dawidek exit(exitcode); 33132115b10SPawel Jakub Dawidek } 33232115b10SPawel Jakub Dawidek 33332115b10SPawel Jakub Dawidek static int 334*974a1085SEd Schouten hast_activemap_flush(struct hast_resource *res) __unlocks(res->hr_amp_lock) 33532115b10SPawel Jakub Dawidek { 33632115b10SPawel Jakub Dawidek const unsigned char *buf; 33732115b10SPawel Jakub Dawidek size_t size; 338a818a4ffSMikolaj Golub int ret; 33932115b10SPawel Jakub Dawidek 340a818a4ffSMikolaj Golub mtx_lock(&res->hr_amp_diskmap_lock); 34132115b10SPawel Jakub Dawidek buf = activemap_bitmap(res->hr_amp, &size); 342a818a4ffSMikolaj Golub mtx_unlock(&res->hr_amp_lock); 3432ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(buf != NULL); 3442ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT((size % res->hr_local_sectorsize) == 0); 345a818a4ffSMikolaj Golub ret = 0; 34632115b10SPawel Jakub Dawidek if (pwrite(res->hr_localfd, buf, size, METADATA_SIZE) != 34732115b10SPawel Jakub Dawidek (ssize_t)size) { 348be1143efSPawel Jakub Dawidek pjdlog_errno(LOG_ERR, "Unable to flush activemap to disk"); 3492adbba66SMikolaj Golub res->hr_stat_activemap_write_error++; 350a818a4ffSMikolaj Golub ret = -1; 35132115b10SPawel Jakub Dawidek } 352a818a4ffSMikolaj Golub if (ret == 0 && res->hr_metaflush == 1 && 353a818a4ffSMikolaj Golub g_flush(res->hr_localfd) == -1) { 354518dd4c0SPawel Jakub Dawidek if (errno == EOPNOTSUPP) { 355518dd4c0SPawel Jakub Dawidek pjdlog_warning("The %s provider doesn't support flushing write cache. Disabling it.", 356518dd4c0SPawel Jakub Dawidek res->hr_localpath); 357518dd4c0SPawel Jakub Dawidek res->hr_metaflush = 0; 358518dd4c0SPawel Jakub Dawidek } else { 359518dd4c0SPawel Jakub Dawidek pjdlog_errno(LOG_ERR, 360518dd4c0SPawel Jakub Dawidek "Unable to flush disk cache on activemap update"); 3612adbba66SMikolaj Golub res->hr_stat_activemap_flush_error++; 362a818a4ffSMikolaj Golub ret = -1; 363518dd4c0SPawel Jakub Dawidek } 364518dd4c0SPawel Jakub Dawidek } 365a818a4ffSMikolaj Golub mtx_unlock(&res->hr_amp_diskmap_lock); 366a818a4ffSMikolaj Golub return (ret); 36732115b10SPawel Jakub Dawidek } 36832115b10SPawel Jakub Dawidek 369f377917cSPawel Jakub Dawidek static bool 370f377917cSPawel Jakub Dawidek real_remote(const struct hast_resource *res) 371f377917cSPawel Jakub Dawidek { 372f377917cSPawel Jakub Dawidek 373f377917cSPawel Jakub Dawidek return (strcmp(res->hr_remoteaddr, "none") != 0); 374f377917cSPawel Jakub Dawidek } 375f377917cSPawel Jakub Dawidek 37632115b10SPawel Jakub Dawidek static void 37732115b10SPawel Jakub Dawidek init_environment(struct hast_resource *res __unused) 37832115b10SPawel Jakub Dawidek { 37932115b10SPawel Jakub Dawidek struct hio *hio; 38032115b10SPawel Jakub Dawidek unsigned int ii, ncomps; 38132115b10SPawel Jakub Dawidek 38232115b10SPawel Jakub Dawidek /* 38332115b10SPawel Jakub Dawidek * In the future it might be per-resource value. 38432115b10SPawel Jakub Dawidek */ 38532115b10SPawel Jakub Dawidek ncomps = HAST_NCOMPONENTS; 38632115b10SPawel Jakub Dawidek 38732115b10SPawel Jakub Dawidek /* 38832115b10SPawel Jakub Dawidek * Allocate memory needed by lists. 38932115b10SPawel Jakub Dawidek */ 39032115b10SPawel Jakub Dawidek hio_send_list = malloc(sizeof(hio_send_list[0]) * ncomps); 39132115b10SPawel Jakub Dawidek if (hio_send_list == NULL) { 39232115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 39332115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for send lists.", 39432115b10SPawel Jakub Dawidek sizeof(hio_send_list[0]) * ncomps); 39532115b10SPawel Jakub Dawidek } 3966b66c350SMikolaj Golub hio_send_list_size = malloc(sizeof(hio_send_list_size[0]) * ncomps); 3976b66c350SMikolaj Golub if (hio_send_list_size == NULL) { 3986b66c350SMikolaj Golub primary_exitx(EX_TEMPFAIL, 3996b66c350SMikolaj Golub "Unable to allocate %zu bytes of memory for send list counters.", 4006b66c350SMikolaj Golub sizeof(hio_send_list_size[0]) * ncomps); 4016b66c350SMikolaj Golub } 40232115b10SPawel Jakub Dawidek hio_send_list_lock = malloc(sizeof(hio_send_list_lock[0]) * ncomps); 40332115b10SPawel Jakub Dawidek if (hio_send_list_lock == NULL) { 40432115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 40532115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for send list locks.", 40632115b10SPawel Jakub Dawidek sizeof(hio_send_list_lock[0]) * ncomps); 40732115b10SPawel Jakub Dawidek } 40832115b10SPawel Jakub Dawidek hio_send_list_cond = malloc(sizeof(hio_send_list_cond[0]) * ncomps); 40932115b10SPawel Jakub Dawidek if (hio_send_list_cond == NULL) { 41032115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 41132115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for send list condition variables.", 41232115b10SPawel Jakub Dawidek sizeof(hio_send_list_cond[0]) * ncomps); 41332115b10SPawel Jakub Dawidek } 41432115b10SPawel Jakub Dawidek hio_recv_list = malloc(sizeof(hio_recv_list[0]) * ncomps); 41532115b10SPawel Jakub Dawidek if (hio_recv_list == NULL) { 41632115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 41732115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for recv lists.", 41832115b10SPawel Jakub Dawidek sizeof(hio_recv_list[0]) * ncomps); 41932115b10SPawel Jakub Dawidek } 4206b66c350SMikolaj Golub hio_recv_list_size = malloc(sizeof(hio_recv_list_size[0]) * ncomps); 4216b66c350SMikolaj Golub if (hio_recv_list_size == NULL) { 4226b66c350SMikolaj Golub primary_exitx(EX_TEMPFAIL, 4236b66c350SMikolaj Golub "Unable to allocate %zu bytes of memory for recv list counters.", 4246b66c350SMikolaj Golub sizeof(hio_recv_list_size[0]) * ncomps); 4256b66c350SMikolaj Golub } 42632115b10SPawel Jakub Dawidek hio_recv_list_lock = malloc(sizeof(hio_recv_list_lock[0]) * ncomps); 42732115b10SPawel Jakub Dawidek if (hio_recv_list_lock == NULL) { 42832115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 42932115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for recv list locks.", 43032115b10SPawel Jakub Dawidek sizeof(hio_recv_list_lock[0]) * ncomps); 43132115b10SPawel Jakub Dawidek } 43232115b10SPawel Jakub Dawidek hio_recv_list_cond = malloc(sizeof(hio_recv_list_cond[0]) * ncomps); 43332115b10SPawel Jakub Dawidek if (hio_recv_list_cond == NULL) { 43432115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 43532115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for recv list condition variables.", 43632115b10SPawel Jakub Dawidek sizeof(hio_recv_list_cond[0]) * ncomps); 43732115b10SPawel Jakub Dawidek } 43832115b10SPawel Jakub Dawidek hio_remote_lock = malloc(sizeof(hio_remote_lock[0]) * ncomps); 43932115b10SPawel Jakub Dawidek if (hio_remote_lock == NULL) { 44032115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 44132115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for remote connections locks.", 44232115b10SPawel Jakub Dawidek sizeof(hio_remote_lock[0]) * ncomps); 44332115b10SPawel Jakub Dawidek } 44432115b10SPawel Jakub Dawidek 44532115b10SPawel Jakub Dawidek /* 4466b66c350SMikolaj Golub * Initialize lists, their counters, locks and condition variables. 44732115b10SPawel Jakub Dawidek */ 44832115b10SPawel Jakub Dawidek TAILQ_INIT(&hio_free_list); 44932115b10SPawel Jakub Dawidek mtx_init(&hio_free_list_lock); 45032115b10SPawel Jakub Dawidek cv_init(&hio_free_list_cond); 45132115b10SPawel Jakub Dawidek for (ii = 0; ii < HAST_NCOMPONENTS; ii++) { 45232115b10SPawel Jakub Dawidek TAILQ_INIT(&hio_send_list[ii]); 4536b66c350SMikolaj Golub hio_send_list_size[ii] = 0; 45432115b10SPawel Jakub Dawidek mtx_init(&hio_send_list_lock[ii]); 45532115b10SPawel Jakub Dawidek cv_init(&hio_send_list_cond[ii]); 45632115b10SPawel Jakub Dawidek TAILQ_INIT(&hio_recv_list[ii]); 4576b66c350SMikolaj Golub hio_recv_list_size[ii] = 0; 45832115b10SPawel Jakub Dawidek mtx_init(&hio_recv_list_lock[ii]); 45932115b10SPawel Jakub Dawidek cv_init(&hio_recv_list_cond[ii]); 46032115b10SPawel Jakub Dawidek rw_init(&hio_remote_lock[ii]); 46132115b10SPawel Jakub Dawidek } 46232115b10SPawel Jakub Dawidek TAILQ_INIT(&hio_done_list); 46332115b10SPawel Jakub Dawidek mtx_init(&hio_done_list_lock); 46432115b10SPawel Jakub Dawidek cv_init(&hio_done_list_cond); 46532115b10SPawel Jakub Dawidek mtx_init(&metadata_lock); 46632115b10SPawel Jakub Dawidek 46732115b10SPawel Jakub Dawidek /* 46832115b10SPawel Jakub Dawidek * Allocate requests pool and initialize requests. 46932115b10SPawel Jakub Dawidek */ 47032115b10SPawel Jakub Dawidek for (ii = 0; ii < HAST_HIO_MAX; ii++) { 47132115b10SPawel Jakub Dawidek hio = malloc(sizeof(*hio)); 47232115b10SPawel Jakub Dawidek if (hio == NULL) { 47332115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 47432115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for hio request.", 47532115b10SPawel Jakub Dawidek sizeof(*hio)); 47632115b10SPawel Jakub Dawidek } 4776e87c151SEd Schouten refcnt_init(&hio->hio_countdown, 0); 47832115b10SPawel Jakub Dawidek hio->hio_errors = malloc(sizeof(hio->hio_errors[0]) * ncomps); 47932115b10SPawel Jakub Dawidek if (hio->hio_errors == NULL) { 48032115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 48132115b10SPawel Jakub Dawidek "Unable allocate %zu bytes of memory for hio errors.", 48232115b10SPawel Jakub Dawidek sizeof(hio->hio_errors[0]) * ncomps); 48332115b10SPawel Jakub Dawidek } 48432115b10SPawel Jakub Dawidek hio->hio_next = malloc(sizeof(hio->hio_next[0]) * ncomps); 48532115b10SPawel Jakub Dawidek if (hio->hio_next == NULL) { 48632115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 48732115b10SPawel Jakub Dawidek "Unable allocate %zu bytes of memory for hio_next field.", 48832115b10SPawel Jakub Dawidek sizeof(hio->hio_next[0]) * ncomps); 48932115b10SPawel Jakub Dawidek } 49032115b10SPawel Jakub Dawidek hio->hio_ggio.gctl_version = G_GATE_VERSION; 49132115b10SPawel Jakub Dawidek hio->hio_ggio.gctl_data = malloc(MAXPHYS); 49232115b10SPawel Jakub Dawidek if (hio->hio_ggio.gctl_data == NULL) { 49332115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 49432115b10SPawel Jakub Dawidek "Unable to allocate %zu bytes of memory for gctl_data.", 49532115b10SPawel Jakub Dawidek MAXPHYS); 49632115b10SPawel Jakub Dawidek } 49732115b10SPawel Jakub Dawidek hio->hio_ggio.gctl_length = MAXPHYS; 49832115b10SPawel Jakub Dawidek hio->hio_ggio.gctl_error = 0; 49932115b10SPawel Jakub Dawidek TAILQ_INSERT_HEAD(&hio_free_list, hio, hio_free_next); 5006b66c350SMikolaj Golub hio_free_list_size++; 50132115b10SPawel Jakub Dawidek } 50232115b10SPawel Jakub Dawidek } 50332115b10SPawel Jakub Dawidek 504ce837469SPawel Jakub Dawidek static bool 505ce837469SPawel Jakub Dawidek init_resuid(struct hast_resource *res) 506ce837469SPawel Jakub Dawidek { 507ce837469SPawel Jakub Dawidek 508ce837469SPawel Jakub Dawidek mtx_lock(&metadata_lock); 509ce837469SPawel Jakub Dawidek if (res->hr_resuid != 0) { 510ce837469SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 511ce837469SPawel Jakub Dawidek return (false); 512ce837469SPawel Jakub Dawidek } else { 513ce837469SPawel Jakub Dawidek /* Initialize unique resource identifier. */ 514ce837469SPawel Jakub Dawidek arc4random_buf(&res->hr_resuid, sizeof(res->hr_resuid)); 515ce837469SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 5162b1b224dSPawel Jakub Dawidek if (metadata_write(res) == -1) 517ce837469SPawel Jakub Dawidek exit(EX_NOINPUT); 518ce837469SPawel Jakub Dawidek return (true); 519ce837469SPawel Jakub Dawidek } 520ce837469SPawel Jakub Dawidek } 521ce837469SPawel Jakub Dawidek 52232115b10SPawel Jakub Dawidek static void 52332115b10SPawel Jakub Dawidek init_local(struct hast_resource *res) 52432115b10SPawel Jakub Dawidek { 52532115b10SPawel Jakub Dawidek unsigned char *buf; 52632115b10SPawel Jakub Dawidek size_t mapsize; 52732115b10SPawel Jakub Dawidek 5282b1b224dSPawel Jakub Dawidek if (metadata_read(res, true) == -1) 52932115b10SPawel Jakub Dawidek exit(EX_NOINPUT); 53032115b10SPawel Jakub Dawidek mtx_init(&res->hr_amp_lock); 53132115b10SPawel Jakub Dawidek if (activemap_init(&res->hr_amp, res->hr_datasize, res->hr_extentsize, 5322b1b224dSPawel Jakub Dawidek res->hr_local_sectorsize, res->hr_keepdirty) == -1) { 53332115b10SPawel Jakub Dawidek primary_exit(EX_TEMPFAIL, "Unable to create activemap"); 53432115b10SPawel Jakub Dawidek } 53532115b10SPawel Jakub Dawidek mtx_init(&range_lock); 53632115b10SPawel Jakub Dawidek cv_init(&range_regular_cond); 5372b1b224dSPawel Jakub Dawidek if (rangelock_init(&range_regular) == -1) 53832115b10SPawel Jakub Dawidek primary_exit(EX_TEMPFAIL, "Unable to create regular range lock"); 53932115b10SPawel Jakub Dawidek cv_init(&range_sync_cond); 5402b1b224dSPawel Jakub Dawidek if (rangelock_init(&range_sync) == -1) 54132115b10SPawel Jakub Dawidek primary_exit(EX_TEMPFAIL, "Unable to create sync range lock"); 54232115b10SPawel Jakub Dawidek mapsize = activemap_ondisk_size(res->hr_amp); 54332115b10SPawel Jakub Dawidek buf = calloc(1, mapsize); 54432115b10SPawel Jakub Dawidek if (buf == NULL) { 54532115b10SPawel Jakub Dawidek primary_exitx(EX_TEMPFAIL, 54632115b10SPawel Jakub Dawidek "Unable to allocate buffer for activemap."); 54732115b10SPawel Jakub Dawidek } 54832115b10SPawel Jakub Dawidek if (pread(res->hr_localfd, buf, mapsize, METADATA_SIZE) != 54932115b10SPawel Jakub Dawidek (ssize_t)mapsize) { 55032115b10SPawel Jakub Dawidek primary_exit(EX_NOINPUT, "Unable to read activemap"); 55132115b10SPawel Jakub Dawidek } 55232115b10SPawel Jakub Dawidek activemap_copyin(res->hr_amp, buf, mapsize); 553b0dfbe5bSPawel Jakub Dawidek free(buf); 55432115b10SPawel Jakub Dawidek if (res->hr_resuid != 0) 55532115b10SPawel Jakub Dawidek return; 55632115b10SPawel Jakub Dawidek /* 557ce837469SPawel Jakub Dawidek * We're using provider for the first time. Initialize local and remote 558ce837469SPawel Jakub Dawidek * counters. We don't initialize resuid here, as we want to do it just 559ce837469SPawel Jakub Dawidek * in time. The reason for this is that we want to inform secondary 560ce837469SPawel Jakub Dawidek * that there were no writes yet, so there is no need to synchronize 561ce837469SPawel Jakub Dawidek * anything. 56232115b10SPawel Jakub Dawidek */ 5639446b453SPawel Jakub Dawidek res->hr_primary_localcnt = 0; 56432115b10SPawel Jakub Dawidek res->hr_primary_remotecnt = 0; 5652b1b224dSPawel Jakub Dawidek if (metadata_write(res) == -1) 56632115b10SPawel Jakub Dawidek exit(EX_NOINPUT); 56732115b10SPawel Jakub Dawidek } 56832115b10SPawel Jakub Dawidek 56932ecf620SPawel Jakub Dawidek static int 57032ecf620SPawel Jakub Dawidek primary_connect(struct hast_resource *res, struct proto_conn **connp) 57132ecf620SPawel Jakub Dawidek { 57232ecf620SPawel Jakub Dawidek struct proto_conn *conn; 57332ecf620SPawel Jakub Dawidek int16_t val; 57432ecf620SPawel Jakub Dawidek 57532ecf620SPawel Jakub Dawidek val = 1; 5762b1b224dSPawel Jakub Dawidek if (proto_send(res->hr_conn, &val, sizeof(val)) == -1) { 57732ecf620SPawel Jakub Dawidek primary_exit(EX_TEMPFAIL, 57832ecf620SPawel Jakub Dawidek "Unable to send connection request to parent"); 57932ecf620SPawel Jakub Dawidek } 5802b1b224dSPawel Jakub Dawidek if (proto_recv(res->hr_conn, &val, sizeof(val)) == -1) { 58132ecf620SPawel Jakub Dawidek primary_exit(EX_TEMPFAIL, 58232ecf620SPawel Jakub Dawidek "Unable to receive reply to connection request from parent"); 58332ecf620SPawel Jakub Dawidek } 58432ecf620SPawel Jakub Dawidek if (val != 0) { 58532ecf620SPawel Jakub Dawidek errno = val; 58632ecf620SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, "Unable to connect to %s", 58732ecf620SPawel Jakub Dawidek res->hr_remoteaddr); 58832ecf620SPawel Jakub Dawidek return (-1); 58932ecf620SPawel Jakub Dawidek } 5902b1b224dSPawel Jakub Dawidek if (proto_connection_recv(res->hr_conn, true, &conn) == -1) { 59132ecf620SPawel Jakub Dawidek primary_exit(EX_TEMPFAIL, 59232ecf620SPawel Jakub Dawidek "Unable to receive connection from parent"); 59332ecf620SPawel Jakub Dawidek } 5942b1b224dSPawel Jakub Dawidek if (proto_connect_wait(conn, res->hr_timeout) == -1) { 59532ecf620SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, "Unable to connect to %s", 59632ecf620SPawel Jakub Dawidek res->hr_remoteaddr); 59732ecf620SPawel Jakub Dawidek proto_close(conn); 59832ecf620SPawel Jakub Dawidek return (-1); 59932ecf620SPawel Jakub Dawidek } 60032ecf620SPawel Jakub Dawidek /* Error in setting timeout is not critical, but why should it fail? */ 6012b1b224dSPawel Jakub Dawidek if (proto_timeout(conn, res->hr_timeout) == -1) 60232ecf620SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, "Unable to set connection timeout"); 60332ecf620SPawel Jakub Dawidek 60432ecf620SPawel Jakub Dawidek *connp = conn; 60532ecf620SPawel Jakub Dawidek 60632ecf620SPawel Jakub Dawidek return (0); 60732ecf620SPawel Jakub Dawidek } 60832ecf620SPawel Jakub Dawidek 609c66ee1b3SPawel Jakub Dawidek /* 610c66ee1b3SPawel Jakub Dawidek * Function instructs GEOM_GATE to handle reads directly from within the kernel. 611c66ee1b3SPawel Jakub Dawidek */ 612c66ee1b3SPawel Jakub Dawidek static void 613c66ee1b3SPawel Jakub Dawidek enable_direct_reads(struct hast_resource *res) 614c66ee1b3SPawel Jakub Dawidek { 615c66ee1b3SPawel Jakub Dawidek struct g_gate_ctl_modify ggiomodify; 616c66ee1b3SPawel Jakub Dawidek 617c66ee1b3SPawel Jakub Dawidek bzero(&ggiomodify, sizeof(ggiomodify)); 618c66ee1b3SPawel Jakub Dawidek ggiomodify.gctl_version = G_GATE_VERSION; 619c66ee1b3SPawel Jakub Dawidek ggiomodify.gctl_unit = res->hr_ggateunit; 620c66ee1b3SPawel Jakub Dawidek ggiomodify.gctl_modify = GG_MODIFY_READPROV | GG_MODIFY_READOFFSET; 621c66ee1b3SPawel Jakub Dawidek strlcpy(ggiomodify.gctl_readprov, res->hr_localpath, 622c66ee1b3SPawel Jakub Dawidek sizeof(ggiomodify.gctl_readprov)); 623c66ee1b3SPawel Jakub Dawidek ggiomodify.gctl_readoffset = res->hr_localoff; 624c66ee1b3SPawel Jakub Dawidek if (ioctl(res->hr_ggatefd, G_GATE_CMD_MODIFY, &ggiomodify) == 0) 625c66ee1b3SPawel Jakub Dawidek pjdlog_debug(1, "Direct reads enabled."); 626c66ee1b3SPawel Jakub Dawidek else 627c66ee1b3SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, "Failed to enable direct reads"); 628c66ee1b3SPawel Jakub Dawidek } 629c66ee1b3SPawel Jakub Dawidek 630ac0401e3SPawel Jakub Dawidek static int 6310d9014f3SPawel Jakub Dawidek init_remote(struct hast_resource *res, struct proto_conn **inp, 6320d9014f3SPawel Jakub Dawidek struct proto_conn **outp) 63332115b10SPawel Jakub Dawidek { 6340d9014f3SPawel Jakub Dawidek struct proto_conn *in, *out; 63532115b10SPawel Jakub Dawidek struct nv *nvout, *nvin; 63632115b10SPawel Jakub Dawidek const unsigned char *token; 63732115b10SPawel Jakub Dawidek unsigned char *map; 63832115b10SPawel Jakub Dawidek const char *errmsg; 63932115b10SPawel Jakub Dawidek int32_t extentsize; 64032115b10SPawel Jakub Dawidek int64_t datasize; 64132115b10SPawel Jakub Dawidek uint32_t mapsize; 642d6e636c9SPawel Jakub Dawidek uint8_t version; 64332115b10SPawel Jakub Dawidek size_t size; 644ac0401e3SPawel Jakub Dawidek int error; 64532115b10SPawel Jakub Dawidek 6462ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT((inp == NULL && outp == NULL) || (inp != NULL && outp != NULL)); 6472ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(real_remote(res)); 6480d9014f3SPawel Jakub Dawidek 6490d9014f3SPawel Jakub Dawidek in = out = NULL; 6502be8fd75SPawel Jakub Dawidek errmsg = NULL; 6510d9014f3SPawel Jakub Dawidek 65232ecf620SPawel Jakub Dawidek if (primary_connect(res, &out) == -1) 653ac0401e3SPawel Jakub Dawidek return (ECONNREFUSED); 654ac0401e3SPawel Jakub Dawidek 655ac0401e3SPawel Jakub Dawidek error = ECONNABORTED; 65632ecf620SPawel Jakub Dawidek 65732115b10SPawel Jakub Dawidek /* 65832115b10SPawel Jakub Dawidek * First handshake step. 65932115b10SPawel Jakub Dawidek * Setup outgoing connection with remote node. 66032115b10SPawel Jakub Dawidek */ 66132115b10SPawel Jakub Dawidek nvout = nv_alloc(); 66232115b10SPawel Jakub Dawidek nv_add_string(nvout, res->hr_name, "resource"); 663d6e636c9SPawel Jakub Dawidek nv_add_uint8(nvout, HAST_PROTO_VERSION, "version"); 66432115b10SPawel Jakub Dawidek if (nv_error(nvout) != 0) { 66532115b10SPawel Jakub Dawidek pjdlog_common(LOG_WARNING, 0, nv_error(nvout), 66632115b10SPawel Jakub Dawidek "Unable to allocate header for connection with %s", 66732115b10SPawel Jakub Dawidek res->hr_remoteaddr); 66832115b10SPawel Jakub Dawidek nv_free(nvout); 66932115b10SPawel Jakub Dawidek goto close; 67032115b10SPawel Jakub Dawidek } 6712b1b224dSPawel Jakub Dawidek if (hast_proto_send(res, out, nvout, NULL, 0) == -1) { 67232115b10SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, 67332115b10SPawel Jakub Dawidek "Unable to send handshake header to %s", 67432115b10SPawel Jakub Dawidek res->hr_remoteaddr); 67532115b10SPawel Jakub Dawidek nv_free(nvout); 67632115b10SPawel Jakub Dawidek goto close; 67732115b10SPawel Jakub Dawidek } 67832115b10SPawel Jakub Dawidek nv_free(nvout); 6792b1b224dSPawel Jakub Dawidek if (hast_proto_recv_hdr(out, &nvin) == -1) { 68032115b10SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, 68132115b10SPawel Jakub Dawidek "Unable to receive handshake header from %s", 68232115b10SPawel Jakub Dawidek res->hr_remoteaddr); 68332115b10SPawel Jakub Dawidek goto close; 68432115b10SPawel Jakub Dawidek } 68532115b10SPawel Jakub Dawidek errmsg = nv_get_string(nvin, "errmsg"); 68632115b10SPawel Jakub Dawidek if (errmsg != NULL) { 68732115b10SPawel Jakub Dawidek pjdlog_warning("%s", errmsg); 688ac0401e3SPawel Jakub Dawidek if (nv_exists(nvin, "wait")) 689ac0401e3SPawel Jakub Dawidek error = EBUSY; 69032115b10SPawel Jakub Dawidek nv_free(nvin); 69132115b10SPawel Jakub Dawidek goto close; 69232115b10SPawel Jakub Dawidek } 693d6e636c9SPawel Jakub Dawidek version = nv_get_uint8(nvin, "version"); 694d6e636c9SPawel Jakub Dawidek if (version == 0) { 695d6e636c9SPawel Jakub Dawidek /* 696d6e636c9SPawel Jakub Dawidek * If no version is sent, it means this is protocol version 1. 697d6e636c9SPawel Jakub Dawidek */ 698d6e636c9SPawel Jakub Dawidek version = 1; 699d6e636c9SPawel Jakub Dawidek } 700d6e636c9SPawel Jakub Dawidek if (version > HAST_PROTO_VERSION) { 701d6e636c9SPawel Jakub Dawidek pjdlog_warning("Invalid version received (%hhu).", version); 702d6e636c9SPawel Jakub Dawidek nv_free(nvin); 703d6e636c9SPawel Jakub Dawidek goto close; 704d6e636c9SPawel Jakub Dawidek } 705d6e636c9SPawel Jakub Dawidek res->hr_version = version; 706d6e636c9SPawel Jakub Dawidek pjdlog_debug(1, "Negotiated protocol version %d.", res->hr_version); 70732115b10SPawel Jakub Dawidek token = nv_get_uint8_array(nvin, &size, "token"); 70832115b10SPawel Jakub Dawidek if (token == NULL) { 70932115b10SPawel Jakub Dawidek pjdlog_warning("Handshake header from %s has no 'token' field.", 71032115b10SPawel Jakub Dawidek res->hr_remoteaddr); 71132115b10SPawel Jakub Dawidek nv_free(nvin); 71232115b10SPawel Jakub Dawidek goto close; 71332115b10SPawel Jakub Dawidek } 71432115b10SPawel Jakub Dawidek if (size != sizeof(res->hr_token)) { 71532115b10SPawel Jakub Dawidek pjdlog_warning("Handshake header from %s contains 'token' of wrong size (got %zu, expected %zu).", 71632115b10SPawel Jakub Dawidek res->hr_remoteaddr, size, sizeof(res->hr_token)); 71732115b10SPawel Jakub Dawidek nv_free(nvin); 71832115b10SPawel Jakub Dawidek goto close; 71932115b10SPawel Jakub Dawidek } 72032115b10SPawel Jakub Dawidek bcopy(token, res->hr_token, sizeof(res->hr_token)); 72132115b10SPawel Jakub Dawidek nv_free(nvin); 72232115b10SPawel Jakub Dawidek 72332115b10SPawel Jakub Dawidek /* 72432115b10SPawel Jakub Dawidek * Second handshake step. 72532115b10SPawel Jakub Dawidek * Setup incoming connection with remote node. 72632115b10SPawel Jakub Dawidek */ 72732ecf620SPawel Jakub Dawidek if (primary_connect(res, &in) == -1) 72832115b10SPawel Jakub Dawidek goto close; 72932ecf620SPawel Jakub Dawidek 73032115b10SPawel Jakub Dawidek nvout = nv_alloc(); 73132115b10SPawel Jakub Dawidek nv_add_string(nvout, res->hr_name, "resource"); 73232115b10SPawel Jakub Dawidek nv_add_uint8_array(nvout, res->hr_token, sizeof(res->hr_token), 73332115b10SPawel Jakub Dawidek "token"); 734ce837469SPawel Jakub Dawidek if (res->hr_resuid == 0) { 735ce837469SPawel Jakub Dawidek /* 736ce837469SPawel Jakub Dawidek * The resuid field was not yet initialized. 737ce837469SPawel Jakub Dawidek * Because we do synchronization inside init_resuid(), it is 738ce837469SPawel Jakub Dawidek * possible that someone already initialized it, the function 739ce837469SPawel Jakub Dawidek * will return false then, but if we successfully initialized 740ce837469SPawel Jakub Dawidek * it, we will get true. True means that there were no writes 741ce837469SPawel Jakub Dawidek * to this resource yet and we want to inform secondary that 742ce837469SPawel Jakub Dawidek * synchronization is not needed by sending "virgin" argument. 743ce837469SPawel Jakub Dawidek */ 744ce837469SPawel Jakub Dawidek if (init_resuid(res)) 745ce837469SPawel Jakub Dawidek nv_add_int8(nvout, 1, "virgin"); 746ce837469SPawel Jakub Dawidek } 74732115b10SPawel Jakub Dawidek nv_add_uint64(nvout, res->hr_resuid, "resuid"); 74832115b10SPawel Jakub Dawidek nv_add_uint64(nvout, res->hr_primary_localcnt, "localcnt"); 74932115b10SPawel Jakub Dawidek nv_add_uint64(nvout, res->hr_primary_remotecnt, "remotecnt"); 75032115b10SPawel Jakub Dawidek if (nv_error(nvout) != 0) { 75132115b10SPawel Jakub Dawidek pjdlog_common(LOG_WARNING, 0, nv_error(nvout), 75232115b10SPawel Jakub Dawidek "Unable to allocate header for connection with %s", 75332115b10SPawel Jakub Dawidek res->hr_remoteaddr); 75432115b10SPawel Jakub Dawidek nv_free(nvout); 75532115b10SPawel Jakub Dawidek goto close; 75632115b10SPawel Jakub Dawidek } 7572b1b224dSPawel Jakub Dawidek if (hast_proto_send(res, in, nvout, NULL, 0) == -1) { 75832115b10SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, 75932115b10SPawel Jakub Dawidek "Unable to send handshake header to %s", 76032115b10SPawel Jakub Dawidek res->hr_remoteaddr); 76132115b10SPawel Jakub Dawidek nv_free(nvout); 76232115b10SPawel Jakub Dawidek goto close; 76332115b10SPawel Jakub Dawidek } 76432115b10SPawel Jakub Dawidek nv_free(nvout); 7652b1b224dSPawel Jakub Dawidek if (hast_proto_recv_hdr(out, &nvin) == -1) { 76632115b10SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, 76732115b10SPawel Jakub Dawidek "Unable to receive handshake header from %s", 76832115b10SPawel Jakub Dawidek res->hr_remoteaddr); 76932115b10SPawel Jakub Dawidek goto close; 77032115b10SPawel Jakub Dawidek } 77132115b10SPawel Jakub Dawidek errmsg = nv_get_string(nvin, "errmsg"); 77232115b10SPawel Jakub Dawidek if (errmsg != NULL) { 77332115b10SPawel Jakub Dawidek pjdlog_warning("%s", errmsg); 77432115b10SPawel Jakub Dawidek nv_free(nvin); 77532115b10SPawel Jakub Dawidek goto close; 77632115b10SPawel Jakub Dawidek } 77732115b10SPawel Jakub Dawidek datasize = nv_get_int64(nvin, "datasize"); 77832115b10SPawel Jakub Dawidek if (datasize != res->hr_datasize) { 77932115b10SPawel Jakub Dawidek pjdlog_warning("Data size differs between nodes (local=%jd, remote=%jd).", 78032115b10SPawel Jakub Dawidek (intmax_t)res->hr_datasize, (intmax_t)datasize); 78132115b10SPawel Jakub Dawidek nv_free(nvin); 78232115b10SPawel Jakub Dawidek goto close; 78332115b10SPawel Jakub Dawidek } 78432115b10SPawel Jakub Dawidek extentsize = nv_get_int32(nvin, "extentsize"); 78532115b10SPawel Jakub Dawidek if (extentsize != res->hr_extentsize) { 78632115b10SPawel Jakub Dawidek pjdlog_warning("Extent size differs between nodes (local=%zd, remote=%zd).", 78732115b10SPawel Jakub Dawidek (ssize_t)res->hr_extentsize, (ssize_t)extentsize); 78832115b10SPawel Jakub Dawidek nv_free(nvin); 78932115b10SPawel Jakub Dawidek goto close; 79032115b10SPawel Jakub Dawidek } 79132115b10SPawel Jakub Dawidek res->hr_secondary_localcnt = nv_get_uint64(nvin, "localcnt"); 79232115b10SPawel Jakub Dawidek res->hr_secondary_remotecnt = nv_get_uint64(nvin, "remotecnt"); 79332115b10SPawel Jakub Dawidek res->hr_syncsrc = nv_get_uint8(nvin, "syncsrc"); 794c66ee1b3SPawel Jakub Dawidek if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) 795c66ee1b3SPawel Jakub Dawidek enable_direct_reads(res); 79606cbf549SPawel Jakub Dawidek if (nv_exists(nvin, "virgin")) { 79706cbf549SPawel Jakub Dawidek /* 79806cbf549SPawel Jakub Dawidek * Secondary was reinitialized, bump localcnt if it is 0 as 79906cbf549SPawel Jakub Dawidek * only we have the data. 80006cbf549SPawel Jakub Dawidek */ 80106cbf549SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_PRIMARY); 80206cbf549SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_secondary_localcnt == 0); 80306cbf549SPawel Jakub Dawidek 80406cbf549SPawel Jakub Dawidek if (res->hr_primary_localcnt == 0) { 80506cbf549SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_secondary_remotecnt == 0); 80606cbf549SPawel Jakub Dawidek 80706cbf549SPawel Jakub Dawidek mtx_lock(&metadata_lock); 80806cbf549SPawel Jakub Dawidek res->hr_primary_localcnt++; 80906cbf549SPawel Jakub Dawidek pjdlog_debug(1, "Increasing localcnt to %ju.", 81006cbf549SPawel Jakub Dawidek (uintmax_t)res->hr_primary_localcnt); 81106cbf549SPawel Jakub Dawidek (void)metadata_write(res); 81206cbf549SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 81306cbf549SPawel Jakub Dawidek } 81406cbf549SPawel Jakub Dawidek } 81532115b10SPawel Jakub Dawidek map = NULL; 81632115b10SPawel Jakub Dawidek mapsize = nv_get_uint32(nvin, "mapsize"); 81732115b10SPawel Jakub Dawidek if (mapsize > 0) { 81832115b10SPawel Jakub Dawidek map = malloc(mapsize); 81932115b10SPawel Jakub Dawidek if (map == NULL) { 82032115b10SPawel Jakub Dawidek pjdlog_error("Unable to allocate memory for remote activemap (mapsize=%ju).", 82132115b10SPawel Jakub Dawidek (uintmax_t)mapsize); 82232115b10SPawel Jakub Dawidek nv_free(nvin); 82332115b10SPawel Jakub Dawidek goto close; 82432115b10SPawel Jakub Dawidek } 82532115b10SPawel Jakub Dawidek /* 82632115b10SPawel Jakub Dawidek * Remote node have some dirty extents on its own, lets 82732115b10SPawel Jakub Dawidek * download its activemap. 82832115b10SPawel Jakub Dawidek */ 8290d9014f3SPawel Jakub Dawidek if (hast_proto_recv_data(res, out, nvin, map, 8302b1b224dSPawel Jakub Dawidek mapsize) == -1) { 83132115b10SPawel Jakub Dawidek pjdlog_errno(LOG_ERR, 83232115b10SPawel Jakub Dawidek "Unable to receive remote activemap"); 83332115b10SPawel Jakub Dawidek nv_free(nvin); 83432115b10SPawel Jakub Dawidek free(map); 83532115b10SPawel Jakub Dawidek goto close; 83632115b10SPawel Jakub Dawidek } 837d03a08e5SMikolaj Golub mtx_lock(&res->hr_amp_lock); 83832115b10SPawel Jakub Dawidek /* 83932115b10SPawel Jakub Dawidek * Merge local and remote bitmaps. 84032115b10SPawel Jakub Dawidek */ 84132115b10SPawel Jakub Dawidek activemap_merge(res->hr_amp, map, mapsize); 84232115b10SPawel Jakub Dawidek free(map); 84332115b10SPawel Jakub Dawidek /* 84432115b10SPawel Jakub Dawidek * Now that we merged bitmaps from both nodes, flush it to the 84532115b10SPawel Jakub Dawidek * disk before we start to synchronize. 84632115b10SPawel Jakub Dawidek */ 84732115b10SPawel Jakub Dawidek (void)hast_activemap_flush(res); 84832115b10SPawel Jakub Dawidek } 849584a9bc3SPawel Jakub Dawidek nv_free(nvin); 850ba2a8224SMikolaj Golub #ifdef notyet 85102dfe972SPawel Jakub Dawidek /* Setup directions. */ 85202dfe972SPawel Jakub Dawidek if (proto_send(out, NULL, 0) == -1) 85302dfe972SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); 85402dfe972SPawel Jakub Dawidek if (proto_recv(in, NULL, 0) == -1) 85502dfe972SPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, "Unable to set connection direction"); 856ba2a8224SMikolaj Golub #endif 85732115b10SPawel Jakub Dawidek pjdlog_info("Connected to %s.", res->hr_remoteaddr); 858d6e636c9SPawel Jakub Dawidek if (res->hr_original_replication == HAST_REPLICATION_MEMSYNC && 859d6e636c9SPawel Jakub Dawidek res->hr_version < 2) { 860d6e636c9SPawel Jakub Dawidek pjdlog_warning("The 'memsync' replication mode is not supported by the remote node, falling back to 'fullsync' mode."); 861d6e636c9SPawel Jakub Dawidek res->hr_replication = HAST_REPLICATION_FULLSYNC; 862d6e636c9SPawel Jakub Dawidek } else if (res->hr_replication != res->hr_original_replication) { 863d6e636c9SPawel Jakub Dawidek /* 864d6e636c9SPawel Jakub Dawidek * This is in case hastd disconnected and was upgraded. 865d6e636c9SPawel Jakub Dawidek */ 866d6e636c9SPawel Jakub Dawidek res->hr_replication = res->hr_original_replication; 867d6e636c9SPawel Jakub Dawidek } 8680d9014f3SPawel Jakub Dawidek if (inp != NULL && outp != NULL) { 8690d9014f3SPawel Jakub Dawidek *inp = in; 8700d9014f3SPawel Jakub Dawidek *outp = out; 8710d9014f3SPawel Jakub Dawidek } else { 8720d9014f3SPawel Jakub Dawidek res->hr_remotein = in; 8730d9014f3SPawel Jakub Dawidek res->hr_remoteout = out; 8740d9014f3SPawel Jakub Dawidek } 8755bdff860SPawel Jakub Dawidek event_send(res, EVENT_CONNECT); 876ac0401e3SPawel Jakub Dawidek return (0); 8770d9014f3SPawel Jakub Dawidek close: 8782be8fd75SPawel Jakub Dawidek if (errmsg != NULL && strcmp(errmsg, "Split-brain condition!") == 0) 8795bdff860SPawel Jakub Dawidek event_send(res, EVENT_SPLITBRAIN); 8800d9014f3SPawel Jakub Dawidek proto_close(out); 8810d9014f3SPawel Jakub Dawidek if (in != NULL) 8820d9014f3SPawel Jakub Dawidek proto_close(in); 883ac0401e3SPawel Jakub Dawidek return (error); 8840d9014f3SPawel Jakub Dawidek } 8850d9014f3SPawel Jakub Dawidek 8860d9014f3SPawel Jakub Dawidek static void 8870d9014f3SPawel Jakub Dawidek sync_start(void) 8880d9014f3SPawel Jakub Dawidek { 8890d9014f3SPawel Jakub Dawidek 89032115b10SPawel Jakub Dawidek mtx_lock(&sync_lock); 89132115b10SPawel Jakub Dawidek sync_inprogress = true; 89232115b10SPawel Jakub Dawidek mtx_unlock(&sync_lock); 89332115b10SPawel Jakub Dawidek cv_signal(&sync_cond); 89432115b10SPawel Jakub Dawidek } 89532115b10SPawel Jakub Dawidek 89632115b10SPawel Jakub Dawidek static void 89755ce1e7cSPawel Jakub Dawidek sync_stop(void) 89855ce1e7cSPawel Jakub Dawidek { 89955ce1e7cSPawel Jakub Dawidek 90055ce1e7cSPawel Jakub Dawidek mtx_lock(&sync_lock); 90155ce1e7cSPawel Jakub Dawidek if (sync_inprogress) 90255ce1e7cSPawel Jakub Dawidek sync_inprogress = false; 90355ce1e7cSPawel Jakub Dawidek mtx_unlock(&sync_lock); 90455ce1e7cSPawel Jakub Dawidek } 90555ce1e7cSPawel Jakub Dawidek 90655ce1e7cSPawel Jakub Dawidek static void 90732115b10SPawel Jakub Dawidek init_ggate(struct hast_resource *res) 90832115b10SPawel Jakub Dawidek { 90932115b10SPawel Jakub Dawidek struct g_gate_ctl_create ggiocreate; 91032115b10SPawel Jakub Dawidek struct g_gate_ctl_cancel ggiocancel; 91132115b10SPawel Jakub Dawidek 91232115b10SPawel Jakub Dawidek /* 91332115b10SPawel Jakub Dawidek * We communicate with ggate via /dev/ggctl. Open it. 91432115b10SPawel Jakub Dawidek */ 91532115b10SPawel Jakub Dawidek res->hr_ggatefd = open("/dev/" G_GATE_CTL_NAME, O_RDWR); 9162b1b224dSPawel Jakub Dawidek if (res->hr_ggatefd == -1) 91732115b10SPawel Jakub Dawidek primary_exit(EX_OSFILE, "Unable to open /dev/" G_GATE_CTL_NAME); 91832115b10SPawel Jakub Dawidek /* 91932115b10SPawel Jakub Dawidek * Create provider before trying to connect, as connection failure 92032115b10SPawel Jakub Dawidek * is not critical, but may take some time. 92132115b10SPawel Jakub Dawidek */ 9224e47b646SPawel Jakub Dawidek bzero(&ggiocreate, sizeof(ggiocreate)); 92332115b10SPawel Jakub Dawidek ggiocreate.gctl_version = G_GATE_VERSION; 92432115b10SPawel Jakub Dawidek ggiocreate.gctl_mediasize = res->hr_datasize; 92532115b10SPawel Jakub Dawidek ggiocreate.gctl_sectorsize = res->hr_local_sectorsize; 92632115b10SPawel Jakub Dawidek ggiocreate.gctl_flags = 0; 9272a49afacSPawel Jakub Dawidek ggiocreate.gctl_maxcount = 0; 92832115b10SPawel Jakub Dawidek ggiocreate.gctl_timeout = 0; 92932115b10SPawel Jakub Dawidek ggiocreate.gctl_unit = G_GATE_NAME_GIVEN; 93032115b10SPawel Jakub Dawidek snprintf(ggiocreate.gctl_name, sizeof(ggiocreate.gctl_name), "hast/%s", 93132115b10SPawel Jakub Dawidek res->hr_provname); 93232115b10SPawel Jakub Dawidek if (ioctl(res->hr_ggatefd, G_GATE_CMD_CREATE, &ggiocreate) == 0) { 93332115b10SPawel Jakub Dawidek pjdlog_info("Device hast/%s created.", res->hr_provname); 93432115b10SPawel Jakub Dawidek res->hr_ggateunit = ggiocreate.gctl_unit; 93532115b10SPawel Jakub Dawidek return; 93632115b10SPawel Jakub Dawidek } 93732115b10SPawel Jakub Dawidek if (errno != EEXIST) { 93832115b10SPawel Jakub Dawidek primary_exit(EX_OSERR, "Unable to create hast/%s device", 93932115b10SPawel Jakub Dawidek res->hr_provname); 94032115b10SPawel Jakub Dawidek } 94132115b10SPawel Jakub Dawidek pjdlog_debug(1, 94232115b10SPawel Jakub Dawidek "Device hast/%s already exists, we will try to take it over.", 94332115b10SPawel Jakub Dawidek res->hr_provname); 94432115b10SPawel Jakub Dawidek /* 94532115b10SPawel Jakub Dawidek * If we received EEXIST, we assume that the process who created the 94632115b10SPawel Jakub Dawidek * provider died and didn't clean up. In that case we will start from 94732115b10SPawel Jakub Dawidek * where he left of. 94832115b10SPawel Jakub Dawidek */ 9494e47b646SPawel Jakub Dawidek bzero(&ggiocancel, sizeof(ggiocancel)); 95032115b10SPawel Jakub Dawidek ggiocancel.gctl_version = G_GATE_VERSION; 95132115b10SPawel Jakub Dawidek ggiocancel.gctl_unit = G_GATE_NAME_GIVEN; 95232115b10SPawel Jakub Dawidek snprintf(ggiocancel.gctl_name, sizeof(ggiocancel.gctl_name), "hast/%s", 95332115b10SPawel Jakub Dawidek res->hr_provname); 95432115b10SPawel Jakub Dawidek if (ioctl(res->hr_ggatefd, G_GATE_CMD_CANCEL, &ggiocancel) == 0) { 95532115b10SPawel Jakub Dawidek pjdlog_info("Device hast/%s recovered.", res->hr_provname); 95632115b10SPawel Jakub Dawidek res->hr_ggateunit = ggiocancel.gctl_unit; 95732115b10SPawel Jakub Dawidek return; 95832115b10SPawel Jakub Dawidek } 95932115b10SPawel Jakub Dawidek primary_exit(EX_OSERR, "Unable to take over hast/%s device", 96032115b10SPawel Jakub Dawidek res->hr_provname); 96132115b10SPawel Jakub Dawidek } 96232115b10SPawel Jakub Dawidek 96332115b10SPawel Jakub Dawidek void 96432115b10SPawel Jakub Dawidek hastd_primary(struct hast_resource *res) 96532115b10SPawel Jakub Dawidek { 96632115b10SPawel Jakub Dawidek pthread_t td; 96732115b10SPawel Jakub Dawidek pid_t pid; 968bc7a916aSMikolaj Golub int error, mode, debuglevel; 96932115b10SPawel Jakub Dawidek 97032115b10SPawel Jakub Dawidek /* 97132ecf620SPawel Jakub Dawidek * Create communication channel for sending control commands from 97232ecf620SPawel Jakub Dawidek * parent to child. 97332115b10SPawel Jakub Dawidek */ 9742b1b224dSPawel Jakub Dawidek if (proto_client(NULL, "socketpair://", &res->hr_ctrl) == -1) { 975d64c0992SPawel Jakub Dawidek /* TODO: There's no need for this to be fatal error. */ 97632115b10SPawel Jakub Dawidek KEEP_ERRNO((void)pidfile_remove(pfh)); 9776be3a25cSPawel Jakub Dawidek pjdlog_exit(EX_OSERR, 97832115b10SPawel Jakub Dawidek "Unable to create control sockets between parent and child"); 97932115b10SPawel Jakub Dawidek } 9805bdff860SPawel Jakub Dawidek /* 98132ecf620SPawel Jakub Dawidek * Create communication channel for sending events from child to parent. 9825bdff860SPawel Jakub Dawidek */ 9832b1b224dSPawel Jakub Dawidek if (proto_client(NULL, "socketpair://", &res->hr_event) == -1) { 984d64c0992SPawel Jakub Dawidek /* TODO: There's no need for this to be fatal error. */ 9855bdff860SPawel Jakub Dawidek KEEP_ERRNO((void)pidfile_remove(pfh)); 9865bdff860SPawel Jakub Dawidek pjdlog_exit(EX_OSERR, 9875bdff860SPawel Jakub Dawidek "Unable to create event sockets between child and parent"); 9885bdff860SPawel Jakub Dawidek } 98932ecf620SPawel Jakub Dawidek /* 99032ecf620SPawel Jakub Dawidek * Create communication channel for sending connection requests from 99132ecf620SPawel Jakub Dawidek * child to parent. 99232ecf620SPawel Jakub Dawidek */ 9932b1b224dSPawel Jakub Dawidek if (proto_client(NULL, "socketpair://", &res->hr_conn) == -1) { 99432ecf620SPawel Jakub Dawidek /* TODO: There's no need for this to be fatal error. */ 99532ecf620SPawel Jakub Dawidek KEEP_ERRNO((void)pidfile_remove(pfh)); 99632ecf620SPawel Jakub Dawidek pjdlog_exit(EX_OSERR, 99732ecf620SPawel Jakub Dawidek "Unable to create connection sockets between child and parent"); 99832ecf620SPawel Jakub Dawidek } 99932115b10SPawel Jakub Dawidek 100032115b10SPawel Jakub Dawidek pid = fork(); 1001dfb1aeceSPawel Jakub Dawidek if (pid == -1) { 1002d64c0992SPawel Jakub Dawidek /* TODO: There's no need for this to be fatal error. */ 100332115b10SPawel Jakub Dawidek KEEP_ERRNO((void)pidfile_remove(pfh)); 10046be3a25cSPawel Jakub Dawidek pjdlog_exit(EX_TEMPFAIL, "Unable to fork"); 100532115b10SPawel Jakub Dawidek } 100632115b10SPawel Jakub Dawidek 100732115b10SPawel Jakub Dawidek if (pid > 0) { 100832115b10SPawel Jakub Dawidek /* This is parent. */ 10095bdff860SPawel Jakub Dawidek /* Declare that we are receiver. */ 10105bdff860SPawel Jakub Dawidek proto_recv(res->hr_event, NULL, 0); 101132ecf620SPawel Jakub Dawidek proto_recv(res->hr_conn, NULL, 0); 1012da1783eaSPawel Jakub Dawidek /* Declare that we are sender. */ 1013da1783eaSPawel Jakub Dawidek proto_send(res->hr_ctrl, NULL, 0); 101432115b10SPawel Jakub Dawidek res->hr_workerpid = pid; 101532115b10SPawel Jakub Dawidek return; 101632115b10SPawel Jakub Dawidek } 1017ecc99c89SPawel Jakub Dawidek 10185b41e644SPawel Jakub Dawidek gres = res; 10196b66c350SMikolaj Golub res->output_status_aux = output_status_aux; 1020da1783eaSPawel Jakub Dawidek mode = pjdlog_mode_get(); 1021bc7a916aSMikolaj Golub debuglevel = pjdlog_debug_get(); 102232115b10SPawel Jakub Dawidek 10235bdff860SPawel Jakub Dawidek /* Declare that we are sender. */ 10245bdff860SPawel Jakub Dawidek proto_send(res->hr_event, NULL, 0); 102532ecf620SPawel Jakub Dawidek proto_send(res->hr_conn, NULL, 0); 1026da1783eaSPawel Jakub Dawidek /* Declare that we are receiver. */ 1027da1783eaSPawel Jakub Dawidek proto_recv(res->hr_ctrl, NULL, 0); 1028da1783eaSPawel Jakub Dawidek descriptors_cleanup(res); 1029da1783eaSPawel Jakub Dawidek 1030f463896eSPawel Jakub Dawidek descriptors_assert(res, mode); 1031f463896eSPawel Jakub Dawidek 1032da1783eaSPawel Jakub Dawidek pjdlog_init(mode); 1033bc7a916aSMikolaj Golub pjdlog_debug_set(debuglevel); 1034da1783eaSPawel Jakub Dawidek pjdlog_prefix_set("[%s] (%s) ", res->hr_name, role2str(res->hr_role)); 1035643080b7SPawel Jakub Dawidek setproctitle("%s (%s)", res->hr_name, role2str(res->hr_role)); 10365bdff860SPawel Jakub Dawidek 103732115b10SPawel Jakub Dawidek init_local(res); 103832115b10SPawel Jakub Dawidek init_ggate(res); 103932115b10SPawel Jakub Dawidek init_environment(res); 1040115f4e5cSPawel Jakub Dawidek 10410cddb12fSPawel Jakub Dawidek if (drop_privs(res) != 0) { 10426d7967deSPawel Jakub Dawidek cleanup(res); 10436d7967deSPawel Jakub Dawidek exit(EX_CONFIG); 10446d7967deSPawel Jakub Dawidek } 1045f4c96f94SPawel Jakub Dawidek pjdlog_info("Privileges successfully dropped."); 10466d7967deSPawel Jakub Dawidek 10478b70e6aeSPawel Jakub Dawidek /* 10484a88128bSPawel Jakub Dawidek * Create the guard thread first, so we can handle signals from the 10494b85a12fSUlrich Spörlein * very beginning. 10504a88128bSPawel Jakub Dawidek */ 10514a88128bSPawel Jakub Dawidek error = pthread_create(&td, NULL, guard_thread, res); 10522ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(error == 0); 10534a88128bSPawel Jakub Dawidek /* 10548b70e6aeSPawel Jakub Dawidek * Create the control thread before sending any event to the parent, 10558b70e6aeSPawel Jakub Dawidek * as we can deadlock when parent sends control request to worker, 10568b70e6aeSPawel Jakub Dawidek * but worker has no control thread started yet, so parent waits. 10578b70e6aeSPawel Jakub Dawidek * In the meantime worker sends an event to the parent, but parent 10588b70e6aeSPawel Jakub Dawidek * is unable to handle the event, because it waits for control 10598b70e6aeSPawel Jakub Dawidek * request response. 10608b70e6aeSPawel Jakub Dawidek */ 10618b70e6aeSPawel Jakub Dawidek error = pthread_create(&td, NULL, ctrl_thread, res); 10622ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(error == 0); 1063ac0401e3SPawel Jakub Dawidek if (real_remote(res)) { 1064ac0401e3SPawel Jakub Dawidek error = init_remote(res, NULL, NULL); 1065ac0401e3SPawel Jakub Dawidek if (error == 0) { 10668b70e6aeSPawel Jakub Dawidek sync_start(); 1067ac0401e3SPawel Jakub Dawidek } else if (error == EBUSY) { 1068ac0401e3SPawel Jakub Dawidek time_t start = time(NULL); 1069ac0401e3SPawel Jakub Dawidek 1070ac0401e3SPawel Jakub Dawidek pjdlog_warning("Waiting for remote node to become %s for %ds.", 1071ac0401e3SPawel Jakub Dawidek role2str(HAST_ROLE_SECONDARY), 1072ac0401e3SPawel Jakub Dawidek res->hr_timeout); 1073ac0401e3SPawel Jakub Dawidek for (;;) { 1074ac0401e3SPawel Jakub Dawidek sleep(1); 1075ac0401e3SPawel Jakub Dawidek error = init_remote(res, NULL, NULL); 1076ac0401e3SPawel Jakub Dawidek if (error != EBUSY) 1077ac0401e3SPawel Jakub Dawidek break; 1078ac0401e3SPawel Jakub Dawidek if (time(NULL) > start + res->hr_timeout) 1079ac0401e3SPawel Jakub Dawidek break; 1080ac0401e3SPawel Jakub Dawidek } 1081ac0401e3SPawel Jakub Dawidek if (error == EBUSY) { 1082ac0401e3SPawel Jakub Dawidek pjdlog_warning("Remote node is still %s, starting anyway.", 1083ac0401e3SPawel Jakub Dawidek role2str(HAST_ROLE_PRIMARY)); 1084ac0401e3SPawel Jakub Dawidek } 1085ac0401e3SPawel Jakub Dawidek } 1086ac0401e3SPawel Jakub Dawidek } 108732115b10SPawel Jakub Dawidek error = pthread_create(&td, NULL, ggate_recv_thread, res); 10882ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(error == 0); 108932115b10SPawel Jakub Dawidek error = pthread_create(&td, NULL, local_send_thread, res); 10902ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(error == 0); 109132115b10SPawel Jakub Dawidek error = pthread_create(&td, NULL, remote_send_thread, res); 10922ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(error == 0); 109332115b10SPawel Jakub Dawidek error = pthread_create(&td, NULL, remote_recv_thread, res); 10942ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(error == 0); 109532115b10SPawel Jakub Dawidek error = pthread_create(&td, NULL, ggate_send_thread, res); 10962ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(error == 0); 1097ac0401e3SPawel Jakub Dawidek fullystarted = true; 10984a88128bSPawel Jakub Dawidek (void)sync_thread(res); 109932115b10SPawel Jakub Dawidek } 110032115b10SPawel Jakub Dawidek 110132115b10SPawel Jakub Dawidek static void 1102d6e636c9SPawel Jakub Dawidek reqlog(int loglevel, int debuglevel, struct g_gate_ctl_io *ggio, 1103d6e636c9SPawel Jakub Dawidek const char *fmt, ...) 110432115b10SPawel Jakub Dawidek { 110532115b10SPawel Jakub Dawidek char msg[1024]; 110632115b10SPawel Jakub Dawidek va_list ap; 110732115b10SPawel Jakub Dawidek 110832115b10SPawel Jakub Dawidek va_start(ap, fmt); 11094c71d263SPawel Jakub Dawidek (void)vsnprintf(msg, sizeof(msg), fmt, ap); 111032115b10SPawel Jakub Dawidek va_end(ap); 111132115b10SPawel Jakub Dawidek switch (ggio->gctl_cmd) { 111232115b10SPawel Jakub Dawidek case BIO_READ: 11134c71d263SPawel Jakub Dawidek (void)snprlcat(msg, sizeof(msg), "READ(%ju, %ju).", 1114d6e636c9SPawel Jakub Dawidek (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length); 111532115b10SPawel Jakub Dawidek break; 111632115b10SPawel Jakub Dawidek case BIO_DELETE: 11174c71d263SPawel Jakub Dawidek (void)snprlcat(msg, sizeof(msg), "DELETE(%ju, %ju).", 1118d6e636c9SPawel Jakub Dawidek (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length); 111932115b10SPawel Jakub Dawidek break; 112032115b10SPawel Jakub Dawidek case BIO_FLUSH: 11214c71d263SPawel Jakub Dawidek (void)snprlcat(msg, sizeof(msg), "FLUSH."); 112232115b10SPawel Jakub Dawidek break; 112332115b10SPawel Jakub Dawidek case BIO_WRITE: 11244c71d263SPawel Jakub Dawidek (void)snprlcat(msg, sizeof(msg), "WRITE(%ju, %ju).", 1125d6e636c9SPawel Jakub Dawidek (uintmax_t)ggio->gctl_offset, (uintmax_t)ggio->gctl_length); 112632115b10SPawel Jakub Dawidek break; 112732115b10SPawel Jakub Dawidek default: 11284c71d263SPawel Jakub Dawidek (void)snprlcat(msg, sizeof(msg), "UNKNOWN(%u).", 11294c71d263SPawel Jakub Dawidek (unsigned int)ggio->gctl_cmd); 113032115b10SPawel Jakub Dawidek break; 113132115b10SPawel Jakub Dawidek } 113232115b10SPawel Jakub Dawidek pjdlog_common(loglevel, debuglevel, -1, "%s", msg); 113332115b10SPawel Jakub Dawidek } 113432115b10SPawel Jakub Dawidek 113532115b10SPawel Jakub Dawidek static void 113632115b10SPawel Jakub Dawidek remote_close(struct hast_resource *res, int ncomp) 113732115b10SPawel Jakub Dawidek { 113832115b10SPawel Jakub Dawidek 113932115b10SPawel Jakub Dawidek rw_wlock(&hio_remote_lock[ncomp]); 114032115b10SPawel Jakub Dawidek /* 11415a58d22aSPawel Jakub Dawidek * Check for a race between dropping rlock and acquiring wlock - 114232115b10SPawel Jakub Dawidek * another thread can close connection in-between. 114332115b10SPawel Jakub Dawidek */ 114432115b10SPawel Jakub Dawidek if (!ISCONNECTED(res, ncomp)) { 11452ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remotein == NULL); 11462ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remoteout == NULL); 114732115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 114832115b10SPawel Jakub Dawidek return; 114932115b10SPawel Jakub Dawidek } 115032115b10SPawel Jakub Dawidek 11512ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remotein != NULL); 11522ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remoteout != NULL); 115332115b10SPawel Jakub Dawidek 11548f8c798cSPawel Jakub Dawidek pjdlog_debug(2, "Closing incoming connection to %s.", 115532115b10SPawel Jakub Dawidek res->hr_remoteaddr); 115632115b10SPawel Jakub Dawidek proto_close(res->hr_remotein); 115732115b10SPawel Jakub Dawidek res->hr_remotein = NULL; 11588f8c798cSPawel Jakub Dawidek pjdlog_debug(2, "Closing outgoing connection to %s.", 115932115b10SPawel Jakub Dawidek res->hr_remoteaddr); 116032115b10SPawel Jakub Dawidek proto_close(res->hr_remoteout); 116132115b10SPawel Jakub Dawidek res->hr_remoteout = NULL; 116232115b10SPawel Jakub Dawidek 116332115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 116432115b10SPawel Jakub Dawidek 11658f8c798cSPawel Jakub Dawidek pjdlog_warning("Disconnected from %s.", res->hr_remoteaddr); 11668f8c798cSPawel Jakub Dawidek 116732115b10SPawel Jakub Dawidek /* 116832115b10SPawel Jakub Dawidek * Stop synchronization if in-progress. 116932115b10SPawel Jakub Dawidek */ 117055ce1e7cSPawel Jakub Dawidek sync_stop(); 11715b41e644SPawel Jakub Dawidek 11725bdff860SPawel Jakub Dawidek event_send(res, EVENT_DISCONNECT); 1173f7fe83f9SPawel Jakub Dawidek } 117432115b10SPawel Jakub Dawidek 117532115b10SPawel Jakub Dawidek /* 117607ebc362SPawel Jakub Dawidek * Acknowledge write completion to the kernel, but don't update activemap yet. 117707ebc362SPawel Jakub Dawidek */ 117807ebc362SPawel Jakub Dawidek static void 117907ebc362SPawel Jakub Dawidek write_complete(struct hast_resource *res, struct hio *hio) 118007ebc362SPawel Jakub Dawidek { 118107ebc362SPawel Jakub Dawidek struct g_gate_ctl_io *ggio; 118207ebc362SPawel Jakub Dawidek unsigned int ncomp; 118307ebc362SPawel Jakub Dawidek 118407ebc362SPawel Jakub Dawidek PJDLOG_ASSERT(!hio->hio_done); 118507ebc362SPawel Jakub Dawidek 118607ebc362SPawel Jakub Dawidek ggio = &hio->hio_ggio; 118707ebc362SPawel Jakub Dawidek PJDLOG_ASSERT(ggio->gctl_cmd == BIO_WRITE); 118807ebc362SPawel Jakub Dawidek 118907ebc362SPawel Jakub Dawidek /* 119007ebc362SPawel Jakub Dawidek * Bump local count if this is first write after 119107ebc362SPawel Jakub Dawidek * connection failure with remote node. 119207ebc362SPawel Jakub Dawidek */ 119307ebc362SPawel Jakub Dawidek ncomp = 1; 119407ebc362SPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ncomp]); 119507ebc362SPawel Jakub Dawidek if (!ISCONNECTED(res, ncomp)) { 119607ebc362SPawel Jakub Dawidek mtx_lock(&metadata_lock); 119707ebc362SPawel Jakub Dawidek if (res->hr_primary_localcnt == res->hr_secondary_remotecnt) { 119807ebc362SPawel Jakub Dawidek res->hr_primary_localcnt++; 119907ebc362SPawel Jakub Dawidek pjdlog_debug(1, "Increasing localcnt to %ju.", 120007ebc362SPawel Jakub Dawidek (uintmax_t)res->hr_primary_localcnt); 120107ebc362SPawel Jakub Dawidek (void)metadata_write(res); 120207ebc362SPawel Jakub Dawidek } 120307ebc362SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 120407ebc362SPawel Jakub Dawidek } 120507ebc362SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 12062b1b224dSPawel Jakub Dawidek if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1) 120707ebc362SPawel Jakub Dawidek primary_exit(EX_OSERR, "G_GATE_CMD_DONE failed"); 120807ebc362SPawel Jakub Dawidek hio->hio_done = true; 120907ebc362SPawel Jakub Dawidek } 121007ebc362SPawel Jakub Dawidek 121107ebc362SPawel Jakub Dawidek /* 121232115b10SPawel Jakub Dawidek * Thread receives ggate I/O requests from the kernel and passes them to 121332115b10SPawel Jakub Dawidek * appropriate threads: 121432115b10SPawel Jakub Dawidek * WRITE - always goes to both local_send and remote_send threads 121532115b10SPawel Jakub Dawidek * READ (when the block is up-to-date on local component) - 121632115b10SPawel Jakub Dawidek * only local_send thread 121732115b10SPawel Jakub Dawidek * READ (when the block isn't up-to-date on local component) - 121832115b10SPawel Jakub Dawidek * only remote_send thread 121932115b10SPawel Jakub Dawidek * DELETE - always goes to both local_send and remote_send threads 122032115b10SPawel Jakub Dawidek * FLUSH - always goes to both local_send and remote_send threads 122132115b10SPawel Jakub Dawidek */ 122232115b10SPawel Jakub Dawidek static void * 122332115b10SPawel Jakub Dawidek ggate_recv_thread(void *arg) 122432115b10SPawel Jakub Dawidek { 122532115b10SPawel Jakub Dawidek struct hast_resource *res = arg; 122632115b10SPawel Jakub Dawidek struct g_gate_ctl_io *ggio; 122732115b10SPawel Jakub Dawidek struct hio *hio; 122832115b10SPawel Jakub Dawidek unsigned int ii, ncomp, ncomps; 122932115b10SPawel Jakub Dawidek int error; 123032115b10SPawel Jakub Dawidek 123132115b10SPawel Jakub Dawidek for (;;) { 123232115b10SPawel Jakub Dawidek pjdlog_debug(2, "ggate_recv: Taking free request."); 123332115b10SPawel Jakub Dawidek QUEUE_TAKE2(hio, free); 123432115b10SPawel Jakub Dawidek pjdlog_debug(2, "ggate_recv: (%p) Got free request.", hio); 123532115b10SPawel Jakub Dawidek ggio = &hio->hio_ggio; 123632115b10SPawel Jakub Dawidek ggio->gctl_unit = res->hr_ggateunit; 123732115b10SPawel Jakub Dawidek ggio->gctl_length = MAXPHYS; 123832115b10SPawel Jakub Dawidek ggio->gctl_error = 0; 123907ebc362SPawel Jakub Dawidek hio->hio_done = false; 124007ebc362SPawel Jakub Dawidek hio->hio_replication = res->hr_replication; 124132115b10SPawel Jakub Dawidek pjdlog_debug(2, 124232115b10SPawel Jakub Dawidek "ggate_recv: (%p) Waiting for request from the kernel.", 124332115b10SPawel Jakub Dawidek hio); 12442b1b224dSPawel Jakub Dawidek if (ioctl(res->hr_ggatefd, G_GATE_CMD_START, ggio) == -1) { 124532115b10SPawel Jakub Dawidek if (sigexit_received) 124632115b10SPawel Jakub Dawidek pthread_exit(NULL); 124732115b10SPawel Jakub Dawidek primary_exit(EX_OSERR, "G_GATE_CMD_START failed"); 124832115b10SPawel Jakub Dawidek } 124932115b10SPawel Jakub Dawidek error = ggio->gctl_error; 125032115b10SPawel Jakub Dawidek switch (error) { 125132115b10SPawel Jakub Dawidek case 0: 125232115b10SPawel Jakub Dawidek break; 125332115b10SPawel Jakub Dawidek case ECANCELED: 125432115b10SPawel Jakub Dawidek /* Exit gracefully. */ 125532115b10SPawel Jakub Dawidek if (!sigexit_received) { 125632115b10SPawel Jakub Dawidek pjdlog_debug(2, 125732115b10SPawel Jakub Dawidek "ggate_recv: (%p) Received cancel from the kernel.", 125832115b10SPawel Jakub Dawidek hio); 125932115b10SPawel Jakub Dawidek pjdlog_info("Received cancel from the kernel, exiting."); 126032115b10SPawel Jakub Dawidek } 126132115b10SPawel Jakub Dawidek pthread_exit(NULL); 126232115b10SPawel Jakub Dawidek case ENOMEM: 126332115b10SPawel Jakub Dawidek /* 126432115b10SPawel Jakub Dawidek * Buffer too small? Impossible, we allocate MAXPHYS 126532115b10SPawel Jakub Dawidek * bytes - request can't be bigger than that. 126632115b10SPawel Jakub Dawidek */ 126732115b10SPawel Jakub Dawidek /* FALLTHROUGH */ 126832115b10SPawel Jakub Dawidek case ENXIO: 126932115b10SPawel Jakub Dawidek default: 127032115b10SPawel Jakub Dawidek primary_exitx(EX_OSERR, "G_GATE_CMD_START failed: %s.", 127132115b10SPawel Jakub Dawidek strerror(error)); 127232115b10SPawel Jakub Dawidek } 127307ebc362SPawel Jakub Dawidek 127407ebc362SPawel Jakub Dawidek ncomp = 0; 127507ebc362SPawel Jakub Dawidek ncomps = HAST_NCOMPONENTS; 127607ebc362SPawel Jakub Dawidek 127732115b10SPawel Jakub Dawidek for (ii = 0; ii < ncomps; ii++) 127832115b10SPawel Jakub Dawidek hio->hio_errors[ii] = EINVAL; 127932115b10SPawel Jakub Dawidek reqlog(LOG_DEBUG, 2, ggio, 128032115b10SPawel Jakub Dawidek "ggate_recv: (%p) Request received from the kernel: ", 128132115b10SPawel Jakub Dawidek hio); 128207ebc362SPawel Jakub Dawidek 128332115b10SPawel Jakub Dawidek /* 128432115b10SPawel Jakub Dawidek * Inform all components about new write request. 128532115b10SPawel Jakub Dawidek * For read request prefer local component unless the given 128632115b10SPawel Jakub Dawidek * range is out-of-date, then use remote component. 128732115b10SPawel Jakub Dawidek */ 128832115b10SPawel Jakub Dawidek switch (ggio->gctl_cmd) { 128932115b10SPawel Jakub Dawidek case BIO_READ: 12903db86c39SPawel Jakub Dawidek res->hr_stat_read++; 129107ebc362SPawel Jakub Dawidek ncomps = 1; 129232115b10SPawel Jakub Dawidek mtx_lock(&metadata_lock); 129332115b10SPawel Jakub Dawidek if (res->hr_syncsrc == HAST_SYNCSRC_UNDEF || 129432115b10SPawel Jakub Dawidek res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { 129532115b10SPawel Jakub Dawidek /* 129632115b10SPawel Jakub Dawidek * This range is up-to-date on local component, 129732115b10SPawel Jakub Dawidek * so handle request locally. 129832115b10SPawel Jakub Dawidek */ 129932115b10SPawel Jakub Dawidek /* Local component is 0 for now. */ 130032115b10SPawel Jakub Dawidek ncomp = 0; 130132115b10SPawel Jakub Dawidek } else /* if (res->hr_syncsrc == 130232115b10SPawel Jakub Dawidek HAST_SYNCSRC_SECONDARY) */ { 13032ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_syncsrc == 130432115b10SPawel Jakub Dawidek HAST_SYNCSRC_SECONDARY); 130532115b10SPawel Jakub Dawidek /* 130632115b10SPawel Jakub Dawidek * This range is out-of-date on local component, 130732115b10SPawel Jakub Dawidek * so send request to the remote node. 130832115b10SPawel Jakub Dawidek */ 130932115b10SPawel Jakub Dawidek /* Remote component is 1 for now. */ 131032115b10SPawel Jakub Dawidek ncomp = 1; 131132115b10SPawel Jakub Dawidek } 131232115b10SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 131332115b10SPawel Jakub Dawidek break; 131432115b10SPawel Jakub Dawidek case BIO_WRITE: 13153db86c39SPawel Jakub Dawidek res->hr_stat_write++; 13168a34134aSPawel Jakub Dawidek if (res->hr_resuid == 0 && 13178a34134aSPawel Jakub Dawidek res->hr_primary_localcnt == 0) { 13188a34134aSPawel Jakub Dawidek /* This is first write. */ 13199446b453SPawel Jakub Dawidek res->hr_primary_localcnt = 1; 1320ce837469SPawel Jakub Dawidek } 132132115b10SPawel Jakub Dawidek for (;;) { 132232115b10SPawel Jakub Dawidek mtx_lock(&range_lock); 132332115b10SPawel Jakub Dawidek if (rangelock_islocked(range_sync, 132432115b10SPawel Jakub Dawidek ggio->gctl_offset, ggio->gctl_length)) { 132532115b10SPawel Jakub Dawidek pjdlog_debug(2, 132632115b10SPawel Jakub Dawidek "regular: Range offset=%jd length=%zu locked.", 132732115b10SPawel Jakub Dawidek (intmax_t)ggio->gctl_offset, 132832115b10SPawel Jakub Dawidek (size_t)ggio->gctl_length); 132932115b10SPawel Jakub Dawidek range_regular_wait = true; 133032115b10SPawel Jakub Dawidek cv_wait(&range_regular_cond, &range_lock); 133132115b10SPawel Jakub Dawidek range_regular_wait = false; 133232115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 133332115b10SPawel Jakub Dawidek continue; 133432115b10SPawel Jakub Dawidek } 133532115b10SPawel Jakub Dawidek if (rangelock_add(range_regular, 13362b1b224dSPawel Jakub Dawidek ggio->gctl_offset, ggio->gctl_length) == -1) { 133732115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 133832115b10SPawel Jakub Dawidek pjdlog_debug(2, 133932115b10SPawel Jakub Dawidek "regular: Range offset=%jd length=%zu is already locked, waiting.", 134032115b10SPawel Jakub Dawidek (intmax_t)ggio->gctl_offset, 134132115b10SPawel Jakub Dawidek (size_t)ggio->gctl_length); 134232115b10SPawel Jakub Dawidek sleep(1); 134332115b10SPawel Jakub Dawidek continue; 134432115b10SPawel Jakub Dawidek } 134532115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 134632115b10SPawel Jakub Dawidek break; 134732115b10SPawel Jakub Dawidek } 134832115b10SPawel Jakub Dawidek mtx_lock(&res->hr_amp_lock); 134932115b10SPawel Jakub Dawidek if (activemap_write_start(res->hr_amp, 135032115b10SPawel Jakub Dawidek ggio->gctl_offset, ggio->gctl_length)) { 13513db86c39SPawel Jakub Dawidek res->hr_stat_activemap_update++; 135232115b10SPawel Jakub Dawidek (void)hast_activemap_flush(res); 1353a818a4ffSMikolaj Golub } else { 135432115b10SPawel Jakub Dawidek mtx_unlock(&res->hr_amp_lock); 1355a818a4ffSMikolaj Golub } 13568f04423fSMikolaj Golub if (ISMEMSYNC(hio)) { 13575d69ed53SMikolaj Golub hio->hio_memsyncacked = false; 13585d69ed53SMikolaj Golub refcnt_init(&hio->hio_writecount, ncomps); 13595d69ed53SMikolaj Golub } 136007ebc362SPawel Jakub Dawidek break; 13613db86c39SPawel Jakub Dawidek case BIO_DELETE: 13623db86c39SPawel Jakub Dawidek res->hr_stat_delete++; 13633db86c39SPawel Jakub Dawidek break; 13643db86c39SPawel Jakub Dawidek case BIO_FLUSH: 13653db86c39SPawel Jakub Dawidek res->hr_stat_flush++; 13663db86c39SPawel Jakub Dawidek break; 13673db86c39SPawel Jakub Dawidek } 136832115b10SPawel Jakub Dawidek pjdlog_debug(2, 136907ebc362SPawel Jakub Dawidek "ggate_recv: (%p) Moving request to the send queues.", hio); 13706e87c151SEd Schouten refcnt_init(&hio->hio_countdown, ncomps); 1371d6e636c9SPawel Jakub Dawidek for (ii = ncomp; ii < ncomps; ii++) 137232115b10SPawel Jakub Dawidek QUEUE_INSERT1(hio, send, ii); 137332115b10SPawel Jakub Dawidek } 137432115b10SPawel Jakub Dawidek /* NOTREACHED */ 137532115b10SPawel Jakub Dawidek return (NULL); 137632115b10SPawel Jakub Dawidek } 137732115b10SPawel Jakub Dawidek 137832115b10SPawel Jakub Dawidek /* 137932115b10SPawel Jakub Dawidek * Thread reads from or writes to local component. 138032115b10SPawel Jakub Dawidek * If local read fails, it redirects it to remote_send thread. 138132115b10SPawel Jakub Dawidek */ 138232115b10SPawel Jakub Dawidek static void * 138332115b10SPawel Jakub Dawidek local_send_thread(void *arg) 138432115b10SPawel Jakub Dawidek { 138532115b10SPawel Jakub Dawidek struct hast_resource *res = arg; 138632115b10SPawel Jakub Dawidek struct g_gate_ctl_io *ggio; 138732115b10SPawel Jakub Dawidek struct hio *hio; 138832115b10SPawel Jakub Dawidek unsigned int ncomp, rncomp; 138932115b10SPawel Jakub Dawidek ssize_t ret; 139032115b10SPawel Jakub Dawidek 139132115b10SPawel Jakub Dawidek /* Local component is 0 for now. */ 139232115b10SPawel Jakub Dawidek ncomp = 0; 139332115b10SPawel Jakub Dawidek /* Remote component is 1 for now. */ 139432115b10SPawel Jakub Dawidek rncomp = 1; 139532115b10SPawel Jakub Dawidek 139632115b10SPawel Jakub Dawidek for (;;) { 139732115b10SPawel Jakub Dawidek pjdlog_debug(2, "local_send: Taking request."); 1398448efa94SPawel Jakub Dawidek QUEUE_TAKE1(hio, send, ncomp, 0); 139932115b10SPawel Jakub Dawidek pjdlog_debug(2, "local_send: (%p) Got request.", hio); 140032115b10SPawel Jakub Dawidek ggio = &hio->hio_ggio; 140132115b10SPawel Jakub Dawidek switch (ggio->gctl_cmd) { 140232115b10SPawel Jakub Dawidek case BIO_READ: 140332115b10SPawel Jakub Dawidek ret = pread(res->hr_localfd, ggio->gctl_data, 140432115b10SPawel Jakub Dawidek ggio->gctl_length, 140532115b10SPawel Jakub Dawidek ggio->gctl_offset + res->hr_localoff); 140632115b10SPawel Jakub Dawidek if (ret == ggio->gctl_length) 140732115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = 0; 1408a01a750fSMikolaj Golub else if (!ISSYNCREQ(hio)) { 140932115b10SPawel Jakub Dawidek /* 141032115b10SPawel Jakub Dawidek * If READ failed, try to read from remote node. 141132115b10SPawel Jakub Dawidek */ 14122b1b224dSPawel Jakub Dawidek if (ret == -1) { 1413cd7b7ee5SPawel Jakub Dawidek reqlog(LOG_WARNING, 0, ggio, 1414cd7b7ee5SPawel Jakub Dawidek "Local request failed (%s), trying remote node. ", 1415cd7b7ee5SPawel Jakub Dawidek strerror(errno)); 1416cd7b7ee5SPawel Jakub Dawidek } else if (ret != ggio->gctl_length) { 1417cd7b7ee5SPawel Jakub Dawidek reqlog(LOG_WARNING, 0, ggio, 1418cd7b7ee5SPawel Jakub Dawidek "Local request failed (%zd != %jd), trying remote node. ", 1419fba1bf5aSPawel Jakub Dawidek ret, (intmax_t)ggio->gctl_length); 1420cd7b7ee5SPawel Jakub Dawidek } 142132115b10SPawel Jakub Dawidek QUEUE_INSERT1(hio, send, rncomp); 142232115b10SPawel Jakub Dawidek continue; 142332115b10SPawel Jakub Dawidek } 142432115b10SPawel Jakub Dawidek break; 142532115b10SPawel Jakub Dawidek case BIO_WRITE: 142632115b10SPawel Jakub Dawidek ret = pwrite(res->hr_localfd, ggio->gctl_data, 142732115b10SPawel Jakub Dawidek ggio->gctl_length, 142832115b10SPawel Jakub Dawidek ggio->gctl_offset + res->hr_localoff); 14292b1b224dSPawel Jakub Dawidek if (ret == -1) { 143032115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = errno; 1431cd7b7ee5SPawel Jakub Dawidek reqlog(LOG_WARNING, 0, ggio, 1432cd7b7ee5SPawel Jakub Dawidek "Local request failed (%s): ", 1433cd7b7ee5SPawel Jakub Dawidek strerror(errno)); 1434cd7b7ee5SPawel Jakub Dawidek } else if (ret != ggio->gctl_length) { 143532115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = EIO; 1436cd7b7ee5SPawel Jakub Dawidek reqlog(LOG_WARNING, 0, ggio, 1437cd7b7ee5SPawel Jakub Dawidek "Local request failed (%zd != %jd): ", 1438fba1bf5aSPawel Jakub Dawidek ret, (intmax_t)ggio->gctl_length); 1439cd7b7ee5SPawel Jakub Dawidek } else { 144032115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = 0; 14418f04423fSMikolaj Golub if (ISASYNC(hio)) { 144207ebc362SPawel Jakub Dawidek ggio->gctl_error = 0; 144307ebc362SPawel Jakub Dawidek write_complete(res, hio); 144407ebc362SPawel Jakub Dawidek } 1445cd7b7ee5SPawel Jakub Dawidek } 144632115b10SPawel Jakub Dawidek break; 144732115b10SPawel Jakub Dawidek case BIO_DELETE: 144832115b10SPawel Jakub Dawidek ret = g_delete(res->hr_localfd, 144932115b10SPawel Jakub Dawidek ggio->gctl_offset + res->hr_localoff, 145032115b10SPawel Jakub Dawidek ggio->gctl_length); 14512b1b224dSPawel Jakub Dawidek if (ret == -1) { 145232115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = errno; 1453cd7b7ee5SPawel Jakub Dawidek reqlog(LOG_WARNING, 0, ggio, 1454cd7b7ee5SPawel Jakub Dawidek "Local request failed (%s): ", 1455cd7b7ee5SPawel Jakub Dawidek strerror(errno)); 1456cd7b7ee5SPawel Jakub Dawidek } else { 145732115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = 0; 1458cd7b7ee5SPawel Jakub Dawidek } 145932115b10SPawel Jakub Dawidek break; 146032115b10SPawel Jakub Dawidek case BIO_FLUSH: 146112daf727SPawel Jakub Dawidek if (!res->hr_localflush) { 146212daf727SPawel Jakub Dawidek ret = -1; 146312daf727SPawel Jakub Dawidek errno = EOPNOTSUPP; 146412daf727SPawel Jakub Dawidek break; 146512daf727SPawel Jakub Dawidek } 146632115b10SPawel Jakub Dawidek ret = g_flush(res->hr_localfd); 14672b1b224dSPawel Jakub Dawidek if (ret == -1) { 146812daf727SPawel Jakub Dawidek if (errno == EOPNOTSUPP) 146912daf727SPawel Jakub Dawidek res->hr_localflush = false; 147032115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = errno; 1471cd7b7ee5SPawel Jakub Dawidek reqlog(LOG_WARNING, 0, ggio, 1472cd7b7ee5SPawel Jakub Dawidek "Local request failed (%s): ", 1473cd7b7ee5SPawel Jakub Dawidek strerror(errno)); 1474cd7b7ee5SPawel Jakub Dawidek } else { 147532115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = 0; 1476cd7b7ee5SPawel Jakub Dawidek } 147732115b10SPawel Jakub Dawidek break; 147832115b10SPawel Jakub Dawidek } 14795d69ed53SMikolaj Golub if (ISMEMSYNCWRITE(hio)) { 14805d69ed53SMikolaj Golub if (refcnt_release(&hio->hio_writecount) == 0) { 14815d69ed53SMikolaj Golub write_complete(res, hio); 14825d69ed53SMikolaj Golub } 14835d69ed53SMikolaj Golub } 1484d6e636c9SPawel Jakub Dawidek if (refcnt_release(&hio->hio_countdown) > 0) 148543b8675bSPawel Jakub Dawidek continue; 148632115b10SPawel Jakub Dawidek if (ISSYNCREQ(hio)) { 148732115b10SPawel Jakub Dawidek mtx_lock(&sync_lock); 148832115b10SPawel Jakub Dawidek SYNCREQDONE(hio); 148932115b10SPawel Jakub Dawidek mtx_unlock(&sync_lock); 149032115b10SPawel Jakub Dawidek cv_signal(&sync_cond); 149132115b10SPawel Jakub Dawidek } else { 149232115b10SPawel Jakub Dawidek pjdlog_debug(2, 149332115b10SPawel Jakub Dawidek "local_send: (%p) Moving request to the done queue.", 149432115b10SPawel Jakub Dawidek hio); 149532115b10SPawel Jakub Dawidek QUEUE_INSERT2(hio, done); 149632115b10SPawel Jakub Dawidek } 149732115b10SPawel Jakub Dawidek } 149832115b10SPawel Jakub Dawidek /* NOTREACHED */ 149932115b10SPawel Jakub Dawidek return (NULL); 150032115b10SPawel Jakub Dawidek } 150132115b10SPawel Jakub Dawidek 1502448efa94SPawel Jakub Dawidek static void 1503448efa94SPawel Jakub Dawidek keepalive_send(struct hast_resource *res, unsigned int ncomp) 1504448efa94SPawel Jakub Dawidek { 1505448efa94SPawel Jakub Dawidek struct nv *nv; 1506448efa94SPawel Jakub Dawidek 150721e7bc5eSPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ncomp]); 150821e7bc5eSPawel Jakub Dawidek 150921e7bc5eSPawel Jakub Dawidek if (!ISCONNECTED(res, ncomp)) { 151021e7bc5eSPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 1511448efa94SPawel Jakub Dawidek return; 151221e7bc5eSPawel Jakub Dawidek } 1513448efa94SPawel Jakub Dawidek 15142ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remotein != NULL); 15152ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remoteout != NULL); 1516448efa94SPawel Jakub Dawidek 1517448efa94SPawel Jakub Dawidek nv = nv_alloc(); 1518448efa94SPawel Jakub Dawidek nv_add_uint8(nv, HIO_KEEPALIVE, "cmd"); 1519448efa94SPawel Jakub Dawidek if (nv_error(nv) != 0) { 152021e7bc5eSPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 1521448efa94SPawel Jakub Dawidek nv_free(nv); 1522448efa94SPawel Jakub Dawidek pjdlog_debug(1, 1523448efa94SPawel Jakub Dawidek "keepalive_send: Unable to prepare header to send."); 1524448efa94SPawel Jakub Dawidek return; 1525448efa94SPawel Jakub Dawidek } 15262b1b224dSPawel Jakub Dawidek if (hast_proto_send(res, res->hr_remoteout, nv, NULL, 0) == -1) { 152721e7bc5eSPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 1528448efa94SPawel Jakub Dawidek pjdlog_common(LOG_DEBUG, 1, errno, 1529448efa94SPawel Jakub Dawidek "keepalive_send: Unable to send request"); 1530448efa94SPawel Jakub Dawidek nv_free(nv); 1531448efa94SPawel Jakub Dawidek remote_close(res, ncomp); 1532448efa94SPawel Jakub Dawidek return; 1533448efa94SPawel Jakub Dawidek } 153421e7bc5eSPawel Jakub Dawidek 153521e7bc5eSPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 1536448efa94SPawel Jakub Dawidek nv_free(nv); 1537448efa94SPawel Jakub Dawidek pjdlog_debug(2, "keepalive_send: Request sent."); 1538448efa94SPawel Jakub Dawidek } 1539448efa94SPawel Jakub Dawidek 154032115b10SPawel Jakub Dawidek /* 154132115b10SPawel Jakub Dawidek * Thread sends request to secondary node. 154232115b10SPawel Jakub Dawidek */ 154332115b10SPawel Jakub Dawidek static void * 154432115b10SPawel Jakub Dawidek remote_send_thread(void *arg) 154532115b10SPawel Jakub Dawidek { 154632115b10SPawel Jakub Dawidek struct hast_resource *res = arg; 154732115b10SPawel Jakub Dawidek struct g_gate_ctl_io *ggio; 1548448efa94SPawel Jakub Dawidek time_t lastcheck, now; 154932115b10SPawel Jakub Dawidek struct hio *hio; 155032115b10SPawel Jakub Dawidek struct nv *nv; 155132115b10SPawel Jakub Dawidek unsigned int ncomp; 155232115b10SPawel Jakub Dawidek bool wakeup; 155332115b10SPawel Jakub Dawidek uint64_t offset, length; 155432115b10SPawel Jakub Dawidek uint8_t cmd; 155532115b10SPawel Jakub Dawidek void *data; 155632115b10SPawel Jakub Dawidek 155732115b10SPawel Jakub Dawidek /* Remote component is 1 for now. */ 155832115b10SPawel Jakub Dawidek ncomp = 1; 1559448efa94SPawel Jakub Dawidek lastcheck = time(NULL); 156032115b10SPawel Jakub Dawidek 156132115b10SPawel Jakub Dawidek for (;;) { 156232115b10SPawel Jakub Dawidek pjdlog_debug(2, "remote_send: Taking request."); 15638d7dcf14SMikolaj Golub QUEUE_TAKE1(hio, send, ncomp, HAST_KEEPALIVE); 1564448efa94SPawel Jakub Dawidek if (hio == NULL) { 1565448efa94SPawel Jakub Dawidek now = time(NULL); 15668d7dcf14SMikolaj Golub if (lastcheck + HAST_KEEPALIVE <= now) { 1567448efa94SPawel Jakub Dawidek keepalive_send(res, ncomp); 1568448efa94SPawel Jakub Dawidek lastcheck = now; 1569448efa94SPawel Jakub Dawidek } 1570448efa94SPawel Jakub Dawidek continue; 1571448efa94SPawel Jakub Dawidek } 157232115b10SPawel Jakub Dawidek pjdlog_debug(2, "remote_send: (%p) Got request.", hio); 157332115b10SPawel Jakub Dawidek ggio = &hio->hio_ggio; 157432115b10SPawel Jakub Dawidek switch (ggio->gctl_cmd) { 157532115b10SPawel Jakub Dawidek case BIO_READ: 157632115b10SPawel Jakub Dawidek cmd = HIO_READ; 157732115b10SPawel Jakub Dawidek data = NULL; 157832115b10SPawel Jakub Dawidek offset = ggio->gctl_offset; 157932115b10SPawel Jakub Dawidek length = ggio->gctl_length; 158032115b10SPawel Jakub Dawidek break; 158132115b10SPawel Jakub Dawidek case BIO_WRITE: 158232115b10SPawel Jakub Dawidek cmd = HIO_WRITE; 158332115b10SPawel Jakub Dawidek data = ggio->gctl_data; 158432115b10SPawel Jakub Dawidek offset = ggio->gctl_offset; 158532115b10SPawel Jakub Dawidek length = ggio->gctl_length; 158632115b10SPawel Jakub Dawidek break; 158732115b10SPawel Jakub Dawidek case BIO_DELETE: 158832115b10SPawel Jakub Dawidek cmd = HIO_DELETE; 158932115b10SPawel Jakub Dawidek data = NULL; 159032115b10SPawel Jakub Dawidek offset = ggio->gctl_offset; 159132115b10SPawel Jakub Dawidek length = ggio->gctl_length; 159232115b10SPawel Jakub Dawidek break; 159332115b10SPawel Jakub Dawidek case BIO_FLUSH: 159432115b10SPawel Jakub Dawidek cmd = HIO_FLUSH; 159532115b10SPawel Jakub Dawidek data = NULL; 159632115b10SPawel Jakub Dawidek offset = 0; 159732115b10SPawel Jakub Dawidek length = 0; 159832115b10SPawel Jakub Dawidek break; 159932115b10SPawel Jakub Dawidek default: 160009c2e843SPawel Jakub Dawidek PJDLOG_ABORT("invalid condition"); 160132115b10SPawel Jakub Dawidek } 160232115b10SPawel Jakub Dawidek nv = nv_alloc(); 160332115b10SPawel Jakub Dawidek nv_add_uint8(nv, cmd, "cmd"); 160432115b10SPawel Jakub Dawidek nv_add_uint64(nv, (uint64_t)ggio->gctl_seq, "seq"); 160532115b10SPawel Jakub Dawidek nv_add_uint64(nv, offset, "offset"); 160632115b10SPawel Jakub Dawidek nv_add_uint64(nv, length, "length"); 16075d69ed53SMikolaj Golub if (ISMEMSYNCWRITE(hio)) 1608d6e636c9SPawel Jakub Dawidek nv_add_uint8(nv, 1, "memsync"); 160932115b10SPawel Jakub Dawidek if (nv_error(nv) != 0) { 161032115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = nv_error(nv); 161132115b10SPawel Jakub Dawidek pjdlog_debug(2, 161232115b10SPawel Jakub Dawidek "remote_send: (%p) Unable to prepare header to send.", 161332115b10SPawel Jakub Dawidek hio); 161432115b10SPawel Jakub Dawidek reqlog(LOG_ERR, 0, ggio, 161532115b10SPawel Jakub Dawidek "Unable to prepare header to send (%s): ", 161632115b10SPawel Jakub Dawidek strerror(nv_error(nv))); 161732115b10SPawel Jakub Dawidek /* Move failed request immediately to the done queue. */ 161832115b10SPawel Jakub Dawidek goto done_queue; 161932115b10SPawel Jakub Dawidek } 162032115b10SPawel Jakub Dawidek /* 162132115b10SPawel Jakub Dawidek * Protect connection from disappearing. 162232115b10SPawel Jakub Dawidek */ 162332115b10SPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ncomp]); 162432115b10SPawel Jakub Dawidek if (!ISCONNECTED(res, ncomp)) { 162532115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 162632115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = ENOTCONN; 162732115b10SPawel Jakub Dawidek goto done_queue; 162832115b10SPawel Jakub Dawidek } 162932115b10SPawel Jakub Dawidek /* 163032115b10SPawel Jakub Dawidek * Move the request to recv queue before sending it, because 163132115b10SPawel Jakub Dawidek * in different order we can get reply before we move request 163232115b10SPawel Jakub Dawidek * to recv queue. 163332115b10SPawel Jakub Dawidek */ 16341212a85cSPawel Jakub Dawidek pjdlog_debug(2, 16351212a85cSPawel Jakub Dawidek "remote_send: (%p) Moving request to the recv queue.", 16361212a85cSPawel Jakub Dawidek hio); 163732115b10SPawel Jakub Dawidek mtx_lock(&hio_recv_list_lock[ncomp]); 163832115b10SPawel Jakub Dawidek wakeup = TAILQ_EMPTY(&hio_recv_list[ncomp]); 163932115b10SPawel Jakub Dawidek TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], hio, hio_next[ncomp]); 16406b66c350SMikolaj Golub hio_recv_list_size[ncomp]++; 164132115b10SPawel Jakub Dawidek mtx_unlock(&hio_recv_list_lock[ncomp]); 164232115b10SPawel Jakub Dawidek if (hast_proto_send(res, res->hr_remoteout, nv, data, 16432b1b224dSPawel Jakub Dawidek data != NULL ? length : 0) == -1) { 164432115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = errno; 164532115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 164632115b10SPawel Jakub Dawidek pjdlog_debug(2, 164732115b10SPawel Jakub Dawidek "remote_send: (%p) Unable to send request.", hio); 164832115b10SPawel Jakub Dawidek reqlog(LOG_ERR, 0, ggio, 164932115b10SPawel Jakub Dawidek "Unable to send request (%s): ", 165032115b10SPawel Jakub Dawidek strerror(hio->hio_errors[ncomp])); 1651ee087cdfSPawel Jakub Dawidek remote_close(res, ncomp); 1652d685f88bSMikolaj Golub } else { 165332115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 1654d685f88bSMikolaj Golub } 165532115b10SPawel Jakub Dawidek nv_free(nv); 165632115b10SPawel Jakub Dawidek if (wakeup) 165732115b10SPawel Jakub Dawidek cv_signal(&hio_recv_list_cond[ncomp]); 165832115b10SPawel Jakub Dawidek continue; 165932115b10SPawel Jakub Dawidek done_queue: 166032115b10SPawel Jakub Dawidek nv_free(nv); 166132115b10SPawel Jakub Dawidek if (ISSYNCREQ(hio)) { 1662d6e636c9SPawel Jakub Dawidek if (refcnt_release(&hio->hio_countdown) > 0) 166332115b10SPawel Jakub Dawidek continue; 166432115b10SPawel Jakub Dawidek mtx_lock(&sync_lock); 166532115b10SPawel Jakub Dawidek SYNCREQDONE(hio); 166632115b10SPawel Jakub Dawidek mtx_unlock(&sync_lock); 166732115b10SPawel Jakub Dawidek cv_signal(&sync_cond); 166832115b10SPawel Jakub Dawidek continue; 166932115b10SPawel Jakub Dawidek } 167032115b10SPawel Jakub Dawidek if (ggio->gctl_cmd == BIO_WRITE) { 167132115b10SPawel Jakub Dawidek mtx_lock(&res->hr_amp_lock); 167232115b10SPawel Jakub Dawidek if (activemap_need_sync(res->hr_amp, ggio->gctl_offset, 167332115b10SPawel Jakub Dawidek ggio->gctl_length)) { 167432115b10SPawel Jakub Dawidek (void)hast_activemap_flush(res); 1675a818a4ffSMikolaj Golub } else { 167632115b10SPawel Jakub Dawidek mtx_unlock(&res->hr_amp_lock); 1677a818a4ffSMikolaj Golub } 16785d69ed53SMikolaj Golub if (ISMEMSYNCWRITE(hio)) { 16795d69ed53SMikolaj Golub if (refcnt_release(&hio->hio_writecount) == 0) { 16805d69ed53SMikolaj Golub if (hio->hio_errors[0] == 0) 16815d69ed53SMikolaj Golub write_complete(res, hio); 16825d69ed53SMikolaj Golub } 16835d69ed53SMikolaj Golub } 168432115b10SPawel Jakub Dawidek } 1685d6e636c9SPawel Jakub Dawidek if (refcnt_release(&hio->hio_countdown) > 0) 168632115b10SPawel Jakub Dawidek continue; 168732115b10SPawel Jakub Dawidek pjdlog_debug(2, 168832115b10SPawel Jakub Dawidek "remote_send: (%p) Moving request to the done queue.", 168932115b10SPawel Jakub Dawidek hio); 169032115b10SPawel Jakub Dawidek QUEUE_INSERT2(hio, done); 169132115b10SPawel Jakub Dawidek } 169232115b10SPawel Jakub Dawidek /* NOTREACHED */ 169332115b10SPawel Jakub Dawidek return (NULL); 169432115b10SPawel Jakub Dawidek } 169532115b10SPawel Jakub Dawidek 169632115b10SPawel Jakub Dawidek /* 169732115b10SPawel Jakub Dawidek * Thread receives answer from secondary node and passes it to ggate_send 169832115b10SPawel Jakub Dawidek * thread. 169932115b10SPawel Jakub Dawidek */ 170032115b10SPawel Jakub Dawidek static void * 170132115b10SPawel Jakub Dawidek remote_recv_thread(void *arg) 170232115b10SPawel Jakub Dawidek { 170332115b10SPawel Jakub Dawidek struct hast_resource *res = arg; 170432115b10SPawel Jakub Dawidek struct g_gate_ctl_io *ggio; 170532115b10SPawel Jakub Dawidek struct hio *hio; 170632115b10SPawel Jakub Dawidek struct nv *nv; 170732115b10SPawel Jakub Dawidek unsigned int ncomp; 170832115b10SPawel Jakub Dawidek uint64_t seq; 1709d6e636c9SPawel Jakub Dawidek bool memsyncack; 171032115b10SPawel Jakub Dawidek int error; 171132115b10SPawel Jakub Dawidek 171232115b10SPawel Jakub Dawidek /* Remote component is 1 for now. */ 171332115b10SPawel Jakub Dawidek ncomp = 1; 171432115b10SPawel Jakub Dawidek 171532115b10SPawel Jakub Dawidek for (;;) { 171632115b10SPawel Jakub Dawidek /* Wait until there is anything to receive. */ 171732115b10SPawel Jakub Dawidek mtx_lock(&hio_recv_list_lock[ncomp]); 171832115b10SPawel Jakub Dawidek while (TAILQ_EMPTY(&hio_recv_list[ncomp])) { 171932115b10SPawel Jakub Dawidek pjdlog_debug(2, "remote_recv: No requests, waiting."); 172032115b10SPawel Jakub Dawidek cv_wait(&hio_recv_list_cond[ncomp], 172132115b10SPawel Jakub Dawidek &hio_recv_list_lock[ncomp]); 172232115b10SPawel Jakub Dawidek } 172332115b10SPawel Jakub Dawidek mtx_unlock(&hio_recv_list_lock[ncomp]); 17243f5bce18SPawel Jakub Dawidek 1725d6e636c9SPawel Jakub Dawidek memsyncack = false; 1726d6e636c9SPawel Jakub Dawidek 172732115b10SPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ncomp]); 172832115b10SPawel Jakub Dawidek if (!ISCONNECTED(res, ncomp)) { 172932115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 173032115b10SPawel Jakub Dawidek /* 173132115b10SPawel Jakub Dawidek * Connection is dead, so move all pending requests to 173232115b10SPawel Jakub Dawidek * the done queue (one-by-one). 173332115b10SPawel Jakub Dawidek */ 173432115b10SPawel Jakub Dawidek mtx_lock(&hio_recv_list_lock[ncomp]); 173532115b10SPawel Jakub Dawidek hio = TAILQ_FIRST(&hio_recv_list[ncomp]); 17362ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(hio != NULL); 173732115b10SPawel Jakub Dawidek TAILQ_REMOVE(&hio_recv_list[ncomp], hio, 173832115b10SPawel Jakub Dawidek hio_next[ncomp]); 17396b66c350SMikolaj Golub hio_recv_list_size[ncomp]--; 174032115b10SPawel Jakub Dawidek mtx_unlock(&hio_recv_list_lock[ncomp]); 17415d69ed53SMikolaj Golub hio->hio_errors[ncomp] = ENOTCONN; 174232115b10SPawel Jakub Dawidek goto done_queue; 174332115b10SPawel Jakub Dawidek } 17442b1b224dSPawel Jakub Dawidek if (hast_proto_recv_hdr(res->hr_remotein, &nv) == -1) { 174532115b10SPawel Jakub Dawidek pjdlog_errno(LOG_ERR, 174632115b10SPawel Jakub Dawidek "Unable to receive reply header"); 174732115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 174832115b10SPawel Jakub Dawidek remote_close(res, ncomp); 174932115b10SPawel Jakub Dawidek continue; 175032115b10SPawel Jakub Dawidek } 175132115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 175232115b10SPawel Jakub Dawidek seq = nv_get_uint64(nv, "seq"); 175332115b10SPawel Jakub Dawidek if (seq == 0) { 175432115b10SPawel Jakub Dawidek pjdlog_error("Header contains no 'seq' field."); 175532115b10SPawel Jakub Dawidek nv_free(nv); 175632115b10SPawel Jakub Dawidek continue; 175732115b10SPawel Jakub Dawidek } 1758d6e636c9SPawel Jakub Dawidek memsyncack = nv_exists(nv, "received"); 175932115b10SPawel Jakub Dawidek mtx_lock(&hio_recv_list_lock[ncomp]); 176032115b10SPawel Jakub Dawidek TAILQ_FOREACH(hio, &hio_recv_list[ncomp], hio_next[ncomp]) { 176132115b10SPawel Jakub Dawidek if (hio->hio_ggio.gctl_seq == seq) { 176232115b10SPawel Jakub Dawidek TAILQ_REMOVE(&hio_recv_list[ncomp], hio, 176332115b10SPawel Jakub Dawidek hio_next[ncomp]); 17646b66c350SMikolaj Golub hio_recv_list_size[ncomp]--; 176532115b10SPawel Jakub Dawidek break; 176632115b10SPawel Jakub Dawidek } 176732115b10SPawel Jakub Dawidek } 176832115b10SPawel Jakub Dawidek mtx_unlock(&hio_recv_list_lock[ncomp]); 176932115b10SPawel Jakub Dawidek if (hio == NULL) { 177032115b10SPawel Jakub Dawidek pjdlog_error("Found no request matching received 'seq' field (%ju).", 177132115b10SPawel Jakub Dawidek (uintmax_t)seq); 177232115b10SPawel Jakub Dawidek nv_free(nv); 177332115b10SPawel Jakub Dawidek continue; 177432115b10SPawel Jakub Dawidek } 17751212a85cSPawel Jakub Dawidek ggio = &hio->hio_ggio; 177632115b10SPawel Jakub Dawidek error = nv_get_int16(nv, "error"); 177732115b10SPawel Jakub Dawidek if (error != 0) { 177832115b10SPawel Jakub Dawidek /* Request failed on remote side. */ 177972089204SPawel Jakub Dawidek hio->hio_errors[ncomp] = error; 17801212a85cSPawel Jakub Dawidek reqlog(LOG_WARNING, 0, ggio, 1781cd7b7ee5SPawel Jakub Dawidek "Remote request failed (%s): ", strerror(error)); 178232115b10SPawel Jakub Dawidek nv_free(nv); 178332115b10SPawel Jakub Dawidek goto done_queue; 178432115b10SPawel Jakub Dawidek } 178532115b10SPawel Jakub Dawidek switch (ggio->gctl_cmd) { 178632115b10SPawel Jakub Dawidek case BIO_READ: 178732115b10SPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ncomp]); 178832115b10SPawel Jakub Dawidek if (!ISCONNECTED(res, ncomp)) { 178932115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 179032115b10SPawel Jakub Dawidek nv_free(nv); 179132115b10SPawel Jakub Dawidek goto done_queue; 179232115b10SPawel Jakub Dawidek } 179332115b10SPawel Jakub Dawidek if (hast_proto_recv_data(res, res->hr_remotein, nv, 17942b1b224dSPawel Jakub Dawidek ggio->gctl_data, ggio->gctl_length) == -1) { 179532115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = errno; 179632115b10SPawel Jakub Dawidek pjdlog_errno(LOG_ERR, 179732115b10SPawel Jakub Dawidek "Unable to receive reply data"); 179832115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 179932115b10SPawel Jakub Dawidek nv_free(nv); 180032115b10SPawel Jakub Dawidek remote_close(res, ncomp); 180132115b10SPawel Jakub Dawidek goto done_queue; 180232115b10SPawel Jakub Dawidek } 180332115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 180432115b10SPawel Jakub Dawidek break; 180532115b10SPawel Jakub Dawidek case BIO_WRITE: 180632115b10SPawel Jakub Dawidek case BIO_DELETE: 180732115b10SPawel Jakub Dawidek case BIO_FLUSH: 180832115b10SPawel Jakub Dawidek break; 180932115b10SPawel Jakub Dawidek default: 181009c2e843SPawel Jakub Dawidek PJDLOG_ABORT("invalid condition"); 181132115b10SPawel Jakub Dawidek } 181232115b10SPawel Jakub Dawidek hio->hio_errors[ncomp] = 0; 181332115b10SPawel Jakub Dawidek nv_free(nv); 181432115b10SPawel Jakub Dawidek done_queue: 18155d69ed53SMikolaj Golub if (ISMEMSYNCWRITE(hio)) { 18165d69ed53SMikolaj Golub if (!hio->hio_memsyncacked) { 18175d69ed53SMikolaj Golub PJDLOG_ASSERT(memsyncack || 18185d69ed53SMikolaj Golub hio->hio_errors[ncomp] != 0); 18195d69ed53SMikolaj Golub /* Remote ack arrived. */ 18205d69ed53SMikolaj Golub if (refcnt_release(&hio->hio_writecount) == 0) { 1821d6e636c9SPawel Jakub Dawidek if (hio->hio_errors[0] == 0) 1822d6e636c9SPawel Jakub Dawidek write_complete(res, hio); 18235d69ed53SMikolaj Golub } 18245d69ed53SMikolaj Golub hio->hio_memsyncacked = true; 18255d69ed53SMikolaj Golub if (hio->hio_errors[ncomp] == 0) { 1826d6e636c9SPawel Jakub Dawidek pjdlog_debug(2, 18275d69ed53SMikolaj Golub "remote_recv: (%p) Moving request " 18285d69ed53SMikolaj Golub "back to the recv queue.", hio); 1829d6e636c9SPawel Jakub Dawidek mtx_lock(&hio_recv_list_lock[ncomp]); 1830d6e636c9SPawel Jakub Dawidek TAILQ_INSERT_TAIL(&hio_recv_list[ncomp], 1831d6e636c9SPawel Jakub Dawidek hio, hio_next[ncomp]); 18326b66c350SMikolaj Golub hio_recv_list_size[ncomp]++; 1833d6e636c9SPawel Jakub Dawidek mtx_unlock(&hio_recv_list_lock[ncomp]); 18345d69ed53SMikolaj Golub continue; 18355d69ed53SMikolaj Golub } 1836d6e636c9SPawel Jakub Dawidek } else { 18375d69ed53SMikolaj Golub PJDLOG_ASSERT(!memsyncack); 18385d69ed53SMikolaj Golub /* Remote final reply arrived. */ 1839d6e636c9SPawel Jakub Dawidek } 18405d69ed53SMikolaj Golub } 18415d69ed53SMikolaj Golub if (refcnt_release(&hio->hio_countdown) > 0) 1842d6e636c9SPawel Jakub Dawidek continue; 184332115b10SPawel Jakub Dawidek if (ISSYNCREQ(hio)) { 184432115b10SPawel Jakub Dawidek mtx_lock(&sync_lock); 184532115b10SPawel Jakub Dawidek SYNCREQDONE(hio); 184632115b10SPawel Jakub Dawidek mtx_unlock(&sync_lock); 184732115b10SPawel Jakub Dawidek cv_signal(&sync_cond); 184832115b10SPawel Jakub Dawidek } else { 184932115b10SPawel Jakub Dawidek pjdlog_debug(2, 185032115b10SPawel Jakub Dawidek "remote_recv: (%p) Moving request to the done queue.", 185132115b10SPawel Jakub Dawidek hio); 185232115b10SPawel Jakub Dawidek QUEUE_INSERT2(hio, done); 185332115b10SPawel Jakub Dawidek } 185432115b10SPawel Jakub Dawidek } 185532115b10SPawel Jakub Dawidek /* NOTREACHED */ 185632115b10SPawel Jakub Dawidek return (NULL); 185732115b10SPawel Jakub Dawidek } 185832115b10SPawel Jakub Dawidek 185932115b10SPawel Jakub Dawidek /* 186032115b10SPawel Jakub Dawidek * Thread sends answer to the kernel. 186132115b10SPawel Jakub Dawidek */ 186232115b10SPawel Jakub Dawidek static void * 186332115b10SPawel Jakub Dawidek ggate_send_thread(void *arg) 186432115b10SPawel Jakub Dawidek { 186532115b10SPawel Jakub Dawidek struct hast_resource *res = arg; 186632115b10SPawel Jakub Dawidek struct g_gate_ctl_io *ggio; 186732115b10SPawel Jakub Dawidek struct hio *hio; 186807ebc362SPawel Jakub Dawidek unsigned int ii, ncomps; 186932115b10SPawel Jakub Dawidek 187032115b10SPawel Jakub Dawidek ncomps = HAST_NCOMPONENTS; 187132115b10SPawel Jakub Dawidek 187232115b10SPawel Jakub Dawidek for (;;) { 187332115b10SPawel Jakub Dawidek pjdlog_debug(2, "ggate_send: Taking request."); 187432115b10SPawel Jakub Dawidek QUEUE_TAKE2(hio, done); 187532115b10SPawel Jakub Dawidek pjdlog_debug(2, "ggate_send: (%p) Got request.", hio); 187632115b10SPawel Jakub Dawidek ggio = &hio->hio_ggio; 187732115b10SPawel Jakub Dawidek for (ii = 0; ii < ncomps; ii++) { 187832115b10SPawel Jakub Dawidek if (hio->hio_errors[ii] == 0) { 187932115b10SPawel Jakub Dawidek /* 188032115b10SPawel Jakub Dawidek * One successful request is enough to declare 188132115b10SPawel Jakub Dawidek * success. 188232115b10SPawel Jakub Dawidek */ 188332115b10SPawel Jakub Dawidek ggio->gctl_error = 0; 188432115b10SPawel Jakub Dawidek break; 188532115b10SPawel Jakub Dawidek } 188632115b10SPawel Jakub Dawidek } 188732115b10SPawel Jakub Dawidek if (ii == ncomps) { 188832115b10SPawel Jakub Dawidek /* 188932115b10SPawel Jakub Dawidek * None of the requests were successful. 1890b068d5aaSMikolaj Golub * Use the error from local component except the 1891b068d5aaSMikolaj Golub * case when we did only remote request. 189232115b10SPawel Jakub Dawidek */ 1893b068d5aaSMikolaj Golub if (ggio->gctl_cmd == BIO_READ && 1894b068d5aaSMikolaj Golub res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) 1895b068d5aaSMikolaj Golub ggio->gctl_error = hio->hio_errors[1]; 1896b068d5aaSMikolaj Golub else 189732115b10SPawel Jakub Dawidek ggio->gctl_error = hio->hio_errors[0]; 189832115b10SPawel Jakub Dawidek } 189932115b10SPawel Jakub Dawidek if (ggio->gctl_error == 0 && ggio->gctl_cmd == BIO_WRITE) { 190032115b10SPawel Jakub Dawidek mtx_lock(&res->hr_amp_lock); 1901d9f039e0SMikolaj Golub if (activemap_write_complete(res->hr_amp, 1902d9f039e0SMikolaj Golub ggio->gctl_offset, ggio->gctl_length)) { 1903d9f039e0SMikolaj Golub res->hr_stat_activemap_update++; 1904d9f039e0SMikolaj Golub (void)hast_activemap_flush(res); 1905a818a4ffSMikolaj Golub } else { 190632115b10SPawel Jakub Dawidek mtx_unlock(&res->hr_amp_lock); 190732115b10SPawel Jakub Dawidek } 1908a818a4ffSMikolaj Golub } 190932115b10SPawel Jakub Dawidek if (ggio->gctl_cmd == BIO_WRITE) { 191032115b10SPawel Jakub Dawidek /* 191132115b10SPawel Jakub Dawidek * Unlock range we locked. 191232115b10SPawel Jakub Dawidek */ 191332115b10SPawel Jakub Dawidek mtx_lock(&range_lock); 191432115b10SPawel Jakub Dawidek rangelock_del(range_regular, ggio->gctl_offset, 191532115b10SPawel Jakub Dawidek ggio->gctl_length); 191632115b10SPawel Jakub Dawidek if (range_sync_wait) 191732115b10SPawel Jakub Dawidek cv_signal(&range_sync_cond); 191832115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 191907ebc362SPawel Jakub Dawidek if (!hio->hio_done) 192007ebc362SPawel Jakub Dawidek write_complete(res, hio); 192107ebc362SPawel Jakub Dawidek } else { 19222b1b224dSPawel Jakub Dawidek if (ioctl(res->hr_ggatefd, G_GATE_CMD_DONE, ggio) == -1) { 192307ebc362SPawel Jakub Dawidek primary_exit(EX_OSERR, 192407ebc362SPawel Jakub Dawidek "G_GATE_CMD_DONE failed"); 192532115b10SPawel Jakub Dawidek } 192632115b10SPawel Jakub Dawidek } 19272adbba66SMikolaj Golub if (hio->hio_errors[0]) { 19282adbba66SMikolaj Golub switch (ggio->gctl_cmd) { 19292adbba66SMikolaj Golub case BIO_READ: 19302adbba66SMikolaj Golub res->hr_stat_read_error++; 19312adbba66SMikolaj Golub break; 19322adbba66SMikolaj Golub case BIO_WRITE: 19332adbba66SMikolaj Golub res->hr_stat_write_error++; 19342adbba66SMikolaj Golub break; 19352adbba66SMikolaj Golub case BIO_DELETE: 19362adbba66SMikolaj Golub res->hr_stat_delete_error++; 19372adbba66SMikolaj Golub break; 19382adbba66SMikolaj Golub case BIO_FLUSH: 19392adbba66SMikolaj Golub res->hr_stat_flush_error++; 19402adbba66SMikolaj Golub break; 19412adbba66SMikolaj Golub } 19422adbba66SMikolaj Golub } 194332115b10SPawel Jakub Dawidek pjdlog_debug(2, 194432115b10SPawel Jakub Dawidek "ggate_send: (%p) Moving request to the free queue.", hio); 194532115b10SPawel Jakub Dawidek QUEUE_INSERT2(hio, free); 194632115b10SPawel Jakub Dawidek } 194732115b10SPawel Jakub Dawidek /* NOTREACHED */ 194832115b10SPawel Jakub Dawidek return (NULL); 194932115b10SPawel Jakub Dawidek } 195032115b10SPawel Jakub Dawidek 195132115b10SPawel Jakub Dawidek /* 195232115b10SPawel Jakub Dawidek * Thread synchronize local and remote components. 195332115b10SPawel Jakub Dawidek */ 195432115b10SPawel Jakub Dawidek static void * 195532115b10SPawel Jakub Dawidek sync_thread(void *arg __unused) 195632115b10SPawel Jakub Dawidek { 195732115b10SPawel Jakub Dawidek struct hast_resource *res = arg; 195832115b10SPawel Jakub Dawidek struct hio *hio; 195932115b10SPawel Jakub Dawidek struct g_gate_ctl_io *ggio; 1960fa356f6cSPawel Jakub Dawidek struct timeval tstart, tend, tdiff; 196132115b10SPawel Jakub Dawidek unsigned int ii, ncomp, ncomps; 196232115b10SPawel Jakub Dawidek off_t offset, length, synced; 1963c66ee1b3SPawel Jakub Dawidek bool dorewind, directreads; 196432115b10SPawel Jakub Dawidek int syncext; 196532115b10SPawel Jakub Dawidek 196632115b10SPawel Jakub Dawidek ncomps = HAST_NCOMPONENTS; 196732115b10SPawel Jakub Dawidek dorewind = true; 1968b9cf0cf5SPawel Jakub Dawidek synced = 0; 1969b9cf0cf5SPawel Jakub Dawidek offset = -1; 1970c66ee1b3SPawel Jakub Dawidek directreads = false; 197132115b10SPawel Jakub Dawidek 197232115b10SPawel Jakub Dawidek for (;;) { 197332115b10SPawel Jakub Dawidek mtx_lock(&sync_lock); 1974b9cf0cf5SPawel Jakub Dawidek if (offset >= 0 && !sync_inprogress) { 1975fa356f6cSPawel Jakub Dawidek gettimeofday(&tend, NULL); 1976fa356f6cSPawel Jakub Dawidek timersub(&tend, &tstart, &tdiff); 1977fa356f6cSPawel Jakub Dawidek pjdlog_info("Synchronization interrupted after %#.0T. " 1978fa356f6cSPawel Jakub Dawidek "%NB synchronized so far.", &tdiff, 197953d9b386SPawel Jakub Dawidek (intmax_t)synced); 19805bdff860SPawel Jakub Dawidek event_send(res, EVENT_SYNCINTR); 198153d9b386SPawel Jakub Dawidek } 198232115b10SPawel Jakub Dawidek while (!sync_inprogress) { 198332115b10SPawel Jakub Dawidek dorewind = true; 198432115b10SPawel Jakub Dawidek synced = 0; 198532115b10SPawel Jakub Dawidek cv_wait(&sync_cond, &sync_lock); 198632115b10SPawel Jakub Dawidek } 198732115b10SPawel Jakub Dawidek mtx_unlock(&sync_lock); 198832115b10SPawel Jakub Dawidek /* 198932115b10SPawel Jakub Dawidek * Obtain offset at which we should synchronize. 199032115b10SPawel Jakub Dawidek * Rewind synchronization if needed. 199132115b10SPawel Jakub Dawidek */ 199232115b10SPawel Jakub Dawidek mtx_lock(&res->hr_amp_lock); 199332115b10SPawel Jakub Dawidek if (dorewind) 199432115b10SPawel Jakub Dawidek activemap_sync_rewind(res->hr_amp); 199532115b10SPawel Jakub Dawidek offset = activemap_sync_offset(res->hr_amp, &length, &syncext); 199632115b10SPawel Jakub Dawidek if (syncext != -1) { 199732115b10SPawel Jakub Dawidek /* 199832115b10SPawel Jakub Dawidek * We synchronized entire syncext extent, we can mark 199932115b10SPawel Jakub Dawidek * it as clean now. 200032115b10SPawel Jakub Dawidek */ 200132115b10SPawel Jakub Dawidek if (activemap_extent_complete(res->hr_amp, syncext)) 200232115b10SPawel Jakub Dawidek (void)hast_activemap_flush(res); 2003a818a4ffSMikolaj Golub else 200432115b10SPawel Jakub Dawidek mtx_unlock(&res->hr_amp_lock); 2005a818a4ffSMikolaj Golub } else { 2006a818a4ffSMikolaj Golub mtx_unlock(&res->hr_amp_lock); 2007a818a4ffSMikolaj Golub } 200832115b10SPawel Jakub Dawidek if (dorewind) { 200932115b10SPawel Jakub Dawidek dorewind = false; 20102b1b224dSPawel Jakub Dawidek if (offset == -1) 201132115b10SPawel Jakub Dawidek pjdlog_info("Nodes are in sync."); 201232115b10SPawel Jakub Dawidek else { 2013fa356f6cSPawel Jakub Dawidek pjdlog_info("Synchronization started. %NB to go.", 2014fa356f6cSPawel Jakub Dawidek (intmax_t)(res->hr_extentsize * 201532115b10SPawel Jakub Dawidek activemap_ndirty(res->hr_amp))); 20165bdff860SPawel Jakub Dawidek event_send(res, EVENT_SYNCSTART); 2017fa356f6cSPawel Jakub Dawidek gettimeofday(&tstart, NULL); 201832115b10SPawel Jakub Dawidek } 201932115b10SPawel Jakub Dawidek } 20202b1b224dSPawel Jakub Dawidek if (offset == -1) { 202155ce1e7cSPawel Jakub Dawidek sync_stop(); 202232115b10SPawel Jakub Dawidek pjdlog_debug(1, "Nothing to synchronize."); 202332115b10SPawel Jakub Dawidek /* 202432115b10SPawel Jakub Dawidek * Synchronization complete, make both localcnt and 202532115b10SPawel Jakub Dawidek * remotecnt equal. 202632115b10SPawel Jakub Dawidek */ 202732115b10SPawel Jakub Dawidek ncomp = 1; 202832115b10SPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ncomp]); 202932115b10SPawel Jakub Dawidek if (ISCONNECTED(res, ncomp)) { 203032115b10SPawel Jakub Dawidek if (synced > 0) { 2031fa356f6cSPawel Jakub Dawidek int64_t bps; 2032fa356f6cSPawel Jakub Dawidek 2033fa356f6cSPawel Jakub Dawidek gettimeofday(&tend, NULL); 2034fa356f6cSPawel Jakub Dawidek timersub(&tend, &tstart, &tdiff); 2035fa356f6cSPawel Jakub Dawidek bps = (int64_t)((double)synced / 2036fa356f6cSPawel Jakub Dawidek ((double)tdiff.tv_sec + 2037fa356f6cSPawel Jakub Dawidek (double)tdiff.tv_usec / 1000000)); 203832115b10SPawel Jakub Dawidek pjdlog_info("Synchronization complete. " 2039fa356f6cSPawel Jakub Dawidek "%NB synchronized in %#.0lT (%NB/sec).", 2040fa356f6cSPawel Jakub Dawidek (intmax_t)synced, &tdiff, 2041fa356f6cSPawel Jakub Dawidek (intmax_t)bps); 20425bdff860SPawel Jakub Dawidek event_send(res, EVENT_SYNCDONE); 204332115b10SPawel Jakub Dawidek } 204432115b10SPawel Jakub Dawidek mtx_lock(&metadata_lock); 2045c66ee1b3SPawel Jakub Dawidek if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) 2046c66ee1b3SPawel Jakub Dawidek directreads = true; 204732115b10SPawel Jakub Dawidek res->hr_syncsrc = HAST_SYNCSRC_UNDEF; 204832115b10SPawel Jakub Dawidek res->hr_primary_localcnt = 204932115b10SPawel Jakub Dawidek res->hr_secondary_remotecnt; 20509237aa3fSMikolaj Golub res->hr_primary_remotecnt = 20519237aa3fSMikolaj Golub res->hr_secondary_localcnt; 205232115b10SPawel Jakub Dawidek pjdlog_debug(1, 205332115b10SPawel Jakub Dawidek "Setting localcnt to %ju and remotecnt to %ju.", 205432115b10SPawel Jakub Dawidek (uintmax_t)res->hr_primary_localcnt, 20559237aa3fSMikolaj Golub (uintmax_t)res->hr_primary_remotecnt); 205632115b10SPawel Jakub Dawidek (void)metadata_write(res); 205732115b10SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 205832115b10SPawel Jakub Dawidek } 205932115b10SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 2060c66ee1b3SPawel Jakub Dawidek if (directreads) { 2061c66ee1b3SPawel Jakub Dawidek directreads = false; 2062c66ee1b3SPawel Jakub Dawidek enable_direct_reads(res); 2063c66ee1b3SPawel Jakub Dawidek } 206432115b10SPawel Jakub Dawidek continue; 206532115b10SPawel Jakub Dawidek } 206632115b10SPawel Jakub Dawidek pjdlog_debug(2, "sync: Taking free request."); 206732115b10SPawel Jakub Dawidek QUEUE_TAKE2(hio, free); 206832115b10SPawel Jakub Dawidek pjdlog_debug(2, "sync: (%p) Got free request.", hio); 206932115b10SPawel Jakub Dawidek /* 207032115b10SPawel Jakub Dawidek * Lock the range we are going to synchronize. We don't want 207132115b10SPawel Jakub Dawidek * race where someone writes between our read and write. 207232115b10SPawel Jakub Dawidek */ 207332115b10SPawel Jakub Dawidek for (;;) { 207432115b10SPawel Jakub Dawidek mtx_lock(&range_lock); 207532115b10SPawel Jakub Dawidek if (rangelock_islocked(range_regular, offset, length)) { 207632115b10SPawel Jakub Dawidek pjdlog_debug(2, 207732115b10SPawel Jakub Dawidek "sync: Range offset=%jd length=%jd locked.", 207832115b10SPawel Jakub Dawidek (intmax_t)offset, (intmax_t)length); 207932115b10SPawel Jakub Dawidek range_sync_wait = true; 208032115b10SPawel Jakub Dawidek cv_wait(&range_sync_cond, &range_lock); 208132115b10SPawel Jakub Dawidek range_sync_wait = false; 208232115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 208332115b10SPawel Jakub Dawidek continue; 208432115b10SPawel Jakub Dawidek } 20852b1b224dSPawel Jakub Dawidek if (rangelock_add(range_sync, offset, length) == -1) { 208632115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 208732115b10SPawel Jakub Dawidek pjdlog_debug(2, 208832115b10SPawel Jakub Dawidek "sync: Range offset=%jd length=%jd is already locked, waiting.", 208932115b10SPawel Jakub Dawidek (intmax_t)offset, (intmax_t)length); 209032115b10SPawel Jakub Dawidek sleep(1); 209132115b10SPawel Jakub Dawidek continue; 209232115b10SPawel Jakub Dawidek } 209332115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 209432115b10SPawel Jakub Dawidek break; 209532115b10SPawel Jakub Dawidek } 209632115b10SPawel Jakub Dawidek /* 209732115b10SPawel Jakub Dawidek * First read the data from synchronization source. 209832115b10SPawel Jakub Dawidek */ 209932115b10SPawel Jakub Dawidek SYNCREQ(hio); 210032115b10SPawel Jakub Dawidek ggio = &hio->hio_ggio; 210132115b10SPawel Jakub Dawidek ggio->gctl_cmd = BIO_READ; 210232115b10SPawel Jakub Dawidek ggio->gctl_offset = offset; 210332115b10SPawel Jakub Dawidek ggio->gctl_length = length; 210432115b10SPawel Jakub Dawidek ggio->gctl_error = 0; 210507ebc362SPawel Jakub Dawidek hio->hio_done = false; 210607ebc362SPawel Jakub Dawidek hio->hio_replication = res->hr_replication; 210732115b10SPawel Jakub Dawidek for (ii = 0; ii < ncomps; ii++) 210832115b10SPawel Jakub Dawidek hio->hio_errors[ii] = EINVAL; 210932115b10SPawel Jakub Dawidek reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ", 211032115b10SPawel Jakub Dawidek hio); 211132115b10SPawel Jakub Dawidek pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", 211232115b10SPawel Jakub Dawidek hio); 211332115b10SPawel Jakub Dawidek mtx_lock(&metadata_lock); 211432115b10SPawel Jakub Dawidek if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { 211532115b10SPawel Jakub Dawidek /* 211632115b10SPawel Jakub Dawidek * This range is up-to-date on local component, 211732115b10SPawel Jakub Dawidek * so handle request locally. 211832115b10SPawel Jakub Dawidek */ 211932115b10SPawel Jakub Dawidek /* Local component is 0 for now. */ 212032115b10SPawel Jakub Dawidek ncomp = 0; 212132115b10SPawel Jakub Dawidek } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ { 21222ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY); 212332115b10SPawel Jakub Dawidek /* 212432115b10SPawel Jakub Dawidek * This range is out-of-date on local component, 212532115b10SPawel Jakub Dawidek * so send request to the remote node. 212632115b10SPawel Jakub Dawidek */ 212732115b10SPawel Jakub Dawidek /* Remote component is 1 for now. */ 212832115b10SPawel Jakub Dawidek ncomp = 1; 212932115b10SPawel Jakub Dawidek } 213032115b10SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 21316e87c151SEd Schouten refcnt_init(&hio->hio_countdown, 1); 213232115b10SPawel Jakub Dawidek QUEUE_INSERT1(hio, send, ncomp); 213332115b10SPawel Jakub Dawidek 213432115b10SPawel Jakub Dawidek /* 213532115b10SPawel Jakub Dawidek * Let's wait for READ to finish. 213632115b10SPawel Jakub Dawidek */ 213732115b10SPawel Jakub Dawidek mtx_lock(&sync_lock); 213832115b10SPawel Jakub Dawidek while (!ISSYNCREQDONE(hio)) 213932115b10SPawel Jakub Dawidek cv_wait(&sync_cond, &sync_lock); 214032115b10SPawel Jakub Dawidek mtx_unlock(&sync_lock); 214132115b10SPawel Jakub Dawidek 214232115b10SPawel Jakub Dawidek if (hio->hio_errors[ncomp] != 0) { 214332115b10SPawel Jakub Dawidek pjdlog_error("Unable to read synchronization data: %s.", 214432115b10SPawel Jakub Dawidek strerror(hio->hio_errors[ncomp])); 214532115b10SPawel Jakub Dawidek goto free_queue; 214632115b10SPawel Jakub Dawidek } 214732115b10SPawel Jakub Dawidek 214832115b10SPawel Jakub Dawidek /* 214932115b10SPawel Jakub Dawidek * We read the data from synchronization source, now write it 215032115b10SPawel Jakub Dawidek * to synchronization target. 215132115b10SPawel Jakub Dawidek */ 215232115b10SPawel Jakub Dawidek SYNCREQ(hio); 215332115b10SPawel Jakub Dawidek ggio->gctl_cmd = BIO_WRITE; 215432115b10SPawel Jakub Dawidek for (ii = 0; ii < ncomps; ii++) 215532115b10SPawel Jakub Dawidek hio->hio_errors[ii] = EINVAL; 215632115b10SPawel Jakub Dawidek reqlog(LOG_DEBUG, 2, ggio, "sync: (%p) Sending sync request: ", 215732115b10SPawel Jakub Dawidek hio); 215832115b10SPawel Jakub Dawidek pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", 215932115b10SPawel Jakub Dawidek hio); 216032115b10SPawel Jakub Dawidek mtx_lock(&metadata_lock); 216132115b10SPawel Jakub Dawidek if (res->hr_syncsrc == HAST_SYNCSRC_PRIMARY) { 216232115b10SPawel Jakub Dawidek /* 216332115b10SPawel Jakub Dawidek * This range is up-to-date on local component, 216432115b10SPawel Jakub Dawidek * so we update remote component. 216532115b10SPawel Jakub Dawidek */ 216632115b10SPawel Jakub Dawidek /* Remote component is 1 for now. */ 216732115b10SPawel Jakub Dawidek ncomp = 1; 216832115b10SPawel Jakub Dawidek } else /* if (res->hr_syncsrc == HAST_SYNCSRC_SECONDARY) */ { 21692ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_syncsrc == HAST_SYNCSRC_SECONDARY); 217032115b10SPawel Jakub Dawidek /* 217132115b10SPawel Jakub Dawidek * This range is out-of-date on local component, 217232115b10SPawel Jakub Dawidek * so we update it. 217332115b10SPawel Jakub Dawidek */ 217432115b10SPawel Jakub Dawidek /* Local component is 0 for now. */ 217532115b10SPawel Jakub Dawidek ncomp = 0; 217632115b10SPawel Jakub Dawidek } 217732115b10SPawel Jakub Dawidek mtx_unlock(&metadata_lock); 217832115b10SPawel Jakub Dawidek 21793f5bce18SPawel Jakub Dawidek pjdlog_debug(2, "sync: (%p) Moving request to the send queue.", 218032115b10SPawel Jakub Dawidek hio); 21816e87c151SEd Schouten refcnt_init(&hio->hio_countdown, 1); 218232115b10SPawel Jakub Dawidek QUEUE_INSERT1(hio, send, ncomp); 218332115b10SPawel Jakub Dawidek 218432115b10SPawel Jakub Dawidek /* 218532115b10SPawel Jakub Dawidek * Let's wait for WRITE to finish. 218632115b10SPawel Jakub Dawidek */ 218732115b10SPawel Jakub Dawidek mtx_lock(&sync_lock); 218832115b10SPawel Jakub Dawidek while (!ISSYNCREQDONE(hio)) 218932115b10SPawel Jakub Dawidek cv_wait(&sync_cond, &sync_lock); 219032115b10SPawel Jakub Dawidek mtx_unlock(&sync_lock); 219132115b10SPawel Jakub Dawidek 219232115b10SPawel Jakub Dawidek if (hio->hio_errors[ncomp] != 0) { 219332115b10SPawel Jakub Dawidek pjdlog_error("Unable to write synchronization data: %s.", 219432115b10SPawel Jakub Dawidek strerror(hio->hio_errors[ncomp])); 219532115b10SPawel Jakub Dawidek goto free_queue; 219632115b10SPawel Jakub Dawidek } 2197e23d2d01SPawel Jakub Dawidek 2198e23d2d01SPawel Jakub Dawidek synced += length; 219932115b10SPawel Jakub Dawidek free_queue: 220032115b10SPawel Jakub Dawidek mtx_lock(&range_lock); 220132115b10SPawel Jakub Dawidek rangelock_del(range_sync, offset, length); 220232115b10SPawel Jakub Dawidek if (range_regular_wait) 220332115b10SPawel Jakub Dawidek cv_signal(&range_regular_cond); 220432115b10SPawel Jakub Dawidek mtx_unlock(&range_lock); 220532115b10SPawel Jakub Dawidek pjdlog_debug(2, "sync: (%p) Moving request to the free queue.", 220632115b10SPawel Jakub Dawidek hio); 220732115b10SPawel Jakub Dawidek QUEUE_INSERT2(hio, free); 220832115b10SPawel Jakub Dawidek } 220932115b10SPawel Jakub Dawidek /* NOTREACHED */ 221032115b10SPawel Jakub Dawidek return (NULL); 221132115b10SPawel Jakub Dawidek } 221232115b10SPawel Jakub Dawidek 2213115f4e5cSPawel Jakub Dawidek void 2214115f4e5cSPawel Jakub Dawidek primary_config_reload(struct hast_resource *res, struct nv *nv) 22150989854dSPawel Jakub Dawidek { 22160989854dSPawel Jakub Dawidek unsigned int ii, ncomps; 2217115f4e5cSPawel Jakub Dawidek int modified, vint; 2218115f4e5cSPawel Jakub Dawidek const char *vstr; 22190989854dSPawel Jakub Dawidek 22200989854dSPawel Jakub Dawidek pjdlog_info("Reloading configuration..."); 22210989854dSPawel Jakub Dawidek 22222ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_role == HAST_ROLE_PRIMARY); 22232ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(gres == res); 2224115f4e5cSPawel Jakub Dawidek nv_assert(nv, "remoteaddr"); 22250b626a28SPawel Jakub Dawidek nv_assert(nv, "sourceaddr"); 2226115f4e5cSPawel Jakub Dawidek nv_assert(nv, "replication"); 22271fee97b0SPawel Jakub Dawidek nv_assert(nv, "checksum"); 22288cd3d45aSPawel Jakub Dawidek nv_assert(nv, "compression"); 2229115f4e5cSPawel Jakub Dawidek nv_assert(nv, "timeout"); 2230115f4e5cSPawel Jakub Dawidek nv_assert(nv, "exec"); 2231518dd4c0SPawel Jakub Dawidek nv_assert(nv, "metaflush"); 2232115f4e5cSPawel Jakub Dawidek 22330989854dSPawel Jakub Dawidek ncomps = HAST_NCOMPONENTS; 22340989854dSPawel Jakub Dawidek 22351fee97b0SPawel Jakub Dawidek #define MODIFIED_REMOTEADDR 0x01 22360b626a28SPawel Jakub Dawidek #define MODIFIED_SOURCEADDR 0x02 22370b626a28SPawel Jakub Dawidek #define MODIFIED_REPLICATION 0x04 22380b626a28SPawel Jakub Dawidek #define MODIFIED_CHECKSUM 0x08 22390b626a28SPawel Jakub Dawidek #define MODIFIED_COMPRESSION 0x10 22400b626a28SPawel Jakub Dawidek #define MODIFIED_TIMEOUT 0x20 22410b626a28SPawel Jakub Dawidek #define MODIFIED_EXEC 0x40 2242518dd4c0SPawel Jakub Dawidek #define MODIFIED_METAFLUSH 0x80 22430989854dSPawel Jakub Dawidek modified = 0; 2244115f4e5cSPawel Jakub Dawidek 2245115f4e5cSPawel Jakub Dawidek vstr = nv_get_string(nv, "remoteaddr"); 2246115f4e5cSPawel Jakub Dawidek if (strcmp(gres->hr_remoteaddr, vstr) != 0) { 22470989854dSPawel Jakub Dawidek /* 22480989854dSPawel Jakub Dawidek * Don't copy res->hr_remoteaddr to gres just yet. 22490989854dSPawel Jakub Dawidek * We want remote_close() to log disconnect from the old 22500989854dSPawel Jakub Dawidek * addresses, not from the new ones. 22510989854dSPawel Jakub Dawidek */ 22520989854dSPawel Jakub Dawidek modified |= MODIFIED_REMOTEADDR; 22530989854dSPawel Jakub Dawidek } 22540b626a28SPawel Jakub Dawidek vstr = nv_get_string(nv, "sourceaddr"); 22550b626a28SPawel Jakub Dawidek if (strcmp(gres->hr_sourceaddr, vstr) != 0) { 22560b626a28SPawel Jakub Dawidek strlcpy(gres->hr_sourceaddr, vstr, sizeof(gres->hr_sourceaddr)); 22570b626a28SPawel Jakub Dawidek modified |= MODIFIED_SOURCEADDR; 22580b626a28SPawel Jakub Dawidek } 2259115f4e5cSPawel Jakub Dawidek vint = nv_get_int32(nv, "replication"); 2260115f4e5cSPawel Jakub Dawidek if (gres->hr_replication != vint) { 2261115f4e5cSPawel Jakub Dawidek gres->hr_replication = vint; 22620989854dSPawel Jakub Dawidek modified |= MODIFIED_REPLICATION; 22630989854dSPawel Jakub Dawidek } 22641fee97b0SPawel Jakub Dawidek vint = nv_get_int32(nv, "checksum"); 22651fee97b0SPawel Jakub Dawidek if (gres->hr_checksum != vint) { 22661fee97b0SPawel Jakub Dawidek gres->hr_checksum = vint; 22671fee97b0SPawel Jakub Dawidek modified |= MODIFIED_CHECKSUM; 22681fee97b0SPawel Jakub Dawidek } 22698cd3d45aSPawel Jakub Dawidek vint = nv_get_int32(nv, "compression"); 22708cd3d45aSPawel Jakub Dawidek if (gres->hr_compression != vint) { 22718cd3d45aSPawel Jakub Dawidek gres->hr_compression = vint; 22728cd3d45aSPawel Jakub Dawidek modified |= MODIFIED_COMPRESSION; 22738cd3d45aSPawel Jakub Dawidek } 2274115f4e5cSPawel Jakub Dawidek vint = nv_get_int32(nv, "timeout"); 2275115f4e5cSPawel Jakub Dawidek if (gres->hr_timeout != vint) { 2276115f4e5cSPawel Jakub Dawidek gres->hr_timeout = vint; 22770989854dSPawel Jakub Dawidek modified |= MODIFIED_TIMEOUT; 22780989854dSPawel Jakub Dawidek } 2279115f4e5cSPawel Jakub Dawidek vstr = nv_get_string(nv, "exec"); 2280115f4e5cSPawel Jakub Dawidek if (strcmp(gres->hr_exec, vstr) != 0) { 2281115f4e5cSPawel Jakub Dawidek strlcpy(gres->hr_exec, vstr, sizeof(gres->hr_exec)); 22820becad39SPawel Jakub Dawidek modified |= MODIFIED_EXEC; 22830becad39SPawel Jakub Dawidek } 2284518dd4c0SPawel Jakub Dawidek vint = nv_get_int32(nv, "metaflush"); 2285518dd4c0SPawel Jakub Dawidek if (gres->hr_metaflush != vint) { 2286518dd4c0SPawel Jakub Dawidek gres->hr_metaflush = vint; 2287518dd4c0SPawel Jakub Dawidek modified |= MODIFIED_METAFLUSH; 2288518dd4c0SPawel Jakub Dawidek } 2289115f4e5cSPawel Jakub Dawidek 22900989854dSPawel Jakub Dawidek /* 22911fee97b0SPawel Jakub Dawidek * Change timeout for connected sockets. 22921fee97b0SPawel Jakub Dawidek * Don't bother if we need to reconnect. 22930989854dSPawel Jakub Dawidek */ 22941fee97b0SPawel Jakub Dawidek if ((modified & MODIFIED_TIMEOUT) != 0 && 229507ebc362SPawel Jakub Dawidek (modified & (MODIFIED_REMOTEADDR | MODIFIED_SOURCEADDR)) == 0) { 22960989854dSPawel Jakub Dawidek for (ii = 0; ii < ncomps; ii++) { 22970989854dSPawel Jakub Dawidek if (!ISREMOTE(ii)) 22980989854dSPawel Jakub Dawidek continue; 22990989854dSPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ii]); 23000989854dSPawel Jakub Dawidek if (!ISCONNECTED(gres, ii)) { 23010989854dSPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ii]); 23020989854dSPawel Jakub Dawidek continue; 23030989854dSPawel Jakub Dawidek } 23040989854dSPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ii]); 23050989854dSPawel Jakub Dawidek if (proto_timeout(gres->hr_remotein, 23062b1b224dSPawel Jakub Dawidek gres->hr_timeout) == -1) { 23070989854dSPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, 23080989854dSPawel Jakub Dawidek "Unable to set connection timeout"); 23090989854dSPawel Jakub Dawidek } 23100989854dSPawel Jakub Dawidek if (proto_timeout(gres->hr_remoteout, 23112b1b224dSPawel Jakub Dawidek gres->hr_timeout) == -1) { 23120989854dSPawel Jakub Dawidek pjdlog_errno(LOG_WARNING, 23130989854dSPawel Jakub Dawidek "Unable to set connection timeout"); 23140989854dSPawel Jakub Dawidek } 23150989854dSPawel Jakub Dawidek } 23161fee97b0SPawel Jakub Dawidek } 231707ebc362SPawel Jakub Dawidek if ((modified & (MODIFIED_REMOTEADDR | MODIFIED_SOURCEADDR)) != 0) { 23180989854dSPawel Jakub Dawidek for (ii = 0; ii < ncomps; ii++) { 23190989854dSPawel Jakub Dawidek if (!ISREMOTE(ii)) 23200989854dSPawel Jakub Dawidek continue; 23210989854dSPawel Jakub Dawidek remote_close(gres, ii); 23220989854dSPawel Jakub Dawidek } 23230989854dSPawel Jakub Dawidek if (modified & MODIFIED_REMOTEADDR) { 2324115f4e5cSPawel Jakub Dawidek vstr = nv_get_string(nv, "remoteaddr"); 2325115f4e5cSPawel Jakub Dawidek strlcpy(gres->hr_remoteaddr, vstr, 23260989854dSPawel Jakub Dawidek sizeof(gres->hr_remoteaddr)); 23270989854dSPawel Jakub Dawidek } 23280989854dSPawel Jakub Dawidek } 23290989854dSPawel Jakub Dawidek #undef MODIFIED_REMOTEADDR 23300b626a28SPawel Jakub Dawidek #undef MODIFIED_SOURCEADDR 23310989854dSPawel Jakub Dawidek #undef MODIFIED_REPLICATION 23321fee97b0SPawel Jakub Dawidek #undef MODIFIED_CHECKSUM 23338cd3d45aSPawel Jakub Dawidek #undef MODIFIED_COMPRESSION 23340989854dSPawel Jakub Dawidek #undef MODIFIED_TIMEOUT 23350becad39SPawel Jakub Dawidek #undef MODIFIED_EXEC 2336518dd4c0SPawel Jakub Dawidek #undef MODIFIED_METAFLUSH 23370989854dSPawel Jakub Dawidek 23380989854dSPawel Jakub Dawidek pjdlog_info("Configuration reloaded successfully."); 23390989854dSPawel Jakub Dawidek } 23400989854dSPawel Jakub Dawidek 2341f7fe83f9SPawel Jakub Dawidek static void 2342ff6bb1f8SPawel Jakub Dawidek guard_one(struct hast_resource *res, unsigned int ncomp) 2343ff6bb1f8SPawel Jakub Dawidek { 2344ff6bb1f8SPawel Jakub Dawidek struct proto_conn *in, *out; 2345ff6bb1f8SPawel Jakub Dawidek 2346ff6bb1f8SPawel Jakub Dawidek if (!ISREMOTE(ncomp)) 2347ff6bb1f8SPawel Jakub Dawidek return; 2348ff6bb1f8SPawel Jakub Dawidek 2349ff6bb1f8SPawel Jakub Dawidek rw_rlock(&hio_remote_lock[ncomp]); 2350ff6bb1f8SPawel Jakub Dawidek 2351ff6bb1f8SPawel Jakub Dawidek if (!real_remote(res)) { 2352ff6bb1f8SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 2353ff6bb1f8SPawel Jakub Dawidek return; 2354ff6bb1f8SPawel Jakub Dawidek } 2355ff6bb1f8SPawel Jakub Dawidek 2356ff6bb1f8SPawel Jakub Dawidek if (ISCONNECTED(res, ncomp)) { 23572ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remotein != NULL); 23582ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remoteout != NULL); 2359ff6bb1f8SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 2360ff6bb1f8SPawel Jakub Dawidek pjdlog_debug(2, "remote_guard: Connection to %s is ok.", 2361ff6bb1f8SPawel Jakub Dawidek res->hr_remoteaddr); 2362ff6bb1f8SPawel Jakub Dawidek return; 2363ff6bb1f8SPawel Jakub Dawidek } 2364ff6bb1f8SPawel Jakub Dawidek 23652ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remotein == NULL); 23662ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remoteout == NULL); 2367ff6bb1f8SPawel Jakub Dawidek /* 2368ff6bb1f8SPawel Jakub Dawidek * Upgrade the lock. It doesn't have to be atomic as no other thread 2369ff6bb1f8SPawel Jakub Dawidek * can change connection status from disconnected to connected. 2370ff6bb1f8SPawel Jakub Dawidek */ 2371ff6bb1f8SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 2372ff6bb1f8SPawel Jakub Dawidek pjdlog_debug(2, "remote_guard: Reconnecting to %s.", 2373ff6bb1f8SPawel Jakub Dawidek res->hr_remoteaddr); 2374ff6bb1f8SPawel Jakub Dawidek in = out = NULL; 2375ac0401e3SPawel Jakub Dawidek if (init_remote(res, &in, &out) == 0) { 2376ff6bb1f8SPawel Jakub Dawidek rw_wlock(&hio_remote_lock[ncomp]); 23772ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remotein == NULL); 23782ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remoteout == NULL); 23792ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(in != NULL && out != NULL); 2380ff6bb1f8SPawel Jakub Dawidek res->hr_remotein = in; 2381ff6bb1f8SPawel Jakub Dawidek res->hr_remoteout = out; 2382ff6bb1f8SPawel Jakub Dawidek rw_unlock(&hio_remote_lock[ncomp]); 2383ff6bb1f8SPawel Jakub Dawidek pjdlog_info("Successfully reconnected to %s.", 2384ff6bb1f8SPawel Jakub Dawidek res->hr_remoteaddr); 2385ff6bb1f8SPawel Jakub Dawidek sync_start(); 2386ff6bb1f8SPawel Jakub Dawidek } else { 2387ff6bb1f8SPawel Jakub Dawidek /* Both connections should be NULL. */ 23882ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remotein == NULL); 23892ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(res->hr_remoteout == NULL); 23902ec483c5SPawel Jakub Dawidek PJDLOG_ASSERT(in == NULL && out == NULL); 2391ff6bb1f8SPawel Jakub Dawidek pjdlog_debug(2, "remote_guard: Reconnect to %s failed.", 2392ff6bb1f8SPawel Jakub Dawidek res->hr_remoteaddr); 2393ff6bb1f8SPawel Jakub Dawidek } 2394ff6bb1f8SPawel Jakub Dawidek } 2395ff6bb1f8SPawel Jakub Dawidek 239632115b10SPawel Jakub Dawidek /* 239732115b10SPawel Jakub Dawidek * Thread guards remote connections and reconnects when needed, handles 239832115b10SPawel Jakub Dawidek * signals, etc. 239932115b10SPawel Jakub Dawidek */ 240032115b10SPawel Jakub Dawidek static void * 240132115b10SPawel Jakub Dawidek guard_thread(void *arg) 240232115b10SPawel Jakub Dawidek { 240332115b10SPawel Jakub Dawidek struct hast_resource *res = arg; 240432115b10SPawel Jakub Dawidek unsigned int ii, ncomps; 24056d0c801eSPawel Jakub Dawidek struct timespec timeout; 2406ff6bb1f8SPawel Jakub Dawidek time_t lastcheck, now; 24076d0c801eSPawel Jakub Dawidek sigset_t mask; 24086d0c801eSPawel Jakub Dawidek int signo; 240932115b10SPawel Jakub Dawidek 241032115b10SPawel Jakub Dawidek ncomps = HAST_NCOMPONENTS; 2411ff6bb1f8SPawel Jakub Dawidek lastcheck = time(NULL); 241232115b10SPawel Jakub Dawidek 24136d0c801eSPawel Jakub Dawidek PJDLOG_VERIFY(sigemptyset(&mask) == 0); 24146d0c801eSPawel Jakub Dawidek PJDLOG_VERIFY(sigaddset(&mask, SIGINT) == 0); 24156d0c801eSPawel Jakub Dawidek PJDLOG_VERIFY(sigaddset(&mask, SIGTERM) == 0); 24166d0c801eSPawel Jakub Dawidek 24178d7dcf14SMikolaj Golub timeout.tv_sec = HAST_KEEPALIVE; 24186d0c801eSPawel Jakub Dawidek timeout.tv_nsec = 0; 24196d0c801eSPawel Jakub Dawidek signo = -1; 24206d0c801eSPawel Jakub Dawidek 242132115b10SPawel Jakub Dawidek for (;;) { 24226d0c801eSPawel Jakub Dawidek switch (signo) { 24236d0c801eSPawel Jakub Dawidek case SIGINT: 24246d0c801eSPawel Jakub Dawidek case SIGTERM: 24256d0c801eSPawel Jakub Dawidek sigexit_received = true; 242632115b10SPawel Jakub Dawidek primary_exitx(EX_OK, 242732115b10SPawel Jakub Dawidek "Termination signal received, exiting."); 24286d0c801eSPawel Jakub Dawidek break; 24296d0c801eSPawel Jakub Dawidek default: 2430ff6bb1f8SPawel Jakub Dawidek break; 2431f7fe83f9SPawel Jakub Dawidek } 24326d0c801eSPawel Jakub Dawidek 2433ac0401e3SPawel Jakub Dawidek /* 2434ac0401e3SPawel Jakub Dawidek * Don't check connections until we fully started, 2435ac0401e3SPawel Jakub Dawidek * as we may still be looping, waiting for remote node 2436ac0401e3SPawel Jakub Dawidek * to switch from primary to secondary. 2437ac0401e3SPawel Jakub Dawidek */ 2438ac0401e3SPawel Jakub Dawidek if (fullystarted) { 24396d0c801eSPawel Jakub Dawidek pjdlog_debug(2, "remote_guard: Checking connections."); 2440ff6bb1f8SPawel Jakub Dawidek now = time(NULL); 24418d7dcf14SMikolaj Golub if (lastcheck + HAST_KEEPALIVE <= now) { 24426d0c801eSPawel Jakub Dawidek for (ii = 0; ii < ncomps; ii++) 2443ff6bb1f8SPawel Jakub Dawidek guard_one(res, ii); 2444ff6bb1f8SPawel Jakub Dawidek lastcheck = now; 244532115b10SPawel Jakub Dawidek } 2446ac0401e3SPawel Jakub Dawidek } 24476d0c801eSPawel Jakub Dawidek signo = sigtimedwait(&mask, NULL, &timeout); 244832115b10SPawel Jakub Dawidek } 244932115b10SPawel Jakub Dawidek /* NOTREACHED */ 245032115b10SPawel Jakub Dawidek return (NULL); 245132115b10SPawel Jakub Dawidek } 2452