/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DS_DDICT #include "../contract.h" #endif #include #include /* dtrace is S10 or later */ #include "rdc.h" #include "rdc_io.h" #include "rdc_bitmap.h" /* * Remote Dual Copy * * This file contains the nsctl io provider functionality for RDC. * * RDC is implemented as a simple filter module that pushes itself between * user (SIMCKD, STE, etc.) and SDBC. */ static int _rdc_open_count; int rdc_eio_nobmp = 0; nsc_io_t *_rdc_io_hc; static nsc_io_t *_rdc_io_hr; static nsc_def_t _rdc_fd_def[], _rdc_io_def[], _rdc_ior_def[]; void _rdc_deinit_dev(); int rdc_diskq_enqueue(rdc_k_info_t *, rdc_aio_t *); extern void rdc_unintercept_diskq(rdc_group_t *); rdc_aio_t *rdc_aio_tbuf_get(void *, void *, int, int, int, int, int); static nsc_buf_t *_rdc_alloc_handle(void (*)(), void (*)(), void (*)(), rdc_fd_t *); static int _rdc_free_handle(rdc_buf_t *, rdc_fd_t *); #ifdef DEBUG int rdc_overlap_cnt; int rdc_overlap_hnd_cnt; #endif static rdc_info_dev_t *rdc_devices; extern int _rdc_rsrv_diskq(rdc_group_t *group); extern void _rdc_rlse_diskq(rdc_group_t *group); /* * _rdc_init_dev * Initialise the io provider. */ int _rdc_init_dev() { _rdc_io_hc = nsc_register_io("rdc-high-cache", NSC_RDCH_ID|NSC_REFCNT|NSC_FILTER, _rdc_io_def); if (_rdc_io_hc == NULL) cmn_err(CE_WARN, "!rdc: nsc_register_io (high, cache) failed."); _rdc_io_hr = nsc_register_io("rdc-high-raw", NSC_RDCHR_ID|NSC_REFCNT|NSC_FILTER, _rdc_ior_def); if (_rdc_io_hr == NULL) cmn_err(CE_WARN, "!rdc: nsc_register_io (high, raw) failed."); if (!_rdc_io_hc || !_rdc_io_hr) { _rdc_deinit_dev(); return (ENOMEM); } return (0); } /* * _rdc_deinit_dev * De-initialise the io provider. * */ void _rdc_deinit_dev() { int rc; if (_rdc_io_hc) { if ((rc = nsc_unregister_io(_rdc_io_hc, 0)) != 0) cmn_err(CE_WARN, "!rdc: nsc_unregister_io (high, cache) failed: %d", rc); } if (_rdc_io_hr) { if ((rc = nsc_unregister_io(_rdc_io_hr, 0)) != 0) cmn_err(CE_WARN, "!rdc: nsc_unregister_io (high, raw) failed: %d", rc); } } /* * rdc_idev_open * - Open the nsctl file descriptors for the data devices. * * Must be called with rdc_conf_lock held. * id_sets is protected by rdc_conf_lock. */ static rdc_info_dev_t * rdc_idev_open(rdc_k_info_t *krdc, char *pathname, int *rc) { rdc_info_dev_t *dp; ASSERT(MUTEX_HELD(&rdc_conf_lock)); for (dp = rdc_devices; dp; dp = dp->id_next) { if (dp->id_cache_dev.bi_fd && strcmp(pathname, nsc_pathname(dp->id_cache_dev.bi_fd)) == 0) break; } if (!dp) { dp = kmem_zalloc(sizeof (*dp), KM_SLEEP); if (!dp) return (NULL); dp->id_cache_dev.bi_krdc = krdc; dp->id_cache_dev.bi_fd = nsc_open(pathname, NSC_RDCHR_ID|NSC_RDWR|NSC_DEVICE, _rdc_fd_def, (blind_t)&dp->id_cache_dev, rc); if (!dp->id_cache_dev.bi_fd) { kmem_free(dp, sizeof (*dp)); return (NULL); } dp->id_raw_dev.bi_krdc = krdc; dp->id_raw_dev.bi_fd = nsc_open(pathname, NSC_RDCHR_ID|NSC_RDWR|NSC_DEVICE, _rdc_fd_def, (blind_t)&dp->id_raw_dev, rc); if (!dp->id_raw_dev.bi_fd) { (void) nsc_close(dp->id_cache_dev.bi_fd); kmem_free(dp, sizeof (*dp)); return (NULL); } mutex_init(&dp->id_rlock, NULL, MUTEX_DRIVER, NULL); cv_init(&dp->id_rcv, NULL, CV_DRIVER, NULL); dp->id_next = rdc_devices; rdc_devices = dp; } dp->id_sets++; return (dp); } /* * rdc_idev_close * - Close the nsctl file descriptors for the data devices. * * Must be called with rdc_conf_lock and dp->id_rlock held. * Will release dp->id_rlock before returning. * * id_sets is protected by rdc_conf_lock. */ static void rdc_idev_close(rdc_k_info_t *krdc, rdc_info_dev_t *dp) { rdc_info_dev_t **dpp; #ifdef DEBUG int count = 0; #endif ASSERT(MUTEX_HELD(&rdc_conf_lock)); ASSERT(MUTEX_HELD(&dp->id_rlock)); dp->id_sets--; if (dp->id_sets > 0) { mutex_exit(&dp->id_rlock); return; } /* external references must have gone */ ASSERT((krdc->c_ref + krdc->r_ref + krdc->b_ref) == 0); /* unlink from chain */ for (dpp = &rdc_devices; *dpp; dpp = &((*dpp)->id_next)) { if (*dpp == dp) { /* unlink */ *dpp = dp->id_next; break; } } /* * Wait for all reserves to go away - the rpc server is * running asynchronously with this close, and so we * have to wait for it to spot that the krdc is !IS_ENABLED() * and throw away the nsc_buf_t's that it has allocated * and release the device. */ while (IS_CRSRV(krdc) || IS_RRSRV(krdc)) { #ifdef DEBUG if (!(++count % 16)) { cmn_err(CE_NOTE, "!_rdc_idev_close(%s): waiting for nsc_release", rdc_u_info[krdc->index].primary.file); } if (count > (16*20)) { /* waited for 20 seconds - too long - panic */ cmn_err(CE_PANIC, "!_rdc_idev_close(%s, %p): lost nsc_release", rdc_u_info[krdc->index].primary.file, (void *)krdc); } #endif mutex_exit(&dp->id_rlock); delay(HZ>>4); mutex_enter(&dp->id_rlock); } if (dp->id_cache_dev.bi_fd) { (void) nsc_close(dp->id_cache_dev.bi_fd); dp->id_cache_dev.bi_fd = NULL; } if (dp->id_raw_dev.bi_fd) { (void) nsc_close(dp->id_raw_dev.bi_fd); dp->id_raw_dev.bi_fd = NULL; } mutex_exit(&dp->id_rlock); mutex_destroy(&dp->id_rlock); cv_destroy(&dp->id_rcv); kmem_free(dp, sizeof (*dp)); } /* * This function provokes an nsc_reserve() for the device which * if successful will populate krdc->maxfbas and urdc->volume_size * via the _rdc_attach_fd() callback. */ void rdc_get_details(rdc_k_info_t *krdc) { int rc; rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; nsc_size_t vol_size, maxfbas; if (_rdc_rsrv_devs(krdc, RDC_RAW, RDC_INTERNAL) == 0) { /* * if the vol is already reserved, * volume_size won't be populated on enable because * it is a *fake* reserve and does not make it to * _rdc_attach_fd(). So do it here. */ rc = nsc_partsize(RDC_U_FD(krdc), &vol_size); if (rc != 0) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_get_details: partsize failed (%d)", rc); #endif /* DEBUG */ urdc->volume_size = vol_size = 0; } urdc->volume_size = vol_size; rc = nsc_maxfbas(RDC_U_FD(krdc), 0, &maxfbas); if (rc != 0) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_get_details: maxfbas failed (%d)", rc); #endif /* DEBUG */ maxfbas = 0; } krdc->maxfbas = min(RDC_MAX_MAXFBAS, maxfbas); _rdc_rlse_devs(krdc, RDC_RAW); } } /* * Should only be used by the config code. */ int rdc_dev_open(rdc_set_t *rdc_set, int options) { rdc_k_info_t *krdc; int index; int rc; char *pathname; ASSERT(MUTEX_HELD(&rdc_conf_lock)); if (options & RDC_OPT_PRIMARY) pathname = rdc_set->primary.file; else pathname = rdc_set->secondary.file; for (index = 0; index < rdc_max_sets; index++) { krdc = &rdc_k_info[index]; if (!IS_CONFIGURED(krdc)) break; } if (index == rdc_max_sets) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_dev_open: out of cd\'s"); #endif index = -EINVAL; goto out; } if (krdc->devices && (krdc->c_fd || krdc->r_fd)) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_dev_open: %s already open", pathname); #endif index = -EINVAL; goto out; } _rdc_open_count++; krdc->devices = rdc_idev_open(krdc, pathname, &rc); if (!krdc->devices) { index = -rc; goto open_fail; } /* * Grab the device size and maxfbas now. */ rdc_get_details(krdc); out: return (index); open_fail: _rdc_open_count--; return (index); } void rdc_dev_close(rdc_k_info_t *krdc) { rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; mutex_enter(&rdc_conf_lock); if (krdc->devices) mutex_enter(&krdc->devices->id_rlock); #ifdef DEBUG if (!krdc->devices || !krdc->c_fd || !krdc->r_fd) { cmn_err(CE_WARN, "!rdc_dev_close(%p): c_fd %p r_fd %p", (void *)krdc, (void *) (krdc->devices ? krdc->c_fd : 0), (void *) (krdc->devices ? krdc->r_fd : 0)); } #endif if (krdc->devices) { /* rdc_idev_close will release id_rlock */ rdc_idev_close(krdc, krdc->devices); krdc->devices = NULL; } urdc->primary.file[0] = '\0'; if (_rdc_open_count <= 0) { cmn_err(CE_WARN, "!rdc: _rdc_open_count corrupt: %d", _rdc_open_count); } _rdc_open_count--; mutex_exit(&rdc_conf_lock); } /* * rdc_intercept * * Register for IO on this device with nsctl. * * For a 1-to-many primary we register for each krdc and let nsctl sort * out which it wants to be using. This means that we cannot tell which * krdc will receive the incoming io from nsctl, though we do know that * at any one time only one krdc will be 'attached' and so get io from * nsctl. * * So the krdc->many_next pointer is maintained as a circular list. The * result of these multiple nsc_register_paths is that we will see a * few more attach and detach io provider calls during enable/resume * and disable/suspend of the 1-to-many whilst nsctl settles down to * using a single krdc. * * The major advantage of this scheme is that nsctl sorts out all the * rdc_fd_t's so that they can only point to krdc's that are currently * active. */ int rdc_intercept(rdc_k_info_t *krdc) { rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; char *pathname; char *bitmap; if (rdc_get_vflags(urdc) & RDC_PRIMARY) { pathname = urdc->primary.file; bitmap = urdc->primary.bitmap; } else { pathname = urdc->secondary.file; bitmap = urdc->secondary.bitmap; } if (!krdc->b_tok) krdc->b_tok = nsc_register_path(bitmap, NSC_CACHE | NSC_DEVICE, _rdc_io_hc); if (!krdc->c_tok) krdc->c_tok = nsc_register_path(pathname, NSC_CACHE, _rdc_io_hc); if (!krdc->r_tok) krdc->r_tok = nsc_register_path(pathname, NSC_DEVICE, _rdc_io_hr); if (!krdc->c_tok || !krdc->r_tok) { (void) rdc_unintercept(krdc); return (ENXIO); } return (0); } static void wait_unregistering(rdc_k_info_t *krdc) { while (krdc->group->unregistering > 0) (void) cv_wait_sig(&krdc->group->unregistercv, &rdc_conf_lock); } static void set_unregistering(rdc_k_info_t *krdc) { wait_unregistering(krdc); krdc->group->unregistering++; } static void wakeup_unregistering(rdc_k_info_t *krdc) { if (krdc->group->unregistering <= 0) return; krdc->group->unregistering--; cv_broadcast(&krdc->group->unregistercv); } /* * rdc_unintercept * * Unregister for IO on this device. * * See comments above rdc_intercept. */ int rdc_unintercept(rdc_k_info_t *krdc) { int err = 0; int rc; rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; mutex_enter(&rdc_conf_lock); set_unregistering(krdc); krdc->type_flag |= RDC_UNREGISTER; mutex_exit(&rdc_conf_lock); if (krdc->r_tok) { rc = nsc_unregister_path(krdc->r_tok, 0); if (rc) { cmn_err(CE_WARN, "!rdc: unregister rawfd %d", rc); err = rc; } krdc->r_tok = NULL; } if (krdc->c_tok) { rc = nsc_unregister_path(krdc->c_tok, 0); if (rc) { cmn_err(CE_WARN, "!rdc: unregister cachefd %d", rc); if (!err) err = rc; } krdc->c_tok = NULL; } if (krdc->b_tok) { rc = nsc_unregister_path(krdc->b_tok, 0); if (rc) { cmn_err(CE_WARN, "!rdc: unregister bitmap %d", rc); err = rc; } krdc->b_tok = NULL; } rdc_group_enter(krdc); /* Wait for all necessary _rdc_close() calls to complete */ while ((krdc->c_ref + krdc->r_ref + krdc->b_ref) != 0) { krdc->closing++; cv_wait(&krdc->closingcv, &krdc->group->lock); krdc->closing--; } rdc_clr_flags(urdc, RDC_ENABLED); rdc_group_exit(krdc); /* * Check there are no outstanding writes in progress. * This can happen when a set is being disabled which * is one of the 'one_to_many' chain, that did not * intercept the original write call. */ for (;;) { rdc_group_enter(krdc); if (krdc->aux_state & RDC_AUXWRITE) { rdc_group_exit(krdc); /* * This doesn't happen very often, * just delay a bit and re-look. */ delay(50); } else { rdc_group_exit(krdc); break; } } mutex_enter(&rdc_conf_lock); krdc->type_flag &= ~RDC_UNREGISTER; wakeup_unregistering(krdc); mutex_exit(&rdc_conf_lock); return (err); } /* * _rdc_rlse_d * Internal version of _rdc_rlse_devs(), only concerned with the * data device, not the bitmap. */ static void _rdc_rlse_d(rdc_k_info_t *krdc, int devs) { _rdc_info_dev_t *cip; _rdc_info_dev_t *rip; int raw = (devs & RDC_RAW); if (!krdc) { cmn_err(CE_WARN, "!rdc: _rdc_rlse_devs null krdc"); return; } ASSERT((devs & (~RDC_BMP)) != 0); cip = &krdc->devices->id_cache_dev; rip = &krdc->devices->id_raw_dev; if (IS_RSRV(cip)) { /* decrement count */ if (raw) { if (cip->bi_ofailed > 0) { cip->bi_ofailed--; } else if (cip->bi_orsrv > 0) { cip->bi_orsrv--; } } else { if (cip->bi_failed > 0) { cip->bi_failed--; } else if (cip->bi_rsrv > 0) { cip->bi_rsrv--; } } /* * reset nsc_fd ownership back link, it is only set if * we have really done an underlying reserve, not for * failed (faked) reserves. */ if (cip->bi_rsrv > 0 || cip->bi_orsrv > 0) { nsc_set_owner(cip->bi_fd, krdc->iodev); } else { nsc_set_owner(cip->bi_fd, NULL); } /* release nsc_fd */ if (!IS_RSRV(cip)) { nsc_release(cip->bi_fd); } } else if (IS_RSRV(rip)) { /* decrement count */ if (raw) { if (rip->bi_failed > 0) { rip->bi_failed--; } else if (rip->bi_rsrv > 0) { rip->bi_rsrv--; } } else { if (rip->bi_ofailed > 0) { rip->bi_ofailed--; } else if (rip->bi_orsrv > 0) { rip->bi_orsrv--; } } /* * reset nsc_fd ownership back link, it is only set if * we have really done an underlying reserve, not for * failed (faked) reserves. */ if (rip->bi_rsrv > 0 || rip->bi_orsrv > 0) { nsc_set_owner(rip->bi_fd, krdc->iodev); } else { nsc_set_owner(rip->bi_fd, NULL); } /* release nsc_fd and any waiters */ if (!IS_RSRV(rip)) { rip->bi_flag = 0; nsc_release(rip->bi_fd); cv_broadcast(&krdc->devices->id_rcv); } } else { cmn_err(CE_WARN, "!rdc: _rdc_rlse_devs no reserve? krdc %p", (void *) krdc); } } /* * _rdc_rlse_devs * Release named underlying devices and take care of setting the * back link on the nsc_fd to the correct parent iodev. * * NOTE: the 'devs' argument must be the same as that passed to * the preceding _rdc_rsrv_devs call. */ void _rdc_rlse_devs(rdc_k_info_t *krdc, int devs) { DTRACE_PROBE(_rdc_rlse_devs_start); mutex_enter(&krdc->devices->id_rlock); ASSERT(!(devs & RDC_CACHE)); if ((devs & (~RDC_BMP)) != 0) { _rdc_rlse_d(krdc, devs); } if ((devs & RDC_BMP) != 0) { if (krdc->bmaprsrv > 0 && --krdc->bmaprsrv == 0) { nsc_release(krdc->bitmapfd); } } mutex_exit(&krdc->devices->id_rlock); } /* * _rdc_rsrv_d * Reserve device flagged, unless its companion is already reserved, * in that case increase the reserve on the companion. Take care * of setting the nsc_fd ownership back link to the correct parent * iodev pointer. */ static int _rdc_rsrv_d(int raw, _rdc_info_dev_t *rid, _rdc_info_dev_t *cid, int flag, rdc_k_info_t *krdc) { _rdc_info_dev_t *p = NULL; rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; int other = 0; int rc; #ifdef DEBUG if ((rid->bi_rsrv < 0) || (cid->bi_rsrv < 0) || (rid->bi_orsrv < 0) || (cid->bi_orsrv < 0) || (rid->bi_failed < 0) || (cid->bi_failed < 0) || (rid->bi_ofailed < 0) || (cid->bi_ofailed < 0)) { cmn_err(CE_WARN, "!_rdc_rsrv_d: negative counts (rsrv %d %d orsrv %d %d)", rid->bi_rsrv, cid->bi_rsrv, rid->bi_orsrv, cid->bi_orsrv); cmn_err(CE_WARN, "!_rdc_rsrv_d: negative counts (fail %d %d ofail %d %d)", rid->bi_failed, cid->bi_failed, rid->bi_ofailed, cid->bi_ofailed); cmn_err(CE_PANIC, "_rdc_rsrv_d: negative counts (krdc %p)", (void *) krdc); } #endif /* * If user wants to do a cache reserve and it's already * raw reserved internally, we need to do a real nsc_reserve, so wait * until the release has been done. */ if (IS_RSRV(rid) && (flag == RDC_EXTERNAL) && (raw == 0) && (rid->bi_flag != RDC_EXTERNAL)) { krdc->devices->id_release++; while (IS_RSRV(rid)) cv_wait(&krdc->devices->id_rcv, &krdc->devices->id_rlock); krdc->devices->id_release--; } /* select underlying device to use */ if (IS_RSRV(rid)) { p = rid; if (!raw) { other = 1; } } else if (IS_RSRV(cid)) { p = cid; if (raw) { other = 1; } } /* just increment count and return if already reserved */ if (p && !RFAILED(p)) { if (other) { p->bi_orsrv++; } else { p->bi_rsrv++; } /* set nsc_fd ownership back link */ nsc_set_owner(p->bi_fd, krdc->iodev); return (0); } /* attempt reserve */ if (!p) { p = raw ? rid : cid; } if (!p->bi_fd) { /* rpc server raced with rdc_dev_close() */ return (EIO); } if ((rc = nsc_reserve(p->bi_fd, 0)) == 0) { /* * convert failed counts into reserved counts, and add * in this reserve. */ p->bi_orsrv = p->bi_ofailed; p->bi_rsrv = p->bi_failed; if (other) { p->bi_orsrv++; } else { p->bi_rsrv++; } p->bi_ofailed = 0; p->bi_failed = 0; /* set nsc_fd ownership back link */ nsc_set_owner(p->bi_fd, krdc->iodev); } else if (rc != EINTR) { /* * If this is the master, and the secondary is not * failed, then just fake this external reserve so that * we can do remote io to the secondary and continue to * provide service to the client. * * Subsequent calls to _rdc_rsrv_d() will re-try the * nsc_reserve() until it succeeds. */ if ((rdc_get_vflags(urdc) & RDC_PRIMARY) && !(rdc_get_vflags(urdc) & RDC_LOGGING) && !((rdc_get_vflags(urdc) & RDC_SLAVE) && (rdc_get_vflags(urdc) & RDC_SYNCING))) { if (!(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) { rdc_many_enter(krdc); /* Primary, so reverse sync needed */ rdc_set_mflags(urdc, RDC_RSYNC_NEEDED); rdc_set_flags_log(urdc, RDC_VOL_FAILED, "nsc_reserve failed"); rdc_many_exit(krdc); rc = -1; #ifdef DEBUG cmn_err(CE_NOTE, "!nsc_reserve failed " "with rc == %d\n", rc); #endif } else { rc = 0; } if (other) { p->bi_ofailed++; } else { p->bi_failed++; } if (krdc->maxfbas == 0) { /* * fake a maxfbas value for remote i/o, * this will get reset when the next * successful reserve happens as part * of the rdc_attach_fd() callback. */ krdc->maxfbas = 128; } } } if (rc == 0 && raw) { p->bi_flag = flag; } return (rc); } /* * _rdc_rsrv_devs * Reserve named underlying devices. * */ int _rdc_rsrv_devs(rdc_k_info_t *krdc, int devs, int flag) { rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; int write = 0; int rc = 0; int got = 0; if (!krdc) { return (EINVAL); } ASSERT(!(devs & RDC_CACHE)); mutex_enter(&krdc->devices->id_rlock); if ((devs & (~RDC_BMP)) != 0) { if ((rc = _rdc_rsrv_d((devs & RDC_CACHE) == 0, &krdc->devices->id_raw_dev, &krdc->devices->id_cache_dev, flag, krdc)) != 0) { if (rc == -1) { /* * we need to call rdc_write_state() * after we drop the mutex */ write = 1; rc = 0; } else { cmn_err(CE_WARN, "!rdc: nsc_reserve(%s) failed %d\n", nsc_pathname(krdc->c_fd), rc); } } else { got |= (devs & (~RDC_BMP)); } } if (rc == 0 && (devs & RDC_BMP) != 0) { if (krdc->bitmapfd == NULL) rc = EIO; else if ((krdc->bmaprsrv == 0) && (rc = nsc_reserve(krdc->bitmapfd, 0)) != 0) { cmn_err(CE_WARN, "!rdc: nsc_reserve(%s) failed %d\n", nsc_pathname(krdc->bitmapfd), rc); } else { krdc->bmaprsrv++; got |= RDC_BMP; } if (!RDC_SUCCESS(rc)) { /* Undo any previous reserve */ if (got != 0) _rdc_rlse_d(krdc, got); } } mutex_exit(&krdc->devices->id_rlock); if (write) { rdc_write_state(urdc); } return (rc); } /* * Read from the remote end, ensuring that if this is a many group in * slave mode that we only remote read from the secondary with the * valid data. */ int _rdc_remote_read(rdc_k_info_t *krdc, nsc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag) { rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; rdc_k_info_t *this = krdc; /* krdc that was requested */ int rc; if (flag & NSC_RDAHEAD) { /* * no point in doing readahead remotely, * just say we did it ok - the client is about to * throw this buffer away as soon as we return. */ return (NSC_DONE); } /* * If this is a many group with a reverse sync in progress and * this is not the slave krdc/urdc, then search for the slave * so that we can do the remote io from the correct secondary. */ if ((rdc_get_mflags(urdc) & RDC_SLAVE) && !(rdc_get_vflags(urdc) & RDC_SLAVE)) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; if (rdc_get_vflags(urdc) & RDC_SLAVE) break; } rdc_many_exit(krdc); this = krdc; } read1: if (rdc_get_vflags(urdc) & RDC_LOGGING) { /* cannot do remote io without the remote node! */ rc = ENETDOWN; goto read2; } /* wait for the remote end to have the latest data */ if (IS_ASYNC(urdc)) { while (krdc->group->ra_queue.blocks != 0) { if (!krdc->group->rdc_writer) (void) rdc_writer(krdc->index); (void) rdc_drain_queue(krdc->index); } } if (krdc->io_kstats) { mutex_enter(krdc->io_kstats->ks_lock); kstat_runq_enter(KSTAT_IO_PTR(krdc->io_kstats)); mutex_exit(krdc->io_kstats->ks_lock); } rc = rdc_net_read(krdc->index, krdc->remote_index, h, pos, len); if (krdc->io_kstats) { mutex_enter(krdc->io_kstats->ks_lock); kstat_runq_exit(KSTAT_IO_PTR(krdc->io_kstats)); mutex_exit(krdc->io_kstats->ks_lock); } /* If read error keep trying every secondary until no more */ read2: if (!RDC_SUCCESS(rc) && IS_MANY(krdc) && !(rdc_get_mflags(urdc) & RDC_SLAVE)) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; rdc_many_exit(krdc); goto read1; } rdc_many_exit(krdc); } return (rc); } /* * _rdc_alloc_buf * Allocate a buffer of data * * Calling/Exit State: * Returns NSC_DONE or NSC_HIT for success, NSC_PENDING for async * I/O, > 0 is an error code. * * Description: */ int rdcbufs = 0; static int _rdc_alloc_buf(rdc_fd_t *rfd, nsc_off_t pos, nsc_size_t len, int flag, rdc_buf_t **ptr) { rdc_k_info_t *krdc = rfd->rdc_info; rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; nsc_vec_t *vec = NULL; rdc_buf_t *h; size_t size; int ioflag; int rc = 0; if (RDC_IS_BMP(rfd) || RDC_IS_QUE(rfd)) return (EIO); if (len == 0) return (EINVAL); if (flag & NSC_WRBUF) { if (!(rdc_get_vflags(urdc) & RDC_PRIMARY) && !(rdc_get_vflags(urdc) & RDC_LOGGING)) { /* * Forbid writes to secondary unless logging. */ return (EIO); } } if (!(rdc_get_vflags(urdc) & RDC_PRIMARY) && (rdc_get_vflags(urdc) & RDC_SYNC_NEEDED)) { /* * Forbid any io to secondary if it needs a sync. */ return (EIO); } if ((rdc_get_vflags(urdc) & RDC_PRIMARY) && (rdc_get_vflags(urdc) & RDC_RSYNC_NEEDED) && !(rdc_get_vflags(urdc) & RDC_VOL_FAILED) && !(rdc_get_vflags(urdc) & RDC_SLAVE)) { /* * Forbid any io to primary if it needs a reverse sync * and is not actively syncing. */ return (EIO); } /* Bounds checking */ ASSERT(urdc->volume_size != 0); if (pos + len > urdc->volume_size) { #ifdef DEBUG cmn_err(CE_NOTE, "!rdc: Attempt to access beyond end of rdc volume"); #endif return (EIO); } h = *ptr; if (h == NULL) { /* should never happen (nsctl does this for us) */ #ifdef DEBUG cmn_err(CE_WARN, "!_rdc_alloc_buf entered without buffer!"); #endif h = (rdc_buf_t *)_rdc_alloc_handle(NULL, NULL, NULL, rfd); if (h == NULL) return (ENOMEM); h->rdc_bufh.sb_flag &= ~NSC_HALLOCATED; *ptr = h; } if (flag & NSC_NOBLOCK) { cmn_err(CE_WARN, "!_rdc_alloc_buf: removing unsupported NSC_NOBLOCK flag"); flag &= ~(NSC_NOBLOCK); } h->rdc_bufh.sb_error = 0; h->rdc_bufh.sb_flag |= flag; h->rdc_bufh.sb_pos = pos; h->rdc_bufh.sb_len = len; ioflag = flag; bzero(&h->rdc_sync, sizeof (h->rdc_sync)); mutex_init(&h->rdc_sync.lock, NULL, MUTEX_DRIVER, NULL); cv_init(&h->rdc_sync.cv, NULL, CV_DRIVER, NULL); if (flag & NSC_WRBUF) _rdc_async_throttle(krdc, len); /* throttle incoming io */ /* * Use remote io when: * - local volume is failed * - reserve status is failed */ if ((rdc_get_vflags(urdc) & RDC_VOL_FAILED) || IS_RFAILED(krdc)) { rc = EIO; } else { rc = nsc_alloc_buf(RDC_U_FD(krdc), pos, len, ioflag, &h->rdc_bufp); if (!RDC_SUCCESS(rc)) { rdc_many_enter(krdc); if (rdc_get_vflags(urdc) & RDC_PRIMARY) { /* Primary, so reverse sync needed */ rdc_set_mflags(urdc, RDC_RSYNC_NEEDED); } else { /* Secondary, so forward sync needed */ rdc_set_flags(urdc, RDC_SYNC_NEEDED); } rdc_set_flags_log(urdc, RDC_VOL_FAILED, "nsc_alloc_buf failed"); rdc_many_exit(krdc); rdc_write_state(urdc); } } if (RDC_SUCCESS(rc)) { h->rdc_bufh.sb_vec = h->rdc_bufp->sb_vec; h->rdc_flags |= RDC_ALLOC; /* * If in slave and reading data, remote read on top of * the buffer to ensure that we have the latest data. */ if ((flag & NSC_READ) && (rdc_get_vflags(urdc) & RDC_PRIMARY) && (rdc_get_mflags(urdc) & RDC_SLAVE)) { rc = _rdc_remote_read(krdc, &h->rdc_bufh, pos, len, flag); /* * Set NSC_MIXED so that the * cache will throw away this buffer when we free * it since we have combined data from multiple * sources into a single buffer. */ h->rdc_bufp->sb_flag |= NSC_MIXED; } } /* * If nsc_alloc_buf above fails, or local volume is failed or * bitmap is failed or reserve, then we fill the buf from remote */ if ((!RDC_SUCCESS(rc)) && (rdc_get_vflags(urdc) & RDC_PRIMARY) && !(rdc_get_vflags(urdc) & RDC_LOGGING)) { if (flag & NSC_NODATA) { ASSERT(!(flag & NSC_READ)); h->rdc_flags |= RDC_REMOTE_BUF; h->rdc_bufh.sb_vec = NULL; } else { size = sizeof (nsc_vec_t) * 2; h->rdc_vsize = size + FBA_SIZE(len); vec = kmem_zalloc(h->rdc_vsize, KM_SLEEP); if (!vec) { rc = ENOMEM; goto error; } /* single flat buffer */ vec[0].sv_addr = (uchar_t *)vec + size; vec[0].sv_len = FBA_SIZE(len); vec[0].sv_vme = 0; /* null terminator */ vec[1].sv_addr = NULL; vec[1].sv_len = 0; vec[1].sv_vme = 0; h->rdc_bufh.sb_vec = vec; h->rdc_flags |= RDC_REMOTE_BUF; h->rdc_flags |= RDC_VEC_ALLOC; } if (flag & NSC_READ) { rc = _rdc_remote_read(krdc, &h->rdc_bufh, pos, len, flag); } else { rc = NSC_DONE; } } error: if (!RDC_SUCCESS(rc)) { h->rdc_bufh.sb_error = rc; } return (rc); } /* * _rdc_free_buf */ static int _rdc_free_buf(rdc_buf_t *h) { int rc = 0; if (h->rdc_flags & RDC_ALLOC) { if (h->rdc_bufp) { rc = nsc_free_buf(h->rdc_bufp); } h->rdc_flags &= ~(RDC_ALLOC); if (!RDC_SUCCESS(rc)) { #ifdef DEBUG cmn_err(CE_WARN, "!_rdc_free_buf(%p): nsc_free_buf(%p) returned %d", (void *) h, (void *) h->rdc_bufp, rc); #endif return (rc); } } if (h->rdc_flags & (RDC_REMOTE_BUF|RDC_VEC_ALLOC)) { if (h->rdc_flags & RDC_VEC_ALLOC) { kmem_free(h->rdc_bufh.sb_vec, h->rdc_vsize); } h->rdc_flags &= ~(RDC_REMOTE_BUF|RDC_VEC_ALLOC); } if (h->rdc_anon) { /* anon buffers still pending */ DTRACE_PROBE1(rdc_free_buf_err, aio_buf_t, h->rdc_anon); } if ((h->rdc_bufh.sb_flag & NSC_HALLOCATED) == 0) { rc = _rdc_free_handle(h, h->rdc_fd); if (!RDC_SUCCESS(rc)) { #ifdef DEBUG cmn_err(CE_WARN, "!_rdc_free_buf(%p): _rdc_free_handle returned %d", (void *) h, rc); #endif return (rc); } } else { h->rdc_bufh.sb_flag = NSC_HALLOCATED; h->rdc_bufh.sb_vec = NULL; h->rdc_bufh.sb_error = 0; h->rdc_bufh.sb_pos = 0; h->rdc_bufh.sb_len = 0; h->rdc_anon = NULL; h->rdc_vsize = 0; cv_destroy(&h->rdc_sync.cv); mutex_destroy(&h->rdc_sync.lock); } return (0); } /* * _rdc_open * Open a device * * Calling/Exit State: * Returns a token to identify the device. * * Description: * Performs the housekeeping operations associated with an upper layer * of the nsctl stack opening a device. */ /* ARGSUSED */ static int _rdc_open(char *path, int flag, blind_t *cdp, nsc_iodev_t *iodev) { rdc_k_info_t *krdc; #ifdef DEBUG rdc_u_info_t *urdc; #endif rdc_fd_t *rfd; int raw = ((flag & NSC_CACHE) == 0); int index; int bmp = 0; int queue = 0; rfd = kmem_zalloc(sizeof (*rfd), KM_SLEEP); if (!rfd) return (ENOMEM); /* * Take config lock to prevent a race with the * (de)configuration code. */ mutex_enter(&rdc_conf_lock); index = rdc_lookup_enabled(path, 0); if (index < 0) { index = rdc_lookup_bitmap(path); if (index >= 0) bmp = 1; } if (index < 0) { index = rdc_lookup_diskq(path); if (index >= 0) queue = 1; } if (index < 0) { /* not found in config */ mutex_exit(&rdc_conf_lock); kmem_free(rfd, sizeof (*rfd)); return (ENXIO); } #ifdef DEBUG urdc = &rdc_u_info[index]; #endif krdc = &rdc_k_info[index]; mutex_exit(&rdc_conf_lock); rdc_group_enter(krdc); ASSERT(IS_ENABLED(urdc)); if (bmp) { krdc->b_ref++; } else if (raw) { krdc->r_ref++; } else if (!queue) { krdc->c_ref++; } rfd->rdc_info = krdc; if (bmp) rfd->rdc_type = RDC_BMP; else if (queue) rfd->rdc_type = RDC_QUE; else rfd->rdc_oflags = flag; rdc_group_exit(krdc); *cdp = (blind_t)rfd; return (0); } static int _rdc_openc(char *path, int flag, blind_t *cdp, nsc_iodev_t *iodev) { return (_rdc_open(path, NSC_CACHE|flag, cdp, iodev)); } static int _rdc_openr(char *path, int flag, blind_t *cdp, nsc_iodev_t *iodev) { return (_rdc_open(path, NSC_DEVICE|flag, cdp, iodev)); } /* * _rdc_close * Close a device * * Calling/Exit State: * Always succeeds - returns 0 * * Description: * Performs the housekeeping operations associated with an upper layer * of the sd stack closing a shadowed device. */ static int _rdc_close(rfd) rdc_fd_t *rfd; { rdc_k_info_t *krdc = rfd->rdc_info; int bmp = RDC_IS_BMP(rfd); int raw = RDC_IS_RAW(rfd); int queue = RDC_IS_QUE(rfd); /* * we don't keep ref counts for the queue, so skip this stuff. * we may not even have a valid krdc at this point */ if (queue) goto queue; rdc_group_enter(krdc); if (bmp) { krdc->b_ref--; } else if (raw && !queue) { krdc->r_ref--; } else if (!queue) { krdc->c_ref--; } if (krdc->closing) { cv_broadcast(&krdc->closingcv); } rdc_group_exit(krdc); queue: kmem_free(rfd, sizeof (*rfd)); return (0); } /* * _rdc_alloc_handle * Allocate a handle * */ static nsc_buf_t * _rdc_alloc_handle(void (*d_cb)(), void (*r_cb)(), void (*w_cb)(), rdc_fd_t *rfd) { rdc_buf_t *h; h = kmem_zalloc(sizeof (*h), KM_SLEEP); if (!h) return (NULL); h->rdc_bufp = nsc_alloc_handle(RDC_FD(rfd), d_cb, r_cb, w_cb); if (!h->rdc_bufp) { if (!IS_RFAILED(rfd->rdc_info)) { /* * This is a real failure from the io provider below. */ kmem_free(h, sizeof (*h)); return (NULL); } else { /* EMPTY */ /* * This is just a failed primary device where * we can do remote io to the secondary. */ } } h->rdc_bufh.sb_flag = NSC_HALLOCATED; h->rdc_fd = rfd; mutex_init(&h->aio_lock, NULL, MUTEX_DRIVER, NULL); return (&h->rdc_bufh); } /* * _rdc_free_handle * Free a handle * */ /* ARGSUSED */ static int _rdc_free_handle(rdc_buf_t *h, rdc_fd_t *rfd) { int rc; mutex_destroy(&h->aio_lock); if (h->rdc_bufp) { rc = nsc_free_handle(h->rdc_bufp); if (!RDC_SUCCESS(rc)) return (rc); } kmem_free(h, sizeof (rdc_buf_t)); return (0); } /* * _rdc_attach * Attach * * Calling/Exit State: * Returns 0 for success, errno on failure. * * Description: */ static int _rdc_attach(rdc_fd_t *rfd, nsc_iodev_t *iodev) { rdc_k_info_t *krdc; int raw = RDC_IS_RAW(rfd); int rc; if ((RDC_IS_BMP(rfd)) || RDC_IS_QUE(rfd)) return (EINVAL); krdc = rfd->rdc_info; if (krdc == NULL) return (EINVAL); mutex_enter(&krdc->devices->id_rlock); krdc->iodev = iodev; mutex_exit(&krdc->devices->id_rlock); rc = _rdc_rsrv_devs(krdc, (raw ? RDC_RAW : RDC_CACHE), RDC_EXTERNAL); return (rc); } /* * _rdc_detach * Detach * * Calling/Exit State: * Returns 0 for success, always succeeds * * Description: */ static int _rdc_detach(rdc_fd_t *rfd, nsc_iodev_t *iodev) { rdc_k_info_t *krdc = rfd->rdc_info; int raw = RDC_IS_RAW(rfd); /* * Flush the async queue if necessary. */ if (IS_ASYNC(&rdc_u_info[krdc->index]) && !RDC_IS_DISKQ(krdc->group)) { int tries = 1; while (krdc->group->ra_queue.blocks != 0 && tries--) { if (!krdc->group->rdc_writer) (void) rdc_writer(krdc->index); (void) rdc_drain_queue(krdc->index); } /* force disgard of possibly blocked flusher threads */ if (rdc_drain_queue(krdc->index) != 0) { #ifdef DEBUG net_queue *qp = &krdc->group->ra_queue; #endif do { mutex_enter(&krdc->group->ra_queue.net_qlock); krdc->group->asyncdis = 1; cv_broadcast(&krdc->group->asyncqcv); mutex_exit(&krdc->group->ra_queue.net_qlock); cmn_err(CE_WARN, "!RDC: async I/O pending and not drained " "for %s during detach", rdc_u_info[krdc->index].primary.file); #ifdef DEBUG cmn_err(CE_WARN, "!nitems: %" NSC_SZFMT " nblocks: %" NSC_SZFMT " head: 0x%p tail: 0x%p", qp->nitems, qp->blocks, (void *)qp->net_qhead, (void *)qp->net_qtail); #endif } while (krdc->group->rdc_thrnum > 0); } } mutex_enter(&krdc->devices->id_rlock); if (krdc->iodev != iodev) cmn_err(CE_WARN, "!_rdc_detach: iodev mismatch %p : %p", (void *) krdc->iodev, (void *) iodev); krdc->iodev = NULL; mutex_exit(&krdc->devices->id_rlock); _rdc_rlse_devs(krdc, (raw ? RDC_RAW : RDC_CACHE)); return (0); } /* * _rdc_get_pinned * * only affects local node. */ static int _rdc_get_pinned(rdc_fd_t *rfd) { return (nsc_get_pinned(RDC_FD(rfd))); } /* * _rdc_discard_pinned * * only affects local node. */ static int _rdc_discard_pinned(rdc_fd_t *rfd, nsc_off_t pos, nsc_size_t len) { return (nsc_discard_pinned(RDC_FD(rfd), pos, len)); } /* * _rdc_partsize * * only affects the local node. */ static int _rdc_partsize(rdc_fd_t *rfd, nsc_size_t *ptr) { rdc_u_info_t *urdc; urdc = &rdc_u_info[rfd->rdc_info->index]; /* Always return saved size */ ASSERT(urdc->volume_size != 0); *ptr = urdc->volume_size; return (0); } /* * _rdc_maxfbas * * only affects local node */ /* ARGSUSED */ static int _rdc_maxfbas(rdc_fd_t *rfd, int flag, nsc_size_t *ptr) { rdc_k_info_t *krdc = rfd->rdc_info; int raw = RDC_IS_RAW(rfd); int rtype = raw ? RDC_RAW : RDC_CACHE; int rc = 0; if (krdc == NULL) return (EINVAL); if (flag == NSC_RDAHEAD || flag == NSC_CACHEBLK) { rc = _rdc_rsrv_devs(krdc, rtype, RDC_INTERNAL); if (rc == 0) { rc = nsc_maxfbas(RDC_U_FD(krdc), flag, ptr); _rdc_rlse_devs(krdc, rtype); } } else { /* Always return saved size */ ASSERT(krdc->maxfbas != 0); *ptr = krdc->maxfbas - 1; } return (rc); } /* ARGSUSED */ static int _rdc_control(rdc_fd_t *rfd, int cmd, void *ptr, int len) { return (nsc_control(RDC_FD(rfd), cmd, ptr, len)); } /* * _rdc_attach_fd * * called by nsctl as part of nsc_reserve() processing when one of * SNDR's underlying file descriptors becomes available and metadata * should be re-acquired. */ static int _rdc_attach_fd(blind_t arg) { _rdc_info_dev_t *dip = (_rdc_info_dev_t *)arg; rdc_k_info_t *krdc; rdc_u_info_t *urdc; nsc_size_t maxfbas, partsize; int rc; krdc = dip->bi_krdc; urdc = &rdc_u_info[krdc->index]; if ((rc = nsc_partsize(dip->bi_fd, &partsize)) != 0) { cmn_err(CE_WARN, "!SNDR: cannot get volume size of %s, error %d", nsc_pathname(dip->bi_fd), rc); } else if (urdc->volume_size == 0 && partsize > 0) { /* set volume size for the first time */ urdc->volume_size = partsize; } else if (urdc->volume_size != partsize) { /* * SNDR cannot yet cope with a volume being resized, * so fail it. */ if (!(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) { rdc_many_enter(krdc); if (rdc_get_vflags(urdc) & RDC_PRIMARY) rdc_set_mflags(urdc, RDC_RSYNC_NEEDED); else rdc_set_mflags(urdc, RDC_SYNC_NEEDED); rdc_set_flags_log(urdc, RDC_VOL_FAILED, "volume resized"); rdc_many_exit(krdc); rdc_write_state(urdc); } cmn_err(CE_WARN, "!SNDR: %s changed size from %" NSC_SZFMT " to %" NSC_SZFMT, nsc_pathname(dip->bi_fd), urdc->volume_size, partsize); } if ((rc = nsc_maxfbas(dip->bi_fd, 0, &maxfbas)) != 0) { cmn_err(CE_WARN, "!SNDR: cannot get max transfer size for %s, error %d", nsc_pathname(dip->bi_fd), rc); } else if (maxfbas > 0) { krdc->maxfbas = min(RDC_MAX_MAXFBAS, maxfbas); } return (0); } /* * _rdc_pinned * * only affects local node */ static void _rdc_pinned(_rdc_info_dev_t *dip, nsc_off_t pos, nsc_size_t len) { nsc_pinned_data(dip->bi_krdc->iodev, pos, len); } /* * _rdc_unpinned * * only affects local node. */ static void _rdc_unpinned(_rdc_info_dev_t *dip, nsc_off_t pos, nsc_size_t len) { nsc_unpinned_data(dip->bi_krdc->iodev, pos, len); } /* * _rdc_read * * read the specified data into the buffer - go remote if local down, * or the remote end has more recent data because an reverse sync is * in progress. */ static int _rdc_read(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag) { rdc_k_info_t *krdc = h->rdc_fd->rdc_info; rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; int remote = (RDC_REMOTE(h) || (rdc_get_mflags(urdc) & RDC_SLAVE)); int rc1, rc2; rc1 = rc2 = 0; if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) { cmn_err(CE_WARN, "!_rdc_read: bounds check: io(handle) pos %" NSC_XSZFMT "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")", pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len); h->rdc_bufh.sb_error = EINVAL; return (h->rdc_bufh.sb_error); } if (flag & NSC_NOBLOCK) { cmn_err(CE_WARN, "!_rdc_read: removing unsupported NSC_NOBLOCK flag"); flag &= ~(NSC_NOBLOCK); } if (!remote) { rc1 = nsc_read(h->rdc_bufp, pos, len, flag); } if (remote || !RDC_SUCCESS(rc1)) { rc2 = _rdc_remote_read(krdc, &h->rdc_bufh, pos, len, flag); } if (remote && !RDC_SUCCESS(rc2)) h->rdc_bufh.sb_error = rc2; else if (!RDC_SUCCESS(rc1) && !RDC_SUCCESS(rc2)) h->rdc_bufh.sb_error = rc1; return (h->rdc_bufh.sb_error); } static int _rdc_remote_write(rdc_k_info_t *krdc, rdc_buf_t *h, nsc_buf_t *nsc_h, nsc_off_t pos, nsc_size_t len, int flag, uint_t bitmask) { rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; int rc = 0; nsc_size_t plen, syncblockpos; aio_buf_t *anon = NULL; if (!(rdc_get_vflags(urdc) & RDC_PRIMARY)) return (EINVAL); if ((rdc_get_vflags(urdc) & RDC_LOGGING) && (!IS_STATE(urdc, RDC_QUEUING))) { goto done; } /* * this check for RDC_SYNCING may seem redundant, but there is a window * in rdc_sync, where an async set has not yet been transformed into a * sync set. */ if ((!IS_ASYNC(urdc) || IS_STATE(urdc, RDC_SYNCING)) || RDC_REMOTE(h) || krdc->group->synccount > 0 || (rdc_get_vflags(urdc) & RDC_SLAVE) || (rdc_get_vflags(urdc) & RDC_VOL_FAILED) || (rdc_get_vflags(urdc) & RDC_BMP_FAILED)) { /* sync mode, or remote io mode, or local device is dead */ rc = rdc_net_write(krdc->index, krdc->remote_index, nsc_h, pos, len, RDC_NOSEQ, RDC_NOQUE, NULL); if ((rc == 0) && !(rdc_get_vflags(urdc) & RDC_BMP_FAILED) && !(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) { if (IS_STATE(urdc, RDC_SYNCING) && !IS_STATE(urdc, RDC_FULL) || !IS_STATE(urdc, RDC_SLAVE)) { mutex_enter(&krdc->syncbitmutex); syncblockpos = LOG_TO_FBA_NUM(krdc->syncbitpos); DTRACE_PROBE4(rdc_remote_write, nsc_off_t, krdc->syncbitpos, nsc_off_t, syncblockpos, nsc_off_t, pos, nsc_size_t, len); /* * If the current I/O's position plus length is * greater then the sync block position, only * clear those blocks upto sync block position */ if (pos < syncblockpos) { if ((pos + len) > syncblockpos) plen = syncblockpos - pos; else plen = len; RDC_CLR_BITMAP(krdc, pos, plen, bitmask, RDC_BIT_BUMP); } mutex_exit(&krdc->syncbitmutex); } else { RDC_CLR_BITMAP(krdc, pos, len, bitmask, RDC_BIT_BUMP); } } else if (rc != 0) { rdc_group_enter(krdc); rdc_set_flags_log(urdc, RDC_LOGGING, "net write failed"); rdc_write_state(urdc); if (rdc_get_vflags(urdc) & RDC_SYNCING) krdc->disk_status = 1; rdc_group_exit(krdc); } } else if (!IS_STATE(urdc, RDC_SYNCING)) { DTRACE_PROBE1(async_enque_start, rdc_buf_t *, h); ASSERT(krdc->group->synccount == 0); /* async mode */ if ((h == NULL) || ((h->rdc_flags & RDC_ASYNC_VEC) == 0)) { rc = _rdc_enqueue_write(krdc, pos, len, flag, NULL); } else { anon = rdc_aio_buf_get(h, krdc->index); if (anon == NULL) { #ifdef DEBUG cmn_err(CE_WARN, "!enqueue write failed for handle %p", (void *) h); #endif return (EINVAL); } rc = _rdc_enqueue_write(krdc, pos, len, flag, anon->rdc_abufp); /* * get rid of the aio_buf_t now, as this * may not be the set that this rdc_buf * was allocated on, we are done with it anyways * enqueuing code frees the nsc_abuf */ rdc_aio_buf_del(h, krdc); } } else { ASSERT(IS_STATE(urdc, RDC_SYNCING)); ASSERT(0); } done: if ((anon == NULL) && h && (h->rdc_flags & RDC_ASYNC_VEC)) { /* * Toss the anonymous buffer if we have one allocated. */ anon = rdc_aio_buf_get(h, krdc->index); if (anon) { (void) nsc_free_buf(anon->rdc_abufp); rdc_aio_buf_del(h, krdc); } } return (rc); } /* * _rdc_multi_write * * Send to multihop remote. Obeys 1 to many if present and we are crazy * enough to support it. * */ int _rdc_multi_write(nsc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag, rdc_k_info_t *krdc) { rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; rdc_k_info_t *this = krdc; /* krdc that was requested */ int rc, retval; uint_t bitmask; retval = rc = 0; if (!RDC_HANDLE_LIMITS(h, pos, len)) { cmn_err(CE_WARN, "!_rdc_multi_write: bounds check: io(handle) pos %" NSC_XSZFMT "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")", pos, h->sb_pos, len, h->sb_len); return (EINVAL); } /* if this is a 1 to many, set all the bits for all the sets */ do { if (RDC_SET_BITMAP(krdc, pos, len, &bitmask) < 0) { (void) nsc_uncommit(h, pos, len, flag); /* set the error, but try other sets */ retval = EIO; } if (IS_MANY(krdc) && IS_STATE(urdc, RDC_PRIMARY)) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; break; } rdc_many_exit(krdc); } } while (krdc != this); urdc = &rdc_u_info[krdc->index]; if (flag & NSC_NOBLOCK) { cmn_err(CE_WARN, "!_rdc_multi_write: removing unsupported NSC_NOBLOCK flag"); flag &= ~(NSC_NOBLOCK); } multiwrite1: if ((rdc_get_vflags(urdc) & RDC_PRIMARY) && (!IS_STATE(urdc, RDC_LOGGING) || (IS_STATE(urdc, RDC_LOGGING) && IS_STATE(urdc, RDC_QUEUING)))) { rc = _rdc_remote_write(krdc, NULL, h, pos, len, flag, bitmask); } if (!RDC_SUCCESS(rc) && retval == 0) { retval = rc; } multiwrite2: if (IS_MANY(krdc) && (rdc_get_vflags(urdc) && RDC_PRIMARY)) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; rc = 0; rdc_many_exit(krdc); goto multiwrite1; } rdc_many_exit(krdc); } return (retval); } void _rdc_diskq_enqueue_thr(rdc_aio_t *p) { rdc_thrsync_t *sync = (rdc_thrsync_t *)p->next; rdc_k_info_t *krdc = &rdc_k_info[p->index]; int rc2; rc2 = rdc_diskq_enqueue(krdc, p); /* * overload flag with error return if any */ if (!RDC_SUCCESS(rc2)) { p->flag = rc2; } else { p->flag = 0; } mutex_enter(&sync->lock); sync->complete++; cv_broadcast(&sync->cv); mutex_exit(&sync->lock); } /* * _rdc_sync_write_thr * syncronous write thread which writes to network while * local write is occuring */ void _rdc_sync_write_thr(rdc_aio_t *p) { rdc_thrsync_t *sync = (rdc_thrsync_t *)p->next; rdc_buf_t *h = (rdc_buf_t *)p->handle; rdc_k_info_t *krdc = &rdc_k_info[p->index]; #ifdef DEBUG rdc_u_info_t *urdc; #endif int rc2; int bitmask; rdc_group_enter(krdc); krdc->aux_state |= RDC_AUXWRITE; #ifdef DEBUG urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) { cmn_err(CE_WARN, "!rdc_sync_write_thr: set not enabled %s:%s", urdc->secondary.file, urdc->secondary.bitmap); } #endif rdc_group_exit(krdc); bitmask = p->iostatus; /* overload */ rc2 = _rdc_remote_write(krdc, h, &h->rdc_bufh, p->pos, p->len, p->flag, bitmask); /* * overload flag with error return if any */ if (!RDC_SUCCESS(rc2)) { p->flag = rc2; } else { p->flag = 0; } rdc_group_enter(krdc); krdc->aux_state &= ~RDC_AUXWRITE; rdc_group_exit(krdc); mutex_enter(&sync->lock); sync->complete++; cv_broadcast(&sync->cv); mutex_exit(&sync->lock); } /* * _rdc_write * * Commit changes to the buffer locally and send remote. * * If this write is whilst the local primary volume is being synced, * then we write the remote end first to ensure that the new data * cannot be overwritten by a concurrent sync operation. */ static int _rdc_write(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag) { rdc_k_info_t *krdc = h->rdc_fd->rdc_info; rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; rdc_k_info_t *this; rdc_k_info_t *multi = NULL; int remote = RDC_REMOTE(h); int rc1, rc2; uint_t bitmask; int first; int rsync; int nthr; int winddown; int thrrc = 0; rdc_aio_t *bp[SNDR_MAXTHREADS]; aio_buf_t *anon; nsthread_t *tp; rdc_thrsync_t *sync = &h->rdc_sync; /* If this is the multi-hop secondary, move along to the primary */ if (IS_MULTI(krdc) && !IS_PRIMARY(urdc)) { multi = krdc; krdc = krdc->multi_next; urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) { krdc = h->rdc_fd->rdc_info; urdc = &rdc_u_info[krdc->index]; multi = NULL; } } this = krdc; rsync = (IS_PRIMARY(urdc)) && (IS_SLAVE(urdc)); /* * If this is a many group with a reverse sync in progress and * this is not the slave krdc/urdc, then search for the slave * so that we can do the remote io to the correct secondary * before the local io. */ if (rsync && !(IS_SLAVE(urdc))) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; if (rdc_get_vflags(urdc) & RDC_SLAVE) break; } rdc_many_exit(krdc); this = krdc; } urdc = &rdc_u_info[krdc->index]; rc1 = rc2 = 0; first = 1; nthr = 0; if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) { cmn_err(CE_WARN, "!_rdc_write: bounds check: io(handle) pos %" NSC_XSZFMT "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")", pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len); h->rdc_bufh.sb_error = EINVAL; return (h->rdc_bufh.sb_error); } DTRACE_PROBE(rdc_write_bitmap_start); /* if this is a 1 to many, set all the bits for all the sets */ do { if (RDC_SET_BITMAP(krdc, pos, len, &bitmask) < 0) { if (rdc_eio_nobmp) { (void) nsc_uncommit (h->rdc_bufp, pos, len, flag); /* set the error, but try the other sets */ h->rdc_bufh.sb_error = EIO; } } if (IS_MANY(krdc) && IS_STATE(urdc, RDC_PRIMARY)) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; break; } rdc_many_exit(krdc); } } while (krdc != this); urdc = &rdc_u_info[krdc->index]; DTRACE_PROBE(rdc_write_bitmap_end); write1: /* just in case we switch mode during write */ if (IS_ASYNC(urdc) && (!IS_STATE(urdc, RDC_SYNCING)) && (!IS_STATE(urdc, RDC_LOGGING) || IS_STATE(urdc, RDC_QUEUING))) { h->rdc_flags |= RDC_ASYNC_BUF; } if (BUF_IS_ASYNC(h)) { /* * We are async mode */ aio_buf_t *p; DTRACE_PROBE(rdc_write_async_start); if ((krdc->type_flag & RDC_DISABLEPEND) || ((IS_STATE(urdc, RDC_LOGGING) && !IS_STATE(urdc, RDC_QUEUING)))) { goto localwrite; } if (IS_STATE(urdc, RDC_VOL_FAILED)) { /* * overload remote as we don't want to do local * IO later. forge ahead with async */ remote++; } if ((IS_STATE(urdc, RDC_SYNCING)) || (IS_STATE(urdc, RDC_LOGGING) && !IS_STATE(urdc, RDC_QUEUING))) { goto localwrite; } p = rdc_aio_buf_add(krdc->index, h); if (p == NULL) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_alloc_buf aio_buf allocation failed"); #endif goto localwrite; } mutex_enter(&h->aio_lock); DTRACE_PROBE(rdc_write_async__allocabuf_start); rc1 = nsc_alloc_abuf(pos, len, 0, &p->rdc_abufp); DTRACE_PROBE(rdc_write_async__allocabuf_end); if (!RDC_SUCCESS(rc1)) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_alloc_buf NSC_ANON allocation failed rc %d", rc1); #endif mutex_exit(&h->aio_lock); goto localwrite; } h->rdc_flags |= RDC_ASYNC_VEC; mutex_exit(&h->aio_lock); /* * Copy buffer into anonymous buffer */ DTRACE_PROBE(rdc_write_async_nsccopy_start); rc1 = nsc_copy(&h->rdc_bufh, p->rdc_abufp, pos, pos, len); DTRACE_PROBE(rdc_write_async_nsccopy_end); if (!RDC_SUCCESS(rc1)) { #ifdef DEBUG cmn_err(CE_WARN, "!_rdc_write: nsc_copy failed rc=%d state %x", rc1, rdc_get_vflags(urdc)); #endif rc1 = nsc_free_buf(p->rdc_abufp); rdc_aio_buf_del(h, krdc); rdc_group_enter(krdc); rdc_group_log(krdc, RDC_FLUSH|RDC_OTHERREMOTE, "nsc_copy failure"); rdc_group_exit(krdc); } DTRACE_PROBE(rdc_write_async_end); /* * using a diskq, launch a thread to queue it * and free the aio->h and aio * if the thread fails, do it the old way (see localwrite) */ if (RDC_IS_DISKQ(krdc->group)) { if (nthr >= SNDR_MAXTHREADS) { #ifdef DEBUG cmn_err(CE_NOTE, "!nthr overrun in _rdc_write"); #endif thrrc = ENOEXEC; goto localwrite; } anon = rdc_aio_buf_get(h, krdc->index); if (anon == NULL) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_aio_buf_get failed for " "%p", (void *)h); #endif thrrc = ENOEXEC; goto localwrite; } /* get a populated rdc_aio_t */ bp[nthr] = rdc_aio_tbuf_get(sync, anon->rdc_abufp, pos, len, flag, krdc->index, bitmask); if (bp[nthr] == NULL) { #ifdef DEBUG cmn_err(CE_NOTE, "!_rdcwrite: " "kmem_alloc failed bp aio (1)"); #endif thrrc = ENOEXEC; goto localwrite; } /* start the queue io */ tp = nst_create(_rdc_ioset, _rdc_diskq_enqueue_thr, (void *)bp[nthr], NST_SLEEP); if (tp == NULL) { #ifdef DEBUG cmn_err(CE_NOTE, "!_rdcwrite: nst_create failure"); #endif thrrc = ENOEXEC; } else { mutex_enter(&(sync->lock)); sync->threads++; mutex_exit(&(sync->lock)); nthr++; } /* * the handle that is to be enqueued is now in * the rdc_aio_t, and will be freed there. * dump the aio_t now. If this is 1 to many * we may not do this in _rdc_free_buf() * if this was not the index that the rdc_buf_t * was allocated on. */ rdc_aio_buf_del(h, krdc); } } /* end of async */ /* * We try to overlap local and network IO for the sync case * (we already do it for async) * If one to many, we need to track the resulting nst_thread * so we don't trash the nsc_buf on a free * Start network IO first then do local (sync only) */ if (IS_PRIMARY(urdc) && !IS_STATE(urdc, RDC_LOGGING) && !BUF_IS_ASYNC(h)) { /* * if forward syncing, we must do local IO first * then remote io. Don't spawn thread */ if (!rsync && (IS_STATE(urdc, RDC_SYNCING))) { thrrc = ENOEXEC; goto localwrite; } if (IS_MULTI(krdc)) { rdc_k_info_t *ktmp; rdc_u_info_t *utmp; ktmp = krdc->multi_next; utmp = &rdc_u_info[ktmp->index]; if (IS_ENABLED(utmp)) multi = ktmp; } if (nthr >= SNDR_MAXTHREADS) { #ifdef DEBUG cmn_err(CE_NOTE, "!nthr overrun in _rdc_write"); #endif thrrc = ENOEXEC; goto localwrite; } bp[nthr] = rdc_aio_tbuf_get(sync, h, pos, len, flag, krdc->index, bitmask); if (bp[nthr] == NULL) { thrrc = ENOEXEC; goto localwrite; } tp = nst_create(_rdc_ioset, _rdc_sync_write_thr, (void *)bp[nthr], NST_SLEEP); if (tp == NULL) { #ifdef DEBUG cmn_err(CE_NOTE, "!_rdcwrite: nst_create failure"); #endif thrrc = ENOEXEC; } else { mutex_enter(&(sync->lock)); sync->threads++; mutex_exit(&(sync->lock)); nthr++; } } localwrite: if (!remote && !rsync && first) { DTRACE_PROBE(rdc_write_nscwrite_start); rc1 = nsc_write(h->rdc_bufp, pos, len, flag); DTRACE_PROBE(rdc_write_nscwrite_end); if (!RDC_SUCCESS(rc1)) { rdc_many_enter(krdc); if (IS_PRIMARY(urdc)) /* Primary, so reverse sync needed */ rdc_set_mflags(urdc, RDC_RSYNC_NEEDED); else /* Secondary, so sync needed */ rdc_set_flags(urdc, RDC_SYNC_NEEDED); rdc_set_flags_log(urdc, RDC_VOL_FAILED, "local write failed"); rdc_many_exit(krdc); rdc_write_state(urdc); } } /* * This is where we either enqueue async IO for the flusher * or do sync IO in the case of an error in thread creation * or we are doing a forward sync * NOTE: if we are async, and using a diskq, we have * already enqueued this write. * _rdc_remote_write will end up enqueuueing to memory, * or in case of a thread creation error above, try again * enqueue the diskq write if thrrc == ENOEXEC */ if ((IS_PRIMARY(urdc)) && (thrrc == ENOEXEC) || (BUF_IS_ASYNC(h) && !RDC_IS_DISKQ(krdc->group))) { thrrc = 0; if (IS_MULTI(krdc)) { rdc_k_info_t *ktmp; rdc_u_info_t *utmp; ktmp = krdc->multi_next; utmp = &rdc_u_info[ktmp->index]; if (IS_ENABLED(utmp)) multi = ktmp; } DTRACE_PROBE(rdc_write_remote_start); rc2 = _rdc_remote_write(krdc, h, &h->rdc_bufh, pos, len, flag, bitmask); DTRACE_PROBE(rdc_rdcwrite_remote_end); } if (!RDC_SUCCESS(rc1)) { if ((IS_PRIMARY(urdc)) && !RDC_SUCCESS(rc2)) { h->rdc_bufh.sb_error = rc1; } } else if ((remote || rsync) && !RDC_SUCCESS(rc2)) { h->rdc_bufh.sb_error = rc2; } write2: /* * If one to many, jump back into the loop to continue IO */ if (IS_MANY(krdc) && (IS_PRIMARY(urdc))) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; rc2 = first = 0; h->rdc_flags &= ~RDC_ASYNC_BUF; rdc_many_exit(krdc); goto write1; } rdc_many_exit(krdc); } urdc = &rdc_u_info[krdc->index]; /* * collect all of our threads if any */ if (nthr) { mutex_enter(&(sync->lock)); /* wait for the threads */ while (sync->complete != sync->threads) { cv_wait(&(sync->cv), &(sync->lock)); } mutex_exit(&(sync->lock)); /* collect status */ winddown = 0; while (winddown < nthr) { /* * Get any error return from thread */ if ((remote || rsync) && bp[winddown]->flag) { h->rdc_bufh.sb_error = bp[winddown]->flag; } if (bp[winddown]) kmem_free(bp[winddown], sizeof (rdc_aio_t)); winddown++; } } if (rsync && !(IS_STATE(urdc, RDC_VOL_FAILED))) { rc1 = nsc_write(h->rdc_bufp, pos, len, flag); if (!RDC_SUCCESS(rc1)) { /* rsync, so reverse sync needed already set */ rdc_many_enter(krdc); rdc_set_flags_log(urdc, RDC_VOL_FAILED, "rsync local write failed"); rdc_many_exit(krdc); rdc_write_state(urdc); /* * only report the error if a remote error * occurred as well. */ if (h->rdc_bufh.sb_error) h->rdc_bufh.sb_error = rc1; } } if (multi) { /* Multi-hop secondary, just set bits in the bitmap */ (void) RDC_SET_BITMAP(multi, pos, len, &bitmask); } return (h->rdc_bufh.sb_error); } static void _rdc_bzero(nsc_buf_t *h, nsc_off_t pos, nsc_size_t len) { nsc_vec_t *v; uchar_t *a; size_t sz; int l; if (!RDC_HANDLE_LIMITS(h, pos, len)) { cmn_err(CE_WARN, "!_rdc_bzero: bounds check: io(handle) pos %" NSC_XSZFMT "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")", pos, h->sb_pos, len, h->sb_len); return; } if (!len) return; /* find starting point */ v = h->sb_vec; pos -= h->sb_pos; for (; pos >= FBA_NUM(v->sv_len); v++) pos -= FBA_NUM(v->sv_len); a = v->sv_addr + FBA_SIZE(pos); l = v->sv_len - FBA_SIZE(pos); /* zero */ len = FBA_SIZE(len); /* convert to bytes */ while (len) { if (!a) /* end of vec */ break; sz = (size_t)min((nsc_size_t)l, len); bzero(a, sz); len -= sz; l -= sz; a += sz; if (!l) { v++; a = v->sv_addr; l = v->sv_len; } } } /* * _rdc_zero * * Zero and commit the specified area of the buffer. * * If this write is whilst the local primary volume is being synced, * then we write the remote end first to ensure that the new data * cannot be overwritten by a concurrent sync operation. */ static int _rdc_zero(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag) { rdc_k_info_t *krdc = h->rdc_fd->rdc_info; rdc_u_info_t *urdc = &rdc_u_info[krdc->index]; rdc_k_info_t *this; rdc_k_info_t *multi = NULL; int remote = RDC_REMOTE(h); int rc1, rc2; uint_t bitmask; int first; int rsync; /* If this is the multi-hop secondary, move along to the primary */ if (IS_MULTI(krdc) && !(rdc_get_vflags(urdc) & RDC_PRIMARY)) { multi = krdc; krdc = krdc->multi_next; urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) { krdc = h->rdc_fd->rdc_info; urdc = &rdc_u_info[krdc->index]; multi = NULL; } } this = krdc; rsync = ((rdc_get_vflags(urdc) & RDC_PRIMARY) && (rdc_get_mflags(urdc) & RDC_SLAVE)); /* * If this is a many group with a reverse sync in progress and * this is not the slave krdc/urdc, then search for the slave * so that we can do the remote io to the correct secondary * before the local io. */ if (rsync && !(rdc_get_vflags(urdc) & RDC_SLAVE)) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; if (rdc_get_vflags(urdc) & RDC_SLAVE) break; } rdc_many_exit(krdc); this = krdc; } rc1 = rc2 = 0; first = 1; if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) { cmn_err(CE_WARN, "!_rdc_zero: bounds check: io(handle) pos %" NSC_XSZFMT "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")", pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len); h->rdc_bufh.sb_error = EINVAL; return (h->rdc_bufh.sb_error); } zero1: if (RDC_SET_BITMAP(krdc, pos, len, &bitmask) < 0) { (void) nsc_uncommit(h->rdc_bufp, pos, len, flag); h->rdc_bufh.sb_error = EIO; goto zero2; } if (IS_ASYNC(urdc)) { /* * We are async mode */ aio_buf_t *p; if ((krdc->type_flag & RDC_DISABLEPEND) || (rdc_get_vflags(urdc) & RDC_LOGGING)) { mutex_exit(&krdc->group->ra_queue.net_qlock); goto localzero; } if ((rdc_get_vflags(urdc) & RDC_VOL_FAILED) || (rdc_get_vflags(urdc) & RDC_BMP_FAILED)) { mutex_exit(&krdc->group->ra_queue.net_qlock); goto zero2; } if (rdc_get_vflags(urdc) & RDC_LOGGING) { mutex_exit(&krdc->group->ra_queue.net_qlock); goto localzero; } p = rdc_aio_buf_add(krdc->index, h); if (p == NULL) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_alloc_buf aio_buf allocation failed"); #endif goto localzero; } mutex_enter(&h->aio_lock); rc1 = nsc_alloc_abuf(pos, len, 0, &p->rdc_abufp); if (!RDC_SUCCESS(rc1)) { #ifdef DEBUG cmn_err(CE_WARN, "!rdc_alloc_buf NSC_ANON allocation failed rc %d", rc1); #endif mutex_exit(&h->aio_lock); goto localzero; } h->rdc_flags |= RDC_ASYNC_VEC; mutex_exit(&h->aio_lock); /* * Copy buffer into anonymous buffer */ rc1 = nsc_zero(p->rdc_abufp, pos, len, flag); if (!RDC_SUCCESS(rc1)) { #ifdef DEBUG cmn_err(CE_WARN, "!_rdc_zero: nsc_zero failed rc=%d state %x", rc1, rdc_get_vflags(urdc)); #endif rc1 = nsc_free_buf(p->rdc_abufp); rdc_aio_buf_del(h, krdc); rdc_group_enter(krdc); rdc_group_log(krdc, RDC_FLUSH | RDC_OTHERREMOTE, "nsc_zero failed"); rdc_group_exit(krdc); } } /* end of async */ localzero: if (flag & NSC_NOBLOCK) { cmn_err(CE_WARN, "!_rdc_zero: removing unsupported NSC_NOBLOCK flag"); flag &= ~(NSC_NOBLOCK); } if (!remote && !rsync && first) { rc1 = nsc_zero(h->rdc_bufp, pos, len, flag); if (!RDC_SUCCESS(rc1)) { ASSERT(rdc_get_vflags(urdc) & RDC_PRIMARY); rdc_many_enter(krdc); /* Primary, so reverse sync needed */ rdc_set_mflags(urdc, RDC_RSYNC_NEEDED); rdc_set_flags_log(urdc, RDC_VOL_FAILED, "nsc_zero failed"); rdc_many_exit(krdc); rdc_write_state(urdc); } } /* * send new data to remote end - nsc_zero has zero'd * the data in the buffer, or _rdc_bzero will be used below. */ if (rdc_get_vflags(urdc) & RDC_PRIMARY) { if (first && (remote || rsync || !RDC_SUCCESS(rc1))) { /* bzero so that we can send new data to remote node */ _rdc_bzero(&h->rdc_bufh, pos, len); } if (IS_MULTI(krdc)) { rdc_k_info_t *ktmp; rdc_u_info_t *utmp; ktmp = krdc->multi_next; utmp = &rdc_u_info[ktmp->index]; if (IS_ENABLED(utmp)) multi = ktmp; } rc2 = _rdc_remote_write(krdc, h, &h->rdc_bufh, pos, len, flag, bitmask); } if (!RDC_SUCCESS(rc1)) { if ((rdc_get_vflags(urdc) & RDC_PRIMARY) && !RDC_SUCCESS(rc2)) { h->rdc_bufh.sb_error = rc1; } } else if ((remote || rsync) && !RDC_SUCCESS(rc2)) { h->rdc_bufh.sb_error = rc2; } zero2: if (IS_MANY(krdc) && (rdc_get_vflags(urdc) && RDC_PRIMARY)) { rdc_many_enter(krdc); for (krdc = krdc->many_next; krdc != this; krdc = krdc->many_next) { urdc = &rdc_u_info[krdc->index]; if (!IS_ENABLED(urdc)) continue; rc2 = first = 0; rdc_many_exit(krdc); goto zero1; } rdc_many_exit(krdc); } if (rsync && !(rdc_get_vflags(urdc) & RDC_VOL_FAILED)) { rc1 = nsc_write(h->rdc_bufp, pos, len, flag); if (!RDC_SUCCESS(rc1)) { /* rsync, so reverse sync needed already set */ rdc_many_enter(krdc); rdc_set_flags_log(urdc, RDC_VOL_FAILED, "nsc_write failed"); rdc_many_exit(krdc); rdc_write_state(urdc); /* * only report the error if a remote error * occurred as well. */ if (h->rdc_bufh.sb_error) h->rdc_bufh.sb_error = rc1; } } if (multi) { /* Multi-hop secondary, just set bits in the bitmap */ (void) RDC_SET_BITMAP(multi, pos, len, &bitmask); } return (h->rdc_bufh.sb_error); } /* * _rdc_uncommit * - refresh specified data region in the buffer to prevent the cache * serving the scribbled on data back to another client. * * Only needs to happen on the local node. If in remote io mode, then * just return 0 - we do not cache the data on the local node and the * changed data will not have made it to the cache on the other node, * so it has no need to uncommit. */ static int _rdc_uncommit(rdc_buf_t *h, nsc_off_t pos, nsc_size_t len, int flag) { int remote = RDC_REMOTE(h); int rc = 0; if (!RDC_HANDLE_LIMITS(&h->rdc_bufh, pos, len)) { cmn_err(CE_WARN, "!_rdc_uncommit: bounds check: io(handle) pos %" NSC_XSZFMT "(%" NSC_XSZFMT ") len %" NSC_XSZFMT "(%" NSC_XSZFMT ")", pos, h->rdc_bufh.sb_pos, len, h->rdc_bufh.sb_len); h->rdc_bufh.sb_error = EINVAL; return (h->rdc_bufh.sb_error); } if (flag & NSC_NOBLOCK) { cmn_err(CE_WARN, "!_rdc_uncommit: removing unsupported NSC_NOBLOCK flag"); flag &= ~(NSC_NOBLOCK); } if (!remote) { rc = nsc_uncommit(h->rdc_bufp, pos, len, flag); } if (!RDC_SUCCESS(rc)) h->rdc_bufh.sb_error = rc; return (rc); } /* * _rdc_trksize * * only needs to happen on local node. */ static int _rdc_trksize(rdc_fd_t *rfd, nsc_size_t trksize) { return (nsc_set_trksize(RDC_FD(rfd), trksize)); } static nsc_def_t _rdc_fd_def[] = { "Attach", (uintptr_t)_rdc_attach_fd, 0, "Pinned", (uintptr_t)_rdc_pinned, 0, "Unpinned", (uintptr_t)_rdc_unpinned, 0, 0, 0, 0 }; static nsc_def_t _rdc_io_def[] = { "Open", (uintptr_t)_rdc_openc, 0, "Close", (uintptr_t)_rdc_close, 0, "Attach", (uintptr_t)_rdc_attach, 0, "Detach", (uintptr_t)_rdc_detach, 0, "AllocHandle", (uintptr_t)_rdc_alloc_handle, 0, "FreeHandle", (uintptr_t)_rdc_free_handle, 0, "AllocBuf", (uintptr_t)_rdc_alloc_buf, 0, "FreeBuf", (uintptr_t)_rdc_free_buf, 0, "GetPinned", (uintptr_t)_rdc_get_pinned, 0, "Discard", (uintptr_t)_rdc_discard_pinned, 0, "PartSize", (uintptr_t)_rdc_partsize, 0, "MaxFbas", (uintptr_t)_rdc_maxfbas, 0, "Control", (uintptr_t)_rdc_control, 0, "Read", (uintptr_t)_rdc_read, 0, "Write", (uintptr_t)_rdc_write, 0, "Zero", (uintptr_t)_rdc_zero, 0, "Uncommit", (uintptr_t)_rdc_uncommit, 0, "TrackSize", (uintptr_t)_rdc_trksize, 0, "Provide", 0, 0, 0, 0, 0 }; static nsc_def_t _rdc_ior_def[] = { "Open", (uintptr_t)_rdc_openr, 0, "Close", (uintptr_t)_rdc_close, 0, "Attach", (uintptr_t)_rdc_attach, 0, "Detach", (uintptr_t)_rdc_detach, 0, "AllocHandle", (uintptr_t)_rdc_alloc_handle, 0, "FreeHandle", (uintptr_t)_rdc_free_handle, 0, "AllocBuf", (uintptr_t)_rdc_alloc_buf, 0, "FreeBuf", (uintptr_t)_rdc_free_buf, 0, "GetPinned", (uintptr_t)_rdc_get_pinned, 0, "Discard", (uintptr_t)_rdc_discard_pinned, 0, "PartSize", (uintptr_t)_rdc_partsize, 0, "MaxFbas", (uintptr_t)_rdc_maxfbas, 0, "Control", (uintptr_t)_rdc_control, 0, "Read", (uintptr_t)_rdc_read, 0, "Write", (uintptr_t)_rdc_write, 0, "Zero", (uintptr_t)_rdc_zero, 0, "Uncommit", (uintptr_t)_rdc_uncommit, 0, "TrackSize", (uintptr_t)_rdc_trksize, 0, "Provide", 0, 0, 0, 0, 0 };