189b17223SAlexander Motin /*- 289b17223SAlexander Motin * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 389b17223SAlexander Motin * All rights reserved. 489b17223SAlexander Motin * 589b17223SAlexander Motin * Redistribution and use in source and binary forms, with or without 689b17223SAlexander Motin * modification, are permitted provided that the following conditions 789b17223SAlexander Motin * are met: 889b17223SAlexander Motin * 1. Redistributions of source code must retain the above copyright 989b17223SAlexander Motin * notice, this list of conditions and the following disclaimer. 1089b17223SAlexander Motin * 2. Redistributions in binary form must reproduce the above copyright 1189b17223SAlexander Motin * notice, this list of conditions and the following disclaimer in the 1289b17223SAlexander Motin * documentation and/or other materials provided with the distribution. 1389b17223SAlexander Motin * 1489b17223SAlexander Motin * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 1589b17223SAlexander Motin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1689b17223SAlexander Motin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1789b17223SAlexander Motin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 1889b17223SAlexander Motin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1989b17223SAlexander Motin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2089b17223SAlexander Motin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2189b17223SAlexander Motin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2289b17223SAlexander Motin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2389b17223SAlexander Motin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2489b17223SAlexander Motin * SUCH DAMAGE. 2589b17223SAlexander Motin */ 2689b17223SAlexander Motin 2789b17223SAlexander Motin #include <sys/cdefs.h> 2889b17223SAlexander Motin __FBSDID("$FreeBSD$"); 2989b17223SAlexander Motin 3089b17223SAlexander Motin #include <sys/param.h> 3189b17223SAlexander Motin #include <sys/bio.h> 3289b17223SAlexander Motin #include <sys/endian.h> 3389b17223SAlexander Motin #include <sys/kernel.h> 3489b17223SAlexander Motin #include <sys/kobj.h> 3589b17223SAlexander Motin #include <sys/limits.h> 3689b17223SAlexander Motin #include <sys/lock.h> 3789b17223SAlexander Motin #include <sys/malloc.h> 3889b17223SAlexander Motin #include <sys/mutex.h> 3989b17223SAlexander Motin #include <sys/sysctl.h> 4089b17223SAlexander Motin #include <sys/systm.h> 4189b17223SAlexander Motin #include <geom/geom.h> 4289b17223SAlexander Motin #include "geom/raid/g_raid.h" 4389b17223SAlexander Motin #include "g_raid_tr_if.h" 4489b17223SAlexander Motin 45c89d2fbeSAlexander Motin SYSCTL_DECL(_kern_geom_raid_raid1); 4689b17223SAlexander Motin 4789b17223SAlexander Motin #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 4889b17223SAlexander Motin static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; 4989b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size", 5089b17223SAlexander Motin &g_raid1_rebuild_slab); 5189b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, 5289b17223SAlexander Motin &g_raid1_rebuild_slab, 0, 5389b17223SAlexander Motin "Amount of the disk to rebuild each read/write cycle of the rebuild."); 5489b17223SAlexander Motin 5589b17223SAlexander Motin #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 5689b17223SAlexander Motin static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; 5789b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io", 5889b17223SAlexander Motin &g_raid1_rebuild_fair_io); 5989b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, 6089b17223SAlexander Motin &g_raid1_rebuild_fair_io, 0, 6189b17223SAlexander Motin "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 6289b17223SAlexander Motin 6389b17223SAlexander Motin #define RAID1_REBUILD_CLUSTER_IDLE 100 6489b17223SAlexander Motin static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; 6589b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle", 6689b17223SAlexander Motin &g_raid1_rebuild_cluster_idle); 6789b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, 6889b17223SAlexander Motin &g_raid1_rebuild_cluster_idle, 0, 6989b17223SAlexander Motin "Number of slabs to do each time we trigger a rebuild cycle"); 7089b17223SAlexander Motin 7189b17223SAlexander Motin #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 7289b17223SAlexander Motin static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; 7389b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update", 7489b17223SAlexander Motin &g_raid1_rebuild_meta_update); 7589b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, 7689b17223SAlexander Motin &g_raid1_rebuild_meta_update, 0, 7789b17223SAlexander Motin "When to update the meta data."); 7889b17223SAlexander Motin 7989b17223SAlexander Motin static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); 8089b17223SAlexander Motin 8189b17223SAlexander Motin #define TR_RAID1_NONE 0 8289b17223SAlexander Motin #define TR_RAID1_REBUILD 1 8389b17223SAlexander Motin #define TR_RAID1_RESYNC 2 8489b17223SAlexander Motin 8589b17223SAlexander Motin #define TR_RAID1_F_DOING_SOME 0x1 8689b17223SAlexander Motin #define TR_RAID1_F_LOCKED 0x2 8789b17223SAlexander Motin #define TR_RAID1_F_ABORT 0x4 8889b17223SAlexander Motin 8989b17223SAlexander Motin struct g_raid_tr_raid1_object { 9089b17223SAlexander Motin struct g_raid_tr_object trso_base; 9189b17223SAlexander Motin int trso_starting; 9289b17223SAlexander Motin int trso_stopping; 9389b17223SAlexander Motin int trso_type; 9489b17223SAlexander Motin int trso_recover_slabs; /* slabs before rest */ 9589b17223SAlexander Motin int trso_fair_io; 9689b17223SAlexander Motin int trso_meta_update; 9789b17223SAlexander Motin int trso_flags; 9889b17223SAlexander Motin struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 9989b17223SAlexander Motin void *trso_buffer; /* Buffer space */ 10089b17223SAlexander Motin struct bio trso_bio; 10189b17223SAlexander Motin }; 10289b17223SAlexander Motin 10389b17223SAlexander Motin static g_raid_tr_taste_t g_raid_tr_taste_raid1; 10489b17223SAlexander Motin static g_raid_tr_event_t g_raid_tr_event_raid1; 10589b17223SAlexander Motin static g_raid_tr_start_t g_raid_tr_start_raid1; 10689b17223SAlexander Motin static g_raid_tr_stop_t g_raid_tr_stop_raid1; 10789b17223SAlexander Motin static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; 10889b17223SAlexander Motin static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; 10989b17223SAlexander Motin static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; 11089b17223SAlexander Motin static g_raid_tr_locked_t g_raid_tr_locked_raid1; 11189b17223SAlexander Motin static g_raid_tr_idle_t g_raid_tr_idle_raid1; 11289b17223SAlexander Motin static g_raid_tr_free_t g_raid_tr_free_raid1; 11389b17223SAlexander Motin 11489b17223SAlexander Motin static kobj_method_t g_raid_tr_raid1_methods[] = { 11589b17223SAlexander Motin KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), 11689b17223SAlexander Motin KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), 11789b17223SAlexander Motin KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), 11889b17223SAlexander Motin KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), 11989b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), 12089b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), 12189b17223SAlexander Motin KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), 12289b17223SAlexander Motin KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), 12389b17223SAlexander Motin KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), 12489b17223SAlexander Motin KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), 12589b17223SAlexander Motin { 0, 0 } 12689b17223SAlexander Motin }; 12789b17223SAlexander Motin 12889b17223SAlexander Motin static struct g_raid_tr_class g_raid_tr_raid1_class = { 12989b17223SAlexander Motin "RAID1", 13089b17223SAlexander Motin g_raid_tr_raid1_methods, 13189b17223SAlexander Motin sizeof(struct g_raid_tr_raid1_object), 132c89d2fbeSAlexander Motin .trc_enable = 1, 13389b17223SAlexander Motin .trc_priority = 100 13489b17223SAlexander Motin }; 13589b17223SAlexander Motin 13689b17223SAlexander Motin static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); 13789b17223SAlexander Motin static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 13889b17223SAlexander Motin struct g_raid_subdisk *sd); 13989b17223SAlexander Motin 14089b17223SAlexander Motin static int 14189b17223SAlexander Motin g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 14289b17223SAlexander Motin { 14389b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 14489b17223SAlexander Motin 14589b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 14689b17223SAlexander Motin if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || 147dbb2e755SAlexander Motin (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && 148dbb2e755SAlexander Motin tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) 14989b17223SAlexander Motin return (G_RAID_TR_TASTE_FAIL); 15089b17223SAlexander Motin trs->trso_starting = 1; 15189b17223SAlexander Motin return (G_RAID_TR_TASTE_SUCCEED); 15289b17223SAlexander Motin } 15389b17223SAlexander Motin 15489b17223SAlexander Motin static int 15589b17223SAlexander Motin g_raid_tr_update_state_raid1(struct g_raid_volume *vol, 15689b17223SAlexander Motin struct g_raid_subdisk *sd) 15789b17223SAlexander Motin { 15889b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 15989b17223SAlexander Motin struct g_raid_softc *sc; 16089b17223SAlexander Motin struct g_raid_subdisk *tsd, *bestsd; 16189b17223SAlexander Motin u_int s; 16289b17223SAlexander Motin int i, na, ns; 16389b17223SAlexander Motin 16489b17223SAlexander Motin sc = vol->v_softc; 16589b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)vol->v_tr; 16689b17223SAlexander Motin if (trs->trso_stopping && 16789b17223SAlexander Motin (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) 16889b17223SAlexander Motin s = G_RAID_VOLUME_S_STOPPED; 16989b17223SAlexander Motin else if (trs->trso_starting) 17089b17223SAlexander Motin s = G_RAID_VOLUME_S_STARTING; 17189b17223SAlexander Motin else { 17289b17223SAlexander Motin /* Make sure we have at least one ACTIVE disk. */ 17389b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 17489b17223SAlexander Motin if (na == 0) { 17589b17223SAlexander Motin /* 17689b17223SAlexander Motin * Critical situation! We have no any active disk! 17789b17223SAlexander Motin * Choose the best disk we have to make it active. 17889b17223SAlexander Motin */ 17989b17223SAlexander Motin bestsd = &vol->v_subdisks[0]; 18089b17223SAlexander Motin for (i = 1; i < vol->v_disks_count; i++) { 18189b17223SAlexander Motin tsd = &vol->v_subdisks[i]; 18289b17223SAlexander Motin if (tsd->sd_state > bestsd->sd_state) 18389b17223SAlexander Motin bestsd = tsd; 18489b17223SAlexander Motin else if (tsd->sd_state == bestsd->sd_state && 18589b17223SAlexander Motin (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || 18689b17223SAlexander Motin tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 18789b17223SAlexander Motin tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 18889b17223SAlexander Motin bestsd = tsd; 18989b17223SAlexander Motin } 19089b17223SAlexander Motin if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { 19189b17223SAlexander Motin /* We found reasonable candidate. */ 19289b17223SAlexander Motin G_RAID_DEBUG1(1, sc, 19389b17223SAlexander Motin "Promote subdisk %s:%d from %s to ACTIVE.", 19489b17223SAlexander Motin vol->v_name, bestsd->sd_pos, 19589b17223SAlexander Motin g_raid_subdisk_state2str(bestsd->sd_state)); 19689b17223SAlexander Motin g_raid_change_subdisk_state(bestsd, 19789b17223SAlexander Motin G_RAID_SUBDISK_S_ACTIVE); 19889b17223SAlexander Motin g_raid_write_metadata(sc, 19989b17223SAlexander Motin vol, bestsd, bestsd->sd_disk); 20089b17223SAlexander Motin } 20189b17223SAlexander Motin } 20289b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 20389b17223SAlexander Motin ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 20489b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 20589b17223SAlexander Motin if (na == vol->v_disks_count) 20689b17223SAlexander Motin s = G_RAID_VOLUME_S_OPTIMAL; 20789b17223SAlexander Motin else if (na + ns == vol->v_disks_count) 20889b17223SAlexander Motin s = G_RAID_VOLUME_S_SUBOPTIMAL; 20989b17223SAlexander Motin else if (na > 0) 21089b17223SAlexander Motin s = G_RAID_VOLUME_S_DEGRADED; 21189b17223SAlexander Motin else 21289b17223SAlexander Motin s = G_RAID_VOLUME_S_BROKEN; 21389b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); 21489b17223SAlexander Motin } 21589b17223SAlexander Motin if (s != vol->v_state) { 21689b17223SAlexander Motin g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 21789b17223SAlexander Motin G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 21889b17223SAlexander Motin G_RAID_EVENT_VOLUME); 21989b17223SAlexander Motin g_raid_change_volume_state(vol, s); 22089b17223SAlexander Motin if (!trs->trso_starting && !trs->trso_stopping) 22189b17223SAlexander Motin g_raid_write_metadata(sc, vol, NULL, NULL); 22289b17223SAlexander Motin } 22389b17223SAlexander Motin return (0); 22489b17223SAlexander Motin } 22589b17223SAlexander Motin 22689b17223SAlexander Motin static void 22789b17223SAlexander Motin g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 22889b17223SAlexander Motin struct g_raid_disk *disk) 22989b17223SAlexander Motin { 23089b17223SAlexander Motin /* 23189b17223SAlexander Motin * We don't fail the last disk in the pack, since it still has decent 23289b17223SAlexander Motin * data on it and that's better than failing the disk if it is the root 23389b17223SAlexander Motin * file system. 23489b17223SAlexander Motin * 23589b17223SAlexander Motin * XXX should this be controlled via a tunable? It makes sense for 23689b17223SAlexander Motin * the volume that has / on it. I can't think of a case where we'd 23789b17223SAlexander Motin * want the volume to go away on this kind of event. 23889b17223SAlexander Motin */ 23989b17223SAlexander Motin if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && 24089b17223SAlexander Motin g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) 24189b17223SAlexander Motin return; 24289b17223SAlexander Motin g_raid_fail_disk(sc, sd, disk); 24389b17223SAlexander Motin } 24489b17223SAlexander Motin 24589b17223SAlexander Motin static void 24689b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) 24789b17223SAlexander Motin { 24889b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 24989b17223SAlexander Motin struct g_raid_subdisk *sd, *good_sd; 25089b17223SAlexander Motin struct bio *bp; 25189b17223SAlexander Motin 25289b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 25389b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) 25489b17223SAlexander Motin return; 25589b17223SAlexander Motin sd = trs->trso_failed_sd; 25689b17223SAlexander Motin good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); 25789b17223SAlexander Motin if (good_sd == NULL) { 25889b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 25989b17223SAlexander Motin return; 26089b17223SAlexander Motin } 26189b17223SAlexander Motin bp = &trs->trso_bio; 26289b17223SAlexander Motin memset(bp, 0, sizeof(*bp)); 26389b17223SAlexander Motin bp->bio_offset = sd->sd_rebuild_pos; 26489b17223SAlexander Motin bp->bio_length = MIN(g_raid1_rebuild_slab, 26589b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 26689b17223SAlexander Motin bp->bio_data = trs->trso_buffer; 26789b17223SAlexander Motin bp->bio_cmd = BIO_READ; 26889b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 26989b17223SAlexander Motin bp->bio_caller1 = good_sd; 27089b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_DOING_SOME; 27189b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_LOCKED; 27289b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ 27389b17223SAlexander Motin bp->bio_offset, bp->bio_length, NULL, bp); 27489b17223SAlexander Motin } 27589b17223SAlexander Motin 27689b17223SAlexander Motin static void 27789b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) 27889b17223SAlexander Motin { 27989b17223SAlexander Motin struct g_raid_volume *vol; 28089b17223SAlexander Motin struct g_raid_subdisk *sd; 28189b17223SAlexander Motin 28289b17223SAlexander Motin vol = trs->trso_base.tro_volume; 28389b17223SAlexander Motin sd = trs->trso_failed_sd; 28489b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 28589b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 28689b17223SAlexander Motin trs->trso_buffer = NULL; 28789b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 28889b17223SAlexander Motin trs->trso_type = TR_RAID1_NONE; 28989b17223SAlexander Motin trs->trso_recover_slabs = 0; 29089b17223SAlexander Motin trs->trso_failed_sd = NULL; 29189b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 29289b17223SAlexander Motin } 29389b17223SAlexander Motin 29489b17223SAlexander Motin static void 29589b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) 29689b17223SAlexander Motin { 29789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 29889b17223SAlexander Motin struct g_raid_subdisk *sd; 29989b17223SAlexander Motin 30089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 30189b17223SAlexander Motin sd = trs->trso_failed_sd; 30289b17223SAlexander Motin G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 30389b17223SAlexander Motin "Subdisk %s:%d-%s rebuild completed.", 30489b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 30589b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 30689b17223SAlexander Motin g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 30789b17223SAlexander Motin sd->sd_rebuild_pos = 0; 30889b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 30989b17223SAlexander Motin } 31089b17223SAlexander Motin 31189b17223SAlexander Motin static void 31289b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) 31389b17223SAlexander Motin { 31489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 31589b17223SAlexander Motin struct g_raid_subdisk *sd; 31689b17223SAlexander Motin struct g_raid_volume *vol; 31789b17223SAlexander Motin off_t len; 31889b17223SAlexander Motin 31989b17223SAlexander Motin vol = tr->tro_volume; 32089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 32189b17223SAlexander Motin sd = trs->trso_failed_sd; 32289b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { 32389b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 32489b17223SAlexander Motin "Subdisk %s:%d-%s rebuild is aborting.", 32589b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 32689b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 32789b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_ABORT; 32889b17223SAlexander Motin } else { 32989b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 33089b17223SAlexander Motin "Subdisk %s:%d-%s rebuild aborted.", 33189b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 33289b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 33389b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_ABORT; 33489b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_LOCKED) { 33589b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 33689b17223SAlexander Motin len = MIN(g_raid1_rebuild_slab, 33789b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 33889b17223SAlexander Motin g_raid_unlock_range(tr->tro_volume, 33989b17223SAlexander Motin sd->sd_rebuild_pos, len); 34089b17223SAlexander Motin } 34189b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 34289b17223SAlexander Motin } 34389b17223SAlexander Motin } 34489b17223SAlexander Motin 34589b17223SAlexander Motin static void 34689b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) 34789b17223SAlexander Motin { 34889b17223SAlexander Motin struct g_raid_volume *vol; 34989b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 35089b17223SAlexander Motin struct g_raid_subdisk *sd, *fsd; 35189b17223SAlexander Motin 35289b17223SAlexander Motin vol = tr->tro_volume; 35389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 35489b17223SAlexander Motin if (trs->trso_failed_sd) { 35589b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 35689b17223SAlexander Motin "Already rebuild in start rebuild. pos %jd\n", 35789b17223SAlexander Motin (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 35889b17223SAlexander Motin return; 35989b17223SAlexander Motin } 36089b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); 36189b17223SAlexander Motin if (sd == NULL) { 36289b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 36389b17223SAlexander Motin "No active disk to rebuild. night night."); 36489b17223SAlexander Motin return; 36589b17223SAlexander Motin } 36689b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 36789b17223SAlexander Motin if (fsd == NULL) 36889b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 36989b17223SAlexander Motin if (fsd == NULL) { 37089b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 37189b17223SAlexander Motin if (fsd != NULL) { 37289b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 37389b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 37489b17223SAlexander Motin G_RAID_SUBDISK_S_RESYNC); 37589b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); 37689b17223SAlexander Motin } else { 37789b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 37889b17223SAlexander Motin G_RAID_SUBDISK_S_UNINITIALIZED); 37989b17223SAlexander Motin if (fsd == NULL) 38089b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 38189b17223SAlexander Motin G_RAID_SUBDISK_S_NEW); 38289b17223SAlexander Motin if (fsd != NULL) { 38389b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 38489b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 38589b17223SAlexander Motin G_RAID_SUBDISK_S_REBUILD); 38689b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 38789b17223SAlexander Motin vol, fsd, NULL); 38889b17223SAlexander Motin } 38989b17223SAlexander Motin } 39089b17223SAlexander Motin } 39189b17223SAlexander Motin if (fsd == NULL) { 39289b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 39389b17223SAlexander Motin "No failed disk to rebuild. night night."); 39489b17223SAlexander Motin return; 39589b17223SAlexander Motin } 39689b17223SAlexander Motin trs->trso_failed_sd = fsd; 39789b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 39889b17223SAlexander Motin "Subdisk %s:%d-%s rebuild start at %jd.", 39989b17223SAlexander Motin fsd->sd_volume->v_name, fsd->sd_pos, 40089b17223SAlexander Motin fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", 40189b17223SAlexander Motin trs->trso_failed_sd->sd_rebuild_pos); 40289b17223SAlexander Motin trs->trso_type = TR_RAID1_REBUILD; 40389b17223SAlexander Motin trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); 40489b17223SAlexander Motin trs->trso_meta_update = g_raid1_rebuild_meta_update; 40589b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 40689b17223SAlexander Motin } 40789b17223SAlexander Motin 40889b17223SAlexander Motin 40989b17223SAlexander Motin static void 41089b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 41189b17223SAlexander Motin struct g_raid_subdisk *sd) 41289b17223SAlexander Motin { 41389b17223SAlexander Motin struct g_raid_volume *vol; 41489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 41589b17223SAlexander Motin int na, nr; 41689b17223SAlexander Motin 41789b17223SAlexander Motin /* 41889b17223SAlexander Motin * If we're stopping, don't do anything. If we don't have at least one 41989b17223SAlexander Motin * good disk and one bad disk, we don't do anything. And if there's a 42089b17223SAlexander Motin * 'good disk' stored in the trs, then we're in progress and we punt. 42189b17223SAlexander Motin * If we make it past all these checks, we need to rebuild. 42289b17223SAlexander Motin */ 42389b17223SAlexander Motin vol = tr->tro_volume; 42489b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 42589b17223SAlexander Motin if (trs->trso_stopping) 42689b17223SAlexander Motin return; 42789b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 42889b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 42989b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 43089b17223SAlexander Motin switch(trs->trso_type) { 43189b17223SAlexander Motin case TR_RAID1_NONE: 43289b17223SAlexander Motin if (na == 0) 43389b17223SAlexander Motin return; 43489b17223SAlexander Motin if (nr == 0) { 43589b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 43689b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 43789b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 43889b17223SAlexander Motin if (nr == 0) 43989b17223SAlexander Motin return; 44089b17223SAlexander Motin } 44189b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(tr); 44289b17223SAlexander Motin break; 44389b17223SAlexander Motin case TR_RAID1_REBUILD: 44489b17223SAlexander Motin if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) 44589b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 44689b17223SAlexander Motin break; 44789b17223SAlexander Motin case TR_RAID1_RESYNC: 44889b17223SAlexander Motin break; 44989b17223SAlexander Motin } 45089b17223SAlexander Motin } 45189b17223SAlexander Motin 45289b17223SAlexander Motin static int 45389b17223SAlexander Motin g_raid_tr_event_raid1(struct g_raid_tr_object *tr, 45489b17223SAlexander Motin struct g_raid_subdisk *sd, u_int event) 45589b17223SAlexander Motin { 45689b17223SAlexander Motin 45789b17223SAlexander Motin g_raid_tr_update_state_raid1(tr->tro_volume, sd); 45889b17223SAlexander Motin return (0); 45989b17223SAlexander Motin } 46089b17223SAlexander Motin 46189b17223SAlexander Motin static int 46289b17223SAlexander Motin g_raid_tr_start_raid1(struct g_raid_tr_object *tr) 46389b17223SAlexander Motin { 46489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 46589b17223SAlexander Motin struct g_raid_volume *vol; 46689b17223SAlexander Motin 46789b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 46889b17223SAlexander Motin vol = tr->tro_volume; 46989b17223SAlexander Motin trs->trso_starting = 0; 47089b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 47189b17223SAlexander Motin return (0); 47289b17223SAlexander Motin } 47389b17223SAlexander Motin 47489b17223SAlexander Motin static int 47589b17223SAlexander Motin g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) 47689b17223SAlexander Motin { 47789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 47889b17223SAlexander Motin struct g_raid_volume *vol; 47989b17223SAlexander Motin 48089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 48189b17223SAlexander Motin vol = tr->tro_volume; 48289b17223SAlexander Motin trs->trso_starting = 0; 48389b17223SAlexander Motin trs->trso_stopping = 1; 48489b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 48589b17223SAlexander Motin return (0); 48689b17223SAlexander Motin } 48789b17223SAlexander Motin 48889b17223SAlexander Motin /* 48989b17223SAlexander Motin * Select the disk to read from. Take into account: subdisk state, running 49089b17223SAlexander Motin * error recovery, average disk load, head position and possible cache hits. 49189b17223SAlexander Motin */ 49289b17223SAlexander Motin #define ABS(x) (((x) >= 0) ? (x) : (-(x))) 49389b17223SAlexander Motin static struct g_raid_subdisk * 49489b17223SAlexander Motin g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, 49589b17223SAlexander Motin u_int mask) 49689b17223SAlexander Motin { 49789b17223SAlexander Motin struct g_raid_subdisk *sd, *best; 49889b17223SAlexander Motin int i, prio, bestprio; 49989b17223SAlexander Motin 50089b17223SAlexander Motin best = NULL; 50189b17223SAlexander Motin bestprio = INT_MAX; 50289b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 50389b17223SAlexander Motin sd = &vol->v_subdisks[i]; 50489b17223SAlexander Motin if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && 50589b17223SAlexander Motin ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && 50689b17223SAlexander Motin sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || 50789b17223SAlexander Motin bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) 50889b17223SAlexander Motin continue; 50989b17223SAlexander Motin if ((mask & (1 << i)) != 0) 51089b17223SAlexander Motin continue; 51189b17223SAlexander Motin prio = G_RAID_SUBDISK_LOAD(sd); 51289b17223SAlexander Motin prio += min(sd->sd_recovery, 255) << 22; 51389b17223SAlexander Motin prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; 51489b17223SAlexander Motin /* If disk head is precisely in position - highly prefer it. */ 51589b17223SAlexander Motin if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) 51689b17223SAlexander Motin prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 51789b17223SAlexander Motin else 51889b17223SAlexander Motin /* If disk head is close to position - prefer it. */ 51989b17223SAlexander Motin if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < 52089b17223SAlexander Motin G_RAID_SUBDISK_TRACK_SIZE) 52189b17223SAlexander Motin prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 52289b17223SAlexander Motin if (prio < bestprio) { 52389b17223SAlexander Motin best = sd; 52489b17223SAlexander Motin bestprio = prio; 52589b17223SAlexander Motin } 52689b17223SAlexander Motin } 52789b17223SAlexander Motin return (best); 52889b17223SAlexander Motin } 52989b17223SAlexander Motin 53089b17223SAlexander Motin static void 53189b17223SAlexander Motin g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) 53289b17223SAlexander Motin { 53389b17223SAlexander Motin struct g_raid_subdisk *sd; 53489b17223SAlexander Motin struct bio *cbp; 53589b17223SAlexander Motin 53689b17223SAlexander Motin sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); 53789b17223SAlexander Motin KASSERT(sd != NULL, ("No active disks in volume %s.", 53889b17223SAlexander Motin tr->tro_volume->v_name)); 53989b17223SAlexander Motin 54089b17223SAlexander Motin cbp = g_clone_bio(bp); 54189b17223SAlexander Motin if (cbp == NULL) { 54289b17223SAlexander Motin g_raid_iodone(bp, ENOMEM); 54389b17223SAlexander Motin return; 54489b17223SAlexander Motin } 54589b17223SAlexander Motin 54689b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 54789b17223SAlexander Motin } 54889b17223SAlexander Motin 54989b17223SAlexander Motin static void 55089b17223SAlexander Motin g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) 55189b17223SAlexander Motin { 55289b17223SAlexander Motin struct g_raid_volume *vol; 55389b17223SAlexander Motin struct g_raid_subdisk *sd; 55489b17223SAlexander Motin struct bio_queue_head queue; 55589b17223SAlexander Motin struct bio *cbp; 55689b17223SAlexander Motin int i; 55789b17223SAlexander Motin 55889b17223SAlexander Motin vol = tr->tro_volume; 55989b17223SAlexander Motin 56089b17223SAlexander Motin /* 56189b17223SAlexander Motin * Allocate all bios before sending any request, so we can return 56289b17223SAlexander Motin * ENOMEM in nice and clean way. 56389b17223SAlexander Motin */ 56489b17223SAlexander Motin bioq_init(&queue); 56589b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 56689b17223SAlexander Motin sd = &vol->v_subdisks[i]; 56789b17223SAlexander Motin switch (sd->sd_state) { 56889b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 56989b17223SAlexander Motin break; 57089b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 57189b17223SAlexander Motin /* 57289b17223SAlexander Motin * When rebuilding, only part of this subdisk is 57389b17223SAlexander Motin * writable, the rest will be written as part of the 57489b17223SAlexander Motin * that process. 57589b17223SAlexander Motin */ 57689b17223SAlexander Motin if (bp->bio_offset >= sd->sd_rebuild_pos) 57789b17223SAlexander Motin continue; 57889b17223SAlexander Motin break; 57989b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 58089b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 58189b17223SAlexander Motin /* 58289b17223SAlexander Motin * Resyncing still writes on the theory that the 58389b17223SAlexander Motin * resync'd disk is very close and writing it will 58489b17223SAlexander Motin * keep it that way better if we keep up while 58589b17223SAlexander Motin * resyncing. 58689b17223SAlexander Motin */ 58789b17223SAlexander Motin break; 58889b17223SAlexander Motin default: 58989b17223SAlexander Motin continue; 59089b17223SAlexander Motin } 59189b17223SAlexander Motin cbp = g_clone_bio(bp); 59289b17223SAlexander Motin if (cbp == NULL) 59389b17223SAlexander Motin goto failure; 59489b17223SAlexander Motin cbp->bio_caller1 = sd; 59589b17223SAlexander Motin bioq_insert_tail(&queue, cbp); 59689b17223SAlexander Motin } 59789b17223SAlexander Motin for (cbp = bioq_first(&queue); cbp != NULL; 59889b17223SAlexander Motin cbp = bioq_first(&queue)) { 59989b17223SAlexander Motin bioq_remove(&queue, cbp); 60089b17223SAlexander Motin sd = cbp->bio_caller1; 60189b17223SAlexander Motin cbp->bio_caller1 = NULL; 60289b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 60389b17223SAlexander Motin } 60489b17223SAlexander Motin return; 60589b17223SAlexander Motin failure: 60689b17223SAlexander Motin for (cbp = bioq_first(&queue); cbp != NULL; 60789b17223SAlexander Motin cbp = bioq_first(&queue)) { 60889b17223SAlexander Motin bioq_remove(&queue, cbp); 60989b17223SAlexander Motin g_destroy_bio(cbp); 61089b17223SAlexander Motin } 61189b17223SAlexander Motin if (bp->bio_error == 0) 61289b17223SAlexander Motin bp->bio_error = ENOMEM; 61389b17223SAlexander Motin g_raid_iodone(bp, bp->bio_error); 61489b17223SAlexander Motin } 61589b17223SAlexander Motin 61689b17223SAlexander Motin static void 61789b17223SAlexander Motin g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) 61889b17223SAlexander Motin { 61989b17223SAlexander Motin struct g_raid_volume *vol; 62089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 62189b17223SAlexander Motin 62289b17223SAlexander Motin vol = tr->tro_volume; 62389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 62489b17223SAlexander Motin if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 62589b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 62689b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 62789b17223SAlexander Motin g_raid_iodone(bp, EIO); 62889b17223SAlexander Motin return; 62989b17223SAlexander Motin } 63089b17223SAlexander Motin /* 63189b17223SAlexander Motin * If we're rebuilding, squeeze in rebuild activity every so often, 63289b17223SAlexander Motin * even when the disk is busy. Be sure to only count real I/O 63389b17223SAlexander Motin * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 63489b17223SAlexander Motin * by this module. 63589b17223SAlexander Motin */ 63689b17223SAlexander Motin if (trs->trso_failed_sd != NULL && 63789b17223SAlexander Motin !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 63889b17223SAlexander Motin /* Make this new or running now round short. */ 63989b17223SAlexander Motin trs->trso_recover_slabs = 0; 64089b17223SAlexander Motin if (--trs->trso_fair_io <= 0) { 64189b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 64289b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 64389b17223SAlexander Motin } 64489b17223SAlexander Motin } 64589b17223SAlexander Motin switch (bp->bio_cmd) { 64689b17223SAlexander Motin case BIO_READ: 64789b17223SAlexander Motin g_raid_tr_iostart_raid1_read(tr, bp); 64889b17223SAlexander Motin break; 64989b17223SAlexander Motin case BIO_WRITE: 65089b17223SAlexander Motin case BIO_DELETE: 651*609a7474SAlexander Motin g_raid_tr_iostart_raid1_write(tr, bp); 65289b17223SAlexander Motin break; 65389b17223SAlexander Motin case BIO_FLUSH: 65489b17223SAlexander Motin g_raid_tr_flush_common(tr, bp); 65589b17223SAlexander Motin break; 65689b17223SAlexander Motin default: 65789b17223SAlexander Motin KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 65889b17223SAlexander Motin bp->bio_cmd, vol->v_name)); 65989b17223SAlexander Motin break; 66089b17223SAlexander Motin } 66189b17223SAlexander Motin } 66289b17223SAlexander Motin 66389b17223SAlexander Motin static void 66489b17223SAlexander Motin g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, 66589b17223SAlexander Motin struct g_raid_subdisk *sd, struct bio *bp) 66689b17223SAlexander Motin { 66789b17223SAlexander Motin struct bio *cbp; 66889b17223SAlexander Motin struct g_raid_subdisk *nsd; 66989b17223SAlexander Motin struct g_raid_volume *vol; 67089b17223SAlexander Motin struct bio *pbp; 67189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 67289b17223SAlexander Motin uintptr_t *mask; 67389b17223SAlexander Motin int error, do_write; 67489b17223SAlexander Motin 67589b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 67689b17223SAlexander Motin vol = tr->tro_volume; 67789b17223SAlexander Motin if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 67889b17223SAlexander Motin /* 67989b17223SAlexander Motin * This operation is part of a rebuild or resync operation. 68089b17223SAlexander Motin * See what work just got done, then schedule the next bit of 68189b17223SAlexander Motin * work, if any. Rebuild/resync is done a little bit at a 68289b17223SAlexander Motin * time. Either when a timeout happens, or after we get a 68389b17223SAlexander Motin * bunch of I/Os to the disk (to make sure an active system 68489b17223SAlexander Motin * will complete in a sane amount of time). 68589b17223SAlexander Motin * 68689b17223SAlexander Motin * We are setup to do differing amounts of work for each of 68789b17223SAlexander Motin * these cases. so long as the slabs is smallish (less than 68889b17223SAlexander Motin * 50 or so, I'd guess, but that's just a WAG), we shouldn't 68989b17223SAlexander Motin * have any bio starvation issues. For active disks, we do 69089b17223SAlexander Motin * 5MB of data, for inactive ones, we do 50MB. 69189b17223SAlexander Motin */ 69289b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) { 69389b17223SAlexander Motin if (bp->bio_cmd == BIO_READ) { 69489b17223SAlexander Motin 69589b17223SAlexander Motin /* Immediately abort rebuild, if requested. */ 69689b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_ABORT) { 69789b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 69889b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 69989b17223SAlexander Motin return; 70089b17223SAlexander Motin } 70189b17223SAlexander Motin 70289b17223SAlexander Motin /* On read error, skip and cross fingers. */ 70389b17223SAlexander Motin if (bp->bio_error != 0) { 70489b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 70589b17223SAlexander Motin "Read error during rebuild (%d), " 70689b17223SAlexander Motin "possible data loss!", 70789b17223SAlexander Motin bp->bio_error); 70889b17223SAlexander Motin goto rebuild_round_done; 70989b17223SAlexander Motin } 71089b17223SAlexander Motin 71189b17223SAlexander Motin /* 71289b17223SAlexander Motin * The read operation finished, queue the 71389b17223SAlexander Motin * write and get out. 71489b17223SAlexander Motin */ 71589b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "rebuild read done. %d", 71689b17223SAlexander Motin bp->bio_error); 71789b17223SAlexander Motin bp->bio_cmd = BIO_WRITE; 71889b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 71989b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); 72089b17223SAlexander Motin g_raid_subdisk_iostart(trs->trso_failed_sd, bp); 72189b17223SAlexander Motin } else { 72289b17223SAlexander Motin /* 72389b17223SAlexander Motin * The write operation just finished. Do 72489b17223SAlexander Motin * another. We keep cloning the master bio 72589b17223SAlexander Motin * since it has the right buffers allocated to 72689b17223SAlexander Motin * it. 72789b17223SAlexander Motin */ 72889b17223SAlexander Motin G_RAID_LOGREQ(4, bp, 72989b17223SAlexander Motin "rebuild write done. Error %d", 73089b17223SAlexander Motin bp->bio_error); 73189b17223SAlexander Motin nsd = trs->trso_failed_sd; 73289b17223SAlexander Motin if (bp->bio_error != 0 || 73389b17223SAlexander Motin trs->trso_flags & TR_RAID1_F_ABORT) { 73489b17223SAlexander Motin if ((trs->trso_flags & 73589b17223SAlexander Motin TR_RAID1_F_ABORT) == 0) { 73689b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, 73789b17223SAlexander Motin nsd, nsd->sd_disk); 73889b17223SAlexander Motin } 73989b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 74089b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 74189b17223SAlexander Motin return; 74289b17223SAlexander Motin } 74389b17223SAlexander Motin rebuild_round_done: 74489b17223SAlexander Motin nsd = trs->trso_failed_sd; 74589b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 74689b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, 74789b17223SAlexander Motin bp->bio_offset, bp->bio_length); 74889b17223SAlexander Motin nsd->sd_rebuild_pos += bp->bio_length; 74989b17223SAlexander Motin if (nsd->sd_rebuild_pos >= nsd->sd_size) { 75089b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(tr); 75189b17223SAlexander Motin return; 75289b17223SAlexander Motin } 75389b17223SAlexander Motin 75489b17223SAlexander Motin /* Abort rebuild if we are stopping */ 75589b17223SAlexander Motin if (trs->trso_stopping) { 75689b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 75789b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 75889b17223SAlexander Motin return; 75989b17223SAlexander Motin } 76089b17223SAlexander Motin 76189b17223SAlexander Motin if (--trs->trso_meta_update <= 0) { 76289b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 76389b17223SAlexander Motin vol, nsd, nsd->sd_disk); 76489b17223SAlexander Motin trs->trso_meta_update = 76589b17223SAlexander Motin g_raid1_rebuild_meta_update; 76689b17223SAlexander Motin } 76789b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 76889b17223SAlexander Motin if (--trs->trso_recover_slabs <= 0) 76989b17223SAlexander Motin return; 77089b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 77189b17223SAlexander Motin } 77289b17223SAlexander Motin } else if (trs->trso_type == TR_RAID1_RESYNC) { 77389b17223SAlexander Motin /* 77489b17223SAlexander Motin * read good sd, read bad sd in parallel. when both 77589b17223SAlexander Motin * done, compare the buffers. write good to the bad 77689b17223SAlexander Motin * if different. do the next bit of work. 77789b17223SAlexander Motin */ 77889b17223SAlexander Motin panic("Somehow, we think we're doing a resync"); 77989b17223SAlexander Motin } 78089b17223SAlexander Motin return; 78189b17223SAlexander Motin } 78289b17223SAlexander Motin pbp = bp->bio_parent; 78389b17223SAlexander Motin pbp->bio_inbed++; 78489b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 78589b17223SAlexander Motin /* 78689b17223SAlexander Motin * Read failed on first drive. Retry the read error on 78789b17223SAlexander Motin * another disk drive, if available, before erroring out the 78889b17223SAlexander Motin * read. 78989b17223SAlexander Motin */ 79089b17223SAlexander Motin sd->sd_disk->d_read_errs++; 79189b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 79289b17223SAlexander Motin "Read error (%d), %d read errors total", 79389b17223SAlexander Motin bp->bio_error, sd->sd_disk->d_read_errs); 79489b17223SAlexander Motin 79589b17223SAlexander Motin /* 79689b17223SAlexander Motin * If there are too many read errors, we move to degraded. 79789b17223SAlexander Motin * XXX Do we want to FAIL the drive (eg, make the user redo 79889b17223SAlexander Motin * everything to get it back in sync), or just degrade the 79989b17223SAlexander Motin * drive, which kicks off a resync? 80089b17223SAlexander Motin */ 80189b17223SAlexander Motin do_write = 1; 80289b17223SAlexander Motin if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { 80389b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 80489b17223SAlexander Motin if (pbp->bio_children == 1) 80589b17223SAlexander Motin do_write = 0; 80689b17223SAlexander Motin } 80789b17223SAlexander Motin 80889b17223SAlexander Motin /* 80989b17223SAlexander Motin * Find the other disk, and try to do the I/O to it. 81089b17223SAlexander Motin */ 81189b17223SAlexander Motin mask = (uintptr_t *)(&pbp->bio_driver2); 81289b17223SAlexander Motin if (pbp->bio_children == 1) { 81389b17223SAlexander Motin /* Save original subdisk. */ 81489b17223SAlexander Motin pbp->bio_driver1 = do_write ? sd : NULL; 81589b17223SAlexander Motin *mask = 0; 81689b17223SAlexander Motin } 81789b17223SAlexander Motin *mask |= 1 << sd->sd_pos; 81889b17223SAlexander Motin nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); 81989b17223SAlexander Motin if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { 82089b17223SAlexander Motin g_destroy_bio(bp); 82189b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 82289b17223SAlexander Motin nsd->sd_pos); 82389b17223SAlexander Motin if (pbp->bio_children == 2 && do_write) { 82489b17223SAlexander Motin sd->sd_recovery++; 82589b17223SAlexander Motin cbp->bio_caller1 = nsd; 82689b17223SAlexander Motin pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; 82789b17223SAlexander Motin /* Lock callback starts I/O */ 82889b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, 82989b17223SAlexander Motin cbp->bio_offset, cbp->bio_length, pbp, cbp); 83089b17223SAlexander Motin } else { 83189b17223SAlexander Motin g_raid_subdisk_iostart(nsd, cbp); 83289b17223SAlexander Motin } 83389b17223SAlexander Motin return; 83489b17223SAlexander Motin } 83589b17223SAlexander Motin /* 83689b17223SAlexander Motin * We can't retry. Return the original error by falling 83789b17223SAlexander Motin * through. This will happen when there's only one good disk. 83889b17223SAlexander Motin * We don't need to fail the raid, since its actual state is 83989b17223SAlexander Motin * based on the state of the subdisks. 84089b17223SAlexander Motin */ 84189b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 84289b17223SAlexander Motin } 84389b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && 84489b17223SAlexander Motin bp->bio_error == 0 && 84589b17223SAlexander Motin pbp->bio_children > 1 && 84689b17223SAlexander Motin pbp->bio_driver1 != NULL) { 84789b17223SAlexander Motin /* 84889b17223SAlexander Motin * If it was a read, and bio_children is >1, then we just 84989b17223SAlexander Motin * recovered the data from the second drive. We should try to 85089b17223SAlexander Motin * write that data to the first drive if sector remapping is 85189b17223SAlexander Motin * enabled. A write should put the data in a new place on the 85289b17223SAlexander Motin * disk, remapping the bad sector. Do we need to do that by 85389b17223SAlexander Motin * queueing a request to the main worker thread? It doesn't 85489b17223SAlexander Motin * affect the return code of this current read, and can be 85589b17223SAlexander Motin * done at our liesure. However, to make the code simpler, it 85689b17223SAlexander Motin * is done syncrhonously. 85789b17223SAlexander Motin */ 85889b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 85989b17223SAlexander Motin cbp = g_clone_bio(pbp); 86089b17223SAlexander Motin if (cbp != NULL) { 86189b17223SAlexander Motin g_destroy_bio(bp); 86289b17223SAlexander Motin cbp->bio_cmd = BIO_WRITE; 86389b17223SAlexander Motin cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 86489b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, 86589b17223SAlexander Motin "Attempting bad sector remap on failing drive."); 86689b17223SAlexander Motin g_raid_subdisk_iostart(pbp->bio_driver1, cbp); 86789b17223SAlexander Motin return; 86889b17223SAlexander Motin } 86989b17223SAlexander Motin } 87089b17223SAlexander Motin if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { 87189b17223SAlexander Motin /* 87289b17223SAlexander Motin * We're done with a recovery, mark the range as unlocked. 87389b17223SAlexander Motin * For any write errors, we agressively fail the disk since 87489b17223SAlexander Motin * there was both a READ and a WRITE error at this location. 87589b17223SAlexander Motin * Both types of errors generally indicates the drive is on 87689b17223SAlexander Motin * the verge of total failure anyway. Better to stop trusting 87789b17223SAlexander Motin * it now. However, we need to reset error to 0 in that case 87889b17223SAlexander Motin * because we're not failing the original I/O which succeeded. 87989b17223SAlexander Motin */ 88089b17223SAlexander Motin if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 88189b17223SAlexander Motin G_RAID_LOGREQ(0, bp, "Remap write failed: " 88289b17223SAlexander Motin "failing subdisk."); 88389b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 88489b17223SAlexander Motin bp->bio_error = 0; 88589b17223SAlexander Motin } 88689b17223SAlexander Motin if (pbp->bio_driver1 != NULL) { 88789b17223SAlexander Motin ((struct g_raid_subdisk *)pbp->bio_driver1) 88889b17223SAlexander Motin ->sd_recovery--; 88989b17223SAlexander Motin } 89089b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 89189b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, bp->bio_offset, 89289b17223SAlexander Motin bp->bio_length); 89389b17223SAlexander Motin } 894*609a7474SAlexander Motin if (pbp->bio_cmd == BIO_WRITE) { 895ef844ef7SAlexander Motin if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 896ef844ef7SAlexander Motin pbp->bio_error = bp->bio_error; 897ef844ef7SAlexander Motin if (bp->bio_error != 0) { 898ef844ef7SAlexander Motin G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 899ef844ef7SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 900ef844ef7SAlexander Motin } 901ef844ef7SAlexander Motin error = pbp->bio_error; 902ef844ef7SAlexander Motin } else 90389b17223SAlexander Motin error = bp->bio_error; 90489b17223SAlexander Motin g_destroy_bio(bp); 90589b17223SAlexander Motin if (pbp->bio_children == pbp->bio_inbed) { 90689b17223SAlexander Motin pbp->bio_completed = pbp->bio_length; 90789b17223SAlexander Motin g_raid_iodone(pbp, error); 90889b17223SAlexander Motin } 90989b17223SAlexander Motin } 91089b17223SAlexander Motin 91189b17223SAlexander Motin static int 91289b17223SAlexander Motin g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, 91389b17223SAlexander Motin void *virtual, vm_offset_t physical, off_t offset, size_t length) 91489b17223SAlexander Motin { 91589b17223SAlexander Motin struct g_raid_volume *vol; 91689b17223SAlexander Motin struct g_raid_subdisk *sd; 91789b17223SAlexander Motin int error, i, ok; 91889b17223SAlexander Motin 91989b17223SAlexander Motin vol = tr->tro_volume; 92089b17223SAlexander Motin error = 0; 92189b17223SAlexander Motin ok = 0; 92289b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 92389b17223SAlexander Motin sd = &vol->v_subdisks[i]; 92489b17223SAlexander Motin switch (sd->sd_state) { 92589b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 92689b17223SAlexander Motin break; 92789b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 92889b17223SAlexander Motin /* 92989b17223SAlexander Motin * When rebuilding, only part of this subdisk is 93089b17223SAlexander Motin * writable, the rest will be written as part of the 93189b17223SAlexander Motin * that process. 93289b17223SAlexander Motin */ 93389b17223SAlexander Motin if (offset >= sd->sd_rebuild_pos) 93489b17223SAlexander Motin continue; 93589b17223SAlexander Motin break; 93689b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 93789b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 93889b17223SAlexander Motin /* 93989b17223SAlexander Motin * Resyncing still writes on the theory that the 94089b17223SAlexander Motin * resync'd disk is very close and writing it will 94189b17223SAlexander Motin * keep it that way better if we keep up while 94289b17223SAlexander Motin * resyncing. 94389b17223SAlexander Motin */ 94489b17223SAlexander Motin break; 94589b17223SAlexander Motin default: 94689b17223SAlexander Motin continue; 94789b17223SAlexander Motin } 94889b17223SAlexander Motin error = g_raid_subdisk_kerneldump(sd, 94989b17223SAlexander Motin virtual, physical, offset, length); 95089b17223SAlexander Motin if (error == 0) 95189b17223SAlexander Motin ok++; 95289b17223SAlexander Motin } 95389b17223SAlexander Motin return (ok > 0 ? 0 : error); 95489b17223SAlexander Motin } 95589b17223SAlexander Motin 95689b17223SAlexander Motin static int 95789b17223SAlexander Motin g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) 95889b17223SAlexander Motin { 95989b17223SAlexander Motin struct bio *bp; 96089b17223SAlexander Motin struct g_raid_subdisk *sd; 96189b17223SAlexander Motin 96289b17223SAlexander Motin bp = (struct bio *)argp; 96389b17223SAlexander Motin sd = (struct g_raid_subdisk *)bp->bio_caller1; 96489b17223SAlexander Motin g_raid_subdisk_iostart(sd, bp); 96589b17223SAlexander Motin 96689b17223SAlexander Motin return (0); 96789b17223SAlexander Motin } 96889b17223SAlexander Motin 96989b17223SAlexander Motin static int 97089b17223SAlexander Motin g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) 97189b17223SAlexander Motin { 97289b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 97389b17223SAlexander Motin 97489b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 97589b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 97689b17223SAlexander Motin trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; 97789b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) 97889b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 97989b17223SAlexander Motin return (0); 98089b17223SAlexander Motin } 98189b17223SAlexander Motin 98289b17223SAlexander Motin static int 98389b17223SAlexander Motin g_raid_tr_free_raid1(struct g_raid_tr_object *tr) 98489b17223SAlexander Motin { 98589b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 98689b17223SAlexander Motin 98789b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 98889b17223SAlexander Motin 98989b17223SAlexander Motin if (trs->trso_buffer != NULL) { 99089b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 99189b17223SAlexander Motin trs->trso_buffer = NULL; 99289b17223SAlexander Motin } 99389b17223SAlexander Motin return (0); 99489b17223SAlexander Motin } 99589b17223SAlexander Motin 996c89d2fbeSAlexander Motin G_RAID_TR_DECLARE(raid1, "RAID1"); 997