189b17223SAlexander Motin /*- 289b17223SAlexander Motin * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 389b17223SAlexander Motin * All rights reserved. 489b17223SAlexander Motin * 589b17223SAlexander Motin * Redistribution and use in source and binary forms, with or without 689b17223SAlexander Motin * modification, are permitted provided that the following conditions 789b17223SAlexander Motin * are met: 889b17223SAlexander Motin * 1. Redistributions of source code must retain the above copyright 989b17223SAlexander Motin * notice, this list of conditions and the following disclaimer. 1089b17223SAlexander Motin * 2. Redistributions in binary form must reproduce the above copyright 1189b17223SAlexander Motin * notice, this list of conditions and the following disclaimer in the 1289b17223SAlexander Motin * documentation and/or other materials provided with the distribution. 1389b17223SAlexander Motin * 1489b17223SAlexander Motin * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 1589b17223SAlexander Motin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1689b17223SAlexander Motin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1789b17223SAlexander Motin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 1889b17223SAlexander Motin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1989b17223SAlexander Motin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2089b17223SAlexander Motin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2189b17223SAlexander Motin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2289b17223SAlexander Motin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2389b17223SAlexander Motin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2489b17223SAlexander Motin * SUCH DAMAGE. 2589b17223SAlexander Motin */ 2689b17223SAlexander Motin 2789b17223SAlexander Motin #include <sys/cdefs.h> 2889b17223SAlexander Motin __FBSDID("$FreeBSD$"); 2989b17223SAlexander Motin 3089b17223SAlexander Motin #include <sys/param.h> 3189b17223SAlexander Motin #include <sys/bio.h> 3289b17223SAlexander Motin #include <sys/endian.h> 3389b17223SAlexander Motin #include <sys/kernel.h> 3489b17223SAlexander Motin #include <sys/kobj.h> 3589b17223SAlexander Motin #include <sys/limits.h> 3689b17223SAlexander Motin #include <sys/lock.h> 3789b17223SAlexander Motin #include <sys/malloc.h> 3889b17223SAlexander Motin #include <sys/mutex.h> 3989b17223SAlexander Motin #include <sys/sysctl.h> 4089b17223SAlexander Motin #include <sys/systm.h> 4189b17223SAlexander Motin #include <geom/geom.h> 4289b17223SAlexander Motin #include "geom/raid/g_raid.h" 4389b17223SAlexander Motin #include "g_raid_tr_if.h" 4489b17223SAlexander Motin 4589b17223SAlexander Motin SYSCTL_DECL(_kern_geom_raid); 466472ac3dSEd Schouten static SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1, CTLFLAG_RW, 0, 4789b17223SAlexander Motin "RAID1 parameters"); 4889b17223SAlexander Motin 4989b17223SAlexander Motin #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 5089b17223SAlexander Motin static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; 5189b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size", 5289b17223SAlexander Motin &g_raid1_rebuild_slab); 5389b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, 5489b17223SAlexander Motin &g_raid1_rebuild_slab, 0, 5589b17223SAlexander Motin "Amount of the disk to rebuild each read/write cycle of the rebuild."); 5689b17223SAlexander Motin 5789b17223SAlexander Motin #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 5889b17223SAlexander Motin static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; 5989b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io", 6089b17223SAlexander Motin &g_raid1_rebuild_fair_io); 6189b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, 6289b17223SAlexander Motin &g_raid1_rebuild_fair_io, 0, 6389b17223SAlexander Motin "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 6489b17223SAlexander Motin 6589b17223SAlexander Motin #define RAID1_REBUILD_CLUSTER_IDLE 100 6689b17223SAlexander Motin static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; 6789b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle", 6889b17223SAlexander Motin &g_raid1_rebuild_cluster_idle); 6989b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, 7089b17223SAlexander Motin &g_raid1_rebuild_cluster_idle, 0, 7189b17223SAlexander Motin "Number of slabs to do each time we trigger a rebuild cycle"); 7289b17223SAlexander Motin 7389b17223SAlexander Motin #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 7489b17223SAlexander Motin static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; 7589b17223SAlexander Motin TUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update", 7689b17223SAlexander Motin &g_raid1_rebuild_meta_update); 7789b17223SAlexander Motin SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, 7889b17223SAlexander Motin &g_raid1_rebuild_meta_update, 0, 7989b17223SAlexander Motin "When to update the meta data."); 8089b17223SAlexander Motin 8189b17223SAlexander Motin static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); 8289b17223SAlexander Motin 8389b17223SAlexander Motin #define TR_RAID1_NONE 0 8489b17223SAlexander Motin #define TR_RAID1_REBUILD 1 8589b17223SAlexander Motin #define TR_RAID1_RESYNC 2 8689b17223SAlexander Motin 8789b17223SAlexander Motin #define TR_RAID1_F_DOING_SOME 0x1 8889b17223SAlexander Motin #define TR_RAID1_F_LOCKED 0x2 8989b17223SAlexander Motin #define TR_RAID1_F_ABORT 0x4 9089b17223SAlexander Motin 9189b17223SAlexander Motin struct g_raid_tr_raid1_object { 9289b17223SAlexander Motin struct g_raid_tr_object trso_base; 9389b17223SAlexander Motin int trso_starting; 9489b17223SAlexander Motin int trso_stopping; 9589b17223SAlexander Motin int trso_type; 9689b17223SAlexander Motin int trso_recover_slabs; /* slabs before rest */ 9789b17223SAlexander Motin int trso_fair_io; 9889b17223SAlexander Motin int trso_meta_update; 9989b17223SAlexander Motin int trso_flags; 10089b17223SAlexander Motin struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 10189b17223SAlexander Motin void *trso_buffer; /* Buffer space */ 10289b17223SAlexander Motin struct bio trso_bio; 10389b17223SAlexander Motin }; 10489b17223SAlexander Motin 10589b17223SAlexander Motin static g_raid_tr_taste_t g_raid_tr_taste_raid1; 10689b17223SAlexander Motin static g_raid_tr_event_t g_raid_tr_event_raid1; 10789b17223SAlexander Motin static g_raid_tr_start_t g_raid_tr_start_raid1; 10889b17223SAlexander Motin static g_raid_tr_stop_t g_raid_tr_stop_raid1; 10989b17223SAlexander Motin static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; 11089b17223SAlexander Motin static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; 11189b17223SAlexander Motin static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; 11289b17223SAlexander Motin static g_raid_tr_locked_t g_raid_tr_locked_raid1; 11389b17223SAlexander Motin static g_raid_tr_idle_t g_raid_tr_idle_raid1; 11489b17223SAlexander Motin static g_raid_tr_free_t g_raid_tr_free_raid1; 11589b17223SAlexander Motin 11689b17223SAlexander Motin static kobj_method_t g_raid_tr_raid1_methods[] = { 11789b17223SAlexander Motin KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), 11889b17223SAlexander Motin KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), 11989b17223SAlexander Motin KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), 12089b17223SAlexander Motin KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), 12189b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), 12289b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), 12389b17223SAlexander Motin KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), 12489b17223SAlexander Motin KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), 12589b17223SAlexander Motin KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), 12689b17223SAlexander Motin KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), 12789b17223SAlexander Motin { 0, 0 } 12889b17223SAlexander Motin }; 12989b17223SAlexander Motin 13089b17223SAlexander Motin static struct g_raid_tr_class g_raid_tr_raid1_class = { 13189b17223SAlexander Motin "RAID1", 13289b17223SAlexander Motin g_raid_tr_raid1_methods, 13389b17223SAlexander Motin sizeof(struct g_raid_tr_raid1_object), 13489b17223SAlexander Motin .trc_priority = 100 13589b17223SAlexander Motin }; 13689b17223SAlexander Motin 13789b17223SAlexander Motin static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); 13889b17223SAlexander Motin static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 13989b17223SAlexander Motin struct g_raid_subdisk *sd); 14089b17223SAlexander Motin 14189b17223SAlexander Motin static int 14289b17223SAlexander Motin g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 14389b17223SAlexander Motin { 14489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 14589b17223SAlexander Motin 14689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 14789b17223SAlexander Motin if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || 148dbb2e755SAlexander Motin (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && 149dbb2e755SAlexander Motin tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) 15089b17223SAlexander Motin return (G_RAID_TR_TASTE_FAIL); 15189b17223SAlexander Motin trs->trso_starting = 1; 15289b17223SAlexander Motin return (G_RAID_TR_TASTE_SUCCEED); 15389b17223SAlexander Motin } 15489b17223SAlexander Motin 15589b17223SAlexander Motin static int 15689b17223SAlexander Motin g_raid_tr_update_state_raid1(struct g_raid_volume *vol, 15789b17223SAlexander Motin struct g_raid_subdisk *sd) 15889b17223SAlexander Motin { 15989b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 16089b17223SAlexander Motin struct g_raid_softc *sc; 16189b17223SAlexander Motin struct g_raid_subdisk *tsd, *bestsd; 16289b17223SAlexander Motin u_int s; 16389b17223SAlexander Motin int i, na, ns; 16489b17223SAlexander Motin 16589b17223SAlexander Motin sc = vol->v_softc; 16689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)vol->v_tr; 16789b17223SAlexander Motin if (trs->trso_stopping && 16889b17223SAlexander Motin (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) 16989b17223SAlexander Motin s = G_RAID_VOLUME_S_STOPPED; 17089b17223SAlexander Motin else if (trs->trso_starting) 17189b17223SAlexander Motin s = G_RAID_VOLUME_S_STARTING; 17289b17223SAlexander Motin else { 17389b17223SAlexander Motin /* Make sure we have at least one ACTIVE disk. */ 17489b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 17589b17223SAlexander Motin if (na == 0) { 17689b17223SAlexander Motin /* 17789b17223SAlexander Motin * Critical situation! We have no any active disk! 17889b17223SAlexander Motin * Choose the best disk we have to make it active. 17989b17223SAlexander Motin */ 18089b17223SAlexander Motin bestsd = &vol->v_subdisks[0]; 18189b17223SAlexander Motin for (i = 1; i < vol->v_disks_count; i++) { 18289b17223SAlexander Motin tsd = &vol->v_subdisks[i]; 18389b17223SAlexander Motin if (tsd->sd_state > bestsd->sd_state) 18489b17223SAlexander Motin bestsd = tsd; 18589b17223SAlexander Motin else if (tsd->sd_state == bestsd->sd_state && 18689b17223SAlexander Motin (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || 18789b17223SAlexander Motin tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 18889b17223SAlexander Motin tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 18989b17223SAlexander Motin bestsd = tsd; 19089b17223SAlexander Motin } 19189b17223SAlexander Motin if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { 19289b17223SAlexander Motin /* We found reasonable candidate. */ 19389b17223SAlexander Motin G_RAID_DEBUG1(1, sc, 19489b17223SAlexander Motin "Promote subdisk %s:%d from %s to ACTIVE.", 19589b17223SAlexander Motin vol->v_name, bestsd->sd_pos, 19689b17223SAlexander Motin g_raid_subdisk_state2str(bestsd->sd_state)); 19789b17223SAlexander Motin g_raid_change_subdisk_state(bestsd, 19889b17223SAlexander Motin G_RAID_SUBDISK_S_ACTIVE); 19989b17223SAlexander Motin g_raid_write_metadata(sc, 20089b17223SAlexander Motin vol, bestsd, bestsd->sd_disk); 20189b17223SAlexander Motin } 20289b17223SAlexander Motin } 20389b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 20489b17223SAlexander Motin ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 20589b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 20689b17223SAlexander Motin if (na == vol->v_disks_count) 20789b17223SAlexander Motin s = G_RAID_VOLUME_S_OPTIMAL; 20889b17223SAlexander Motin else if (na + ns == vol->v_disks_count) 20989b17223SAlexander Motin s = G_RAID_VOLUME_S_SUBOPTIMAL; 21089b17223SAlexander Motin else if (na > 0) 21189b17223SAlexander Motin s = G_RAID_VOLUME_S_DEGRADED; 21289b17223SAlexander Motin else 21389b17223SAlexander Motin s = G_RAID_VOLUME_S_BROKEN; 21489b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); 21589b17223SAlexander Motin } 21689b17223SAlexander Motin if (s != vol->v_state) { 21789b17223SAlexander Motin g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 21889b17223SAlexander Motin G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 21989b17223SAlexander Motin G_RAID_EVENT_VOLUME); 22089b17223SAlexander Motin g_raid_change_volume_state(vol, s); 22189b17223SAlexander Motin if (!trs->trso_starting && !trs->trso_stopping) 22289b17223SAlexander Motin g_raid_write_metadata(sc, vol, NULL, NULL); 22389b17223SAlexander Motin } 22489b17223SAlexander Motin return (0); 22589b17223SAlexander Motin } 22689b17223SAlexander Motin 22789b17223SAlexander Motin static void 22889b17223SAlexander Motin g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 22989b17223SAlexander Motin struct g_raid_disk *disk) 23089b17223SAlexander Motin { 23189b17223SAlexander Motin /* 23289b17223SAlexander Motin * We don't fail the last disk in the pack, since it still has decent 23389b17223SAlexander Motin * data on it and that's better than failing the disk if it is the root 23489b17223SAlexander Motin * file system. 23589b17223SAlexander Motin * 23689b17223SAlexander Motin * XXX should this be controlled via a tunable? It makes sense for 23789b17223SAlexander Motin * the volume that has / on it. I can't think of a case where we'd 23889b17223SAlexander Motin * want the volume to go away on this kind of event. 23989b17223SAlexander Motin */ 24089b17223SAlexander Motin if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && 24189b17223SAlexander Motin g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) 24289b17223SAlexander Motin return; 24389b17223SAlexander Motin g_raid_fail_disk(sc, sd, disk); 24489b17223SAlexander Motin } 24589b17223SAlexander Motin 24689b17223SAlexander Motin static void 24789b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) 24889b17223SAlexander Motin { 24989b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 25089b17223SAlexander Motin struct g_raid_subdisk *sd, *good_sd; 25189b17223SAlexander Motin struct bio *bp; 25289b17223SAlexander Motin 25389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 25489b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) 25589b17223SAlexander Motin return; 25689b17223SAlexander Motin sd = trs->trso_failed_sd; 25789b17223SAlexander Motin good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); 25889b17223SAlexander Motin if (good_sd == NULL) { 25989b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 26089b17223SAlexander Motin return; 26189b17223SAlexander Motin } 26289b17223SAlexander Motin bp = &trs->trso_bio; 26389b17223SAlexander Motin memset(bp, 0, sizeof(*bp)); 26489b17223SAlexander Motin bp->bio_offset = sd->sd_rebuild_pos; 26589b17223SAlexander Motin bp->bio_length = MIN(g_raid1_rebuild_slab, 26689b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 26789b17223SAlexander Motin bp->bio_data = trs->trso_buffer; 26889b17223SAlexander Motin bp->bio_cmd = BIO_READ; 26989b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 27089b17223SAlexander Motin bp->bio_caller1 = good_sd; 27189b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_DOING_SOME; 27289b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_LOCKED; 27389b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ 27489b17223SAlexander Motin bp->bio_offset, bp->bio_length, NULL, bp); 27589b17223SAlexander Motin } 27689b17223SAlexander Motin 27789b17223SAlexander Motin static void 27889b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) 27989b17223SAlexander Motin { 28089b17223SAlexander Motin struct g_raid_volume *vol; 28189b17223SAlexander Motin struct g_raid_subdisk *sd; 28289b17223SAlexander Motin 28389b17223SAlexander Motin vol = trs->trso_base.tro_volume; 28489b17223SAlexander Motin sd = trs->trso_failed_sd; 28589b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 28689b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 28789b17223SAlexander Motin trs->trso_buffer = NULL; 28889b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 28989b17223SAlexander Motin trs->trso_type = TR_RAID1_NONE; 29089b17223SAlexander Motin trs->trso_recover_slabs = 0; 29189b17223SAlexander Motin trs->trso_failed_sd = NULL; 29289b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 29389b17223SAlexander Motin } 29489b17223SAlexander Motin 29589b17223SAlexander Motin static void 29689b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) 29789b17223SAlexander Motin { 29889b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 29989b17223SAlexander Motin struct g_raid_subdisk *sd; 30089b17223SAlexander Motin 30189b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 30289b17223SAlexander Motin sd = trs->trso_failed_sd; 30389b17223SAlexander Motin G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 30489b17223SAlexander Motin "Subdisk %s:%d-%s rebuild completed.", 30589b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 30689b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 30789b17223SAlexander Motin g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 30889b17223SAlexander Motin sd->sd_rebuild_pos = 0; 30989b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 31089b17223SAlexander Motin } 31189b17223SAlexander Motin 31289b17223SAlexander Motin static void 31389b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) 31489b17223SAlexander Motin { 31589b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 31689b17223SAlexander Motin struct g_raid_subdisk *sd; 31789b17223SAlexander Motin struct g_raid_volume *vol; 31889b17223SAlexander Motin off_t len; 31989b17223SAlexander Motin 32089b17223SAlexander Motin vol = tr->tro_volume; 32189b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 32289b17223SAlexander Motin sd = trs->trso_failed_sd; 32389b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { 32489b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 32589b17223SAlexander Motin "Subdisk %s:%d-%s rebuild is aborting.", 32689b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 32789b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 32889b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_ABORT; 32989b17223SAlexander Motin } else { 33089b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 33189b17223SAlexander Motin "Subdisk %s:%d-%s rebuild aborted.", 33289b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 33389b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 33489b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_ABORT; 33589b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_LOCKED) { 33689b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 33789b17223SAlexander Motin len = MIN(g_raid1_rebuild_slab, 33889b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 33989b17223SAlexander Motin g_raid_unlock_range(tr->tro_volume, 34089b17223SAlexander Motin sd->sd_rebuild_pos, len); 34189b17223SAlexander Motin } 34289b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 34389b17223SAlexander Motin } 34489b17223SAlexander Motin } 34589b17223SAlexander Motin 34689b17223SAlexander Motin static void 34789b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) 34889b17223SAlexander Motin { 34989b17223SAlexander Motin struct g_raid_volume *vol; 35089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 35189b17223SAlexander Motin struct g_raid_subdisk *sd, *fsd; 35289b17223SAlexander Motin 35389b17223SAlexander Motin vol = tr->tro_volume; 35489b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 35589b17223SAlexander Motin if (trs->trso_failed_sd) { 35689b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 35789b17223SAlexander Motin "Already rebuild in start rebuild. pos %jd\n", 35889b17223SAlexander Motin (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 35989b17223SAlexander Motin return; 36089b17223SAlexander Motin } 36189b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); 36289b17223SAlexander Motin if (sd == NULL) { 36389b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 36489b17223SAlexander Motin "No active disk to rebuild. night night."); 36589b17223SAlexander Motin return; 36689b17223SAlexander Motin } 36789b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 36889b17223SAlexander Motin if (fsd == NULL) 36989b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 37089b17223SAlexander Motin if (fsd == NULL) { 37189b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 37289b17223SAlexander Motin if (fsd != NULL) { 37389b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 37489b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 37589b17223SAlexander Motin G_RAID_SUBDISK_S_RESYNC); 37689b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); 37789b17223SAlexander Motin } else { 37889b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 37989b17223SAlexander Motin G_RAID_SUBDISK_S_UNINITIALIZED); 38089b17223SAlexander Motin if (fsd == NULL) 38189b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 38289b17223SAlexander Motin G_RAID_SUBDISK_S_NEW); 38389b17223SAlexander Motin if (fsd != NULL) { 38489b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 38589b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 38689b17223SAlexander Motin G_RAID_SUBDISK_S_REBUILD); 38789b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 38889b17223SAlexander Motin vol, fsd, NULL); 38989b17223SAlexander Motin } 39089b17223SAlexander Motin } 39189b17223SAlexander Motin } 39289b17223SAlexander Motin if (fsd == NULL) { 39389b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 39489b17223SAlexander Motin "No failed disk to rebuild. night night."); 39589b17223SAlexander Motin return; 39689b17223SAlexander Motin } 39789b17223SAlexander Motin trs->trso_failed_sd = fsd; 39889b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 39989b17223SAlexander Motin "Subdisk %s:%d-%s rebuild start at %jd.", 40089b17223SAlexander Motin fsd->sd_volume->v_name, fsd->sd_pos, 40189b17223SAlexander Motin fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", 40289b17223SAlexander Motin trs->trso_failed_sd->sd_rebuild_pos); 40389b17223SAlexander Motin trs->trso_type = TR_RAID1_REBUILD; 40489b17223SAlexander Motin trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); 40589b17223SAlexander Motin trs->trso_meta_update = g_raid1_rebuild_meta_update; 40689b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 40789b17223SAlexander Motin } 40889b17223SAlexander Motin 40989b17223SAlexander Motin 41089b17223SAlexander Motin static void 41189b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 41289b17223SAlexander Motin struct g_raid_subdisk *sd) 41389b17223SAlexander Motin { 41489b17223SAlexander Motin struct g_raid_volume *vol; 41589b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 41689b17223SAlexander Motin int na, nr; 41789b17223SAlexander Motin 41889b17223SAlexander Motin /* 41989b17223SAlexander Motin * If we're stopping, don't do anything. If we don't have at least one 42089b17223SAlexander Motin * good disk and one bad disk, we don't do anything. And if there's a 42189b17223SAlexander Motin * 'good disk' stored in the trs, then we're in progress and we punt. 42289b17223SAlexander Motin * If we make it past all these checks, we need to rebuild. 42389b17223SAlexander Motin */ 42489b17223SAlexander Motin vol = tr->tro_volume; 42589b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 42689b17223SAlexander Motin if (trs->trso_stopping) 42789b17223SAlexander Motin return; 42889b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 42989b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 43089b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 43189b17223SAlexander Motin switch(trs->trso_type) { 43289b17223SAlexander Motin case TR_RAID1_NONE: 43389b17223SAlexander Motin if (na == 0) 43489b17223SAlexander Motin return; 43589b17223SAlexander Motin if (nr == 0) { 43689b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 43789b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 43889b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 43989b17223SAlexander Motin if (nr == 0) 44089b17223SAlexander Motin return; 44189b17223SAlexander Motin } 44289b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(tr); 44389b17223SAlexander Motin break; 44489b17223SAlexander Motin case TR_RAID1_REBUILD: 44589b17223SAlexander Motin if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) 44689b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 44789b17223SAlexander Motin break; 44889b17223SAlexander Motin case TR_RAID1_RESYNC: 44989b17223SAlexander Motin break; 45089b17223SAlexander Motin } 45189b17223SAlexander Motin } 45289b17223SAlexander Motin 45389b17223SAlexander Motin static int 45489b17223SAlexander Motin g_raid_tr_event_raid1(struct g_raid_tr_object *tr, 45589b17223SAlexander Motin struct g_raid_subdisk *sd, u_int event) 45689b17223SAlexander Motin { 45789b17223SAlexander Motin 45889b17223SAlexander Motin g_raid_tr_update_state_raid1(tr->tro_volume, sd); 45989b17223SAlexander Motin return (0); 46089b17223SAlexander Motin } 46189b17223SAlexander Motin 46289b17223SAlexander Motin static int 46389b17223SAlexander Motin g_raid_tr_start_raid1(struct g_raid_tr_object *tr) 46489b17223SAlexander Motin { 46589b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 46689b17223SAlexander Motin struct g_raid_volume *vol; 46789b17223SAlexander Motin 46889b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 46989b17223SAlexander Motin vol = tr->tro_volume; 47089b17223SAlexander Motin trs->trso_starting = 0; 47189b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 47289b17223SAlexander Motin return (0); 47389b17223SAlexander Motin } 47489b17223SAlexander Motin 47589b17223SAlexander Motin static int 47689b17223SAlexander Motin g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) 47789b17223SAlexander Motin { 47889b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 47989b17223SAlexander Motin struct g_raid_volume *vol; 48089b17223SAlexander Motin 48189b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 48289b17223SAlexander Motin vol = tr->tro_volume; 48389b17223SAlexander Motin trs->trso_starting = 0; 48489b17223SAlexander Motin trs->trso_stopping = 1; 48589b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 48689b17223SAlexander Motin return (0); 48789b17223SAlexander Motin } 48889b17223SAlexander Motin 48989b17223SAlexander Motin /* 49089b17223SAlexander Motin * Select the disk to read from. Take into account: subdisk state, running 49189b17223SAlexander Motin * error recovery, average disk load, head position and possible cache hits. 49289b17223SAlexander Motin */ 49389b17223SAlexander Motin #define ABS(x) (((x) >= 0) ? (x) : (-(x))) 49489b17223SAlexander Motin static struct g_raid_subdisk * 49589b17223SAlexander Motin g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, 49689b17223SAlexander Motin u_int mask) 49789b17223SAlexander Motin { 49889b17223SAlexander Motin struct g_raid_subdisk *sd, *best; 49989b17223SAlexander Motin int i, prio, bestprio; 50089b17223SAlexander Motin 50189b17223SAlexander Motin best = NULL; 50289b17223SAlexander Motin bestprio = INT_MAX; 50389b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 50489b17223SAlexander Motin sd = &vol->v_subdisks[i]; 50589b17223SAlexander Motin if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && 50689b17223SAlexander Motin ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && 50789b17223SAlexander Motin sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || 50889b17223SAlexander Motin bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) 50989b17223SAlexander Motin continue; 51089b17223SAlexander Motin if ((mask & (1 << i)) != 0) 51189b17223SAlexander Motin continue; 51289b17223SAlexander Motin prio = G_RAID_SUBDISK_LOAD(sd); 51389b17223SAlexander Motin prio += min(sd->sd_recovery, 255) << 22; 51489b17223SAlexander Motin prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; 51589b17223SAlexander Motin /* If disk head is precisely in position - highly prefer it. */ 51689b17223SAlexander Motin if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) 51789b17223SAlexander Motin prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 51889b17223SAlexander Motin else 51989b17223SAlexander Motin /* If disk head is close to position - prefer it. */ 52089b17223SAlexander Motin if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < 52189b17223SAlexander Motin G_RAID_SUBDISK_TRACK_SIZE) 52289b17223SAlexander Motin prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 52389b17223SAlexander Motin if (prio < bestprio) { 52489b17223SAlexander Motin best = sd; 52589b17223SAlexander Motin bestprio = prio; 52689b17223SAlexander Motin } 52789b17223SAlexander Motin } 52889b17223SAlexander Motin return (best); 52989b17223SAlexander Motin } 53089b17223SAlexander Motin 53189b17223SAlexander Motin static void 53289b17223SAlexander Motin g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) 53389b17223SAlexander Motin { 53489b17223SAlexander Motin struct g_raid_subdisk *sd; 53589b17223SAlexander Motin struct bio *cbp; 53689b17223SAlexander Motin 53789b17223SAlexander Motin sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); 53889b17223SAlexander Motin KASSERT(sd != NULL, ("No active disks in volume %s.", 53989b17223SAlexander Motin tr->tro_volume->v_name)); 54089b17223SAlexander Motin 54189b17223SAlexander Motin cbp = g_clone_bio(bp); 54289b17223SAlexander Motin if (cbp == NULL) { 54389b17223SAlexander Motin g_raid_iodone(bp, ENOMEM); 54489b17223SAlexander Motin return; 54589b17223SAlexander Motin } 54689b17223SAlexander Motin 54789b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 54889b17223SAlexander Motin } 54989b17223SAlexander Motin 55089b17223SAlexander Motin static void 55189b17223SAlexander Motin g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) 55289b17223SAlexander Motin { 55389b17223SAlexander Motin struct g_raid_volume *vol; 55489b17223SAlexander Motin struct g_raid_subdisk *sd; 55589b17223SAlexander Motin struct bio_queue_head queue; 55689b17223SAlexander Motin struct bio *cbp; 55789b17223SAlexander Motin int i; 55889b17223SAlexander Motin 55989b17223SAlexander Motin vol = tr->tro_volume; 56089b17223SAlexander Motin 56189b17223SAlexander Motin /* 56289b17223SAlexander Motin * Allocate all bios before sending any request, so we can return 56389b17223SAlexander Motin * ENOMEM in nice and clean way. 56489b17223SAlexander Motin */ 56589b17223SAlexander Motin bioq_init(&queue); 56689b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 56789b17223SAlexander Motin sd = &vol->v_subdisks[i]; 56889b17223SAlexander Motin switch (sd->sd_state) { 56989b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 57089b17223SAlexander Motin break; 57189b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 57289b17223SAlexander Motin /* 57389b17223SAlexander Motin * When rebuilding, only part of this subdisk is 57489b17223SAlexander Motin * writable, the rest will be written as part of the 57589b17223SAlexander Motin * that process. 57689b17223SAlexander Motin */ 57789b17223SAlexander Motin if (bp->bio_offset >= sd->sd_rebuild_pos) 57889b17223SAlexander Motin continue; 57989b17223SAlexander Motin break; 58089b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 58189b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 58289b17223SAlexander Motin /* 58389b17223SAlexander Motin * Resyncing still writes on the theory that the 58489b17223SAlexander Motin * resync'd disk is very close and writing it will 58589b17223SAlexander Motin * keep it that way better if we keep up while 58689b17223SAlexander Motin * resyncing. 58789b17223SAlexander Motin */ 58889b17223SAlexander Motin break; 58989b17223SAlexander Motin default: 59089b17223SAlexander Motin continue; 59189b17223SAlexander Motin } 59289b17223SAlexander Motin cbp = g_clone_bio(bp); 59389b17223SAlexander Motin if (cbp == NULL) 59489b17223SAlexander Motin goto failure; 59589b17223SAlexander Motin cbp->bio_caller1 = sd; 59689b17223SAlexander Motin bioq_insert_tail(&queue, cbp); 59789b17223SAlexander Motin } 59889b17223SAlexander Motin for (cbp = bioq_first(&queue); cbp != NULL; 59989b17223SAlexander Motin cbp = bioq_first(&queue)) { 60089b17223SAlexander Motin bioq_remove(&queue, cbp); 60189b17223SAlexander Motin sd = cbp->bio_caller1; 60289b17223SAlexander Motin cbp->bio_caller1 = NULL; 60389b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 60489b17223SAlexander Motin } 60589b17223SAlexander Motin return; 60689b17223SAlexander Motin failure: 60789b17223SAlexander Motin for (cbp = bioq_first(&queue); cbp != NULL; 60889b17223SAlexander Motin cbp = bioq_first(&queue)) { 60989b17223SAlexander Motin bioq_remove(&queue, cbp); 61089b17223SAlexander Motin g_destroy_bio(cbp); 61189b17223SAlexander Motin } 61289b17223SAlexander Motin if (bp->bio_error == 0) 61389b17223SAlexander Motin bp->bio_error = ENOMEM; 61489b17223SAlexander Motin g_raid_iodone(bp, bp->bio_error); 61589b17223SAlexander Motin } 61689b17223SAlexander Motin 61789b17223SAlexander Motin static void 61889b17223SAlexander Motin g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) 61989b17223SAlexander Motin { 62089b17223SAlexander Motin struct g_raid_volume *vol; 62189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 62289b17223SAlexander Motin 62389b17223SAlexander Motin vol = tr->tro_volume; 62489b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 62589b17223SAlexander Motin if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 62689b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 62789b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 62889b17223SAlexander Motin g_raid_iodone(bp, EIO); 62989b17223SAlexander Motin return; 63089b17223SAlexander Motin } 63189b17223SAlexander Motin /* 63289b17223SAlexander Motin * If we're rebuilding, squeeze in rebuild activity every so often, 63389b17223SAlexander Motin * even when the disk is busy. Be sure to only count real I/O 63489b17223SAlexander Motin * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 63589b17223SAlexander Motin * by this module. 63689b17223SAlexander Motin */ 63789b17223SAlexander Motin if (trs->trso_failed_sd != NULL && 63889b17223SAlexander Motin !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 63989b17223SAlexander Motin /* Make this new or running now round short. */ 64089b17223SAlexander Motin trs->trso_recover_slabs = 0; 64189b17223SAlexander Motin if (--trs->trso_fair_io <= 0) { 64289b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 64389b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 64489b17223SAlexander Motin } 64589b17223SAlexander Motin } 64689b17223SAlexander Motin switch (bp->bio_cmd) { 64789b17223SAlexander Motin case BIO_READ: 64889b17223SAlexander Motin g_raid_tr_iostart_raid1_read(tr, bp); 64989b17223SAlexander Motin break; 65089b17223SAlexander Motin case BIO_WRITE: 65189b17223SAlexander Motin g_raid_tr_iostart_raid1_write(tr, bp); 65289b17223SAlexander Motin break; 65389b17223SAlexander Motin case BIO_DELETE: 65489b17223SAlexander Motin g_raid_iodone(bp, EIO); 65589b17223SAlexander Motin break; 65689b17223SAlexander Motin case BIO_FLUSH: 65789b17223SAlexander Motin g_raid_tr_flush_common(tr, bp); 65889b17223SAlexander Motin break; 65989b17223SAlexander Motin default: 66089b17223SAlexander Motin KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 66189b17223SAlexander Motin bp->bio_cmd, vol->v_name)); 66289b17223SAlexander Motin break; 66389b17223SAlexander Motin } 66489b17223SAlexander Motin } 66589b17223SAlexander Motin 66689b17223SAlexander Motin static void 66789b17223SAlexander Motin g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, 66889b17223SAlexander Motin struct g_raid_subdisk *sd, struct bio *bp) 66989b17223SAlexander Motin { 67089b17223SAlexander Motin struct bio *cbp; 67189b17223SAlexander Motin struct g_raid_subdisk *nsd; 67289b17223SAlexander Motin struct g_raid_volume *vol; 67389b17223SAlexander Motin struct bio *pbp; 67489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 67589b17223SAlexander Motin uintptr_t *mask; 67689b17223SAlexander Motin int error, do_write; 67789b17223SAlexander Motin 67889b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 67989b17223SAlexander Motin vol = tr->tro_volume; 68089b17223SAlexander Motin if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 68189b17223SAlexander Motin /* 68289b17223SAlexander Motin * This operation is part of a rebuild or resync operation. 68389b17223SAlexander Motin * See what work just got done, then schedule the next bit of 68489b17223SAlexander Motin * work, if any. Rebuild/resync is done a little bit at a 68589b17223SAlexander Motin * time. Either when a timeout happens, or after we get a 68689b17223SAlexander Motin * bunch of I/Os to the disk (to make sure an active system 68789b17223SAlexander Motin * will complete in a sane amount of time). 68889b17223SAlexander Motin * 68989b17223SAlexander Motin * We are setup to do differing amounts of work for each of 69089b17223SAlexander Motin * these cases. so long as the slabs is smallish (less than 69189b17223SAlexander Motin * 50 or so, I'd guess, but that's just a WAG), we shouldn't 69289b17223SAlexander Motin * have any bio starvation issues. For active disks, we do 69389b17223SAlexander Motin * 5MB of data, for inactive ones, we do 50MB. 69489b17223SAlexander Motin */ 69589b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) { 69689b17223SAlexander Motin if (bp->bio_cmd == BIO_READ) { 69789b17223SAlexander Motin 69889b17223SAlexander Motin /* Immediately abort rebuild, if requested. */ 69989b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_ABORT) { 70089b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 70189b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 70289b17223SAlexander Motin return; 70389b17223SAlexander Motin } 70489b17223SAlexander Motin 70589b17223SAlexander Motin /* On read error, skip and cross fingers. */ 70689b17223SAlexander Motin if (bp->bio_error != 0) { 70789b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 70889b17223SAlexander Motin "Read error during rebuild (%d), " 70989b17223SAlexander Motin "possible data loss!", 71089b17223SAlexander Motin bp->bio_error); 71189b17223SAlexander Motin goto rebuild_round_done; 71289b17223SAlexander Motin } 71389b17223SAlexander Motin 71489b17223SAlexander Motin /* 71589b17223SAlexander Motin * The read operation finished, queue the 71689b17223SAlexander Motin * write and get out. 71789b17223SAlexander Motin */ 71889b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "rebuild read done. %d", 71989b17223SAlexander Motin bp->bio_error); 72089b17223SAlexander Motin bp->bio_cmd = BIO_WRITE; 72189b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 72289b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); 72389b17223SAlexander Motin g_raid_subdisk_iostart(trs->trso_failed_sd, bp); 72489b17223SAlexander Motin } else { 72589b17223SAlexander Motin /* 72689b17223SAlexander Motin * The write operation just finished. Do 72789b17223SAlexander Motin * another. We keep cloning the master bio 72889b17223SAlexander Motin * since it has the right buffers allocated to 72989b17223SAlexander Motin * it. 73089b17223SAlexander Motin */ 73189b17223SAlexander Motin G_RAID_LOGREQ(4, bp, 73289b17223SAlexander Motin "rebuild write done. Error %d", 73389b17223SAlexander Motin bp->bio_error); 73489b17223SAlexander Motin nsd = trs->trso_failed_sd; 73589b17223SAlexander Motin if (bp->bio_error != 0 || 73689b17223SAlexander Motin trs->trso_flags & TR_RAID1_F_ABORT) { 73789b17223SAlexander Motin if ((trs->trso_flags & 73889b17223SAlexander Motin TR_RAID1_F_ABORT) == 0) { 73989b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, 74089b17223SAlexander Motin nsd, nsd->sd_disk); 74189b17223SAlexander Motin } 74289b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 74389b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 74489b17223SAlexander Motin return; 74589b17223SAlexander Motin } 74689b17223SAlexander Motin rebuild_round_done: 74789b17223SAlexander Motin nsd = trs->trso_failed_sd; 74889b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 74989b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, 75089b17223SAlexander Motin bp->bio_offset, bp->bio_length); 75189b17223SAlexander Motin nsd->sd_rebuild_pos += bp->bio_length; 75289b17223SAlexander Motin if (nsd->sd_rebuild_pos >= nsd->sd_size) { 75389b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(tr); 75489b17223SAlexander Motin return; 75589b17223SAlexander Motin } 75689b17223SAlexander Motin 75789b17223SAlexander Motin /* Abort rebuild if we are stopping */ 75889b17223SAlexander Motin if (trs->trso_stopping) { 75989b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 76089b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 76189b17223SAlexander Motin return; 76289b17223SAlexander Motin } 76389b17223SAlexander Motin 76489b17223SAlexander Motin if (--trs->trso_meta_update <= 0) { 76589b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 76689b17223SAlexander Motin vol, nsd, nsd->sd_disk); 76789b17223SAlexander Motin trs->trso_meta_update = 76889b17223SAlexander Motin g_raid1_rebuild_meta_update; 76989b17223SAlexander Motin } 77089b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 77189b17223SAlexander Motin if (--trs->trso_recover_slabs <= 0) 77289b17223SAlexander Motin return; 77389b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 77489b17223SAlexander Motin } 77589b17223SAlexander Motin } else if (trs->trso_type == TR_RAID1_RESYNC) { 77689b17223SAlexander Motin /* 77789b17223SAlexander Motin * read good sd, read bad sd in parallel. when both 77889b17223SAlexander Motin * done, compare the buffers. write good to the bad 77989b17223SAlexander Motin * if different. do the next bit of work. 78089b17223SAlexander Motin */ 78189b17223SAlexander Motin panic("Somehow, we think we're doing a resync"); 78289b17223SAlexander Motin } 78389b17223SAlexander Motin return; 78489b17223SAlexander Motin } 78589b17223SAlexander Motin pbp = bp->bio_parent; 78689b17223SAlexander Motin pbp->bio_inbed++; 78789b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 78889b17223SAlexander Motin /* 78989b17223SAlexander Motin * Read failed on first drive. Retry the read error on 79089b17223SAlexander Motin * another disk drive, if available, before erroring out the 79189b17223SAlexander Motin * read. 79289b17223SAlexander Motin */ 79389b17223SAlexander Motin sd->sd_disk->d_read_errs++; 79489b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 79589b17223SAlexander Motin "Read error (%d), %d read errors total", 79689b17223SAlexander Motin bp->bio_error, sd->sd_disk->d_read_errs); 79789b17223SAlexander Motin 79889b17223SAlexander Motin /* 79989b17223SAlexander Motin * If there are too many read errors, we move to degraded. 80089b17223SAlexander Motin * XXX Do we want to FAIL the drive (eg, make the user redo 80189b17223SAlexander Motin * everything to get it back in sync), or just degrade the 80289b17223SAlexander Motin * drive, which kicks off a resync? 80389b17223SAlexander Motin */ 80489b17223SAlexander Motin do_write = 1; 80589b17223SAlexander Motin if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { 80689b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 80789b17223SAlexander Motin if (pbp->bio_children == 1) 80889b17223SAlexander Motin do_write = 0; 80989b17223SAlexander Motin } 81089b17223SAlexander Motin 81189b17223SAlexander Motin /* 81289b17223SAlexander Motin * Find the other disk, and try to do the I/O to it. 81389b17223SAlexander Motin */ 81489b17223SAlexander Motin mask = (uintptr_t *)(&pbp->bio_driver2); 81589b17223SAlexander Motin if (pbp->bio_children == 1) { 81689b17223SAlexander Motin /* Save original subdisk. */ 81789b17223SAlexander Motin pbp->bio_driver1 = do_write ? sd : NULL; 81889b17223SAlexander Motin *mask = 0; 81989b17223SAlexander Motin } 82089b17223SAlexander Motin *mask |= 1 << sd->sd_pos; 82189b17223SAlexander Motin nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); 82289b17223SAlexander Motin if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { 82389b17223SAlexander Motin g_destroy_bio(bp); 82489b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 82589b17223SAlexander Motin nsd->sd_pos); 82689b17223SAlexander Motin if (pbp->bio_children == 2 && do_write) { 82789b17223SAlexander Motin sd->sd_recovery++; 82889b17223SAlexander Motin cbp->bio_caller1 = nsd; 82989b17223SAlexander Motin pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; 83089b17223SAlexander Motin /* Lock callback starts I/O */ 83189b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, 83289b17223SAlexander Motin cbp->bio_offset, cbp->bio_length, pbp, cbp); 83389b17223SAlexander Motin } else { 83489b17223SAlexander Motin g_raid_subdisk_iostart(nsd, cbp); 83589b17223SAlexander Motin } 83689b17223SAlexander Motin return; 83789b17223SAlexander Motin } 83889b17223SAlexander Motin /* 83989b17223SAlexander Motin * We can't retry. Return the original error by falling 84089b17223SAlexander Motin * through. This will happen when there's only one good disk. 84189b17223SAlexander Motin * We don't need to fail the raid, since its actual state is 84289b17223SAlexander Motin * based on the state of the subdisks. 84389b17223SAlexander Motin */ 84489b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 84589b17223SAlexander Motin } 84689b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && 84789b17223SAlexander Motin bp->bio_error == 0 && 84889b17223SAlexander Motin pbp->bio_children > 1 && 84989b17223SAlexander Motin pbp->bio_driver1 != NULL) { 85089b17223SAlexander Motin /* 85189b17223SAlexander Motin * If it was a read, and bio_children is >1, then we just 85289b17223SAlexander Motin * recovered the data from the second drive. We should try to 85389b17223SAlexander Motin * write that data to the first drive if sector remapping is 85489b17223SAlexander Motin * enabled. A write should put the data in a new place on the 85589b17223SAlexander Motin * disk, remapping the bad sector. Do we need to do that by 85689b17223SAlexander Motin * queueing a request to the main worker thread? It doesn't 85789b17223SAlexander Motin * affect the return code of this current read, and can be 85889b17223SAlexander Motin * done at our liesure. However, to make the code simpler, it 85989b17223SAlexander Motin * is done syncrhonously. 86089b17223SAlexander Motin */ 86189b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 86289b17223SAlexander Motin cbp = g_clone_bio(pbp); 86389b17223SAlexander Motin if (cbp != NULL) { 86489b17223SAlexander Motin g_destroy_bio(bp); 86589b17223SAlexander Motin cbp->bio_cmd = BIO_WRITE; 86689b17223SAlexander Motin cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 86789b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, 86889b17223SAlexander Motin "Attempting bad sector remap on failing drive."); 86989b17223SAlexander Motin g_raid_subdisk_iostart(pbp->bio_driver1, cbp); 87089b17223SAlexander Motin return; 87189b17223SAlexander Motin } 87289b17223SAlexander Motin } 87389b17223SAlexander Motin if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { 87489b17223SAlexander Motin /* 87589b17223SAlexander Motin * We're done with a recovery, mark the range as unlocked. 87689b17223SAlexander Motin * For any write errors, we agressively fail the disk since 87789b17223SAlexander Motin * there was both a READ and a WRITE error at this location. 87889b17223SAlexander Motin * Both types of errors generally indicates the drive is on 87989b17223SAlexander Motin * the verge of total failure anyway. Better to stop trusting 88089b17223SAlexander Motin * it now. However, we need to reset error to 0 in that case 88189b17223SAlexander Motin * because we're not failing the original I/O which succeeded. 88289b17223SAlexander Motin */ 88389b17223SAlexander Motin if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 88489b17223SAlexander Motin G_RAID_LOGREQ(0, bp, "Remap write failed: " 88589b17223SAlexander Motin "failing subdisk."); 88689b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 88789b17223SAlexander Motin bp->bio_error = 0; 88889b17223SAlexander Motin } 88989b17223SAlexander Motin if (pbp->bio_driver1 != NULL) { 89089b17223SAlexander Motin ((struct g_raid_subdisk *)pbp->bio_driver1) 89189b17223SAlexander Motin ->sd_recovery--; 89289b17223SAlexander Motin } 89389b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 89489b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, bp->bio_offset, 89589b17223SAlexander Motin bp->bio_length); 89689b17223SAlexander Motin } 897*ef844ef7SAlexander Motin if (pbp->bio_cmd != BIO_READ) { 898*ef844ef7SAlexander Motin if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 899*ef844ef7SAlexander Motin pbp->bio_error = bp->bio_error; 900*ef844ef7SAlexander Motin if (bp->bio_error != 0) { 901*ef844ef7SAlexander Motin G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 902*ef844ef7SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 903*ef844ef7SAlexander Motin } 904*ef844ef7SAlexander Motin error = pbp->bio_error; 905*ef844ef7SAlexander Motin } else 90689b17223SAlexander Motin error = bp->bio_error; 90789b17223SAlexander Motin g_destroy_bio(bp); 90889b17223SAlexander Motin if (pbp->bio_children == pbp->bio_inbed) { 90989b17223SAlexander Motin pbp->bio_completed = pbp->bio_length; 91089b17223SAlexander Motin g_raid_iodone(pbp, error); 91189b17223SAlexander Motin } 91289b17223SAlexander Motin } 91389b17223SAlexander Motin 91489b17223SAlexander Motin static int 91589b17223SAlexander Motin g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, 91689b17223SAlexander Motin void *virtual, vm_offset_t physical, off_t offset, size_t length) 91789b17223SAlexander Motin { 91889b17223SAlexander Motin struct g_raid_volume *vol; 91989b17223SAlexander Motin struct g_raid_subdisk *sd; 92089b17223SAlexander Motin int error, i, ok; 92189b17223SAlexander Motin 92289b17223SAlexander Motin vol = tr->tro_volume; 92389b17223SAlexander Motin error = 0; 92489b17223SAlexander Motin ok = 0; 92589b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 92689b17223SAlexander Motin sd = &vol->v_subdisks[i]; 92789b17223SAlexander Motin switch (sd->sd_state) { 92889b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 92989b17223SAlexander Motin break; 93089b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 93189b17223SAlexander Motin /* 93289b17223SAlexander Motin * When rebuilding, only part of this subdisk is 93389b17223SAlexander Motin * writable, the rest will be written as part of the 93489b17223SAlexander Motin * that process. 93589b17223SAlexander Motin */ 93689b17223SAlexander Motin if (offset >= sd->sd_rebuild_pos) 93789b17223SAlexander Motin continue; 93889b17223SAlexander Motin break; 93989b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 94089b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 94189b17223SAlexander Motin /* 94289b17223SAlexander Motin * Resyncing still writes on the theory that the 94389b17223SAlexander Motin * resync'd disk is very close and writing it will 94489b17223SAlexander Motin * keep it that way better if we keep up while 94589b17223SAlexander Motin * resyncing. 94689b17223SAlexander Motin */ 94789b17223SAlexander Motin break; 94889b17223SAlexander Motin default: 94989b17223SAlexander Motin continue; 95089b17223SAlexander Motin } 95189b17223SAlexander Motin error = g_raid_subdisk_kerneldump(sd, 95289b17223SAlexander Motin virtual, physical, offset, length); 95389b17223SAlexander Motin if (error == 0) 95489b17223SAlexander Motin ok++; 95589b17223SAlexander Motin } 95689b17223SAlexander Motin return (ok > 0 ? 0 : error); 95789b17223SAlexander Motin } 95889b17223SAlexander Motin 95989b17223SAlexander Motin static int 96089b17223SAlexander Motin g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) 96189b17223SAlexander Motin { 96289b17223SAlexander Motin struct bio *bp; 96389b17223SAlexander Motin struct g_raid_subdisk *sd; 96489b17223SAlexander Motin 96589b17223SAlexander Motin bp = (struct bio *)argp; 96689b17223SAlexander Motin sd = (struct g_raid_subdisk *)bp->bio_caller1; 96789b17223SAlexander Motin g_raid_subdisk_iostart(sd, bp); 96889b17223SAlexander Motin 96989b17223SAlexander Motin return (0); 97089b17223SAlexander Motin } 97189b17223SAlexander Motin 97289b17223SAlexander Motin static int 97389b17223SAlexander Motin g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) 97489b17223SAlexander Motin { 97589b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 97689b17223SAlexander Motin 97789b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 97889b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 97989b17223SAlexander Motin trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; 98089b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) 98189b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 98289b17223SAlexander Motin return (0); 98389b17223SAlexander Motin } 98489b17223SAlexander Motin 98589b17223SAlexander Motin static int 98689b17223SAlexander Motin g_raid_tr_free_raid1(struct g_raid_tr_object *tr) 98789b17223SAlexander Motin { 98889b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 98989b17223SAlexander Motin 99089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 99189b17223SAlexander Motin 99289b17223SAlexander Motin if (trs->trso_buffer != NULL) { 99389b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 99489b17223SAlexander Motin trs->trso_buffer = NULL; 99589b17223SAlexander Motin } 99689b17223SAlexander Motin return (0); 99789b17223SAlexander Motin } 99889b17223SAlexander Motin 99989b17223SAlexander Motin G_RAID_TR_DECLARE(g_raid_tr_raid1); 1000