189b17223SAlexander Motin /*- 289b17223SAlexander Motin * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 389b17223SAlexander Motin * All rights reserved. 489b17223SAlexander Motin * 589b17223SAlexander Motin * Redistribution and use in source and binary forms, with or without 689b17223SAlexander Motin * modification, are permitted provided that the following conditions 789b17223SAlexander Motin * are met: 889b17223SAlexander Motin * 1. Redistributions of source code must retain the above copyright 989b17223SAlexander Motin * notice, this list of conditions and the following disclaimer. 1089b17223SAlexander Motin * 2. Redistributions in binary form must reproduce the above copyright 1189b17223SAlexander Motin * notice, this list of conditions and the following disclaimer in the 1289b17223SAlexander Motin * documentation and/or other materials provided with the distribution. 1389b17223SAlexander Motin * 1489b17223SAlexander Motin * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 1589b17223SAlexander Motin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1689b17223SAlexander Motin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1789b17223SAlexander Motin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 1889b17223SAlexander Motin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1989b17223SAlexander Motin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2089b17223SAlexander Motin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2189b17223SAlexander Motin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2289b17223SAlexander Motin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2389b17223SAlexander Motin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2489b17223SAlexander Motin * SUCH DAMAGE. 2589b17223SAlexander Motin */ 2689b17223SAlexander Motin 2789b17223SAlexander Motin #include <sys/cdefs.h> 2889b17223SAlexander Motin __FBSDID("$FreeBSD$"); 2989b17223SAlexander Motin 3089b17223SAlexander Motin #include <sys/param.h> 3189b17223SAlexander Motin #include <sys/bio.h> 3289b17223SAlexander Motin #include <sys/endian.h> 3389b17223SAlexander Motin #include <sys/kernel.h> 3489b17223SAlexander Motin #include <sys/kobj.h> 3589b17223SAlexander Motin #include <sys/limits.h> 3689b17223SAlexander Motin #include <sys/lock.h> 3789b17223SAlexander Motin #include <sys/malloc.h> 3889b17223SAlexander Motin #include <sys/mutex.h> 3989b17223SAlexander Motin #include <sys/sysctl.h> 4089b17223SAlexander Motin #include <sys/systm.h> 4189b17223SAlexander Motin #include <geom/geom.h> 4289b17223SAlexander Motin #include "geom/raid/g_raid.h" 4389b17223SAlexander Motin #include "g_raid_tr_if.h" 4489b17223SAlexander Motin 45c89d2fbeSAlexander Motin SYSCTL_DECL(_kern_geom_raid_raid1); 4689b17223SAlexander Motin 4789b17223SAlexander Motin #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 4889b17223SAlexander Motin static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; 49*af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, 5089b17223SAlexander Motin &g_raid1_rebuild_slab, 0, 5189b17223SAlexander Motin "Amount of the disk to rebuild each read/write cycle of the rebuild."); 5289b17223SAlexander Motin 5389b17223SAlexander Motin #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 5489b17223SAlexander Motin static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; 55*af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, 5689b17223SAlexander Motin &g_raid1_rebuild_fair_io, 0, 5789b17223SAlexander Motin "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 5889b17223SAlexander Motin 5989b17223SAlexander Motin #define RAID1_REBUILD_CLUSTER_IDLE 100 6089b17223SAlexander Motin static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; 61*af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, 6289b17223SAlexander Motin &g_raid1_rebuild_cluster_idle, 0, 6389b17223SAlexander Motin "Number of slabs to do each time we trigger a rebuild cycle"); 6489b17223SAlexander Motin 6589b17223SAlexander Motin #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 6689b17223SAlexander Motin static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; 67*af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, 6889b17223SAlexander Motin &g_raid1_rebuild_meta_update, 0, 6989b17223SAlexander Motin "When to update the meta data."); 7089b17223SAlexander Motin 7189b17223SAlexander Motin static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); 7289b17223SAlexander Motin 7389b17223SAlexander Motin #define TR_RAID1_NONE 0 7489b17223SAlexander Motin #define TR_RAID1_REBUILD 1 7589b17223SAlexander Motin #define TR_RAID1_RESYNC 2 7689b17223SAlexander Motin 7789b17223SAlexander Motin #define TR_RAID1_F_DOING_SOME 0x1 7889b17223SAlexander Motin #define TR_RAID1_F_LOCKED 0x2 7989b17223SAlexander Motin #define TR_RAID1_F_ABORT 0x4 8089b17223SAlexander Motin 8189b17223SAlexander Motin struct g_raid_tr_raid1_object { 8289b17223SAlexander Motin struct g_raid_tr_object trso_base; 8389b17223SAlexander Motin int trso_starting; 8489b17223SAlexander Motin int trso_stopping; 8589b17223SAlexander Motin int trso_type; 8689b17223SAlexander Motin int trso_recover_slabs; /* slabs before rest */ 8789b17223SAlexander Motin int trso_fair_io; 8889b17223SAlexander Motin int trso_meta_update; 8989b17223SAlexander Motin int trso_flags; 9089b17223SAlexander Motin struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 9189b17223SAlexander Motin void *trso_buffer; /* Buffer space */ 9289b17223SAlexander Motin struct bio trso_bio; 9389b17223SAlexander Motin }; 9489b17223SAlexander Motin 9589b17223SAlexander Motin static g_raid_tr_taste_t g_raid_tr_taste_raid1; 9689b17223SAlexander Motin static g_raid_tr_event_t g_raid_tr_event_raid1; 9789b17223SAlexander Motin static g_raid_tr_start_t g_raid_tr_start_raid1; 9889b17223SAlexander Motin static g_raid_tr_stop_t g_raid_tr_stop_raid1; 9989b17223SAlexander Motin static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; 10089b17223SAlexander Motin static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; 10189b17223SAlexander Motin static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; 10289b17223SAlexander Motin static g_raid_tr_locked_t g_raid_tr_locked_raid1; 10389b17223SAlexander Motin static g_raid_tr_idle_t g_raid_tr_idle_raid1; 10489b17223SAlexander Motin static g_raid_tr_free_t g_raid_tr_free_raid1; 10589b17223SAlexander Motin 10689b17223SAlexander Motin static kobj_method_t g_raid_tr_raid1_methods[] = { 10789b17223SAlexander Motin KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), 10889b17223SAlexander Motin KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), 10989b17223SAlexander Motin KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), 11089b17223SAlexander Motin KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), 11189b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), 11289b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), 11389b17223SAlexander Motin KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), 11489b17223SAlexander Motin KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), 11589b17223SAlexander Motin KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), 11689b17223SAlexander Motin KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), 11789b17223SAlexander Motin { 0, 0 } 11889b17223SAlexander Motin }; 11989b17223SAlexander Motin 12089b17223SAlexander Motin static struct g_raid_tr_class g_raid_tr_raid1_class = { 12189b17223SAlexander Motin "RAID1", 12289b17223SAlexander Motin g_raid_tr_raid1_methods, 12389b17223SAlexander Motin sizeof(struct g_raid_tr_raid1_object), 124c89d2fbeSAlexander Motin .trc_enable = 1, 125b43560abSAlexander Motin .trc_priority = 100, 126b43560abSAlexander Motin .trc_accept_unmapped = 1 12789b17223SAlexander Motin }; 12889b17223SAlexander Motin 12989b17223SAlexander Motin static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); 13089b17223SAlexander Motin static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 13189b17223SAlexander Motin struct g_raid_subdisk *sd); 13289b17223SAlexander Motin 13389b17223SAlexander Motin static int 13489b17223SAlexander Motin g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 13589b17223SAlexander Motin { 13689b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 13789b17223SAlexander Motin 13889b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 13989b17223SAlexander Motin if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || 140dbb2e755SAlexander Motin (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && 141dbb2e755SAlexander Motin tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) 14289b17223SAlexander Motin return (G_RAID_TR_TASTE_FAIL); 14389b17223SAlexander Motin trs->trso_starting = 1; 14489b17223SAlexander Motin return (G_RAID_TR_TASTE_SUCCEED); 14589b17223SAlexander Motin } 14689b17223SAlexander Motin 14789b17223SAlexander Motin static int 14889b17223SAlexander Motin g_raid_tr_update_state_raid1(struct g_raid_volume *vol, 14989b17223SAlexander Motin struct g_raid_subdisk *sd) 15089b17223SAlexander Motin { 15189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 15289b17223SAlexander Motin struct g_raid_softc *sc; 15389b17223SAlexander Motin struct g_raid_subdisk *tsd, *bestsd; 15489b17223SAlexander Motin u_int s; 15589b17223SAlexander Motin int i, na, ns; 15689b17223SAlexander Motin 15789b17223SAlexander Motin sc = vol->v_softc; 15889b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)vol->v_tr; 15989b17223SAlexander Motin if (trs->trso_stopping && 16089b17223SAlexander Motin (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) 16189b17223SAlexander Motin s = G_RAID_VOLUME_S_STOPPED; 16289b17223SAlexander Motin else if (trs->trso_starting) 16389b17223SAlexander Motin s = G_RAID_VOLUME_S_STARTING; 16489b17223SAlexander Motin else { 16589b17223SAlexander Motin /* Make sure we have at least one ACTIVE disk. */ 16689b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 16789b17223SAlexander Motin if (na == 0) { 16889b17223SAlexander Motin /* 16989b17223SAlexander Motin * Critical situation! We have no any active disk! 17089b17223SAlexander Motin * Choose the best disk we have to make it active. 17189b17223SAlexander Motin */ 17289b17223SAlexander Motin bestsd = &vol->v_subdisks[0]; 17389b17223SAlexander Motin for (i = 1; i < vol->v_disks_count; i++) { 17489b17223SAlexander Motin tsd = &vol->v_subdisks[i]; 17589b17223SAlexander Motin if (tsd->sd_state > bestsd->sd_state) 17689b17223SAlexander Motin bestsd = tsd; 17789b17223SAlexander Motin else if (tsd->sd_state == bestsd->sd_state && 17889b17223SAlexander Motin (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || 17989b17223SAlexander Motin tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 18089b17223SAlexander Motin tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 18189b17223SAlexander Motin bestsd = tsd; 18289b17223SAlexander Motin } 18389b17223SAlexander Motin if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { 18489b17223SAlexander Motin /* We found reasonable candidate. */ 18589b17223SAlexander Motin G_RAID_DEBUG1(1, sc, 18689b17223SAlexander Motin "Promote subdisk %s:%d from %s to ACTIVE.", 18789b17223SAlexander Motin vol->v_name, bestsd->sd_pos, 18889b17223SAlexander Motin g_raid_subdisk_state2str(bestsd->sd_state)); 18989b17223SAlexander Motin g_raid_change_subdisk_state(bestsd, 19089b17223SAlexander Motin G_RAID_SUBDISK_S_ACTIVE); 19189b17223SAlexander Motin g_raid_write_metadata(sc, 19289b17223SAlexander Motin vol, bestsd, bestsd->sd_disk); 19389b17223SAlexander Motin } 19489b17223SAlexander Motin } 19589b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 19689b17223SAlexander Motin ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 19789b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 19889b17223SAlexander Motin if (na == vol->v_disks_count) 19989b17223SAlexander Motin s = G_RAID_VOLUME_S_OPTIMAL; 20089b17223SAlexander Motin else if (na + ns == vol->v_disks_count) 20189b17223SAlexander Motin s = G_RAID_VOLUME_S_SUBOPTIMAL; 20289b17223SAlexander Motin else if (na > 0) 20389b17223SAlexander Motin s = G_RAID_VOLUME_S_DEGRADED; 20489b17223SAlexander Motin else 20589b17223SAlexander Motin s = G_RAID_VOLUME_S_BROKEN; 20689b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); 20789b17223SAlexander Motin } 20889b17223SAlexander Motin if (s != vol->v_state) { 20989b17223SAlexander Motin g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 21089b17223SAlexander Motin G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 21189b17223SAlexander Motin G_RAID_EVENT_VOLUME); 21289b17223SAlexander Motin g_raid_change_volume_state(vol, s); 21389b17223SAlexander Motin if (!trs->trso_starting && !trs->trso_stopping) 21489b17223SAlexander Motin g_raid_write_metadata(sc, vol, NULL, NULL); 21589b17223SAlexander Motin } 21689b17223SAlexander Motin return (0); 21789b17223SAlexander Motin } 21889b17223SAlexander Motin 21989b17223SAlexander Motin static void 22089b17223SAlexander Motin g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 22189b17223SAlexander Motin struct g_raid_disk *disk) 22289b17223SAlexander Motin { 22389b17223SAlexander Motin /* 22489b17223SAlexander Motin * We don't fail the last disk in the pack, since it still has decent 22589b17223SAlexander Motin * data on it and that's better than failing the disk if it is the root 22689b17223SAlexander Motin * file system. 22789b17223SAlexander Motin * 22889b17223SAlexander Motin * XXX should this be controlled via a tunable? It makes sense for 22989b17223SAlexander Motin * the volume that has / on it. I can't think of a case where we'd 23089b17223SAlexander Motin * want the volume to go away on this kind of event. 23189b17223SAlexander Motin */ 23289b17223SAlexander Motin if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && 23389b17223SAlexander Motin g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) 23489b17223SAlexander Motin return; 23589b17223SAlexander Motin g_raid_fail_disk(sc, sd, disk); 23689b17223SAlexander Motin } 23789b17223SAlexander Motin 23889b17223SAlexander Motin static void 23989b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) 24089b17223SAlexander Motin { 24189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 24289b17223SAlexander Motin struct g_raid_subdisk *sd, *good_sd; 24389b17223SAlexander Motin struct bio *bp; 24489b17223SAlexander Motin 24589b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 24689b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) 24789b17223SAlexander Motin return; 24889b17223SAlexander Motin sd = trs->trso_failed_sd; 24989b17223SAlexander Motin good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); 25089b17223SAlexander Motin if (good_sd == NULL) { 25189b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 25289b17223SAlexander Motin return; 25389b17223SAlexander Motin } 25489b17223SAlexander Motin bp = &trs->trso_bio; 25589b17223SAlexander Motin memset(bp, 0, sizeof(*bp)); 25689b17223SAlexander Motin bp->bio_offset = sd->sd_rebuild_pos; 25789b17223SAlexander Motin bp->bio_length = MIN(g_raid1_rebuild_slab, 25889b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 25989b17223SAlexander Motin bp->bio_data = trs->trso_buffer; 26089b17223SAlexander Motin bp->bio_cmd = BIO_READ; 26189b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 26289b17223SAlexander Motin bp->bio_caller1 = good_sd; 26389b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_DOING_SOME; 26489b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_LOCKED; 26589b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ 26689b17223SAlexander Motin bp->bio_offset, bp->bio_length, NULL, bp); 26789b17223SAlexander Motin } 26889b17223SAlexander Motin 26989b17223SAlexander Motin static void 27089b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) 27189b17223SAlexander Motin { 27289b17223SAlexander Motin struct g_raid_volume *vol; 27389b17223SAlexander Motin struct g_raid_subdisk *sd; 27489b17223SAlexander Motin 27589b17223SAlexander Motin vol = trs->trso_base.tro_volume; 27689b17223SAlexander Motin sd = trs->trso_failed_sd; 27789b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 27889b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 27989b17223SAlexander Motin trs->trso_buffer = NULL; 28089b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 28189b17223SAlexander Motin trs->trso_type = TR_RAID1_NONE; 28289b17223SAlexander Motin trs->trso_recover_slabs = 0; 28389b17223SAlexander Motin trs->trso_failed_sd = NULL; 28489b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 28589b17223SAlexander Motin } 28689b17223SAlexander Motin 28789b17223SAlexander Motin static void 28889b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) 28989b17223SAlexander Motin { 29089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 29189b17223SAlexander Motin struct g_raid_subdisk *sd; 29289b17223SAlexander Motin 29389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 29489b17223SAlexander Motin sd = trs->trso_failed_sd; 29589b17223SAlexander Motin G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 29689b17223SAlexander Motin "Subdisk %s:%d-%s rebuild completed.", 29789b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 29889b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 29989b17223SAlexander Motin g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 30089b17223SAlexander Motin sd->sd_rebuild_pos = 0; 30189b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 30289b17223SAlexander Motin } 30389b17223SAlexander Motin 30489b17223SAlexander Motin static void 30589b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) 30689b17223SAlexander Motin { 30789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 30889b17223SAlexander Motin struct g_raid_subdisk *sd; 30989b17223SAlexander Motin struct g_raid_volume *vol; 31089b17223SAlexander Motin off_t len; 31189b17223SAlexander Motin 31289b17223SAlexander Motin vol = tr->tro_volume; 31389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 31489b17223SAlexander Motin sd = trs->trso_failed_sd; 31589b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { 31689b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 31789b17223SAlexander Motin "Subdisk %s:%d-%s rebuild is aborting.", 31889b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 31989b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 32089b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_ABORT; 32189b17223SAlexander Motin } else { 32289b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 32389b17223SAlexander Motin "Subdisk %s:%d-%s rebuild aborted.", 32489b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 32589b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 32689b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_ABORT; 32789b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_LOCKED) { 32889b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 32989b17223SAlexander Motin len = MIN(g_raid1_rebuild_slab, 33089b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 33189b17223SAlexander Motin g_raid_unlock_range(tr->tro_volume, 33289b17223SAlexander Motin sd->sd_rebuild_pos, len); 33389b17223SAlexander Motin } 33489b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 33589b17223SAlexander Motin } 33689b17223SAlexander Motin } 33789b17223SAlexander Motin 33889b17223SAlexander Motin static void 33989b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) 34089b17223SAlexander Motin { 34189b17223SAlexander Motin struct g_raid_volume *vol; 34289b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 34389b17223SAlexander Motin struct g_raid_subdisk *sd, *fsd; 34489b17223SAlexander Motin 34589b17223SAlexander Motin vol = tr->tro_volume; 34689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 34789b17223SAlexander Motin if (trs->trso_failed_sd) { 34889b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 34989b17223SAlexander Motin "Already rebuild in start rebuild. pos %jd\n", 35089b17223SAlexander Motin (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 35189b17223SAlexander Motin return; 35289b17223SAlexander Motin } 35389b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); 35489b17223SAlexander Motin if (sd == NULL) { 35589b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 35689b17223SAlexander Motin "No active disk to rebuild. night night."); 35789b17223SAlexander Motin return; 35889b17223SAlexander Motin } 35989b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 36089b17223SAlexander Motin if (fsd == NULL) 36189b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 36289b17223SAlexander Motin if (fsd == NULL) { 36389b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 36489b17223SAlexander Motin if (fsd != NULL) { 36589b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 36689b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 36789b17223SAlexander Motin G_RAID_SUBDISK_S_RESYNC); 36889b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); 36989b17223SAlexander Motin } else { 37089b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 37189b17223SAlexander Motin G_RAID_SUBDISK_S_UNINITIALIZED); 37289b17223SAlexander Motin if (fsd == NULL) 37389b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 37489b17223SAlexander Motin G_RAID_SUBDISK_S_NEW); 37589b17223SAlexander Motin if (fsd != NULL) { 37689b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 37789b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 37889b17223SAlexander Motin G_RAID_SUBDISK_S_REBUILD); 37989b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 38089b17223SAlexander Motin vol, fsd, NULL); 38189b17223SAlexander Motin } 38289b17223SAlexander Motin } 38389b17223SAlexander Motin } 38489b17223SAlexander Motin if (fsd == NULL) { 38589b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 38689b17223SAlexander Motin "No failed disk to rebuild. night night."); 38789b17223SAlexander Motin return; 38889b17223SAlexander Motin } 38989b17223SAlexander Motin trs->trso_failed_sd = fsd; 39089b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 39189b17223SAlexander Motin "Subdisk %s:%d-%s rebuild start at %jd.", 39289b17223SAlexander Motin fsd->sd_volume->v_name, fsd->sd_pos, 39389b17223SAlexander Motin fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", 39489b17223SAlexander Motin trs->trso_failed_sd->sd_rebuild_pos); 39589b17223SAlexander Motin trs->trso_type = TR_RAID1_REBUILD; 39689b17223SAlexander Motin trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); 39789b17223SAlexander Motin trs->trso_meta_update = g_raid1_rebuild_meta_update; 39889b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 39989b17223SAlexander Motin } 40089b17223SAlexander Motin 40189b17223SAlexander Motin 40289b17223SAlexander Motin static void 40389b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 40489b17223SAlexander Motin struct g_raid_subdisk *sd) 40589b17223SAlexander Motin { 40689b17223SAlexander Motin struct g_raid_volume *vol; 40789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 40889b17223SAlexander Motin int na, nr; 40989b17223SAlexander Motin 41089b17223SAlexander Motin /* 41189b17223SAlexander Motin * If we're stopping, don't do anything. If we don't have at least one 41289b17223SAlexander Motin * good disk and one bad disk, we don't do anything. And if there's a 41389b17223SAlexander Motin * 'good disk' stored in the trs, then we're in progress and we punt. 41489b17223SAlexander Motin * If we make it past all these checks, we need to rebuild. 41589b17223SAlexander Motin */ 41689b17223SAlexander Motin vol = tr->tro_volume; 41789b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 41889b17223SAlexander Motin if (trs->trso_stopping) 41989b17223SAlexander Motin return; 42089b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 42189b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 42289b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 42389b17223SAlexander Motin switch(trs->trso_type) { 42489b17223SAlexander Motin case TR_RAID1_NONE: 42589b17223SAlexander Motin if (na == 0) 42689b17223SAlexander Motin return; 42789b17223SAlexander Motin if (nr == 0) { 42889b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 42989b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 43089b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 43189b17223SAlexander Motin if (nr == 0) 43289b17223SAlexander Motin return; 43389b17223SAlexander Motin } 43489b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(tr); 43589b17223SAlexander Motin break; 43689b17223SAlexander Motin case TR_RAID1_REBUILD: 43789b17223SAlexander Motin if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) 43889b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 43989b17223SAlexander Motin break; 44089b17223SAlexander Motin case TR_RAID1_RESYNC: 44189b17223SAlexander Motin break; 44289b17223SAlexander Motin } 44389b17223SAlexander Motin } 44489b17223SAlexander Motin 44589b17223SAlexander Motin static int 44689b17223SAlexander Motin g_raid_tr_event_raid1(struct g_raid_tr_object *tr, 44789b17223SAlexander Motin struct g_raid_subdisk *sd, u_int event) 44889b17223SAlexander Motin { 44989b17223SAlexander Motin 45089b17223SAlexander Motin g_raid_tr_update_state_raid1(tr->tro_volume, sd); 45189b17223SAlexander Motin return (0); 45289b17223SAlexander Motin } 45389b17223SAlexander Motin 45489b17223SAlexander Motin static int 45589b17223SAlexander Motin g_raid_tr_start_raid1(struct g_raid_tr_object *tr) 45689b17223SAlexander Motin { 45789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 45889b17223SAlexander Motin struct g_raid_volume *vol; 45989b17223SAlexander Motin 46089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 46189b17223SAlexander Motin vol = tr->tro_volume; 46289b17223SAlexander Motin trs->trso_starting = 0; 46389b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 46489b17223SAlexander Motin return (0); 46589b17223SAlexander Motin } 46689b17223SAlexander Motin 46789b17223SAlexander Motin static int 46889b17223SAlexander Motin g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) 46989b17223SAlexander Motin { 47089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 47189b17223SAlexander Motin struct g_raid_volume *vol; 47289b17223SAlexander Motin 47389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 47489b17223SAlexander Motin vol = tr->tro_volume; 47589b17223SAlexander Motin trs->trso_starting = 0; 47689b17223SAlexander Motin trs->trso_stopping = 1; 47789b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 47889b17223SAlexander Motin return (0); 47989b17223SAlexander Motin } 48089b17223SAlexander Motin 48189b17223SAlexander Motin /* 48289b17223SAlexander Motin * Select the disk to read from. Take into account: subdisk state, running 48389b17223SAlexander Motin * error recovery, average disk load, head position and possible cache hits. 48489b17223SAlexander Motin */ 48589b17223SAlexander Motin #define ABS(x) (((x) >= 0) ? (x) : (-(x))) 48689b17223SAlexander Motin static struct g_raid_subdisk * 48789b17223SAlexander Motin g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, 48889b17223SAlexander Motin u_int mask) 48989b17223SAlexander Motin { 49089b17223SAlexander Motin struct g_raid_subdisk *sd, *best; 49189b17223SAlexander Motin int i, prio, bestprio; 49289b17223SAlexander Motin 49389b17223SAlexander Motin best = NULL; 49489b17223SAlexander Motin bestprio = INT_MAX; 49589b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 49689b17223SAlexander Motin sd = &vol->v_subdisks[i]; 49789b17223SAlexander Motin if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && 49889b17223SAlexander Motin ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && 49989b17223SAlexander Motin sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || 50089b17223SAlexander Motin bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) 50189b17223SAlexander Motin continue; 50289b17223SAlexander Motin if ((mask & (1 << i)) != 0) 50389b17223SAlexander Motin continue; 50489b17223SAlexander Motin prio = G_RAID_SUBDISK_LOAD(sd); 50589b17223SAlexander Motin prio += min(sd->sd_recovery, 255) << 22; 50689b17223SAlexander Motin prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; 50789b17223SAlexander Motin /* If disk head is precisely in position - highly prefer it. */ 50889b17223SAlexander Motin if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) 50989b17223SAlexander Motin prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 51089b17223SAlexander Motin else 51189b17223SAlexander Motin /* If disk head is close to position - prefer it. */ 51289b17223SAlexander Motin if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < 51389b17223SAlexander Motin G_RAID_SUBDISK_TRACK_SIZE) 51489b17223SAlexander Motin prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 51589b17223SAlexander Motin if (prio < bestprio) { 51689b17223SAlexander Motin best = sd; 51789b17223SAlexander Motin bestprio = prio; 51889b17223SAlexander Motin } 51989b17223SAlexander Motin } 52089b17223SAlexander Motin return (best); 52189b17223SAlexander Motin } 52289b17223SAlexander Motin 52389b17223SAlexander Motin static void 52489b17223SAlexander Motin g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) 52589b17223SAlexander Motin { 52689b17223SAlexander Motin struct g_raid_subdisk *sd; 52789b17223SAlexander Motin struct bio *cbp; 52889b17223SAlexander Motin 52989b17223SAlexander Motin sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); 53089b17223SAlexander Motin KASSERT(sd != NULL, ("No active disks in volume %s.", 53189b17223SAlexander Motin tr->tro_volume->v_name)); 53289b17223SAlexander Motin 53389b17223SAlexander Motin cbp = g_clone_bio(bp); 53489b17223SAlexander Motin if (cbp == NULL) { 53589b17223SAlexander Motin g_raid_iodone(bp, ENOMEM); 53689b17223SAlexander Motin return; 53789b17223SAlexander Motin } 53889b17223SAlexander Motin 53989b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 54089b17223SAlexander Motin } 54189b17223SAlexander Motin 54289b17223SAlexander Motin static void 54389b17223SAlexander Motin g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) 54489b17223SAlexander Motin { 54589b17223SAlexander Motin struct g_raid_volume *vol; 54689b17223SAlexander Motin struct g_raid_subdisk *sd; 54789b17223SAlexander Motin struct bio_queue_head queue; 54889b17223SAlexander Motin struct bio *cbp; 54989b17223SAlexander Motin int i; 55089b17223SAlexander Motin 55189b17223SAlexander Motin vol = tr->tro_volume; 55289b17223SAlexander Motin 55389b17223SAlexander Motin /* 55489b17223SAlexander Motin * Allocate all bios before sending any request, so we can return 55589b17223SAlexander Motin * ENOMEM in nice and clean way. 55689b17223SAlexander Motin */ 55789b17223SAlexander Motin bioq_init(&queue); 55889b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 55989b17223SAlexander Motin sd = &vol->v_subdisks[i]; 56089b17223SAlexander Motin switch (sd->sd_state) { 56189b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 56289b17223SAlexander Motin break; 56389b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 56489b17223SAlexander Motin /* 56589b17223SAlexander Motin * When rebuilding, only part of this subdisk is 56689b17223SAlexander Motin * writable, the rest will be written as part of the 56789b17223SAlexander Motin * that process. 56889b17223SAlexander Motin */ 56989b17223SAlexander Motin if (bp->bio_offset >= sd->sd_rebuild_pos) 57089b17223SAlexander Motin continue; 57189b17223SAlexander Motin break; 57289b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 57389b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 57489b17223SAlexander Motin /* 57589b17223SAlexander Motin * Resyncing still writes on the theory that the 57689b17223SAlexander Motin * resync'd disk is very close and writing it will 57789b17223SAlexander Motin * keep it that way better if we keep up while 57889b17223SAlexander Motin * resyncing. 57989b17223SAlexander Motin */ 58089b17223SAlexander Motin break; 58189b17223SAlexander Motin default: 58289b17223SAlexander Motin continue; 58389b17223SAlexander Motin } 58489b17223SAlexander Motin cbp = g_clone_bio(bp); 58589b17223SAlexander Motin if (cbp == NULL) 58689b17223SAlexander Motin goto failure; 58789b17223SAlexander Motin cbp->bio_caller1 = sd; 58889b17223SAlexander Motin bioq_insert_tail(&queue, cbp); 58989b17223SAlexander Motin } 590b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL) { 59189b17223SAlexander Motin sd = cbp->bio_caller1; 59289b17223SAlexander Motin cbp->bio_caller1 = NULL; 59389b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 59489b17223SAlexander Motin } 59589b17223SAlexander Motin return; 59689b17223SAlexander Motin failure: 597b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL) 59889b17223SAlexander Motin g_destroy_bio(cbp); 59989b17223SAlexander Motin if (bp->bio_error == 0) 60089b17223SAlexander Motin bp->bio_error = ENOMEM; 60189b17223SAlexander Motin g_raid_iodone(bp, bp->bio_error); 60289b17223SAlexander Motin } 60389b17223SAlexander Motin 60489b17223SAlexander Motin static void 60589b17223SAlexander Motin g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) 60689b17223SAlexander Motin { 60789b17223SAlexander Motin struct g_raid_volume *vol; 60889b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 60989b17223SAlexander Motin 61089b17223SAlexander Motin vol = tr->tro_volume; 61189b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 61289b17223SAlexander Motin if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 61389b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 61489b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 61589b17223SAlexander Motin g_raid_iodone(bp, EIO); 61689b17223SAlexander Motin return; 61789b17223SAlexander Motin } 61889b17223SAlexander Motin /* 61989b17223SAlexander Motin * If we're rebuilding, squeeze in rebuild activity every so often, 62089b17223SAlexander Motin * even when the disk is busy. Be sure to only count real I/O 62189b17223SAlexander Motin * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 62289b17223SAlexander Motin * by this module. 62389b17223SAlexander Motin */ 62489b17223SAlexander Motin if (trs->trso_failed_sd != NULL && 62589b17223SAlexander Motin !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 62689b17223SAlexander Motin /* Make this new or running now round short. */ 62789b17223SAlexander Motin trs->trso_recover_slabs = 0; 62889b17223SAlexander Motin if (--trs->trso_fair_io <= 0) { 62989b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 63089b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 63189b17223SAlexander Motin } 63289b17223SAlexander Motin } 63389b17223SAlexander Motin switch (bp->bio_cmd) { 63489b17223SAlexander Motin case BIO_READ: 63589b17223SAlexander Motin g_raid_tr_iostart_raid1_read(tr, bp); 63689b17223SAlexander Motin break; 63789b17223SAlexander Motin case BIO_WRITE: 63889b17223SAlexander Motin case BIO_DELETE: 639609a7474SAlexander Motin g_raid_tr_iostart_raid1_write(tr, bp); 64089b17223SAlexander Motin break; 64189b17223SAlexander Motin case BIO_FLUSH: 64289b17223SAlexander Motin g_raid_tr_flush_common(tr, bp); 64389b17223SAlexander Motin break; 64489b17223SAlexander Motin default: 64589b17223SAlexander Motin KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 64689b17223SAlexander Motin bp->bio_cmd, vol->v_name)); 64789b17223SAlexander Motin break; 64889b17223SAlexander Motin } 64989b17223SAlexander Motin } 65089b17223SAlexander Motin 65189b17223SAlexander Motin static void 65289b17223SAlexander Motin g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, 65389b17223SAlexander Motin struct g_raid_subdisk *sd, struct bio *bp) 65489b17223SAlexander Motin { 65589b17223SAlexander Motin struct bio *cbp; 65689b17223SAlexander Motin struct g_raid_subdisk *nsd; 65789b17223SAlexander Motin struct g_raid_volume *vol; 65889b17223SAlexander Motin struct bio *pbp; 65989b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 66089b17223SAlexander Motin uintptr_t *mask; 66189b17223SAlexander Motin int error, do_write; 66289b17223SAlexander Motin 66389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 66489b17223SAlexander Motin vol = tr->tro_volume; 66589b17223SAlexander Motin if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 66689b17223SAlexander Motin /* 66789b17223SAlexander Motin * This operation is part of a rebuild or resync operation. 66889b17223SAlexander Motin * See what work just got done, then schedule the next bit of 66989b17223SAlexander Motin * work, if any. Rebuild/resync is done a little bit at a 67089b17223SAlexander Motin * time. Either when a timeout happens, or after we get a 67189b17223SAlexander Motin * bunch of I/Os to the disk (to make sure an active system 67289b17223SAlexander Motin * will complete in a sane amount of time). 67389b17223SAlexander Motin * 67489b17223SAlexander Motin * We are setup to do differing amounts of work for each of 67589b17223SAlexander Motin * these cases. so long as the slabs is smallish (less than 67689b17223SAlexander Motin * 50 or so, I'd guess, but that's just a WAG), we shouldn't 67789b17223SAlexander Motin * have any bio starvation issues. For active disks, we do 67889b17223SAlexander Motin * 5MB of data, for inactive ones, we do 50MB. 67989b17223SAlexander Motin */ 68089b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) { 68189b17223SAlexander Motin if (bp->bio_cmd == BIO_READ) { 68289b17223SAlexander Motin 68389b17223SAlexander Motin /* Immediately abort rebuild, if requested. */ 68489b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_ABORT) { 68589b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 68689b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 68789b17223SAlexander Motin return; 68889b17223SAlexander Motin } 68989b17223SAlexander Motin 69089b17223SAlexander Motin /* On read error, skip and cross fingers. */ 69189b17223SAlexander Motin if (bp->bio_error != 0) { 69289b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 69389b17223SAlexander Motin "Read error during rebuild (%d), " 69489b17223SAlexander Motin "possible data loss!", 69589b17223SAlexander Motin bp->bio_error); 69689b17223SAlexander Motin goto rebuild_round_done; 69789b17223SAlexander Motin } 69889b17223SAlexander Motin 69989b17223SAlexander Motin /* 70089b17223SAlexander Motin * The read operation finished, queue the 70189b17223SAlexander Motin * write and get out. 70289b17223SAlexander Motin */ 70389b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "rebuild read done. %d", 70489b17223SAlexander Motin bp->bio_error); 70589b17223SAlexander Motin bp->bio_cmd = BIO_WRITE; 70689b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 70789b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); 70889b17223SAlexander Motin g_raid_subdisk_iostart(trs->trso_failed_sd, bp); 70989b17223SAlexander Motin } else { 71089b17223SAlexander Motin /* 71189b17223SAlexander Motin * The write operation just finished. Do 71289b17223SAlexander Motin * another. We keep cloning the master bio 71389b17223SAlexander Motin * since it has the right buffers allocated to 71489b17223SAlexander Motin * it. 71589b17223SAlexander Motin */ 71689b17223SAlexander Motin G_RAID_LOGREQ(4, bp, 71789b17223SAlexander Motin "rebuild write done. Error %d", 71889b17223SAlexander Motin bp->bio_error); 71989b17223SAlexander Motin nsd = trs->trso_failed_sd; 72089b17223SAlexander Motin if (bp->bio_error != 0 || 72189b17223SAlexander Motin trs->trso_flags & TR_RAID1_F_ABORT) { 72289b17223SAlexander Motin if ((trs->trso_flags & 72389b17223SAlexander Motin TR_RAID1_F_ABORT) == 0) { 72489b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, 72589b17223SAlexander Motin nsd, nsd->sd_disk); 72689b17223SAlexander Motin } 72789b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 72889b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 72989b17223SAlexander Motin return; 73089b17223SAlexander Motin } 73189b17223SAlexander Motin rebuild_round_done: 73289b17223SAlexander Motin nsd = trs->trso_failed_sd; 73389b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 73489b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, 73589b17223SAlexander Motin bp->bio_offset, bp->bio_length); 73689b17223SAlexander Motin nsd->sd_rebuild_pos += bp->bio_length; 73789b17223SAlexander Motin if (nsd->sd_rebuild_pos >= nsd->sd_size) { 73889b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(tr); 73989b17223SAlexander Motin return; 74089b17223SAlexander Motin } 74189b17223SAlexander Motin 74289b17223SAlexander Motin /* Abort rebuild if we are stopping */ 74389b17223SAlexander Motin if (trs->trso_stopping) { 74489b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 74589b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 74689b17223SAlexander Motin return; 74789b17223SAlexander Motin } 74889b17223SAlexander Motin 74989b17223SAlexander Motin if (--trs->trso_meta_update <= 0) { 75089b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 75189b17223SAlexander Motin vol, nsd, nsd->sd_disk); 75289b17223SAlexander Motin trs->trso_meta_update = 75389b17223SAlexander Motin g_raid1_rebuild_meta_update; 75489b17223SAlexander Motin } 75589b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 75689b17223SAlexander Motin if (--trs->trso_recover_slabs <= 0) 75789b17223SAlexander Motin return; 75889b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 75989b17223SAlexander Motin } 76089b17223SAlexander Motin } else if (trs->trso_type == TR_RAID1_RESYNC) { 76189b17223SAlexander Motin /* 76289b17223SAlexander Motin * read good sd, read bad sd in parallel. when both 76389b17223SAlexander Motin * done, compare the buffers. write good to the bad 76489b17223SAlexander Motin * if different. do the next bit of work. 76589b17223SAlexander Motin */ 76689b17223SAlexander Motin panic("Somehow, we think we're doing a resync"); 76789b17223SAlexander Motin } 76889b17223SAlexander Motin return; 76989b17223SAlexander Motin } 77089b17223SAlexander Motin pbp = bp->bio_parent; 77189b17223SAlexander Motin pbp->bio_inbed++; 77289b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 77389b17223SAlexander Motin /* 77489b17223SAlexander Motin * Read failed on first drive. Retry the read error on 77589b17223SAlexander Motin * another disk drive, if available, before erroring out the 77689b17223SAlexander Motin * read. 77789b17223SAlexander Motin */ 77889b17223SAlexander Motin sd->sd_disk->d_read_errs++; 77989b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 78089b17223SAlexander Motin "Read error (%d), %d read errors total", 78189b17223SAlexander Motin bp->bio_error, sd->sd_disk->d_read_errs); 78289b17223SAlexander Motin 78389b17223SAlexander Motin /* 78489b17223SAlexander Motin * If there are too many read errors, we move to degraded. 78589b17223SAlexander Motin * XXX Do we want to FAIL the drive (eg, make the user redo 78689b17223SAlexander Motin * everything to get it back in sync), or just degrade the 78789b17223SAlexander Motin * drive, which kicks off a resync? 78889b17223SAlexander Motin */ 78989b17223SAlexander Motin do_write = 1; 79089b17223SAlexander Motin if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { 79189b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 79289b17223SAlexander Motin if (pbp->bio_children == 1) 79389b17223SAlexander Motin do_write = 0; 79489b17223SAlexander Motin } 79589b17223SAlexander Motin 79689b17223SAlexander Motin /* 79789b17223SAlexander Motin * Find the other disk, and try to do the I/O to it. 79889b17223SAlexander Motin */ 79989b17223SAlexander Motin mask = (uintptr_t *)(&pbp->bio_driver2); 80089b17223SAlexander Motin if (pbp->bio_children == 1) { 80189b17223SAlexander Motin /* Save original subdisk. */ 80289b17223SAlexander Motin pbp->bio_driver1 = do_write ? sd : NULL; 80389b17223SAlexander Motin *mask = 0; 80489b17223SAlexander Motin } 80589b17223SAlexander Motin *mask |= 1 << sd->sd_pos; 80689b17223SAlexander Motin nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); 80789b17223SAlexander Motin if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { 80889b17223SAlexander Motin g_destroy_bio(bp); 80989b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 81089b17223SAlexander Motin nsd->sd_pos); 81189b17223SAlexander Motin if (pbp->bio_children == 2 && do_write) { 81289b17223SAlexander Motin sd->sd_recovery++; 81389b17223SAlexander Motin cbp->bio_caller1 = nsd; 81489b17223SAlexander Motin pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; 81589b17223SAlexander Motin /* Lock callback starts I/O */ 81689b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, 81789b17223SAlexander Motin cbp->bio_offset, cbp->bio_length, pbp, cbp); 81889b17223SAlexander Motin } else { 81989b17223SAlexander Motin g_raid_subdisk_iostart(nsd, cbp); 82089b17223SAlexander Motin } 82189b17223SAlexander Motin return; 82289b17223SAlexander Motin } 82389b17223SAlexander Motin /* 82489b17223SAlexander Motin * We can't retry. Return the original error by falling 82589b17223SAlexander Motin * through. This will happen when there's only one good disk. 82689b17223SAlexander Motin * We don't need to fail the raid, since its actual state is 82789b17223SAlexander Motin * based on the state of the subdisks. 82889b17223SAlexander Motin */ 82989b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 83089b17223SAlexander Motin } 83189b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && 83289b17223SAlexander Motin bp->bio_error == 0 && 83389b17223SAlexander Motin pbp->bio_children > 1 && 83489b17223SAlexander Motin pbp->bio_driver1 != NULL) { 83589b17223SAlexander Motin /* 83689b17223SAlexander Motin * If it was a read, and bio_children is >1, then we just 83789b17223SAlexander Motin * recovered the data from the second drive. We should try to 83889b17223SAlexander Motin * write that data to the first drive if sector remapping is 83989b17223SAlexander Motin * enabled. A write should put the data in a new place on the 84089b17223SAlexander Motin * disk, remapping the bad sector. Do we need to do that by 84189b17223SAlexander Motin * queueing a request to the main worker thread? It doesn't 84289b17223SAlexander Motin * affect the return code of this current read, and can be 84389b17223SAlexander Motin * done at our liesure. However, to make the code simpler, it 84489b17223SAlexander Motin * is done syncrhonously. 84589b17223SAlexander Motin */ 84689b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 84789b17223SAlexander Motin cbp = g_clone_bio(pbp); 84889b17223SAlexander Motin if (cbp != NULL) { 84989b17223SAlexander Motin g_destroy_bio(bp); 85089b17223SAlexander Motin cbp->bio_cmd = BIO_WRITE; 85189b17223SAlexander Motin cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 85289b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, 85389b17223SAlexander Motin "Attempting bad sector remap on failing drive."); 85489b17223SAlexander Motin g_raid_subdisk_iostart(pbp->bio_driver1, cbp); 85589b17223SAlexander Motin return; 85689b17223SAlexander Motin } 85789b17223SAlexander Motin } 85889b17223SAlexander Motin if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { 85989b17223SAlexander Motin /* 86089b17223SAlexander Motin * We're done with a recovery, mark the range as unlocked. 86189b17223SAlexander Motin * For any write errors, we agressively fail the disk since 86289b17223SAlexander Motin * there was both a READ and a WRITE error at this location. 86389b17223SAlexander Motin * Both types of errors generally indicates the drive is on 86489b17223SAlexander Motin * the verge of total failure anyway. Better to stop trusting 86589b17223SAlexander Motin * it now. However, we need to reset error to 0 in that case 86689b17223SAlexander Motin * because we're not failing the original I/O which succeeded. 86789b17223SAlexander Motin */ 86889b17223SAlexander Motin if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 86989b17223SAlexander Motin G_RAID_LOGREQ(0, bp, "Remap write failed: " 87089b17223SAlexander Motin "failing subdisk."); 87189b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 87289b17223SAlexander Motin bp->bio_error = 0; 87389b17223SAlexander Motin } 87489b17223SAlexander Motin if (pbp->bio_driver1 != NULL) { 87589b17223SAlexander Motin ((struct g_raid_subdisk *)pbp->bio_driver1) 87689b17223SAlexander Motin ->sd_recovery--; 87789b17223SAlexander Motin } 87889b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 87989b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, bp->bio_offset, 88089b17223SAlexander Motin bp->bio_length); 88189b17223SAlexander Motin } 882650e245eSAlexander Motin if (pbp->bio_cmd != BIO_READ) { 883ef844ef7SAlexander Motin if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 884ef844ef7SAlexander Motin pbp->bio_error = bp->bio_error; 885650e245eSAlexander Motin if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { 886ef844ef7SAlexander Motin G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 887ef844ef7SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 888ef844ef7SAlexander Motin } 889ef844ef7SAlexander Motin error = pbp->bio_error; 890ef844ef7SAlexander Motin } else 89189b17223SAlexander Motin error = bp->bio_error; 89289b17223SAlexander Motin g_destroy_bio(bp); 89389b17223SAlexander Motin if (pbp->bio_children == pbp->bio_inbed) { 89489b17223SAlexander Motin pbp->bio_completed = pbp->bio_length; 89589b17223SAlexander Motin g_raid_iodone(pbp, error); 89689b17223SAlexander Motin } 89789b17223SAlexander Motin } 89889b17223SAlexander Motin 89989b17223SAlexander Motin static int 90089b17223SAlexander Motin g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, 90189b17223SAlexander Motin void *virtual, vm_offset_t physical, off_t offset, size_t length) 90289b17223SAlexander Motin { 90389b17223SAlexander Motin struct g_raid_volume *vol; 90489b17223SAlexander Motin struct g_raid_subdisk *sd; 90589b17223SAlexander Motin int error, i, ok; 90689b17223SAlexander Motin 90789b17223SAlexander Motin vol = tr->tro_volume; 90889b17223SAlexander Motin error = 0; 90989b17223SAlexander Motin ok = 0; 91089b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 91189b17223SAlexander Motin sd = &vol->v_subdisks[i]; 91289b17223SAlexander Motin switch (sd->sd_state) { 91389b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 91489b17223SAlexander Motin break; 91589b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 91689b17223SAlexander Motin /* 91789b17223SAlexander Motin * When rebuilding, only part of this subdisk is 91889b17223SAlexander Motin * writable, the rest will be written as part of the 91989b17223SAlexander Motin * that process. 92089b17223SAlexander Motin */ 92189b17223SAlexander Motin if (offset >= sd->sd_rebuild_pos) 92289b17223SAlexander Motin continue; 92389b17223SAlexander Motin break; 92489b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 92589b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 92689b17223SAlexander Motin /* 92789b17223SAlexander Motin * Resyncing still writes on the theory that the 92889b17223SAlexander Motin * resync'd disk is very close and writing it will 92989b17223SAlexander Motin * keep it that way better if we keep up while 93089b17223SAlexander Motin * resyncing. 93189b17223SAlexander Motin */ 93289b17223SAlexander Motin break; 93389b17223SAlexander Motin default: 93489b17223SAlexander Motin continue; 93589b17223SAlexander Motin } 93689b17223SAlexander Motin error = g_raid_subdisk_kerneldump(sd, 93789b17223SAlexander Motin virtual, physical, offset, length); 93889b17223SAlexander Motin if (error == 0) 93989b17223SAlexander Motin ok++; 94089b17223SAlexander Motin } 94189b17223SAlexander Motin return (ok > 0 ? 0 : error); 94289b17223SAlexander Motin } 94389b17223SAlexander Motin 94489b17223SAlexander Motin static int 94589b17223SAlexander Motin g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) 94689b17223SAlexander Motin { 94789b17223SAlexander Motin struct bio *bp; 94889b17223SAlexander Motin struct g_raid_subdisk *sd; 94989b17223SAlexander Motin 95089b17223SAlexander Motin bp = (struct bio *)argp; 95189b17223SAlexander Motin sd = (struct g_raid_subdisk *)bp->bio_caller1; 95289b17223SAlexander Motin g_raid_subdisk_iostart(sd, bp); 95389b17223SAlexander Motin 95489b17223SAlexander Motin return (0); 95589b17223SAlexander Motin } 95689b17223SAlexander Motin 95789b17223SAlexander Motin static int 95889b17223SAlexander Motin g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) 95989b17223SAlexander Motin { 96089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 96189b17223SAlexander Motin 96289b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 96389b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 96489b17223SAlexander Motin trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; 96589b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) 96689b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 96789b17223SAlexander Motin return (0); 96889b17223SAlexander Motin } 96989b17223SAlexander Motin 97089b17223SAlexander Motin static int 97189b17223SAlexander Motin g_raid_tr_free_raid1(struct g_raid_tr_object *tr) 97289b17223SAlexander Motin { 97389b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 97489b17223SAlexander Motin 97589b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 97689b17223SAlexander Motin 97789b17223SAlexander Motin if (trs->trso_buffer != NULL) { 97889b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 97989b17223SAlexander Motin trs->trso_buffer = NULL; 98089b17223SAlexander Motin } 98189b17223SAlexander Motin return (0); 98289b17223SAlexander Motin } 98389b17223SAlexander Motin 984c89d2fbeSAlexander Motin G_RAID_TR_DECLARE(raid1, "RAID1"); 985