189b17223SAlexander Motin /*- 23728855aSPedro F. Giffuni * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 33728855aSPedro F. Giffuni * 489b17223SAlexander Motin * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 589b17223SAlexander Motin * All rights reserved. 689b17223SAlexander Motin * 789b17223SAlexander Motin * Redistribution and use in source and binary forms, with or without 889b17223SAlexander Motin * modification, are permitted provided that the following conditions 989b17223SAlexander Motin * are met: 1089b17223SAlexander Motin * 1. Redistributions of source code must retain the above copyright 1189b17223SAlexander Motin * notice, this list of conditions and the following disclaimer. 1289b17223SAlexander Motin * 2. Redistributions in binary form must reproduce the above copyright 1389b17223SAlexander Motin * notice, this list of conditions and the following disclaimer in the 1489b17223SAlexander Motin * documentation and/or other materials provided with the distribution. 1589b17223SAlexander Motin * 1689b17223SAlexander Motin * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 1789b17223SAlexander Motin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1889b17223SAlexander Motin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1989b17223SAlexander Motin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 2089b17223SAlexander Motin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2189b17223SAlexander Motin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2289b17223SAlexander Motin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2389b17223SAlexander Motin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2489b17223SAlexander Motin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2589b17223SAlexander Motin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2689b17223SAlexander Motin * SUCH DAMAGE. 2789b17223SAlexander Motin */ 2889b17223SAlexander Motin 2989b17223SAlexander Motin #include <sys/cdefs.h> 3089b17223SAlexander Motin __FBSDID("$FreeBSD$"); 3189b17223SAlexander Motin 3289b17223SAlexander Motin #include <sys/param.h> 3389b17223SAlexander Motin #include <sys/bio.h> 3489b17223SAlexander Motin #include <sys/endian.h> 3589b17223SAlexander Motin #include <sys/kernel.h> 3689b17223SAlexander Motin #include <sys/kobj.h> 3789b17223SAlexander Motin #include <sys/limits.h> 3889b17223SAlexander Motin #include <sys/lock.h> 3989b17223SAlexander Motin #include <sys/malloc.h> 4089b17223SAlexander Motin #include <sys/mutex.h> 4189b17223SAlexander Motin #include <sys/sysctl.h> 4289b17223SAlexander Motin #include <sys/systm.h> 4389b17223SAlexander Motin #include <geom/geom.h> 44ac03832eSConrad Meyer #include <geom/geom_dbg.h> 4589b17223SAlexander Motin #include "geom/raid/g_raid.h" 4689b17223SAlexander Motin #include "g_raid_tr_if.h" 4789b17223SAlexander Motin 48c89d2fbeSAlexander Motin SYSCTL_DECL(_kern_geom_raid_raid1); 4989b17223SAlexander Motin 5089b17223SAlexander Motin #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 5189b17223SAlexander Motin static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; 52af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, 5389b17223SAlexander Motin &g_raid1_rebuild_slab, 0, 5489b17223SAlexander Motin "Amount of the disk to rebuild each read/write cycle of the rebuild."); 5589b17223SAlexander Motin 5689b17223SAlexander Motin #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 5789b17223SAlexander Motin static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; 58af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, 5989b17223SAlexander Motin &g_raid1_rebuild_fair_io, 0, 6089b17223SAlexander Motin "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 6189b17223SAlexander Motin 6289b17223SAlexander Motin #define RAID1_REBUILD_CLUSTER_IDLE 100 6389b17223SAlexander Motin static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; 64af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, 6589b17223SAlexander Motin &g_raid1_rebuild_cluster_idle, 0, 6689b17223SAlexander Motin "Number of slabs to do each time we trigger a rebuild cycle"); 6789b17223SAlexander Motin 6889b17223SAlexander Motin #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 6989b17223SAlexander Motin static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; 70af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, 7189b17223SAlexander Motin &g_raid1_rebuild_meta_update, 0, 7289b17223SAlexander Motin "When to update the meta data."); 7389b17223SAlexander Motin 7489b17223SAlexander Motin static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); 7589b17223SAlexander Motin 7689b17223SAlexander Motin #define TR_RAID1_NONE 0 7789b17223SAlexander Motin #define TR_RAID1_REBUILD 1 7889b17223SAlexander Motin #define TR_RAID1_RESYNC 2 7989b17223SAlexander Motin 8089b17223SAlexander Motin #define TR_RAID1_F_DOING_SOME 0x1 8189b17223SAlexander Motin #define TR_RAID1_F_LOCKED 0x2 8289b17223SAlexander Motin #define TR_RAID1_F_ABORT 0x4 8389b17223SAlexander Motin 8489b17223SAlexander Motin struct g_raid_tr_raid1_object { 8589b17223SAlexander Motin struct g_raid_tr_object trso_base; 8689b17223SAlexander Motin int trso_starting; 8789b17223SAlexander Motin int trso_stopping; 8889b17223SAlexander Motin int trso_type; 8989b17223SAlexander Motin int trso_recover_slabs; /* slabs before rest */ 9089b17223SAlexander Motin int trso_fair_io; 9189b17223SAlexander Motin int trso_meta_update; 9289b17223SAlexander Motin int trso_flags; 9389b17223SAlexander Motin struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 9489b17223SAlexander Motin void *trso_buffer; /* Buffer space */ 9589b17223SAlexander Motin struct bio trso_bio; 9689b17223SAlexander Motin }; 9789b17223SAlexander Motin 9889b17223SAlexander Motin static g_raid_tr_taste_t g_raid_tr_taste_raid1; 9989b17223SAlexander Motin static g_raid_tr_event_t g_raid_tr_event_raid1; 10089b17223SAlexander Motin static g_raid_tr_start_t g_raid_tr_start_raid1; 10189b17223SAlexander Motin static g_raid_tr_stop_t g_raid_tr_stop_raid1; 10289b17223SAlexander Motin static g_raid_tr_iostart_t g_raid_tr_iostart_raid1; 10389b17223SAlexander Motin static g_raid_tr_iodone_t g_raid_tr_iodone_raid1; 10489b17223SAlexander Motin static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; 10589b17223SAlexander Motin static g_raid_tr_locked_t g_raid_tr_locked_raid1; 10689b17223SAlexander Motin static g_raid_tr_idle_t g_raid_tr_idle_raid1; 10789b17223SAlexander Motin static g_raid_tr_free_t g_raid_tr_free_raid1; 10889b17223SAlexander Motin 10989b17223SAlexander Motin static kobj_method_t g_raid_tr_raid1_methods[] = { 11089b17223SAlexander Motin KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), 11189b17223SAlexander Motin KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), 11289b17223SAlexander Motin KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), 11389b17223SAlexander Motin KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), 11489b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), 11589b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), 11689b17223SAlexander Motin KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), 11789b17223SAlexander Motin KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), 11889b17223SAlexander Motin KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), 11989b17223SAlexander Motin KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), 12089b17223SAlexander Motin { 0, 0 } 12189b17223SAlexander Motin }; 12289b17223SAlexander Motin 12389b17223SAlexander Motin static struct g_raid_tr_class g_raid_tr_raid1_class = { 12489b17223SAlexander Motin "RAID1", 12589b17223SAlexander Motin g_raid_tr_raid1_methods, 12689b17223SAlexander Motin sizeof(struct g_raid_tr_raid1_object), 127c89d2fbeSAlexander Motin .trc_enable = 1, 128b43560abSAlexander Motin .trc_priority = 100, 129b43560abSAlexander Motin .trc_accept_unmapped = 1 13089b17223SAlexander Motin }; 13189b17223SAlexander Motin 13289b17223SAlexander Motin static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); 13389b17223SAlexander Motin static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 13489b17223SAlexander Motin struct g_raid_subdisk *sd); 13589b17223SAlexander Motin 13689b17223SAlexander Motin static int 13789b17223SAlexander Motin g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 13889b17223SAlexander Motin { 13989b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 14089b17223SAlexander Motin 14189b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 14289b17223SAlexander Motin if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || 143dbb2e755SAlexander Motin (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && 144dbb2e755SAlexander Motin tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) 14589b17223SAlexander Motin return (G_RAID_TR_TASTE_FAIL); 14689b17223SAlexander Motin trs->trso_starting = 1; 14789b17223SAlexander Motin return (G_RAID_TR_TASTE_SUCCEED); 14889b17223SAlexander Motin } 14989b17223SAlexander Motin 15089b17223SAlexander Motin static int 15189b17223SAlexander Motin g_raid_tr_update_state_raid1(struct g_raid_volume *vol, 15289b17223SAlexander Motin struct g_raid_subdisk *sd) 15389b17223SAlexander Motin { 15489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 15589b17223SAlexander Motin struct g_raid_softc *sc; 15689b17223SAlexander Motin struct g_raid_subdisk *tsd, *bestsd; 15789b17223SAlexander Motin u_int s; 15889b17223SAlexander Motin int i, na, ns; 15989b17223SAlexander Motin 16089b17223SAlexander Motin sc = vol->v_softc; 16189b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)vol->v_tr; 16289b17223SAlexander Motin if (trs->trso_stopping && 16389b17223SAlexander Motin (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) 16489b17223SAlexander Motin s = G_RAID_VOLUME_S_STOPPED; 16589b17223SAlexander Motin else if (trs->trso_starting) 16689b17223SAlexander Motin s = G_RAID_VOLUME_S_STARTING; 16789b17223SAlexander Motin else { 16889b17223SAlexander Motin /* Make sure we have at least one ACTIVE disk. */ 16989b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 17089b17223SAlexander Motin if (na == 0) { 17189b17223SAlexander Motin /* 17289b17223SAlexander Motin * Critical situation! We have no any active disk! 17389b17223SAlexander Motin * Choose the best disk we have to make it active. 17489b17223SAlexander Motin */ 17589b17223SAlexander Motin bestsd = &vol->v_subdisks[0]; 17689b17223SAlexander Motin for (i = 1; i < vol->v_disks_count; i++) { 17789b17223SAlexander Motin tsd = &vol->v_subdisks[i]; 17889b17223SAlexander Motin if (tsd->sd_state > bestsd->sd_state) 17989b17223SAlexander Motin bestsd = tsd; 18089b17223SAlexander Motin else if (tsd->sd_state == bestsd->sd_state && 18189b17223SAlexander Motin (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || 18289b17223SAlexander Motin tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 18389b17223SAlexander Motin tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 18489b17223SAlexander Motin bestsd = tsd; 18589b17223SAlexander Motin } 18689b17223SAlexander Motin if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { 18789b17223SAlexander Motin /* We found reasonable candidate. */ 18889b17223SAlexander Motin G_RAID_DEBUG1(1, sc, 18989b17223SAlexander Motin "Promote subdisk %s:%d from %s to ACTIVE.", 19089b17223SAlexander Motin vol->v_name, bestsd->sd_pos, 19189b17223SAlexander Motin g_raid_subdisk_state2str(bestsd->sd_state)); 19289b17223SAlexander Motin g_raid_change_subdisk_state(bestsd, 19389b17223SAlexander Motin G_RAID_SUBDISK_S_ACTIVE); 19489b17223SAlexander Motin g_raid_write_metadata(sc, 19589b17223SAlexander Motin vol, bestsd, bestsd->sd_disk); 19689b17223SAlexander Motin } 19789b17223SAlexander Motin } 19889b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 19989b17223SAlexander Motin ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 20089b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 20189b17223SAlexander Motin if (na == vol->v_disks_count) 20289b17223SAlexander Motin s = G_RAID_VOLUME_S_OPTIMAL; 20389b17223SAlexander Motin else if (na + ns == vol->v_disks_count) 20489b17223SAlexander Motin s = G_RAID_VOLUME_S_SUBOPTIMAL; 20589b17223SAlexander Motin else if (na > 0) 20689b17223SAlexander Motin s = G_RAID_VOLUME_S_DEGRADED; 20789b17223SAlexander Motin else 20889b17223SAlexander Motin s = G_RAID_VOLUME_S_BROKEN; 20989b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); 21089b17223SAlexander Motin } 21189b17223SAlexander Motin if (s != vol->v_state) { 21289b17223SAlexander Motin g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 21389b17223SAlexander Motin G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 21489b17223SAlexander Motin G_RAID_EVENT_VOLUME); 21589b17223SAlexander Motin g_raid_change_volume_state(vol, s); 21689b17223SAlexander Motin if (!trs->trso_starting && !trs->trso_stopping) 21789b17223SAlexander Motin g_raid_write_metadata(sc, vol, NULL, NULL); 21889b17223SAlexander Motin } 21989b17223SAlexander Motin return (0); 22089b17223SAlexander Motin } 22189b17223SAlexander Motin 22289b17223SAlexander Motin static void 22389b17223SAlexander Motin g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 22489b17223SAlexander Motin struct g_raid_disk *disk) 22589b17223SAlexander Motin { 22689b17223SAlexander Motin /* 22789b17223SAlexander Motin * We don't fail the last disk in the pack, since it still has decent 22889b17223SAlexander Motin * data on it and that's better than failing the disk if it is the root 22989b17223SAlexander Motin * file system. 23089b17223SAlexander Motin * 23189b17223SAlexander Motin * XXX should this be controlled via a tunable? It makes sense for 23289b17223SAlexander Motin * the volume that has / on it. I can't think of a case where we'd 23389b17223SAlexander Motin * want the volume to go away on this kind of event. 23489b17223SAlexander Motin */ 23589b17223SAlexander Motin if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && 23689b17223SAlexander Motin g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) 23789b17223SAlexander Motin return; 23889b17223SAlexander Motin g_raid_fail_disk(sc, sd, disk); 23989b17223SAlexander Motin } 24089b17223SAlexander Motin 24189b17223SAlexander Motin static void 24289b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) 24389b17223SAlexander Motin { 24489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 24589b17223SAlexander Motin struct g_raid_subdisk *sd, *good_sd; 24689b17223SAlexander Motin struct bio *bp; 24789b17223SAlexander Motin 24889b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 24989b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) 25089b17223SAlexander Motin return; 25189b17223SAlexander Motin sd = trs->trso_failed_sd; 25289b17223SAlexander Motin good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); 25389b17223SAlexander Motin if (good_sd == NULL) { 25489b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 25589b17223SAlexander Motin return; 25689b17223SAlexander Motin } 25789b17223SAlexander Motin bp = &trs->trso_bio; 25889b17223SAlexander Motin memset(bp, 0, sizeof(*bp)); 25989b17223SAlexander Motin bp->bio_offset = sd->sd_rebuild_pos; 26089b17223SAlexander Motin bp->bio_length = MIN(g_raid1_rebuild_slab, 26189b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 26289b17223SAlexander Motin bp->bio_data = trs->trso_buffer; 26389b17223SAlexander Motin bp->bio_cmd = BIO_READ; 26489b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 26589b17223SAlexander Motin bp->bio_caller1 = good_sd; 26689b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_DOING_SOME; 26789b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_LOCKED; 26889b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ 26989b17223SAlexander Motin bp->bio_offset, bp->bio_length, NULL, bp); 27089b17223SAlexander Motin } 27189b17223SAlexander Motin 27289b17223SAlexander Motin static void 27389b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) 27489b17223SAlexander Motin { 27589b17223SAlexander Motin struct g_raid_volume *vol; 27689b17223SAlexander Motin struct g_raid_subdisk *sd; 27789b17223SAlexander Motin 27889b17223SAlexander Motin vol = trs->trso_base.tro_volume; 27989b17223SAlexander Motin sd = trs->trso_failed_sd; 28089b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 28189b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 28289b17223SAlexander Motin trs->trso_buffer = NULL; 28389b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 28489b17223SAlexander Motin trs->trso_type = TR_RAID1_NONE; 28589b17223SAlexander Motin trs->trso_recover_slabs = 0; 28689b17223SAlexander Motin trs->trso_failed_sd = NULL; 28789b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 28889b17223SAlexander Motin } 28989b17223SAlexander Motin 29089b17223SAlexander Motin static void 29189b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) 29289b17223SAlexander Motin { 29389b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 29489b17223SAlexander Motin struct g_raid_subdisk *sd; 29589b17223SAlexander Motin 29689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 29789b17223SAlexander Motin sd = trs->trso_failed_sd; 29889b17223SAlexander Motin G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 29989b17223SAlexander Motin "Subdisk %s:%d-%s rebuild completed.", 30089b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 30189b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 30289b17223SAlexander Motin g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 30389b17223SAlexander Motin sd->sd_rebuild_pos = 0; 30489b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 30589b17223SAlexander Motin } 30689b17223SAlexander Motin 30789b17223SAlexander Motin static void 30889b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) 30989b17223SAlexander Motin { 31089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 31189b17223SAlexander Motin struct g_raid_subdisk *sd; 31289b17223SAlexander Motin struct g_raid_volume *vol; 31389b17223SAlexander Motin off_t len; 31489b17223SAlexander Motin 31589b17223SAlexander Motin vol = tr->tro_volume; 31689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 31789b17223SAlexander Motin sd = trs->trso_failed_sd; 31889b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { 31989b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 32089b17223SAlexander Motin "Subdisk %s:%d-%s rebuild is aborting.", 32189b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 32289b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 32389b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_ABORT; 32489b17223SAlexander Motin } else { 32589b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 32689b17223SAlexander Motin "Subdisk %s:%d-%s rebuild aborted.", 32789b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos, 32889b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 32989b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_ABORT; 33089b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_LOCKED) { 33189b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 33289b17223SAlexander Motin len = MIN(g_raid1_rebuild_slab, 33389b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos); 33489b17223SAlexander Motin g_raid_unlock_range(tr->tro_volume, 33589b17223SAlexander Motin sd->sd_rebuild_pos, len); 33689b17223SAlexander Motin } 33789b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs); 33889b17223SAlexander Motin } 33989b17223SAlexander Motin } 34089b17223SAlexander Motin 34189b17223SAlexander Motin static void 34289b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) 34389b17223SAlexander Motin { 34489b17223SAlexander Motin struct g_raid_volume *vol; 34589b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 34689b17223SAlexander Motin struct g_raid_subdisk *sd, *fsd; 34789b17223SAlexander Motin 34889b17223SAlexander Motin vol = tr->tro_volume; 34989b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 35089b17223SAlexander Motin if (trs->trso_failed_sd) { 35189b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 35289b17223SAlexander Motin "Already rebuild in start rebuild. pos %jd\n", 35389b17223SAlexander Motin (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 35489b17223SAlexander Motin return; 35589b17223SAlexander Motin } 35689b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); 35789b17223SAlexander Motin if (sd == NULL) { 35889b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 35989b17223SAlexander Motin "No active disk to rebuild. night night."); 36089b17223SAlexander Motin return; 36189b17223SAlexander Motin } 36289b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 36389b17223SAlexander Motin if (fsd == NULL) 36489b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 36589b17223SAlexander Motin if (fsd == NULL) { 36689b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 36789b17223SAlexander Motin if (fsd != NULL) { 36889b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 36989b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 37089b17223SAlexander Motin G_RAID_SUBDISK_S_RESYNC); 37189b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); 37289b17223SAlexander Motin } else { 37389b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 37489b17223SAlexander Motin G_RAID_SUBDISK_S_UNINITIALIZED); 37589b17223SAlexander Motin if (fsd == NULL) 37689b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, 37789b17223SAlexander Motin G_RAID_SUBDISK_S_NEW); 37889b17223SAlexander Motin if (fsd != NULL) { 37989b17223SAlexander Motin fsd->sd_rebuild_pos = 0; 38089b17223SAlexander Motin g_raid_change_subdisk_state(fsd, 38189b17223SAlexander Motin G_RAID_SUBDISK_S_REBUILD); 38289b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 38389b17223SAlexander Motin vol, fsd, NULL); 38489b17223SAlexander Motin } 38589b17223SAlexander Motin } 38689b17223SAlexander Motin } 38789b17223SAlexander Motin if (fsd == NULL) { 38889b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc, 38989b17223SAlexander Motin "No failed disk to rebuild. night night."); 39089b17223SAlexander Motin return; 39189b17223SAlexander Motin } 39289b17223SAlexander Motin trs->trso_failed_sd = fsd; 39389b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc, 39489b17223SAlexander Motin "Subdisk %s:%d-%s rebuild start at %jd.", 39589b17223SAlexander Motin fsd->sd_volume->v_name, fsd->sd_pos, 39689b17223SAlexander Motin fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", 39789b17223SAlexander Motin trs->trso_failed_sd->sd_rebuild_pos); 39889b17223SAlexander Motin trs->trso_type = TR_RAID1_REBUILD; 39989b17223SAlexander Motin trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); 40089b17223SAlexander Motin trs->trso_meta_update = g_raid1_rebuild_meta_update; 40189b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 40289b17223SAlexander Motin } 40389b17223SAlexander Motin 40489b17223SAlexander Motin 40589b17223SAlexander Motin static void 40689b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 40789b17223SAlexander Motin struct g_raid_subdisk *sd) 40889b17223SAlexander Motin { 40989b17223SAlexander Motin struct g_raid_volume *vol; 41089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 41189b17223SAlexander Motin int na, nr; 41289b17223SAlexander Motin 41389b17223SAlexander Motin /* 41489b17223SAlexander Motin * If we're stopping, don't do anything. If we don't have at least one 41589b17223SAlexander Motin * good disk and one bad disk, we don't do anything. And if there's a 41689b17223SAlexander Motin * 'good disk' stored in the trs, then we're in progress and we punt. 41789b17223SAlexander Motin * If we make it past all these checks, we need to rebuild. 41889b17223SAlexander Motin */ 41989b17223SAlexander Motin vol = tr->tro_volume; 42089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 42189b17223SAlexander Motin if (trs->trso_stopping) 42289b17223SAlexander Motin return; 42389b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 42489b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 42589b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 42689b17223SAlexander Motin switch(trs->trso_type) { 42789b17223SAlexander Motin case TR_RAID1_NONE: 42889b17223SAlexander Motin if (na == 0) 42989b17223SAlexander Motin return; 43089b17223SAlexander Motin if (nr == 0) { 43189b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 43289b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 43389b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 43489b17223SAlexander Motin if (nr == 0) 43589b17223SAlexander Motin return; 43689b17223SAlexander Motin } 43789b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(tr); 43889b17223SAlexander Motin break; 43989b17223SAlexander Motin case TR_RAID1_REBUILD: 44089b17223SAlexander Motin if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) 44189b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 44289b17223SAlexander Motin break; 44389b17223SAlexander Motin case TR_RAID1_RESYNC: 44489b17223SAlexander Motin break; 44589b17223SAlexander Motin } 44689b17223SAlexander Motin } 44789b17223SAlexander Motin 44889b17223SAlexander Motin static int 44989b17223SAlexander Motin g_raid_tr_event_raid1(struct g_raid_tr_object *tr, 45089b17223SAlexander Motin struct g_raid_subdisk *sd, u_int event) 45189b17223SAlexander Motin { 45289b17223SAlexander Motin 45389b17223SAlexander Motin g_raid_tr_update_state_raid1(tr->tro_volume, sd); 45489b17223SAlexander Motin return (0); 45589b17223SAlexander Motin } 45689b17223SAlexander Motin 45789b17223SAlexander Motin static int 45889b17223SAlexander Motin g_raid_tr_start_raid1(struct g_raid_tr_object *tr) 45989b17223SAlexander Motin { 46089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 46189b17223SAlexander Motin struct g_raid_volume *vol; 46289b17223SAlexander Motin 46389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 46489b17223SAlexander Motin vol = tr->tro_volume; 46589b17223SAlexander Motin trs->trso_starting = 0; 46689b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 46789b17223SAlexander Motin return (0); 46889b17223SAlexander Motin } 46989b17223SAlexander Motin 47089b17223SAlexander Motin static int 47189b17223SAlexander Motin g_raid_tr_stop_raid1(struct g_raid_tr_object *tr) 47289b17223SAlexander Motin { 47389b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 47489b17223SAlexander Motin struct g_raid_volume *vol; 47589b17223SAlexander Motin 47689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 47789b17223SAlexander Motin vol = tr->tro_volume; 47889b17223SAlexander Motin trs->trso_starting = 0; 47989b17223SAlexander Motin trs->trso_stopping = 1; 48089b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL); 48189b17223SAlexander Motin return (0); 48289b17223SAlexander Motin } 48389b17223SAlexander Motin 48489b17223SAlexander Motin /* 48589b17223SAlexander Motin * Select the disk to read from. Take into account: subdisk state, running 48689b17223SAlexander Motin * error recovery, average disk load, head position and possible cache hits. 48789b17223SAlexander Motin */ 48889b17223SAlexander Motin #define ABS(x) (((x) >= 0) ? (x) : (-(x))) 48989b17223SAlexander Motin static struct g_raid_subdisk * 49089b17223SAlexander Motin g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, 49189b17223SAlexander Motin u_int mask) 49289b17223SAlexander Motin { 49389b17223SAlexander Motin struct g_raid_subdisk *sd, *best; 49489b17223SAlexander Motin int i, prio, bestprio; 49589b17223SAlexander Motin 49689b17223SAlexander Motin best = NULL; 49789b17223SAlexander Motin bestprio = INT_MAX; 49889b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 49989b17223SAlexander Motin sd = &vol->v_subdisks[i]; 50089b17223SAlexander Motin if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && 50189b17223SAlexander Motin ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && 50289b17223SAlexander Motin sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || 50389b17223SAlexander Motin bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) 50489b17223SAlexander Motin continue; 50589b17223SAlexander Motin if ((mask & (1 << i)) != 0) 50689b17223SAlexander Motin continue; 50789b17223SAlexander Motin prio = G_RAID_SUBDISK_LOAD(sd); 50889b17223SAlexander Motin prio += min(sd->sd_recovery, 255) << 22; 50989b17223SAlexander Motin prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; 51089b17223SAlexander Motin /* If disk head is precisely in position - highly prefer it. */ 51189b17223SAlexander Motin if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) 51289b17223SAlexander Motin prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 51389b17223SAlexander Motin else 51489b17223SAlexander Motin /* If disk head is close to position - prefer it. */ 51589b17223SAlexander Motin if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < 51689b17223SAlexander Motin G_RAID_SUBDISK_TRACK_SIZE) 51789b17223SAlexander Motin prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 51889b17223SAlexander Motin if (prio < bestprio) { 51989b17223SAlexander Motin best = sd; 52089b17223SAlexander Motin bestprio = prio; 52189b17223SAlexander Motin } 52289b17223SAlexander Motin } 52389b17223SAlexander Motin return (best); 52489b17223SAlexander Motin } 52589b17223SAlexander Motin 52689b17223SAlexander Motin static void 52789b17223SAlexander Motin g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) 52889b17223SAlexander Motin { 52989b17223SAlexander Motin struct g_raid_subdisk *sd; 53089b17223SAlexander Motin struct bio *cbp; 53189b17223SAlexander Motin 53289b17223SAlexander Motin sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); 53389b17223SAlexander Motin KASSERT(sd != NULL, ("No active disks in volume %s.", 53489b17223SAlexander Motin tr->tro_volume->v_name)); 53589b17223SAlexander Motin 53689b17223SAlexander Motin cbp = g_clone_bio(bp); 53789b17223SAlexander Motin if (cbp == NULL) { 53889b17223SAlexander Motin g_raid_iodone(bp, ENOMEM); 53989b17223SAlexander Motin return; 54089b17223SAlexander Motin } 54189b17223SAlexander Motin 54289b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 54389b17223SAlexander Motin } 54489b17223SAlexander Motin 54589b17223SAlexander Motin static void 54689b17223SAlexander Motin g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) 54789b17223SAlexander Motin { 54889b17223SAlexander Motin struct g_raid_volume *vol; 54989b17223SAlexander Motin struct g_raid_subdisk *sd; 55089b17223SAlexander Motin struct bio_queue_head queue; 55189b17223SAlexander Motin struct bio *cbp; 55289b17223SAlexander Motin int i; 55389b17223SAlexander Motin 55489b17223SAlexander Motin vol = tr->tro_volume; 55589b17223SAlexander Motin 55689b17223SAlexander Motin /* 55789b17223SAlexander Motin * Allocate all bios before sending any request, so we can return 55889b17223SAlexander Motin * ENOMEM in nice and clean way. 55989b17223SAlexander Motin */ 56089b17223SAlexander Motin bioq_init(&queue); 56189b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 56289b17223SAlexander Motin sd = &vol->v_subdisks[i]; 56389b17223SAlexander Motin switch (sd->sd_state) { 56489b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 56589b17223SAlexander Motin break; 56689b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 56789b17223SAlexander Motin /* 56889b17223SAlexander Motin * When rebuilding, only part of this subdisk is 56989b17223SAlexander Motin * writable, the rest will be written as part of the 57089b17223SAlexander Motin * that process. 57189b17223SAlexander Motin */ 57289b17223SAlexander Motin if (bp->bio_offset >= sd->sd_rebuild_pos) 57389b17223SAlexander Motin continue; 57489b17223SAlexander Motin break; 57589b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 57689b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 57789b17223SAlexander Motin /* 57889b17223SAlexander Motin * Resyncing still writes on the theory that the 57989b17223SAlexander Motin * resync'd disk is very close and writing it will 58089b17223SAlexander Motin * keep it that way better if we keep up while 58189b17223SAlexander Motin * resyncing. 58289b17223SAlexander Motin */ 58389b17223SAlexander Motin break; 58489b17223SAlexander Motin default: 58589b17223SAlexander Motin continue; 58689b17223SAlexander Motin } 58789b17223SAlexander Motin cbp = g_clone_bio(bp); 58889b17223SAlexander Motin if (cbp == NULL) 58989b17223SAlexander Motin goto failure; 59089b17223SAlexander Motin cbp->bio_caller1 = sd; 59189b17223SAlexander Motin bioq_insert_tail(&queue, cbp); 59289b17223SAlexander Motin } 593b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL) { 59489b17223SAlexander Motin sd = cbp->bio_caller1; 59589b17223SAlexander Motin cbp->bio_caller1 = NULL; 59689b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp); 59789b17223SAlexander Motin } 59889b17223SAlexander Motin return; 59989b17223SAlexander Motin failure: 600b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL) 60189b17223SAlexander Motin g_destroy_bio(cbp); 60289b17223SAlexander Motin if (bp->bio_error == 0) 60389b17223SAlexander Motin bp->bio_error = ENOMEM; 60489b17223SAlexander Motin g_raid_iodone(bp, bp->bio_error); 60589b17223SAlexander Motin } 60689b17223SAlexander Motin 60789b17223SAlexander Motin static void 60889b17223SAlexander Motin g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) 60989b17223SAlexander Motin { 61089b17223SAlexander Motin struct g_raid_volume *vol; 61189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 61289b17223SAlexander Motin 61389b17223SAlexander Motin vol = tr->tro_volume; 61489b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 61589b17223SAlexander Motin if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 61689b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 61789b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 61889b17223SAlexander Motin g_raid_iodone(bp, EIO); 61989b17223SAlexander Motin return; 62089b17223SAlexander Motin } 62189b17223SAlexander Motin /* 62289b17223SAlexander Motin * If we're rebuilding, squeeze in rebuild activity every so often, 62389b17223SAlexander Motin * even when the disk is busy. Be sure to only count real I/O 62489b17223SAlexander Motin * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 62589b17223SAlexander Motin * by this module. 62689b17223SAlexander Motin */ 62789b17223SAlexander Motin if (trs->trso_failed_sd != NULL && 62889b17223SAlexander Motin !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 62989b17223SAlexander Motin /* Make this new or running now round short. */ 63089b17223SAlexander Motin trs->trso_recover_slabs = 0; 63189b17223SAlexander Motin if (--trs->trso_fair_io <= 0) { 63289b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 63389b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 63489b17223SAlexander Motin } 63589b17223SAlexander Motin } 63689b17223SAlexander Motin switch (bp->bio_cmd) { 63789b17223SAlexander Motin case BIO_READ: 63889b17223SAlexander Motin g_raid_tr_iostart_raid1_read(tr, bp); 63989b17223SAlexander Motin break; 64089b17223SAlexander Motin case BIO_WRITE: 64189b17223SAlexander Motin case BIO_DELETE: 642609a7474SAlexander Motin g_raid_tr_iostart_raid1_write(tr, bp); 64389b17223SAlexander Motin break; 644*8b522bdaSWarner Losh case BIO_SPEEDUP: 64589b17223SAlexander Motin case BIO_FLUSH: 64689b17223SAlexander Motin g_raid_tr_flush_common(tr, bp); 64789b17223SAlexander Motin break; 64889b17223SAlexander Motin default: 64989b17223SAlexander Motin KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 65089b17223SAlexander Motin bp->bio_cmd, vol->v_name)); 65189b17223SAlexander Motin break; 65289b17223SAlexander Motin } 65389b17223SAlexander Motin } 65489b17223SAlexander Motin 65589b17223SAlexander Motin static void 65689b17223SAlexander Motin g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, 65789b17223SAlexander Motin struct g_raid_subdisk *sd, struct bio *bp) 65889b17223SAlexander Motin { 65989b17223SAlexander Motin struct bio *cbp; 66089b17223SAlexander Motin struct g_raid_subdisk *nsd; 66189b17223SAlexander Motin struct g_raid_volume *vol; 66289b17223SAlexander Motin struct bio *pbp; 66389b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 66489b17223SAlexander Motin uintptr_t *mask; 66589b17223SAlexander Motin int error, do_write; 66689b17223SAlexander Motin 66789b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 66889b17223SAlexander Motin vol = tr->tro_volume; 66989b17223SAlexander Motin if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 67089b17223SAlexander Motin /* 67189b17223SAlexander Motin * This operation is part of a rebuild or resync operation. 67289b17223SAlexander Motin * See what work just got done, then schedule the next bit of 67389b17223SAlexander Motin * work, if any. Rebuild/resync is done a little bit at a 67489b17223SAlexander Motin * time. Either when a timeout happens, or after we get a 67589b17223SAlexander Motin * bunch of I/Os to the disk (to make sure an active system 67689b17223SAlexander Motin * will complete in a sane amount of time). 67789b17223SAlexander Motin * 67889b17223SAlexander Motin * We are setup to do differing amounts of work for each of 67989b17223SAlexander Motin * these cases. so long as the slabs is smallish (less than 68089b17223SAlexander Motin * 50 or so, I'd guess, but that's just a WAG), we shouldn't 68189b17223SAlexander Motin * have any bio starvation issues. For active disks, we do 68289b17223SAlexander Motin * 5MB of data, for inactive ones, we do 50MB. 68389b17223SAlexander Motin */ 68489b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) { 68589b17223SAlexander Motin if (bp->bio_cmd == BIO_READ) { 68689b17223SAlexander Motin 68789b17223SAlexander Motin /* Immediately abort rebuild, if requested. */ 68889b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_ABORT) { 68989b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 69089b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 69189b17223SAlexander Motin return; 69289b17223SAlexander Motin } 69389b17223SAlexander Motin 69489b17223SAlexander Motin /* On read error, skip and cross fingers. */ 69589b17223SAlexander Motin if (bp->bio_error != 0) { 69689b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 69789b17223SAlexander Motin "Read error during rebuild (%d), " 69889b17223SAlexander Motin "possible data loss!", 69989b17223SAlexander Motin bp->bio_error); 70089b17223SAlexander Motin goto rebuild_round_done; 70189b17223SAlexander Motin } 70289b17223SAlexander Motin 70389b17223SAlexander Motin /* 70489b17223SAlexander Motin * The read operation finished, queue the 70589b17223SAlexander Motin * write and get out. 70689b17223SAlexander Motin */ 70789b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "rebuild read done. %d", 70889b17223SAlexander Motin bp->bio_error); 70989b17223SAlexander Motin bp->bio_cmd = BIO_WRITE; 71089b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 71189b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); 71289b17223SAlexander Motin g_raid_subdisk_iostart(trs->trso_failed_sd, bp); 71389b17223SAlexander Motin } else { 71489b17223SAlexander Motin /* 71589b17223SAlexander Motin * The write operation just finished. Do 71689b17223SAlexander Motin * another. We keep cloning the master bio 71789b17223SAlexander Motin * since it has the right buffers allocated to 71889b17223SAlexander Motin * it. 71989b17223SAlexander Motin */ 72089b17223SAlexander Motin G_RAID_LOGREQ(4, bp, 72189b17223SAlexander Motin "rebuild write done. Error %d", 72289b17223SAlexander Motin bp->bio_error); 72389b17223SAlexander Motin nsd = trs->trso_failed_sd; 72489b17223SAlexander Motin if (bp->bio_error != 0 || 72589b17223SAlexander Motin trs->trso_flags & TR_RAID1_F_ABORT) { 72689b17223SAlexander Motin if ((trs->trso_flags & 72789b17223SAlexander Motin TR_RAID1_F_ABORT) == 0) { 72889b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, 72989b17223SAlexander Motin nsd, nsd->sd_disk); 73089b17223SAlexander Motin } 73189b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 73289b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 73389b17223SAlexander Motin return; 73489b17223SAlexander Motin } 73589b17223SAlexander Motin rebuild_round_done: 73689b17223SAlexander Motin nsd = trs->trso_failed_sd; 73789b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED; 73889b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, 73989b17223SAlexander Motin bp->bio_offset, bp->bio_length); 74089b17223SAlexander Motin nsd->sd_rebuild_pos += bp->bio_length; 74189b17223SAlexander Motin if (nsd->sd_rebuild_pos >= nsd->sd_size) { 74289b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(tr); 74389b17223SAlexander Motin return; 74489b17223SAlexander Motin } 74589b17223SAlexander Motin 74689b17223SAlexander Motin /* Abort rebuild if we are stopping */ 74789b17223SAlexander Motin if (trs->trso_stopping) { 74889b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 74989b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr); 75089b17223SAlexander Motin return; 75189b17223SAlexander Motin } 75289b17223SAlexander Motin 75389b17223SAlexander Motin if (--trs->trso_meta_update <= 0) { 75489b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, 75589b17223SAlexander Motin vol, nsd, nsd->sd_disk); 75689b17223SAlexander Motin trs->trso_meta_update = 75789b17223SAlexander Motin g_raid1_rebuild_meta_update; 75889b17223SAlexander Motin } 75989b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 76089b17223SAlexander Motin if (--trs->trso_recover_slabs <= 0) 76189b17223SAlexander Motin return; 76289b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 76389b17223SAlexander Motin } 76489b17223SAlexander Motin } else if (trs->trso_type == TR_RAID1_RESYNC) { 76589b17223SAlexander Motin /* 76689b17223SAlexander Motin * read good sd, read bad sd in parallel. when both 76789b17223SAlexander Motin * done, compare the buffers. write good to the bad 76889b17223SAlexander Motin * if different. do the next bit of work. 76989b17223SAlexander Motin */ 77089b17223SAlexander Motin panic("Somehow, we think we're doing a resync"); 77189b17223SAlexander Motin } 77289b17223SAlexander Motin return; 77389b17223SAlexander Motin } 77489b17223SAlexander Motin pbp = bp->bio_parent; 77589b17223SAlexander Motin pbp->bio_inbed++; 77689b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 77789b17223SAlexander Motin /* 77889b17223SAlexander Motin * Read failed on first drive. Retry the read error on 77989b17223SAlexander Motin * another disk drive, if available, before erroring out the 78089b17223SAlexander Motin * read. 78189b17223SAlexander Motin */ 78289b17223SAlexander Motin sd->sd_disk->d_read_errs++; 78389b17223SAlexander Motin G_RAID_LOGREQ(0, bp, 78489b17223SAlexander Motin "Read error (%d), %d read errors total", 78589b17223SAlexander Motin bp->bio_error, sd->sd_disk->d_read_errs); 78689b17223SAlexander Motin 78789b17223SAlexander Motin /* 78889b17223SAlexander Motin * If there are too many read errors, we move to degraded. 78989b17223SAlexander Motin * XXX Do we want to FAIL the drive (eg, make the user redo 79089b17223SAlexander Motin * everything to get it back in sync), or just degrade the 79189b17223SAlexander Motin * drive, which kicks off a resync? 79289b17223SAlexander Motin */ 79389b17223SAlexander Motin do_write = 1; 79489b17223SAlexander Motin if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { 79589b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 79689b17223SAlexander Motin if (pbp->bio_children == 1) 79789b17223SAlexander Motin do_write = 0; 79889b17223SAlexander Motin } 79989b17223SAlexander Motin 80089b17223SAlexander Motin /* 80189b17223SAlexander Motin * Find the other disk, and try to do the I/O to it. 80289b17223SAlexander Motin */ 80389b17223SAlexander Motin mask = (uintptr_t *)(&pbp->bio_driver2); 80489b17223SAlexander Motin if (pbp->bio_children == 1) { 80589b17223SAlexander Motin /* Save original subdisk. */ 80689b17223SAlexander Motin pbp->bio_driver1 = do_write ? sd : NULL; 80789b17223SAlexander Motin *mask = 0; 80889b17223SAlexander Motin } 80989b17223SAlexander Motin *mask |= 1 << sd->sd_pos; 81089b17223SAlexander Motin nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); 81189b17223SAlexander Motin if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { 81289b17223SAlexander Motin g_destroy_bio(bp); 81389b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 81489b17223SAlexander Motin nsd->sd_pos); 81589b17223SAlexander Motin if (pbp->bio_children == 2 && do_write) { 81689b17223SAlexander Motin sd->sd_recovery++; 81789b17223SAlexander Motin cbp->bio_caller1 = nsd; 81889b17223SAlexander Motin pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; 81989b17223SAlexander Motin /* Lock callback starts I/O */ 82089b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, 82189b17223SAlexander Motin cbp->bio_offset, cbp->bio_length, pbp, cbp); 82289b17223SAlexander Motin } else { 82389b17223SAlexander Motin g_raid_subdisk_iostart(nsd, cbp); 82489b17223SAlexander Motin } 82589b17223SAlexander Motin return; 82689b17223SAlexander Motin } 82789b17223SAlexander Motin /* 82889b17223SAlexander Motin * We can't retry. Return the original error by falling 82989b17223SAlexander Motin * through. This will happen when there's only one good disk. 83089b17223SAlexander Motin * We don't need to fail the raid, since its actual state is 83189b17223SAlexander Motin * based on the state of the subdisks. 83289b17223SAlexander Motin */ 83389b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 83489b17223SAlexander Motin } 83589b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && 83689b17223SAlexander Motin bp->bio_error == 0 && 83789b17223SAlexander Motin pbp->bio_children > 1 && 83889b17223SAlexander Motin pbp->bio_driver1 != NULL) { 83989b17223SAlexander Motin /* 84089b17223SAlexander Motin * If it was a read, and bio_children is >1, then we just 84189b17223SAlexander Motin * recovered the data from the second drive. We should try to 84289b17223SAlexander Motin * write that data to the first drive if sector remapping is 84389b17223SAlexander Motin * enabled. A write should put the data in a new place on the 84489b17223SAlexander Motin * disk, remapping the bad sector. Do we need to do that by 84589b17223SAlexander Motin * queueing a request to the main worker thread? It doesn't 84689b17223SAlexander Motin * affect the return code of this current read, and can be 847e8d57122SPedro F. Giffuni * done at our leisure. However, to make the code simpler, it 848e8d57122SPedro F. Giffuni * is done synchronously. 84989b17223SAlexander Motin */ 85089b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 85189b17223SAlexander Motin cbp = g_clone_bio(pbp); 85289b17223SAlexander Motin if (cbp != NULL) { 85389b17223SAlexander Motin g_destroy_bio(bp); 85489b17223SAlexander Motin cbp->bio_cmd = BIO_WRITE; 85589b17223SAlexander Motin cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 85689b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, 85789b17223SAlexander Motin "Attempting bad sector remap on failing drive."); 85889b17223SAlexander Motin g_raid_subdisk_iostart(pbp->bio_driver1, cbp); 85989b17223SAlexander Motin return; 86089b17223SAlexander Motin } 86189b17223SAlexander Motin } 86289b17223SAlexander Motin if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { 86389b17223SAlexander Motin /* 86489b17223SAlexander Motin * We're done with a recovery, mark the range as unlocked. 865e8d57122SPedro F. Giffuni * For any write errors, we aggressively fail the disk since 86689b17223SAlexander Motin * there was both a READ and a WRITE error at this location. 86789b17223SAlexander Motin * Both types of errors generally indicates the drive is on 86889b17223SAlexander Motin * the verge of total failure anyway. Better to stop trusting 86989b17223SAlexander Motin * it now. However, we need to reset error to 0 in that case 87089b17223SAlexander Motin * because we're not failing the original I/O which succeeded. 87189b17223SAlexander Motin */ 87289b17223SAlexander Motin if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 87389b17223SAlexander Motin G_RAID_LOGREQ(0, bp, "Remap write failed: " 87489b17223SAlexander Motin "failing subdisk."); 87589b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 87689b17223SAlexander Motin bp->bio_error = 0; 87789b17223SAlexander Motin } 87889b17223SAlexander Motin if (pbp->bio_driver1 != NULL) { 87989b17223SAlexander Motin ((struct g_raid_subdisk *)pbp->bio_driver1) 88089b17223SAlexander Motin ->sd_recovery--; 88189b17223SAlexander Motin } 88289b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 88389b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, bp->bio_offset, 88489b17223SAlexander Motin bp->bio_length); 88589b17223SAlexander Motin } 886650e245eSAlexander Motin if (pbp->bio_cmd != BIO_READ) { 887ef844ef7SAlexander Motin if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 888ef844ef7SAlexander Motin pbp->bio_error = bp->bio_error; 889650e245eSAlexander Motin if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { 890ef844ef7SAlexander Motin G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 891ef844ef7SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 892ef844ef7SAlexander Motin } 893ef844ef7SAlexander Motin error = pbp->bio_error; 894ef844ef7SAlexander Motin } else 89589b17223SAlexander Motin error = bp->bio_error; 89689b17223SAlexander Motin g_destroy_bio(bp); 89789b17223SAlexander Motin if (pbp->bio_children == pbp->bio_inbed) { 89889b17223SAlexander Motin pbp->bio_completed = pbp->bio_length; 89989b17223SAlexander Motin g_raid_iodone(pbp, error); 90089b17223SAlexander Motin } 90189b17223SAlexander Motin } 90289b17223SAlexander Motin 90389b17223SAlexander Motin static int 90489b17223SAlexander Motin g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, 90589b17223SAlexander Motin void *virtual, vm_offset_t physical, off_t offset, size_t length) 90689b17223SAlexander Motin { 90789b17223SAlexander Motin struct g_raid_volume *vol; 90889b17223SAlexander Motin struct g_raid_subdisk *sd; 90989b17223SAlexander Motin int error, i, ok; 91089b17223SAlexander Motin 91189b17223SAlexander Motin vol = tr->tro_volume; 91289b17223SAlexander Motin error = 0; 91389b17223SAlexander Motin ok = 0; 91489b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) { 91589b17223SAlexander Motin sd = &vol->v_subdisks[i]; 91689b17223SAlexander Motin switch (sd->sd_state) { 91789b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE: 91889b17223SAlexander Motin break; 91989b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD: 92089b17223SAlexander Motin /* 92189b17223SAlexander Motin * When rebuilding, only part of this subdisk is 92289b17223SAlexander Motin * writable, the rest will be written as part of the 92389b17223SAlexander Motin * that process. 92489b17223SAlexander Motin */ 92589b17223SAlexander Motin if (offset >= sd->sd_rebuild_pos) 92689b17223SAlexander Motin continue; 92789b17223SAlexander Motin break; 92889b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE: 92989b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC: 93089b17223SAlexander Motin /* 93189b17223SAlexander Motin * Resyncing still writes on the theory that the 93289b17223SAlexander Motin * resync'd disk is very close and writing it will 93389b17223SAlexander Motin * keep it that way better if we keep up while 93489b17223SAlexander Motin * resyncing. 93589b17223SAlexander Motin */ 93689b17223SAlexander Motin break; 93789b17223SAlexander Motin default: 93889b17223SAlexander Motin continue; 93989b17223SAlexander Motin } 94089b17223SAlexander Motin error = g_raid_subdisk_kerneldump(sd, 94189b17223SAlexander Motin virtual, physical, offset, length); 94289b17223SAlexander Motin if (error == 0) 94389b17223SAlexander Motin ok++; 94489b17223SAlexander Motin } 94589b17223SAlexander Motin return (ok > 0 ? 0 : error); 94689b17223SAlexander Motin } 94789b17223SAlexander Motin 94889b17223SAlexander Motin static int 94989b17223SAlexander Motin g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) 95089b17223SAlexander Motin { 95189b17223SAlexander Motin struct bio *bp; 95289b17223SAlexander Motin struct g_raid_subdisk *sd; 95389b17223SAlexander Motin 95489b17223SAlexander Motin bp = (struct bio *)argp; 95589b17223SAlexander Motin sd = (struct g_raid_subdisk *)bp->bio_caller1; 95689b17223SAlexander Motin g_raid_subdisk_iostart(sd, bp); 95789b17223SAlexander Motin 95889b17223SAlexander Motin return (0); 95989b17223SAlexander Motin } 96089b17223SAlexander Motin 96189b17223SAlexander Motin static int 96289b17223SAlexander Motin g_raid_tr_idle_raid1(struct g_raid_tr_object *tr) 96389b17223SAlexander Motin { 96489b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 96589b17223SAlexander Motin 96689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 96789b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io; 96889b17223SAlexander Motin trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; 96989b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) 97089b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr); 97189b17223SAlexander Motin return (0); 97289b17223SAlexander Motin } 97389b17223SAlexander Motin 97489b17223SAlexander Motin static int 97589b17223SAlexander Motin g_raid_tr_free_raid1(struct g_raid_tr_object *tr) 97689b17223SAlexander Motin { 97789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs; 97889b17223SAlexander Motin 97989b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr; 98089b17223SAlexander Motin 98189b17223SAlexander Motin if (trs->trso_buffer != NULL) { 98289b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1); 98389b17223SAlexander Motin trs->trso_buffer = NULL; 98489b17223SAlexander Motin } 98589b17223SAlexander Motin return (0); 98689b17223SAlexander Motin } 98789b17223SAlexander Motin 988c89d2fbeSAlexander Motin G_RAID_TR_DECLARE(raid1, "RAID1"); 989