189b17223SAlexander Motin /*-
2*4d846d26SWarner Losh * SPDX-License-Identifier: BSD-2-Clause
33728855aSPedro F. Giffuni *
489b17223SAlexander Motin * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
589b17223SAlexander Motin * All rights reserved.
689b17223SAlexander Motin *
789b17223SAlexander Motin * Redistribution and use in source and binary forms, with or without
889b17223SAlexander Motin * modification, are permitted provided that the following conditions
989b17223SAlexander Motin * are met:
1089b17223SAlexander Motin * 1. Redistributions of source code must retain the above copyright
1189b17223SAlexander Motin * notice, this list of conditions and the following disclaimer.
1289b17223SAlexander Motin * 2. Redistributions in binary form must reproduce the above copyright
1389b17223SAlexander Motin * notice, this list of conditions and the following disclaimer in the
1489b17223SAlexander Motin * documentation and/or other materials provided with the distribution.
1589b17223SAlexander Motin *
1689b17223SAlexander Motin * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
1789b17223SAlexander Motin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1889b17223SAlexander Motin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1989b17223SAlexander Motin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
2089b17223SAlexander Motin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2189b17223SAlexander Motin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2289b17223SAlexander Motin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2389b17223SAlexander Motin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2489b17223SAlexander Motin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2589b17223SAlexander Motin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2689b17223SAlexander Motin * SUCH DAMAGE.
2789b17223SAlexander Motin */
2889b17223SAlexander Motin
2989b17223SAlexander Motin #include <sys/param.h>
3089b17223SAlexander Motin #include <sys/bio.h>
3189b17223SAlexander Motin #include <sys/endian.h>
3289b17223SAlexander Motin #include <sys/kernel.h>
3389b17223SAlexander Motin #include <sys/kobj.h>
3489b17223SAlexander Motin #include <sys/limits.h>
3589b17223SAlexander Motin #include <sys/lock.h>
3689b17223SAlexander Motin #include <sys/malloc.h>
3789b17223SAlexander Motin #include <sys/mutex.h>
3889b17223SAlexander Motin #include <sys/sysctl.h>
3989b17223SAlexander Motin #include <sys/systm.h>
4089b17223SAlexander Motin #include <geom/geom.h>
41ac03832eSConrad Meyer #include <geom/geom_dbg.h>
4289b17223SAlexander Motin #include "geom/raid/g_raid.h"
4389b17223SAlexander Motin #include "g_raid_tr_if.h"
4489b17223SAlexander Motin
45c89d2fbeSAlexander Motin SYSCTL_DECL(_kern_geom_raid_raid1);
4689b17223SAlexander Motin
4789b17223SAlexander Motin #define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
4889b17223SAlexander Motin static int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB;
49af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
5089b17223SAlexander Motin &g_raid1_rebuild_slab, 0,
5189b17223SAlexander Motin "Amount of the disk to rebuild each read/write cycle of the rebuild.");
5289b17223SAlexander Motin
5389b17223SAlexander Motin #define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
5489b17223SAlexander Motin static int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO;
55af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
5689b17223SAlexander Motin &g_raid1_rebuild_fair_io, 0,
5789b17223SAlexander Motin "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
5889b17223SAlexander Motin
5989b17223SAlexander Motin #define RAID1_REBUILD_CLUSTER_IDLE 100
6089b17223SAlexander Motin static int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE;
61af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
6289b17223SAlexander Motin &g_raid1_rebuild_cluster_idle, 0,
6389b17223SAlexander Motin "Number of slabs to do each time we trigger a rebuild cycle");
6489b17223SAlexander Motin
6589b17223SAlexander Motin #define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
6689b17223SAlexander Motin static int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE;
67af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
6889b17223SAlexander Motin &g_raid1_rebuild_meta_update, 0,
6989b17223SAlexander Motin "When to update the meta data.");
7089b17223SAlexander Motin
7189b17223SAlexander Motin static MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data");
7289b17223SAlexander Motin
7389b17223SAlexander Motin #define TR_RAID1_NONE 0
7489b17223SAlexander Motin #define TR_RAID1_REBUILD 1
7589b17223SAlexander Motin #define TR_RAID1_RESYNC 2
7689b17223SAlexander Motin
7789b17223SAlexander Motin #define TR_RAID1_F_DOING_SOME 0x1
7889b17223SAlexander Motin #define TR_RAID1_F_LOCKED 0x2
7989b17223SAlexander Motin #define TR_RAID1_F_ABORT 0x4
8089b17223SAlexander Motin
8189b17223SAlexander Motin struct g_raid_tr_raid1_object {
8289b17223SAlexander Motin struct g_raid_tr_object trso_base;
8389b17223SAlexander Motin int trso_starting;
8489b17223SAlexander Motin int trso_stopping;
8589b17223SAlexander Motin int trso_type;
8689b17223SAlexander Motin int trso_recover_slabs; /* slabs before rest */
8789b17223SAlexander Motin int trso_fair_io;
8889b17223SAlexander Motin int trso_meta_update;
8989b17223SAlexander Motin int trso_flags;
9089b17223SAlexander Motin struct g_raid_subdisk *trso_failed_sd; /* like per volume */
9189b17223SAlexander Motin void *trso_buffer; /* Buffer space */
9289b17223SAlexander Motin struct bio trso_bio;
9389b17223SAlexander Motin };
9489b17223SAlexander Motin
9589b17223SAlexander Motin static g_raid_tr_taste_t g_raid_tr_taste_raid1;
9689b17223SAlexander Motin static g_raid_tr_event_t g_raid_tr_event_raid1;
9789b17223SAlexander Motin static g_raid_tr_start_t g_raid_tr_start_raid1;
9889b17223SAlexander Motin static g_raid_tr_stop_t g_raid_tr_stop_raid1;
9989b17223SAlexander Motin static g_raid_tr_iostart_t g_raid_tr_iostart_raid1;
10089b17223SAlexander Motin static g_raid_tr_iodone_t g_raid_tr_iodone_raid1;
10189b17223SAlexander Motin static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1;
10289b17223SAlexander Motin static g_raid_tr_locked_t g_raid_tr_locked_raid1;
10389b17223SAlexander Motin static g_raid_tr_idle_t g_raid_tr_idle_raid1;
10489b17223SAlexander Motin static g_raid_tr_free_t g_raid_tr_free_raid1;
10589b17223SAlexander Motin
10689b17223SAlexander Motin static kobj_method_t g_raid_tr_raid1_methods[] = {
10789b17223SAlexander Motin KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1),
10889b17223SAlexander Motin KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1),
10989b17223SAlexander Motin KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1),
11089b17223SAlexander Motin KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1),
11189b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1),
11289b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1),
11389b17223SAlexander Motin KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1),
11489b17223SAlexander Motin KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1),
11589b17223SAlexander Motin KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1),
11689b17223SAlexander Motin KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1),
11789b17223SAlexander Motin { 0, 0 }
11889b17223SAlexander Motin };
11989b17223SAlexander Motin
12089b17223SAlexander Motin static struct g_raid_tr_class g_raid_tr_raid1_class = {
12189b17223SAlexander Motin "RAID1",
12289b17223SAlexander Motin g_raid_tr_raid1_methods,
12389b17223SAlexander Motin sizeof(struct g_raid_tr_raid1_object),
124c89d2fbeSAlexander Motin .trc_enable = 1,
125b43560abSAlexander Motin .trc_priority = 100,
126b43560abSAlexander Motin .trc_accept_unmapped = 1
12789b17223SAlexander Motin };
12889b17223SAlexander Motin
12989b17223SAlexander Motin static void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr);
13089b17223SAlexander Motin static void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
13189b17223SAlexander Motin struct g_raid_subdisk *sd);
13289b17223SAlexander Motin
13389b17223SAlexander Motin static int
g_raid_tr_taste_raid1(struct g_raid_tr_object * tr,struct g_raid_volume * vol)13489b17223SAlexander Motin g_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
13589b17223SAlexander Motin {
13689b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
13789b17223SAlexander Motin
13889b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
13989b17223SAlexander Motin if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 ||
140dbb2e755SAlexander Motin (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM &&
141dbb2e755SAlexander Motin tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM))
14289b17223SAlexander Motin return (G_RAID_TR_TASTE_FAIL);
14389b17223SAlexander Motin trs->trso_starting = 1;
14489b17223SAlexander Motin return (G_RAID_TR_TASTE_SUCCEED);
14589b17223SAlexander Motin }
14689b17223SAlexander Motin
14789b17223SAlexander Motin static int
g_raid_tr_update_state_raid1(struct g_raid_volume * vol,struct g_raid_subdisk * sd)14889b17223SAlexander Motin g_raid_tr_update_state_raid1(struct g_raid_volume *vol,
14989b17223SAlexander Motin struct g_raid_subdisk *sd)
15089b17223SAlexander Motin {
15189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
15289b17223SAlexander Motin struct g_raid_softc *sc;
15389b17223SAlexander Motin struct g_raid_subdisk *tsd, *bestsd;
15489b17223SAlexander Motin u_int s;
15589b17223SAlexander Motin int i, na, ns;
15689b17223SAlexander Motin
15789b17223SAlexander Motin sc = vol->v_softc;
15889b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)vol->v_tr;
15989b17223SAlexander Motin if (trs->trso_stopping &&
16089b17223SAlexander Motin (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0)
16189b17223SAlexander Motin s = G_RAID_VOLUME_S_STOPPED;
16289b17223SAlexander Motin else if (trs->trso_starting)
16389b17223SAlexander Motin s = G_RAID_VOLUME_S_STARTING;
16489b17223SAlexander Motin else {
16589b17223SAlexander Motin /* Make sure we have at least one ACTIVE disk. */
16689b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
16789b17223SAlexander Motin if (na == 0) {
16889b17223SAlexander Motin /*
16989b17223SAlexander Motin * Critical situation! We have no any active disk!
17089b17223SAlexander Motin * Choose the best disk we have to make it active.
17189b17223SAlexander Motin */
17289b17223SAlexander Motin bestsd = &vol->v_subdisks[0];
17389b17223SAlexander Motin for (i = 1; i < vol->v_disks_count; i++) {
17489b17223SAlexander Motin tsd = &vol->v_subdisks[i];
17589b17223SAlexander Motin if (tsd->sd_state > bestsd->sd_state)
17689b17223SAlexander Motin bestsd = tsd;
17789b17223SAlexander Motin else if (tsd->sd_state == bestsd->sd_state &&
17889b17223SAlexander Motin (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
17989b17223SAlexander Motin tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
18089b17223SAlexander Motin tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
18189b17223SAlexander Motin bestsd = tsd;
18289b17223SAlexander Motin }
18389b17223SAlexander Motin if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) {
18489b17223SAlexander Motin /* We found reasonable candidate. */
18589b17223SAlexander Motin G_RAID_DEBUG1(1, sc,
18689b17223SAlexander Motin "Promote subdisk %s:%d from %s to ACTIVE.",
18789b17223SAlexander Motin vol->v_name, bestsd->sd_pos,
18889b17223SAlexander Motin g_raid_subdisk_state2str(bestsd->sd_state));
18989b17223SAlexander Motin g_raid_change_subdisk_state(bestsd,
19089b17223SAlexander Motin G_RAID_SUBDISK_S_ACTIVE);
19189b17223SAlexander Motin g_raid_write_metadata(sc,
19289b17223SAlexander Motin vol, bestsd, bestsd->sd_disk);
19389b17223SAlexander Motin }
19489b17223SAlexander Motin }
19589b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
19689b17223SAlexander Motin ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
19789b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
19889b17223SAlexander Motin if (na == vol->v_disks_count)
19989b17223SAlexander Motin s = G_RAID_VOLUME_S_OPTIMAL;
20089b17223SAlexander Motin else if (na + ns == vol->v_disks_count)
20189b17223SAlexander Motin s = G_RAID_VOLUME_S_SUBOPTIMAL;
20289b17223SAlexander Motin else if (na > 0)
20389b17223SAlexander Motin s = G_RAID_VOLUME_S_DEGRADED;
20489b17223SAlexander Motin else
20589b17223SAlexander Motin s = G_RAID_VOLUME_S_BROKEN;
20689b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd);
20789b17223SAlexander Motin }
20889b17223SAlexander Motin if (s != vol->v_state) {
20989b17223SAlexander Motin g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
21089b17223SAlexander Motin G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
21189b17223SAlexander Motin G_RAID_EVENT_VOLUME);
21289b17223SAlexander Motin g_raid_change_volume_state(vol, s);
21389b17223SAlexander Motin if (!trs->trso_starting && !trs->trso_stopping)
21489b17223SAlexander Motin g_raid_write_metadata(sc, vol, NULL, NULL);
21589b17223SAlexander Motin }
21689b17223SAlexander Motin return (0);
21789b17223SAlexander Motin }
21889b17223SAlexander Motin
21989b17223SAlexander Motin static void
g_raid_tr_raid1_fail_disk(struct g_raid_softc * sc,struct g_raid_subdisk * sd,struct g_raid_disk * disk)22089b17223SAlexander Motin g_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
22189b17223SAlexander Motin struct g_raid_disk *disk)
22289b17223SAlexander Motin {
22389b17223SAlexander Motin /*
22489b17223SAlexander Motin * We don't fail the last disk in the pack, since it still has decent
22589b17223SAlexander Motin * data on it and that's better than failing the disk if it is the root
22689b17223SAlexander Motin * file system.
22789b17223SAlexander Motin *
22889b17223SAlexander Motin * XXX should this be controlled via a tunable? It makes sense for
22989b17223SAlexander Motin * the volume that has / on it. I can't think of a case where we'd
23089b17223SAlexander Motin * want the volume to go away on this kind of event.
23189b17223SAlexander Motin */
23289b17223SAlexander Motin if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
23389b17223SAlexander Motin g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
23489b17223SAlexander Motin return;
23589b17223SAlexander Motin g_raid_fail_disk(sc, sd, disk);
23689b17223SAlexander Motin }
23789b17223SAlexander Motin
23889b17223SAlexander Motin static void
g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object * tr)23989b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr)
24089b17223SAlexander Motin {
24189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
24289b17223SAlexander Motin struct g_raid_subdisk *sd, *good_sd;
24389b17223SAlexander Motin struct bio *bp;
24489b17223SAlexander Motin
24589b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
24689b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME)
24789b17223SAlexander Motin return;
24889b17223SAlexander Motin sd = trs->trso_failed_sd;
24989b17223SAlexander Motin good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE);
25089b17223SAlexander Motin if (good_sd == NULL) {
25189b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr);
25289b17223SAlexander Motin return;
25389b17223SAlexander Motin }
25489b17223SAlexander Motin bp = &trs->trso_bio;
25589b17223SAlexander Motin memset(bp, 0, sizeof(*bp));
25689b17223SAlexander Motin bp->bio_offset = sd->sd_rebuild_pos;
25789b17223SAlexander Motin bp->bio_length = MIN(g_raid1_rebuild_slab,
25889b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos);
25989b17223SAlexander Motin bp->bio_data = trs->trso_buffer;
26089b17223SAlexander Motin bp->bio_cmd = BIO_READ;
26189b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
26289b17223SAlexander Motin bp->bio_caller1 = good_sd;
26389b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_DOING_SOME;
26489b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_LOCKED;
26589b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */
26689b17223SAlexander Motin bp->bio_offset, bp->bio_length, NULL, bp);
26789b17223SAlexander Motin }
26889b17223SAlexander Motin
26989b17223SAlexander Motin static void
g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object * trs)27089b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs)
27189b17223SAlexander Motin {
27289b17223SAlexander Motin struct g_raid_volume *vol;
27389b17223SAlexander Motin struct g_raid_subdisk *sd;
27489b17223SAlexander Motin
27589b17223SAlexander Motin vol = trs->trso_base.tro_volume;
27689b17223SAlexander Motin sd = trs->trso_failed_sd;
27789b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
27889b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1);
27989b17223SAlexander Motin trs->trso_buffer = NULL;
28089b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
28189b17223SAlexander Motin trs->trso_type = TR_RAID1_NONE;
28289b17223SAlexander Motin trs->trso_recover_slabs = 0;
28389b17223SAlexander Motin trs->trso_failed_sd = NULL;
28489b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL);
28589b17223SAlexander Motin }
28689b17223SAlexander Motin
28789b17223SAlexander Motin static void
g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object * tr)28889b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr)
28989b17223SAlexander Motin {
29089b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
29189b17223SAlexander Motin struct g_raid_subdisk *sd;
29289b17223SAlexander Motin
29389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
29489b17223SAlexander Motin sd = trs->trso_failed_sd;
29589b17223SAlexander Motin G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
29689b17223SAlexander Motin "Subdisk %s:%d-%s rebuild completed.",
29789b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos,
29889b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
29989b17223SAlexander Motin g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
30089b17223SAlexander Motin sd->sd_rebuild_pos = 0;
30189b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs);
30289b17223SAlexander Motin }
30389b17223SAlexander Motin
30489b17223SAlexander Motin static void
g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object * tr)30589b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr)
30689b17223SAlexander Motin {
30789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
30889b17223SAlexander Motin struct g_raid_subdisk *sd;
30989b17223SAlexander Motin struct g_raid_volume *vol;
31089b17223SAlexander Motin off_t len;
31189b17223SAlexander Motin
31289b17223SAlexander Motin vol = tr->tro_volume;
31389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
31489b17223SAlexander Motin sd = trs->trso_failed_sd;
31589b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_DOING_SOME) {
31689b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc,
31789b17223SAlexander Motin "Subdisk %s:%d-%s rebuild is aborting.",
31889b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos,
31989b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
32089b17223SAlexander Motin trs->trso_flags |= TR_RAID1_F_ABORT;
32189b17223SAlexander Motin } else {
32289b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc,
32389b17223SAlexander Motin "Subdisk %s:%d-%s rebuild aborted.",
32489b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos,
32589b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
32689b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_ABORT;
32789b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_LOCKED) {
32889b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED;
32989b17223SAlexander Motin len = MIN(g_raid1_rebuild_slab,
33089b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos);
33189b17223SAlexander Motin g_raid_unlock_range(tr->tro_volume,
33289b17223SAlexander Motin sd->sd_rebuild_pos, len);
33389b17223SAlexander Motin }
33489b17223SAlexander Motin g_raid_tr_raid1_rebuild_done(trs);
33589b17223SAlexander Motin }
33689b17223SAlexander Motin }
33789b17223SAlexander Motin
33889b17223SAlexander Motin static void
g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object * tr)33989b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr)
34089b17223SAlexander Motin {
34189b17223SAlexander Motin struct g_raid_volume *vol;
34289b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
34389b17223SAlexander Motin struct g_raid_subdisk *sd, *fsd;
34489b17223SAlexander Motin
34589b17223SAlexander Motin vol = tr->tro_volume;
34689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
34789b17223SAlexander Motin if (trs->trso_failed_sd) {
34889b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc,
34989b17223SAlexander Motin "Already rebuild in start rebuild. pos %jd\n",
35089b17223SAlexander Motin (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
35189b17223SAlexander Motin return;
35289b17223SAlexander Motin }
35389b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE);
35489b17223SAlexander Motin if (sd == NULL) {
35589b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc,
35689b17223SAlexander Motin "No active disk to rebuild. night night.");
35789b17223SAlexander Motin return;
35889b17223SAlexander Motin }
35989b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
36089b17223SAlexander Motin if (fsd == NULL)
36189b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
36289b17223SAlexander Motin if (fsd == NULL) {
36389b17223SAlexander Motin fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
36489b17223SAlexander Motin if (fsd != NULL) {
36589b17223SAlexander Motin fsd->sd_rebuild_pos = 0;
36689b17223SAlexander Motin g_raid_change_subdisk_state(fsd,
36789b17223SAlexander Motin G_RAID_SUBDISK_S_RESYNC);
36889b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, fsd, NULL);
36989b17223SAlexander Motin } else {
37089b17223SAlexander Motin fsd = g_raid_get_subdisk(vol,
37189b17223SAlexander Motin G_RAID_SUBDISK_S_UNINITIALIZED);
37289b17223SAlexander Motin if (fsd == NULL)
37389b17223SAlexander Motin fsd = g_raid_get_subdisk(vol,
37489b17223SAlexander Motin G_RAID_SUBDISK_S_NEW);
37589b17223SAlexander Motin if (fsd != NULL) {
37689b17223SAlexander Motin fsd->sd_rebuild_pos = 0;
37789b17223SAlexander Motin g_raid_change_subdisk_state(fsd,
37889b17223SAlexander Motin G_RAID_SUBDISK_S_REBUILD);
37989b17223SAlexander Motin g_raid_write_metadata(vol->v_softc,
38089b17223SAlexander Motin vol, fsd, NULL);
38189b17223SAlexander Motin }
38289b17223SAlexander Motin }
38389b17223SAlexander Motin }
38489b17223SAlexander Motin if (fsd == NULL) {
38589b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc,
38689b17223SAlexander Motin "No failed disk to rebuild. night night.");
38789b17223SAlexander Motin return;
38889b17223SAlexander Motin }
38989b17223SAlexander Motin trs->trso_failed_sd = fsd;
39089b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc,
39189b17223SAlexander Motin "Subdisk %s:%d-%s rebuild start at %jd.",
39289b17223SAlexander Motin fsd->sd_volume->v_name, fsd->sd_pos,
39389b17223SAlexander Motin fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]",
39489b17223SAlexander Motin trs->trso_failed_sd->sd_rebuild_pos);
39589b17223SAlexander Motin trs->trso_type = TR_RAID1_REBUILD;
39689b17223SAlexander Motin trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK);
39789b17223SAlexander Motin trs->trso_meta_update = g_raid1_rebuild_meta_update;
39889b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr);
39989b17223SAlexander Motin }
40089b17223SAlexander Motin
40189b17223SAlexander Motin static void
g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd)40289b17223SAlexander Motin g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr,
40389b17223SAlexander Motin struct g_raid_subdisk *sd)
40489b17223SAlexander Motin {
40589b17223SAlexander Motin struct g_raid_volume *vol;
40689b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
40789b17223SAlexander Motin int na, nr;
40889b17223SAlexander Motin
40989b17223SAlexander Motin /*
41089b17223SAlexander Motin * If we're stopping, don't do anything. If we don't have at least one
41189b17223SAlexander Motin * good disk and one bad disk, we don't do anything. And if there's a
41289b17223SAlexander Motin * 'good disk' stored in the trs, then we're in progress and we punt.
41389b17223SAlexander Motin * If we make it past all these checks, we need to rebuild.
41489b17223SAlexander Motin */
41589b17223SAlexander Motin vol = tr->tro_volume;
41689b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
41789b17223SAlexander Motin if (trs->trso_stopping)
41889b17223SAlexander Motin return;
41989b17223SAlexander Motin na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
42089b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
42189b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
42289b17223SAlexander Motin switch(trs->trso_type) {
42389b17223SAlexander Motin case TR_RAID1_NONE:
42489b17223SAlexander Motin if (na == 0)
42589b17223SAlexander Motin return;
42689b17223SAlexander Motin if (nr == 0) {
42789b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
42889b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
42989b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
43089b17223SAlexander Motin if (nr == 0)
43189b17223SAlexander Motin return;
43289b17223SAlexander Motin }
43389b17223SAlexander Motin g_raid_tr_raid1_rebuild_start(tr);
43489b17223SAlexander Motin break;
43589b17223SAlexander Motin case TR_RAID1_REBUILD:
43689b17223SAlexander Motin if (na == 0 || nr == 0 || trs->trso_failed_sd == sd)
43789b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr);
43889b17223SAlexander Motin break;
43989b17223SAlexander Motin case TR_RAID1_RESYNC:
44089b17223SAlexander Motin break;
44189b17223SAlexander Motin }
44289b17223SAlexander Motin }
44389b17223SAlexander Motin
44489b17223SAlexander Motin static int
g_raid_tr_event_raid1(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,u_int event)44589b17223SAlexander Motin g_raid_tr_event_raid1(struct g_raid_tr_object *tr,
44689b17223SAlexander Motin struct g_raid_subdisk *sd, u_int event)
44789b17223SAlexander Motin {
44889b17223SAlexander Motin
44989b17223SAlexander Motin g_raid_tr_update_state_raid1(tr->tro_volume, sd);
45089b17223SAlexander Motin return (0);
45189b17223SAlexander Motin }
45289b17223SAlexander Motin
45389b17223SAlexander Motin static int
g_raid_tr_start_raid1(struct g_raid_tr_object * tr)45489b17223SAlexander Motin g_raid_tr_start_raid1(struct g_raid_tr_object *tr)
45589b17223SAlexander Motin {
45689b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
45789b17223SAlexander Motin struct g_raid_volume *vol;
45889b17223SAlexander Motin
45989b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
46089b17223SAlexander Motin vol = tr->tro_volume;
46189b17223SAlexander Motin trs->trso_starting = 0;
46289b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL);
46389b17223SAlexander Motin return (0);
46489b17223SAlexander Motin }
46589b17223SAlexander Motin
46689b17223SAlexander Motin static int
g_raid_tr_stop_raid1(struct g_raid_tr_object * tr)46789b17223SAlexander Motin g_raid_tr_stop_raid1(struct g_raid_tr_object *tr)
46889b17223SAlexander Motin {
46989b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
47089b17223SAlexander Motin struct g_raid_volume *vol;
47189b17223SAlexander Motin
47289b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
47389b17223SAlexander Motin vol = tr->tro_volume;
47489b17223SAlexander Motin trs->trso_starting = 0;
47589b17223SAlexander Motin trs->trso_stopping = 1;
47689b17223SAlexander Motin g_raid_tr_update_state_raid1(vol, NULL);
47789b17223SAlexander Motin return (0);
47889b17223SAlexander Motin }
47989b17223SAlexander Motin
48089b17223SAlexander Motin /*
48189b17223SAlexander Motin * Select the disk to read from. Take into account: subdisk state, running
48289b17223SAlexander Motin * error recovery, average disk load, head position and possible cache hits.
48389b17223SAlexander Motin */
48489b17223SAlexander Motin #define ABS(x) (((x) >= 0) ? (x) : (-(x)))
48589b17223SAlexander Motin static struct g_raid_subdisk *
g_raid_tr_raid1_select_read_disk(struct g_raid_volume * vol,struct bio * bp,u_int mask)48689b17223SAlexander Motin g_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp,
48789b17223SAlexander Motin u_int mask)
48889b17223SAlexander Motin {
48989b17223SAlexander Motin struct g_raid_subdisk *sd, *best;
49089b17223SAlexander Motin int i, prio, bestprio;
49189b17223SAlexander Motin
49289b17223SAlexander Motin best = NULL;
49389b17223SAlexander Motin bestprio = INT_MAX;
49489b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) {
49589b17223SAlexander Motin sd = &vol->v_subdisks[i];
49689b17223SAlexander Motin if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
49789b17223SAlexander Motin ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD &&
49889b17223SAlexander Motin sd->sd_state != G_RAID_SUBDISK_S_RESYNC) ||
49989b17223SAlexander Motin bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos))
50089b17223SAlexander Motin continue;
50189b17223SAlexander Motin if ((mask & (1 << i)) != 0)
50289b17223SAlexander Motin continue;
50389b17223SAlexander Motin prio = G_RAID_SUBDISK_LOAD(sd);
50489b17223SAlexander Motin prio += min(sd->sd_recovery, 255) << 22;
50589b17223SAlexander Motin prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16;
50689b17223SAlexander Motin /* If disk head is precisely in position - highly prefer it. */
50789b17223SAlexander Motin if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset)
50889b17223SAlexander Motin prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
50989b17223SAlexander Motin else
51089b17223SAlexander Motin /* If disk head is close to position - prefer it. */
51189b17223SAlexander Motin if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) <
51289b17223SAlexander Motin G_RAID_SUBDISK_TRACK_SIZE)
51389b17223SAlexander Motin prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
51489b17223SAlexander Motin if (prio < bestprio) {
51589b17223SAlexander Motin best = sd;
51689b17223SAlexander Motin bestprio = prio;
51789b17223SAlexander Motin }
51889b17223SAlexander Motin }
51989b17223SAlexander Motin return (best);
52089b17223SAlexander Motin }
52189b17223SAlexander Motin
52289b17223SAlexander Motin static void
g_raid_tr_iostart_raid1_read(struct g_raid_tr_object * tr,struct bio * bp)52389b17223SAlexander Motin g_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp)
52489b17223SAlexander Motin {
52589b17223SAlexander Motin struct g_raid_subdisk *sd;
52689b17223SAlexander Motin struct bio *cbp;
52789b17223SAlexander Motin
52889b17223SAlexander Motin sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0);
52989b17223SAlexander Motin KASSERT(sd != NULL, ("No active disks in volume %s.",
53089b17223SAlexander Motin tr->tro_volume->v_name));
53189b17223SAlexander Motin
53289b17223SAlexander Motin cbp = g_clone_bio(bp);
53389b17223SAlexander Motin if (cbp == NULL) {
53489b17223SAlexander Motin g_raid_iodone(bp, ENOMEM);
53589b17223SAlexander Motin return;
53689b17223SAlexander Motin }
53789b17223SAlexander Motin
53889b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp);
53989b17223SAlexander Motin }
54089b17223SAlexander Motin
54189b17223SAlexander Motin static void
g_raid_tr_iostart_raid1_write(struct g_raid_tr_object * tr,struct bio * bp)54289b17223SAlexander Motin g_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp)
54389b17223SAlexander Motin {
54489b17223SAlexander Motin struct g_raid_volume *vol;
54589b17223SAlexander Motin struct g_raid_subdisk *sd;
54689b17223SAlexander Motin struct bio_queue_head queue;
54789b17223SAlexander Motin struct bio *cbp;
54889b17223SAlexander Motin int i;
54989b17223SAlexander Motin
55089b17223SAlexander Motin vol = tr->tro_volume;
55189b17223SAlexander Motin
55289b17223SAlexander Motin /*
55389b17223SAlexander Motin * Allocate all bios before sending any request, so we can return
55489b17223SAlexander Motin * ENOMEM in nice and clean way.
55589b17223SAlexander Motin */
55689b17223SAlexander Motin bioq_init(&queue);
55789b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) {
55889b17223SAlexander Motin sd = &vol->v_subdisks[i];
55989b17223SAlexander Motin switch (sd->sd_state) {
56089b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE:
56189b17223SAlexander Motin break;
56289b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD:
56389b17223SAlexander Motin /*
56489b17223SAlexander Motin * When rebuilding, only part of this subdisk is
56589b17223SAlexander Motin * writable, the rest will be written as part of the
56689b17223SAlexander Motin * that process.
56789b17223SAlexander Motin */
56889b17223SAlexander Motin if (bp->bio_offset >= sd->sd_rebuild_pos)
56989b17223SAlexander Motin continue;
57089b17223SAlexander Motin break;
57189b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE:
57289b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC:
57389b17223SAlexander Motin /*
57489b17223SAlexander Motin * Resyncing still writes on the theory that the
57589b17223SAlexander Motin * resync'd disk is very close and writing it will
57689b17223SAlexander Motin * keep it that way better if we keep up while
57789b17223SAlexander Motin * resyncing.
57889b17223SAlexander Motin */
57989b17223SAlexander Motin break;
58089b17223SAlexander Motin default:
58189b17223SAlexander Motin continue;
58289b17223SAlexander Motin }
58389b17223SAlexander Motin cbp = g_clone_bio(bp);
58489b17223SAlexander Motin if (cbp == NULL)
58589b17223SAlexander Motin goto failure;
58689b17223SAlexander Motin cbp->bio_caller1 = sd;
58789b17223SAlexander Motin bioq_insert_tail(&queue, cbp);
58889b17223SAlexander Motin }
589b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL) {
59089b17223SAlexander Motin sd = cbp->bio_caller1;
59189b17223SAlexander Motin cbp->bio_caller1 = NULL;
59289b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp);
59389b17223SAlexander Motin }
59489b17223SAlexander Motin return;
59589b17223SAlexander Motin failure:
596b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL)
59789b17223SAlexander Motin g_destroy_bio(cbp);
59889b17223SAlexander Motin if (bp->bio_error == 0)
59989b17223SAlexander Motin bp->bio_error = ENOMEM;
60089b17223SAlexander Motin g_raid_iodone(bp, bp->bio_error);
60189b17223SAlexander Motin }
60289b17223SAlexander Motin
60389b17223SAlexander Motin static void
g_raid_tr_iostart_raid1(struct g_raid_tr_object * tr,struct bio * bp)60489b17223SAlexander Motin g_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp)
60589b17223SAlexander Motin {
60689b17223SAlexander Motin struct g_raid_volume *vol;
60789b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
60889b17223SAlexander Motin
60989b17223SAlexander Motin vol = tr->tro_volume;
61089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
61189b17223SAlexander Motin if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
61289b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
61389b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
61489b17223SAlexander Motin g_raid_iodone(bp, EIO);
61589b17223SAlexander Motin return;
61689b17223SAlexander Motin }
61789b17223SAlexander Motin /*
61889b17223SAlexander Motin * If we're rebuilding, squeeze in rebuild activity every so often,
61989b17223SAlexander Motin * even when the disk is busy. Be sure to only count real I/O
62089b17223SAlexander Motin * to the disk. All 'SPECIAL' I/O is traffic generated to the disk
62189b17223SAlexander Motin * by this module.
62289b17223SAlexander Motin */
62389b17223SAlexander Motin if (trs->trso_failed_sd != NULL &&
62489b17223SAlexander Motin !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
62589b17223SAlexander Motin /* Make this new or running now round short. */
62689b17223SAlexander Motin trs->trso_recover_slabs = 0;
62789b17223SAlexander Motin if (--trs->trso_fair_io <= 0) {
62889b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io;
62989b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr);
63089b17223SAlexander Motin }
63189b17223SAlexander Motin }
63289b17223SAlexander Motin switch (bp->bio_cmd) {
63389b17223SAlexander Motin case BIO_READ:
63489b17223SAlexander Motin g_raid_tr_iostart_raid1_read(tr, bp);
63589b17223SAlexander Motin break;
63689b17223SAlexander Motin case BIO_WRITE:
63789b17223SAlexander Motin case BIO_DELETE:
638609a7474SAlexander Motin g_raid_tr_iostart_raid1_write(tr, bp);
63989b17223SAlexander Motin break;
6408b522bdaSWarner Losh case BIO_SPEEDUP:
64189b17223SAlexander Motin case BIO_FLUSH:
64289b17223SAlexander Motin g_raid_tr_flush_common(tr, bp);
64389b17223SAlexander Motin break;
64489b17223SAlexander Motin default:
64589b17223SAlexander Motin KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
64689b17223SAlexander Motin bp->bio_cmd, vol->v_name));
64789b17223SAlexander Motin break;
64889b17223SAlexander Motin }
64989b17223SAlexander Motin }
65089b17223SAlexander Motin
65189b17223SAlexander Motin static void
g_raid_tr_iodone_raid1(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,struct bio * bp)65289b17223SAlexander Motin g_raid_tr_iodone_raid1(struct g_raid_tr_object *tr,
65389b17223SAlexander Motin struct g_raid_subdisk *sd, struct bio *bp)
65489b17223SAlexander Motin {
65589b17223SAlexander Motin struct bio *cbp;
65689b17223SAlexander Motin struct g_raid_subdisk *nsd;
65789b17223SAlexander Motin struct g_raid_volume *vol;
65889b17223SAlexander Motin struct bio *pbp;
65989b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
66089b17223SAlexander Motin uintptr_t *mask;
66189b17223SAlexander Motin int error, do_write;
66289b17223SAlexander Motin
66389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
66489b17223SAlexander Motin vol = tr->tro_volume;
66589b17223SAlexander Motin if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
66689b17223SAlexander Motin /*
66789b17223SAlexander Motin * This operation is part of a rebuild or resync operation.
66889b17223SAlexander Motin * See what work just got done, then schedule the next bit of
66989b17223SAlexander Motin * work, if any. Rebuild/resync is done a little bit at a
67089b17223SAlexander Motin * time. Either when a timeout happens, or after we get a
67189b17223SAlexander Motin * bunch of I/Os to the disk (to make sure an active system
67289b17223SAlexander Motin * will complete in a sane amount of time).
67389b17223SAlexander Motin *
67489b17223SAlexander Motin * We are setup to do differing amounts of work for each of
67589b17223SAlexander Motin * these cases. so long as the slabs is smallish (less than
67689b17223SAlexander Motin * 50 or so, I'd guess, but that's just a WAG), we shouldn't
67789b17223SAlexander Motin * have any bio starvation issues. For active disks, we do
67889b17223SAlexander Motin * 5MB of data, for inactive ones, we do 50MB.
67989b17223SAlexander Motin */
68089b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD) {
68189b17223SAlexander Motin if (bp->bio_cmd == BIO_READ) {
68289b17223SAlexander Motin /* Immediately abort rebuild, if requested. */
68389b17223SAlexander Motin if (trs->trso_flags & TR_RAID1_F_ABORT) {
68489b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
68589b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr);
68689b17223SAlexander Motin return;
68789b17223SAlexander Motin }
68889b17223SAlexander Motin
68989b17223SAlexander Motin /* On read error, skip and cross fingers. */
69089b17223SAlexander Motin if (bp->bio_error != 0) {
69189b17223SAlexander Motin G_RAID_LOGREQ(0, bp,
69289b17223SAlexander Motin "Read error during rebuild (%d), "
69389b17223SAlexander Motin "possible data loss!",
69489b17223SAlexander Motin bp->bio_error);
69589b17223SAlexander Motin goto rebuild_round_done;
69689b17223SAlexander Motin }
69789b17223SAlexander Motin
69889b17223SAlexander Motin /*
69989b17223SAlexander Motin * The read operation finished, queue the
70089b17223SAlexander Motin * write and get out.
70189b17223SAlexander Motin */
70289b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "rebuild read done. %d",
70389b17223SAlexander Motin bp->bio_error);
70489b17223SAlexander Motin bp->bio_cmd = BIO_WRITE;
70589b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
70689b17223SAlexander Motin G_RAID_LOGREQ(4, bp, "Queueing rebuild write.");
70789b17223SAlexander Motin g_raid_subdisk_iostart(trs->trso_failed_sd, bp);
70889b17223SAlexander Motin } else {
70989b17223SAlexander Motin /*
71089b17223SAlexander Motin * The write operation just finished. Do
71189b17223SAlexander Motin * another. We keep cloning the master bio
71289b17223SAlexander Motin * since it has the right buffers allocated to
71389b17223SAlexander Motin * it.
71489b17223SAlexander Motin */
71589b17223SAlexander Motin G_RAID_LOGREQ(4, bp,
71689b17223SAlexander Motin "rebuild write done. Error %d",
71789b17223SAlexander Motin bp->bio_error);
71889b17223SAlexander Motin nsd = trs->trso_failed_sd;
71989b17223SAlexander Motin if (bp->bio_error != 0 ||
72089b17223SAlexander Motin trs->trso_flags & TR_RAID1_F_ABORT) {
72189b17223SAlexander Motin if ((trs->trso_flags &
72289b17223SAlexander Motin TR_RAID1_F_ABORT) == 0) {
72389b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc,
72489b17223SAlexander Motin nsd, nsd->sd_disk);
72589b17223SAlexander Motin }
72689b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
72789b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr);
72889b17223SAlexander Motin return;
72989b17223SAlexander Motin }
73089b17223SAlexander Motin rebuild_round_done:
73189b17223SAlexander Motin nsd = trs->trso_failed_sd;
73289b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_LOCKED;
73389b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume,
73489b17223SAlexander Motin bp->bio_offset, bp->bio_length);
73589b17223SAlexander Motin nsd->sd_rebuild_pos += bp->bio_length;
73689b17223SAlexander Motin if (nsd->sd_rebuild_pos >= nsd->sd_size) {
73789b17223SAlexander Motin g_raid_tr_raid1_rebuild_finish(tr);
73889b17223SAlexander Motin return;
73989b17223SAlexander Motin }
74089b17223SAlexander Motin
74189b17223SAlexander Motin /* Abort rebuild if we are stopping */
74289b17223SAlexander Motin if (trs->trso_stopping) {
74389b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
74489b17223SAlexander Motin g_raid_tr_raid1_rebuild_abort(tr);
74589b17223SAlexander Motin return;
74689b17223SAlexander Motin }
74789b17223SAlexander Motin
74889b17223SAlexander Motin if (--trs->trso_meta_update <= 0) {
74989b17223SAlexander Motin g_raid_write_metadata(vol->v_softc,
75089b17223SAlexander Motin vol, nsd, nsd->sd_disk);
75189b17223SAlexander Motin trs->trso_meta_update =
75289b17223SAlexander Motin g_raid1_rebuild_meta_update;
75389b17223SAlexander Motin }
75489b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1_F_DOING_SOME;
75589b17223SAlexander Motin if (--trs->trso_recover_slabs <= 0)
75689b17223SAlexander Motin return;
75789b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr);
75889b17223SAlexander Motin }
75989b17223SAlexander Motin } else if (trs->trso_type == TR_RAID1_RESYNC) {
76089b17223SAlexander Motin /*
76189b17223SAlexander Motin * read good sd, read bad sd in parallel. when both
76289b17223SAlexander Motin * done, compare the buffers. write good to the bad
76389b17223SAlexander Motin * if different. do the next bit of work.
76489b17223SAlexander Motin */
76589b17223SAlexander Motin panic("Somehow, we think we're doing a resync");
76689b17223SAlexander Motin }
76789b17223SAlexander Motin return;
76889b17223SAlexander Motin }
76989b17223SAlexander Motin pbp = bp->bio_parent;
77089b17223SAlexander Motin pbp->bio_inbed++;
77189b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
77289b17223SAlexander Motin /*
77389b17223SAlexander Motin * Read failed on first drive. Retry the read error on
77489b17223SAlexander Motin * another disk drive, if available, before erroring out the
77589b17223SAlexander Motin * read.
77689b17223SAlexander Motin */
77789b17223SAlexander Motin sd->sd_disk->d_read_errs++;
77889b17223SAlexander Motin G_RAID_LOGREQ(0, bp,
77989b17223SAlexander Motin "Read error (%d), %d read errors total",
78089b17223SAlexander Motin bp->bio_error, sd->sd_disk->d_read_errs);
78189b17223SAlexander Motin
78289b17223SAlexander Motin /*
78389b17223SAlexander Motin * If there are too many read errors, we move to degraded.
78489b17223SAlexander Motin * XXX Do we want to FAIL the drive (eg, make the user redo
78589b17223SAlexander Motin * everything to get it back in sync), or just degrade the
78689b17223SAlexander Motin * drive, which kicks off a resync?
78789b17223SAlexander Motin */
78889b17223SAlexander Motin do_write = 1;
78989b17223SAlexander Motin if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) {
79089b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
79189b17223SAlexander Motin if (pbp->bio_children == 1)
79289b17223SAlexander Motin do_write = 0;
79389b17223SAlexander Motin }
79489b17223SAlexander Motin
79589b17223SAlexander Motin /*
79689b17223SAlexander Motin * Find the other disk, and try to do the I/O to it.
79789b17223SAlexander Motin */
79889b17223SAlexander Motin mask = (uintptr_t *)(&pbp->bio_driver2);
79989b17223SAlexander Motin if (pbp->bio_children == 1) {
80089b17223SAlexander Motin /* Save original subdisk. */
80189b17223SAlexander Motin pbp->bio_driver1 = do_write ? sd : NULL;
80289b17223SAlexander Motin *mask = 0;
80389b17223SAlexander Motin }
80489b17223SAlexander Motin *mask |= 1 << sd->sd_pos;
80589b17223SAlexander Motin nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask);
80689b17223SAlexander Motin if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) {
80789b17223SAlexander Motin g_destroy_bio(bp);
80889b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
80989b17223SAlexander Motin nsd->sd_pos);
81089b17223SAlexander Motin if (pbp->bio_children == 2 && do_write) {
81189b17223SAlexander Motin sd->sd_recovery++;
81289b17223SAlexander Motin cbp->bio_caller1 = nsd;
81389b17223SAlexander Motin pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED;
81489b17223SAlexander Motin /* Lock callback starts I/O */
81589b17223SAlexander Motin g_raid_lock_range(sd->sd_volume,
81689b17223SAlexander Motin cbp->bio_offset, cbp->bio_length, pbp, cbp);
81789b17223SAlexander Motin } else {
81889b17223SAlexander Motin g_raid_subdisk_iostart(nsd, cbp);
81989b17223SAlexander Motin }
82089b17223SAlexander Motin return;
82189b17223SAlexander Motin }
82289b17223SAlexander Motin /*
82389b17223SAlexander Motin * We can't retry. Return the original error by falling
82489b17223SAlexander Motin * through. This will happen when there's only one good disk.
82589b17223SAlexander Motin * We don't need to fail the raid, since its actual state is
82689b17223SAlexander Motin * based on the state of the subdisks.
82789b17223SAlexander Motin */
82889b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
82989b17223SAlexander Motin }
83089b17223SAlexander Motin if (bp->bio_cmd == BIO_READ &&
83189b17223SAlexander Motin bp->bio_error == 0 &&
83289b17223SAlexander Motin pbp->bio_children > 1 &&
83389b17223SAlexander Motin pbp->bio_driver1 != NULL) {
83489b17223SAlexander Motin /*
83589b17223SAlexander Motin * If it was a read, and bio_children is >1, then we just
83689b17223SAlexander Motin * recovered the data from the second drive. We should try to
83789b17223SAlexander Motin * write that data to the first drive if sector remapping is
83889b17223SAlexander Motin * enabled. A write should put the data in a new place on the
83989b17223SAlexander Motin * disk, remapping the bad sector. Do we need to do that by
84089b17223SAlexander Motin * queueing a request to the main worker thread? It doesn't
84189b17223SAlexander Motin * affect the return code of this current read, and can be
842e8d57122SPedro F. Giffuni * done at our leisure. However, to make the code simpler, it
843e8d57122SPedro F. Giffuni * is done synchronously.
84489b17223SAlexander Motin */
84589b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
84689b17223SAlexander Motin cbp = g_clone_bio(pbp);
84789b17223SAlexander Motin if (cbp != NULL) {
84889b17223SAlexander Motin g_destroy_bio(bp);
84989b17223SAlexander Motin cbp->bio_cmd = BIO_WRITE;
85089b17223SAlexander Motin cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
85189b17223SAlexander Motin G_RAID_LOGREQ(2, cbp,
85289b17223SAlexander Motin "Attempting bad sector remap on failing drive.");
85389b17223SAlexander Motin g_raid_subdisk_iostart(pbp->bio_driver1, cbp);
85489b17223SAlexander Motin return;
85589b17223SAlexander Motin }
85689b17223SAlexander Motin }
85789b17223SAlexander Motin if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) {
85889b17223SAlexander Motin /*
85989b17223SAlexander Motin * We're done with a recovery, mark the range as unlocked.
860e8d57122SPedro F. Giffuni * For any write errors, we aggressively fail the disk since
86189b17223SAlexander Motin * there was both a READ and a WRITE error at this location.
86289b17223SAlexander Motin * Both types of errors generally indicates the drive is on
86389b17223SAlexander Motin * the verge of total failure anyway. Better to stop trusting
86489b17223SAlexander Motin * it now. However, we need to reset error to 0 in that case
86589b17223SAlexander Motin * because we're not failing the original I/O which succeeded.
86689b17223SAlexander Motin */
86789b17223SAlexander Motin if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
86889b17223SAlexander Motin G_RAID_LOGREQ(0, bp, "Remap write failed: "
86989b17223SAlexander Motin "failing subdisk.");
87089b17223SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
87189b17223SAlexander Motin bp->bio_error = 0;
87289b17223SAlexander Motin }
87389b17223SAlexander Motin if (pbp->bio_driver1 != NULL) {
87489b17223SAlexander Motin ((struct g_raid_subdisk *)pbp->bio_driver1)
87589b17223SAlexander Motin ->sd_recovery--;
87689b17223SAlexander Motin }
87789b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
87889b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, bp->bio_offset,
87989b17223SAlexander Motin bp->bio_length);
88089b17223SAlexander Motin }
881650e245eSAlexander Motin if (pbp->bio_cmd != BIO_READ) {
882ef844ef7SAlexander Motin if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
883ef844ef7SAlexander Motin pbp->bio_error = bp->bio_error;
884650e245eSAlexander Motin if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
885ef844ef7SAlexander Motin G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
886ef844ef7SAlexander Motin g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk);
887ef844ef7SAlexander Motin }
888ef844ef7SAlexander Motin error = pbp->bio_error;
889ef844ef7SAlexander Motin } else
89089b17223SAlexander Motin error = bp->bio_error;
89189b17223SAlexander Motin g_destroy_bio(bp);
89289b17223SAlexander Motin if (pbp->bio_children == pbp->bio_inbed) {
89389b17223SAlexander Motin pbp->bio_completed = pbp->bio_length;
89489b17223SAlexander Motin g_raid_iodone(pbp, error);
89589b17223SAlexander Motin }
89689b17223SAlexander Motin }
89789b17223SAlexander Motin
89889b17223SAlexander Motin static int
g_raid_tr_kerneldump_raid1(struct g_raid_tr_object * tr,void * virtual,off_t offset,size_t length)899489ba222SMitchell Horne g_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, void *virtual,
900489ba222SMitchell Horne off_t offset, size_t length)
90189b17223SAlexander Motin {
90289b17223SAlexander Motin struct g_raid_volume *vol;
90389b17223SAlexander Motin struct g_raid_subdisk *sd;
90489b17223SAlexander Motin int error, i, ok;
90589b17223SAlexander Motin
90689b17223SAlexander Motin vol = tr->tro_volume;
90789b17223SAlexander Motin error = 0;
90889b17223SAlexander Motin ok = 0;
90989b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) {
91089b17223SAlexander Motin sd = &vol->v_subdisks[i];
91189b17223SAlexander Motin switch (sd->sd_state) {
91289b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE:
91389b17223SAlexander Motin break;
91489b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD:
91589b17223SAlexander Motin /*
91689b17223SAlexander Motin * When rebuilding, only part of this subdisk is
91789b17223SAlexander Motin * writable, the rest will be written as part of the
91889b17223SAlexander Motin * that process.
91989b17223SAlexander Motin */
92089b17223SAlexander Motin if (offset >= sd->sd_rebuild_pos)
92189b17223SAlexander Motin continue;
92289b17223SAlexander Motin break;
92389b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE:
92489b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC:
92589b17223SAlexander Motin /*
92689b17223SAlexander Motin * Resyncing still writes on the theory that the
92789b17223SAlexander Motin * resync'd disk is very close and writing it will
92889b17223SAlexander Motin * keep it that way better if we keep up while
92989b17223SAlexander Motin * resyncing.
93089b17223SAlexander Motin */
93189b17223SAlexander Motin break;
93289b17223SAlexander Motin default:
93389b17223SAlexander Motin continue;
93489b17223SAlexander Motin }
935489ba222SMitchell Horne error = g_raid_subdisk_kerneldump(sd, virtual, offset, length);
93689b17223SAlexander Motin if (error == 0)
93789b17223SAlexander Motin ok++;
93889b17223SAlexander Motin }
93989b17223SAlexander Motin return (ok > 0 ? 0 : error);
94089b17223SAlexander Motin }
94189b17223SAlexander Motin
94289b17223SAlexander Motin static int
g_raid_tr_locked_raid1(struct g_raid_tr_object * tr,void * argp)94389b17223SAlexander Motin g_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp)
94489b17223SAlexander Motin {
94589b17223SAlexander Motin struct bio *bp;
94689b17223SAlexander Motin struct g_raid_subdisk *sd;
94789b17223SAlexander Motin
94889b17223SAlexander Motin bp = (struct bio *)argp;
94989b17223SAlexander Motin sd = (struct g_raid_subdisk *)bp->bio_caller1;
95089b17223SAlexander Motin g_raid_subdisk_iostart(sd, bp);
95189b17223SAlexander Motin
95289b17223SAlexander Motin return (0);
95389b17223SAlexander Motin }
95489b17223SAlexander Motin
95589b17223SAlexander Motin static int
g_raid_tr_idle_raid1(struct g_raid_tr_object * tr)95689b17223SAlexander Motin g_raid_tr_idle_raid1(struct g_raid_tr_object *tr)
95789b17223SAlexander Motin {
95889b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
95989b17223SAlexander Motin
96089b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
96189b17223SAlexander Motin trs->trso_fair_io = g_raid1_rebuild_fair_io;
96289b17223SAlexander Motin trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle;
96389b17223SAlexander Motin if (trs->trso_type == TR_RAID1_REBUILD)
96489b17223SAlexander Motin g_raid_tr_raid1_rebuild_some(tr);
96589b17223SAlexander Motin return (0);
96689b17223SAlexander Motin }
96789b17223SAlexander Motin
96889b17223SAlexander Motin static int
g_raid_tr_free_raid1(struct g_raid_tr_object * tr)96989b17223SAlexander Motin g_raid_tr_free_raid1(struct g_raid_tr_object *tr)
97089b17223SAlexander Motin {
97189b17223SAlexander Motin struct g_raid_tr_raid1_object *trs;
97289b17223SAlexander Motin
97389b17223SAlexander Motin trs = (struct g_raid_tr_raid1_object *)tr;
97489b17223SAlexander Motin
97589b17223SAlexander Motin if (trs->trso_buffer != NULL) {
97689b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1);
97789b17223SAlexander Motin trs->trso_buffer = NULL;
97889b17223SAlexander Motin }
97989b17223SAlexander Motin return (0);
98089b17223SAlexander Motin }
98189b17223SAlexander Motin
982c89d2fbeSAlexander Motin G_RAID_TR_DECLARE(raid1, "RAID1");
983