1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bio.h> 34 #include <sys/endian.h> 35 #include <sys/kernel.h> 36 #include <sys/kobj.h> 37 #include <sys/limits.h> 38 #include <sys/lock.h> 39 #include <sys/malloc.h> 40 #include <sys/mutex.h> 41 #include <sys/sysctl.h> 42 #include <sys/systm.h> 43 #include <geom/geom.h> 44 #include <geom/geom_dbg.h> 45 #include "geom/raid/g_raid.h" 46 #include "g_raid_tr_if.h" 47 48 #define N 2 49 50 SYSCTL_DECL(_kern_geom_raid_raid1e); 51 52 #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 53 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; 54 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, 55 &g_raid1e_rebuild_slab, 0, 56 "Amount of the disk to rebuild each read/write cycle of the rebuild."); 57 58 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 59 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; 60 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, 61 &g_raid1e_rebuild_fair_io, 0, 62 "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 63 64 #define RAID1E_REBUILD_CLUSTER_IDLE 100 65 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; 66 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, 67 &g_raid1e_rebuild_cluster_idle, 0, 68 "Number of slabs to do each time we trigger a rebuild cycle"); 69 70 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 71 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; 72 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, 73 &g_raid1e_rebuild_meta_update, 0, 74 "When to update the meta data."); 75 76 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); 77 78 #define TR_RAID1E_NONE 0 79 #define TR_RAID1E_REBUILD 1 80 #define TR_RAID1E_RESYNC 2 81 82 #define TR_RAID1E_F_DOING_SOME 0x1 83 #define TR_RAID1E_F_LOCKED 0x2 84 #define TR_RAID1E_F_ABORT 0x4 85 86 struct g_raid_tr_raid1e_object { 87 struct g_raid_tr_object trso_base; 88 int trso_starting; 89 int trso_stopping; 90 int trso_type; 91 int trso_recover_slabs; /* slabs before rest */ 92 int trso_fair_io; 93 int trso_meta_update; 94 int trso_flags; 95 struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 96 void *trso_buffer; /* Buffer space */ 97 off_t trso_lock_pos; /* Locked range start. */ 98 off_t trso_lock_len; /* Locked range length. */ 99 struct bio trso_bio; 100 }; 101 102 static g_raid_tr_taste_t g_raid_tr_taste_raid1e; 103 static g_raid_tr_event_t g_raid_tr_event_raid1e; 104 static g_raid_tr_start_t g_raid_tr_start_raid1e; 105 static g_raid_tr_stop_t g_raid_tr_stop_raid1e; 106 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; 107 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; 108 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; 109 static g_raid_tr_locked_t g_raid_tr_locked_raid1e; 110 static g_raid_tr_idle_t g_raid_tr_idle_raid1e; 111 static g_raid_tr_free_t g_raid_tr_free_raid1e; 112 113 static kobj_method_t g_raid_tr_raid1e_methods[] = { 114 KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), 115 KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), 116 KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), 117 KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), 118 KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), 119 KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), 120 KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), 121 KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), 122 KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), 123 KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), 124 { 0, 0 } 125 }; 126 127 static struct g_raid_tr_class g_raid_tr_raid1e_class = { 128 "RAID1E", 129 g_raid_tr_raid1e_methods, 130 sizeof(struct g_raid_tr_raid1e_object), 131 .trc_enable = 1, 132 .trc_priority = 200, 133 .trc_accept_unmapped = 1 134 }; 135 136 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); 137 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 138 struct g_raid_subdisk *sd); 139 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 140 int no, off_t off, off_t len, u_int mask); 141 142 static inline void 143 V2P(struct g_raid_volume *vol, off_t virt, 144 int *disk, off_t *offset, off_t *start) 145 { 146 off_t nstrip; 147 u_int strip_size; 148 149 strip_size = vol->v_strip_size; 150 /* Strip number. */ 151 nstrip = virt / strip_size; 152 /* Start position in strip. */ 153 *start = virt % strip_size; 154 /* Disk number. */ 155 *disk = (nstrip * N) % vol->v_disks_count; 156 /* Strip start position in disk. */ 157 *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; 158 } 159 160 static inline void 161 P2V(struct g_raid_volume *vol, int disk, off_t offset, 162 off_t *virt, int *copy) 163 { 164 off_t nstrip, start; 165 u_int strip_size; 166 167 strip_size = vol->v_strip_size; 168 /* Start position in strip. */ 169 start = offset % strip_size; 170 /* Physical strip number. */ 171 nstrip = (offset / strip_size) * vol->v_disks_count + disk; 172 /* Number of physical strip (copy) inside virtual strip. */ 173 *copy = nstrip % N; 174 /* Offset in virtual space. */ 175 *virt = (nstrip / N) * strip_size + start; 176 } 177 178 static int 179 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 180 { 181 struct g_raid_tr_raid1e_object *trs; 182 183 trs = (struct g_raid_tr_raid1e_object *)tr; 184 if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || 185 tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) 186 return (G_RAID_TR_TASTE_FAIL); 187 trs->trso_starting = 1; 188 return (G_RAID_TR_TASTE_SUCCEED); 189 } 190 191 static int 192 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) 193 { 194 struct g_raid_softc *sc; 195 struct g_raid_subdisk *sd, *bestsd, *worstsd; 196 int i, j, state, sstate; 197 198 sc = vol->v_softc; 199 state = G_RAID_VOLUME_S_OPTIMAL; 200 for (i = 0; i < vol->v_disks_count / N; i++) { 201 bestsd = &vol->v_subdisks[i * N]; 202 for (j = 1; j < N; j++) { 203 sd = &vol->v_subdisks[i * N + j]; 204 if (sd->sd_state > bestsd->sd_state) 205 bestsd = sd; 206 else if (sd->sd_state == bestsd->sd_state && 207 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 208 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 209 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 210 bestsd = sd; 211 } 212 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && 213 bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { 214 /* We found reasonable candidate. */ 215 G_RAID_DEBUG1(1, sc, 216 "Promote subdisk %s:%d from %s to ACTIVE.", 217 vol->v_name, bestsd->sd_pos, 218 g_raid_subdisk_state2str(bestsd->sd_state)); 219 g_raid_change_subdisk_state(bestsd, 220 G_RAID_SUBDISK_S_ACTIVE); 221 g_raid_write_metadata(sc, 222 vol, bestsd, bestsd->sd_disk); 223 } 224 worstsd = &vol->v_subdisks[i * N]; 225 for (j = 1; j < N; j++) { 226 sd = &vol->v_subdisks[i * N + j]; 227 if (sd->sd_state < worstsd->sd_state) 228 worstsd = sd; 229 } 230 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 231 sstate = G_RAID_VOLUME_S_OPTIMAL; 232 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 233 sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 234 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 235 sstate = G_RAID_VOLUME_S_DEGRADED; 236 else 237 sstate = G_RAID_VOLUME_S_BROKEN; 238 if (sstate < state) 239 state = sstate; 240 } 241 return (state); 242 } 243 244 static int 245 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) 246 { 247 struct g_raid_softc *sc; 248 struct g_raid_subdisk *sd, *bestsd, *worstsd; 249 int i, j, state, sstate; 250 251 sc = vol->v_softc; 252 if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == 253 vol->v_disks_count) 254 return (G_RAID_VOLUME_S_OPTIMAL); 255 for (i = 0; i < vol->v_disks_count; i++) { 256 sd = &vol->v_subdisks[i]; 257 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { 258 /* We found reasonable candidate. */ 259 G_RAID_DEBUG1(1, sc, 260 "Promote subdisk %s:%d from %s to STALE.", 261 vol->v_name, sd->sd_pos, 262 g_raid_subdisk_state2str(sd->sd_state)); 263 g_raid_change_subdisk_state(sd, 264 G_RAID_SUBDISK_S_STALE); 265 g_raid_write_metadata(sc, vol, sd, sd->sd_disk); 266 } 267 } 268 state = G_RAID_VOLUME_S_OPTIMAL; 269 for (i = 0; i < vol->v_disks_count; i++) { 270 bestsd = &vol->v_subdisks[i]; 271 worstsd = &vol->v_subdisks[i]; 272 for (j = 1; j < N; j++) { 273 sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; 274 if (sd->sd_state > bestsd->sd_state) 275 bestsd = sd; 276 else if (sd->sd_state == bestsd->sd_state && 277 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 278 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 279 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 280 bestsd = sd; 281 if (sd->sd_state < worstsd->sd_state) 282 worstsd = sd; 283 } 284 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 285 sstate = G_RAID_VOLUME_S_OPTIMAL; 286 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 287 sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 288 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) 289 sstate = G_RAID_VOLUME_S_DEGRADED; 290 else 291 sstate = G_RAID_VOLUME_S_BROKEN; 292 if (sstate < state) 293 state = sstate; 294 } 295 return (state); 296 } 297 298 static int 299 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, 300 struct g_raid_subdisk *sd) 301 { 302 struct g_raid_tr_raid1e_object *trs; 303 struct g_raid_softc *sc; 304 u_int s; 305 306 sc = vol->v_softc; 307 trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; 308 if (trs->trso_stopping && 309 (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) 310 s = G_RAID_VOLUME_S_STOPPED; 311 else if (trs->trso_starting) 312 s = G_RAID_VOLUME_S_STARTING; 313 else { 314 if ((vol->v_disks_count % N) == 0) 315 s = g_raid_tr_update_state_raid1e_even(vol); 316 else 317 s = g_raid_tr_update_state_raid1e_odd(vol); 318 } 319 if (s != vol->v_state) { 320 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 321 G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 322 G_RAID_EVENT_VOLUME); 323 g_raid_change_volume_state(vol, s); 324 if (!trs->trso_starting && !trs->trso_stopping) 325 g_raid_write_metadata(sc, vol, NULL, NULL); 326 } 327 if (!trs->trso_starting && !trs->trso_stopping) 328 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); 329 return (0); 330 } 331 332 static void 333 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 334 struct g_raid_disk *disk) 335 { 336 struct g_raid_volume *vol; 337 338 vol = sd->sd_volume; 339 /* 340 * We don't fail the last disk in the pack, since it still has decent 341 * data on it and that's better than failing the disk if it is the root 342 * file system. 343 * 344 * XXX should this be controlled via a tunable? It makes sense for 345 * the volume that has / on it. I can't think of a case where we'd 346 * want the volume to go away on this kind of event. 347 */ 348 if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + 349 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + 350 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 351 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < 352 vol->v_disks_count) && 353 (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) 354 return; 355 g_raid_fail_disk(sc, sd, disk); 356 } 357 358 static void 359 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) 360 { 361 struct g_raid_volume *vol; 362 struct g_raid_subdisk *sd; 363 364 vol = trs->trso_base.tro_volume; 365 sd = trs->trso_failed_sd; 366 g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 367 free(trs->trso_buffer, M_TR_RAID1E); 368 trs->trso_buffer = NULL; 369 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 370 trs->trso_type = TR_RAID1E_NONE; 371 trs->trso_recover_slabs = 0; 372 trs->trso_failed_sd = NULL; 373 g_raid_tr_update_state_raid1e(vol, NULL); 374 } 375 376 static void 377 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) 378 { 379 struct g_raid_tr_raid1e_object *trs; 380 struct g_raid_subdisk *sd; 381 382 trs = (struct g_raid_tr_raid1e_object *)tr; 383 sd = trs->trso_failed_sd; 384 G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 385 "Subdisk %s:%d-%s rebuild completed.", 386 sd->sd_volume->v_name, sd->sd_pos, 387 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 388 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 389 sd->sd_rebuild_pos = 0; 390 g_raid_tr_raid1e_rebuild_done(trs); 391 } 392 393 static void 394 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) 395 { 396 struct g_raid_tr_raid1e_object *trs; 397 struct g_raid_subdisk *sd; 398 struct g_raid_volume *vol; 399 400 vol = tr->tro_volume; 401 trs = (struct g_raid_tr_raid1e_object *)tr; 402 sd = trs->trso_failed_sd; 403 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { 404 G_RAID_DEBUG1(1, vol->v_softc, 405 "Subdisk %s:%d-%s rebuild is aborting.", 406 sd->sd_volume->v_name, sd->sd_pos, 407 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 408 trs->trso_flags |= TR_RAID1E_F_ABORT; 409 } else { 410 G_RAID_DEBUG1(0, vol->v_softc, 411 "Subdisk %s:%d-%s rebuild aborted.", 412 sd->sd_volume->v_name, sd->sd_pos, 413 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 414 trs->trso_flags &= ~TR_RAID1E_F_ABORT; 415 if (trs->trso_flags & TR_RAID1E_F_LOCKED) { 416 trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 417 g_raid_unlock_range(tr->tro_volume, 418 trs->trso_lock_pos, trs->trso_lock_len); 419 } 420 g_raid_tr_raid1e_rebuild_done(trs); 421 } 422 } 423 424 static void 425 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) 426 { 427 struct g_raid_tr_raid1e_object *trs; 428 struct g_raid_softc *sc; 429 struct g_raid_volume *vol; 430 struct g_raid_subdisk *sd; 431 struct bio *bp; 432 off_t len, virtual, vend, offset, start; 433 int disk, copy, best; 434 435 trs = (struct g_raid_tr_raid1e_object *)tr; 436 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) 437 return; 438 vol = tr->tro_volume; 439 sc = vol->v_softc; 440 sd = trs->trso_failed_sd; 441 442 while (1) { 443 if (sd->sd_rebuild_pos >= sd->sd_size) { 444 g_raid_tr_raid1e_rebuild_finish(tr); 445 return; 446 } 447 /* Get virtual offset from physical rebuild position. */ 448 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); 449 /* Get physical offset back to get first stripe position. */ 450 V2P(vol, virtual, &disk, &offset, &start); 451 /* Calculate contignous data length. */ 452 len = MIN(g_raid1e_rebuild_slab, 453 sd->sd_size - sd->sd_rebuild_pos); 454 if ((vol->v_disks_count % N) != 0) 455 len = MIN(len, vol->v_strip_size - start); 456 /* Find disk with most accurate data. */ 457 best = g_raid_tr_raid1e_select_read_disk(vol, disk, 458 offset + start, len, 0); 459 if (best < 0) { 460 /* There is no any valid disk. */ 461 g_raid_tr_raid1e_rebuild_abort(tr); 462 return; 463 } else if (best != copy) { 464 /* Some other disk has better data. */ 465 break; 466 } 467 /* We have the most accurate data. Skip the range. */ 468 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", 469 sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); 470 sd->sd_rebuild_pos += len; 471 } 472 473 bp = &trs->trso_bio; 474 memset(bp, 0, sizeof(*bp)); 475 bp->bio_offset = offset + start + 476 ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); 477 bp->bio_length = len; 478 bp->bio_data = trs->trso_buffer; 479 bp->bio_cmd = BIO_READ; 480 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 481 bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; 482 G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); 483 /* 484 * If we are crossing stripe boundary, correct affected virtual 485 * range we should lock. 486 */ 487 if (start + len > vol->v_strip_size) { 488 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); 489 len = vend - virtual; 490 } 491 trs->trso_flags |= TR_RAID1E_F_DOING_SOME; 492 trs->trso_flags |= TR_RAID1E_F_LOCKED; 493 trs->trso_lock_pos = virtual; 494 trs->trso_lock_len = len; 495 /* Lock callback starts I/O */ 496 g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); 497 } 498 499 static void 500 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) 501 { 502 struct g_raid_volume *vol; 503 struct g_raid_tr_raid1e_object *trs; 504 struct g_raid_subdisk *sd; 505 506 vol = tr->tro_volume; 507 trs = (struct g_raid_tr_raid1e_object *)tr; 508 if (trs->trso_failed_sd) { 509 G_RAID_DEBUG1(1, vol->v_softc, 510 "Already rebuild in start rebuild. pos %jd\n", 511 (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 512 return; 513 } 514 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 515 if (sd == NULL) 516 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 517 if (sd == NULL) { 518 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 519 if (sd != NULL) { 520 sd->sd_rebuild_pos = 0; 521 g_raid_change_subdisk_state(sd, 522 G_RAID_SUBDISK_S_RESYNC); 523 g_raid_write_metadata(vol->v_softc, vol, sd, NULL); 524 } else { 525 sd = g_raid_get_subdisk(vol, 526 G_RAID_SUBDISK_S_UNINITIALIZED); 527 if (sd == NULL) 528 sd = g_raid_get_subdisk(vol, 529 G_RAID_SUBDISK_S_NEW); 530 if (sd != NULL) { 531 sd->sd_rebuild_pos = 0; 532 g_raid_change_subdisk_state(sd, 533 G_RAID_SUBDISK_S_REBUILD); 534 g_raid_write_metadata(vol->v_softc, 535 vol, sd, NULL); 536 } 537 } 538 } 539 if (sd == NULL) { 540 G_RAID_DEBUG1(1, vol->v_softc, 541 "No failed disk to rebuild. night night."); 542 return; 543 } 544 trs->trso_failed_sd = sd; 545 G_RAID_DEBUG1(0, vol->v_softc, 546 "Subdisk %s:%d-%s rebuild start at %jd.", 547 sd->sd_volume->v_name, sd->sd_pos, 548 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", 549 trs->trso_failed_sd->sd_rebuild_pos); 550 trs->trso_type = TR_RAID1E_REBUILD; 551 trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); 552 trs->trso_meta_update = g_raid1e_rebuild_meta_update; 553 g_raid_tr_raid1e_rebuild_some(tr); 554 } 555 556 static void 557 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 558 struct g_raid_subdisk *sd) 559 { 560 struct g_raid_volume *vol; 561 struct g_raid_tr_raid1e_object *trs; 562 int nr; 563 564 vol = tr->tro_volume; 565 trs = (struct g_raid_tr_raid1e_object *)tr; 566 if (trs->trso_stopping) 567 return; 568 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 569 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 570 switch(trs->trso_type) { 571 case TR_RAID1E_NONE: 572 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) 573 return; 574 if (nr == 0) { 575 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 576 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 577 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 578 if (nr == 0) 579 return; 580 } 581 g_raid_tr_raid1e_rebuild_start(tr); 582 break; 583 case TR_RAID1E_REBUILD: 584 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || 585 trs->trso_failed_sd == sd) 586 g_raid_tr_raid1e_rebuild_abort(tr); 587 break; 588 case TR_RAID1E_RESYNC: 589 break; 590 } 591 } 592 593 static int 594 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, 595 struct g_raid_subdisk *sd, u_int event) 596 { 597 598 g_raid_tr_update_state_raid1e(tr->tro_volume, sd); 599 return (0); 600 } 601 602 static int 603 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) 604 { 605 struct g_raid_tr_raid1e_object *trs; 606 struct g_raid_volume *vol; 607 608 trs = (struct g_raid_tr_raid1e_object *)tr; 609 vol = tr->tro_volume; 610 trs->trso_starting = 0; 611 g_raid_tr_update_state_raid1e(vol, NULL); 612 return (0); 613 } 614 615 static int 616 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) 617 { 618 struct g_raid_tr_raid1e_object *trs; 619 struct g_raid_volume *vol; 620 621 trs = (struct g_raid_tr_raid1e_object *)tr; 622 vol = tr->tro_volume; 623 trs->trso_starting = 0; 624 trs->trso_stopping = 1; 625 g_raid_tr_update_state_raid1e(vol, NULL); 626 return (0); 627 } 628 629 /* 630 * Select the disk to read from. Take into account: subdisk state, running 631 * error recovery, average disk load, head position and possible cache hits. 632 */ 633 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) 634 static int 635 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 636 int no, off_t off, off_t len, u_int mask) 637 { 638 struct g_raid_subdisk *sd; 639 off_t offset; 640 int i, best, prio, bestprio; 641 642 best = -1; 643 bestprio = INT_MAX; 644 for (i = 0; i < N; i++) { 645 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; 646 offset = off; 647 if (no + i >= vol->v_disks_count) 648 offset += vol->v_strip_size; 649 650 prio = G_RAID_SUBDISK_LOAD(sd); 651 if ((mask & (1 << sd->sd_pos)) != 0) 652 continue; 653 switch (sd->sd_state) { 654 case G_RAID_SUBDISK_S_ACTIVE: 655 break; 656 case G_RAID_SUBDISK_S_RESYNC: 657 if (offset + off < sd->sd_rebuild_pos) 658 break; 659 /* FALLTHROUGH */ 660 case G_RAID_SUBDISK_S_STALE: 661 prio += i << 24; 662 break; 663 case G_RAID_SUBDISK_S_REBUILD: 664 if (offset + off < sd->sd_rebuild_pos) 665 break; 666 /* FALLTHROUGH */ 667 default: 668 continue; 669 } 670 prio += min(sd->sd_recovery, 255) << 16; 671 /* If disk head is precisely in position - highly prefer it. */ 672 if (G_RAID_SUBDISK_POS(sd) == offset) 673 prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 674 else 675 /* If disk head is close to position - prefer it. */ 676 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < 677 G_RAID_SUBDISK_TRACK_SIZE) 678 prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 679 if (prio < bestprio) { 680 bestprio = prio; 681 best = i; 682 } 683 } 684 return (best); 685 } 686 687 static void 688 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) 689 { 690 struct g_raid_volume *vol; 691 struct g_raid_subdisk *sd; 692 struct bio_queue_head queue; 693 struct bio *cbp; 694 char *addr; 695 off_t offset, start, length, remain; 696 u_int no, strip_size; 697 int best; 698 699 vol = tr->tro_volume; 700 if ((bp->bio_flags & BIO_UNMAPPED) != 0) 701 addr = NULL; 702 else 703 addr = bp->bio_data; 704 strip_size = vol->v_strip_size; 705 V2P(vol, bp->bio_offset, &no, &offset, &start); 706 remain = bp->bio_length; 707 bioq_init(&queue); 708 while (remain > 0) { 709 length = MIN(strip_size - start, remain); 710 best = g_raid_tr_raid1e_select_read_disk(vol, 711 no, offset, length, 0); 712 KASSERT(best >= 0, ("No readable disk in volume %s!", 713 vol->v_name)); 714 no += best; 715 if (no >= vol->v_disks_count) { 716 no -= vol->v_disks_count; 717 offset += strip_size; 718 } 719 cbp = g_clone_bio(bp); 720 if (cbp == NULL) 721 goto failure; 722 cbp->bio_offset = offset + start; 723 cbp->bio_length = length; 724 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 725 cbp->bio_ma_offset += (uintptr_t)addr; 726 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 727 cbp->bio_ma_offset %= PAGE_SIZE; 728 cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 729 cbp->bio_length) / PAGE_SIZE; 730 } else 731 cbp->bio_data = addr; 732 cbp->bio_caller1 = &vol->v_subdisks[no]; 733 bioq_insert_tail(&queue, cbp); 734 no += N - best; 735 if (no >= vol->v_disks_count) { 736 no -= vol->v_disks_count; 737 offset += strip_size; 738 } 739 remain -= length; 740 addr += length; 741 start = 0; 742 } 743 while ((cbp = bioq_takefirst(&queue)) != NULL) { 744 sd = cbp->bio_caller1; 745 cbp->bio_caller1 = NULL; 746 g_raid_subdisk_iostart(sd, cbp); 747 } 748 return; 749 failure: 750 while ((cbp = bioq_takefirst(&queue)) != NULL) 751 g_destroy_bio(cbp); 752 if (bp->bio_error == 0) 753 bp->bio_error = ENOMEM; 754 g_raid_iodone(bp, bp->bio_error); 755 } 756 757 static void 758 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) 759 { 760 struct g_raid_volume *vol; 761 struct g_raid_subdisk *sd; 762 struct bio_queue_head queue; 763 struct bio *cbp; 764 char *addr; 765 off_t offset, start, length, remain; 766 u_int no, strip_size; 767 int i; 768 769 vol = tr->tro_volume; 770 if ((bp->bio_flags & BIO_UNMAPPED) != 0) 771 addr = NULL; 772 else 773 addr = bp->bio_data; 774 strip_size = vol->v_strip_size; 775 V2P(vol, bp->bio_offset, &no, &offset, &start); 776 remain = bp->bio_length; 777 bioq_init(&queue); 778 while (remain > 0) { 779 length = MIN(strip_size - start, remain); 780 for (i = 0; i < N; i++) { 781 sd = &vol->v_subdisks[no]; 782 switch (sd->sd_state) { 783 case G_RAID_SUBDISK_S_ACTIVE: 784 case G_RAID_SUBDISK_S_STALE: 785 case G_RAID_SUBDISK_S_RESYNC: 786 break; 787 case G_RAID_SUBDISK_S_REBUILD: 788 if (offset + start >= sd->sd_rebuild_pos) 789 goto nextdisk; 790 break; 791 default: 792 goto nextdisk; 793 } 794 cbp = g_clone_bio(bp); 795 if (cbp == NULL) 796 goto failure; 797 cbp->bio_offset = offset + start; 798 cbp->bio_length = length; 799 if ((bp->bio_flags & BIO_UNMAPPED) != 0 && 800 bp->bio_cmd != BIO_DELETE) { 801 cbp->bio_ma_offset += (uintptr_t)addr; 802 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 803 cbp->bio_ma_offset %= PAGE_SIZE; 804 cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 805 cbp->bio_length) / PAGE_SIZE; 806 } else 807 cbp->bio_data = addr; 808 cbp->bio_caller1 = sd; 809 bioq_insert_tail(&queue, cbp); 810 nextdisk: 811 if (++no >= vol->v_disks_count) { 812 no = 0; 813 offset += strip_size; 814 } 815 } 816 remain -= length; 817 if (bp->bio_cmd != BIO_DELETE) 818 addr += length; 819 start = 0; 820 } 821 while ((cbp = bioq_takefirst(&queue)) != NULL) { 822 sd = cbp->bio_caller1; 823 cbp->bio_caller1 = NULL; 824 g_raid_subdisk_iostart(sd, cbp); 825 } 826 return; 827 failure: 828 while ((cbp = bioq_takefirst(&queue)) != NULL) 829 g_destroy_bio(cbp); 830 if (bp->bio_error == 0) 831 bp->bio_error = ENOMEM; 832 g_raid_iodone(bp, bp->bio_error); 833 } 834 835 static void 836 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) 837 { 838 struct g_raid_volume *vol; 839 struct g_raid_tr_raid1e_object *trs; 840 841 vol = tr->tro_volume; 842 trs = (struct g_raid_tr_raid1e_object *)tr; 843 if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 844 vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 845 vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 846 g_raid_iodone(bp, EIO); 847 return; 848 } 849 /* 850 * If we're rebuilding, squeeze in rebuild activity every so often, 851 * even when the disk is busy. Be sure to only count real I/O 852 * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 853 * by this module. 854 */ 855 if (trs->trso_failed_sd != NULL && 856 !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 857 /* Make this new or running now round short. */ 858 trs->trso_recover_slabs = 0; 859 if (--trs->trso_fair_io <= 0) { 860 trs->trso_fair_io = g_raid1e_rebuild_fair_io; 861 g_raid_tr_raid1e_rebuild_some(tr); 862 } 863 } 864 switch (bp->bio_cmd) { 865 case BIO_READ: 866 g_raid_tr_iostart_raid1e_read(tr, bp); 867 break; 868 case BIO_WRITE: 869 case BIO_DELETE: 870 g_raid_tr_iostart_raid1e_write(tr, bp); 871 break; 872 case BIO_FLUSH: 873 g_raid_tr_flush_common(tr, bp); 874 break; 875 default: 876 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 877 bp->bio_cmd, vol->v_name)); 878 break; 879 } 880 } 881 882 static void 883 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, 884 struct g_raid_subdisk *sd, struct bio *bp) 885 { 886 struct bio *cbp; 887 struct g_raid_subdisk *nsd; 888 struct g_raid_volume *vol; 889 struct bio *pbp; 890 struct g_raid_tr_raid1e_object *trs; 891 off_t virtual, offset, start; 892 uintptr_t mask; 893 int error, do_write, copy, disk, best; 894 895 trs = (struct g_raid_tr_raid1e_object *)tr; 896 vol = tr->tro_volume; 897 if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 898 if (trs->trso_type == TR_RAID1E_REBUILD) { 899 nsd = trs->trso_failed_sd; 900 if (bp->bio_cmd == BIO_READ) { 901 902 /* Immediately abort rebuild, if requested. */ 903 if (trs->trso_flags & TR_RAID1E_F_ABORT) { 904 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 905 g_raid_tr_raid1e_rebuild_abort(tr); 906 return; 907 } 908 909 /* On read error, skip and cross fingers. */ 910 if (bp->bio_error != 0) { 911 G_RAID_LOGREQ(0, bp, 912 "Read error during rebuild (%d), " 913 "possible data loss!", 914 bp->bio_error); 915 goto rebuild_round_done; 916 } 917 918 /* 919 * The read operation finished, queue the 920 * write and get out. 921 */ 922 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", 923 bp->bio_error); 924 bp->bio_cmd = BIO_WRITE; 925 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 926 bp->bio_offset = nsd->sd_rebuild_pos; 927 G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); 928 g_raid_subdisk_iostart(nsd, bp); 929 } else { 930 /* 931 * The write operation just finished. Do 932 * another. We keep cloning the master bio 933 * since it has the right buffers allocated to 934 * it. 935 */ 936 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", 937 bp->bio_error); 938 if (bp->bio_error != 0 || 939 trs->trso_flags & TR_RAID1E_F_ABORT) { 940 if ((trs->trso_flags & 941 TR_RAID1E_F_ABORT) == 0) { 942 g_raid_tr_raid1e_fail_disk(sd->sd_softc, 943 nsd, nsd->sd_disk); 944 } 945 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 946 g_raid_tr_raid1e_rebuild_abort(tr); 947 return; 948 } 949 rebuild_round_done: 950 trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 951 g_raid_unlock_range(tr->tro_volume, 952 trs->trso_lock_pos, trs->trso_lock_len); 953 nsd->sd_rebuild_pos += bp->bio_length; 954 if (nsd->sd_rebuild_pos >= nsd->sd_size) { 955 g_raid_tr_raid1e_rebuild_finish(tr); 956 return; 957 } 958 959 /* Abort rebuild if we are stopping */ 960 if (trs->trso_stopping) { 961 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 962 g_raid_tr_raid1e_rebuild_abort(tr); 963 return; 964 } 965 966 if (--trs->trso_meta_update <= 0) { 967 g_raid_write_metadata(vol->v_softc, 968 vol, nsd, nsd->sd_disk); 969 trs->trso_meta_update = 970 g_raid1e_rebuild_meta_update; 971 /* Compensate short rebuild I/Os. */ 972 if ((vol->v_disks_count % N) != 0 && 973 vol->v_strip_size < 974 g_raid1e_rebuild_slab) { 975 trs->trso_meta_update *= 976 g_raid1e_rebuild_slab; 977 trs->trso_meta_update /= 978 vol->v_strip_size; 979 } 980 } 981 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 982 if (--trs->trso_recover_slabs <= 0) 983 return; 984 /* Run next rebuild iteration. */ 985 g_raid_tr_raid1e_rebuild_some(tr); 986 } 987 } else if (trs->trso_type == TR_RAID1E_RESYNC) { 988 /* 989 * read good sd, read bad sd in parallel. when both 990 * done, compare the buffers. write good to the bad 991 * if different. do the next bit of work. 992 */ 993 panic("Somehow, we think we're doing a resync"); 994 } 995 return; 996 } 997 pbp = bp->bio_parent; 998 pbp->bio_inbed++; 999 mask = (intptr_t)bp->bio_caller2; 1000 if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 1001 /* 1002 * Read failed on first drive. Retry the read error on 1003 * another disk drive, if available, before erroring out the 1004 * read. 1005 */ 1006 sd->sd_disk->d_read_errs++; 1007 G_RAID_LOGREQ(0, bp, 1008 "Read error (%d), %d read errors total", 1009 bp->bio_error, sd->sd_disk->d_read_errs); 1010 1011 /* 1012 * If there are too many read errors, we move to degraded. 1013 * XXX Do we want to FAIL the drive (eg, make the user redo 1014 * everything to get it back in sync), or just degrade the 1015 * drive, which kicks off a resync? 1016 */ 1017 do_write = 0; 1018 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) 1019 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1020 else if (mask == 0) 1021 do_write = 1; 1022 1023 /* Restore what we were doing. */ 1024 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1025 V2P(vol, virtual, &disk, &offset, &start); 1026 1027 /* Find the other disk, and try to do the I/O to it. */ 1028 mask |= 1 << copy; 1029 best = g_raid_tr_raid1e_select_read_disk(vol, 1030 disk, offset, start, mask); 1031 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1032 disk += best; 1033 if (disk >= vol->v_disks_count) { 1034 disk -= vol->v_disks_count; 1035 offset += vol->v_strip_size; 1036 } 1037 cbp->bio_offset = offset + start; 1038 cbp->bio_length = bp->bio_length; 1039 cbp->bio_data = bp->bio_data; 1040 cbp->bio_ma = bp->bio_ma; 1041 cbp->bio_ma_offset = bp->bio_ma_offset; 1042 cbp->bio_ma_n = bp->bio_ma_n; 1043 g_destroy_bio(bp); 1044 nsd = &vol->v_subdisks[disk]; 1045 G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 1046 nsd->sd_pos); 1047 if (do_write) 1048 mask |= 1 << 31; 1049 if ((mask & (1U << 31)) != 0) 1050 sd->sd_recovery++; 1051 cbp->bio_caller2 = (void *)mask; 1052 if (do_write) { 1053 cbp->bio_caller1 = nsd; 1054 /* Lock callback starts I/O */ 1055 g_raid_lock_range(sd->sd_volume, 1056 virtual, cbp->bio_length, pbp, cbp); 1057 } else { 1058 g_raid_subdisk_iostart(nsd, cbp); 1059 } 1060 return; 1061 } 1062 /* 1063 * We can't retry. Return the original error by falling 1064 * through. This will happen when there's only one good disk. 1065 * We don't need to fail the raid, since its actual state is 1066 * based on the state of the subdisks. 1067 */ 1068 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 1069 } 1070 if (bp->bio_cmd == BIO_READ && 1071 bp->bio_error == 0 && 1072 (mask & (1U << 31)) != 0) { 1073 G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 1074 1075 /* Restore what we were doing. */ 1076 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1077 V2P(vol, virtual, &disk, &offset, &start); 1078 1079 /* Find best disk to write. */ 1080 best = g_raid_tr_raid1e_select_read_disk(vol, 1081 disk, offset, start, ~mask); 1082 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1083 disk += best; 1084 if (disk >= vol->v_disks_count) { 1085 disk -= vol->v_disks_count; 1086 offset += vol->v_strip_size; 1087 } 1088 cbp->bio_offset = offset + start; 1089 cbp->bio_cmd = BIO_WRITE; 1090 cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 1091 cbp->bio_caller2 = (void *)mask; 1092 g_destroy_bio(bp); 1093 G_RAID_LOGREQ(2, cbp, 1094 "Attempting bad sector remap on failing drive."); 1095 g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); 1096 return; 1097 } 1098 } 1099 if ((mask & (1U << 31)) != 0) { 1100 /* 1101 * We're done with a recovery, mark the range as unlocked. 1102 * For any write errors, we aggressively fail the disk since 1103 * there was both a READ and a WRITE error at this location. 1104 * Both types of errors generally indicates the drive is on 1105 * the verge of total failure anyway. Better to stop trusting 1106 * it now. However, we need to reset error to 0 in that case 1107 * because we're not failing the original I/O which succeeded. 1108 */ 1109 1110 /* Restore what we were doing. */ 1111 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1112 V2P(vol, virtual, &disk, &offset, &start); 1113 1114 for (copy = 0; copy < N; copy++) { 1115 if ((mask & (1 << copy) ) != 0) 1116 vol->v_subdisks[(disk + copy) % 1117 vol->v_disks_count].sd_recovery--; 1118 } 1119 1120 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 1121 G_RAID_LOGREQ(0, bp, "Remap write failed: " 1122 "failing subdisk."); 1123 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1124 bp->bio_error = 0; 1125 } 1126 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 1127 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); 1128 } 1129 if (pbp->bio_cmd != BIO_READ) { 1130 if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 1131 pbp->bio_error = bp->bio_error; 1132 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { 1133 G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 1134 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1135 } 1136 error = pbp->bio_error; 1137 } else 1138 error = bp->bio_error; 1139 g_destroy_bio(bp); 1140 if (pbp->bio_children == pbp->bio_inbed) { 1141 pbp->bio_completed = pbp->bio_length; 1142 g_raid_iodone(pbp, error); 1143 } 1144 } 1145 1146 static int 1147 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, 1148 void *virtual, vm_offset_t physical, off_t boffset, size_t blength) 1149 { 1150 struct g_raid_volume *vol; 1151 struct g_raid_subdisk *sd; 1152 struct bio_queue_head queue; 1153 char *addr; 1154 off_t offset, start, length, remain; 1155 u_int no, strip_size; 1156 int i, error; 1157 1158 vol = tr->tro_volume; 1159 addr = virtual; 1160 strip_size = vol->v_strip_size; 1161 V2P(vol, boffset, &no, &offset, &start); 1162 remain = blength; 1163 bioq_init(&queue); 1164 while (remain > 0) { 1165 length = MIN(strip_size - start, remain); 1166 for (i = 0; i < N; i++) { 1167 sd = &vol->v_subdisks[no]; 1168 switch (sd->sd_state) { 1169 case G_RAID_SUBDISK_S_ACTIVE: 1170 case G_RAID_SUBDISK_S_STALE: 1171 case G_RAID_SUBDISK_S_RESYNC: 1172 break; 1173 case G_RAID_SUBDISK_S_REBUILD: 1174 if (offset + start >= sd->sd_rebuild_pos) 1175 goto nextdisk; 1176 break; 1177 default: 1178 goto nextdisk; 1179 } 1180 error = g_raid_subdisk_kerneldump(sd, 1181 addr, 0, offset + start, length); 1182 if (error != 0) 1183 return (error); 1184 nextdisk: 1185 if (++no >= vol->v_disks_count) { 1186 no = 0; 1187 offset += strip_size; 1188 } 1189 } 1190 remain -= length; 1191 addr += length; 1192 start = 0; 1193 } 1194 return (0); 1195 } 1196 1197 static int 1198 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) 1199 { 1200 struct bio *bp; 1201 struct g_raid_subdisk *sd; 1202 1203 bp = (struct bio *)argp; 1204 sd = (struct g_raid_subdisk *)bp->bio_caller1; 1205 g_raid_subdisk_iostart(sd, bp); 1206 1207 return (0); 1208 } 1209 1210 static int 1211 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) 1212 { 1213 struct g_raid_tr_raid1e_object *trs; 1214 struct g_raid_volume *vol; 1215 1216 vol = tr->tro_volume; 1217 trs = (struct g_raid_tr_raid1e_object *)tr; 1218 trs->trso_fair_io = g_raid1e_rebuild_fair_io; 1219 trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; 1220 /* Compensate short rebuild I/Os. */ 1221 if ((vol->v_disks_count % N) != 0 && 1222 vol->v_strip_size < g_raid1e_rebuild_slab) { 1223 trs->trso_recover_slabs *= g_raid1e_rebuild_slab; 1224 trs->trso_recover_slabs /= vol->v_strip_size; 1225 } 1226 if (trs->trso_type == TR_RAID1E_REBUILD) 1227 g_raid_tr_raid1e_rebuild_some(tr); 1228 return (0); 1229 } 1230 1231 static int 1232 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) 1233 { 1234 struct g_raid_tr_raid1e_object *trs; 1235 1236 trs = (struct g_raid_tr_raid1e_object *)tr; 1237 1238 if (trs->trso_buffer != NULL) { 1239 free(trs->trso_buffer, M_TR_RAID1E); 1240 trs->trso_buffer = NULL; 1241 } 1242 return (0); 1243 } 1244 1245 G_RAID_TR_DECLARE(raid1e, "RAID1E"); 1246