1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/param.h> 30 #include <sys/bio.h> 31 #include <sys/endian.h> 32 #include <sys/kernel.h> 33 #include <sys/kobj.h> 34 #include <sys/limits.h> 35 #include <sys/lock.h> 36 #include <sys/malloc.h> 37 #include <sys/mutex.h> 38 #include <sys/sysctl.h> 39 #include <sys/systm.h> 40 #include <geom/geom.h> 41 #include <geom/geom_dbg.h> 42 #include "geom/raid/g_raid.h" 43 #include "g_raid_tr_if.h" 44 45 #define N 2 46 47 SYSCTL_DECL(_kern_geom_raid_raid1e); 48 49 #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB; 51 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN, 52 &g_raid1e_rebuild_slab, 0, 53 "Amount of the disk to rebuild each read/write cycle of the rebuild."); 54 55 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 56 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO; 57 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN, 58 &g_raid1e_rebuild_fair_io, 0, 59 "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 60 61 #define RAID1E_REBUILD_CLUSTER_IDLE 100 62 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE; 63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN, 64 &g_raid1e_rebuild_cluster_idle, 0, 65 "Number of slabs to do each time we trigger a rebuild cycle"); 66 67 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 68 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE; 69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN, 70 &g_raid1e_rebuild_meta_update, 0, 71 "When to update the meta data."); 72 73 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data"); 74 75 #define TR_RAID1E_NONE 0 76 #define TR_RAID1E_REBUILD 1 77 #define TR_RAID1E_RESYNC 2 78 79 #define TR_RAID1E_F_DOING_SOME 0x1 80 #define TR_RAID1E_F_LOCKED 0x2 81 #define TR_RAID1E_F_ABORT 0x4 82 83 struct g_raid_tr_raid1e_object { 84 struct g_raid_tr_object trso_base; 85 int trso_starting; 86 int trso_stopping; 87 int trso_type; 88 int trso_recover_slabs; /* slabs before rest */ 89 int trso_fair_io; 90 int trso_meta_update; 91 int trso_flags; 92 struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 93 void *trso_buffer; /* Buffer space */ 94 off_t trso_lock_pos; /* Locked range start. */ 95 off_t trso_lock_len; /* Locked range length. */ 96 struct bio trso_bio; 97 }; 98 99 static g_raid_tr_taste_t g_raid_tr_taste_raid1e; 100 static g_raid_tr_event_t g_raid_tr_event_raid1e; 101 static g_raid_tr_start_t g_raid_tr_start_raid1e; 102 static g_raid_tr_stop_t g_raid_tr_stop_raid1e; 103 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e; 104 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e; 105 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e; 106 static g_raid_tr_locked_t g_raid_tr_locked_raid1e; 107 static g_raid_tr_idle_t g_raid_tr_idle_raid1e; 108 static g_raid_tr_free_t g_raid_tr_free_raid1e; 109 110 static kobj_method_t g_raid_tr_raid1e_methods[] = { 111 KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e), 112 KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e), 113 KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e), 114 KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e), 115 KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e), 116 KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e), 117 KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e), 118 KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e), 119 KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e), 120 KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e), 121 { 0, 0 } 122 }; 123 124 static struct g_raid_tr_class g_raid_tr_raid1e_class = { 125 "RAID1E", 126 g_raid_tr_raid1e_methods, 127 sizeof(struct g_raid_tr_raid1e_object), 128 .trc_enable = 1, 129 .trc_priority = 200, 130 .trc_accept_unmapped = 1 131 }; 132 133 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr); 134 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 135 struct g_raid_subdisk *sd); 136 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 137 int no, off_t off, off_t len, u_int mask); 138 139 static inline void 140 V2P(struct g_raid_volume *vol, off_t virt, 141 int *disk, off_t *offset, off_t *start) 142 { 143 off_t nstrip; 144 u_int strip_size; 145 146 strip_size = vol->v_strip_size; 147 /* Strip number. */ 148 nstrip = virt / strip_size; 149 /* Start position in strip. */ 150 *start = virt % strip_size; 151 /* Disk number. */ 152 *disk = (nstrip * N) % vol->v_disks_count; 153 /* Strip start position in disk. */ 154 *offset = ((nstrip * N) / vol->v_disks_count) * strip_size; 155 } 156 157 static inline void 158 P2V(struct g_raid_volume *vol, int disk, off_t offset, 159 off_t *virt, int *copy) 160 { 161 off_t nstrip, start; 162 u_int strip_size; 163 164 strip_size = vol->v_strip_size; 165 /* Start position in strip. */ 166 start = offset % strip_size; 167 /* Physical strip number. */ 168 nstrip = (offset / strip_size) * vol->v_disks_count + disk; 169 /* Number of physical strip (copy) inside virtual strip. */ 170 *copy = nstrip % N; 171 /* Offset in virtual space. */ 172 *virt = (nstrip / N) * strip_size + start; 173 } 174 175 static int 176 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 177 { 178 struct g_raid_tr_raid1e_object *trs; 179 180 trs = (struct g_raid_tr_raid1e_object *)tr; 181 if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E || 182 tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA) 183 return (G_RAID_TR_TASTE_FAIL); 184 trs->trso_starting = 1; 185 return (G_RAID_TR_TASTE_SUCCEED); 186 } 187 188 static int 189 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol) 190 { 191 struct g_raid_softc *sc; 192 struct g_raid_subdisk *sd, *bestsd, *worstsd; 193 int i, j, state, sstate; 194 195 sc = vol->v_softc; 196 state = G_RAID_VOLUME_S_OPTIMAL; 197 for (i = 0; i < vol->v_disks_count / N; i++) { 198 bestsd = &vol->v_subdisks[i * N]; 199 for (j = 1; j < N; j++) { 200 sd = &vol->v_subdisks[i * N + j]; 201 if (sd->sd_state > bestsd->sd_state) 202 bestsd = sd; 203 else if (sd->sd_state == bestsd->sd_state && 204 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 205 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 206 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 207 bestsd = sd; 208 } 209 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED && 210 bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { 211 /* We found reasonable candidate. */ 212 G_RAID_DEBUG1(1, sc, 213 "Promote subdisk %s:%d from %s to ACTIVE.", 214 vol->v_name, bestsd->sd_pos, 215 g_raid_subdisk_state2str(bestsd->sd_state)); 216 g_raid_change_subdisk_state(bestsd, 217 G_RAID_SUBDISK_S_ACTIVE); 218 g_raid_write_metadata(sc, 219 vol, bestsd, bestsd->sd_disk); 220 } 221 worstsd = &vol->v_subdisks[i * N]; 222 for (j = 1; j < N; j++) { 223 sd = &vol->v_subdisks[i * N + j]; 224 if (sd->sd_state < worstsd->sd_state) 225 worstsd = sd; 226 } 227 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 228 sstate = G_RAID_VOLUME_S_OPTIMAL; 229 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 230 sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 231 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 232 sstate = G_RAID_VOLUME_S_DEGRADED; 233 else 234 sstate = G_RAID_VOLUME_S_BROKEN; 235 if (sstate < state) 236 state = sstate; 237 } 238 return (state); 239 } 240 241 static int 242 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol) 243 { 244 struct g_raid_softc *sc; 245 struct g_raid_subdisk *sd, *bestsd, *worstsd; 246 int i, j, state, sstate; 247 248 sc = vol->v_softc; 249 if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) == 250 vol->v_disks_count) 251 return (G_RAID_VOLUME_S_OPTIMAL); 252 for (i = 0; i < vol->v_disks_count; i++) { 253 sd = &vol->v_subdisks[i]; 254 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) { 255 /* We found reasonable candidate. */ 256 G_RAID_DEBUG1(1, sc, 257 "Promote subdisk %s:%d from %s to STALE.", 258 vol->v_name, sd->sd_pos, 259 g_raid_subdisk_state2str(sd->sd_state)); 260 g_raid_change_subdisk_state(sd, 261 G_RAID_SUBDISK_S_STALE); 262 g_raid_write_metadata(sc, vol, sd, sd->sd_disk); 263 } 264 } 265 state = G_RAID_VOLUME_S_OPTIMAL; 266 for (i = 0; i < vol->v_disks_count; i++) { 267 bestsd = &vol->v_subdisks[i]; 268 worstsd = &vol->v_subdisks[i]; 269 for (j = 1; j < N; j++) { 270 sd = &vol->v_subdisks[(i + j) % vol->v_disks_count]; 271 if (sd->sd_state > bestsd->sd_state) 272 bestsd = sd; 273 else if (sd->sd_state == bestsd->sd_state && 274 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 275 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 276 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 277 bestsd = sd; 278 if (sd->sd_state < worstsd->sd_state) 279 worstsd = sd; 280 } 281 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE) 282 sstate = G_RAID_VOLUME_S_OPTIMAL; 283 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE) 284 sstate = G_RAID_VOLUME_S_SUBOPTIMAL; 285 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE) 286 sstate = G_RAID_VOLUME_S_DEGRADED; 287 else 288 sstate = G_RAID_VOLUME_S_BROKEN; 289 if (sstate < state) 290 state = sstate; 291 } 292 return (state); 293 } 294 295 static int 296 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol, 297 struct g_raid_subdisk *sd) 298 { 299 struct g_raid_tr_raid1e_object *trs; 300 struct g_raid_softc *sc; 301 u_int s; 302 303 sc = vol->v_softc; 304 trs = (struct g_raid_tr_raid1e_object *)vol->v_tr; 305 if (trs->trso_stopping && 306 (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0) 307 s = G_RAID_VOLUME_S_STOPPED; 308 else if (trs->trso_starting) 309 s = G_RAID_VOLUME_S_STARTING; 310 else { 311 if ((vol->v_disks_count % N) == 0) 312 s = g_raid_tr_update_state_raid1e_even(vol); 313 else 314 s = g_raid_tr_update_state_raid1e_odd(vol); 315 } 316 if (s != vol->v_state) { 317 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 318 G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 319 G_RAID_EVENT_VOLUME); 320 g_raid_change_volume_state(vol, s); 321 if (!trs->trso_starting && !trs->trso_stopping) 322 g_raid_write_metadata(sc, vol, NULL, NULL); 323 } 324 if (!trs->trso_starting && !trs->trso_stopping) 325 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd); 326 return (0); 327 } 328 329 static void 330 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 331 struct g_raid_disk *disk) 332 { 333 struct g_raid_volume *vol; 334 335 vol = sd->sd_volume; 336 /* 337 * We don't fail the last disk in the pack, since it still has decent 338 * data on it and that's better than failing the disk if it is the root 339 * file system. 340 * 341 * XXX should this be controlled via a tunable? It makes sense for 342 * the volume that has / on it. I can't think of a case where we'd 343 * want the volume to go away on this kind of event. 344 */ 345 if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) + 346 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) + 347 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 348 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) < 349 vol->v_disks_count) && 350 (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED)) 351 return; 352 g_raid_fail_disk(sc, sd, disk); 353 } 354 355 static void 356 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs) 357 { 358 struct g_raid_volume *vol; 359 struct g_raid_subdisk *sd; 360 361 vol = trs->trso_base.tro_volume; 362 sd = trs->trso_failed_sd; 363 g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 364 free(trs->trso_buffer, M_TR_RAID1E); 365 trs->trso_buffer = NULL; 366 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 367 trs->trso_type = TR_RAID1E_NONE; 368 trs->trso_recover_slabs = 0; 369 trs->trso_failed_sd = NULL; 370 g_raid_tr_update_state_raid1e(vol, NULL); 371 } 372 373 static void 374 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr) 375 { 376 struct g_raid_tr_raid1e_object *trs; 377 struct g_raid_subdisk *sd; 378 379 trs = (struct g_raid_tr_raid1e_object *)tr; 380 sd = trs->trso_failed_sd; 381 G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 382 "Subdisk %s:%d-%s rebuild completed.", 383 sd->sd_volume->v_name, sd->sd_pos, 384 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 385 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 386 sd->sd_rebuild_pos = 0; 387 g_raid_tr_raid1e_rebuild_done(trs); 388 } 389 390 static void 391 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr) 392 { 393 struct g_raid_tr_raid1e_object *trs; 394 struct g_raid_subdisk *sd; 395 struct g_raid_volume *vol; 396 397 vol = tr->tro_volume; 398 trs = (struct g_raid_tr_raid1e_object *)tr; 399 sd = trs->trso_failed_sd; 400 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) { 401 G_RAID_DEBUG1(1, vol->v_softc, 402 "Subdisk %s:%d-%s rebuild is aborting.", 403 sd->sd_volume->v_name, sd->sd_pos, 404 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 405 trs->trso_flags |= TR_RAID1E_F_ABORT; 406 } else { 407 G_RAID_DEBUG1(0, vol->v_softc, 408 "Subdisk %s:%d-%s rebuild aborted.", 409 sd->sd_volume->v_name, sd->sd_pos, 410 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 411 trs->trso_flags &= ~TR_RAID1E_F_ABORT; 412 if (trs->trso_flags & TR_RAID1E_F_LOCKED) { 413 trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 414 g_raid_unlock_range(tr->tro_volume, 415 trs->trso_lock_pos, trs->trso_lock_len); 416 } 417 g_raid_tr_raid1e_rebuild_done(trs); 418 } 419 } 420 421 static void 422 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr) 423 { 424 struct g_raid_tr_raid1e_object *trs; 425 struct g_raid_softc *sc; 426 struct g_raid_volume *vol; 427 struct g_raid_subdisk *sd; 428 struct bio *bp; 429 off_t len, virtual, vend, offset, start; 430 int disk, copy, best; 431 432 trs = (struct g_raid_tr_raid1e_object *)tr; 433 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) 434 return; 435 vol = tr->tro_volume; 436 sc = vol->v_softc; 437 sd = trs->trso_failed_sd; 438 439 while (1) { 440 if (sd->sd_rebuild_pos >= sd->sd_size) { 441 g_raid_tr_raid1e_rebuild_finish(tr); 442 return; 443 } 444 /* Get virtual offset from physical rebuild position. */ 445 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©); 446 /* Get physical offset back to get first stripe position. */ 447 V2P(vol, virtual, &disk, &offset, &start); 448 /* Calculate contignous data length. */ 449 len = MIN(g_raid1e_rebuild_slab, 450 sd->sd_size - sd->sd_rebuild_pos); 451 if ((vol->v_disks_count % N) != 0) 452 len = MIN(len, vol->v_strip_size - start); 453 /* Find disk with most accurate data. */ 454 best = g_raid_tr_raid1e_select_read_disk(vol, disk, 455 offset + start, len, 0); 456 if (best < 0) { 457 /* There is no any valid disk. */ 458 g_raid_tr_raid1e_rebuild_abort(tr); 459 return; 460 } else if (best != copy) { 461 /* Some other disk has better data. */ 462 break; 463 } 464 /* We have the most accurate data. Skip the range. */ 465 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju", 466 sd->sd_rebuild_pos, sd->sd_rebuild_pos + len); 467 sd->sd_rebuild_pos += len; 468 } 469 470 bp = &trs->trso_bio; 471 memset(bp, 0, sizeof(*bp)); 472 bp->bio_offset = offset + start + 473 ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0); 474 bp->bio_length = len; 475 bp->bio_data = trs->trso_buffer; 476 bp->bio_cmd = BIO_READ; 477 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 478 bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count]; 479 G_RAID_LOGREQ(3, bp, "Queueing rebuild read"); 480 /* 481 * If we are crossing stripe boundary, correct affected virtual 482 * range we should lock. 483 */ 484 if (start + len > vol->v_strip_size) { 485 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©); 486 len = vend - virtual; 487 } 488 trs->trso_flags |= TR_RAID1E_F_DOING_SOME; 489 trs->trso_flags |= TR_RAID1E_F_LOCKED; 490 trs->trso_lock_pos = virtual; 491 trs->trso_lock_len = len; 492 /* Lock callback starts I/O */ 493 g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp); 494 } 495 496 static void 497 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr) 498 { 499 struct g_raid_volume *vol; 500 struct g_raid_tr_raid1e_object *trs; 501 struct g_raid_subdisk *sd; 502 503 vol = tr->tro_volume; 504 trs = (struct g_raid_tr_raid1e_object *)tr; 505 if (trs->trso_failed_sd) { 506 G_RAID_DEBUG1(1, vol->v_softc, 507 "Already rebuild in start rebuild. pos %jd\n", 508 (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 509 return; 510 } 511 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 512 if (sd == NULL) 513 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 514 if (sd == NULL) { 515 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 516 if (sd != NULL) { 517 sd->sd_rebuild_pos = 0; 518 g_raid_change_subdisk_state(sd, 519 G_RAID_SUBDISK_S_RESYNC); 520 g_raid_write_metadata(vol->v_softc, vol, sd, NULL); 521 } else { 522 sd = g_raid_get_subdisk(vol, 523 G_RAID_SUBDISK_S_UNINITIALIZED); 524 if (sd == NULL) 525 sd = g_raid_get_subdisk(vol, 526 G_RAID_SUBDISK_S_NEW); 527 if (sd != NULL) { 528 sd->sd_rebuild_pos = 0; 529 g_raid_change_subdisk_state(sd, 530 G_RAID_SUBDISK_S_REBUILD); 531 g_raid_write_metadata(vol->v_softc, 532 vol, sd, NULL); 533 } 534 } 535 } 536 if (sd == NULL) { 537 G_RAID_DEBUG1(1, vol->v_softc, 538 "No failed disk to rebuild. night night."); 539 return; 540 } 541 trs->trso_failed_sd = sd; 542 G_RAID_DEBUG1(0, vol->v_softc, 543 "Subdisk %s:%d-%s rebuild start at %jd.", 544 sd->sd_volume->v_name, sd->sd_pos, 545 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", 546 trs->trso_failed_sd->sd_rebuild_pos); 547 trs->trso_type = TR_RAID1E_REBUILD; 548 trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK); 549 trs->trso_meta_update = g_raid1e_rebuild_meta_update; 550 g_raid_tr_raid1e_rebuild_some(tr); 551 } 552 553 static void 554 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr, 555 struct g_raid_subdisk *sd) 556 { 557 struct g_raid_volume *vol; 558 struct g_raid_tr_raid1e_object *trs; 559 int nr; 560 561 vol = tr->tro_volume; 562 trs = (struct g_raid_tr_raid1e_object *)tr; 563 if (trs->trso_stopping) 564 return; 565 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 566 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 567 switch(trs->trso_type) { 568 case TR_RAID1E_NONE: 569 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED) 570 return; 571 if (nr == 0) { 572 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 573 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 574 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 575 if (nr == 0) 576 return; 577 } 578 g_raid_tr_raid1e_rebuild_start(tr); 579 break; 580 case TR_RAID1E_REBUILD: 581 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 || 582 trs->trso_failed_sd == sd) 583 g_raid_tr_raid1e_rebuild_abort(tr); 584 break; 585 case TR_RAID1E_RESYNC: 586 break; 587 } 588 } 589 590 static int 591 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr, 592 struct g_raid_subdisk *sd, u_int event) 593 { 594 595 g_raid_tr_update_state_raid1e(tr->tro_volume, sd); 596 return (0); 597 } 598 599 static int 600 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr) 601 { 602 struct g_raid_tr_raid1e_object *trs; 603 struct g_raid_volume *vol; 604 605 trs = (struct g_raid_tr_raid1e_object *)tr; 606 vol = tr->tro_volume; 607 trs->trso_starting = 0; 608 g_raid_tr_update_state_raid1e(vol, NULL); 609 return (0); 610 } 611 612 static int 613 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr) 614 { 615 struct g_raid_tr_raid1e_object *trs; 616 struct g_raid_volume *vol; 617 618 trs = (struct g_raid_tr_raid1e_object *)tr; 619 vol = tr->tro_volume; 620 trs->trso_starting = 0; 621 trs->trso_stopping = 1; 622 g_raid_tr_update_state_raid1e(vol, NULL); 623 return (0); 624 } 625 626 /* 627 * Select the disk to read from. Take into account: subdisk state, running 628 * error recovery, average disk load, head position and possible cache hits. 629 */ 630 #define ABS(x) (((x) >= 0) ? (x) : (-(x))) 631 static int 632 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol, 633 int no, off_t off, off_t len, u_int mask) 634 { 635 struct g_raid_subdisk *sd; 636 off_t offset; 637 int i, best, prio, bestprio; 638 639 best = -1; 640 bestprio = INT_MAX; 641 for (i = 0; i < N; i++) { 642 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count]; 643 offset = off; 644 if (no + i >= vol->v_disks_count) 645 offset += vol->v_strip_size; 646 647 prio = G_RAID_SUBDISK_LOAD(sd); 648 if ((mask & (1 << sd->sd_pos)) != 0) 649 continue; 650 switch (sd->sd_state) { 651 case G_RAID_SUBDISK_S_ACTIVE: 652 break; 653 case G_RAID_SUBDISK_S_RESYNC: 654 if (offset + off < sd->sd_rebuild_pos) 655 break; 656 /* FALLTHROUGH */ 657 case G_RAID_SUBDISK_S_STALE: 658 prio += i << 24; 659 break; 660 case G_RAID_SUBDISK_S_REBUILD: 661 if (offset + off < sd->sd_rebuild_pos) 662 break; 663 /* FALLTHROUGH */ 664 default: 665 continue; 666 } 667 prio += min(sd->sd_recovery, 255) << 16; 668 /* If disk head is precisely in position - highly prefer it. */ 669 if (G_RAID_SUBDISK_POS(sd) == offset) 670 prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 671 else 672 /* If disk head is close to position - prefer it. */ 673 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) < 674 G_RAID_SUBDISK_TRACK_SIZE) 675 prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 676 if (prio < bestprio) { 677 bestprio = prio; 678 best = i; 679 } 680 } 681 return (best); 682 } 683 684 static void 685 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp) 686 { 687 struct g_raid_volume *vol; 688 struct g_raid_subdisk *sd; 689 struct bio_queue_head queue; 690 struct bio *cbp; 691 char *addr; 692 off_t offset, start, length, remain; 693 u_int no, strip_size; 694 int best; 695 696 vol = tr->tro_volume; 697 if ((bp->bio_flags & BIO_UNMAPPED) != 0) 698 addr = NULL; 699 else 700 addr = bp->bio_data; 701 strip_size = vol->v_strip_size; 702 V2P(vol, bp->bio_offset, &no, &offset, &start); 703 remain = bp->bio_length; 704 bioq_init(&queue); 705 while (remain > 0) { 706 length = MIN(strip_size - start, remain); 707 best = g_raid_tr_raid1e_select_read_disk(vol, 708 no, offset, length, 0); 709 KASSERT(best >= 0, ("No readable disk in volume %s!", 710 vol->v_name)); 711 no += best; 712 if (no >= vol->v_disks_count) { 713 no -= vol->v_disks_count; 714 offset += strip_size; 715 } 716 cbp = g_clone_bio(bp); 717 if (cbp == NULL) 718 goto failure; 719 cbp->bio_offset = offset + start; 720 cbp->bio_length = length; 721 if ((bp->bio_flags & BIO_UNMAPPED) != 0) { 722 cbp->bio_ma_offset += (uintptr_t)addr; 723 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 724 cbp->bio_ma_offset %= PAGE_SIZE; 725 cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 726 cbp->bio_length) / PAGE_SIZE; 727 } else 728 cbp->bio_data = addr; 729 cbp->bio_caller1 = &vol->v_subdisks[no]; 730 bioq_insert_tail(&queue, cbp); 731 no += N - best; 732 if (no >= vol->v_disks_count) { 733 no -= vol->v_disks_count; 734 offset += strip_size; 735 } 736 remain -= length; 737 addr += length; 738 start = 0; 739 } 740 while ((cbp = bioq_takefirst(&queue)) != NULL) { 741 sd = cbp->bio_caller1; 742 cbp->bio_caller1 = NULL; 743 g_raid_subdisk_iostart(sd, cbp); 744 } 745 return; 746 failure: 747 while ((cbp = bioq_takefirst(&queue)) != NULL) 748 g_destroy_bio(cbp); 749 if (bp->bio_error == 0) 750 bp->bio_error = ENOMEM; 751 g_raid_iodone(bp, bp->bio_error); 752 } 753 754 static void 755 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp) 756 { 757 struct g_raid_volume *vol; 758 struct g_raid_subdisk *sd; 759 struct bio_queue_head queue; 760 struct bio *cbp; 761 char *addr; 762 off_t offset, start, length, remain; 763 u_int no, strip_size; 764 int i; 765 766 vol = tr->tro_volume; 767 if ((bp->bio_flags & BIO_UNMAPPED) != 0) 768 addr = NULL; 769 else 770 addr = bp->bio_data; 771 strip_size = vol->v_strip_size; 772 V2P(vol, bp->bio_offset, &no, &offset, &start); 773 remain = bp->bio_length; 774 bioq_init(&queue); 775 while (remain > 0) { 776 length = MIN(strip_size - start, remain); 777 for (i = 0; i < N; i++) { 778 sd = &vol->v_subdisks[no]; 779 switch (sd->sd_state) { 780 case G_RAID_SUBDISK_S_ACTIVE: 781 case G_RAID_SUBDISK_S_STALE: 782 case G_RAID_SUBDISK_S_RESYNC: 783 break; 784 case G_RAID_SUBDISK_S_REBUILD: 785 if (offset + start >= sd->sd_rebuild_pos) 786 goto nextdisk; 787 break; 788 default: 789 goto nextdisk; 790 } 791 cbp = g_clone_bio(bp); 792 if (cbp == NULL) 793 goto failure; 794 cbp->bio_offset = offset + start; 795 cbp->bio_length = length; 796 if ((bp->bio_flags & BIO_UNMAPPED) != 0 && 797 bp->bio_cmd != BIO_DELETE) { 798 cbp->bio_ma_offset += (uintptr_t)addr; 799 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE; 800 cbp->bio_ma_offset %= PAGE_SIZE; 801 cbp->bio_ma_n = round_page(cbp->bio_ma_offset + 802 cbp->bio_length) / PAGE_SIZE; 803 } else 804 cbp->bio_data = addr; 805 cbp->bio_caller1 = sd; 806 bioq_insert_tail(&queue, cbp); 807 nextdisk: 808 if (++no >= vol->v_disks_count) { 809 no = 0; 810 offset += strip_size; 811 } 812 } 813 remain -= length; 814 if (bp->bio_cmd != BIO_DELETE) 815 addr += length; 816 start = 0; 817 } 818 while ((cbp = bioq_takefirst(&queue)) != NULL) { 819 sd = cbp->bio_caller1; 820 cbp->bio_caller1 = NULL; 821 g_raid_subdisk_iostart(sd, cbp); 822 } 823 return; 824 failure: 825 while ((cbp = bioq_takefirst(&queue)) != NULL) 826 g_destroy_bio(cbp); 827 if (bp->bio_error == 0) 828 bp->bio_error = ENOMEM; 829 g_raid_iodone(bp, bp->bio_error); 830 } 831 832 static void 833 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp) 834 { 835 struct g_raid_volume *vol; 836 struct g_raid_tr_raid1e_object *trs; 837 838 vol = tr->tro_volume; 839 trs = (struct g_raid_tr_raid1e_object *)tr; 840 if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 841 vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 842 vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 843 g_raid_iodone(bp, EIO); 844 return; 845 } 846 /* 847 * If we're rebuilding, squeeze in rebuild activity every so often, 848 * even when the disk is busy. Be sure to only count real I/O 849 * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 850 * by this module. 851 */ 852 if (trs->trso_failed_sd != NULL && 853 !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 854 /* Make this new or running now round short. */ 855 trs->trso_recover_slabs = 0; 856 if (--trs->trso_fair_io <= 0) { 857 trs->trso_fair_io = g_raid1e_rebuild_fair_io; 858 g_raid_tr_raid1e_rebuild_some(tr); 859 } 860 } 861 switch (bp->bio_cmd) { 862 case BIO_READ: 863 g_raid_tr_iostart_raid1e_read(tr, bp); 864 break; 865 case BIO_WRITE: 866 case BIO_DELETE: 867 g_raid_tr_iostart_raid1e_write(tr, bp); 868 break; 869 case BIO_SPEEDUP: 870 case BIO_FLUSH: 871 g_raid_tr_flush_common(tr, bp); 872 break; 873 default: 874 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 875 bp->bio_cmd, vol->v_name)); 876 break; 877 } 878 } 879 880 static void 881 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr, 882 struct g_raid_subdisk *sd, struct bio *bp) 883 { 884 struct bio *cbp; 885 struct g_raid_subdisk *nsd; 886 struct g_raid_volume *vol; 887 struct bio *pbp; 888 struct g_raid_tr_raid1e_object *trs; 889 off_t virtual, offset, start; 890 uintptr_t mask; 891 int error, do_write, copy, disk, best; 892 893 trs = (struct g_raid_tr_raid1e_object *)tr; 894 vol = tr->tro_volume; 895 if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 896 if (trs->trso_type == TR_RAID1E_REBUILD) { 897 nsd = trs->trso_failed_sd; 898 if (bp->bio_cmd == BIO_READ) { 899 /* Immediately abort rebuild, if requested. */ 900 if (trs->trso_flags & TR_RAID1E_F_ABORT) { 901 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 902 g_raid_tr_raid1e_rebuild_abort(tr); 903 return; 904 } 905 906 /* On read error, skip and cross fingers. */ 907 if (bp->bio_error != 0) { 908 G_RAID_LOGREQ(0, bp, 909 "Read error during rebuild (%d), " 910 "possible data loss!", 911 bp->bio_error); 912 goto rebuild_round_done; 913 } 914 915 /* 916 * The read operation finished, queue the 917 * write and get out. 918 */ 919 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d", 920 bp->bio_error); 921 bp->bio_cmd = BIO_WRITE; 922 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 923 bp->bio_offset = nsd->sd_rebuild_pos; 924 G_RAID_LOGREQ(3, bp, "Queueing rebuild write."); 925 g_raid_subdisk_iostart(nsd, bp); 926 } else { 927 /* 928 * The write operation just finished. Do 929 * another. We keep cloning the master bio 930 * since it has the right buffers allocated to 931 * it. 932 */ 933 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d", 934 bp->bio_error); 935 if (bp->bio_error != 0 || 936 trs->trso_flags & TR_RAID1E_F_ABORT) { 937 if ((trs->trso_flags & 938 TR_RAID1E_F_ABORT) == 0) { 939 g_raid_tr_raid1e_fail_disk(sd->sd_softc, 940 nsd, nsd->sd_disk); 941 } 942 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 943 g_raid_tr_raid1e_rebuild_abort(tr); 944 return; 945 } 946 rebuild_round_done: 947 trs->trso_flags &= ~TR_RAID1E_F_LOCKED; 948 g_raid_unlock_range(tr->tro_volume, 949 trs->trso_lock_pos, trs->trso_lock_len); 950 nsd->sd_rebuild_pos += bp->bio_length; 951 if (nsd->sd_rebuild_pos >= nsd->sd_size) { 952 g_raid_tr_raid1e_rebuild_finish(tr); 953 return; 954 } 955 956 /* Abort rebuild if we are stopping */ 957 if (trs->trso_stopping) { 958 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 959 g_raid_tr_raid1e_rebuild_abort(tr); 960 return; 961 } 962 963 if (--trs->trso_meta_update <= 0) { 964 g_raid_write_metadata(vol->v_softc, 965 vol, nsd, nsd->sd_disk); 966 trs->trso_meta_update = 967 g_raid1e_rebuild_meta_update; 968 /* Compensate short rebuild I/Os. */ 969 if ((vol->v_disks_count % N) != 0 && 970 vol->v_strip_size < 971 g_raid1e_rebuild_slab) { 972 trs->trso_meta_update *= 973 g_raid1e_rebuild_slab; 974 trs->trso_meta_update /= 975 vol->v_strip_size; 976 } 977 } 978 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME; 979 if (--trs->trso_recover_slabs <= 0) 980 return; 981 /* Run next rebuild iteration. */ 982 g_raid_tr_raid1e_rebuild_some(tr); 983 } 984 } else if (trs->trso_type == TR_RAID1E_RESYNC) { 985 /* 986 * read good sd, read bad sd in parallel. when both 987 * done, compare the buffers. write good to the bad 988 * if different. do the next bit of work. 989 */ 990 panic("Somehow, we think we're doing a resync"); 991 } 992 return; 993 } 994 pbp = bp->bio_parent; 995 pbp->bio_inbed++; 996 mask = (intptr_t)bp->bio_caller2; 997 if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 998 /* 999 * Read failed on first drive. Retry the read error on 1000 * another disk drive, if available, before erroring out the 1001 * read. 1002 */ 1003 sd->sd_disk->d_read_errs++; 1004 G_RAID_LOGREQ(0, bp, 1005 "Read error (%d), %d read errors total", 1006 bp->bio_error, sd->sd_disk->d_read_errs); 1007 1008 /* 1009 * If there are too many read errors, we move to degraded. 1010 * XXX Do we want to FAIL the drive (eg, make the user redo 1011 * everything to get it back in sync), or just degrade the 1012 * drive, which kicks off a resync? 1013 */ 1014 do_write = 0; 1015 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) 1016 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1017 else if (mask == 0) 1018 do_write = 1; 1019 1020 /* Restore what we were doing. */ 1021 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1022 V2P(vol, virtual, &disk, &offset, &start); 1023 1024 /* Find the other disk, and try to do the I/O to it. */ 1025 mask |= 1 << copy; 1026 best = g_raid_tr_raid1e_select_read_disk(vol, 1027 disk, offset, start, mask); 1028 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1029 disk += best; 1030 if (disk >= vol->v_disks_count) { 1031 disk -= vol->v_disks_count; 1032 offset += vol->v_strip_size; 1033 } 1034 cbp->bio_offset = offset + start; 1035 cbp->bio_length = bp->bio_length; 1036 cbp->bio_data = bp->bio_data; 1037 cbp->bio_ma = bp->bio_ma; 1038 cbp->bio_ma_offset = bp->bio_ma_offset; 1039 cbp->bio_ma_n = bp->bio_ma_n; 1040 g_destroy_bio(bp); 1041 nsd = &vol->v_subdisks[disk]; 1042 G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 1043 nsd->sd_pos); 1044 if (do_write) 1045 mask |= 1 << 31; 1046 if ((mask & (1U << 31)) != 0) 1047 sd->sd_recovery++; 1048 cbp->bio_caller2 = (void *)mask; 1049 if (do_write) { 1050 cbp->bio_caller1 = nsd; 1051 /* Lock callback starts I/O */ 1052 g_raid_lock_range(sd->sd_volume, 1053 virtual, cbp->bio_length, pbp, cbp); 1054 } else { 1055 g_raid_subdisk_iostart(nsd, cbp); 1056 } 1057 return; 1058 } 1059 /* 1060 * We can't retry. Return the original error by falling 1061 * through. This will happen when there's only one good disk. 1062 * We don't need to fail the raid, since its actual state is 1063 * based on the state of the subdisks. 1064 */ 1065 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 1066 } 1067 if (bp->bio_cmd == BIO_READ && 1068 bp->bio_error == 0 && 1069 (mask & (1U << 31)) != 0) { 1070 G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 1071 1072 /* Restore what we were doing. */ 1073 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1074 V2P(vol, virtual, &disk, &offset, &start); 1075 1076 /* Find best disk to write. */ 1077 best = g_raid_tr_raid1e_select_read_disk(vol, 1078 disk, offset, start, ~mask); 1079 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) { 1080 disk += best; 1081 if (disk >= vol->v_disks_count) { 1082 disk -= vol->v_disks_count; 1083 offset += vol->v_strip_size; 1084 } 1085 cbp->bio_offset = offset + start; 1086 cbp->bio_cmd = BIO_WRITE; 1087 cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 1088 cbp->bio_caller2 = (void *)mask; 1089 g_destroy_bio(bp); 1090 G_RAID_LOGREQ(2, cbp, 1091 "Attempting bad sector remap on failing drive."); 1092 g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp); 1093 return; 1094 } 1095 } 1096 if ((mask & (1U << 31)) != 0) { 1097 /* 1098 * We're done with a recovery, mark the range as unlocked. 1099 * For any write errors, we aggressively fail the disk since 1100 * there was both a READ and a WRITE error at this location. 1101 * Both types of errors generally indicates the drive is on 1102 * the verge of total failure anyway. Better to stop trusting 1103 * it now. However, we need to reset error to 0 in that case 1104 * because we're not failing the original I/O which succeeded. 1105 */ 1106 1107 /* Restore what we were doing. */ 1108 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©); 1109 V2P(vol, virtual, &disk, &offset, &start); 1110 1111 for (copy = 0; copy < N; copy++) { 1112 if ((mask & (1 << copy) ) != 0) 1113 vol->v_subdisks[(disk + copy) % 1114 vol->v_disks_count].sd_recovery--; 1115 } 1116 1117 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 1118 G_RAID_LOGREQ(0, bp, "Remap write failed: " 1119 "failing subdisk."); 1120 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1121 bp->bio_error = 0; 1122 } 1123 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 1124 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length); 1125 } 1126 if (pbp->bio_cmd != BIO_READ) { 1127 if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 1128 pbp->bio_error = bp->bio_error; 1129 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { 1130 G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 1131 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk); 1132 } 1133 error = pbp->bio_error; 1134 } else 1135 error = bp->bio_error; 1136 g_destroy_bio(bp); 1137 if (pbp->bio_children == pbp->bio_inbed) { 1138 pbp->bio_completed = pbp->bio_length; 1139 g_raid_iodone(pbp, error); 1140 } 1141 } 1142 1143 static int 1144 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual, 1145 off_t boffset, size_t blength) 1146 { 1147 struct g_raid_volume *vol; 1148 struct g_raid_subdisk *sd; 1149 struct bio_queue_head queue; 1150 char *addr; 1151 off_t offset, start, length, remain; 1152 u_int no, strip_size; 1153 int i, error; 1154 1155 vol = tr->tro_volume; 1156 addr = virtual; 1157 strip_size = vol->v_strip_size; 1158 V2P(vol, boffset, &no, &offset, &start); 1159 remain = blength; 1160 bioq_init(&queue); 1161 while (remain > 0) { 1162 length = MIN(strip_size - start, remain); 1163 for (i = 0; i < N; i++) { 1164 sd = &vol->v_subdisks[no]; 1165 switch (sd->sd_state) { 1166 case G_RAID_SUBDISK_S_ACTIVE: 1167 case G_RAID_SUBDISK_S_STALE: 1168 case G_RAID_SUBDISK_S_RESYNC: 1169 break; 1170 case G_RAID_SUBDISK_S_REBUILD: 1171 if (offset + start >= sd->sd_rebuild_pos) 1172 goto nextdisk; 1173 break; 1174 default: 1175 goto nextdisk; 1176 } 1177 error = g_raid_subdisk_kerneldump(sd, addr, 1178 offset + start, length); 1179 if (error != 0) 1180 return (error); 1181 nextdisk: 1182 if (++no >= vol->v_disks_count) { 1183 no = 0; 1184 offset += strip_size; 1185 } 1186 } 1187 remain -= length; 1188 addr += length; 1189 start = 0; 1190 } 1191 return (0); 1192 } 1193 1194 static int 1195 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp) 1196 { 1197 struct bio *bp; 1198 struct g_raid_subdisk *sd; 1199 1200 bp = (struct bio *)argp; 1201 sd = (struct g_raid_subdisk *)bp->bio_caller1; 1202 g_raid_subdisk_iostart(sd, bp); 1203 1204 return (0); 1205 } 1206 1207 static int 1208 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr) 1209 { 1210 struct g_raid_tr_raid1e_object *trs; 1211 struct g_raid_volume *vol; 1212 1213 vol = tr->tro_volume; 1214 trs = (struct g_raid_tr_raid1e_object *)tr; 1215 trs->trso_fair_io = g_raid1e_rebuild_fair_io; 1216 trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle; 1217 /* Compensate short rebuild I/Os. */ 1218 if ((vol->v_disks_count % N) != 0 && 1219 vol->v_strip_size < g_raid1e_rebuild_slab) { 1220 trs->trso_recover_slabs *= g_raid1e_rebuild_slab; 1221 trs->trso_recover_slabs /= vol->v_strip_size; 1222 } 1223 if (trs->trso_type == TR_RAID1E_REBUILD) 1224 g_raid_tr_raid1e_rebuild_some(tr); 1225 return (0); 1226 } 1227 1228 static int 1229 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr) 1230 { 1231 struct g_raid_tr_raid1e_object *trs; 1232 1233 trs = (struct g_raid_tr_raid1e_object *)tr; 1234 1235 if (trs->trso_buffer != NULL) { 1236 free(trs->trso_buffer, M_TR_RAID1E); 1237 trs->trso_buffer = NULL; 1238 } 1239 return (0); 1240 } 1241 1242 G_RAID_TR_DECLARE(raid1e, "RAID1E"); 1243