1 /*- 2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/kernel.h> 33 #include <sys/module.h> 34 #include <sys/limits.h> 35 #include <sys/lock.h> 36 #include <sys/mutex.h> 37 #include <sys/bio.h> 38 #include <sys/sbuf.h> 39 #include <sys/sysctl.h> 40 #include <sys/malloc.h> 41 #include <sys/eventhandler.h> 42 #include <vm/uma.h> 43 #include <geom/geom.h> 44 #include <sys/proc.h> 45 #include <sys/kthread.h> 46 #include <sys/sched.h> 47 #include <geom/raid/g_raid.h> 48 #include "g_raid_md_if.h" 49 #include "g_raid_tr_if.h" 50 51 static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); 52 53 SYSCTL_DECL(_kern_geom); 54 SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff"); 55 u_int g_raid_aggressive_spare = 0; 56 TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare); 57 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW, 58 &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); 59 u_int g_raid_debug = 0; 60 TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug); 61 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0, 62 "Debug level"); 63 int g_raid_read_err_thresh = 10; 64 TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh); 65 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW, 66 &g_raid_read_err_thresh, 0, 67 "Number of read errors equated to disk failure"); 68 u_int g_raid_start_timeout = 30; 69 TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout); 70 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW, 71 &g_raid_start_timeout, 0, 72 "Time to wait for all array components"); 73 static u_int g_raid_clean_time = 5; 74 TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time); 75 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW, 76 &g_raid_clean_time, 0, "Mark volume as clean when idling"); 77 static u_int g_raid_disconnect_on_failure = 1; 78 TUNABLE_INT("kern.geom.raid.disconnect_on_failure", 79 &g_raid_disconnect_on_failure); 80 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, 81 &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); 82 static u_int g_raid_name_format = 0; 83 TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format); 84 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW, 85 &g_raid_name_format, 0, "Providers name format."); 86 static u_int g_raid_idle_threshold = 1000000; 87 TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold); 88 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW, 89 &g_raid_idle_threshold, 1000000, 90 "Time in microseconds to consider a volume idle."); 91 92 #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ 93 G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ 94 rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ 95 G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ 96 } while (0) 97 98 LIST_HEAD(, g_raid_md_class) g_raid_md_classes = 99 LIST_HEAD_INITIALIZER(g_raid_md_classes); 100 101 LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = 102 LIST_HEAD_INITIALIZER(g_raid_tr_classes); 103 104 LIST_HEAD(, g_raid_volume) g_raid_volumes = 105 LIST_HEAD_INITIALIZER(g_raid_volumes); 106 107 static eventhandler_tag g_raid_pre_sync = NULL; 108 static int g_raid_started = 0; 109 110 static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, 111 struct g_geom *gp); 112 static g_taste_t g_raid_taste; 113 static void g_raid_init(struct g_class *mp); 114 static void g_raid_fini(struct g_class *mp); 115 116 struct g_class g_raid_class = { 117 .name = G_RAID_CLASS_NAME, 118 .version = G_VERSION, 119 .ctlreq = g_raid_ctl, 120 .taste = g_raid_taste, 121 .destroy_geom = g_raid_destroy_geom, 122 .init = g_raid_init, 123 .fini = g_raid_fini 124 }; 125 126 static void g_raid_destroy_provider(struct g_raid_volume *vol); 127 static int g_raid_update_disk(struct g_raid_disk *disk, u_int event); 128 static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event); 129 static int g_raid_update_volume(struct g_raid_volume *vol, u_int event); 130 static int g_raid_update_node(struct g_raid_softc *sc, u_int event); 131 static void g_raid_dumpconf(struct sbuf *sb, const char *indent, 132 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 133 static void g_raid_start(struct bio *bp); 134 static void g_raid_start_request(struct bio *bp); 135 static void g_raid_disk_done(struct bio *bp); 136 static void g_raid_poll(struct g_raid_softc *sc); 137 138 static const char * 139 g_raid_node_event2str(int event) 140 { 141 142 switch (event) { 143 case G_RAID_NODE_E_WAKE: 144 return ("WAKE"); 145 case G_RAID_NODE_E_START: 146 return ("START"); 147 default: 148 return ("INVALID"); 149 } 150 } 151 152 const char * 153 g_raid_disk_state2str(int state) 154 { 155 156 switch (state) { 157 case G_RAID_DISK_S_NONE: 158 return ("NONE"); 159 case G_RAID_DISK_S_OFFLINE: 160 return ("OFFLINE"); 161 case G_RAID_DISK_S_FAILED: 162 return ("FAILED"); 163 case G_RAID_DISK_S_STALE_FAILED: 164 return ("STALE_FAILED"); 165 case G_RAID_DISK_S_SPARE: 166 return ("SPARE"); 167 case G_RAID_DISK_S_STALE: 168 return ("STALE"); 169 case G_RAID_DISK_S_ACTIVE: 170 return ("ACTIVE"); 171 default: 172 return ("INVALID"); 173 } 174 } 175 176 static const char * 177 g_raid_disk_event2str(int event) 178 { 179 180 switch (event) { 181 case G_RAID_DISK_E_DISCONNECTED: 182 return ("DISCONNECTED"); 183 default: 184 return ("INVALID"); 185 } 186 } 187 188 const char * 189 g_raid_subdisk_state2str(int state) 190 { 191 192 switch (state) { 193 case G_RAID_SUBDISK_S_NONE: 194 return ("NONE"); 195 case G_RAID_SUBDISK_S_FAILED: 196 return ("FAILED"); 197 case G_RAID_SUBDISK_S_NEW: 198 return ("NEW"); 199 case G_RAID_SUBDISK_S_REBUILD: 200 return ("REBUILD"); 201 case G_RAID_SUBDISK_S_UNINITIALIZED: 202 return ("UNINITIALIZED"); 203 case G_RAID_SUBDISK_S_STALE: 204 return ("STALE"); 205 case G_RAID_SUBDISK_S_RESYNC: 206 return ("RESYNC"); 207 case G_RAID_SUBDISK_S_ACTIVE: 208 return ("ACTIVE"); 209 default: 210 return ("INVALID"); 211 } 212 } 213 214 static const char * 215 g_raid_subdisk_event2str(int event) 216 { 217 218 switch (event) { 219 case G_RAID_SUBDISK_E_NEW: 220 return ("NEW"); 221 case G_RAID_SUBDISK_E_DISCONNECTED: 222 return ("DISCONNECTED"); 223 default: 224 return ("INVALID"); 225 } 226 } 227 228 const char * 229 g_raid_volume_state2str(int state) 230 { 231 232 switch (state) { 233 case G_RAID_VOLUME_S_STARTING: 234 return ("STARTING"); 235 case G_RAID_VOLUME_S_BROKEN: 236 return ("BROKEN"); 237 case G_RAID_VOLUME_S_DEGRADED: 238 return ("DEGRADED"); 239 case G_RAID_VOLUME_S_SUBOPTIMAL: 240 return ("SUBOPTIMAL"); 241 case G_RAID_VOLUME_S_OPTIMAL: 242 return ("OPTIMAL"); 243 case G_RAID_VOLUME_S_UNSUPPORTED: 244 return ("UNSUPPORTED"); 245 case G_RAID_VOLUME_S_STOPPED: 246 return ("STOPPED"); 247 default: 248 return ("INVALID"); 249 } 250 } 251 252 static const char * 253 g_raid_volume_event2str(int event) 254 { 255 256 switch (event) { 257 case G_RAID_VOLUME_E_UP: 258 return ("UP"); 259 case G_RAID_VOLUME_E_DOWN: 260 return ("DOWN"); 261 case G_RAID_VOLUME_E_START: 262 return ("START"); 263 case G_RAID_VOLUME_E_STARTMD: 264 return ("STARTMD"); 265 default: 266 return ("INVALID"); 267 } 268 } 269 270 const char * 271 g_raid_volume_level2str(int level, int qual) 272 { 273 274 switch (level) { 275 case G_RAID_VOLUME_RL_RAID0: 276 return ("RAID0"); 277 case G_RAID_VOLUME_RL_RAID1: 278 return ("RAID1"); 279 case G_RAID_VOLUME_RL_RAID3: 280 if (qual == G_RAID_VOLUME_RLQ_R3P0) 281 return ("RAID3-P0"); 282 if (qual == G_RAID_VOLUME_RLQ_R3PN) 283 return ("RAID3-PN"); 284 return ("RAID3"); 285 case G_RAID_VOLUME_RL_RAID4: 286 if (qual == G_RAID_VOLUME_RLQ_R4P0) 287 return ("RAID4-P0"); 288 if (qual == G_RAID_VOLUME_RLQ_R4PN) 289 return ("RAID4-PN"); 290 return ("RAID4"); 291 case G_RAID_VOLUME_RL_RAID5: 292 if (qual == G_RAID_VOLUME_RLQ_R5RA) 293 return ("RAID5-RA"); 294 if (qual == G_RAID_VOLUME_RLQ_R5RS) 295 return ("RAID5-RS"); 296 if (qual == G_RAID_VOLUME_RLQ_R5LA) 297 return ("RAID5-LA"); 298 if (qual == G_RAID_VOLUME_RLQ_R5LS) 299 return ("RAID5-LS"); 300 return ("RAID5"); 301 case G_RAID_VOLUME_RL_RAID6: 302 if (qual == G_RAID_VOLUME_RLQ_R6RA) 303 return ("RAID6-RA"); 304 if (qual == G_RAID_VOLUME_RLQ_R6RS) 305 return ("RAID6-RS"); 306 if (qual == G_RAID_VOLUME_RLQ_R6LA) 307 return ("RAID6-LA"); 308 if (qual == G_RAID_VOLUME_RLQ_R6LS) 309 return ("RAID6-LS"); 310 return ("RAID6"); 311 case G_RAID_VOLUME_RL_RAIDMDF: 312 if (qual == G_RAID_VOLUME_RLQ_RMDFRA) 313 return ("RAIDMDF-RA"); 314 if (qual == G_RAID_VOLUME_RLQ_RMDFRS) 315 return ("RAIDMDF-RS"); 316 if (qual == G_RAID_VOLUME_RLQ_RMDFLA) 317 return ("RAIDMDF-LA"); 318 if (qual == G_RAID_VOLUME_RLQ_RMDFLS) 319 return ("RAIDMDF-LS"); 320 return ("RAIDMDF"); 321 case G_RAID_VOLUME_RL_RAID1E: 322 if (qual == G_RAID_VOLUME_RLQ_R1EA) 323 return ("RAID1E-A"); 324 if (qual == G_RAID_VOLUME_RLQ_R1EO) 325 return ("RAID1E-O"); 326 return ("RAID1E"); 327 case G_RAID_VOLUME_RL_SINGLE: 328 return ("SINGLE"); 329 case G_RAID_VOLUME_RL_CONCAT: 330 return ("CONCAT"); 331 case G_RAID_VOLUME_RL_RAID5E: 332 if (qual == G_RAID_VOLUME_RLQ_R5ERA) 333 return ("RAID5E-RA"); 334 if (qual == G_RAID_VOLUME_RLQ_R5ERS) 335 return ("RAID5E-RS"); 336 if (qual == G_RAID_VOLUME_RLQ_R5ELA) 337 return ("RAID5E-LA"); 338 if (qual == G_RAID_VOLUME_RLQ_R5ELS) 339 return ("RAID5E-LS"); 340 return ("RAID5E"); 341 case G_RAID_VOLUME_RL_RAID5EE: 342 if (qual == G_RAID_VOLUME_RLQ_R5EERA) 343 return ("RAID5EE-RA"); 344 if (qual == G_RAID_VOLUME_RLQ_R5EERS) 345 return ("RAID5EE-RS"); 346 if (qual == G_RAID_VOLUME_RLQ_R5EELA) 347 return ("RAID5EE-LA"); 348 if (qual == G_RAID_VOLUME_RLQ_R5EELS) 349 return ("RAID5EE-LS"); 350 return ("RAID5EE"); 351 case G_RAID_VOLUME_RL_RAID5R: 352 if (qual == G_RAID_VOLUME_RLQ_R5RRA) 353 return ("RAID5R-RA"); 354 if (qual == G_RAID_VOLUME_RLQ_R5RRS) 355 return ("RAID5R-RS"); 356 if (qual == G_RAID_VOLUME_RLQ_R5RLA) 357 return ("RAID5R-LA"); 358 if (qual == G_RAID_VOLUME_RLQ_R5RLS) 359 return ("RAID5R-LS"); 360 return ("RAID5E"); 361 default: 362 return ("UNKNOWN"); 363 } 364 } 365 366 int 367 g_raid_volume_str2level(const char *str, int *level, int *qual) 368 { 369 370 *level = G_RAID_VOLUME_RL_UNKNOWN; 371 *qual = G_RAID_VOLUME_RLQ_NONE; 372 if (strcasecmp(str, "RAID0") == 0) 373 *level = G_RAID_VOLUME_RL_RAID0; 374 else if (strcasecmp(str, "RAID1") == 0) 375 *level = G_RAID_VOLUME_RL_RAID1; 376 else if (strcasecmp(str, "RAID3-P0") == 0) { 377 *level = G_RAID_VOLUME_RL_RAID3; 378 *qual = G_RAID_VOLUME_RLQ_R3P0; 379 } else if (strcasecmp(str, "RAID3-PN") == 0 && 380 strcasecmp(str, "RAID3") == 0) { 381 *level = G_RAID_VOLUME_RL_RAID3; 382 *qual = G_RAID_VOLUME_RLQ_R3P0; 383 } else if (strcasecmp(str, "RAID4-P0") == 0) { 384 *level = G_RAID_VOLUME_RL_RAID4; 385 *qual = G_RAID_VOLUME_RLQ_R4P0; 386 } else if (strcasecmp(str, "RAID4-PN") == 0 && 387 strcasecmp(str, "RAID4") == 0) { 388 *level = G_RAID_VOLUME_RL_RAID4; 389 *qual = G_RAID_VOLUME_RLQ_R4P0; 390 } else if (strcasecmp(str, "RAID5-RA") == 0) { 391 *level = G_RAID_VOLUME_RL_RAID5; 392 *qual = G_RAID_VOLUME_RLQ_R5RA; 393 } else if (strcasecmp(str, "RAID5-RS") == 0) { 394 *level = G_RAID_VOLUME_RL_RAID5; 395 *qual = G_RAID_VOLUME_RLQ_R5RS; 396 } else if (strcasecmp(str, "RAID5") == 0 || 397 strcasecmp(str, "RAID5-LA") == 0) { 398 *level = G_RAID_VOLUME_RL_RAID5; 399 *qual = G_RAID_VOLUME_RLQ_R5LA; 400 } else if (strcasecmp(str, "RAID5-LS") == 0) { 401 *level = G_RAID_VOLUME_RL_RAID5; 402 *qual = G_RAID_VOLUME_RLQ_R5LS; 403 } else if (strcasecmp(str, "RAID6-RA") == 0) { 404 *level = G_RAID_VOLUME_RL_RAID6; 405 *qual = G_RAID_VOLUME_RLQ_R6RA; 406 } else if (strcasecmp(str, "RAID6-RS") == 0) { 407 *level = G_RAID_VOLUME_RL_RAID6; 408 *qual = G_RAID_VOLUME_RLQ_R6RS; 409 } else if (strcasecmp(str, "RAID6") == 0 || 410 strcasecmp(str, "RAID6-LA") == 0) { 411 *level = G_RAID_VOLUME_RL_RAID6; 412 *qual = G_RAID_VOLUME_RLQ_R6LA; 413 } else if (strcasecmp(str, "RAID6-LS") == 0) { 414 *level = G_RAID_VOLUME_RL_RAID6; 415 *qual = G_RAID_VOLUME_RLQ_R6LS; 416 } else if (strcasecmp(str, "RAIDMDF-RA") == 0) { 417 *level = G_RAID_VOLUME_RL_RAIDMDF; 418 *qual = G_RAID_VOLUME_RLQ_RMDFRA; 419 } else if (strcasecmp(str, "RAIDMDF-RS") == 0) { 420 *level = G_RAID_VOLUME_RL_RAIDMDF; 421 *qual = G_RAID_VOLUME_RLQ_RMDFRS; 422 } else if (strcasecmp(str, "RAIDMDF") == 0 || 423 strcasecmp(str, "RAIDMDF-LA") == 0) { 424 *level = G_RAID_VOLUME_RL_RAIDMDF; 425 *qual = G_RAID_VOLUME_RLQ_RMDFLA; 426 } else if (strcasecmp(str, "RAIDMDF-LS") == 0) { 427 *level = G_RAID_VOLUME_RL_RAIDMDF; 428 *qual = G_RAID_VOLUME_RLQ_RMDFLS; 429 } else if (strcasecmp(str, "RAID10") == 0 || 430 strcasecmp(str, "RAID1E") == 0 || 431 strcasecmp(str, "RAID1E-A") == 0) { 432 *level = G_RAID_VOLUME_RL_RAID1E; 433 *qual = G_RAID_VOLUME_RLQ_R1EA; 434 } else if (strcasecmp(str, "RAID1E-O") == 0) { 435 *level = G_RAID_VOLUME_RL_RAID1E; 436 *qual = G_RAID_VOLUME_RLQ_R1EO; 437 } else if (strcasecmp(str, "SINGLE") == 0) 438 *level = G_RAID_VOLUME_RL_SINGLE; 439 else if (strcasecmp(str, "CONCAT") == 0) 440 *level = G_RAID_VOLUME_RL_CONCAT; 441 else if (strcasecmp(str, "RAID5E-RA") == 0) { 442 *level = G_RAID_VOLUME_RL_RAID5E; 443 *qual = G_RAID_VOLUME_RLQ_R5ERA; 444 } else if (strcasecmp(str, "RAID5E-RS") == 0) { 445 *level = G_RAID_VOLUME_RL_RAID5E; 446 *qual = G_RAID_VOLUME_RLQ_R5ERS; 447 } else if (strcasecmp(str, "RAID5E") == 0 || 448 strcasecmp(str, "RAID5E-LA") == 0) { 449 *level = G_RAID_VOLUME_RL_RAID5E; 450 *qual = G_RAID_VOLUME_RLQ_R5ELA; 451 } else if (strcasecmp(str, "RAID5E-LS") == 0) { 452 *level = G_RAID_VOLUME_RL_RAID5E; 453 *qual = G_RAID_VOLUME_RLQ_R5ELS; 454 } else if (strcasecmp(str, "RAID5EE-RA") == 0) { 455 *level = G_RAID_VOLUME_RL_RAID5EE; 456 *qual = G_RAID_VOLUME_RLQ_R5EERA; 457 } else if (strcasecmp(str, "RAID5EE-RS") == 0) { 458 *level = G_RAID_VOLUME_RL_RAID5EE; 459 *qual = G_RAID_VOLUME_RLQ_R5EERS; 460 } else if (strcasecmp(str, "RAID5EE") == 0 || 461 strcasecmp(str, "RAID5EE-LA") == 0) { 462 *level = G_RAID_VOLUME_RL_RAID5EE; 463 *qual = G_RAID_VOLUME_RLQ_R5EELA; 464 } else if (strcasecmp(str, "RAID5EE-LS") == 0) { 465 *level = G_RAID_VOLUME_RL_RAID5EE; 466 *qual = G_RAID_VOLUME_RLQ_R5EELS; 467 } else if (strcasecmp(str, "RAID5R-RA") == 0) { 468 *level = G_RAID_VOLUME_RL_RAID5R; 469 *qual = G_RAID_VOLUME_RLQ_R5RRA; 470 } else if (strcasecmp(str, "RAID5R-RS") == 0) { 471 *level = G_RAID_VOLUME_RL_RAID5R; 472 *qual = G_RAID_VOLUME_RLQ_R5RRS; 473 } else if (strcasecmp(str, "RAID5R") == 0 || 474 strcasecmp(str, "RAID5R-LA") == 0) { 475 *level = G_RAID_VOLUME_RL_RAID5R; 476 *qual = G_RAID_VOLUME_RLQ_R5RLA; 477 } else if (strcasecmp(str, "RAID5R-LS") == 0) { 478 *level = G_RAID_VOLUME_RL_RAID5R; 479 *qual = G_RAID_VOLUME_RLQ_R5RLS; 480 } else 481 return (-1); 482 return (0); 483 } 484 485 const char * 486 g_raid_get_diskname(struct g_raid_disk *disk) 487 { 488 489 if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) 490 return ("[unknown]"); 491 return (disk->d_consumer->provider->name); 492 } 493 494 void 495 g_raid_report_disk_state(struct g_raid_disk *disk) 496 { 497 struct g_raid_subdisk *sd; 498 int len, state; 499 uint32_t s; 500 501 if (disk->d_consumer == NULL) 502 return; 503 if (disk->d_state == G_RAID_DISK_S_FAILED || 504 disk->d_state == G_RAID_DISK_S_STALE_FAILED) { 505 s = G_STATE_FAILED; 506 } else { 507 state = G_RAID_SUBDISK_S_ACTIVE; 508 TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { 509 if (sd->sd_state < state) 510 state = sd->sd_state; 511 } 512 if (state == G_RAID_SUBDISK_S_FAILED) 513 s = G_STATE_FAILED; 514 else if (state == G_RAID_SUBDISK_S_NEW || 515 state == G_RAID_SUBDISK_S_REBUILD) 516 s = G_STATE_REBUILD; 517 else if (state == G_RAID_SUBDISK_S_STALE || 518 state == G_RAID_SUBDISK_S_RESYNC) 519 s = G_STATE_RESYNC; 520 else 521 s = G_STATE_ACTIVE; 522 } 523 len = sizeof(s); 524 g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s); 525 G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.", 526 g_raid_get_diskname(disk), s); 527 } 528 529 void 530 g_raid_change_disk_state(struct g_raid_disk *disk, int state) 531 { 532 533 G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.", 534 g_raid_get_diskname(disk), 535 g_raid_disk_state2str(disk->d_state), 536 g_raid_disk_state2str(state)); 537 disk->d_state = state; 538 g_raid_report_disk_state(disk); 539 } 540 541 void 542 g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) 543 { 544 545 G_RAID_DEBUG1(0, sd->sd_softc, 546 "Subdisk %s:%d-%s state changed from %s to %s.", 547 sd->sd_volume->v_name, sd->sd_pos, 548 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", 549 g_raid_subdisk_state2str(sd->sd_state), 550 g_raid_subdisk_state2str(state)); 551 sd->sd_state = state; 552 if (sd->sd_disk) 553 g_raid_report_disk_state(sd->sd_disk); 554 } 555 556 void 557 g_raid_change_volume_state(struct g_raid_volume *vol, int state) 558 { 559 560 G_RAID_DEBUG1(0, vol->v_softc, 561 "Volume %s state changed from %s to %s.", 562 vol->v_name, 563 g_raid_volume_state2str(vol->v_state), 564 g_raid_volume_state2str(state)); 565 vol->v_state = state; 566 } 567 568 /* 569 * --- Events handling functions --- 570 * Events in geom_raid are used to maintain subdisks and volumes status 571 * from one thread to simplify locking. 572 */ 573 static void 574 g_raid_event_free(struct g_raid_event *ep) 575 { 576 577 free(ep, M_RAID); 578 } 579 580 int 581 g_raid_event_send(void *arg, int event, int flags) 582 { 583 struct g_raid_softc *sc; 584 struct g_raid_event *ep; 585 int error; 586 587 if ((flags & G_RAID_EVENT_VOLUME) != 0) { 588 sc = ((struct g_raid_volume *)arg)->v_softc; 589 } else if ((flags & G_RAID_EVENT_DISK) != 0) { 590 sc = ((struct g_raid_disk *)arg)->d_softc; 591 } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { 592 sc = ((struct g_raid_subdisk *)arg)->sd_softc; 593 } else { 594 sc = arg; 595 } 596 ep = malloc(sizeof(*ep), M_RAID, 597 sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT); 598 if (ep == NULL) 599 return (ENOMEM); 600 ep->e_tgt = arg; 601 ep->e_event = event; 602 ep->e_flags = flags; 603 ep->e_error = 0; 604 G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc); 605 mtx_lock(&sc->sc_queue_mtx); 606 TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); 607 mtx_unlock(&sc->sc_queue_mtx); 608 wakeup(sc); 609 610 if ((flags & G_RAID_EVENT_WAIT) == 0) 611 return (0); 612 613 sx_assert(&sc->sc_lock, SX_XLOCKED); 614 G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep); 615 sx_xunlock(&sc->sc_lock); 616 while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { 617 mtx_lock(&sc->sc_queue_mtx); 618 MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", 619 hz * 5); 620 } 621 error = ep->e_error; 622 g_raid_event_free(ep); 623 sx_xlock(&sc->sc_lock); 624 return (error); 625 } 626 627 static void 628 g_raid_event_cancel(struct g_raid_softc *sc, void *tgt) 629 { 630 struct g_raid_event *ep, *tmpep; 631 632 sx_assert(&sc->sc_lock, SX_XLOCKED); 633 634 mtx_lock(&sc->sc_queue_mtx); 635 TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { 636 if (ep->e_tgt != tgt) 637 continue; 638 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 639 if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) 640 g_raid_event_free(ep); 641 else { 642 ep->e_error = ECANCELED; 643 wakeup(ep); 644 } 645 } 646 mtx_unlock(&sc->sc_queue_mtx); 647 } 648 649 static int 650 g_raid_event_check(struct g_raid_softc *sc, void *tgt) 651 { 652 struct g_raid_event *ep; 653 int res = 0; 654 655 sx_assert(&sc->sc_lock, SX_XLOCKED); 656 657 mtx_lock(&sc->sc_queue_mtx); 658 TAILQ_FOREACH(ep, &sc->sc_events, e_next) { 659 if (ep->e_tgt != tgt) 660 continue; 661 res = 1; 662 break; 663 } 664 mtx_unlock(&sc->sc_queue_mtx); 665 return (res); 666 } 667 668 /* 669 * Return the number of disks in given state. 670 * If state is equal to -1, count all connected disks. 671 */ 672 u_int 673 g_raid_ndisks(struct g_raid_softc *sc, int state) 674 { 675 struct g_raid_disk *disk; 676 u_int n; 677 678 sx_assert(&sc->sc_lock, SX_LOCKED); 679 680 n = 0; 681 TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { 682 if (disk->d_state == state || state == -1) 683 n++; 684 } 685 return (n); 686 } 687 688 /* 689 * Return the number of subdisks in given state. 690 * If state is equal to -1, count all connected disks. 691 */ 692 u_int 693 g_raid_nsubdisks(struct g_raid_volume *vol, int state) 694 { 695 struct g_raid_subdisk *subdisk; 696 struct g_raid_softc *sc; 697 u_int i, n ; 698 699 sc = vol->v_softc; 700 sx_assert(&sc->sc_lock, SX_LOCKED); 701 702 n = 0; 703 for (i = 0; i < vol->v_disks_count; i++) { 704 subdisk = &vol->v_subdisks[i]; 705 if ((state == -1 && 706 subdisk->sd_state != G_RAID_SUBDISK_S_NONE) || 707 subdisk->sd_state == state) 708 n++; 709 } 710 return (n); 711 } 712 713 /* 714 * Return the first subdisk in given state. 715 * If state is equal to -1, then the first connected disks. 716 */ 717 struct g_raid_subdisk * 718 g_raid_get_subdisk(struct g_raid_volume *vol, int state) 719 { 720 struct g_raid_subdisk *sd; 721 struct g_raid_softc *sc; 722 u_int i; 723 724 sc = vol->v_softc; 725 sx_assert(&sc->sc_lock, SX_LOCKED); 726 727 for (i = 0; i < vol->v_disks_count; i++) { 728 sd = &vol->v_subdisks[i]; 729 if ((state == -1 && 730 sd->sd_state != G_RAID_SUBDISK_S_NONE) || 731 sd->sd_state == state) 732 return (sd); 733 } 734 return (NULL); 735 } 736 737 struct g_consumer * 738 g_raid_open_consumer(struct g_raid_softc *sc, const char *name) 739 { 740 struct g_consumer *cp; 741 struct g_provider *pp; 742 743 g_topology_assert(); 744 745 if (strncmp(name, "/dev/", 5) == 0) 746 name += 5; 747 pp = g_provider_by_name(name); 748 if (pp == NULL) 749 return (NULL); 750 cp = g_new_consumer(sc->sc_geom); 751 if (g_attach(cp, pp) != 0) { 752 g_destroy_consumer(cp); 753 return (NULL); 754 } 755 if (g_access(cp, 1, 1, 1) != 0) { 756 g_detach(cp); 757 g_destroy_consumer(cp); 758 return (NULL); 759 } 760 return (cp); 761 } 762 763 static u_int 764 g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) 765 { 766 struct bio *bp; 767 u_int nreqs = 0; 768 769 mtx_lock(&sc->sc_queue_mtx); 770 TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { 771 if (bp->bio_from == cp) 772 nreqs++; 773 } 774 mtx_unlock(&sc->sc_queue_mtx); 775 return (nreqs); 776 } 777 778 u_int 779 g_raid_nopens(struct g_raid_softc *sc) 780 { 781 struct g_raid_volume *vol; 782 u_int opens; 783 784 opens = 0; 785 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 786 if (vol->v_provider_open != 0) 787 opens++; 788 } 789 return (opens); 790 } 791 792 static int 793 g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) 794 { 795 796 if (cp->index > 0) { 797 G_RAID_DEBUG1(2, sc, 798 "I/O requests for %s exist, can't destroy it now.", 799 cp->provider->name); 800 return (1); 801 } 802 if (g_raid_nrequests(sc, cp) > 0) { 803 G_RAID_DEBUG1(2, sc, 804 "I/O requests for %s in queue, can't destroy it now.", 805 cp->provider->name); 806 return (1); 807 } 808 return (0); 809 } 810 811 static void 812 g_raid_destroy_consumer(void *arg, int flags __unused) 813 { 814 struct g_consumer *cp; 815 816 g_topology_assert(); 817 818 cp = arg; 819 G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); 820 g_detach(cp); 821 g_destroy_consumer(cp); 822 } 823 824 void 825 g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) 826 { 827 struct g_provider *pp; 828 int retaste_wait; 829 830 g_topology_assert_not(); 831 832 g_topology_lock(); 833 cp->private = NULL; 834 if (g_raid_consumer_is_busy(sc, cp)) 835 goto out; 836 pp = cp->provider; 837 retaste_wait = 0; 838 if (cp->acw == 1) { 839 if ((pp->geom->flags & G_GEOM_WITHER) == 0) 840 retaste_wait = 1; 841 } 842 if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) 843 g_access(cp, -cp->acr, -cp->acw, -cp->ace); 844 if (retaste_wait) { 845 /* 846 * After retaste event was send (inside g_access()), we can send 847 * event to detach and destroy consumer. 848 * A class, which has consumer to the given provider connected 849 * will not receive retaste event for the provider. 850 * This is the way how I ignore retaste events when I close 851 * consumers opened for write: I detach and destroy consumer 852 * after retaste event is sent. 853 */ 854 g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); 855 goto out; 856 } 857 G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); 858 g_detach(cp); 859 g_destroy_consumer(cp); 860 out: 861 g_topology_unlock(); 862 } 863 864 static void 865 g_raid_orphan(struct g_consumer *cp) 866 { 867 struct g_raid_disk *disk; 868 869 g_topology_assert(); 870 871 disk = cp->private; 872 if (disk == NULL) 873 return; 874 g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, 875 G_RAID_EVENT_DISK); 876 } 877 878 static int 879 g_raid_clean(struct g_raid_volume *vol, int acw) 880 { 881 struct g_raid_softc *sc; 882 int timeout; 883 884 sc = vol->v_softc; 885 g_topology_assert_not(); 886 sx_assert(&sc->sc_lock, SX_XLOCKED); 887 888 // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) 889 // return (0); 890 if (!vol->v_dirty) 891 return (0); 892 if (vol->v_writes > 0) 893 return (0); 894 if (acw > 0 || (acw == -1 && 895 vol->v_provider != NULL && vol->v_provider->acw > 0)) { 896 timeout = g_raid_clean_time - (time_uptime - vol->v_last_write); 897 if (timeout > 0) 898 return (timeout); 899 } 900 vol->v_dirty = 0; 901 G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.", 902 vol->v_name); 903 g_raid_write_metadata(sc, vol, NULL, NULL); 904 return (0); 905 } 906 907 static void 908 g_raid_dirty(struct g_raid_volume *vol) 909 { 910 struct g_raid_softc *sc; 911 912 sc = vol->v_softc; 913 g_topology_assert_not(); 914 sx_assert(&sc->sc_lock, SX_XLOCKED); 915 916 // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) 917 // return; 918 vol->v_dirty = 1; 919 G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.", 920 vol->v_name); 921 g_raid_write_metadata(sc, vol, NULL, NULL); 922 } 923 924 void 925 g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp) 926 { 927 struct g_raid_softc *sc; 928 struct g_raid_volume *vol; 929 struct g_raid_subdisk *sd; 930 struct bio_queue_head queue; 931 struct bio *cbp; 932 int i; 933 934 vol = tr->tro_volume; 935 sc = vol->v_softc; 936 937 /* 938 * Allocate all bios before sending any request, so we can return 939 * ENOMEM in nice and clean way. 940 */ 941 bioq_init(&queue); 942 for (i = 0; i < vol->v_disks_count; i++) { 943 sd = &vol->v_subdisks[i]; 944 if (sd->sd_state == G_RAID_SUBDISK_S_NONE || 945 sd->sd_state == G_RAID_SUBDISK_S_FAILED) 946 continue; 947 cbp = g_clone_bio(bp); 948 if (cbp == NULL) 949 goto failure; 950 cbp->bio_caller1 = sd; 951 bioq_insert_tail(&queue, cbp); 952 } 953 for (cbp = bioq_first(&queue); cbp != NULL; 954 cbp = bioq_first(&queue)) { 955 bioq_remove(&queue, cbp); 956 sd = cbp->bio_caller1; 957 cbp->bio_caller1 = NULL; 958 g_raid_subdisk_iostart(sd, cbp); 959 } 960 return; 961 failure: 962 for (cbp = bioq_first(&queue); cbp != NULL; 963 cbp = bioq_first(&queue)) { 964 bioq_remove(&queue, cbp); 965 g_destroy_bio(cbp); 966 } 967 if (bp->bio_error == 0) 968 bp->bio_error = ENOMEM; 969 g_raid_iodone(bp, bp->bio_error); 970 } 971 972 static void 973 g_raid_tr_kerneldump_common_done(struct bio *bp) 974 { 975 976 bp->bio_flags |= BIO_DONE; 977 } 978 979 int 980 g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, 981 void *virtual, vm_offset_t physical, off_t offset, size_t length) 982 { 983 struct g_raid_softc *sc; 984 struct g_raid_volume *vol; 985 struct bio bp; 986 987 vol = tr->tro_volume; 988 sc = vol->v_softc; 989 990 bzero(&bp, sizeof(bp)); 991 bp.bio_cmd = BIO_WRITE; 992 bp.bio_done = g_raid_tr_kerneldump_common_done; 993 bp.bio_attribute = NULL; 994 bp.bio_offset = offset; 995 bp.bio_length = length; 996 bp.bio_data = virtual; 997 bp.bio_to = vol->v_provider; 998 999 g_raid_start(&bp); 1000 while (!(bp.bio_flags & BIO_DONE)) { 1001 G_RAID_DEBUG1(4, sc, "Poll..."); 1002 g_raid_poll(sc); 1003 DELAY(10); 1004 } 1005 1006 return (bp.bio_error != 0 ? EIO : 0); 1007 } 1008 1009 static int 1010 g_raid_dump(void *arg, 1011 void *virtual, vm_offset_t physical, off_t offset, size_t length) 1012 { 1013 struct g_raid_volume *vol; 1014 int error; 1015 1016 vol = (struct g_raid_volume *)arg; 1017 G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.", 1018 (long long unsigned)offset, (long long unsigned)length); 1019 1020 error = G_RAID_TR_KERNELDUMP(vol->v_tr, 1021 virtual, physical, offset, length); 1022 return (error); 1023 } 1024 1025 static void 1026 g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) 1027 { 1028 struct g_kerneldump *gkd; 1029 struct g_provider *pp; 1030 struct g_raid_volume *vol; 1031 1032 gkd = (struct g_kerneldump*)bp->bio_data; 1033 pp = bp->bio_to; 1034 vol = pp->private; 1035 g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", 1036 pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); 1037 gkd->di.dumper = g_raid_dump; 1038 gkd->di.priv = vol; 1039 gkd->di.blocksize = vol->v_sectorsize; 1040 gkd->di.maxiosize = DFLTPHYS; 1041 gkd->di.mediaoffset = gkd->offset; 1042 if ((gkd->offset + gkd->length) > vol->v_mediasize) 1043 gkd->length = vol->v_mediasize - gkd->offset; 1044 gkd->di.mediasize = gkd->length; 1045 g_io_deliver(bp, 0); 1046 } 1047 1048 static void 1049 g_raid_start(struct bio *bp) 1050 { 1051 struct g_raid_softc *sc; 1052 1053 sc = bp->bio_to->geom->softc; 1054 /* 1055 * If sc == NULL or there are no valid disks, provider's error 1056 * should be set and g_raid_start() should not be called at all. 1057 */ 1058 // KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, 1059 // ("Provider's error should be set (error=%d)(mirror=%s).", 1060 // bp->bio_to->error, bp->bio_to->name)); 1061 G_RAID_LOGREQ(3, bp, "Request received."); 1062 1063 switch (bp->bio_cmd) { 1064 case BIO_READ: 1065 case BIO_WRITE: 1066 case BIO_DELETE: 1067 case BIO_FLUSH: 1068 break; 1069 case BIO_GETATTR: 1070 if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) 1071 g_raid_kerneldump(sc, bp); 1072 else 1073 g_io_deliver(bp, EOPNOTSUPP); 1074 return; 1075 default: 1076 g_io_deliver(bp, EOPNOTSUPP); 1077 return; 1078 } 1079 mtx_lock(&sc->sc_queue_mtx); 1080 bioq_disksort(&sc->sc_queue, bp); 1081 mtx_unlock(&sc->sc_queue_mtx); 1082 if (!dumping) { 1083 G_RAID_DEBUG1(4, sc, "Waking up %p.", sc); 1084 wakeup(sc); 1085 } 1086 } 1087 1088 static int 1089 g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) 1090 { 1091 /* 1092 * 5 cases: 1093 * (1) bp entirely below NO 1094 * (2) bp entirely above NO 1095 * (3) bp start below, but end in range YES 1096 * (4) bp entirely within YES 1097 * (5) bp starts within, ends above YES 1098 * 1099 * lock range 10-19 (offset 10 length 10) 1100 * (1) 1-5: first if kicks it out 1101 * (2) 30-35: second if kicks it out 1102 * (3) 5-15: passes both ifs 1103 * (4) 12-14: passes both ifs 1104 * (5) 19-20: passes both 1105 */ 1106 off_t lend = lstart + len - 1; 1107 off_t bstart = bp->bio_offset; 1108 off_t bend = bp->bio_offset + bp->bio_length - 1; 1109 1110 if (bend < lstart) 1111 return (0); 1112 if (lend < bstart) 1113 return (0); 1114 return (1); 1115 } 1116 1117 static int 1118 g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) 1119 { 1120 struct g_raid_lock *lp; 1121 1122 sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); 1123 1124 LIST_FOREACH(lp, &vol->v_locks, l_next) { 1125 if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) 1126 return (1); 1127 } 1128 return (0); 1129 } 1130 1131 static void 1132 g_raid_start_request(struct bio *bp) 1133 { 1134 struct g_raid_softc *sc; 1135 struct g_raid_volume *vol; 1136 1137 sc = bp->bio_to->geom->softc; 1138 sx_assert(&sc->sc_lock, SX_LOCKED); 1139 vol = bp->bio_to->private; 1140 1141 /* 1142 * Check to see if this item is in a locked range. If so, 1143 * queue it to our locked queue and return. We'll requeue 1144 * it when the range is unlocked. Internal I/O for the 1145 * rebuild/rescan/recovery process is excluded from this 1146 * check so we can actually do the recovery. 1147 */ 1148 if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && 1149 g_raid_is_in_locked_range(vol, bp)) { 1150 G_RAID_LOGREQ(3, bp, "Defer request."); 1151 bioq_insert_tail(&vol->v_locked, bp); 1152 return; 1153 } 1154 1155 /* 1156 * If we're actually going to do the write/delete, then 1157 * update the idle stats for the volume. 1158 */ 1159 if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { 1160 if (!vol->v_dirty) 1161 g_raid_dirty(vol); 1162 vol->v_writes++; 1163 } 1164 1165 /* 1166 * Put request onto inflight queue, so we can check if new 1167 * synchronization requests don't collide with it. Then tell 1168 * the transformation layer to start the I/O. 1169 */ 1170 bioq_insert_tail(&vol->v_inflight, bp); 1171 G_RAID_LOGREQ(4, bp, "Request started"); 1172 G_RAID_TR_IOSTART(vol->v_tr, bp); 1173 } 1174 1175 static void 1176 g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) 1177 { 1178 off_t off, len; 1179 struct bio *nbp; 1180 struct g_raid_lock *lp; 1181 1182 vol->v_pending_lock = 0; 1183 LIST_FOREACH(lp, &vol->v_locks, l_next) { 1184 if (lp->l_pending) { 1185 off = lp->l_offset; 1186 len = lp->l_length; 1187 lp->l_pending = 0; 1188 TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { 1189 if (g_raid_bio_overlaps(nbp, off, len)) 1190 lp->l_pending++; 1191 } 1192 if (lp->l_pending) { 1193 vol->v_pending_lock = 1; 1194 G_RAID_DEBUG1(4, vol->v_softc, 1195 "Deferred lock(%jd, %jd) has %d pending", 1196 (intmax_t)off, (intmax_t)(off + len), 1197 lp->l_pending); 1198 continue; 1199 } 1200 G_RAID_DEBUG1(4, vol->v_softc, 1201 "Deferred lock of %jd to %jd completed", 1202 (intmax_t)off, (intmax_t)(off + len)); 1203 G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); 1204 } 1205 } 1206 } 1207 1208 void 1209 g_raid_iodone(struct bio *bp, int error) 1210 { 1211 struct g_raid_softc *sc; 1212 struct g_raid_volume *vol; 1213 1214 sc = bp->bio_to->geom->softc; 1215 sx_assert(&sc->sc_lock, SX_LOCKED); 1216 vol = bp->bio_to->private; 1217 G_RAID_LOGREQ(3, bp, "Request done: %d.", error); 1218 1219 /* Update stats if we done write/delete. */ 1220 if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { 1221 vol->v_writes--; 1222 vol->v_last_write = time_uptime; 1223 } 1224 1225 bioq_remove(&vol->v_inflight, bp); 1226 if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) 1227 g_raid_finish_with_locked_ranges(vol, bp); 1228 getmicrouptime(&vol->v_last_done); 1229 g_io_deliver(bp, error); 1230 } 1231 1232 int 1233 g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, 1234 struct bio *ignore, void *argp) 1235 { 1236 struct g_raid_softc *sc; 1237 struct g_raid_lock *lp; 1238 struct bio *bp; 1239 1240 sc = vol->v_softc; 1241 lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); 1242 LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); 1243 lp->l_offset = off; 1244 lp->l_length = len; 1245 lp->l_callback_arg = argp; 1246 1247 lp->l_pending = 0; 1248 TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { 1249 if (bp != ignore && g_raid_bio_overlaps(bp, off, len)) 1250 lp->l_pending++; 1251 } 1252 1253 /* 1254 * If there are any writes that are pending, we return EBUSY. All 1255 * callers will have to wait until all pending writes clear. 1256 */ 1257 if (lp->l_pending > 0) { 1258 vol->v_pending_lock = 1; 1259 G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend", 1260 (intmax_t)off, (intmax_t)(off+len), lp->l_pending); 1261 return (EBUSY); 1262 } 1263 G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd", 1264 (intmax_t)off, (intmax_t)(off+len)); 1265 G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); 1266 return (0); 1267 } 1268 1269 int 1270 g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) 1271 { 1272 struct g_raid_lock *lp; 1273 struct g_raid_softc *sc; 1274 struct bio *bp; 1275 1276 sc = vol->v_softc; 1277 LIST_FOREACH(lp, &vol->v_locks, l_next) { 1278 if (lp->l_offset == off && lp->l_length == len) { 1279 LIST_REMOVE(lp, l_next); 1280 /* XXX 1281 * Right now we just put them all back on the queue 1282 * and hope for the best. We hope this because any 1283 * locked ranges will go right back on this list 1284 * when the worker thread runs. 1285 * XXX 1286 */ 1287 G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd", 1288 (intmax_t)lp->l_offset, 1289 (intmax_t)(lp->l_offset+lp->l_length)); 1290 mtx_lock(&sc->sc_queue_mtx); 1291 while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) 1292 bioq_disksort(&sc->sc_queue, bp); 1293 mtx_unlock(&sc->sc_queue_mtx); 1294 free(lp, M_RAID); 1295 return (0); 1296 } 1297 } 1298 return (EINVAL); 1299 } 1300 1301 void 1302 g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) 1303 { 1304 struct g_consumer *cp; 1305 struct g_raid_disk *disk, *tdisk; 1306 1307 bp->bio_caller1 = sd; 1308 1309 /* 1310 * Make sure that the disk is present. Generally it is a task of 1311 * transformation layers to not send requests to absent disks, but 1312 * it is better to be safe and report situation then sorry. 1313 */ 1314 if (sd->sd_disk == NULL) { 1315 G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!"); 1316 nodisk: 1317 bp->bio_from = NULL; 1318 bp->bio_to = NULL; 1319 bp->bio_error = ENXIO; 1320 g_raid_disk_done(bp); 1321 return; 1322 } 1323 disk = sd->sd_disk; 1324 if (disk->d_state != G_RAID_DISK_S_ACTIVE && 1325 disk->d_state != G_RAID_DISK_S_FAILED) { 1326 G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a " 1327 "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); 1328 goto nodisk; 1329 } 1330 1331 cp = disk->d_consumer; 1332 bp->bio_from = cp; 1333 bp->bio_to = cp->provider; 1334 cp->index++; 1335 1336 /* Update average disks load. */ 1337 TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { 1338 if (tdisk->d_consumer == NULL) 1339 tdisk->d_load = 0; 1340 else 1341 tdisk->d_load = (tdisk->d_consumer->index * 1342 G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; 1343 } 1344 1345 disk->d_last_offset = bp->bio_offset + bp->bio_length; 1346 if (dumping) { 1347 G_RAID_LOGREQ(3, bp, "Sending dumping request."); 1348 if (bp->bio_cmd == BIO_WRITE) { 1349 bp->bio_error = g_raid_subdisk_kerneldump(sd, 1350 bp->bio_data, 0, bp->bio_offset, bp->bio_length); 1351 } else 1352 bp->bio_error = EOPNOTSUPP; 1353 g_raid_disk_done(bp); 1354 } else { 1355 bp->bio_done = g_raid_disk_done; 1356 bp->bio_offset += sd->sd_offset; 1357 G_RAID_LOGREQ(3, bp, "Sending request."); 1358 g_io_request(bp, cp); 1359 } 1360 } 1361 1362 int 1363 g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, 1364 void *virtual, vm_offset_t physical, off_t offset, size_t length) 1365 { 1366 1367 if (sd->sd_disk == NULL) 1368 return (ENXIO); 1369 if (sd->sd_disk->d_kd.di.dumper == NULL) 1370 return (EOPNOTSUPP); 1371 return (dump_write(&sd->sd_disk->d_kd.di, 1372 virtual, physical, 1373 sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, 1374 length)); 1375 } 1376 1377 static void 1378 g_raid_disk_done(struct bio *bp) 1379 { 1380 struct g_raid_softc *sc; 1381 struct g_raid_subdisk *sd; 1382 1383 sd = bp->bio_caller1; 1384 sc = sd->sd_softc; 1385 mtx_lock(&sc->sc_queue_mtx); 1386 bioq_disksort(&sc->sc_queue, bp); 1387 mtx_unlock(&sc->sc_queue_mtx); 1388 if (!dumping) 1389 wakeup(sc); 1390 } 1391 1392 static void 1393 g_raid_disk_done_request(struct bio *bp) 1394 { 1395 struct g_raid_softc *sc; 1396 struct g_raid_disk *disk; 1397 struct g_raid_subdisk *sd; 1398 struct g_raid_volume *vol; 1399 1400 g_topology_assert_not(); 1401 1402 G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); 1403 sd = bp->bio_caller1; 1404 sc = sd->sd_softc; 1405 vol = sd->sd_volume; 1406 if (bp->bio_from != NULL) { 1407 bp->bio_from->index--; 1408 disk = bp->bio_from->private; 1409 if (disk == NULL) 1410 g_raid_kill_consumer(sc, bp->bio_from); 1411 } 1412 bp->bio_offset -= sd->sd_offset; 1413 1414 G_RAID_TR_IODONE(vol->v_tr, sd, bp); 1415 } 1416 1417 static void 1418 g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) 1419 { 1420 1421 if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) 1422 ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); 1423 else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) 1424 ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); 1425 else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) 1426 ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); 1427 else 1428 ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event); 1429 if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { 1430 KASSERT(ep->e_error == 0, 1431 ("Error cannot be handled.")); 1432 g_raid_event_free(ep); 1433 } else { 1434 ep->e_flags |= G_RAID_EVENT_DONE; 1435 G_RAID_DEBUG1(4, sc, "Waking up %p.", ep); 1436 mtx_lock(&sc->sc_queue_mtx); 1437 wakeup(ep); 1438 mtx_unlock(&sc->sc_queue_mtx); 1439 } 1440 } 1441 1442 /* 1443 * Worker thread. 1444 */ 1445 static void 1446 g_raid_worker(void *arg) 1447 { 1448 struct g_raid_softc *sc; 1449 struct g_raid_event *ep; 1450 struct g_raid_volume *vol; 1451 struct bio *bp; 1452 struct timeval now, t; 1453 int timeout, rv; 1454 1455 sc = arg; 1456 thread_lock(curthread); 1457 sched_prio(curthread, PRIBIO); 1458 thread_unlock(curthread); 1459 1460 sx_xlock(&sc->sc_lock); 1461 for (;;) { 1462 mtx_lock(&sc->sc_queue_mtx); 1463 /* 1464 * First take a look at events. 1465 * This is important to handle events before any I/O requests. 1466 */ 1467 bp = NULL; 1468 vol = NULL; 1469 rv = 0; 1470 ep = TAILQ_FIRST(&sc->sc_events); 1471 if (ep != NULL) 1472 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 1473 else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) 1474 ; 1475 else { 1476 getmicrouptime(&now); 1477 t = now; 1478 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 1479 if (bioq_first(&vol->v_inflight) == NULL && 1480 vol->v_tr && 1481 timevalcmp(&vol->v_last_done, &t, < )) 1482 t = vol->v_last_done; 1483 } 1484 timevalsub(&t, &now); 1485 timeout = g_raid_idle_threshold + 1486 t.tv_sec * 1000000 + t.tv_usec; 1487 if (timeout > 0) { 1488 /* 1489 * Two steps to avoid overflows at HZ=1000 1490 * and idle timeouts > 2.1s. Some rounding 1491 * errors can occur, but they are < 1tick, 1492 * which is deemed to be close enough for 1493 * this purpose. 1494 */ 1495 int micpertic = 1000000 / hz; 1496 timeout = (timeout + micpertic - 1) / micpertic; 1497 sx_xunlock(&sc->sc_lock); 1498 MSLEEP(rv, sc, &sc->sc_queue_mtx, 1499 PRIBIO | PDROP, "-", timeout); 1500 sx_xlock(&sc->sc_lock); 1501 goto process; 1502 } else 1503 rv = EWOULDBLOCK; 1504 } 1505 mtx_unlock(&sc->sc_queue_mtx); 1506 process: 1507 if (ep != NULL) { 1508 g_raid_handle_event(sc, ep); 1509 } else if (bp != NULL) { 1510 if (bp->bio_to != NULL && 1511 bp->bio_to->geom == sc->sc_geom) 1512 g_raid_start_request(bp); 1513 else 1514 g_raid_disk_done_request(bp); 1515 } else if (rv == EWOULDBLOCK) { 1516 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 1517 if (vol->v_writes == 0 && vol->v_dirty) 1518 g_raid_clean(vol, -1); 1519 if (bioq_first(&vol->v_inflight) == NULL && 1520 vol->v_tr) { 1521 t.tv_sec = g_raid_idle_threshold / 1000000; 1522 t.tv_usec = g_raid_idle_threshold % 1000000; 1523 timevaladd(&t, &vol->v_last_done); 1524 getmicrouptime(&now); 1525 if (timevalcmp(&t, &now, <= )) { 1526 G_RAID_TR_IDLE(vol->v_tr); 1527 vol->v_last_done = now; 1528 } 1529 } 1530 } 1531 } 1532 if (sc->sc_stopping == G_RAID_DESTROY_HARD) 1533 g_raid_destroy_node(sc, 1); /* May not return. */ 1534 } 1535 } 1536 1537 static void 1538 g_raid_poll(struct g_raid_softc *sc) 1539 { 1540 struct g_raid_event *ep; 1541 struct bio *bp; 1542 1543 sx_xlock(&sc->sc_lock); 1544 mtx_lock(&sc->sc_queue_mtx); 1545 /* 1546 * First take a look at events. 1547 * This is important to handle events before any I/O requests. 1548 */ 1549 ep = TAILQ_FIRST(&sc->sc_events); 1550 if (ep != NULL) { 1551 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 1552 mtx_unlock(&sc->sc_queue_mtx); 1553 g_raid_handle_event(sc, ep); 1554 goto out; 1555 } 1556 bp = bioq_takefirst(&sc->sc_queue); 1557 if (bp != NULL) { 1558 mtx_unlock(&sc->sc_queue_mtx); 1559 if (bp->bio_from == NULL || 1560 bp->bio_from->geom != sc->sc_geom) 1561 g_raid_start_request(bp); 1562 else 1563 g_raid_disk_done_request(bp); 1564 } 1565 out: 1566 sx_xunlock(&sc->sc_lock); 1567 } 1568 1569 static void 1570 g_raid_launch_provider(struct g_raid_volume *vol) 1571 { 1572 struct g_raid_disk *disk; 1573 struct g_raid_softc *sc; 1574 struct g_provider *pp; 1575 char name[G_RAID_MAX_VOLUMENAME]; 1576 off_t off; 1577 1578 sc = vol->v_softc; 1579 sx_assert(&sc->sc_lock, SX_LOCKED); 1580 1581 g_topology_lock(); 1582 /* Try to name provider with volume name. */ 1583 snprintf(name, sizeof(name), "raid/%s", vol->v_name); 1584 if (g_raid_name_format == 0 || vol->v_name[0] == 0 || 1585 g_provider_by_name(name) != NULL) { 1586 /* Otherwise use sequential volume number. */ 1587 snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); 1588 } 1589 pp = g_new_providerf(sc->sc_geom, "%s", name); 1590 pp->private = vol; 1591 pp->mediasize = vol->v_mediasize; 1592 pp->sectorsize = vol->v_sectorsize; 1593 pp->stripesize = 0; 1594 pp->stripeoffset = 0; 1595 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || 1596 vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || 1597 vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE || 1598 vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) { 1599 if ((disk = vol->v_subdisks[0].sd_disk) != NULL && 1600 disk->d_consumer != NULL && 1601 disk->d_consumer->provider != NULL) { 1602 pp->stripesize = disk->d_consumer->provider->stripesize; 1603 off = disk->d_consumer->provider->stripeoffset; 1604 pp->stripeoffset = off + vol->v_subdisks[0].sd_offset; 1605 if (off > 0) 1606 pp->stripeoffset %= off; 1607 } 1608 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) { 1609 pp->stripesize *= (vol->v_disks_count - 1); 1610 pp->stripeoffset *= (vol->v_disks_count - 1); 1611 } 1612 } else 1613 pp->stripesize = vol->v_strip_size; 1614 vol->v_provider = pp; 1615 g_error_provider(pp, 0); 1616 g_topology_unlock(); 1617 G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.", 1618 pp->name, vol->v_name); 1619 } 1620 1621 static void 1622 g_raid_destroy_provider(struct g_raid_volume *vol) 1623 { 1624 struct g_raid_softc *sc; 1625 struct g_provider *pp; 1626 struct bio *bp, *tmp; 1627 1628 g_topology_assert_not(); 1629 sc = vol->v_softc; 1630 pp = vol->v_provider; 1631 KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); 1632 1633 g_topology_lock(); 1634 g_error_provider(pp, ENXIO); 1635 mtx_lock(&sc->sc_queue_mtx); 1636 TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { 1637 if (bp->bio_to != pp) 1638 continue; 1639 bioq_remove(&sc->sc_queue, bp); 1640 g_io_deliver(bp, ENXIO); 1641 } 1642 mtx_unlock(&sc->sc_queue_mtx); 1643 G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.", 1644 pp->name, vol->v_name); 1645 g_wither_provider(pp, ENXIO); 1646 g_topology_unlock(); 1647 vol->v_provider = NULL; 1648 } 1649 1650 /* 1651 * Update device state. 1652 */ 1653 static int 1654 g_raid_update_volume(struct g_raid_volume *vol, u_int event) 1655 { 1656 struct g_raid_softc *sc; 1657 1658 sc = vol->v_softc; 1659 sx_assert(&sc->sc_lock, SX_XLOCKED); 1660 1661 G_RAID_DEBUG1(2, sc, "Event %s for volume %s.", 1662 g_raid_volume_event2str(event), 1663 vol->v_name); 1664 switch (event) { 1665 case G_RAID_VOLUME_E_DOWN: 1666 if (vol->v_provider != NULL) 1667 g_raid_destroy_provider(vol); 1668 break; 1669 case G_RAID_VOLUME_E_UP: 1670 if (vol->v_provider == NULL) 1671 g_raid_launch_provider(vol); 1672 break; 1673 case G_RAID_VOLUME_E_START: 1674 if (vol->v_tr) 1675 G_RAID_TR_START(vol->v_tr); 1676 return (0); 1677 default: 1678 if (sc->sc_md) 1679 G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event); 1680 return (0); 1681 } 1682 1683 /* Manage root mount release. */ 1684 if (vol->v_starting) { 1685 vol->v_starting = 0; 1686 G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount); 1687 root_mount_rel(vol->v_rootmount); 1688 vol->v_rootmount = NULL; 1689 } 1690 if (vol->v_stopping && vol->v_provider_open == 0) 1691 g_raid_destroy_volume(vol); 1692 return (0); 1693 } 1694 1695 /* 1696 * Update subdisk state. 1697 */ 1698 static int 1699 g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) 1700 { 1701 struct g_raid_softc *sc; 1702 struct g_raid_volume *vol; 1703 1704 sc = sd->sd_softc; 1705 vol = sd->sd_volume; 1706 sx_assert(&sc->sc_lock, SX_XLOCKED); 1707 1708 G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.", 1709 g_raid_subdisk_event2str(event), 1710 vol->v_name, sd->sd_pos, 1711 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 1712 if (vol->v_tr) 1713 G_RAID_TR_EVENT(vol->v_tr, sd, event); 1714 1715 return (0); 1716 } 1717 1718 /* 1719 * Update disk state. 1720 */ 1721 static int 1722 g_raid_update_disk(struct g_raid_disk *disk, u_int event) 1723 { 1724 struct g_raid_softc *sc; 1725 1726 sc = disk->d_softc; 1727 sx_assert(&sc->sc_lock, SX_XLOCKED); 1728 1729 G_RAID_DEBUG1(2, sc, "Event %s for disk %s.", 1730 g_raid_disk_event2str(event), 1731 g_raid_get_diskname(disk)); 1732 1733 if (sc->sc_md) 1734 G_RAID_MD_EVENT(sc->sc_md, disk, event); 1735 return (0); 1736 } 1737 1738 /* 1739 * Node event. 1740 */ 1741 static int 1742 g_raid_update_node(struct g_raid_softc *sc, u_int event) 1743 { 1744 sx_assert(&sc->sc_lock, SX_XLOCKED); 1745 1746 G_RAID_DEBUG1(2, sc, "Event %s for the array.", 1747 g_raid_node_event2str(event)); 1748 1749 if (event == G_RAID_NODE_E_WAKE) 1750 return (0); 1751 if (sc->sc_md) 1752 G_RAID_MD_EVENT(sc->sc_md, NULL, event); 1753 return (0); 1754 } 1755 1756 static int 1757 g_raid_access(struct g_provider *pp, int acr, int acw, int ace) 1758 { 1759 struct g_raid_volume *vol; 1760 struct g_raid_softc *sc; 1761 int dcw, opens, error = 0; 1762 1763 g_topology_assert(); 1764 sc = pp->geom->softc; 1765 vol = pp->private; 1766 KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); 1767 KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); 1768 1769 G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name, 1770 acr, acw, ace); 1771 dcw = pp->acw + acw; 1772 1773 g_topology_unlock(); 1774 sx_xlock(&sc->sc_lock); 1775 /* Deny new opens while dying. */ 1776 if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { 1777 error = ENXIO; 1778 goto out; 1779 } 1780 if (dcw == 0 && vol->v_dirty) 1781 g_raid_clean(vol, dcw); 1782 vol->v_provider_open += acr + acw + ace; 1783 /* Handle delayed node destruction. */ 1784 if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && 1785 vol->v_provider_open == 0) { 1786 /* Count open volumes. */ 1787 opens = g_raid_nopens(sc); 1788 if (opens == 0) { 1789 sc->sc_stopping = G_RAID_DESTROY_HARD; 1790 /* Wake up worker to make it selfdestruct. */ 1791 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 1792 } 1793 } 1794 /* Handle open volume destruction. */ 1795 if (vol->v_stopping && vol->v_provider_open == 0) 1796 g_raid_destroy_volume(vol); 1797 out: 1798 sx_xunlock(&sc->sc_lock); 1799 g_topology_lock(); 1800 return (error); 1801 } 1802 1803 struct g_raid_softc * 1804 g_raid_create_node(struct g_class *mp, 1805 const char *name, struct g_raid_md_object *md) 1806 { 1807 struct g_raid_softc *sc; 1808 struct g_geom *gp; 1809 int error; 1810 1811 g_topology_assert(); 1812 G_RAID_DEBUG(1, "Creating array %s.", name); 1813 1814 gp = g_new_geomf(mp, "%s", name); 1815 sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); 1816 gp->start = g_raid_start; 1817 gp->orphan = g_raid_orphan; 1818 gp->access = g_raid_access; 1819 gp->dumpconf = g_raid_dumpconf; 1820 1821 sc->sc_md = md; 1822 sc->sc_geom = gp; 1823 sc->sc_flags = 0; 1824 TAILQ_INIT(&sc->sc_volumes); 1825 TAILQ_INIT(&sc->sc_disks); 1826 sx_init(&sc->sc_lock, "gmirror:lock"); 1827 mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); 1828 TAILQ_INIT(&sc->sc_events); 1829 bioq_init(&sc->sc_queue); 1830 gp->softc = sc; 1831 error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, 1832 "g_raid %s", name); 1833 if (error != 0) { 1834 G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name); 1835 mtx_destroy(&sc->sc_queue_mtx); 1836 sx_destroy(&sc->sc_lock); 1837 g_destroy_geom(sc->sc_geom); 1838 free(sc, M_RAID); 1839 return (NULL); 1840 } 1841 1842 G_RAID_DEBUG1(0, sc, "Array %s created.", name); 1843 return (sc); 1844 } 1845 1846 struct g_raid_volume * 1847 g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id) 1848 { 1849 struct g_raid_volume *vol, *vol1; 1850 int i; 1851 1852 G_RAID_DEBUG1(1, sc, "Creating volume %s.", name); 1853 vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); 1854 vol->v_softc = sc; 1855 strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); 1856 vol->v_state = G_RAID_VOLUME_S_STARTING; 1857 vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; 1858 vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN; 1859 bioq_init(&vol->v_inflight); 1860 bioq_init(&vol->v_locked); 1861 LIST_INIT(&vol->v_locks); 1862 for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { 1863 vol->v_subdisks[i].sd_softc = sc; 1864 vol->v_subdisks[i].sd_volume = vol; 1865 vol->v_subdisks[i].sd_pos = i; 1866 vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; 1867 } 1868 1869 /* Find free ID for this volume. */ 1870 g_topology_lock(); 1871 vol1 = vol; 1872 if (id >= 0) { 1873 LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { 1874 if (vol1->v_global_id == id) 1875 break; 1876 } 1877 } 1878 if (vol1 != NULL) { 1879 for (id = 0; ; id++) { 1880 LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { 1881 if (vol1->v_global_id == id) 1882 break; 1883 } 1884 if (vol1 == NULL) 1885 break; 1886 } 1887 } 1888 vol->v_global_id = id; 1889 LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); 1890 g_topology_unlock(); 1891 1892 /* Delay root mounting. */ 1893 vol->v_rootmount = root_mount_hold("GRAID"); 1894 G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount); 1895 vol->v_starting = 1; 1896 TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); 1897 return (vol); 1898 } 1899 1900 struct g_raid_disk * 1901 g_raid_create_disk(struct g_raid_softc *sc) 1902 { 1903 struct g_raid_disk *disk; 1904 1905 G_RAID_DEBUG1(1, sc, "Creating disk."); 1906 disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); 1907 disk->d_softc = sc; 1908 disk->d_state = G_RAID_DISK_S_NONE; 1909 TAILQ_INIT(&disk->d_subdisks); 1910 TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); 1911 return (disk); 1912 } 1913 1914 int g_raid_start_volume(struct g_raid_volume *vol) 1915 { 1916 struct g_raid_tr_class *class; 1917 struct g_raid_tr_object *obj; 1918 int status; 1919 1920 G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name); 1921 LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { 1922 G_RAID_DEBUG1(2, vol->v_softc, 1923 "Tasting volume %s for %s transformation.", 1924 vol->v_name, class->name); 1925 obj = (void *)kobj_create((kobj_class_t)class, M_RAID, 1926 M_WAITOK); 1927 obj->tro_class = class; 1928 obj->tro_volume = vol; 1929 status = G_RAID_TR_TASTE(obj, vol); 1930 if (status != G_RAID_TR_TASTE_FAIL) 1931 break; 1932 kobj_delete((kobj_t)obj, M_RAID); 1933 } 1934 if (class == NULL) { 1935 G_RAID_DEBUG1(0, vol->v_softc, 1936 "No transformation module found for %s.", 1937 vol->v_name); 1938 vol->v_tr = NULL; 1939 g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); 1940 g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, 1941 G_RAID_EVENT_VOLUME); 1942 return (-1); 1943 } 1944 G_RAID_DEBUG1(2, vol->v_softc, 1945 "Transformation module %s chosen for %s.", 1946 class->name, vol->v_name); 1947 vol->v_tr = obj; 1948 return (0); 1949 } 1950 1951 int 1952 g_raid_destroy_node(struct g_raid_softc *sc, int worker) 1953 { 1954 struct g_raid_volume *vol, *tmpv; 1955 struct g_raid_disk *disk, *tmpd; 1956 int error = 0; 1957 1958 sc->sc_stopping = G_RAID_DESTROY_HARD; 1959 TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { 1960 if (g_raid_destroy_volume(vol)) 1961 error = EBUSY; 1962 } 1963 if (error) 1964 return (error); 1965 TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { 1966 if (g_raid_destroy_disk(disk)) 1967 error = EBUSY; 1968 } 1969 if (error) 1970 return (error); 1971 if (sc->sc_md) { 1972 G_RAID_MD_FREE(sc->sc_md); 1973 kobj_delete((kobj_t)sc->sc_md, M_RAID); 1974 sc->sc_md = NULL; 1975 } 1976 if (sc->sc_geom != NULL) { 1977 G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name); 1978 g_topology_lock(); 1979 sc->sc_geom->softc = NULL; 1980 g_wither_geom(sc->sc_geom, ENXIO); 1981 g_topology_unlock(); 1982 sc->sc_geom = NULL; 1983 } else 1984 G_RAID_DEBUG(1, "Array destroyed."); 1985 if (worker) { 1986 g_raid_event_cancel(sc, sc); 1987 mtx_destroy(&sc->sc_queue_mtx); 1988 sx_xunlock(&sc->sc_lock); 1989 sx_destroy(&sc->sc_lock); 1990 wakeup(&sc->sc_stopping); 1991 free(sc, M_RAID); 1992 curthread->td_pflags &= ~TDP_GEOM; 1993 G_RAID_DEBUG(1, "Thread exiting."); 1994 kproc_exit(0); 1995 } else { 1996 /* Wake up worker to make it selfdestruct. */ 1997 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 1998 } 1999 return (0); 2000 } 2001 2002 int 2003 g_raid_destroy_volume(struct g_raid_volume *vol) 2004 { 2005 struct g_raid_softc *sc; 2006 struct g_raid_disk *disk; 2007 int i; 2008 2009 sc = vol->v_softc; 2010 G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name); 2011 vol->v_stopping = 1; 2012 if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { 2013 if (vol->v_tr) { 2014 G_RAID_TR_STOP(vol->v_tr); 2015 return (EBUSY); 2016 } else 2017 vol->v_state = G_RAID_VOLUME_S_STOPPED; 2018 } 2019 if (g_raid_event_check(sc, vol) != 0) 2020 return (EBUSY); 2021 if (vol->v_provider != NULL) 2022 return (EBUSY); 2023 if (vol->v_provider_open != 0) 2024 return (EBUSY); 2025 if (vol->v_tr) { 2026 G_RAID_TR_FREE(vol->v_tr); 2027 kobj_delete((kobj_t)vol->v_tr, M_RAID); 2028 vol->v_tr = NULL; 2029 } 2030 if (vol->v_rootmount) 2031 root_mount_rel(vol->v_rootmount); 2032 g_topology_lock(); 2033 LIST_REMOVE(vol, v_global_next); 2034 g_topology_unlock(); 2035 TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); 2036 for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { 2037 g_raid_event_cancel(sc, &vol->v_subdisks[i]); 2038 disk = vol->v_subdisks[i].sd_disk; 2039 if (disk == NULL) 2040 continue; 2041 TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); 2042 } 2043 G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name); 2044 if (sc->sc_md) 2045 G_RAID_MD_FREE_VOLUME(sc->sc_md, vol); 2046 g_raid_event_cancel(sc, vol); 2047 free(vol, M_RAID); 2048 if (sc->sc_stopping == G_RAID_DESTROY_HARD) { 2049 /* Wake up worker to let it selfdestruct. */ 2050 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 2051 } 2052 return (0); 2053 } 2054 2055 int 2056 g_raid_destroy_disk(struct g_raid_disk *disk) 2057 { 2058 struct g_raid_softc *sc; 2059 struct g_raid_subdisk *sd, *tmp; 2060 2061 sc = disk->d_softc; 2062 G_RAID_DEBUG1(2, sc, "Destroying disk."); 2063 if (disk->d_consumer) { 2064 g_raid_kill_consumer(sc, disk->d_consumer); 2065 disk->d_consumer = NULL; 2066 } 2067 TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { 2068 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); 2069 g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, 2070 G_RAID_EVENT_SUBDISK); 2071 TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); 2072 sd->sd_disk = NULL; 2073 } 2074 TAILQ_REMOVE(&sc->sc_disks, disk, d_next); 2075 if (sc->sc_md) 2076 G_RAID_MD_FREE_DISK(sc->sc_md, disk); 2077 g_raid_event_cancel(sc, disk); 2078 free(disk, M_RAID); 2079 return (0); 2080 } 2081 2082 int 2083 g_raid_destroy(struct g_raid_softc *sc, int how) 2084 { 2085 int opens; 2086 2087 g_topology_assert_not(); 2088 if (sc == NULL) 2089 return (ENXIO); 2090 sx_assert(&sc->sc_lock, SX_XLOCKED); 2091 2092 /* Count open volumes. */ 2093 opens = g_raid_nopens(sc); 2094 2095 /* React on some opened volumes. */ 2096 if (opens > 0) { 2097 switch (how) { 2098 case G_RAID_DESTROY_SOFT: 2099 G_RAID_DEBUG1(1, sc, 2100 "%d volumes are still open.", 2101 opens); 2102 return (EBUSY); 2103 case G_RAID_DESTROY_DELAYED: 2104 G_RAID_DEBUG1(1, sc, 2105 "Array will be destroyed on last close."); 2106 sc->sc_stopping = G_RAID_DESTROY_DELAYED; 2107 return (EBUSY); 2108 case G_RAID_DESTROY_HARD: 2109 G_RAID_DEBUG1(1, sc, 2110 "%d volumes are still open.", 2111 opens); 2112 } 2113 } 2114 2115 /* Mark node for destruction. */ 2116 sc->sc_stopping = G_RAID_DESTROY_HARD; 2117 /* Wake up worker to let it selfdestruct. */ 2118 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 2119 /* Sleep until node destroyed. */ 2120 sx_sleep(&sc->sc_stopping, &sc->sc_lock, 2121 PRIBIO | PDROP, "r:destroy", 0); 2122 return (0); 2123 } 2124 2125 static void 2126 g_raid_taste_orphan(struct g_consumer *cp) 2127 { 2128 2129 KASSERT(1 == 0, ("%s called while tasting %s.", __func__, 2130 cp->provider->name)); 2131 } 2132 2133 static struct g_geom * 2134 g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 2135 { 2136 struct g_consumer *cp; 2137 struct g_geom *gp, *geom; 2138 struct g_raid_md_class *class; 2139 struct g_raid_md_object *obj; 2140 int status; 2141 2142 g_topology_assert(); 2143 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); 2144 G_RAID_DEBUG(2, "Tasting provider %s.", pp->name); 2145 2146 gp = g_new_geomf(mp, "mirror:taste"); 2147 /* 2148 * This orphan function should be never called. 2149 */ 2150 gp->orphan = g_raid_taste_orphan; 2151 cp = g_new_consumer(gp); 2152 g_attach(cp, pp); 2153 2154 geom = NULL; 2155 LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { 2156 G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.", 2157 pp->name, class->name); 2158 obj = (void *)kobj_create((kobj_class_t)class, M_RAID, 2159 M_WAITOK); 2160 obj->mdo_class = class; 2161 status = G_RAID_MD_TASTE(obj, mp, cp, &geom); 2162 if (status != G_RAID_MD_TASTE_NEW) 2163 kobj_delete((kobj_t)obj, M_RAID); 2164 if (status != G_RAID_MD_TASTE_FAIL) 2165 break; 2166 } 2167 2168 g_detach(cp); 2169 g_destroy_consumer(cp); 2170 g_destroy_geom(gp); 2171 G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name); 2172 return (geom); 2173 } 2174 2175 int 2176 g_raid_create_node_format(const char *format, struct g_geom **gp) 2177 { 2178 struct g_raid_md_class *class; 2179 struct g_raid_md_object *obj; 2180 int status; 2181 2182 G_RAID_DEBUG(2, "Creating array for %s metadata.", format); 2183 LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { 2184 if (strcasecmp(class->name, format) == 0) 2185 break; 2186 } 2187 if (class == NULL) { 2188 G_RAID_DEBUG(1, "No support for %s metadata.", format); 2189 return (G_RAID_MD_TASTE_FAIL); 2190 } 2191 obj = (void *)kobj_create((kobj_class_t)class, M_RAID, 2192 M_WAITOK); 2193 obj->mdo_class = class; 2194 status = G_RAID_MD_CREATE(obj, &g_raid_class, gp); 2195 if (status != G_RAID_MD_TASTE_NEW) 2196 kobj_delete((kobj_t)obj, M_RAID); 2197 return (status); 2198 } 2199 2200 static int 2201 g_raid_destroy_geom(struct gctl_req *req __unused, 2202 struct g_class *mp __unused, struct g_geom *gp) 2203 { 2204 struct g_raid_softc *sc; 2205 int error; 2206 2207 g_topology_unlock(); 2208 sc = gp->softc; 2209 sx_xlock(&sc->sc_lock); 2210 g_cancel_event(sc); 2211 error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); 2212 if (error != 0) 2213 sx_xunlock(&sc->sc_lock); 2214 g_topology_lock(); 2215 return (error); 2216 } 2217 2218 void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, 2219 struct g_raid_subdisk *sd, struct g_raid_disk *disk) 2220 { 2221 2222 if (sc->sc_stopping == G_RAID_DESTROY_HARD) 2223 return; 2224 if (sc->sc_md) 2225 G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); 2226 } 2227 2228 void g_raid_fail_disk(struct g_raid_softc *sc, 2229 struct g_raid_subdisk *sd, struct g_raid_disk *disk) 2230 { 2231 2232 if (disk == NULL) 2233 disk = sd->sd_disk; 2234 if (disk == NULL) { 2235 G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!"); 2236 return; 2237 } 2238 if (disk->d_state != G_RAID_DISK_S_ACTIVE) { 2239 G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a " 2240 "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); 2241 return; 2242 } 2243 if (sc->sc_md) 2244 G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); 2245 } 2246 2247 static void 2248 g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 2249 struct g_consumer *cp, struct g_provider *pp) 2250 { 2251 struct g_raid_softc *sc; 2252 struct g_raid_volume *vol; 2253 struct g_raid_subdisk *sd; 2254 struct g_raid_disk *disk; 2255 int i, s; 2256 2257 g_topology_assert(); 2258 2259 sc = gp->softc; 2260 if (sc == NULL) 2261 return; 2262 if (pp != NULL) { 2263 vol = pp->private; 2264 g_topology_unlock(); 2265 sx_xlock(&sc->sc_lock); 2266 sbuf_printf(sb, "%s<Label>%s</Label>\n", indent, 2267 vol->v_name); 2268 sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent, 2269 g_raid_volume_level2str(vol->v_raid_level, 2270 vol->v_raid_level_qualifier)); 2271 sbuf_printf(sb, 2272 "%s<Transformation>%s</Transformation>\n", indent, 2273 vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); 2274 sbuf_printf(sb, "%s<Components>%u</Components>\n", indent, 2275 vol->v_disks_count); 2276 sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent, 2277 vol->v_strip_size); 2278 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 2279 g_raid_volume_state2str(vol->v_state)); 2280 sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent, 2281 vol->v_dirty ? "Yes" : "No"); 2282 sbuf_printf(sb, "%s<Subdisks>", indent); 2283 for (i = 0; i < vol->v_disks_count; i++) { 2284 sd = &vol->v_subdisks[i]; 2285 if (sd->sd_disk != NULL && 2286 sd->sd_disk->d_consumer != NULL) { 2287 sbuf_printf(sb, "%s ", 2288 g_raid_get_diskname(sd->sd_disk)); 2289 } else { 2290 sbuf_printf(sb, "NONE "); 2291 } 2292 sbuf_printf(sb, "(%s", 2293 g_raid_subdisk_state2str(sd->sd_state)); 2294 if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 2295 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { 2296 sbuf_printf(sb, " %d%%", 2297 (int)(sd->sd_rebuild_pos * 100 / 2298 sd->sd_size)); 2299 } 2300 sbuf_printf(sb, ")"); 2301 if (i + 1 < vol->v_disks_count) 2302 sbuf_printf(sb, ", "); 2303 } 2304 sbuf_printf(sb, "</Subdisks>\n"); 2305 sx_xunlock(&sc->sc_lock); 2306 g_topology_lock(); 2307 } else if (cp != NULL) { 2308 disk = cp->private; 2309 if (disk == NULL) 2310 return; 2311 g_topology_unlock(); 2312 sx_xlock(&sc->sc_lock); 2313 sbuf_printf(sb, "%s<State>%s", indent, 2314 g_raid_disk_state2str(disk->d_state)); 2315 if (!TAILQ_EMPTY(&disk->d_subdisks)) { 2316 sbuf_printf(sb, " ("); 2317 TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { 2318 sbuf_printf(sb, "%s", 2319 g_raid_subdisk_state2str(sd->sd_state)); 2320 if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 2321 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { 2322 sbuf_printf(sb, " %d%%", 2323 (int)(sd->sd_rebuild_pos * 100 / 2324 sd->sd_size)); 2325 } 2326 if (TAILQ_NEXT(sd, sd_next)) 2327 sbuf_printf(sb, ", "); 2328 } 2329 sbuf_printf(sb, ")"); 2330 } 2331 sbuf_printf(sb, "</State>\n"); 2332 sbuf_printf(sb, "%s<Subdisks>", indent); 2333 TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { 2334 sbuf_printf(sb, "r%d(%s):%d@%ju", 2335 sd->sd_volume->v_global_id, 2336 sd->sd_volume->v_name, 2337 sd->sd_pos, sd->sd_offset); 2338 if (TAILQ_NEXT(sd, sd_next)) 2339 sbuf_printf(sb, ", "); 2340 } 2341 sbuf_printf(sb, "</Subdisks>\n"); 2342 sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent, 2343 disk->d_read_errs); 2344 sx_xunlock(&sc->sc_lock); 2345 g_topology_lock(); 2346 } else { 2347 g_topology_unlock(); 2348 sx_xlock(&sc->sc_lock); 2349 if (sc->sc_md) { 2350 sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent, 2351 sc->sc_md->mdo_class->name); 2352 } 2353 if (!TAILQ_EMPTY(&sc->sc_volumes)) { 2354 s = 0xff; 2355 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 2356 if (vol->v_state < s) 2357 s = vol->v_state; 2358 } 2359 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 2360 g_raid_volume_state2str(s)); 2361 } 2362 sx_xunlock(&sc->sc_lock); 2363 g_topology_lock(); 2364 } 2365 } 2366 2367 static void 2368 g_raid_shutdown_pre_sync(void *arg, int howto) 2369 { 2370 struct g_class *mp; 2371 struct g_geom *gp, *gp2; 2372 struct g_raid_softc *sc; 2373 int error; 2374 2375 mp = arg; 2376 DROP_GIANT(); 2377 g_topology_lock(); 2378 LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { 2379 if ((sc = gp->softc) == NULL) 2380 continue; 2381 g_topology_unlock(); 2382 sx_xlock(&sc->sc_lock); 2383 g_cancel_event(sc); 2384 error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); 2385 if (error != 0) 2386 sx_xunlock(&sc->sc_lock); 2387 g_topology_lock(); 2388 } 2389 g_topology_unlock(); 2390 PICKUP_GIANT(); 2391 } 2392 2393 static void 2394 g_raid_init(struct g_class *mp) 2395 { 2396 2397 g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, 2398 g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); 2399 if (g_raid_pre_sync == NULL) 2400 G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); 2401 g_raid_started = 1; 2402 } 2403 2404 static void 2405 g_raid_fini(struct g_class *mp) 2406 { 2407 2408 if (g_raid_pre_sync != NULL) 2409 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync); 2410 g_raid_started = 0; 2411 } 2412 2413 int 2414 g_raid_md_modevent(module_t mod, int type, void *arg) 2415 { 2416 struct g_raid_md_class *class, *c, *nc; 2417 int error; 2418 2419 error = 0; 2420 class = arg; 2421 switch (type) { 2422 case MOD_LOAD: 2423 c = LIST_FIRST(&g_raid_md_classes); 2424 if (c == NULL || c->mdc_priority > class->mdc_priority) 2425 LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); 2426 else { 2427 while ((nc = LIST_NEXT(c, mdc_list)) != NULL && 2428 nc->mdc_priority < class->mdc_priority) 2429 c = nc; 2430 LIST_INSERT_AFTER(c, class, mdc_list); 2431 } 2432 if (g_raid_started) 2433 g_retaste(&g_raid_class); 2434 break; 2435 case MOD_UNLOAD: 2436 LIST_REMOVE(class, mdc_list); 2437 break; 2438 default: 2439 error = EOPNOTSUPP; 2440 break; 2441 } 2442 2443 return (error); 2444 } 2445 2446 int 2447 g_raid_tr_modevent(module_t mod, int type, void *arg) 2448 { 2449 struct g_raid_tr_class *class, *c, *nc; 2450 int error; 2451 2452 error = 0; 2453 class = arg; 2454 switch (type) { 2455 case MOD_LOAD: 2456 c = LIST_FIRST(&g_raid_tr_classes); 2457 if (c == NULL || c->trc_priority > class->trc_priority) 2458 LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); 2459 else { 2460 while ((nc = LIST_NEXT(c, trc_list)) != NULL && 2461 nc->trc_priority < class->trc_priority) 2462 c = nc; 2463 LIST_INSERT_AFTER(c, class, trc_list); 2464 } 2465 break; 2466 case MOD_UNLOAD: 2467 LIST_REMOVE(class, trc_list); 2468 break; 2469 default: 2470 error = EOPNOTSUPP; 2471 break; 2472 } 2473 2474 return (error); 2475 } 2476 2477 /* 2478 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) 2479 * to reduce module priority, allowing submodules to register them first. 2480 */ 2481 static moduledata_t g_raid_mod = { 2482 "g_raid", 2483 g_modevent, 2484 &g_raid_class 2485 }; 2486 DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); 2487 MODULE_VERSION(geom_raid, 0); 2488