1 /*- 2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #ifndef _G_RAID_H_ 30 #define _G_RAID_H_ 31 32 #include <sys/param.h> 33 #include <sys/kobj.h> 34 #include <sys/bio.h> 35 #include <sys/time.h> 36 37 #define G_RAID_CLASS_NAME "RAID" 38 39 #define G_RAID_MAGIC "GEOM::RAID" 40 41 #define G_RAID_VERSION 0 42 43 struct g_raid_md_object; 44 struct g_raid_tr_object; 45 46 #define G_RAID_DEVICE_FLAG_NOAUTOSYNC 0x0000000000000001ULL 47 #define G_RAID_DEVICE_FLAG_NOFAILSYNC 0x0000000000000002ULL 48 #define G_RAID_DEVICE_FLAG_MASK (G_RAID_DEVICE_FLAG_NOAUTOSYNC | \ 49 G_RAID_DEVICE_FLAG_NOFAILSYNC) 50 51 #ifdef _KERNEL 52 extern u_int g_raid_aggressive_spare; 53 extern u_int g_raid_debug; 54 extern int g_raid_read_err_thresh; 55 extern u_int g_raid_start_timeout; 56 extern struct g_class g_raid_class; 57 58 #define G_RAID_DEBUG(lvl, fmt, ...) do { \ 59 if (g_raid_debug >= (lvl)) { \ 60 if (g_raid_debug > 0) { \ 61 printf("GEOM_RAID[%u]: " fmt "\n", \ 62 lvl, ## __VA_ARGS__); \ 63 } else { \ 64 printf("GEOM_RAID: " fmt "\n", \ 65 ## __VA_ARGS__); \ 66 } \ 67 } \ 68 } while (0) 69 #define G_RAID_DEBUG1(lvl, sc, fmt, ...) do { \ 70 if (g_raid_debug >= (lvl)) { \ 71 if (g_raid_debug > 0) { \ 72 printf("GEOM_RAID[%u]: %s: " fmt "\n", \ 73 lvl, (sc)->sc_name, ## __VA_ARGS__); \ 74 } else { \ 75 printf("GEOM_RAID: %s: " fmt "\n", \ 76 (sc)->sc_name, ## __VA_ARGS__); \ 77 } \ 78 } \ 79 } while (0) 80 #define G_RAID_LOGREQ(lvl, bp, fmt, ...) do { \ 81 if (g_raid_debug >= (lvl)) { \ 82 if (g_raid_debug > 0) { \ 83 printf("GEOM_RAID[%u]: " fmt " ", \ 84 lvl, ## __VA_ARGS__); \ 85 } else \ 86 printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__); \ 87 g_print_bio(bp); \ 88 printf("\n"); \ 89 } \ 90 } while (0) 91 92 /* 93 * Flags we use to distinguish I/O initiated by the TR layer to maintain 94 * the volume's characteristics, fix subdisks, extra copies of data, etc. 95 * 96 * G_RAID_BIO_FLAG_SYNC I/O to update an extra copy of the data 97 * for RAID volumes that maintain extra data 98 * and need to rebuild that data. 99 * G_RAID_BIO_FLAG_REMAP I/O done to try to provoke a subdisk into 100 * doing some desirable action such as bad 101 * block remapping after we detect a bad part 102 * of the disk. 103 * G_RAID_BIO_FLAG_LOCKED I/O holds range lock that should re released. 104 * 105 * and the following meta item: 106 * G_RAID_BIO_FLAG_SPECIAL And of the I/O flags that need to make it 107 * through the range locking which would 108 * otherwise defer the I/O until after that 109 * range is unlocked. 110 */ 111 #define G_RAID_BIO_FLAG_SYNC 0x01 112 #define G_RAID_BIO_FLAG_REMAP 0x02 113 #define G_RAID_BIO_FLAG_SPECIAL \ 114 (G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP) 115 #define G_RAID_BIO_FLAG_LOCKED 0x80 116 117 struct g_raid_lock { 118 off_t l_offset; 119 off_t l_length; 120 void *l_callback_arg; 121 int l_pending; 122 LIST_ENTRY(g_raid_lock) l_next; 123 }; 124 125 #define G_RAID_EVENT_WAIT 0x01 126 #define G_RAID_EVENT_VOLUME 0x02 127 #define G_RAID_EVENT_SUBDISK 0x04 128 #define G_RAID_EVENT_DISK 0x08 129 #define G_RAID_EVENT_DONE 0x10 130 struct g_raid_event { 131 void *e_tgt; 132 int e_event; 133 int e_flags; 134 int e_error; 135 TAILQ_ENTRY(g_raid_event) e_next; 136 }; 137 #define G_RAID_DISK_S_NONE 0x00 /* State is unknown. */ 138 #define G_RAID_DISK_S_OFFLINE 0x01 /* Missing disk placeholder. */ 139 #define G_RAID_DISK_S_FAILED 0x02 /* Failed. */ 140 #define G_RAID_DISK_S_STALE_FAILED 0x03 /* Old failed. */ 141 #define G_RAID_DISK_S_SPARE 0x04 /* Hot-spare. */ 142 #define G_RAID_DISK_S_STALE 0x05 /* Old disk, unused now. */ 143 #define G_RAID_DISK_S_ACTIVE 0x06 /* Operational. */ 144 145 #define G_RAID_DISK_E_DISCONNECTED 0x01 146 147 struct g_raid_disk { 148 struct g_raid_softc *d_softc; /* Back-pointer to softc. */ 149 struct g_consumer *d_consumer; /* GEOM disk consumer. */ 150 void *d_md_data; /* Disk's metadata storage. */ 151 struct g_kerneldump d_kd; /* Kernel dumping method/args. */ 152 uint64_t d_flags; /* Additional flags. */ 153 u_int d_state; /* Disk state. */ 154 u_int d_load; /* Disk average load. */ 155 off_t d_last_offset; /* Last head offset. */ 156 int d_read_errs; /* Count of the read errors */ 157 TAILQ_HEAD(, g_raid_subdisk) d_subdisks; /* List of subdisks. */ 158 TAILQ_ENTRY(g_raid_disk) d_next; /* Next disk in the node. */ 159 }; 160 161 #define G_RAID_SUBDISK_S_NONE 0x00 /* Absent. */ 162 #define G_RAID_SUBDISK_S_FAILED 0x01 /* Failed. */ 163 #define G_RAID_SUBDISK_S_NEW 0x02 /* Blank. */ 164 #define G_RAID_SUBDISK_S_REBUILD 0x03 /* Blank + rebuild. */ 165 #define G_RAID_SUBDISK_S_UNINITIALIZED 0x04 /* Disk of the new volume. */ 166 #define G_RAID_SUBDISK_S_STALE 0x05 /* Dirty. */ 167 #define G_RAID_SUBDISK_S_RESYNC 0x06 /* Dirty + check/repair. */ 168 #define G_RAID_SUBDISK_S_ACTIVE 0x07 /* Usable. */ 169 170 #define G_RAID_SUBDISK_E_NEW 0x01 /* A new subdisk has arrived */ 171 #define G_RAID_SUBDISK_E_FAILED 0x02 /* A subdisk failed, but remains in volume */ 172 #define G_RAID_SUBDISK_E_DISCONNECTED 0x03 /* A subdisk removed from volume. */ 173 #define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80 /* translation private events */ 174 175 #define G_RAID_SUBDISK_POS(sd) \ 176 ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0) 177 #define G_RAID_SUBDISK_TRACK_SIZE (1 * 1024 * 1024) 178 #define G_RAID_SUBDISK_LOAD(sd) \ 179 ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0) 180 #define G_RAID_SUBDISK_LOAD_SCALE 256 181 182 struct g_raid_subdisk { 183 struct g_raid_softc *sd_softc; /* Back-pointer to softc. */ 184 struct g_raid_disk *sd_disk; /* Where this subdisk lives. */ 185 struct g_raid_volume *sd_volume; /* Volume, sd is a part of. */ 186 off_t sd_offset; /* Offset on the disk. */ 187 off_t sd_size; /* Size on the disk. */ 188 u_int sd_pos; /* Position in volume. */ 189 u_int sd_state; /* Subdisk state. */ 190 off_t sd_rebuild_pos; /* Rebuild position. */ 191 int sd_recovery; /* Count of recovery reqs. */ 192 TAILQ_ENTRY(g_raid_subdisk) sd_next; /* Next subdisk on disk. */ 193 }; 194 195 #define G_RAID_MAX_SUBDISKS 16 196 #define G_RAID_MAX_VOLUMENAME 32 197 198 #define G_RAID_VOLUME_S_STARTING 0x00 199 #define G_RAID_VOLUME_S_BROKEN 0x01 200 #define G_RAID_VOLUME_S_DEGRADED 0x02 201 #define G_RAID_VOLUME_S_SUBOPTIMAL 0x03 202 #define G_RAID_VOLUME_S_OPTIMAL 0x04 203 #define G_RAID_VOLUME_S_UNSUPPORTED 0x05 204 #define G_RAID_VOLUME_S_STOPPED 0x06 205 206 #define G_RAID_VOLUME_S_ALIVE(s) \ 207 ((s) == G_RAID_VOLUME_S_DEGRADED || \ 208 (s) == G_RAID_VOLUME_S_SUBOPTIMAL || \ 209 (s) == G_RAID_VOLUME_S_OPTIMAL) 210 211 #define G_RAID_VOLUME_E_DOWN 0x00 212 #define G_RAID_VOLUME_E_UP 0x01 213 #define G_RAID_VOLUME_E_START 0x10 214 #define G_RAID_VOLUME_E_STARTMD 0x11 215 216 #define G_RAID_VOLUME_RL_RAID0 0x00 217 #define G_RAID_VOLUME_RL_RAID1 0x01 218 #define G_RAID_VOLUME_RL_RAID3 0x03 219 #define G_RAID_VOLUME_RL_RAID4 0x04 220 #define G_RAID_VOLUME_RL_RAID5 0x05 221 #define G_RAID_VOLUME_RL_RAID6 0x06 222 #define G_RAID_VOLUME_RL_RAIDMDF 0x07 223 #define G_RAID_VOLUME_RL_RAID1E 0x11 224 #define G_RAID_VOLUME_RL_SINGLE 0x0f 225 #define G_RAID_VOLUME_RL_CONCAT 0x1f 226 #define G_RAID_VOLUME_RL_RAID5E 0x15 227 #define G_RAID_VOLUME_RL_RAID5EE 0x25 228 #define G_RAID_VOLUME_RL_RAID5R 0x35 229 #define G_RAID_VOLUME_RL_UNKNOWN 0xff 230 231 #define G_RAID_VOLUME_RLQ_NONE 0x00 232 #define G_RAID_VOLUME_RLQ_R1SM 0x00 233 #define G_RAID_VOLUME_RLQ_R1MM 0x01 234 #define G_RAID_VOLUME_RLQ_R3P0 0x00 235 #define G_RAID_VOLUME_RLQ_R3PN 0x01 236 #define G_RAID_VOLUME_RLQ_R4P0 0x00 237 #define G_RAID_VOLUME_RLQ_R4PN 0x01 238 #define G_RAID_VOLUME_RLQ_R5RA 0x00 239 #define G_RAID_VOLUME_RLQ_R5RS 0x01 240 #define G_RAID_VOLUME_RLQ_R5LA 0x02 241 #define G_RAID_VOLUME_RLQ_R5LS 0x03 242 #define G_RAID_VOLUME_RLQ_R6RA 0x00 243 #define G_RAID_VOLUME_RLQ_R6RS 0x01 244 #define G_RAID_VOLUME_RLQ_R6LA 0x02 245 #define G_RAID_VOLUME_RLQ_R6LS 0x03 246 #define G_RAID_VOLUME_RLQ_RMDFRA 0x00 247 #define G_RAID_VOLUME_RLQ_RMDFRS 0x01 248 #define G_RAID_VOLUME_RLQ_RMDFLA 0x02 249 #define G_RAID_VOLUME_RLQ_RMDFLS 0x03 250 #define G_RAID_VOLUME_RLQ_R1EA 0x00 251 #define G_RAID_VOLUME_RLQ_R1EO 0x01 252 #define G_RAID_VOLUME_RLQ_R5ERA 0x00 253 #define G_RAID_VOLUME_RLQ_R5ERS 0x01 254 #define G_RAID_VOLUME_RLQ_R5ELA 0x02 255 #define G_RAID_VOLUME_RLQ_R5ELS 0x03 256 #define G_RAID_VOLUME_RLQ_R5EERA 0x00 257 #define G_RAID_VOLUME_RLQ_R5EERS 0x01 258 #define G_RAID_VOLUME_RLQ_R5EELA 0x02 259 #define G_RAID_VOLUME_RLQ_R5EELS 0x03 260 #define G_RAID_VOLUME_RLQ_R5RRA 0x00 261 #define G_RAID_VOLUME_RLQ_R5RRS 0x01 262 #define G_RAID_VOLUME_RLQ_R5RLA 0x02 263 #define G_RAID_VOLUME_RLQ_R5RLS 0x03 264 #define G_RAID_VOLUME_RLQ_UNKNOWN 0xff 265 266 struct g_raid_volume; 267 268 struct g_raid_volume { 269 struct g_raid_softc *v_softc; /* Back-pointer to softc. */ 270 struct g_provider *v_provider; /* GEOM provider. */ 271 struct g_raid_subdisk v_subdisks[G_RAID_MAX_SUBDISKS]; 272 /* Subdisks of this volume. */ 273 void *v_md_data; /* Volume's metadata storage. */ 274 struct g_raid_tr_object *v_tr; /* Transformation object. */ 275 char v_name[G_RAID_MAX_VOLUMENAME]; 276 /* Volume name. */ 277 u_int v_state; /* Volume state. */ 278 u_int v_raid_level; /* Array RAID level. */ 279 u_int v_raid_level_qualifier; /* RAID level det. */ 280 u_int v_disks_count; /* Number of disks in array. */ 281 u_int v_strip_size; /* Array strip size. */ 282 u_int v_sectorsize; /* Volume sector size. */ 283 off_t v_mediasize; /* Volume media size. */ 284 struct bio_queue_head v_inflight; /* In-flight write requests. */ 285 struct bio_queue_head v_locked; /* Blocked I/O requests. */ 286 LIST_HEAD(, g_raid_lock) v_locks; /* List of locked regions. */ 287 int v_pending_lock; /* writes to locked region */ 288 int v_dirty; /* Volume is DIRTY. */ 289 struct timeval v_last_done; /* Time of the last I/O. */ 290 time_t v_last_write; /* Time of the last write. */ 291 u_int v_writes; /* Number of active writes. */ 292 struct root_hold_token *v_rootmount; /* Root mount delay token. */ 293 int v_starting; /* Volume is starting */ 294 int v_stopping; /* Volume is stopping */ 295 int v_provider_open; /* Number of opens. */ 296 int v_global_id; /* Global volume ID (rX). */ 297 TAILQ_ENTRY(g_raid_volume) v_next; /* List of volumes entry. */ 298 LIST_ENTRY(g_raid_volume) v_global_next; /* Global list entry. */ 299 }; 300 301 #define G_RAID_NODE_E_WAKE 0x00 302 #define G_RAID_NODE_E_START 0x01 303 304 struct g_raid_softc { 305 struct g_raid_md_object *sc_md; /* Metadata object. */ 306 struct g_geom *sc_geom; /* GEOM class instance. */ 307 uint64_t sc_flags; /* Additional flags. */ 308 TAILQ_HEAD(, g_raid_volume) sc_volumes; /* List of volumes. */ 309 TAILQ_HEAD(, g_raid_disk) sc_disks; /* List of disks. */ 310 struct sx sc_lock; /* Main node lock. */ 311 struct proc *sc_worker; /* Worker process. */ 312 struct mtx sc_queue_mtx; /* Worker queues lock. */ 313 TAILQ_HEAD(, g_raid_event) sc_events; /* Worker events queue. */ 314 struct bio_queue_head sc_queue; /* Worker I/O queue. */ 315 int sc_stopping; /* Node is stopping */ 316 }; 317 #define sc_name sc_geom->name 318 319 /* 320 * KOBJ parent class of metadata processing modules. 321 */ 322 struct g_raid_md_class { 323 KOBJ_CLASS_FIELDS; 324 int mdc_priority; 325 LIST_ENTRY(g_raid_md_class) mdc_list; 326 }; 327 328 /* 329 * KOBJ instance of metadata processing module. 330 */ 331 struct g_raid_md_object { 332 KOBJ_FIELDS; 333 struct g_raid_md_class *mdo_class; 334 struct g_raid_softc *mdo_softc; /* Back-pointer to softc. */ 335 }; 336 337 int g_raid_md_modevent(module_t, int, void *); 338 339 #define G_RAID_MD_DECLARE(name) \ 340 static moduledata_t name##_mod = { \ 341 #name, \ 342 g_raid_md_modevent, \ 343 &name##_class \ 344 }; \ 345 DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); \ 346 MODULE_DEPEND(name, geom_raid, 0, 0, 0) 347 348 /* 349 * KOBJ parent class of data transformation modules. 350 */ 351 struct g_raid_tr_class { 352 KOBJ_CLASS_FIELDS; 353 int trc_priority; 354 LIST_ENTRY(g_raid_tr_class) trc_list; 355 }; 356 357 /* 358 * KOBJ instance of data transformation module. 359 */ 360 struct g_raid_tr_object { 361 KOBJ_FIELDS; 362 struct g_raid_tr_class *tro_class; 363 struct g_raid_volume *tro_volume; /* Back-pointer to volume. */ 364 }; 365 366 int g_raid_tr_modevent(module_t, int, void *); 367 368 #define G_RAID_TR_DECLARE(name) \ 369 static moduledata_t name##_mod = { \ 370 #name, \ 371 g_raid_tr_modevent, \ 372 &name##_class \ 373 }; \ 374 DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); \ 375 MODULE_DEPEND(name, geom_raid, 0, 0, 0) 376 377 const char * g_raid_volume_level2str(int level, int qual); 378 int g_raid_volume_str2level(const char *str, int *level, int *qual); 379 const char * g_raid_volume_state2str(int state); 380 const char * g_raid_subdisk_state2str(int state); 381 const char * g_raid_disk_state2str(int state); 382 383 struct g_raid_softc * g_raid_create_node(struct g_class *mp, 384 const char *name, struct g_raid_md_object *md); 385 int g_raid_create_node_format(const char *format, struct g_geom **gp); 386 struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc, 387 const char *name, int id); 388 struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc); 389 const char * g_raid_get_diskname(struct g_raid_disk *disk); 390 391 int g_raid_start_volume(struct g_raid_volume *vol); 392 393 int g_raid_destroy_node(struct g_raid_softc *sc, int worker); 394 int g_raid_destroy_volume(struct g_raid_volume *vol); 395 int g_raid_destroy_disk(struct g_raid_disk *disk); 396 397 void g_raid_iodone(struct bio *bp, int error); 398 void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp); 399 int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, 400 void *virtual, vm_offset_t physical, off_t offset, size_t length); 401 402 struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc, 403 const char *name); 404 void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp); 405 406 void g_raid_report_disk_state(struct g_raid_disk *disk); 407 void g_raid_change_disk_state(struct g_raid_disk *disk, int state); 408 void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state); 409 void g_raid_change_volume_state(struct g_raid_volume *vol, int state); 410 411 void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, 412 struct g_raid_subdisk *sd, struct g_raid_disk *disk); 413 void g_raid_fail_disk(struct g_raid_softc *sc, 414 struct g_raid_subdisk *sd, struct g_raid_disk *disk); 415 416 void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp); 417 int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, 418 void *virtual, vm_offset_t physical, off_t offset, size_t length); 419 420 u_int g_raid_ndisks(struct g_raid_softc *sc, int state); 421 u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state); 422 u_int g_raid_nopens(struct g_raid_softc *sc); 423 struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol, 424 int state); 425 #define G_RAID_DESTROY_SOFT 0 426 #define G_RAID_DESTROY_DELAYED 1 427 #define G_RAID_DESTROY_HARD 2 428 int g_raid_destroy(struct g_raid_softc *sc, int how); 429 int g_raid_event_send(void *arg, int event, int flags); 430 int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, 431 struct bio *ignore, void *argp); 432 int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len); 433 434 g_ctl_req_t g_raid_ctl; 435 #endif /* _KERNEL */ 436 437 #endif /* !_G_RAID_H_ */ 438