xref: /freebsd/sys/geom/raid/g_raid.h (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #ifndef	_G_RAID_H_
30 #define	_G_RAID_H_
31 
32 #include <sys/param.h>
33 #include <sys/kobj.h>
34 #include <sys/bio.h>
35 #include <sys/time.h>
36 
37 #define	G_RAID_CLASS_NAME	"RAID"
38 
39 #define	G_RAID_MAGIC		"GEOM::RAID"
40 
41 #define	G_RAID_VERSION		0
42 
43 struct g_raid_md_object;
44 struct g_raid_tr_object;
45 
46 #define	G_RAID_DEVICE_FLAG_NOAUTOSYNC	0x0000000000000001ULL
47 #define	G_RAID_DEVICE_FLAG_NOFAILSYNC	0x0000000000000002ULL
48 #define	G_RAID_DEVICE_FLAG_MASK	(G_RAID_DEVICE_FLAG_NOAUTOSYNC | \
49 					 G_RAID_DEVICE_FLAG_NOFAILSYNC)
50 
51 #ifdef _KERNEL
52 extern u_int g_raid_aggressive_spare;
53 extern u_int g_raid_debug;
54 extern int g_raid_read_err_thresh;
55 extern u_int g_raid_start_timeout;
56 extern struct g_class g_raid_class;
57 
58 #define	G_RAID_DEBUG(lvl, fmt, ...)	do {				\
59 	if (g_raid_debug >= (lvl)) {					\
60 		if (g_raid_debug > 0) {					\
61 			printf("GEOM_RAID[%u]: " fmt "\n",		\
62 			    lvl, ## __VA_ARGS__);			\
63 		} else {						\
64 			printf("GEOM_RAID: " fmt "\n",			\
65 			    ## __VA_ARGS__);				\
66 		}							\
67 	}								\
68 } while (0)
69 #define	G_RAID_DEBUG1(lvl, sc, fmt, ...)	do {			\
70 	if (g_raid_debug >= (lvl)) {					\
71 		if (g_raid_debug > 0) {					\
72 			printf("GEOM_RAID[%u]: %s: " fmt "\n",		\
73 			    lvl, (sc)->sc_name, ## __VA_ARGS__);	\
74 		} else {						\
75 			printf("GEOM_RAID: %s: " fmt "\n",		\
76 			    (sc)->sc_name, ## __VA_ARGS__);		\
77 		}							\
78 	}								\
79 } while (0)
80 #define	G_RAID_LOGREQ(lvl, bp, fmt, ...)	do {			\
81 	if (g_raid_debug >= (lvl)) {					\
82 		if (g_raid_debug > 0) {					\
83 			printf("GEOM_RAID[%u]: " fmt " ",		\
84 			    lvl, ## __VA_ARGS__);			\
85 		} else							\
86 			printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__);	\
87 		g_print_bio(bp);					\
88 		printf("\n");						\
89 	}								\
90 } while (0)
91 
92 /*
93  * Flags we use to distinguish I/O initiated by the TR layer to maintain
94  * the volume's characteristics, fix subdisks, extra copies of data, etc.
95  *
96  * G_RAID_BIO_FLAG_SYNC		I/O to update an extra copy of the data
97  *				for RAID volumes that maintain extra data
98  *				and need to rebuild that data.
99  * G_RAID_BIO_FLAG_REMAP	I/O done to try to provoke a subdisk into
100  *				doing some desirable action such as bad
101  *				block remapping after we detect a bad part
102  *				of the disk.
103  * G_RAID_BIO_FLAG_LOCKED	I/O holds range lock that should re released.
104  *
105  * and the following meta item:
106  * G_RAID_BIO_FLAG_SPECIAL	And of the I/O flags that need to make it
107  *				through the range locking which would
108  *				otherwise defer the I/O until after that
109  *				range is unlocked.
110  */
111 #define	G_RAID_BIO_FLAG_SYNC		0x01
112 #define	G_RAID_BIO_FLAG_REMAP		0x02
113 #define	G_RAID_BIO_FLAG_SPECIAL \
114 		(G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP)
115 #define	G_RAID_BIO_FLAG_LOCKED		0x80
116 
117 struct g_raid_lock {
118 	off_t			 l_offset;
119 	off_t			 l_length;
120 	void			*l_callback_arg;
121 	int			 l_pending;
122 	LIST_ENTRY(g_raid_lock)	 l_next;
123 };
124 
125 #define	G_RAID_EVENT_WAIT	0x01
126 #define	G_RAID_EVENT_VOLUME	0x02
127 #define	G_RAID_EVENT_SUBDISK	0x04
128 #define	G_RAID_EVENT_DISK	0x08
129 #define	G_RAID_EVENT_DONE	0x10
130 struct g_raid_event {
131 	void			*e_tgt;
132 	int			 e_event;
133 	int			 e_flags;
134 	int			 e_error;
135 	TAILQ_ENTRY(g_raid_event) e_next;
136 };
137 #define G_RAID_DISK_S_NONE		0x00	/* State is unknown. */
138 #define G_RAID_DISK_S_OFFLINE		0x01	/* Missing disk placeholder. */
139 #define G_RAID_DISK_S_FAILED		0x02	/* Failed. */
140 #define G_RAID_DISK_S_STALE_FAILED	0x03	/* Old failed. */
141 #define G_RAID_DISK_S_SPARE		0x04	/* Hot-spare. */
142 #define G_RAID_DISK_S_STALE		0x05	/* Old disk, unused now. */
143 #define G_RAID_DISK_S_ACTIVE		0x06	/* Operational. */
144 
145 #define G_RAID_DISK_E_DISCONNECTED	0x01
146 
147 struct g_raid_disk {
148 	struct g_raid_softc	*d_softc;	/* Back-pointer to softc. */
149 	struct g_consumer	*d_consumer;	/* GEOM disk consumer. */
150 	void			*d_md_data;	/* Disk's metadata storage. */
151 	struct g_kerneldump	 d_kd;		/* Kernel dumping method/args. */
152 	uint64_t		 d_flags;	/* Additional flags. */
153 	u_int			 d_state;	/* Disk state. */
154 	u_int			 d_load;	/* Disk average load. */
155 	off_t			 d_last_offset;	/* Last head offset. */
156 	int			 d_read_errs;	/* Count of the read errors */
157 	TAILQ_HEAD(, g_raid_subdisk)	 d_subdisks; /* List of subdisks. */
158 	TAILQ_ENTRY(g_raid_disk)	 d_next;	/* Next disk in the node. */
159 };
160 
161 #define G_RAID_SUBDISK_S_NONE		0x00	/* Absent. */
162 #define G_RAID_SUBDISK_S_FAILED		0x01	/* Failed. */
163 #define G_RAID_SUBDISK_S_NEW		0x02	/* Blank. */
164 #define G_RAID_SUBDISK_S_REBUILD	0x03	/* Blank + rebuild. */
165 #define G_RAID_SUBDISK_S_UNINITIALIZED	0x04	/* Disk of the new volume. */
166 #define G_RAID_SUBDISK_S_STALE		0x05	/* Dirty. */
167 #define G_RAID_SUBDISK_S_RESYNC		0x06	/* Dirty + check/repair. */
168 #define G_RAID_SUBDISK_S_ACTIVE		0x07	/* Usable. */
169 
170 #define G_RAID_SUBDISK_E_NEW		0x01	/* A new subdisk has arrived */
171 #define G_RAID_SUBDISK_E_FAILED		0x02	/* A subdisk failed, but remains in volume */
172 #define G_RAID_SUBDISK_E_DISCONNECTED	0x03	/* A subdisk removed from volume. */
173 #define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80	/* translation private events */
174 
175 #define G_RAID_SUBDISK_POS(sd)						\
176     ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
177 #define G_RAID_SUBDISK_TRACK_SIZE	(1 * 1024 * 1024)
178 #define G_RAID_SUBDISK_LOAD(sd)						\
179     ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
180 #define G_RAID_SUBDISK_LOAD_SCALE	256
181 
182 struct g_raid_subdisk {
183 	struct g_raid_softc	*sd_softc;	/* Back-pointer to softc. */
184 	struct g_raid_disk	*sd_disk;	/* Where this subdisk lives. */
185 	struct g_raid_volume	*sd_volume;	/* Volume, sd is a part of. */
186 	off_t			 sd_offset;	/* Offset on the disk. */
187 	off_t			 sd_size;	/* Size on the disk. */
188 	u_int			 sd_pos;	/* Position in volume. */
189 	u_int			 sd_state;	/* Subdisk state. */
190 	off_t			 sd_rebuild_pos; /* Rebuild position. */
191 	int			 sd_recovery;	/* Count of recovery reqs. */
192 	TAILQ_ENTRY(g_raid_subdisk)	 sd_next; /* Next subdisk on disk. */
193 };
194 
195 #define G_RAID_MAX_SUBDISKS	16
196 #define G_RAID_MAX_VOLUMENAME	32
197 
198 #define G_RAID_VOLUME_S_STARTING	0x00
199 #define G_RAID_VOLUME_S_BROKEN		0x01
200 #define G_RAID_VOLUME_S_DEGRADED	0x02
201 #define G_RAID_VOLUME_S_SUBOPTIMAL	0x03
202 #define G_RAID_VOLUME_S_OPTIMAL		0x04
203 #define G_RAID_VOLUME_S_UNSUPPORTED	0x05
204 #define G_RAID_VOLUME_S_STOPPED		0x06
205 
206 #define G_RAID_VOLUME_S_ALIVE(s)			\
207     ((s) == G_RAID_VOLUME_S_DEGRADED ||			\
208      (s) == G_RAID_VOLUME_S_SUBOPTIMAL ||		\
209      (s) == G_RAID_VOLUME_S_OPTIMAL)
210 
211 #define G_RAID_VOLUME_E_DOWN		0x00
212 #define G_RAID_VOLUME_E_UP		0x01
213 #define G_RAID_VOLUME_E_START		0x10
214 #define G_RAID_VOLUME_E_STARTMD		0x11
215 
216 #define G_RAID_VOLUME_RL_RAID0		0x00
217 #define G_RAID_VOLUME_RL_RAID1		0x01
218 #define G_RAID_VOLUME_RL_RAID3		0x03
219 #define G_RAID_VOLUME_RL_RAID4		0x04
220 #define G_RAID_VOLUME_RL_RAID5		0x05
221 #define G_RAID_VOLUME_RL_RAID6		0x06
222 #define G_RAID_VOLUME_RL_RAID1E		0x11
223 #define G_RAID_VOLUME_RL_SINGLE		0x0f
224 #define G_RAID_VOLUME_RL_CONCAT		0x1f
225 #define G_RAID_VOLUME_RL_RAID5E		0x15
226 #define G_RAID_VOLUME_RL_RAID5EE	0x25
227 #define G_RAID_VOLUME_RL_UNKNOWN	0xff
228 
229 #define G_RAID_VOLUME_RLQ_NONE		0x00
230 #define G_RAID_VOLUME_RLQ_UNKNOWN	0xff
231 
232 struct g_raid_volume;
233 
234 struct g_raid_volume {
235 	struct g_raid_softc	*v_softc;	/* Back-pointer to softc. */
236 	struct g_provider	*v_provider;	/* GEOM provider. */
237 	struct g_raid_subdisk	 v_subdisks[G_RAID_MAX_SUBDISKS];
238 						/* Subdisks of this volume. */
239 	void			*v_md_data;	/* Volume's metadata storage. */
240 	struct g_raid_tr_object	*v_tr;		/* Transformation object. */
241 	char			 v_name[G_RAID_MAX_VOLUMENAME];
242 						/* Volume name. */
243 	u_int			 v_state;	/* Volume state. */
244 	u_int			 v_raid_level;	/* Array RAID level. */
245 	u_int			 v_raid_level_qualifier; /* RAID level det. */
246 	u_int			 v_disks_count;	/* Number of disks in array. */
247 	u_int			 v_strip_size;	/* Array strip size. */
248 	u_int			 v_sectorsize;	/* Volume sector size. */
249 	off_t			 v_mediasize;	/* Volume media size.  */
250 	struct bio_queue_head	 v_inflight;	/* In-flight write requests. */
251 	struct bio_queue_head	 v_locked;	/* Blocked I/O requests. */
252 	LIST_HEAD(, g_raid_lock) v_locks;	 /* List of locked regions. */
253 	int			 v_pending_lock; /* writes to locked region */
254 	int			 v_dirty;	/* Volume is DIRTY. */
255 	struct timeval		 v_last_done;	/* Time of the last I/O. */
256 	time_t			 v_last_write;	/* Time of the last write. */
257 	u_int			 v_writes;	/* Number of active writes. */
258 	struct root_hold_token	*v_rootmount;	/* Root mount delay token. */
259 	int			 v_starting;	/* Volume is starting */
260 	int			 v_stopping;	/* Volume is stopping */
261 	int			 v_provider_open; /* Number of opens. */
262 	int			 v_global_id;	/* Global volume ID (rX). */
263 	TAILQ_ENTRY(g_raid_volume)	 v_next; /* List of volumes entry. */
264 	LIST_ENTRY(g_raid_volume)	 v_global_next; /* Global list entry. */
265 };
266 
267 #define G_RAID_NODE_E_WAKE	0x00
268 #define G_RAID_NODE_E_START	0x01
269 
270 struct g_raid_softc {
271 	struct g_raid_md_object	*sc_md;		/* Metadata object. */
272 	struct g_geom		*sc_geom;	/* GEOM class instance. */
273 	uint64_t		 sc_flags;	/* Additional flags. */
274 	TAILQ_HEAD(, g_raid_volume)	 sc_volumes;	/* List of volumes. */
275 	TAILQ_HEAD(, g_raid_disk)	 sc_disks;	/* List of disks. */
276 	struct sx		 sc_lock;	/* Main node lock. */
277 	struct proc		*sc_worker;	/* Worker process. */
278 	struct mtx		 sc_queue_mtx;	/* Worker queues lock. */
279 	TAILQ_HEAD(, g_raid_event) sc_events;	/* Worker events queue. */
280 	struct bio_queue_head	 sc_queue;	/* Worker I/O queue. */
281 	int			 sc_stopping;	/* Node is stopping */
282 };
283 #define	sc_name	sc_geom->name
284 
285 /*
286  * KOBJ parent class of metadata processing modules.
287  */
288 struct g_raid_md_class {
289 	KOBJ_CLASS_FIELDS;
290 	int		 mdc_priority;
291 	LIST_ENTRY(g_raid_md_class) mdc_list;
292 };
293 
294 /*
295  * KOBJ instance of metadata processing module.
296  */
297 struct g_raid_md_object {
298 	KOBJ_FIELDS;
299 	struct g_raid_md_class	*mdo_class;
300 	struct g_raid_softc	*mdo_softc;	/* Back-pointer to softc. */
301 };
302 
303 int g_raid_md_modevent(module_t, int, void *);
304 
305 #define	G_RAID_MD_DECLARE(name)					\
306     static moduledata_t name##_mod = {				\
307 	#name,							\
308 	g_raid_md_modevent,					\
309 	&name##_class						\
310     };								\
311     DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);	\
312     MODULE_DEPEND(name, geom_raid, 0, 0, 0)
313 
314 /*
315  * KOBJ parent class of data transformation modules.
316  */
317 struct g_raid_tr_class {
318 	KOBJ_CLASS_FIELDS;
319 	int		 trc_priority;
320 	LIST_ENTRY(g_raid_tr_class) trc_list;
321 };
322 
323 /*
324  * KOBJ instance of data transformation module.
325  */
326 struct g_raid_tr_object {
327 	KOBJ_FIELDS;
328 	struct g_raid_tr_class	*tro_class;
329 	struct g_raid_volume 	*tro_volume;	/* Back-pointer to volume. */
330 };
331 
332 int g_raid_tr_modevent(module_t, int, void *);
333 
334 #define	G_RAID_TR_DECLARE(name)					\
335     static moduledata_t name##_mod = {				\
336 	#name,							\
337 	g_raid_tr_modevent,					\
338 	&name##_class						\
339     };								\
340     DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);	\
341     MODULE_DEPEND(name, geom_raid, 0, 0, 0)
342 
343 const char * g_raid_volume_level2str(int level, int qual);
344 int g_raid_volume_str2level(const char *str, int *level, int *qual);
345 const char * g_raid_volume_state2str(int state);
346 const char * g_raid_subdisk_state2str(int state);
347 const char * g_raid_disk_state2str(int state);
348 
349 struct g_raid_softc * g_raid_create_node(struct g_class *mp,
350     const char *name, struct g_raid_md_object *md);
351 int g_raid_create_node_format(const char *format, struct g_geom **gp);
352 struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc,
353     const char *name, int id);
354 struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc);
355 const char * g_raid_get_diskname(struct g_raid_disk *disk);
356 
357 int g_raid_start_volume(struct g_raid_volume *vol);
358 
359 int g_raid_destroy_node(struct g_raid_softc *sc, int worker);
360 int g_raid_destroy_volume(struct g_raid_volume *vol);
361 int g_raid_destroy_disk(struct g_raid_disk *disk);
362 
363 void g_raid_iodone(struct bio *bp, int error);
364 void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp);
365 int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
366     void *virtual, vm_offset_t physical, off_t offset, size_t length);
367 
368 struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc,
369     const char *name);
370 void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp);
371 
372 void g_raid_report_disk_state(struct g_raid_disk *disk);
373 void g_raid_change_disk_state(struct g_raid_disk *disk, int state);
374 void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state);
375 void g_raid_change_volume_state(struct g_raid_volume *vol, int state);
376 
377 void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
378     struct g_raid_subdisk *sd, struct g_raid_disk *disk);
379 void g_raid_fail_disk(struct g_raid_softc *sc,
380     struct g_raid_subdisk *sd, struct g_raid_disk *disk);
381 
382 void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp);
383 int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
384     void *virtual, vm_offset_t physical, off_t offset, size_t length);
385 
386 u_int g_raid_ndisks(struct g_raid_softc *sc, int state);
387 u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state);
388 u_int g_raid_nopens(struct g_raid_softc *sc);
389 struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol,
390     int state);
391 #define	G_RAID_DESTROY_SOFT		0
392 #define	G_RAID_DESTROY_DELAYED	1
393 #define	G_RAID_DESTROY_HARD		2
394 int g_raid_destroy(struct g_raid_softc *sc, int how);
395 int g_raid_event_send(void *arg, int event, int flags);
396 int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
397     struct bio *ignore, void *argp);
398 int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);
399 
400 g_ctl_req_t g_raid_ctl;
401 #endif	/* _KERNEL */
402 
403 #endif	/* !_G_RAID_H_ */
404