xref: /linux/fs/btrfs/raid56.h (revision 7696286034ac72cf9b46499be1715ac62fd302c3)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Copyright (C) 2012 Fusion-io  All rights reserved.
4  * Copyright (C) 2012 Intel Corp. All rights reserved.
5  */
6 
7 #ifndef BTRFS_RAID56_H
8 #define BTRFS_RAID56_H
9 
10 #include <linux/types.h>
11 #include <linux/list.h>
12 #include <linux/spinlock.h>
13 #include <linux/bio.h>
14 #include <linux/refcount.h>
15 #include <linux/workqueue.h>
16 #include "volumes.h"
17 
18 struct page;
19 struct btrfs_fs_info;
20 
21 enum btrfs_rbio_ops {
22 	BTRFS_RBIO_WRITE,
23 	BTRFS_RBIO_READ_REBUILD,
24 	BTRFS_RBIO_PARITY_SCRUB,
25 };
26 
27 /*
28  * Overview of btrfs_raid_bio.
29  *
30  * One btrfs_raid_bio represents a full stripe of RAID56, including both data
31  * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K).
32  *
33  * One btrfs_raid_bio can have one or more bios from higher layer, covering
34  * part or all of the data stripes.
35  *
36  * [PAGES FROM HIGHER LAYER BIOS]
37  * Higher layer bios are in the btrfs_raid_bio::bio_list.
38  *
39  * Pages from the bio_list are represented like the following:
40  *
41  * bio_list:	     |<- Bio 1 ->|             |<- Bio 2 ->|  ...
42  * bio_paddrs:	    [0]   [1]   [2]    [3]    [4]    [5]      ...
43  *
44  * If there is a bio covering a sector (one btrfs fs block), the corresponding
45  * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address
46  * (with the offset inside the page) of the corresponding bio.
47  *
48  * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will
49  * be INVALID_PADDR.
50  *
51  * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)).
52  *
53  * [PAGES FOR INTERNAL USAGES]
54  * Pages not covered by any bio or belonging to P/Q stripes are stored in
55  * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following:
56  *
57  * stripe_pages:       |<- Page 0 ->|<- Page 1 ->|  ...
58  * stripe_paddrs:     [0]    [1]   [2]    [3]   [4] ...
59  *
60  * stripe_pages[] array stores all the pages covering the full stripe, including
61  * data and P/Q pages.
62  * stripe_pages[0] is the first page of the first data stripe.
63  * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second
64  * data stripe.
65  *
66  * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write
67  * (the bio covers all data stripes) there is no need to allocate pages for
68  * data stripes (can grab from bio_paddrs[]).
69  *
70  * If the corresponding page of stripe_paddrs[i] is not allocated, the value of
71  * stripe_paddrs[i] will be INVALID_PADDR.
72  *
73  * The length of each entry in stripe_paddrs[] is a step.
74  *
75  * [LOCATING A SECTOR]
76  * To locate a sector for IO, we need the following info:
77  *
78  * - stripe_nr
79  *   Starts from 0 (representing the first data stripe), ends at
80  *   @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe).
81  *
82  * - sector_nr
83  *   Starts from 0 (representing the first sector of the stripe), ends
84  *   at BTRFS_STRIPE_LEN / sectorsize - 1.
85  *
86  * - step_nr
87  *   A step is min(sector_size, PAGE_SIZE).
88  *
89  *   Starts from 0 (representing the first step of the sector), ends
90  *   at @sector_nsteps - 1.
91  *
92  *   For most call sites they do not need to bother this parameter.
93  *   It is for bs > ps support and only for vertical stripe related works.
94  *   (e.g. RMW/recover)
95  *
96  * - from which array
97  *   Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the
98  *   bio_paddrs[] (aka, from the higher layer bios).
99  *
100  * For IO, a physical address is returned, so that we can extract the page and
101  * the offset inside the page for IO.
102  * A special value INVALID_PADDR represents when the physical address is invalid,
103  * normally meaning there is no page allocated for the specified sector.
104  */
105 struct btrfs_raid_bio {
106 	struct btrfs_io_context *bioc;
107 
108 	/*
109 	 * While we're doing RMW on a stripe we put it into a hash table so we
110 	 * can lock the stripe and merge more rbios into it.
111 	 */
112 	struct list_head hash_list;
113 
114 	/* LRU list for the stripe cache */
115 	struct list_head stripe_cache;
116 
117 	/* For scheduling work in the helper threads */
118 	struct work_struct work;
119 
120 	/*
121 	 * bio_list and bio_list_lock are used to add more bios into the stripe
122 	 * in hopes of avoiding the full RMW
123 	 */
124 	struct bio_list bio_list;
125 	spinlock_t bio_list_lock;
126 
127 	/*
128 	 * Also protected by the bio_list_lock, the plug list is used by the
129 	 * plugging code to collect partial bios while plugged.  The stripe
130 	 * locking code also uses it to hand off the stripe lock to the next
131 	 * pending IO.
132 	 */
133 	struct list_head plug_list;
134 
135 	/* Flags that tell us if it is safe to merge with this bio. */
136 	unsigned long flags;
137 
138 	/*
139 	 * Set if we're doing a parity rebuild for a read from higher up, which
140 	 * is handled differently from a parity rebuild as part of RMW.
141 	 */
142 	enum btrfs_rbio_ops operation;
143 
144 	/* How many pages there are for the full stripe including P/Q */
145 	u16 nr_pages;
146 
147 	/* How many sectors there are for the full stripe including P/Q */
148 	u16 nr_sectors;
149 
150 	/* Number of data stripes (no p/q) */
151 	u8 nr_data;
152 
153 	/* Number of all stripes (including P/Q) */
154 	u8 real_stripes;
155 
156 	/* How many pages there are for each stripe */
157 	u8 stripe_npages;
158 
159 	/* How many sectors there are for each stripe */
160 	u8 stripe_nsectors;
161 
162 	/*
163 	 * How many steps there are for one sector.
164 	 *
165 	 * For bs > ps cases, it's sectorsize / PAGE_SIZE.
166 	 * For bs <= ps cases, it's always 1.
167 	 */
168 	u8 sector_nsteps;
169 
170 	/* Stripe number that we're scrubbing  */
171 	u8 scrubp;
172 
173 	/*
174 	 * Size of all the bios in the bio_list.  This helps us decide if the
175 	 * rbio maps to a full stripe or not.
176 	 */
177 	int bio_list_bytes;
178 
179 	refcount_t refs;
180 
181 	atomic_t stripes_pending;
182 
183 	wait_queue_head_t io_wait;
184 
185 	/* Bitmap to record which horizontal stripe has data */
186 	unsigned long dbitmap;
187 
188 	/* Allocated with stripe_nsectors-many bits for finish_*() calls */
189 	unsigned long finish_pbitmap;
190 
191 	/*
192 	 * These are two arrays of pointers.  We allocate the rbio big enough
193 	 * to hold them both and setup their locations when the rbio is
194 	 * allocated.
195 	 */
196 
197 	/*
198 	 * Pointers to pages that we allocated for reading/writing stripes
199 	 * directly from the disk (including P/Q).
200 	 */
201 	struct page **stripe_pages;
202 
203 	/* Pointers to the sectors in the bio_list, for faster lookup */
204 	phys_addr_t *bio_paddrs;
205 
206 	/* Pointers to the sectors in the stripe_pages[]. */
207 	phys_addr_t *stripe_paddrs;
208 
209 	/* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */
210 	unsigned long *stripe_uptodate_bitmap;
211 
212 	/* Allocated with real_stripes-many pointers for finish_*() calls */
213 	void **finish_pointers;
214 
215 	/*
216 	 * The bitmap recording where IO errors happened.
217 	 * Each bit is corresponding to one sector in either bio_sectors[] or
218 	 * stripe_sectors[] array.
219 	 */
220 	unsigned long *error_bitmap;
221 
222 	/*
223 	 * Checksum buffer if the rbio is for data.  The buffer should cover
224 	 * all data sectors (excluding P/Q sectors).
225 	 */
226 	u8 *csum_buf;
227 
228 	/*
229 	 * Each bit represents if the corresponding sector has data csum found.
230 	 * Should only cover data sectors (excluding P/Q sectors).
231 	 */
232 	unsigned long *csum_bitmap;
233 };
234 
235 /*
236  * For trace event usage only. Records useful debug info for each bio submitted
237  * by RAID56 to each physical device.
238  *
239  * No matter signed or not, (-1) is always the one indicating we can not grab
240  * the proper stripe number.
241  */
242 struct raid56_bio_trace_info {
243 	u64 devid;
244 
245 	/* The offset inside the stripe. (<= STRIPE_LEN) */
246 	u32 offset;
247 
248 	/*
249 	 * Stripe number.
250 	 * 0 is the first data stripe, and nr_data for P stripe,
251 	 * nr_data + 1 for Q stripe.
252 	 * >= real_stripes for
253 	 */
254 	u8 stripe_nr;
255 };
256 
257 static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
258 {
259 	return map->num_stripes - btrfs_nr_parity_stripes(map->type);
260 }
261 
262 static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
263 {
264 	return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
265 }
266 
267 #define RAID5_P_STRIPE ((u64)-2)
268 #define RAID6_Q_STRIPE ((u64)-1)
269 
270 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||		\
271 			     ((x) == RAID6_Q_STRIPE))
272 
273 struct btrfs_device;
274 
275 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
276 			   int mirror_num);
277 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
278 
279 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
280 				struct btrfs_io_context *bioc,
281 				struct btrfs_device *scrub_dev,
282 				unsigned long *dbitmap, int stripe_nsectors);
283 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
284 
285 void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
286 				     struct folio **data_folios, u64 data_logical);
287 
288 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
289 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
290 
291 #endif
292