1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #ifndef BTRFS_RAID56_H 8 #define BTRFS_RAID56_H 9 10 #include <linux/types.h> 11 #include <linux/list.h> 12 #include <linux/spinlock.h> 13 #include <linux/bio.h> 14 #include <linux/refcount.h> 15 #include <linux/workqueue.h> 16 #include "volumes.h" 17 18 struct page; 19 struct btrfs_fs_info; 20 21 enum btrfs_rbio_ops { 22 BTRFS_RBIO_WRITE, 23 BTRFS_RBIO_READ_REBUILD, 24 BTRFS_RBIO_PARITY_SCRUB, 25 }; 26 27 /* 28 * Overview of btrfs_raid_bio. 29 * 30 * One btrfs_raid_bio represents a full stripe of RAID56, including both data 31 * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K). 32 * 33 * One btrfs_raid_bio can have one or more bios from higher layer, covering 34 * part or all of the data stripes. 35 * 36 * [PAGES FROM HIGHER LAYER BIOS] 37 * Higher layer bios are in the btrfs_raid_bio::bio_list. 38 * 39 * Pages from the bio_list are represented like the following: 40 * 41 * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ... 42 * bio_paddrs: [0] [1] [2] [3] [4] [5] ... 43 * 44 * If there is a bio covering a sector (one btrfs fs block), the corresponding 45 * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address 46 * (with the offset inside the page) of the corresponding bio. 47 * 48 * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will 49 * be INVALID_PADDR. 50 * 51 * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)). 52 * 53 * [PAGES FOR INTERNAL USAGES] 54 * Pages not covered by any bio or belonging to P/Q stripes are stored in 55 * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following: 56 * 57 * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ... 58 * stripe_paddrs: [0] [1] [2] [3] [4] ... 59 * 60 * stripe_pages[] array stores all the pages covering the full stripe, including 61 * data and P/Q pages. 62 * stripe_pages[0] is the first page of the first data stripe. 63 * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second 64 * data stripe. 65 * 66 * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write 67 * (the bio covers all data stripes) there is no need to allocate pages for 68 * data stripes (can grab from bio_paddrs[]). 69 * 70 * If the corresponding page of stripe_paddrs[i] is not allocated, the value of 71 * stripe_paddrs[i] will be INVALID_PADDR. 72 * 73 * The length of each entry in stripe_paddrs[] is a step. 74 * 75 * [LOCATING A SECTOR] 76 * To locate a sector for IO, we need the following info: 77 * 78 * - stripe_nr 79 * Starts from 0 (representing the first data stripe), ends at 80 * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe). 81 * 82 * - sector_nr 83 * Starts from 0 (representing the first sector of the stripe), ends 84 * at BTRFS_STRIPE_LEN / sectorsize - 1. 85 * 86 * - step_nr 87 * A step is min(sector_size, PAGE_SIZE). 88 * 89 * Starts from 0 (representing the first step of the sector), ends 90 * at @sector_nsteps - 1. 91 * 92 * For most call sites they do not need to bother this parameter. 93 * It is for bs > ps support and only for vertical stripe related works. 94 * (e.g. RMW/recover) 95 * 96 * - from which array 97 * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the 98 * bio_paddrs[] (aka, from the higher layer bios). 99 * 100 * For IO, a physical address is returned, so that we can extract the page and 101 * the offset inside the page for IO. 102 * A special value INVALID_PADDR represents when the physical address is invalid, 103 * normally meaning there is no page allocated for the specified sector. 104 */ 105 struct btrfs_raid_bio { 106 struct btrfs_io_context *bioc; 107 108 /* 109 * While we're doing RMW on a stripe we put it into a hash table so we 110 * can lock the stripe and merge more rbios into it. 111 */ 112 struct list_head hash_list; 113 114 /* LRU list for the stripe cache */ 115 struct list_head stripe_cache; 116 117 /* For scheduling work in the helper threads */ 118 struct work_struct work; 119 120 /* 121 * bio_list and bio_list_lock are used to add more bios into the stripe 122 * in hopes of avoiding the full RMW 123 */ 124 struct bio_list bio_list; 125 spinlock_t bio_list_lock; 126 127 /* 128 * Also protected by the bio_list_lock, the plug list is used by the 129 * plugging code to collect partial bios while plugged. The stripe 130 * locking code also uses it to hand off the stripe lock to the next 131 * pending IO. 132 */ 133 struct list_head plug_list; 134 135 /* Flags that tell us if it is safe to merge with this bio. */ 136 unsigned long flags; 137 138 /* 139 * Set if we're doing a parity rebuild for a read from higher up, which 140 * is handled differently from a parity rebuild as part of RMW. 141 */ 142 enum btrfs_rbio_ops operation; 143 144 /* How many pages there are for the full stripe including P/Q */ 145 u16 nr_pages; 146 147 /* How many sectors there are for the full stripe including P/Q */ 148 u16 nr_sectors; 149 150 /* Number of data stripes (no p/q) */ 151 u8 nr_data; 152 153 /* Number of all stripes (including P/Q) */ 154 u8 real_stripes; 155 156 /* How many pages there are for each stripe */ 157 u8 stripe_npages; 158 159 /* How many sectors there are for each stripe */ 160 u8 stripe_nsectors; 161 162 /* 163 * How many steps there are for one sector. 164 * 165 * For bs > ps cases, it's sectorsize / PAGE_SIZE. 166 * For bs <= ps cases, it's always 1. 167 */ 168 u8 sector_nsteps; 169 170 /* Stripe number that we're scrubbing */ 171 u8 scrubp; 172 173 /* 174 * Size of all the bios in the bio_list. This helps us decide if the 175 * rbio maps to a full stripe or not. 176 */ 177 int bio_list_bytes; 178 179 refcount_t refs; 180 181 atomic_t stripes_pending; 182 183 wait_queue_head_t io_wait; 184 185 /* Bitmap to record which horizontal stripe has data */ 186 unsigned long dbitmap; 187 188 /* Allocated with stripe_nsectors-many bits for finish_*() calls */ 189 unsigned long finish_pbitmap; 190 191 /* 192 * These are two arrays of pointers. We allocate the rbio big enough 193 * to hold them both and setup their locations when the rbio is 194 * allocated. 195 */ 196 197 /* 198 * Pointers to pages that we allocated for reading/writing stripes 199 * directly from the disk (including P/Q). 200 */ 201 struct page **stripe_pages; 202 203 /* Pointers to the sectors in the bio_list, for faster lookup */ 204 phys_addr_t *bio_paddrs; 205 206 /* Pointers to the sectors in the stripe_pages[]. */ 207 phys_addr_t *stripe_paddrs; 208 209 /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */ 210 unsigned long *stripe_uptodate_bitmap; 211 212 /* Allocated with real_stripes-many pointers for finish_*() calls */ 213 void **finish_pointers; 214 215 /* 216 * The bitmap recording where IO errors happened. 217 * Each bit is corresponding to one sector in either bio_sectors[] or 218 * stripe_sectors[] array. 219 */ 220 unsigned long *error_bitmap; 221 222 /* 223 * Checksum buffer if the rbio is for data. The buffer should cover 224 * all data sectors (excluding P/Q sectors). 225 */ 226 u8 *csum_buf; 227 228 /* 229 * Each bit represents if the corresponding sector has data csum found. 230 * Should only cover data sectors (excluding P/Q sectors). 231 */ 232 unsigned long *csum_bitmap; 233 }; 234 235 /* 236 * For trace event usage only. Records useful debug info for each bio submitted 237 * by RAID56 to each physical device. 238 * 239 * No matter signed or not, (-1) is always the one indicating we can not grab 240 * the proper stripe number. 241 */ 242 struct raid56_bio_trace_info { 243 u64 devid; 244 245 /* The offset inside the stripe. (<= STRIPE_LEN) */ 246 u32 offset; 247 248 /* 249 * Stripe number. 250 * 0 is the first data stripe, and nr_data for P stripe, 251 * nr_data + 1 for Q stripe. 252 * >= real_stripes for 253 */ 254 u8 stripe_nr; 255 }; 256 257 static inline int nr_data_stripes(const struct btrfs_chunk_map *map) 258 { 259 return map->num_stripes - btrfs_nr_parity_stripes(map->type); 260 } 261 262 static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc) 263 { 264 return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type); 265 } 266 267 #define RAID5_P_STRIPE ((u64)-2) 268 #define RAID6_Q_STRIPE ((u64)-1) 269 270 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ 271 ((x) == RAID6_Q_STRIPE)) 272 273 struct btrfs_device; 274 275 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 276 int mirror_num); 277 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); 278 279 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 280 struct btrfs_io_context *bioc, 281 struct btrfs_device *scrub_dev, 282 unsigned long *dbitmap, int stripe_nsectors); 283 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); 284 285 void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, 286 struct folio **data_folios, u64 data_logical); 287 288 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); 289 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); 290 291 #endif 292