1f6bed0efSShaohua Li /* 2f6bed0efSShaohua Li * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3f6bed0efSShaohua Li * 4f6bed0efSShaohua Li * This program is free software; you can redistribute it and/or modify it 5f6bed0efSShaohua Li * under the terms and conditions of the GNU General Public License, 6f6bed0efSShaohua Li * version 2, as published by the Free Software Foundation. 7f6bed0efSShaohua Li * 8f6bed0efSShaohua Li * This program is distributed in the hope it will be useful, but WITHOUT 9f6bed0efSShaohua Li * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10f6bed0efSShaohua Li * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11f6bed0efSShaohua Li * more details. 12f6bed0efSShaohua Li * 13f6bed0efSShaohua Li */ 14f6bed0efSShaohua Li #include <linux/kernel.h> 15f6bed0efSShaohua Li #include <linux/wait.h> 16f6bed0efSShaohua Li #include <linux/blkdev.h> 17f6bed0efSShaohua Li #include <linux/slab.h> 18f6bed0efSShaohua Li #include <linux/raid/md_p.h> 195cb2fbd6SShaohua Li #include <linux/crc32c.h> 20f6bed0efSShaohua Li #include <linux/random.h> 21f6bed0efSShaohua Li #include "md.h" 22f6bed0efSShaohua Li #include "raid5.h" 23f6bed0efSShaohua Li 24f6bed0efSShaohua Li /* 25f6bed0efSShaohua Li * metadata/data stored in disk with 4k size unit (a block) regardless 26f6bed0efSShaohua Li * underneath hardware sector size. only works with PAGE_SIZE == 4096 27f6bed0efSShaohua Li */ 28f6bed0efSShaohua Li #define BLOCK_SECTORS (8) 29f6bed0efSShaohua Li 300576b1c6SShaohua Li /* 310576b1c6SShaohua Li * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent 320576b1c6SShaohua Li * recovery scans a very long log 330576b1c6SShaohua Li */ 340576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 350576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 360576b1c6SShaohua Li 37c38d29b3SChristoph Hellwig /* 38c38d29b3SChristoph Hellwig * We only need 2 bios per I/O unit to make progress, but ensure we 39c38d29b3SChristoph Hellwig * have a few more available to not get too tight. 40c38d29b3SChristoph Hellwig */ 41c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE 4 42c38d29b3SChristoph Hellwig 43*2ded3703SSong Liu /* 44*2ded3703SSong Liu * r5c journal modes of the array: write-back or write-through. 45*2ded3703SSong Liu * write-through mode has identical behavior as existing log only 46*2ded3703SSong Liu * implementation. 47*2ded3703SSong Liu */ 48*2ded3703SSong Liu enum r5c_journal_mode { 49*2ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 50*2ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_BACK = 1, 51*2ded3703SSong Liu }; 52*2ded3703SSong Liu 53*2ded3703SSong Liu /* 54*2ded3703SSong Liu * raid5 cache state machine 55*2ded3703SSong Liu * 56*2ded3703SSong Liu * With rhe RAID cache, each stripe works in two phases: 57*2ded3703SSong Liu * - caching phase 58*2ded3703SSong Liu * - writing-out phase 59*2ded3703SSong Liu * 60*2ded3703SSong Liu * These two phases are controlled by bit STRIPE_R5C_CACHING: 61*2ded3703SSong Liu * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 62*2ded3703SSong Liu * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 63*2ded3703SSong Liu * 64*2ded3703SSong Liu * When there is no journal, or the journal is in write-through mode, 65*2ded3703SSong Liu * the stripe is always in writing-out phase. 66*2ded3703SSong Liu * 67*2ded3703SSong Liu * For write-back journal, the stripe is sent to caching phase on write 68*2ded3703SSong Liu * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 69*2ded3703SSong Liu * the write-out phase by clearing STRIPE_R5C_CACHING. 70*2ded3703SSong Liu * 71*2ded3703SSong Liu * Stripes in caching phase do not write the raid disks. Instead, all 72*2ded3703SSong Liu * writes are committed from the log device. Therefore, a stripe in 73*2ded3703SSong Liu * caching phase handles writes as: 74*2ded3703SSong Liu * - write to log device 75*2ded3703SSong Liu * - return IO 76*2ded3703SSong Liu * 77*2ded3703SSong Liu * Stripes in writing-out phase handle writes as: 78*2ded3703SSong Liu * - calculate parity 79*2ded3703SSong Liu * - write pending data and parity to journal 80*2ded3703SSong Liu * - write data and parity to raid disks 81*2ded3703SSong Liu * - return IO for pending writes 82*2ded3703SSong Liu */ 83*2ded3703SSong Liu 84f6bed0efSShaohua Li struct r5l_log { 85f6bed0efSShaohua Li struct md_rdev *rdev; 86f6bed0efSShaohua Li 87f6bed0efSShaohua Li u32 uuid_checksum; 88f6bed0efSShaohua Li 89f6bed0efSShaohua Li sector_t device_size; /* log device size, round to 90f6bed0efSShaohua Li * BLOCK_SECTORS */ 910576b1c6SShaohua Li sector_t max_free_space; /* reclaim run if free space is at 920576b1c6SShaohua Li * this size */ 93f6bed0efSShaohua Li 94f6bed0efSShaohua Li sector_t last_checkpoint; /* log tail. where recovery scan 95f6bed0efSShaohua Li * starts from */ 96f6bed0efSShaohua Li u64 last_cp_seq; /* log tail sequence */ 97f6bed0efSShaohua Li 98f6bed0efSShaohua Li sector_t log_start; /* log head. where new data appends */ 99f6bed0efSShaohua Li u64 seq; /* log head sequence */ 100f6bed0efSShaohua Li 10117036461SChristoph Hellwig sector_t next_checkpoint; 10217036461SChristoph Hellwig u64 next_cp_seq; 10317036461SChristoph Hellwig 104f6bed0efSShaohua Li struct mutex io_mutex; 105f6bed0efSShaohua Li struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 106f6bed0efSShaohua Li 107f6bed0efSShaohua Li spinlock_t io_list_lock; 108f6bed0efSShaohua Li struct list_head running_ios; /* io_units which are still running, 109f6bed0efSShaohua Li * and have not yet been completely 110f6bed0efSShaohua Li * written to the log */ 111f6bed0efSShaohua Li struct list_head io_end_ios; /* io_units which have been completely 112f6bed0efSShaohua Li * written to the log but not yet written 113f6bed0efSShaohua Li * to the RAID */ 114a8c34f91SShaohua Li struct list_head flushing_ios; /* io_units which are waiting for log 115a8c34f91SShaohua Li * cache flush */ 11604732f74SChristoph Hellwig struct list_head finished_ios; /* io_units which settle down in log disk */ 117a8c34f91SShaohua Li struct bio flush_bio; 118f6bed0efSShaohua Li 1195036c390SChristoph Hellwig struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 1205036c390SChristoph Hellwig 121f6bed0efSShaohua Li struct kmem_cache *io_kc; 1225036c390SChristoph Hellwig mempool_t *io_pool; 123c38d29b3SChristoph Hellwig struct bio_set *bs; 124e8deb638SChristoph Hellwig mempool_t *meta_pool; 125f6bed0efSShaohua Li 1260576b1c6SShaohua Li struct md_thread *reclaim_thread; 1270576b1c6SShaohua Li unsigned long reclaim_target; /* number of space that need to be 1280576b1c6SShaohua Li * reclaimed. if it's 0, reclaim spaces 1290576b1c6SShaohua Li * used by io_units which are in 1300576b1c6SShaohua Li * IO_UNIT_STRIPE_END state (eg, reclaim 1310576b1c6SShaohua Li * dones't wait for specific io_unit 1320576b1c6SShaohua Li * switching to IO_UNIT_STRIPE_END 1330576b1c6SShaohua Li * state) */ 1340fd22b45SShaohua Li wait_queue_head_t iounit_wait; 1350576b1c6SShaohua Li 136f6bed0efSShaohua Li struct list_head no_space_stripes; /* pending stripes, log has no space */ 137f6bed0efSShaohua Li spinlock_t no_space_stripes_lock; 13856fef7c6SChristoph Hellwig 13956fef7c6SChristoph Hellwig bool need_cache_flush; 140*2ded3703SSong Liu 141*2ded3703SSong Liu /* for r5c_cache */ 142*2ded3703SSong Liu enum r5c_journal_mode r5c_journal_mode; 143f6bed0efSShaohua Li }; 144f6bed0efSShaohua Li 145f6bed0efSShaohua Li /* 146f6bed0efSShaohua Li * an IO range starts from a meta data block and end at the next meta data 147f6bed0efSShaohua Li * block. The io unit's the meta data block tracks data/parity followed it. io 148f6bed0efSShaohua Li * unit is written to log disk with normal write, as we always flush log disk 149f6bed0efSShaohua Li * first and then start move data to raid disks, there is no requirement to 150f6bed0efSShaohua Li * write io unit with FLUSH/FUA 151f6bed0efSShaohua Li */ 152f6bed0efSShaohua Li struct r5l_io_unit { 153f6bed0efSShaohua Li struct r5l_log *log; 154f6bed0efSShaohua Li 155f6bed0efSShaohua Li struct page *meta_page; /* store meta block */ 156f6bed0efSShaohua Li int meta_offset; /* current offset in meta_page */ 157f6bed0efSShaohua Li 158f6bed0efSShaohua Li struct bio *current_bio;/* current_bio accepting new data */ 159f6bed0efSShaohua Li 160f6bed0efSShaohua Li atomic_t pending_stripe;/* how many stripes not flushed to raid */ 161f6bed0efSShaohua Li u64 seq; /* seq number of the metablock */ 162f6bed0efSShaohua Li sector_t log_start; /* where the io_unit starts */ 163f6bed0efSShaohua Li sector_t log_end; /* where the io_unit ends */ 164f6bed0efSShaohua Li struct list_head log_sibling; /* log->running_ios */ 165f6bed0efSShaohua Li struct list_head stripe_list; /* stripes added to the io_unit */ 166f6bed0efSShaohua Li 167f6bed0efSShaohua Li int state; 1686143e2ceSChristoph Hellwig bool need_split_bio; 169f6bed0efSShaohua Li }; 170f6bed0efSShaohua Li 171f6bed0efSShaohua Li /* r5l_io_unit state */ 172f6bed0efSShaohua Li enum r5l_io_unit_state { 173f6bed0efSShaohua Li IO_UNIT_RUNNING = 0, /* accepting new IO */ 174f6bed0efSShaohua Li IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 175f6bed0efSShaohua Li * don't accepting new bio */ 176f6bed0efSShaohua Li IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 177a8c34f91SShaohua Li IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 178f6bed0efSShaohua Li }; 179f6bed0efSShaohua Li 180*2ded3703SSong Liu bool r5c_is_writeback(struct r5l_log *log) 181*2ded3703SSong Liu { 182*2ded3703SSong Liu return (log != NULL && 183*2ded3703SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 184*2ded3703SSong Liu } 185*2ded3703SSong Liu 186f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 187f6bed0efSShaohua Li { 188f6bed0efSShaohua Li start += inc; 189f6bed0efSShaohua Li if (start >= log->device_size) 190f6bed0efSShaohua Li start = start - log->device_size; 191f6bed0efSShaohua Li return start; 192f6bed0efSShaohua Li } 193f6bed0efSShaohua Li 194f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 195f6bed0efSShaohua Li sector_t end) 196f6bed0efSShaohua Li { 197f6bed0efSShaohua Li if (end >= start) 198f6bed0efSShaohua Li return end - start; 199f6bed0efSShaohua Li else 200f6bed0efSShaohua Li return end + log->device_size - start; 201f6bed0efSShaohua Li } 202f6bed0efSShaohua Li 203f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 204f6bed0efSShaohua Li { 205f6bed0efSShaohua Li sector_t used_size; 206f6bed0efSShaohua Li 207f6bed0efSShaohua Li used_size = r5l_ring_distance(log, log->last_checkpoint, 208f6bed0efSShaohua Li log->log_start); 209f6bed0efSShaohua Li 210f6bed0efSShaohua Li return log->device_size > used_size + size; 211f6bed0efSShaohua Li } 212f6bed0efSShaohua Li 213f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 214f6bed0efSShaohua Li enum r5l_io_unit_state state) 215f6bed0efSShaohua Li { 216f6bed0efSShaohua Li if (WARN_ON(io->state >= state)) 217f6bed0efSShaohua Li return; 218f6bed0efSShaohua Li io->state = state; 219f6bed0efSShaohua Li } 220f6bed0efSShaohua Li 221*2ded3703SSong Liu /* 222*2ded3703SSong Liu * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 223*2ded3703SSong Liu * This function should only be called in write-back mode. 224*2ded3703SSong Liu */ 225*2ded3703SSong Liu static void r5c_make_stripe_write_out(struct stripe_head *sh) 226*2ded3703SSong Liu { 227*2ded3703SSong Liu struct r5conf *conf = sh->raid_conf; 228*2ded3703SSong Liu struct r5l_log *log = conf->log; 229*2ded3703SSong Liu 230*2ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 231*2ded3703SSong Liu 232*2ded3703SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 233*2ded3703SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 234*2ded3703SSong Liu } 235*2ded3703SSong Liu 236*2ded3703SSong Liu /* 237*2ded3703SSong Liu * Setting proper flags after writing (or flushing) data and/or parity to the 238*2ded3703SSong Liu * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 239*2ded3703SSong Liu */ 240*2ded3703SSong Liu static void r5c_finish_cache_stripe(struct stripe_head *sh) 241*2ded3703SSong Liu { 242*2ded3703SSong Liu struct r5l_log *log = sh->raid_conf->log; 243*2ded3703SSong Liu 244*2ded3703SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 245*2ded3703SSong Liu BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 246*2ded3703SSong Liu /* 247*2ded3703SSong Liu * Set R5_InJournal for parity dev[pd_idx]. This means 248*2ded3703SSong Liu * all data AND parity in the journal. For RAID 6, it is 249*2ded3703SSong Liu * NOT necessary to set the flag for dev[qd_idx], as the 250*2ded3703SSong Liu * two parities are written out together. 251*2ded3703SSong Liu */ 252*2ded3703SSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 253*2ded3703SSong Liu } else 254*2ded3703SSong Liu BUG(); /* write-back logic in next patch */ 255*2ded3703SSong Liu } 256*2ded3703SSong Liu 257d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io) 258d8858f43SChristoph Hellwig { 259d8858f43SChristoph Hellwig struct stripe_head *sh, *next; 260d8858f43SChristoph Hellwig 261d8858f43SChristoph Hellwig list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 262d8858f43SChristoph Hellwig list_del_init(&sh->log_list); 263*2ded3703SSong Liu 264*2ded3703SSong Liu r5c_finish_cache_stripe(sh); 265*2ded3703SSong Liu 266d8858f43SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 267d8858f43SChristoph Hellwig raid5_release_stripe(sh); 268d8858f43SChristoph Hellwig } 269d8858f43SChristoph Hellwig } 270d8858f43SChristoph Hellwig 27156fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log) 27256fef7c6SChristoph Hellwig { 27356fef7c6SChristoph Hellwig struct r5l_io_unit *io, *next; 27456fef7c6SChristoph Hellwig 27556fef7c6SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 27656fef7c6SChristoph Hellwig 27756fef7c6SChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 27856fef7c6SChristoph Hellwig /* don't change list order */ 27956fef7c6SChristoph Hellwig if (io->state < IO_UNIT_IO_END) 28056fef7c6SChristoph Hellwig break; 28156fef7c6SChristoph Hellwig 28256fef7c6SChristoph Hellwig list_move_tail(&io->log_sibling, &log->finished_ios); 28356fef7c6SChristoph Hellwig r5l_io_run_stripes(io); 28456fef7c6SChristoph Hellwig } 28556fef7c6SChristoph Hellwig } 28656fef7c6SChristoph Hellwig 2873848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log) 2883848c0bcSChristoph Hellwig { 2893848c0bcSChristoph Hellwig struct r5l_io_unit *io, *next; 2903848c0bcSChristoph Hellwig 2913848c0bcSChristoph Hellwig assert_spin_locked(&log->io_list_lock); 2923848c0bcSChristoph Hellwig 2933848c0bcSChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 2943848c0bcSChristoph Hellwig /* don't change list order */ 2953848c0bcSChristoph Hellwig if (io->state < IO_UNIT_IO_END) 2963848c0bcSChristoph Hellwig break; 2973848c0bcSChristoph Hellwig list_move_tail(&io->log_sibling, &log->io_end_ios); 2983848c0bcSChristoph Hellwig } 2993848c0bcSChristoph Hellwig } 3003848c0bcSChristoph Hellwig 301f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio) 302f6bed0efSShaohua Li { 303f6bed0efSShaohua Li struct r5l_io_unit *io = bio->bi_private; 304f6bed0efSShaohua Li struct r5l_log *log = io->log; 305509ffec7SChristoph Hellwig unsigned long flags; 306f6bed0efSShaohua Li 3076e74a9cfSShaohua Li if (bio->bi_error) 3086e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 3096e74a9cfSShaohua Li 310f6bed0efSShaohua Li bio_put(bio); 311e8deb638SChristoph Hellwig mempool_free(io->meta_page, log->meta_pool); 312f6bed0efSShaohua Li 313509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 314509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 31556fef7c6SChristoph Hellwig if (log->need_cache_flush) 3163848c0bcSChristoph Hellwig r5l_move_to_end_ios(log); 31756fef7c6SChristoph Hellwig else 31856fef7c6SChristoph Hellwig r5l_log_run_stripes(log); 319509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 320509ffec7SChristoph Hellwig 32156fef7c6SChristoph Hellwig if (log->need_cache_flush) 322f6bed0efSShaohua Li md_wakeup_thread(log->rdev->mddev->thread); 323f6bed0efSShaohua Li } 324f6bed0efSShaohua Li 325f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log) 326f6bed0efSShaohua Li { 327f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 328f6bed0efSShaohua Li struct r5l_meta_block *block; 329509ffec7SChristoph Hellwig unsigned long flags; 330f6bed0efSShaohua Li u32 crc; 331f6bed0efSShaohua Li 332f6bed0efSShaohua Li if (!io) 333f6bed0efSShaohua Li return; 334f6bed0efSShaohua Li 335f6bed0efSShaohua Li block = page_address(io->meta_page); 336f6bed0efSShaohua Li block->meta_size = cpu_to_le32(io->meta_offset); 3375cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 338f6bed0efSShaohua Li block->checksum = cpu_to_le32(crc); 339f6bed0efSShaohua Li 340f6bed0efSShaohua Li log->current_io = NULL; 341509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 342509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 343509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 344f6bed0efSShaohua Li 3454e49ea4aSMike Christie submit_bio(io->current_bio); 346f6bed0efSShaohua Li } 347f6bed0efSShaohua Li 3486143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log) 349b349feb3SChristoph Hellwig { 350c38d29b3SChristoph Hellwig struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 351b349feb3SChristoph Hellwig 352796a5cf0SMike Christie bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 353b349feb3SChristoph Hellwig bio->bi_bdev = log->rdev->bdev; 3541e932a37SChristoph Hellwig bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 355b349feb3SChristoph Hellwig 356b349feb3SChristoph Hellwig return bio; 357b349feb3SChristoph Hellwig } 358b349feb3SChristoph Hellwig 359c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 360c1b99198SChristoph Hellwig { 361c1b99198SChristoph Hellwig log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 362c1b99198SChristoph Hellwig 363c1b99198SChristoph Hellwig /* 364c1b99198SChristoph Hellwig * If we filled up the log device start from the beginning again, 365c1b99198SChristoph Hellwig * which will require a new bio. 366c1b99198SChristoph Hellwig * 367c1b99198SChristoph Hellwig * Note: for this to work properly the log size needs to me a multiple 368c1b99198SChristoph Hellwig * of BLOCK_SECTORS. 369c1b99198SChristoph Hellwig */ 370c1b99198SChristoph Hellwig if (log->log_start == 0) 3716143e2ceSChristoph Hellwig io->need_split_bio = true; 372c1b99198SChristoph Hellwig 373c1b99198SChristoph Hellwig io->log_end = log->log_start; 374c1b99198SChristoph Hellwig } 375c1b99198SChristoph Hellwig 376f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 377f6bed0efSShaohua Li { 378f6bed0efSShaohua Li struct r5l_io_unit *io; 379f6bed0efSShaohua Li struct r5l_meta_block *block; 380f6bed0efSShaohua Li 3815036c390SChristoph Hellwig io = mempool_alloc(log->io_pool, GFP_ATOMIC); 3825036c390SChristoph Hellwig if (!io) 3835036c390SChristoph Hellwig return NULL; 3845036c390SChristoph Hellwig memset(io, 0, sizeof(*io)); 3855036c390SChristoph Hellwig 38651039cd0SChristoph Hellwig io->log = log; 38751039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->log_sibling); 38851039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->stripe_list); 38951039cd0SChristoph Hellwig io->state = IO_UNIT_RUNNING; 390f6bed0efSShaohua Li 391e8deb638SChristoph Hellwig io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 392f6bed0efSShaohua Li block = page_address(io->meta_page); 393e8deb638SChristoph Hellwig clear_page(block); 394f6bed0efSShaohua Li block->magic = cpu_to_le32(R5LOG_MAGIC); 395f6bed0efSShaohua Li block->version = R5LOG_VERSION; 396f6bed0efSShaohua Li block->seq = cpu_to_le64(log->seq); 397f6bed0efSShaohua Li block->position = cpu_to_le64(log->log_start); 398f6bed0efSShaohua Li 399f6bed0efSShaohua Li io->log_start = log->log_start; 400f6bed0efSShaohua Li io->meta_offset = sizeof(struct r5l_meta_block); 4012b8ef16eSChristoph Hellwig io->seq = log->seq++; 402f6bed0efSShaohua Li 4036143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 4046143e2ceSChristoph Hellwig io->current_bio->bi_end_io = r5l_log_endio; 4056143e2ceSChristoph Hellwig io->current_bio->bi_private = io; 406b349feb3SChristoph Hellwig bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 407f6bed0efSShaohua Li 408c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 409f6bed0efSShaohua Li 410f6bed0efSShaohua Li spin_lock_irq(&log->io_list_lock); 411f6bed0efSShaohua Li list_add_tail(&io->log_sibling, &log->running_ios); 412f6bed0efSShaohua Li spin_unlock_irq(&log->io_list_lock); 413f6bed0efSShaohua Li 414f6bed0efSShaohua Li return io; 415f6bed0efSShaohua Li } 416f6bed0efSShaohua Li 417f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 418f6bed0efSShaohua Li { 41922581f58SChristoph Hellwig if (log->current_io && 42022581f58SChristoph Hellwig log->current_io->meta_offset + payload_size > PAGE_SIZE) 421f6bed0efSShaohua Li r5l_submit_current_io(log); 422f6bed0efSShaohua Li 4235036c390SChristoph Hellwig if (!log->current_io) { 424f6bed0efSShaohua Li log->current_io = r5l_new_meta(log); 4255036c390SChristoph Hellwig if (!log->current_io) 4265036c390SChristoph Hellwig return -ENOMEM; 4275036c390SChristoph Hellwig } 4285036c390SChristoph Hellwig 429f6bed0efSShaohua Li return 0; 430f6bed0efSShaohua Li } 431f6bed0efSShaohua Li 432f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 433f6bed0efSShaohua Li sector_t location, 434f6bed0efSShaohua Li u32 checksum1, u32 checksum2, 435f6bed0efSShaohua Li bool checksum2_valid) 436f6bed0efSShaohua Li { 437f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 438f6bed0efSShaohua Li struct r5l_payload_data_parity *payload; 439f6bed0efSShaohua Li 440f6bed0efSShaohua Li payload = page_address(io->meta_page) + io->meta_offset; 441f6bed0efSShaohua Li payload->header.type = cpu_to_le16(type); 442f6bed0efSShaohua Li payload->header.flags = cpu_to_le16(0); 443f6bed0efSShaohua Li payload->size = cpu_to_le32((1 + !!checksum2_valid) << 444f6bed0efSShaohua Li (PAGE_SHIFT - 9)); 445f6bed0efSShaohua Li payload->location = cpu_to_le64(location); 446f6bed0efSShaohua Li payload->checksum[0] = cpu_to_le32(checksum1); 447f6bed0efSShaohua Li if (checksum2_valid) 448f6bed0efSShaohua Li payload->checksum[1] = cpu_to_le32(checksum2); 449f6bed0efSShaohua Li 450f6bed0efSShaohua Li io->meta_offset += sizeof(struct r5l_payload_data_parity) + 451f6bed0efSShaohua Li sizeof(__le32) * (1 + !!checksum2_valid); 452f6bed0efSShaohua Li } 453f6bed0efSShaohua Li 454f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 455f6bed0efSShaohua Li { 456f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 457f6bed0efSShaohua Li 4586143e2ceSChristoph Hellwig if (io->need_split_bio) { 4596143e2ceSChristoph Hellwig struct bio *prev = io->current_bio; 460f6bed0efSShaohua Li 4616143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 4626143e2ceSChristoph Hellwig bio_chain(io->current_bio, prev); 4636143e2ceSChristoph Hellwig 4644e49ea4aSMike Christie submit_bio(prev); 465f6bed0efSShaohua Li } 466f6bed0efSShaohua Li 4676143e2ceSChristoph Hellwig if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 4686143e2ceSChristoph Hellwig BUG(); 4696143e2ceSChristoph Hellwig 470c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 471f6bed0efSShaohua Li } 472f6bed0efSShaohua Li 4735036c390SChristoph Hellwig static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 474f6bed0efSShaohua Li int data_pages, int parity_pages) 475f6bed0efSShaohua Li { 476f6bed0efSShaohua Li int i; 477f6bed0efSShaohua Li int meta_size; 4785036c390SChristoph Hellwig int ret; 479f6bed0efSShaohua Li struct r5l_io_unit *io; 480f6bed0efSShaohua Li 481f6bed0efSShaohua Li meta_size = 482f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 483f6bed0efSShaohua Li * data_pages) + 484f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 485f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 486f6bed0efSShaohua Li 4875036c390SChristoph Hellwig ret = r5l_get_meta(log, meta_size); 4885036c390SChristoph Hellwig if (ret) 4895036c390SChristoph Hellwig return ret; 4905036c390SChristoph Hellwig 491f6bed0efSShaohua Li io = log->current_io; 492f6bed0efSShaohua Li 493f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 494f6bed0efSShaohua Li if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 495f6bed0efSShaohua Li continue; 496f6bed0efSShaohua Li if (i == sh->pd_idx || i == sh->qd_idx) 497f6bed0efSShaohua Li continue; 498f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 499f6bed0efSShaohua Li raid5_compute_blocknr(sh, i, 0), 500f6bed0efSShaohua Li sh->dev[i].log_checksum, 0, false); 501f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[i].page); 502f6bed0efSShaohua Li } 503f6bed0efSShaohua Li 504*2ded3703SSong Liu if (parity_pages == 2) { 505f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 506f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 507f6bed0efSShaohua Li sh->dev[sh->qd_idx].log_checksum, true); 508f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 509f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 510*2ded3703SSong Liu } else if (parity_pages == 1) { 511f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 512f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 513f6bed0efSShaohua Li 0, false); 514f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 515*2ded3703SSong Liu } else /* Just writing data, not parity, in caching phase */ 516*2ded3703SSong Liu BUG_ON(parity_pages != 0); 517f6bed0efSShaohua Li 518f6bed0efSShaohua Li list_add_tail(&sh->log_list, &io->stripe_list); 519f6bed0efSShaohua Li atomic_inc(&io->pending_stripe); 520f6bed0efSShaohua Li sh->log_io = io; 5215036c390SChristoph Hellwig 5225036c390SChristoph Hellwig return 0; 523f6bed0efSShaohua Li } 524f6bed0efSShaohua Li 525509ffec7SChristoph Hellwig static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 526f6bed0efSShaohua Li /* 527f6bed0efSShaohua Li * running in raid5d, where reclaim could wait for raid5d too (when it flushes 528f6bed0efSShaohua Li * data from log to raid disks), so we shouldn't wait for reclaim here 529f6bed0efSShaohua Li */ 530f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 531f6bed0efSShaohua Li { 532f6bed0efSShaohua Li int write_disks = 0; 533f6bed0efSShaohua Li int data_pages, parity_pages; 534f6bed0efSShaohua Li int reserve; 535f6bed0efSShaohua Li int i; 5365036c390SChristoph Hellwig int ret = 0; 537f6bed0efSShaohua Li 538f6bed0efSShaohua Li if (!log) 539f6bed0efSShaohua Li return -EAGAIN; 540f6bed0efSShaohua Li /* Don't support stripe batch */ 541f6bed0efSShaohua Li if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 542f6bed0efSShaohua Li test_bit(STRIPE_SYNCING, &sh->state)) { 543f6bed0efSShaohua Li /* the stripe is written to log, we start writing it to raid */ 544f6bed0efSShaohua Li clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 545f6bed0efSShaohua Li return -EAGAIN; 546f6bed0efSShaohua Li } 547f6bed0efSShaohua Li 548*2ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 549*2ded3703SSong Liu 550f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 551f6bed0efSShaohua Li void *addr; 552f6bed0efSShaohua Li 553f6bed0efSShaohua Li if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 554f6bed0efSShaohua Li continue; 555f6bed0efSShaohua Li write_disks++; 556f6bed0efSShaohua Li /* checksum is already calculated in last run */ 557f6bed0efSShaohua Li if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 558f6bed0efSShaohua Li continue; 559f6bed0efSShaohua Li addr = kmap_atomic(sh->dev[i].page); 5605cb2fbd6SShaohua Li sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 561f6bed0efSShaohua Li addr, PAGE_SIZE); 562f6bed0efSShaohua Li kunmap_atomic(addr); 563f6bed0efSShaohua Li } 564f6bed0efSShaohua Li parity_pages = 1 + !!(sh->qd_idx >= 0); 565f6bed0efSShaohua Li data_pages = write_disks - parity_pages; 566f6bed0efSShaohua Li 567f6bed0efSShaohua Li set_bit(STRIPE_LOG_TRAPPED, &sh->state); 568253f9fd4SShaohua Li /* 569253f9fd4SShaohua Li * The stripe must enter state machine again to finish the write, so 570253f9fd4SShaohua Li * don't delay. 571253f9fd4SShaohua Li */ 572253f9fd4SShaohua Li clear_bit(STRIPE_DELAYED, &sh->state); 573f6bed0efSShaohua Li atomic_inc(&sh->count); 574f6bed0efSShaohua Li 575f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 576f6bed0efSShaohua Li /* meta + data */ 577f6bed0efSShaohua Li reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 5785036c390SChristoph Hellwig if (!r5l_has_free_space(log, reserve)) { 579f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 580f6bed0efSShaohua Li list_add_tail(&sh->log_list, &log->no_space_stripes); 581f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 582f6bed0efSShaohua Li 583f6bed0efSShaohua Li r5l_wake_reclaim(log, reserve); 5845036c390SChristoph Hellwig } else { 5855036c390SChristoph Hellwig ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 5865036c390SChristoph Hellwig if (ret) { 5875036c390SChristoph Hellwig spin_lock_irq(&log->io_list_lock); 5885036c390SChristoph Hellwig list_add_tail(&sh->log_list, &log->no_mem_stripes); 5895036c390SChristoph Hellwig spin_unlock_irq(&log->io_list_lock); 590f6bed0efSShaohua Li } 5915036c390SChristoph Hellwig } 592f6bed0efSShaohua Li 5935036c390SChristoph Hellwig mutex_unlock(&log->io_mutex); 594f6bed0efSShaohua Li return 0; 595f6bed0efSShaohua Li } 596f6bed0efSShaohua Li 597f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log) 598f6bed0efSShaohua Li { 599f6bed0efSShaohua Li if (!log) 600f6bed0efSShaohua Li return; 601f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 602f6bed0efSShaohua Li r5l_submit_current_io(log); 603f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 604f6bed0efSShaohua Li } 605f6bed0efSShaohua Li 606828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 607828cbe98SShaohua Li { 608828cbe98SShaohua Li if (!log) 609828cbe98SShaohua Li return -ENODEV; 610828cbe98SShaohua Li /* 611828cbe98SShaohua Li * we flush log disk cache first, then write stripe data to raid disks. 612828cbe98SShaohua Li * So if bio is finished, the log disk cache is flushed already. The 613828cbe98SShaohua Li * recovery guarantees we can recovery the bio from log disk, so we 614828cbe98SShaohua Li * don't need to flush again 615828cbe98SShaohua Li */ 616828cbe98SShaohua Li if (bio->bi_iter.bi_size == 0) { 617828cbe98SShaohua Li bio_endio(bio); 618828cbe98SShaohua Li return 0; 619828cbe98SShaohua Li } 6201eff9d32SJens Axboe bio->bi_opf &= ~REQ_PREFLUSH; 621828cbe98SShaohua Li return -EAGAIN; 622828cbe98SShaohua Li } 623828cbe98SShaohua Li 624f6bed0efSShaohua Li /* This will run after log space is reclaimed */ 625f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log) 626f6bed0efSShaohua Li { 627f6bed0efSShaohua Li struct stripe_head *sh; 628f6bed0efSShaohua Li 629f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 630f6bed0efSShaohua Li while (!list_empty(&log->no_space_stripes)) { 631f6bed0efSShaohua Li sh = list_first_entry(&log->no_space_stripes, 632f6bed0efSShaohua Li struct stripe_head, log_list); 633f6bed0efSShaohua Li list_del_init(&sh->log_list); 634f6bed0efSShaohua Li set_bit(STRIPE_HANDLE, &sh->state); 635f6bed0efSShaohua Li raid5_release_stripe(sh); 636f6bed0efSShaohua Li } 637f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 638f6bed0efSShaohua Li } 639f6bed0efSShaohua Li 64017036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log) 64117036461SChristoph Hellwig { 64217036461SChristoph Hellwig return r5l_ring_distance(log, log->last_checkpoint, 64317036461SChristoph Hellwig log->next_checkpoint); 64417036461SChristoph Hellwig } 64517036461SChristoph Hellwig 6465036c390SChristoph Hellwig static void r5l_run_no_mem_stripe(struct r5l_log *log) 6475036c390SChristoph Hellwig { 6485036c390SChristoph Hellwig struct stripe_head *sh; 6495036c390SChristoph Hellwig 6505036c390SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 6515036c390SChristoph Hellwig 6525036c390SChristoph Hellwig if (!list_empty(&log->no_mem_stripes)) { 6535036c390SChristoph Hellwig sh = list_first_entry(&log->no_mem_stripes, 6545036c390SChristoph Hellwig struct stripe_head, log_list); 6555036c390SChristoph Hellwig list_del_init(&sh->log_list); 6565036c390SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 6575036c390SChristoph Hellwig raid5_release_stripe(sh); 6585036c390SChristoph Hellwig } 6595036c390SChristoph Hellwig } 6605036c390SChristoph Hellwig 66104732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log) 66217036461SChristoph Hellwig { 66317036461SChristoph Hellwig struct r5l_io_unit *io, *next; 66417036461SChristoph Hellwig bool found = false; 66517036461SChristoph Hellwig 66617036461SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 66717036461SChristoph Hellwig 66804732f74SChristoph Hellwig list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 66917036461SChristoph Hellwig /* don't change list order */ 67017036461SChristoph Hellwig if (io->state < IO_UNIT_STRIPE_END) 67117036461SChristoph Hellwig break; 67217036461SChristoph Hellwig 67317036461SChristoph Hellwig log->next_checkpoint = io->log_start; 67417036461SChristoph Hellwig log->next_cp_seq = io->seq; 67517036461SChristoph Hellwig 67617036461SChristoph Hellwig list_del(&io->log_sibling); 6775036c390SChristoph Hellwig mempool_free(io, log->io_pool); 6785036c390SChristoph Hellwig r5l_run_no_mem_stripe(log); 67917036461SChristoph Hellwig 68017036461SChristoph Hellwig found = true; 68117036461SChristoph Hellwig } 68217036461SChristoph Hellwig 68317036461SChristoph Hellwig return found; 68417036461SChristoph Hellwig } 68517036461SChristoph Hellwig 686509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 687509ffec7SChristoph Hellwig { 688509ffec7SChristoph Hellwig struct r5l_log *log = io->log; 689509ffec7SChristoph Hellwig unsigned long flags; 690509ffec7SChristoph Hellwig 691509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 692509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 69317036461SChristoph Hellwig 69404732f74SChristoph Hellwig if (!r5l_complete_finished_ios(log)) { 69585f2f9a4SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 69685f2f9a4SShaohua Li return; 69785f2f9a4SShaohua Li } 698509ffec7SChristoph Hellwig 69917036461SChristoph Hellwig if (r5l_reclaimable_space(log) > log->max_free_space) 700509ffec7SChristoph Hellwig r5l_wake_reclaim(log, 0); 701509ffec7SChristoph Hellwig 702509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 703509ffec7SChristoph Hellwig wake_up(&log->iounit_wait); 704509ffec7SChristoph Hellwig } 705509ffec7SChristoph Hellwig 7060576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh) 7070576b1c6SShaohua Li { 7080576b1c6SShaohua Li struct r5l_io_unit *io; 7090576b1c6SShaohua Li 7100576b1c6SShaohua Li io = sh->log_io; 7110576b1c6SShaohua Li sh->log_io = NULL; 7120576b1c6SShaohua Li 713509ffec7SChristoph Hellwig if (io && atomic_dec_and_test(&io->pending_stripe)) 714509ffec7SChristoph Hellwig __r5l_stripe_write_finished(io); 7150576b1c6SShaohua Li } 7160576b1c6SShaohua Li 717a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio) 718a8c34f91SShaohua Li { 719a8c34f91SShaohua Li struct r5l_log *log = container_of(bio, struct r5l_log, 720a8c34f91SShaohua Li flush_bio); 721a8c34f91SShaohua Li unsigned long flags; 722a8c34f91SShaohua Li struct r5l_io_unit *io; 723a8c34f91SShaohua Li 7246e74a9cfSShaohua Li if (bio->bi_error) 7256e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 7266e74a9cfSShaohua Li 727a8c34f91SShaohua Li spin_lock_irqsave(&log->io_list_lock, flags); 728d8858f43SChristoph Hellwig list_for_each_entry(io, &log->flushing_ios, log_sibling) 729d8858f43SChristoph Hellwig r5l_io_run_stripes(io); 73004732f74SChristoph Hellwig list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 731a8c34f91SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 732a8c34f91SShaohua Li } 733a8c34f91SShaohua Li 7340576b1c6SShaohua Li /* 7350576b1c6SShaohua Li * Starting dispatch IO to raid. 7360576b1c6SShaohua Li * io_unit(meta) consists of a log. There is one situation we want to avoid. A 7370576b1c6SShaohua Li * broken meta in the middle of a log causes recovery can't find meta at the 7380576b1c6SShaohua Li * head of log. If operations require meta at the head persistent in log, we 7390576b1c6SShaohua Li * must make sure meta before it persistent in log too. A case is: 7400576b1c6SShaohua Li * 7410576b1c6SShaohua Li * stripe data/parity is in log, we start write stripe to raid disks. stripe 7420576b1c6SShaohua Li * data/parity must be persistent in log before we do the write to raid disks. 7430576b1c6SShaohua Li * 7440576b1c6SShaohua Li * The solution is we restrictly maintain io_unit list order. In this case, we 7450576b1c6SShaohua Li * only write stripes of an io_unit to raid disks till the io_unit is the first 7460576b1c6SShaohua Li * one whose data/parity is in log. 7470576b1c6SShaohua Li */ 7480576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log) 7490576b1c6SShaohua Li { 750a8c34f91SShaohua Li bool do_flush; 75156fef7c6SChristoph Hellwig 75256fef7c6SChristoph Hellwig if (!log || !log->need_cache_flush) 7530576b1c6SShaohua Li return; 7540576b1c6SShaohua Li 755a8c34f91SShaohua Li spin_lock_irq(&log->io_list_lock); 756a8c34f91SShaohua Li /* flush bio is running */ 757a8c34f91SShaohua Li if (!list_empty(&log->flushing_ios)) { 758a8c34f91SShaohua Li spin_unlock_irq(&log->io_list_lock); 7590576b1c6SShaohua Li return; 7600576b1c6SShaohua Li } 761a8c34f91SShaohua Li list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 762a8c34f91SShaohua Li do_flush = !list_empty(&log->flushing_ios); 7630576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 764a8c34f91SShaohua Li 765a8c34f91SShaohua Li if (!do_flush) 766a8c34f91SShaohua Li return; 767a8c34f91SShaohua Li bio_reset(&log->flush_bio); 768a8c34f91SShaohua Li log->flush_bio.bi_bdev = log->rdev->bdev; 769a8c34f91SShaohua Li log->flush_bio.bi_end_io = r5l_log_flush_endio; 770796a5cf0SMike Christie bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); 7714e49ea4aSMike Christie submit_bio(&log->flush_bio); 7720576b1c6SShaohua Li } 7730576b1c6SShaohua Li 7740576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp); 7754b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log, 7764b482044SShaohua Li sector_t end) 7774b482044SShaohua Li { 7784b482044SShaohua Li struct block_device *bdev = log->rdev->bdev; 7794b482044SShaohua Li struct mddev *mddev; 7804b482044SShaohua Li 7814b482044SShaohua Li r5l_write_super(log, end); 7824b482044SShaohua Li 7834b482044SShaohua Li if (!blk_queue_discard(bdev_get_queue(bdev))) 7844b482044SShaohua Li return; 7854b482044SShaohua Li 7864b482044SShaohua Li mddev = log->rdev->mddev; 7874b482044SShaohua Li /* 7888e018c21SShaohua Li * Discard could zero data, so before discard we must make sure 7898e018c21SShaohua Li * superblock is updated to new log tail. Updating superblock (either 7908e018c21SShaohua Li * directly call md_update_sb() or depend on md thread) must hold 7918e018c21SShaohua Li * reconfig mutex. On the other hand, raid5_quiesce is called with 7928e018c21SShaohua Li * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 7938e018c21SShaohua Li * for all IO finish, hence waitting for reclaim thread, while reclaim 7948e018c21SShaohua Li * thread is calling this function and waitting for reconfig mutex. So 7958e018c21SShaohua Li * there is a deadlock. We workaround this issue with a trylock. 7968e018c21SShaohua Li * FIXME: we could miss discard if we can't take reconfig mutex 7974b482044SShaohua Li */ 79885ad1d13SGuoqing Jiang set_mask_bits(&mddev->flags, 0, 79985ad1d13SGuoqing Jiang BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 8008e018c21SShaohua Li if (!mddev_trylock(mddev)) 8018e018c21SShaohua Li return; 8024b482044SShaohua Li md_update_sb(mddev, 1); 8038e018c21SShaohua Li mddev_unlock(mddev); 8044b482044SShaohua Li 8056e74a9cfSShaohua Li /* discard IO error really doesn't matter, ignore it */ 8064b482044SShaohua Li if (log->last_checkpoint < end) { 8074b482044SShaohua Li blkdev_issue_discard(bdev, 8084b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 8094b482044SShaohua Li end - log->last_checkpoint, GFP_NOIO, 0); 8104b482044SShaohua Li } else { 8114b482044SShaohua Li blkdev_issue_discard(bdev, 8124b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 8134b482044SShaohua Li log->device_size - log->last_checkpoint, 8144b482044SShaohua Li GFP_NOIO, 0); 8154b482044SShaohua Li blkdev_issue_discard(bdev, log->rdev->data_offset, end, 8164b482044SShaohua Li GFP_NOIO, 0); 8174b482044SShaohua Li } 8184b482044SShaohua Li } 8194b482044SShaohua Li 8204b482044SShaohua Li 8210576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log) 8220576b1c6SShaohua Li { 8230576b1c6SShaohua Li sector_t reclaim_target = xchg(&log->reclaim_target, 0); 82417036461SChristoph Hellwig sector_t reclaimable; 82517036461SChristoph Hellwig sector_t next_checkpoint; 82617036461SChristoph Hellwig u64 next_cp_seq; 8270576b1c6SShaohua Li 8280576b1c6SShaohua Li spin_lock_irq(&log->io_list_lock); 8290576b1c6SShaohua Li /* 8300576b1c6SShaohua Li * move proper io_unit to reclaim list. We should not change the order. 8310576b1c6SShaohua Li * reclaimable/unreclaimable io_unit can be mixed in the list, we 8320576b1c6SShaohua Li * shouldn't reuse space of an unreclaimable io_unit 8330576b1c6SShaohua Li */ 8340576b1c6SShaohua Li while (1) { 83517036461SChristoph Hellwig reclaimable = r5l_reclaimable_space(log); 83617036461SChristoph Hellwig if (reclaimable >= reclaim_target || 8370576b1c6SShaohua Li (list_empty(&log->running_ios) && 8380576b1c6SShaohua Li list_empty(&log->io_end_ios) && 839a8c34f91SShaohua Li list_empty(&log->flushing_ios) && 84004732f74SChristoph Hellwig list_empty(&log->finished_ios))) 8410576b1c6SShaohua Li break; 8420576b1c6SShaohua Li 84317036461SChristoph Hellwig md_wakeup_thread(log->rdev->mddev->thread); 84417036461SChristoph Hellwig wait_event_lock_irq(log->iounit_wait, 84517036461SChristoph Hellwig r5l_reclaimable_space(log) > reclaimable, 84617036461SChristoph Hellwig log->io_list_lock); 8470576b1c6SShaohua Li } 84817036461SChristoph Hellwig 84917036461SChristoph Hellwig next_checkpoint = log->next_checkpoint; 85017036461SChristoph Hellwig next_cp_seq = log->next_cp_seq; 8510576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 8520576b1c6SShaohua Li 85317036461SChristoph Hellwig BUG_ON(reclaimable < 0); 85417036461SChristoph Hellwig if (reclaimable == 0) 8550576b1c6SShaohua Li return; 8560576b1c6SShaohua Li 8570576b1c6SShaohua Li /* 8580576b1c6SShaohua Li * write_super will flush cache of each raid disk. We must write super 8590576b1c6SShaohua Li * here, because the log area might be reused soon and we don't want to 8600576b1c6SShaohua Li * confuse recovery 8610576b1c6SShaohua Li */ 8624b482044SShaohua Li r5l_write_super_and_discard_space(log, next_checkpoint); 8630576b1c6SShaohua Li 8640576b1c6SShaohua Li mutex_lock(&log->io_mutex); 86517036461SChristoph Hellwig log->last_checkpoint = next_checkpoint; 86617036461SChristoph Hellwig log->last_cp_seq = next_cp_seq; 8670576b1c6SShaohua Li mutex_unlock(&log->io_mutex); 8680576b1c6SShaohua Li 86917036461SChristoph Hellwig r5l_run_no_space_stripes(log); 8700576b1c6SShaohua Li } 8710576b1c6SShaohua Li 8720576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread) 8730576b1c6SShaohua Li { 8740576b1c6SShaohua Li struct mddev *mddev = thread->mddev; 8750576b1c6SShaohua Li struct r5conf *conf = mddev->private; 8760576b1c6SShaohua Li struct r5l_log *log = conf->log; 8770576b1c6SShaohua Li 8780576b1c6SShaohua Li if (!log) 8790576b1c6SShaohua Li return; 8800576b1c6SShaohua Li r5l_do_reclaim(log); 8810576b1c6SShaohua Li } 8820576b1c6SShaohua Li 883f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 884f6bed0efSShaohua Li { 8850576b1c6SShaohua Li unsigned long target; 8860576b1c6SShaohua Li unsigned long new = (unsigned long)space; /* overflow in theory */ 8870576b1c6SShaohua Li 8880576b1c6SShaohua Li do { 8890576b1c6SShaohua Li target = log->reclaim_target; 8900576b1c6SShaohua Li if (new < target) 8910576b1c6SShaohua Li return; 8920576b1c6SShaohua Li } while (cmpxchg(&log->reclaim_target, target, new) != target); 8930576b1c6SShaohua Li md_wakeup_thread(log->reclaim_thread); 894f6bed0efSShaohua Li } 895f6bed0efSShaohua Li 896e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state) 897e6c033f7SShaohua Li { 8984b482044SShaohua Li struct mddev *mddev; 899e6c033f7SShaohua Li if (!log || state == 2) 900e6c033f7SShaohua Li return; 901e6c033f7SShaohua Li if (state == 0) { 90216a43f6aSShaohua Li /* 90316a43f6aSShaohua Li * This is a special case for hotadd. In suspend, the array has 90416a43f6aSShaohua Li * no journal. In resume, journal is initialized as well as the 90516a43f6aSShaohua Li * reclaim thread. 90616a43f6aSShaohua Li */ 90716a43f6aSShaohua Li if (log->reclaim_thread) 90816a43f6aSShaohua Li return; 909e6c033f7SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 910e6c033f7SShaohua Li log->rdev->mddev, "reclaim"); 911e6c033f7SShaohua Li } else if (state == 1) { 9124b482044SShaohua Li /* make sure r5l_write_super_and_discard_space exits */ 9134b482044SShaohua Li mddev = log->rdev->mddev; 9144b482044SShaohua Li wake_up(&mddev->sb_wait); 915e6c033f7SShaohua Li r5l_wake_reclaim(log, -1L); 916e6c033f7SShaohua Li md_unregister_thread(&log->reclaim_thread); 917e6c033f7SShaohua Li r5l_do_reclaim(log); 918e6c033f7SShaohua Li } 919e6c033f7SShaohua Li } 920e6c033f7SShaohua Li 9216e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf) 9226e74a9cfSShaohua Li { 923f6b6ec5cSShaohua Li struct r5l_log *log; 924f6b6ec5cSShaohua Li bool ret; 9257dde2ad3SShaohua Li /* don't allow write if journal disk is missing */ 926f6b6ec5cSShaohua Li rcu_read_lock(); 927f6b6ec5cSShaohua Li log = rcu_dereference(conf->log); 928f6b6ec5cSShaohua Li 929f6b6ec5cSShaohua Li if (!log) 930f6b6ec5cSShaohua Li ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 931f6b6ec5cSShaohua Li else 932f6b6ec5cSShaohua Li ret = test_bit(Faulty, &log->rdev->flags); 933f6b6ec5cSShaohua Li rcu_read_unlock(); 934f6b6ec5cSShaohua Li return ret; 9356e74a9cfSShaohua Li } 9366e74a9cfSShaohua Li 937355810d1SShaohua Li struct r5l_recovery_ctx { 938355810d1SShaohua Li struct page *meta_page; /* current meta */ 939355810d1SShaohua Li sector_t meta_total_blocks; /* total size of current meta and data */ 940355810d1SShaohua Li sector_t pos; /* recovery position */ 941355810d1SShaohua Li u64 seq; /* recovery position seq */ 942355810d1SShaohua Li }; 943355810d1SShaohua Li 944355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log, 945355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 946355810d1SShaohua Li { 947355810d1SShaohua Li struct page *page = ctx->meta_page; 948355810d1SShaohua Li struct r5l_meta_block *mb; 949355810d1SShaohua Li u32 crc, stored_crc; 950355810d1SShaohua Li 951796a5cf0SMike Christie if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 952796a5cf0SMike Christie false)) 953355810d1SShaohua Li return -EIO; 954355810d1SShaohua Li 955355810d1SShaohua Li mb = page_address(page); 956355810d1SShaohua Li stored_crc = le32_to_cpu(mb->checksum); 957355810d1SShaohua Li mb->checksum = 0; 958355810d1SShaohua Li 959355810d1SShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 960355810d1SShaohua Li le64_to_cpu(mb->seq) != ctx->seq || 961355810d1SShaohua Li mb->version != R5LOG_VERSION || 962355810d1SShaohua Li le64_to_cpu(mb->position) != ctx->pos) 963355810d1SShaohua Li return -EINVAL; 964355810d1SShaohua Li 9655cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 966355810d1SShaohua Li if (stored_crc != crc) 967355810d1SShaohua Li return -EINVAL; 968355810d1SShaohua Li 969355810d1SShaohua Li if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 970355810d1SShaohua Li return -EINVAL; 971355810d1SShaohua Li 972355810d1SShaohua Li ctx->meta_total_blocks = BLOCK_SECTORS; 973355810d1SShaohua Li 974355810d1SShaohua Li return 0; 975355810d1SShaohua Li } 976355810d1SShaohua Li 977355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log, 978355810d1SShaohua Li struct r5l_recovery_ctx *ctx, 979355810d1SShaohua Li sector_t stripe_sect, 9803fd880afSJackieLiu int *offset) 981355810d1SShaohua Li { 982355810d1SShaohua Li struct r5conf *conf = log->rdev->mddev->private; 983355810d1SShaohua Li struct stripe_head *sh; 984355810d1SShaohua Li struct r5l_payload_data_parity *payload; 985355810d1SShaohua Li int disk_index; 986355810d1SShaohua Li 987355810d1SShaohua Li sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); 988355810d1SShaohua Li while (1) { 9893fd880afSJackieLiu sector_t log_offset = r5l_ring_add(log, ctx->pos, 9903fd880afSJackieLiu ctx->meta_total_blocks); 991355810d1SShaohua Li payload = page_address(ctx->meta_page) + *offset; 992355810d1SShaohua Li 993355810d1SShaohua Li if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 994355810d1SShaohua Li raid5_compute_sector(conf, 995355810d1SShaohua Li le64_to_cpu(payload->location), 0, 996355810d1SShaohua Li &disk_index, sh); 997355810d1SShaohua Li 9983fd880afSJackieLiu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 999796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_READ, 0, 1000796a5cf0SMike Christie false); 1001355810d1SShaohua Li sh->dev[disk_index].log_checksum = 1002355810d1SShaohua Li le32_to_cpu(payload->checksum[0]); 1003355810d1SShaohua Li set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 1004355810d1SShaohua Li } else { 1005355810d1SShaohua Li disk_index = sh->pd_idx; 10063fd880afSJackieLiu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1007796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_READ, 0, 1008796a5cf0SMike Christie false); 1009355810d1SShaohua Li sh->dev[disk_index].log_checksum = 1010355810d1SShaohua Li le32_to_cpu(payload->checksum[0]); 1011355810d1SShaohua Li set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 1012355810d1SShaohua Li 1013355810d1SShaohua Li if (sh->qd_idx >= 0) { 1014355810d1SShaohua Li disk_index = sh->qd_idx; 1015355810d1SShaohua Li sync_page_io(log->rdev, 10163fd880afSJackieLiu r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1017355810d1SShaohua Li PAGE_SIZE, sh->dev[disk_index].page, 1018796a5cf0SMike Christie REQ_OP_READ, 0, false); 1019355810d1SShaohua Li sh->dev[disk_index].log_checksum = 1020355810d1SShaohua Li le32_to_cpu(payload->checksum[1]); 1021355810d1SShaohua Li set_bit(R5_Wantwrite, 1022355810d1SShaohua Li &sh->dev[disk_index].flags); 1023355810d1SShaohua Li } 1024355810d1SShaohua Li } 1025355810d1SShaohua Li 10263fd880afSJackieLiu ctx->meta_total_blocks += le32_to_cpu(payload->size); 1027355810d1SShaohua Li *offset += sizeof(struct r5l_payload_data_parity) + 1028355810d1SShaohua Li sizeof(__le32) * 1029355810d1SShaohua Li (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1030355810d1SShaohua Li if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 1031355810d1SShaohua Li break; 1032355810d1SShaohua Li } 1033355810d1SShaohua Li 1034355810d1SShaohua Li for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1035355810d1SShaohua Li void *addr; 1036355810d1SShaohua Li u32 checksum; 1037355810d1SShaohua Li 1038355810d1SShaohua Li if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1039355810d1SShaohua Li continue; 1040355810d1SShaohua Li addr = kmap_atomic(sh->dev[disk_index].page); 10415cb2fbd6SShaohua Li checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1042355810d1SShaohua Li kunmap_atomic(addr); 1043355810d1SShaohua Li if (checksum != sh->dev[disk_index].log_checksum) 1044355810d1SShaohua Li goto error; 1045355810d1SShaohua Li } 1046355810d1SShaohua Li 1047355810d1SShaohua Li for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1048355810d1SShaohua Li struct md_rdev *rdev, *rrdev; 1049355810d1SShaohua Li 1050355810d1SShaohua Li if (!test_and_clear_bit(R5_Wantwrite, 1051355810d1SShaohua Li &sh->dev[disk_index].flags)) 1052355810d1SShaohua Li continue; 1053355810d1SShaohua Li 1054355810d1SShaohua Li /* in case device is broken */ 1055354b445bSShaohua Li rcu_read_lock(); 1056355810d1SShaohua Li rdev = rcu_dereference(conf->disks[disk_index].rdev); 1057354b445bSShaohua Li if (rdev) { 1058354b445bSShaohua Li atomic_inc(&rdev->nr_pending); 1059354b445bSShaohua Li rcu_read_unlock(); 1060355810d1SShaohua Li sync_page_io(rdev, stripe_sect, PAGE_SIZE, 1061796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1062796a5cf0SMike Christie false); 1063354b445bSShaohua Li rdev_dec_pending(rdev, rdev->mddev); 1064354b445bSShaohua Li rcu_read_lock(); 1065354b445bSShaohua Li } 1066355810d1SShaohua Li rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1067354b445bSShaohua Li if (rrdev) { 1068354b445bSShaohua Li atomic_inc(&rrdev->nr_pending); 1069354b445bSShaohua Li rcu_read_unlock(); 1070355810d1SShaohua Li sync_page_io(rrdev, stripe_sect, PAGE_SIZE, 1071796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1072796a5cf0SMike Christie false); 1073354b445bSShaohua Li rdev_dec_pending(rrdev, rrdev->mddev); 1074354b445bSShaohua Li rcu_read_lock(); 1075354b445bSShaohua Li } 1076354b445bSShaohua Li rcu_read_unlock(); 1077355810d1SShaohua Li } 1078355810d1SShaohua Li raid5_release_stripe(sh); 1079355810d1SShaohua Li return 0; 1080355810d1SShaohua Li 1081355810d1SShaohua Li error: 1082355810d1SShaohua Li for (disk_index = 0; disk_index < sh->disks; disk_index++) 1083355810d1SShaohua Li sh->dev[disk_index].flags = 0; 1084355810d1SShaohua Li raid5_release_stripe(sh); 1085355810d1SShaohua Li return -EINVAL; 1086355810d1SShaohua Li } 1087355810d1SShaohua Li 1088355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log, 1089355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1090355810d1SShaohua Li { 1091355810d1SShaohua Li struct r5conf *conf = log->rdev->mddev->private; 1092355810d1SShaohua Li struct r5l_payload_data_parity *payload; 1093355810d1SShaohua Li struct r5l_meta_block *mb; 1094355810d1SShaohua Li int offset; 1095355810d1SShaohua Li sector_t stripe_sector; 1096355810d1SShaohua Li 1097355810d1SShaohua Li mb = page_address(ctx->meta_page); 1098355810d1SShaohua Li offset = sizeof(struct r5l_meta_block); 1099355810d1SShaohua Li 1100355810d1SShaohua Li while (offset < le32_to_cpu(mb->meta_size)) { 1101355810d1SShaohua Li int dd; 1102355810d1SShaohua Li 1103355810d1SShaohua Li payload = (void *)mb + offset; 1104355810d1SShaohua Li stripe_sector = raid5_compute_sector(conf, 1105355810d1SShaohua Li le64_to_cpu(payload->location), 0, &dd, NULL); 1106355810d1SShaohua Li if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, 11073fd880afSJackieLiu &offset)) 1108355810d1SShaohua Li return -EINVAL; 1109355810d1SShaohua Li } 1110355810d1SShaohua Li return 0; 1111355810d1SShaohua Li } 1112355810d1SShaohua Li 1113355810d1SShaohua Li /* copy data/parity from log to raid disks */ 1114355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log, 1115355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1116355810d1SShaohua Li { 1117355810d1SShaohua Li while (1) { 1118355810d1SShaohua Li if (r5l_read_meta_block(log, ctx)) 1119355810d1SShaohua Li return; 1120355810d1SShaohua Li if (r5l_recovery_flush_one_meta(log, ctx)) 1121355810d1SShaohua Li return; 1122355810d1SShaohua Li ctx->seq++; 1123355810d1SShaohua Li ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1124355810d1SShaohua Li } 1125355810d1SShaohua Li } 1126355810d1SShaohua Li 1127355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1128355810d1SShaohua Li u64 seq) 1129355810d1SShaohua Li { 1130355810d1SShaohua Li struct page *page; 1131355810d1SShaohua Li struct r5l_meta_block *mb; 1132355810d1SShaohua Li u32 crc; 1133355810d1SShaohua Li 1134355810d1SShaohua Li page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1135355810d1SShaohua Li if (!page) 1136355810d1SShaohua Li return -ENOMEM; 1137355810d1SShaohua Li mb = page_address(page); 1138355810d1SShaohua Li mb->magic = cpu_to_le32(R5LOG_MAGIC); 1139355810d1SShaohua Li mb->version = R5LOG_VERSION; 1140355810d1SShaohua Li mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1141355810d1SShaohua Li mb->seq = cpu_to_le64(seq); 1142355810d1SShaohua Li mb->position = cpu_to_le64(pos); 11435cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1144355810d1SShaohua Li mb->checksum = cpu_to_le32(crc); 1145355810d1SShaohua Li 1146796a5cf0SMike Christie if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1147796a5cf0SMike Christie WRITE_FUA, false)) { 1148355810d1SShaohua Li __free_page(page); 1149355810d1SShaohua Li return -EIO; 1150355810d1SShaohua Li } 1151355810d1SShaohua Li __free_page(page); 1152355810d1SShaohua Li return 0; 1153355810d1SShaohua Li } 1154355810d1SShaohua Li 1155f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log) 1156f6bed0efSShaohua Li { 1157355810d1SShaohua Li struct r5l_recovery_ctx ctx; 1158355810d1SShaohua Li 1159355810d1SShaohua Li ctx.pos = log->last_checkpoint; 1160355810d1SShaohua Li ctx.seq = log->last_cp_seq; 1161355810d1SShaohua Li ctx.meta_page = alloc_page(GFP_KERNEL); 1162355810d1SShaohua Li if (!ctx.meta_page) 1163355810d1SShaohua Li return -ENOMEM; 1164355810d1SShaohua Li 1165355810d1SShaohua Li r5l_recovery_flush_log(log, &ctx); 1166355810d1SShaohua Li __free_page(ctx.meta_page); 1167355810d1SShaohua Li 1168355810d1SShaohua Li /* 1169355810d1SShaohua Li * we did a recovery. Now ctx.pos points to an invalid meta block. New 1170355810d1SShaohua Li * log will start here. but we can't let superblock point to last valid 1171355810d1SShaohua Li * meta block. The log might looks like: 1172355810d1SShaohua Li * | meta 1| meta 2| meta 3| 1173355810d1SShaohua Li * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1174355810d1SShaohua Li * superblock points to meta 1, we write a new valid meta 2n. if crash 1175355810d1SShaohua Li * happens again, new recovery will start from meta 1. Since meta 2n is 1176355810d1SShaohua Li * valid now, recovery will think meta 3 is valid, which is wrong. 1177355810d1SShaohua Li * The solution is we create a new meta in meta2 with its seq == meta 1178355810d1SShaohua Li * 1's seq + 10 and let superblock points to meta2. The same recovery will 1179355810d1SShaohua Li * not think meta 3 is a valid meta, because its seq doesn't match 1180355810d1SShaohua Li */ 11819a8b27faSShaohua Li if (ctx.seq > log->last_cp_seq) { 1182355810d1SShaohua Li int ret; 1183355810d1SShaohua Li 1184355810d1SShaohua Li ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); 1185355810d1SShaohua Li if (ret) 1186355810d1SShaohua Li return ret; 1187355810d1SShaohua Li log->seq = ctx.seq + 11; 1188355810d1SShaohua Li log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 1189355810d1SShaohua Li r5l_write_super(log, ctx.pos); 119028cd88e2SZhengyuan Liu log->last_checkpoint = ctx.pos; 119128cd88e2SZhengyuan Liu log->next_checkpoint = ctx.pos; 1192355810d1SShaohua Li } else { 1193355810d1SShaohua Li log->log_start = ctx.pos; 1194355810d1SShaohua Li log->seq = ctx.seq; 1195355810d1SShaohua Li } 1196f6bed0efSShaohua Li return 0; 1197f6bed0efSShaohua Li } 1198f6bed0efSShaohua Li 1199f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp) 1200f6bed0efSShaohua Li { 1201f6bed0efSShaohua Li struct mddev *mddev = log->rdev->mddev; 1202f6bed0efSShaohua Li 1203f6bed0efSShaohua Li log->rdev->journal_tail = cp; 1204f6bed0efSShaohua Li set_bit(MD_CHANGE_DEVS, &mddev->flags); 1205f6bed0efSShaohua Li } 1206f6bed0efSShaohua Li 1207*2ded3703SSong Liu /* 1208*2ded3703SSong Liu * Try handle write operation in caching phase. This function should only 1209*2ded3703SSong Liu * be called in write-back mode. 1210*2ded3703SSong Liu * 1211*2ded3703SSong Liu * If all outstanding writes can be handled in caching phase, returns 0 1212*2ded3703SSong Liu * If writes requires write-out phase, call r5c_make_stripe_write_out() 1213*2ded3703SSong Liu * and returns -EAGAIN 1214*2ded3703SSong Liu */ 1215*2ded3703SSong Liu int r5c_try_caching_write(struct r5conf *conf, 1216*2ded3703SSong Liu struct stripe_head *sh, 1217*2ded3703SSong Liu struct stripe_head_state *s, 1218*2ded3703SSong Liu int disks) 1219*2ded3703SSong Liu { 1220*2ded3703SSong Liu struct r5l_log *log = conf->log; 1221*2ded3703SSong Liu 1222*2ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 1223*2ded3703SSong Liu 1224*2ded3703SSong Liu /* more write-back logic in next patches */ 1225*2ded3703SSong Liu r5c_make_stripe_write_out(sh); 1226*2ded3703SSong Liu return -EAGAIN; 1227*2ded3703SSong Liu } 1228*2ded3703SSong Liu 1229*2ded3703SSong Liu /* 1230*2ded3703SSong Liu * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 1231*2ded3703SSong Liu * stripe is committed to RAID disks. 1232*2ded3703SSong Liu */ 1233*2ded3703SSong Liu void r5c_finish_stripe_write_out(struct r5conf *conf, 1234*2ded3703SSong Liu struct stripe_head *sh, 1235*2ded3703SSong Liu struct stripe_head_state *s) 1236*2ded3703SSong Liu { 1237*2ded3703SSong Liu if (!conf->log || 1238*2ded3703SSong Liu !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 1239*2ded3703SSong Liu return; 1240*2ded3703SSong Liu 1241*2ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 1242*2ded3703SSong Liu clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 1243*2ded3703SSong Liu 1244*2ded3703SSong Liu if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1245*2ded3703SSong Liu return; 1246*2ded3703SSong Liu BUG(); /* write-back logic in following patches */ 1247*2ded3703SSong Liu } 1248*2ded3703SSong Liu 1249*2ded3703SSong Liu 1250f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log) 1251f6bed0efSShaohua Li { 1252f6bed0efSShaohua Li struct md_rdev *rdev = log->rdev; 1253f6bed0efSShaohua Li struct page *page; 1254f6bed0efSShaohua Li struct r5l_meta_block *mb; 1255f6bed0efSShaohua Li sector_t cp = log->rdev->journal_tail; 1256f6bed0efSShaohua Li u32 stored_crc, expected_crc; 1257f6bed0efSShaohua Li bool create_super = false; 1258f6bed0efSShaohua Li int ret; 1259f6bed0efSShaohua Li 1260f6bed0efSShaohua Li /* Make sure it's valid */ 1261f6bed0efSShaohua Li if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 1262f6bed0efSShaohua Li cp = 0; 1263f6bed0efSShaohua Li page = alloc_page(GFP_KERNEL); 1264f6bed0efSShaohua Li if (!page) 1265f6bed0efSShaohua Li return -ENOMEM; 1266f6bed0efSShaohua Li 1267796a5cf0SMike Christie if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 1268f6bed0efSShaohua Li ret = -EIO; 1269f6bed0efSShaohua Li goto ioerr; 1270f6bed0efSShaohua Li } 1271f6bed0efSShaohua Li mb = page_address(page); 1272f6bed0efSShaohua Li 1273f6bed0efSShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1274f6bed0efSShaohua Li mb->version != R5LOG_VERSION) { 1275f6bed0efSShaohua Li create_super = true; 1276f6bed0efSShaohua Li goto create; 1277f6bed0efSShaohua Li } 1278f6bed0efSShaohua Li stored_crc = le32_to_cpu(mb->checksum); 1279f6bed0efSShaohua Li mb->checksum = 0; 12805cb2fbd6SShaohua Li expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1281f6bed0efSShaohua Li if (stored_crc != expected_crc) { 1282f6bed0efSShaohua Li create_super = true; 1283f6bed0efSShaohua Li goto create; 1284f6bed0efSShaohua Li } 1285f6bed0efSShaohua Li if (le64_to_cpu(mb->position) != cp) { 1286f6bed0efSShaohua Li create_super = true; 1287f6bed0efSShaohua Li goto create; 1288f6bed0efSShaohua Li } 1289f6bed0efSShaohua Li create: 1290f6bed0efSShaohua Li if (create_super) { 1291f6bed0efSShaohua Li log->last_cp_seq = prandom_u32(); 1292f6bed0efSShaohua Li cp = 0; 129356056c2eSZhengyuan Liu r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 1294f6bed0efSShaohua Li /* 1295f6bed0efSShaohua Li * Make sure super points to correct address. Log might have 1296f6bed0efSShaohua Li * data very soon. If super hasn't correct log tail address, 1297f6bed0efSShaohua Li * recovery can't find the log 1298f6bed0efSShaohua Li */ 1299f6bed0efSShaohua Li r5l_write_super(log, cp); 1300f6bed0efSShaohua Li } else 1301f6bed0efSShaohua Li log->last_cp_seq = le64_to_cpu(mb->seq); 1302f6bed0efSShaohua Li 1303f6bed0efSShaohua Li log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 13040576b1c6SShaohua Li log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 13050576b1c6SShaohua Li if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 13060576b1c6SShaohua Li log->max_free_space = RECLAIM_MAX_FREE_SPACE; 1307f6bed0efSShaohua Li log->last_checkpoint = cp; 130828cd88e2SZhengyuan Liu log->next_checkpoint = cp; 1309f6bed0efSShaohua Li 1310f6bed0efSShaohua Li __free_page(page); 1311f6bed0efSShaohua Li 1312f6bed0efSShaohua Li return r5l_recovery_log(log); 1313f6bed0efSShaohua Li ioerr: 1314f6bed0efSShaohua Li __free_page(page); 1315f6bed0efSShaohua Li return ret; 1316f6bed0efSShaohua Li } 1317f6bed0efSShaohua Li 1318f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 1319f6bed0efSShaohua Li { 1320c888a8f9SJens Axboe struct request_queue *q = bdev_get_queue(rdev->bdev); 1321f6bed0efSShaohua Li struct r5l_log *log; 1322f6bed0efSShaohua Li 1323f6bed0efSShaohua Li if (PAGE_SIZE != 4096) 1324f6bed0efSShaohua Li return -EINVAL; 1325c757ec95SSong Liu 1326c757ec95SSong Liu /* 1327c757ec95SSong Liu * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 1328c757ec95SSong Liu * raid_disks r5l_payload_data_parity. 1329c757ec95SSong Liu * 1330c757ec95SSong Liu * Write journal and cache does not work for very big array 1331c757ec95SSong Liu * (raid_disks > 203) 1332c757ec95SSong Liu */ 1333c757ec95SSong Liu if (sizeof(struct r5l_meta_block) + 1334c757ec95SSong Liu ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 1335c757ec95SSong Liu conf->raid_disks) > PAGE_SIZE) { 1336c757ec95SSong Liu pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 1337c757ec95SSong Liu mdname(conf->mddev), conf->raid_disks); 1338c757ec95SSong Liu return -EINVAL; 1339c757ec95SSong Liu } 1340c757ec95SSong Liu 1341f6bed0efSShaohua Li log = kzalloc(sizeof(*log), GFP_KERNEL); 1342f6bed0efSShaohua Li if (!log) 1343f6bed0efSShaohua Li return -ENOMEM; 1344f6bed0efSShaohua Li log->rdev = rdev; 1345f6bed0efSShaohua Li 1346c888a8f9SJens Axboe log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 134756fef7c6SChristoph Hellwig 13485cb2fbd6SShaohua Li log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 1349f6bed0efSShaohua Li sizeof(rdev->mddev->uuid)); 1350f6bed0efSShaohua Li 1351f6bed0efSShaohua Li mutex_init(&log->io_mutex); 1352f6bed0efSShaohua Li 1353f6bed0efSShaohua Li spin_lock_init(&log->io_list_lock); 1354f6bed0efSShaohua Li INIT_LIST_HEAD(&log->running_ios); 13550576b1c6SShaohua Li INIT_LIST_HEAD(&log->io_end_ios); 1356a8c34f91SShaohua Li INIT_LIST_HEAD(&log->flushing_ios); 135704732f74SChristoph Hellwig INIT_LIST_HEAD(&log->finished_ios); 1358a8c34f91SShaohua Li bio_init(&log->flush_bio); 1359f6bed0efSShaohua Li 1360f6bed0efSShaohua Li log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 1361f6bed0efSShaohua Li if (!log->io_kc) 1362f6bed0efSShaohua Li goto io_kc; 1363f6bed0efSShaohua Li 13645036c390SChristoph Hellwig log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 13655036c390SChristoph Hellwig if (!log->io_pool) 13665036c390SChristoph Hellwig goto io_pool; 13675036c390SChristoph Hellwig 1368c38d29b3SChristoph Hellwig log->bs = bioset_create(R5L_POOL_SIZE, 0); 1369c38d29b3SChristoph Hellwig if (!log->bs) 1370c38d29b3SChristoph Hellwig goto io_bs; 1371c38d29b3SChristoph Hellwig 1372e8deb638SChristoph Hellwig log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 1373e8deb638SChristoph Hellwig if (!log->meta_pool) 1374e8deb638SChristoph Hellwig goto out_mempool; 1375e8deb638SChristoph Hellwig 13760576b1c6SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 13770576b1c6SShaohua Li log->rdev->mddev, "reclaim"); 13780576b1c6SShaohua Li if (!log->reclaim_thread) 13790576b1c6SShaohua Li goto reclaim_thread; 13800fd22b45SShaohua Li init_waitqueue_head(&log->iounit_wait); 13810576b1c6SShaohua Li 13825036c390SChristoph Hellwig INIT_LIST_HEAD(&log->no_mem_stripes); 13835036c390SChristoph Hellwig 1384f6bed0efSShaohua Li INIT_LIST_HEAD(&log->no_space_stripes); 1385f6bed0efSShaohua Li spin_lock_init(&log->no_space_stripes_lock); 1386f6bed0efSShaohua Li 1387*2ded3703SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 1388*2ded3703SSong Liu 1389f6bed0efSShaohua Li if (r5l_load_log(log)) 1390f6bed0efSShaohua Li goto error; 1391f6bed0efSShaohua Li 1392f6b6ec5cSShaohua Li rcu_assign_pointer(conf->log, log); 1393a62ab49eSShaohua Li set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1394f6bed0efSShaohua Li return 0; 1395e8deb638SChristoph Hellwig 1396f6bed0efSShaohua Li error: 13970576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 13980576b1c6SShaohua Li reclaim_thread: 1399e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 1400e8deb638SChristoph Hellwig out_mempool: 1401c38d29b3SChristoph Hellwig bioset_free(log->bs); 1402c38d29b3SChristoph Hellwig io_bs: 14035036c390SChristoph Hellwig mempool_destroy(log->io_pool); 14045036c390SChristoph Hellwig io_pool: 1405f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 1406f6bed0efSShaohua Li io_kc: 1407f6bed0efSShaohua Li kfree(log); 1408f6bed0efSShaohua Li return -EINVAL; 1409f6bed0efSShaohua Li } 1410f6bed0efSShaohua Li 1411f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log) 1412f6bed0efSShaohua Li { 14130576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 1414e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 1415c38d29b3SChristoph Hellwig bioset_free(log->bs); 14165036c390SChristoph Hellwig mempool_destroy(log->io_pool); 1417f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 1418f6bed0efSShaohua Li kfree(log); 1419f6bed0efSShaohua Li } 1420