1f6bed0efSShaohua Li /* 2f6bed0efSShaohua Li * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3b4c625c6SSong Liu * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4f6bed0efSShaohua Li * 5f6bed0efSShaohua Li * This program is free software; you can redistribute it and/or modify it 6f6bed0efSShaohua Li * under the terms and conditions of the GNU General Public License, 7f6bed0efSShaohua Li * version 2, as published by the Free Software Foundation. 8f6bed0efSShaohua Li * 9f6bed0efSShaohua Li * This program is distributed in the hope it will be useful, but WITHOUT 10f6bed0efSShaohua Li * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11f6bed0efSShaohua Li * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12f6bed0efSShaohua Li * more details. 13f6bed0efSShaohua Li * 14f6bed0efSShaohua Li */ 15f6bed0efSShaohua Li #include <linux/kernel.h> 16f6bed0efSShaohua Li #include <linux/wait.h> 17f6bed0efSShaohua Li #include <linux/blkdev.h> 18f6bed0efSShaohua Li #include <linux/slab.h> 19f6bed0efSShaohua Li #include <linux/raid/md_p.h> 205cb2fbd6SShaohua Li #include <linux/crc32c.h> 21f6bed0efSShaohua Li #include <linux/random.h> 22f6bed0efSShaohua Li #include "md.h" 23f6bed0efSShaohua Li #include "raid5.h" 241e6d690bSSong Liu #include "bitmap.h" 25f6bed0efSShaohua Li 26f6bed0efSShaohua Li /* 27f6bed0efSShaohua Li * metadata/data stored in disk with 4k size unit (a block) regardless 28f6bed0efSShaohua Li * underneath hardware sector size. only works with PAGE_SIZE == 4096 29f6bed0efSShaohua Li */ 30f6bed0efSShaohua Li #define BLOCK_SECTORS (8) 31f6bed0efSShaohua Li 320576b1c6SShaohua Li /* 33a39f7afdSSong Liu * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 34a39f7afdSSong Liu * 35a39f7afdSSong Liu * In write through mode, the reclaim runs every log->max_free_space. 36a39f7afdSSong Liu * This can prevent the recovery scans for too long 370576b1c6SShaohua Li */ 380576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 390576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 400576b1c6SShaohua Li 41a39f7afdSSong Liu /* wake up reclaim thread periodically */ 42a39f7afdSSong Liu #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 43a39f7afdSSong Liu /* start flush with these full stripes */ 44a39f7afdSSong Liu #define R5C_FULL_STRIPE_FLUSH_BATCH 256 45a39f7afdSSong Liu /* reclaim stripes in groups */ 46a39f7afdSSong Liu #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 47a39f7afdSSong Liu 48c38d29b3SChristoph Hellwig /* 49c38d29b3SChristoph Hellwig * We only need 2 bios per I/O unit to make progress, but ensure we 50c38d29b3SChristoph Hellwig * have a few more available to not get too tight. 51c38d29b3SChristoph Hellwig */ 52c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE 4 53c38d29b3SChristoph Hellwig 542ded3703SSong Liu /* 552ded3703SSong Liu * r5c journal modes of the array: write-back or write-through. 562ded3703SSong Liu * write-through mode has identical behavior as existing log only 572ded3703SSong Liu * implementation. 582ded3703SSong Liu */ 592ded3703SSong Liu enum r5c_journal_mode { 602ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 612ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_BACK = 1, 622ded3703SSong Liu }; 632ded3703SSong Liu 642c7da14bSSong Liu static char *r5c_journal_mode_str[] = {"write-through", 652c7da14bSSong Liu "write-back"}; 662ded3703SSong Liu /* 672ded3703SSong Liu * raid5 cache state machine 682ded3703SSong Liu * 692ded3703SSong Liu * With rhe RAID cache, each stripe works in two phases: 702ded3703SSong Liu * - caching phase 712ded3703SSong Liu * - writing-out phase 722ded3703SSong Liu * 732ded3703SSong Liu * These two phases are controlled by bit STRIPE_R5C_CACHING: 742ded3703SSong Liu * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 752ded3703SSong Liu * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 762ded3703SSong Liu * 772ded3703SSong Liu * When there is no journal, or the journal is in write-through mode, 782ded3703SSong Liu * the stripe is always in writing-out phase. 792ded3703SSong Liu * 802ded3703SSong Liu * For write-back journal, the stripe is sent to caching phase on write 812ded3703SSong Liu * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 822ded3703SSong Liu * the write-out phase by clearing STRIPE_R5C_CACHING. 832ded3703SSong Liu * 842ded3703SSong Liu * Stripes in caching phase do not write the raid disks. Instead, all 852ded3703SSong Liu * writes are committed from the log device. Therefore, a stripe in 862ded3703SSong Liu * caching phase handles writes as: 872ded3703SSong Liu * - write to log device 882ded3703SSong Liu * - return IO 892ded3703SSong Liu * 902ded3703SSong Liu * Stripes in writing-out phase handle writes as: 912ded3703SSong Liu * - calculate parity 922ded3703SSong Liu * - write pending data and parity to journal 932ded3703SSong Liu * - write data and parity to raid disks 942ded3703SSong Liu * - return IO for pending writes 952ded3703SSong Liu */ 962ded3703SSong Liu 97f6bed0efSShaohua Li struct r5l_log { 98f6bed0efSShaohua Li struct md_rdev *rdev; 99f6bed0efSShaohua Li 100f6bed0efSShaohua Li u32 uuid_checksum; 101f6bed0efSShaohua Li 102f6bed0efSShaohua Li sector_t device_size; /* log device size, round to 103f6bed0efSShaohua Li * BLOCK_SECTORS */ 1040576b1c6SShaohua Li sector_t max_free_space; /* reclaim run if free space is at 1050576b1c6SShaohua Li * this size */ 106f6bed0efSShaohua Li 107f6bed0efSShaohua Li sector_t last_checkpoint; /* log tail. where recovery scan 108f6bed0efSShaohua Li * starts from */ 109f6bed0efSShaohua Li u64 last_cp_seq; /* log tail sequence */ 110f6bed0efSShaohua Li 111f6bed0efSShaohua Li sector_t log_start; /* log head. where new data appends */ 112f6bed0efSShaohua Li u64 seq; /* log head sequence */ 113f6bed0efSShaohua Li 11417036461SChristoph Hellwig sector_t next_checkpoint; 11517036461SChristoph Hellwig u64 next_cp_seq; 11617036461SChristoph Hellwig 117f6bed0efSShaohua Li struct mutex io_mutex; 118f6bed0efSShaohua Li struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 119f6bed0efSShaohua Li 120f6bed0efSShaohua Li spinlock_t io_list_lock; 121f6bed0efSShaohua Li struct list_head running_ios; /* io_units which are still running, 122f6bed0efSShaohua Li * and have not yet been completely 123f6bed0efSShaohua Li * written to the log */ 124f6bed0efSShaohua Li struct list_head io_end_ios; /* io_units which have been completely 125f6bed0efSShaohua Li * written to the log but not yet written 126f6bed0efSShaohua Li * to the RAID */ 127a8c34f91SShaohua Li struct list_head flushing_ios; /* io_units which are waiting for log 128a8c34f91SShaohua Li * cache flush */ 12904732f74SChristoph Hellwig struct list_head finished_ios; /* io_units which settle down in log disk */ 130a8c34f91SShaohua Li struct bio flush_bio; 131f6bed0efSShaohua Li 1325036c390SChristoph Hellwig struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 1335036c390SChristoph Hellwig 134f6bed0efSShaohua Li struct kmem_cache *io_kc; 1355036c390SChristoph Hellwig mempool_t *io_pool; 136c38d29b3SChristoph Hellwig struct bio_set *bs; 137e8deb638SChristoph Hellwig mempool_t *meta_pool; 138f6bed0efSShaohua Li 1390576b1c6SShaohua Li struct md_thread *reclaim_thread; 1400576b1c6SShaohua Li unsigned long reclaim_target; /* number of space that need to be 1410576b1c6SShaohua Li * reclaimed. if it's 0, reclaim spaces 1420576b1c6SShaohua Li * used by io_units which are in 1430576b1c6SShaohua Li * IO_UNIT_STRIPE_END state (eg, reclaim 1440576b1c6SShaohua Li * dones't wait for specific io_unit 1450576b1c6SShaohua Li * switching to IO_UNIT_STRIPE_END 1460576b1c6SShaohua Li * state) */ 1470fd22b45SShaohua Li wait_queue_head_t iounit_wait; 1480576b1c6SShaohua Li 149f6bed0efSShaohua Li struct list_head no_space_stripes; /* pending stripes, log has no space */ 150f6bed0efSShaohua Li spinlock_t no_space_stripes_lock; 15156fef7c6SChristoph Hellwig 15256fef7c6SChristoph Hellwig bool need_cache_flush; 1532ded3703SSong Liu 1542ded3703SSong Liu /* for r5c_cache */ 1552ded3703SSong Liu enum r5c_journal_mode r5c_journal_mode; 156a39f7afdSSong Liu 157a39f7afdSSong Liu /* all stripes in r5cache, in the order of seq at sh->log_start */ 158a39f7afdSSong Liu struct list_head stripe_in_journal_list; 159a39f7afdSSong Liu 160a39f7afdSSong Liu spinlock_t stripe_in_journal_lock; 161a39f7afdSSong Liu atomic_t stripe_in_journal_count; 162f6bed0efSShaohua Li }; 163f6bed0efSShaohua Li 164f6bed0efSShaohua Li /* 165f6bed0efSShaohua Li * an IO range starts from a meta data block and end at the next meta data 166f6bed0efSShaohua Li * block. The io unit's the meta data block tracks data/parity followed it. io 167f6bed0efSShaohua Li * unit is written to log disk with normal write, as we always flush log disk 168f6bed0efSShaohua Li * first and then start move data to raid disks, there is no requirement to 169f6bed0efSShaohua Li * write io unit with FLUSH/FUA 170f6bed0efSShaohua Li */ 171f6bed0efSShaohua Li struct r5l_io_unit { 172f6bed0efSShaohua Li struct r5l_log *log; 173f6bed0efSShaohua Li 174f6bed0efSShaohua Li struct page *meta_page; /* store meta block */ 175f6bed0efSShaohua Li int meta_offset; /* current offset in meta_page */ 176f6bed0efSShaohua Li 177f6bed0efSShaohua Li struct bio *current_bio;/* current_bio accepting new data */ 178f6bed0efSShaohua Li 179f6bed0efSShaohua Li atomic_t pending_stripe;/* how many stripes not flushed to raid */ 180f6bed0efSShaohua Li u64 seq; /* seq number of the metablock */ 181f6bed0efSShaohua Li sector_t log_start; /* where the io_unit starts */ 182f6bed0efSShaohua Li sector_t log_end; /* where the io_unit ends */ 183f6bed0efSShaohua Li struct list_head log_sibling; /* log->running_ios */ 184f6bed0efSShaohua Li struct list_head stripe_list; /* stripes added to the io_unit */ 185f6bed0efSShaohua Li 186f6bed0efSShaohua Li int state; 1876143e2ceSChristoph Hellwig bool need_split_bio; 188f6bed0efSShaohua Li }; 189f6bed0efSShaohua Li 190f6bed0efSShaohua Li /* r5l_io_unit state */ 191f6bed0efSShaohua Li enum r5l_io_unit_state { 192f6bed0efSShaohua Li IO_UNIT_RUNNING = 0, /* accepting new IO */ 193f6bed0efSShaohua Li IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 194f6bed0efSShaohua Li * don't accepting new bio */ 195f6bed0efSShaohua Li IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 196a8c34f91SShaohua Li IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 197f6bed0efSShaohua Li }; 198f6bed0efSShaohua Li 1992ded3703SSong Liu bool r5c_is_writeback(struct r5l_log *log) 2002ded3703SSong Liu { 2012ded3703SSong Liu return (log != NULL && 2022ded3703SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 2032ded3703SSong Liu } 2042ded3703SSong Liu 205f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 206f6bed0efSShaohua Li { 207f6bed0efSShaohua Li start += inc; 208f6bed0efSShaohua Li if (start >= log->device_size) 209f6bed0efSShaohua Li start = start - log->device_size; 210f6bed0efSShaohua Li return start; 211f6bed0efSShaohua Li } 212f6bed0efSShaohua Li 213f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 214f6bed0efSShaohua Li sector_t end) 215f6bed0efSShaohua Li { 216f6bed0efSShaohua Li if (end >= start) 217f6bed0efSShaohua Li return end - start; 218f6bed0efSShaohua Li else 219f6bed0efSShaohua Li return end + log->device_size - start; 220f6bed0efSShaohua Li } 221f6bed0efSShaohua Li 222f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 223f6bed0efSShaohua Li { 224f6bed0efSShaohua Li sector_t used_size; 225f6bed0efSShaohua Li 226f6bed0efSShaohua Li used_size = r5l_ring_distance(log, log->last_checkpoint, 227f6bed0efSShaohua Li log->log_start); 228f6bed0efSShaohua Li 229f6bed0efSShaohua Li return log->device_size > used_size + size; 230f6bed0efSShaohua Li } 231f6bed0efSShaohua Li 232f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 233f6bed0efSShaohua Li enum r5l_io_unit_state state) 234f6bed0efSShaohua Li { 235f6bed0efSShaohua Li if (WARN_ON(io->state >= state)) 236f6bed0efSShaohua Li return; 237f6bed0efSShaohua Li io->state = state; 238f6bed0efSShaohua Li } 239f6bed0efSShaohua Li 2401e6d690bSSong Liu static void 2411e6d690bSSong Liu r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 2421e6d690bSSong Liu struct bio_list *return_bi) 2431e6d690bSSong Liu { 2441e6d690bSSong Liu struct bio *wbi, *wbi2; 2451e6d690bSSong Liu 2461e6d690bSSong Liu wbi = dev->written; 2471e6d690bSSong Liu dev->written = NULL; 2481e6d690bSSong Liu while (wbi && wbi->bi_iter.bi_sector < 2491e6d690bSSong Liu dev->sector + STRIPE_SECTORS) { 2501e6d690bSSong Liu wbi2 = r5_next_bio(wbi, dev->sector); 2511e6d690bSSong Liu if (!raid5_dec_bi_active_stripes(wbi)) { 2521e6d690bSSong Liu md_write_end(conf->mddev); 2531e6d690bSSong Liu bio_list_add(return_bi, wbi); 2541e6d690bSSong Liu } 2551e6d690bSSong Liu wbi = wbi2; 2561e6d690bSSong Liu } 2571e6d690bSSong Liu } 2581e6d690bSSong Liu 2591e6d690bSSong Liu void r5c_handle_cached_data_endio(struct r5conf *conf, 2601e6d690bSSong Liu struct stripe_head *sh, int disks, struct bio_list *return_bi) 2611e6d690bSSong Liu { 2621e6d690bSSong Liu int i; 2631e6d690bSSong Liu 2641e6d690bSSong Liu for (i = sh->disks; i--; ) { 2651e6d690bSSong Liu if (sh->dev[i].written) { 2661e6d690bSSong Liu set_bit(R5_UPTODATE, &sh->dev[i].flags); 2671e6d690bSSong Liu r5c_return_dev_pending_writes(conf, &sh->dev[i], 2681e6d690bSSong Liu return_bi); 2691e6d690bSSong Liu bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2701e6d690bSSong Liu STRIPE_SECTORS, 2711e6d690bSSong Liu !test_bit(STRIPE_DEGRADED, &sh->state), 2721e6d690bSSong Liu 0); 2731e6d690bSSong Liu } 2741e6d690bSSong Liu } 2751e6d690bSSong Liu } 2761e6d690bSSong Liu 277a39f7afdSSong Liu /* Check whether we should flush some stripes to free up stripe cache */ 278a39f7afdSSong Liu void r5c_check_stripe_cache_usage(struct r5conf *conf) 279a39f7afdSSong Liu { 280a39f7afdSSong Liu int total_cached; 281a39f7afdSSong Liu 282a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 283a39f7afdSSong Liu return; 284a39f7afdSSong Liu 285a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 286a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 287a39f7afdSSong Liu 288a39f7afdSSong Liu /* 289a39f7afdSSong Liu * The following condition is true for either of the following: 290a39f7afdSSong Liu * - stripe cache pressure high: 291a39f7afdSSong Liu * total_cached > 3/4 min_nr_stripes || 292a39f7afdSSong Liu * empty_inactive_list_nr > 0 293a39f7afdSSong Liu * - stripe cache pressure moderate: 294a39f7afdSSong Liu * total_cached > 1/2 min_nr_stripes 295a39f7afdSSong Liu */ 296a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 1 / 2 || 297a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 298a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 299a39f7afdSSong Liu } 300a39f7afdSSong Liu 301a39f7afdSSong Liu /* 302a39f7afdSSong Liu * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 303a39f7afdSSong Liu * stripes in the cache 304a39f7afdSSong Liu */ 305a39f7afdSSong Liu void r5c_check_cached_full_stripe(struct r5conf *conf) 306a39f7afdSSong Liu { 307a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 308a39f7afdSSong Liu return; 309a39f7afdSSong Liu 310a39f7afdSSong Liu /* 311a39f7afdSSong Liu * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 312a39f7afdSSong Liu * or a full stripe (chunk size / 4k stripes). 313a39f7afdSSong Liu */ 314a39f7afdSSong Liu if (atomic_read(&conf->r5c_cached_full_stripes) >= 315a39f7afdSSong Liu min(R5C_FULL_STRIPE_FLUSH_BATCH, 316a39f7afdSSong Liu conf->chunk_sectors >> STRIPE_SHIFT)) 317a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 318a39f7afdSSong Liu } 319a39f7afdSSong Liu 320a39f7afdSSong Liu /* 321a39f7afdSSong Liu * Total log space (in sectors) needed to flush all data in cache 322a39f7afdSSong Liu * 323a39f7afdSSong Liu * Currently, writing-out phase automatically includes all pending writes 324a39f7afdSSong Liu * to the same sector. So the reclaim of each stripe takes up to 325a39f7afdSSong Liu * (conf->raid_disks + 1) pages of log space. 326a39f7afdSSong Liu * 327a39f7afdSSong Liu * To totally avoid deadlock due to log space, the code reserves 328a39f7afdSSong Liu * (conf->raid_disks + 1) pages for each stripe in cache, which is not 329a39f7afdSSong Liu * necessary in most cases. 330a39f7afdSSong Liu * 331a39f7afdSSong Liu * To improve this, we will need writing-out phase to be able to NOT include 332a39f7afdSSong Liu * pending writes, which will reduce the requirement to 333a39f7afdSSong Liu * (conf->max_degraded + 1) pages per stripe in cache. 334a39f7afdSSong Liu */ 335a39f7afdSSong Liu static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 336a39f7afdSSong Liu { 337a39f7afdSSong Liu struct r5l_log *log = conf->log; 338a39f7afdSSong Liu 339a39f7afdSSong Liu if (!r5c_is_writeback(log)) 340a39f7afdSSong Liu return 0; 341a39f7afdSSong Liu 342a39f7afdSSong Liu return BLOCK_SECTORS * (conf->raid_disks + 1) * 343a39f7afdSSong Liu atomic_read(&log->stripe_in_journal_count); 344a39f7afdSSong Liu } 345a39f7afdSSong Liu 346a39f7afdSSong Liu /* 347a39f7afdSSong Liu * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 348a39f7afdSSong Liu * 349a39f7afdSSong Liu * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 350a39f7afdSSong Liu * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 351a39f7afdSSong Liu * device is less than 2x of reclaim_required_space. 352a39f7afdSSong Liu */ 353a39f7afdSSong Liu static inline void r5c_update_log_state(struct r5l_log *log) 354a39f7afdSSong Liu { 355a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 356a39f7afdSSong Liu sector_t free_space; 357a39f7afdSSong Liu sector_t reclaim_space; 358a39f7afdSSong Liu 359a39f7afdSSong Liu if (!r5c_is_writeback(log)) 360a39f7afdSSong Liu return; 361a39f7afdSSong Liu 362a39f7afdSSong Liu free_space = r5l_ring_distance(log, log->log_start, 363a39f7afdSSong Liu log->last_checkpoint); 364a39f7afdSSong Liu reclaim_space = r5c_log_required_to_flush_cache(conf); 365a39f7afdSSong Liu if (free_space < 2 * reclaim_space) 366a39f7afdSSong Liu set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 367a39f7afdSSong Liu else 368a39f7afdSSong Liu clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 369a39f7afdSSong Liu if (free_space < 3 * reclaim_space) 370a39f7afdSSong Liu set_bit(R5C_LOG_TIGHT, &conf->cache_state); 371a39f7afdSSong Liu else 372a39f7afdSSong Liu clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 373a39f7afdSSong Liu } 374a39f7afdSSong Liu 3752ded3703SSong Liu /* 3762ded3703SSong Liu * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 3772ded3703SSong Liu * This function should only be called in write-back mode. 3782ded3703SSong Liu */ 379a39f7afdSSong Liu void r5c_make_stripe_write_out(struct stripe_head *sh) 3802ded3703SSong Liu { 3812ded3703SSong Liu struct r5conf *conf = sh->raid_conf; 3822ded3703SSong Liu struct r5l_log *log = conf->log; 3832ded3703SSong Liu 3842ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 3852ded3703SSong Liu 3862ded3703SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 3872ded3703SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 3881e6d690bSSong Liu 3891e6d690bSSong Liu if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3901e6d690bSSong Liu atomic_inc(&conf->preread_active_stripes); 3911e6d690bSSong Liu 3921e6d690bSSong Liu if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 3931e6d690bSSong Liu BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 3941e6d690bSSong Liu atomic_dec(&conf->r5c_cached_partial_stripes); 3951e6d690bSSong Liu } 3961e6d690bSSong Liu 3971e6d690bSSong Liu if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 3981e6d690bSSong Liu BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 3991e6d690bSSong Liu atomic_dec(&conf->r5c_cached_full_stripes); 4001e6d690bSSong Liu } 4011e6d690bSSong Liu } 4021e6d690bSSong Liu 4031e6d690bSSong Liu static void r5c_handle_data_cached(struct stripe_head *sh) 4041e6d690bSSong Liu { 4051e6d690bSSong Liu int i; 4061e6d690bSSong Liu 4071e6d690bSSong Liu for (i = sh->disks; i--; ) 4081e6d690bSSong Liu if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 4091e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[i].flags); 4101e6d690bSSong Liu clear_bit(R5_LOCKED, &sh->dev[i].flags); 4111e6d690bSSong Liu } 4121e6d690bSSong Liu clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 4131e6d690bSSong Liu } 4141e6d690bSSong Liu 4151e6d690bSSong Liu /* 4161e6d690bSSong Liu * this journal write must contain full parity, 4171e6d690bSSong Liu * it may also contain some data pages 4181e6d690bSSong Liu */ 4191e6d690bSSong Liu static void r5c_handle_parity_cached(struct stripe_head *sh) 4201e6d690bSSong Liu { 4211e6d690bSSong Liu int i; 4221e6d690bSSong Liu 4231e6d690bSSong Liu for (i = sh->disks; i--; ) 4241e6d690bSSong Liu if (test_bit(R5_InJournal, &sh->dev[i].flags)) 4251e6d690bSSong Liu set_bit(R5_Wantwrite, &sh->dev[i].flags); 4262ded3703SSong Liu } 4272ded3703SSong Liu 4282ded3703SSong Liu /* 4292ded3703SSong Liu * Setting proper flags after writing (or flushing) data and/or parity to the 4302ded3703SSong Liu * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 4312ded3703SSong Liu */ 4322ded3703SSong Liu static void r5c_finish_cache_stripe(struct stripe_head *sh) 4332ded3703SSong Liu { 4342ded3703SSong Liu struct r5l_log *log = sh->raid_conf->log; 4352ded3703SSong Liu 4362ded3703SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 4372ded3703SSong Liu BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 4382ded3703SSong Liu /* 4392ded3703SSong Liu * Set R5_InJournal for parity dev[pd_idx]. This means 4402ded3703SSong Liu * all data AND parity in the journal. For RAID 6, it is 4412ded3703SSong Liu * NOT necessary to set the flag for dev[qd_idx], as the 4422ded3703SSong Liu * two parities are written out together. 4432ded3703SSong Liu */ 4442ded3703SSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 4451e6d690bSSong Liu } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 4461e6d690bSSong Liu r5c_handle_data_cached(sh); 4471e6d690bSSong Liu } else { 4481e6d690bSSong Liu r5c_handle_parity_cached(sh); 4491e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 4501e6d690bSSong Liu } 4512ded3703SSong Liu } 4522ded3703SSong Liu 453d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io) 454d8858f43SChristoph Hellwig { 455d8858f43SChristoph Hellwig struct stripe_head *sh, *next; 456d8858f43SChristoph Hellwig 457d8858f43SChristoph Hellwig list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 458d8858f43SChristoph Hellwig list_del_init(&sh->log_list); 4592ded3703SSong Liu 4602ded3703SSong Liu r5c_finish_cache_stripe(sh); 4612ded3703SSong Liu 462d8858f43SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 463d8858f43SChristoph Hellwig raid5_release_stripe(sh); 464d8858f43SChristoph Hellwig } 465d8858f43SChristoph Hellwig } 466d8858f43SChristoph Hellwig 46756fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log) 46856fef7c6SChristoph Hellwig { 46956fef7c6SChristoph Hellwig struct r5l_io_unit *io, *next; 47056fef7c6SChristoph Hellwig 47156fef7c6SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 47256fef7c6SChristoph Hellwig 47356fef7c6SChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 47456fef7c6SChristoph Hellwig /* don't change list order */ 47556fef7c6SChristoph Hellwig if (io->state < IO_UNIT_IO_END) 47656fef7c6SChristoph Hellwig break; 47756fef7c6SChristoph Hellwig 47856fef7c6SChristoph Hellwig list_move_tail(&io->log_sibling, &log->finished_ios); 47956fef7c6SChristoph Hellwig r5l_io_run_stripes(io); 48056fef7c6SChristoph Hellwig } 48156fef7c6SChristoph Hellwig } 48256fef7c6SChristoph Hellwig 4833848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log) 4843848c0bcSChristoph Hellwig { 4853848c0bcSChristoph Hellwig struct r5l_io_unit *io, *next; 4863848c0bcSChristoph Hellwig 4873848c0bcSChristoph Hellwig assert_spin_locked(&log->io_list_lock); 4883848c0bcSChristoph Hellwig 4893848c0bcSChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 4903848c0bcSChristoph Hellwig /* don't change list order */ 4913848c0bcSChristoph Hellwig if (io->state < IO_UNIT_IO_END) 4923848c0bcSChristoph Hellwig break; 4933848c0bcSChristoph Hellwig list_move_tail(&io->log_sibling, &log->io_end_ios); 4943848c0bcSChristoph Hellwig } 4953848c0bcSChristoph Hellwig } 4963848c0bcSChristoph Hellwig 497f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio) 498f6bed0efSShaohua Li { 499f6bed0efSShaohua Li struct r5l_io_unit *io = bio->bi_private; 500f6bed0efSShaohua Li struct r5l_log *log = io->log; 501509ffec7SChristoph Hellwig unsigned long flags; 502f6bed0efSShaohua Li 5036e74a9cfSShaohua Li if (bio->bi_error) 5046e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 5056e74a9cfSShaohua Li 506f6bed0efSShaohua Li bio_put(bio); 507e8deb638SChristoph Hellwig mempool_free(io->meta_page, log->meta_pool); 508f6bed0efSShaohua Li 509509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 510509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 51156fef7c6SChristoph Hellwig if (log->need_cache_flush) 5123848c0bcSChristoph Hellwig r5l_move_to_end_ios(log); 51356fef7c6SChristoph Hellwig else 51456fef7c6SChristoph Hellwig r5l_log_run_stripes(log); 515509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 516509ffec7SChristoph Hellwig 51756fef7c6SChristoph Hellwig if (log->need_cache_flush) 518f6bed0efSShaohua Li md_wakeup_thread(log->rdev->mddev->thread); 519f6bed0efSShaohua Li } 520f6bed0efSShaohua Li 521f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log) 522f6bed0efSShaohua Li { 523f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 524f6bed0efSShaohua Li struct r5l_meta_block *block; 525509ffec7SChristoph Hellwig unsigned long flags; 526f6bed0efSShaohua Li u32 crc; 527f6bed0efSShaohua Li 528f6bed0efSShaohua Li if (!io) 529f6bed0efSShaohua Li return; 530f6bed0efSShaohua Li 531f6bed0efSShaohua Li block = page_address(io->meta_page); 532f6bed0efSShaohua Li block->meta_size = cpu_to_le32(io->meta_offset); 5335cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 534f6bed0efSShaohua Li block->checksum = cpu_to_le32(crc); 535f6bed0efSShaohua Li 536f6bed0efSShaohua Li log->current_io = NULL; 537509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 538509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 539509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 540f6bed0efSShaohua Li 5414e49ea4aSMike Christie submit_bio(io->current_bio); 542f6bed0efSShaohua Li } 543f6bed0efSShaohua Li 5446143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log) 545b349feb3SChristoph Hellwig { 546c38d29b3SChristoph Hellwig struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 547b349feb3SChristoph Hellwig 548796a5cf0SMike Christie bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 549b349feb3SChristoph Hellwig bio->bi_bdev = log->rdev->bdev; 5501e932a37SChristoph Hellwig bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 551b349feb3SChristoph Hellwig 552b349feb3SChristoph Hellwig return bio; 553b349feb3SChristoph Hellwig } 554b349feb3SChristoph Hellwig 555c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 556c1b99198SChristoph Hellwig { 557c1b99198SChristoph Hellwig log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 558c1b99198SChristoph Hellwig 559a39f7afdSSong Liu r5c_update_log_state(log); 560c1b99198SChristoph Hellwig /* 561c1b99198SChristoph Hellwig * If we filled up the log device start from the beginning again, 562c1b99198SChristoph Hellwig * which will require a new bio. 563c1b99198SChristoph Hellwig * 564c1b99198SChristoph Hellwig * Note: for this to work properly the log size needs to me a multiple 565c1b99198SChristoph Hellwig * of BLOCK_SECTORS. 566c1b99198SChristoph Hellwig */ 567c1b99198SChristoph Hellwig if (log->log_start == 0) 5686143e2ceSChristoph Hellwig io->need_split_bio = true; 569c1b99198SChristoph Hellwig 570c1b99198SChristoph Hellwig io->log_end = log->log_start; 571c1b99198SChristoph Hellwig } 572c1b99198SChristoph Hellwig 573f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 574f6bed0efSShaohua Li { 575f6bed0efSShaohua Li struct r5l_io_unit *io; 576f6bed0efSShaohua Li struct r5l_meta_block *block; 577f6bed0efSShaohua Li 5785036c390SChristoph Hellwig io = mempool_alloc(log->io_pool, GFP_ATOMIC); 5795036c390SChristoph Hellwig if (!io) 5805036c390SChristoph Hellwig return NULL; 5815036c390SChristoph Hellwig memset(io, 0, sizeof(*io)); 5825036c390SChristoph Hellwig 58351039cd0SChristoph Hellwig io->log = log; 58451039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->log_sibling); 58551039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->stripe_list); 58651039cd0SChristoph Hellwig io->state = IO_UNIT_RUNNING; 587f6bed0efSShaohua Li 588e8deb638SChristoph Hellwig io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 589f6bed0efSShaohua Li block = page_address(io->meta_page); 590e8deb638SChristoph Hellwig clear_page(block); 591f6bed0efSShaohua Li block->magic = cpu_to_le32(R5LOG_MAGIC); 592f6bed0efSShaohua Li block->version = R5LOG_VERSION; 593f6bed0efSShaohua Li block->seq = cpu_to_le64(log->seq); 594f6bed0efSShaohua Li block->position = cpu_to_le64(log->log_start); 595f6bed0efSShaohua Li 596f6bed0efSShaohua Li io->log_start = log->log_start; 597f6bed0efSShaohua Li io->meta_offset = sizeof(struct r5l_meta_block); 5982b8ef16eSChristoph Hellwig io->seq = log->seq++; 599f6bed0efSShaohua Li 6006143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 6016143e2ceSChristoph Hellwig io->current_bio->bi_end_io = r5l_log_endio; 6026143e2ceSChristoph Hellwig io->current_bio->bi_private = io; 603b349feb3SChristoph Hellwig bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 604f6bed0efSShaohua Li 605c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 606f6bed0efSShaohua Li 607f6bed0efSShaohua Li spin_lock_irq(&log->io_list_lock); 608f6bed0efSShaohua Li list_add_tail(&io->log_sibling, &log->running_ios); 609f6bed0efSShaohua Li spin_unlock_irq(&log->io_list_lock); 610f6bed0efSShaohua Li 611f6bed0efSShaohua Li return io; 612f6bed0efSShaohua Li } 613f6bed0efSShaohua Li 614f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 615f6bed0efSShaohua Li { 61622581f58SChristoph Hellwig if (log->current_io && 61722581f58SChristoph Hellwig log->current_io->meta_offset + payload_size > PAGE_SIZE) 618f6bed0efSShaohua Li r5l_submit_current_io(log); 619f6bed0efSShaohua Li 6205036c390SChristoph Hellwig if (!log->current_io) { 621f6bed0efSShaohua Li log->current_io = r5l_new_meta(log); 6225036c390SChristoph Hellwig if (!log->current_io) 6235036c390SChristoph Hellwig return -ENOMEM; 6245036c390SChristoph Hellwig } 6255036c390SChristoph Hellwig 626f6bed0efSShaohua Li return 0; 627f6bed0efSShaohua Li } 628f6bed0efSShaohua Li 629f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 630f6bed0efSShaohua Li sector_t location, 631f6bed0efSShaohua Li u32 checksum1, u32 checksum2, 632f6bed0efSShaohua Li bool checksum2_valid) 633f6bed0efSShaohua Li { 634f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 635f6bed0efSShaohua Li struct r5l_payload_data_parity *payload; 636f6bed0efSShaohua Li 637f6bed0efSShaohua Li payload = page_address(io->meta_page) + io->meta_offset; 638f6bed0efSShaohua Li payload->header.type = cpu_to_le16(type); 639f6bed0efSShaohua Li payload->header.flags = cpu_to_le16(0); 640f6bed0efSShaohua Li payload->size = cpu_to_le32((1 + !!checksum2_valid) << 641f6bed0efSShaohua Li (PAGE_SHIFT - 9)); 642f6bed0efSShaohua Li payload->location = cpu_to_le64(location); 643f6bed0efSShaohua Li payload->checksum[0] = cpu_to_le32(checksum1); 644f6bed0efSShaohua Li if (checksum2_valid) 645f6bed0efSShaohua Li payload->checksum[1] = cpu_to_le32(checksum2); 646f6bed0efSShaohua Li 647f6bed0efSShaohua Li io->meta_offset += sizeof(struct r5l_payload_data_parity) + 648f6bed0efSShaohua Li sizeof(__le32) * (1 + !!checksum2_valid); 649f6bed0efSShaohua Li } 650f6bed0efSShaohua Li 651f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 652f6bed0efSShaohua Li { 653f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 654f6bed0efSShaohua Li 6556143e2ceSChristoph Hellwig if (io->need_split_bio) { 6566143e2ceSChristoph Hellwig struct bio *prev = io->current_bio; 657f6bed0efSShaohua Li 6586143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 6596143e2ceSChristoph Hellwig bio_chain(io->current_bio, prev); 6606143e2ceSChristoph Hellwig 6614e49ea4aSMike Christie submit_bio(prev); 662f6bed0efSShaohua Li } 663f6bed0efSShaohua Li 6646143e2ceSChristoph Hellwig if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 6656143e2ceSChristoph Hellwig BUG(); 6666143e2ceSChristoph Hellwig 667c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 668f6bed0efSShaohua Li } 669f6bed0efSShaohua Li 6705036c390SChristoph Hellwig static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 671f6bed0efSShaohua Li int data_pages, int parity_pages) 672f6bed0efSShaohua Li { 673f6bed0efSShaohua Li int i; 674f6bed0efSShaohua Li int meta_size; 6755036c390SChristoph Hellwig int ret; 676f6bed0efSShaohua Li struct r5l_io_unit *io; 677f6bed0efSShaohua Li 678f6bed0efSShaohua Li meta_size = 679f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 680f6bed0efSShaohua Li * data_pages) + 681f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 682f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 683f6bed0efSShaohua Li 6845036c390SChristoph Hellwig ret = r5l_get_meta(log, meta_size); 6855036c390SChristoph Hellwig if (ret) 6865036c390SChristoph Hellwig return ret; 6875036c390SChristoph Hellwig 688f6bed0efSShaohua Li io = log->current_io; 689f6bed0efSShaohua Li 690f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 6911e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 6921e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 693f6bed0efSShaohua Li continue; 694f6bed0efSShaohua Li if (i == sh->pd_idx || i == sh->qd_idx) 695f6bed0efSShaohua Li continue; 696f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 697f6bed0efSShaohua Li raid5_compute_blocknr(sh, i, 0), 698f6bed0efSShaohua Li sh->dev[i].log_checksum, 0, false); 699f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[i].page); 700f6bed0efSShaohua Li } 701f6bed0efSShaohua Li 7022ded3703SSong Liu if (parity_pages == 2) { 703f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 704f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 705f6bed0efSShaohua Li sh->dev[sh->qd_idx].log_checksum, true); 706f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 707f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 7082ded3703SSong Liu } else if (parity_pages == 1) { 709f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 710f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 711f6bed0efSShaohua Li 0, false); 712f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 7132ded3703SSong Liu } else /* Just writing data, not parity, in caching phase */ 7142ded3703SSong Liu BUG_ON(parity_pages != 0); 715f6bed0efSShaohua Li 716f6bed0efSShaohua Li list_add_tail(&sh->log_list, &io->stripe_list); 717f6bed0efSShaohua Li atomic_inc(&io->pending_stripe); 718f6bed0efSShaohua Li sh->log_io = io; 7195036c390SChristoph Hellwig 720a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 721a39f7afdSSong Liu return 0; 722a39f7afdSSong Liu 723a39f7afdSSong Liu if (sh->log_start == MaxSector) { 724a39f7afdSSong Liu BUG_ON(!list_empty(&sh->r5c)); 725a39f7afdSSong Liu sh->log_start = io->log_start; 726a39f7afdSSong Liu spin_lock_irq(&log->stripe_in_journal_lock); 727a39f7afdSSong Liu list_add_tail(&sh->r5c, 728a39f7afdSSong Liu &log->stripe_in_journal_list); 729a39f7afdSSong Liu spin_unlock_irq(&log->stripe_in_journal_lock); 730a39f7afdSSong Liu atomic_inc(&log->stripe_in_journal_count); 731a39f7afdSSong Liu } 7325036c390SChristoph Hellwig return 0; 733f6bed0efSShaohua Li } 734f6bed0efSShaohua Li 735a39f7afdSSong Liu /* add stripe to no_space_stripes, and then wake up reclaim */ 736a39f7afdSSong Liu static inline void r5l_add_no_space_stripe(struct r5l_log *log, 737a39f7afdSSong Liu struct stripe_head *sh) 738a39f7afdSSong Liu { 739a39f7afdSSong Liu spin_lock(&log->no_space_stripes_lock); 740a39f7afdSSong Liu list_add_tail(&sh->log_list, &log->no_space_stripes); 741a39f7afdSSong Liu spin_unlock(&log->no_space_stripes_lock); 742a39f7afdSSong Liu } 743a39f7afdSSong Liu 744f6bed0efSShaohua Li /* 745f6bed0efSShaohua Li * running in raid5d, where reclaim could wait for raid5d too (when it flushes 746f6bed0efSShaohua Li * data from log to raid disks), so we shouldn't wait for reclaim here 747f6bed0efSShaohua Li */ 748f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 749f6bed0efSShaohua Li { 750a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 751f6bed0efSShaohua Li int write_disks = 0; 752f6bed0efSShaohua Li int data_pages, parity_pages; 753f6bed0efSShaohua Li int reserve; 754f6bed0efSShaohua Li int i; 7555036c390SChristoph Hellwig int ret = 0; 756a39f7afdSSong Liu bool wake_reclaim = false; 757f6bed0efSShaohua Li 758f6bed0efSShaohua Li if (!log) 759f6bed0efSShaohua Li return -EAGAIN; 760f6bed0efSShaohua Li /* Don't support stripe batch */ 761f6bed0efSShaohua Li if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 762f6bed0efSShaohua Li test_bit(STRIPE_SYNCING, &sh->state)) { 763f6bed0efSShaohua Li /* the stripe is written to log, we start writing it to raid */ 764f6bed0efSShaohua Li clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 765f6bed0efSShaohua Li return -EAGAIN; 766f6bed0efSShaohua Li } 767f6bed0efSShaohua Li 7682ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 7692ded3703SSong Liu 770f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 771f6bed0efSShaohua Li void *addr; 772f6bed0efSShaohua Li 7731e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 7741e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 775f6bed0efSShaohua Li continue; 7761e6d690bSSong Liu 777f6bed0efSShaohua Li write_disks++; 778f6bed0efSShaohua Li /* checksum is already calculated in last run */ 779f6bed0efSShaohua Li if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 780f6bed0efSShaohua Li continue; 781f6bed0efSShaohua Li addr = kmap_atomic(sh->dev[i].page); 7825cb2fbd6SShaohua Li sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 783f6bed0efSShaohua Li addr, PAGE_SIZE); 784f6bed0efSShaohua Li kunmap_atomic(addr); 785f6bed0efSShaohua Li } 786f6bed0efSShaohua Li parity_pages = 1 + !!(sh->qd_idx >= 0); 787f6bed0efSShaohua Li data_pages = write_disks - parity_pages; 788f6bed0efSShaohua Li 789f6bed0efSShaohua Li set_bit(STRIPE_LOG_TRAPPED, &sh->state); 790253f9fd4SShaohua Li /* 791253f9fd4SShaohua Li * The stripe must enter state machine again to finish the write, so 792253f9fd4SShaohua Li * don't delay. 793253f9fd4SShaohua Li */ 794253f9fd4SShaohua Li clear_bit(STRIPE_DELAYED, &sh->state); 795f6bed0efSShaohua Li atomic_inc(&sh->count); 796f6bed0efSShaohua Li 797f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 798f6bed0efSShaohua Li /* meta + data */ 799f6bed0efSShaohua Li reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 800f6bed0efSShaohua Li 801a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 802a39f7afdSSong Liu if (!r5l_has_free_space(log, reserve)) { 803a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 804a39f7afdSSong Liu wake_reclaim = true; 8055036c390SChristoph Hellwig } else { 8065036c390SChristoph Hellwig ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 8075036c390SChristoph Hellwig if (ret) { 8085036c390SChristoph Hellwig spin_lock_irq(&log->io_list_lock); 809a39f7afdSSong Liu list_add_tail(&sh->log_list, 810a39f7afdSSong Liu &log->no_mem_stripes); 8115036c390SChristoph Hellwig spin_unlock_irq(&log->io_list_lock); 812f6bed0efSShaohua Li } 8135036c390SChristoph Hellwig } 814a39f7afdSSong Liu } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 815a39f7afdSSong Liu /* 816a39f7afdSSong Liu * log space critical, do not process stripes that are 817a39f7afdSSong Liu * not in cache yet (sh->log_start == MaxSector). 818a39f7afdSSong Liu */ 819a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 820a39f7afdSSong Liu sh->log_start == MaxSector) { 821a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 822a39f7afdSSong Liu wake_reclaim = true; 823a39f7afdSSong Liu reserve = 0; 824a39f7afdSSong Liu } else if (!r5l_has_free_space(log, reserve)) { 825a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 826a39f7afdSSong Liu BUG(); 827a39f7afdSSong Liu else 828a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 829a39f7afdSSong Liu } else { 830a39f7afdSSong Liu ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 831a39f7afdSSong Liu if (ret) { 832a39f7afdSSong Liu spin_lock_irq(&log->io_list_lock); 833a39f7afdSSong Liu list_add_tail(&sh->log_list, 834a39f7afdSSong Liu &log->no_mem_stripes); 835a39f7afdSSong Liu spin_unlock_irq(&log->io_list_lock); 836a39f7afdSSong Liu } 837a39f7afdSSong Liu } 838a39f7afdSSong Liu } 839f6bed0efSShaohua Li 8405036c390SChristoph Hellwig mutex_unlock(&log->io_mutex); 841a39f7afdSSong Liu if (wake_reclaim) 842a39f7afdSSong Liu r5l_wake_reclaim(log, reserve); 843f6bed0efSShaohua Li return 0; 844f6bed0efSShaohua Li } 845f6bed0efSShaohua Li 846f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log) 847f6bed0efSShaohua Li { 848f6bed0efSShaohua Li if (!log) 849f6bed0efSShaohua Li return; 850f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 851f6bed0efSShaohua Li r5l_submit_current_io(log); 852f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 853f6bed0efSShaohua Li } 854f6bed0efSShaohua Li 855828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 856828cbe98SShaohua Li { 857828cbe98SShaohua Li if (!log) 858828cbe98SShaohua Li return -ENODEV; 859828cbe98SShaohua Li /* 860828cbe98SShaohua Li * we flush log disk cache first, then write stripe data to raid disks. 861828cbe98SShaohua Li * So if bio is finished, the log disk cache is flushed already. The 862828cbe98SShaohua Li * recovery guarantees we can recovery the bio from log disk, so we 863828cbe98SShaohua Li * don't need to flush again 864828cbe98SShaohua Li */ 865828cbe98SShaohua Li if (bio->bi_iter.bi_size == 0) { 866828cbe98SShaohua Li bio_endio(bio); 867828cbe98SShaohua Li return 0; 868828cbe98SShaohua Li } 8691eff9d32SJens Axboe bio->bi_opf &= ~REQ_PREFLUSH; 870828cbe98SShaohua Li return -EAGAIN; 871828cbe98SShaohua Li } 872828cbe98SShaohua Li 873f6bed0efSShaohua Li /* This will run after log space is reclaimed */ 874f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log) 875f6bed0efSShaohua Li { 876f6bed0efSShaohua Li struct stripe_head *sh; 877f6bed0efSShaohua Li 878f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 879f6bed0efSShaohua Li while (!list_empty(&log->no_space_stripes)) { 880f6bed0efSShaohua Li sh = list_first_entry(&log->no_space_stripes, 881f6bed0efSShaohua Li struct stripe_head, log_list); 882f6bed0efSShaohua Li list_del_init(&sh->log_list); 883f6bed0efSShaohua Li set_bit(STRIPE_HANDLE, &sh->state); 884f6bed0efSShaohua Li raid5_release_stripe(sh); 885f6bed0efSShaohua Li } 886f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 887f6bed0efSShaohua Li } 888f6bed0efSShaohua Li 889a39f7afdSSong Liu /* 890a39f7afdSSong Liu * calculate new last_checkpoint 891a39f7afdSSong Liu * for write through mode, returns log->next_checkpoint 892a39f7afdSSong Liu * for write back, returns log_start of first sh in stripe_in_journal_list 893a39f7afdSSong Liu */ 894a39f7afdSSong Liu static sector_t r5c_calculate_new_cp(struct r5conf *conf) 895a39f7afdSSong Liu { 896a39f7afdSSong Liu struct stripe_head *sh; 897a39f7afdSSong Liu struct r5l_log *log = conf->log; 898a39f7afdSSong Liu sector_t new_cp; 899a39f7afdSSong Liu unsigned long flags; 900a39f7afdSSong Liu 901a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 902a39f7afdSSong Liu return log->next_checkpoint; 903a39f7afdSSong Liu 904a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 905a39f7afdSSong Liu if (list_empty(&conf->log->stripe_in_journal_list)) { 906a39f7afdSSong Liu /* all stripes flushed */ 907a39f7afdSSong Liu spin_unlock(&log->stripe_in_journal_lock); 908a39f7afdSSong Liu return log->next_checkpoint; 909a39f7afdSSong Liu } 910a39f7afdSSong Liu sh = list_first_entry(&conf->log->stripe_in_journal_list, 911a39f7afdSSong Liu struct stripe_head, r5c); 912a39f7afdSSong Liu new_cp = sh->log_start; 913a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 914a39f7afdSSong Liu return new_cp; 915a39f7afdSSong Liu } 916a39f7afdSSong Liu 91717036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log) 91817036461SChristoph Hellwig { 919a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 920a39f7afdSSong Liu 92117036461SChristoph Hellwig return r5l_ring_distance(log, log->last_checkpoint, 922a39f7afdSSong Liu r5c_calculate_new_cp(conf)); 92317036461SChristoph Hellwig } 92417036461SChristoph Hellwig 9255036c390SChristoph Hellwig static void r5l_run_no_mem_stripe(struct r5l_log *log) 9265036c390SChristoph Hellwig { 9275036c390SChristoph Hellwig struct stripe_head *sh; 9285036c390SChristoph Hellwig 9295036c390SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 9305036c390SChristoph Hellwig 9315036c390SChristoph Hellwig if (!list_empty(&log->no_mem_stripes)) { 9325036c390SChristoph Hellwig sh = list_first_entry(&log->no_mem_stripes, 9335036c390SChristoph Hellwig struct stripe_head, log_list); 9345036c390SChristoph Hellwig list_del_init(&sh->log_list); 9355036c390SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 9365036c390SChristoph Hellwig raid5_release_stripe(sh); 9375036c390SChristoph Hellwig } 9385036c390SChristoph Hellwig } 9395036c390SChristoph Hellwig 94004732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log) 94117036461SChristoph Hellwig { 94217036461SChristoph Hellwig struct r5l_io_unit *io, *next; 94317036461SChristoph Hellwig bool found = false; 94417036461SChristoph Hellwig 94517036461SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 94617036461SChristoph Hellwig 94704732f74SChristoph Hellwig list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 94817036461SChristoph Hellwig /* don't change list order */ 94917036461SChristoph Hellwig if (io->state < IO_UNIT_STRIPE_END) 95017036461SChristoph Hellwig break; 95117036461SChristoph Hellwig 95217036461SChristoph Hellwig log->next_checkpoint = io->log_start; 95317036461SChristoph Hellwig log->next_cp_seq = io->seq; 95417036461SChristoph Hellwig 95517036461SChristoph Hellwig list_del(&io->log_sibling); 9565036c390SChristoph Hellwig mempool_free(io, log->io_pool); 9575036c390SChristoph Hellwig r5l_run_no_mem_stripe(log); 95817036461SChristoph Hellwig 95917036461SChristoph Hellwig found = true; 96017036461SChristoph Hellwig } 96117036461SChristoph Hellwig 96217036461SChristoph Hellwig return found; 96317036461SChristoph Hellwig } 96417036461SChristoph Hellwig 965509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 966509ffec7SChristoph Hellwig { 967509ffec7SChristoph Hellwig struct r5l_log *log = io->log; 968a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 969509ffec7SChristoph Hellwig unsigned long flags; 970509ffec7SChristoph Hellwig 971509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 972509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 97317036461SChristoph Hellwig 97404732f74SChristoph Hellwig if (!r5l_complete_finished_ios(log)) { 97585f2f9a4SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 97685f2f9a4SShaohua Li return; 97785f2f9a4SShaohua Li } 978509ffec7SChristoph Hellwig 979a39f7afdSSong Liu if (r5l_reclaimable_space(log) > log->max_free_space || 980a39f7afdSSong Liu test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 981509ffec7SChristoph Hellwig r5l_wake_reclaim(log, 0); 982509ffec7SChristoph Hellwig 983509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 984509ffec7SChristoph Hellwig wake_up(&log->iounit_wait); 985509ffec7SChristoph Hellwig } 986509ffec7SChristoph Hellwig 9870576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh) 9880576b1c6SShaohua Li { 9890576b1c6SShaohua Li struct r5l_io_unit *io; 9900576b1c6SShaohua Li 9910576b1c6SShaohua Li io = sh->log_io; 9920576b1c6SShaohua Li sh->log_io = NULL; 9930576b1c6SShaohua Li 994509ffec7SChristoph Hellwig if (io && atomic_dec_and_test(&io->pending_stripe)) 995509ffec7SChristoph Hellwig __r5l_stripe_write_finished(io); 9960576b1c6SShaohua Li } 9970576b1c6SShaohua Li 998a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio) 999a8c34f91SShaohua Li { 1000a8c34f91SShaohua Li struct r5l_log *log = container_of(bio, struct r5l_log, 1001a8c34f91SShaohua Li flush_bio); 1002a8c34f91SShaohua Li unsigned long flags; 1003a8c34f91SShaohua Li struct r5l_io_unit *io; 1004a8c34f91SShaohua Li 10056e74a9cfSShaohua Li if (bio->bi_error) 10066e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 10076e74a9cfSShaohua Li 1008a8c34f91SShaohua Li spin_lock_irqsave(&log->io_list_lock, flags); 1009d8858f43SChristoph Hellwig list_for_each_entry(io, &log->flushing_ios, log_sibling) 1010d8858f43SChristoph Hellwig r5l_io_run_stripes(io); 101104732f74SChristoph Hellwig list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1012a8c34f91SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 1013a8c34f91SShaohua Li } 1014a8c34f91SShaohua Li 10150576b1c6SShaohua Li /* 10160576b1c6SShaohua Li * Starting dispatch IO to raid. 10170576b1c6SShaohua Li * io_unit(meta) consists of a log. There is one situation we want to avoid. A 10180576b1c6SShaohua Li * broken meta in the middle of a log causes recovery can't find meta at the 10190576b1c6SShaohua Li * head of log. If operations require meta at the head persistent in log, we 10200576b1c6SShaohua Li * must make sure meta before it persistent in log too. A case is: 10210576b1c6SShaohua Li * 10220576b1c6SShaohua Li * stripe data/parity is in log, we start write stripe to raid disks. stripe 10230576b1c6SShaohua Li * data/parity must be persistent in log before we do the write to raid disks. 10240576b1c6SShaohua Li * 10250576b1c6SShaohua Li * The solution is we restrictly maintain io_unit list order. In this case, we 10260576b1c6SShaohua Li * only write stripes of an io_unit to raid disks till the io_unit is the first 10270576b1c6SShaohua Li * one whose data/parity is in log. 10280576b1c6SShaohua Li */ 10290576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log) 10300576b1c6SShaohua Li { 1031a8c34f91SShaohua Li bool do_flush; 103256fef7c6SChristoph Hellwig 103356fef7c6SChristoph Hellwig if (!log || !log->need_cache_flush) 10340576b1c6SShaohua Li return; 10350576b1c6SShaohua Li 1036a8c34f91SShaohua Li spin_lock_irq(&log->io_list_lock); 1037a8c34f91SShaohua Li /* flush bio is running */ 1038a8c34f91SShaohua Li if (!list_empty(&log->flushing_ios)) { 1039a8c34f91SShaohua Li spin_unlock_irq(&log->io_list_lock); 10400576b1c6SShaohua Li return; 10410576b1c6SShaohua Li } 1042a8c34f91SShaohua Li list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1043a8c34f91SShaohua Li do_flush = !list_empty(&log->flushing_ios); 10440576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 1045a8c34f91SShaohua Li 1046a8c34f91SShaohua Li if (!do_flush) 1047a8c34f91SShaohua Li return; 1048a8c34f91SShaohua Li bio_reset(&log->flush_bio); 1049a8c34f91SShaohua Li log->flush_bio.bi_bdev = log->rdev->bdev; 1050a8c34f91SShaohua Li log->flush_bio.bi_end_io = r5l_log_flush_endio; 1051796a5cf0SMike Christie bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); 10524e49ea4aSMike Christie submit_bio(&log->flush_bio); 10530576b1c6SShaohua Li } 10540576b1c6SShaohua Li 10550576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp); 10564b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log, 10574b482044SShaohua Li sector_t end) 10584b482044SShaohua Li { 10594b482044SShaohua Li struct block_device *bdev = log->rdev->bdev; 10604b482044SShaohua Li struct mddev *mddev; 10614b482044SShaohua Li 10624b482044SShaohua Li r5l_write_super(log, end); 10634b482044SShaohua Li 10644b482044SShaohua Li if (!blk_queue_discard(bdev_get_queue(bdev))) 10654b482044SShaohua Li return; 10664b482044SShaohua Li 10674b482044SShaohua Li mddev = log->rdev->mddev; 10684b482044SShaohua Li /* 10698e018c21SShaohua Li * Discard could zero data, so before discard we must make sure 10708e018c21SShaohua Li * superblock is updated to new log tail. Updating superblock (either 10718e018c21SShaohua Li * directly call md_update_sb() or depend on md thread) must hold 10728e018c21SShaohua Li * reconfig mutex. On the other hand, raid5_quiesce is called with 10738e018c21SShaohua Li * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 10748e018c21SShaohua Li * for all IO finish, hence waitting for reclaim thread, while reclaim 10758e018c21SShaohua Li * thread is calling this function and waitting for reconfig mutex. So 10768e018c21SShaohua Li * there is a deadlock. We workaround this issue with a trylock. 10778e018c21SShaohua Li * FIXME: we could miss discard if we can't take reconfig mutex 10784b482044SShaohua Li */ 107985ad1d13SGuoqing Jiang set_mask_bits(&mddev->flags, 0, 108085ad1d13SGuoqing Jiang BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 10818e018c21SShaohua Li if (!mddev_trylock(mddev)) 10828e018c21SShaohua Li return; 10834b482044SShaohua Li md_update_sb(mddev, 1); 10848e018c21SShaohua Li mddev_unlock(mddev); 10854b482044SShaohua Li 10866e74a9cfSShaohua Li /* discard IO error really doesn't matter, ignore it */ 10874b482044SShaohua Li if (log->last_checkpoint < end) { 10884b482044SShaohua Li blkdev_issue_discard(bdev, 10894b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 10904b482044SShaohua Li end - log->last_checkpoint, GFP_NOIO, 0); 10914b482044SShaohua Li } else { 10924b482044SShaohua Li blkdev_issue_discard(bdev, 10934b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 10944b482044SShaohua Li log->device_size - log->last_checkpoint, 10954b482044SShaohua Li GFP_NOIO, 0); 10964b482044SShaohua Li blkdev_issue_discard(bdev, log->rdev->data_offset, end, 10974b482044SShaohua Li GFP_NOIO, 0); 10984b482044SShaohua Li } 10994b482044SShaohua Li } 11004b482044SShaohua Li 1101a39f7afdSSong Liu /* 1102a39f7afdSSong Liu * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1103a39f7afdSSong Liu * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1104a39f7afdSSong Liu * 1105a39f7afdSSong Liu * must hold conf->device_lock 1106a39f7afdSSong Liu */ 1107a39f7afdSSong Liu static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1108a39f7afdSSong Liu { 1109a39f7afdSSong Liu BUG_ON(list_empty(&sh->lru)); 1110a39f7afdSSong Liu BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1111a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1112a39f7afdSSong Liu 1113a39f7afdSSong Liu /* 1114a39f7afdSSong Liu * The stripe is not ON_RELEASE_LIST, so it is safe to call 1115a39f7afdSSong Liu * raid5_release_stripe() while holding conf->device_lock 1116a39f7afdSSong Liu */ 1117a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1118a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1119a39f7afdSSong Liu 1120a39f7afdSSong Liu list_del_init(&sh->lru); 1121a39f7afdSSong Liu atomic_inc(&sh->count); 1122a39f7afdSSong Liu 1123a39f7afdSSong Liu set_bit(STRIPE_HANDLE, &sh->state); 1124a39f7afdSSong Liu atomic_inc(&conf->active_stripes); 1125a39f7afdSSong Liu r5c_make_stripe_write_out(sh); 1126a39f7afdSSong Liu 1127a39f7afdSSong Liu if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1128a39f7afdSSong Liu atomic_inc(&conf->preread_active_stripes); 1129a39f7afdSSong Liu raid5_release_stripe(sh); 1130a39f7afdSSong Liu } 1131a39f7afdSSong Liu 1132a39f7afdSSong Liu /* 1133a39f7afdSSong Liu * if num == 0, flush all full stripes 1134a39f7afdSSong Liu * if num > 0, flush all full stripes. If less than num full stripes are 1135a39f7afdSSong Liu * flushed, flush some partial stripes until totally num stripes are 1136a39f7afdSSong Liu * flushed or there is no more cached stripes. 1137a39f7afdSSong Liu */ 1138a39f7afdSSong Liu void r5c_flush_cache(struct r5conf *conf, int num) 1139a39f7afdSSong Liu { 1140a39f7afdSSong Liu int count; 1141a39f7afdSSong Liu struct stripe_head *sh, *next; 1142a39f7afdSSong Liu 1143a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1144a39f7afdSSong Liu if (!conf->log) 1145a39f7afdSSong Liu return; 1146a39f7afdSSong Liu 1147a39f7afdSSong Liu count = 0; 1148a39f7afdSSong Liu list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1149a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1150a39f7afdSSong Liu count++; 1151a39f7afdSSong Liu } 1152a39f7afdSSong Liu 1153a39f7afdSSong Liu if (count >= num) 1154a39f7afdSSong Liu return; 1155a39f7afdSSong Liu list_for_each_entry_safe(sh, next, 1156a39f7afdSSong Liu &conf->r5c_partial_stripe_list, lru) { 1157a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1158a39f7afdSSong Liu if (++count >= num) 1159a39f7afdSSong Liu break; 1160a39f7afdSSong Liu } 1161a39f7afdSSong Liu } 1162a39f7afdSSong Liu 1163a39f7afdSSong Liu static void r5c_do_reclaim(struct r5conf *conf) 1164a39f7afdSSong Liu { 1165a39f7afdSSong Liu struct r5l_log *log = conf->log; 1166a39f7afdSSong Liu struct stripe_head *sh; 1167a39f7afdSSong Liu int count = 0; 1168a39f7afdSSong Liu unsigned long flags; 1169a39f7afdSSong Liu int total_cached; 1170a39f7afdSSong Liu int stripes_to_flush; 1171a39f7afdSSong Liu 1172a39f7afdSSong Liu if (!r5c_is_writeback(log)) 1173a39f7afdSSong Liu return; 1174a39f7afdSSong Liu 1175a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1176a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 1177a39f7afdSSong Liu 1178a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 3 / 4 || 1179a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 1180a39f7afdSSong Liu /* 1181a39f7afdSSong Liu * if stripe cache pressure high, flush all full stripes and 1182a39f7afdSSong Liu * some partial stripes 1183a39f7afdSSong Liu */ 1184a39f7afdSSong Liu stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1185a39f7afdSSong Liu else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1186a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes) > 1187a39f7afdSSong Liu R5C_FULL_STRIPE_FLUSH_BATCH) 1188a39f7afdSSong Liu /* 1189a39f7afdSSong Liu * if stripe cache pressure moderate, or if there is many full 1190a39f7afdSSong Liu * stripes,flush all full stripes 1191a39f7afdSSong Liu */ 1192a39f7afdSSong Liu stripes_to_flush = 0; 1193a39f7afdSSong Liu else 1194a39f7afdSSong Liu /* no need to flush */ 1195a39f7afdSSong Liu stripes_to_flush = -1; 1196a39f7afdSSong Liu 1197a39f7afdSSong Liu if (stripes_to_flush >= 0) { 1198a39f7afdSSong Liu spin_lock_irqsave(&conf->device_lock, flags); 1199a39f7afdSSong Liu r5c_flush_cache(conf, stripes_to_flush); 1200a39f7afdSSong Liu spin_unlock_irqrestore(&conf->device_lock, flags); 1201a39f7afdSSong Liu } 1202a39f7afdSSong Liu 1203a39f7afdSSong Liu /* if log space is tight, flush stripes on stripe_in_journal_list */ 1204a39f7afdSSong Liu if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1205a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1206a39f7afdSSong Liu spin_lock(&conf->device_lock); 1207a39f7afdSSong Liu list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1208a39f7afdSSong Liu /* 1209a39f7afdSSong Liu * stripes on stripe_in_journal_list could be in any 1210a39f7afdSSong Liu * state of the stripe_cache state machine. In this 1211a39f7afdSSong Liu * case, we only want to flush stripe on 1212a39f7afdSSong Liu * r5c_cached_full/partial_stripes. The following 1213a39f7afdSSong Liu * condition makes sure the stripe is on one of the 1214a39f7afdSSong Liu * two lists. 1215a39f7afdSSong Liu */ 1216a39f7afdSSong Liu if (!list_empty(&sh->lru) && 1217a39f7afdSSong Liu !test_bit(STRIPE_HANDLE, &sh->state) && 1218a39f7afdSSong Liu atomic_read(&sh->count) == 0) { 1219a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1220a39f7afdSSong Liu } 1221a39f7afdSSong Liu if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1222a39f7afdSSong Liu break; 1223a39f7afdSSong Liu } 1224a39f7afdSSong Liu spin_unlock(&conf->device_lock); 1225a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1226a39f7afdSSong Liu } 1227a39f7afdSSong Liu md_wakeup_thread(conf->mddev->thread); 1228a39f7afdSSong Liu } 1229a39f7afdSSong Liu 12300576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log) 12310576b1c6SShaohua Li { 1232a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 12330576b1c6SShaohua Li sector_t reclaim_target = xchg(&log->reclaim_target, 0); 123417036461SChristoph Hellwig sector_t reclaimable; 123517036461SChristoph Hellwig sector_t next_checkpoint; 1236a39f7afdSSong Liu bool write_super; 12370576b1c6SShaohua Li 12380576b1c6SShaohua Li spin_lock_irq(&log->io_list_lock); 1239a39f7afdSSong Liu write_super = r5l_reclaimable_space(log) > log->max_free_space || 1240a39f7afdSSong Liu reclaim_target != 0 || !list_empty(&log->no_space_stripes); 12410576b1c6SShaohua Li /* 12420576b1c6SShaohua Li * move proper io_unit to reclaim list. We should not change the order. 12430576b1c6SShaohua Li * reclaimable/unreclaimable io_unit can be mixed in the list, we 12440576b1c6SShaohua Li * shouldn't reuse space of an unreclaimable io_unit 12450576b1c6SShaohua Li */ 12460576b1c6SShaohua Li while (1) { 124717036461SChristoph Hellwig reclaimable = r5l_reclaimable_space(log); 124817036461SChristoph Hellwig if (reclaimable >= reclaim_target || 12490576b1c6SShaohua Li (list_empty(&log->running_ios) && 12500576b1c6SShaohua Li list_empty(&log->io_end_ios) && 1251a8c34f91SShaohua Li list_empty(&log->flushing_ios) && 125204732f74SChristoph Hellwig list_empty(&log->finished_ios))) 12530576b1c6SShaohua Li break; 12540576b1c6SShaohua Li 125517036461SChristoph Hellwig md_wakeup_thread(log->rdev->mddev->thread); 125617036461SChristoph Hellwig wait_event_lock_irq(log->iounit_wait, 125717036461SChristoph Hellwig r5l_reclaimable_space(log) > reclaimable, 125817036461SChristoph Hellwig log->io_list_lock); 12590576b1c6SShaohua Li } 126017036461SChristoph Hellwig 1261a39f7afdSSong Liu next_checkpoint = r5c_calculate_new_cp(conf); 12620576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 12630576b1c6SShaohua Li 126417036461SChristoph Hellwig BUG_ON(reclaimable < 0); 1265a39f7afdSSong Liu 1266a39f7afdSSong Liu if (reclaimable == 0 || !write_super) 12670576b1c6SShaohua Li return; 12680576b1c6SShaohua Li 12690576b1c6SShaohua Li /* 12700576b1c6SShaohua Li * write_super will flush cache of each raid disk. We must write super 12710576b1c6SShaohua Li * here, because the log area might be reused soon and we don't want to 12720576b1c6SShaohua Li * confuse recovery 12730576b1c6SShaohua Li */ 12744b482044SShaohua Li r5l_write_super_and_discard_space(log, next_checkpoint); 12750576b1c6SShaohua Li 12760576b1c6SShaohua Li mutex_lock(&log->io_mutex); 127717036461SChristoph Hellwig log->last_checkpoint = next_checkpoint; 1278a39f7afdSSong Liu r5c_update_log_state(log); 12790576b1c6SShaohua Li mutex_unlock(&log->io_mutex); 12800576b1c6SShaohua Li 128117036461SChristoph Hellwig r5l_run_no_space_stripes(log); 12820576b1c6SShaohua Li } 12830576b1c6SShaohua Li 12840576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread) 12850576b1c6SShaohua Li { 12860576b1c6SShaohua Li struct mddev *mddev = thread->mddev; 12870576b1c6SShaohua Li struct r5conf *conf = mddev->private; 12880576b1c6SShaohua Li struct r5l_log *log = conf->log; 12890576b1c6SShaohua Li 12900576b1c6SShaohua Li if (!log) 12910576b1c6SShaohua Li return; 1292a39f7afdSSong Liu r5c_do_reclaim(conf); 12930576b1c6SShaohua Li r5l_do_reclaim(log); 12940576b1c6SShaohua Li } 12950576b1c6SShaohua Li 1296a39f7afdSSong Liu void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1297f6bed0efSShaohua Li { 12980576b1c6SShaohua Li unsigned long target; 12990576b1c6SShaohua Li unsigned long new = (unsigned long)space; /* overflow in theory */ 13000576b1c6SShaohua Li 1301a39f7afdSSong Liu if (!log) 1302a39f7afdSSong Liu return; 13030576b1c6SShaohua Li do { 13040576b1c6SShaohua Li target = log->reclaim_target; 13050576b1c6SShaohua Li if (new < target) 13060576b1c6SShaohua Li return; 13070576b1c6SShaohua Li } while (cmpxchg(&log->reclaim_target, target, new) != target); 13080576b1c6SShaohua Li md_wakeup_thread(log->reclaim_thread); 1309f6bed0efSShaohua Li } 1310f6bed0efSShaohua Li 1311e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state) 1312e6c033f7SShaohua Li { 13134b482044SShaohua Li struct mddev *mddev; 1314e6c033f7SShaohua Li if (!log || state == 2) 1315e6c033f7SShaohua Li return; 1316e6c033f7SShaohua Li if (state == 0) { 131716a43f6aSShaohua Li /* 131816a43f6aSShaohua Li * This is a special case for hotadd. In suspend, the array has 131916a43f6aSShaohua Li * no journal. In resume, journal is initialized as well as the 132016a43f6aSShaohua Li * reclaim thread. 132116a43f6aSShaohua Li */ 132216a43f6aSShaohua Li if (log->reclaim_thread) 132316a43f6aSShaohua Li return; 1324e6c033f7SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1325e6c033f7SShaohua Li log->rdev->mddev, "reclaim"); 1326a39f7afdSSong Liu log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 1327e6c033f7SShaohua Li } else if (state == 1) { 13284b482044SShaohua Li /* make sure r5l_write_super_and_discard_space exits */ 13294b482044SShaohua Li mddev = log->rdev->mddev; 13304b482044SShaohua Li wake_up(&mddev->sb_wait); 1331a39f7afdSSong Liu r5l_wake_reclaim(log, MaxSector); 1332e6c033f7SShaohua Li md_unregister_thread(&log->reclaim_thread); 1333e6c033f7SShaohua Li r5l_do_reclaim(log); 1334e6c033f7SShaohua Li } 1335e6c033f7SShaohua Li } 1336e6c033f7SShaohua Li 13376e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf) 13386e74a9cfSShaohua Li { 1339f6b6ec5cSShaohua Li struct r5l_log *log; 1340f6b6ec5cSShaohua Li bool ret; 13417dde2ad3SShaohua Li /* don't allow write if journal disk is missing */ 1342f6b6ec5cSShaohua Li rcu_read_lock(); 1343f6b6ec5cSShaohua Li log = rcu_dereference(conf->log); 1344f6b6ec5cSShaohua Li 1345f6b6ec5cSShaohua Li if (!log) 1346f6b6ec5cSShaohua Li ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1347f6b6ec5cSShaohua Li else 1348f6b6ec5cSShaohua Li ret = test_bit(Faulty, &log->rdev->flags); 1349f6b6ec5cSShaohua Li rcu_read_unlock(); 1350f6b6ec5cSShaohua Li return ret; 13516e74a9cfSShaohua Li } 13526e74a9cfSShaohua Li 1353355810d1SShaohua Li struct r5l_recovery_ctx { 1354355810d1SShaohua Li struct page *meta_page; /* current meta */ 1355355810d1SShaohua Li sector_t meta_total_blocks; /* total size of current meta and data */ 1356355810d1SShaohua Li sector_t pos; /* recovery position */ 1357355810d1SShaohua Li u64 seq; /* recovery position seq */ 1358b4c625c6SSong Liu int data_parity_stripes; /* number of data_parity stripes */ 1359b4c625c6SSong Liu int data_only_stripes; /* number of data_only stripes */ 1360b4c625c6SSong Liu struct list_head cached_list; 1361355810d1SShaohua Li }; 1362355810d1SShaohua Li 13639ed988f5SSong Liu static int r5l_recovery_read_meta_block(struct r5l_log *log, 1364355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1365355810d1SShaohua Li { 1366355810d1SShaohua Li struct page *page = ctx->meta_page; 1367355810d1SShaohua Li struct r5l_meta_block *mb; 1368355810d1SShaohua Li u32 crc, stored_crc; 1369355810d1SShaohua Li 1370796a5cf0SMike Christie if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1371796a5cf0SMike Christie false)) 1372355810d1SShaohua Li return -EIO; 1373355810d1SShaohua Li 1374355810d1SShaohua Li mb = page_address(page); 1375355810d1SShaohua Li stored_crc = le32_to_cpu(mb->checksum); 1376355810d1SShaohua Li mb->checksum = 0; 1377355810d1SShaohua Li 1378355810d1SShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1379355810d1SShaohua Li le64_to_cpu(mb->seq) != ctx->seq || 1380355810d1SShaohua Li mb->version != R5LOG_VERSION || 1381355810d1SShaohua Li le64_to_cpu(mb->position) != ctx->pos) 1382355810d1SShaohua Li return -EINVAL; 1383355810d1SShaohua Li 13845cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1385355810d1SShaohua Li if (stored_crc != crc) 1386355810d1SShaohua Li return -EINVAL; 1387355810d1SShaohua Li 1388355810d1SShaohua Li if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1389355810d1SShaohua Li return -EINVAL; 1390355810d1SShaohua Li 1391355810d1SShaohua Li ctx->meta_total_blocks = BLOCK_SECTORS; 1392355810d1SShaohua Li 1393355810d1SShaohua Li return 0; 1394355810d1SShaohua Li } 1395355810d1SShaohua Li 13969ed988f5SSong Liu static void 13979ed988f5SSong Liu r5l_recovery_create_empty_meta_block(struct r5l_log *log, 13989ed988f5SSong Liu struct page *page, 13999ed988f5SSong Liu sector_t pos, u64 seq) 1400355810d1SShaohua Li { 1401355810d1SShaohua Li struct r5l_meta_block *mb; 1402355810d1SShaohua Li u32 crc; 1403355810d1SShaohua Li 1404355810d1SShaohua Li mb = page_address(page); 14059ed988f5SSong Liu clear_page(mb); 1406355810d1SShaohua Li mb->magic = cpu_to_le32(R5LOG_MAGIC); 1407355810d1SShaohua Li mb->version = R5LOG_VERSION; 1408355810d1SShaohua Li mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1409355810d1SShaohua Li mb->seq = cpu_to_le64(seq); 1410355810d1SShaohua Li mb->position = cpu_to_le64(pos); 14115cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1412355810d1SShaohua Li mb->checksum = cpu_to_le32(crc); 14139ed988f5SSong Liu } 1414355810d1SShaohua Li 14159ed988f5SSong Liu static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 14169ed988f5SSong Liu u64 seq) 14179ed988f5SSong Liu { 14189ed988f5SSong Liu struct page *page; 14199ed988f5SSong Liu 14209ed988f5SSong Liu page = alloc_page(GFP_KERNEL); 14219ed988f5SSong Liu if (!page) 14229ed988f5SSong Liu return -ENOMEM; 14239ed988f5SSong Liu r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1424796a5cf0SMike Christie if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1425796a5cf0SMike Christie WRITE_FUA, false)) { 1426355810d1SShaohua Li __free_page(page); 1427355810d1SShaohua Li return -EIO; 1428355810d1SShaohua Li } 1429355810d1SShaohua Li __free_page(page); 1430355810d1SShaohua Li return 0; 1431355810d1SShaohua Li } 1432355810d1SShaohua Li 1433b4c625c6SSong Liu /* 1434b4c625c6SSong Liu * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1435b4c625c6SSong Liu * to mark valid (potentially not flushed) data in the journal. 1436b4c625c6SSong Liu * 1437b4c625c6SSong Liu * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1438b4c625c6SSong Liu * so there should not be any mismatch here. 1439b4c625c6SSong Liu */ 1440b4c625c6SSong Liu static void r5l_recovery_load_data(struct r5l_log *log, 1441b4c625c6SSong Liu struct stripe_head *sh, 1442b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1443b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1444b4c625c6SSong Liu sector_t log_offset) 1445b4c625c6SSong Liu { 1446b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1447b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1448b4c625c6SSong Liu int dd_idx; 1449b4c625c6SSong Liu 1450b4c625c6SSong Liu raid5_compute_sector(conf, 1451b4c625c6SSong Liu le64_to_cpu(payload->location), 0, 1452b4c625c6SSong Liu &dd_idx, sh); 1453b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1454b4c625c6SSong Liu sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1455b4c625c6SSong Liu sh->dev[dd_idx].log_checksum = 1456b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1457b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS; 1458b4c625c6SSong Liu 1459b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1460b4c625c6SSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 1461b4c625c6SSong Liu } 1462b4c625c6SSong Liu 1463b4c625c6SSong Liu static void r5l_recovery_load_parity(struct r5l_log *log, 1464b4c625c6SSong Liu struct stripe_head *sh, 1465b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1466b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1467b4c625c6SSong Liu sector_t log_offset) 1468b4c625c6SSong Liu { 1469b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1470b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1471b4c625c6SSong Liu 1472b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1473b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1474b4c625c6SSong Liu sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1475b4c625c6SSong Liu sh->dev[sh->pd_idx].log_checksum = 1476b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1477b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1478b4c625c6SSong Liu 1479b4c625c6SSong Liu if (sh->qd_idx >= 0) { 1480b4c625c6SSong Liu sync_page_io(log->rdev, 1481b4c625c6SSong Liu r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1482b4c625c6SSong Liu PAGE_SIZE, sh->dev[sh->qd_idx].page, 1483b4c625c6SSong Liu REQ_OP_READ, 0, false); 1484b4c625c6SSong Liu sh->dev[sh->qd_idx].log_checksum = 1485b4c625c6SSong Liu le32_to_cpu(payload->checksum[1]); 1486b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1487b4c625c6SSong Liu } 1488b4c625c6SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 1489b4c625c6SSong Liu } 1490b4c625c6SSong Liu 1491b4c625c6SSong Liu static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1492b4c625c6SSong Liu { 1493b4c625c6SSong Liu int i; 1494b4c625c6SSong Liu 1495b4c625c6SSong Liu sh->state = 0; 1496b4c625c6SSong Liu sh->log_start = MaxSector; 1497b4c625c6SSong Liu for (i = sh->disks; i--; ) 1498b4c625c6SSong Liu sh->dev[i].flags = 0; 1499b4c625c6SSong Liu } 1500b4c625c6SSong Liu 1501b4c625c6SSong Liu static void 1502b4c625c6SSong Liu r5l_recovery_replay_one_stripe(struct r5conf *conf, 1503b4c625c6SSong Liu struct stripe_head *sh, 1504b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1505b4c625c6SSong Liu { 1506b4c625c6SSong Liu struct md_rdev *rdev, *rrdev; 1507b4c625c6SSong Liu int disk_index; 1508b4c625c6SSong Liu int data_count = 0; 1509b4c625c6SSong Liu 1510b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1511b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1512b4c625c6SSong Liu continue; 1513b4c625c6SSong Liu if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1514b4c625c6SSong Liu continue; 1515b4c625c6SSong Liu data_count++; 1516b4c625c6SSong Liu } 1517b4c625c6SSong Liu 1518b4c625c6SSong Liu /* 1519b4c625c6SSong Liu * stripes that only have parity must have been flushed 1520b4c625c6SSong Liu * before the crash that we are now recovering from, so 1521b4c625c6SSong Liu * there is nothing more to recovery. 1522b4c625c6SSong Liu */ 1523b4c625c6SSong Liu if (data_count == 0) 1524b4c625c6SSong Liu goto out; 1525b4c625c6SSong Liu 1526b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1527b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1528b4c625c6SSong Liu continue; 1529b4c625c6SSong Liu 1530b4c625c6SSong Liu /* in case device is broken */ 1531b4c625c6SSong Liu rcu_read_lock(); 1532b4c625c6SSong Liu rdev = rcu_dereference(conf->disks[disk_index].rdev); 1533b4c625c6SSong Liu if (rdev) { 1534b4c625c6SSong Liu atomic_inc(&rdev->nr_pending); 1535b4c625c6SSong Liu rcu_read_unlock(); 1536b4c625c6SSong Liu sync_page_io(rdev, sh->sector, PAGE_SIZE, 1537b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1538b4c625c6SSong Liu false); 1539b4c625c6SSong Liu rdev_dec_pending(rdev, rdev->mddev); 1540b4c625c6SSong Liu rcu_read_lock(); 1541b4c625c6SSong Liu } 1542b4c625c6SSong Liu rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1543b4c625c6SSong Liu if (rrdev) { 1544b4c625c6SSong Liu atomic_inc(&rrdev->nr_pending); 1545b4c625c6SSong Liu rcu_read_unlock(); 1546b4c625c6SSong Liu sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1547b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1548b4c625c6SSong Liu false); 1549b4c625c6SSong Liu rdev_dec_pending(rrdev, rrdev->mddev); 1550b4c625c6SSong Liu rcu_read_lock(); 1551b4c625c6SSong Liu } 1552b4c625c6SSong Liu rcu_read_unlock(); 1553b4c625c6SSong Liu } 1554b4c625c6SSong Liu ctx->data_parity_stripes++; 1555b4c625c6SSong Liu out: 1556b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1557b4c625c6SSong Liu } 1558b4c625c6SSong Liu 1559b4c625c6SSong Liu static struct stripe_head * 1560b4c625c6SSong Liu r5c_recovery_alloc_stripe(struct r5conf *conf, 1561b4c625c6SSong Liu struct list_head *recovery_list, 1562b4c625c6SSong Liu sector_t stripe_sect, 1563b4c625c6SSong Liu sector_t log_start) 1564b4c625c6SSong Liu { 1565b4c625c6SSong Liu struct stripe_head *sh; 1566b4c625c6SSong Liu 1567b4c625c6SSong Liu sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1568b4c625c6SSong Liu if (!sh) 1569b4c625c6SSong Liu return NULL; /* no more stripe available */ 1570b4c625c6SSong Liu 1571b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1572b4c625c6SSong Liu sh->log_start = log_start; 1573b4c625c6SSong Liu 1574b4c625c6SSong Liu return sh; 1575b4c625c6SSong Liu } 1576b4c625c6SSong Liu 1577b4c625c6SSong Liu static struct stripe_head * 1578b4c625c6SSong Liu r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1579b4c625c6SSong Liu { 1580b4c625c6SSong Liu struct stripe_head *sh; 1581b4c625c6SSong Liu 1582b4c625c6SSong Liu list_for_each_entry(sh, list, lru) 1583b4c625c6SSong Liu if (sh->sector == sect) 1584b4c625c6SSong Liu return sh; 1585b4c625c6SSong Liu return NULL; 1586b4c625c6SSong Liu } 1587b4c625c6SSong Liu 1588b4c625c6SSong Liu static void 1589b4c625c6SSong Liu r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1590b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1591b4c625c6SSong Liu { 1592b4c625c6SSong Liu struct stripe_head *sh, *next; 1593b4c625c6SSong Liu 1594b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1595b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1596b4c625c6SSong Liu list_del_init(&sh->lru); 1597b4c625c6SSong Liu raid5_release_stripe(sh); 1598b4c625c6SSong Liu } 1599b4c625c6SSong Liu } 1600b4c625c6SSong Liu 1601b4c625c6SSong Liu static void 1602b4c625c6SSong Liu r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1603b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1604b4c625c6SSong Liu { 1605b4c625c6SSong Liu struct stripe_head *sh, *next; 1606b4c625c6SSong Liu 1607b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1608b4c625c6SSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1609b4c625c6SSong Liu r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1610b4c625c6SSong Liu list_del_init(&sh->lru); 1611b4c625c6SSong Liu raid5_release_stripe(sh); 1612b4c625c6SSong Liu } 1613b4c625c6SSong Liu } 1614b4c625c6SSong Liu 1615b4c625c6SSong Liu /* if matches return 0; otherwise return -EINVAL */ 1616b4c625c6SSong Liu static int 1617b4c625c6SSong Liu r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1618b4c625c6SSong Liu sector_t log_offset, __le32 log_checksum) 1619b4c625c6SSong Liu { 1620b4c625c6SSong Liu void *addr; 1621b4c625c6SSong Liu u32 checksum; 1622b4c625c6SSong Liu 1623b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1624b4c625c6SSong Liu page, REQ_OP_READ, 0, false); 1625b4c625c6SSong Liu addr = kmap_atomic(page); 1626b4c625c6SSong Liu checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1627b4c625c6SSong Liu kunmap_atomic(addr); 1628b4c625c6SSong Liu return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1629b4c625c6SSong Liu } 1630b4c625c6SSong Liu 1631b4c625c6SSong Liu /* 1632b4c625c6SSong Liu * before loading data to stripe cache, we need verify checksum for all data, 1633b4c625c6SSong Liu * if there is mismatch for any data page, we drop all data in the mata block 1634b4c625c6SSong Liu */ 1635b4c625c6SSong Liu static int 1636b4c625c6SSong Liu r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1637b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1638b4c625c6SSong Liu { 1639b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1640b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1641b4c625c6SSong Liu struct r5l_meta_block *mb = page_address(ctx->meta_page); 1642b4c625c6SSong Liu sector_t mb_offset = sizeof(struct r5l_meta_block); 1643b4c625c6SSong Liu sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1644b4c625c6SSong Liu struct page *page; 1645b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 1646b4c625c6SSong Liu 1647b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 1648b4c625c6SSong Liu if (!page) 1649b4c625c6SSong Liu return -ENOMEM; 1650b4c625c6SSong Liu 1651b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 1652b4c625c6SSong Liu payload = (void *)mb + mb_offset; 1653b4c625c6SSong Liu 1654b4c625c6SSong Liu if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1655b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 1656b4c625c6SSong Liu log, page, log_offset, 1657b4c625c6SSong Liu payload->checksum[0]) < 0) 1658b4c625c6SSong Liu goto mismatch; 1659b4c625c6SSong Liu } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1660b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 1661b4c625c6SSong Liu log, page, log_offset, 1662b4c625c6SSong Liu payload->checksum[0]) < 0) 1663b4c625c6SSong Liu goto mismatch; 1664b4c625c6SSong Liu if (conf->max_degraded == 2 && /* q for RAID 6 */ 1665b4c625c6SSong Liu r5l_recovery_verify_data_checksum( 1666b4c625c6SSong Liu log, page, 1667b4c625c6SSong Liu r5l_ring_add(log, log_offset, 1668b4c625c6SSong Liu BLOCK_SECTORS), 1669b4c625c6SSong Liu payload->checksum[1]) < 0) 1670b4c625c6SSong Liu goto mismatch; 1671b4c625c6SSong Liu } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1672b4c625c6SSong Liu goto mismatch; 1673b4c625c6SSong Liu 1674b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 1675b4c625c6SSong Liu le32_to_cpu(payload->size)); 1676b4c625c6SSong Liu 1677b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 1678b4c625c6SSong Liu sizeof(__le32) * 1679b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1680b4c625c6SSong Liu } 1681b4c625c6SSong Liu 1682b4c625c6SSong Liu put_page(page); 1683b4c625c6SSong Liu return 0; 1684b4c625c6SSong Liu 1685b4c625c6SSong Liu mismatch: 1686b4c625c6SSong Liu put_page(page); 1687b4c625c6SSong Liu return -EINVAL; 1688b4c625c6SSong Liu } 1689b4c625c6SSong Liu 1690b4c625c6SSong Liu /* 1691b4c625c6SSong Liu * Analyze all data/parity pages in one meta block 1692b4c625c6SSong Liu * Returns: 1693b4c625c6SSong Liu * 0 for success 1694b4c625c6SSong Liu * -EINVAL for unknown playload type 1695b4c625c6SSong Liu * -EAGAIN for checksum mismatch of data page 1696b4c625c6SSong Liu * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1697b4c625c6SSong Liu */ 1698b4c625c6SSong Liu static int 1699b4c625c6SSong Liu r5c_recovery_analyze_meta_block(struct r5l_log *log, 1700b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1701b4c625c6SSong Liu struct list_head *cached_stripe_list) 1702b4c625c6SSong Liu { 1703b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1704b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1705b4c625c6SSong Liu struct r5l_meta_block *mb; 1706b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 1707b4c625c6SSong Liu int mb_offset; 1708b4c625c6SSong Liu sector_t log_offset; 1709b4c625c6SSong Liu sector_t stripe_sect; 1710b4c625c6SSong Liu struct stripe_head *sh; 1711b4c625c6SSong Liu int ret; 1712b4c625c6SSong Liu 1713b4c625c6SSong Liu /* 1714b4c625c6SSong Liu * for mismatch in data blocks, we will drop all data in this mb, but 1715b4c625c6SSong Liu * we will still read next mb for other data with FLUSH flag, as 1716b4c625c6SSong Liu * io_unit could finish out of order. 1717b4c625c6SSong Liu */ 1718b4c625c6SSong Liu ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1719b4c625c6SSong Liu if (ret == -EINVAL) 1720b4c625c6SSong Liu return -EAGAIN; 1721b4c625c6SSong Liu else if (ret) 1722b4c625c6SSong Liu return ret; /* -ENOMEM duo to alloc_page() failed */ 1723b4c625c6SSong Liu 1724b4c625c6SSong Liu mb = page_address(ctx->meta_page); 1725b4c625c6SSong Liu mb_offset = sizeof(struct r5l_meta_block); 1726b4c625c6SSong Liu log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1727b4c625c6SSong Liu 1728b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 1729b4c625c6SSong Liu int dd; 1730b4c625c6SSong Liu 1731b4c625c6SSong Liu payload = (void *)mb + mb_offset; 1732b4c625c6SSong Liu stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1733b4c625c6SSong Liu raid5_compute_sector( 1734b4c625c6SSong Liu conf, le64_to_cpu(payload->location), 0, &dd, 1735b4c625c6SSong Liu NULL) 1736b4c625c6SSong Liu : le64_to_cpu(payload->location); 1737b4c625c6SSong Liu 1738b4c625c6SSong Liu sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1739b4c625c6SSong Liu stripe_sect); 1740b4c625c6SSong Liu 1741b4c625c6SSong Liu if (!sh) { 1742b4c625c6SSong Liu sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list, 1743b4c625c6SSong Liu stripe_sect, ctx->pos); 1744b4c625c6SSong Liu /* 1745b4c625c6SSong Liu * cannot get stripe from raid5_get_active_stripe 1746b4c625c6SSong Liu * try replay some stripes 1747b4c625c6SSong Liu */ 1748b4c625c6SSong Liu if (!sh) { 1749b4c625c6SSong Liu r5c_recovery_replay_stripes( 1750b4c625c6SSong Liu cached_stripe_list, ctx); 1751b4c625c6SSong Liu sh = r5c_recovery_alloc_stripe( 1752b4c625c6SSong Liu conf, cached_stripe_list, 1753b4c625c6SSong Liu stripe_sect, ctx->pos); 1754b4c625c6SSong Liu } 1755b4c625c6SSong Liu if (!sh) { 1756b4c625c6SSong Liu pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1757b4c625c6SSong Liu mdname(mddev), 1758b4c625c6SSong Liu conf->min_nr_stripes * 2); 1759b4c625c6SSong Liu raid5_set_cache_size(mddev, 1760b4c625c6SSong Liu conf->min_nr_stripes * 2); 1761b4c625c6SSong Liu sh = r5c_recovery_alloc_stripe( 1762b4c625c6SSong Liu conf, cached_stripe_list, stripe_sect, 1763b4c625c6SSong Liu ctx->pos); 1764b4c625c6SSong Liu } 1765b4c625c6SSong Liu if (!sh) { 1766b4c625c6SSong Liu pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1767b4c625c6SSong Liu mdname(mddev)); 1768b4c625c6SSong Liu return -ENOMEM; 1769b4c625c6SSong Liu } 1770b4c625c6SSong Liu list_add_tail(&sh->lru, cached_stripe_list); 1771b4c625c6SSong Liu } 1772b4c625c6SSong Liu 1773b4c625c6SSong Liu if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1774b4c625c6SSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1775b4c625c6SSong Liu r5l_recovery_replay_one_stripe(conf, sh, ctx); 1776b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1777b4c625c6SSong Liu sh->log_start = ctx->pos; 1778b4c625c6SSong Liu list_move_tail(&sh->lru, cached_stripe_list); 1779b4c625c6SSong Liu } 1780b4c625c6SSong Liu r5l_recovery_load_data(log, sh, ctx, payload, 1781b4c625c6SSong Liu log_offset); 1782b4c625c6SSong Liu } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1783b4c625c6SSong Liu r5l_recovery_load_parity(log, sh, ctx, payload, 1784b4c625c6SSong Liu log_offset); 1785b4c625c6SSong Liu else 1786b4c625c6SSong Liu return -EINVAL; 1787b4c625c6SSong Liu 1788b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 1789b4c625c6SSong Liu le32_to_cpu(payload->size)); 1790b4c625c6SSong Liu 1791b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 1792b4c625c6SSong Liu sizeof(__le32) * 1793b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1794b4c625c6SSong Liu } 1795b4c625c6SSong Liu 1796b4c625c6SSong Liu return 0; 1797b4c625c6SSong Liu } 1798b4c625c6SSong Liu 1799b4c625c6SSong Liu /* 1800b4c625c6SSong Liu * Load the stripe into cache. The stripe will be written out later by 1801b4c625c6SSong Liu * the stripe cache state machine. 1802b4c625c6SSong Liu */ 1803b4c625c6SSong Liu static void r5c_recovery_load_one_stripe(struct r5l_log *log, 1804b4c625c6SSong Liu struct stripe_head *sh) 1805b4c625c6SSong Liu { 1806b4c625c6SSong Liu struct r5conf *conf = sh->raid_conf; 1807b4c625c6SSong Liu struct r5dev *dev; 1808b4c625c6SSong Liu int i; 1809b4c625c6SSong Liu 1810b4c625c6SSong Liu for (i = sh->disks; i--; ) { 1811b4c625c6SSong Liu dev = sh->dev + i; 1812b4c625c6SSong Liu if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1813b4c625c6SSong Liu set_bit(R5_InJournal, &dev->flags); 1814b4c625c6SSong Liu set_bit(R5_UPTODATE, &dev->flags); 1815b4c625c6SSong Liu } 1816b4c625c6SSong Liu } 1817b4c625c6SSong Liu set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 1818b4c625c6SSong Liu atomic_inc(&conf->r5c_cached_partial_stripes); 1819b4c625c6SSong Liu list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 1820b4c625c6SSong Liu } 1821b4c625c6SSong Liu 1822b4c625c6SSong Liu /* 1823b4c625c6SSong Liu * Scan through the log for all to-be-flushed data 1824b4c625c6SSong Liu * 1825b4c625c6SSong Liu * For stripes with data and parity, namely Data-Parity stripe 1826b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1827b4c625c6SSong Liu * 1828b4c625c6SSong Liu * For stripes with only data, namely Data-Only stripe 1829b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1830b4c625c6SSong Liu * 1831b4c625c6SSong Liu * For a stripe, if we see data after parity, we should discard all previous 1832b4c625c6SSong Liu * data and parity for this stripe, as these data are already flushed to 1833b4c625c6SSong Liu * the array. 1834b4c625c6SSong Liu * 1835b4c625c6SSong Liu * At the end of the scan, we return the new journal_tail, which points to 1836b4c625c6SSong Liu * first data-only stripe on the journal device, or next invalid meta block. 1837b4c625c6SSong Liu */ 1838b4c625c6SSong Liu static int r5c_recovery_flush_log(struct r5l_log *log, 1839b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1840b4c625c6SSong Liu { 1841b4c625c6SSong Liu struct stripe_head *sh, *next; 1842b4c625c6SSong Liu int ret = 0; 1843b4c625c6SSong Liu 1844b4c625c6SSong Liu /* scan through the log */ 1845b4c625c6SSong Liu while (1) { 1846b4c625c6SSong Liu if (r5l_recovery_read_meta_block(log, ctx)) 1847b4c625c6SSong Liu break; 1848b4c625c6SSong Liu 1849b4c625c6SSong Liu ret = r5c_recovery_analyze_meta_block(log, ctx, 1850b4c625c6SSong Liu &ctx->cached_list); 1851b4c625c6SSong Liu /* 1852b4c625c6SSong Liu * -EAGAIN means mismatch in data block, in this case, we still 1853b4c625c6SSong Liu * try scan the next metablock 1854b4c625c6SSong Liu */ 1855b4c625c6SSong Liu if (ret && ret != -EAGAIN) 1856b4c625c6SSong Liu break; /* ret == -EINVAL or -ENOMEM */ 1857b4c625c6SSong Liu ctx->seq++; 1858b4c625c6SSong Liu ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1859b4c625c6SSong Liu } 1860b4c625c6SSong Liu 1861b4c625c6SSong Liu if (ret == -ENOMEM) { 1862b4c625c6SSong Liu r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1863b4c625c6SSong Liu return ret; 1864b4c625c6SSong Liu } 1865b4c625c6SSong Liu 1866b4c625c6SSong Liu /* replay data-parity stripes */ 1867b4c625c6SSong Liu r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1868b4c625c6SSong Liu 1869b4c625c6SSong Liu /* load data-only stripes to stripe cache */ 1870b4c625c6SSong Liu list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 1871b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1872b4c625c6SSong Liu r5c_recovery_load_one_stripe(log, sh); 1873b4c625c6SSong Liu list_del_init(&sh->lru); 1874b4c625c6SSong Liu raid5_release_stripe(sh); 1875b4c625c6SSong Liu ctx->data_only_stripes++; 1876b4c625c6SSong Liu } 1877b4c625c6SSong Liu 1878b4c625c6SSong Liu return 0; 1879b4c625c6SSong Liu } 1880b4c625c6SSong Liu 1881b4c625c6SSong Liu /* 1882b4c625c6SSong Liu * we did a recovery. Now ctx.pos points to an invalid meta block. New 1883b4c625c6SSong Liu * log will start here. but we can't let superblock point to last valid 1884b4c625c6SSong Liu * meta block. The log might looks like: 1885b4c625c6SSong Liu * | meta 1| meta 2| meta 3| 1886b4c625c6SSong Liu * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1887b4c625c6SSong Liu * superblock points to meta 1, we write a new valid meta 2n. if crash 1888b4c625c6SSong Liu * happens again, new recovery will start from meta 1. Since meta 2n is 1889b4c625c6SSong Liu * valid now, recovery will think meta 3 is valid, which is wrong. 1890b4c625c6SSong Liu * The solution is we create a new meta in meta2 with its seq == meta 1891b4c625c6SSong Liu * 1's seq + 10 and let superblock points to meta2. The same recovery will 1892b4c625c6SSong Liu * not think meta 3 is a valid meta, because its seq doesn't match 1893b4c625c6SSong Liu */ 1894b4c625c6SSong Liu 1895b4c625c6SSong Liu /* 1896b4c625c6SSong Liu * Before recovery, the log looks like the following 1897b4c625c6SSong Liu * 1898b4c625c6SSong Liu * --------------------------------------------- 1899b4c625c6SSong Liu * | valid log | invalid log | 1900b4c625c6SSong Liu * --------------------------------------------- 1901b4c625c6SSong Liu * ^ 1902b4c625c6SSong Liu * |- log->last_checkpoint 1903b4c625c6SSong Liu * |- log->last_cp_seq 1904b4c625c6SSong Liu * 1905b4c625c6SSong Liu * Now we scan through the log until we see invalid entry 1906b4c625c6SSong Liu * 1907b4c625c6SSong Liu * --------------------------------------------- 1908b4c625c6SSong Liu * | valid log | invalid log | 1909b4c625c6SSong Liu * --------------------------------------------- 1910b4c625c6SSong Liu * ^ ^ 1911b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos 1912b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq 1913b4c625c6SSong Liu * 1914b4c625c6SSong Liu * From this point, we need to increase seq number by 10 to avoid 1915b4c625c6SSong Liu * confusing next recovery. 1916b4c625c6SSong Liu * 1917b4c625c6SSong Liu * --------------------------------------------- 1918b4c625c6SSong Liu * | valid log | invalid log | 1919b4c625c6SSong Liu * --------------------------------------------- 1920b4c625c6SSong Liu * ^ ^ 1921b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+1 1922b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq+11 1923b4c625c6SSong Liu * 1924b4c625c6SSong Liu * However, it is not safe to start the state machine yet, because data only 1925b4c625c6SSong Liu * parities are not yet secured in RAID. To save these data only parities, we 1926b4c625c6SSong Liu * rewrite them from seq+11. 1927b4c625c6SSong Liu * 1928b4c625c6SSong Liu * ----------------------------------------------------------------- 1929b4c625c6SSong Liu * | valid log | data only stripes | invalid log | 1930b4c625c6SSong Liu * ----------------------------------------------------------------- 1931b4c625c6SSong Liu * ^ ^ 1932b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 1933b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq+10+n 1934b4c625c6SSong Liu * 1935b4c625c6SSong Liu * If failure happens again during this process, the recovery can safe start 1936b4c625c6SSong Liu * again from log->last_checkpoint. 1937b4c625c6SSong Liu * 1938b4c625c6SSong Liu * Once data only stripes are rewritten to journal, we move log_tail 1939b4c625c6SSong Liu * 1940b4c625c6SSong Liu * ----------------------------------------------------------------- 1941b4c625c6SSong Liu * | old log | data only stripes | invalid log | 1942b4c625c6SSong Liu * ----------------------------------------------------------------- 1943b4c625c6SSong Liu * ^ ^ 1944b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 1945b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq+10+n 1946b4c625c6SSong Liu * 1947b4c625c6SSong Liu * Then we can safely start the state machine. If failure happens from this 1948b4c625c6SSong Liu * point on, the recovery will start from new log->last_checkpoint. 1949b4c625c6SSong Liu */ 1950b4c625c6SSong Liu static int 1951b4c625c6SSong Liu r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 1952b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1953b4c625c6SSong Liu { 1954b4c625c6SSong Liu struct stripe_head *sh; 1955b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1956b4c625c6SSong Liu struct page *page; 1957b4c625c6SSong Liu 1958b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 1959b4c625c6SSong Liu if (!page) { 1960b4c625c6SSong Liu pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 1961b4c625c6SSong Liu mdname(mddev)); 1962b4c625c6SSong Liu return -ENOMEM; 1963b4c625c6SSong Liu } 1964b4c625c6SSong Liu 1965b4c625c6SSong Liu ctx->seq += 10; 1966b4c625c6SSong Liu list_for_each_entry(sh, &ctx->cached_list, lru) { 1967b4c625c6SSong Liu struct r5l_meta_block *mb; 1968b4c625c6SSong Liu int i; 1969b4c625c6SSong Liu int offset; 1970b4c625c6SSong Liu sector_t write_pos; 1971b4c625c6SSong Liu 1972b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1973b4c625c6SSong Liu r5l_recovery_create_empty_meta_block(log, page, 1974b4c625c6SSong Liu ctx->pos, ctx->seq); 1975b4c625c6SSong Liu mb = page_address(page); 1976b4c625c6SSong Liu offset = le32_to_cpu(mb->meta_size); 1977b4c625c6SSong Liu write_pos = ctx->pos + BLOCK_SECTORS; 1978b4c625c6SSong Liu 1979b4c625c6SSong Liu for (i = sh->disks; i--; ) { 1980b4c625c6SSong Liu struct r5dev *dev = &sh->dev[i]; 1981b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 1982b4c625c6SSong Liu void *addr; 1983b4c625c6SSong Liu 1984b4c625c6SSong Liu if (test_bit(R5_InJournal, &dev->flags)) { 1985b4c625c6SSong Liu payload = (void *)mb + offset; 1986b4c625c6SSong Liu payload->header.type = cpu_to_le16( 1987b4c625c6SSong Liu R5LOG_PAYLOAD_DATA); 1988b4c625c6SSong Liu payload->size = BLOCK_SECTORS; 1989b4c625c6SSong Liu payload->location = cpu_to_le64( 1990b4c625c6SSong Liu raid5_compute_blocknr(sh, i, 0)); 1991b4c625c6SSong Liu addr = kmap_atomic(dev->page); 1992b4c625c6SSong Liu payload->checksum[0] = cpu_to_le32( 1993b4c625c6SSong Liu crc32c_le(log->uuid_checksum, addr, 1994b4c625c6SSong Liu PAGE_SIZE)); 1995b4c625c6SSong Liu kunmap_atomic(addr); 1996b4c625c6SSong Liu sync_page_io(log->rdev, write_pos, PAGE_SIZE, 1997b4c625c6SSong Liu dev->page, REQ_OP_WRITE, 0, false); 1998b4c625c6SSong Liu write_pos = r5l_ring_add(log, write_pos, 1999b4c625c6SSong Liu BLOCK_SECTORS); 2000b4c625c6SSong Liu offset += sizeof(__le32) + 2001b4c625c6SSong Liu sizeof(struct r5l_payload_data_parity); 2002b4c625c6SSong Liu 2003b4c625c6SSong Liu } 2004b4c625c6SSong Liu } 2005b4c625c6SSong Liu mb->meta_size = cpu_to_le32(offset); 2006b4c625c6SSong Liu mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2007b4c625c6SSong Liu sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2008b4c625c6SSong Liu REQ_OP_WRITE, WRITE_FUA, false); 2009b4c625c6SSong Liu sh->log_start = ctx->pos; 2010b4c625c6SSong Liu ctx->pos = write_pos; 2011b4c625c6SSong Liu ctx->seq += 1; 2012b4c625c6SSong Liu } 2013b4c625c6SSong Liu __free_page(page); 2014b4c625c6SSong Liu return 0; 2015b4c625c6SSong Liu } 2016b4c625c6SSong Liu 2017f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log) 2018f6bed0efSShaohua Li { 2019*5aabf7c4SSong Liu struct mddev *mddev = log->rdev->mddev; 2020355810d1SShaohua Li struct r5l_recovery_ctx ctx; 2021*5aabf7c4SSong Liu int ret; 2022355810d1SShaohua Li 2023355810d1SShaohua Li ctx.pos = log->last_checkpoint; 2024355810d1SShaohua Li ctx.seq = log->last_cp_seq; 2025355810d1SShaohua Li ctx.meta_page = alloc_page(GFP_KERNEL); 2026b4c625c6SSong Liu ctx.data_only_stripes = 0; 2027b4c625c6SSong Liu ctx.data_parity_stripes = 0; 2028b4c625c6SSong Liu INIT_LIST_HEAD(&ctx.cached_list); 2029b4c625c6SSong Liu 2030355810d1SShaohua Li if (!ctx.meta_page) 2031355810d1SShaohua Li return -ENOMEM; 2032355810d1SShaohua Li 2033*5aabf7c4SSong Liu ret = r5c_recovery_flush_log(log, &ctx); 2034355810d1SShaohua Li __free_page(ctx.meta_page); 2035355810d1SShaohua Li 2036355810d1SShaohua Li if (ret) 2037355810d1SShaohua Li return ret; 2038*5aabf7c4SSong Liu 2039*5aabf7c4SSong Liu if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2040*5aabf7c4SSong Liu pr_debug("md/raid:%s: starting from clean shutdown\n", 2041*5aabf7c4SSong Liu mdname(mddev)); 2042*5aabf7c4SSong Liu else { 2043*5aabf7c4SSong Liu pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", 2044*5aabf7c4SSong Liu mdname(mddev), ctx.data_only_stripes, 2045*5aabf7c4SSong Liu ctx.data_parity_stripes); 2046*5aabf7c4SSong Liu 2047*5aabf7c4SSong Liu if (ctx.data_only_stripes > 0) 2048*5aabf7c4SSong Liu if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2049*5aabf7c4SSong Liu pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2050*5aabf7c4SSong Liu mdname(mddev)); 2051*5aabf7c4SSong Liu return -EIO; 2052*5aabf7c4SSong Liu } 2053*5aabf7c4SSong Liu } 2054*5aabf7c4SSong Liu 2055355810d1SShaohua Li log->log_start = ctx.pos; 2056*5aabf7c4SSong Liu log->next_checkpoint = ctx.pos; 2057355810d1SShaohua Li log->seq = ctx.seq; 2058*5aabf7c4SSong Liu r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq); 2059*5aabf7c4SSong Liu r5l_write_super(log, ctx.pos); 2060f6bed0efSShaohua Li return 0; 2061f6bed0efSShaohua Li } 2062f6bed0efSShaohua Li 2063f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp) 2064f6bed0efSShaohua Li { 2065f6bed0efSShaohua Li struct mddev *mddev = log->rdev->mddev; 2066f6bed0efSShaohua Li 2067f6bed0efSShaohua Li log->rdev->journal_tail = cp; 2068f6bed0efSShaohua Li set_bit(MD_CHANGE_DEVS, &mddev->flags); 2069f6bed0efSShaohua Li } 2070f6bed0efSShaohua Li 20712c7da14bSSong Liu static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 20722c7da14bSSong Liu { 20732c7da14bSSong Liu struct r5conf *conf = mddev->private; 20742c7da14bSSong Liu int ret; 20752c7da14bSSong Liu 20762c7da14bSSong Liu if (!conf->log) 20772c7da14bSSong Liu return 0; 20782c7da14bSSong Liu 20792c7da14bSSong Liu switch (conf->log->r5c_journal_mode) { 20802c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_THROUGH: 20812c7da14bSSong Liu ret = snprintf( 20822c7da14bSSong Liu page, PAGE_SIZE, "[%s] %s\n", 20832c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 20842c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 20852c7da14bSSong Liu break; 20862c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_BACK: 20872c7da14bSSong Liu ret = snprintf( 20882c7da14bSSong Liu page, PAGE_SIZE, "%s [%s]\n", 20892c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 20902c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 20912c7da14bSSong Liu break; 20922c7da14bSSong Liu default: 20932c7da14bSSong Liu ret = 0; 20942c7da14bSSong Liu } 20952c7da14bSSong Liu return ret; 20962c7da14bSSong Liu } 20972c7da14bSSong Liu 20982c7da14bSSong Liu static ssize_t r5c_journal_mode_store(struct mddev *mddev, 20992c7da14bSSong Liu const char *page, size_t length) 21002c7da14bSSong Liu { 21012c7da14bSSong Liu struct r5conf *conf = mddev->private; 21022c7da14bSSong Liu struct r5l_log *log = conf->log; 21032c7da14bSSong Liu int val = -1, i; 21042c7da14bSSong Liu int len = length; 21052c7da14bSSong Liu 21062c7da14bSSong Liu if (!log) 21072c7da14bSSong Liu return -ENODEV; 21082c7da14bSSong Liu 21092c7da14bSSong Liu if (len && page[len - 1] == '\n') 21102c7da14bSSong Liu len -= 1; 21112c7da14bSSong Liu for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 21122c7da14bSSong Liu if (strlen(r5c_journal_mode_str[i]) == len && 21132c7da14bSSong Liu strncmp(page, r5c_journal_mode_str[i], len) == 0) { 21142c7da14bSSong Liu val = i; 21152c7da14bSSong Liu break; 21162c7da14bSSong Liu } 21172c7da14bSSong Liu if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 21182c7da14bSSong Liu val > R5C_JOURNAL_MODE_WRITE_BACK) 21192c7da14bSSong Liu return -EINVAL; 21202c7da14bSSong Liu 21212c7da14bSSong Liu mddev_suspend(mddev); 21222c7da14bSSong Liu conf->log->r5c_journal_mode = val; 21232c7da14bSSong Liu mddev_resume(mddev); 21242c7da14bSSong Liu 21252c7da14bSSong Liu pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 21262c7da14bSSong Liu mdname(mddev), val, r5c_journal_mode_str[val]); 21272c7da14bSSong Liu return length; 21282c7da14bSSong Liu } 21292c7da14bSSong Liu 21302c7da14bSSong Liu struct md_sysfs_entry 21312c7da14bSSong Liu r5c_journal_mode = __ATTR(journal_mode, 0644, 21322c7da14bSSong Liu r5c_journal_mode_show, r5c_journal_mode_store); 21332c7da14bSSong Liu 21342ded3703SSong Liu /* 21352ded3703SSong Liu * Try handle write operation in caching phase. This function should only 21362ded3703SSong Liu * be called in write-back mode. 21372ded3703SSong Liu * 21382ded3703SSong Liu * If all outstanding writes can be handled in caching phase, returns 0 21392ded3703SSong Liu * If writes requires write-out phase, call r5c_make_stripe_write_out() 21402ded3703SSong Liu * and returns -EAGAIN 21412ded3703SSong Liu */ 21422ded3703SSong Liu int r5c_try_caching_write(struct r5conf *conf, 21432ded3703SSong Liu struct stripe_head *sh, 21442ded3703SSong Liu struct stripe_head_state *s, 21452ded3703SSong Liu int disks) 21462ded3703SSong Liu { 21472ded3703SSong Liu struct r5l_log *log = conf->log; 21481e6d690bSSong Liu int i; 21491e6d690bSSong Liu struct r5dev *dev; 21501e6d690bSSong Liu int to_cache = 0; 21512ded3703SSong Liu 21522ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 21532ded3703SSong Liu 21541e6d690bSSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 21551e6d690bSSong Liu /* 21561e6d690bSSong Liu * There are two different scenarios here: 21571e6d690bSSong Liu * 1. The stripe has some data cached, and it is sent to 21581e6d690bSSong Liu * write-out phase for reclaim 21591e6d690bSSong Liu * 2. The stripe is clean, and this is the first write 21601e6d690bSSong Liu * 21611e6d690bSSong Liu * For 1, return -EAGAIN, so we continue with 21621e6d690bSSong Liu * handle_stripe_dirtying(). 21631e6d690bSSong Liu * 21641e6d690bSSong Liu * For 2, set STRIPE_R5C_CACHING and continue with caching 21651e6d690bSSong Liu * write. 21661e6d690bSSong Liu */ 21671e6d690bSSong Liu 21681e6d690bSSong Liu /* case 1: anything injournal or anything in written */ 21691e6d690bSSong Liu if (s->injournal > 0 || s->written > 0) 21701e6d690bSSong Liu return -EAGAIN; 21711e6d690bSSong Liu /* case 2 */ 21721e6d690bSSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 21731e6d690bSSong Liu } 21741e6d690bSSong Liu 21751e6d690bSSong Liu for (i = disks; i--; ) { 21761e6d690bSSong Liu dev = &sh->dev[i]; 21771e6d690bSSong Liu /* if non-overwrite, use writing-out phase */ 21781e6d690bSSong Liu if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 21791e6d690bSSong Liu !test_bit(R5_InJournal, &dev->flags)) { 21802ded3703SSong Liu r5c_make_stripe_write_out(sh); 21812ded3703SSong Liu return -EAGAIN; 21822ded3703SSong Liu } 21831e6d690bSSong Liu } 21841e6d690bSSong Liu 21851e6d690bSSong Liu for (i = disks; i--; ) { 21861e6d690bSSong Liu dev = &sh->dev[i]; 21871e6d690bSSong Liu if (dev->towrite) { 21881e6d690bSSong Liu set_bit(R5_Wantwrite, &dev->flags); 21891e6d690bSSong Liu set_bit(R5_Wantdrain, &dev->flags); 21901e6d690bSSong Liu set_bit(R5_LOCKED, &dev->flags); 21911e6d690bSSong Liu to_cache++; 21921e6d690bSSong Liu } 21931e6d690bSSong Liu } 21941e6d690bSSong Liu 21951e6d690bSSong Liu if (to_cache) { 21961e6d690bSSong Liu set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 21971e6d690bSSong Liu /* 21981e6d690bSSong Liu * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 21991e6d690bSSong Liu * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 22001e6d690bSSong Liu * r5c_handle_data_cached() 22011e6d690bSSong Liu */ 22021e6d690bSSong Liu set_bit(STRIPE_LOG_TRAPPED, &sh->state); 22031e6d690bSSong Liu } 22041e6d690bSSong Liu 22051e6d690bSSong Liu return 0; 22061e6d690bSSong Liu } 22071e6d690bSSong Liu 22081e6d690bSSong Liu /* 22091e6d690bSSong Liu * free extra pages (orig_page) we allocated for prexor 22101e6d690bSSong Liu */ 22111e6d690bSSong Liu void r5c_release_extra_page(struct stripe_head *sh) 22121e6d690bSSong Liu { 22131e6d690bSSong Liu int i; 22141e6d690bSSong Liu 22151e6d690bSSong Liu for (i = sh->disks; i--; ) 22161e6d690bSSong Liu if (sh->dev[i].page != sh->dev[i].orig_page) { 22171e6d690bSSong Liu struct page *p = sh->dev[i].orig_page; 22181e6d690bSSong Liu 22191e6d690bSSong Liu sh->dev[i].orig_page = sh->dev[i].page; 22201e6d690bSSong Liu put_page(p); 22211e6d690bSSong Liu } 22221e6d690bSSong Liu } 22232ded3703SSong Liu 22242ded3703SSong Liu /* 22252ded3703SSong Liu * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 22262ded3703SSong Liu * stripe is committed to RAID disks. 22272ded3703SSong Liu */ 22282ded3703SSong Liu void r5c_finish_stripe_write_out(struct r5conf *conf, 22292ded3703SSong Liu struct stripe_head *sh, 22302ded3703SSong Liu struct stripe_head_state *s) 22312ded3703SSong Liu { 22321e6d690bSSong Liu int i; 22331e6d690bSSong Liu int do_wakeup = 0; 22341e6d690bSSong Liu 22352ded3703SSong Liu if (!conf->log || 22362ded3703SSong Liu !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 22372ded3703SSong Liu return; 22382ded3703SSong Liu 22392ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 22402ded3703SSong Liu clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 22412ded3703SSong Liu 22422ded3703SSong Liu if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 22432ded3703SSong Liu return; 22441e6d690bSSong Liu 22451e6d690bSSong Liu for (i = sh->disks; i--; ) { 22461e6d690bSSong Liu clear_bit(R5_InJournal, &sh->dev[i].flags); 22471e6d690bSSong Liu if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 22481e6d690bSSong Liu do_wakeup = 1; 22491e6d690bSSong Liu } 22501e6d690bSSong Liu 22511e6d690bSSong Liu /* 22521e6d690bSSong Liu * analyse_stripe() runs before r5c_finish_stripe_write_out(), 22531e6d690bSSong Liu * We updated R5_InJournal, so we also update s->injournal. 22541e6d690bSSong Liu */ 22551e6d690bSSong Liu s->injournal = 0; 22561e6d690bSSong Liu 22571e6d690bSSong Liu if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 22581e6d690bSSong Liu if (atomic_dec_and_test(&conf->pending_full_writes)) 22591e6d690bSSong Liu md_wakeup_thread(conf->mddev->thread); 22601e6d690bSSong Liu 22611e6d690bSSong Liu if (do_wakeup) 22621e6d690bSSong Liu wake_up(&conf->wait_for_overlap); 2263a39f7afdSSong Liu 2264a39f7afdSSong Liu if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2265a39f7afdSSong Liu return; 2266a39f7afdSSong Liu 2267a39f7afdSSong Liu spin_lock_irq(&conf->log->stripe_in_journal_lock); 2268a39f7afdSSong Liu list_del_init(&sh->r5c); 2269a39f7afdSSong Liu spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2270a39f7afdSSong Liu sh->log_start = MaxSector; 2271a39f7afdSSong Liu atomic_dec(&conf->log->stripe_in_journal_count); 22721e6d690bSSong Liu } 22731e6d690bSSong Liu 22741e6d690bSSong Liu int 22751e6d690bSSong Liu r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 22761e6d690bSSong Liu struct stripe_head_state *s) 22771e6d690bSSong Liu { 2278a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 22791e6d690bSSong Liu int pages = 0; 22801e6d690bSSong Liu int reserve; 22811e6d690bSSong Liu int i; 22821e6d690bSSong Liu int ret = 0; 22831e6d690bSSong Liu 22841e6d690bSSong Liu BUG_ON(!log); 22851e6d690bSSong Liu 22861e6d690bSSong Liu for (i = 0; i < sh->disks; i++) { 22871e6d690bSSong Liu void *addr; 22881e6d690bSSong Liu 22891e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 22901e6d690bSSong Liu continue; 22911e6d690bSSong Liu addr = kmap_atomic(sh->dev[i].page); 22921e6d690bSSong Liu sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 22931e6d690bSSong Liu addr, PAGE_SIZE); 22941e6d690bSSong Liu kunmap_atomic(addr); 22951e6d690bSSong Liu pages++; 22961e6d690bSSong Liu } 22971e6d690bSSong Liu WARN_ON(pages == 0); 22981e6d690bSSong Liu 22991e6d690bSSong Liu /* 23001e6d690bSSong Liu * The stripe must enter state machine again to call endio, so 23011e6d690bSSong Liu * don't delay. 23021e6d690bSSong Liu */ 23031e6d690bSSong Liu clear_bit(STRIPE_DELAYED, &sh->state); 23041e6d690bSSong Liu atomic_inc(&sh->count); 23051e6d690bSSong Liu 23061e6d690bSSong Liu mutex_lock(&log->io_mutex); 23071e6d690bSSong Liu /* meta + data */ 23081e6d690bSSong Liu reserve = (1 + pages) << (PAGE_SHIFT - 9); 23091e6d690bSSong Liu 2310a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2311a39f7afdSSong Liu sh->log_start == MaxSector) 2312a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 2313a39f7afdSSong Liu else if (!r5l_has_free_space(log, reserve)) { 2314a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 2315a39f7afdSSong Liu BUG(); 2316a39f7afdSSong Liu else 2317a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 23181e6d690bSSong Liu } else { 23191e6d690bSSong Liu ret = r5l_log_stripe(log, sh, pages, 0); 23201e6d690bSSong Liu if (ret) { 23211e6d690bSSong Liu spin_lock_irq(&log->io_list_lock); 23221e6d690bSSong Liu list_add_tail(&sh->log_list, &log->no_mem_stripes); 23231e6d690bSSong Liu spin_unlock_irq(&log->io_list_lock); 23241e6d690bSSong Liu } 23251e6d690bSSong Liu } 23261e6d690bSSong Liu 23271e6d690bSSong Liu mutex_unlock(&log->io_mutex); 23281e6d690bSSong Liu return 0; 23292ded3703SSong Liu } 23302ded3703SSong Liu 2331f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log) 2332f6bed0efSShaohua Li { 2333f6bed0efSShaohua Li struct md_rdev *rdev = log->rdev; 2334f6bed0efSShaohua Li struct page *page; 2335f6bed0efSShaohua Li struct r5l_meta_block *mb; 2336f6bed0efSShaohua Li sector_t cp = log->rdev->journal_tail; 2337f6bed0efSShaohua Li u32 stored_crc, expected_crc; 2338f6bed0efSShaohua Li bool create_super = false; 2339f6bed0efSShaohua Li int ret; 2340f6bed0efSShaohua Li 2341f6bed0efSShaohua Li /* Make sure it's valid */ 2342f6bed0efSShaohua Li if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2343f6bed0efSShaohua Li cp = 0; 2344f6bed0efSShaohua Li page = alloc_page(GFP_KERNEL); 2345f6bed0efSShaohua Li if (!page) 2346f6bed0efSShaohua Li return -ENOMEM; 2347f6bed0efSShaohua Li 2348796a5cf0SMike Christie if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2349f6bed0efSShaohua Li ret = -EIO; 2350f6bed0efSShaohua Li goto ioerr; 2351f6bed0efSShaohua Li } 2352f6bed0efSShaohua Li mb = page_address(page); 2353f6bed0efSShaohua Li 2354f6bed0efSShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2355f6bed0efSShaohua Li mb->version != R5LOG_VERSION) { 2356f6bed0efSShaohua Li create_super = true; 2357f6bed0efSShaohua Li goto create; 2358f6bed0efSShaohua Li } 2359f6bed0efSShaohua Li stored_crc = le32_to_cpu(mb->checksum); 2360f6bed0efSShaohua Li mb->checksum = 0; 23615cb2fbd6SShaohua Li expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2362f6bed0efSShaohua Li if (stored_crc != expected_crc) { 2363f6bed0efSShaohua Li create_super = true; 2364f6bed0efSShaohua Li goto create; 2365f6bed0efSShaohua Li } 2366f6bed0efSShaohua Li if (le64_to_cpu(mb->position) != cp) { 2367f6bed0efSShaohua Li create_super = true; 2368f6bed0efSShaohua Li goto create; 2369f6bed0efSShaohua Li } 2370f6bed0efSShaohua Li create: 2371f6bed0efSShaohua Li if (create_super) { 2372f6bed0efSShaohua Li log->last_cp_seq = prandom_u32(); 2373f6bed0efSShaohua Li cp = 0; 237456056c2eSZhengyuan Liu r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2375f6bed0efSShaohua Li /* 2376f6bed0efSShaohua Li * Make sure super points to correct address. Log might have 2377f6bed0efSShaohua Li * data very soon. If super hasn't correct log tail address, 2378f6bed0efSShaohua Li * recovery can't find the log 2379f6bed0efSShaohua Li */ 2380f6bed0efSShaohua Li r5l_write_super(log, cp); 2381f6bed0efSShaohua Li } else 2382f6bed0efSShaohua Li log->last_cp_seq = le64_to_cpu(mb->seq); 2383f6bed0efSShaohua Li 2384f6bed0efSShaohua Li log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 23850576b1c6SShaohua Li log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 23860576b1c6SShaohua Li if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 23870576b1c6SShaohua Li log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2388f6bed0efSShaohua Li log->last_checkpoint = cp; 238928cd88e2SZhengyuan Liu log->next_checkpoint = cp; 2390a39f7afdSSong Liu mutex_lock(&log->io_mutex); 2391a39f7afdSSong Liu r5c_update_log_state(log); 2392a39f7afdSSong Liu mutex_unlock(&log->io_mutex); 2393f6bed0efSShaohua Li 2394f6bed0efSShaohua Li __free_page(page); 2395f6bed0efSShaohua Li 2396f6bed0efSShaohua Li return r5l_recovery_log(log); 2397f6bed0efSShaohua Li ioerr: 2398f6bed0efSShaohua Li __free_page(page); 2399f6bed0efSShaohua Li return ret; 2400f6bed0efSShaohua Li } 2401f6bed0efSShaohua Li 2402f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2403f6bed0efSShaohua Li { 2404c888a8f9SJens Axboe struct request_queue *q = bdev_get_queue(rdev->bdev); 2405f6bed0efSShaohua Li struct r5l_log *log; 2406f6bed0efSShaohua Li 2407f6bed0efSShaohua Li if (PAGE_SIZE != 4096) 2408f6bed0efSShaohua Li return -EINVAL; 2409c757ec95SSong Liu 2410c757ec95SSong Liu /* 2411c757ec95SSong Liu * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2412c757ec95SSong Liu * raid_disks r5l_payload_data_parity. 2413c757ec95SSong Liu * 2414c757ec95SSong Liu * Write journal and cache does not work for very big array 2415c757ec95SSong Liu * (raid_disks > 203) 2416c757ec95SSong Liu */ 2417c757ec95SSong Liu if (sizeof(struct r5l_meta_block) + 2418c757ec95SSong Liu ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2419c757ec95SSong Liu conf->raid_disks) > PAGE_SIZE) { 2420c757ec95SSong Liu pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2421c757ec95SSong Liu mdname(conf->mddev), conf->raid_disks); 2422c757ec95SSong Liu return -EINVAL; 2423c757ec95SSong Liu } 2424c757ec95SSong Liu 2425f6bed0efSShaohua Li log = kzalloc(sizeof(*log), GFP_KERNEL); 2426f6bed0efSShaohua Li if (!log) 2427f6bed0efSShaohua Li return -ENOMEM; 2428f6bed0efSShaohua Li log->rdev = rdev; 2429f6bed0efSShaohua Li 2430c888a8f9SJens Axboe log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 243156fef7c6SChristoph Hellwig 24325cb2fbd6SShaohua Li log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2433f6bed0efSShaohua Li sizeof(rdev->mddev->uuid)); 2434f6bed0efSShaohua Li 2435f6bed0efSShaohua Li mutex_init(&log->io_mutex); 2436f6bed0efSShaohua Li 2437f6bed0efSShaohua Li spin_lock_init(&log->io_list_lock); 2438f6bed0efSShaohua Li INIT_LIST_HEAD(&log->running_ios); 24390576b1c6SShaohua Li INIT_LIST_HEAD(&log->io_end_ios); 2440a8c34f91SShaohua Li INIT_LIST_HEAD(&log->flushing_ios); 244104732f74SChristoph Hellwig INIT_LIST_HEAD(&log->finished_ios); 2442a8c34f91SShaohua Li bio_init(&log->flush_bio); 2443f6bed0efSShaohua Li 2444f6bed0efSShaohua Li log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2445f6bed0efSShaohua Li if (!log->io_kc) 2446f6bed0efSShaohua Li goto io_kc; 2447f6bed0efSShaohua Li 24485036c390SChristoph Hellwig log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 24495036c390SChristoph Hellwig if (!log->io_pool) 24505036c390SChristoph Hellwig goto io_pool; 24515036c390SChristoph Hellwig 2452c38d29b3SChristoph Hellwig log->bs = bioset_create(R5L_POOL_SIZE, 0); 2453c38d29b3SChristoph Hellwig if (!log->bs) 2454c38d29b3SChristoph Hellwig goto io_bs; 2455c38d29b3SChristoph Hellwig 2456e8deb638SChristoph Hellwig log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2457e8deb638SChristoph Hellwig if (!log->meta_pool) 2458e8deb638SChristoph Hellwig goto out_mempool; 2459e8deb638SChristoph Hellwig 24600576b1c6SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 24610576b1c6SShaohua Li log->rdev->mddev, "reclaim"); 24620576b1c6SShaohua Li if (!log->reclaim_thread) 24630576b1c6SShaohua Li goto reclaim_thread; 2464a39f7afdSSong Liu log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2465a39f7afdSSong Liu 24660fd22b45SShaohua Li init_waitqueue_head(&log->iounit_wait); 24670576b1c6SShaohua Li 24685036c390SChristoph Hellwig INIT_LIST_HEAD(&log->no_mem_stripes); 24695036c390SChristoph Hellwig 2470f6bed0efSShaohua Li INIT_LIST_HEAD(&log->no_space_stripes); 2471f6bed0efSShaohua Li spin_lock_init(&log->no_space_stripes_lock); 2472f6bed0efSShaohua Li 24732ded3703SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2474a39f7afdSSong Liu INIT_LIST_HEAD(&log->stripe_in_journal_list); 2475a39f7afdSSong Liu spin_lock_init(&log->stripe_in_journal_lock); 2476a39f7afdSSong Liu atomic_set(&log->stripe_in_journal_count, 0); 24772ded3703SSong Liu 2478f6bed0efSShaohua Li if (r5l_load_log(log)) 2479f6bed0efSShaohua Li goto error; 2480f6bed0efSShaohua Li 2481f6b6ec5cSShaohua Li rcu_assign_pointer(conf->log, log); 2482a62ab49eSShaohua Li set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2483f6bed0efSShaohua Li return 0; 2484e8deb638SChristoph Hellwig 2485f6bed0efSShaohua Li error: 24860576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 24870576b1c6SShaohua Li reclaim_thread: 2488e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 2489e8deb638SChristoph Hellwig out_mempool: 2490c38d29b3SChristoph Hellwig bioset_free(log->bs); 2491c38d29b3SChristoph Hellwig io_bs: 24925036c390SChristoph Hellwig mempool_destroy(log->io_pool); 24935036c390SChristoph Hellwig io_pool: 2494f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 2495f6bed0efSShaohua Li io_kc: 2496f6bed0efSShaohua Li kfree(log); 2497f6bed0efSShaohua Li return -EINVAL; 2498f6bed0efSShaohua Li } 2499f6bed0efSShaohua Li 2500f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log) 2501f6bed0efSShaohua Li { 25020576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 2503e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 2504c38d29b3SChristoph Hellwig bioset_free(log->bs); 25055036c390SChristoph Hellwig mempool_destroy(log->io_pool); 2506f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 2507f6bed0efSShaohua Li kfree(log); 2508f6bed0efSShaohua Li } 2509