1f6bed0efSShaohua Li /* 2f6bed0efSShaohua Li * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3b4c625c6SSong Liu * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 4f6bed0efSShaohua Li * 5f6bed0efSShaohua Li * This program is free software; you can redistribute it and/or modify it 6f6bed0efSShaohua Li * under the terms and conditions of the GNU General Public License, 7f6bed0efSShaohua Li * version 2, as published by the Free Software Foundation. 8f6bed0efSShaohua Li * 9f6bed0efSShaohua Li * This program is distributed in the hope it will be useful, but WITHOUT 10f6bed0efSShaohua Li * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11f6bed0efSShaohua Li * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12f6bed0efSShaohua Li * more details. 13f6bed0efSShaohua Li * 14f6bed0efSShaohua Li */ 15f6bed0efSShaohua Li #include <linux/kernel.h> 16f6bed0efSShaohua Li #include <linux/wait.h> 17f6bed0efSShaohua Li #include <linux/blkdev.h> 18f6bed0efSShaohua Li #include <linux/slab.h> 19f6bed0efSShaohua Li #include <linux/raid/md_p.h> 205cb2fbd6SShaohua Li #include <linux/crc32c.h> 21f6bed0efSShaohua Li #include <linux/random.h> 22ce1ccd07SShaohua Li #include <linux/kthread.h> 23f6bed0efSShaohua Li #include "md.h" 24f6bed0efSShaohua Li #include "raid5.h" 251e6d690bSSong Liu #include "bitmap.h" 26f6bed0efSShaohua Li 27f6bed0efSShaohua Li /* 28f6bed0efSShaohua Li * metadata/data stored in disk with 4k size unit (a block) regardless 29f6bed0efSShaohua Li * underneath hardware sector size. only works with PAGE_SIZE == 4096 30f6bed0efSShaohua Li */ 31f6bed0efSShaohua Li #define BLOCK_SECTORS (8) 32f6bed0efSShaohua Li 330576b1c6SShaohua Li /* 34a39f7afdSSong Liu * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 35a39f7afdSSong Liu * 36a39f7afdSSong Liu * In write through mode, the reclaim runs every log->max_free_space. 37a39f7afdSSong Liu * This can prevent the recovery scans for too long 380576b1c6SShaohua Li */ 390576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 400576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 410576b1c6SShaohua Li 42a39f7afdSSong Liu /* wake up reclaim thread periodically */ 43a39f7afdSSong Liu #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 44a39f7afdSSong Liu /* start flush with these full stripes */ 45a39f7afdSSong Liu #define R5C_FULL_STRIPE_FLUSH_BATCH 256 46a39f7afdSSong Liu /* reclaim stripes in groups */ 47a39f7afdSSong Liu #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 48a39f7afdSSong Liu 49c38d29b3SChristoph Hellwig /* 50c38d29b3SChristoph Hellwig * We only need 2 bios per I/O unit to make progress, but ensure we 51c38d29b3SChristoph Hellwig * have a few more available to not get too tight. 52c38d29b3SChristoph Hellwig */ 53c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE 4 54c38d29b3SChristoph Hellwig 552ded3703SSong Liu /* 562ded3703SSong Liu * r5c journal modes of the array: write-back or write-through. 572ded3703SSong Liu * write-through mode has identical behavior as existing log only 582ded3703SSong Liu * implementation. 592ded3703SSong Liu */ 602ded3703SSong Liu enum r5c_journal_mode { 612ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 622ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_BACK = 1, 632ded3703SSong Liu }; 642ded3703SSong Liu 652c7da14bSSong Liu static char *r5c_journal_mode_str[] = {"write-through", 662c7da14bSSong Liu "write-back"}; 672ded3703SSong Liu /* 682ded3703SSong Liu * raid5 cache state machine 692ded3703SSong Liu * 709b69173eSJackieLiu * With the RAID cache, each stripe works in two phases: 712ded3703SSong Liu * - caching phase 722ded3703SSong Liu * - writing-out phase 732ded3703SSong Liu * 742ded3703SSong Liu * These two phases are controlled by bit STRIPE_R5C_CACHING: 752ded3703SSong Liu * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 762ded3703SSong Liu * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 772ded3703SSong Liu * 782ded3703SSong Liu * When there is no journal, or the journal is in write-through mode, 792ded3703SSong Liu * the stripe is always in writing-out phase. 802ded3703SSong Liu * 812ded3703SSong Liu * For write-back journal, the stripe is sent to caching phase on write 822ded3703SSong Liu * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 832ded3703SSong Liu * the write-out phase by clearing STRIPE_R5C_CACHING. 842ded3703SSong Liu * 852ded3703SSong Liu * Stripes in caching phase do not write the raid disks. Instead, all 862ded3703SSong Liu * writes are committed from the log device. Therefore, a stripe in 872ded3703SSong Liu * caching phase handles writes as: 882ded3703SSong Liu * - write to log device 892ded3703SSong Liu * - return IO 902ded3703SSong Liu * 912ded3703SSong Liu * Stripes in writing-out phase handle writes as: 922ded3703SSong Liu * - calculate parity 932ded3703SSong Liu * - write pending data and parity to journal 942ded3703SSong Liu * - write data and parity to raid disks 952ded3703SSong Liu * - return IO for pending writes 962ded3703SSong Liu */ 972ded3703SSong Liu 98f6bed0efSShaohua Li struct r5l_log { 99f6bed0efSShaohua Li struct md_rdev *rdev; 100f6bed0efSShaohua Li 101f6bed0efSShaohua Li u32 uuid_checksum; 102f6bed0efSShaohua Li 103f6bed0efSShaohua Li sector_t device_size; /* log device size, round to 104f6bed0efSShaohua Li * BLOCK_SECTORS */ 1050576b1c6SShaohua Li sector_t max_free_space; /* reclaim run if free space is at 1060576b1c6SShaohua Li * this size */ 107f6bed0efSShaohua Li 108f6bed0efSShaohua Li sector_t last_checkpoint; /* log tail. where recovery scan 109f6bed0efSShaohua Li * starts from */ 110f6bed0efSShaohua Li u64 last_cp_seq; /* log tail sequence */ 111f6bed0efSShaohua Li 112f6bed0efSShaohua Li sector_t log_start; /* log head. where new data appends */ 113f6bed0efSShaohua Li u64 seq; /* log head sequence */ 114f6bed0efSShaohua Li 11517036461SChristoph Hellwig sector_t next_checkpoint; 11617036461SChristoph Hellwig 117f6bed0efSShaohua Li struct mutex io_mutex; 118f6bed0efSShaohua Li struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 119f6bed0efSShaohua Li 120f6bed0efSShaohua Li spinlock_t io_list_lock; 121f6bed0efSShaohua Li struct list_head running_ios; /* io_units which are still running, 122f6bed0efSShaohua Li * and have not yet been completely 123f6bed0efSShaohua Li * written to the log */ 124f6bed0efSShaohua Li struct list_head io_end_ios; /* io_units which have been completely 125f6bed0efSShaohua Li * written to the log but not yet written 126f6bed0efSShaohua Li * to the RAID */ 127a8c34f91SShaohua Li struct list_head flushing_ios; /* io_units which are waiting for log 128a8c34f91SShaohua Li * cache flush */ 12904732f74SChristoph Hellwig struct list_head finished_ios; /* io_units which settle down in log disk */ 130a8c34f91SShaohua Li struct bio flush_bio; 131f6bed0efSShaohua Li 1325036c390SChristoph Hellwig struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 1335036c390SChristoph Hellwig 134f6bed0efSShaohua Li struct kmem_cache *io_kc; 1355036c390SChristoph Hellwig mempool_t *io_pool; 136c38d29b3SChristoph Hellwig struct bio_set *bs; 137e8deb638SChristoph Hellwig mempool_t *meta_pool; 138f6bed0efSShaohua Li 1390576b1c6SShaohua Li struct md_thread *reclaim_thread; 1400576b1c6SShaohua Li unsigned long reclaim_target; /* number of space that need to be 1410576b1c6SShaohua Li * reclaimed. if it's 0, reclaim spaces 1420576b1c6SShaohua Li * used by io_units which are in 1430576b1c6SShaohua Li * IO_UNIT_STRIPE_END state (eg, reclaim 1440576b1c6SShaohua Li * dones't wait for specific io_unit 1450576b1c6SShaohua Li * switching to IO_UNIT_STRIPE_END 1460576b1c6SShaohua Li * state) */ 1470fd22b45SShaohua Li wait_queue_head_t iounit_wait; 1480576b1c6SShaohua Li 149f6bed0efSShaohua Li struct list_head no_space_stripes; /* pending stripes, log has no space */ 150f6bed0efSShaohua Li spinlock_t no_space_stripes_lock; 15156fef7c6SChristoph Hellwig 15256fef7c6SChristoph Hellwig bool need_cache_flush; 1532ded3703SSong Liu 1542ded3703SSong Liu /* for r5c_cache */ 1552ded3703SSong Liu enum r5c_journal_mode r5c_journal_mode; 156a39f7afdSSong Liu 157a39f7afdSSong Liu /* all stripes in r5cache, in the order of seq at sh->log_start */ 158a39f7afdSSong Liu struct list_head stripe_in_journal_list; 159a39f7afdSSong Liu 160a39f7afdSSong Liu spinlock_t stripe_in_journal_lock; 161a39f7afdSSong Liu atomic_t stripe_in_journal_count; 1623bddb7f8SSong Liu 1633bddb7f8SSong Liu /* to submit async io_units, to fulfill ordering of flush */ 1643bddb7f8SSong Liu struct work_struct deferred_io_work; 165f6bed0efSShaohua Li }; 166f6bed0efSShaohua Li 167f6bed0efSShaohua Li /* 168f6bed0efSShaohua Li * an IO range starts from a meta data block and end at the next meta data 169f6bed0efSShaohua Li * block. The io unit's the meta data block tracks data/parity followed it. io 170f6bed0efSShaohua Li * unit is written to log disk with normal write, as we always flush log disk 171f6bed0efSShaohua Li * first and then start move data to raid disks, there is no requirement to 172f6bed0efSShaohua Li * write io unit with FLUSH/FUA 173f6bed0efSShaohua Li */ 174f6bed0efSShaohua Li struct r5l_io_unit { 175f6bed0efSShaohua Li struct r5l_log *log; 176f6bed0efSShaohua Li 177f6bed0efSShaohua Li struct page *meta_page; /* store meta block */ 178f6bed0efSShaohua Li int meta_offset; /* current offset in meta_page */ 179f6bed0efSShaohua Li 180f6bed0efSShaohua Li struct bio *current_bio;/* current_bio accepting new data */ 181f6bed0efSShaohua Li 182f6bed0efSShaohua Li atomic_t pending_stripe;/* how many stripes not flushed to raid */ 183f6bed0efSShaohua Li u64 seq; /* seq number of the metablock */ 184f6bed0efSShaohua Li sector_t log_start; /* where the io_unit starts */ 185f6bed0efSShaohua Li sector_t log_end; /* where the io_unit ends */ 186f6bed0efSShaohua Li struct list_head log_sibling; /* log->running_ios */ 187f6bed0efSShaohua Li struct list_head stripe_list; /* stripes added to the io_unit */ 188f6bed0efSShaohua Li 189f6bed0efSShaohua Li int state; 1906143e2ceSChristoph Hellwig bool need_split_bio; 1913bddb7f8SSong Liu struct bio *split_bio; 1923bddb7f8SSong Liu 1933bddb7f8SSong Liu unsigned int has_flush:1; /* include flush request */ 1943bddb7f8SSong Liu unsigned int has_fua:1; /* include fua request */ 1953bddb7f8SSong Liu unsigned int has_null_flush:1; /* include empty flush request */ 1963bddb7f8SSong Liu /* 1973bddb7f8SSong Liu * io isn't sent yet, flush/fua request can only be submitted till it's 1983bddb7f8SSong Liu * the first IO in running_ios list 1993bddb7f8SSong Liu */ 2003bddb7f8SSong Liu unsigned int io_deferred:1; 2013bddb7f8SSong Liu 2023bddb7f8SSong Liu struct bio_list flush_barriers; /* size == 0 flush bios */ 203f6bed0efSShaohua Li }; 204f6bed0efSShaohua Li 205f6bed0efSShaohua Li /* r5l_io_unit state */ 206f6bed0efSShaohua Li enum r5l_io_unit_state { 207f6bed0efSShaohua Li IO_UNIT_RUNNING = 0, /* accepting new IO */ 208f6bed0efSShaohua Li IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 209f6bed0efSShaohua Li * don't accepting new bio */ 210f6bed0efSShaohua Li IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 211a8c34f91SShaohua Li IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 212f6bed0efSShaohua Li }; 213f6bed0efSShaohua Li 2142ded3703SSong Liu bool r5c_is_writeback(struct r5l_log *log) 2152ded3703SSong Liu { 2162ded3703SSong Liu return (log != NULL && 2172ded3703SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 2182ded3703SSong Liu } 2192ded3703SSong Liu 220f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 221f6bed0efSShaohua Li { 222f6bed0efSShaohua Li start += inc; 223f6bed0efSShaohua Li if (start >= log->device_size) 224f6bed0efSShaohua Li start = start - log->device_size; 225f6bed0efSShaohua Li return start; 226f6bed0efSShaohua Li } 227f6bed0efSShaohua Li 228f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 229f6bed0efSShaohua Li sector_t end) 230f6bed0efSShaohua Li { 231f6bed0efSShaohua Li if (end >= start) 232f6bed0efSShaohua Li return end - start; 233f6bed0efSShaohua Li else 234f6bed0efSShaohua Li return end + log->device_size - start; 235f6bed0efSShaohua Li } 236f6bed0efSShaohua Li 237f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 238f6bed0efSShaohua Li { 239f6bed0efSShaohua Li sector_t used_size; 240f6bed0efSShaohua Li 241f6bed0efSShaohua Li used_size = r5l_ring_distance(log, log->last_checkpoint, 242f6bed0efSShaohua Li log->log_start); 243f6bed0efSShaohua Li 244f6bed0efSShaohua Li return log->device_size > used_size + size; 245f6bed0efSShaohua Li } 246f6bed0efSShaohua Li 247f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 248f6bed0efSShaohua Li enum r5l_io_unit_state state) 249f6bed0efSShaohua Li { 250f6bed0efSShaohua Li if (WARN_ON(io->state >= state)) 251f6bed0efSShaohua Li return; 252f6bed0efSShaohua Li io->state = state; 253f6bed0efSShaohua Li } 254f6bed0efSShaohua Li 2551e6d690bSSong Liu static void 2561e6d690bSSong Liu r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 2571e6d690bSSong Liu struct bio_list *return_bi) 2581e6d690bSSong Liu { 2591e6d690bSSong Liu struct bio *wbi, *wbi2; 2601e6d690bSSong Liu 2611e6d690bSSong Liu wbi = dev->written; 2621e6d690bSSong Liu dev->written = NULL; 2631e6d690bSSong Liu while (wbi && wbi->bi_iter.bi_sector < 2641e6d690bSSong Liu dev->sector + STRIPE_SECTORS) { 2651e6d690bSSong Liu wbi2 = r5_next_bio(wbi, dev->sector); 2661e6d690bSSong Liu if (!raid5_dec_bi_active_stripes(wbi)) { 2671e6d690bSSong Liu md_write_end(conf->mddev); 2681e6d690bSSong Liu bio_list_add(return_bi, wbi); 2691e6d690bSSong Liu } 2701e6d690bSSong Liu wbi = wbi2; 2711e6d690bSSong Liu } 2721e6d690bSSong Liu } 2731e6d690bSSong Liu 2741e6d690bSSong Liu void r5c_handle_cached_data_endio(struct r5conf *conf, 2751e6d690bSSong Liu struct stripe_head *sh, int disks, struct bio_list *return_bi) 2761e6d690bSSong Liu { 2771e6d690bSSong Liu int i; 2781e6d690bSSong Liu 2791e6d690bSSong Liu for (i = sh->disks; i--; ) { 2801e6d690bSSong Liu if (sh->dev[i].written) { 2811e6d690bSSong Liu set_bit(R5_UPTODATE, &sh->dev[i].flags); 2821e6d690bSSong Liu r5c_return_dev_pending_writes(conf, &sh->dev[i], 2831e6d690bSSong Liu return_bi); 2841e6d690bSSong Liu bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2851e6d690bSSong Liu STRIPE_SECTORS, 2861e6d690bSSong Liu !test_bit(STRIPE_DEGRADED, &sh->state), 2871e6d690bSSong Liu 0); 2881e6d690bSSong Liu } 2891e6d690bSSong Liu } 2901e6d690bSSong Liu } 2911e6d690bSSong Liu 292a39f7afdSSong Liu /* Check whether we should flush some stripes to free up stripe cache */ 293a39f7afdSSong Liu void r5c_check_stripe_cache_usage(struct r5conf *conf) 294a39f7afdSSong Liu { 295a39f7afdSSong Liu int total_cached; 296a39f7afdSSong Liu 297a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 298a39f7afdSSong Liu return; 299a39f7afdSSong Liu 300a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 301a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 302a39f7afdSSong Liu 303a39f7afdSSong Liu /* 304a39f7afdSSong Liu * The following condition is true for either of the following: 305a39f7afdSSong Liu * - stripe cache pressure high: 306a39f7afdSSong Liu * total_cached > 3/4 min_nr_stripes || 307a39f7afdSSong Liu * empty_inactive_list_nr > 0 308a39f7afdSSong Liu * - stripe cache pressure moderate: 309a39f7afdSSong Liu * total_cached > 1/2 min_nr_stripes 310a39f7afdSSong Liu */ 311a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 1 / 2 || 312a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 313a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 314a39f7afdSSong Liu } 315a39f7afdSSong Liu 316a39f7afdSSong Liu /* 317a39f7afdSSong Liu * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 318a39f7afdSSong Liu * stripes in the cache 319a39f7afdSSong Liu */ 320a39f7afdSSong Liu void r5c_check_cached_full_stripe(struct r5conf *conf) 321a39f7afdSSong Liu { 322a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 323a39f7afdSSong Liu return; 324a39f7afdSSong Liu 325a39f7afdSSong Liu /* 326a39f7afdSSong Liu * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 327a39f7afdSSong Liu * or a full stripe (chunk size / 4k stripes). 328a39f7afdSSong Liu */ 329a39f7afdSSong Liu if (atomic_read(&conf->r5c_cached_full_stripes) >= 330a39f7afdSSong Liu min(R5C_FULL_STRIPE_FLUSH_BATCH, 331a39f7afdSSong Liu conf->chunk_sectors >> STRIPE_SHIFT)) 332a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 333a39f7afdSSong Liu } 334a39f7afdSSong Liu 335a39f7afdSSong Liu /* 336a39f7afdSSong Liu * Total log space (in sectors) needed to flush all data in cache 337a39f7afdSSong Liu * 338a39f7afdSSong Liu * Currently, writing-out phase automatically includes all pending writes 339a39f7afdSSong Liu * to the same sector. So the reclaim of each stripe takes up to 340a39f7afdSSong Liu * (conf->raid_disks + 1) pages of log space. 341a39f7afdSSong Liu * 342a39f7afdSSong Liu * To totally avoid deadlock due to log space, the code reserves 343a39f7afdSSong Liu * (conf->raid_disks + 1) pages for each stripe in cache, which is not 344a39f7afdSSong Liu * necessary in most cases. 345a39f7afdSSong Liu * 346a39f7afdSSong Liu * To improve this, we will need writing-out phase to be able to NOT include 347a39f7afdSSong Liu * pending writes, which will reduce the requirement to 348a39f7afdSSong Liu * (conf->max_degraded + 1) pages per stripe in cache. 349a39f7afdSSong Liu */ 350a39f7afdSSong Liu static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 351a39f7afdSSong Liu { 352a39f7afdSSong Liu struct r5l_log *log = conf->log; 353a39f7afdSSong Liu 354a39f7afdSSong Liu if (!r5c_is_writeback(log)) 355a39f7afdSSong Liu return 0; 356a39f7afdSSong Liu 357a39f7afdSSong Liu return BLOCK_SECTORS * (conf->raid_disks + 1) * 358a39f7afdSSong Liu atomic_read(&log->stripe_in_journal_count); 359a39f7afdSSong Liu } 360a39f7afdSSong Liu 361a39f7afdSSong Liu /* 362a39f7afdSSong Liu * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 363a39f7afdSSong Liu * 364a39f7afdSSong Liu * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 365a39f7afdSSong Liu * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 366a39f7afdSSong Liu * device is less than 2x of reclaim_required_space. 367a39f7afdSSong Liu */ 368a39f7afdSSong Liu static inline void r5c_update_log_state(struct r5l_log *log) 369a39f7afdSSong Liu { 370a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 371a39f7afdSSong Liu sector_t free_space; 372a39f7afdSSong Liu sector_t reclaim_space; 373f687a33eSSong Liu bool wake_reclaim = false; 374a39f7afdSSong Liu 375a39f7afdSSong Liu if (!r5c_is_writeback(log)) 376a39f7afdSSong Liu return; 377a39f7afdSSong Liu 378a39f7afdSSong Liu free_space = r5l_ring_distance(log, log->log_start, 379a39f7afdSSong Liu log->last_checkpoint); 380a39f7afdSSong Liu reclaim_space = r5c_log_required_to_flush_cache(conf); 381a39f7afdSSong Liu if (free_space < 2 * reclaim_space) 382a39f7afdSSong Liu set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 383f687a33eSSong Liu else { 384f687a33eSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 385f687a33eSSong Liu wake_reclaim = true; 386a39f7afdSSong Liu clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 387f687a33eSSong Liu } 388a39f7afdSSong Liu if (free_space < 3 * reclaim_space) 389a39f7afdSSong Liu set_bit(R5C_LOG_TIGHT, &conf->cache_state); 390a39f7afdSSong Liu else 391a39f7afdSSong Liu clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 392f687a33eSSong Liu 393f687a33eSSong Liu if (wake_reclaim) 394f687a33eSSong Liu r5l_wake_reclaim(log, 0); 395a39f7afdSSong Liu } 396a39f7afdSSong Liu 3972ded3703SSong Liu /* 3982ded3703SSong Liu * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 3992ded3703SSong Liu * This function should only be called in write-back mode. 4002ded3703SSong Liu */ 401a39f7afdSSong Liu void r5c_make_stripe_write_out(struct stripe_head *sh) 4022ded3703SSong Liu { 4032ded3703SSong Liu struct r5conf *conf = sh->raid_conf; 4042ded3703SSong Liu struct r5l_log *log = conf->log; 4052ded3703SSong Liu 4062ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 4072ded3703SSong Liu 4082ded3703SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 4092ded3703SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 4101e6d690bSSong Liu 4111e6d690bSSong Liu if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4121e6d690bSSong Liu atomic_inc(&conf->preread_active_stripes); 4131e6d690bSSong Liu 4141e6d690bSSong Liu if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 4151e6d690bSSong Liu BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 4161e6d690bSSong Liu atomic_dec(&conf->r5c_cached_partial_stripes); 4171e6d690bSSong Liu } 4181e6d690bSSong Liu 4191e6d690bSSong Liu if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 4201e6d690bSSong Liu BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 4211e6d690bSSong Liu atomic_dec(&conf->r5c_cached_full_stripes); 4221e6d690bSSong Liu } 4231e6d690bSSong Liu } 4241e6d690bSSong Liu 4251e6d690bSSong Liu static void r5c_handle_data_cached(struct stripe_head *sh) 4261e6d690bSSong Liu { 4271e6d690bSSong Liu int i; 4281e6d690bSSong Liu 4291e6d690bSSong Liu for (i = sh->disks; i--; ) 4301e6d690bSSong Liu if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 4311e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[i].flags); 4321e6d690bSSong Liu clear_bit(R5_LOCKED, &sh->dev[i].flags); 4331e6d690bSSong Liu } 4341e6d690bSSong Liu clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 4351e6d690bSSong Liu } 4361e6d690bSSong Liu 4371e6d690bSSong Liu /* 4381e6d690bSSong Liu * this journal write must contain full parity, 4391e6d690bSSong Liu * it may also contain some data pages 4401e6d690bSSong Liu */ 4411e6d690bSSong Liu static void r5c_handle_parity_cached(struct stripe_head *sh) 4421e6d690bSSong Liu { 4431e6d690bSSong Liu int i; 4441e6d690bSSong Liu 4451e6d690bSSong Liu for (i = sh->disks; i--; ) 4461e6d690bSSong Liu if (test_bit(R5_InJournal, &sh->dev[i].flags)) 4471e6d690bSSong Liu set_bit(R5_Wantwrite, &sh->dev[i].flags); 4482ded3703SSong Liu } 4492ded3703SSong Liu 4502ded3703SSong Liu /* 4512ded3703SSong Liu * Setting proper flags after writing (or flushing) data and/or parity to the 4522ded3703SSong Liu * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 4532ded3703SSong Liu */ 4542ded3703SSong Liu static void r5c_finish_cache_stripe(struct stripe_head *sh) 4552ded3703SSong Liu { 4562ded3703SSong Liu struct r5l_log *log = sh->raid_conf->log; 4572ded3703SSong Liu 4582ded3703SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 4592ded3703SSong Liu BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 4602ded3703SSong Liu /* 4612ded3703SSong Liu * Set R5_InJournal for parity dev[pd_idx]. This means 4622ded3703SSong Liu * all data AND parity in the journal. For RAID 6, it is 4632ded3703SSong Liu * NOT necessary to set the flag for dev[qd_idx], as the 4642ded3703SSong Liu * two parities are written out together. 4652ded3703SSong Liu */ 4662ded3703SSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 4671e6d690bSSong Liu } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 4681e6d690bSSong Liu r5c_handle_data_cached(sh); 4691e6d690bSSong Liu } else { 4701e6d690bSSong Liu r5c_handle_parity_cached(sh); 4711e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 4721e6d690bSSong Liu } 4732ded3703SSong Liu } 4742ded3703SSong Liu 475d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io) 476d8858f43SChristoph Hellwig { 477d8858f43SChristoph Hellwig struct stripe_head *sh, *next; 478d8858f43SChristoph Hellwig 479d8858f43SChristoph Hellwig list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 480d8858f43SChristoph Hellwig list_del_init(&sh->log_list); 4812ded3703SSong Liu 4822ded3703SSong Liu r5c_finish_cache_stripe(sh); 4832ded3703SSong Liu 484d8858f43SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 485d8858f43SChristoph Hellwig raid5_release_stripe(sh); 486d8858f43SChristoph Hellwig } 487d8858f43SChristoph Hellwig } 488d8858f43SChristoph Hellwig 48956fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log) 49056fef7c6SChristoph Hellwig { 49156fef7c6SChristoph Hellwig struct r5l_io_unit *io, *next; 49256fef7c6SChristoph Hellwig 49356fef7c6SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 49456fef7c6SChristoph Hellwig 49556fef7c6SChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 49656fef7c6SChristoph Hellwig /* don't change list order */ 49756fef7c6SChristoph Hellwig if (io->state < IO_UNIT_IO_END) 49856fef7c6SChristoph Hellwig break; 49956fef7c6SChristoph Hellwig 50056fef7c6SChristoph Hellwig list_move_tail(&io->log_sibling, &log->finished_ios); 50156fef7c6SChristoph Hellwig r5l_io_run_stripes(io); 50256fef7c6SChristoph Hellwig } 50356fef7c6SChristoph Hellwig } 50456fef7c6SChristoph Hellwig 5053848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log) 5063848c0bcSChristoph Hellwig { 5073848c0bcSChristoph Hellwig struct r5l_io_unit *io, *next; 5083848c0bcSChristoph Hellwig 5093848c0bcSChristoph Hellwig assert_spin_locked(&log->io_list_lock); 5103848c0bcSChristoph Hellwig 5113848c0bcSChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 5123848c0bcSChristoph Hellwig /* don't change list order */ 5133848c0bcSChristoph Hellwig if (io->state < IO_UNIT_IO_END) 5143848c0bcSChristoph Hellwig break; 5153848c0bcSChristoph Hellwig list_move_tail(&io->log_sibling, &log->io_end_ios); 5163848c0bcSChristoph Hellwig } 5173848c0bcSChristoph Hellwig } 5183848c0bcSChristoph Hellwig 5193bddb7f8SSong Liu static void __r5l_stripe_write_finished(struct r5l_io_unit *io); 520f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio) 521f6bed0efSShaohua Li { 522f6bed0efSShaohua Li struct r5l_io_unit *io = bio->bi_private; 5233bddb7f8SSong Liu struct r5l_io_unit *io_deferred; 524f6bed0efSShaohua Li struct r5l_log *log = io->log; 525509ffec7SChristoph Hellwig unsigned long flags; 526f6bed0efSShaohua Li 5276e74a9cfSShaohua Li if (bio->bi_error) 5286e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 5296e74a9cfSShaohua Li 530f6bed0efSShaohua Li bio_put(bio); 531e8deb638SChristoph Hellwig mempool_free(io->meta_page, log->meta_pool); 532f6bed0efSShaohua Li 533509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 534509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 53556fef7c6SChristoph Hellwig if (log->need_cache_flush) 5363848c0bcSChristoph Hellwig r5l_move_to_end_ios(log); 53756fef7c6SChristoph Hellwig else 53856fef7c6SChristoph Hellwig r5l_log_run_stripes(log); 5393bddb7f8SSong Liu if (!list_empty(&log->running_ios)) { 5403bddb7f8SSong Liu /* 5413bddb7f8SSong Liu * FLUSH/FUA io_unit is deferred because of ordering, now we 5423bddb7f8SSong Liu * can dispatch it 5433bddb7f8SSong Liu */ 5443bddb7f8SSong Liu io_deferred = list_first_entry(&log->running_ios, 5453bddb7f8SSong Liu struct r5l_io_unit, log_sibling); 5463bddb7f8SSong Liu if (io_deferred->io_deferred) 5473bddb7f8SSong Liu schedule_work(&log->deferred_io_work); 5483bddb7f8SSong Liu } 5493bddb7f8SSong Liu 550509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 551509ffec7SChristoph Hellwig 55256fef7c6SChristoph Hellwig if (log->need_cache_flush) 553f6bed0efSShaohua Li md_wakeup_thread(log->rdev->mddev->thread); 5543bddb7f8SSong Liu 5553bddb7f8SSong Liu if (io->has_null_flush) { 5563bddb7f8SSong Liu struct bio *bi; 5573bddb7f8SSong Liu 5583bddb7f8SSong Liu WARN_ON(bio_list_empty(&io->flush_barriers)); 5593bddb7f8SSong Liu while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 5603bddb7f8SSong Liu bio_endio(bi); 5613bddb7f8SSong Liu atomic_dec(&io->pending_stripe); 5623bddb7f8SSong Liu } 5633bddb7f8SSong Liu if (atomic_read(&io->pending_stripe) == 0) 5643bddb7f8SSong Liu __r5l_stripe_write_finished(io); 5653bddb7f8SSong Liu } 5663bddb7f8SSong Liu } 5673bddb7f8SSong Liu 5683bddb7f8SSong Liu static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 5693bddb7f8SSong Liu { 5703bddb7f8SSong Liu unsigned long flags; 5713bddb7f8SSong Liu 5723bddb7f8SSong Liu spin_lock_irqsave(&log->io_list_lock, flags); 5733bddb7f8SSong Liu __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 5743bddb7f8SSong Liu spin_unlock_irqrestore(&log->io_list_lock, flags); 5753bddb7f8SSong Liu 5763bddb7f8SSong Liu if (io->has_flush) 5773bddb7f8SSong Liu bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH); 5783bddb7f8SSong Liu if (io->has_fua) 5793bddb7f8SSong Liu bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA); 5803bddb7f8SSong Liu submit_bio(io->current_bio); 5813bddb7f8SSong Liu 5823bddb7f8SSong Liu if (!io->split_bio) 5833bddb7f8SSong Liu return; 5843bddb7f8SSong Liu 5853bddb7f8SSong Liu if (io->has_flush) 5863bddb7f8SSong Liu bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH); 5873bddb7f8SSong Liu if (io->has_fua) 5883bddb7f8SSong Liu bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA); 5893bddb7f8SSong Liu submit_bio(io->split_bio); 5903bddb7f8SSong Liu } 5913bddb7f8SSong Liu 5923bddb7f8SSong Liu /* deferred io_unit will be dispatched here */ 5933bddb7f8SSong Liu static void r5l_submit_io_async(struct work_struct *work) 5943bddb7f8SSong Liu { 5953bddb7f8SSong Liu struct r5l_log *log = container_of(work, struct r5l_log, 5963bddb7f8SSong Liu deferred_io_work); 5973bddb7f8SSong Liu struct r5l_io_unit *io = NULL; 5983bddb7f8SSong Liu unsigned long flags; 5993bddb7f8SSong Liu 6003bddb7f8SSong Liu spin_lock_irqsave(&log->io_list_lock, flags); 6013bddb7f8SSong Liu if (!list_empty(&log->running_ios)) { 6023bddb7f8SSong Liu io = list_first_entry(&log->running_ios, struct r5l_io_unit, 6033bddb7f8SSong Liu log_sibling); 6043bddb7f8SSong Liu if (!io->io_deferred) 6053bddb7f8SSong Liu io = NULL; 6063bddb7f8SSong Liu else 6073bddb7f8SSong Liu io->io_deferred = 0; 6083bddb7f8SSong Liu } 6093bddb7f8SSong Liu spin_unlock_irqrestore(&log->io_list_lock, flags); 6103bddb7f8SSong Liu if (io) 6113bddb7f8SSong Liu r5l_do_submit_io(log, io); 612f6bed0efSShaohua Li } 613f6bed0efSShaohua Li 614f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log) 615f6bed0efSShaohua Li { 616f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 6173bddb7f8SSong Liu struct bio *bio; 618f6bed0efSShaohua Li struct r5l_meta_block *block; 619509ffec7SChristoph Hellwig unsigned long flags; 620f6bed0efSShaohua Li u32 crc; 6213bddb7f8SSong Liu bool do_submit = true; 622f6bed0efSShaohua Li 623f6bed0efSShaohua Li if (!io) 624f6bed0efSShaohua Li return; 625f6bed0efSShaohua Li 626f6bed0efSShaohua Li block = page_address(io->meta_page); 627f6bed0efSShaohua Li block->meta_size = cpu_to_le32(io->meta_offset); 6285cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 629f6bed0efSShaohua Li block->checksum = cpu_to_le32(crc); 6303bddb7f8SSong Liu bio = io->current_bio; 631f6bed0efSShaohua Li 632f6bed0efSShaohua Li log->current_io = NULL; 633509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 6343bddb7f8SSong Liu if (io->has_flush || io->has_fua) { 6353bddb7f8SSong Liu if (io != list_first_entry(&log->running_ios, 6363bddb7f8SSong Liu struct r5l_io_unit, log_sibling)) { 6373bddb7f8SSong Liu io->io_deferred = 1; 6383bddb7f8SSong Liu do_submit = false; 6393bddb7f8SSong Liu } 6403bddb7f8SSong Liu } 641509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 6423bddb7f8SSong Liu if (do_submit) 6433bddb7f8SSong Liu r5l_do_submit_io(log, io); 644f6bed0efSShaohua Li } 645f6bed0efSShaohua Li 6466143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log) 647b349feb3SChristoph Hellwig { 648c38d29b3SChristoph Hellwig struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 649b349feb3SChristoph Hellwig 650796a5cf0SMike Christie bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 651b349feb3SChristoph Hellwig bio->bi_bdev = log->rdev->bdev; 6521e932a37SChristoph Hellwig bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 653b349feb3SChristoph Hellwig 654b349feb3SChristoph Hellwig return bio; 655b349feb3SChristoph Hellwig } 656b349feb3SChristoph Hellwig 657c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 658c1b99198SChristoph Hellwig { 659c1b99198SChristoph Hellwig log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 660c1b99198SChristoph Hellwig 661a39f7afdSSong Liu r5c_update_log_state(log); 662c1b99198SChristoph Hellwig /* 663c1b99198SChristoph Hellwig * If we filled up the log device start from the beginning again, 664c1b99198SChristoph Hellwig * which will require a new bio. 665c1b99198SChristoph Hellwig * 666c1b99198SChristoph Hellwig * Note: for this to work properly the log size needs to me a multiple 667c1b99198SChristoph Hellwig * of BLOCK_SECTORS. 668c1b99198SChristoph Hellwig */ 669c1b99198SChristoph Hellwig if (log->log_start == 0) 6706143e2ceSChristoph Hellwig io->need_split_bio = true; 671c1b99198SChristoph Hellwig 672c1b99198SChristoph Hellwig io->log_end = log->log_start; 673c1b99198SChristoph Hellwig } 674c1b99198SChristoph Hellwig 675f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 676f6bed0efSShaohua Li { 677f6bed0efSShaohua Li struct r5l_io_unit *io; 678f6bed0efSShaohua Li struct r5l_meta_block *block; 679f6bed0efSShaohua Li 6805036c390SChristoph Hellwig io = mempool_alloc(log->io_pool, GFP_ATOMIC); 6815036c390SChristoph Hellwig if (!io) 6825036c390SChristoph Hellwig return NULL; 6835036c390SChristoph Hellwig memset(io, 0, sizeof(*io)); 6845036c390SChristoph Hellwig 68551039cd0SChristoph Hellwig io->log = log; 68651039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->log_sibling); 68751039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->stripe_list); 6883bddb7f8SSong Liu bio_list_init(&io->flush_barriers); 68951039cd0SChristoph Hellwig io->state = IO_UNIT_RUNNING; 690f6bed0efSShaohua Li 691e8deb638SChristoph Hellwig io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 692f6bed0efSShaohua Li block = page_address(io->meta_page); 693e8deb638SChristoph Hellwig clear_page(block); 694f6bed0efSShaohua Li block->magic = cpu_to_le32(R5LOG_MAGIC); 695f6bed0efSShaohua Li block->version = R5LOG_VERSION; 696f6bed0efSShaohua Li block->seq = cpu_to_le64(log->seq); 697f6bed0efSShaohua Li block->position = cpu_to_le64(log->log_start); 698f6bed0efSShaohua Li 699f6bed0efSShaohua Li io->log_start = log->log_start; 700f6bed0efSShaohua Li io->meta_offset = sizeof(struct r5l_meta_block); 7012b8ef16eSChristoph Hellwig io->seq = log->seq++; 702f6bed0efSShaohua Li 7036143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 7046143e2ceSChristoph Hellwig io->current_bio->bi_end_io = r5l_log_endio; 7056143e2ceSChristoph Hellwig io->current_bio->bi_private = io; 706b349feb3SChristoph Hellwig bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 707f6bed0efSShaohua Li 708c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 709f6bed0efSShaohua Li 710f6bed0efSShaohua Li spin_lock_irq(&log->io_list_lock); 711f6bed0efSShaohua Li list_add_tail(&io->log_sibling, &log->running_ios); 712f6bed0efSShaohua Li spin_unlock_irq(&log->io_list_lock); 713f6bed0efSShaohua Li 714f6bed0efSShaohua Li return io; 715f6bed0efSShaohua Li } 716f6bed0efSShaohua Li 717f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 718f6bed0efSShaohua Li { 71922581f58SChristoph Hellwig if (log->current_io && 72022581f58SChristoph Hellwig log->current_io->meta_offset + payload_size > PAGE_SIZE) 721f6bed0efSShaohua Li r5l_submit_current_io(log); 722f6bed0efSShaohua Li 7235036c390SChristoph Hellwig if (!log->current_io) { 724f6bed0efSShaohua Li log->current_io = r5l_new_meta(log); 7255036c390SChristoph Hellwig if (!log->current_io) 7265036c390SChristoph Hellwig return -ENOMEM; 7275036c390SChristoph Hellwig } 7285036c390SChristoph Hellwig 729f6bed0efSShaohua Li return 0; 730f6bed0efSShaohua Li } 731f6bed0efSShaohua Li 732f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 733f6bed0efSShaohua Li sector_t location, 734f6bed0efSShaohua Li u32 checksum1, u32 checksum2, 735f6bed0efSShaohua Li bool checksum2_valid) 736f6bed0efSShaohua Li { 737f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 738f6bed0efSShaohua Li struct r5l_payload_data_parity *payload; 739f6bed0efSShaohua Li 740f6bed0efSShaohua Li payload = page_address(io->meta_page) + io->meta_offset; 741f6bed0efSShaohua Li payload->header.type = cpu_to_le16(type); 742f6bed0efSShaohua Li payload->header.flags = cpu_to_le16(0); 743f6bed0efSShaohua Li payload->size = cpu_to_le32((1 + !!checksum2_valid) << 744f6bed0efSShaohua Li (PAGE_SHIFT - 9)); 745f6bed0efSShaohua Li payload->location = cpu_to_le64(location); 746f6bed0efSShaohua Li payload->checksum[0] = cpu_to_le32(checksum1); 747f6bed0efSShaohua Li if (checksum2_valid) 748f6bed0efSShaohua Li payload->checksum[1] = cpu_to_le32(checksum2); 749f6bed0efSShaohua Li 750f6bed0efSShaohua Li io->meta_offset += sizeof(struct r5l_payload_data_parity) + 751f6bed0efSShaohua Li sizeof(__le32) * (1 + !!checksum2_valid); 752f6bed0efSShaohua Li } 753f6bed0efSShaohua Li 754f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 755f6bed0efSShaohua Li { 756f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 757f6bed0efSShaohua Li 7586143e2ceSChristoph Hellwig if (io->need_split_bio) { 7593bddb7f8SSong Liu BUG_ON(io->split_bio); 7603bddb7f8SSong Liu io->split_bio = io->current_bio; 7616143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 7623bddb7f8SSong Liu bio_chain(io->current_bio, io->split_bio); 7633bddb7f8SSong Liu io->need_split_bio = false; 764f6bed0efSShaohua Li } 765f6bed0efSShaohua Li 7666143e2ceSChristoph Hellwig if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 7676143e2ceSChristoph Hellwig BUG(); 7686143e2ceSChristoph Hellwig 769c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 770f6bed0efSShaohua Li } 771f6bed0efSShaohua Li 7725036c390SChristoph Hellwig static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 773f6bed0efSShaohua Li int data_pages, int parity_pages) 774f6bed0efSShaohua Li { 775f6bed0efSShaohua Li int i; 776f6bed0efSShaohua Li int meta_size; 7775036c390SChristoph Hellwig int ret; 778f6bed0efSShaohua Li struct r5l_io_unit *io; 779f6bed0efSShaohua Li 780f6bed0efSShaohua Li meta_size = 781f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 782f6bed0efSShaohua Li * data_pages) + 783f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 784f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 785f6bed0efSShaohua Li 7865036c390SChristoph Hellwig ret = r5l_get_meta(log, meta_size); 7875036c390SChristoph Hellwig if (ret) 7885036c390SChristoph Hellwig return ret; 7895036c390SChristoph Hellwig 790f6bed0efSShaohua Li io = log->current_io; 791f6bed0efSShaohua Li 7923bddb7f8SSong Liu if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 7933bddb7f8SSong Liu io->has_flush = 1; 7943bddb7f8SSong Liu 795f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 7961e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 7971e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 798f6bed0efSShaohua Li continue; 799f6bed0efSShaohua Li if (i == sh->pd_idx || i == sh->qd_idx) 800f6bed0efSShaohua Li continue; 8013bddb7f8SSong Liu if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 8023bddb7f8SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 8033bddb7f8SSong Liu io->has_fua = 1; 8043bddb7f8SSong Liu /* 8053bddb7f8SSong Liu * we need to flush journal to make sure recovery can 8063bddb7f8SSong Liu * reach the data with fua flag 8073bddb7f8SSong Liu */ 8083bddb7f8SSong Liu io->has_flush = 1; 8093bddb7f8SSong Liu } 810f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 811f6bed0efSShaohua Li raid5_compute_blocknr(sh, i, 0), 812f6bed0efSShaohua Li sh->dev[i].log_checksum, 0, false); 813f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[i].page); 814f6bed0efSShaohua Li } 815f6bed0efSShaohua Li 8162ded3703SSong Liu if (parity_pages == 2) { 817f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 818f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 819f6bed0efSShaohua Li sh->dev[sh->qd_idx].log_checksum, true); 820f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 821f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 8222ded3703SSong Liu } else if (parity_pages == 1) { 823f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 824f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 825f6bed0efSShaohua Li 0, false); 826f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 8272ded3703SSong Liu } else /* Just writing data, not parity, in caching phase */ 8282ded3703SSong Liu BUG_ON(parity_pages != 0); 829f6bed0efSShaohua Li 830f6bed0efSShaohua Li list_add_tail(&sh->log_list, &io->stripe_list); 831f6bed0efSShaohua Li atomic_inc(&io->pending_stripe); 832f6bed0efSShaohua Li sh->log_io = io; 8335036c390SChristoph Hellwig 834a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 835a39f7afdSSong Liu return 0; 836a39f7afdSSong Liu 837a39f7afdSSong Liu if (sh->log_start == MaxSector) { 838a39f7afdSSong Liu BUG_ON(!list_empty(&sh->r5c)); 839a39f7afdSSong Liu sh->log_start = io->log_start; 840a39f7afdSSong Liu spin_lock_irq(&log->stripe_in_journal_lock); 841a39f7afdSSong Liu list_add_tail(&sh->r5c, 842a39f7afdSSong Liu &log->stripe_in_journal_list); 843a39f7afdSSong Liu spin_unlock_irq(&log->stripe_in_journal_lock); 844a39f7afdSSong Liu atomic_inc(&log->stripe_in_journal_count); 845a39f7afdSSong Liu } 8465036c390SChristoph Hellwig return 0; 847f6bed0efSShaohua Li } 848f6bed0efSShaohua Li 849a39f7afdSSong Liu /* add stripe to no_space_stripes, and then wake up reclaim */ 850a39f7afdSSong Liu static inline void r5l_add_no_space_stripe(struct r5l_log *log, 851a39f7afdSSong Liu struct stripe_head *sh) 852a39f7afdSSong Liu { 853a39f7afdSSong Liu spin_lock(&log->no_space_stripes_lock); 854a39f7afdSSong Liu list_add_tail(&sh->log_list, &log->no_space_stripes); 855a39f7afdSSong Liu spin_unlock(&log->no_space_stripes_lock); 856a39f7afdSSong Liu } 857a39f7afdSSong Liu 858f6bed0efSShaohua Li /* 859f6bed0efSShaohua Li * running in raid5d, where reclaim could wait for raid5d too (when it flushes 860f6bed0efSShaohua Li * data from log to raid disks), so we shouldn't wait for reclaim here 861f6bed0efSShaohua Li */ 862f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 863f6bed0efSShaohua Li { 864a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 865f6bed0efSShaohua Li int write_disks = 0; 866f6bed0efSShaohua Li int data_pages, parity_pages; 867f6bed0efSShaohua Li int reserve; 868f6bed0efSShaohua Li int i; 8695036c390SChristoph Hellwig int ret = 0; 870a39f7afdSSong Liu bool wake_reclaim = false; 871f6bed0efSShaohua Li 872f6bed0efSShaohua Li if (!log) 873f6bed0efSShaohua Li return -EAGAIN; 874f6bed0efSShaohua Li /* Don't support stripe batch */ 875f6bed0efSShaohua Li if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 876f6bed0efSShaohua Li test_bit(STRIPE_SYNCING, &sh->state)) { 877f6bed0efSShaohua Li /* the stripe is written to log, we start writing it to raid */ 878f6bed0efSShaohua Li clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 879f6bed0efSShaohua Li return -EAGAIN; 880f6bed0efSShaohua Li } 881f6bed0efSShaohua Li 8822ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 8832ded3703SSong Liu 884f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 885f6bed0efSShaohua Li void *addr; 886f6bed0efSShaohua Li 8871e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 8881e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 889f6bed0efSShaohua Li continue; 8901e6d690bSSong Liu 891f6bed0efSShaohua Li write_disks++; 892f6bed0efSShaohua Li /* checksum is already calculated in last run */ 893f6bed0efSShaohua Li if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 894f6bed0efSShaohua Li continue; 895f6bed0efSShaohua Li addr = kmap_atomic(sh->dev[i].page); 8965cb2fbd6SShaohua Li sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 897f6bed0efSShaohua Li addr, PAGE_SIZE); 898f6bed0efSShaohua Li kunmap_atomic(addr); 899f6bed0efSShaohua Li } 900f6bed0efSShaohua Li parity_pages = 1 + !!(sh->qd_idx >= 0); 901f6bed0efSShaohua Li data_pages = write_disks - parity_pages; 902f6bed0efSShaohua Li 903f6bed0efSShaohua Li set_bit(STRIPE_LOG_TRAPPED, &sh->state); 904253f9fd4SShaohua Li /* 905253f9fd4SShaohua Li * The stripe must enter state machine again to finish the write, so 906253f9fd4SShaohua Li * don't delay. 907253f9fd4SShaohua Li */ 908253f9fd4SShaohua Li clear_bit(STRIPE_DELAYED, &sh->state); 909f6bed0efSShaohua Li atomic_inc(&sh->count); 910f6bed0efSShaohua Li 911f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 912f6bed0efSShaohua Li /* meta + data */ 913f6bed0efSShaohua Li reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 914f6bed0efSShaohua Li 915a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 916a39f7afdSSong Liu if (!r5l_has_free_space(log, reserve)) { 917a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 918a39f7afdSSong Liu wake_reclaim = true; 9195036c390SChristoph Hellwig } else { 9205036c390SChristoph Hellwig ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 9215036c390SChristoph Hellwig if (ret) { 9225036c390SChristoph Hellwig spin_lock_irq(&log->io_list_lock); 923a39f7afdSSong Liu list_add_tail(&sh->log_list, 924a39f7afdSSong Liu &log->no_mem_stripes); 9255036c390SChristoph Hellwig spin_unlock_irq(&log->io_list_lock); 926f6bed0efSShaohua Li } 9275036c390SChristoph Hellwig } 928a39f7afdSSong Liu } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 929a39f7afdSSong Liu /* 930a39f7afdSSong Liu * log space critical, do not process stripes that are 931a39f7afdSSong Liu * not in cache yet (sh->log_start == MaxSector). 932a39f7afdSSong Liu */ 933a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 934a39f7afdSSong Liu sh->log_start == MaxSector) { 935a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 936a39f7afdSSong Liu wake_reclaim = true; 937a39f7afdSSong Liu reserve = 0; 938a39f7afdSSong Liu } else if (!r5l_has_free_space(log, reserve)) { 939a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 940a39f7afdSSong Liu BUG(); 941a39f7afdSSong Liu else 942a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 943a39f7afdSSong Liu } else { 944a39f7afdSSong Liu ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 945a39f7afdSSong Liu if (ret) { 946a39f7afdSSong Liu spin_lock_irq(&log->io_list_lock); 947a39f7afdSSong Liu list_add_tail(&sh->log_list, 948a39f7afdSSong Liu &log->no_mem_stripes); 949a39f7afdSSong Liu spin_unlock_irq(&log->io_list_lock); 950a39f7afdSSong Liu } 951a39f7afdSSong Liu } 952a39f7afdSSong Liu } 953f6bed0efSShaohua Li 9545036c390SChristoph Hellwig mutex_unlock(&log->io_mutex); 955a39f7afdSSong Liu if (wake_reclaim) 956a39f7afdSSong Liu r5l_wake_reclaim(log, reserve); 957f6bed0efSShaohua Li return 0; 958f6bed0efSShaohua Li } 959f6bed0efSShaohua Li 960f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log) 961f6bed0efSShaohua Li { 962f6bed0efSShaohua Li if (!log) 963f6bed0efSShaohua Li return; 964f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 965f6bed0efSShaohua Li r5l_submit_current_io(log); 966f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 967f6bed0efSShaohua Li } 968f6bed0efSShaohua Li 969828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 970828cbe98SShaohua Li { 971828cbe98SShaohua Li if (!log) 972828cbe98SShaohua Li return -ENODEV; 9733bddb7f8SSong Liu 9743bddb7f8SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 975828cbe98SShaohua Li /* 9763bddb7f8SSong Liu * in write through (journal only) 9773bddb7f8SSong Liu * we flush log disk cache first, then write stripe data to 9783bddb7f8SSong Liu * raid disks. So if bio is finished, the log disk cache is 9793bddb7f8SSong Liu * flushed already. The recovery guarantees we can recovery 9803bddb7f8SSong Liu * the bio from log disk, so we don't need to flush again 981828cbe98SShaohua Li */ 982828cbe98SShaohua Li if (bio->bi_iter.bi_size == 0) { 983828cbe98SShaohua Li bio_endio(bio); 984828cbe98SShaohua Li return 0; 985828cbe98SShaohua Li } 9861eff9d32SJens Axboe bio->bi_opf &= ~REQ_PREFLUSH; 9873bddb7f8SSong Liu } else { 9883bddb7f8SSong Liu /* write back (with cache) */ 9893bddb7f8SSong Liu if (bio->bi_iter.bi_size == 0) { 9903bddb7f8SSong Liu mutex_lock(&log->io_mutex); 9913bddb7f8SSong Liu r5l_get_meta(log, 0); 9923bddb7f8SSong Liu bio_list_add(&log->current_io->flush_barriers, bio); 9933bddb7f8SSong Liu log->current_io->has_flush = 1; 9943bddb7f8SSong Liu log->current_io->has_null_flush = 1; 9953bddb7f8SSong Liu atomic_inc(&log->current_io->pending_stripe); 9963bddb7f8SSong Liu r5l_submit_current_io(log); 9973bddb7f8SSong Liu mutex_unlock(&log->io_mutex); 9983bddb7f8SSong Liu return 0; 9993bddb7f8SSong Liu } 10003bddb7f8SSong Liu } 1001828cbe98SShaohua Li return -EAGAIN; 1002828cbe98SShaohua Li } 1003828cbe98SShaohua Li 1004f6bed0efSShaohua Li /* This will run after log space is reclaimed */ 1005f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log) 1006f6bed0efSShaohua Li { 1007f6bed0efSShaohua Li struct stripe_head *sh; 1008f6bed0efSShaohua Li 1009f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 1010f6bed0efSShaohua Li while (!list_empty(&log->no_space_stripes)) { 1011f6bed0efSShaohua Li sh = list_first_entry(&log->no_space_stripes, 1012f6bed0efSShaohua Li struct stripe_head, log_list); 1013f6bed0efSShaohua Li list_del_init(&sh->log_list); 1014f6bed0efSShaohua Li set_bit(STRIPE_HANDLE, &sh->state); 1015f6bed0efSShaohua Li raid5_release_stripe(sh); 1016f6bed0efSShaohua Li } 1017f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 1018f6bed0efSShaohua Li } 1019f6bed0efSShaohua Li 1020a39f7afdSSong Liu /* 1021a39f7afdSSong Liu * calculate new last_checkpoint 1022a39f7afdSSong Liu * for write through mode, returns log->next_checkpoint 1023a39f7afdSSong Liu * for write back, returns log_start of first sh in stripe_in_journal_list 1024a39f7afdSSong Liu */ 1025a39f7afdSSong Liu static sector_t r5c_calculate_new_cp(struct r5conf *conf) 1026a39f7afdSSong Liu { 1027a39f7afdSSong Liu struct stripe_head *sh; 1028a39f7afdSSong Liu struct r5l_log *log = conf->log; 1029a39f7afdSSong Liu sector_t new_cp; 1030a39f7afdSSong Liu unsigned long flags; 1031a39f7afdSSong Liu 1032a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1033a39f7afdSSong Liu return log->next_checkpoint; 1034a39f7afdSSong Liu 1035a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1036a39f7afdSSong Liu if (list_empty(&conf->log->stripe_in_journal_list)) { 1037a39f7afdSSong Liu /* all stripes flushed */ 1038d3014e21SDan Carpenter spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1039a39f7afdSSong Liu return log->next_checkpoint; 1040a39f7afdSSong Liu } 1041a39f7afdSSong Liu sh = list_first_entry(&conf->log->stripe_in_journal_list, 1042a39f7afdSSong Liu struct stripe_head, r5c); 1043a39f7afdSSong Liu new_cp = sh->log_start; 1044a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1045a39f7afdSSong Liu return new_cp; 1046a39f7afdSSong Liu } 1047a39f7afdSSong Liu 104817036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log) 104917036461SChristoph Hellwig { 1050a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 1051a39f7afdSSong Liu 105217036461SChristoph Hellwig return r5l_ring_distance(log, log->last_checkpoint, 1053a39f7afdSSong Liu r5c_calculate_new_cp(conf)); 105417036461SChristoph Hellwig } 105517036461SChristoph Hellwig 10565036c390SChristoph Hellwig static void r5l_run_no_mem_stripe(struct r5l_log *log) 10575036c390SChristoph Hellwig { 10585036c390SChristoph Hellwig struct stripe_head *sh; 10595036c390SChristoph Hellwig 10605036c390SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 10615036c390SChristoph Hellwig 10625036c390SChristoph Hellwig if (!list_empty(&log->no_mem_stripes)) { 10635036c390SChristoph Hellwig sh = list_first_entry(&log->no_mem_stripes, 10645036c390SChristoph Hellwig struct stripe_head, log_list); 10655036c390SChristoph Hellwig list_del_init(&sh->log_list); 10665036c390SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 10675036c390SChristoph Hellwig raid5_release_stripe(sh); 10685036c390SChristoph Hellwig } 10695036c390SChristoph Hellwig } 10705036c390SChristoph Hellwig 107104732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log) 107217036461SChristoph Hellwig { 107317036461SChristoph Hellwig struct r5l_io_unit *io, *next; 107417036461SChristoph Hellwig bool found = false; 107517036461SChristoph Hellwig 107617036461SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 107717036461SChristoph Hellwig 107804732f74SChristoph Hellwig list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 107917036461SChristoph Hellwig /* don't change list order */ 108017036461SChristoph Hellwig if (io->state < IO_UNIT_STRIPE_END) 108117036461SChristoph Hellwig break; 108217036461SChristoph Hellwig 108317036461SChristoph Hellwig log->next_checkpoint = io->log_start; 108417036461SChristoph Hellwig 108517036461SChristoph Hellwig list_del(&io->log_sibling); 10865036c390SChristoph Hellwig mempool_free(io, log->io_pool); 10875036c390SChristoph Hellwig r5l_run_no_mem_stripe(log); 108817036461SChristoph Hellwig 108917036461SChristoph Hellwig found = true; 109017036461SChristoph Hellwig } 109117036461SChristoph Hellwig 109217036461SChristoph Hellwig return found; 109317036461SChristoph Hellwig } 109417036461SChristoph Hellwig 1095509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 1096509ffec7SChristoph Hellwig { 1097509ffec7SChristoph Hellwig struct r5l_log *log = io->log; 1098a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 1099509ffec7SChristoph Hellwig unsigned long flags; 1100509ffec7SChristoph Hellwig 1101509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 1102509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 110317036461SChristoph Hellwig 110404732f74SChristoph Hellwig if (!r5l_complete_finished_ios(log)) { 110585f2f9a4SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 110685f2f9a4SShaohua Li return; 110785f2f9a4SShaohua Li } 1108509ffec7SChristoph Hellwig 1109a39f7afdSSong Liu if (r5l_reclaimable_space(log) > log->max_free_space || 1110a39f7afdSSong Liu test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 1111509ffec7SChristoph Hellwig r5l_wake_reclaim(log, 0); 1112509ffec7SChristoph Hellwig 1113509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 1114509ffec7SChristoph Hellwig wake_up(&log->iounit_wait); 1115509ffec7SChristoph Hellwig } 1116509ffec7SChristoph Hellwig 11170576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh) 11180576b1c6SShaohua Li { 11190576b1c6SShaohua Li struct r5l_io_unit *io; 11200576b1c6SShaohua Li 11210576b1c6SShaohua Li io = sh->log_io; 11220576b1c6SShaohua Li sh->log_io = NULL; 11230576b1c6SShaohua Li 1124509ffec7SChristoph Hellwig if (io && atomic_dec_and_test(&io->pending_stripe)) 1125509ffec7SChristoph Hellwig __r5l_stripe_write_finished(io); 11260576b1c6SShaohua Li } 11270576b1c6SShaohua Li 1128a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio) 1129a8c34f91SShaohua Li { 1130a8c34f91SShaohua Li struct r5l_log *log = container_of(bio, struct r5l_log, 1131a8c34f91SShaohua Li flush_bio); 1132a8c34f91SShaohua Li unsigned long flags; 1133a8c34f91SShaohua Li struct r5l_io_unit *io; 1134a8c34f91SShaohua Li 11356e74a9cfSShaohua Li if (bio->bi_error) 11366e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 11376e74a9cfSShaohua Li 1138a8c34f91SShaohua Li spin_lock_irqsave(&log->io_list_lock, flags); 1139d8858f43SChristoph Hellwig list_for_each_entry(io, &log->flushing_ios, log_sibling) 1140d8858f43SChristoph Hellwig r5l_io_run_stripes(io); 114104732f74SChristoph Hellwig list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1142a8c34f91SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 1143a8c34f91SShaohua Li } 1144a8c34f91SShaohua Li 11450576b1c6SShaohua Li /* 11460576b1c6SShaohua Li * Starting dispatch IO to raid. 11470576b1c6SShaohua Li * io_unit(meta) consists of a log. There is one situation we want to avoid. A 11480576b1c6SShaohua Li * broken meta in the middle of a log causes recovery can't find meta at the 11490576b1c6SShaohua Li * head of log. If operations require meta at the head persistent in log, we 11500576b1c6SShaohua Li * must make sure meta before it persistent in log too. A case is: 11510576b1c6SShaohua Li * 11520576b1c6SShaohua Li * stripe data/parity is in log, we start write stripe to raid disks. stripe 11530576b1c6SShaohua Li * data/parity must be persistent in log before we do the write to raid disks. 11540576b1c6SShaohua Li * 11550576b1c6SShaohua Li * The solution is we restrictly maintain io_unit list order. In this case, we 11560576b1c6SShaohua Li * only write stripes of an io_unit to raid disks till the io_unit is the first 11570576b1c6SShaohua Li * one whose data/parity is in log. 11580576b1c6SShaohua Li */ 11590576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log) 11600576b1c6SShaohua Li { 1161a8c34f91SShaohua Li bool do_flush; 116256fef7c6SChristoph Hellwig 116356fef7c6SChristoph Hellwig if (!log || !log->need_cache_flush) 11640576b1c6SShaohua Li return; 11650576b1c6SShaohua Li 1166a8c34f91SShaohua Li spin_lock_irq(&log->io_list_lock); 1167a8c34f91SShaohua Li /* flush bio is running */ 1168a8c34f91SShaohua Li if (!list_empty(&log->flushing_ios)) { 1169a8c34f91SShaohua Li spin_unlock_irq(&log->io_list_lock); 11700576b1c6SShaohua Li return; 11710576b1c6SShaohua Li } 1172a8c34f91SShaohua Li list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1173a8c34f91SShaohua Li do_flush = !list_empty(&log->flushing_ios); 11740576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 1175a8c34f91SShaohua Li 1176a8c34f91SShaohua Li if (!do_flush) 1177a8c34f91SShaohua Li return; 1178a8c34f91SShaohua Li bio_reset(&log->flush_bio); 1179a8c34f91SShaohua Li log->flush_bio.bi_bdev = log->rdev->bdev; 1180a8c34f91SShaohua Li log->flush_bio.bi_end_io = r5l_log_flush_endio; 1181796a5cf0SMike Christie bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); 11824e49ea4aSMike Christie submit_bio(&log->flush_bio); 11830576b1c6SShaohua Li } 11840576b1c6SShaohua Li 11850576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp); 11864b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log, 11874b482044SShaohua Li sector_t end) 11884b482044SShaohua Li { 11894b482044SShaohua Li struct block_device *bdev = log->rdev->bdev; 11904b482044SShaohua Li struct mddev *mddev; 11914b482044SShaohua Li 11924b482044SShaohua Li r5l_write_super(log, end); 11934b482044SShaohua Li 11944b482044SShaohua Li if (!blk_queue_discard(bdev_get_queue(bdev))) 11954b482044SShaohua Li return; 11964b482044SShaohua Li 11974b482044SShaohua Li mddev = log->rdev->mddev; 11984b482044SShaohua Li /* 11998e018c21SShaohua Li * Discard could zero data, so before discard we must make sure 12008e018c21SShaohua Li * superblock is updated to new log tail. Updating superblock (either 12018e018c21SShaohua Li * directly call md_update_sb() or depend on md thread) must hold 12028e018c21SShaohua Li * reconfig mutex. On the other hand, raid5_quiesce is called with 12038e018c21SShaohua Li * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 12048e018c21SShaohua Li * for all IO finish, hence waitting for reclaim thread, while reclaim 12058e018c21SShaohua Li * thread is calling this function and waitting for reconfig mutex. So 12068e018c21SShaohua Li * there is a deadlock. We workaround this issue with a trylock. 12078e018c21SShaohua Li * FIXME: we could miss discard if we can't take reconfig mutex 12084b482044SShaohua Li */ 120985ad1d13SGuoqing Jiang set_mask_bits(&mddev->flags, 0, 121085ad1d13SGuoqing Jiang BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 12118e018c21SShaohua Li if (!mddev_trylock(mddev)) 12128e018c21SShaohua Li return; 12134b482044SShaohua Li md_update_sb(mddev, 1); 12148e018c21SShaohua Li mddev_unlock(mddev); 12154b482044SShaohua Li 12166e74a9cfSShaohua Li /* discard IO error really doesn't matter, ignore it */ 12174b482044SShaohua Li if (log->last_checkpoint < end) { 12184b482044SShaohua Li blkdev_issue_discard(bdev, 12194b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 12204b482044SShaohua Li end - log->last_checkpoint, GFP_NOIO, 0); 12214b482044SShaohua Li } else { 12224b482044SShaohua Li blkdev_issue_discard(bdev, 12234b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 12244b482044SShaohua Li log->device_size - log->last_checkpoint, 12254b482044SShaohua Li GFP_NOIO, 0); 12264b482044SShaohua Li blkdev_issue_discard(bdev, log->rdev->data_offset, end, 12274b482044SShaohua Li GFP_NOIO, 0); 12284b482044SShaohua Li } 12294b482044SShaohua Li } 12304b482044SShaohua Li 1231a39f7afdSSong Liu /* 1232a39f7afdSSong Liu * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1233a39f7afdSSong Liu * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1234a39f7afdSSong Liu * 1235a39f7afdSSong Liu * must hold conf->device_lock 1236a39f7afdSSong Liu */ 1237a39f7afdSSong Liu static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1238a39f7afdSSong Liu { 1239a39f7afdSSong Liu BUG_ON(list_empty(&sh->lru)); 1240a39f7afdSSong Liu BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1241a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1242a39f7afdSSong Liu 1243a39f7afdSSong Liu /* 1244a39f7afdSSong Liu * The stripe is not ON_RELEASE_LIST, so it is safe to call 1245a39f7afdSSong Liu * raid5_release_stripe() while holding conf->device_lock 1246a39f7afdSSong Liu */ 1247a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1248a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1249a39f7afdSSong Liu 1250a39f7afdSSong Liu list_del_init(&sh->lru); 1251a39f7afdSSong Liu atomic_inc(&sh->count); 1252a39f7afdSSong Liu 1253a39f7afdSSong Liu set_bit(STRIPE_HANDLE, &sh->state); 1254a39f7afdSSong Liu atomic_inc(&conf->active_stripes); 1255a39f7afdSSong Liu r5c_make_stripe_write_out(sh); 1256a39f7afdSSong Liu 1257a39f7afdSSong Liu raid5_release_stripe(sh); 1258a39f7afdSSong Liu } 1259a39f7afdSSong Liu 1260a39f7afdSSong Liu /* 1261a39f7afdSSong Liu * if num == 0, flush all full stripes 1262a39f7afdSSong Liu * if num > 0, flush all full stripes. If less than num full stripes are 1263a39f7afdSSong Liu * flushed, flush some partial stripes until totally num stripes are 1264a39f7afdSSong Liu * flushed or there is no more cached stripes. 1265a39f7afdSSong Liu */ 1266a39f7afdSSong Liu void r5c_flush_cache(struct r5conf *conf, int num) 1267a39f7afdSSong Liu { 1268a39f7afdSSong Liu int count; 1269a39f7afdSSong Liu struct stripe_head *sh, *next; 1270a39f7afdSSong Liu 1271a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1272a39f7afdSSong Liu if (!conf->log) 1273a39f7afdSSong Liu return; 1274a39f7afdSSong Liu 1275a39f7afdSSong Liu count = 0; 1276a39f7afdSSong Liu list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1277a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1278a39f7afdSSong Liu count++; 1279a39f7afdSSong Liu } 1280a39f7afdSSong Liu 1281a39f7afdSSong Liu if (count >= num) 1282a39f7afdSSong Liu return; 1283a39f7afdSSong Liu list_for_each_entry_safe(sh, next, 1284a39f7afdSSong Liu &conf->r5c_partial_stripe_list, lru) { 1285a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1286a39f7afdSSong Liu if (++count >= num) 1287a39f7afdSSong Liu break; 1288a39f7afdSSong Liu } 1289a39f7afdSSong Liu } 1290a39f7afdSSong Liu 1291a39f7afdSSong Liu static void r5c_do_reclaim(struct r5conf *conf) 1292a39f7afdSSong Liu { 1293a39f7afdSSong Liu struct r5l_log *log = conf->log; 1294a39f7afdSSong Liu struct stripe_head *sh; 1295a39f7afdSSong Liu int count = 0; 1296a39f7afdSSong Liu unsigned long flags; 1297a39f7afdSSong Liu int total_cached; 1298a39f7afdSSong Liu int stripes_to_flush; 1299a39f7afdSSong Liu 1300a39f7afdSSong Liu if (!r5c_is_writeback(log)) 1301a39f7afdSSong Liu return; 1302a39f7afdSSong Liu 1303a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1304a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 1305a39f7afdSSong Liu 1306a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 3 / 4 || 1307a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 1308a39f7afdSSong Liu /* 1309a39f7afdSSong Liu * if stripe cache pressure high, flush all full stripes and 1310a39f7afdSSong Liu * some partial stripes 1311a39f7afdSSong Liu */ 1312a39f7afdSSong Liu stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1313a39f7afdSSong Liu else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1314a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes) > 1315a39f7afdSSong Liu R5C_FULL_STRIPE_FLUSH_BATCH) 1316a39f7afdSSong Liu /* 1317a39f7afdSSong Liu * if stripe cache pressure moderate, or if there is many full 1318a39f7afdSSong Liu * stripes,flush all full stripes 1319a39f7afdSSong Liu */ 1320a39f7afdSSong Liu stripes_to_flush = 0; 1321a39f7afdSSong Liu else 1322a39f7afdSSong Liu /* no need to flush */ 1323a39f7afdSSong Liu stripes_to_flush = -1; 1324a39f7afdSSong Liu 1325a39f7afdSSong Liu if (stripes_to_flush >= 0) { 1326a39f7afdSSong Liu spin_lock_irqsave(&conf->device_lock, flags); 1327a39f7afdSSong Liu r5c_flush_cache(conf, stripes_to_flush); 1328a39f7afdSSong Liu spin_unlock_irqrestore(&conf->device_lock, flags); 1329a39f7afdSSong Liu } 1330a39f7afdSSong Liu 1331a39f7afdSSong Liu /* if log space is tight, flush stripes on stripe_in_journal_list */ 1332a39f7afdSSong Liu if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1333a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1334a39f7afdSSong Liu spin_lock(&conf->device_lock); 1335a39f7afdSSong Liu list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1336a39f7afdSSong Liu /* 1337a39f7afdSSong Liu * stripes on stripe_in_journal_list could be in any 1338a39f7afdSSong Liu * state of the stripe_cache state machine. In this 1339a39f7afdSSong Liu * case, we only want to flush stripe on 1340a39f7afdSSong Liu * r5c_cached_full/partial_stripes. The following 1341a39f7afdSSong Liu * condition makes sure the stripe is on one of the 1342a39f7afdSSong Liu * two lists. 1343a39f7afdSSong Liu */ 1344a39f7afdSSong Liu if (!list_empty(&sh->lru) && 1345a39f7afdSSong Liu !test_bit(STRIPE_HANDLE, &sh->state) && 1346a39f7afdSSong Liu atomic_read(&sh->count) == 0) { 1347a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1348a39f7afdSSong Liu } 1349a39f7afdSSong Liu if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1350a39f7afdSSong Liu break; 1351a39f7afdSSong Liu } 1352a39f7afdSSong Liu spin_unlock(&conf->device_lock); 1353a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1354a39f7afdSSong Liu } 1355f687a33eSSong Liu 1356f687a33eSSong Liu if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 1357f687a33eSSong Liu r5l_run_no_space_stripes(log); 1358f687a33eSSong Liu 1359a39f7afdSSong Liu md_wakeup_thread(conf->mddev->thread); 1360a39f7afdSSong Liu } 1361a39f7afdSSong Liu 13620576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log) 13630576b1c6SShaohua Li { 1364a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 13650576b1c6SShaohua Li sector_t reclaim_target = xchg(&log->reclaim_target, 0); 136617036461SChristoph Hellwig sector_t reclaimable; 136717036461SChristoph Hellwig sector_t next_checkpoint; 1368a39f7afdSSong Liu bool write_super; 13690576b1c6SShaohua Li 13700576b1c6SShaohua Li spin_lock_irq(&log->io_list_lock); 1371a39f7afdSSong Liu write_super = r5l_reclaimable_space(log) > log->max_free_space || 1372a39f7afdSSong Liu reclaim_target != 0 || !list_empty(&log->no_space_stripes); 13730576b1c6SShaohua Li /* 13740576b1c6SShaohua Li * move proper io_unit to reclaim list. We should not change the order. 13750576b1c6SShaohua Li * reclaimable/unreclaimable io_unit can be mixed in the list, we 13760576b1c6SShaohua Li * shouldn't reuse space of an unreclaimable io_unit 13770576b1c6SShaohua Li */ 13780576b1c6SShaohua Li while (1) { 137917036461SChristoph Hellwig reclaimable = r5l_reclaimable_space(log); 138017036461SChristoph Hellwig if (reclaimable >= reclaim_target || 13810576b1c6SShaohua Li (list_empty(&log->running_ios) && 13820576b1c6SShaohua Li list_empty(&log->io_end_ios) && 1383a8c34f91SShaohua Li list_empty(&log->flushing_ios) && 138404732f74SChristoph Hellwig list_empty(&log->finished_ios))) 13850576b1c6SShaohua Li break; 13860576b1c6SShaohua Li 138717036461SChristoph Hellwig md_wakeup_thread(log->rdev->mddev->thread); 138817036461SChristoph Hellwig wait_event_lock_irq(log->iounit_wait, 138917036461SChristoph Hellwig r5l_reclaimable_space(log) > reclaimable, 139017036461SChristoph Hellwig log->io_list_lock); 13910576b1c6SShaohua Li } 139217036461SChristoph Hellwig 1393a39f7afdSSong Liu next_checkpoint = r5c_calculate_new_cp(conf); 13940576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 13950576b1c6SShaohua Li 139617036461SChristoph Hellwig BUG_ON(reclaimable < 0); 1397a39f7afdSSong Liu 1398a39f7afdSSong Liu if (reclaimable == 0 || !write_super) 13990576b1c6SShaohua Li return; 14000576b1c6SShaohua Li 14010576b1c6SShaohua Li /* 14020576b1c6SShaohua Li * write_super will flush cache of each raid disk. We must write super 14030576b1c6SShaohua Li * here, because the log area might be reused soon and we don't want to 14040576b1c6SShaohua Li * confuse recovery 14050576b1c6SShaohua Li */ 14064b482044SShaohua Li r5l_write_super_and_discard_space(log, next_checkpoint); 14070576b1c6SShaohua Li 14080576b1c6SShaohua Li mutex_lock(&log->io_mutex); 140917036461SChristoph Hellwig log->last_checkpoint = next_checkpoint; 1410a39f7afdSSong Liu r5c_update_log_state(log); 14110576b1c6SShaohua Li mutex_unlock(&log->io_mutex); 14120576b1c6SShaohua Li 141317036461SChristoph Hellwig r5l_run_no_space_stripes(log); 14140576b1c6SShaohua Li } 14150576b1c6SShaohua Li 14160576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread) 14170576b1c6SShaohua Li { 14180576b1c6SShaohua Li struct mddev *mddev = thread->mddev; 14190576b1c6SShaohua Li struct r5conf *conf = mddev->private; 14200576b1c6SShaohua Li struct r5l_log *log = conf->log; 14210576b1c6SShaohua Li 14220576b1c6SShaohua Li if (!log) 14230576b1c6SShaohua Li return; 1424a39f7afdSSong Liu r5c_do_reclaim(conf); 14250576b1c6SShaohua Li r5l_do_reclaim(log); 14260576b1c6SShaohua Li } 14270576b1c6SShaohua Li 1428a39f7afdSSong Liu void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1429f6bed0efSShaohua Li { 14300576b1c6SShaohua Li unsigned long target; 14310576b1c6SShaohua Li unsigned long new = (unsigned long)space; /* overflow in theory */ 14320576b1c6SShaohua Li 1433a39f7afdSSong Liu if (!log) 1434a39f7afdSSong Liu return; 14350576b1c6SShaohua Li do { 14360576b1c6SShaohua Li target = log->reclaim_target; 14370576b1c6SShaohua Li if (new < target) 14380576b1c6SShaohua Li return; 14390576b1c6SShaohua Li } while (cmpxchg(&log->reclaim_target, target, new) != target); 14400576b1c6SShaohua Li md_wakeup_thread(log->reclaim_thread); 1441f6bed0efSShaohua Li } 1442f6bed0efSShaohua Li 1443e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state) 1444e6c033f7SShaohua Li { 14454b482044SShaohua Li struct mddev *mddev; 1446e6c033f7SShaohua Li if (!log || state == 2) 1447e6c033f7SShaohua Li return; 1448ce1ccd07SShaohua Li if (state == 0) 1449ce1ccd07SShaohua Li kthread_unpark(log->reclaim_thread->tsk); 1450ce1ccd07SShaohua Li else if (state == 1) { 14514b482044SShaohua Li /* make sure r5l_write_super_and_discard_space exits */ 14524b482044SShaohua Li mddev = log->rdev->mddev; 14534b482044SShaohua Li wake_up(&mddev->sb_wait); 1454ce1ccd07SShaohua Li kthread_park(log->reclaim_thread->tsk); 1455a39f7afdSSong Liu r5l_wake_reclaim(log, MaxSector); 1456e6c033f7SShaohua Li r5l_do_reclaim(log); 1457e6c033f7SShaohua Li } 1458e6c033f7SShaohua Li } 1459e6c033f7SShaohua Li 14606e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf) 14616e74a9cfSShaohua Li { 1462f6b6ec5cSShaohua Li struct r5l_log *log; 1463f6b6ec5cSShaohua Li bool ret; 14647dde2ad3SShaohua Li /* don't allow write if journal disk is missing */ 1465f6b6ec5cSShaohua Li rcu_read_lock(); 1466f6b6ec5cSShaohua Li log = rcu_dereference(conf->log); 1467f6b6ec5cSShaohua Li 1468f6b6ec5cSShaohua Li if (!log) 1469f6b6ec5cSShaohua Li ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1470f6b6ec5cSShaohua Li else 1471f6b6ec5cSShaohua Li ret = test_bit(Faulty, &log->rdev->flags); 1472f6b6ec5cSShaohua Li rcu_read_unlock(); 1473f6b6ec5cSShaohua Li return ret; 14746e74a9cfSShaohua Li } 14756e74a9cfSShaohua Li 1476355810d1SShaohua Li struct r5l_recovery_ctx { 1477355810d1SShaohua Li struct page *meta_page; /* current meta */ 1478355810d1SShaohua Li sector_t meta_total_blocks; /* total size of current meta and data */ 1479355810d1SShaohua Li sector_t pos; /* recovery position */ 1480355810d1SShaohua Li u64 seq; /* recovery position seq */ 1481b4c625c6SSong Liu int data_parity_stripes; /* number of data_parity stripes */ 1482b4c625c6SSong Liu int data_only_stripes; /* number of data_only stripes */ 1483b4c625c6SSong Liu struct list_head cached_list; 1484355810d1SShaohua Li }; 1485355810d1SShaohua Li 14869ed988f5SSong Liu static int r5l_recovery_read_meta_block(struct r5l_log *log, 1487355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1488355810d1SShaohua Li { 1489355810d1SShaohua Li struct page *page = ctx->meta_page; 1490355810d1SShaohua Li struct r5l_meta_block *mb; 1491355810d1SShaohua Li u32 crc, stored_crc; 1492355810d1SShaohua Li 1493796a5cf0SMike Christie if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1494796a5cf0SMike Christie false)) 1495355810d1SShaohua Li return -EIO; 1496355810d1SShaohua Li 1497355810d1SShaohua Li mb = page_address(page); 1498355810d1SShaohua Li stored_crc = le32_to_cpu(mb->checksum); 1499355810d1SShaohua Li mb->checksum = 0; 1500355810d1SShaohua Li 1501355810d1SShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1502355810d1SShaohua Li le64_to_cpu(mb->seq) != ctx->seq || 1503355810d1SShaohua Li mb->version != R5LOG_VERSION || 1504355810d1SShaohua Li le64_to_cpu(mb->position) != ctx->pos) 1505355810d1SShaohua Li return -EINVAL; 1506355810d1SShaohua Li 15075cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1508355810d1SShaohua Li if (stored_crc != crc) 1509355810d1SShaohua Li return -EINVAL; 1510355810d1SShaohua Li 1511355810d1SShaohua Li if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1512355810d1SShaohua Li return -EINVAL; 1513355810d1SShaohua Li 1514355810d1SShaohua Li ctx->meta_total_blocks = BLOCK_SECTORS; 1515355810d1SShaohua Li 1516355810d1SShaohua Li return 0; 1517355810d1SShaohua Li } 1518355810d1SShaohua Li 15199ed988f5SSong Liu static void 15209ed988f5SSong Liu r5l_recovery_create_empty_meta_block(struct r5l_log *log, 15219ed988f5SSong Liu struct page *page, 15229ed988f5SSong Liu sector_t pos, u64 seq) 1523355810d1SShaohua Li { 1524355810d1SShaohua Li struct r5l_meta_block *mb; 1525355810d1SShaohua Li u32 crc; 1526355810d1SShaohua Li 1527355810d1SShaohua Li mb = page_address(page); 15289ed988f5SSong Liu clear_page(mb); 1529355810d1SShaohua Li mb->magic = cpu_to_le32(R5LOG_MAGIC); 1530355810d1SShaohua Li mb->version = R5LOG_VERSION; 1531355810d1SShaohua Li mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1532355810d1SShaohua Li mb->seq = cpu_to_le64(seq); 1533355810d1SShaohua Li mb->position = cpu_to_le64(pos); 15345cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1535355810d1SShaohua Li mb->checksum = cpu_to_le32(crc); 15369ed988f5SSong Liu } 1537355810d1SShaohua Li 15389ed988f5SSong Liu static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 15399ed988f5SSong Liu u64 seq) 15409ed988f5SSong Liu { 15419ed988f5SSong Liu struct page *page; 15429ed988f5SSong Liu 15439ed988f5SSong Liu page = alloc_page(GFP_KERNEL); 15449ed988f5SSong Liu if (!page) 15459ed988f5SSong Liu return -ENOMEM; 15469ed988f5SSong Liu r5l_recovery_create_empty_meta_block(log, page, pos, seq); 1547796a5cf0SMike Christie if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1548796a5cf0SMike Christie WRITE_FUA, false)) { 1549355810d1SShaohua Li __free_page(page); 1550355810d1SShaohua Li return -EIO; 1551355810d1SShaohua Li } 1552355810d1SShaohua Li __free_page(page); 1553355810d1SShaohua Li return 0; 1554355810d1SShaohua Li } 1555355810d1SShaohua Li 1556b4c625c6SSong Liu /* 1557b4c625c6SSong Liu * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 1558b4c625c6SSong Liu * to mark valid (potentially not flushed) data in the journal. 1559b4c625c6SSong Liu * 1560b4c625c6SSong Liu * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 1561b4c625c6SSong Liu * so there should not be any mismatch here. 1562b4c625c6SSong Liu */ 1563b4c625c6SSong Liu static void r5l_recovery_load_data(struct r5l_log *log, 1564b4c625c6SSong Liu struct stripe_head *sh, 1565b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1566b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1567b4c625c6SSong Liu sector_t log_offset) 1568b4c625c6SSong Liu { 1569b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1570b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1571b4c625c6SSong Liu int dd_idx; 1572b4c625c6SSong Liu 1573b4c625c6SSong Liu raid5_compute_sector(conf, 1574b4c625c6SSong Liu le64_to_cpu(payload->location), 0, 1575b4c625c6SSong Liu &dd_idx, sh); 1576b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1577b4c625c6SSong Liu sh->dev[dd_idx].page, REQ_OP_READ, 0, false); 1578b4c625c6SSong Liu sh->dev[dd_idx].log_checksum = 1579b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1580b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS; 1581b4c625c6SSong Liu 1582b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 1583b4c625c6SSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 1584b4c625c6SSong Liu } 1585b4c625c6SSong Liu 1586b4c625c6SSong Liu static void r5l_recovery_load_parity(struct r5l_log *log, 1587b4c625c6SSong Liu struct stripe_head *sh, 1588b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1589b4c625c6SSong Liu struct r5l_payload_data_parity *payload, 1590b4c625c6SSong Liu sector_t log_offset) 1591b4c625c6SSong Liu { 1592b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1593b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1594b4c625c6SSong Liu 1595b4c625c6SSong Liu ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 1596b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1597b4c625c6SSong Liu sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); 1598b4c625c6SSong Liu sh->dev[sh->pd_idx].log_checksum = 1599b4c625c6SSong Liu le32_to_cpu(payload->checksum[0]); 1600b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 1601b4c625c6SSong Liu 1602b4c625c6SSong Liu if (sh->qd_idx >= 0) { 1603b4c625c6SSong Liu sync_page_io(log->rdev, 1604b4c625c6SSong Liu r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1605b4c625c6SSong Liu PAGE_SIZE, sh->dev[sh->qd_idx].page, 1606b4c625c6SSong Liu REQ_OP_READ, 0, false); 1607b4c625c6SSong Liu sh->dev[sh->qd_idx].log_checksum = 1608b4c625c6SSong Liu le32_to_cpu(payload->checksum[1]); 1609b4c625c6SSong Liu set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 1610b4c625c6SSong Liu } 1611b4c625c6SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 1612b4c625c6SSong Liu } 1613b4c625c6SSong Liu 1614b4c625c6SSong Liu static void r5l_recovery_reset_stripe(struct stripe_head *sh) 1615b4c625c6SSong Liu { 1616b4c625c6SSong Liu int i; 1617b4c625c6SSong Liu 1618b4c625c6SSong Liu sh->state = 0; 1619b4c625c6SSong Liu sh->log_start = MaxSector; 1620b4c625c6SSong Liu for (i = sh->disks; i--; ) 1621b4c625c6SSong Liu sh->dev[i].flags = 0; 1622b4c625c6SSong Liu } 1623b4c625c6SSong Liu 1624b4c625c6SSong Liu static void 1625b4c625c6SSong Liu r5l_recovery_replay_one_stripe(struct r5conf *conf, 1626b4c625c6SSong Liu struct stripe_head *sh, 1627b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1628b4c625c6SSong Liu { 1629b4c625c6SSong Liu struct md_rdev *rdev, *rrdev; 1630b4c625c6SSong Liu int disk_index; 1631b4c625c6SSong Liu int data_count = 0; 1632b4c625c6SSong Liu 1633b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1634b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1635b4c625c6SSong Liu continue; 1636b4c625c6SSong Liu if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 1637b4c625c6SSong Liu continue; 1638b4c625c6SSong Liu data_count++; 1639b4c625c6SSong Liu } 1640b4c625c6SSong Liu 1641b4c625c6SSong Liu /* 1642b4c625c6SSong Liu * stripes that only have parity must have been flushed 1643b4c625c6SSong Liu * before the crash that we are now recovering from, so 1644b4c625c6SSong Liu * there is nothing more to recovery. 1645b4c625c6SSong Liu */ 1646b4c625c6SSong Liu if (data_count == 0) 1647b4c625c6SSong Liu goto out; 1648b4c625c6SSong Liu 1649b4c625c6SSong Liu for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1650b4c625c6SSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1651b4c625c6SSong Liu continue; 1652b4c625c6SSong Liu 1653b4c625c6SSong Liu /* in case device is broken */ 1654b4c625c6SSong Liu rcu_read_lock(); 1655b4c625c6SSong Liu rdev = rcu_dereference(conf->disks[disk_index].rdev); 1656b4c625c6SSong Liu if (rdev) { 1657b4c625c6SSong Liu atomic_inc(&rdev->nr_pending); 1658b4c625c6SSong Liu rcu_read_unlock(); 1659b4c625c6SSong Liu sync_page_io(rdev, sh->sector, PAGE_SIZE, 1660b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1661b4c625c6SSong Liu false); 1662b4c625c6SSong Liu rdev_dec_pending(rdev, rdev->mddev); 1663b4c625c6SSong Liu rcu_read_lock(); 1664b4c625c6SSong Liu } 1665b4c625c6SSong Liu rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1666b4c625c6SSong Liu if (rrdev) { 1667b4c625c6SSong Liu atomic_inc(&rrdev->nr_pending); 1668b4c625c6SSong Liu rcu_read_unlock(); 1669b4c625c6SSong Liu sync_page_io(rrdev, sh->sector, PAGE_SIZE, 1670b4c625c6SSong Liu sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1671b4c625c6SSong Liu false); 1672b4c625c6SSong Liu rdev_dec_pending(rrdev, rrdev->mddev); 1673b4c625c6SSong Liu rcu_read_lock(); 1674b4c625c6SSong Liu } 1675b4c625c6SSong Liu rcu_read_unlock(); 1676b4c625c6SSong Liu } 1677b4c625c6SSong Liu ctx->data_parity_stripes++; 1678b4c625c6SSong Liu out: 1679b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1680b4c625c6SSong Liu } 1681b4c625c6SSong Liu 1682b4c625c6SSong Liu static struct stripe_head * 1683b4c625c6SSong Liu r5c_recovery_alloc_stripe(struct r5conf *conf, 1684b4c625c6SSong Liu sector_t stripe_sect, 1685b4c625c6SSong Liu sector_t log_start) 1686b4c625c6SSong Liu { 1687b4c625c6SSong Liu struct stripe_head *sh; 1688b4c625c6SSong Liu 1689b4c625c6SSong Liu sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); 1690b4c625c6SSong Liu if (!sh) 1691b4c625c6SSong Liu return NULL; /* no more stripe available */ 1692b4c625c6SSong Liu 1693b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1694b4c625c6SSong Liu sh->log_start = log_start; 1695b4c625c6SSong Liu 1696b4c625c6SSong Liu return sh; 1697b4c625c6SSong Liu } 1698b4c625c6SSong Liu 1699b4c625c6SSong Liu static struct stripe_head * 1700b4c625c6SSong Liu r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 1701b4c625c6SSong Liu { 1702b4c625c6SSong Liu struct stripe_head *sh; 1703b4c625c6SSong Liu 1704b4c625c6SSong Liu list_for_each_entry(sh, list, lru) 1705b4c625c6SSong Liu if (sh->sector == sect) 1706b4c625c6SSong Liu return sh; 1707b4c625c6SSong Liu return NULL; 1708b4c625c6SSong Liu } 1709b4c625c6SSong Liu 1710b4c625c6SSong Liu static void 1711b4c625c6SSong Liu r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 1712b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1713b4c625c6SSong Liu { 1714b4c625c6SSong Liu struct stripe_head *sh, *next; 1715b4c625c6SSong Liu 1716b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 1717b4c625c6SSong Liu r5l_recovery_reset_stripe(sh); 1718b4c625c6SSong Liu list_del_init(&sh->lru); 1719b4c625c6SSong Liu raid5_release_stripe(sh); 1720b4c625c6SSong Liu } 1721b4c625c6SSong Liu } 1722b4c625c6SSong Liu 1723b4c625c6SSong Liu static void 1724b4c625c6SSong Liu r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 1725b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1726b4c625c6SSong Liu { 1727b4c625c6SSong Liu struct stripe_head *sh, *next; 1728b4c625c6SSong Liu 1729b4c625c6SSong Liu list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 1730b4c625c6SSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 1731b4c625c6SSong Liu r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 1732b4c625c6SSong Liu list_del_init(&sh->lru); 1733b4c625c6SSong Liu raid5_release_stripe(sh); 1734b4c625c6SSong Liu } 1735b4c625c6SSong Liu } 1736b4c625c6SSong Liu 1737b4c625c6SSong Liu /* if matches return 0; otherwise return -EINVAL */ 1738b4c625c6SSong Liu static int 1739b4c625c6SSong Liu r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, 1740b4c625c6SSong Liu sector_t log_offset, __le32 log_checksum) 1741b4c625c6SSong Liu { 1742b4c625c6SSong Liu void *addr; 1743b4c625c6SSong Liu u32 checksum; 1744b4c625c6SSong Liu 1745b4c625c6SSong Liu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1746b4c625c6SSong Liu page, REQ_OP_READ, 0, false); 1747b4c625c6SSong Liu addr = kmap_atomic(page); 1748b4c625c6SSong Liu checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1749b4c625c6SSong Liu kunmap_atomic(addr); 1750b4c625c6SSong Liu return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 1751b4c625c6SSong Liu } 1752b4c625c6SSong Liu 1753b4c625c6SSong Liu /* 1754b4c625c6SSong Liu * before loading data to stripe cache, we need verify checksum for all data, 1755b4c625c6SSong Liu * if there is mismatch for any data page, we drop all data in the mata block 1756b4c625c6SSong Liu */ 1757b4c625c6SSong Liu static int 1758b4c625c6SSong Liu r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 1759b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1760b4c625c6SSong Liu { 1761b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1762b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1763b4c625c6SSong Liu struct r5l_meta_block *mb = page_address(ctx->meta_page); 1764b4c625c6SSong Liu sector_t mb_offset = sizeof(struct r5l_meta_block); 1765b4c625c6SSong Liu sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1766b4c625c6SSong Liu struct page *page; 1767b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 1768b4c625c6SSong Liu 1769b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 1770b4c625c6SSong Liu if (!page) 1771b4c625c6SSong Liu return -ENOMEM; 1772b4c625c6SSong Liu 1773b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 1774b4c625c6SSong Liu payload = (void *)mb + mb_offset; 1775b4c625c6SSong Liu 1776b4c625c6SSong Liu if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1777b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 1778b4c625c6SSong Liu log, page, log_offset, 1779b4c625c6SSong Liu payload->checksum[0]) < 0) 1780b4c625c6SSong Liu goto mismatch; 1781b4c625c6SSong Liu } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { 1782b4c625c6SSong Liu if (r5l_recovery_verify_data_checksum( 1783b4c625c6SSong Liu log, page, log_offset, 1784b4c625c6SSong Liu payload->checksum[0]) < 0) 1785b4c625c6SSong Liu goto mismatch; 1786b4c625c6SSong Liu if (conf->max_degraded == 2 && /* q for RAID 6 */ 1787b4c625c6SSong Liu r5l_recovery_verify_data_checksum( 1788b4c625c6SSong Liu log, page, 1789b4c625c6SSong Liu r5l_ring_add(log, log_offset, 1790b4c625c6SSong Liu BLOCK_SECTORS), 1791b4c625c6SSong Liu payload->checksum[1]) < 0) 1792b4c625c6SSong Liu goto mismatch; 1793b4c625c6SSong Liu } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ 1794b4c625c6SSong Liu goto mismatch; 1795b4c625c6SSong Liu 1796b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 1797b4c625c6SSong Liu le32_to_cpu(payload->size)); 1798b4c625c6SSong Liu 1799b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 1800b4c625c6SSong Liu sizeof(__le32) * 1801b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1802b4c625c6SSong Liu } 1803b4c625c6SSong Liu 1804b4c625c6SSong Liu put_page(page); 1805b4c625c6SSong Liu return 0; 1806b4c625c6SSong Liu 1807b4c625c6SSong Liu mismatch: 1808b4c625c6SSong Liu put_page(page); 1809b4c625c6SSong Liu return -EINVAL; 1810b4c625c6SSong Liu } 1811b4c625c6SSong Liu 1812b4c625c6SSong Liu /* 1813b4c625c6SSong Liu * Analyze all data/parity pages in one meta block 1814b4c625c6SSong Liu * Returns: 1815b4c625c6SSong Liu * 0 for success 1816b4c625c6SSong Liu * -EINVAL for unknown playload type 1817b4c625c6SSong Liu * -EAGAIN for checksum mismatch of data page 1818b4c625c6SSong Liu * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 1819b4c625c6SSong Liu */ 1820b4c625c6SSong Liu static int 1821b4c625c6SSong Liu r5c_recovery_analyze_meta_block(struct r5l_log *log, 1822b4c625c6SSong Liu struct r5l_recovery_ctx *ctx, 1823b4c625c6SSong Liu struct list_head *cached_stripe_list) 1824b4c625c6SSong Liu { 1825b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 1826b4c625c6SSong Liu struct r5conf *conf = mddev->private; 1827b4c625c6SSong Liu struct r5l_meta_block *mb; 1828b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 1829b4c625c6SSong Liu int mb_offset; 1830b4c625c6SSong Liu sector_t log_offset; 1831b4c625c6SSong Liu sector_t stripe_sect; 1832b4c625c6SSong Liu struct stripe_head *sh; 1833b4c625c6SSong Liu int ret; 1834b4c625c6SSong Liu 1835b4c625c6SSong Liu /* 1836b4c625c6SSong Liu * for mismatch in data blocks, we will drop all data in this mb, but 1837b4c625c6SSong Liu * we will still read next mb for other data with FLUSH flag, as 1838b4c625c6SSong Liu * io_unit could finish out of order. 1839b4c625c6SSong Liu */ 1840b4c625c6SSong Liu ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 1841b4c625c6SSong Liu if (ret == -EINVAL) 1842b4c625c6SSong Liu return -EAGAIN; 1843b4c625c6SSong Liu else if (ret) 1844b4c625c6SSong Liu return ret; /* -ENOMEM duo to alloc_page() failed */ 1845b4c625c6SSong Liu 1846b4c625c6SSong Liu mb = page_address(ctx->meta_page); 1847b4c625c6SSong Liu mb_offset = sizeof(struct r5l_meta_block); 1848b4c625c6SSong Liu log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1849b4c625c6SSong Liu 1850b4c625c6SSong Liu while (mb_offset < le32_to_cpu(mb->meta_size)) { 1851b4c625c6SSong Liu int dd; 1852b4c625c6SSong Liu 1853b4c625c6SSong Liu payload = (void *)mb + mb_offset; 1854b4c625c6SSong Liu stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? 1855b4c625c6SSong Liu raid5_compute_sector( 1856b4c625c6SSong Liu conf, le64_to_cpu(payload->location), 0, &dd, 1857b4c625c6SSong Liu NULL) 1858b4c625c6SSong Liu : le64_to_cpu(payload->location); 1859b4c625c6SSong Liu 1860b4c625c6SSong Liu sh = r5c_recovery_lookup_stripe(cached_stripe_list, 1861b4c625c6SSong Liu stripe_sect); 1862b4c625c6SSong Liu 1863b4c625c6SSong Liu if (!sh) { 18649b69173eSJackieLiu sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos); 1865b4c625c6SSong Liu /* 1866b4c625c6SSong Liu * cannot get stripe from raid5_get_active_stripe 1867b4c625c6SSong Liu * try replay some stripes 1868b4c625c6SSong Liu */ 1869b4c625c6SSong Liu if (!sh) { 1870b4c625c6SSong Liu r5c_recovery_replay_stripes( 1871b4c625c6SSong Liu cached_stripe_list, ctx); 1872b4c625c6SSong Liu sh = r5c_recovery_alloc_stripe( 18739b69173eSJackieLiu conf, stripe_sect, ctx->pos); 1874b4c625c6SSong Liu } 1875b4c625c6SSong Liu if (!sh) { 1876b4c625c6SSong Liu pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 1877b4c625c6SSong Liu mdname(mddev), 1878b4c625c6SSong Liu conf->min_nr_stripes * 2); 1879b4c625c6SSong Liu raid5_set_cache_size(mddev, 1880b4c625c6SSong Liu conf->min_nr_stripes * 2); 1881b4c625c6SSong Liu sh = r5c_recovery_alloc_stripe( 18829b69173eSJackieLiu conf, stripe_sect, ctx->pos); 1883b4c625c6SSong Liu } 1884b4c625c6SSong Liu if (!sh) { 1885b4c625c6SSong Liu pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 1886b4c625c6SSong Liu mdname(mddev)); 1887b4c625c6SSong Liu return -ENOMEM; 1888b4c625c6SSong Liu } 1889b4c625c6SSong Liu list_add_tail(&sh->lru, cached_stripe_list); 1890b4c625c6SSong Liu } 1891b4c625c6SSong Liu 1892b4c625c6SSong Liu if (payload->header.type == R5LOG_PAYLOAD_DATA) { 1893f7b7bee7SZhengyuan Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 1894f7b7bee7SZhengyuan Liu test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 1895b4c625c6SSong Liu r5l_recovery_replay_one_stripe(conf, sh, ctx); 1896b4c625c6SSong Liu sh->log_start = ctx->pos; 1897b4c625c6SSong Liu list_move_tail(&sh->lru, cached_stripe_list); 1898b4c625c6SSong Liu } 1899b4c625c6SSong Liu r5l_recovery_load_data(log, sh, ctx, payload, 1900b4c625c6SSong Liu log_offset); 1901b4c625c6SSong Liu } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) 1902b4c625c6SSong Liu r5l_recovery_load_parity(log, sh, ctx, payload, 1903b4c625c6SSong Liu log_offset); 1904b4c625c6SSong Liu else 1905b4c625c6SSong Liu return -EINVAL; 1906b4c625c6SSong Liu 1907b4c625c6SSong Liu log_offset = r5l_ring_add(log, log_offset, 1908b4c625c6SSong Liu le32_to_cpu(payload->size)); 1909b4c625c6SSong Liu 1910b4c625c6SSong Liu mb_offset += sizeof(struct r5l_payload_data_parity) + 1911b4c625c6SSong Liu sizeof(__le32) * 1912b4c625c6SSong Liu (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1913b4c625c6SSong Liu } 1914b4c625c6SSong Liu 1915b4c625c6SSong Liu return 0; 1916b4c625c6SSong Liu } 1917b4c625c6SSong Liu 1918b4c625c6SSong Liu /* 1919b4c625c6SSong Liu * Load the stripe into cache. The stripe will be written out later by 1920b4c625c6SSong Liu * the stripe cache state machine. 1921b4c625c6SSong Liu */ 1922b4c625c6SSong Liu static void r5c_recovery_load_one_stripe(struct r5l_log *log, 1923b4c625c6SSong Liu struct stripe_head *sh) 1924b4c625c6SSong Liu { 1925b4c625c6SSong Liu struct r5dev *dev; 1926b4c625c6SSong Liu int i; 1927b4c625c6SSong Liu 1928b4c625c6SSong Liu for (i = sh->disks; i--; ) { 1929b4c625c6SSong Liu dev = sh->dev + i; 1930b4c625c6SSong Liu if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 1931b4c625c6SSong Liu set_bit(R5_InJournal, &dev->flags); 1932b4c625c6SSong Liu set_bit(R5_UPTODATE, &dev->flags); 1933b4c625c6SSong Liu } 1934b4c625c6SSong Liu } 1935b4c625c6SSong Liu list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 1936462eb7d8SZhengyuan Liu atomic_inc(&log->stripe_in_journal_count); 1937b4c625c6SSong Liu } 1938b4c625c6SSong Liu 1939b4c625c6SSong Liu /* 1940b4c625c6SSong Liu * Scan through the log for all to-be-flushed data 1941b4c625c6SSong Liu * 1942b4c625c6SSong Liu * For stripes with data and parity, namely Data-Parity stripe 1943b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 1944b4c625c6SSong Liu * 1945b4c625c6SSong Liu * For stripes with only data, namely Data-Only stripe 1946b4c625c6SSong Liu * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 1947b4c625c6SSong Liu * 1948b4c625c6SSong Liu * For a stripe, if we see data after parity, we should discard all previous 1949b4c625c6SSong Liu * data and parity for this stripe, as these data are already flushed to 1950b4c625c6SSong Liu * the array. 1951b4c625c6SSong Liu * 1952b4c625c6SSong Liu * At the end of the scan, we return the new journal_tail, which points to 1953b4c625c6SSong Liu * first data-only stripe on the journal device, or next invalid meta block. 1954b4c625c6SSong Liu */ 1955b4c625c6SSong Liu static int r5c_recovery_flush_log(struct r5l_log *log, 1956b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 1957b4c625c6SSong Liu { 1958bc8f167fSJackieLiu struct stripe_head *sh; 1959b4c625c6SSong Liu int ret = 0; 1960b4c625c6SSong Liu 1961b4c625c6SSong Liu /* scan through the log */ 1962b4c625c6SSong Liu while (1) { 1963b4c625c6SSong Liu if (r5l_recovery_read_meta_block(log, ctx)) 1964b4c625c6SSong Liu break; 1965b4c625c6SSong Liu 1966b4c625c6SSong Liu ret = r5c_recovery_analyze_meta_block(log, ctx, 1967b4c625c6SSong Liu &ctx->cached_list); 1968b4c625c6SSong Liu /* 1969b4c625c6SSong Liu * -EAGAIN means mismatch in data block, in this case, we still 1970b4c625c6SSong Liu * try scan the next metablock 1971b4c625c6SSong Liu */ 1972b4c625c6SSong Liu if (ret && ret != -EAGAIN) 1973b4c625c6SSong Liu break; /* ret == -EINVAL or -ENOMEM */ 1974b4c625c6SSong Liu ctx->seq++; 1975b4c625c6SSong Liu ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1976b4c625c6SSong Liu } 1977b4c625c6SSong Liu 1978b4c625c6SSong Liu if (ret == -ENOMEM) { 1979b4c625c6SSong Liu r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 1980b4c625c6SSong Liu return ret; 1981b4c625c6SSong Liu } 1982b4c625c6SSong Liu 1983b4c625c6SSong Liu /* replay data-parity stripes */ 1984b4c625c6SSong Liu r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 1985b4c625c6SSong Liu 1986b4c625c6SSong Liu /* load data-only stripes to stripe cache */ 1987bc8f167fSJackieLiu list_for_each_entry(sh, &ctx->cached_list, lru) { 1988b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1989b4c625c6SSong Liu r5c_recovery_load_one_stripe(log, sh); 1990b4c625c6SSong Liu ctx->data_only_stripes++; 1991b4c625c6SSong Liu } 1992b4c625c6SSong Liu 1993b4c625c6SSong Liu return 0; 1994b4c625c6SSong Liu } 1995b4c625c6SSong Liu 1996b4c625c6SSong Liu /* 1997b4c625c6SSong Liu * we did a recovery. Now ctx.pos points to an invalid meta block. New 1998b4c625c6SSong Liu * log will start here. but we can't let superblock point to last valid 1999b4c625c6SSong Liu * meta block. The log might looks like: 2000b4c625c6SSong Liu * | meta 1| meta 2| meta 3| 2001b4c625c6SSong Liu * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 2002b4c625c6SSong Liu * superblock points to meta 1, we write a new valid meta 2n. if crash 2003b4c625c6SSong Liu * happens again, new recovery will start from meta 1. Since meta 2n is 2004b4c625c6SSong Liu * valid now, recovery will think meta 3 is valid, which is wrong. 2005b4c625c6SSong Liu * The solution is we create a new meta in meta2 with its seq == meta 2006b4c625c6SSong Liu * 1's seq + 10 and let superblock points to meta2. The same recovery will 2007b4c625c6SSong Liu * not think meta 3 is a valid meta, because its seq doesn't match 2008b4c625c6SSong Liu */ 2009b4c625c6SSong Liu 2010b4c625c6SSong Liu /* 2011b4c625c6SSong Liu * Before recovery, the log looks like the following 2012b4c625c6SSong Liu * 2013b4c625c6SSong Liu * --------------------------------------------- 2014b4c625c6SSong Liu * | valid log | invalid log | 2015b4c625c6SSong Liu * --------------------------------------------- 2016b4c625c6SSong Liu * ^ 2017b4c625c6SSong Liu * |- log->last_checkpoint 2018b4c625c6SSong Liu * |- log->last_cp_seq 2019b4c625c6SSong Liu * 2020b4c625c6SSong Liu * Now we scan through the log until we see invalid entry 2021b4c625c6SSong Liu * 2022b4c625c6SSong Liu * --------------------------------------------- 2023b4c625c6SSong Liu * | valid log | invalid log | 2024b4c625c6SSong Liu * --------------------------------------------- 2025b4c625c6SSong Liu * ^ ^ 2026b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos 2027b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq 2028b4c625c6SSong Liu * 2029b4c625c6SSong Liu * From this point, we need to increase seq number by 10 to avoid 2030b4c625c6SSong Liu * confusing next recovery. 2031b4c625c6SSong Liu * 2032b4c625c6SSong Liu * --------------------------------------------- 2033b4c625c6SSong Liu * | valid log | invalid log | 2034b4c625c6SSong Liu * --------------------------------------------- 2035b4c625c6SSong Liu * ^ ^ 2036b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+1 2037b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq+11 2038b4c625c6SSong Liu * 2039b4c625c6SSong Liu * However, it is not safe to start the state machine yet, because data only 2040b4c625c6SSong Liu * parities are not yet secured in RAID. To save these data only parities, we 2041b4c625c6SSong Liu * rewrite them from seq+11. 2042b4c625c6SSong Liu * 2043b4c625c6SSong Liu * ----------------------------------------------------------------- 2044b4c625c6SSong Liu * | valid log | data only stripes | invalid log | 2045b4c625c6SSong Liu * ----------------------------------------------------------------- 2046b4c625c6SSong Liu * ^ ^ 2047b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 2048b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq+10+n 2049b4c625c6SSong Liu * 2050b4c625c6SSong Liu * If failure happens again during this process, the recovery can safe start 2051b4c625c6SSong Liu * again from log->last_checkpoint. 2052b4c625c6SSong Liu * 2053b4c625c6SSong Liu * Once data only stripes are rewritten to journal, we move log_tail 2054b4c625c6SSong Liu * 2055b4c625c6SSong Liu * ----------------------------------------------------------------- 2056b4c625c6SSong Liu * | old log | data only stripes | invalid log | 2057b4c625c6SSong Liu * ----------------------------------------------------------------- 2058b4c625c6SSong Liu * ^ ^ 2059b4c625c6SSong Liu * |- log->last_checkpoint |- ctx->pos+n 2060b4c625c6SSong Liu * |- log->last_cp_seq |- ctx->seq+10+n 2061b4c625c6SSong Liu * 2062b4c625c6SSong Liu * Then we can safely start the state machine. If failure happens from this 2063b4c625c6SSong Liu * point on, the recovery will start from new log->last_checkpoint. 2064b4c625c6SSong Liu */ 2065b4c625c6SSong Liu static int 2066b4c625c6SSong Liu r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2067b4c625c6SSong Liu struct r5l_recovery_ctx *ctx) 2068b4c625c6SSong Liu { 2069bc8f167fSJackieLiu struct stripe_head *sh, *next; 2070b4c625c6SSong Liu struct mddev *mddev = log->rdev->mddev; 2071b4c625c6SSong Liu struct page *page; 2072b4c625c6SSong Liu 2073b4c625c6SSong Liu page = alloc_page(GFP_KERNEL); 2074b4c625c6SSong Liu if (!page) { 2075b4c625c6SSong Liu pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 2076b4c625c6SSong Liu mdname(mddev)); 2077b4c625c6SSong Liu return -ENOMEM; 2078b4c625c6SSong Liu } 2079b4c625c6SSong Liu 2080bc8f167fSJackieLiu list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2081b4c625c6SSong Liu struct r5l_meta_block *mb; 2082b4c625c6SSong Liu int i; 2083b4c625c6SSong Liu int offset; 2084b4c625c6SSong Liu sector_t write_pos; 2085b4c625c6SSong Liu 2086b4c625c6SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 2087b4c625c6SSong Liu r5l_recovery_create_empty_meta_block(log, page, 2088b4c625c6SSong Liu ctx->pos, ctx->seq); 2089b4c625c6SSong Liu mb = page_address(page); 2090b4c625c6SSong Liu offset = le32_to_cpu(mb->meta_size); 2091fc833c2aSJackieLiu write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 2092b4c625c6SSong Liu 2093b4c625c6SSong Liu for (i = sh->disks; i--; ) { 2094b4c625c6SSong Liu struct r5dev *dev = &sh->dev[i]; 2095b4c625c6SSong Liu struct r5l_payload_data_parity *payload; 2096b4c625c6SSong Liu void *addr; 2097b4c625c6SSong Liu 2098b4c625c6SSong Liu if (test_bit(R5_InJournal, &dev->flags)) { 2099b4c625c6SSong Liu payload = (void *)mb + offset; 2100b4c625c6SSong Liu payload->header.type = cpu_to_le16( 2101b4c625c6SSong Liu R5LOG_PAYLOAD_DATA); 2102b4c625c6SSong Liu payload->size = BLOCK_SECTORS; 2103b4c625c6SSong Liu payload->location = cpu_to_le64( 2104b4c625c6SSong Liu raid5_compute_blocknr(sh, i, 0)); 2105b4c625c6SSong Liu addr = kmap_atomic(dev->page); 2106b4c625c6SSong Liu payload->checksum[0] = cpu_to_le32( 2107b4c625c6SSong Liu crc32c_le(log->uuid_checksum, addr, 2108b4c625c6SSong Liu PAGE_SIZE)); 2109b4c625c6SSong Liu kunmap_atomic(addr); 2110b4c625c6SSong Liu sync_page_io(log->rdev, write_pos, PAGE_SIZE, 2111b4c625c6SSong Liu dev->page, REQ_OP_WRITE, 0, false); 2112b4c625c6SSong Liu write_pos = r5l_ring_add(log, write_pos, 2113b4c625c6SSong Liu BLOCK_SECTORS); 2114b4c625c6SSong Liu offset += sizeof(__le32) + 2115b4c625c6SSong Liu sizeof(struct r5l_payload_data_parity); 2116b4c625c6SSong Liu 2117b4c625c6SSong Liu } 2118b4c625c6SSong Liu } 2119b4c625c6SSong Liu mb->meta_size = cpu_to_le32(offset); 2120b4c625c6SSong Liu mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2121b4c625c6SSong Liu sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 2122b4c625c6SSong Liu REQ_OP_WRITE, WRITE_FUA, false); 2123b4c625c6SSong Liu sh->log_start = ctx->pos; 2124b4c625c6SSong Liu ctx->pos = write_pos; 2125b4c625c6SSong Liu ctx->seq += 1; 2126bc8f167fSJackieLiu 2127bc8f167fSJackieLiu list_del_init(&sh->lru); 2128bc8f167fSJackieLiu raid5_release_stripe(sh); 2129b4c625c6SSong Liu } 2130b4c625c6SSong Liu __free_page(page); 2131b4c625c6SSong Liu return 0; 2132b4c625c6SSong Liu } 2133b4c625c6SSong Liu 2134f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log) 2135f6bed0efSShaohua Li { 21365aabf7c4SSong Liu struct mddev *mddev = log->rdev->mddev; 2137355810d1SShaohua Li struct r5l_recovery_ctx ctx; 21385aabf7c4SSong Liu int ret; 2139*43b96748SJackieLiu sector_t pos; 2140*43b96748SJackieLiu struct stripe_head *sh; 2141355810d1SShaohua Li 2142355810d1SShaohua Li ctx.pos = log->last_checkpoint; 2143355810d1SShaohua Li ctx.seq = log->last_cp_seq; 2144355810d1SShaohua Li ctx.meta_page = alloc_page(GFP_KERNEL); 2145b4c625c6SSong Liu ctx.data_only_stripes = 0; 2146b4c625c6SSong Liu ctx.data_parity_stripes = 0; 2147b4c625c6SSong Liu INIT_LIST_HEAD(&ctx.cached_list); 2148b4c625c6SSong Liu 2149355810d1SShaohua Li if (!ctx.meta_page) 2150355810d1SShaohua Li return -ENOMEM; 2151355810d1SShaohua Li 21525aabf7c4SSong Liu ret = r5c_recovery_flush_log(log, &ctx); 2153355810d1SShaohua Li __free_page(ctx.meta_page); 2154355810d1SShaohua Li 2155355810d1SShaohua Li if (ret) 2156355810d1SShaohua Li return ret; 21575aabf7c4SSong Liu 2158*43b96748SJackieLiu pos = ctx.pos; 2159*43b96748SJackieLiu ctx.seq += 10; 2160*43b96748SJackieLiu 2161*43b96748SJackieLiu if (ctx.data_only_stripes == 0) { 2162*43b96748SJackieLiu log->next_checkpoint = ctx.pos; 2163*43b96748SJackieLiu r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2164*43b96748SJackieLiu ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2165*43b96748SJackieLiu } else { 2166*43b96748SJackieLiu sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru); 2167*43b96748SJackieLiu log->next_checkpoint = sh->log_start; 2168*43b96748SJackieLiu } 2169*43b96748SJackieLiu 21705aabf7c4SSong Liu if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 21715aabf7c4SSong Liu pr_debug("md/raid:%s: starting from clean shutdown\n", 21725aabf7c4SSong Liu mdname(mddev)); 21735aabf7c4SSong Liu else { 21745aabf7c4SSong Liu pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n", 21755aabf7c4SSong Liu mdname(mddev), ctx.data_only_stripes, 21765aabf7c4SSong Liu ctx.data_parity_stripes); 21775aabf7c4SSong Liu 21785aabf7c4SSong Liu if (ctx.data_only_stripes > 0) 21795aabf7c4SSong Liu if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 21805aabf7c4SSong Liu pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 21815aabf7c4SSong Liu mdname(mddev)); 21825aabf7c4SSong Liu return -EIO; 21835aabf7c4SSong Liu } 21845aabf7c4SSong Liu } 21855aabf7c4SSong Liu 2186355810d1SShaohua Li log->log_start = ctx.pos; 2187355810d1SShaohua Li log->seq = ctx.seq; 2188*43b96748SJackieLiu log->last_checkpoint = pos; 2189*43b96748SJackieLiu r5l_write_super(log, pos); 2190f6bed0efSShaohua Li return 0; 2191f6bed0efSShaohua Li } 2192f6bed0efSShaohua Li 2193f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp) 2194f6bed0efSShaohua Li { 2195f6bed0efSShaohua Li struct mddev *mddev = log->rdev->mddev; 2196f6bed0efSShaohua Li 2197f6bed0efSShaohua Li log->rdev->journal_tail = cp; 2198f6bed0efSShaohua Li set_bit(MD_CHANGE_DEVS, &mddev->flags); 2199f6bed0efSShaohua Li } 2200f6bed0efSShaohua Li 22012c7da14bSSong Liu static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 22022c7da14bSSong Liu { 22032c7da14bSSong Liu struct r5conf *conf = mddev->private; 22042c7da14bSSong Liu int ret; 22052c7da14bSSong Liu 22062c7da14bSSong Liu if (!conf->log) 22072c7da14bSSong Liu return 0; 22082c7da14bSSong Liu 22092c7da14bSSong Liu switch (conf->log->r5c_journal_mode) { 22102c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_THROUGH: 22112c7da14bSSong Liu ret = snprintf( 22122c7da14bSSong Liu page, PAGE_SIZE, "[%s] %s\n", 22132c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 22142c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 22152c7da14bSSong Liu break; 22162c7da14bSSong Liu case R5C_JOURNAL_MODE_WRITE_BACK: 22172c7da14bSSong Liu ret = snprintf( 22182c7da14bSSong Liu page, PAGE_SIZE, "%s [%s]\n", 22192c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 22202c7da14bSSong Liu r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 22212c7da14bSSong Liu break; 22222c7da14bSSong Liu default: 22232c7da14bSSong Liu ret = 0; 22242c7da14bSSong Liu } 22252c7da14bSSong Liu return ret; 22262c7da14bSSong Liu } 22272c7da14bSSong Liu 22282c7da14bSSong Liu static ssize_t r5c_journal_mode_store(struct mddev *mddev, 22292c7da14bSSong Liu const char *page, size_t length) 22302c7da14bSSong Liu { 22312c7da14bSSong Liu struct r5conf *conf = mddev->private; 22322c7da14bSSong Liu struct r5l_log *log = conf->log; 22332c7da14bSSong Liu int val = -1, i; 22342c7da14bSSong Liu int len = length; 22352c7da14bSSong Liu 22362c7da14bSSong Liu if (!log) 22372c7da14bSSong Liu return -ENODEV; 22382c7da14bSSong Liu 22392c7da14bSSong Liu if (len && page[len - 1] == '\n') 22402c7da14bSSong Liu len -= 1; 22412c7da14bSSong Liu for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) 22422c7da14bSSong Liu if (strlen(r5c_journal_mode_str[i]) == len && 22432c7da14bSSong Liu strncmp(page, r5c_journal_mode_str[i], len) == 0) { 22442c7da14bSSong Liu val = i; 22452c7da14bSSong Liu break; 22462c7da14bSSong Liu } 22472c7da14bSSong Liu if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || 22482c7da14bSSong Liu val > R5C_JOURNAL_MODE_WRITE_BACK) 22492c7da14bSSong Liu return -EINVAL; 22502c7da14bSSong Liu 22512c7da14bSSong Liu mddev_suspend(mddev); 22522c7da14bSSong Liu conf->log->r5c_journal_mode = val; 22532c7da14bSSong Liu mddev_resume(mddev); 22542c7da14bSSong Liu 22552c7da14bSSong Liu pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 22562c7da14bSSong Liu mdname(mddev), val, r5c_journal_mode_str[val]); 22572c7da14bSSong Liu return length; 22582c7da14bSSong Liu } 22592c7da14bSSong Liu 22602c7da14bSSong Liu struct md_sysfs_entry 22612c7da14bSSong Liu r5c_journal_mode = __ATTR(journal_mode, 0644, 22622c7da14bSSong Liu r5c_journal_mode_show, r5c_journal_mode_store); 22632c7da14bSSong Liu 22642ded3703SSong Liu /* 22652ded3703SSong Liu * Try handle write operation in caching phase. This function should only 22662ded3703SSong Liu * be called in write-back mode. 22672ded3703SSong Liu * 22682ded3703SSong Liu * If all outstanding writes can be handled in caching phase, returns 0 22692ded3703SSong Liu * If writes requires write-out phase, call r5c_make_stripe_write_out() 22702ded3703SSong Liu * and returns -EAGAIN 22712ded3703SSong Liu */ 22722ded3703SSong Liu int r5c_try_caching_write(struct r5conf *conf, 22732ded3703SSong Liu struct stripe_head *sh, 22742ded3703SSong Liu struct stripe_head_state *s, 22752ded3703SSong Liu int disks) 22762ded3703SSong Liu { 22772ded3703SSong Liu struct r5l_log *log = conf->log; 22781e6d690bSSong Liu int i; 22791e6d690bSSong Liu struct r5dev *dev; 22801e6d690bSSong Liu int to_cache = 0; 22812ded3703SSong Liu 22822ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 22832ded3703SSong Liu 22841e6d690bSSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 22851e6d690bSSong Liu /* 22861e6d690bSSong Liu * There are two different scenarios here: 22871e6d690bSSong Liu * 1. The stripe has some data cached, and it is sent to 22881e6d690bSSong Liu * write-out phase for reclaim 22891e6d690bSSong Liu * 2. The stripe is clean, and this is the first write 22901e6d690bSSong Liu * 22911e6d690bSSong Liu * For 1, return -EAGAIN, so we continue with 22921e6d690bSSong Liu * handle_stripe_dirtying(). 22931e6d690bSSong Liu * 22941e6d690bSSong Liu * For 2, set STRIPE_R5C_CACHING and continue with caching 22951e6d690bSSong Liu * write. 22961e6d690bSSong Liu */ 22971e6d690bSSong Liu 22981e6d690bSSong Liu /* case 1: anything injournal or anything in written */ 22991e6d690bSSong Liu if (s->injournal > 0 || s->written > 0) 23001e6d690bSSong Liu return -EAGAIN; 23011e6d690bSSong Liu /* case 2 */ 23021e6d690bSSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 23031e6d690bSSong Liu } 23041e6d690bSSong Liu 23051e6d690bSSong Liu for (i = disks; i--; ) { 23061e6d690bSSong Liu dev = &sh->dev[i]; 23071e6d690bSSong Liu /* if non-overwrite, use writing-out phase */ 23081e6d690bSSong Liu if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 23091e6d690bSSong Liu !test_bit(R5_InJournal, &dev->flags)) { 23102ded3703SSong Liu r5c_make_stripe_write_out(sh); 23112ded3703SSong Liu return -EAGAIN; 23122ded3703SSong Liu } 23131e6d690bSSong Liu } 23141e6d690bSSong Liu 23151e6d690bSSong Liu for (i = disks; i--; ) { 23161e6d690bSSong Liu dev = &sh->dev[i]; 23171e6d690bSSong Liu if (dev->towrite) { 23181e6d690bSSong Liu set_bit(R5_Wantwrite, &dev->flags); 23191e6d690bSSong Liu set_bit(R5_Wantdrain, &dev->flags); 23201e6d690bSSong Liu set_bit(R5_LOCKED, &dev->flags); 23211e6d690bSSong Liu to_cache++; 23221e6d690bSSong Liu } 23231e6d690bSSong Liu } 23241e6d690bSSong Liu 23251e6d690bSSong Liu if (to_cache) { 23261e6d690bSSong Liu set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 23271e6d690bSSong Liu /* 23281e6d690bSSong Liu * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 23291e6d690bSSong Liu * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 23301e6d690bSSong Liu * r5c_handle_data_cached() 23311e6d690bSSong Liu */ 23321e6d690bSSong Liu set_bit(STRIPE_LOG_TRAPPED, &sh->state); 23331e6d690bSSong Liu } 23341e6d690bSSong Liu 23351e6d690bSSong Liu return 0; 23361e6d690bSSong Liu } 23371e6d690bSSong Liu 23381e6d690bSSong Liu /* 23391e6d690bSSong Liu * free extra pages (orig_page) we allocated for prexor 23401e6d690bSSong Liu */ 23411e6d690bSSong Liu void r5c_release_extra_page(struct stripe_head *sh) 23421e6d690bSSong Liu { 2343d7bd398eSSong Liu struct r5conf *conf = sh->raid_conf; 23441e6d690bSSong Liu int i; 2345d7bd398eSSong Liu bool using_disk_info_extra_page; 2346d7bd398eSSong Liu 2347d7bd398eSSong Liu using_disk_info_extra_page = 2348d7bd398eSSong Liu sh->dev[0].orig_page == conf->disks[0].extra_page; 23491e6d690bSSong Liu 23501e6d690bSSong Liu for (i = sh->disks; i--; ) 23511e6d690bSSong Liu if (sh->dev[i].page != sh->dev[i].orig_page) { 23521e6d690bSSong Liu struct page *p = sh->dev[i].orig_page; 23531e6d690bSSong Liu 23541e6d690bSSong Liu sh->dev[i].orig_page = sh->dev[i].page; 2355d7bd398eSSong Liu if (!using_disk_info_extra_page) 23561e6d690bSSong Liu put_page(p); 23571e6d690bSSong Liu } 2358d7bd398eSSong Liu 2359d7bd398eSSong Liu if (using_disk_info_extra_page) { 2360d7bd398eSSong Liu clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 2361d7bd398eSSong Liu md_wakeup_thread(conf->mddev->thread); 2362d7bd398eSSong Liu } 2363d7bd398eSSong Liu } 2364d7bd398eSSong Liu 2365d7bd398eSSong Liu void r5c_use_extra_page(struct stripe_head *sh) 2366d7bd398eSSong Liu { 2367d7bd398eSSong Liu struct r5conf *conf = sh->raid_conf; 2368d7bd398eSSong Liu int i; 2369d7bd398eSSong Liu struct r5dev *dev; 2370d7bd398eSSong Liu 2371d7bd398eSSong Liu for (i = sh->disks; i--; ) { 2372d7bd398eSSong Liu dev = &sh->dev[i]; 2373d7bd398eSSong Liu if (dev->orig_page != dev->page) 2374d7bd398eSSong Liu put_page(dev->orig_page); 2375d7bd398eSSong Liu dev->orig_page = conf->disks[i].extra_page; 2376d7bd398eSSong Liu } 23771e6d690bSSong Liu } 23782ded3703SSong Liu 23792ded3703SSong Liu /* 23802ded3703SSong Liu * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 23812ded3703SSong Liu * stripe is committed to RAID disks. 23822ded3703SSong Liu */ 23832ded3703SSong Liu void r5c_finish_stripe_write_out(struct r5conf *conf, 23842ded3703SSong Liu struct stripe_head *sh, 23852ded3703SSong Liu struct stripe_head_state *s) 23862ded3703SSong Liu { 23871e6d690bSSong Liu int i; 23881e6d690bSSong Liu int do_wakeup = 0; 23891e6d690bSSong Liu 23902ded3703SSong Liu if (!conf->log || 23912ded3703SSong Liu !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 23922ded3703SSong Liu return; 23932ded3703SSong Liu 23942ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 23952ded3703SSong Liu clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 23962ded3703SSong Liu 23972ded3703SSong Liu if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 23982ded3703SSong Liu return; 23991e6d690bSSong Liu 24001e6d690bSSong Liu for (i = sh->disks; i--; ) { 24011e6d690bSSong Liu clear_bit(R5_InJournal, &sh->dev[i].flags); 24021e6d690bSSong Liu if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 24031e6d690bSSong Liu do_wakeup = 1; 24041e6d690bSSong Liu } 24051e6d690bSSong Liu 24061e6d690bSSong Liu /* 24071e6d690bSSong Liu * analyse_stripe() runs before r5c_finish_stripe_write_out(), 24081e6d690bSSong Liu * We updated R5_InJournal, so we also update s->injournal. 24091e6d690bSSong Liu */ 24101e6d690bSSong Liu s->injournal = 0; 24111e6d690bSSong Liu 24121e6d690bSSong Liu if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 24131e6d690bSSong Liu if (atomic_dec_and_test(&conf->pending_full_writes)) 24141e6d690bSSong Liu md_wakeup_thread(conf->mddev->thread); 24151e6d690bSSong Liu 24161e6d690bSSong Liu if (do_wakeup) 24171e6d690bSSong Liu wake_up(&conf->wait_for_overlap); 2418a39f7afdSSong Liu 2419a39f7afdSSong Liu if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 2420a39f7afdSSong Liu return; 2421a39f7afdSSong Liu 2422a39f7afdSSong Liu spin_lock_irq(&conf->log->stripe_in_journal_lock); 2423a39f7afdSSong Liu list_del_init(&sh->r5c); 2424a39f7afdSSong Liu spin_unlock_irq(&conf->log->stripe_in_journal_lock); 2425a39f7afdSSong Liu sh->log_start = MaxSector; 2426a39f7afdSSong Liu atomic_dec(&conf->log->stripe_in_journal_count); 2427f687a33eSSong Liu r5c_update_log_state(conf->log); 24281e6d690bSSong Liu } 24291e6d690bSSong Liu 24301e6d690bSSong Liu int 24311e6d690bSSong Liu r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 24321e6d690bSSong Liu struct stripe_head_state *s) 24331e6d690bSSong Liu { 2434a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 24351e6d690bSSong Liu int pages = 0; 24361e6d690bSSong Liu int reserve; 24371e6d690bSSong Liu int i; 24381e6d690bSSong Liu int ret = 0; 24391e6d690bSSong Liu 24401e6d690bSSong Liu BUG_ON(!log); 24411e6d690bSSong Liu 24421e6d690bSSong Liu for (i = 0; i < sh->disks; i++) { 24431e6d690bSSong Liu void *addr; 24441e6d690bSSong Liu 24451e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 24461e6d690bSSong Liu continue; 24471e6d690bSSong Liu addr = kmap_atomic(sh->dev[i].page); 24481e6d690bSSong Liu sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 24491e6d690bSSong Liu addr, PAGE_SIZE); 24501e6d690bSSong Liu kunmap_atomic(addr); 24511e6d690bSSong Liu pages++; 24521e6d690bSSong Liu } 24531e6d690bSSong Liu WARN_ON(pages == 0); 24541e6d690bSSong Liu 24551e6d690bSSong Liu /* 24561e6d690bSSong Liu * The stripe must enter state machine again to call endio, so 24571e6d690bSSong Liu * don't delay. 24581e6d690bSSong Liu */ 24591e6d690bSSong Liu clear_bit(STRIPE_DELAYED, &sh->state); 24601e6d690bSSong Liu atomic_inc(&sh->count); 24611e6d690bSSong Liu 24621e6d690bSSong Liu mutex_lock(&log->io_mutex); 24631e6d690bSSong Liu /* meta + data */ 24641e6d690bSSong Liu reserve = (1 + pages) << (PAGE_SHIFT - 9); 24651e6d690bSSong Liu 2466a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 2467a39f7afdSSong Liu sh->log_start == MaxSector) 2468a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 2469a39f7afdSSong Liu else if (!r5l_has_free_space(log, reserve)) { 2470a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 2471a39f7afdSSong Liu BUG(); 2472a39f7afdSSong Liu else 2473a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 24741e6d690bSSong Liu } else { 24751e6d690bSSong Liu ret = r5l_log_stripe(log, sh, pages, 0); 24761e6d690bSSong Liu if (ret) { 24771e6d690bSSong Liu spin_lock_irq(&log->io_list_lock); 24781e6d690bSSong Liu list_add_tail(&sh->log_list, &log->no_mem_stripes); 24791e6d690bSSong Liu spin_unlock_irq(&log->io_list_lock); 24801e6d690bSSong Liu } 24811e6d690bSSong Liu } 24821e6d690bSSong Liu 24831e6d690bSSong Liu mutex_unlock(&log->io_mutex); 24841e6d690bSSong Liu return 0; 24852ded3703SSong Liu } 24862ded3703SSong Liu 2487f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log) 2488f6bed0efSShaohua Li { 2489f6bed0efSShaohua Li struct md_rdev *rdev = log->rdev; 2490f6bed0efSShaohua Li struct page *page; 2491f6bed0efSShaohua Li struct r5l_meta_block *mb; 2492f6bed0efSShaohua Li sector_t cp = log->rdev->journal_tail; 2493f6bed0efSShaohua Li u32 stored_crc, expected_crc; 2494f6bed0efSShaohua Li bool create_super = false; 2495f6bed0efSShaohua Li int ret; 2496f6bed0efSShaohua Li 2497f6bed0efSShaohua Li /* Make sure it's valid */ 2498f6bed0efSShaohua Li if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 2499f6bed0efSShaohua Li cp = 0; 2500f6bed0efSShaohua Li page = alloc_page(GFP_KERNEL); 2501f6bed0efSShaohua Li if (!page) 2502f6bed0efSShaohua Li return -ENOMEM; 2503f6bed0efSShaohua Li 2504796a5cf0SMike Christie if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 2505f6bed0efSShaohua Li ret = -EIO; 2506f6bed0efSShaohua Li goto ioerr; 2507f6bed0efSShaohua Li } 2508f6bed0efSShaohua Li mb = page_address(page); 2509f6bed0efSShaohua Li 2510f6bed0efSShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 2511f6bed0efSShaohua Li mb->version != R5LOG_VERSION) { 2512f6bed0efSShaohua Li create_super = true; 2513f6bed0efSShaohua Li goto create; 2514f6bed0efSShaohua Li } 2515f6bed0efSShaohua Li stored_crc = le32_to_cpu(mb->checksum); 2516f6bed0efSShaohua Li mb->checksum = 0; 25175cb2fbd6SShaohua Li expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 2518f6bed0efSShaohua Li if (stored_crc != expected_crc) { 2519f6bed0efSShaohua Li create_super = true; 2520f6bed0efSShaohua Li goto create; 2521f6bed0efSShaohua Li } 2522f6bed0efSShaohua Li if (le64_to_cpu(mb->position) != cp) { 2523f6bed0efSShaohua Li create_super = true; 2524f6bed0efSShaohua Li goto create; 2525f6bed0efSShaohua Li } 2526f6bed0efSShaohua Li create: 2527f6bed0efSShaohua Li if (create_super) { 2528f6bed0efSShaohua Li log->last_cp_seq = prandom_u32(); 2529f6bed0efSShaohua Li cp = 0; 253056056c2eSZhengyuan Liu r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 2531f6bed0efSShaohua Li /* 2532f6bed0efSShaohua Li * Make sure super points to correct address. Log might have 2533f6bed0efSShaohua Li * data very soon. If super hasn't correct log tail address, 2534f6bed0efSShaohua Li * recovery can't find the log 2535f6bed0efSShaohua Li */ 2536f6bed0efSShaohua Li r5l_write_super(log, cp); 2537f6bed0efSShaohua Li } else 2538f6bed0efSShaohua Li log->last_cp_seq = le64_to_cpu(mb->seq); 2539f6bed0efSShaohua Li 2540f6bed0efSShaohua Li log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 25410576b1c6SShaohua Li log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 25420576b1c6SShaohua Li if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 25430576b1c6SShaohua Li log->max_free_space = RECLAIM_MAX_FREE_SPACE; 2544f6bed0efSShaohua Li log->last_checkpoint = cp; 254528cd88e2SZhengyuan Liu log->next_checkpoint = cp; 2546a39f7afdSSong Liu mutex_lock(&log->io_mutex); 2547a39f7afdSSong Liu r5c_update_log_state(log); 2548a39f7afdSSong Liu mutex_unlock(&log->io_mutex); 2549f6bed0efSShaohua Li 2550f6bed0efSShaohua Li __free_page(page); 2551f6bed0efSShaohua Li 2552f6bed0efSShaohua Li return r5l_recovery_log(log); 2553f6bed0efSShaohua Li ioerr: 2554f6bed0efSShaohua Li __free_page(page); 2555f6bed0efSShaohua Li return ret; 2556f6bed0efSShaohua Li } 2557f6bed0efSShaohua Li 2558f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 2559f6bed0efSShaohua Li { 2560c888a8f9SJens Axboe struct request_queue *q = bdev_get_queue(rdev->bdev); 2561f6bed0efSShaohua Li struct r5l_log *log; 2562f6bed0efSShaohua Li 2563f6bed0efSShaohua Li if (PAGE_SIZE != 4096) 2564f6bed0efSShaohua Li return -EINVAL; 2565c757ec95SSong Liu 2566c757ec95SSong Liu /* 2567c757ec95SSong Liu * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 2568c757ec95SSong Liu * raid_disks r5l_payload_data_parity. 2569c757ec95SSong Liu * 2570c757ec95SSong Liu * Write journal and cache does not work for very big array 2571c757ec95SSong Liu * (raid_disks > 203) 2572c757ec95SSong Liu */ 2573c757ec95SSong Liu if (sizeof(struct r5l_meta_block) + 2574c757ec95SSong Liu ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 2575c757ec95SSong Liu conf->raid_disks) > PAGE_SIZE) { 2576c757ec95SSong Liu pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 2577c757ec95SSong Liu mdname(conf->mddev), conf->raid_disks); 2578c757ec95SSong Liu return -EINVAL; 2579c757ec95SSong Liu } 2580c757ec95SSong Liu 2581f6bed0efSShaohua Li log = kzalloc(sizeof(*log), GFP_KERNEL); 2582f6bed0efSShaohua Li if (!log) 2583f6bed0efSShaohua Li return -ENOMEM; 2584f6bed0efSShaohua Li log->rdev = rdev; 2585f6bed0efSShaohua Li 2586c888a8f9SJens Axboe log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 258756fef7c6SChristoph Hellwig 25885cb2fbd6SShaohua Li log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 2589f6bed0efSShaohua Li sizeof(rdev->mddev->uuid)); 2590f6bed0efSShaohua Li 2591f6bed0efSShaohua Li mutex_init(&log->io_mutex); 2592f6bed0efSShaohua Li 2593f6bed0efSShaohua Li spin_lock_init(&log->io_list_lock); 2594f6bed0efSShaohua Li INIT_LIST_HEAD(&log->running_ios); 25950576b1c6SShaohua Li INIT_LIST_HEAD(&log->io_end_ios); 2596a8c34f91SShaohua Li INIT_LIST_HEAD(&log->flushing_ios); 259704732f74SChristoph Hellwig INIT_LIST_HEAD(&log->finished_ios); 2598a8c34f91SShaohua Li bio_init(&log->flush_bio); 2599f6bed0efSShaohua Li 2600f6bed0efSShaohua Li log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 2601f6bed0efSShaohua Li if (!log->io_kc) 2602f6bed0efSShaohua Li goto io_kc; 2603f6bed0efSShaohua Li 26045036c390SChristoph Hellwig log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 26055036c390SChristoph Hellwig if (!log->io_pool) 26065036c390SChristoph Hellwig goto io_pool; 26075036c390SChristoph Hellwig 2608c38d29b3SChristoph Hellwig log->bs = bioset_create(R5L_POOL_SIZE, 0); 2609c38d29b3SChristoph Hellwig if (!log->bs) 2610c38d29b3SChristoph Hellwig goto io_bs; 2611c38d29b3SChristoph Hellwig 2612e8deb638SChristoph Hellwig log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 2613e8deb638SChristoph Hellwig if (!log->meta_pool) 2614e8deb638SChristoph Hellwig goto out_mempool; 2615e8deb638SChristoph Hellwig 26160576b1c6SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 26170576b1c6SShaohua Li log->rdev->mddev, "reclaim"); 26180576b1c6SShaohua Li if (!log->reclaim_thread) 26190576b1c6SShaohua Li goto reclaim_thread; 2620a39f7afdSSong Liu log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 2621a39f7afdSSong Liu 26220fd22b45SShaohua Li init_waitqueue_head(&log->iounit_wait); 26230576b1c6SShaohua Li 26245036c390SChristoph Hellwig INIT_LIST_HEAD(&log->no_mem_stripes); 26255036c390SChristoph Hellwig 2626f6bed0efSShaohua Li INIT_LIST_HEAD(&log->no_space_stripes); 2627f6bed0efSShaohua Li spin_lock_init(&log->no_space_stripes_lock); 2628f6bed0efSShaohua Li 26293bddb7f8SSong Liu INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 26303bddb7f8SSong Liu 26312ded3703SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2632a39f7afdSSong Liu INIT_LIST_HEAD(&log->stripe_in_journal_list); 2633a39f7afdSSong Liu spin_lock_init(&log->stripe_in_journal_lock); 2634a39f7afdSSong Liu atomic_set(&log->stripe_in_journal_count, 0); 26352ded3703SSong Liu 2636f6bed0efSShaohua Li if (r5l_load_log(log)) 2637f6bed0efSShaohua Li goto error; 2638f6bed0efSShaohua Li 2639f6b6ec5cSShaohua Li rcu_assign_pointer(conf->log, log); 2640a62ab49eSShaohua Li set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 2641f6bed0efSShaohua Li return 0; 2642e8deb638SChristoph Hellwig 2643f6bed0efSShaohua Li error: 26440576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 26450576b1c6SShaohua Li reclaim_thread: 2646e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 2647e8deb638SChristoph Hellwig out_mempool: 2648c38d29b3SChristoph Hellwig bioset_free(log->bs); 2649c38d29b3SChristoph Hellwig io_bs: 26505036c390SChristoph Hellwig mempool_destroy(log->io_pool); 26515036c390SChristoph Hellwig io_pool: 2652f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 2653f6bed0efSShaohua Li io_kc: 2654f6bed0efSShaohua Li kfree(log); 2655f6bed0efSShaohua Li return -EINVAL; 2656f6bed0efSShaohua Li } 2657f6bed0efSShaohua Li 2658f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log) 2659f6bed0efSShaohua Li { 26600576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 2661e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 2662c38d29b3SChristoph Hellwig bioset_free(log->bs); 26635036c390SChristoph Hellwig mempool_destroy(log->io_pool); 2664f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 2665f6bed0efSShaohua Li kfree(log); 2666f6bed0efSShaohua Li } 2667