1f6bed0efSShaohua Li /* 2f6bed0efSShaohua Li * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3f6bed0efSShaohua Li * 4f6bed0efSShaohua Li * This program is free software; you can redistribute it and/or modify it 5f6bed0efSShaohua Li * under the terms and conditions of the GNU General Public License, 6f6bed0efSShaohua Li * version 2, as published by the Free Software Foundation. 7f6bed0efSShaohua Li * 8f6bed0efSShaohua Li * This program is distributed in the hope it will be useful, but WITHOUT 9f6bed0efSShaohua Li * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10f6bed0efSShaohua Li * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11f6bed0efSShaohua Li * more details. 12f6bed0efSShaohua Li * 13f6bed0efSShaohua Li */ 14f6bed0efSShaohua Li #include <linux/kernel.h> 15f6bed0efSShaohua Li #include <linux/wait.h> 16f6bed0efSShaohua Li #include <linux/blkdev.h> 17f6bed0efSShaohua Li #include <linux/slab.h> 18f6bed0efSShaohua Li #include <linux/raid/md_p.h> 195cb2fbd6SShaohua Li #include <linux/crc32c.h> 20f6bed0efSShaohua Li #include <linux/random.h> 21f6bed0efSShaohua Li #include "md.h" 22f6bed0efSShaohua Li #include "raid5.h" 231e6d690bSSong Liu #include "bitmap.h" 24f6bed0efSShaohua Li 25f6bed0efSShaohua Li /* 26f6bed0efSShaohua Li * metadata/data stored in disk with 4k size unit (a block) regardless 27f6bed0efSShaohua Li * underneath hardware sector size. only works with PAGE_SIZE == 4096 28f6bed0efSShaohua Li */ 29f6bed0efSShaohua Li #define BLOCK_SECTORS (8) 30f6bed0efSShaohua Li 310576b1c6SShaohua Li /* 32*a39f7afdSSong Liu * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 33*a39f7afdSSong Liu * 34*a39f7afdSSong Liu * In write through mode, the reclaim runs every log->max_free_space. 35*a39f7afdSSong Liu * This can prevent the recovery scans for too long 360576b1c6SShaohua Li */ 370576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 380576b1c6SShaohua Li #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 390576b1c6SShaohua Li 40*a39f7afdSSong Liu /* wake up reclaim thread periodically */ 41*a39f7afdSSong Liu #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 42*a39f7afdSSong Liu /* start flush with these full stripes */ 43*a39f7afdSSong Liu #define R5C_FULL_STRIPE_FLUSH_BATCH 256 44*a39f7afdSSong Liu /* reclaim stripes in groups */ 45*a39f7afdSSong Liu #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 46*a39f7afdSSong Liu 47c38d29b3SChristoph Hellwig /* 48c38d29b3SChristoph Hellwig * We only need 2 bios per I/O unit to make progress, but ensure we 49c38d29b3SChristoph Hellwig * have a few more available to not get too tight. 50c38d29b3SChristoph Hellwig */ 51c38d29b3SChristoph Hellwig #define R5L_POOL_SIZE 4 52c38d29b3SChristoph Hellwig 532ded3703SSong Liu /* 542ded3703SSong Liu * r5c journal modes of the array: write-back or write-through. 552ded3703SSong Liu * write-through mode has identical behavior as existing log only 562ded3703SSong Liu * implementation. 572ded3703SSong Liu */ 582ded3703SSong Liu enum r5c_journal_mode { 592ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_THROUGH = 0, 602ded3703SSong Liu R5C_JOURNAL_MODE_WRITE_BACK = 1, 612ded3703SSong Liu }; 622ded3703SSong Liu 632ded3703SSong Liu /* 642ded3703SSong Liu * raid5 cache state machine 652ded3703SSong Liu * 662ded3703SSong Liu * With rhe RAID cache, each stripe works in two phases: 672ded3703SSong Liu * - caching phase 682ded3703SSong Liu * - writing-out phase 692ded3703SSong Liu * 702ded3703SSong Liu * These two phases are controlled by bit STRIPE_R5C_CACHING: 712ded3703SSong Liu * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 722ded3703SSong Liu * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 732ded3703SSong Liu * 742ded3703SSong Liu * When there is no journal, or the journal is in write-through mode, 752ded3703SSong Liu * the stripe is always in writing-out phase. 762ded3703SSong Liu * 772ded3703SSong Liu * For write-back journal, the stripe is sent to caching phase on write 782ded3703SSong Liu * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 792ded3703SSong Liu * the write-out phase by clearing STRIPE_R5C_CACHING. 802ded3703SSong Liu * 812ded3703SSong Liu * Stripes in caching phase do not write the raid disks. Instead, all 822ded3703SSong Liu * writes are committed from the log device. Therefore, a stripe in 832ded3703SSong Liu * caching phase handles writes as: 842ded3703SSong Liu * - write to log device 852ded3703SSong Liu * - return IO 862ded3703SSong Liu * 872ded3703SSong Liu * Stripes in writing-out phase handle writes as: 882ded3703SSong Liu * - calculate parity 892ded3703SSong Liu * - write pending data and parity to journal 902ded3703SSong Liu * - write data and parity to raid disks 912ded3703SSong Liu * - return IO for pending writes 922ded3703SSong Liu */ 932ded3703SSong Liu 94f6bed0efSShaohua Li struct r5l_log { 95f6bed0efSShaohua Li struct md_rdev *rdev; 96f6bed0efSShaohua Li 97f6bed0efSShaohua Li u32 uuid_checksum; 98f6bed0efSShaohua Li 99f6bed0efSShaohua Li sector_t device_size; /* log device size, round to 100f6bed0efSShaohua Li * BLOCK_SECTORS */ 1010576b1c6SShaohua Li sector_t max_free_space; /* reclaim run if free space is at 1020576b1c6SShaohua Li * this size */ 103f6bed0efSShaohua Li 104f6bed0efSShaohua Li sector_t last_checkpoint; /* log tail. where recovery scan 105f6bed0efSShaohua Li * starts from */ 106f6bed0efSShaohua Li u64 last_cp_seq; /* log tail sequence */ 107f6bed0efSShaohua Li 108f6bed0efSShaohua Li sector_t log_start; /* log head. where new data appends */ 109f6bed0efSShaohua Li u64 seq; /* log head sequence */ 110f6bed0efSShaohua Li 11117036461SChristoph Hellwig sector_t next_checkpoint; 11217036461SChristoph Hellwig u64 next_cp_seq; 11317036461SChristoph Hellwig 114f6bed0efSShaohua Li struct mutex io_mutex; 115f6bed0efSShaohua Li struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 116f6bed0efSShaohua Li 117f6bed0efSShaohua Li spinlock_t io_list_lock; 118f6bed0efSShaohua Li struct list_head running_ios; /* io_units which are still running, 119f6bed0efSShaohua Li * and have not yet been completely 120f6bed0efSShaohua Li * written to the log */ 121f6bed0efSShaohua Li struct list_head io_end_ios; /* io_units which have been completely 122f6bed0efSShaohua Li * written to the log but not yet written 123f6bed0efSShaohua Li * to the RAID */ 124a8c34f91SShaohua Li struct list_head flushing_ios; /* io_units which are waiting for log 125a8c34f91SShaohua Li * cache flush */ 12604732f74SChristoph Hellwig struct list_head finished_ios; /* io_units which settle down in log disk */ 127a8c34f91SShaohua Li struct bio flush_bio; 128f6bed0efSShaohua Li 1295036c390SChristoph Hellwig struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 1305036c390SChristoph Hellwig 131f6bed0efSShaohua Li struct kmem_cache *io_kc; 1325036c390SChristoph Hellwig mempool_t *io_pool; 133c38d29b3SChristoph Hellwig struct bio_set *bs; 134e8deb638SChristoph Hellwig mempool_t *meta_pool; 135f6bed0efSShaohua Li 1360576b1c6SShaohua Li struct md_thread *reclaim_thread; 1370576b1c6SShaohua Li unsigned long reclaim_target; /* number of space that need to be 1380576b1c6SShaohua Li * reclaimed. if it's 0, reclaim spaces 1390576b1c6SShaohua Li * used by io_units which are in 1400576b1c6SShaohua Li * IO_UNIT_STRIPE_END state (eg, reclaim 1410576b1c6SShaohua Li * dones't wait for specific io_unit 1420576b1c6SShaohua Li * switching to IO_UNIT_STRIPE_END 1430576b1c6SShaohua Li * state) */ 1440fd22b45SShaohua Li wait_queue_head_t iounit_wait; 1450576b1c6SShaohua Li 146f6bed0efSShaohua Li struct list_head no_space_stripes; /* pending stripes, log has no space */ 147f6bed0efSShaohua Li spinlock_t no_space_stripes_lock; 14856fef7c6SChristoph Hellwig 14956fef7c6SChristoph Hellwig bool need_cache_flush; 1502ded3703SSong Liu 1512ded3703SSong Liu /* for r5c_cache */ 1522ded3703SSong Liu enum r5c_journal_mode r5c_journal_mode; 153*a39f7afdSSong Liu 154*a39f7afdSSong Liu /* all stripes in r5cache, in the order of seq at sh->log_start */ 155*a39f7afdSSong Liu struct list_head stripe_in_journal_list; 156*a39f7afdSSong Liu 157*a39f7afdSSong Liu spinlock_t stripe_in_journal_lock; 158*a39f7afdSSong Liu atomic_t stripe_in_journal_count; 159f6bed0efSShaohua Li }; 160f6bed0efSShaohua Li 161f6bed0efSShaohua Li /* 162f6bed0efSShaohua Li * an IO range starts from a meta data block and end at the next meta data 163f6bed0efSShaohua Li * block. The io unit's the meta data block tracks data/parity followed it. io 164f6bed0efSShaohua Li * unit is written to log disk with normal write, as we always flush log disk 165f6bed0efSShaohua Li * first and then start move data to raid disks, there is no requirement to 166f6bed0efSShaohua Li * write io unit with FLUSH/FUA 167f6bed0efSShaohua Li */ 168f6bed0efSShaohua Li struct r5l_io_unit { 169f6bed0efSShaohua Li struct r5l_log *log; 170f6bed0efSShaohua Li 171f6bed0efSShaohua Li struct page *meta_page; /* store meta block */ 172f6bed0efSShaohua Li int meta_offset; /* current offset in meta_page */ 173f6bed0efSShaohua Li 174f6bed0efSShaohua Li struct bio *current_bio;/* current_bio accepting new data */ 175f6bed0efSShaohua Li 176f6bed0efSShaohua Li atomic_t pending_stripe;/* how many stripes not flushed to raid */ 177f6bed0efSShaohua Li u64 seq; /* seq number of the metablock */ 178f6bed0efSShaohua Li sector_t log_start; /* where the io_unit starts */ 179f6bed0efSShaohua Li sector_t log_end; /* where the io_unit ends */ 180f6bed0efSShaohua Li struct list_head log_sibling; /* log->running_ios */ 181f6bed0efSShaohua Li struct list_head stripe_list; /* stripes added to the io_unit */ 182f6bed0efSShaohua Li 183f6bed0efSShaohua Li int state; 1846143e2ceSChristoph Hellwig bool need_split_bio; 185f6bed0efSShaohua Li }; 186f6bed0efSShaohua Li 187f6bed0efSShaohua Li /* r5l_io_unit state */ 188f6bed0efSShaohua Li enum r5l_io_unit_state { 189f6bed0efSShaohua Li IO_UNIT_RUNNING = 0, /* accepting new IO */ 190f6bed0efSShaohua Li IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 191f6bed0efSShaohua Li * don't accepting new bio */ 192f6bed0efSShaohua Li IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 193a8c34f91SShaohua Li IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 194f6bed0efSShaohua Li }; 195f6bed0efSShaohua Li 1962ded3703SSong Liu bool r5c_is_writeback(struct r5l_log *log) 1972ded3703SSong Liu { 1982ded3703SSong Liu return (log != NULL && 1992ded3703SSong Liu log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 2002ded3703SSong Liu } 2012ded3703SSong Liu 202f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 203f6bed0efSShaohua Li { 204f6bed0efSShaohua Li start += inc; 205f6bed0efSShaohua Li if (start >= log->device_size) 206f6bed0efSShaohua Li start = start - log->device_size; 207f6bed0efSShaohua Li return start; 208f6bed0efSShaohua Li } 209f6bed0efSShaohua Li 210f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 211f6bed0efSShaohua Li sector_t end) 212f6bed0efSShaohua Li { 213f6bed0efSShaohua Li if (end >= start) 214f6bed0efSShaohua Li return end - start; 215f6bed0efSShaohua Li else 216f6bed0efSShaohua Li return end + log->device_size - start; 217f6bed0efSShaohua Li } 218f6bed0efSShaohua Li 219f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 220f6bed0efSShaohua Li { 221f6bed0efSShaohua Li sector_t used_size; 222f6bed0efSShaohua Li 223f6bed0efSShaohua Li used_size = r5l_ring_distance(log, log->last_checkpoint, 224f6bed0efSShaohua Li log->log_start); 225f6bed0efSShaohua Li 226f6bed0efSShaohua Li return log->device_size > used_size + size; 227f6bed0efSShaohua Li } 228f6bed0efSShaohua Li 229f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 230f6bed0efSShaohua Li enum r5l_io_unit_state state) 231f6bed0efSShaohua Li { 232f6bed0efSShaohua Li if (WARN_ON(io->state >= state)) 233f6bed0efSShaohua Li return; 234f6bed0efSShaohua Li io->state = state; 235f6bed0efSShaohua Li } 236f6bed0efSShaohua Li 2371e6d690bSSong Liu static void 2381e6d690bSSong Liu r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, 2391e6d690bSSong Liu struct bio_list *return_bi) 2401e6d690bSSong Liu { 2411e6d690bSSong Liu struct bio *wbi, *wbi2; 2421e6d690bSSong Liu 2431e6d690bSSong Liu wbi = dev->written; 2441e6d690bSSong Liu dev->written = NULL; 2451e6d690bSSong Liu while (wbi && wbi->bi_iter.bi_sector < 2461e6d690bSSong Liu dev->sector + STRIPE_SECTORS) { 2471e6d690bSSong Liu wbi2 = r5_next_bio(wbi, dev->sector); 2481e6d690bSSong Liu if (!raid5_dec_bi_active_stripes(wbi)) { 2491e6d690bSSong Liu md_write_end(conf->mddev); 2501e6d690bSSong Liu bio_list_add(return_bi, wbi); 2511e6d690bSSong Liu } 2521e6d690bSSong Liu wbi = wbi2; 2531e6d690bSSong Liu } 2541e6d690bSSong Liu } 2551e6d690bSSong Liu 2561e6d690bSSong Liu void r5c_handle_cached_data_endio(struct r5conf *conf, 2571e6d690bSSong Liu struct stripe_head *sh, int disks, struct bio_list *return_bi) 2581e6d690bSSong Liu { 2591e6d690bSSong Liu int i; 2601e6d690bSSong Liu 2611e6d690bSSong Liu for (i = sh->disks; i--; ) { 2621e6d690bSSong Liu if (sh->dev[i].written) { 2631e6d690bSSong Liu set_bit(R5_UPTODATE, &sh->dev[i].flags); 2641e6d690bSSong Liu r5c_return_dev_pending_writes(conf, &sh->dev[i], 2651e6d690bSSong Liu return_bi); 2661e6d690bSSong Liu bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2671e6d690bSSong Liu STRIPE_SECTORS, 2681e6d690bSSong Liu !test_bit(STRIPE_DEGRADED, &sh->state), 2691e6d690bSSong Liu 0); 2701e6d690bSSong Liu } 2711e6d690bSSong Liu } 2721e6d690bSSong Liu } 2731e6d690bSSong Liu 274*a39f7afdSSong Liu /* Check whether we should flush some stripes to free up stripe cache */ 275*a39f7afdSSong Liu void r5c_check_stripe_cache_usage(struct r5conf *conf) 276*a39f7afdSSong Liu { 277*a39f7afdSSong Liu int total_cached; 278*a39f7afdSSong Liu 279*a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 280*a39f7afdSSong Liu return; 281*a39f7afdSSong Liu 282*a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 283*a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 284*a39f7afdSSong Liu 285*a39f7afdSSong Liu /* 286*a39f7afdSSong Liu * The following condition is true for either of the following: 287*a39f7afdSSong Liu * - stripe cache pressure high: 288*a39f7afdSSong Liu * total_cached > 3/4 min_nr_stripes || 289*a39f7afdSSong Liu * empty_inactive_list_nr > 0 290*a39f7afdSSong Liu * - stripe cache pressure moderate: 291*a39f7afdSSong Liu * total_cached > 1/2 min_nr_stripes 292*a39f7afdSSong Liu */ 293*a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 1 / 2 || 294*a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 295*a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 296*a39f7afdSSong Liu } 297*a39f7afdSSong Liu 298*a39f7afdSSong Liu /* 299*a39f7afdSSong Liu * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 300*a39f7afdSSong Liu * stripes in the cache 301*a39f7afdSSong Liu */ 302*a39f7afdSSong Liu void r5c_check_cached_full_stripe(struct r5conf *conf) 303*a39f7afdSSong Liu { 304*a39f7afdSSong Liu if (!r5c_is_writeback(conf->log)) 305*a39f7afdSSong Liu return; 306*a39f7afdSSong Liu 307*a39f7afdSSong Liu /* 308*a39f7afdSSong Liu * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 309*a39f7afdSSong Liu * or a full stripe (chunk size / 4k stripes). 310*a39f7afdSSong Liu */ 311*a39f7afdSSong Liu if (atomic_read(&conf->r5c_cached_full_stripes) >= 312*a39f7afdSSong Liu min(R5C_FULL_STRIPE_FLUSH_BATCH, 313*a39f7afdSSong Liu conf->chunk_sectors >> STRIPE_SHIFT)) 314*a39f7afdSSong Liu r5l_wake_reclaim(conf->log, 0); 315*a39f7afdSSong Liu } 316*a39f7afdSSong Liu 317*a39f7afdSSong Liu /* 318*a39f7afdSSong Liu * Total log space (in sectors) needed to flush all data in cache 319*a39f7afdSSong Liu * 320*a39f7afdSSong Liu * Currently, writing-out phase automatically includes all pending writes 321*a39f7afdSSong Liu * to the same sector. So the reclaim of each stripe takes up to 322*a39f7afdSSong Liu * (conf->raid_disks + 1) pages of log space. 323*a39f7afdSSong Liu * 324*a39f7afdSSong Liu * To totally avoid deadlock due to log space, the code reserves 325*a39f7afdSSong Liu * (conf->raid_disks + 1) pages for each stripe in cache, which is not 326*a39f7afdSSong Liu * necessary in most cases. 327*a39f7afdSSong Liu * 328*a39f7afdSSong Liu * To improve this, we will need writing-out phase to be able to NOT include 329*a39f7afdSSong Liu * pending writes, which will reduce the requirement to 330*a39f7afdSSong Liu * (conf->max_degraded + 1) pages per stripe in cache. 331*a39f7afdSSong Liu */ 332*a39f7afdSSong Liu static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 333*a39f7afdSSong Liu { 334*a39f7afdSSong Liu struct r5l_log *log = conf->log; 335*a39f7afdSSong Liu 336*a39f7afdSSong Liu if (!r5c_is_writeback(log)) 337*a39f7afdSSong Liu return 0; 338*a39f7afdSSong Liu 339*a39f7afdSSong Liu return BLOCK_SECTORS * (conf->raid_disks + 1) * 340*a39f7afdSSong Liu atomic_read(&log->stripe_in_journal_count); 341*a39f7afdSSong Liu } 342*a39f7afdSSong Liu 343*a39f7afdSSong Liu /* 344*a39f7afdSSong Liu * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 345*a39f7afdSSong Liu * 346*a39f7afdSSong Liu * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 347*a39f7afdSSong Liu * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 348*a39f7afdSSong Liu * device is less than 2x of reclaim_required_space. 349*a39f7afdSSong Liu */ 350*a39f7afdSSong Liu static inline void r5c_update_log_state(struct r5l_log *log) 351*a39f7afdSSong Liu { 352*a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 353*a39f7afdSSong Liu sector_t free_space; 354*a39f7afdSSong Liu sector_t reclaim_space; 355*a39f7afdSSong Liu 356*a39f7afdSSong Liu if (!r5c_is_writeback(log)) 357*a39f7afdSSong Liu return; 358*a39f7afdSSong Liu 359*a39f7afdSSong Liu free_space = r5l_ring_distance(log, log->log_start, 360*a39f7afdSSong Liu log->last_checkpoint); 361*a39f7afdSSong Liu reclaim_space = r5c_log_required_to_flush_cache(conf); 362*a39f7afdSSong Liu if (free_space < 2 * reclaim_space) 363*a39f7afdSSong Liu set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 364*a39f7afdSSong Liu else 365*a39f7afdSSong Liu clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 366*a39f7afdSSong Liu if (free_space < 3 * reclaim_space) 367*a39f7afdSSong Liu set_bit(R5C_LOG_TIGHT, &conf->cache_state); 368*a39f7afdSSong Liu else 369*a39f7afdSSong Liu clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 370*a39f7afdSSong Liu } 371*a39f7afdSSong Liu 3722ded3703SSong Liu /* 3732ded3703SSong Liu * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 3742ded3703SSong Liu * This function should only be called in write-back mode. 3752ded3703SSong Liu */ 376*a39f7afdSSong Liu void r5c_make_stripe_write_out(struct stripe_head *sh) 3772ded3703SSong Liu { 3782ded3703SSong Liu struct r5conf *conf = sh->raid_conf; 3792ded3703SSong Liu struct r5l_log *log = conf->log; 3802ded3703SSong Liu 3812ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 3822ded3703SSong Liu 3832ded3703SSong Liu WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 3842ded3703SSong Liu clear_bit(STRIPE_R5C_CACHING, &sh->state); 3851e6d690bSSong Liu 3861e6d690bSSong Liu if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3871e6d690bSSong Liu atomic_inc(&conf->preread_active_stripes); 3881e6d690bSSong Liu 3891e6d690bSSong Liu if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 3901e6d690bSSong Liu BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 3911e6d690bSSong Liu atomic_dec(&conf->r5c_cached_partial_stripes); 3921e6d690bSSong Liu } 3931e6d690bSSong Liu 3941e6d690bSSong Liu if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 3951e6d690bSSong Liu BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 3961e6d690bSSong Liu atomic_dec(&conf->r5c_cached_full_stripes); 3971e6d690bSSong Liu } 3981e6d690bSSong Liu } 3991e6d690bSSong Liu 4001e6d690bSSong Liu static void r5c_handle_data_cached(struct stripe_head *sh) 4011e6d690bSSong Liu { 4021e6d690bSSong Liu int i; 4031e6d690bSSong Liu 4041e6d690bSSong Liu for (i = sh->disks; i--; ) 4051e6d690bSSong Liu if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 4061e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[i].flags); 4071e6d690bSSong Liu clear_bit(R5_LOCKED, &sh->dev[i].flags); 4081e6d690bSSong Liu } 4091e6d690bSSong Liu clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 4101e6d690bSSong Liu } 4111e6d690bSSong Liu 4121e6d690bSSong Liu /* 4131e6d690bSSong Liu * this journal write must contain full parity, 4141e6d690bSSong Liu * it may also contain some data pages 4151e6d690bSSong Liu */ 4161e6d690bSSong Liu static void r5c_handle_parity_cached(struct stripe_head *sh) 4171e6d690bSSong Liu { 4181e6d690bSSong Liu int i; 4191e6d690bSSong Liu 4201e6d690bSSong Liu for (i = sh->disks; i--; ) 4211e6d690bSSong Liu if (test_bit(R5_InJournal, &sh->dev[i].flags)) 4221e6d690bSSong Liu set_bit(R5_Wantwrite, &sh->dev[i].flags); 4232ded3703SSong Liu } 4242ded3703SSong Liu 4252ded3703SSong Liu /* 4262ded3703SSong Liu * Setting proper flags after writing (or flushing) data and/or parity to the 4272ded3703SSong Liu * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 4282ded3703SSong Liu */ 4292ded3703SSong Liu static void r5c_finish_cache_stripe(struct stripe_head *sh) 4302ded3703SSong Liu { 4312ded3703SSong Liu struct r5l_log *log = sh->raid_conf->log; 4322ded3703SSong Liu 4332ded3703SSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 4342ded3703SSong Liu BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 4352ded3703SSong Liu /* 4362ded3703SSong Liu * Set R5_InJournal for parity dev[pd_idx]. This means 4372ded3703SSong Liu * all data AND parity in the journal. For RAID 6, it is 4382ded3703SSong Liu * NOT necessary to set the flag for dev[qd_idx], as the 4392ded3703SSong Liu * two parities are written out together. 4402ded3703SSong Liu */ 4412ded3703SSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 4421e6d690bSSong Liu } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 4431e6d690bSSong Liu r5c_handle_data_cached(sh); 4441e6d690bSSong Liu } else { 4451e6d690bSSong Liu r5c_handle_parity_cached(sh); 4461e6d690bSSong Liu set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 4471e6d690bSSong Liu } 4482ded3703SSong Liu } 4492ded3703SSong Liu 450d8858f43SChristoph Hellwig static void r5l_io_run_stripes(struct r5l_io_unit *io) 451d8858f43SChristoph Hellwig { 452d8858f43SChristoph Hellwig struct stripe_head *sh, *next; 453d8858f43SChristoph Hellwig 454d8858f43SChristoph Hellwig list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 455d8858f43SChristoph Hellwig list_del_init(&sh->log_list); 4562ded3703SSong Liu 4572ded3703SSong Liu r5c_finish_cache_stripe(sh); 4582ded3703SSong Liu 459d8858f43SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 460d8858f43SChristoph Hellwig raid5_release_stripe(sh); 461d8858f43SChristoph Hellwig } 462d8858f43SChristoph Hellwig } 463d8858f43SChristoph Hellwig 46456fef7c6SChristoph Hellwig static void r5l_log_run_stripes(struct r5l_log *log) 46556fef7c6SChristoph Hellwig { 46656fef7c6SChristoph Hellwig struct r5l_io_unit *io, *next; 46756fef7c6SChristoph Hellwig 46856fef7c6SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 46956fef7c6SChristoph Hellwig 47056fef7c6SChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 47156fef7c6SChristoph Hellwig /* don't change list order */ 47256fef7c6SChristoph Hellwig if (io->state < IO_UNIT_IO_END) 47356fef7c6SChristoph Hellwig break; 47456fef7c6SChristoph Hellwig 47556fef7c6SChristoph Hellwig list_move_tail(&io->log_sibling, &log->finished_ios); 47656fef7c6SChristoph Hellwig r5l_io_run_stripes(io); 47756fef7c6SChristoph Hellwig } 47856fef7c6SChristoph Hellwig } 47956fef7c6SChristoph Hellwig 4803848c0bcSChristoph Hellwig static void r5l_move_to_end_ios(struct r5l_log *log) 4813848c0bcSChristoph Hellwig { 4823848c0bcSChristoph Hellwig struct r5l_io_unit *io, *next; 4833848c0bcSChristoph Hellwig 4843848c0bcSChristoph Hellwig assert_spin_locked(&log->io_list_lock); 4853848c0bcSChristoph Hellwig 4863848c0bcSChristoph Hellwig list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 4873848c0bcSChristoph Hellwig /* don't change list order */ 4883848c0bcSChristoph Hellwig if (io->state < IO_UNIT_IO_END) 4893848c0bcSChristoph Hellwig break; 4903848c0bcSChristoph Hellwig list_move_tail(&io->log_sibling, &log->io_end_ios); 4913848c0bcSChristoph Hellwig } 4923848c0bcSChristoph Hellwig } 4933848c0bcSChristoph Hellwig 494f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio) 495f6bed0efSShaohua Li { 496f6bed0efSShaohua Li struct r5l_io_unit *io = bio->bi_private; 497f6bed0efSShaohua Li struct r5l_log *log = io->log; 498509ffec7SChristoph Hellwig unsigned long flags; 499f6bed0efSShaohua Li 5006e74a9cfSShaohua Li if (bio->bi_error) 5016e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 5026e74a9cfSShaohua Li 503f6bed0efSShaohua Li bio_put(bio); 504e8deb638SChristoph Hellwig mempool_free(io->meta_page, log->meta_pool); 505f6bed0efSShaohua Li 506509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 507509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 50856fef7c6SChristoph Hellwig if (log->need_cache_flush) 5093848c0bcSChristoph Hellwig r5l_move_to_end_ios(log); 51056fef7c6SChristoph Hellwig else 51156fef7c6SChristoph Hellwig r5l_log_run_stripes(log); 512509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 513509ffec7SChristoph Hellwig 51456fef7c6SChristoph Hellwig if (log->need_cache_flush) 515f6bed0efSShaohua Li md_wakeup_thread(log->rdev->mddev->thread); 516f6bed0efSShaohua Li } 517f6bed0efSShaohua Li 518f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log) 519f6bed0efSShaohua Li { 520f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 521f6bed0efSShaohua Li struct r5l_meta_block *block; 522509ffec7SChristoph Hellwig unsigned long flags; 523f6bed0efSShaohua Li u32 crc; 524f6bed0efSShaohua Li 525f6bed0efSShaohua Li if (!io) 526f6bed0efSShaohua Li return; 527f6bed0efSShaohua Li 528f6bed0efSShaohua Li block = page_address(io->meta_page); 529f6bed0efSShaohua Li block->meta_size = cpu_to_le32(io->meta_offset); 5305cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 531f6bed0efSShaohua Li block->checksum = cpu_to_le32(crc); 532f6bed0efSShaohua Li 533f6bed0efSShaohua Li log->current_io = NULL; 534509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 535509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 536509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 537f6bed0efSShaohua Li 5384e49ea4aSMike Christie submit_bio(io->current_bio); 539f6bed0efSShaohua Li } 540f6bed0efSShaohua Li 5416143e2ceSChristoph Hellwig static struct bio *r5l_bio_alloc(struct r5l_log *log) 542b349feb3SChristoph Hellwig { 543c38d29b3SChristoph Hellwig struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 544b349feb3SChristoph Hellwig 545796a5cf0SMike Christie bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 546b349feb3SChristoph Hellwig bio->bi_bdev = log->rdev->bdev; 5471e932a37SChristoph Hellwig bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 548b349feb3SChristoph Hellwig 549b349feb3SChristoph Hellwig return bio; 550b349feb3SChristoph Hellwig } 551b349feb3SChristoph Hellwig 552c1b99198SChristoph Hellwig static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 553c1b99198SChristoph Hellwig { 554c1b99198SChristoph Hellwig log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 555c1b99198SChristoph Hellwig 556*a39f7afdSSong Liu r5c_update_log_state(log); 557c1b99198SChristoph Hellwig /* 558c1b99198SChristoph Hellwig * If we filled up the log device start from the beginning again, 559c1b99198SChristoph Hellwig * which will require a new bio. 560c1b99198SChristoph Hellwig * 561c1b99198SChristoph Hellwig * Note: for this to work properly the log size needs to me a multiple 562c1b99198SChristoph Hellwig * of BLOCK_SECTORS. 563c1b99198SChristoph Hellwig */ 564c1b99198SChristoph Hellwig if (log->log_start == 0) 5656143e2ceSChristoph Hellwig io->need_split_bio = true; 566c1b99198SChristoph Hellwig 567c1b99198SChristoph Hellwig io->log_end = log->log_start; 568c1b99198SChristoph Hellwig } 569c1b99198SChristoph Hellwig 570f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 571f6bed0efSShaohua Li { 572f6bed0efSShaohua Li struct r5l_io_unit *io; 573f6bed0efSShaohua Li struct r5l_meta_block *block; 574f6bed0efSShaohua Li 5755036c390SChristoph Hellwig io = mempool_alloc(log->io_pool, GFP_ATOMIC); 5765036c390SChristoph Hellwig if (!io) 5775036c390SChristoph Hellwig return NULL; 5785036c390SChristoph Hellwig memset(io, 0, sizeof(*io)); 5795036c390SChristoph Hellwig 58051039cd0SChristoph Hellwig io->log = log; 58151039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->log_sibling); 58251039cd0SChristoph Hellwig INIT_LIST_HEAD(&io->stripe_list); 58351039cd0SChristoph Hellwig io->state = IO_UNIT_RUNNING; 584f6bed0efSShaohua Li 585e8deb638SChristoph Hellwig io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 586f6bed0efSShaohua Li block = page_address(io->meta_page); 587e8deb638SChristoph Hellwig clear_page(block); 588f6bed0efSShaohua Li block->magic = cpu_to_le32(R5LOG_MAGIC); 589f6bed0efSShaohua Li block->version = R5LOG_VERSION; 590f6bed0efSShaohua Li block->seq = cpu_to_le64(log->seq); 591f6bed0efSShaohua Li block->position = cpu_to_le64(log->log_start); 592f6bed0efSShaohua Li 593f6bed0efSShaohua Li io->log_start = log->log_start; 594f6bed0efSShaohua Li io->meta_offset = sizeof(struct r5l_meta_block); 5952b8ef16eSChristoph Hellwig io->seq = log->seq++; 596f6bed0efSShaohua Li 5976143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 5986143e2ceSChristoph Hellwig io->current_bio->bi_end_io = r5l_log_endio; 5996143e2ceSChristoph Hellwig io->current_bio->bi_private = io; 600b349feb3SChristoph Hellwig bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 601f6bed0efSShaohua Li 602c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 603f6bed0efSShaohua Li 604f6bed0efSShaohua Li spin_lock_irq(&log->io_list_lock); 605f6bed0efSShaohua Li list_add_tail(&io->log_sibling, &log->running_ios); 606f6bed0efSShaohua Li spin_unlock_irq(&log->io_list_lock); 607f6bed0efSShaohua Li 608f6bed0efSShaohua Li return io; 609f6bed0efSShaohua Li } 610f6bed0efSShaohua Li 611f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 612f6bed0efSShaohua Li { 61322581f58SChristoph Hellwig if (log->current_io && 61422581f58SChristoph Hellwig log->current_io->meta_offset + payload_size > PAGE_SIZE) 615f6bed0efSShaohua Li r5l_submit_current_io(log); 616f6bed0efSShaohua Li 6175036c390SChristoph Hellwig if (!log->current_io) { 618f6bed0efSShaohua Li log->current_io = r5l_new_meta(log); 6195036c390SChristoph Hellwig if (!log->current_io) 6205036c390SChristoph Hellwig return -ENOMEM; 6215036c390SChristoph Hellwig } 6225036c390SChristoph Hellwig 623f6bed0efSShaohua Li return 0; 624f6bed0efSShaohua Li } 625f6bed0efSShaohua Li 626f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 627f6bed0efSShaohua Li sector_t location, 628f6bed0efSShaohua Li u32 checksum1, u32 checksum2, 629f6bed0efSShaohua Li bool checksum2_valid) 630f6bed0efSShaohua Li { 631f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 632f6bed0efSShaohua Li struct r5l_payload_data_parity *payload; 633f6bed0efSShaohua Li 634f6bed0efSShaohua Li payload = page_address(io->meta_page) + io->meta_offset; 635f6bed0efSShaohua Li payload->header.type = cpu_to_le16(type); 636f6bed0efSShaohua Li payload->header.flags = cpu_to_le16(0); 637f6bed0efSShaohua Li payload->size = cpu_to_le32((1 + !!checksum2_valid) << 638f6bed0efSShaohua Li (PAGE_SHIFT - 9)); 639f6bed0efSShaohua Li payload->location = cpu_to_le64(location); 640f6bed0efSShaohua Li payload->checksum[0] = cpu_to_le32(checksum1); 641f6bed0efSShaohua Li if (checksum2_valid) 642f6bed0efSShaohua Li payload->checksum[1] = cpu_to_le32(checksum2); 643f6bed0efSShaohua Li 644f6bed0efSShaohua Li io->meta_offset += sizeof(struct r5l_payload_data_parity) + 645f6bed0efSShaohua Li sizeof(__le32) * (1 + !!checksum2_valid); 646f6bed0efSShaohua Li } 647f6bed0efSShaohua Li 648f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 649f6bed0efSShaohua Li { 650f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 651f6bed0efSShaohua Li 6526143e2ceSChristoph Hellwig if (io->need_split_bio) { 6536143e2ceSChristoph Hellwig struct bio *prev = io->current_bio; 654f6bed0efSShaohua Li 6556143e2ceSChristoph Hellwig io->current_bio = r5l_bio_alloc(log); 6566143e2ceSChristoph Hellwig bio_chain(io->current_bio, prev); 6576143e2ceSChristoph Hellwig 6584e49ea4aSMike Christie submit_bio(prev); 659f6bed0efSShaohua Li } 660f6bed0efSShaohua Li 6616143e2ceSChristoph Hellwig if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 6626143e2ceSChristoph Hellwig BUG(); 6636143e2ceSChristoph Hellwig 664c1b99198SChristoph Hellwig r5_reserve_log_entry(log, io); 665f6bed0efSShaohua Li } 666f6bed0efSShaohua Li 6675036c390SChristoph Hellwig static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 668f6bed0efSShaohua Li int data_pages, int parity_pages) 669f6bed0efSShaohua Li { 670f6bed0efSShaohua Li int i; 671f6bed0efSShaohua Li int meta_size; 6725036c390SChristoph Hellwig int ret; 673f6bed0efSShaohua Li struct r5l_io_unit *io; 674f6bed0efSShaohua Li 675f6bed0efSShaohua Li meta_size = 676f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 677f6bed0efSShaohua Li * data_pages) + 678f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 679f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 680f6bed0efSShaohua Li 6815036c390SChristoph Hellwig ret = r5l_get_meta(log, meta_size); 6825036c390SChristoph Hellwig if (ret) 6835036c390SChristoph Hellwig return ret; 6845036c390SChristoph Hellwig 685f6bed0efSShaohua Li io = log->current_io; 686f6bed0efSShaohua Li 687f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 6881e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 6891e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 690f6bed0efSShaohua Li continue; 691f6bed0efSShaohua Li if (i == sh->pd_idx || i == sh->qd_idx) 692f6bed0efSShaohua Li continue; 693f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 694f6bed0efSShaohua Li raid5_compute_blocknr(sh, i, 0), 695f6bed0efSShaohua Li sh->dev[i].log_checksum, 0, false); 696f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[i].page); 697f6bed0efSShaohua Li } 698f6bed0efSShaohua Li 6992ded3703SSong Liu if (parity_pages == 2) { 700f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 701f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 702f6bed0efSShaohua Li sh->dev[sh->qd_idx].log_checksum, true); 703f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 704f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 7052ded3703SSong Liu } else if (parity_pages == 1) { 706f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 707f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 708f6bed0efSShaohua Li 0, false); 709f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 7102ded3703SSong Liu } else /* Just writing data, not parity, in caching phase */ 7112ded3703SSong Liu BUG_ON(parity_pages != 0); 712f6bed0efSShaohua Li 713f6bed0efSShaohua Li list_add_tail(&sh->log_list, &io->stripe_list); 714f6bed0efSShaohua Li atomic_inc(&io->pending_stripe); 715f6bed0efSShaohua Li sh->log_io = io; 7165036c390SChristoph Hellwig 717*a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 718*a39f7afdSSong Liu return 0; 719*a39f7afdSSong Liu 720*a39f7afdSSong Liu if (sh->log_start == MaxSector) { 721*a39f7afdSSong Liu BUG_ON(!list_empty(&sh->r5c)); 722*a39f7afdSSong Liu sh->log_start = io->log_start; 723*a39f7afdSSong Liu spin_lock_irq(&log->stripe_in_journal_lock); 724*a39f7afdSSong Liu list_add_tail(&sh->r5c, 725*a39f7afdSSong Liu &log->stripe_in_journal_list); 726*a39f7afdSSong Liu spin_unlock_irq(&log->stripe_in_journal_lock); 727*a39f7afdSSong Liu atomic_inc(&log->stripe_in_journal_count); 728*a39f7afdSSong Liu } 7295036c390SChristoph Hellwig return 0; 730f6bed0efSShaohua Li } 731f6bed0efSShaohua Li 732*a39f7afdSSong Liu /* add stripe to no_space_stripes, and then wake up reclaim */ 733*a39f7afdSSong Liu static inline void r5l_add_no_space_stripe(struct r5l_log *log, 734*a39f7afdSSong Liu struct stripe_head *sh) 735*a39f7afdSSong Liu { 736*a39f7afdSSong Liu spin_lock(&log->no_space_stripes_lock); 737*a39f7afdSSong Liu list_add_tail(&sh->log_list, &log->no_space_stripes); 738*a39f7afdSSong Liu spin_unlock(&log->no_space_stripes_lock); 739*a39f7afdSSong Liu } 740*a39f7afdSSong Liu 741f6bed0efSShaohua Li /* 742f6bed0efSShaohua Li * running in raid5d, where reclaim could wait for raid5d too (when it flushes 743f6bed0efSShaohua Li * data from log to raid disks), so we shouldn't wait for reclaim here 744f6bed0efSShaohua Li */ 745f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 746f6bed0efSShaohua Li { 747*a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 748f6bed0efSShaohua Li int write_disks = 0; 749f6bed0efSShaohua Li int data_pages, parity_pages; 750f6bed0efSShaohua Li int reserve; 751f6bed0efSShaohua Li int i; 7525036c390SChristoph Hellwig int ret = 0; 753*a39f7afdSSong Liu bool wake_reclaim = false; 754f6bed0efSShaohua Li 755f6bed0efSShaohua Li if (!log) 756f6bed0efSShaohua Li return -EAGAIN; 757f6bed0efSShaohua Li /* Don't support stripe batch */ 758f6bed0efSShaohua Li if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 759f6bed0efSShaohua Li test_bit(STRIPE_SYNCING, &sh->state)) { 760f6bed0efSShaohua Li /* the stripe is written to log, we start writing it to raid */ 761f6bed0efSShaohua Li clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 762f6bed0efSShaohua Li return -EAGAIN; 763f6bed0efSShaohua Li } 764f6bed0efSShaohua Li 7652ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 7662ded3703SSong Liu 767f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 768f6bed0efSShaohua Li void *addr; 769f6bed0efSShaohua Li 7701e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 7711e6d690bSSong Liu test_bit(R5_InJournal, &sh->dev[i].flags)) 772f6bed0efSShaohua Li continue; 7731e6d690bSSong Liu 774f6bed0efSShaohua Li write_disks++; 775f6bed0efSShaohua Li /* checksum is already calculated in last run */ 776f6bed0efSShaohua Li if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 777f6bed0efSShaohua Li continue; 778f6bed0efSShaohua Li addr = kmap_atomic(sh->dev[i].page); 7795cb2fbd6SShaohua Li sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 780f6bed0efSShaohua Li addr, PAGE_SIZE); 781f6bed0efSShaohua Li kunmap_atomic(addr); 782f6bed0efSShaohua Li } 783f6bed0efSShaohua Li parity_pages = 1 + !!(sh->qd_idx >= 0); 784f6bed0efSShaohua Li data_pages = write_disks - parity_pages; 785f6bed0efSShaohua Li 786f6bed0efSShaohua Li set_bit(STRIPE_LOG_TRAPPED, &sh->state); 787253f9fd4SShaohua Li /* 788253f9fd4SShaohua Li * The stripe must enter state machine again to finish the write, so 789253f9fd4SShaohua Li * don't delay. 790253f9fd4SShaohua Li */ 791253f9fd4SShaohua Li clear_bit(STRIPE_DELAYED, &sh->state); 792f6bed0efSShaohua Li atomic_inc(&sh->count); 793f6bed0efSShaohua Li 794f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 795f6bed0efSShaohua Li /* meta + data */ 796f6bed0efSShaohua Li reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 797f6bed0efSShaohua Li 798*a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 799*a39f7afdSSong Liu if (!r5l_has_free_space(log, reserve)) { 800*a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 801*a39f7afdSSong Liu wake_reclaim = true; 8025036c390SChristoph Hellwig } else { 8035036c390SChristoph Hellwig ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 8045036c390SChristoph Hellwig if (ret) { 8055036c390SChristoph Hellwig spin_lock_irq(&log->io_list_lock); 806*a39f7afdSSong Liu list_add_tail(&sh->log_list, 807*a39f7afdSSong Liu &log->no_mem_stripes); 8085036c390SChristoph Hellwig spin_unlock_irq(&log->io_list_lock); 809f6bed0efSShaohua Li } 8105036c390SChristoph Hellwig } 811*a39f7afdSSong Liu } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 812*a39f7afdSSong Liu /* 813*a39f7afdSSong Liu * log space critical, do not process stripes that are 814*a39f7afdSSong Liu * not in cache yet (sh->log_start == MaxSector). 815*a39f7afdSSong Liu */ 816*a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 817*a39f7afdSSong Liu sh->log_start == MaxSector) { 818*a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 819*a39f7afdSSong Liu wake_reclaim = true; 820*a39f7afdSSong Liu reserve = 0; 821*a39f7afdSSong Liu } else if (!r5l_has_free_space(log, reserve)) { 822*a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 823*a39f7afdSSong Liu BUG(); 824*a39f7afdSSong Liu else 825*a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 826*a39f7afdSSong Liu } else { 827*a39f7afdSSong Liu ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 828*a39f7afdSSong Liu if (ret) { 829*a39f7afdSSong Liu spin_lock_irq(&log->io_list_lock); 830*a39f7afdSSong Liu list_add_tail(&sh->log_list, 831*a39f7afdSSong Liu &log->no_mem_stripes); 832*a39f7afdSSong Liu spin_unlock_irq(&log->io_list_lock); 833*a39f7afdSSong Liu } 834*a39f7afdSSong Liu } 835*a39f7afdSSong Liu } 836f6bed0efSShaohua Li 8375036c390SChristoph Hellwig mutex_unlock(&log->io_mutex); 838*a39f7afdSSong Liu if (wake_reclaim) 839*a39f7afdSSong Liu r5l_wake_reclaim(log, reserve); 840f6bed0efSShaohua Li return 0; 841f6bed0efSShaohua Li } 842f6bed0efSShaohua Li 843f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log) 844f6bed0efSShaohua Li { 845f6bed0efSShaohua Li if (!log) 846f6bed0efSShaohua Li return; 847f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 848f6bed0efSShaohua Li r5l_submit_current_io(log); 849f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 850f6bed0efSShaohua Li } 851f6bed0efSShaohua Li 852828cbe98SShaohua Li int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 853828cbe98SShaohua Li { 854828cbe98SShaohua Li if (!log) 855828cbe98SShaohua Li return -ENODEV; 856828cbe98SShaohua Li /* 857828cbe98SShaohua Li * we flush log disk cache first, then write stripe data to raid disks. 858828cbe98SShaohua Li * So if bio is finished, the log disk cache is flushed already. The 859828cbe98SShaohua Li * recovery guarantees we can recovery the bio from log disk, so we 860828cbe98SShaohua Li * don't need to flush again 861828cbe98SShaohua Li */ 862828cbe98SShaohua Li if (bio->bi_iter.bi_size == 0) { 863828cbe98SShaohua Li bio_endio(bio); 864828cbe98SShaohua Li return 0; 865828cbe98SShaohua Li } 8661eff9d32SJens Axboe bio->bi_opf &= ~REQ_PREFLUSH; 867828cbe98SShaohua Li return -EAGAIN; 868828cbe98SShaohua Li } 869828cbe98SShaohua Li 870f6bed0efSShaohua Li /* This will run after log space is reclaimed */ 871f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log) 872f6bed0efSShaohua Li { 873f6bed0efSShaohua Li struct stripe_head *sh; 874f6bed0efSShaohua Li 875f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 876f6bed0efSShaohua Li while (!list_empty(&log->no_space_stripes)) { 877f6bed0efSShaohua Li sh = list_first_entry(&log->no_space_stripes, 878f6bed0efSShaohua Li struct stripe_head, log_list); 879f6bed0efSShaohua Li list_del_init(&sh->log_list); 880f6bed0efSShaohua Li set_bit(STRIPE_HANDLE, &sh->state); 881f6bed0efSShaohua Li raid5_release_stripe(sh); 882f6bed0efSShaohua Li } 883f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 884f6bed0efSShaohua Li } 885f6bed0efSShaohua Li 886*a39f7afdSSong Liu /* 887*a39f7afdSSong Liu * calculate new last_checkpoint 888*a39f7afdSSong Liu * for write through mode, returns log->next_checkpoint 889*a39f7afdSSong Liu * for write back, returns log_start of first sh in stripe_in_journal_list 890*a39f7afdSSong Liu */ 891*a39f7afdSSong Liu static sector_t r5c_calculate_new_cp(struct r5conf *conf) 892*a39f7afdSSong Liu { 893*a39f7afdSSong Liu struct stripe_head *sh; 894*a39f7afdSSong Liu struct r5l_log *log = conf->log; 895*a39f7afdSSong Liu sector_t new_cp; 896*a39f7afdSSong Liu unsigned long flags; 897*a39f7afdSSong Liu 898*a39f7afdSSong Liu if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 899*a39f7afdSSong Liu return log->next_checkpoint; 900*a39f7afdSSong Liu 901*a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 902*a39f7afdSSong Liu if (list_empty(&conf->log->stripe_in_journal_list)) { 903*a39f7afdSSong Liu /* all stripes flushed */ 904*a39f7afdSSong Liu spin_unlock(&log->stripe_in_journal_lock); 905*a39f7afdSSong Liu return log->next_checkpoint; 906*a39f7afdSSong Liu } 907*a39f7afdSSong Liu sh = list_first_entry(&conf->log->stripe_in_journal_list, 908*a39f7afdSSong Liu struct stripe_head, r5c); 909*a39f7afdSSong Liu new_cp = sh->log_start; 910*a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 911*a39f7afdSSong Liu return new_cp; 912*a39f7afdSSong Liu } 913*a39f7afdSSong Liu 91417036461SChristoph Hellwig static sector_t r5l_reclaimable_space(struct r5l_log *log) 91517036461SChristoph Hellwig { 916*a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 917*a39f7afdSSong Liu 91817036461SChristoph Hellwig return r5l_ring_distance(log, log->last_checkpoint, 919*a39f7afdSSong Liu r5c_calculate_new_cp(conf)); 92017036461SChristoph Hellwig } 92117036461SChristoph Hellwig 9225036c390SChristoph Hellwig static void r5l_run_no_mem_stripe(struct r5l_log *log) 9235036c390SChristoph Hellwig { 9245036c390SChristoph Hellwig struct stripe_head *sh; 9255036c390SChristoph Hellwig 9265036c390SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 9275036c390SChristoph Hellwig 9285036c390SChristoph Hellwig if (!list_empty(&log->no_mem_stripes)) { 9295036c390SChristoph Hellwig sh = list_first_entry(&log->no_mem_stripes, 9305036c390SChristoph Hellwig struct stripe_head, log_list); 9315036c390SChristoph Hellwig list_del_init(&sh->log_list); 9325036c390SChristoph Hellwig set_bit(STRIPE_HANDLE, &sh->state); 9335036c390SChristoph Hellwig raid5_release_stripe(sh); 9345036c390SChristoph Hellwig } 9355036c390SChristoph Hellwig } 9365036c390SChristoph Hellwig 93704732f74SChristoph Hellwig static bool r5l_complete_finished_ios(struct r5l_log *log) 93817036461SChristoph Hellwig { 93917036461SChristoph Hellwig struct r5l_io_unit *io, *next; 94017036461SChristoph Hellwig bool found = false; 94117036461SChristoph Hellwig 94217036461SChristoph Hellwig assert_spin_locked(&log->io_list_lock); 94317036461SChristoph Hellwig 94404732f74SChristoph Hellwig list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 94517036461SChristoph Hellwig /* don't change list order */ 94617036461SChristoph Hellwig if (io->state < IO_UNIT_STRIPE_END) 94717036461SChristoph Hellwig break; 94817036461SChristoph Hellwig 94917036461SChristoph Hellwig log->next_checkpoint = io->log_start; 95017036461SChristoph Hellwig log->next_cp_seq = io->seq; 95117036461SChristoph Hellwig 95217036461SChristoph Hellwig list_del(&io->log_sibling); 9535036c390SChristoph Hellwig mempool_free(io, log->io_pool); 9545036c390SChristoph Hellwig r5l_run_no_mem_stripe(log); 95517036461SChristoph Hellwig 95617036461SChristoph Hellwig found = true; 95717036461SChristoph Hellwig } 95817036461SChristoph Hellwig 95917036461SChristoph Hellwig return found; 96017036461SChristoph Hellwig } 96117036461SChristoph Hellwig 962509ffec7SChristoph Hellwig static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 963509ffec7SChristoph Hellwig { 964509ffec7SChristoph Hellwig struct r5l_log *log = io->log; 965*a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 966509ffec7SChristoph Hellwig unsigned long flags; 967509ffec7SChristoph Hellwig 968509ffec7SChristoph Hellwig spin_lock_irqsave(&log->io_list_lock, flags); 969509ffec7SChristoph Hellwig __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 97017036461SChristoph Hellwig 97104732f74SChristoph Hellwig if (!r5l_complete_finished_ios(log)) { 97285f2f9a4SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 97385f2f9a4SShaohua Li return; 97485f2f9a4SShaohua Li } 975509ffec7SChristoph Hellwig 976*a39f7afdSSong Liu if (r5l_reclaimable_space(log) > log->max_free_space || 977*a39f7afdSSong Liu test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 978509ffec7SChristoph Hellwig r5l_wake_reclaim(log, 0); 979509ffec7SChristoph Hellwig 980509ffec7SChristoph Hellwig spin_unlock_irqrestore(&log->io_list_lock, flags); 981509ffec7SChristoph Hellwig wake_up(&log->iounit_wait); 982509ffec7SChristoph Hellwig } 983509ffec7SChristoph Hellwig 9840576b1c6SShaohua Li void r5l_stripe_write_finished(struct stripe_head *sh) 9850576b1c6SShaohua Li { 9860576b1c6SShaohua Li struct r5l_io_unit *io; 9870576b1c6SShaohua Li 9880576b1c6SShaohua Li io = sh->log_io; 9890576b1c6SShaohua Li sh->log_io = NULL; 9900576b1c6SShaohua Li 991509ffec7SChristoph Hellwig if (io && atomic_dec_and_test(&io->pending_stripe)) 992509ffec7SChristoph Hellwig __r5l_stripe_write_finished(io); 9930576b1c6SShaohua Li } 9940576b1c6SShaohua Li 995a8c34f91SShaohua Li static void r5l_log_flush_endio(struct bio *bio) 996a8c34f91SShaohua Li { 997a8c34f91SShaohua Li struct r5l_log *log = container_of(bio, struct r5l_log, 998a8c34f91SShaohua Li flush_bio); 999a8c34f91SShaohua Li unsigned long flags; 1000a8c34f91SShaohua Li struct r5l_io_unit *io; 1001a8c34f91SShaohua Li 10026e74a9cfSShaohua Li if (bio->bi_error) 10036e74a9cfSShaohua Li md_error(log->rdev->mddev, log->rdev); 10046e74a9cfSShaohua Li 1005a8c34f91SShaohua Li spin_lock_irqsave(&log->io_list_lock, flags); 1006d8858f43SChristoph Hellwig list_for_each_entry(io, &log->flushing_ios, log_sibling) 1007d8858f43SChristoph Hellwig r5l_io_run_stripes(io); 100804732f74SChristoph Hellwig list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 1009a8c34f91SShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 1010a8c34f91SShaohua Li } 1011a8c34f91SShaohua Li 10120576b1c6SShaohua Li /* 10130576b1c6SShaohua Li * Starting dispatch IO to raid. 10140576b1c6SShaohua Li * io_unit(meta) consists of a log. There is one situation we want to avoid. A 10150576b1c6SShaohua Li * broken meta in the middle of a log causes recovery can't find meta at the 10160576b1c6SShaohua Li * head of log. If operations require meta at the head persistent in log, we 10170576b1c6SShaohua Li * must make sure meta before it persistent in log too. A case is: 10180576b1c6SShaohua Li * 10190576b1c6SShaohua Li * stripe data/parity is in log, we start write stripe to raid disks. stripe 10200576b1c6SShaohua Li * data/parity must be persistent in log before we do the write to raid disks. 10210576b1c6SShaohua Li * 10220576b1c6SShaohua Li * The solution is we restrictly maintain io_unit list order. In this case, we 10230576b1c6SShaohua Li * only write stripes of an io_unit to raid disks till the io_unit is the first 10240576b1c6SShaohua Li * one whose data/parity is in log. 10250576b1c6SShaohua Li */ 10260576b1c6SShaohua Li void r5l_flush_stripe_to_raid(struct r5l_log *log) 10270576b1c6SShaohua Li { 1028a8c34f91SShaohua Li bool do_flush; 102956fef7c6SChristoph Hellwig 103056fef7c6SChristoph Hellwig if (!log || !log->need_cache_flush) 10310576b1c6SShaohua Li return; 10320576b1c6SShaohua Li 1033a8c34f91SShaohua Li spin_lock_irq(&log->io_list_lock); 1034a8c34f91SShaohua Li /* flush bio is running */ 1035a8c34f91SShaohua Li if (!list_empty(&log->flushing_ios)) { 1036a8c34f91SShaohua Li spin_unlock_irq(&log->io_list_lock); 10370576b1c6SShaohua Li return; 10380576b1c6SShaohua Li } 1039a8c34f91SShaohua Li list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 1040a8c34f91SShaohua Li do_flush = !list_empty(&log->flushing_ios); 10410576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 1042a8c34f91SShaohua Li 1043a8c34f91SShaohua Li if (!do_flush) 1044a8c34f91SShaohua Li return; 1045a8c34f91SShaohua Li bio_reset(&log->flush_bio); 1046a8c34f91SShaohua Li log->flush_bio.bi_bdev = log->rdev->bdev; 1047a8c34f91SShaohua Li log->flush_bio.bi_end_io = r5l_log_flush_endio; 1048796a5cf0SMike Christie bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); 10494e49ea4aSMike Christie submit_bio(&log->flush_bio); 10500576b1c6SShaohua Li } 10510576b1c6SShaohua Li 10520576b1c6SShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp); 10534b482044SShaohua Li static void r5l_write_super_and_discard_space(struct r5l_log *log, 10544b482044SShaohua Li sector_t end) 10554b482044SShaohua Li { 10564b482044SShaohua Li struct block_device *bdev = log->rdev->bdev; 10574b482044SShaohua Li struct mddev *mddev; 10584b482044SShaohua Li 10594b482044SShaohua Li r5l_write_super(log, end); 10604b482044SShaohua Li 10614b482044SShaohua Li if (!blk_queue_discard(bdev_get_queue(bdev))) 10624b482044SShaohua Li return; 10634b482044SShaohua Li 10644b482044SShaohua Li mddev = log->rdev->mddev; 10654b482044SShaohua Li /* 10668e018c21SShaohua Li * Discard could zero data, so before discard we must make sure 10678e018c21SShaohua Li * superblock is updated to new log tail. Updating superblock (either 10688e018c21SShaohua Li * directly call md_update_sb() or depend on md thread) must hold 10698e018c21SShaohua Li * reconfig mutex. On the other hand, raid5_quiesce is called with 10708e018c21SShaohua Li * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 10718e018c21SShaohua Li * for all IO finish, hence waitting for reclaim thread, while reclaim 10728e018c21SShaohua Li * thread is calling this function and waitting for reconfig mutex. So 10738e018c21SShaohua Li * there is a deadlock. We workaround this issue with a trylock. 10748e018c21SShaohua Li * FIXME: we could miss discard if we can't take reconfig mutex 10754b482044SShaohua Li */ 107685ad1d13SGuoqing Jiang set_mask_bits(&mddev->flags, 0, 107785ad1d13SGuoqing Jiang BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 10788e018c21SShaohua Li if (!mddev_trylock(mddev)) 10798e018c21SShaohua Li return; 10804b482044SShaohua Li md_update_sb(mddev, 1); 10818e018c21SShaohua Li mddev_unlock(mddev); 10824b482044SShaohua Li 10836e74a9cfSShaohua Li /* discard IO error really doesn't matter, ignore it */ 10844b482044SShaohua Li if (log->last_checkpoint < end) { 10854b482044SShaohua Li blkdev_issue_discard(bdev, 10864b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 10874b482044SShaohua Li end - log->last_checkpoint, GFP_NOIO, 0); 10884b482044SShaohua Li } else { 10894b482044SShaohua Li blkdev_issue_discard(bdev, 10904b482044SShaohua Li log->last_checkpoint + log->rdev->data_offset, 10914b482044SShaohua Li log->device_size - log->last_checkpoint, 10924b482044SShaohua Li GFP_NOIO, 0); 10934b482044SShaohua Li blkdev_issue_discard(bdev, log->rdev->data_offset, end, 10944b482044SShaohua Li GFP_NOIO, 0); 10954b482044SShaohua Li } 10964b482044SShaohua Li } 10974b482044SShaohua Li 1098*a39f7afdSSong Liu /* 1099*a39f7afdSSong Liu * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 1100*a39f7afdSSong Liu * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 1101*a39f7afdSSong Liu * 1102*a39f7afdSSong Liu * must hold conf->device_lock 1103*a39f7afdSSong Liu */ 1104*a39f7afdSSong Liu static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 1105*a39f7afdSSong Liu { 1106*a39f7afdSSong Liu BUG_ON(list_empty(&sh->lru)); 1107*a39f7afdSSong Liu BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 1108*a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 1109*a39f7afdSSong Liu 1110*a39f7afdSSong Liu /* 1111*a39f7afdSSong Liu * The stripe is not ON_RELEASE_LIST, so it is safe to call 1112*a39f7afdSSong Liu * raid5_release_stripe() while holding conf->device_lock 1113*a39f7afdSSong Liu */ 1114*a39f7afdSSong Liu BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 1115*a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1116*a39f7afdSSong Liu 1117*a39f7afdSSong Liu list_del_init(&sh->lru); 1118*a39f7afdSSong Liu atomic_inc(&sh->count); 1119*a39f7afdSSong Liu 1120*a39f7afdSSong Liu set_bit(STRIPE_HANDLE, &sh->state); 1121*a39f7afdSSong Liu atomic_inc(&conf->active_stripes); 1122*a39f7afdSSong Liu r5c_make_stripe_write_out(sh); 1123*a39f7afdSSong Liu 1124*a39f7afdSSong Liu if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 1125*a39f7afdSSong Liu atomic_inc(&conf->preread_active_stripes); 1126*a39f7afdSSong Liu raid5_release_stripe(sh); 1127*a39f7afdSSong Liu } 1128*a39f7afdSSong Liu 1129*a39f7afdSSong Liu /* 1130*a39f7afdSSong Liu * if num == 0, flush all full stripes 1131*a39f7afdSSong Liu * if num > 0, flush all full stripes. If less than num full stripes are 1132*a39f7afdSSong Liu * flushed, flush some partial stripes until totally num stripes are 1133*a39f7afdSSong Liu * flushed or there is no more cached stripes. 1134*a39f7afdSSong Liu */ 1135*a39f7afdSSong Liu void r5c_flush_cache(struct r5conf *conf, int num) 1136*a39f7afdSSong Liu { 1137*a39f7afdSSong Liu int count; 1138*a39f7afdSSong Liu struct stripe_head *sh, *next; 1139*a39f7afdSSong Liu 1140*a39f7afdSSong Liu assert_spin_locked(&conf->device_lock); 1141*a39f7afdSSong Liu if (!conf->log) 1142*a39f7afdSSong Liu return; 1143*a39f7afdSSong Liu 1144*a39f7afdSSong Liu count = 0; 1145*a39f7afdSSong Liu list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 1146*a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1147*a39f7afdSSong Liu count++; 1148*a39f7afdSSong Liu } 1149*a39f7afdSSong Liu 1150*a39f7afdSSong Liu if (count >= num) 1151*a39f7afdSSong Liu return; 1152*a39f7afdSSong Liu list_for_each_entry_safe(sh, next, 1153*a39f7afdSSong Liu &conf->r5c_partial_stripe_list, lru) { 1154*a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1155*a39f7afdSSong Liu if (++count >= num) 1156*a39f7afdSSong Liu break; 1157*a39f7afdSSong Liu } 1158*a39f7afdSSong Liu } 1159*a39f7afdSSong Liu 1160*a39f7afdSSong Liu static void r5c_do_reclaim(struct r5conf *conf) 1161*a39f7afdSSong Liu { 1162*a39f7afdSSong Liu struct r5l_log *log = conf->log; 1163*a39f7afdSSong Liu struct stripe_head *sh; 1164*a39f7afdSSong Liu int count = 0; 1165*a39f7afdSSong Liu unsigned long flags; 1166*a39f7afdSSong Liu int total_cached; 1167*a39f7afdSSong Liu int stripes_to_flush; 1168*a39f7afdSSong Liu 1169*a39f7afdSSong Liu if (!r5c_is_writeback(log)) 1170*a39f7afdSSong Liu return; 1171*a39f7afdSSong Liu 1172*a39f7afdSSong Liu total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 1173*a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes); 1174*a39f7afdSSong Liu 1175*a39f7afdSSong Liu if (total_cached > conf->min_nr_stripes * 3 / 4 || 1176*a39f7afdSSong Liu atomic_read(&conf->empty_inactive_list_nr) > 0) 1177*a39f7afdSSong Liu /* 1178*a39f7afdSSong Liu * if stripe cache pressure high, flush all full stripes and 1179*a39f7afdSSong Liu * some partial stripes 1180*a39f7afdSSong Liu */ 1181*a39f7afdSSong Liu stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 1182*a39f7afdSSong Liu else if (total_cached > conf->min_nr_stripes * 1 / 2 || 1183*a39f7afdSSong Liu atomic_read(&conf->r5c_cached_full_stripes) > 1184*a39f7afdSSong Liu R5C_FULL_STRIPE_FLUSH_BATCH) 1185*a39f7afdSSong Liu /* 1186*a39f7afdSSong Liu * if stripe cache pressure moderate, or if there is many full 1187*a39f7afdSSong Liu * stripes,flush all full stripes 1188*a39f7afdSSong Liu */ 1189*a39f7afdSSong Liu stripes_to_flush = 0; 1190*a39f7afdSSong Liu else 1191*a39f7afdSSong Liu /* no need to flush */ 1192*a39f7afdSSong Liu stripes_to_flush = -1; 1193*a39f7afdSSong Liu 1194*a39f7afdSSong Liu if (stripes_to_flush >= 0) { 1195*a39f7afdSSong Liu spin_lock_irqsave(&conf->device_lock, flags); 1196*a39f7afdSSong Liu r5c_flush_cache(conf, stripes_to_flush); 1197*a39f7afdSSong Liu spin_unlock_irqrestore(&conf->device_lock, flags); 1198*a39f7afdSSong Liu } 1199*a39f7afdSSong Liu 1200*a39f7afdSSong Liu /* if log space is tight, flush stripes on stripe_in_journal_list */ 1201*a39f7afdSSong Liu if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 1202*a39f7afdSSong Liu spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 1203*a39f7afdSSong Liu spin_lock(&conf->device_lock); 1204*a39f7afdSSong Liu list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 1205*a39f7afdSSong Liu /* 1206*a39f7afdSSong Liu * stripes on stripe_in_journal_list could be in any 1207*a39f7afdSSong Liu * state of the stripe_cache state machine. In this 1208*a39f7afdSSong Liu * case, we only want to flush stripe on 1209*a39f7afdSSong Liu * r5c_cached_full/partial_stripes. The following 1210*a39f7afdSSong Liu * condition makes sure the stripe is on one of the 1211*a39f7afdSSong Liu * two lists. 1212*a39f7afdSSong Liu */ 1213*a39f7afdSSong Liu if (!list_empty(&sh->lru) && 1214*a39f7afdSSong Liu !test_bit(STRIPE_HANDLE, &sh->state) && 1215*a39f7afdSSong Liu atomic_read(&sh->count) == 0) { 1216*a39f7afdSSong Liu r5c_flush_stripe(conf, sh); 1217*a39f7afdSSong Liu } 1218*a39f7afdSSong Liu if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 1219*a39f7afdSSong Liu break; 1220*a39f7afdSSong Liu } 1221*a39f7afdSSong Liu spin_unlock(&conf->device_lock); 1222*a39f7afdSSong Liu spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 1223*a39f7afdSSong Liu } 1224*a39f7afdSSong Liu md_wakeup_thread(conf->mddev->thread); 1225*a39f7afdSSong Liu } 1226*a39f7afdSSong Liu 12270576b1c6SShaohua Li static void r5l_do_reclaim(struct r5l_log *log) 12280576b1c6SShaohua Li { 1229*a39f7afdSSong Liu struct r5conf *conf = log->rdev->mddev->private; 12300576b1c6SShaohua Li sector_t reclaim_target = xchg(&log->reclaim_target, 0); 123117036461SChristoph Hellwig sector_t reclaimable; 123217036461SChristoph Hellwig sector_t next_checkpoint; 1233*a39f7afdSSong Liu bool write_super; 12340576b1c6SShaohua Li 12350576b1c6SShaohua Li spin_lock_irq(&log->io_list_lock); 1236*a39f7afdSSong Liu write_super = r5l_reclaimable_space(log) > log->max_free_space || 1237*a39f7afdSSong Liu reclaim_target != 0 || !list_empty(&log->no_space_stripes); 12380576b1c6SShaohua Li /* 12390576b1c6SShaohua Li * move proper io_unit to reclaim list. We should not change the order. 12400576b1c6SShaohua Li * reclaimable/unreclaimable io_unit can be mixed in the list, we 12410576b1c6SShaohua Li * shouldn't reuse space of an unreclaimable io_unit 12420576b1c6SShaohua Li */ 12430576b1c6SShaohua Li while (1) { 124417036461SChristoph Hellwig reclaimable = r5l_reclaimable_space(log); 124517036461SChristoph Hellwig if (reclaimable >= reclaim_target || 12460576b1c6SShaohua Li (list_empty(&log->running_ios) && 12470576b1c6SShaohua Li list_empty(&log->io_end_ios) && 1248a8c34f91SShaohua Li list_empty(&log->flushing_ios) && 124904732f74SChristoph Hellwig list_empty(&log->finished_ios))) 12500576b1c6SShaohua Li break; 12510576b1c6SShaohua Li 125217036461SChristoph Hellwig md_wakeup_thread(log->rdev->mddev->thread); 125317036461SChristoph Hellwig wait_event_lock_irq(log->iounit_wait, 125417036461SChristoph Hellwig r5l_reclaimable_space(log) > reclaimable, 125517036461SChristoph Hellwig log->io_list_lock); 12560576b1c6SShaohua Li } 125717036461SChristoph Hellwig 1258*a39f7afdSSong Liu next_checkpoint = r5c_calculate_new_cp(conf); 12590576b1c6SShaohua Li spin_unlock_irq(&log->io_list_lock); 12600576b1c6SShaohua Li 126117036461SChristoph Hellwig BUG_ON(reclaimable < 0); 1262*a39f7afdSSong Liu 1263*a39f7afdSSong Liu if (reclaimable == 0 || !write_super) 12640576b1c6SShaohua Li return; 12650576b1c6SShaohua Li 12660576b1c6SShaohua Li /* 12670576b1c6SShaohua Li * write_super will flush cache of each raid disk. We must write super 12680576b1c6SShaohua Li * here, because the log area might be reused soon and we don't want to 12690576b1c6SShaohua Li * confuse recovery 12700576b1c6SShaohua Li */ 12714b482044SShaohua Li r5l_write_super_and_discard_space(log, next_checkpoint); 12720576b1c6SShaohua Li 12730576b1c6SShaohua Li mutex_lock(&log->io_mutex); 127417036461SChristoph Hellwig log->last_checkpoint = next_checkpoint; 1275*a39f7afdSSong Liu r5c_update_log_state(log); 12760576b1c6SShaohua Li mutex_unlock(&log->io_mutex); 12770576b1c6SShaohua Li 127817036461SChristoph Hellwig r5l_run_no_space_stripes(log); 12790576b1c6SShaohua Li } 12800576b1c6SShaohua Li 12810576b1c6SShaohua Li static void r5l_reclaim_thread(struct md_thread *thread) 12820576b1c6SShaohua Li { 12830576b1c6SShaohua Li struct mddev *mddev = thread->mddev; 12840576b1c6SShaohua Li struct r5conf *conf = mddev->private; 12850576b1c6SShaohua Li struct r5l_log *log = conf->log; 12860576b1c6SShaohua Li 12870576b1c6SShaohua Li if (!log) 12880576b1c6SShaohua Li return; 1289*a39f7afdSSong Liu r5c_do_reclaim(conf); 12900576b1c6SShaohua Li r5l_do_reclaim(log); 12910576b1c6SShaohua Li } 12920576b1c6SShaohua Li 1293*a39f7afdSSong Liu void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 1294f6bed0efSShaohua Li { 12950576b1c6SShaohua Li unsigned long target; 12960576b1c6SShaohua Li unsigned long new = (unsigned long)space; /* overflow in theory */ 12970576b1c6SShaohua Li 1298*a39f7afdSSong Liu if (!log) 1299*a39f7afdSSong Liu return; 13000576b1c6SShaohua Li do { 13010576b1c6SShaohua Li target = log->reclaim_target; 13020576b1c6SShaohua Li if (new < target) 13030576b1c6SShaohua Li return; 13040576b1c6SShaohua Li } while (cmpxchg(&log->reclaim_target, target, new) != target); 13050576b1c6SShaohua Li md_wakeup_thread(log->reclaim_thread); 1306f6bed0efSShaohua Li } 1307f6bed0efSShaohua Li 1308e6c033f7SShaohua Li void r5l_quiesce(struct r5l_log *log, int state) 1309e6c033f7SShaohua Li { 13104b482044SShaohua Li struct mddev *mddev; 1311e6c033f7SShaohua Li if (!log || state == 2) 1312e6c033f7SShaohua Li return; 1313e6c033f7SShaohua Li if (state == 0) { 131416a43f6aSShaohua Li /* 131516a43f6aSShaohua Li * This is a special case for hotadd. In suspend, the array has 131616a43f6aSShaohua Li * no journal. In resume, journal is initialized as well as the 131716a43f6aSShaohua Li * reclaim thread. 131816a43f6aSShaohua Li */ 131916a43f6aSShaohua Li if (log->reclaim_thread) 132016a43f6aSShaohua Li return; 1321e6c033f7SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1322e6c033f7SShaohua Li log->rdev->mddev, "reclaim"); 1323*a39f7afdSSong Liu log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 1324e6c033f7SShaohua Li } else if (state == 1) { 13254b482044SShaohua Li /* make sure r5l_write_super_and_discard_space exits */ 13264b482044SShaohua Li mddev = log->rdev->mddev; 13274b482044SShaohua Li wake_up(&mddev->sb_wait); 1328*a39f7afdSSong Liu r5l_wake_reclaim(log, MaxSector); 1329e6c033f7SShaohua Li md_unregister_thread(&log->reclaim_thread); 1330e6c033f7SShaohua Li r5l_do_reclaim(log); 1331e6c033f7SShaohua Li } 1332e6c033f7SShaohua Li } 1333e6c033f7SShaohua Li 13346e74a9cfSShaohua Li bool r5l_log_disk_error(struct r5conf *conf) 13356e74a9cfSShaohua Li { 1336f6b6ec5cSShaohua Li struct r5l_log *log; 1337f6b6ec5cSShaohua Li bool ret; 13387dde2ad3SShaohua Li /* don't allow write if journal disk is missing */ 1339f6b6ec5cSShaohua Li rcu_read_lock(); 1340f6b6ec5cSShaohua Li log = rcu_dereference(conf->log); 1341f6b6ec5cSShaohua Li 1342f6b6ec5cSShaohua Li if (!log) 1343f6b6ec5cSShaohua Li ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1344f6b6ec5cSShaohua Li else 1345f6b6ec5cSShaohua Li ret = test_bit(Faulty, &log->rdev->flags); 1346f6b6ec5cSShaohua Li rcu_read_unlock(); 1347f6b6ec5cSShaohua Li return ret; 13486e74a9cfSShaohua Li } 13496e74a9cfSShaohua Li 1350355810d1SShaohua Li struct r5l_recovery_ctx { 1351355810d1SShaohua Li struct page *meta_page; /* current meta */ 1352355810d1SShaohua Li sector_t meta_total_blocks; /* total size of current meta and data */ 1353355810d1SShaohua Li sector_t pos; /* recovery position */ 1354355810d1SShaohua Li u64 seq; /* recovery position seq */ 1355355810d1SShaohua Li }; 1356355810d1SShaohua Li 1357355810d1SShaohua Li static int r5l_read_meta_block(struct r5l_log *log, 1358355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1359355810d1SShaohua Li { 1360355810d1SShaohua Li struct page *page = ctx->meta_page; 1361355810d1SShaohua Li struct r5l_meta_block *mb; 1362355810d1SShaohua Li u32 crc, stored_crc; 1363355810d1SShaohua Li 1364796a5cf0SMike Christie if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 1365796a5cf0SMike Christie false)) 1366355810d1SShaohua Li return -EIO; 1367355810d1SShaohua Li 1368355810d1SShaohua Li mb = page_address(page); 1369355810d1SShaohua Li stored_crc = le32_to_cpu(mb->checksum); 1370355810d1SShaohua Li mb->checksum = 0; 1371355810d1SShaohua Li 1372355810d1SShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1373355810d1SShaohua Li le64_to_cpu(mb->seq) != ctx->seq || 1374355810d1SShaohua Li mb->version != R5LOG_VERSION || 1375355810d1SShaohua Li le64_to_cpu(mb->position) != ctx->pos) 1376355810d1SShaohua Li return -EINVAL; 1377355810d1SShaohua Li 13785cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1379355810d1SShaohua Li if (stored_crc != crc) 1380355810d1SShaohua Li return -EINVAL; 1381355810d1SShaohua Li 1382355810d1SShaohua Li if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 1383355810d1SShaohua Li return -EINVAL; 1384355810d1SShaohua Li 1385355810d1SShaohua Li ctx->meta_total_blocks = BLOCK_SECTORS; 1386355810d1SShaohua Li 1387355810d1SShaohua Li return 0; 1388355810d1SShaohua Li } 1389355810d1SShaohua Li 1390355810d1SShaohua Li static int r5l_recovery_flush_one_stripe(struct r5l_log *log, 1391355810d1SShaohua Li struct r5l_recovery_ctx *ctx, 1392355810d1SShaohua Li sector_t stripe_sect, 13933fd880afSJackieLiu int *offset) 1394355810d1SShaohua Li { 1395355810d1SShaohua Li struct r5conf *conf = log->rdev->mddev->private; 1396355810d1SShaohua Li struct stripe_head *sh; 1397355810d1SShaohua Li struct r5l_payload_data_parity *payload; 1398355810d1SShaohua Li int disk_index; 1399355810d1SShaohua Li 1400355810d1SShaohua Li sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); 1401355810d1SShaohua Li while (1) { 14023fd880afSJackieLiu sector_t log_offset = r5l_ring_add(log, ctx->pos, 14033fd880afSJackieLiu ctx->meta_total_blocks); 1404355810d1SShaohua Li payload = page_address(ctx->meta_page) + *offset; 1405355810d1SShaohua Li 1406355810d1SShaohua Li if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 1407355810d1SShaohua Li raid5_compute_sector(conf, 1408355810d1SShaohua Li le64_to_cpu(payload->location), 0, 1409355810d1SShaohua Li &disk_index, sh); 1410355810d1SShaohua Li 14113fd880afSJackieLiu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1412796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_READ, 0, 1413796a5cf0SMike Christie false); 1414355810d1SShaohua Li sh->dev[disk_index].log_checksum = 1415355810d1SShaohua Li le32_to_cpu(payload->checksum[0]); 1416355810d1SShaohua Li set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 1417355810d1SShaohua Li } else { 1418355810d1SShaohua Li disk_index = sh->pd_idx; 14193fd880afSJackieLiu sync_page_io(log->rdev, log_offset, PAGE_SIZE, 1420796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_READ, 0, 1421796a5cf0SMike Christie false); 1422355810d1SShaohua Li sh->dev[disk_index].log_checksum = 1423355810d1SShaohua Li le32_to_cpu(payload->checksum[0]); 1424355810d1SShaohua Li set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 1425355810d1SShaohua Li 1426355810d1SShaohua Li if (sh->qd_idx >= 0) { 1427355810d1SShaohua Li disk_index = sh->qd_idx; 1428355810d1SShaohua Li sync_page_io(log->rdev, 14293fd880afSJackieLiu r5l_ring_add(log, log_offset, BLOCK_SECTORS), 1430355810d1SShaohua Li PAGE_SIZE, sh->dev[disk_index].page, 1431796a5cf0SMike Christie REQ_OP_READ, 0, false); 1432355810d1SShaohua Li sh->dev[disk_index].log_checksum = 1433355810d1SShaohua Li le32_to_cpu(payload->checksum[1]); 1434355810d1SShaohua Li set_bit(R5_Wantwrite, 1435355810d1SShaohua Li &sh->dev[disk_index].flags); 1436355810d1SShaohua Li } 1437355810d1SShaohua Li } 1438355810d1SShaohua Li 14393fd880afSJackieLiu ctx->meta_total_blocks += le32_to_cpu(payload->size); 1440355810d1SShaohua Li *offset += sizeof(struct r5l_payload_data_parity) + 1441355810d1SShaohua Li sizeof(__le32) * 1442355810d1SShaohua Li (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 1443355810d1SShaohua Li if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 1444355810d1SShaohua Li break; 1445355810d1SShaohua Li } 1446355810d1SShaohua Li 1447355810d1SShaohua Li for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1448355810d1SShaohua Li void *addr; 1449355810d1SShaohua Li u32 checksum; 1450355810d1SShaohua Li 1451355810d1SShaohua Li if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 1452355810d1SShaohua Li continue; 1453355810d1SShaohua Li addr = kmap_atomic(sh->dev[disk_index].page); 14545cb2fbd6SShaohua Li checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 1455355810d1SShaohua Li kunmap_atomic(addr); 1456355810d1SShaohua Li if (checksum != sh->dev[disk_index].log_checksum) 1457355810d1SShaohua Li goto error; 1458355810d1SShaohua Li } 1459355810d1SShaohua Li 1460355810d1SShaohua Li for (disk_index = 0; disk_index < sh->disks; disk_index++) { 1461355810d1SShaohua Li struct md_rdev *rdev, *rrdev; 1462355810d1SShaohua Li 1463355810d1SShaohua Li if (!test_and_clear_bit(R5_Wantwrite, 1464355810d1SShaohua Li &sh->dev[disk_index].flags)) 1465355810d1SShaohua Li continue; 1466355810d1SShaohua Li 1467355810d1SShaohua Li /* in case device is broken */ 1468354b445bSShaohua Li rcu_read_lock(); 1469355810d1SShaohua Li rdev = rcu_dereference(conf->disks[disk_index].rdev); 1470354b445bSShaohua Li if (rdev) { 1471354b445bSShaohua Li atomic_inc(&rdev->nr_pending); 1472354b445bSShaohua Li rcu_read_unlock(); 1473355810d1SShaohua Li sync_page_io(rdev, stripe_sect, PAGE_SIZE, 1474796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1475796a5cf0SMike Christie false); 1476354b445bSShaohua Li rdev_dec_pending(rdev, rdev->mddev); 1477354b445bSShaohua Li rcu_read_lock(); 1478354b445bSShaohua Li } 1479355810d1SShaohua Li rrdev = rcu_dereference(conf->disks[disk_index].replacement); 1480354b445bSShaohua Li if (rrdev) { 1481354b445bSShaohua Li atomic_inc(&rrdev->nr_pending); 1482354b445bSShaohua Li rcu_read_unlock(); 1483355810d1SShaohua Li sync_page_io(rrdev, stripe_sect, PAGE_SIZE, 1484796a5cf0SMike Christie sh->dev[disk_index].page, REQ_OP_WRITE, 0, 1485796a5cf0SMike Christie false); 1486354b445bSShaohua Li rdev_dec_pending(rrdev, rrdev->mddev); 1487354b445bSShaohua Li rcu_read_lock(); 1488354b445bSShaohua Li } 1489354b445bSShaohua Li rcu_read_unlock(); 1490355810d1SShaohua Li } 1491355810d1SShaohua Li raid5_release_stripe(sh); 1492355810d1SShaohua Li return 0; 1493355810d1SShaohua Li 1494355810d1SShaohua Li error: 1495355810d1SShaohua Li for (disk_index = 0; disk_index < sh->disks; disk_index++) 1496355810d1SShaohua Li sh->dev[disk_index].flags = 0; 1497355810d1SShaohua Li raid5_release_stripe(sh); 1498355810d1SShaohua Li return -EINVAL; 1499355810d1SShaohua Li } 1500355810d1SShaohua Li 1501355810d1SShaohua Li static int r5l_recovery_flush_one_meta(struct r5l_log *log, 1502355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1503355810d1SShaohua Li { 1504355810d1SShaohua Li struct r5conf *conf = log->rdev->mddev->private; 1505355810d1SShaohua Li struct r5l_payload_data_parity *payload; 1506355810d1SShaohua Li struct r5l_meta_block *mb; 1507355810d1SShaohua Li int offset; 1508355810d1SShaohua Li sector_t stripe_sector; 1509355810d1SShaohua Li 1510355810d1SShaohua Li mb = page_address(ctx->meta_page); 1511355810d1SShaohua Li offset = sizeof(struct r5l_meta_block); 1512355810d1SShaohua Li 1513355810d1SShaohua Li while (offset < le32_to_cpu(mb->meta_size)) { 1514355810d1SShaohua Li int dd; 1515355810d1SShaohua Li 1516355810d1SShaohua Li payload = (void *)mb + offset; 1517355810d1SShaohua Li stripe_sector = raid5_compute_sector(conf, 1518355810d1SShaohua Li le64_to_cpu(payload->location), 0, &dd, NULL); 1519355810d1SShaohua Li if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, 15203fd880afSJackieLiu &offset)) 1521355810d1SShaohua Li return -EINVAL; 1522355810d1SShaohua Li } 1523355810d1SShaohua Li return 0; 1524355810d1SShaohua Li } 1525355810d1SShaohua Li 1526355810d1SShaohua Li /* copy data/parity from log to raid disks */ 1527355810d1SShaohua Li static void r5l_recovery_flush_log(struct r5l_log *log, 1528355810d1SShaohua Li struct r5l_recovery_ctx *ctx) 1529355810d1SShaohua Li { 1530355810d1SShaohua Li while (1) { 1531355810d1SShaohua Li if (r5l_read_meta_block(log, ctx)) 1532355810d1SShaohua Li return; 1533355810d1SShaohua Li if (r5l_recovery_flush_one_meta(log, ctx)) 1534355810d1SShaohua Li return; 1535355810d1SShaohua Li ctx->seq++; 1536355810d1SShaohua Li ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1537355810d1SShaohua Li } 1538355810d1SShaohua Li } 1539355810d1SShaohua Li 1540355810d1SShaohua Li static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1541355810d1SShaohua Li u64 seq) 1542355810d1SShaohua Li { 1543355810d1SShaohua Li struct page *page; 1544355810d1SShaohua Li struct r5l_meta_block *mb; 1545355810d1SShaohua Li u32 crc; 1546355810d1SShaohua Li 1547355810d1SShaohua Li page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1548355810d1SShaohua Li if (!page) 1549355810d1SShaohua Li return -ENOMEM; 1550355810d1SShaohua Li mb = page_address(page); 1551355810d1SShaohua Li mb->magic = cpu_to_le32(R5LOG_MAGIC); 1552355810d1SShaohua Li mb->version = R5LOG_VERSION; 1553355810d1SShaohua Li mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1554355810d1SShaohua Li mb->seq = cpu_to_le64(seq); 1555355810d1SShaohua Li mb->position = cpu_to_le64(pos); 15565cb2fbd6SShaohua Li crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1557355810d1SShaohua Li mb->checksum = cpu_to_le32(crc); 1558355810d1SShaohua Li 1559796a5cf0SMike Christie if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1560796a5cf0SMike Christie WRITE_FUA, false)) { 1561355810d1SShaohua Li __free_page(page); 1562355810d1SShaohua Li return -EIO; 1563355810d1SShaohua Li } 1564355810d1SShaohua Li __free_page(page); 1565355810d1SShaohua Li return 0; 1566355810d1SShaohua Li } 1567355810d1SShaohua Li 1568f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log) 1569f6bed0efSShaohua Li { 1570355810d1SShaohua Li struct r5l_recovery_ctx ctx; 1571355810d1SShaohua Li 1572355810d1SShaohua Li ctx.pos = log->last_checkpoint; 1573355810d1SShaohua Li ctx.seq = log->last_cp_seq; 1574355810d1SShaohua Li ctx.meta_page = alloc_page(GFP_KERNEL); 1575355810d1SShaohua Li if (!ctx.meta_page) 1576355810d1SShaohua Li return -ENOMEM; 1577355810d1SShaohua Li 1578355810d1SShaohua Li r5l_recovery_flush_log(log, &ctx); 1579355810d1SShaohua Li __free_page(ctx.meta_page); 1580355810d1SShaohua Li 1581355810d1SShaohua Li /* 1582355810d1SShaohua Li * we did a recovery. Now ctx.pos points to an invalid meta block. New 1583355810d1SShaohua Li * log will start here. but we can't let superblock point to last valid 1584355810d1SShaohua Li * meta block. The log might looks like: 1585355810d1SShaohua Li * | meta 1| meta 2| meta 3| 1586355810d1SShaohua Li * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1587355810d1SShaohua Li * superblock points to meta 1, we write a new valid meta 2n. if crash 1588355810d1SShaohua Li * happens again, new recovery will start from meta 1. Since meta 2n is 1589355810d1SShaohua Li * valid now, recovery will think meta 3 is valid, which is wrong. 1590355810d1SShaohua Li * The solution is we create a new meta in meta2 with its seq == meta 1591355810d1SShaohua Li * 1's seq + 10 and let superblock points to meta2. The same recovery will 1592355810d1SShaohua Li * not think meta 3 is a valid meta, because its seq doesn't match 1593355810d1SShaohua Li */ 15949a8b27faSShaohua Li if (ctx.seq > log->last_cp_seq) { 1595355810d1SShaohua Li int ret; 1596355810d1SShaohua Li 1597355810d1SShaohua Li ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); 1598355810d1SShaohua Li if (ret) 1599355810d1SShaohua Li return ret; 1600355810d1SShaohua Li log->seq = ctx.seq + 11; 1601355810d1SShaohua Li log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 1602355810d1SShaohua Li r5l_write_super(log, ctx.pos); 160328cd88e2SZhengyuan Liu log->last_checkpoint = ctx.pos; 160428cd88e2SZhengyuan Liu log->next_checkpoint = ctx.pos; 1605355810d1SShaohua Li } else { 1606355810d1SShaohua Li log->log_start = ctx.pos; 1607355810d1SShaohua Li log->seq = ctx.seq; 1608355810d1SShaohua Li } 1609f6bed0efSShaohua Li return 0; 1610f6bed0efSShaohua Li } 1611f6bed0efSShaohua Li 1612f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp) 1613f6bed0efSShaohua Li { 1614f6bed0efSShaohua Li struct mddev *mddev = log->rdev->mddev; 1615f6bed0efSShaohua Li 1616f6bed0efSShaohua Li log->rdev->journal_tail = cp; 1617f6bed0efSShaohua Li set_bit(MD_CHANGE_DEVS, &mddev->flags); 1618f6bed0efSShaohua Li } 1619f6bed0efSShaohua Li 16202ded3703SSong Liu /* 16212ded3703SSong Liu * Try handle write operation in caching phase. This function should only 16222ded3703SSong Liu * be called in write-back mode. 16232ded3703SSong Liu * 16242ded3703SSong Liu * If all outstanding writes can be handled in caching phase, returns 0 16252ded3703SSong Liu * If writes requires write-out phase, call r5c_make_stripe_write_out() 16262ded3703SSong Liu * and returns -EAGAIN 16272ded3703SSong Liu */ 16282ded3703SSong Liu int r5c_try_caching_write(struct r5conf *conf, 16292ded3703SSong Liu struct stripe_head *sh, 16302ded3703SSong Liu struct stripe_head_state *s, 16312ded3703SSong Liu int disks) 16322ded3703SSong Liu { 16332ded3703SSong Liu struct r5l_log *log = conf->log; 16341e6d690bSSong Liu int i; 16351e6d690bSSong Liu struct r5dev *dev; 16361e6d690bSSong Liu int to_cache = 0; 16372ded3703SSong Liu 16382ded3703SSong Liu BUG_ON(!r5c_is_writeback(log)); 16392ded3703SSong Liu 16401e6d690bSSong Liu if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 16411e6d690bSSong Liu /* 16421e6d690bSSong Liu * There are two different scenarios here: 16431e6d690bSSong Liu * 1. The stripe has some data cached, and it is sent to 16441e6d690bSSong Liu * write-out phase for reclaim 16451e6d690bSSong Liu * 2. The stripe is clean, and this is the first write 16461e6d690bSSong Liu * 16471e6d690bSSong Liu * For 1, return -EAGAIN, so we continue with 16481e6d690bSSong Liu * handle_stripe_dirtying(). 16491e6d690bSSong Liu * 16501e6d690bSSong Liu * For 2, set STRIPE_R5C_CACHING and continue with caching 16511e6d690bSSong Liu * write. 16521e6d690bSSong Liu */ 16531e6d690bSSong Liu 16541e6d690bSSong Liu /* case 1: anything injournal or anything in written */ 16551e6d690bSSong Liu if (s->injournal > 0 || s->written > 0) 16561e6d690bSSong Liu return -EAGAIN; 16571e6d690bSSong Liu /* case 2 */ 16581e6d690bSSong Liu set_bit(STRIPE_R5C_CACHING, &sh->state); 16591e6d690bSSong Liu } 16601e6d690bSSong Liu 16611e6d690bSSong Liu for (i = disks; i--; ) { 16621e6d690bSSong Liu dev = &sh->dev[i]; 16631e6d690bSSong Liu /* if non-overwrite, use writing-out phase */ 16641e6d690bSSong Liu if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 16651e6d690bSSong Liu !test_bit(R5_InJournal, &dev->flags)) { 16662ded3703SSong Liu r5c_make_stripe_write_out(sh); 16672ded3703SSong Liu return -EAGAIN; 16682ded3703SSong Liu } 16691e6d690bSSong Liu } 16701e6d690bSSong Liu 16711e6d690bSSong Liu for (i = disks; i--; ) { 16721e6d690bSSong Liu dev = &sh->dev[i]; 16731e6d690bSSong Liu if (dev->towrite) { 16741e6d690bSSong Liu set_bit(R5_Wantwrite, &dev->flags); 16751e6d690bSSong Liu set_bit(R5_Wantdrain, &dev->flags); 16761e6d690bSSong Liu set_bit(R5_LOCKED, &dev->flags); 16771e6d690bSSong Liu to_cache++; 16781e6d690bSSong Liu } 16791e6d690bSSong Liu } 16801e6d690bSSong Liu 16811e6d690bSSong Liu if (to_cache) { 16821e6d690bSSong Liu set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 16831e6d690bSSong Liu /* 16841e6d690bSSong Liu * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 16851e6d690bSSong Liu * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 16861e6d690bSSong Liu * r5c_handle_data_cached() 16871e6d690bSSong Liu */ 16881e6d690bSSong Liu set_bit(STRIPE_LOG_TRAPPED, &sh->state); 16891e6d690bSSong Liu } 16901e6d690bSSong Liu 16911e6d690bSSong Liu return 0; 16921e6d690bSSong Liu } 16931e6d690bSSong Liu 16941e6d690bSSong Liu /* 16951e6d690bSSong Liu * free extra pages (orig_page) we allocated for prexor 16961e6d690bSSong Liu */ 16971e6d690bSSong Liu void r5c_release_extra_page(struct stripe_head *sh) 16981e6d690bSSong Liu { 16991e6d690bSSong Liu int i; 17001e6d690bSSong Liu 17011e6d690bSSong Liu for (i = sh->disks; i--; ) 17021e6d690bSSong Liu if (sh->dev[i].page != sh->dev[i].orig_page) { 17031e6d690bSSong Liu struct page *p = sh->dev[i].orig_page; 17041e6d690bSSong Liu 17051e6d690bSSong Liu sh->dev[i].orig_page = sh->dev[i].page; 17061e6d690bSSong Liu put_page(p); 17071e6d690bSSong Liu } 17081e6d690bSSong Liu } 17092ded3703SSong Liu 17102ded3703SSong Liu /* 17112ded3703SSong Liu * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 17122ded3703SSong Liu * stripe is committed to RAID disks. 17132ded3703SSong Liu */ 17142ded3703SSong Liu void r5c_finish_stripe_write_out(struct r5conf *conf, 17152ded3703SSong Liu struct stripe_head *sh, 17162ded3703SSong Liu struct stripe_head_state *s) 17172ded3703SSong Liu { 17181e6d690bSSong Liu int i; 17191e6d690bSSong Liu int do_wakeup = 0; 17201e6d690bSSong Liu 17212ded3703SSong Liu if (!conf->log || 17222ded3703SSong Liu !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 17232ded3703SSong Liu return; 17242ded3703SSong Liu 17252ded3703SSong Liu WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 17262ded3703SSong Liu clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 17272ded3703SSong Liu 17282ded3703SSong Liu if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 17292ded3703SSong Liu return; 17301e6d690bSSong Liu 17311e6d690bSSong Liu for (i = sh->disks; i--; ) { 17321e6d690bSSong Liu clear_bit(R5_InJournal, &sh->dev[i].flags); 17331e6d690bSSong Liu if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 17341e6d690bSSong Liu do_wakeup = 1; 17351e6d690bSSong Liu } 17361e6d690bSSong Liu 17371e6d690bSSong Liu /* 17381e6d690bSSong Liu * analyse_stripe() runs before r5c_finish_stripe_write_out(), 17391e6d690bSSong Liu * We updated R5_InJournal, so we also update s->injournal. 17401e6d690bSSong Liu */ 17411e6d690bSSong Liu s->injournal = 0; 17421e6d690bSSong Liu 17431e6d690bSSong Liu if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 17441e6d690bSSong Liu if (atomic_dec_and_test(&conf->pending_full_writes)) 17451e6d690bSSong Liu md_wakeup_thread(conf->mddev->thread); 17461e6d690bSSong Liu 17471e6d690bSSong Liu if (do_wakeup) 17481e6d690bSSong Liu wake_up(&conf->wait_for_overlap); 1749*a39f7afdSSong Liu 1750*a39f7afdSSong Liu if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 1751*a39f7afdSSong Liu return; 1752*a39f7afdSSong Liu 1753*a39f7afdSSong Liu spin_lock_irq(&conf->log->stripe_in_journal_lock); 1754*a39f7afdSSong Liu list_del_init(&sh->r5c); 1755*a39f7afdSSong Liu spin_unlock_irq(&conf->log->stripe_in_journal_lock); 1756*a39f7afdSSong Liu sh->log_start = MaxSector; 1757*a39f7afdSSong Liu atomic_dec(&conf->log->stripe_in_journal_count); 17581e6d690bSSong Liu } 17591e6d690bSSong Liu 17601e6d690bSSong Liu int 17611e6d690bSSong Liu r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, 17621e6d690bSSong Liu struct stripe_head_state *s) 17631e6d690bSSong Liu { 1764*a39f7afdSSong Liu struct r5conf *conf = sh->raid_conf; 17651e6d690bSSong Liu int pages = 0; 17661e6d690bSSong Liu int reserve; 17671e6d690bSSong Liu int i; 17681e6d690bSSong Liu int ret = 0; 17691e6d690bSSong Liu 17701e6d690bSSong Liu BUG_ON(!log); 17711e6d690bSSong Liu 17721e6d690bSSong Liu for (i = 0; i < sh->disks; i++) { 17731e6d690bSSong Liu void *addr; 17741e6d690bSSong Liu 17751e6d690bSSong Liu if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 17761e6d690bSSong Liu continue; 17771e6d690bSSong Liu addr = kmap_atomic(sh->dev[i].page); 17781e6d690bSSong Liu sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 17791e6d690bSSong Liu addr, PAGE_SIZE); 17801e6d690bSSong Liu kunmap_atomic(addr); 17811e6d690bSSong Liu pages++; 17821e6d690bSSong Liu } 17831e6d690bSSong Liu WARN_ON(pages == 0); 17841e6d690bSSong Liu 17851e6d690bSSong Liu /* 17861e6d690bSSong Liu * The stripe must enter state machine again to call endio, so 17871e6d690bSSong Liu * don't delay. 17881e6d690bSSong Liu */ 17891e6d690bSSong Liu clear_bit(STRIPE_DELAYED, &sh->state); 17901e6d690bSSong Liu atomic_inc(&sh->count); 17911e6d690bSSong Liu 17921e6d690bSSong Liu mutex_lock(&log->io_mutex); 17931e6d690bSSong Liu /* meta + data */ 17941e6d690bSSong Liu reserve = (1 + pages) << (PAGE_SHIFT - 9); 17951e6d690bSSong Liu 1796*a39f7afdSSong Liu if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 1797*a39f7afdSSong Liu sh->log_start == MaxSector) 1798*a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 1799*a39f7afdSSong Liu else if (!r5l_has_free_space(log, reserve)) { 1800*a39f7afdSSong Liu if (sh->log_start == log->last_checkpoint) 1801*a39f7afdSSong Liu BUG(); 1802*a39f7afdSSong Liu else 1803*a39f7afdSSong Liu r5l_add_no_space_stripe(log, sh); 18041e6d690bSSong Liu } else { 18051e6d690bSSong Liu ret = r5l_log_stripe(log, sh, pages, 0); 18061e6d690bSSong Liu if (ret) { 18071e6d690bSSong Liu spin_lock_irq(&log->io_list_lock); 18081e6d690bSSong Liu list_add_tail(&sh->log_list, &log->no_mem_stripes); 18091e6d690bSSong Liu spin_unlock_irq(&log->io_list_lock); 18101e6d690bSSong Liu } 18111e6d690bSSong Liu } 18121e6d690bSSong Liu 18131e6d690bSSong Liu mutex_unlock(&log->io_mutex); 18141e6d690bSSong Liu return 0; 18152ded3703SSong Liu } 18162ded3703SSong Liu 1817f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log) 1818f6bed0efSShaohua Li { 1819f6bed0efSShaohua Li struct md_rdev *rdev = log->rdev; 1820f6bed0efSShaohua Li struct page *page; 1821f6bed0efSShaohua Li struct r5l_meta_block *mb; 1822f6bed0efSShaohua Li sector_t cp = log->rdev->journal_tail; 1823f6bed0efSShaohua Li u32 stored_crc, expected_crc; 1824f6bed0efSShaohua Li bool create_super = false; 1825f6bed0efSShaohua Li int ret; 1826f6bed0efSShaohua Li 1827f6bed0efSShaohua Li /* Make sure it's valid */ 1828f6bed0efSShaohua Li if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 1829f6bed0efSShaohua Li cp = 0; 1830f6bed0efSShaohua Li page = alloc_page(GFP_KERNEL); 1831f6bed0efSShaohua Li if (!page) 1832f6bed0efSShaohua Li return -ENOMEM; 1833f6bed0efSShaohua Li 1834796a5cf0SMike Christie if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 1835f6bed0efSShaohua Li ret = -EIO; 1836f6bed0efSShaohua Li goto ioerr; 1837f6bed0efSShaohua Li } 1838f6bed0efSShaohua Li mb = page_address(page); 1839f6bed0efSShaohua Li 1840f6bed0efSShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1841f6bed0efSShaohua Li mb->version != R5LOG_VERSION) { 1842f6bed0efSShaohua Li create_super = true; 1843f6bed0efSShaohua Li goto create; 1844f6bed0efSShaohua Li } 1845f6bed0efSShaohua Li stored_crc = le32_to_cpu(mb->checksum); 1846f6bed0efSShaohua Li mb->checksum = 0; 18475cb2fbd6SShaohua Li expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1848f6bed0efSShaohua Li if (stored_crc != expected_crc) { 1849f6bed0efSShaohua Li create_super = true; 1850f6bed0efSShaohua Li goto create; 1851f6bed0efSShaohua Li } 1852f6bed0efSShaohua Li if (le64_to_cpu(mb->position) != cp) { 1853f6bed0efSShaohua Li create_super = true; 1854f6bed0efSShaohua Li goto create; 1855f6bed0efSShaohua Li } 1856f6bed0efSShaohua Li create: 1857f6bed0efSShaohua Li if (create_super) { 1858f6bed0efSShaohua Li log->last_cp_seq = prandom_u32(); 1859f6bed0efSShaohua Li cp = 0; 186056056c2eSZhengyuan Liu r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 1861f6bed0efSShaohua Li /* 1862f6bed0efSShaohua Li * Make sure super points to correct address. Log might have 1863f6bed0efSShaohua Li * data very soon. If super hasn't correct log tail address, 1864f6bed0efSShaohua Li * recovery can't find the log 1865f6bed0efSShaohua Li */ 1866f6bed0efSShaohua Li r5l_write_super(log, cp); 1867f6bed0efSShaohua Li } else 1868f6bed0efSShaohua Li log->last_cp_seq = le64_to_cpu(mb->seq); 1869f6bed0efSShaohua Li 1870f6bed0efSShaohua Li log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 18710576b1c6SShaohua Li log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 18720576b1c6SShaohua Li if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 18730576b1c6SShaohua Li log->max_free_space = RECLAIM_MAX_FREE_SPACE; 1874f6bed0efSShaohua Li log->last_checkpoint = cp; 187528cd88e2SZhengyuan Liu log->next_checkpoint = cp; 1876*a39f7afdSSong Liu mutex_lock(&log->io_mutex); 1877*a39f7afdSSong Liu r5c_update_log_state(log); 1878*a39f7afdSSong Liu mutex_unlock(&log->io_mutex); 1879f6bed0efSShaohua Li 1880f6bed0efSShaohua Li __free_page(page); 1881f6bed0efSShaohua Li 1882f6bed0efSShaohua Li return r5l_recovery_log(log); 1883f6bed0efSShaohua Li ioerr: 1884f6bed0efSShaohua Li __free_page(page); 1885f6bed0efSShaohua Li return ret; 1886f6bed0efSShaohua Li } 1887f6bed0efSShaohua Li 1888f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 1889f6bed0efSShaohua Li { 1890c888a8f9SJens Axboe struct request_queue *q = bdev_get_queue(rdev->bdev); 1891f6bed0efSShaohua Li struct r5l_log *log; 1892f6bed0efSShaohua Li 1893f6bed0efSShaohua Li if (PAGE_SIZE != 4096) 1894f6bed0efSShaohua Li return -EINVAL; 1895c757ec95SSong Liu 1896c757ec95SSong Liu /* 1897c757ec95SSong Liu * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 1898c757ec95SSong Liu * raid_disks r5l_payload_data_parity. 1899c757ec95SSong Liu * 1900c757ec95SSong Liu * Write journal and cache does not work for very big array 1901c757ec95SSong Liu * (raid_disks > 203) 1902c757ec95SSong Liu */ 1903c757ec95SSong Liu if (sizeof(struct r5l_meta_block) + 1904c757ec95SSong Liu ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 1905c757ec95SSong Liu conf->raid_disks) > PAGE_SIZE) { 1906c757ec95SSong Liu pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 1907c757ec95SSong Liu mdname(conf->mddev), conf->raid_disks); 1908c757ec95SSong Liu return -EINVAL; 1909c757ec95SSong Liu } 1910c757ec95SSong Liu 1911f6bed0efSShaohua Li log = kzalloc(sizeof(*log), GFP_KERNEL); 1912f6bed0efSShaohua Li if (!log) 1913f6bed0efSShaohua Li return -ENOMEM; 1914f6bed0efSShaohua Li log->rdev = rdev; 1915f6bed0efSShaohua Li 1916c888a8f9SJens Axboe log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 191756fef7c6SChristoph Hellwig 19185cb2fbd6SShaohua Li log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 1919f6bed0efSShaohua Li sizeof(rdev->mddev->uuid)); 1920f6bed0efSShaohua Li 1921f6bed0efSShaohua Li mutex_init(&log->io_mutex); 1922f6bed0efSShaohua Li 1923f6bed0efSShaohua Li spin_lock_init(&log->io_list_lock); 1924f6bed0efSShaohua Li INIT_LIST_HEAD(&log->running_ios); 19250576b1c6SShaohua Li INIT_LIST_HEAD(&log->io_end_ios); 1926a8c34f91SShaohua Li INIT_LIST_HEAD(&log->flushing_ios); 192704732f74SChristoph Hellwig INIT_LIST_HEAD(&log->finished_ios); 1928a8c34f91SShaohua Li bio_init(&log->flush_bio); 1929f6bed0efSShaohua Li 1930f6bed0efSShaohua Li log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 1931f6bed0efSShaohua Li if (!log->io_kc) 1932f6bed0efSShaohua Li goto io_kc; 1933f6bed0efSShaohua Li 19345036c390SChristoph Hellwig log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 19355036c390SChristoph Hellwig if (!log->io_pool) 19365036c390SChristoph Hellwig goto io_pool; 19375036c390SChristoph Hellwig 1938c38d29b3SChristoph Hellwig log->bs = bioset_create(R5L_POOL_SIZE, 0); 1939c38d29b3SChristoph Hellwig if (!log->bs) 1940c38d29b3SChristoph Hellwig goto io_bs; 1941c38d29b3SChristoph Hellwig 1942e8deb638SChristoph Hellwig log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 1943e8deb638SChristoph Hellwig if (!log->meta_pool) 1944e8deb638SChristoph Hellwig goto out_mempool; 1945e8deb638SChristoph Hellwig 19460576b1c6SShaohua Li log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 19470576b1c6SShaohua Li log->rdev->mddev, "reclaim"); 19480576b1c6SShaohua Li if (!log->reclaim_thread) 19490576b1c6SShaohua Li goto reclaim_thread; 1950*a39f7afdSSong Liu log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 1951*a39f7afdSSong Liu 19520fd22b45SShaohua Li init_waitqueue_head(&log->iounit_wait); 19530576b1c6SShaohua Li 19545036c390SChristoph Hellwig INIT_LIST_HEAD(&log->no_mem_stripes); 19555036c390SChristoph Hellwig 1956f6bed0efSShaohua Li INIT_LIST_HEAD(&log->no_space_stripes); 1957f6bed0efSShaohua Li spin_lock_init(&log->no_space_stripes_lock); 1958f6bed0efSShaohua Li 19592ded3703SSong Liu log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 1960*a39f7afdSSong Liu INIT_LIST_HEAD(&log->stripe_in_journal_list); 1961*a39f7afdSSong Liu spin_lock_init(&log->stripe_in_journal_lock); 1962*a39f7afdSSong Liu atomic_set(&log->stripe_in_journal_count, 0); 19632ded3703SSong Liu 1964f6bed0efSShaohua Li if (r5l_load_log(log)) 1965f6bed0efSShaohua Li goto error; 1966f6bed0efSShaohua Li 1967f6b6ec5cSShaohua Li rcu_assign_pointer(conf->log, log); 1968a62ab49eSShaohua Li set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1969f6bed0efSShaohua Li return 0; 1970e8deb638SChristoph Hellwig 1971f6bed0efSShaohua Li error: 19720576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 19730576b1c6SShaohua Li reclaim_thread: 1974e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 1975e8deb638SChristoph Hellwig out_mempool: 1976c38d29b3SChristoph Hellwig bioset_free(log->bs); 1977c38d29b3SChristoph Hellwig io_bs: 19785036c390SChristoph Hellwig mempool_destroy(log->io_pool); 19795036c390SChristoph Hellwig io_pool: 1980f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 1981f6bed0efSShaohua Li io_kc: 1982f6bed0efSShaohua Li kfree(log); 1983f6bed0efSShaohua Li return -EINVAL; 1984f6bed0efSShaohua Li } 1985f6bed0efSShaohua Li 1986f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log) 1987f6bed0efSShaohua Li { 19880576b1c6SShaohua Li md_unregister_thread(&log->reclaim_thread); 1989e8deb638SChristoph Hellwig mempool_destroy(log->meta_pool); 1990c38d29b3SChristoph Hellwig bioset_free(log->bs); 19915036c390SChristoph Hellwig mempool_destroy(log->io_pool); 1992f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 1993f6bed0efSShaohua Li kfree(log); 1994f6bed0efSShaohua Li } 1995